From 78e355fd08bd127541274c158bac1bc1189beba9 Mon Sep 17 00:00:00 2001
From: gaoqiong <gaoqiong@1559765771@qq.com>
Date: Tue, 20 Dec 2022 16:42:06 +0800
Subject: [PATCH] onnxruntime

---
 CITATION.cff                                  |   67 +
 CMakeLists.txt                                |  311 +++
 CONTRIBUTORS.md                               |   31 +
 Config.cmake.in                               |   11 +
 Dockerfile                                    |  110 +
 Jenkinsfile                                   |  705 ++++++
 LICENSE                                       |   28 +
 README.md                                     |   96 +-
 client_example/01_gemm/CMakeLists.txt         |    2 +
 client_example/01_gemm/gemm.cpp               |  218 ++
 .../02_gemm_add_add_fastgelu/CMakeLists.txt   |   13 +
 .../gemm_add_add_fastgelu.cpp                 |  241 ++
 .../gemm_add_fastgelu.cpp                     |  233 ++
 .../gemm_fastgelu.cpp                         |  225 ++
 .../03_gemm_layernorm/CMakeLists.txt          |    2 +
 .../gemm_add_add_layernorm.cpp                |  274 +++
 client_example/04_contraction/CMakeLists.txt  |    6 +
 .../04_contraction/contraction_bilinear.cpp   |  236 ++
 .../04_contraction/contraction_scale.cpp      |  222 ++
 client_example/05_layernorm/CMakeLists.txt    |    2 +
 client_example/05_layernorm/layernorm2d.cpp   |  163 ++
 client_example/06_softmax/CMakeLists.txt      |    2 +
 client_example/06_softmax/softmax4d.cpp       |  150 ++
 .../07_grouped_conv2d_fwd/CMakeLists.txt      |    2 +
 .../grouped_conv2d_fwd.cpp                    |  226 ++
 .../08_fused_attention/CMakeLists.txt         |    2 +
 .../08_fused_attention/fused_attention.cpp    |  213 ++
 client_example/09_quantization/CMakeLists.txt |   11 +
 ..._fwd_bias_relu_perchannel_quantization.cpp |  205 ++
 ...2d_fwd_bias_relu_perlayer_quantization.cpp |  198 ++
 .../conv2d_fwd_perchannel_quantization.cpp    |  198 ++
 .../conv2d_fwd_perlayer_quantization.cpp      |  192 ++
 .../10_grouped_conv2d_bwd_data/CMakeLists.txt |    2 +
 .../grouped_conv2d_bwd_data.cpp               |  226 ++
 .../11_grouped_conv_bwd_weight/CMakeLists.txt |    2 +
 .../grouped_conv2d_bwd_weight.cpp             |  190 ++
 .../CMakeLists.txt                            |    2 +
 .../elementwise_layernorm2d.cpp               |  175 ++
 client_example/13_batchnorm/CMakeLists.txt    |    4 +
 .../13_batchnorm/batchnorm_bwd_nhwc.cpp       |  201 ++
 .../13_batchnorm/batchnorm_fwd_nhwc.cpp       |  197 ++
 client_example/14_instance_id/CMakeLists.txt  |    2 +
 .../batchnorm_fwd_instance_id.cpp             |  206 ++
 client_example/CMakeLists.txt                 |   15 +
 client_example/README.md                      |   21 +
 cmake/Analyzers.cmake                         |   34 +
 cmake/ClangTidy.cmake                         |  162 ++
 cmake/CppCheck.cmake                          |  130 ++
 cmake/DoxygenDoc.cmake                        |  355 +++
 cmake/EnableCompilerWarnings.cmake            |  110 +
 cmake/TargetFlags.cmake                       |   50 +
 cmake/googletest.cmake                        |   49 +
 dev-requirements.txt                          |    3 +
 doc/image/ck_component.png                    |  Bin 0 -> 565049 bytes
 doc/image/ck_layer.png                        |  Bin 0 -> 549343 bytes
 doc/markdown/dockerhub.md                     |   93 +
 example/01_gemm/CMakeLists.txt                |   37 +
 example/01_gemm/README.md                     |   23 +
 example/01_gemm/common.hpp                    |   89 +
 example/01_gemm/gemm_dl_fp16.cpp              |   37 +
 example/01_gemm/gemm_dl_fp32.cpp              |   37 +
 example/01_gemm/gemm_dl_int4.cpp              |   45 +
 example/01_gemm/gemm_dl_int8.cpp              |   37 +
 example/01_gemm/gemm_xdl_bf16.cpp             |   38 +
 example/01_gemm/gemm_xdl_fp16.cpp             |   50 +
 example/01_gemm/gemm_xdl_fp64.cpp             |   46 +
 example/01_gemm/gemm_xdl_int4.cpp             |   46 +
 example/01_gemm/gemm_xdl_int8.cpp             |   38 +
 example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp  |  249 +++
 example/01_gemm/run_gemm_example.inc          |  151 ++
 example/02_gemm_bilinear/CMakeLists.txt       |    1 +
 example/02_gemm_bilinear/README.md            |   28 +
 .../gemm_bilinear_xdl_fp16.cpp                |  306 +++
 example/03_gemm_bias_relu/CMakeLists.txt      |    1 +
 example/03_gemm_bias_relu/README.md           |   10 +
 .../gemm_bias_relu_xdl_fp16.cpp               |  283 +++
 .../04_gemm_add_add_fastgelu/CMakeLists.txt   |   17 +
 example/04_gemm_add_add_fastgelu/README.md    |   23 +
 example/04_gemm_add_add_fastgelu/common.hpp   |  106 +
 .../gemm_add_add_fastgelu_xdl_bf16.cpp        |   47 +
 .../gemm_add_add_fastgelu_xdl_fp16.cpp        |   47 +
 .../gemm_add_add_fastgelu_xdl_fp32.cpp        |   47 +
 .../gemm_add_add_fastgelu_xdl_int4.cpp        |   59 +
 .../gemm_add_add_fastgelu_xdl_int8.cpp        |   47 +
 .../run_gemm_add_add_fastgelu_example.inc     |  166 ++
 example/09_convnd_fwd/CMakeLists.txt          |   11 +
 example/09_convnd_fwd/README.md               |   32 +
 example/09_convnd_fwd/convnd_fwd_common.hpp   |  172 ++
 .../09_convnd_fwd/convnd_fwd_dl_common.hpp    |  196 ++
 example/09_convnd_fwd/convnd_fwd_dl_fp16.cpp  |   40 +
 example/09_convnd_fwd/convnd_fwd_dl_fp32.cpp  |   40 +
 example/09_convnd_fwd/convnd_fwd_dl_int8.cpp  |   40 +
 example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp |   79 +
 example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp |   79 +
 example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp |   79 +
 example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp |   79 +
 example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp |   79 +
 .../run_convnd_fwd_dl_example.inc             |   98 +
 .../09_convnd_fwd/run_convnd_fwd_example.inc  |   97 +
 .../CMakeLists.txt                            |   16 +
 .../common.hpp                                |  159 ++
 .../convnd_fwd_max_xdl_bf16.cpp               |   18 +
 .../convnd_fwd_max_xdl_fp16.cpp               |   18 +
 .../convnd_fwd_max_xdl_fp32.cpp               |   18 +
 .../convnd_fwd_max_xdl_int4.cpp               |   26 +
 .../convnd_fwd_max_xdl_int8.cpp               |   18 +
 .../run_convnd_fwd_max_example.inc            |  307 +++
 example/12_reduce/CMakeLists.txt              |    3 +
 example/12_reduce/README.md                   |   62 +
 example/12_reduce/reduce_blockwise.cpp        |  299 +++
 example/12_reduce/reduce_blockwise_impl.hpp   |  338 +++
 .../12_reduce/reduce_blockwise_two_call.cpp   |  301 +++
 example/12_reduce/reduce_example_common.hpp   |   49 +
 .../reduce_multiblock_atomic_add.cpp          |  216 ++
 .../reduce_multiblock_atomic_add_impl.hpp     |  233 ++
 example/13_pool2d_fwd/CMakeLists.txt          |    3 +
 example/13_pool2d_fwd/README.md               |   41 +
 example/13_pool2d_fwd/pool2d_fwd_common.hpp   |  283 +++
 example/13_pool2d_fwd/pool2d_fwd_fp16.cpp     |  117 +
 example/13_pool2d_fwd/pool2d_fwd_fp32.cpp     |  117 +
 example/14_gemm_quantization/CMakeLists.txt   |    2 +
 .../gemm_xdl_bias_relu_quantization_int8.cpp  |  235 ++
 .../gemm_xdl_quantization_int8.cpp            |  207 ++
 example/15_grouped_gemm/CMakeLists.txt        |   17 +
 example/15_grouped_gemm/README.md             |   25 +
 .../grouped_gemm_xdl_bfp16.cpp                |   62 +
 .../15_grouped_gemm/grouped_gemm_xdl_fp16.cpp |   62 +
 .../15_grouped_gemm/grouped_gemm_xdl_fp32.cpp |   62 +
 .../15_grouped_gemm/grouped_gemm_xdl_int4.cpp |  102 +
 .../15_grouped_gemm/grouped_gemm_xdl_int8.cpp |   59 +
 .../run_grouped_gemm_example.inc              |  265 +++
 .../CMakeLists.txt                            |   40 +
 .../gemm_add_add_mean_meansquare_xdl_fp16.cpp |  276 +++
 .../gemm_add_addsquare_xdl_int8.cpp           |  364 +++
 .../gemm_max_xdl_bf16.cpp                     |  167 ++
 .../gemm_max_xdl_fp16.cpp                     |  167 ++
 .../gemm_max_xdl_fp32.cpp                     |  166 ++
 .../gemm_max_xdl_int4.cpp                     |  172 ++
 .../gemm_max_xdl_int8.cpp                     |  166 ++
 .../gemm_mean_meansquare_xdl_bf16.cpp         |  174 ++
 .../gemm_mean_meansquare_xdl_fp16.cpp         |  174 ++
 .../gemm_mean_meansquare_xdl_fp32.cpp         |  174 ++
 .../gemm_reduce_xdl_common.hpp                |  491 +++++
 example/17_convnd_bwd_data/CMakeLists.txt     |    5 +
 example/17_convnd_bwd_data/README.md          |   47 +
 .../convnd_bwd_data_common.hpp                |  152 ++
 .../convnd_bwd_data_dl_fp16.cpp               |  180 ++
 .../convnd_bwd_data_xdl_fp16.cpp              |  207 ++
 example/18_batched_gemm_reduce/CMakeLists.txt |    2 +
 .../batched_gemm_reduce_xdl_fp16.cpp          |  311 +++
 example/19_binary_elementwise/CMakeLists.txt  |    4 +
 .../broadcast_add_2d_amn_bn.cpp               |  136 ++
 .../broadcast_add_3d_am_bmnk.cpp              |  120 +
 .../elementwise_add_1d.cpp                    |  111 +
 .../elementwise_add_4d.cpp                    |  120 +
 .../20_grouped_conv_bwd_weight/CMakeLists.txt |    8 +
 example/20_grouped_conv_bwd_weight/common.hpp |  138 ++
 .../grouped_conv_bwd_weight_xdl_bf16.cpp      |   18 +
 .../grouped_conv_bwd_weight_xdl_fp16.cpp      |   17 +
 .../run_grouped_conv_bwd_weight_example.inc   |  206 ++
 example/21_gemm_layernorm/CMakeLists.txt      |    3 +
 .../gemm_bias_relu_add_layernorm_xdl_fp16.cpp |  406 ++++
 .../gemm_layernorm_xdl_fp16.cpp               |  375 ++++
 .../gemm_xdl_layernorm_single_kernel_fp16.cpp |  287 +++
 example/22_cgemm/CMakeLists.txt               |   17 +
 example/22_cgemm/cgemm_xdl_bf16.cpp           |  132 ++
 example/22_cgemm/cgemm_xdl_common.hpp         |  254 +++
 example/22_cgemm/cgemm_xdl_fp16.cpp           |  131 ++
 example/22_cgemm/cgemm_xdl_fp32.cpp           |  132 ++
 example/22_cgemm/cgemm_xdl_int4.cpp           |  140 ++
 example/22_cgemm/cgemm_xdl_int8.cpp           |  132 ++
 example/23_softmax/CMakeLists.txt             |    1 +
 example/23_softmax/README.md                  |   18 +
 example/23_softmax/softmax_blockwise.cpp      |  264 +++
 example/24_batched_gemm/CMakeLists.txt        |   17 +
 .../batched_gemm_xdl_bfp16.cpp                |   59 +
 .../24_batched_gemm/batched_gemm_xdl_fp16.cpp |   59 +
 .../24_batched_gemm/batched_gemm_xdl_fp32.cpp |   58 +
 .../24_batched_gemm/batched_gemm_xdl_int4.cpp |   99 +
 .../24_batched_gemm/batched_gemm_xdl_int8.cpp |   56 +
 .../run_batched_gemm_example.inc              |  240 ++
 example/25_gemm_bias_e_permute/CMakeLists.txt |    2 +
 .../gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp |  397 ++++
 .../gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp |  398 ++++
 example/26_contraction/CMakeLists.txt         |    2 +
 example/26_contraction/README.md              |   20 +
 .../contraction_bilinear_xdl_fp32.cpp         |  427 ++++
 .../contraction_scale_xdl_fp32.cpp            |  409 ++++
 example/27_layernorm/CMakeLists.txt           |    1 +
 example/27_layernorm/layernorm_blockwise.cpp  |  139 ++
 .../CMakeLists.txt                            |    1 +
 .../grouped_gemm_bias_e_permute_xdl_fp16.cpp  |  466 ++++
 .../CMakeLists.txt                            |    1 +
 .../batched_gemm_bias_e_permute_xdl_fp16.cpp  |  397 ++++
 .../CMakeLists.txt                            |   22 +
 .../30_grouped_conv_fwd_multiple_d/README.md  |   30 +
 .../30_grouped_conv_fwd_multiple_d/common.hpp |  355 +++
 ...rouped_conv_fwd_bias_relu_add_xdl_bf16.cpp |   26 +
 ...rouped_conv_fwd_bias_relu_add_xdl_fp16.cpp |   26 +
 ...rouped_conv_fwd_bias_relu_add_xdl_fp32.cpp |   26 +
 ...rouped_conv_fwd_bias_relu_add_xdl_int4.cpp |   31 +
 ...rouped_conv_fwd_bias_relu_add_xdl_int8.cpp |   26 +
 .../grouped_conv_fwd_xdl_fp16.cpp             |   24 +
 ...grouped_conv_fwd_bias_relu_add_example.inc |  288 +++
 .../run_grouped_conv_fwd_example.inc          |  223 ++
 example/31_batched_gemm_gemm/CMakeLists.txt   |    8 +
 .../batched_gemm_gemm_xdl_bf16.cpp            |  136 ++
 .../batched_gemm_gemm_xdl_fp16.cpp            |  136 ++
 .../batched_gemm_gemm_xdl_fp32.cpp            |  135 ++
 .../batched_gemm_gemm_xdl_int4.cpp            |  146 ++
 .../batched_gemm_gemm_xdl_int8.cpp            |  133 ++
 .../run_batched_gemm_gemm_example.inc         |  277 +++
 .../CMakeLists.txt                            |   16 +
 ...le_scale_softmax_gemm_permute_xdl_fp16.cpp |  160 ++
 ...mm_scale_softmax_gemm_permute_xdl_bf16.cpp |  159 ++
 ...mm_scale_softmax_gemm_permute_xdl_fp16.cpp |  160 ++
 ...tched_gemm_scale_softmax_gemm_xdl_bf16.cpp |  143 ++
 ...tched_gemm_scale_softmax_gemm_xdl_fp16.cpp |  144 ++
 ...le_scale_softmax_gemm_permute_xdl_fp16.cpp |  159 ++
 ...mm_scale_softmax_gemm_permute_xdl_fp16.cpp |  160 ++
 .../run_batched_gemm_scale_softmax_gemm.inc   |  261 +++
 ...atched_gemm_scale_softmax_gemm_permute.inc |  278 +++
 ...rouped_gemm_scale_softmax_gemm_permute.inc |  319 +++
 example/33_multiple_reduce/CMakeLists.txt     |    2 +
 example/33_multiple_reduce/README.md          |   37 +
 .../33_multiple_reduce/dual_reduce_common.hpp |  314 +++
 .../dual_reduce_multiblock.cpp                |   98 +
 .../dual_reduce_threadwise.cpp                |   93 +
 example/34_batchnorm/CMakeLists.txt           |    3 +
 example/34_batchnorm/README.md                |   81 +
 .../34_batchnorm/batchnorm_backward_nhwc.cpp  |  506 +++++
 example/34_batchnorm/batchnorm_common.hpp     |   68 +
 .../batchnorm_forward_inferring_nhwc.cpp      |  366 +++
 .../batchnorm_forward_training_nhwc.cpp       |  591 +++++
 example/34_batchnorm/batchnorm_infer_impl.hpp |  131 ++
 example/35_splitK_gemm/CMakeLists.txt         |   17 +
 .../run_splitK_gemm_example.inc               |  217 ++
 .../35_splitK_gemm/splitK_gemm_xdl_bfp16.cpp  |   58 +
 .../35_splitK_gemm/splitK_gemm_xdl_fp16.cpp   |   58 +
 .../35_splitK_gemm/splitK_gemm_xdl_fp32.cpp   |   58 +
 .../35_splitK_gemm/splitK_gemm_xdl_int4.cpp   |   92 +
 .../35_splitK_gemm/splitK_gemm_xdl_int8.cpp   |   55 +
 example/36_sparse_embedding/CMakeLists.txt    |    1 +
 .../sparse_embedding3_forward_layernorm.cpp   |  219 ++
 .../CMakeLists.txt                            |    1 +
 ...ed_gemm_add_add_relu_gemm_add_xdl_fp16.cpp |  519 +++++
 .../CMakeLists.txt                            |    7 +
 .../common.hpp                                |  103 +
 .../grouped_conv_bwd_data_bias_relu_fp16.cpp  |   33 +
 .../grouped_conv_bwd_data_fp16.cpp            |   33 +
 ...rouped_conv_bwd_data_bias_relu_example.inc |  224 ++
 .../run_grouped_conv_bwd_data_example.inc     |  190 ++
 example/39_permute/CMakeLists.txt             |    9 +
 example/39_permute/common.hpp                 |  456 ++++
 example/39_permute/permute_1xHxW_fp16.cpp     |   20 +
 example/39_permute/permute_HxWx4_fp16.cpp     |   22 +
 example/39_permute/permute_NxHxW_fp16.cpp     |   20 +
 .../39_permute/run_permute_bundle_example.inc |   78 +
 .../run_permute_element_example.inc           |   65 +
 .../41_grouped_conv_conv_fwd/CMakeLists.txt   |    8 +
 .../grouped_conv_conv_fwd_xdl_bf16.cpp        |  109 +
 .../grouped_conv_conv_fwd_xdl_fp16.cpp        |  109 +
 .../grouped_conv_conv_fwd_xdl_fp32.cpp        |  109 +
 .../grouped_conv_conv_fwd_xdl_int4.cpp        |  122 +
 .../grouped_conv_conv_fwd_xdl_int8.cpp        |  109 +
 .../run_grouped_conv_conv_fwd_example.inc     |  379 ++++
 example/42_groupnorm/CMakeLists.txt           |    1 +
 .../42_groupnorm/groupnorm_sigmoid_fp16.cpp   |  174 ++
 .../CMakeLists.txt                            |    2 +
 .../splitk_gemm_bias_e_permute_xdl_fp16.cpp   |  407 ++++
 .../splitk_gemm_bias_e_permute_xdl_fp32.cpp   |  407 ++++
 .../44_conv2d_fwd_quantization/CMakeLists.txt |    3 +
 ...bias_relu_perchannel_quantization_int8.cpp |  342 +++
 ...l_bias_relu_perlayer_quantization_int8.cpp |  318 +++
 ...v2d_fwd_xdl_perlayer_quantization_int8.cpp |  279 +++
 example/44_elementwise_permute/CMakeLists.txt |    2 +
 .../elementwise_permute_4D_fp16.cpp           |  116 +
 .../elementwise_permute_4D_fp16_2d.cpp        |  130 ++
 .../CMakeLists.txt                            |    1 +
 .../elementwise_layernorm_blockwise.cpp       |  195 ++
 example/CMakeLists.txt                        |   32 +
 include/ck/ck.hpp                             |  201 ++
 include/ck/host_utility/device_prop.hpp       |   54 +
 include/ck/host_utility/hip_check_error.hpp   |   17 +
 include/ck/host_utility/io.hpp                |   41 +
 include/ck/host_utility/kernel_launch.hpp     |   74 +
 ...volution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp |  275 +++
 ...lution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp |  355 +++
 ...into_gemm_v4r4r2_atomic_nchw_kcyx_nkhw.hpp |  150 ++
 ...lution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp |  132 ++
 ...into_gemm_v4r4r4_atomic_nhwc_kyxc_nhwk.hpp |  150 ++
 ...lution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp |  135 ++
 ...lution_into_gemm_v4r4r5_nhwc_kyxc_nhwk.hpp |  147 ++
 ...n3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp |  153 ++
 ...volution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp |  260 +++
 ...volution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp |  179 ++
 ...lution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp |  132 ++
 ...lution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp |  132 ++
 ...lution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp |  134 ++
 ...volution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp |  135 ++
 include/ck/stream_config.hpp                  |   14 +
 include/ck/tensor/static_tensor.hpp           |  273 +++
 .../tensor_description/cluster_descriptor.hpp |   34 +
 .../multi_index_transform.hpp                 | 1954 +++++++++++++++++
 .../multi_index_transform_helper.hpp          |  130 ++
 .../ck/tensor_description/tensor_adaptor.hpp  |  482 ++++
 .../tensor_description/tensor_descriptor.hpp  |  615 ++++++
 .../tensor_descriptor_helper.hpp              |  165 ++
 .../tensor_space_filling_curve.hpp            |  162 ++
 .../gpu/block/blockwise_gemm_dl_v2r3.hpp      |  412 ++++
 .../gpu/block/blockwise_gemm_dlops_v2r2.hpp   |  397 ++++
 .../gpu/block/blockwise_gemm_dlops_v3.hpp     |  178 ++
 .../gpu/block/blockwise_gemm_xdlops.hpp       |  998 +++++++++
 .../blockwise_gemm_xdlops_skip_b_lds.hpp      |  321 +++
 .../gpu/block/blockwise_softmax.hpp           |  115 +
 .../blockwise_tensor_slice_transfer_v5r1.hpp  |  156 ++
 .../gpu/block/blockwise_welford.hpp           |  108 +
 .../block/reduction_functions_blockwise.hpp   |  244 ++
 ...hread_group_tensor_slice_transfer_v4r1.hpp |  173 ++
 ...hread_group_tensor_slice_transfer_v6r1.hpp |  134 ++
 ...hread_group_tensor_slice_transfer_v6r2.hpp |  158 ++
 ...hread_group_tensor_slice_transfer_v6r3.hpp |  183 ++
 .../thread_group_tensor_slice_transfer_v7.hpp |  172 ++
 ...nvolution_backward_data_specialization.hpp |   30 +
 ...olution_backward_weight_specialization.hpp |   33 +
 .../convolution_forward_specialization.hpp    |   34 +
 .../gpu/device/device_base.hpp                |   74 +
 .../device_batched_contraction_multiple_d.hpp |   64 +
 .../gpu/device/device_batched_gemm.hpp        |   68 +
 .../device/device_batched_gemm_e_permute.hpp  |   50 +
 .../gpu/device/device_batched_gemm_gemm.hpp   |   59 +
 .../device/device_batched_gemm_multi_d.hpp    |   58 +
 ...atched_gemm_multiple_d_gemm_multiple_d.hpp |   72 +
 .../device_batched_gemm_softmax_gemm.hpp      |   60 +
 ...vice_batched_gemm_softmax_gemm_permute.hpp |   70 +
 .../gpu/device/device_batchnorm_backward.hpp  |   77 +
 .../gpu/device/device_batchnorm_forward.hpp   |   72 +
 .../gpu/device/device_batchnorm_infer.hpp     |   69 +
 .../gpu/device/device_cgemm.hpp               |   51 +
 .../device/device_contraction_multiple_d.hpp  |   63 +
 .../gpu/device/device_conv_bwd_data.hpp       |   49 +
 .../gpu/device/device_conv_fwd.hpp            |   49 +
 .../device_conv_fwd_bias_activation.hpp       |   53 +
 .../device_conv_fwd_bias_activation_add.hpp   |   53 +
 .../gpu/device/device_elementwise_2d.hpp      |  341 +++
 .../gpu/device/device_elementwise_base.hpp    |   45 +
 .../device_elementwise_normalization.hpp      |   68 +
 .../gpu/device/device_gemm.hpp                |   42 +
 .../gpu/device/device_gemm_bias_e_permute.hpp |   51 +
 .../gpu/device/device_gemm_multiple_d.hpp     |   58 +
 .../device_gemm_multiple_d_multiple_r.hpp     |   97 +
 .../gpu/device/device_gemm_reduce.hpp         |   46 +
 .../gpu/device/device_gemm_splitk.hpp         |   64 +
 .../device_grouped_contraction_multiple_d.hpp |   72 +
 ...evice_grouped_conv_bwd_data_multiple_d.hpp |   67 +
 .../device/device_grouped_conv_bwd_weight.hpp |   51 +
 .../gpu/device/device_grouped_conv_fwd.hpp    |   55 +
 ..._conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp |  959 ++++++++
 ...ice_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp |  837 +++++++
 .../device_grouped_conv_fwd_multiple_d.hpp    |   65 +
 .../gpu/device/device_grouped_gemm.hpp        |   51 +
 ...vice_grouped_gemm_softmax_gemm_permute.hpp |   75 +
 ...gemm_softmax_gemm_permute_xdl_cshuffle.hpp |  881 ++++++++
 .../gpu/device/device_multiple_reduce.hpp     |   58 +
 .../gpu/device/device_normalization.hpp       |   62 +
 .../gpu/device/device_permute.hpp             |   37 +
 .../gpu/device/device_pool2d_fwd.hpp          |   40 +
 .../gpu/device/device_reduce.hpp              |   51 +
 .../gpu/device/device_softmax.hpp             |   69 +
 .../device_splitk_contraction_multiple_d.hpp  |   65 +
 ...tk_contraction_multiple_d_xdl_cshuffle.hpp | 1147 ++++++++++
 .../gpu/device/gemm_specialization.hpp        |   58 +
 ...ed_contraction_multiple_d_xdl_cshuffle.hpp | 1040 +++++++++
 .../device_batched_gemm_e_permute_xdl.hpp     |  683 ++++++
 .../device_batched_gemm_gemm_xdl_cshuffle.hpp |  747 +++++++
 .../impl/device_batched_gemm_multi_d_xdl.hpp  |  716 ++++++
 ...ultiple_d_gemm_multiple_d_xdl_cshuffle.hpp |  951 ++++++++
 ...evice_batched_gemm_reduce_xdl_cshuffle.hpp | 1001 +++++++++
 ...gemm_softmax_gemm_permute_xdl_cshuffle.hpp |  859 ++++++++
 ...batched_gemm_softmax_gemm_xdl_cshuffle.hpp |  771 +++++++
 .../device/impl/device_batched_gemm_xdl.hpp   |  668 ++++++
 .../impl/device_batchnorm_backward_impl.hpp   |  874 ++++++++
 .../impl/device_batchnorm_forward_impl.hpp    |  718 ++++++
 .../impl/device_cgemm_4gemm_xdl_cshuffle.hpp  |  948 ++++++++
 ...ce_contraction_multiple_d_xdl_cshuffle.hpp |  779 +++++++
 ...rd_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp |  787 +++++++
 ...ice_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp |  835 +++++++
 ...fle_bias_activation_add_nhwc_kyxc_nhwk.hpp |  968 ++++++++
 ...shuffle_bias_activation_nhwc_kyxc_nhwk.hpp |  925 ++++++++
 ...onv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp |  893 ++++++++
 .../device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp  |  733 +++++++
 ...ice_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp |  268 +++
 ...evice_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp |  642 ++++++
 .../device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp | 1583 +++++++++++++
 ...device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp | 1568 +++++++++++++
 .../gpu/device/impl/device_elementwise.hpp    |  304 +++
 .../device_elementwise_normalization_impl.hpp |  592 +++++
 ...vice_gemm_bias_add_reduce_xdl_cshuffle.hpp |  875 ++++++++
 .../impl/device_gemm_bias_e_permute_xdl.hpp   |  572 +++++
 .../gpu/device/impl/device_gemm_dl.hpp        |  595 +++++
 ...emm_multiple_d_multiple_r_xdl_cshuffle.hpp |  682 ++++++
 .../device_gemm_multiple_d_xdl_cshuffle.hpp   |  698 ++++++
 .../impl/device_gemm_reduce_xdl_cshuffle.hpp  |  835 +++++++
 .../gpu/device/impl/device_gemm_xdl.hpp       |  570 +++++
 .../device/impl/device_gemm_xdl_cshuffle.hpp  |  700 ++++++
 .../device_gemm_xdl_layernorm_cshuffle.hpp    |  773 +++++++
 .../impl/device_gemm_xdl_skip_b_lds.hpp       |  523 +++++
 .../impl/device_gemm_xdl_splitk_c_shuffle.hpp |  650 ++++++
 ...ed_contraction_multiple_d_xdl_cshuffle.hpp |  907 ++++++++
 ...nv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp | 1015 +++++++++
 ...bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp | 1244 +++++++++++
 ...grouped_conv_fwd_multiple_d_multiple_r.hpp |   77 +
 ...fwd_multiple_d_multiple_r_xdl_cshuffle.hpp | 1105 ++++++++++
 ...ouped_conv_fwd_multiple_d_xdl_cshuffle.hpp |  952 ++++++++
 .../device/impl/device_grouped_gemm_xdl.hpp   |  677 ++++++
 .../device_multiple_reduce_multiblock.hpp     |  595 +++++
 .../device_multiple_reduce_threadwise.hpp     |  422 ++++
 .../device/impl/device_normalization_impl.hpp |  476 ++++
 .../gpu/device/impl/device_permute_impl.hpp   |  282 +++
 .../impl/device_pool2d_fwd_nhwc_nhwc.hpp      |  327 +++
 .../gpu/device/impl/device_reduce_common.hpp  |  142 ++
 .../device/impl/device_reduce_multiblock.hpp  |  543 +++++
 .../device/impl/device_reduce_threadwise.hpp  |  382 ++++
 .../gpu/device/impl/device_softmax_impl.hpp   |  423 ++++
 ...ce_sparse_embedding3_forward_layernorm.hpp |  210 ++
 .../gpu/device/masking_specialization.hpp     |   82 +
 .../gpu/device/matrix_padder.hpp              |  382 ++++
 .../gpu/device/reduction_operator_mapping.hpp |  186 ++
 .../gpu/device/tensor_layout.hpp              |  417 ++++
 .../gpu/device/tensor_specialization.hpp      |   28 +
 .../gpu/device/welford_helper.hpp             |   89 +
 .../element/binary_element_wise_operation.hpp |  286 +++
 .../gpu/element/element_wise_operation.hpp    |  304 +++
 .../gpu/element/quantization_operation.hpp    |  124 ++
 .../element/unary_element_wise_operation.hpp  |  268 +++
 ...e_second_half_batchnorm_backward_final.hpp |  498 +++++
 ...gridwise_multiblock_welford_first_half.hpp |  261 +++
 ...rd_second_half_batchnorm_forward_final.hpp |  571 +++++
 ...cond_half_multiblock_reduce_first_half.hpp |  556 +++++
 .../gpu/grid/block_to_ctile_map.hpp           |  546 +++++
 ...dwise_2d_multiple_reduction_multiblock.hpp |  321 +++
 ...dwise_2d_multiple_reduction_threadwise.hpp |  264 +++
 .../grid/gridwise_2d_reduction_multiblock.hpp |  613 ++++++
 .../grid/gridwise_2d_reduction_threadwise.hpp |  474 ++++
 ...wise_batched_gemm_gemm_xdl_cshuffle_v1.hpp |  931 ++++++++
 ...iple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp | 1268 +++++++++++
 ...ched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp | 1131 ++++++++++
 ...e_batchnorm_backward_blockwise_welford.hpp |  554 +++++
 ...se_batchnorm_forward_blockwise_welford.hpp |  483 ++++
 .../grid/gridwise_contraction_dlops_v1r2.hpp  |  662 ++++++
 .../gpu/grid/gridwise_elementwise_1d.hpp      |  195 ++
 .../gpu/grid/gridwise_elementwise_2d.hpp      |  230 ++
 ...elementwise_layernorm_welford_variance.hpp |  500 +++++
 ...e_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp |  997 +++++++++
 .../gpu/grid/gridwise_gemm_dl_multiple_d.hpp  |  678 ++++++
 .../gpu/grid/gridwise_gemm_dl_v1r3.hpp        |  577 +++++
 .../gpu/grid/gridwise_gemm_dlops_v1r2.hpp     |  608 +++++
 .../gpu/grid/gridwise_gemm_dlops_v2.hpp       |  461 ++++
 .../gpu/grid/gridwise_gemm_dlops_v3.hpp       | 1597 ++++++++++++++
 ...emm_multiple_d_multiple_r_xdl_cshuffle.hpp |  944 ++++++++
 .../gridwise_gemm_multiple_d_xdl_cshuffle.hpp |  753 +++++++
 .../grid/gridwise_gemm_pipeline_selector.hpp  |   43 +
 .../gpu/grid/gridwise_gemm_pipeline_v1.hpp    |  369 ++++
 .../gpu/grid/gridwise_gemm_pipeline_v2.hpp    |  128 ++
 .../gridwise_gemm_reduce_xdl_cshuffle_v1.hpp  |  879 ++++++++
 ...e_gemm_split_k_multiple_d_xdl_cshuffle.hpp | 1263 +++++++++++
 .../grid/gridwise_gemm_xdl_cshuffle_v1.hpp    |  653 ++++++
 ...ridwise_gemm_xdl_layernorm_cshuffle_v1.hpp | 1068 +++++++++
 .../grid/gridwise_gemm_xdlops_bwd_weight.hpp  |  983 +++++++++
 .../gridwise_gemm_xdlops_skip_b_lds_v1.hpp    |  678 ++++++
 .../gpu/grid/gridwise_gemm_xdlops_v2r3.hpp    |  557 +++++
 .../gpu/grid/gridwise_gemm_xdlops_v2r4.hpp    |  616 ++++++
 .../gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp  |  721 ++++++
 .../gpu/grid/gridwise_gemm_xdlops_v3r1.hpp    |  723 ++++++
 .../gpu/grid/gridwise_gemm_xdlops_v3r2.hpp    |  762 +++++++
 .../gpu/grid/gridwise_gemm_xdlops_v3r3.hpp    |  801 +++++++
 .../gridwise_normalization_naive_variance.hpp |  360 +++
 ...ridwise_normalization_welford_variance.hpp |  384 ++++
 .../gpu/grid/gridwise_permute.hpp             |  339 +++
 .../gpu/grid/gridwise_set_buffer_value.hpp    |   57 +
 .../gridwise_set_multiple_buffer_value.hpp    |   86 +
 .../gpu/grid/gridwise_softmax.hpp             |  407 ++++
 ...se_sparse_embedding3_forward_layernorm.hpp |  344 +++
 .../thread/reduction_functions_threadwise.hpp |   99 +
 .../gpu/thread/threadwise_contraction_dl.hpp  |  224 ++
 .../gpu/thread/threadwise_gemm_dlops_v3.hpp   |  168 ++
 .../thread/threadwise_tensor_slice_set.hpp    |   60 +
 .../threadwise_tensor_slice_transfer.hpp      | 1301 +++++++++++
 .../threadwise_tensor_slice_transfer_v3r1.hpp |  794 +++++++
 .../threadwise_tensor_slice_transfer_v3r3.hpp |  886 ++++++++
 .../threadwise_tensor_slice_transfer_v4r1.hpp |  175 ++
 .../threadwise_tensor_slice_transfer_v5r1.hpp |  614 ++++++
 .../threadwise_tensor_slice_transfer_v6r1.hpp |  213 ++
 .../threadwise_tensor_slice_transfer_v6r2.hpp |  260 +++
 .../threadwise_tensor_slice_transfer_v6r3.hpp |  310 +++
 .../threadwise_tensor_slice_transfer_v7.hpp   |  298 +++
 .../gpu/thread/threadwise_welford.hpp         |  137 ++
 .../tensor_operation/gpu/warp/xdlops_gemm.hpp |  851 +++++++
 .../transform_contraction_to_gemm.hpp         |  288 +++
 .../transform_conv_bwd_data_to_gemm_v1.hpp    |  583 +++++
 .../transform_conv_fwd_to_gemm.hpp            |  880 ++++++++
 include/ck/utility/amd_address_space.hpp      |   45 +
 include/ck/utility/amd_buffer_addressing.hpp  | 1177 ++++++++++
 include/ck/utility/amd_inline_asm.hpp         |  359 +++
 include/ck/utility/amd_llvm_intrinsic.hpp     |   14 +
 include/ck/utility/amd_wmma.hpp               |  102 +
 include/ck/utility/amd_xdlops.hpp             |  320 +++
 include/ck/utility/array.hpp                  |   66 +
 include/ck/utility/array_multi_index.hpp      |   80 +
 include/ck/utility/c_style_pointer_cast.hpp   |   25 +
 include/ck/utility/common_header.hpp          |   51 +
 .../ck/utility/container_element_picker.hpp   |  158 ++
 include/ck/utility/container_helper.hpp       |  393 ++++
 include/ck/utility/data_type.hpp              | 1059 +++++++++
 include/ck/utility/debug.hpp                  |   85 +
 include/ck/utility/dynamic_buffer.hpp         |  398 ++++
 include/ck/utility/enable_if.hpp              |   14 +
 include/ck/utility/functional.hpp             |  131 ++
 include/ck/utility/functional2.hpp            |   49 +
 include/ck/utility/functional3.hpp            |  144 ++
 include/ck/utility/functional4.hpp            |   65 +
 .../utility/generic_memory_space_atomic.hpp   |  123 ++
 include/ck/utility/get_id.hpp                 |   28 +
 include/ck/utility/ignore.hpp                 |   22 +
 include/ck/utility/inner_product.hpp          |  205 ++
 include/ck/utility/integral_constant.hpp      |   51 +
 .../ck/utility/is_known_at_compile_time.hpp   |   56 +
 include/ck/utility/magic_division.hpp         |  160 ++
 include/ck/utility/math.hpp                   |  240 ++
 include/ck/utility/math_v2.hpp                |  164 ++
 include/ck/utility/multi_index.hpp            |   12 +
 include/ck/utility/number.hpp                 |   18 +
 include/ck/utility/print.hpp                  |   25 +
 include/ck/utility/reduction_common.hpp       |   40 +
 include/ck/utility/reduction_enums.hpp        |   41 +
 .../reduction_functions_accumulate.hpp        |  115 +
 include/ck/utility/reduction_operator.hpp     |  292 +++
 include/ck/utility/sequence.hpp               |  899 ++++++++
 include/ck/utility/sequence_helper.hpp        |   37 +
 include/ck/utility/span.hpp                   |   67 +
 include/ck/utility/static_buffer.hpp          |  195 ++
 .../ck/utility/statically_indexed_array.hpp   |  105 +
 .../statically_indexed_array_multi_index.hpp  |  162 ++
 include/ck/utility/synchronization.hpp        |   32 +
 include/ck/utility/thread_group.hpp           |   22 +
 include/ck/utility/transpose_vectors.hpp      |  165 ++
 include/ck/utility/tuple.hpp                  |  221 ++
 include/ck/utility/tuple_helper.hpp           |   81 +
 include/ck/utility/type.hpp                   |   60 +
 library/CMakeLists.txt                        |    2 +
 .../cpu/reference_batched_gemm.hpp            |  138 ++
 .../cpu/reference_batchnorm_backward.hpp      |  412 ++++
 .../cpu/reference_batchnorm_forward.hpp       |  368 ++++
 .../cpu/reference_batchnorm_infer.hpp         |  300 +++
 .../cpu/reference_cgemm.hpp                   |  184 ++
 .../cpu/reference_conv_bwd_data.hpp           |  378 ++++
 .../cpu/reference_conv_bwd_weight.hpp         |  327 +++
 .../cpu/reference_conv_fwd.hpp                |  339 +++
 .../reference_conv_fwd_bias_activation.hpp    |  192 ++
 ...reference_conv_fwd_bias_activation_add.hpp |  200 ++
 .../cpu/reference_gemm.hpp                    |  137 ++
 .../cpu/reference_gemm_bias_2d.hpp            |  136 ++
 .../cpu/reference_gemm_bias_activation.hpp    |  140 ++
 .../reference_gemm_bias_activation_add.hpp    |  148 ++
 .../cpu/reference_gemm_layernorm.hpp          |  236 ++
 .../cpu/reference_groupnorm.hpp               |  191 ++
 .../cpu/reference_layernorm.hpp               |  171 ++
 .../cpu/reference_softmax.hpp                 |  175 ++
 ...ce_sparse_embedding3_forward_layernorm.hpp |  205 ++
 .../gpu/naive_conv_fwd.hpp                    |  125 ++
 .../add_device_operation_instance.hpp         |   35 +
 .../device_operation_instance_factory.hpp     |  114 +
 .../gpu/batched_gemm.hpp                      |  259 +++
 .../gpu/batched_gemm_add_relu_gemm_add.hpp    |  139 ++
 .../gpu/batched_gemm_gemm.hpp                 |  113 +
 .../gpu/batched_gemm_softmax_gemm.hpp         |  121 +
 .../gpu/batched_gemm_softmax_gemm_permute.hpp |  185 ++
 .../gpu/batchnorm_backward.hpp                |  124 ++
 .../gpu/batchnorm_forward.hpp                 |  117 +
 .../gpu/contraction_bilinear.hpp              |  128 ++
 .../gpu/contraction_scale.hpp                 |  127 ++
 .../gpu/convolution_backward_data.hpp         |  309 +++
 .../gpu/convolution_forward.hpp               |  128 ++
 .../gpu/device_elementwise_instance.hpp       |   54 +
 .../device_gemm_mean_squaremean_instance.hpp  |   84 +
 .../gpu/elementwise_normalization.hpp         |   79 +
 .../tensor_operation_instance/gpu/gemm.hpp    |  385 ++++
 .../gpu/gemm_add_add_fastgelu.hpp             |  155 ++
 .../gpu/gemm_add_fastgelu.hpp                 |  145 ++
 .../gpu/gemm_bilinear.hpp                     |  148 ++
 .../gpu/gemm_fastgelu.hpp                     |  138 ++
 .../gpu/gemm_splitk.hpp                       |  147 ++
 .../gpu/grouped_convolution_backward_data.hpp |   90 +
 .../grouped_convolution_backward_weight.hpp   |  235 ++
 .../gpu/grouped_convolution_forward.hpp       |  396 ++++
 .../gpu/grouped_gemm.hpp                      |  137 ++
 .../gpu/normalization.hpp                     |  109 +
 ...n_bias_forward_perchannel_quantization.hpp |  114 +
 ...ion_bias_forward_perlayer_quantization.hpp |  114 +
 ...lution_forward_perchannel_quantization.hpp |  113 +
 ...volution_forward_perlayer_quantization.hpp |  110 +
 .../gpu/reduce/device_reduce_instance.hpp     |   79 +
 .../device_reduce_instance_blockwise.hpp      |  123 ++
 ...uce_instance_blockwise_b16_f32_b16_add.hpp |   27 +
 ...ce_instance_blockwise_b16_f32_b16_amax.hpp |   31 +
 ...uce_instance_blockwise_b16_f32_b16_avg.hpp |   27 +
 ...uce_instance_blockwise_b16_f32_b16_max.hpp |   31 +
 ...uce_instance_blockwise_b16_f32_b16_min.hpp |   31 +
 ...e_instance_blockwise_b16_f32_b16_norm2.hpp |   27 +
 ...ce_instance_blockwise_f16_f16_f16_amax.hpp |   31 +
 ...uce_instance_blockwise_f16_f16_f16_max.hpp |   31 +
 ...uce_instance_blockwise_f16_f16_f16_min.hpp |   31 +
 ...uce_instance_blockwise_f16_f32_f16_add.hpp |   27 +
 ...uce_instance_blockwise_f16_f32_f16_avg.hpp |   27 +
 ...e_instance_blockwise_f16_f32_f16_norm2.hpp |   27 +
 ...uce_instance_blockwise_f32_f32_f32_add.hpp |   27 +
 ...ce_instance_blockwise_f32_f32_f32_amax.hpp |   31 +
 ...uce_instance_blockwise_f32_f32_f32_avg.hpp |   27 +
 ...uce_instance_blockwise_f32_f32_f32_max.hpp |   31 +
 ...uce_instance_blockwise_f32_f32_f32_min.hpp |   31 +
 ...e_instance_blockwise_f32_f32_f32_norm2.hpp |   27 +
 ...uce_instance_blockwise_f32_f64_f32_add.hpp |   27 +
 ...uce_instance_blockwise_f32_f64_f32_avg.hpp |   27 +
 ...e_instance_blockwise_f32_f64_f32_norm2.hpp |   27 +
 ...uce_instance_blockwise_f64_f64_f64_add.hpp |   27 +
 ...ce_instance_blockwise_f64_f64_f64_amax.hpp |   31 +
 ...uce_instance_blockwise_f64_f64_f64_avg.hpp |   27 +
 ...uce_instance_blockwise_f64_f64_f64_max.hpp |   31 +
 ...uce_instance_blockwise_f64_f64_f64_min.hpp |   31 +
 ...e_instance_blockwise_f64_f64_f64_norm2.hpp |   27 +
 ...educe_instance_blockwise_i8_i32_i8_add.hpp |   27 +
 ...educe_instance_blockwise_i8_i32_i8_avg.hpp |   27 +
 ...educe_instance_blockwise_i8_i8_i8_amax.hpp |   31 +
 ...reduce_instance_blockwise_i8_i8_i8_max.hpp |   31 +
 ...reduce_instance_blockwise_i8_i8_i8_min.hpp |   31 +
 .../device_reduce_instance_impl_common.hpp    |   53 +
 ..._reduce_instance_multiblock_atomic_add.hpp |  123 ++
 ..._multiblock_atomic_add_b16_f32_f32_add.hpp |   27 +
 ..._multiblock_atomic_add_b16_f32_f32_avg.hpp |   27 +
 ..._multiblock_atomic_add_f16_f32_f32_add.hpp |   27 +
 ..._multiblock_atomic_add_f16_f32_f32_avg.hpp |   27 +
 ..._multiblock_atomic_add_f32_f32_f32_add.hpp |   27 +
 ..._multiblock_atomic_add_f32_f32_f32_avg.hpp |   27 +
 ..._multiblock_atomic_add_f32_f64_f32_add.hpp |   28 +
 ..._multiblock_atomic_add_f32_f64_f32_avg.hpp |   28 +
 ..._multiblock_atomic_add_f64_f64_f64_add.hpp |   27 +
 ..._multiblock_atomic_add_f64_f64_f64_avg.hpp |   27 +
 .../device_reduce_instance_threadwise.hpp     |  100 +
 ...ce_instance_threadwise_b16_f32_b16_add.hpp |   27 +
 ...e_instance_threadwise_b16_f32_b16_amax.hpp |   31 +
 ...ce_instance_threadwise_b16_f32_b16_avg.hpp |   27 +
 ...ce_instance_threadwise_b16_f32_b16_max.hpp |   31 +
 ...ce_instance_threadwise_b16_f32_b16_min.hpp |   31 +
 ..._instance_threadwise_b16_f32_b16_norm2.hpp |   27 +
 ...e_instance_threadwise_f16_f16_f16_amax.hpp |   31 +
 ...ce_instance_threadwise_f16_f16_f16_max.hpp |   31 +
 ...ce_instance_threadwise_f16_f16_f16_min.hpp |   31 +
 ...ce_instance_threadwise_f16_f32_f16_add.hpp |   27 +
 ...ce_instance_threadwise_f16_f32_f16_avg.hpp |   27 +
 ..._instance_threadwise_f16_f32_f16_norm2.hpp |   27 +
 ...ce_instance_threadwise_f32_f32_f32_add.hpp |   27 +
 ...e_instance_threadwise_f32_f32_f32_amax.hpp |   31 +
 ...ce_instance_threadwise_f32_f32_f32_avg.hpp |   27 +
 ...ce_instance_threadwise_f32_f32_f32_max.hpp |   31 +
 ...ce_instance_threadwise_f32_f32_f32_min.hpp |   31 +
 ..._instance_threadwise_f32_f32_f32_norm2.hpp |   27 +
 ...ce_instance_threadwise_f32_f64_f32_add.hpp |   27 +
 ...ce_instance_threadwise_f32_f64_f32_avg.hpp |   27 +
 ..._instance_threadwise_f32_f64_f32_norm2.hpp |   27 +
 ...ce_instance_threadwise_f64_f64_f64_add.hpp |   27 +
 ...e_instance_threadwise_f64_f64_f64_amax.hpp |   31 +
 ...ce_instance_threadwise_f64_f64_f64_avg.hpp |   27 +
 ...ce_instance_threadwise_f64_f64_f64_max.hpp |   31 +
 ...ce_instance_threadwise_f64_f64_f64_min.hpp |   31 +
 ..._instance_threadwise_f64_f64_f64_norm2.hpp |   27 +
 ...duce_instance_threadwise_i8_i32_i8_add.hpp |   27 +
 ...duce_instance_threadwise_i8_i32_i8_avg.hpp |   27 +
 ...duce_instance_threadwise_i8_i8_i8_amax.hpp |   31 +
 ...educe_instance_threadwise_i8_i8_i8_max.hpp |   31 +
 ...educe_instance_threadwise_i8_i8_i8_min.hpp |   31 +
 .../tensor_operation_instance/gpu/softmax.hpp |   77 +
 .../device_softmax_f16_f16_instance.hpp       |   22 +
 ...softmax_f16_f16_instance_rank3_reduce1.hpp |   22 +
 ...softmax_f16_f16_instance_rank3_reduce2.hpp |   22 +
 ...softmax_f16_f16_instance_rank3_reduce3.hpp |   22 +
 ...softmax_f16_f16_instance_rank4_reduce1.hpp |   22 +
 ...softmax_f16_f16_instance_rank4_reduce2.hpp |   22 +
 ...softmax_f16_f16_instance_rank4_reduce3.hpp |   22 +
 ...softmax_f16_f16_instance_rank4_reduce4.hpp |   22 +
 .../device_softmax_f16_f16_instance_type.hpp  |   39 +
 .../device_softmax_f32_f32_instance.hpp       |   22 +
 ...softmax_f32_f32_instance_rank3_reduce1.hpp |   22 +
 ...softmax_f32_f32_instance_rank3_reduce2.hpp |   22 +
 ...softmax_f32_f32_instance_rank3_reduce3.hpp |   22 +
 ...softmax_f32_f32_instance_rank4_reduce1.hpp |   22 +
 ...softmax_f32_f32_instance_rank4_reduce2.hpp |   22 +
 ...softmax_f32_f32_instance_rank4_reduce3.hpp |   22 +
 ...softmax_f32_f32_instance_rank4_reduce4.hpp |   22 +
 .../device_softmax_f32_f32_instance_type.hpp  |   38 +
 .../softmax/device_softmax_i8_i8_instance.hpp |   22 +
 ...e_softmax_i8_i8_instance_rank3_reduce1.hpp |   22 +
 ...e_softmax_i8_i8_instance_rank3_reduce2.hpp |   22 +
 ...e_softmax_i8_i8_instance_rank3_reduce3.hpp |   22 +
 ...e_softmax_i8_i8_instance_rank4_reduce1.hpp |   22 +
 ...e_softmax_i8_i8_instance_rank4_reduce2.hpp |   22 +
 ...e_softmax_i8_i8_instance_rank4_reduce3.hpp |   22 +
 ...e_softmax_i8_i8_instance_rank4_reduce4.hpp |   22 +
 .../device_softmax_i8_i8_instance_type.hpp    |   40 +
 .../gpu/softmax/device_softmax_instance.hpp   |    8 +
 .../include/ck/library/utility/algorithm.hpp  |   43 +
 .../include/ck/library/utility/check_err.hpp  |  218 ++
 .../ck/library/utility/conv_common.hpp        |   77 +
 ...volution_host_tensor_descriptor_helper.hpp |  355 +++
 .../library/utility/convolution_parameter.hpp |  101 +
 .../ck/library/utility/device_memory.hpp      |   43 +
 library/include/ck/library/utility/fill.hpp   |  118 +
 .../ck/library/utility/host_common_util.hpp   |  136 ++
 .../include/ck/library/utility/host_conv.hpp  |  152 ++
 .../include/ck/library/utility/host_gemm.hpp  |   47 +
 .../ck/library/utility/host_reduction.hpp     |  374 ++++
 .../ck/library/utility/host_tensor.hpp        |  472 ++++
 .../library/utility/host_tensor_generator.hpp |  190 ++
 .../include/ck/library/utility/iterator.hpp   |   22 +
 .../include/ck/library/utility/literals.hpp   |   20 +
 .../include/ck/library/utility/numeric.hpp    |   16 +
 .../ck/library/utility/op_instance_engine.hpp |  249 +++
 library/include/ck/library/utility/ranges.hpp |   60 +
 .../gpu/CMakeLists.txt                        |   67 +
 .../gpu/batched_gemm/CMakeLists.txt           |   18 +
 ...dl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp |   59 +
 ...dl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp |   58 +
 ...dl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp |   62 +
 ...dl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp |   63 +
 ...m_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp |   83 +
 ...m_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp |   83 +
 ...m_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp |  110 +
 ...m_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp |   98 +
 ...m_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp |   58 +
 ...m_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp |   58 +
 ...m_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp |   58 +
 ...m_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp |   63 +
 ...dl_int8_int8_int8_gkm_gkn_gmn_instance.cpp |   79 +
 ...dl_int8_int8_int8_gkm_gnk_gmn_instance.cpp |   79 +
 ...dl_int8_int8_int8_gmk_gkn_gmn_instance.cpp |   79 +
 ...dl_int8_int8_int8_gmk_gnk_gmn_instance.cpp |   71 +
 .../CMakeLists.txt                            |    4 +
 ...6_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp |   80 +
 ...6_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp |   81 +
 .../gpu/batched_gemm_gemm/CMakeLists.txt      |    4 +
 ...6_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp |   80 +
 ...6_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp |   80 +
 .../gpu/batched_gemm_reduce/CMakeLists.txt    |    7 +
 ...6_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp |   80 +
 ...6_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp |   80 +
 ...6_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp |   80 +
 ...6_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp |   77 +
 .../batched_gemm_softmax_gemm/CMakeLists.txt  |    4 +
 ...6_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp |  131 ++
 .../CMakeLists.txt                            |    5 +
 ...f16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp |  133 ++
 ...6_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp |  133 ++
 .../gpu/batchnorm/CMakeLists.txt              |   10 +
 ...evice_batchnorm_backward_bf16_instance.cpp |  146 ++
 ...device_batchnorm_backward_f16_instance.cpp |  147 ++
 ...device_batchnorm_backward_f32_instance.cpp |  145 ++
 ...device_batchnorm_backward_f64_instance.cpp |  145 ++
 ...device_batchnorm_forward_bf16_instance.cpp |  147 ++
 .../device_batchnorm_forward_f16_instance.cpp |  147 ++
 .../device_batchnorm_forward_f32_instance.cpp |  145 ++
 .../device_batchnorm_forward_f64_instance.cpp |  145 ++
 .../gpu/contraction_bilinear/CMakeLists.txt   |    7 +
 ..._shuffle_f32_f32_f32_f32_kknn_instance.cpp |   79 +
 ..._shuffle_f32_f32_f32_f32_knnn_instance.cpp |   82 +
 ..._shuffle_f32_f32_f32_f32_mknn_instance.cpp |   82 +
 ..._shuffle_f32_f32_f32_f32_mnnn_instance.cpp |   82 +
 .../gpu/contraction_scale/CMakeLists.txt      |    7 +
 ...xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp |   78 +
 ...xdl_c_shuffle_f32_f32_f32_knn_instance.cpp |   81 +
 ...xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp |   81 +
 ...xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp |   81 +
 .../gpu/conv1d_bwd_data/CMakeLists.txt        |    6 +
 ...bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp |  102 +
 ..._bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp |   95 +
 ..._bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp |   94 +
 ...bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp |   99 +
 .../gpu/conv2d_bwd_data/CMakeLists.txt        |   10 +
 ...wd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp |   83 +
 ...wd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp |   83 +
 ...d_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp |   83 +
 ..._data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp |  157 ++
 ...d_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp |  158 ++
 ...d_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp |  155 ++
 ..._data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp |  153 ++
 .../gpu/conv2d_fwd/CMakeLists.txt             |    7 +
 ..._c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp |  155 ++
 ...d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp |  128 ++
 ...2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp |  120 +
 ...2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp |  119 +
 ...d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp |  125 ++
 .../gpu/conv2d_fwd_bias_relu/CMakeLists.txt   |    3 +
 ..._bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp |  154 ++
 .../conv2d_fwd_bias_relu_add/CMakeLists.txt   |    4 +
 ...s_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp |  154 ++
 .../gpu/conv3d_bwd_data/CMakeLists.txt        |    6 +
 ...ta_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp |  102 +
 ...ata_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp |  102 +
 ...ata_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp |  101 +
 ...ta_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp |   99 +
 .../gpu/elementwise/CMakeLists.txt            |    3 +
 .../elementwise/device_normalize_instance.cpp |   49 +
 .../elementwise_normalization/CMakeLists.txt  |    3 +
 ...elementwise_normalization_f16_instance.cpp |   54 +
 .../gpu/gemm/CMakeLists.txt                   |   43 +
 ..._gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp |   52 +
 ..._gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp |   52 +
 ..._gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp |   52 +
 ..._gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp |   53 +
 ..._gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp |   52 +
 ..._gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp |   53 +
 ..._gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp |   53 +
 ..._gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp |   53 +
 ...ice_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp |   49 +
 ...ice_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp |   49 +
 ...ice_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp |   49 +
 ...ice_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp |   49 +
 ..._2_stage_f16_f16_f16_mk_nk_mn_instance.cpp |  100 +
 ...uffle_bf16_bf16_bf16_km_kn_mn_instance.cpp |   68 +
 ...uffle_bf16_bf16_bf16_km_nk_mn_instance.cpp |   68 +
 ...uffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp |   68 +
 ...uffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp |   65 +
 ..._shuffle_f16_f16_f16_km_kn_mn_instance.cpp |  111 +
 ..._shuffle_f16_f16_f16_km_nk_mn_instance.cpp |  111 +
 ..._shuffle_f16_f16_f16_mk_kn_mn_instance.cpp |  111 +
 ..._shuffle_f16_f16_f16_mk_nk_mn_instance.cpp |  102 +
 ..._shuffle_f32_f32_f32_km_kn_mn_instance.cpp |   67 +
 ..._shuffle_f32_f32_f32_km_nk_mn_instance.cpp |   67 +
 ..._shuffle_f32_f32_f32_mk_kn_mn_instance.cpp |   67 +
 ..._shuffle_f32_f32_f32_mk_nk_mn_instance.cpp |   64 +
 ...l_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp |   68 +
 ...l_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp |   68 +
 ...l_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp |   68 +
 ...l_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp |   65 +
 ...gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp |  110 +
 ...gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp |  110 +
 ...gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp |  137 ++
 ...gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp |  130 ++
 ...gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp |   60 +
 ...gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp |   60 +
 ...gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp |   60 +
 ...gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp |   65 +
 ...gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp |   56 +
 ...gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp |   56 +
 ...gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp |   56 +
 ...gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp |   61 +
 .../gpu/gemm_add_add_fastgelu/CMakeLists.txt  |    6 +
 ...16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp |  153 ++
 ...16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp |  153 ++
 ...16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp |  153 ++
 ...16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp |  144 ++
 .../gpu/gemm_add_fastgelu/CMakeLists.txt      |    6 +
 ...e_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp |  136 ++
 ...e_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp |  136 ++
 ...e_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp |  136 ++
 ...e_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp |  127 ++
 .../gpu/gemm_bias_add_reduce/CMakeLists.txt   |    6 +
 ..._f16_f16_f16_f32_f32_km_kn_mn_instance.cpp |   83 +
 ..._f16_f16_f16_f32_f32_km_nk_mn_instance.cpp |   82 +
 ..._f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp |   82 +
 ..._f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp |   79 +
 .../gpu/gemm_bilinear/CMakeLists.txt          |    6 +
 ...e_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp |  105 +
 ...e_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp |  105 +
 ...e_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp |  105 +
 ...e_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp |  142 ++
 .../gpu/gemm_fastgelu/CMakeLists.txt          |    6 +
 ..._shuffle_f16_f16_f16_km_kn_mn_instance.cpp |  135 ++
 ..._shuffle_f16_f16_f16_km_nk_mn_instance.cpp |  135 ++
 ..._shuffle_f16_f16_f16_mk_kn_mn_instance.cpp |  135 ++
 ..._shuffle_f16_f16_f16_mk_nk_mn_instance.cpp |  126 ++
 .../gpu/gemm_reduce/CMakeLists.txt            |    6 +
 ..._f16_f16_f16_f32_f32_km_kn_mn_instance.cpp |   80 +
 ..._f16_f16_f16_f32_f32_km_nk_mn_instance.cpp |   80 +
 ..._f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp |   80 +
 ..._f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp |   77 +
 .../gpu/gemm_splitk/CMakeLists.txt            |   10 +
 ...l_splitk_f16_f16_f16_km_kn_mn_instance.cpp |   61 +
 ...l_splitk_f16_f16_f16_km_nk_mn_instance.cpp |   61 +
 ...l_splitk_f16_f16_f16_mk_kn_mn_instance.cpp |   61 +
 ...l_splitk_f16_f16_f16_mk_nk_mn_instance.cpp |   66 +
 ...l_splitk_f32_f32_f32_km_kn_mn_instance.cpp |   61 +
 ...l_splitk_f32_f32_f32_km_nk_mn_instance.cpp |   61 +
 ...l_splitk_f32_f32_f32_mk_kn_mn_instance.cpp |   61 +
 ...l_splitk_f32_f32_f32_mk_nk_mn_instance.cpp |   66 +
 .../grouped_conv1d_bwd_weight/CMakeLists.txt  |    5 +
 ...eight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp |  106 +
 ...weight_xdl_gnwc_gkxc_gnwk_f16_instance.cpp |  104 +
 ...weight_xdl_gnwc_gkxc_gnwk_f32_instance.cpp |  103 +
 .../gpu/grouped_conv1d_fwd/CMakeLists.txt     |    6 +
 ...d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp |  129 ++
 ...1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instance.cpp |  129 ++
 ...1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instance.cpp |  128 ++
 ...d_fwd_xdl_gnwc_gkxc_gnwk_int8_instance.cpp |  125 ++
 .../grouped_conv2d_bwd_data/CMakeLists.txt    |    3 +
 ...ata_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp |   99 +
 .../grouped_conv2d_bwd_weight/CMakeLists.txt  |    6 +
 ...ht_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp |  106 +
 ...ght_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp |  105 +
 ...ght_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp |  104 +
 .../gpu/grouped_conv2d_fwd/CMakeLists.txt     |   13 +
 ..._fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp |  103 +
 ..._fwd_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp |  107 +
 ...fwd_dl_gnhwc_gkyxc_gnhwk_int8_instance.cpp |  104 +
 ...wd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp |  156 ++
 ...fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp |  156 ++
 ...fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp |  128 ++
 ...wd_xdl_gnhwc_gkyxc_gnhwk_int8_instance.cpp |  125 ++
 ...fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp |  156 ++
 .../grouped_conv3d_bwd_weight/CMakeLists.txt  |    5 +
 ...xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp |  106 +
 ..._xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp |  106 +
 ..._xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp |  105 +
 .../gpu/grouped_conv3d_fwd/CMakeLists.txt     |    6 +
 ...xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp |  129 ++
 ..._xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp |  129 ++
 ..._xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp |  128 ++
 ...xdl_gndhwc_gkzyxc_gndhwk_int8_instance.cpp |  125 ++
 .../gpu/grouped_gemm/CMakeLists.txt           |    6 +
 ...gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp |   94 +
 ...gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp |   94 +
 ...gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp |   94 +
 ...gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp |   91 +
 .../gpu/normalization/CMakeLists.txt          |    4 +
 .../device_normalization_f16_instance.cpp     |   65 +
 .../device_normalization_f32_instance.cpp     |   60 +
 .../gpu/quantization/CMakeLists.txt           |    6 +
 ..._perchannel_quantization_int8_instance.cpp |   74 +
 ...as_perlayer_quantization_int8_instance.cpp |   68 +
 .../device_conv2d_xdl_int8_instance.hpp       |  111 +
 ..._perchannel_quantization_int8_instance.cpp |   62 +
 ...dl_perlayer_quantization_int8_instance.cpp |   62 +
 .../gpu/reduce/CMakeLists.txt                 |   76 +
 ...uce_instance_blockwise_b16_f32_b16_add.cpp |   24 +
 ...ce_instance_blockwise_b16_f32_b16_amax.cpp |   28 +
 ...uce_instance_blockwise_b16_f32_b16_avg.cpp |   24 +
 ...uce_instance_blockwise_b16_f32_b16_max.cpp |   28 +
 ...uce_instance_blockwise_b16_f32_b16_min.cpp |   28 +
 ...e_instance_blockwise_b16_f32_b16_norm2.cpp |   24 +
 ...ce_instance_blockwise_f16_f16_f16_amax.cpp |   28 +
 ...uce_instance_blockwise_f16_f16_f16_max.cpp |   28 +
 ...uce_instance_blockwise_f16_f16_f16_min.cpp |   28 +
 ...uce_instance_blockwise_f16_f32_f16_add.cpp |   24 +
 ...uce_instance_blockwise_f16_f32_f16_avg.cpp |   24 +
 ...e_instance_blockwise_f16_f32_f16_norm2.cpp |   24 +
 ...uce_instance_blockwise_f32_f32_f32_add.cpp |   24 +
 ...ce_instance_blockwise_f32_f32_f32_amax.cpp |   28 +
 ...uce_instance_blockwise_f32_f32_f32_avg.cpp |   24 +
 ...uce_instance_blockwise_f32_f32_f32_max.cpp |   28 +
 ...uce_instance_blockwise_f32_f32_f32_min.cpp |   28 +
 ...e_instance_blockwise_f32_f32_f32_norm2.cpp |   25 +
 ...uce_instance_blockwise_f32_f64_f32_add.cpp |   23 +
 ...uce_instance_blockwise_f32_f64_f32_avg.cpp |   23 +
 ...e_instance_blockwise_f32_f64_f32_norm2.cpp |   23 +
 ...uce_instance_blockwise_f64_f64_f64_add.cpp |   24 +
 ...ce_instance_blockwise_f64_f64_f64_amax.cpp |   28 +
 ...uce_instance_blockwise_f64_f64_f64_avg.cpp |   24 +
 ...uce_instance_blockwise_f64_f64_f64_max.cpp |   28 +
 ...uce_instance_blockwise_f64_f64_f64_min.cpp |   28 +
 ...e_instance_blockwise_f64_f64_f64_norm2.cpp |   24 +
 ...educe_instance_blockwise_i8_i32_i8_add.cpp |   24 +
 ...educe_instance_blockwise_i8_i32_i8_avg.cpp |   24 +
 ...educe_instance_blockwise_i8_i8_i8_amax.cpp |   28 +
 ...reduce_instance_blockwise_i8_i8_i8_max.cpp |   28 +
 ...reduce_instance_blockwise_i8_i8_i8_min.cpp |   28 +
 ..._multiblock_atomic_add_b16_f32_f32_add.cpp |   23 +
 ..._multiblock_atomic_add_b16_f32_f32_avg.cpp |   23 +
 ..._multiblock_atomic_add_f16_f32_f32_add.cpp |   24 +
 ..._multiblock_atomic_add_f16_f32_f32_avg.cpp |   24 +
 ..._multiblock_atomic_add_f32_f32_f32_add.cpp |   23 +
 ..._multiblock_atomic_add_f32_f32_f32_avg.cpp |   23 +
 ..._multiblock_atomic_add_f32_f64_f32_add.cpp |   23 +
 ..._multiblock_atomic_add_f32_f64_f32_avg.cpp |   23 +
 ..._multiblock_atomic_add_f64_f64_f64_add.cpp |   24 +
 ..._multiblock_atomic_add_f64_f64_f64_avg.cpp |   24 +
 ...ce_instance_threadwise_b16_f32_b16_add.cpp |   24 +
 ...e_instance_threadwise_b16_f32_b16_amax.cpp |   28 +
 ...ce_instance_threadwise_b16_f32_b16_avg.cpp |   24 +
 ...ce_instance_threadwise_b16_f32_b16_max.cpp |   28 +
 ...ce_instance_threadwise_b16_f32_b16_min.cpp |   28 +
 ..._instance_threadwise_b16_f32_b16_norm2.cpp |   24 +
 ...e_instance_threadwise_f16_f16_f16_amax.cpp |   28 +
 ...ce_instance_threadwise_f16_f16_f16_max.cpp |   28 +
 ...ce_instance_threadwise_f16_f16_f16_min.cpp |   28 +
 ...ce_instance_threadwise_f16_f32_f16_add.cpp |   23 +
 ...ce_instance_threadwise_f16_f32_f16_avg.cpp |   23 +
 ..._instance_threadwise_f16_f32_f16_norm2.cpp |   23 +
 ...ce_instance_threadwise_f32_f32_f32_add.cpp |   24 +
 ...e_instance_threadwise_f32_f32_f32_amax.cpp |   28 +
 ...ce_instance_threadwise_f32_f32_f32_avg.cpp |   24 +
 ...ce_instance_threadwise_f32_f32_f32_max.cpp |   28 +
 ...ce_instance_threadwise_f32_f32_f32_min.cpp |   28 +
 ..._instance_threadwise_f32_f32_f32_norm2.cpp |   24 +
 ...ce_instance_threadwise_f32_f64_f32_add.cpp |   24 +
 ...ce_instance_threadwise_f32_f64_f32_avg.cpp |   24 +
 ..._instance_threadwise_f32_f64_f32_norm2.cpp |   24 +
 ...ce_instance_threadwise_f64_f64_f64_add.cpp |   23 +
 ...e_instance_threadwise_f64_f64_f64_amax.cpp |   27 +
 ...ce_instance_threadwise_f64_f64_f64_avg.cpp |   23 +
 ...ce_instance_threadwise_f64_f64_f64_max.cpp |   27 +
 ...ce_instance_threadwise_f64_f64_f64_min.cpp |   27 +
 ..._instance_threadwise_f64_f64_f64_norm2.cpp |   23 +
 ...duce_instance_threadwise_i8_i32_i8_add.cpp |   25 +
 ...duce_instance_threadwise_i8_i32_i8_avg.cpp |   24 +
 ...duce_instance_threadwise_i8_i8_i8_amax.cpp |   28 +
 ...educe_instance_threadwise_i8_i8_i8_max.cpp |   28 +
 ...educe_instance_threadwise_i8_i8_i8_min.cpp |   28 +
 .../gpu/softmax/CMakeLists.txt                |   26 +
 .../device_softmax_f16_f16_instance.cpp       |   40 +
 ...softmax_f16_f16_instance_rank3_reduce1.cpp |   27 +
 ...softmax_f16_f16_instance_rank3_reduce2.cpp |   27 +
 ...softmax_f16_f16_instance_rank3_reduce3.cpp |   27 +
 ...softmax_f16_f16_instance_rank4_reduce1.cpp |   27 +
 ...softmax_f16_f16_instance_rank4_reduce2.cpp |   27 +
 ...softmax_f16_f16_instance_rank4_reduce3.cpp |   27 +
 ...softmax_f16_f16_instance_rank4_reduce4.cpp |   27 +
 .../device_softmax_f32_f32_instance.cpp       |   40 +
 ...softmax_f32_f32_instance_rank3_reduce1.cpp |   27 +
 ...softmax_f32_f32_instance_rank3_reduce2.cpp |   27 +
 ...softmax_f32_f32_instance_rank3_reduce3.cpp |   27 +
 ...softmax_f32_f32_instance_rank4_reduce1.cpp |   27 +
 ...softmax_f32_f32_instance_rank4_reduce2.cpp |   27 +
 ...softmax_f32_f32_instance_rank4_reduce3.cpp |   27 +
 ...softmax_f32_f32_instance_rank4_reduce4.cpp |   27 +
 .../softmax/device_softmax_i8_i8_instance.cpp |   40 +
 ...e_softmax_i8_i8_instance_rank3_reduce1.cpp |   27 +
 ...e_softmax_i8_i8_instance_rank3_reduce2.cpp |   27 +
 ...e_softmax_i8_i8_instance_rank3_reduce3.cpp |   27 +
 ...e_softmax_i8_i8_instance_rank4_reduce1.cpp |   27 +
 ...e_softmax_i8_i8_instance_rank4_reduce2.cpp |   27 +
 ...e_softmax_i8_i8_instance_rank4_reduce3.cpp |   27 +
 ...e_softmax_i8_i8_instance_rank4_reduce4.cpp |   27 +
 library/src/utility/CMakeLists.txt            |   28 +
 library/src/utility/convolution_parameter.cpp |  171 ++
 library/src/utility/device_memory.cpp         |   29 +
 library/src/utility/host_tensor.cpp           |   59 +
 profiler/CMakeLists.txt                       |    5 +
 profiler/README.md                            |   48 +
 profiler/include/profiler/data_type_enum.hpp  |   20 +
 .../profiler/data_type_enum_helper.hpp        |   77 +
 ...le_batched_gemm_add_relu_gemm_add_impl.hpp |  360 +++
 .../profile_batched_gemm_gemm_impl.hpp        |  319 +++
 .../profiler/profile_batched_gemm_impl.hpp    |  233 ++
 .../profile_batched_gemm_reduce_impl.hpp      |  362 +++
 ...profile_batched_gemm_softmax_gemm_impl.hpp |  347 +++
 ...batched_gemm_softmax_gemm_permute_impl.hpp |  367 ++++
 .../profile_batchnorm_backward_impl.hpp       |  390 ++++
 .../profile_batchnorm_forward_impl.hpp        |  412 ++++
 .../profiler/profile_conv_bwd_data_impl.hpp   |  248 +++
 .../profile_conv_fwd_bias_relu_add_impl.hpp   |  278 +++
 .../profile_conv_fwd_bias_relu_impl.hpp       |  266 +++
 .../profiler/profile_conv_fwd_impl.hpp        |  221 ++
 .../profiler/profile_convnd_bwd_data_impl.hpp |  486 ++++
 .../profile_convnd_bwd_weight_impl.hpp        |  474 ++++
 .../profile_elementwise_layernorm_impl.hpp    |  266 +++
 .../profile_gemm_add_add_fastgelu_impl.hpp    |  242 ++
 .../profile_gemm_add_fastgelu_impl.hpp        |  232 ++
 .../profile_gemm_bias_add_reduce_impl.hpp     |  384 ++++
 .../profiler/profile_gemm_bilinear_impl.hpp   |  234 ++
 .../profiler/profile_gemm_fastgelu_impl.hpp   |  222 ++
 .../include/profiler/profile_gemm_impl.hpp    |  254 +++
 .../profiler/profile_gemm_reduce_impl.hpp     |  353 +++
 .../profiler/profile_gemm_splitk_impl.hpp     |  257 +++
 .../profile_grouped_conv_bwd_weight_impl.hpp  |  252 +++
 .../profile_grouped_conv_fwd_impl.hpp         |  253 +++
 .../profiler/profile_grouped_gemm_impl.hpp    |  291 +++
 .../profiler/profile_groupnorm_impl.hpp       |  208 ++
 .../profiler/profile_layernorm_impl.hpp       |  227 ++
 .../include/profiler/profile_reduce_impl.hpp  |  520 +++++
 .../include/profiler/profile_softmax_impl.hpp |  219 ++
 profiler/src/CMakeLists.txt                   |   67 +
 profiler/src/profile_batched_gemm.cpp         |  203 ++
 ...profile_batched_gemm_add_relu_gemm_add.cpp |  214 ++
 profiler/src/profile_batched_gemm_gemm.cpp    |  187 ++
 profiler/src/profile_batched_gemm_reduce.cpp  |  159 ++
 profiler/src/profile_batchnorm_bwd.cpp        |  207 ++
 profiler/src/profile_batchnorm_fwd.cpp        |  219 ++
 profiler/src/profile_conv_bwd_data.cpp        |  190 ++
 profiler/src/profile_conv_fwd.cpp             |  192 ++
 profiler/src/profile_conv_fwd_bias_relu.cpp   |  122 +
 .../src/profile_conv_fwd_bias_relu_add.cpp    |  122 +
 profiler/src/profile_gemm.cpp                 |  192 ++
 .../src/profile_gemm_add_add_fastgelu.cpp     |  158 ++
 profiler/src/profile_gemm_add_fastgelu.cpp    |  146 ++
 profiler/src/profile_gemm_bias_add_reduce.cpp |  167 ++
 profiler/src/profile_gemm_bilinear.cpp        |  152 ++
 profiler/src/profile_gemm_fastgelu.cpp        |  137 ++
 profiler/src/profile_gemm_reduce.cpp          |  154 ++
 profiler/src/profile_gemm_splitk.cpp          |  154 ++
 .../src/profile_grouped_conv_bwd_weight.cpp   |  182 ++
 profiler/src/profile_grouped_conv_fwd.cpp     |  260 +++
 profiler/src/profile_grouped_gemm.cpp         |  169 ++
 profiler/src/profile_groupnorm.cpp            |  112 +
 profiler/src/profile_layernorm.cpp            |  101 +
 profiler/src/profile_reduce.cpp               |  434 ++++
 profiler/src/profile_softmax.cpp              |  169 ++
 profiler/src/profiler.cpp                     |   30 +
 profiler/src/profiler_operation_registry.hpp  |   79 +
 rbuild.ini                                    |    8 +
 requirements.txt                              |    1 +
 script/clang-format-overwrite.sh              |    2 +
 script/cmake-ck-dev.sh                        |   19 +
 script/cmake-ck-release.sh                    |   19 +
 script/count_vgpr.sh                          |   20 +
 script/hipclang_opt.sh                        |   25 +
 script/parse_perf_data.py                     |  290 +++
 script/process_perf_data.py                   |  308 +++
 script/process_perf_data.sh                   |   15 +
 script/process_qa_data.sh                     |   23 +
 script/profile_batched_gemm.sh                |   44 +
 script/profile_conv_bwd_data.sh               |   38 +
 script/profile_conv_fwd.sh                    |   38 +
 script/profile_gemm.sh                        |   58 +
 script/profile_gemm_bilinear.sh               |   41 +
 script/profile_grouped_gemm.sh                |   18 +
 script/profile_onnx_gemm.sh                   |   31 +
 script/profile_reduce_no_index.sh             |   78 +
 script/profile_reduce_with_index.sh           |   70 +
 script/profile_resnet50.sh                    |   69 +
 script/profile_splitK_gemm.sh                 |   41 +
 script/run_full_performance_tests.sh          |  149 ++
 script/run_performance_tests.sh               |   60 +
 script/test_convnd_fwd.sh                     |  110 +
 script/test_reduce_no_index.sh                |   63 +
 script/test_reduce_with_index.sh              |   63 +
 test/CMakeLists.txt                           |   60 +
 test/batched_gemm/CMakeLists.txt              |   15 +
 test/batched_gemm/batched_gemm_bf16.cpp       |   44 +
 test/batched_gemm/batched_gemm_fp16.cpp       |   44 +
 test/batched_gemm/batched_gemm_fp32.cpp       |   44 +
 test/batched_gemm/batched_gemm_int8.cpp       |   44 +
 test/batched_gemm_gemm/CMakeLists.txt         |    5 +
 .../test_batched_gemm_gemm_fp16.cpp           |  149 ++
 .../test_batched_gemm_gemm_util.hpp           |  189 ++
 test/batched_gemm_reduce/CMakeLists.txt       |    3 +
 .../batched_gemm_reduce_fp16.cpp              |   67 +
 test/batched_gemm_softmax_gemm/CMakeLists.txt |    5 +
 .../test_batched_gemm_softmax_gemm_fp16.cpp   |  176 ++
 .../test_batched_gemm_softmax_gemm_util.hpp   |  197 ++
 .../CMakeLists.txt                            |    8 +
 ...batched_gemm_softmax_gemm_permute_bf16.cpp |  182 ++
 ...batched_gemm_softmax_gemm_permute_fp16.cpp |  182 ++
 ...batched_gemm_softmax_gemm_permute_util.hpp |  368 ++++
 test/batchnorm/CMakeLists.txt                 |    4 +
 test/batchnorm/batchnorm_bwd_rank_4.cpp       |   92 +
 test/batchnorm/batchnorm_fwd_rank_4.cpp       |  109 +
 test/block_to_ctile_map/CMakeLists.txt        |    1 +
 .../test_block_to_ctile_map.cpp               |  322 +++
 test/conv_util/CMakeLists.txt                 |    2 +
 test/conv_util/conv_util.cpp                  |  156 ++
 test/convnd_bwd_data/CMakeLists.txt           |    2 +
 test/convnd_bwd_data/convnd_bwd_data.cpp      |   93 +
 test/convnd_fwd/CMakeLists.txt                |    2 +
 test/convnd_fwd/convnd_fwd.cpp                |   92 +
 test/data_type/CMakeLists.txt                 |    4 +
 test/data_type/int4.cpp                       |  211 ++
 test/elementwise_normalization/CMakeLists.txt |    7 +
 .../test_elementwise_layernorm_fp16.cpp       |   47 +
 test/gemm/CMakeLists.txt                      |   25 +
 test/gemm/gemm_bf16.cpp                       |   34 +
 test/gemm/gemm_fp16.cpp                       |   34 +
 test/gemm/gemm_fp32.cpp                       |   34 +
 test/gemm/gemm_fp64.cpp                       |   34 +
 test/gemm/gemm_int8.cpp                       |   34 +
 test/gemm/gemm_standalone_xdl_fp16.cpp        |  162 ++
 test/gemm/gemm_util.hpp                       |  267 +++
 test/gemm/instance/gemm_f16_nn_instance.cpp   |   86 +
 test/gemm/instance/gemm_f16_nn_instance.hpp   |   41 +
 test/gemm/instance/gemm_f16_nt_instance.cpp   |   86 +
 test/gemm/instance/gemm_f16_nt_instance.hpp   |   41 +
 test/gemm/instance/gemm_f16_tn_instance.cpp   |   86 +
 test/gemm/instance/gemm_f16_tn_instance.hpp   |   41 +
 test/gemm/instance/gemm_f16_tt_instance.cpp   |   86 +
 test/gemm/instance/gemm_f16_tt_instance.hpp   |   41 +
 test/gemm/run_gemm_test.inc                   |   41 +
 test/gemm_reduce/CMakeLists.txt               |    3 +
 test/gemm_reduce/gemm_reduce_fp16.cpp         |   49 +
 test/gemm_split_k/CMakeLists.txt              |    3 +
 test/gemm_split_k/gemm_split_k.cpp            |  261 +++
 test/grouped_convnd_bwd_weight/CMakeLists.txt |    2 +
 .../grouped_convnd_bwd_weight.cpp             |   91 +
 test/grouped_convnd_fwd/CMakeLists.txt        |    3 +
 .../grouped_convnd_fwd/grouped_convnd_fwd.cpp |  270 +++
 test/grouped_gemm/CMakeLists.txt              |    3 +
 test/grouped_gemm/grouped_gemm_fp16.cpp       |   65 +
 test/magic_number_division/CMakeLists.txt     |    2 +
 .../magic_number_division.cpp                 |  150 ++
 test/normalization/CMakeLists.txt             |   17 +
 test/normalization/test_groupnorm_fp16.cpp    |   51 +
 test/normalization/test_groupnorm_fp32.cpp    |   49 +
 test/normalization/test_layernorm2d_fp16.cpp  |   45 +
 test/normalization/test_layernorm2d_fp32.cpp  |   45 +
 test/reduce/CMakeLists.txt                    |    7 +
 test/reduce/reduce_no_index.cpp               |  248 +++
 test/reduce/reduce_with_index.cpp             |  248 +++
 test/reference_conv_fwd/CMakeLists.txt        |    2 +
 .../reference_conv_fwd/reference_conv_fwd.cpp |  392 ++++
 test/softmax/CMakeLists.txt                   |   11 +
 test/softmax/test_softmax_interface.cpp       |   86 +
 test/softmax/test_softmax_rank3.cpp           |   34 +
 test/softmax/test_softmax_rank4.cpp           |   34 +
 test/softmax/test_softmax_ut_cases.inc        |   60 +
 test/softmax/test_softmax_util.hpp            |  142 ++
 test/space_filling_curve/CMakeLists.txt       |    1 +
 .../space_filling_curve.cpp                   |  195 ++
 test/wmma_op/CMakeLists.txt                   |    2 +
 test/wmma_op/wmma_op.cpp                      |   67 +
 test/wmma_op/wmma_op_util.hpp                 |  369 ++++
 1216 files changed, 196847 insertions(+), 1 deletion(-)
 create mode 100644 CITATION.cff
 create mode 100644 CMakeLists.txt
 create mode 100644 CONTRIBUTORS.md
 create mode 100644 Config.cmake.in
 create mode 100644 Dockerfile
 create mode 100644 Jenkinsfile
 create mode 100644 LICENSE
 create mode 100644 client_example/01_gemm/CMakeLists.txt
 create mode 100644 client_example/01_gemm/gemm.cpp
 create mode 100644 client_example/02_gemm_add_add_fastgelu/CMakeLists.txt
 create mode 100644 client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
 create mode 100644 client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp
 create mode 100644 client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp
 create mode 100644 client_example/03_gemm_layernorm/CMakeLists.txt
 create mode 100644 client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp
 create mode 100644 client_example/04_contraction/CMakeLists.txt
 create mode 100644 client_example/04_contraction/contraction_bilinear.cpp
 create mode 100644 client_example/04_contraction/contraction_scale.cpp
 create mode 100644 client_example/05_layernorm/CMakeLists.txt
 create mode 100644 client_example/05_layernorm/layernorm2d.cpp
 create mode 100644 client_example/06_softmax/CMakeLists.txt
 create mode 100644 client_example/06_softmax/softmax4d.cpp
 create mode 100644 client_example/07_grouped_conv2d_fwd/CMakeLists.txt
 create mode 100644 client_example/07_grouped_conv2d_fwd/grouped_conv2d_fwd.cpp
 create mode 100644 client_example/08_fused_attention/CMakeLists.txt
 create mode 100644 client_example/08_fused_attention/fused_attention.cpp
 create mode 100644 client_example/09_quantization/CMakeLists.txt
 create mode 100644 client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp
 create mode 100644 client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp
 create mode 100644 client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
 create mode 100644 client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
 create mode 100644 client_example/10_grouped_conv2d_bwd_data/CMakeLists.txt
 create mode 100644 client_example/10_grouped_conv2d_bwd_data/grouped_conv2d_bwd_data.cpp
 create mode 100644 client_example/11_grouped_conv_bwd_weight/CMakeLists.txt
 create mode 100644 client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight.cpp
 create mode 100644 client_example/12_elementwise_normalization/CMakeLists.txt
 create mode 100644 client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp
 create mode 100644 client_example/13_batchnorm/CMakeLists.txt
 create mode 100644 client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp
 create mode 100644 client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp
 create mode 100644 client_example/14_instance_id/CMakeLists.txt
 create mode 100644 client_example/14_instance_id/batchnorm_fwd_instance_id.cpp
 create mode 100644 client_example/CMakeLists.txt
 create mode 100644 client_example/README.md
 create mode 100644 cmake/Analyzers.cmake
 create mode 100644 cmake/ClangTidy.cmake
 create mode 100644 cmake/CppCheck.cmake
 create mode 100644 cmake/DoxygenDoc.cmake
 create mode 100644 cmake/EnableCompilerWarnings.cmake
 create mode 100644 cmake/TargetFlags.cmake
 create mode 100644 cmake/googletest.cmake
 create mode 100644 dev-requirements.txt
 create mode 100644 doc/image/ck_component.png
 create mode 100644 doc/image/ck_layer.png
 create mode 100644 doc/markdown/dockerhub.md
 create mode 100644 example/01_gemm/CMakeLists.txt
 create mode 100644 example/01_gemm/README.md
 create mode 100644 example/01_gemm/common.hpp
 create mode 100644 example/01_gemm/gemm_dl_fp16.cpp
 create mode 100644 example/01_gemm/gemm_dl_fp32.cpp
 create mode 100644 example/01_gemm/gemm_dl_int4.cpp
 create mode 100644 example/01_gemm/gemm_dl_int8.cpp
 create mode 100644 example/01_gemm/gemm_xdl_bf16.cpp
 create mode 100644 example/01_gemm/gemm_xdl_fp16.cpp
 create mode 100644 example/01_gemm/gemm_xdl_fp64.cpp
 create mode 100644 example/01_gemm/gemm_xdl_int4.cpp
 create mode 100644 example/01_gemm/gemm_xdl_int8.cpp
 create mode 100644 example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
 create mode 100644 example/01_gemm/run_gemm_example.inc
 create mode 100644 example/02_gemm_bilinear/CMakeLists.txt
 create mode 100644 example/02_gemm_bilinear/README.md
 create mode 100644 example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
 create mode 100644 example/03_gemm_bias_relu/CMakeLists.txt
 create mode 100644 example/03_gemm_bias_relu/README.md
 create mode 100644 example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
 create mode 100644 example/04_gemm_add_add_fastgelu/CMakeLists.txt
 create mode 100644 example/04_gemm_add_add_fastgelu/README.md
 create mode 100644 example/04_gemm_add_add_fastgelu/common.hpp
 create mode 100644 example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp
 create mode 100644 example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
 create mode 100644 example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp
 create mode 100644 example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int4.cpp
 create mode 100644 example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp
 create mode 100644 example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
 create mode 100644 example/09_convnd_fwd/CMakeLists.txt
 create mode 100644 example/09_convnd_fwd/README.md
 create mode 100644 example/09_convnd_fwd/convnd_fwd_common.hpp
 create mode 100644 example/09_convnd_fwd/convnd_fwd_dl_common.hpp
 create mode 100644 example/09_convnd_fwd/convnd_fwd_dl_fp16.cpp
 create mode 100644 example/09_convnd_fwd/convnd_fwd_dl_fp32.cpp
 create mode 100644 example/09_convnd_fwd/convnd_fwd_dl_int8.cpp
 create mode 100644 example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
 create mode 100644 example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
 create mode 100644 example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
 create mode 100644 example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
 create mode 100644 example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
 create mode 100644 example/09_convnd_fwd/run_convnd_fwd_dl_example.inc
 create mode 100644 example/09_convnd_fwd/run_convnd_fwd_example.inc
 create mode 100644 example/10_convnd_fwd_multiple_d_multiple_reduce/CMakeLists.txt
 create mode 100644 example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
 create mode 100644 example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_bf16.cpp
 create mode 100644 example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp16.cpp
 create mode 100644 example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp32.cpp
 create mode 100644 example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int4.cpp
 create mode 100644 example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int8.cpp
 create mode 100644 example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
 create mode 100644 example/12_reduce/CMakeLists.txt
 create mode 100644 example/12_reduce/README.md
 create mode 100644 example/12_reduce/reduce_blockwise.cpp
 create mode 100644 example/12_reduce/reduce_blockwise_impl.hpp
 create mode 100644 example/12_reduce/reduce_blockwise_two_call.cpp
 create mode 100644 example/12_reduce/reduce_example_common.hpp
 create mode 100644 example/12_reduce/reduce_multiblock_atomic_add.cpp
 create mode 100644 example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
 create mode 100644 example/13_pool2d_fwd/CMakeLists.txt
 create mode 100644 example/13_pool2d_fwd/README.md
 create mode 100644 example/13_pool2d_fwd/pool2d_fwd_common.hpp
 create mode 100644 example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
 create mode 100644 example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
 create mode 100644 example/14_gemm_quantization/CMakeLists.txt
 create mode 100644 example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp
 create mode 100644 example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp
 create mode 100644 example/15_grouped_gemm/CMakeLists.txt
 create mode 100644 example/15_grouped_gemm/README.md
 create mode 100644 example/15_grouped_gemm/grouped_gemm_xdl_bfp16.cpp
 create mode 100644 example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
 create mode 100644 example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp
 create mode 100644 example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp
 create mode 100644 example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp
 create mode 100644 example/15_grouped_gemm/run_grouped_gemm_example.inc
 create mode 100644 example/16_gemm_multi_d_multi_reduces/CMakeLists.txt
 create mode 100644 example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp
 create mode 100644 example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp
 create mode 100644 example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp
 create mode 100644 example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp
 create mode 100644 example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp
 create mode 100644 example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp
 create mode 100644 example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp
 create mode 100644 example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp
 create mode 100644 example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp
 create mode 100644 example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp
 create mode 100644 example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp
 create mode 100644 example/17_convnd_bwd_data/CMakeLists.txt
 create mode 100644 example/17_convnd_bwd_data/README.md
 create mode 100644 example/17_convnd_bwd_data/convnd_bwd_data_common.hpp
 create mode 100644 example/17_convnd_bwd_data/convnd_bwd_data_dl_fp16.cpp
 create mode 100644 example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp
 create mode 100644 example/18_batched_gemm_reduce/CMakeLists.txt
 create mode 100644 example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
 create mode 100644 example/19_binary_elementwise/CMakeLists.txt
 create mode 100644 example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp
 create mode 100644 example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp
 create mode 100644 example/19_binary_elementwise/elementwise_add_1d.cpp
 create mode 100644 example/19_binary_elementwise/elementwise_add_4d.cpp
 create mode 100644 example/20_grouped_conv_bwd_weight/CMakeLists.txt
 create mode 100644 example/20_grouped_conv_bwd_weight/common.hpp
 create mode 100644 example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_bf16.cpp
 create mode 100644 example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16.cpp
 create mode 100644 example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
 create mode 100644 example/21_gemm_layernorm/CMakeLists.txt
 create mode 100644 example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_fp16.cpp
 create mode 100644 example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp
 create mode 100644 example/21_gemm_layernorm/gemm_xdl_layernorm_single_kernel_fp16.cpp
 create mode 100644 example/22_cgemm/CMakeLists.txt
 create mode 100644 example/22_cgemm/cgemm_xdl_bf16.cpp
 create mode 100644 example/22_cgemm/cgemm_xdl_common.hpp
 create mode 100644 example/22_cgemm/cgemm_xdl_fp16.cpp
 create mode 100644 example/22_cgemm/cgemm_xdl_fp32.cpp
 create mode 100644 example/22_cgemm/cgemm_xdl_int4.cpp
 create mode 100644 example/22_cgemm/cgemm_xdl_int8.cpp
 create mode 100644 example/23_softmax/CMakeLists.txt
 create mode 100644 example/23_softmax/README.md
 create mode 100644 example/23_softmax/softmax_blockwise.cpp
 create mode 100644 example/24_batched_gemm/CMakeLists.txt
 create mode 100644 example/24_batched_gemm/batched_gemm_xdl_bfp16.cpp
 create mode 100644 example/24_batched_gemm/batched_gemm_xdl_fp16.cpp
 create mode 100644 example/24_batched_gemm/batched_gemm_xdl_fp32.cpp
 create mode 100644 example/24_batched_gemm/batched_gemm_xdl_int4.cpp
 create mode 100644 example/24_batched_gemm/batched_gemm_xdl_int8.cpp
 create mode 100644 example/24_batched_gemm/run_batched_gemm_example.inc
 create mode 100644 example/25_gemm_bias_e_permute/CMakeLists.txt
 create mode 100644 example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp
 create mode 100644 example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp
 create mode 100644 example/26_contraction/CMakeLists.txt
 create mode 100644 example/26_contraction/README.md
 create mode 100644 example/26_contraction/contraction_bilinear_xdl_fp32.cpp
 create mode 100644 example/26_contraction/contraction_scale_xdl_fp32.cpp
 create mode 100644 example/27_layernorm/CMakeLists.txt
 create mode 100644 example/27_layernorm/layernorm_blockwise.cpp
 create mode 100644 example/28_grouped_gemm_bias_e_permute/CMakeLists.txt
 create mode 100644 example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp
 create mode 100644 example/29_batched_gemm_bias_e_permute/CMakeLists.txt
 create mode 100644 example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp
 create mode 100644 example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
 create mode 100644 example/30_grouped_conv_fwd_multiple_d/README.md
 create mode 100644 example/30_grouped_conv_fwd_multiple_d/common.hpp
 create mode 100644 example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp
 create mode 100644 example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp
 create mode 100644 example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp
 create mode 100644 example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int4.cpp
 create mode 100644 example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int8.cpp
 create mode 100644 example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_xdl_fp16.cpp
 create mode 100644 example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc
 create mode 100644 example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_example.inc
 create mode 100644 example/31_batched_gemm_gemm/CMakeLists.txt
 create mode 100644 example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp
 create mode 100644 example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp
 create mode 100644 example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp
 create mode 100644 example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp
 create mode 100644 example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp
 create mode 100644 example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc
 create mode 100644 example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt
 create mode 100644 example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
 create mode 100644 example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp
 create mode 100644 example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
 create mode 100644 example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_bf16.cpp
 create mode 100644 example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp
 create mode 100644 example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
 create mode 100644 example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
 create mode 100644 example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc
 create mode 100644 example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc
 create mode 100644 example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc
 create mode 100644 example/33_multiple_reduce/CMakeLists.txt
 create mode 100644 example/33_multiple_reduce/README.md
 create mode 100644 example/33_multiple_reduce/dual_reduce_common.hpp
 create mode 100644 example/33_multiple_reduce/dual_reduce_multiblock.cpp
 create mode 100644 example/33_multiple_reduce/dual_reduce_threadwise.cpp
 create mode 100644 example/34_batchnorm/CMakeLists.txt
 create mode 100644 example/34_batchnorm/README.md
 create mode 100644 example/34_batchnorm/batchnorm_backward_nhwc.cpp
 create mode 100644 example/34_batchnorm/batchnorm_common.hpp
 create mode 100644 example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp
 create mode 100644 example/34_batchnorm/batchnorm_forward_training_nhwc.cpp
 create mode 100644 example/34_batchnorm/batchnorm_infer_impl.hpp
 create mode 100644 example/35_splitK_gemm/CMakeLists.txt
 create mode 100644 example/35_splitK_gemm/run_splitK_gemm_example.inc
 create mode 100644 example/35_splitK_gemm/splitK_gemm_xdl_bfp16.cpp
 create mode 100644 example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp
 create mode 100644 example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp
 create mode 100644 example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp
 create mode 100644 example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp
 create mode 100644 example/36_sparse_embedding/CMakeLists.txt
 create mode 100644 example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp
 create mode 100644 example/37_batched_gemm_add_add_relu_gemm_add/CMakeLists.txt
 create mode 100644 example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
 create mode 100644 example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
 create mode 100644 example/38_grouped_conv_bwd_data_multiple_d/common.hpp
 create mode 100644 example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_bias_relu_fp16.cpp
 create mode 100644 example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_fp16.cpp
 create mode 100644 example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc
 create mode 100644 example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_example.inc
 create mode 100644 example/39_permute/CMakeLists.txt
 create mode 100644 example/39_permute/common.hpp
 create mode 100644 example/39_permute/permute_1xHxW_fp16.cpp
 create mode 100644 example/39_permute/permute_HxWx4_fp16.cpp
 create mode 100644 example/39_permute/permute_NxHxW_fp16.cpp
 create mode 100644 example/39_permute/run_permute_bundle_example.inc
 create mode 100644 example/39_permute/run_permute_element_example.inc
 create mode 100644 example/41_grouped_conv_conv_fwd/CMakeLists.txt
 create mode 100644 example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp
 create mode 100644 example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp
 create mode 100644 example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp
 create mode 100644 example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp
 create mode 100644 example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp
 create mode 100644 example/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc
 create mode 100644 example/42_groupnorm/CMakeLists.txt
 create mode 100644 example/42_groupnorm/groupnorm_sigmoid_fp16.cpp
 create mode 100644 example/43_splitk_gemm_bias_e_permute/CMakeLists.txt
 create mode 100644 example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp
 create mode 100644 example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp
 create mode 100644 example/44_conv2d_fwd_quantization/CMakeLists.txt
 create mode 100644 example/44_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
 create mode 100644 example/44_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
 create mode 100644 example/44_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
 create mode 100644 example/44_elementwise_permute/CMakeLists.txt
 create mode 100644 example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
 create mode 100644 example/44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp
 create mode 100644 example/45_elementwise_normalization/CMakeLists.txt
 create mode 100644 example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp
 create mode 100644 example/CMakeLists.txt
 create mode 100644 include/ck/ck.hpp
 create mode 100644 include/ck/host_utility/device_prop.hpp
 create mode 100644 include/ck/host_utility/hip_check_error.hpp
 create mode 100644 include/ck/host_utility/io.hpp
 create mode 100644 include/ck/host_utility/kernel_launch.hpp
 create mode 100644 include/ck/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp
 create mode 100644 include/ck/problem_transform/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp
 create mode 100644 include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_atomic_nchw_kcyx_nkhw.hpp
 create mode 100644 include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
 create mode 100644 include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r4_atomic_nhwc_kyxc_nhwk.hpp
 create mode 100644 include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
 create mode 100644 include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r5_nhwc_kyxc_nhwk.hpp
 create mode 100644 include/ck/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp
 create mode 100644 include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp
 create mode 100644 include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp
 create mode 100644 include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
 create mode 100644 include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp
 create mode 100644 include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
 create mode 100644 include/ck/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp
 create mode 100644 include/ck/stream_config.hpp
 create mode 100644 include/ck/tensor/static_tensor.hpp
 create mode 100644 include/ck/tensor_description/cluster_descriptor.hpp
 create mode 100644 include/ck/tensor_description/multi_index_transform.hpp
 create mode 100644 include/ck/tensor_description/multi_index_transform_helper.hpp
 create mode 100644 include/ck/tensor_description/tensor_adaptor.hpp
 create mode 100644 include/ck/tensor_description/tensor_descriptor.hpp
 create mode 100644 include/ck/tensor_description/tensor_descriptor_helper.hpp
 create mode 100644 include/ck/tensor_description/tensor_space_filling_curve.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/blockwise_welford.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp
 create mode 100644 include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_base.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_batched_gemm_e_permute.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_cgemm.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_conv_fwd.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation_add.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_elementwise_base.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_gemm.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_gemm_bias_e_permute.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_grouped_contraction_multiple_d.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_multiple_reduce.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_normalization.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_permute.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_pool2d_fwd.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_reduce.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_softmax.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_elementwise.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_add_reduce_xdl_cshuffle.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_e_permute_xdl.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_gemm_reduce_xdl_cshuffle.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_threadwise.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_sparse_embedding3_forward_layernorm.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/masking_specialization.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/matrix_padder.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/tensor_layout.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/tensor_specialization.hpp
 create mode 100644 include/ck/tensor_operation/gpu/device/welford_helper.hpp
 create mode 100644 include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
 create mode 100644 include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
 create mode 100644 include/ck/tensor_operation/gpu/element/quantization_operation.hpp
 create mode 100644 include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_multiblock_reduce_first_half.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_multiblock.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_threadwise.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_forward_blockwise_welford.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_contraction_dlops_v1r2.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r2.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v2.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v3.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_normalization_naive_variance.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_permute.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_set_multiple_buffer_value.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp
 create mode 100644 include/ck/tensor_operation/gpu/grid/gridwise_sparse_embedding3_forward_layernorm.hpp
 create mode 100644 include/ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp
 create mode 100644 include/ck/tensor_operation/gpu/thread/threadwise_contraction_dl.hpp
 create mode 100644 include/ck/tensor_operation/gpu/thread/threadwise_gemm_dlops_v3.hpp
 create mode 100644 include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_set.hpp
 create mode 100644 include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
 create mode 100644 include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
 create mode 100644 include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r3.hpp
 create mode 100644 include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp
 create mode 100644 include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp
 create mode 100644 include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp
 create mode 100644 include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp
 create mode 100644 include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp
 create mode 100644 include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp
 create mode 100644 include/ck/tensor_operation/gpu/thread/threadwise_welford.hpp
 create mode 100644 include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
 create mode 100644 include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp
 create mode 100644 include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
 create mode 100644 include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
 create mode 100644 include/ck/utility/amd_address_space.hpp
 create mode 100644 include/ck/utility/amd_buffer_addressing.hpp
 create mode 100644 include/ck/utility/amd_inline_asm.hpp
 create mode 100644 include/ck/utility/amd_llvm_intrinsic.hpp
 create mode 100644 include/ck/utility/amd_wmma.hpp
 create mode 100644 include/ck/utility/amd_xdlops.hpp
 create mode 100644 include/ck/utility/array.hpp
 create mode 100644 include/ck/utility/array_multi_index.hpp
 create mode 100644 include/ck/utility/c_style_pointer_cast.hpp
 create mode 100644 include/ck/utility/common_header.hpp
 create mode 100644 include/ck/utility/container_element_picker.hpp
 create mode 100644 include/ck/utility/container_helper.hpp
 create mode 100644 include/ck/utility/data_type.hpp
 create mode 100644 include/ck/utility/debug.hpp
 create mode 100644 include/ck/utility/dynamic_buffer.hpp
 create mode 100644 include/ck/utility/enable_if.hpp
 create mode 100644 include/ck/utility/functional.hpp
 create mode 100644 include/ck/utility/functional2.hpp
 create mode 100644 include/ck/utility/functional3.hpp
 create mode 100644 include/ck/utility/functional4.hpp
 create mode 100644 include/ck/utility/generic_memory_space_atomic.hpp
 create mode 100644 include/ck/utility/get_id.hpp
 create mode 100644 include/ck/utility/ignore.hpp
 create mode 100644 include/ck/utility/inner_product.hpp
 create mode 100644 include/ck/utility/integral_constant.hpp
 create mode 100644 include/ck/utility/is_known_at_compile_time.hpp
 create mode 100644 include/ck/utility/magic_division.hpp
 create mode 100644 include/ck/utility/math.hpp
 create mode 100644 include/ck/utility/math_v2.hpp
 create mode 100644 include/ck/utility/multi_index.hpp
 create mode 100644 include/ck/utility/number.hpp
 create mode 100644 include/ck/utility/print.hpp
 create mode 100644 include/ck/utility/reduction_common.hpp
 create mode 100644 include/ck/utility/reduction_enums.hpp
 create mode 100644 include/ck/utility/reduction_functions_accumulate.hpp
 create mode 100644 include/ck/utility/reduction_operator.hpp
 create mode 100644 include/ck/utility/sequence.hpp
 create mode 100644 include/ck/utility/sequence_helper.hpp
 create mode 100644 include/ck/utility/span.hpp
 create mode 100644 include/ck/utility/static_buffer.hpp
 create mode 100644 include/ck/utility/statically_indexed_array.hpp
 create mode 100644 include/ck/utility/statically_indexed_array_multi_index.hpp
 create mode 100644 include/ck/utility/synchronization.hpp
 create mode 100644 include/ck/utility/thread_group.hpp
 create mode 100644 include/ck/utility/transpose_vectors.hpp
 create mode 100644 include/ck/utility/tuple.hpp
 create mode 100644 include/ck/utility/tuple_helper.hpp
 create mode 100644 include/ck/utility/type.hpp
 create mode 100644 library/CMakeLists.txt
 create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp
 create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward.hpp
 create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward.hpp
 create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp
 create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp
 create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
 create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp
 create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
 create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp
 create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp
 create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
 create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp
 create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp
 create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp
 create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp
 create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp
 create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp
 create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp
 create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_sparse_embedding3_forward_layernorm.hpp
 create mode 100644 library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/normalization.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perchannel_quantization.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perlayer_quantization.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp
 create mode 100644 library/include/ck/library/utility/algorithm.hpp
 create mode 100644 library/include/ck/library/utility/check_err.hpp
 create mode 100644 library/include/ck/library/utility/conv_common.hpp
 create mode 100644 library/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp
 create mode 100644 library/include/ck/library/utility/convolution_parameter.hpp
 create mode 100644 library/include/ck/library/utility/device_memory.hpp
 create mode 100644 library/include/ck/library/utility/fill.hpp
 create mode 100644 library/include/ck/library/utility/host_common_util.hpp
 create mode 100644 library/include/ck/library/utility/host_conv.hpp
 create mode 100644 library/include/ck/library/utility/host_gemm.hpp
 create mode 100644 library/include/ck/library/utility/host_reduction.hpp
 create mode 100644 library/include/ck/library/utility/host_tensor.hpp
 create mode 100644 library/include/ck/library/utility/host_tensor_generator.hpp
 create mode 100644 library/include/ck/library/utility/iterator.hpp
 create mode 100644 library/include/ck/library/utility/literals.hpp
 create mode 100644 library/include/ck/library/utility/numeric.hpp
 create mode 100644 library/include/ck/library/utility/op_instance_engine.hpp
 create mode 100644 library/include/ck/library/utility/ranges.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_gemm/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batchnorm/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f64_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f64_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/contraction_bilinear/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/contraction_scale/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/conv1d_bwd_data/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/conv3d_bwd_data/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/elementwise/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/elementwise_normalization/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/elementwise_normalization/device_elementwise_normalization_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_bilinear/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_fastgelu/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_reduce/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_splitk/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_int8_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/quantization/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/quantization/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/quantization/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/quantization/device_conv2d_xdl_int8_instance.hpp
 create mode 100644 library/src/tensor_operation_instance/gpu/quantization/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/quantization/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
 create mode 100644 library/src/utility/CMakeLists.txt
 create mode 100644 library/src/utility/convolution_parameter.cpp
 create mode 100644 library/src/utility/device_memory.cpp
 create mode 100644 library/src/utility/host_tensor.cpp
 create mode 100644 profiler/CMakeLists.txt
 create mode 100644 profiler/README.md
 create mode 100644 profiler/include/profiler/data_type_enum.hpp
 create mode 100644 profiler/include/profiler/data_type_enum_helper.hpp
 create mode 100644 profiler/include/profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp
 create mode 100644 profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp
 create mode 100644 profiler/include/profiler/profile_batched_gemm_impl.hpp
 create mode 100644 profiler/include/profiler/profile_batched_gemm_reduce_impl.hpp
 create mode 100644 profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp
 create mode 100644 profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp
 create mode 100644 profiler/include/profiler/profile_batchnorm_backward_impl.hpp
 create mode 100644 profiler/include/profiler/profile_batchnorm_forward_impl.hpp
 create mode 100644 profiler/include/profiler/profile_conv_bwd_data_impl.hpp
 create mode 100644 profiler/include/profiler/profile_conv_fwd_bias_relu_add_impl.hpp
 create mode 100644 profiler/include/profiler/profile_conv_fwd_bias_relu_impl.hpp
 create mode 100644 profiler/include/profiler/profile_conv_fwd_impl.hpp
 create mode 100644 profiler/include/profiler/profile_convnd_bwd_data_impl.hpp
 create mode 100644 profiler/include/profiler/profile_convnd_bwd_weight_impl.hpp
 create mode 100644 profiler/include/profiler/profile_elementwise_layernorm_impl.hpp
 create mode 100644 profiler/include/profiler/profile_gemm_add_add_fastgelu_impl.hpp
 create mode 100644 profiler/include/profiler/profile_gemm_add_fastgelu_impl.hpp
 create mode 100644 profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp
 create mode 100644 profiler/include/profiler/profile_gemm_bilinear_impl.hpp
 create mode 100644 profiler/include/profiler/profile_gemm_fastgelu_impl.hpp
 create mode 100644 profiler/include/profiler/profile_gemm_impl.hpp
 create mode 100644 profiler/include/profiler/profile_gemm_reduce_impl.hpp
 create mode 100644 profiler/include/profiler/profile_gemm_splitk_impl.hpp
 create mode 100644 profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
 create mode 100644 profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
 create mode 100644 profiler/include/profiler/profile_grouped_gemm_impl.hpp
 create mode 100644 profiler/include/profiler/profile_groupnorm_impl.hpp
 create mode 100644 profiler/include/profiler/profile_layernorm_impl.hpp
 create mode 100644 profiler/include/profiler/profile_reduce_impl.hpp
 create mode 100644 profiler/include/profiler/profile_softmax_impl.hpp
 create mode 100644 profiler/src/CMakeLists.txt
 create mode 100644 profiler/src/profile_batched_gemm.cpp
 create mode 100644 profiler/src/profile_batched_gemm_add_relu_gemm_add.cpp
 create mode 100644 profiler/src/profile_batched_gemm_gemm.cpp
 create mode 100644 profiler/src/profile_batched_gemm_reduce.cpp
 create mode 100644 profiler/src/profile_batchnorm_bwd.cpp
 create mode 100644 profiler/src/profile_batchnorm_fwd.cpp
 create mode 100644 profiler/src/profile_conv_bwd_data.cpp
 create mode 100644 profiler/src/profile_conv_fwd.cpp
 create mode 100644 profiler/src/profile_conv_fwd_bias_relu.cpp
 create mode 100644 profiler/src/profile_conv_fwd_bias_relu_add.cpp
 create mode 100644 profiler/src/profile_gemm.cpp
 create mode 100644 profiler/src/profile_gemm_add_add_fastgelu.cpp
 create mode 100644 profiler/src/profile_gemm_add_fastgelu.cpp
 create mode 100644 profiler/src/profile_gemm_bias_add_reduce.cpp
 create mode 100644 profiler/src/profile_gemm_bilinear.cpp
 create mode 100644 profiler/src/profile_gemm_fastgelu.cpp
 create mode 100644 profiler/src/profile_gemm_reduce.cpp
 create mode 100644 profiler/src/profile_gemm_splitk.cpp
 create mode 100644 profiler/src/profile_grouped_conv_bwd_weight.cpp
 create mode 100644 profiler/src/profile_grouped_conv_fwd.cpp
 create mode 100644 profiler/src/profile_grouped_gemm.cpp
 create mode 100644 profiler/src/profile_groupnorm.cpp
 create mode 100644 profiler/src/profile_layernorm.cpp
 create mode 100644 profiler/src/profile_reduce.cpp
 create mode 100644 profiler/src/profile_softmax.cpp
 create mode 100644 profiler/src/profiler.cpp
 create mode 100644 profiler/src/profiler_operation_registry.hpp
 create mode 100644 rbuild.ini
 create mode 100644 requirements.txt
 create mode 100644 script/clang-format-overwrite.sh
 create mode 100644 script/cmake-ck-dev.sh
 create mode 100644 script/cmake-ck-release.sh
 create mode 100644 script/count_vgpr.sh
 create mode 100644 script/hipclang_opt.sh
 create mode 100644 script/parse_perf_data.py
 create mode 100644 script/process_perf_data.py
 create mode 100644 script/process_perf_data.sh
 create mode 100644 script/process_qa_data.sh
 create mode 100644 script/profile_batched_gemm.sh
 create mode 100644 script/profile_conv_bwd_data.sh
 create mode 100644 script/profile_conv_fwd.sh
 create mode 100644 script/profile_gemm.sh
 create mode 100644 script/profile_gemm_bilinear.sh
 create mode 100644 script/profile_grouped_gemm.sh
 create mode 100644 script/profile_onnx_gemm.sh
 create mode 100644 script/profile_reduce_no_index.sh
 create mode 100644 script/profile_reduce_with_index.sh
 create mode 100644 script/profile_resnet50.sh
 create mode 100644 script/profile_splitK_gemm.sh
 create mode 100644 script/run_full_performance_tests.sh
 create mode 100644 script/run_performance_tests.sh
 create mode 100644 script/test_convnd_fwd.sh
 create mode 100644 script/test_reduce_no_index.sh
 create mode 100644 script/test_reduce_with_index.sh
 create mode 100644 test/CMakeLists.txt
 create mode 100644 test/batched_gemm/CMakeLists.txt
 create mode 100644 test/batched_gemm/batched_gemm_bf16.cpp
 create mode 100644 test/batched_gemm/batched_gemm_fp16.cpp
 create mode 100644 test/batched_gemm/batched_gemm_fp32.cpp
 create mode 100644 test/batched_gemm/batched_gemm_int8.cpp
 create mode 100644 test/batched_gemm_gemm/CMakeLists.txt
 create mode 100644 test/batched_gemm_gemm/test_batched_gemm_gemm_fp16.cpp
 create mode 100644 test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp
 create mode 100644 test/batched_gemm_reduce/CMakeLists.txt
 create mode 100644 test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
 create mode 100644 test/batched_gemm_softmax_gemm/CMakeLists.txt
 create mode 100644 test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16.cpp
 create mode 100644 test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp
 create mode 100644 test/batched_gemm_softmax_gemm_permute/CMakeLists.txt
 create mode 100644 test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16.cpp
 create mode 100644 test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16.cpp
 create mode 100644 test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp
 create mode 100644 test/batchnorm/CMakeLists.txt
 create mode 100644 test/batchnorm/batchnorm_bwd_rank_4.cpp
 create mode 100644 test/batchnorm/batchnorm_fwd_rank_4.cpp
 create mode 100644 test/block_to_ctile_map/CMakeLists.txt
 create mode 100644 test/block_to_ctile_map/test_block_to_ctile_map.cpp
 create mode 100644 test/conv_util/CMakeLists.txt
 create mode 100644 test/conv_util/conv_util.cpp
 create mode 100644 test/convnd_bwd_data/CMakeLists.txt
 create mode 100644 test/convnd_bwd_data/convnd_bwd_data.cpp
 create mode 100644 test/convnd_fwd/CMakeLists.txt
 create mode 100644 test/convnd_fwd/convnd_fwd.cpp
 create mode 100644 test/data_type/CMakeLists.txt
 create mode 100644 test/data_type/int4.cpp
 create mode 100644 test/elementwise_normalization/CMakeLists.txt
 create mode 100644 test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
 create mode 100644 test/gemm/CMakeLists.txt
 create mode 100644 test/gemm/gemm_bf16.cpp
 create mode 100644 test/gemm/gemm_fp16.cpp
 create mode 100644 test/gemm/gemm_fp32.cpp
 create mode 100644 test/gemm/gemm_fp64.cpp
 create mode 100644 test/gemm/gemm_int8.cpp
 create mode 100644 test/gemm/gemm_standalone_xdl_fp16.cpp
 create mode 100644 test/gemm/gemm_util.hpp
 create mode 100644 test/gemm/instance/gemm_f16_nn_instance.cpp
 create mode 100644 test/gemm/instance/gemm_f16_nn_instance.hpp
 create mode 100644 test/gemm/instance/gemm_f16_nt_instance.cpp
 create mode 100644 test/gemm/instance/gemm_f16_nt_instance.hpp
 create mode 100644 test/gemm/instance/gemm_f16_tn_instance.cpp
 create mode 100644 test/gemm/instance/gemm_f16_tn_instance.hpp
 create mode 100644 test/gemm/instance/gemm_f16_tt_instance.cpp
 create mode 100644 test/gemm/instance/gemm_f16_tt_instance.hpp
 create mode 100644 test/gemm/run_gemm_test.inc
 create mode 100644 test/gemm_reduce/CMakeLists.txt
 create mode 100644 test/gemm_reduce/gemm_reduce_fp16.cpp
 create mode 100644 test/gemm_split_k/CMakeLists.txt
 create mode 100644 test/gemm_split_k/gemm_split_k.cpp
 create mode 100644 test/grouped_convnd_bwd_weight/CMakeLists.txt
 create mode 100644 test/grouped_convnd_bwd_weight/grouped_convnd_bwd_weight.cpp
 create mode 100644 test/grouped_convnd_fwd/CMakeLists.txt
 create mode 100644 test/grouped_convnd_fwd/grouped_convnd_fwd.cpp
 create mode 100644 test/grouped_gemm/CMakeLists.txt
 create mode 100644 test/grouped_gemm/grouped_gemm_fp16.cpp
 create mode 100644 test/magic_number_division/CMakeLists.txt
 create mode 100644 test/magic_number_division/magic_number_division.cpp
 create mode 100644 test/normalization/CMakeLists.txt
 create mode 100644 test/normalization/test_groupnorm_fp16.cpp
 create mode 100644 test/normalization/test_groupnorm_fp32.cpp
 create mode 100644 test/normalization/test_layernorm2d_fp16.cpp
 create mode 100644 test/normalization/test_layernorm2d_fp32.cpp
 create mode 100644 test/reduce/CMakeLists.txt
 create mode 100644 test/reduce/reduce_no_index.cpp
 create mode 100644 test/reduce/reduce_with_index.cpp
 create mode 100644 test/reference_conv_fwd/CMakeLists.txt
 create mode 100644 test/reference_conv_fwd/reference_conv_fwd.cpp
 create mode 100644 test/softmax/CMakeLists.txt
 create mode 100644 test/softmax/test_softmax_interface.cpp
 create mode 100644 test/softmax/test_softmax_rank3.cpp
 create mode 100644 test/softmax/test_softmax_rank4.cpp
 create mode 100644 test/softmax/test_softmax_ut_cases.inc
 create mode 100644 test/softmax/test_softmax_util.hpp
 create mode 100644 test/space_filling_curve/CMakeLists.txt
 create mode 100644 test/space_filling_curve/space_filling_curve.cpp
 create mode 100644 test/wmma_op/CMakeLists.txt
 create mode 100644 test/wmma_op/wmma_op.cpp
 create mode 100644 test/wmma_op/wmma_op_util.hpp

diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 00000000..d35fe9e5
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,67 @@
+cff-version: 1.2.0
+title: Composable Kernel
+message: If you use this software, please cite using the following metadata.
+type: software
+authors:
+  - given-names: Chao
+    family-names: Liu
+    email: chao.liu2@amd.com
+    affiliation: AMD
+  - given-names: Jing
+    family-names: Zhang
+    email: jing.zhang3@amd.com
+    affiliation: AMD
+  - given-names: Letao
+    family-names: Qin
+    email: letao.qin@amd.com
+    affiliation: AMD
+  - given-names: Qianfeng
+    family-names: Zhang
+    email: qianfeng.zhang@amd.com
+    affiliation: AMD
+  - given-names: Liang
+    family-names: Huang
+    email: carlus.huang@amd.com
+    affiliation: AMD
+  - given-names: Shaojie
+    family-names: Wang
+    email: shaojie.wang@amd.com
+    affiliation: AMD
+  - given-names: Anthony
+    family-names: Chang
+    email: antc@amd.com
+    affiliation: AMD
+  - given-names: Chunyu
+    family-names: Lai
+    email: chunyu.lai@amd.com
+    affiliation: AMD
+  - given-names: Illia
+    family-names: Silin
+    email: illia.silin@amd.com
+    affiliation: AMD
+  - given-names: Adam
+    family-names: Osewski
+    email: adam.osewski@amd.com
+    affiliation: AMD
+  - given-names: Poyen
+    family-names: Chen
+    email: poyen.chen@amd.com
+    affiliation: AMD
+  - given-names: Rosty
+    family-names: Geyyer
+    email: rosty.geyyer@amd.com
+    affiliation: AMD
+  - given-names: Hanwen
+    family-names: Chen
+  - given-names: Tejash
+    family-names: Shah
+  - given-names: Xiaoyan
+    family-names: Zhou
+  - given-names: Jianfeng
+    family-names: Yan
+repository-code: 'https://github.com/ROCmSoftwarePlatform/composable_kernel'
+abstract: Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for Machine Learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel progarmming languages, like HIP C++.
+keywords:
+  - 'CK, Composable Kernel, Tensor Coordinate Transformation'
+license: MIT
+license-url: https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/7fc3ed761aa35709d87c8fbbe41dd368648b3541/LICENSE
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 00000000..f861e302
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,311 @@
+cmake_minimum_required(VERSION 3.14)
+
+# Check support for CUDA/HIP in Cmake
+project(composable_kernel)
+
+list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
+
+enable_testing()
+
+set(ROCM_SYMLINK_LIBS OFF)
+find_package(ROCM REQUIRED PATHS /opt/rocm)
+
+include(ROCMInstallTargets)
+include(ROCMPackageConfigHelpers)
+include(ROCMSetupVersion)
+include(ROCMInstallSymlinks)
+include(ROCMCreatePackage)
+include(CheckCXXCompilerFlag)
+
+rocm_setup_version(VERSION 0.2.0)
+include(TargetFlags)
+list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip)
+
+option(USE_BITINT_EXTENSION_INT4, "Whether to enable clang's BitInt extension to provide int4 data type." OFF)
+
+if(USE_BITINT_EXTENSION_INT4)
+    add_compile_definitions(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
+    add_compile_options(-Wno-bit-int-extension)
+    message("CK compiled with USE_BITINT_EXTENSION_INT4 set to ${USE_BITINT_EXTENSION_INT4}")
+endif()
+
+## Threads
+set(THREADS_PREFER_PTHREAD_FLAG ON)
+find_package(Threads REQUIRED)
+link_libraries(Threads::Threads)
+
+## C++
+enable_language(CXX)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
+
+## OpenMP
+if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+	# workaround issue hipcc in rocm3.5 cannot find openmp
+	set(OpenMP_CXX "${CMAKE_CXX_COMPILER}")
+	set(OpenMP_CXX_FLAGS "-fopenmp=libomp -Wno-unused-command-line-argument")
+	set(OpenMP_CXX_LIB_NAMES "libomp" "libgomp" "libiomp5")
+	set(OpenMP_libomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
+	set(OpenMP_libgomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
+	set(OpenMP_libiomp5_LIBRARY ${OpenMP_CXX_LIB_NAMES})
+else()
+	find_package(OpenMP REQUIRED)
+endif()
+
+message("OpenMP_CXX_LIB_NAMES: ${OpenMP_CXX_LIB_NAMES}")
+message("OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}")
+message("OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}")
+message("OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}")
+
+link_libraries(${OpenMP_gomp_LIBRARY})
+link_libraries(${OpenMP_pthread_LIBRARY})
+
+## HIP
+find_package(HIP REQUIRED)
+# Override HIP version in config.h, if necessary.
+# The variables set by find_package() can't be overwritten,
+# therefore let's use intermediate variables.
+set(CK_HIP_VERSION_MAJOR "${HIP_VERSION_MAJOR}")
+set(CK_HIP_VERSION_MINOR "${HIP_VERSION_MINOR}")
+set(CK_HIP_VERSION_PATCH "${HIP_VERSION_PATCH}")
+if( DEFINED CK_OVERRIDE_HIP_VERSION_MAJOR )
+    set(CK_HIP_VERSION_MAJOR "${CK_OVERRIDE_HIP_VERSION_MAJOR}")
+    message(STATUS "CK_HIP_VERSION_MAJOR overriden with ${CK_OVERRIDE_HIP_VERSION_MAJOR}")
+endif()
+if( DEFINED CK_OVERRIDE_HIP_VERSION_MINOR )
+    set(CK_HIP_VERSION_MINOR "${CK_OVERRIDE_HIP_VERSION_MINOR}")
+    message(STATUS "CK_HIP_VERSION_MINOR overriden with ${CK_OVERRIDE_HIP_VERSION_MINOR}")
+endif()
+if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH )
+    set(CK_HIP_VERSION_PATCH "${CK_OVERRIDE_HIP_VERSION_PATCH}")
+    message(STATUS "CK_HIP_VERSION_PATCH overriden with ${CK_OVERRIDE_HIP_VERSION_PATCH}")
+endif()
+message(STATUS "Build with HIP ${HIP_VERSION}")
+link_libraries(hip::device)
+add_compile_definitions(__HIP_PLATFORM_HCC__=1)
+
+## tidy
+include(EnableCompilerWarnings)
+set(CK_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name)
+if(CMAKE_CXX_COMPILER MATCHES ".*hcc" OR CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+")
+    set(CK_TIDY_CHECKS -modernize-use-override -readability-non-const-parameter)
+# Enable tidy on hip
+elseif(CK_BACKEND STREQUAL "HIP" OR CK_BACKEND STREQUAL "HIPNOGPU")
+    set(CK_TIDY_ERRORS ALL)
+endif()
+
+
+include(ClangTidy)
+enable_clang_tidy(
+    CHECKS
+        *
+        -abseil-*
+        -android-cloexec-fopen
+        # Yea we shouldn't be using rand()
+        -cert-msc30-c
+        -bugprone-exception-escape
+        -bugprone-macro-parentheses
+        -cert-env33-c
+        -cert-msc32-c
+        -cert-msc50-cpp
+        -cert-msc51-cpp
+        -cert-dcl37-c
+        -cert-dcl51-cpp
+        -clang-analyzer-alpha.core.CastToStruct
+        -clang-analyzer-optin.performance.Padding
+        -clang-diagnostic-deprecated-declarations
+        -clang-diagnostic-extern-c-compat
+        -clang-diagnostic-unused-command-line-argument
+        -cppcoreguidelines-avoid-c-arrays
+        -cppcoreguidelines-avoid-magic-numbers
+        -cppcoreguidelines-explicit-virtual-functions
+        -cppcoreguidelines-init-variables
+        -cppcoreguidelines-macro-usage
+        -cppcoreguidelines-non-private-member-variables-in-classes
+        -cppcoreguidelines-pro-bounds-array-to-pointer-decay
+        -cppcoreguidelines-pro-bounds-constant-array-index
+        -cppcoreguidelines-pro-bounds-pointer-arithmetic
+        -cppcoreguidelines-pro-type-member-init
+        -cppcoreguidelines-pro-type-reinterpret-cast
+        -cppcoreguidelines-pro-type-union-access
+        -cppcoreguidelines-pro-type-vararg
+        -cppcoreguidelines-special-member-functions
+        -fuchsia-*
+        -google-explicit-constructor
+        -google-readability-braces-around-statements
+        -google-readability-todo
+        -google-runtime-int
+        -google-runtime-references
+        -hicpp-vararg
+        -hicpp-braces-around-statements
+        -hicpp-explicit-conversions
+        -hicpp-named-parameter
+        -hicpp-no-array-decay
+        # We really shouldn't use bitwise operators with signed integers, but
+        # opencl leaves us no choice
+        -hicpp-avoid-c-arrays
+        -hicpp-signed-bitwise
+        -hicpp-special-member-functions
+        -hicpp-uppercase-literal-suffix
+        -hicpp-use-auto
+        -hicpp-use-equals-default
+        -hicpp-use-override
+        -llvm-header-guard
+        -llvm-include-order
+        #-llvmlibc-*
+        -llvmlibc-restrict-system-libc-headers
+        -llvmlibc-callee-namespace
+        -llvmlibc-implementation-in-namespace
+        -llvm-else-after-return
+        -llvm-qualified-auto
+        -misc-misplaced-const
+        -misc-non-private-member-variables-in-classes
+        -misc-no-recursion
+        -modernize-avoid-bind
+        -modernize-avoid-c-arrays
+        -modernize-pass-by-value
+        -modernize-use-auto
+        -modernize-use-default-member-init
+        -modernize-use-equals-default
+        -modernize-use-trailing-return-type
+        -modernize-use-transparent-functors
+        -performance-unnecessary-value-param
+        -readability-braces-around-statements
+        -readability-else-after-return
+        # we are not ready to use it, but very useful
+        -readability-function-cognitive-complexity
+        -readability-isolate-declaration
+        -readability-magic-numbers
+        -readability-named-parameter
+        -readability-uppercase-literal-suffix
+        -readability-convert-member-functions-to-static
+        -readability-qualified-auto
+        -readability-redundant-string-init
+        # too many narrowing conversions in our code
+        -bugprone-narrowing-conversions
+        -cppcoreguidelines-narrowing-conversions
+        -altera-struct-pack-align
+        -cppcoreguidelines-prefer-member-initializer
+        ${CK_TIDY_CHECKS}
+        ${CK_TIDY_ERRORS}
+    HEADER_FILTER
+        "\.hpp$"
+    EXTRA_ARGS
+        -DCK_USE_CLANG_TIDY
+)
+
+include(CppCheck)
+enable_cppcheck(
+    CHECKS
+        warning
+        style
+        performance
+        portability
+    SUPPRESS
+        ConfigurationNotChecked
+        constStatement
+        duplicateCondition
+        noExplicitConstructor
+        passedByValue
+        preprocessorErrorDirective
+        shadowVariable
+        unusedFunction
+        unusedPrivateFunction
+        unusedStructMember
+        unmatchedSuppression
+    FORCE
+    SOURCES
+        library/src
+    INCLUDE
+        ${CMAKE_CURRENT_SOURCE_DIR}/include
+        ${CMAKE_CURRENT_BINARY_DIR}/include
+        ${CMAKE_CURRENT_SOURCE_DIR}/library/include
+    DEFINE
+        CPPCHECK=1
+        __linux__=1
+)
+
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
+
+include_directories(BEFORE
+    ${PROJECT_SOURCE_DIR}/include
+    ${PROJECT_SOURCE_DIR}/library/include
+    ${HIP_INCLUDE_DIRS}
+)
+
+
+SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
+if(BUILD_DEV)
+    add_compile_options(-Werror)
+    add_compile_options(-Weverything)
+endif()
+message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+
+add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})
+
+file(GLOB_RECURSE INSTANCE_FILES "${PROJECT_SOURCE_DIR}/*/device_*_instance.cpp")
+file(GLOB dir_list RELATIVE ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/*)
+set(CK_DEVICE_INSTANCES)
+FOREACH(subdir_path ${dir_list})
+    IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}")
+       list(APPEND CK_DEVICE_INSTANCES device_${subdir_path}_instance)
+    ENDIF()
+ENDFOREACH()
+add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES}  SOURCES ${INSTANCE_FILES})
+
+rocm_package_setup_component(tests
+        LIBRARY_NAME composablekernel
+        PACKAGE_NAME tests # Prevent -static suffix on package name
+)
+
+rocm_package_setup_component(examples
+        LIBRARY_NAME composablekernel
+        PACKAGE_NAME examples
+)
+
+rocm_package_setup_component(profiler
+        LIBRARY_NAME composablekernel
+        PACKAGE_NAME ckProfiler
+)
+
+add_subdirectory(library)
+add_subdirectory(example)
+add_subdirectory(test)
+add_subdirectory(profiler)
+
+#Create an interface target for the include only files and call it "composablekernels"
+include(CMakePackageConfigHelpers)
+
+set(version 1.0.0)
+write_basic_package_version_file(
+    "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
+    VERSION "${version}"
+    COMPATIBILITY AnyNewerVersion
+)
+
+configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/Config.cmake.in
+        "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
+        INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+        NO_CHECK_REQUIRED_COMPONENTS_MACRO
+)
+
+rocm_install(FILES
+    "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
+    "${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+)
+
+set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
+set(CPACK_RPM_PACKAGE_LICENSE "MIT")
+
+rocm_create_package(
+    NAME composablekernel
+    DESCRIPTION "High Performance Composable Kernel for AMD GPUs"
+    MAINTAINER "MIOpen Kernels Dev Team <dl.MIOpen@amd.com>"
+    LDCONFIG
+    HEADER_ONLY
+)
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
new file mode 100644
index 00000000..8ccfe99c
--- /dev/null
+++ b/CONTRIBUTORS.md
@@ -0,0 +1,31 @@
+# Composable Kernel Developers and Contributors
+
+This is the list of developers and contributors to Composable Kernel library
+
+
+## Developers
+[Chao Liu](https://github.com/asroy), [Jing Zhang](https://github.com/zjing14), 2018-2022
+
+[Letao Qin](https://github.com/ltqin), [Qianfeng Zhang](https://github.com/qianfengz), [Liang Huang](https://github.com/carlushuang), [Shaojie Wang](https://github.com/shaojiewang), 2019-2022
+
+[Anthony Chang](https://github.com/rosenrodt), [Chunyu Lai](https://github.com/rocking5566), [Illia Silin](https://github.com/illsilin), [Adam Osewski](https://github.com/aosewski), [Poyen Chen](https://github.com/poyenc), [Rosty Geyyer](https://github.com/geyyer), 2022
+
+Hanwen Chang, 2019-2021,
+
+Tejash Shah, 2019-2020
+
+Xiaoyan Zhou, 2020
+
+[Jianfeng Yan](https://github.com/j4yan), 2021-2022
+
+
+## Product Manager
+[Jun Liu](https://github.com/junliume)
+
+
+## Contributors
+[Dan Yao](https://github.com/danyao12), [Guangzhao Lu](https://github.com/guangzlu), [Raman Jana](https://github.com/ramjana), [Jehandad Khan](https://github.com/JehandadKhan), [Wen-Heng (Jack) Chung](https://github.com/whchung)
+
+
+## Acknowledgement
+CK team works closely with Meta [AITemplate](https://github.com/facebookincubator/AITemplate) team ([Bing Xu](https://github.com/antinucleon), [Hao Lu](https://github.com/hlu1), [Ying Zhang](https://github.com/ipiszy), etc). Most of the lucrative graph optimization opportunities in ML models were identified by AITemplate team, and we also co-designed many high performance fused kernels for AMD GPUs. Without this collaboration, CK would not reach its current potential.
diff --git a/Config.cmake.in b/Config.cmake.in
new file mode 100644
index 00000000..02978cd4
--- /dev/null
+++ b/Config.cmake.in
@@ -0,0 +1,11 @@
+@PACKAGE_INIT@
+
+set(_composable_kernel_supported_components device_operations utility)
+
+foreach(_comp ${composable_kernel_FIND_COMPONENTS})
+	if(NOT _comp IN_LIST _composable_kernel_supported_components)
+		set(composable_kernel_FOUND False)
+		set(composable_kernel_NOT_FOUND_MESSAGE "Unsupported component: ${_comp}")
+	endif()
+	include("${CMAKE_CURRENT_LIST_DIR}/composable_kernel${_comp}Targets.cmake")
+endforeach()
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 00000000..d024f966
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,110 @@
+FROM ubuntu:20.04
+
+ARG ROCMVERSION=5.3
+ARG compiler_version="release"
+ARG compiler_commit=""
+
+RUN set -xe
+
+ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
+# Add rocm repository
+RUN apt-get update
+RUN apt-get install -y wget gnupg
+RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
+RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list"
+RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
+RUN sh -c "echo deb http://mirrors.kernel.org/ubuntu focal main universe | tee -a /etc/apt/sources.list"
+
+# Install dependencies
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
+    apt-utils \
+    build-essential \
+    ccache \
+    cmake-data \
+    cmake \
+    curl \
+    git \
+    hip-rocclr \
+    jq \
+    libelf-dev \
+    libncurses5-dev \
+    libnuma-dev \
+    libpthread-stubs0-dev \
+    llvm-amdgpu \
+    pkg-config \
+    python \
+    python3 \
+    python-dev \
+    python3-dev \
+    python3-pip \
+    software-properties-common \
+    rocm-dev \
+    rocm-device-libs \
+    rocm-cmake \
+    vim \
+    zlib1g-dev \
+    openssh-server \
+    clang-format-10 \
+    kmod && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Setup ubsan environment to printstacktrace
+RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
+ENV UBSAN_OPTIONS=print_stacktrace=1
+
+# Install an init system
+RUN wget https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64.deb
+RUN dpkg -i dumb-init_*.deb && rm dumb-init_*.deb
+
+ARG PREFIX=/opt/rocm
+# Install packages for processing the performance results
+RUN pip3 install --upgrade pip
+RUN pip3 install sqlalchemy
+RUN pip3 install pymysql
+RUN pip3 install pandas
+RUN pip3 install setuptools-rust
+RUN pip3 install sshtunnel
+# Setup ubsan environment to printstacktrace
+ENV UBSAN_OPTIONS=print_stacktrace=1
+
+ENV LC_ALL=C.UTF-8
+ENV LANG=C.UTF-8
+RUN groupadd -f render
+
+# Install the new rocm-cmake version
+RUN git clone -b master https://github.com/RadeonOpenCompute/rocm-cmake.git  && \
+  cd rocm-cmake && mkdir build && cd build && \
+  cmake  .. && cmake --build . && cmake --build . --target install
+
+WORKDIR /
+
+ENV compiler_version=$compiler_version
+ENV compiler_commit=$compiler_commit
+RUN sh -c "echo compiler version = '$compiler_version'"
+RUN sh -c "echo compiler commit = '$compiler_commit'"
+
+RUN --mount=type=ssh if [ "$compiler_version" = "amd-stg-open" ]; then \
+        sed -i '/$HIP_CLANG_TARGET = chomp($HIP_CLANG_TARGET);/c\    chomp($HIP_CLANG_TARGET);' /opt/rocm/hip/bin/hipcc.pl && \
+        sed -i '/$HIP_CLANG_TARGET = chomp($HIP_CLANG_TARGET);/c\    chomp($HIP_CLANG_TARGET);' /opt/rocm/bin/hipcc.pl; \
+    fi
+
+RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_commit" = "" ]; then \
+        git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
+        cd llvm-project && mkdir build && cd build && \
+        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \
+        make -j 8 ; \
+    else echo "using the release compiler"; \
+    fi
+
+RUN --mount=type=ssh if [ "$compiler_version" != "release" ] && [ "$compiler_commit" != "" ]; then \
+        git clone -b "$compiler_version" https://github.com/RadeonOpenCompute/llvm-project.git && \
+        cd llvm-project && git checkout "$compiler_commit" && echo "checking out commit $compiler_commit" && mkdir build && cd build && \
+        cmake -DCMAKE_INSTALL_PREFIX=/opt/rocm/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=1 -DLLVM_TARGETS_TO_BUILD="AMDGPU;X86" -DLLVM_ENABLE_PROJECTS="clang;lld;compiler-rt" ../llvm && \
+        make -j 8 ; \
+    else echo "using the release compiler"; \
+    fi
+
+
+#ENV HIP_CLANG_PATH='/llvm-project/build/bin'
+#RUN sh -c "echo HIP_CLANG_PATH = '$HIP_CLANG_PATH'"
diff --git a/Jenkinsfile b/Jenkinsfile
new file mode 100644
index 00000000..7b2e57c1
--- /dev/null
+++ b/Jenkinsfile
@@ -0,0 +1,705 @@
+def rocmnode(name) {
+    return 'rocmtest && miopen && ' + name
+}
+
+def show_node_info() {
+    sh """
+        echo "NODE_NAME = \$NODE_NAME"
+        lsb_release -sd
+        uname -r
+        ls /opt/ -la
+    """
+}
+
+def runShell(String command){
+    def responseCode = sh returnStatus: true, script: "${command} > tmp.txt"
+    def output = readFile(file: "tmp.txt")
+    echo "tmp.txt contents: $output"
+    return (output != "")
+}
+
+def getDockerImageName(){
+    def img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}"
+    return img
+}
+
+def check_host() {
+    if ("${env.CK_CCACHE}" != "null"){
+        def CCACHE_SERVER="${env.CK_CCACHE.split(':')[0]}"
+        echo "ccache server: ${CCACHE_SERVER}"
+        sh '''ping -c 1 -p 6379 "${CCACHE_SERVER}" | echo $? > tmp.txt'''
+        def output = readFile(file: "tmp.txt")
+        echo "tmp.txt contents: \$output"
+        return (output != "0")
+    }
+    else{
+        return 1
+    }
+}
+
+def build_compiler(){
+    def compiler
+    if (params.BUILD_COMPILER == "hipcc"){
+        compiler = '/opt/rocm/bin/hipcc'
+    }
+    else{
+        if (params.COMPILER_VERSION == "release"){
+            compiler = "/opt/rocm/llvm/bin/clang++"
+        }
+        else{
+            compiler = "/llvm-project/build/bin/clang++"
+        }        
+    }
+    return compiler
+}
+
+def getDockerImage(Map conf=[:]){
+    env.DOCKER_BUILDKIT=1
+    def prefixpath = conf.get("prefixpath", "/opt/rocm") // prefix:/opt/rocm
+    def no_cache = conf.get("no_cache", false)
+    def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
+    echo "ccache server: ${env.CK_CCACHE}"
+    if(env.CK_CCACHE)
+    {
+        if(check_host())
+        {
+            echo "FOUND CCACHE SERVER: ${env.CK_CCACHE}"
+        }
+        else 
+        {
+            echo "CCACHE SERVER: ${env.CK_CCACHE} NOT FOUND, got ${check_host} response"
+        }
+        dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CK_CCACHE}' --build-arg COMPILER_LAUNCHER='ccache' "
+        env.CCACHE_DIR = """/tmp/ccache_store"""
+        env.CCACHE_SECONDARY_STORAGE="""redis://${env.CK_CCACHE}"""
+    }
+    if(no_cache)
+    {
+        dockerArgs = dockerArgs + " --no-cache "
+    }
+    echo "Docker Args: ${dockerArgs}"
+    def image = getDockerImageName()
+    //Check if image exists 
+    def retimage
+    try 
+    {
+        echo "Pulling down image: ${image}"
+        retimage = docker.image("${image}")
+        retimage.pull()
+    }
+    catch(Exception ex)
+    {
+        error "Unable to locate image: ${image}"
+    }
+    return [retimage, image]
+}
+
+def buildDocker(install_prefix){
+    show_node_info()
+    env.DOCKER_BUILDKIT=1
+    checkout scm
+    def image_name = getDockerImageName()
+    echo "Building Docker for ${image_name}"
+    def dockerArgs = "--build-arg BUILDKIT_INLINE_CACHE=1 --build-arg PREFIX=${install_prefix} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
+    echo "ccache server: ${env.CK_CCACHE}"
+    if(env.CK_CCACHE)
+    {
+        if(check_host())
+        {
+            echo "FOUND CCACHE SERVER: ${env.CK_CCACHE}"
+        }
+        else 
+        {
+            echo "CCACHE SERVER: ${env.CK_CCACHE} NOT FOUND, got ${check_host} response"
+        }
+        dockerArgs = dockerArgs + " --build-arg CCACHE_SECONDARY_STORAGE='redis://${env.CK_CCACHE}' --build-arg COMPILER_LAUNCHER='ccache' "
+        env.CCACHE_DIR = """/tmp/ccache_store"""
+        env.CCACHE_SECONDARY_STORAGE="""redis://${env.CK_CCACHE}"""
+    }
+
+    echo "Build Args: ${dockerArgs}"
+    try{
+        if(params.BUILD_DOCKER){
+            //force building the new docker if that parameter is true
+            echo "Building image: ${image_name}"
+            retimage = docker.build("${image_name}", dockerArgs + ' .')
+            retimage.push()
+        }
+        else{
+            echo "Checking for image: ${image_name}"
+            sh "docker manifest inspect --insecure ${image_name}"
+            echo "Image: ${image_name} found!! Skipping building image"
+        }
+    }
+    catch(Exception ex){
+        echo "Unable to locate image: ${image_name}. Building image now"
+        retimage = docker.build("${image_name}", dockerArgs + ' .')
+        retimage.push()
+    }
+}
+
+def cmake_build(Map conf=[:]){
+
+    def compiler = build_compiler()
+    def config_targets = conf.get("config_targets","check")
+    def debug_flags = "-g -fno-omit-frame-pointer -fsanitize=undefined -fno-sanitize-recover=undefined " + conf.get("extradebugflags", "")
+    def build_envs = "CTEST_PARALLEL_LEVEL=4 " + conf.get("build_env","")
+    def prefixpath = conf.get("prefixpath","/opt/rocm")
+    def setup_args = conf.get("setup_args","")
+
+    if (prefixpath != "/usr/local"){
+        setup_args = setup_args + " -DCMAKE_PREFIX_PATH=${prefixpath} "
+    }
+
+    def build_type_debug = (conf.get("build_type",'release') == 'debug')
+
+    //cmake_env can overwrite default CXX variables.
+    def cmake_envs = "CXX=${compiler} CXXFLAGS='-Werror' " + conf.get("cmake_ex_env","")
+
+    def package_build = (conf.get("package_build","") == "true")
+
+    if (package_build == true) {
+        config_targets = "package"
+    }
+
+    if(conf.get("build_install","") == "true")
+    {
+        config_targets = 'install ' + config_targets
+        setup_args = ' -DBUILD_DEV=Off -DCMAKE_INSTALL_PREFIX=../install' + setup_args
+    } else{
+        setup_args = ' -DBUILD_DEV=On' + setup_args
+    }
+
+    if(build_type_debug){
+        setup_args = " -DCMAKE_BUILD_TYPE=debug -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags}'" + setup_args
+    }else{
+        setup_args = " -DCMAKE_BUILD_TYPE=release" + setup_args
+    }
+    if(env.CK_CCACHE)
+    {
+        setup_args = " -DCMAKE_CXX_COMPILER_LAUNCHER='ccache' -DCMAKE_C_COMPILER_LAUNCHER='ccache' " + setup_args
+    }
+    echo "ccache server: ${env.CK_CCACHE}"
+
+    def pre_setup_cmd = """
+            echo \$HSA_ENABLE_SDMA
+            ulimit -c unlimited
+            rm -rf build
+            mkdir build
+            rm -rf install
+            mkdir install
+            cd build
+        """
+    def setup_cmd = conf.get("setup_cmd", "${cmake_envs} cmake ${setup_args}   .. ")
+    // reduce parallelism when compiling, clang uses too much memory
+    def build_cmd = conf.get("build_cmd", "${build_envs} dumb-init make  -j\$(( \$(nproc) / 2 )) ${config_targets}")
+    def execute_cmd = conf.get("execute_cmd", "")
+
+    def cmd = conf.get("cmd", """
+            ${pre_setup_cmd}
+            ${setup_cmd}
+            ${build_cmd}
+            ${execute_cmd}
+        """)
+
+    echo cmd
+    sh cmd
+
+    // Only archive from master or develop
+    if (package_build == true && (env.BRANCH_NAME == "develop" || env.BRANCH_NAME == "master")) {
+        archiveArtifacts artifacts: "build/*.deb", allowEmptyArchive: true, fingerprint: true
+    }
+}
+
+def buildHipClangJob(Map conf=[:]){
+        show_node_info()
+
+        env.HSA_ENABLE_SDMA=0
+        checkout scm
+
+        def image = getDockerImageName() 
+        def prefixpath = conf.get("prefixpath", "/opt/rocm")
+
+        // Jenkins is complaining about the render group 
+        def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+        if (conf.get("enforce_xnack_on", false)) {
+            dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
+        }
+        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
+        if (params.COMPILER_VERSION != "release"){
+            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
+        }
+
+        def variant = env.STAGE_NAME
+
+        def retimage
+        (retimage, image) = getDockerImage(conf)
+
+        gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
+            withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
+                timeout(time: 5, unit: 'HOURS')
+                {
+                    cmake_build(conf)
+                }
+            }
+        }
+        return retimage
+}
+
+def reboot(){
+    build job: 'reboot-slaves', propagate: false , parameters: [string(name: 'server', value: "${env.NODE_NAME}"),]
+}
+
+def buildHipClangJobAndReboot(Map conf=[:]){
+    try{
+        buildHipClangJob(conf)
+    }
+    catch(e){
+        echo "throwing error exception for the stage"
+        echo 'Exception occurred: ' + e.toString()
+        throw e
+    }
+    finally{
+        if (!conf.get("no_reboot", false)) {
+            reboot()
+        }
+    }
+}
+
+def runCKProfiler(Map conf=[:]){
+        show_node_info()
+
+        env.HSA_ENABLE_SDMA=0
+        checkout scm
+
+        def image = getDockerImageName()
+        def prefixpath = conf.get("prefixpath", "/opt/rocm")
+
+        // Jenkins is complaining about the render group 
+        def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+        if (conf.get("enforce_xnack_on", false)) {
+            dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
+        }
+        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
+        if (params.COMPILER_VERSION != "release"){
+            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
+        }
+
+        def variant = env.STAGE_NAME
+        def retimage
+
+        gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
+            try {
+                (retimage, image) = getDockerImage(conf)
+                withDockerContainer(image: image, args: dockerOpts) {
+                    timeout(time: 5, unit: 'MINUTES'){
+                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
+                        if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
+                            throw new Exception ("GPU not found")
+                        }
+                        else{
+                            echo "GPU is OK"
+                        }
+                    }
+                }
+            }
+            catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
+                echo "The job was cancelled or aborted"
+                throw e
+            }
+            catch(Exception ex) {
+                retimage = docker.build("${image}", dockerArgs + " --no-cache .")
+                withDockerContainer(image: image, args: dockerOpts) {
+                    timeout(time: 5, unit: 'MINUTES'){
+                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
+                        if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
+                            throw new Exception ("GPU not found")
+                        }
+                        else{
+                            echo "GPU is OK"
+                        }
+                    }
+                }
+            }
+
+            withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
+                timeout(time: 24, unit: 'HOURS')
+                {
+                    //cmake_build(conf)
+                    //instead of building, just unstash the ckProfiler and install it
+                    sh """
+                        rm -rf build
+                        mkdir build
+                    """
+                    dir("build"){
+                        unstash 'ckProfiler.tar.gz'
+                        sh 'tar -xvf ckProfiler.tar.gz'
+                    }
+
+					dir("script"){
+                        if (params.RUN_FULL_QA){
+                            sh "./run_full_performance_tests.sh 1 QA_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}"
+                            archiveArtifacts "perf_gemm.log"
+                            archiveArtifacts "perf_resnet50_N256.log"
+                            archiveArtifacts "perf_resnet50_N4.log"
+                            archiveArtifacts "perf_batched_gemm.log"
+                            archiveArtifacts "perf_grouped_gemm.log"
+                            archiveArtifacts "perf_conv_fwd.log"
+                            archiveArtifacts "perf_conv_bwd_data.log"
+                            archiveArtifacts "perf_gemm_bilinear.log"
+                            archiveArtifacts "perf_reduction.log"
+                            archiveArtifacts "perf_splitK_gemm_verify.log"
+                            archiveArtifacts "perf_splitK_gemm.log"
+                            archiveArtifacts "perf_onnx_gemm.log"
+                           // stash perf files to master
+                            stash name: "perf_gemm.log"
+                            stash name: "perf_resnet50_N256.log"
+                            stash name: "perf_resnet50_N4.log"
+                            stash name: "perf_batched_gemm.log"
+                            stash name: "perf_grouped_gemm.log"
+                            stash name: "perf_conv_fwd.log"
+                            stash name: "perf_conv_bwd_data.log"
+                            stash name: "perf_gemm_bilinear.log"
+                            stash name: "perf_reduction.log"
+                            stash name: "perf_splitK_gemm.log"
+                            stash name: "perf_onnx_gemm.log"
+                            //we will process results on the master node
+                        }
+                        else{
+                            sh "./run_performance_tests.sh 0 CI_${params.COMPILER_VERSION} ${env.BRANCH_NAME} ${NODE_NAME}"
+                            archiveArtifacts "perf_gemm.log"
+                            archiveArtifacts "perf_resnet50_N256.log"
+                            archiveArtifacts "perf_resnet50_N4.log"
+                            // stash perf files to master
+                            stash name: "perf_gemm.log"
+                            stash name: "perf_resnet50_N256.log"
+                            stash name: "perf_resnet50_N4.log"
+                            //we will process the results on the master node
+                        }
+					}
+                }
+            }
+        }
+        return retimage
+}
+
+def runPerfTest(Map conf=[:]){
+    try{
+        runCKProfiler(conf)
+    }
+    catch(e){
+        echo "throwing error exception in performance tests"
+        echo 'Exception occurred: ' + e.toString()
+        throw e
+    }
+    finally{
+        if (!conf.get("no_reboot", false)) {
+            reboot()
+        }
+    }
+}
+
+def Build_CK(Map conf=[:]){
+        show_node_info()
+
+        env.HSA_ENABLE_SDMA=0
+        checkout scm
+
+        def image = getDockerImageName() 
+        def prefixpath = conf.get("prefixpath", "/opt/rocm")
+
+        // Jenkins is complaining about the render group 
+        def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+        if (conf.get("enforce_xnack_on", false)) {
+            dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
+        }
+        def dockerArgs = "--build-arg PREFIX=${prefixpath} --build-arg compiler_version='${params.COMPILER_VERSION}' --build-arg compiler_commit='${params.COMPILER_COMMIT}' --build-arg ROCMVERSION='${params.ROCMVERSION}' "
+        if (params.COMPILER_VERSION != "release"){
+            dockerOpts = dockerOpts + " --env HIP_CLANG_PATH='/llvm-project/build/bin' "
+        }
+
+        def variant = env.STAGE_NAME
+        def retimage
+
+        gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
+            try {
+                (retimage, image) = getDockerImage(conf)
+                withDockerContainer(image: image, args: dockerOpts) {
+                    timeout(time: 5, unit: 'MINUTES'){
+                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo | tee clinfo.log'
+                        if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
+                            throw new Exception ("GPU not found")
+                        }
+                        else{
+                            echo "GPU is OK"
+                        }
+                    }
+                }
+            }
+            catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
+                echo "The job was cancelled or aborted"
+                throw e
+            }
+            catch(Exception ex) {
+                retimage = docker.build("${image}", dockerArgs + " --no-cache .")
+                withDockerContainer(image: image, args: dockerOpts) {
+                    timeout(time: 5, unit: 'MINUTES'){
+                        sh 'PATH="/opt/rocm/opencl/bin:/opt/rocm/opencl/bin/x86_64:$PATH" clinfo |tee clinfo.log'
+                        if ( runShell('grep -n "Number of devices:.*. 0" clinfo.log') ){
+                            throw new Exception ("GPU not found")
+                        }
+                        else{
+                            echo "GPU is OK"
+                        }
+                    }
+                }
+            }
+            withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
+                timeout(time: 24, unit: 'HOURS')
+                {
+                    cmake_build(conf)
+                    dir("build"){
+                        //run tests and examples
+                        sh 'make -j check'
+                        //we only need the ckProfiler to run the performance tests, so we pack and stash it
+                        sh 'tar -zcvf ckProfiler.tar.gz bin/ckProfiler'
+                        stash "ckProfiler.tar.gz"
+                    }
+                }
+            }
+        }
+        return retimage
+}
+
+def Build_CK_and_Reboot(Map conf=[:]){
+    try{
+        Build_CK(conf)
+    }
+    catch(e){
+        echo "throwing error exception while building CK"
+        echo 'Exception occurred: ' + e.toString()
+        throw e
+    }
+    finally{
+        if (!conf.get("no_reboot", false)) {
+            reboot()
+        }
+    }
+}
+
+def process_results(Map conf=[:]){
+    env.HSA_ENABLE_SDMA=0
+    checkout scm
+    def image = getDockerImageName() 
+    def prefixpath = "/opt/rocm"
+
+    // Jenkins is complaining about the render group 
+    def dockerOpts="--cap-add=SYS_PTRACE --security-opt seccomp=unconfined"
+    if (conf.get("enforce_xnack_on", false)) {
+        dockerOpts = dockerOpts + " --env HSA_XNACK=1 "
+    }
+
+    def variant = env.STAGE_NAME
+    def retimage
+
+    gitStatusWrapper(credentialsId: "${status_wrapper_creds}", gitHubContext: "Jenkins - ${variant}", account: 'ROCmSoftwarePlatform', repo: 'composable_kernel') {
+        try {
+            (retimage, image) = getDockerImage(conf)
+        }
+        catch (org.jenkinsci.plugins.workflow.steps.FlowInterruptedException e){
+            echo "The job was cancelled or aborted"
+            throw e
+        }
+    }
+
+    withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') {
+        timeout(time: 1, unit: 'HOURS'){
+            try{
+                dir("script"){
+                    if (params.RUN_FULL_QA){
+                        // unstash perf files to master
+                        unstash "perf_gemm.log"
+                        unstash "perf_resnet50_N256.log"
+                        unstash "perf_resnet50_N4.log"
+                        unstash "perf_batched_gemm.log"
+                        unstash "perf_grouped_gemm.log"
+                        unstash "perf_conv_fwd.log"
+                        unstash "perf_conv_bwd_data.log"
+                        unstash "perf_gemm_bilinear.log"
+                        unstash "perf_reduction.log"
+                        unstash "perf_splitK_gemm.log"
+                        unstash "perf_onnx_gemm.log"
+                        sh "./process_qa_data.sh"
+                    }
+                    else{
+                        // unstash perf files to master
+                        unstash "perf_gemm.log"
+                        unstash "perf_resnet50_N256.log"
+                        unstash "perf_resnet50_N4.log"
+                        sh "./process_perf_data.sh"
+                    }
+                }
+            }
+            catch(e){
+                echo "throwing error exception while processing performance test results"
+                echo 'Exception occurred: ' + e.toString()
+                throw e
+            }
+        }
+    }
+}
+
+//launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;COMPILER_VERSION=release
+                                              0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-stg-open''' : ""
+
+pipeline {
+    agent none
+    triggers {
+        parameterizedCron(CRON_SETTINGS)
+    }
+    options {
+        parallelsAlwaysFailFast()
+    }
+    parameters {
+        booleanParam(
+            name: "BUILD_DOCKER",
+            defaultValue: false,
+            description: "Force building docker image (default: false), set to true if docker image needs to be updated.")
+        string(
+            name: 'ROCMVERSION', 
+            defaultValue: '5.3', 
+            description: 'Specify which ROCM version to use: 5.2.3, or 5.3 (default), etc.')
+        string(
+            name: 'COMPILER_VERSION', 
+            defaultValue: 'release', 
+            description: 'Specify which version of compiler to use: ck-9110, release (default), or amd-stg-open.')
+        string(
+            name: 'COMPILER_COMMIT', 
+            defaultValue: '', 
+            description: 'Specify which commit of compiler branch to use: leave empty to use the latest commit (default), or use 8a82e4eb7ba28521ba9a9424a0315a8a16590424 commit of amd-stg-open branch.')
+        string(
+            name: 'BUILD_COMPILER', 
+            defaultValue: 'hipcc', 
+            description: 'Specify whether to build CK with hipcc (default) or with clang.')
+        booleanParam(
+            name: "RUN_FULL_QA",
+            defaultValue: false,
+            description: "Select whether to run small set of performance tests (default) or full QA")
+    }
+    environment{
+        dbuser = "${dbuser}"
+        dbpassword = "${dbpassword}"
+        dbsship = "${dbsship}"
+        dbsshport = "${dbsshport}"
+        dbsshuser = "${dbsshuser}"
+        dbsshpassword = "${dbsshpassword}"
+        status_wrapper_creds = "${status_wrapper_creds}"
+        gerrit_cred="${gerrit_cred}"
+        DOCKER_BUILDKIT = "1"
+    }
+    stages{
+        stage("Build Docker"){
+            //when {
+            //    beforeAgent true
+            //    expression { params.BUILD_DOCKER.toBoolean() }
+            //}
+            parallel{
+                stage('Docker /opt/rocm'){
+                    agent{ label rocmnode("nogpu") }
+                    steps{
+                        buildDocker('/opt/rocm')
+                    }
+                }
+            }
+        }
+        stage("Static checks") {
+            parallel{
+                stage('Clang Format') {
+                    agent{ label rocmnode("nogpu") }
+                    environment{
+                        execute_cmd = "find .. -not -path \'*.git*\' -iname \'*.h\' \
+                                -o -not -path \'*.git*\' -iname \'*.hpp\' \
+                                -o -not -path \'*.git*\' -iname \'*.cpp\' \
+                                -o -iname \'*.h.in\' \
+                                -o -iname \'*.hpp.in\' \
+                                -o -iname \'*.cpp.in\' \
+                                -o -iname \'*.cl\' \
+                                | grep -v 'build/' \
+                                | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-10 -style=file {} | diff - {}\'"
+                    }
+                    steps{
+                        buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", execute_cmd: execute_cmd, no_reboot:true)
+                    }
+                }
+            }
+        }
+    
+		stage("Build CK and run Tests")
+        {
+            parallel
+            {
+                stage("Build CK and run Tests")
+                {
+                    agent{ label rocmnode("gfx908 || gfx90a") }
+                    environment{
+                        setup_args = "${params.COMPILER_VERSION == "ck-9110" ? """ -DBUILD_DEV=Off -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a" -DCMAKE_CXX_FLAGS="-O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc" """ : """ -DBUILD_DEV=Off -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx908;gfx90a" -DCMAKE_CXX_FLAGS="-O3 " """ }"
+                        execute_args = "${params.COMPILER_VERSION == "ck-9110" ? """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908;gfx90a" -DCMAKE_CXX_FLAGS="-O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ : """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx908,gfx90a" -DCMAKE_CXX_FLAGS="-O3" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ }"
+                    }
+                    steps{
+                        Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                    }
+                }
+            }
+        }
+
+        stage("Performance Tests")
+        {
+            parallel
+            {
+                stage("Run ckProfiler: gfx908 or gfx90a")
+                {
+                    when {
+                        beforeAgent true
+                        expression { !params.RUN_FULL_QA.toBoolean() }
+                    }
+                    options { retry(2) }
+                    agent{ label rocmnode("gfx908 || gfx90a")}
+                    environment{
+                        setup_args = "${params.COMPILER_VERSION == "ck-9110" ? """ -DGPU_TARGETS="gfx908;gfx90a" -DCMAKE_CXX_FLAGS=" -O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc" -DBUILD_DEV=On """ : """ -DGPU_TARGETS="gfx908;gfx90a" -DCMAKE_CXX_FLAGS=" -O3 " -DBUILD_DEV=On """}"
+                   }
+                    steps{
+                        runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release')
+                    }
+                }
+                stage("Run ckProfiler: gfx90a")
+                {
+                    when {
+                        beforeAgent true
+                        expression { params.RUN_FULL_QA.toBoolean() }
+                    }
+                    options { retry(2) }
+                    agent{ label rocmnode("gfx90a")}
+                    environment{
+                        setup_args = "${params.COMPILER_VERSION == "ck-9110" ? """ -DGPU_TARGETS="gfx90a" -DCMAKE_CXX_FLAGS=" -O3 -Xclang -mlink-builtin-bitcode -Xclang /opt/rocm/amdgcn/bitcode/oclc_abi_version_400.bc" -DBUILD_DEV=On """ : """ -DGPU_TARGETS="gfx90a" -DCMAKE_CXX_FLAGS=" -O3 " -DBUILD_DEV=On """}"
+                    }
+                    steps{
+                        runPerfTest(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release')
+                    }
+                }
+            }
+        }
+        stage("Process Performance Test Results")
+        {
+            parallel
+            {
+                stage("Process results"){
+                    agent { label 'mici' }
+                    steps{
+                        process_results()
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 00000000..2fe9a845
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,28 @@
+Copyright (c) 2018-    , Advanced Micro Devices, Inc. (Chao Liu, Jing Zhang)
+Copyright (c) 2019-    , Advanced Micro Devices, Inc. (Letao Qin, Qianfeng Zhang, Liang Huang, Shaojie Wang)
+Copyright (c) 2022-    , Advanced Micro Devices, Inc. (Anthony Chang, Chunyu Lai, Illia Silin, Adam Osewski, Poyen Chen, Jehandad Khan)
+Copyright (c) 2019-2021, Advanced Micro Devices, Inc. (Hanwen Chang)
+Copyright (c) 2019-2020, Advanced Micro Devices, Inc. (Tejash Shah)
+Copyright (c) 2020     , Advanced Micro Devices, Inc. (Xiaoyan Zhou)
+Copyright (c) 2021-2022, Advanced Micro Devices, Inc. (Jianfeng Yan)
+
+SPDX-License-Identifier: MIT
+Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
index 5897a9cb..151da974 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,96 @@
-# composable_kernel_onnxruntime
+# Composable Kernel
 
+## Methodology
+Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel languages, like HIP C++.
+
+CK utilizes two concepts to achieve performance portability and code maintainability:
+* A tile-based programming model
+* Algorithm complexity reduction for complex ML operators, using innovative technique we call "Tensor Coordinate Transformation".
+
+![ALT](/doc/image/ck_component.png "CK Components")
+
+## Code Structure
+Current CK library are structured into 4 layers:
+* "Templated Tile Operators" layer
+* "Templated Kernel and Invoker" layer
+* "Instantiated Kernel and Invoker" layer
+* "Client API" layer
+
+![ALT](/doc/image/ck_layer.png "CK Layers")
+
+## Contributors
+The list of developers and contributors is here: [Contributors](/CONTRIBUTORS.md)
+
+## Citation
+If you use CK, please use following citations:
+* CK paper will be freely available on arXiv soon: [Realizing Tensor Operators Using Coordinate Transformations and Tile Based Programming](???)
+* [CITATION.cff](/CITATION.cff)
+
+## License
+CK is released under the MIT license. [License File](/LICENSE)
+
+
+# Build CK
+
+## Build docker image
+```bash
+DOCKER_BUILDKIT=1 docker build -t ck:latest -f Dockerfile .
+```
+
+## Launch docker
+```bash
+docker run                                     \
+-it                                            \
+--privileged                                   \
+--group-add sudo                               \
+-w /root/workspace                             \
+-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace  \
+ck:latest                                      \
+/bin/bash
+```
+
+## Build CK
+```bash
+mkdir build && cd build
+
+# Need to specify target ID, example below is for gfx908 and gfx90a
+cmake                                                                                             \
+-D CMAKE_PREFIX_PATH=/opt/rocm                                                                    \
+-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                         \
+-D CMAKE_CXX_FLAGS="-O3"                                                                          \
+-D CMAKE_BUILD_TYPE=Release                                                                       \
+-D GPU_TARGETS="gfx908;gfx90a"                                                                    \
+..
+```
+
+### Build examples and tests
+```bash
+ make -j examples tests
+ make test
+```
+
+Instructions for running each individual examples are under [example](/example)
+
+
+## Build ckProfiler
+```bash
+ make -j ckProfiler
+```
+Instructions for running ckProfiler are under [profiler](/profiler)
+
+## Install CK
+```bash
+make install
+```
+
+## Using CK as pre-built kernel library
+Instructions for using CK as a pre-built kernel library are under [client_example](/client_example)
+
+## Caveat
+### Kernel Timing and Verification
+CK's own kernel timer will warn up kernel once, and then run it multiple times
+to get average kernel time. For some kernels that use atomic add, this will cause
+output buffer to be accumulated multiple times, causing verification failure.
+To work around it, do not use CK's own timer and do verification at the same time.
+CK's own timer and verification in each example and ckProfiler can be enabled or
+disabled from command line.
diff --git a/client_example/01_gemm/CMakeLists.txt b/client_example/01_gemm/CMakeLists.txt
new file mode 100644
index 00000000..9e741192
--- /dev/null
+++ b/client_example/01_gemm/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(client_gemm gemm.cpp)
+target_link_libraries(client_gemm PRIVATE composable_kernel::device_operations)
diff --git a/client_example/01_gemm/gemm.cpp b/client_example/01_gemm/gemm.cpp
new file mode 100644
index 00000000..a8a6bf16
--- /dev/null
+++ b/client_example/01_gemm/gemm.cpp
@@ -0,0 +1,218 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <vector>
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+using ADataType = F16;
+using BDataType = F16;
+using CDataType = F16;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideC = 4096;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 7)
+    {
+        M = std::stoi(argv[1]);
+        N = std::stoi(argv[2]);
+        K = std::stoi(argv[3]);
+
+        StrideA = std::stoi(argv[4]);
+        StrideB = std::stoi(argv[5]);
+        StrideC = std::stoi(argv[6]);
+    }
+    else
+    {
+        printf("arg1 to 6: M, N, K, StrideA, StrideB, StrideC\n");
+        exit(0);
+    }
+
+    auto f_matrix_space_size =
+        [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+            using Layout = decltype(layout);
+
+            if(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return (nRow - 1) * stride + nCol;
+            }
+            else
+            {
+                return (nCol - 1) * stride + nRow;
+            }
+        };
+
+    SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+    SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+    SimpleDeviceMem c_device_buf(sizeof(CDataType) * f_matrix_space_size(M, N, StrideC, CLayout{}));
+
+    using DeviceOp =
+        ck::tensor_operation::device::DeviceGemm<ALayout,
+                                                 BLayout,
+                                                 CLayout,
+                                                 ADataType,
+                                                 BDataType,
+                                                 CDataType,
+                                                 ck::tensor_operation::element_wise::PassThrough,
+                                                 ck::tensor_operation::element_wise::PassThrough,
+                                                 ck::tensor_operation::element_wise::PassThrough>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    const auto a_element_op = AElementOp{};
+    const auto b_element_op = BElementOp{};
+    const auto c_element_op = CElementOp{};
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                                        b_device_buf.GetDeviceBuffer(),
+                                                        c_device_buf.GetDeviceBuffer(),
+                                                        M,
+                                                        N,
+                                                        K,
+                                                        StrideA,
+                                                        StrideB,
+                                                        StrideC,
+                                                        a_element_op,
+                                                        b_element_op,
+                                                        c_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t flop = std::size_t(2) * M * N * K;
+
+            std::size_t num_btype =
+                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                                        b_device_buf.GetDeviceBuffer(),
+                                                        c_device_buf.GetDeviceBuffer(),
+                                                        M,
+                                                        N,
+                                                        K,
+                                                        StrideA,
+                                                        StrideB,
+                                                        StrideC,
+                                                        a_element_op,
+                                                        b_element_op,
+                                                        c_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
diff --git a/client_example/02_gemm_add_add_fastgelu/CMakeLists.txt b/client_example/02_gemm_add_add_fastgelu/CMakeLists.txt
new file mode 100644
index 00000000..b7b724cc
--- /dev/null
+++ b/client_example/02_gemm_add_add_fastgelu/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_custom_target(client_gemm_fastgelu_examples)
+
+add_executable(client_gemm_add_add_fastgelu gemm_add_add_fastgelu.cpp)
+target_link_libraries(client_gemm_add_add_fastgelu PRIVATE composable_kernel::device_operations)
+
+add_executable(client_gemm_add_fastgelu gemm_add_fastgelu.cpp)
+target_link_libraries(client_gemm_add_fastgelu PRIVATE composable_kernel::device_operations)
+
+add_executable(client_gemm_fastgelu gemm_fastgelu.cpp)
+target_link_libraries(client_gemm_fastgelu PRIVATE composable_kernel::device_operations)
+
+add_dependencies(client_gemm_fastgelu_examples client_gemm_add_add_fastgelu client_gemm_add_fastgelu
+                 client_gemm_fastgelu)
diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp b/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
new file mode 100644
index 00000000..f88e72b6
--- /dev/null
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
@@ -0,0 +1,241 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <vector>
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough    = ck::tensor_operation::element_wise::PassThrough;
+using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddAddFastGelu;
+
+using ADataType  = F16;
+using BDataType  = F16;
+using D0DataType = F16;
+using D1DataType = F16;
+using EDataType  = F16;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using D0Layout = Row;
+using D1Layout = Row;
+using ELayout  = Row;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA  = 4096;
+    ck::index_t StrideB  = 4096;
+    ck::index_t StrideD0 = 0;
+    ck::index_t StrideD1 = 4096;
+    ck::index_t StrideE  = 4096;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 9)
+    {
+        M = std::stoi(argv[1]);
+        N = std::stoi(argv[2]);
+        K = std::stoi(argv[3]);
+
+        StrideA  = std::stoi(argv[4]);
+        StrideB  = std::stoi(argv[5]);
+        StrideD0 = std::stoi(argv[6]);
+        StrideD1 = std::stoi(argv[7]);
+        StrideE  = std::stoi(argv[8]);
+    }
+    else
+    {
+        printf("arg1 to 8: M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE\n");
+        exit(0);
+    }
+
+    auto f_matrix_space_size =
+        [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+            using Layout = decltype(layout);
+
+            if(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return (nRow - 1) * stride + nCol;
+            }
+            else
+            {
+                return (nCol - 1) * stride + nRow;
+            }
+        };
+
+    SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+    SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+    SimpleDeviceMem d0_m_n_device_buf(sizeof(D0DataType) *
+                                      f_matrix_space_size(M, N, StrideD0, D0Layout{}));
+    SimpleDeviceMem d1_m_n_device_buf(sizeof(D1DataType) *
+                                      f_matrix_space_size(M, N, StrideD1, D1Layout{}));
+    SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
+        ALayout,
+        BLayout,
+        ck::Tuple<D0Layout, D1Layout>,
+        ELayout,
+        ADataType,
+        BDataType,
+        ck::Tuple<D0DataType, D1DataType>,
+        EDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::AddAddFastGelu>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{};
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            a_device_buf.GetDeviceBuffer(),
+            b_device_buf.GetDeviceBuffer(),
+            std::array<const void*, 2>{d0_m_n_device_buf.GetDeviceBuffer(),
+                                       d1_m_n_device_buf.GetDeviceBuffer()},
+            e_device_buf.GetDeviceBuffer(),
+            M,
+            N,
+            K,
+            StrideA,
+            StrideB,
+            std::array<ck::index_t, 2>{StrideD0, StrideD1},
+            StrideE,
+            a_element_op,
+            b_element_op,
+            cde_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t flop = std::size_t(2) * M * N * K;
+
+            std::size_t num_btype =
+                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            a_device_buf.GetDeviceBuffer(),
+            b_device_buf.GetDeviceBuffer(),
+            std::array<const void*, 2>{d0_m_n_device_buf.GetDeviceBuffer(),
+                                       d1_m_n_device_buf.GetDeviceBuffer()},
+            e_device_buf.GetDeviceBuffer(),
+            M,
+            N,
+            K,
+            StrideA,
+            StrideB,
+            std::array<ck::index_t, 2>{StrideD0, StrideD1},
+            StrideE,
+            a_element_op,
+            b_element_op,
+            cde_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp b/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp
new file mode 100644
index 00000000..512555f9
--- /dev/null
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <vector>
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddFastGelu;
+
+using ADataType  = F16;
+using BDataType  = F16;
+using D0DataType = F16;
+using EDataType  = F16;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using D0Layout = Row;
+using ELayout  = Row;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA  = 4096;
+    ck::index_t StrideB  = 4096;
+    ck::index_t StrideD0 = 0;
+    ck::index_t StrideE  = 4096;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 8)
+    {
+        M = std::stoi(argv[1]);
+        N = std::stoi(argv[2]);
+        K = std::stoi(argv[3]);
+
+        StrideA  = std::stoi(argv[4]);
+        StrideB  = std::stoi(argv[5]);
+        StrideD0 = std::stoi(argv[6]);
+        StrideE  = std::stoi(argv[8]);
+    }
+    else
+    {
+        printf("arg1 to 7: M, N, K, StrideA, StrideB, StrideD0, StrideE\n");
+        exit(0);
+    }
+
+    auto f_matrix_space_size =
+        [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+            using Layout = decltype(layout);
+
+            if(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return (nRow - 1) * stride + nCol;
+            }
+            else
+            {
+                return (nCol - 1) * stride + nRow;
+            }
+        };
+
+    SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+    SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+    SimpleDeviceMem d0_m_n_device_buf(sizeof(D0DataType) *
+                                      f_matrix_space_size(M, N, StrideD0, D0Layout{}));
+    SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
+        ALayout,
+        BLayout,
+        ck::Tuple<D0Layout>,
+        ELayout,
+        ADataType,
+        BDataType,
+        ck::Tuple<D0DataType>,
+        EDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::AddFastGelu>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{};
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            a_device_buf.GetDeviceBuffer(),
+            b_device_buf.GetDeviceBuffer(),
+            std::array<const void*, 1>{d0_m_n_device_buf.GetDeviceBuffer()},
+            e_device_buf.GetDeviceBuffer(),
+            M,
+            N,
+            K,
+            StrideA,
+            StrideB,
+            std::array<ck::index_t, 1>{StrideD0},
+            StrideE,
+            a_element_op,
+            b_element_op,
+            cde_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t flop = std::size_t(2) * M * N * K;
+
+            std::size_t num_btype =
+                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            a_device_buf.GetDeviceBuffer(),
+            b_device_buf.GetDeviceBuffer(),
+            std::array<const void*, 1>{d0_m_n_device_buf.GetDeviceBuffer()},
+            e_device_buf.GetDeviceBuffer(),
+            M,
+            N,
+            K,
+            StrideA,
+            StrideB,
+            std::array<ck::index_t, 1>{StrideD0},
+            StrideE,
+            a_element_op,
+            b_element_op,
+            cde_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
diff --git a/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp b/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp
new file mode 100644
index 00000000..72372310
--- /dev/null
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <vector>
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using FastGelu    = ck::tensor_operation::element_wise::FastGelu;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = FastGelu;
+
+using ADataType = F16;
+using BDataType = F16;
+using EDataType = F16;
+
+using ALayout = Row;
+using BLayout = Col;
+using ELayout = Row;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideE = 4096;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 7)
+    {
+        M = std::stoi(argv[1]);
+        N = std::stoi(argv[2]);
+        K = std::stoi(argv[3]);
+
+        StrideA = std::stoi(argv[4]);
+        StrideB = std::stoi(argv[5]);
+        StrideE = std::stoi(argv[8]);
+    }
+    else
+    {
+        printf("arg1 to 6: M, N, K, StrideA, StrideB, StrideE\n");
+        exit(0);
+    }
+
+    auto f_matrix_space_size =
+        [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+            using Layout = decltype(layout);
+
+            if(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return (nRow - 1) * stride + nCol;
+            }
+            else
+            {
+                return (nCol - 1) * stride + nRow;
+            }
+        };
+
+    SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+    SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+    SimpleDeviceMem e_device_buf(sizeof(EDataType) * f_matrix_space_size(M, N, StrideE, ELayout{}));
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
+        ALayout,
+        BLayout,
+        ck::Tuple<>,
+        ELayout,
+        ADataType,
+        BDataType,
+        ck::Tuple<>,
+        EDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::FastGelu>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{};
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                                        b_device_buf.GetDeviceBuffer(),
+                                                        {},
+                                                        e_device_buf.GetDeviceBuffer(),
+                                                        M,
+                                                        N,
+                                                        K,
+                                                        StrideA,
+                                                        StrideB,
+                                                        {},
+                                                        StrideE,
+                                                        a_element_op,
+                                                        b_element_op,
+                                                        cde_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t flop = std::size_t(2) * M * N * K;
+
+            std::size_t num_btype =
+                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                                        b_device_buf.GetDeviceBuffer(),
+                                                        {},
+                                                        e_device_buf.GetDeviceBuffer(),
+                                                        M,
+                                                        N,
+                                                        K,
+                                                        StrideA,
+                                                        StrideB,
+                                                        {},
+                                                        StrideE,
+                                                        a_element_op,
+                                                        b_element_op,
+                                                        cde_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
diff --git a/client_example/03_gemm_layernorm/CMakeLists.txt b/client_example/03_gemm_layernorm/CMakeLists.txt
new file mode 100644
index 00000000..3742e708
--- /dev/null
+++ b/client_example/03_gemm_layernorm/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(client_gemm_add_add_reduce_normalize gemm_add_add_layernorm.cpp)
+target_link_libraries(client_gemm_add_add_reduce_normalize PRIVATE composable_kernel::device_operations)
diff --git a/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp b/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp
new file mode 100644
index 00000000..6c259407
--- /dev/null
+++ b/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp
@@ -0,0 +1,274 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <vector>
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using ADataType            = F16;
+using BDataType            = F16;
+using BiasDataType         = F32;
+using CDataType            = F16;
+using D0DataType           = F16;
+using ReduceDataType       = F32;
+using GammaDataType        = F16;
+using BetaDataType         = F16;
+using LayerNormOutDataType = F16;
+
+using ALayout = ck::tensor_layout::gemm::RowMajor;
+using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+template <typename gemm_reduce_op_ptr>
+bool RunDeviceGemmMeanSquareMean(gemm_reduce_op_ptr& p_op,
+                                 const void* p_a,
+                                 const void* p_b,
+                                 const void* p_bias,
+                                 const void* p_d0,
+                                 void* p_c,
+                                 void* p_mean,
+                                 void* p_square_mean,
+                                 int M,
+                                 int N,
+                                 int K,
+                                 int StrideA,
+                                 int StrideB,
+                                 int StrideC,
+                                 int StrideD0,
+                                 bool time_kernel)
+{
+    using PassThrough          = ck::tensor_operation::element_wise::PassThrough;
+    using UnaryDivElementOp    = ck::tensor_operation::element_wise::UnaryDivide;
+    using UnarySquareElementOp = ck::tensor_operation::element_wise::UnarySquare;
+
+    auto passOp   = PassThrough{};
+    auto squareOp = UnarySquareElementOp{};
+    auto divOp    = UnaryDivElementOp{N};
+
+    auto argument_ptr =
+        p_op->MakeArgumentPointer(p_a,
+                                  p_b,
+                                  p_bias,
+                                  {p_d0},
+                                  p_c,
+                                  {p_mean, p_square_mean},
+                                  M,
+                                  N,
+                                  K,
+                                  StrideA,
+                                  StrideB,
+                                  StrideC,
+                                  {StrideD0},
+                                  {&passOp, &passOp, &passOp}, // functor for a, b, c
+                                  {&passOp},                   // functor for d0
+                                  {&passOp, &squareOp},        // functor for inputs of reduction
+                                  {&divOp, &divOp});           // functor for outputs of reduction
+
+    if(p_op->IsSupportedArgument(argument_ptr.get()))
+    {
+        auto invoker_ptr = p_op->MakeInvokerPointer();
+
+        // If we evaluate running time of gemm_reduce. The output may wrong.
+        // Because we need to initialize the reduction tensor before runing the kernel.
+        // However we run kernel many times for time_kernel = trie without reinitialize the out
+        // of reduction tensor.
+        float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+        if(time_kernel)
+            std::cout << "Gemm + reduce Perf: " << std::setw(10) << ave_time << " ms" << std::endl;
+
+        return true;
+    }
+
+    return false;
+}
+
+template <typename normalize_op_ptr>
+bool RunDeviceNormalize2D(normalize_op_ptr& p_op,
+                          const void* p_x,
+                          const void* p_mean,
+                          const void* p_square_mean,
+                          const void* p_gamma,
+                          const void* p_beta,
+                          void* p_y,
+                          int M,
+                          int N,
+                          int StrideX,
+                          bool time_kernel)
+{
+    std::array<const void*, 5> input = {p_x, p_mean, p_square_mean, p_gamma, p_beta};
+    std::array<void*, 1> output      = {p_y};
+    auto normalize_functor           = ck::tensor_operation::element_wise::Normalize{};
+
+    std::array<ck::index_t, 2> xyLengths = {M, N};
+    std::array<ck::index_t, 2> xyStrides = {StrideX, 1};
+
+    auto argument_ptr = p_op->MakeArgumentPointer(xyLengths,
+                                                  {xyStrides, {1, 0}, {1, 0}, {0, 1}, {0, 1}},
+                                                  {xyStrides},
+                                                  input,
+                                                  output,
+                                                  ck::tensor_operation::element_wise::Normalize{});
+
+    if(p_op->IsSupportedArgument(argument_ptr.get()))
+    {
+        auto invoker_ptr = p_op->MakeInvokerPointer();
+        float ave_time   = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+        if(time_kernel)
+            std::cout << "Normalize Perf: " << std::setw(10) << ave_time << " ms" << std::endl;
+
+        return true;
+    }
+
+    return false;
+}
+
+int main()
+{
+    ck::index_t M = 1024;
+    ck::index_t N = 1024;
+    ck::index_t K = 1024;
+
+    ck::index_t StrideA  = 1024;
+    ck::index_t StrideB  = 1024;
+    ck::index_t StrideC  = 1024;
+    ck::index_t StrideD0 = 1024;
+
+    const auto gemm_reduce_ptrs =
+        ck::tensor_operation::device::instance::get_device_gemm_add_add_mean_squaremean_instances<
+            ADataType,
+            BDataType,
+            CDataType,
+            ALayout,
+            BLayout,
+            CLayout>();
+
+    const auto normalize_ptrs =
+        ck::tensor_operation::device::instance::get_device_normalize_from_mean_meansquare_instances<
+            CDataType,
+            ReduceDataType,
+            ReduceDataType,
+            GammaDataType,
+            BetaDataType,
+            LayerNormOutDataType>();
+
+    std::cout << "found " << gemm_reduce_ptrs.size()
+              << " gemm_reduceMean_reduceSquareMean instances" << std::endl;
+
+    std::cout << "found " << normalize_ptrs.size() << " normalize instances" << std::endl;
+
+    auto f_matrix_space_size =
+        [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+            using Layout = decltype(layout);
+
+            if(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return (nRow - 1) * stride + nCol;
+            }
+            else
+            {
+                return (nCol - 1) * stride + nRow;
+            }
+        };
+
+    SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+    SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+    SimpleDeviceMem bias_device_buf(sizeof(BiasDataType) * N);
+    SimpleDeviceMem c_device_buf(sizeof(CDataType) * f_matrix_space_size(M, N, StrideC, CLayout{}));
+    SimpleDeviceMem d0_device_buf(sizeof(D0DataType) *
+                                  f_matrix_space_size(M, N, StrideD0, CLayout{}));
+    SimpleDeviceMem reduceMean_device_buf(sizeof(ReduceDataType) * M);
+    SimpleDeviceMem reduceMeanSquare_device_buf(sizeof(ReduceDataType) * M);
+    SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * N);
+    SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * N);
+    SimpleDeviceMem layerNorm_device_buf(sizeof(LayerNormOutDataType) * M * N);
+
+    bool b_time_kernel           = true;
+    bool b_only_run_first_kernel = true;
+
+    // layernorm => (1) + (2)
+    // (1). c = gemm(a, b), reduce_mean(c), reduce_square_mean(c)
+    // (2). normalize(c, mean, square_mean, gamma, beta)
+    for(auto& gemm_reduce_ptr : gemm_reduce_ptrs)
+    {
+        // run first available kernel
+        if(RunDeviceGemmMeanSquareMean(gemm_reduce_ptr,
+                                       a_device_buf.GetDeviceBuffer(),
+                                       b_device_buf.GetDeviceBuffer(),
+                                       bias_device_buf.GetDeviceBuffer(),
+                                       d0_device_buf.GetDeviceBuffer(),
+                                       c_device_buf.GetDeviceBuffer(),
+                                       reduceMean_device_buf.GetDeviceBuffer(),
+                                       reduceMeanSquare_device_buf.GetDeviceBuffer(),
+                                       M,
+                                       N,
+                                       K,
+                                       StrideA,
+                                       StrideB,
+                                       StrideC,
+                                       StrideD0,
+                                       b_time_kernel))
+        {
+            if(b_only_run_first_kernel)
+                break;
+        }
+        else
+        {
+            std::cout << gemm_reduce_ptr->GetTypeString() << " does not support this problem"
+                      << std::endl;
+        }
+    }
+
+    for(auto& normalize_ptr : normalize_ptrs)
+    {
+        if(RunDeviceNormalize2D(normalize_ptr,
+                                c_device_buf.GetDeviceBuffer(),
+                                reduceMean_device_buf.GetDeviceBuffer(),
+                                reduceMeanSquare_device_buf.GetDeviceBuffer(),
+                                gamma_device_buf.GetDeviceBuffer(),
+                                beta_device_buf.GetDeviceBuffer(),
+                                layerNorm_device_buf.GetDeviceBuffer(),
+                                M,
+                                N,
+                                StrideC,
+                                b_time_kernel))
+        {
+            if(b_only_run_first_kernel)
+                break;
+        }
+        else
+        {
+            std::cout << normalize_ptr->GetTypeString() << " does not support this problem"
+                      << std::endl;
+        }
+    }
+}
diff --git a/client_example/04_contraction/CMakeLists.txt b/client_example/04_contraction/CMakeLists.txt
new file mode 100644
index 00000000..4bc6780f
--- /dev/null
+++ b/client_example/04_contraction/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_executable(client_contraction_scale contraction_scale.cpp)
+target_link_libraries(client_contraction_scale PRIVATE composable_kernel::device_operations)
+
+add_executable(client_contraction_bilinear contraction_bilinear.cpp)
+target_link_libraries(client_contraction_bilinear PRIVATE composable_kernel::device_operations)
+
diff --git a/client_example/04_contraction/contraction_bilinear.cpp b/client_example/04_contraction/contraction_bilinear.cpp
new file mode 100644
index 00000000..91dead41
--- /dev/null
+++ b/client_example/04_contraction/contraction_bilinear.cpp
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <numeric>
+#include <vector>
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp"
+#include "ck/library/utility/numeric.hpp"
+
+using F32 = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = Bilinear;
+
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DDataType        = F32;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F32;
+
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    // A[M0, M1, K0, K1]
+    std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
+    // B[N0, N1, K0, K1]
+    std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
+    std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
+    // D[M0, M1, N0, N1]
+    std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
+    // E[M0, M1, N0, N1]
+    std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
+
+    float alpha = 1.f;
+    float beta  = 1.f;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 25)
+    {
+        const ck::index_t M0 = std::stoi(argv[1]);
+        const ck::index_t M1 = std::stoi(argv[2]);
+
+        const ck::index_t N0 = std::stoi(argv[3]);
+        const ck::index_t N1 = std::stoi(argv[4]);
+
+        const ck::index_t K0 = std::stoi(argv[5]);
+        const ck::index_t K1 = std::stoi(argv[6]);
+
+        a_ms_ks_lengths = {M0, M1, K0, K1};
+        a_ms_ks_strides = {
+            std::stoi(argv[7]), std::stoi(argv[8]), std::stoi(argv[9]), std::stoi(argv[10])};
+
+        b_ns_ks_lengths = {N0, N1, K0, K1};
+        b_ns_ks_strides = {
+            std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13]), std::stoi(argv[14])};
+
+        d_ms_ns_lengths = {M0, M1, N0, N1};
+        d_ms_ns_strides = {
+            std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17]), std::stoi(argv[18])};
+
+        e_ms_ns_lengths = {M0, M1, N0, N1};
+        e_ms_ns_strides = {
+            std::stoi(argv[19]), std::stoi(argv[20]), std::stoi(argv[21]), std::stoi(argv[22])};
+
+        alpha = std::stof(argv[23]);
+        beta  = std::stof(argv[24]);
+    }
+    else
+    {
+        printf("arg1 to 6: M0, M1, N0, N1, K0, K1\n");
+        printf("arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
+        printf("arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
+        printf("arg15 to 18: Stride_D_M0, Stride_D_M1, Stride_D_N0, Stride_D_N1\n");
+        printf("arg19 to 22: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
+        printf("arg23 to 24: alpha, beta\n");
+        exit(0);
+    }
+
+    auto f_tensor_space_size = [](auto lengths, auto strides) {
+        std::size_t space_size = 1;
+        for(std::size_t i = 0; i < lengths.size(); ++i)
+        {
+            space_size += (lengths[i] - 1) * strides[i];
+        }
+        return space_size;
+    };
+
+    SimpleDeviceMem a_device_buf(sizeof(ADataType) *
+                                 f_tensor_space_size(a_ms_ks_lengths, a_ms_ks_strides));
+    SimpleDeviceMem b_device_buf(sizeof(BDataType) *
+                                 f_tensor_space_size(b_ns_ks_lengths, b_ns_ks_strides));
+    SimpleDeviceMem d_device_buf(sizeof(DDataType) *
+                                 f_tensor_space_size(d_ms_ns_lengths, d_ms_ns_strides));
+    SimpleDeviceMem e_device_buf(sizeof(EDataType) *
+                                 f_tensor_space_size(e_ms_ns_lengths, e_ms_ns_strides));
+
+    using DeviceOp = ck::tensor_operation::device::DeviceContractionMultipleD<
+        NumDimM,
+        NumDimN,
+        NumDimK,
+        ADataType,
+        BDataType,
+        ck::Tuple<DDataType>,
+        EDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::Bilinear>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{alpha, beta};
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                        b_device_buf.GetDeviceBuffer(),
+                                        std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
+                                        e_device_buf.GetDeviceBuffer(),
+                                        a_ms_ks_lengths,
+                                        a_ms_ks_strides,
+                                        b_ns_ks_lengths,
+                                        b_ns_ks_strides,
+                                        std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
+                                        std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
+                                        e_ms_ns_lengths,
+                                        e_ms_ns_strides,
+                                        a_element_op,
+                                        b_element_op,
+                                        cde_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            ck::index_t M = ck::accumulate_n<ck::index_t>(
+                e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
+
+            ck::index_t N = ck::accumulate_n<ck::index_t>(
+                e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
+
+            ck::index_t K = ck::accumulate_n<ck::index_t>(
+                a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
+
+            std::size_t flop      = std::size_t(2) * M * N * K;
+            std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                                    sizeof(DDataType) * M * N + sizeof(EDataType) * M * N;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    return 0;
+}
diff --git a/client_example/04_contraction/contraction_scale.cpp b/client_example/04_contraction/contraction_scale.cpp
new file mode 100644
index 00000000..4e08ee19
--- /dev/null
+++ b/client_example/04_contraction/contraction_scale.cpp
@@ -0,0 +1,222 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <numeric>
+#include <vector>
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/contraction_scale.hpp"
+#include "ck/library/utility/numeric.hpp"
+
+using F32 = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Scale       = ck::tensor_operation::element_wise::Scale;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = Scale;
+
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = F32;
+
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    // A[M0, M1, K0, K1]
+    std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
+    // B[N0, N1, K0, K1]
+    std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
+    std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
+    // E[M0, M1, N0, N1]
+    std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
+
+    float scale = 1.f;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 20)
+    {
+        const ck::index_t M0 = std::stoi(argv[1]);
+        const ck::index_t M1 = std::stoi(argv[2]);
+
+        const ck::index_t N0 = std::stoi(argv[3]);
+        const ck::index_t N1 = std::stoi(argv[4]);
+
+        const ck::index_t K0 = std::stoi(argv[5]);
+        const ck::index_t K1 = std::stoi(argv[6]);
+
+        a_ms_ks_lengths = {M0, M1, K0, K1};
+        a_ms_ks_strides = {
+            std::stoi(argv[7]), std::stoi(argv[8]), std::stoi(argv[9]), std::stoi(argv[10])};
+
+        b_ns_ks_lengths = {N0, N1, K0, K1};
+        b_ns_ks_strides = {
+            std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13]), std::stoi(argv[14])};
+
+        e_ms_ns_lengths = {M0, M1, N0, N1};
+        e_ms_ns_strides = {
+            std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17]), std::stoi(argv[18])};
+
+        scale = std::stof(argv[19]);
+    }
+    else
+    {
+        printf("arg1 to 6: M0, M1, N0, N1, K0, K1\n");
+        printf("arg7 to 10: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
+        printf("arg11 to 14: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
+        printf("arg15 to 18: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
+        printf("arg19: scale\n");
+        exit(0);
+    }
+
+    auto f_tensor_space_size = [](auto lengths, auto strides) {
+        std::size_t space_size = 1;
+        for(std::size_t i = 0; i < lengths.size(); ++i)
+        {
+            space_size += (lengths[i] - 1) * strides[i];
+        }
+        return space_size;
+    };
+
+    SimpleDeviceMem a_device_buf(sizeof(ADataType) *
+                                 f_tensor_space_size(a_ms_ks_lengths, a_ms_ks_strides));
+    SimpleDeviceMem b_device_buf(sizeof(BDataType) *
+                                 f_tensor_space_size(b_ns_ks_lengths, b_ns_ks_strides));
+    SimpleDeviceMem e_device_buf(sizeof(EDataType) *
+                                 f_tensor_space_size(e_ms_ns_lengths, e_ms_ns_strides));
+
+    using DeviceOp = ck::tensor_operation::device::DeviceContractionMultipleD<
+        NumDimM,
+        NumDimN,
+        NumDimK,
+        ADataType,
+        BDataType,
+        ck::Tuple<>,
+        EDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::Scale>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{scale};
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                                        b_device_buf.GetDeviceBuffer(),
+                                                        std::array<const void*, 0>{},
+                                                        e_device_buf.GetDeviceBuffer(),
+                                                        a_ms_ks_lengths,
+                                                        a_ms_ks_strides,
+                                                        b_ns_ks_lengths,
+                                                        b_ns_ks_strides,
+                                                        std::array<std::vector<ck::index_t>, 0>{},
+                                                        std::array<std::vector<ck::index_t>, 0>{},
+                                                        e_ms_ns_lengths,
+                                                        e_ms_ns_strides,
+                                                        a_element_op,
+                                                        b_element_op,
+                                                        cde_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            ck::index_t M = ck::accumulate_n<ck::index_t>(
+                e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
+
+            ck::index_t N = ck::accumulate_n<ck::index_t>(
+                e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
+
+            ck::index_t K = ck::accumulate_n<ck::index_t>(
+                a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
+
+            std::size_t flop = std::size_t(2) * M * N * K;
+            std::size_t num_btype =
+                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    return 0;
+}
diff --git a/client_example/05_layernorm/CMakeLists.txt b/client_example/05_layernorm/CMakeLists.txt
new file mode 100644
index 00000000..b582b485
--- /dev/null
+++ b/client_example/05_layernorm/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(client_layernorm2d layernorm2d.cpp)
+target_link_libraries(client_layernorm2d PRIVATE composable_kernel::device_operations)
diff --git a/client_example/05_layernorm/layernorm2d.cpp b/client_example/05_layernorm/layernorm2d.cpp
new file mode 100644
index 00000000..adb41171
--- /dev/null
+++ b/client_example/05_layernorm/layernorm2d.cpp
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <vector>
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_normalization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/normalization.hpp"
+
+using XDataType     = ck::half_t;
+using GammaDataType = ck::half_t;
+using BetaDataType  = ck::half_t;
+using YDataType     = ck::half_t;
+using AccDataType   = float;
+using PassThrough   = ck::tensor_operation::element_wise::PassThrough;
+
+constexpr int Rank         = 2;
+constexpr int NumReduceDim = 1;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    ck::index_t M      = 1024;
+    ck::index_t N      = 1024;
+    ck::index_t Stride = 1024;
+
+    auto xy_size = (M - 1) * Stride + N;
+
+    SimpleDeviceMem x_device_buf(sizeof(XDataType) * xy_size);
+    SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * N);
+    SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * N);
+    SimpleDeviceMem y_device_buf(sizeof(YDataType) * xy_size);
+
+    using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType,
+                                                                       GammaDataType,
+                                                                       BetaDataType,
+                                                                       AccDataType,
+                                                                       YDataType,
+                                                                       PassThrough,
+                                                                       Rank,
+                                                                       NumReduceDim>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer({M, N},      // lengths
+                                                        {Stride, 1}, // xStrides
+                                                        {0, 1},      // gammaStrides
+                                                        {0, 1},      // betaStrides
+                                                        {Stride, 1}, // yStrides
+                                                        {1},         // reduceDims
+                                                        1e-4,
+                                                        x_device_buf.GetDeviceBuffer(),
+                                                        gamma_device_buf.GetDeviceBuffer(),
+                                                        beta_device_buf.GetDeviceBuffer(),
+                                                        y_device_buf.GetDeviceBuffer(),
+                                                        nullptr,
+                                                        nullptr,
+                                                        PassThrough{});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t num_byte = sizeof(XDataType) * M * N + sizeof(GammaDataType) * N +
+                                   sizeof(BetaDataType) * N + sizeof(YDataType) * M * N;
+
+            float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+
+            if(ave_time < best_ave_time)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+              << best_op_name << std::endl;
+
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer({M, N},      // lengths
+                                                        {Stride, 1}, // xStrides
+                                                        {1},         // gammaStrides
+                                                        {1},         // betaStrides
+                                                        {Stride, 1}, // yStrides
+                                                        {1},         // reduceDims
+                                                        1e-4,
+                                                        x_device_buf.GetDeviceBuffer(),
+                                                        gamma_device_buf.GetDeviceBuffer(),
+                                                        beta_device_buf.GetDeviceBuffer(),
+                                                        y_device_buf.GetDeviceBuffer(),
+                                                        nullptr,
+                                                        nullptr,
+                                                        PassThrough{});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
diff --git a/client_example/06_softmax/CMakeLists.txt b/client_example/06_softmax/CMakeLists.txt
new file mode 100644
index 00000000..b38a0fd9
--- /dev/null
+++ b/client_example/06_softmax/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(client_softmax4d softmax4d.cpp)
+target_link_libraries(client_softmax4d PRIVATE composable_kernel::device_operations)
diff --git a/client_example/06_softmax/softmax4d.cpp b/client_example/06_softmax/softmax4d.cpp
new file mode 100644
index 00000000..7745ddf3
--- /dev/null
+++ b/client_example/06_softmax/softmax4d.cpp
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <functional>
+#include <numeric>
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/softmax.hpp"
+
+using InDataType  = ck::half_t;
+using OutDataType = ck::half_t;
+using AccDataType = float;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+constexpr int Rank         = 4;
+constexpr int NumReduceDim = 2;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    std::vector<ck::index_t> in_lengths{2, 8, 128, 1024};
+    std::vector<ck::index_t> in_strides{8 * 128 * 1024, 128 * 1024, 1024, 1};
+    std::vector<ck::index_t> reduce_dims{2, 3};
+
+    ck::index_t num_elements =
+        std::accumulate(in_lengths.begin(), in_lengths.end(), 1, std::multiplies<ck::index_t>());
+
+    AccDataType alpha{2.0f};
+    AccDataType beta{2.0f};
+
+    SimpleDeviceMem in(sizeof(InDataType) * num_elements);
+    SimpleDeviceMem out(sizeof(OutDataType) * num_elements);
+
+    using DeviceOp = ck::tensor_operation::device::
+        DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+
+        if(op_ptr->GetRank() != Rank || op_ptr->GetNumReduceDim() != NumReduceDim)
+        {
+            continue;
+        }
+
+        auto argument_ptr   = op_ptr->MakeArgumentPointer(in_lengths,
+                                                        in_strides,
+                                                        reduce_dims,
+                                                        &alpha,
+                                                        &beta,
+                                                        in.GetDeviceBuffer(),
+                                                        out.GetDeviceBuffer(),
+                                                        PassThrough{},
+                                                        PassThrough{});
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t num_bytes = num_elements * sizeof(InDataType) +
+                                    (beta == 0.0f ? 1 : 2) * num_elements * sizeof(OutDataType);
+
+            float gb_per_sec = num_bytes / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+
+            if(ave_time < best_ave_time)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+              << best_op_name << std::endl;
+
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in_lengths,
+                                                        in_strides,
+                                                        reduce_dims,
+                                                        &alpha,
+                                                        &beta,
+                                                        in.GetDeviceBuffer(),
+                                                        out.GetDeviceBuffer(),
+                                                        PassThrough{},
+                                                        PassThrough{});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/client_example/07_grouped_conv2d_fwd/CMakeLists.txt b/client_example/07_grouped_conv2d_fwd/CMakeLists.txt
new file mode 100644
index 00000000..ddc83168
--- /dev/null
+++ b/client_example/07_grouped_conv2d_fwd/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(client_grouped_conv2d_fwd grouped_conv2d_fwd.cpp)
+target_link_libraries(client_grouped_conv2d_fwd PRIVATE composable_kernel::device_operations)
diff --git a/client_example/07_grouped_conv2d_fwd/grouped_conv2d_fwd.cpp b/client_example/07_grouped_conv2d_fwd/grouped_conv2d_fwd.cpp
new file mode 100644
index 00000000..ece6e30c
--- /dev/null
+++ b/client_example/07_grouped_conv2d_fwd/grouped_conv2d_fwd.cpp
@@ -0,0 +1,226 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using InDataType  = ck::half_t;
+using WeiDataType = ck::half_t;
+using OutDataType = ck::half_t;
+
+using InLayout    = ck::tensor_layout::convolution::GNHWC;
+using WeiLayout   = ck::tensor_layout::convolution::GKYXC;
+using OutLayout   = ck::tensor_layout::convolution::GNHWK;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr ck::index_t NumDimSpatial = 2;
+static constexpr ck::index_t G             = 32;
+static constexpr ck::index_t N             = 256;
+static constexpr ck::index_t K             = 192;
+static constexpr ck::index_t C             = 192;
+static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Hi            = 28;
+static constexpr ck::index_t Wi            = 28;
+static constexpr ck::index_t Ho            = 28;
+static constexpr ck::index_t Wo            = 28;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main()
+{
+    std::array<ck::index_t, NumDimSpatial + 3> in_lengths{G, N, Hi, Wi, C};
+    std::array<ck::index_t, NumDimSpatial + 3> in_strides{0, 0, 0, 0, 1};
+
+    std::array<ck::index_t, NumDimSpatial + 3> wei_lengths{G, K, Y, X, C};
+    std::array<ck::index_t, NumDimSpatial + 3> wei_strides{0, 0, 0, 0, 1};
+
+    std::array<ck::index_t, NumDimSpatial + 3> out_lengths{G, N, Ho, Wo, K};
+    std::array<ck::index_t, NumDimSpatial + 3> out_strides{0, 0, 0, 0, 1};
+
+    std::partial_sum(rbegin(in_lengths),
+                     std::prev(rend(in_lengths)),
+                     std::next(rbegin(in_strides)),
+                     std::multiplies<>{});
+    std::partial_sum(rbegin(wei_lengths),
+                     std::prev(rend(wei_lengths)),
+                     std::next(rbegin(wei_strides)),
+                     std::multiplies<>{});
+    std::partial_sum(rbegin(out_lengths),
+                     std::prev(rend(out_lengths)),
+                     std::next(rbegin(out_strides)),
+                     std::multiplies<>{});
+
+    // transpose GNHWC/GKYXC/GNHWK to GNCHW/GKCYX/GNCHW
+    std::rotate(
+        rbegin(in_lengths), std::next(rbegin(in_lengths)), std::next(rbegin(in_lengths), 3));
+    std::rotate(
+        rbegin(in_strides), std::next(rbegin(in_strides)), std::next(rbegin(in_strides), 3));
+    std::rotate(
+        rbegin(wei_lengths), std::next(rbegin(wei_lengths)), std::next(rbegin(wei_lengths), 3));
+    std::rotate(
+        rbegin(wei_strides), std::next(rbegin(wei_strides)), std::next(rbegin(wei_strides), 3));
+    std::rotate(
+        rbegin(out_lengths), std::next(rbegin(out_lengths)), std::next(rbegin(out_lengths), 3));
+    std::rotate(
+        rbegin(out_strides), std::next(rbegin(out_strides)), std::next(rbegin(out_strides), 3));
+
+    std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
+    std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
+    std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
+    std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
+
+    SimpleDeviceMem in(sizeof(InDataType) * G * N * Hi * Wi * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
+    SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * K);
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
+                                                                                 InLayout,
+                                                                                 WeiLayout,
+                                                                                 ck::Tuple<>,
+                                                                                 OutLayout,
+                                                                                 InDataType,
+                                                                                 WeiDataType,
+                                                                                 ck::Tuple<>,
+                                                                                 OutDataType,
+                                                                                 PassThrough,
+                                                                                 PassThrough,
+                                                                                 PassThrough>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    float best_tflops     = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr        = op_ptrs[i];
+        auto argument_ptr   = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        {},
+                                                        out.GetDeviceBuffer(),
+                                                        in_lengths,
+                                                        in_strides,
+                                                        wei_lengths,
+                                                        wei_strides,
+                                                        {},
+                                                        {},
+                                                        out_lengths,
+                                                        out_strides,
+                                                        filter_strides,
+                                                        filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        PassThrough{});
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t flop      = std::size_t(2) * G * N * K * C * Ho * Wo * Y * X;
+            std::size_t num_bytes = sizeof(InDataType) * G * N * Hi * Wi * C +
+                                    sizeof(WeiDataType) * G * K * Y * X * C +
+                                    sizeof(OutDataType) * G * N * Ho * Wo * K;
+
+            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+                best_tflops     = tflops;
+            }
+        }
+        else
+        {
+            std::cerr << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    if(best_op_id < 0)
+    {
+        std::cerr << "no suitable instance" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+              << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        {},
+                                                        out.GetDeviceBuffer(),
+                                                        in_lengths,
+                                                        in_strides,
+                                                        wei_lengths,
+                                                        wei_strides,
+                                                        {},
+                                                        {},
+                                                        out_lengths,
+                                                        out_strides,
+                                                        filter_strides,
+                                                        filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        PassThrough{});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+}
diff --git a/client_example/08_fused_attention/CMakeLists.txt b/client_example/08_fused_attention/CMakeLists.txt
new file mode 100644
index 00000000..5cdea72f
--- /dev/null
+++ b/client_example/08_fused_attention/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(client_fused_attention fused_attention.cpp)
+target_link_libraries(client_fused_attention PRIVATE composable_kernel::device_operations)
diff --git a/client_example/08_fused_attention/fused_attention.cpp b/client_example/08_fused_attention/fused_attention.cpp
new file mode 100644
index 00000000..fe927da1
--- /dev/null
+++ b/client_example/08_fused_attention/fused_attention.cpp
@@ -0,0 +1,213 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using AElementOp    = ck::tensor_operation::element_wise::PassThrough;
+using B0ElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+using B1ElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CElementOp    = ck::tensor_operation::element_wise::PassThrough;
+
+constexpr static auto MaskingSpec =
+    ck::tensor_operation::device::MaskingSpecialization::MaskDisabled;
+
+using ADataType   = ck::half_t;
+using B0DataType  = ck::half_t;
+using B1DataType  = ck::half_t;
+using CDataType   = ck::half_t;
+using AccDataType = float;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    int G0 = 48;
+    int G1 = 16;
+    int M  = 1024;
+    int N  = 1024;
+    int K  = 64;
+    int O  = 64;
+
+    // A layout [G0, M, G1, K]
+    std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, G1, M, K};
+    std::vector<ck::index_t> a_gs_ms_ks_strides{M * G1 * K, K, G1 * K, 1};
+
+    // B0 layout [G0, N, G1, K]
+    std::vector<ck::index_t> b0_gs_ns_ks_lengths{G0, G1, N, K};
+    std::vector<ck::index_t> b0_gs_ns_ks_strides{N * G1 * K, K, G1 * K, 1};
+
+    // B1 layout [G0, N, G1, O]
+    std::vector<ck::index_t> b1_gs_os_ns_lengths{G0, G1, O, N};
+    std::vector<ck::index_t> b1_gs_os_ns_strides{N * G1 * O, O, 1, G1 * O};
+
+    // C layout [G0, M, G1, O]
+    std::vector<ck::index_t> c_gs_ms_os_lengths{G0, G1, M, O};
+    std::vector<ck::index_t> c_gs_ms_os_strides{M * G1 * O, O, G1 * O, 1};
+
+    SimpleDeviceMem a_device_buf(sizeof(ADataType) * G0 * G1 * M * K);
+    SimpleDeviceMem b0_device_buf(sizeof(B0DataType) * G0 * G1 * N * K);
+    SimpleDeviceMem b1_device_buf(sizeof(B1DataType) * G0 * G1 * O * N);
+    SimpleDeviceMem c_device_buf(sizeof(CDataType) * G0 * G1 * M * O);
+
+    using DeviceOp =
+        ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute<2,
+                                                                          1,
+                                                                          1,
+                                                                          1,
+                                                                          1,
+                                                                          ADataType,
+                                                                          B0DataType,
+                                                                          B1DataType,
+                                                                          CDataType,
+                                                                          ck::Tuple<>,
+                                                                          ck::Tuple<>,
+                                                                          AElementOp,
+                                                                          B0ElementOp,
+                                                                          Acc0ElementOp,
+                                                                          B1ElementOp,
+                                                                          CElementOp,
+                                                                          MaskingSpec>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device op instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr      = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                                        b0_device_buf.GetDeviceBuffer(),
+                                                        b1_device_buf.GetDeviceBuffer(),
+                                                        c_device_buf.GetDeviceBuffer(),
+                                                        {}, // p_acc0_biases
+                                                        {}, // p_acc1_biases
+                                                        a_gs_ms_ks_lengths,
+                                                        a_gs_ms_ks_strides,
+                                                        b0_gs_ns_ks_lengths,
+                                                        b0_gs_ns_ks_strides,
+                                                        b1_gs_os_ns_lengths,
+                                                        b1_gs_os_ns_strides,
+                                                        c_gs_ms_os_lengths,
+                                                        c_gs_ms_os_strides,
+                                                        {}, // acc0_biases_gs_ms_ns_lengths
+                                                        {}, // acc0_biases_gs_ms_ns_strides
+                                                        {}, // acc1_biases_gs_ms_os_lengths
+                                                        {}, // acc1_biases_gs_ms_os_strides
+                                                        AElementOp{},
+                                                        B0ElementOp{},
+                                                        Acc0ElementOp{1 / sqrtf(K)},
+                                                        B1ElementOp{},
+                                                        CElementOp{});
+
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t flop      = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * G0 * G1;
+            std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N +
+                                     sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O) *
+                                    G0 * G1;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                      << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    // run the best instance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                                        b0_device_buf.GetDeviceBuffer(),
+                                                        b1_device_buf.GetDeviceBuffer(),
+                                                        c_device_buf.GetDeviceBuffer(),
+                                                        {}, // p_acc0_biases
+                                                        {}, // p_acc1_biases
+                                                        a_gs_ms_ks_lengths,
+                                                        a_gs_ms_ks_strides,
+                                                        b0_gs_ns_ks_lengths,
+                                                        b0_gs_ns_ks_strides,
+                                                        b1_gs_os_ns_lengths,
+                                                        b1_gs_os_ns_strides,
+                                                        c_gs_ms_os_lengths,
+                                                        c_gs_ms_os_strides,
+                                                        {}, // acc0_biases_gs_ms_ns_lengths
+                                                        {}, // acc0_biases_gs_ms_ns_strides
+                                                        {}, // acc1_biases_gs_ms_os_lengths
+                                                        {}, // acc1_biases_gs_ms_os_strides
+                                                        AElementOp{},
+                                                        B0ElementOp{},
+                                                        Acc0ElementOp{1 / sqrtf(K)},
+                                                        B1ElementOp{},
+                                                        CElementOp{});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
diff --git a/client_example/09_quantization/CMakeLists.txt b/client_example/09_quantization/CMakeLists.txt
new file mode 100644
index 00000000..7dc9b860
--- /dev/null
+++ b/client_example/09_quantization/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_executable(client_conv2d_fwd_bias_relu_perchannel_quantization conv2d_fwd_bias_relu_perchannel_quantization.cpp)
+target_link_libraries(client_conv2d_fwd_bias_relu_perchannel_quantization PRIVATE composable_kernel::device_operations)
+
+add_executable(client_conv2d_fwd_bias_relu_perlayer_quantization conv2d_fwd_bias_relu_perlayer_quantization.cpp)
+target_link_libraries(client_conv2d_fwd_bias_relu_perlayer_quantization PRIVATE composable_kernel::device_operations)
+
+add_executable(client_conv2d_fwd_perchannel_quantization conv2d_fwd_perchannel_quantization.cpp)
+target_link_libraries(client_conv2d_fwd_perchannel_quantization PRIVATE composable_kernel::device_operations)
+
+add_executable(client_conv2d_fwd_perlayer_quantization conv2d_fwd_perlayer_quantization.cpp)
+target_link_libraries(client_conv2d_fwd_perlayer_quantization PRIVATE composable_kernel::device_operations)
diff --git a/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp b/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp
new file mode 100644
index 00000000..bcb0cefa
--- /dev/null
+++ b/client_example/09_quantization/conv2d_fwd_bias_relu_perchannel_quantization.cpp
@@ -0,0 +1,205 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using InDataType           = int8_t;
+using WeiDataType          = int8_t;
+using BiasDataType         = int32_t;
+using RequantScaleDataType = float;
+using OutDataType          = int8_t;
+
+using InLayout           = ck::tensor_layout::convolution::GNHWC;
+using WeiLayout          = ck::tensor_layout::convolution::GKYXC;
+using BiasLayout         = ck::tensor_layout::convolution::G_K;
+using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
+using OutLayout          = ck::tensor_layout::convolution::GNHWK;
+using PassThrough        = ck::tensor_operation::element_wise::PassThrough;
+using ActivationOp       = ck::tensor_operation::element_wise::Relu;
+using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<ActivationOp>;
+
+static constexpr ck::index_t NumDimSpatial = 2;
+static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t N             = 4;
+static constexpr ck::index_t K             = 64;
+static constexpr ck::index_t C             = 32;
+static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Hi            = 71;
+static constexpr ck::index_t Wi            = 71;
+static constexpr ck::index_t Ho            = 36;
+static constexpr ck::index_t Wo            = 36;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
+    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
+    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
+    std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
+    std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
+    std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
+    std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
+    std::array<ck::index_t, 5> out_lengths{G, N, C, Ho, Wo};
+    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * C, Ho * Wo * C, 1, Wo * C, C};
+    std::array<ck::index_t, 2> in_left_pad{1, 1};
+    std::array<ck::index_t, 2> in_right_pad{1, 1};
+    std::array<ck::index_t, 2> conv_strides{2, 2};
+    std::array<ck::index_t, 2> conv_dilations{1, 1};
+
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
+    SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C);
+    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K * Y * X * C);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
+        NumDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<BiasLayout, RequantScaleLayout>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        ck::Tuple<BiasDataType, RequantScaleDataType>,
+        OutDataType,
+        PassThrough,
+        PassThrough,
+        OutElementOp>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    float best_tflops     = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                        wei.GetDeviceBuffer(),
+                                        {bias.GetDeviceBuffer(), requant_scale.GetDeviceBuffer()},
+                                        out.GetDeviceBuffer(),
+                                        in_lengths,
+                                        in_strides,
+                                        weight_lengths,
+                                        weight_strides,
+                                        {bias_lengths, requant_scale_lengths},
+                                        {bias_strides, requant_scale_strides},
+                                        out_lengths,
+                                        out_strides,
+                                        conv_strides,
+                                        conv_dilations,
+                                        in_left_pad,
+                                        in_right_pad,
+                                        PassThrough{},
+                                        PassThrough{},
+                                        OutElementOp{ActivationOp{}});
+
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t flop      = G * 2 * N * K * C * Ho * Wo * Y * X;
+            std::size_t num_bytes = G * sizeof(InDataType) * N * Hi * Wi * C +
+                                    G * sizeof(WeiDataType) * K * Y * X * C +
+                                    G * sizeof(OutDataType) * N * Ho * Wo * K;
+
+            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+                best_tflops     = tflops;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+              << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                        wei.GetDeviceBuffer(),
+                                        {bias.GetDeviceBuffer(), requant_scale.GetDeviceBuffer()},
+                                        out.GetDeviceBuffer(),
+                                        in_lengths,
+                                        in_strides,
+                                        weight_lengths,
+                                        weight_strides,
+                                        {bias_lengths, requant_scale_lengths},
+                                        {bias_strides, requant_scale_strides},
+                                        out_lengths,
+                                        out_strides,
+                                        conv_strides,
+                                        conv_dilations,
+                                        in_left_pad,
+                                        in_right_pad,
+                                        PassThrough{},
+                                        PassThrough{},
+                                        OutElementOp{ActivationOp{}});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp b/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp
new file mode 100644
index 00000000..26c7aa15
--- /dev/null
+++ b/client_example/09_quantization/conv2d_fwd_bias_relu_perlayer_quantization.cpp
@@ -0,0 +1,198 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using InDataType   = int8_t;
+using WeiDataType  = int8_t;
+using BiasDataType = int32_t;
+using OutDataType  = int8_t;
+
+using InLayout     = ck::tensor_layout::convolution::GNHWC;
+using WeiLayout    = ck::tensor_layout::convolution::GKYXC;
+using BiasLayout   = ck::tensor_layout::convolution::G_K;
+using OutLayout    = ck::tensor_layout::convolution::GNHWK;
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using ActivationOp = ck::tensor_operation::element_wise::Relu;
+using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<ActivationOp>;
+
+static constexpr ck::index_t NumDimSpatial = 2;
+static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t N             = 4;
+static constexpr ck::index_t K             = 64;
+static constexpr ck::index_t C             = 32;
+static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Hi            = 71;
+static constexpr ck::index_t Wi            = 71;
+static constexpr ck::index_t Ho            = 36;
+static constexpr ck::index_t Wo            = 36;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
+    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
+    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
+    std::array<ck::index_t, 5> bias_lengths{G, N, K, Ho, Wo};
+    std::array<ck::index_t, 5> bias_strides{K, 0, 1, 0, 0};
+    std::array<ck::index_t, 5> out_lengths{G, N, C, Ho, Wo};
+    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * C, Ho * Wo * C, 1, Wo * C, C};
+    std::array<ck::index_t, 2> in_left_pad{1, 1};
+    std::array<ck::index_t, 2> in_right_pad{1, 1};
+    std::array<ck::index_t, 2> conv_strides{2, 2};
+    std::array<ck::index_t, 2> conv_dilations{1, 1};
+
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
+    SimpleDeviceMem bias(sizeof(BiasDataType) * K * Y * X * C);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+
+    using DeviceOp =
+        ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
+                                                                    InLayout,
+                                                                    WeiLayout,
+                                                                    ck::Tuple<BiasLayout>,
+                                                                    OutLayout,
+                                                                    InDataType,
+                                                                    WeiDataType,
+                                                                    ck::Tuple<BiasDataType>,
+                                                                    OutDataType,
+                                                                    PassThrough,
+                                                                    PassThrough,
+                                                                    OutElementOp>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    float best_tflops     = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr      = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        {bias.GetDeviceBuffer()},
+                                                        out.GetDeviceBuffer(),
+                                                        in_lengths,
+                                                        in_strides,
+                                                        weight_lengths,
+                                                        weight_strides,
+                                                        {bias_lengths},
+                                                        {bias_strides},
+                                                        out_lengths,
+                                                        out_strides,
+                                                        conv_strides,
+                                                        conv_dilations,
+                                                        in_left_pad,
+                                                        in_right_pad,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        OutElementOp{0.5f, ActivationOp{}});
+
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t flop      = G * 2 * N * K * C * Ho * Wo * Y * X;
+            std::size_t num_bytes = G * sizeof(InDataType) * N * Hi * Wi * C +
+                                    G * sizeof(WeiDataType) * K * Y * X * C +
+                                    G * sizeof(OutDataType) * N * Ho * Wo * K;
+
+            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+                best_tflops     = tflops;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+              << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        {bias.GetDeviceBuffer()},
+                                                        out.GetDeviceBuffer(),
+                                                        in_lengths,
+                                                        in_strides,
+                                                        weight_lengths,
+                                                        weight_strides,
+                                                        {bias_lengths},
+                                                        {bias_strides},
+                                                        out_lengths,
+                                                        out_strides,
+                                                        conv_strides,
+                                                        conv_dilations,
+                                                        in_left_pad,
+                                                        in_right_pad,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        OutElementOp{0.5f, ActivationOp{}});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp b/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
new file mode 100644
index 00000000..475b2f03
--- /dev/null
+++ b/client_example/09_quantization/conv2d_fwd_perchannel_quantization.cpp
@@ -0,0 +1,198 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perchannel_quantization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using InDataType           = int8_t;
+using WeiDataType          = int8_t;
+using RequantScaleDataType = float;
+using OutDataType          = int8_t;
+
+using InLayout           = ck::tensor_layout::convolution::GNHWC;
+using WeiLayout          = ck::tensor_layout::convolution::GKYXC;
+using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
+using OutLayout          = ck::tensor_layout::convolution::GNHWK;
+using PassThrough        = ck::tensor_operation::element_wise::PassThrough;
+using ActivationOp       = PassThrough;
+using OutElementOp       = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<ActivationOp>;
+
+static constexpr ck::index_t NumDimSpatial = 2;
+static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t N             = 4;
+static constexpr ck::index_t K             = 64;
+static constexpr ck::index_t C             = 32;
+static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Hi            = 71;
+static constexpr ck::index_t Wi            = 71;
+static constexpr ck::index_t Ho            = 36;
+static constexpr ck::index_t Wo            = 36;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
+    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
+    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
+    std::array<ck::index_t, 5> requant_scale_lengths{G, N, K, Ho, Wo};
+    std::array<ck::index_t, 5> requant_scale_strides{K, 0, 1, 0, 0};
+    std::array<ck::index_t, 5> out_lengths{G, N, C, Ho, Wo};
+    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * C, Ho * Wo * C, 1, Wo * C, C};
+    std::array<ck::index_t, 2> in_left_pad{1, 1};
+    std::array<ck::index_t, 2> in_right_pad{1, 1};
+    std::array<ck::index_t, 2> conv_strides{2, 2};
+    std::array<ck::index_t, 2> conv_dilations{1, 1};
+
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
+    SimpleDeviceMem requant_scale(sizeof(RequantScaleDataType) * K * Y * X * C);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+
+    using DeviceOp =
+        ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
+                                                                    InLayout,
+                                                                    WeiLayout,
+                                                                    ck::Tuple<RequantScaleLayout>,
+                                                                    OutLayout,
+                                                                    InDataType,
+                                                                    WeiDataType,
+                                                                    ck::Tuple<RequantScaleDataType>,
+                                                                    OutDataType,
+                                                                    PassThrough,
+                                                                    PassThrough,
+                                                                    OutElementOp>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    float best_tflops     = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr      = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        {requant_scale.GetDeviceBuffer()},
+                                                        out.GetDeviceBuffer(),
+                                                        in_lengths,
+                                                        in_strides,
+                                                        weight_lengths,
+                                                        weight_strides,
+                                                        {requant_scale_lengths},
+                                                        {requant_scale_strides},
+                                                        out_lengths,
+                                                        out_strides,
+                                                        conv_strides,
+                                                        conv_dilations,
+                                                        in_left_pad,
+                                                        in_right_pad,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        OutElementOp{ActivationOp{}});
+
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t flop      = G * 2 * N * K * C * Ho * Wo * Y * X;
+            std::size_t num_bytes = G * sizeof(InDataType) * N * Hi * Wi * C +
+                                    G * sizeof(WeiDataType) * K * Y * X * C +
+                                    G * sizeof(OutDataType) * N * Ho * Wo * K;
+
+            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+                best_tflops     = tflops;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+              << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        {},
+                                                        out.GetDeviceBuffer(),
+                                                        in_lengths,
+                                                        in_strides,
+                                                        weight_lengths,
+                                                        weight_strides,
+                                                        {},
+                                                        {},
+                                                        out_lengths,
+                                                        out_strides,
+                                                        conv_strides,
+                                                        conv_dilations,
+                                                        in_left_pad,
+                                                        in_right_pad,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        OutElementOp{ActivationOp{}});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp b/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
new file mode 100644
index 00000000..da7b7e6a
--- /dev/null
+++ b/client_example/09_quantization/conv2d_fwd_perlayer_quantization.cpp
@@ -0,0 +1,192 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perlayer_quantization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using InDataType  = int8_t;
+using WeiDataType = int8_t;
+using OutDataType = int8_t;
+
+using InLayout     = ck::tensor_layout::convolution::GNHWC;
+using WeiLayout    = ck::tensor_layout::convolution::GKYXC;
+using OutLayout    = ck::tensor_layout::convolution::GNHWK;
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using ActivationOp = PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;
+
+static constexpr ck::index_t NumDimSpatial = 2;
+static constexpr ck::index_t G             = 1;
+static constexpr ck::index_t N             = 4;
+static constexpr ck::index_t K             = 64;
+static constexpr ck::index_t C             = 32;
+static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Hi            = 71;
+static constexpr ck::index_t Wi            = 71;
+static constexpr ck::index_t Ho            = 36;
+static constexpr ck::index_t Wo            = 36;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    std::array<ck::index_t, 5> in_lengths{G, N, C, Hi, Wi};
+    std::array<ck::index_t, 5> in_strides{N * Hi * Wi * C, Hi * Wi * C, 1, Wi * C, C};
+    std::array<ck::index_t, 5> weight_lengths{G, K, C, Y, X};
+    std::array<ck::index_t, 5> weight_strides{K * Y * X * C, Y * X * C, 1, X * C, C};
+    std::array<ck::index_t, 5> out_lengths{G, N, C, Ho, Wo};
+    std::array<ck::index_t, 5> out_strides{N * Ho * Wo * C, Ho * Wo * C, 1, Wo * C, C};
+    std::array<ck::index_t, 2> in_left_pad{1, 1};
+    std::array<ck::index_t, 2> in_right_pad{1, 1};
+    std::array<ck::index_t, 2> conv_strides{2, 2};
+    std::array<ck::index_t, 2> conv_dilations{1, 1};
+
+    SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C);
+    SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K);
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NumDimSpatial,
+                                                                                 InLayout,
+                                                                                 WeiLayout,
+                                                                                 ck::Tuple<>,
+                                                                                 OutLayout,
+                                                                                 InDataType,
+                                                                                 WeiDataType,
+                                                                                 ck::Tuple<>,
+                                                                                 OutDataType,
+                                                                                 PassThrough,
+                                                                                 PassThrough,
+                                                                                 OutElementOp>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    float best_tflops     = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr      = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        {},
+                                                        out.GetDeviceBuffer(),
+                                                        in_lengths,
+                                                        in_strides,
+                                                        weight_lengths,
+                                                        weight_strides,
+                                                        {},
+                                                        {},
+                                                        out_lengths,
+                                                        out_strides,
+                                                        conv_strides,
+                                                        conv_dilations,
+                                                        in_left_pad,
+                                                        in_right_pad,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        OutElementOp{0.5f, ActivationOp{}});
+
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t flop      = G * 2 * N * K * C * Ho * Wo * Y * X;
+            std::size_t num_bytes = G * sizeof(InDataType) * N * Hi * Wi * C +
+                                    G * sizeof(WeiDataType) * K * Y * X * C +
+                                    G * sizeof(OutDataType) * N * Ho * Wo * K;
+
+            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+                best_tflops     = tflops;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+              << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        {},
+                                                        out.GetDeviceBuffer(),
+                                                        in_lengths,
+                                                        in_strides,
+                                                        weight_lengths,
+                                                        weight_strides,
+                                                        {},
+                                                        {},
+                                                        out_lengths,
+                                                        out_strides,
+                                                        conv_strides,
+                                                        conv_dilations,
+                                                        in_left_pad,
+                                                        in_right_pad,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        OutElementOp{0.5f, ActivationOp{}});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/client_example/10_grouped_conv2d_bwd_data/CMakeLists.txt b/client_example/10_grouped_conv2d_bwd_data/CMakeLists.txt
new file mode 100644
index 00000000..e564f318
--- /dev/null
+++ b/client_example/10_grouped_conv2d_bwd_data/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(client_grouped_conv2d_bwd_data grouped_conv2d_bwd_data.cpp)
+target_link_libraries(client_grouped_conv2d_bwd_data PRIVATE composable_kernel::device_operations)
diff --git a/client_example/10_grouped_conv2d_bwd_data/grouped_conv2d_bwd_data.cpp b/client_example/10_grouped_conv2d_bwd_data/grouped_conv2d_bwd_data.cpp
new file mode 100644
index 00000000..55c78980
--- /dev/null
+++ b/client_example/10_grouped_conv2d_bwd_data/grouped_conv2d_bwd_data.cpp
@@ -0,0 +1,226 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using InDataType  = ck::half_t;
+using WeiDataType = ck::half_t;
+using OutDataType = ck::half_t;
+
+using InLayout    = ck::tensor_layout::convolution::GNHWC;
+using WeiLayout   = ck::tensor_layout::convolution::GKYXC;
+using OutLayout   = ck::tensor_layout::convolution::GNHWK;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr ck::index_t NumDimSpatial = 2;
+static constexpr ck::index_t G             = 32;
+static constexpr ck::index_t N             = 256;
+static constexpr ck::index_t K             = 192;
+static constexpr ck::index_t C             = 192;
+static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Hi            = 28;
+static constexpr ck::index_t Wi            = 28;
+static constexpr ck::index_t Ho            = 28;
+static constexpr ck::index_t Wo            = 28;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main()
+{
+    std::array<ck::index_t, NumDimSpatial + 3> in_lengths{G, N, Hi, Wi, C};
+    std::array<ck::index_t, NumDimSpatial + 3> in_strides{0, 0, 0, 0, 1};
+
+    std::array<ck::index_t, NumDimSpatial + 3> wei_lengths{G, K, Y, X, C};
+    std::array<ck::index_t, NumDimSpatial + 3> wei_strides{0, 0, 0, 0, 1};
+
+    std::array<ck::index_t, NumDimSpatial + 3> out_lengths{G, N, Ho, Wo, K};
+    std::array<ck::index_t, NumDimSpatial + 3> out_strides{0, 0, 0, 0, 1};
+
+    std::partial_sum(rbegin(in_lengths),
+                     std::prev(rend(in_lengths)),
+                     std::next(rbegin(in_strides)),
+                     std::multiplies<>{});
+    std::partial_sum(rbegin(wei_lengths),
+                     std::prev(rend(wei_lengths)),
+                     std::next(rbegin(wei_strides)),
+                     std::multiplies<>{});
+    std::partial_sum(rbegin(out_lengths),
+                     std::prev(rend(out_lengths)),
+                     std::next(rbegin(out_strides)),
+                     std::multiplies<>{});
+
+    // transpose GNHWC/GKYXC/GNHWK to GNCHW/GKCYX/GNCHW
+    std::rotate(
+        rbegin(in_lengths), std::next(rbegin(in_lengths)), std::next(rbegin(in_lengths), 3));
+    std::rotate(
+        rbegin(in_strides), std::next(rbegin(in_strides)), std::next(rbegin(in_strides), 3));
+    std::rotate(
+        rbegin(wei_lengths), std::next(rbegin(wei_lengths)), std::next(rbegin(wei_lengths), 3));
+    std::rotate(
+        rbegin(wei_strides), std::next(rbegin(wei_strides)), std::next(rbegin(wei_strides), 3));
+    std::rotate(
+        rbegin(out_lengths), std::next(rbegin(out_lengths)), std::next(rbegin(out_lengths), 3));
+    std::rotate(
+        rbegin(out_strides), std::next(rbegin(out_strides)), std::next(rbegin(out_strides), 3));
+
+    std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
+    std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
+    std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
+    std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
+
+    SimpleDeviceMem in(sizeof(InDataType) * G * N * Hi * Wi * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
+    SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * K);
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD<NumDimSpatial,
+                                                                                     OutLayout,
+                                                                                     WeiLayout,
+                                                                                     ck::Tuple<>,
+                                                                                     InLayout,
+                                                                                     OutDataType,
+                                                                                     WeiDataType,
+                                                                                     ck::Tuple<>,
+                                                                                     InDataType,
+                                                                                     PassThrough,
+                                                                                     PassThrough,
+                                                                                     PassThrough>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    float best_tflops     = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr        = op_ptrs[i];
+        auto argument_ptr   = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        {},
+                                                        in.GetDeviceBuffer(),
+                                                        out_lengths,
+                                                        out_strides,
+                                                        wei_lengths,
+                                                        wei_strides,
+                                                        {},
+                                                        {},
+                                                        in_lengths,
+                                                        in_strides,
+                                                        filter_strides,
+                                                        filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        PassThrough{});
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t flop      = std::size_t(2) * G * N * K * C * Ho * Wo * Y * X;
+            std::size_t num_bytes = sizeof(InDataType) * G * N * Hi * Wi * C +
+                                    sizeof(WeiDataType) * G * K * Y * X * C +
+                                    sizeof(OutDataType) * G * N * Ho * Wo * K;
+
+            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+                best_tflops     = tflops;
+            }
+        }
+        else
+        {
+            std::cerr << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    if(best_op_id < 0)
+    {
+        std::cerr << "no suitable instance" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+              << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(out.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        {},
+                                                        in.GetDeviceBuffer(),
+                                                        out_lengths,
+                                                        out_strides,
+                                                        wei_lengths,
+                                                        wei_strides,
+                                                        {},
+                                                        {},
+                                                        in_lengths,
+                                                        in_strides,
+                                                        filter_strides,
+                                                        filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        PassThrough{});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+}
diff --git a/client_example/11_grouped_conv_bwd_weight/CMakeLists.txt b/client_example/11_grouped_conv_bwd_weight/CMakeLists.txt
new file mode 100644
index 00000000..3e3f6677
--- /dev/null
+++ b/client_example/11_grouped_conv_bwd_weight/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(client_grouped_conv2d_bwd_weight grouped_conv2d_bwd_weight.cpp)
+target_link_libraries(client_grouped_conv2d_bwd_weight PRIVATE composable_kernel::device_operations)
diff --git a/client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight.cpp b/client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight.cpp
new file mode 100644
index 00000000..1ecc8568
--- /dev/null
+++ b/client_example/11_grouped_conv_bwd_weight/grouped_conv2d_bwd_weight.cpp
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+using InDataType  = ck::half_t;
+using WeiDataType = ck::half_t;
+using OutDataType = ck::half_t;
+
+using InLayout    = ck::tensor_layout::convolution::GNHWC;
+using WeiLayout   = ck::tensor_layout::convolution::GKYXC;
+using OutLayout   = ck::tensor_layout::convolution::GNHWK;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr ck::index_t NumDimSpatial = 2;
+static constexpr ck::index_t G             = 32;
+static constexpr ck::index_t N             = 256;
+static constexpr ck::index_t K             = 192;
+static constexpr ck::index_t C             = 192;
+static constexpr ck::index_t Y             = 3;
+static constexpr ck::index_t X             = 3;
+static constexpr ck::index_t Hi            = 28;
+static constexpr ck::index_t Wi            = 28;
+static constexpr ck::index_t Ho            = 28;
+static constexpr ck::index_t Wo            = 28;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main()
+{
+    std::array<ck::index_t, NumDimSpatial> input_spatial_lengths{Hi, Wi};
+    std::array<ck::index_t, NumDimSpatial> filter_spatial_lengths{Y, X};
+    std::array<ck::index_t, NumDimSpatial> output_spatial_lengths{Ho, Wo};
+
+    std::array<ck::index_t, NumDimSpatial> conv_filter_strides{1, 1};
+    std::array<ck::index_t, NumDimSpatial> conv_filter_dilations{1, 1};
+    std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
+    std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
+
+    ck::index_t split_k = 2;
+
+    SimpleDeviceMem in(sizeof(InDataType) * G * N * Hi * Wi * C);
+    SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C);
+    SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * K);
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdWeight<NumDimSpatial,
+                                                                              InLayout,
+                                                                              WeiLayout,
+                                                                              OutLayout,
+                                                                              InDataType,
+                                                                              WeiDataType,
+                                                                              OutDataType,
+                                                                              PassThrough,
+                                                                              PassThrough,
+                                                                              PassThrough>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    float best_tflops     = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr        = op_ptrs[i];
+        auto argument_ptr   = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        out.GetDeviceBuffer(),
+                                                        G,
+                                                        N,
+                                                        K,
+                                                        C,
+                                                        input_spatial_lengths,
+                                                        filter_spatial_lengths,
+                                                        output_spatial_lengths,
+                                                        conv_filter_strides,
+                                                        conv_filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        split_k);
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t flop      = std::size_t(2) * G * N * K * C * Ho * Wo * Y * X;
+            std::size_t num_bytes = sizeof(InDataType) * G * N * Hi * Wi * C +
+                                    sizeof(WeiDataType) * G * K * Y * X * C +
+                                    sizeof(OutDataType) * G * N * Ho * Wo * K;
+
+            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+                best_tflops     = tflops;
+            }
+        }
+        else
+        {
+            std::cerr << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    if(best_op_id < 0)
+    {
+        std::cerr << "no suitable instance" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops
+              << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
+                                                        wei.GetDeviceBuffer(),
+                                                        out.GetDeviceBuffer(),
+                                                        G,
+                                                        N,
+                                                        K,
+                                                        C,
+                                                        input_spatial_lengths,
+                                                        filter_spatial_lengths,
+                                                        output_spatial_lengths,
+                                                        conv_filter_strides,
+                                                        conv_filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads,
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        PassThrough{},
+                                                        split_k);
+        auto invoker_ptr  = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+}
diff --git a/client_example/12_elementwise_normalization/CMakeLists.txt b/client_example/12_elementwise_normalization/CMakeLists.txt
new file mode 100644
index 00000000..1ba0e127
--- /dev/null
+++ b/client_example/12_elementwise_normalization/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(client_elementwise_layernorm2d elementwise_layernorm2d.cpp)
+target_link_libraries(client_elementwise_layernorm2d PRIVATE composable_kernel::device_operations)
diff --git a/client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp b/client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp
new file mode 100644
index 00000000..8cf46eda
--- /dev/null
+++ b/client_example/12_elementwise_normalization/elementwise_layernorm2d.cpp
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iomanip>
+#include <vector>
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp"
+
+using ADataType             = ck::half_t; // Input 1
+using BDataType             = ck::half_t; // Input 2
+using XDataType             = ck::half_t;
+using GammaDataType         = ck::half_t;
+using BetaDataType          = ck::half_t;
+using YDataType             = ck::half_t;
+using AccDataType           = float;
+using XElementwiseOperation = ck::tensor_operation::element_wise::Add;
+using YElementwiseOperation = ck::tensor_operation::element_wise::PassThrough;
+
+constexpr int Rank         = 2;
+constexpr int NumReduceDim = 1;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main()
+{
+    bool time_kernel = true;
+
+    ck::index_t M      = 48 * 256;
+    ck::index_t N      = 1024;
+    ck::index_t Stride = N;
+
+    auto mn_size = (M - 1) * Stride + N;
+
+    SimpleDeviceMem a_dev_buf(sizeof(ADataType) * mn_size);
+    SimpleDeviceMem b_dev_buf(sizeof(BDataType) * mn_size);
+    SimpleDeviceMem gamma_dev_buf(sizeof(GammaDataType) * N);
+    SimpleDeviceMem beta_dev_buf(sizeof(BetaDataType) * N);
+    SimpleDeviceMem y_dev_buf(sizeof(YDataType) * mn_size);
+
+    std::array<const void*, 2> ab_input               = {a_dev_buf.GetDeviceBuffer(),
+                                           b_dev_buf.GetDeviceBuffer()};
+    std::vector<ck::index_t> abStride                 = {Stride, 1};
+    std::array<std::vector<ck::index_t>, 2> abStrides = {abStride, abStride};
+
+    using DeviceOp = ck::tensor_operation::device::DeviceElementwiseNormalization<
+        ck::Tuple<ADataType, BDataType>,
+        GammaDataType,
+        BetaDataType,
+        AccDataType,
+        YDataType,
+        XElementwiseOperation,
+        YElementwiseOperation,
+        Rank,
+        NumReduceDim>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // lengths
+                                                        abStrides,
+                                                        {0, 1},      // gammaStrides
+                                                        {0, 1},      // betaStrides
+                                                        {Stride, 1}, // yStrides
+                                                        {1},         // reduceDims
+                                                        1e-4,
+                                                        ab_input,
+                                                        gamma_dev_buf.GetDeviceBuffer(),
+                                                        beta_dev_buf.GetDeviceBuffer(),
+                                                        y_dev_buf.GetDeviceBuffer(),
+                                                        XElementwiseOperation{},
+                                                        YElementwiseOperation{});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t num_byte = sizeof(ADataType) * M * N + sizeof(BDataType) * M * N +
+                                   sizeof(GammaDataType) * N + sizeof(BetaDataType) * N +
+                                   sizeof(YDataType) * M * N;
+
+            float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+
+            if(ave_time < best_ave_time)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+              << best_op_name << std::endl;
+
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer({M, N}, // lengths
+                                                        abStrides,
+                                                        {1},         // gammaStrides
+                                                        {1},         // betaStrides
+                                                        {Stride, 1}, // yStrides
+                                                        {1},         // reduceDims
+                                                        1e-4,
+                                                        ab_input,
+                                                        gamma_dev_buf.GetDeviceBuffer(),
+                                                        beta_dev_buf.GetDeviceBuffer(),
+                                                        y_dev_buf.GetDeviceBuffer(),
+                                                        XElementwiseOperation{},
+                                                        YElementwiseOperation{});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
diff --git a/client_example/13_batchnorm/CMakeLists.txt b/client_example/13_batchnorm/CMakeLists.txt
new file mode 100644
index 00000000..54669678
--- /dev/null
+++ b/client_example/13_batchnorm/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_executable(client_batchnorm_fwd_nhwc batchnorm_fwd_nhwc.cpp)
+add_executable(client_batchnorm_bwd_nhwc batchnorm_bwd_nhwc.cpp)
+target_link_libraries(client_batchnorm_fwd_nhwc PRIVATE composable_kernel::device_operations)
+target_link_libraries(client_batchnorm_bwd_nhwc PRIVATE composable_kernel::device_operations)
diff --git a/client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp b/client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp
new file mode 100644
index 00000000..8ef21986
--- /dev/null
+++ b/client_example/13_batchnorm/batchnorm_bwd_nhwc.cpp
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <functional>
+#include <numeric>
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp"
+
+using XDataType           = ck::half_t;
+using DxDataType          = float;
+using DyDataType          = float;
+using AccDataType         = float;
+using ScaleDataType       = ck::half_t;
+using DscaleDbiasDataType = float;
+using MeanVarDataType     = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+constexpr int Rank                  = 4;
+constexpr int NumBatchNormReduceDim = 3;
+
+const double epsilon = std::numeric_limits<float>::epsilon();
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    std::array<ck::index_t, Rank> xyLengths{16, 8, 128, 256};
+    std::array<ck::index_t, Rank> xyStrides{8 * 128 * 256, 128 * 256, 256, 1};
+    std::array<ck::index_t, Rank - NumBatchNormReduceDim> scaleBiasMeanVarLengths{256};
+    std::array<ck::index_t, Rank - NumBatchNormReduceDim> scaleBiasMeanVarStrides{1};
+    std::array<int, NumBatchNormReduceDim> reduceDims{0, 1, 2};
+
+    ck::index_t numXYElement =
+        std::accumulate(xyLengths.begin(), xyLengths.end(), 1, std::multiplies<ck::index_t>());
+
+    ck::index_t numScaleBiasMeanVarElement = std::accumulate(scaleBiasMeanVarLengths.begin(),
+                                                             scaleBiasMeanVarLengths.end(),
+                                                             1,
+                                                             std::multiplies<ck::index_t>());
+
+    SimpleDeviceMem x(sizeof(XDataType) * numXYElement);
+    SimpleDeviceMem dy(sizeof(DyDataType) * numXYElement);
+    SimpleDeviceMem scale(sizeof(ScaleDataType) * numScaleBiasMeanVarElement);
+    SimpleDeviceMem mean(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement);
+    SimpleDeviceMem invVariance(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement);
+    SimpleDeviceMem dx(sizeof(DxDataType) * numXYElement);
+    SimpleDeviceMem dscale(sizeof(DscaleDbiasDataType) * numScaleBiasMeanVarElement);
+    SimpleDeviceMem dbias(sizeof(DscaleDbiasDataType) * numScaleBiasMeanVarElement);
+
+    using DeviceOp = ck::tensor_operation::device::DeviceBatchNormBwd<XDataType,
+                                                                      DxDataType,
+                                                                      DyDataType,
+                                                                      AccDataType,
+                                                                      ScaleDataType,
+                                                                      DscaleDbiasDataType,
+                                                                      MeanVarDataType,
+                                                                      PassThrough,
+                                                                      Rank,
+                                                                      NumBatchNormReduceDim>;
+
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths,
+                                                        xyStrides,
+                                                        xyStrides,
+                                                        xyStrides,
+                                                        reduceDims,
+                                                        scaleBiasMeanVarLengths,
+                                                        scaleBiasMeanVarStrides,
+                                                        scaleBiasMeanVarStrides,
+                                                        scaleBiasMeanVarStrides,
+                                                        x.GetDeviceBuffer(),
+                                                        dy.GetDeviceBuffer(),
+                                                        scale.GetDeviceBuffer(),
+                                                        mean.GetDeviceBuffer(),
+                                                        invVariance.GetDeviceBuffer(),
+                                                        epsilon,
+                                                        PassThrough{},
+                                                        dx.GetDeviceBuffer(),
+                                                        dscale.GetDeviceBuffer(),
+                                                        dbias.GetDeviceBuffer());
+
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+
+            SimpleDeviceMem workspace(workspace_sz);
+
+            op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
+
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t num_bytes =
+                numXYElement * (sizeof(XDataType) + sizeof(DyDataType) + sizeof(DxDataType)) +
+                numScaleBiasMeanVarElement *
+                    (sizeof(ScaleDataType) + sizeof(DscaleDbiasDataType) * 2 +
+                     sizeof(MeanVarDataType) * 2);
+
+            float gb_per_sec = num_bytes / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+
+            if(ave_time < best_ave_time)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    if(found)
+    {
+        std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_op_name << std::endl;
+
+        // run the best intance
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths,
+                                                        xyStrides,
+                                                        xyStrides,
+                                                        xyStrides,
+                                                        reduceDims,
+                                                        scaleBiasMeanVarLengths,
+                                                        scaleBiasMeanVarStrides,
+                                                        scaleBiasMeanVarStrides,
+                                                        scaleBiasMeanVarStrides,
+                                                        x.GetDeviceBuffer(),
+                                                        dy.GetDeviceBuffer(),
+                                                        scale.GetDeviceBuffer(),
+                                                        mean.GetDeviceBuffer(),
+                                                        invVariance.GetDeviceBuffer(),
+                                                        epsilon,
+                                                        PassThrough{},
+                                                        dx.GetDeviceBuffer(),
+                                                        dscale.GetDeviceBuffer(),
+                                                        dbias.GetDeviceBuffer());
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
diff --git a/client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp b/client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp
new file mode 100644
index 00000000..322667a4
--- /dev/null
+++ b/client_example/13_batchnorm/batchnorm_fwd_nhwc.cpp
@@ -0,0 +1,197 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <functional>
+#include <numeric>
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp"
+
+using XDataType       = float;
+using YDataType       = float;
+using AccDataType     = float;
+using ScaleDataType   = AccDataType;
+using BiasDataType    = AccDataType;
+using MeanVarDataType = AccDataType;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+constexpr int Rank                  = 4;
+constexpr int NumBatchNormReduceDim = 3;
+
+const double epsilon       = std::numeric_limits<float>::epsilon();
+const double averageFactor = 0.1;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+int main(int argc, char* argv[])
+{
+    std::array<ck::index_t, Rank> xyLengths{16, 8, 128, 256};
+    std::array<ck::index_t, Rank> xyStrides{8 * 128 * 256, 128 * 256, 256, 1};
+    std::array<ck::index_t, Rank - NumBatchNormReduceDim> scaleBiasMeanVarLengths{256};
+    std::array<ck::index_t, Rank - NumBatchNormReduceDim> scaleBiasMeanVarStrides{1};
+    std::array<int, NumBatchNormReduceDim> reduceDims{0, 1, 2};
+
+    ck::index_t numXYElement =
+        std::accumulate(xyLengths.begin(), xyLengths.end(), 1, std::multiplies<ck::index_t>());
+
+    ck::index_t numScaleBiasMeanVarElement = std::accumulate(scaleBiasMeanVarLengths.begin(),
+                                                             scaleBiasMeanVarLengths.end(),
+                                                             1,
+                                                             std::multiplies<ck::index_t>());
+
+    SimpleDeviceMem x(sizeof(XDataType) * numXYElement);
+    SimpleDeviceMem y(sizeof(YDataType) * numXYElement);
+    SimpleDeviceMem scale(sizeof(ScaleDataType) * numScaleBiasMeanVarElement);
+    SimpleDeviceMem bias(sizeof(BiasDataType) * numScaleBiasMeanVarElement);
+    SimpleDeviceMem mean(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement);
+    SimpleDeviceMem invVariance(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement);
+
+    using DeviceOp = ck::tensor_operation::device::DeviceBatchNormFwd<XDataType,
+                                                                      YDataType,
+                                                                      AccDataType,
+                                                                      ScaleDataType,
+                                                                      BiasDataType,
+                                                                      MeanVarDataType,
+                                                                      PassThrough,
+                                                                      Rank,
+                                                                      NumBatchNormReduceDim>;
+
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths,
+                                                        xyStrides,
+                                                        xyStrides,
+                                                        reduceDims,
+                                                        scaleBiasMeanVarLengths,
+                                                        scaleBiasMeanVarStrides,
+                                                        scaleBiasMeanVarStrides,
+                                                        scaleBiasMeanVarStrides,
+                                                        x.GetDeviceBuffer(),
+                                                        scale.GetDeviceBuffer(),
+                                                        bias.GetDeviceBuffer(),
+                                                        epsilon,
+                                                        PassThrough{},
+                                                        y.GetDeviceBuffer(),
+                                                        mean.GetDeviceBuffer(),
+                                                        invVariance.GetDeviceBuffer(),
+                                                        averageFactor,
+                                                        nullptr,
+                                                        nullptr);
+
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+
+            SimpleDeviceMem workspace(workspace_sz);
+
+            op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
+
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t num_bytes =
+                numXYElement * (sizeof(XDataType) + sizeof(YDataType)) +
+                numScaleBiasMeanVarElement * (sizeof(ScaleDataType) + sizeof(BiasDataType) +
+                                              sizeof(MeanVarDataType) + sizeof(MeanVarDataType));
+
+            float gb_per_sec = num_bytes / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+
+            if(ave_time < best_ave_time)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    if(found)
+    {
+        std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_op_name << std::endl;
+
+        // run the best intance
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths,
+                                                        xyStrides,
+                                                        xyStrides,
+                                                        reduceDims,
+                                                        scaleBiasMeanVarLengths,
+                                                        scaleBiasMeanVarStrides,
+                                                        scaleBiasMeanVarStrides,
+                                                        scaleBiasMeanVarStrides,
+                                                        x.GetDeviceBuffer(),
+                                                        scale.GetDeviceBuffer(),
+                                                        bias.GetDeviceBuffer(),
+                                                        epsilon,
+                                                        PassThrough{},
+                                                        y.GetDeviceBuffer(),
+                                                        mean.GetDeviceBuffer(),
+                                                        invVariance.GetDeviceBuffer(),
+                                                        averageFactor,
+                                                        nullptr,
+                                                        nullptr);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
diff --git a/client_example/14_instance_id/CMakeLists.txt b/client_example/14_instance_id/CMakeLists.txt
new file mode 100644
index 00000000..87b2a9a0
--- /dev/null
+++ b/client_example/14_instance_id/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(client_batchnorm_fwd_instance_id batchnorm_fwd_instance_id.cpp)
+target_link_libraries(client_batchnorm_fwd_instance_id PRIVATE composable_kernel::device_operations)
diff --git a/client_example/14_instance_id/batchnorm_fwd_instance_id.cpp b/client_example/14_instance_id/batchnorm_fwd_instance_id.cpp
new file mode 100644
index 00000000..9cfeee1c
--- /dev/null
+++ b/client_example/14_instance_id/batchnorm_fwd_instance_id.cpp
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <functional>
+#include <numeric>
+#include <iomanip>
+#include <iostream>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp"
+
+using XDataType       = float;
+using YDataType       = float;
+using AccDataType     = float;
+using ScaleDataType   = AccDataType;
+using BiasDataType    = AccDataType;
+using MeanVarDataType = AccDataType;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+constexpr int Rank                  = 4;
+constexpr int NumBatchNormReduceDim = 3;
+
+const double epsilon       = std::numeric_limits<float>::epsilon();
+const double averageFactor = 0.1;
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+};
+
+// In the actual application, the instance index and name are usually from the perf db
+static int instance_index = -1;
+static std::string instance_name;
+
+int main(int argc, char* argv[])
+{
+    std::array<ck::index_t, Rank> xyLengths{16, 8, 128, 256};
+    std::array<ck::index_t, Rank> xyStrides{8 * 128 * 256, 128 * 256, 256, 1};
+    std::array<ck::index_t, Rank - NumBatchNormReduceDim> scaleBiasMeanVarLengths{256};
+    std::array<ck::index_t, Rank - NumBatchNormReduceDim> scaleBiasMeanVarStrides{1};
+    std::array<int, NumBatchNormReduceDim> reduceDims{0, 1, 2};
+
+    ck::index_t numXYElement =
+        std::accumulate(xyLengths.begin(), xyLengths.end(), 1, std::multiplies<ck::index_t>());
+
+    ck::index_t numScaleBiasMeanVarElement = std::accumulate(scaleBiasMeanVarLengths.begin(),
+                                                             scaleBiasMeanVarLengths.end(),
+                                                             1,
+                                                             std::multiplies<ck::index_t>());
+
+    SimpleDeviceMem x(sizeof(XDataType) * numXYElement);
+    SimpleDeviceMem y(sizeof(YDataType) * numXYElement);
+    SimpleDeviceMem scale(sizeof(ScaleDataType) * numScaleBiasMeanVarElement);
+    SimpleDeviceMem bias(sizeof(BiasDataType) * numScaleBiasMeanVarElement);
+    SimpleDeviceMem mean(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement);
+    SimpleDeviceMem invVariance(sizeof(MeanVarDataType) * numScaleBiasMeanVarElement);
+
+    using DeviceOp = ck::tensor_operation::device::DeviceBatchNormFwd<XDataType,
+                                                                      YDataType,
+                                                                      AccDataType,
+                                                                      ScaleDataType,
+                                                                      BiasDataType,
+                                                                      MeanVarDataType,
+                                                                      PassThrough,
+                                                                      Rank,
+                                                                      NumBatchNormReduceDim>;
+
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    bool found          = false;
+    int best_op_index   = -1;
+    float best_ave_time = std::numeric_limits<float>::max();
+
+    // profile device operation instances and save the best performant instance index and instance
+    // name
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths,
+                                                        xyStrides,
+                                                        xyStrides,
+                                                        reduceDims,
+                                                        scaleBiasMeanVarLengths,
+                                                        scaleBiasMeanVarStrides,
+                                                        scaleBiasMeanVarStrides,
+                                                        scaleBiasMeanVarStrides,
+                                                        x.GetDeviceBuffer(),
+                                                        scale.GetDeviceBuffer(),
+                                                        bias.GetDeviceBuffer(),
+                                                        epsilon,
+                                                        PassThrough{},
+                                                        y.GetDeviceBuffer(),
+                                                        mean.GetDeviceBuffer(),
+                                                        invVariance.GetDeviceBuffer(),
+                                                        averageFactor,
+                                                        nullptr,
+                                                        nullptr);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+
+            SimpleDeviceMem workspace(workspace_sz);
+
+            op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
+
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            if(ave_time < best_ave_time)
+            {
+                found         = true;
+                best_op_index = i;
+                best_ave_time = ave_time;
+            }
+        }
+    }
+
+    if(found)
+    {
+        instance_index = best_op_index;
+        instance_name  = op_ptrs[instance_index]->GetTypeIdHashCode();
+    };
+
+    // simulate the execution of the operation when the instance index and name are available
+    const auto op_ptrs_2 = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    if(instance_index >= 0 && instance_index < op_ptrs_2.size())
+    {
+        auto& op_ptr = op_ptrs_2[instance_index];
+
+        if(op_ptr->GetTypeIdHashCode() == instance_name)
+        {
+
+            auto argument_ptr = op_ptr->MakeArgumentPointer(xyLengths,
+                                                            xyStrides,
+                                                            xyStrides,
+                                                            reduceDims,
+                                                            scaleBiasMeanVarLengths,
+                                                            scaleBiasMeanVarStrides,
+                                                            scaleBiasMeanVarStrides,
+                                                            scaleBiasMeanVarStrides,
+                                                            x.GetDeviceBuffer(),
+                                                            scale.GetDeviceBuffer(),
+                                                            bias.GetDeviceBuffer(),
+                                                            epsilon,
+                                                            PassThrough{},
+                                                            y.GetDeviceBuffer(),
+                                                            mean.GetDeviceBuffer(),
+                                                            invVariance.GetDeviceBuffer(),
+                                                            averageFactor,
+                                                            nullptr,
+                                                            nullptr);
+
+            auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+            if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+            {
+                size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+
+                SimpleDeviceMem workspace(workspace_sz);
+
+                op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
+
+                float exec_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+                size_t num_bytes = numXYElement * (sizeof(XDataType) + sizeof(YDataType)) +
+                                   numScaleBiasMeanVarElement *
+                                       (sizeof(ScaleDataType) + sizeof(BiasDataType) +
+                                        sizeof(MeanVarDataType) + sizeof(MeanVarDataType));
+
+                float gb_per_sec = num_bytes / 1.E6 / exec_time;
+
+                std::cout << "Kernel execution time: " << std::setw(10) << exec_time
+                          << " ms,  effective data transfer bandwidth: " << gb_per_sec << " GB/s"
+                          << std::endl;
+            }
+        };
+    }
+
+    return 0;
+}
diff --git a/client_example/CMakeLists.txt b/client_example/CMakeLists.txt
new file mode 100644
index 00000000..14c066e4
--- /dev/null
+++ b/client_example/CMakeLists.txt
@@ -0,0 +1,15 @@
+cmake_minimum_required(VERSION 3.15)
+project(ck_app)
+add_compile_options(-std=c++17)
+
+find_package(composable_kernel 1.0.0 COMPONENTS device_operations)
+find_package(hip REQUIRED PATHS /opt/rocm)
+message(STATUS "Build with HIP ${hip_VERSION}")
+
+# add all example subdir
+file(GLOB dir_list LIST_DIRECTORIES true *)
+FOREACH(subdir ${dir_list})
+    IF(IS_DIRECTORY "${subdir}" AND (NOT "${subdir}" MATCHES "build"))
+        add_subdirectory(${subdir})
+    ENDIF()
+ENDFOREACH()
diff --git a/client_example/README.md b/client_example/README.md
new file mode 100644
index 00000000..64a7130d
--- /dev/null
+++ b/client_example/README.md
@@ -0,0 +1,21 @@
+##
+Client application links to CK library, and therefore CK library needs to be installed before building client applications.
+
+
+## Build
+```bash
+mkdir -p client_example/build
+cd client_example/build
+```
+
+```bash
+cmake                                                                 \
+-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                             \
+-D CMAKE_PREFIX_PATH="/opt/rocm;${PATH_TO_CK_INSTALL_DIRECTORY}"      \
+..
+```
+
+### Build client example
+```bash
+ make -j 
+```
diff --git a/cmake/Analyzers.cmake b/cmake/Analyzers.cmake
new file mode 100644
index 00000000..1bf1a52c
--- /dev/null
+++ b/cmake/Analyzers.cmake
@@ -0,0 +1,34 @@
+################################################################################
+# 
+# MIT License
+# 
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# 
+################################################################################
+
+if(NOT TARGET analyze)
+    add_custom_target(analyze)
+endif()
+
+function(mark_as_analyzer)
+    add_dependencies(analyze ${ARGN})
+endfunction()
+
diff --git a/cmake/ClangTidy.cmake b/cmake/ClangTidy.cmake
new file mode 100644
index 00000000..01b348c4
--- /dev/null
+++ b/cmake/ClangTidy.cmake
@@ -0,0 +1,162 @@
+################################################################################
+#
+# MIT License
+#
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+################################################################################
+include(CMakeParseArguments)
+include(Analyzers)
+
+get_filename_component(CLANG_TIDY_EXE_HINT "${CMAKE_CXX_COMPILER}" PATH)
+
+find_program(CLANG_TIDY_EXE
+    NAMES
+        clang-tidy
+        clang-tidy-5.0
+        clang-tidy-4.0
+        clang-tidy-3.9
+        clang-tidy-3.8
+        clang-tidy-3.7
+        clang-tidy-3.6
+        clang-tidy-3.5
+    HINTS
+        ${CLANG_TIDY_EXE_HINT}
+    PATH_SUFFIXES
+        compiler/bin
+    PATHS
+        /opt/rocm/llvm/bin
+        /opt/rocm/hcc
+        /usr/local/opt/llvm/bin
+)
+
+function(find_clang_tidy_version VAR)
+    execute_process(COMMAND ${CLANG_TIDY_EXE} -version OUTPUT_VARIABLE VERSION_OUTPUT)
+    separate_arguments(VERSION_OUTPUT_LIST UNIX_COMMAND "${VERSION_OUTPUT}")
+    list(FIND VERSION_OUTPUT_LIST "version" VERSION_INDEX)
+    if(VERSION_INDEX GREATER 0)
+        math(EXPR VERSION_INDEX "${VERSION_INDEX} + 1")
+        list(GET VERSION_OUTPUT_LIST ${VERSION_INDEX} VERSION)
+        set(${VAR} ${VERSION} PARENT_SCOPE)
+    else()
+        set(${VAR} "0.0" PARENT_SCOPE)
+    endif()
+
+endfunction()
+
+if( NOT CLANG_TIDY_EXE )
+    message( STATUS "Clang tidy not found" )
+    set(CLANG_TIDY_VERSION "0.0")
+else()
+    find_clang_tidy_version(CLANG_TIDY_VERSION)
+    message( STATUS "Clang tidy found: ${CLANG_TIDY_VERSION}")
+endif()
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+set(CLANG_TIDY_FIXIT_DIR ${CMAKE_BINARY_DIR}/fixits)
+file(MAKE_DIRECTORY ${CLANG_TIDY_FIXIT_DIR})
+set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${CLANG_TIDY_FIXIT_DIR})
+
+macro(enable_clang_tidy)
+    set(options ANALYZE_TEMPORARY_DTORS ALL)
+    set(oneValueArgs HEADER_FILTER)
+    set(multiValueArgs CHECKS ERRORS EXTRA_ARGS)
+
+    cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    string(REPLACE ";" "," CLANG_TIDY_CHECKS "${PARSE_CHECKS}")
+    string(REPLACE ";" "," CLANG_TIDY_ERRORS "${PARSE_ERRORS}")
+    set(CLANG_TIDY_EXTRA_ARGS)
+    foreach(ARG ${PARSE_EXTRA_ARGS})
+        list(APPEND CLANG_TIDY_EXTRA_ARGS "-extra-arg=${ARG}")
+    endforeach()
+
+    set(CLANG_TIDY_ALL)
+    if(PARSE_ALL)
+        set(CLANG_TIDY_ALL ALL)
+    endif()
+
+    message(STATUS "Clang tidy checks: ${CLANG_TIDY_CHECKS}")
+
+    if (${PARSE_ANALYZE_TEMPORARY_DTORS})
+        set(CLANG_TIDY_ANALYZE_TEMPORARY_DTORS "-analyze-temporary-dtors")
+    endif()
+
+    if (${CLANG_TIDY_VERSION} VERSION_LESS "3.9.0")
+        set(CLANG_TIDY_ERRORS_ARG "")
+    else()
+        set(CLANG_TIDY_ERRORS_ARG "-warnings-as-errors='${CLANG_TIDY_ERRORS}'")
+    endif()
+
+    if (${CLANG_TIDY_VERSION} VERSION_LESS "3.9.0")
+        set(CLANG_TIDY_QUIET_ARG "")
+    else()
+        set(CLANG_TIDY_QUIET_ARG "-quiet")
+    endif()
+
+    if(PARSE_HEADER_FILTER)
+        string(REPLACE "$" "$$" CLANG_TIDY_HEADER_FILTER "${PARSE_HEADER_FILTER}")
+    else()
+        set(CLANG_TIDY_HEADER_FILTER ".*")
+    endif()
+
+    set(CLANG_TIDY_COMMAND
+        ${CLANG_TIDY_EXE}
+        ${CLANG_TIDY_QUIET_ARG}
+        -p ${CMAKE_BINARY_DIR}
+        -checks='${CLANG_TIDY_CHECKS}'
+        ${CLANG_TIDY_ERRORS_ARG}
+        ${CLANG_TIDY_EXTRA_ARGS}
+        ${CLANG_TIDY_ANALYZE_TEMPORARY_DTORS}
+        -header-filter='${CLANG_TIDY_HEADER_FILTER}'
+    )
+    add_custom_target(tidy ${CLANG_TIDY_ALL})
+    mark_as_analyzer(tidy)
+    add_custom_target(tidy-base)
+    add_custom_target(tidy-make-fixit-dir COMMAND ${CMAKE_COMMAND} -E make_directory ${CLANG_TIDY_FIXIT_DIR})
+    add_custom_target(tidy-rm-fixit-dir COMMAND ${CMAKE_COMMAND} -E remove_directory ${CLANG_TIDY_FIXIT_DIR})
+    add_dependencies(tidy-make-fixit-dir tidy-rm-fixit-dir)
+    add_dependencies(tidy-base tidy-make-fixit-dir)
+endmacro()
+
+function(clang_tidy_check TARGET)
+    get_target_property(SOURCES ${TARGET} SOURCES)
+    # TODO: Use generator expressions instead
+    # COMMAND ${CLANG_TIDY_COMMAND} $<TARGET_PROPERTY:${TARGET},SOURCES>
+    # COMMAND ${CLANG_TIDY_COMMAND} $<JOIN:$<TARGET_PROPERTY:${TARGET},SOURCES>, >
+    foreach(SOURCE ${SOURCES})
+        if((NOT "${SOURCE}" MATCHES "(h|hpp|hxx)$") AND (NOT "${SOURCE}" MATCHES "TARGET_OBJECTS"))
+            string(MAKE_C_IDENTIFIER "${SOURCE}" tidy_file)
+            set(tidy_target tidy-target-${TARGET}-${tidy_file})
+            add_custom_target(${tidy_target}
+                # for some targets clang-tidy not able to get information from .clang-tidy
+                DEPENDS ${SOURCE}
+                COMMAND ${CLANG_TIDY_COMMAND} "-config=\{CheckOptions: \[\{key: bugprone-reserved-identifier.AllowedIdentifiers,value: __HIP_PLATFORM_HCC__\; __HIP_ROCclr__\}\]\}" ${SOURCE} "-export-fixes=${CLANG_TIDY_FIXIT_DIR}/${TARGET}-${tidy_file}.yaml"
+                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+                COMMENT "clang-tidy: Running clang-tidy on target ${SOURCE}..."
+            )
+            add_dependencies(${tidy_target} ${TARGET})
+            add_dependencies(${tidy_target} tidy-base)
+            add_dependencies(tidy ${tidy_target})
+        endif()
+    endforeach()
+endfunction()
+
diff --git a/cmake/CppCheck.cmake b/cmake/CppCheck.cmake
new file mode 100644
index 00000000..797dcf4b
--- /dev/null
+++ b/cmake/CppCheck.cmake
@@ -0,0 +1,130 @@
+################################################################################
+# 
+# MIT License
+# 
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# 
+################################################################################
+
+include(CMakeParseArguments)
+include(ProcessorCount)
+include(Analyzers)
+
+find_program(CPPCHECK_EXE 
+    NAMES 
+        cppcheck
+    PATHS
+        /opt/rocm/bin
+)
+
+ProcessorCount(CPPCHECK_JOBS)
+
+set(CPPCHECK_BUILD_DIR ${CMAKE_BINARY_DIR}/cppcheck-build)
+file(MAKE_DIRECTORY ${CPPCHECK_BUILD_DIR})
+set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${CPPCHECK_BUILD_DIR})
+
+macro(enable_cppcheck)
+    set(options FORCE)
+    set(oneValueArgs)
+    set(multiValueArgs CHECKS SUPPRESS DEFINE UNDEFINE INCLUDE SOURCES)
+
+    cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    string(REPLACE ";" "," CPPCHECK_CHECKS "${PARSE_CHECKS}")
+    string(REPLACE ";" "\n" CPPCHECK_SUPPRESS "${PARSE_SUPPRESS};*:/usr/*")
+    file(WRITE ${CMAKE_BINARY_DIR}/cppcheck-supressions "${CPPCHECK_SUPPRESS}")
+    set(CPPCHECK_DEFINES)
+    foreach(DEF ${PARSE_DEFINE})
+        set(CPPCHECK_DEFINES "${CPPCHECK_DEFINES} -D${DEF}")
+    endforeach()
+
+    set(CPPCHECK_UNDEFINES)
+    foreach(DEF ${PARSE_UNDEFINE})
+        set(CPPCHECK_UNDEFINES "${CPPCHECK_UNDEFINES} -U${DEF}")
+    endforeach()
+
+    set(CPPCHECK_INCLUDES)
+    foreach(INC ${PARSE_INCLUDE})
+        set(CPPCHECK_INCLUDES "${CPPCHECK_INCLUDES} -I${INC}")
+    endforeach()
+
+    # set(CPPCHECK_FORCE)
+    set(CPPCHECK_FORCE "--project=${CMAKE_BINARY_DIR}/compile_commands.json")
+    if(PARSE_FORCE)
+        set(CPPCHECK_FORCE --force)
+    endif()
+
+    set(SOURCES)
+    set(GLOBS)
+    foreach(SOURCE ${PARSE_SOURCES})
+        get_filename_component(ABS_SOURCE ${SOURCE} ABSOLUTE)
+        if(EXISTS ${ABS_SOURCE})
+            if(IS_DIRECTORY ${ABS_SOURCE})
+                set(GLOBS "${GLOBS} ${ABS_SOURCE}/*.cpp ${ABS_SOURCE}/*.hpp ${ABS_SOURCE}/*.cxx ${ABS_SOURCE}/*.c ${ABS_SOURCE}/*.h")
+            else()
+                set(SOURCES "${SOURCES} ${ABS_SOURCE}")
+            endif()
+        else()
+            set(GLOBS "${GLOBS} ${ABS_SOURCE}")
+        endif()
+    endforeach()
+
+    file(WRITE ${CMAKE_BINARY_DIR}/cppcheck.cmake "
+        file(GLOB_RECURSE GSRCS ${GLOBS})
+        set(CPPCHECK_COMMAND
+            ${CPPCHECK_EXE}
+            -q
+            # -v
+            # --report-progress
+            ${CPPCHECK_FORCE}
+            --cppcheck-build-dir=${CPPCHECK_BUILD_DIR}
+            --platform=native
+            --template=gcc
+            --error-exitcode=1
+            -j ${CPPCHECK_JOBS}
+            ${CPPCHECK_DEFINES}
+            ${CPPCHECK_UNDEFINES}
+            ${CPPCHECK_INCLUDES}
+            --enable=${CPPCHECK_CHECKS}
+            --inline-suppr
+            --suppressions-list=${CMAKE_BINARY_DIR}/cppcheck-supressions
+            ${SOURCES} \${GSRCS}
+        )
+        string(REPLACE \";\" \" \" CPPCHECK_SHOW_COMMAND \"\${CPPCHECK_COMMAND}\")
+        message(\"\${CPPCHECK_SHOW_COMMAND}\")
+        execute_process(
+            COMMAND \${CPPCHECK_COMMAND}
+            WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+            RESULT_VARIABLE RESULT
+        )
+        if(NOT RESULT EQUAL 0)
+            message(FATAL_ERROR \"Cppcheck failed\")
+        endif()
+")
+
+    add_custom_target(cppcheck
+        COMMAND ${CMAKE_COMMAND} -P ${CMAKE_BINARY_DIR}/cppcheck.cmake
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        COMMENT "cppcheck: Running cppcheck..."
+    )
+    mark_as_analyzer(cppcheck)
+endmacro()
+
+
diff --git a/cmake/DoxygenDoc.cmake b/cmake/DoxygenDoc.cmake
new file mode 100644
index 00000000..2e3669fc
--- /dev/null
+++ b/cmake/DoxygenDoc.cmake
@@ -0,0 +1,355 @@
+################################################################################
+# 
+# MIT License
+# 
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# 
+################################################################################
+include(CMakeParseArguments)
+include(MainDoc)
+
+find_program(DOXYGEN_EXECUTABLE NAMES doxygen
+    PATH_SUFFIXES bin
+    DOC "Doxygen documentation generator"
+)
+mark_as_advanced(DOXYGEN_EXECUTABLE)
+
+find_path(DOT_EXECUTABLE NAMES dot
+    PATH_SUFFIXES bin
+    DOC "Graphviz"
+)
+mark_as_advanced(DOT_EXECUTABLE)
+
+set(DOXYGEN_ARGS
+ABBREVIATE_BRIEF
+ALIASES
+ALLEXTERNALS
+ALLOW_UNICODE_NAMES
+ALPHABETICAL_INDEX
+ALWAYS_DETAILED_SEC
+AUTOLINK_SUPPORT
+BINARY_TOC
+BRIEF_MEMBER_DESC
+BUILTIN_STL_SUPPORT
+CALLER_GRAPH
+CALL_GRAPH
+CASE_SENSE_NAMES
+CHM_FILE
+CHM_INDEX_ENCODING
+CITE_BIB_FILES
+CLANG_ASSISTED_PARSING
+CLANG_OPTIONS
+CLASS_DIAGRAMS
+CLASS_GRAPH
+COLLABORATION_GRAPH
+COLS_IN_ALPHA_INDEX
+COMPACT_LATEX
+COMPACT_RTF
+CPP_CLI_SUPPORT
+CREATE_SUBDIRS
+DIAFILE_DIRS
+DIA_PATH
+DIRECTORY_GRAPH
+DISABLE_INDEX
+DISTRIBUTE_GROUP_DOC
+DOCBOOK_OUTPUT
+DOCBOOK_PROGRAMLISTING
+DOCSET_BUNDLE_ID
+DOCSET_FEEDNAME
+DOCSET_PUBLISHER_ID
+DOCSET_PUBLISHER_NAME
+DOTFILE_DIRS
+DOT_CLEANUP
+DOT_FONTNAME
+DOT_FONTPATH
+DOT_FONTSIZE
+DOT_GRAPH_MAX_NODES
+DOT_IMAGE_FORMAT
+DOT_MULTI_TARGETS
+DOT_NUM_THREADS
+# DOT_PATH
+DOT_TRANSPARENT
+DOXYFILE_ENCODING
+ECLIPSE_DOC_ID
+ENABLED_SECTIONS
+ENABLE_PREPROCESSING
+ENUM_VALUES_PER_LINE
+EXAMPLE_PATH
+EXAMPLE_PATTERNS
+EXAMPLE_RECURSIVE
+EXCLUDE
+EXCLUDE_PATTERNS
+EXCLUDE_SYMBOLS
+EXCLUDE_SYMLINKS
+EXPAND_AS_DEFINED
+EXPAND_ONLY_PREDEF
+EXTENSION_MAPPING
+EXTERNAL_GROUPS
+EXTERNAL_PAGES
+EXTERNAL_SEARCH
+EXTERNAL_SEARCH_ID
+EXTRACT_ALL
+EXTRACT_ANON_NSPACES
+EXTRACT_LOCAL_CLASSES
+EXTRACT_LOCAL_METHODS
+EXTRACT_PACKAGE
+EXTRACT_PRIVATE
+EXTRACT_STATIC
+EXTRA_PACKAGES
+EXTRA_SEARCH_MAPPINGS
+EXT_LINKS_IN_WINDOW
+FILE_PATTERNS
+FILE_VERSION_FILTER
+FILTER_PATTERNS
+FILTER_SOURCE_FILES
+FILTER_SOURCE_PATTERNS
+FORCE_LOCAL_INCLUDES
+FORMULA_FONTSIZE
+FORMULA_TRANSPARENT
+FULL_PATH_NAMES
+GENERATE_AUTOGEN_DEF
+GENERATE_BUGLIST
+GENERATE_CHI
+GENERATE_DEPRECATEDLIST
+GENERATE_DOCBOOK
+GENERATE_DOCSET
+GENERATE_ECLIPSEHELP
+GENERATE_HTML
+GENERATE_HTMLHELP
+GENERATE_LATEX
+GENERATE_LEGEND
+GENERATE_MAN
+GENERATE_PERLMOD
+GENERATE_QHP
+GENERATE_RTF
+GENERATE_TAGFILE
+GENERATE_TESTLIST
+GENERATE_TODOLIST
+GENERATE_TREEVIEW
+GENERATE_XML
+GRAPHICAL_HIERARCHY
+GROUP_GRAPHS
+GROUP_NESTED_COMPOUNDS
+# HAVE_DOT
+HHC_LOCATION
+HIDE_COMPOUND_REFERENCE
+HIDE_FRIEND_COMPOUNDS
+HIDE_IN_BODY_DOCS
+HIDE_SCOPE_NAMES
+HIDE_UNDOC_CLASSES
+HIDE_UNDOC_MEMBERS
+HIDE_UNDOC_RELATIONS
+HTML_COLORSTYLE_GAMMA
+HTML_COLORSTYLE_HUE
+HTML_COLORSTYLE_SAT
+HTML_DYNAMIC_SECTIONS
+HTML_EXTRA_FILES
+HTML_EXTRA_STYLESHEET
+HTML_FILE_EXTENSION
+HTML_FOOTER
+HTML_HEADER
+HTML_INDEX_NUM_ENTRIES
+HTML_OUTPUT
+HTML_STYLESHEET
+HTML_TIMESTAMP
+IDL_PROPERTY_SUPPORT
+IGNORE_PREFIX
+IMAGE_PATH
+INCLUDED_BY_GRAPH
+INCLUDE_FILE_PATTERNS
+INCLUDE_GRAPH
+INCLUDE_PATH
+INHERIT_DOCS
+INLINE_GROUPED_CLASSES
+INLINE_INFO
+INLINE_INHERITED_MEMB
+INLINE_SIMPLE_STRUCTS
+INLINE_SOURCES
+INPUT
+INPUT_ENCODING
+INPUT_FILTER
+INTERACTIVE_SVG
+INTERNAL_DOCS
+JAVADOC_AUTOBRIEF
+LATEX_BATCHMODE
+LATEX_BIB_STYLE
+LATEX_CMD_NAME
+LATEX_EXTRA_FILES
+LATEX_EXTRA_STYLESHEET
+LATEX_FOOTER
+LATEX_HEADER
+LATEX_HIDE_INDICES
+LATEX_OUTPUT
+LATEX_SOURCE_CODE
+LATEX_TIMESTAMP
+LAYOUT_FILE
+LOOKUP_CACHE_SIZE
+MACRO_EXPANSION
+MAKEINDEX_CMD_NAME
+MAN_EXTENSION
+MAN_LINKS
+MAN_OUTPUT
+MAN_SUBDIR
+MARKDOWN_SUPPORT
+MATHJAX_CODEFILE
+MATHJAX_EXTENSIONS
+MATHJAX_FORMAT
+MATHJAX_RELPATH
+MAX_DOT_GRAPH_DEPTH
+MAX_INITIALIZER_LINES
+MSCFILE_DIRS
+MSCGEN_PATH
+MULTILINE_CPP_IS_BRIEF
+OPTIMIZE_FOR_FORTRAN
+OPTIMIZE_OUTPUT_FOR_C
+OPTIMIZE_OUTPUT_JAVA
+OPTIMIZE_OUTPUT_VHDL
+OUTPUT_DIRECTORY
+OUTPUT_LANGUAGE
+PAPER_TYPE
+PDF_HYPERLINKS
+PERLMOD_LATEX
+PERLMOD_MAKEVAR_PREFIX
+PERLMOD_PRETTY
+PERL_PATH
+PLANTUML_CFG_FILE
+PLANTUML_INCLUDE_PATH
+PLANTUML_JAR_PATH
+PREDEFINED
+PROJECT_BRIEF
+PROJECT_LOGO
+PROJECT_NAME
+PROJECT_NUMBER
+QCH_FILE
+QHG_LOCATION
+QHP_CUST_FILTER_ATTRS
+QHP_CUST_FILTER_NAME
+QHP_NAMESPACE
+QHP_SECT_FILTER_ATTRS
+QHP_VIRTUAL_FOLDER
+QT_AUTOBRIEF
+QUIET
+RECURSIVE
+REFERENCED_BY_RELATION
+REFERENCES_LINK_SOURCE
+REFERENCES_RELATION
+REPEAT_BRIEF
+RTF_EXTENSIONS_FILE
+RTF_HYPERLINKS
+RTF_OUTPUT
+RTF_SOURCE_CODE
+RTF_STYLESHEET_FILE
+SEARCHDATA_FILE
+SEARCHENGINE
+SEARCHENGINE_URL
+SEARCH_INCLUDES
+SEPARATE_MEMBER_PAGES
+SERVER_BASED_SEARCH
+SHORT_NAMES
+SHOW_FILES
+SHOW_GROUPED_MEMB_INC
+SHOW_INCLUDE_FILES
+SHOW_NAMESPACES
+SHOW_USED_FILES
+SIP_SUPPORT
+SKIP_FUNCTION_MACROS
+SORT_BRIEF_DOCS
+SORT_BY_SCOPE_NAME
+SORT_GROUP_NAMES
+SORT_MEMBERS_CTORS_1ST
+SORT_MEMBER_DOCS
+SOURCE_BROWSER
+SOURCE_TOOLTIPS
+STRICT_PROTO_MATCHING
+STRIP_CODE_COMMENTS
+STRIP_FROM_INC_PATH
+STRIP_FROM_PATH
+SUBGROUPING
+TAB_SIZE
+TAGFILES
+TCL_SUBST
+TEMPLATE_RELATIONS
+TOC_EXPAND
+TOC_INCLUDE_HEADINGS
+TREEVIEW_WIDTH
+TYPEDEF_HIDES_STRUCT
+UML_LIMIT_NUM_FIELDS
+UML_LOOK
+USE_HTAGS
+USE_MATHJAX
+USE_MDFILE_AS_MAINPAGE
+USE_PDFLATEX
+VERBATIM_HEADERS
+WARNINGS
+WARN_AS_ERROR
+WARN_FORMAT
+WARN_IF_DOC_ERROR
+WARN_IF_UNDOCUMENTED
+WARN_LOGFILE
+WARN_NO_PARAMDOC
+XML_OUTPUT
+XML_PROGRAMLISTING
+)
+
+set(DOXYGEN_CONFIG_FILE "${CMAKE_CURRENT_BINARY_DIR}/doxygen/doxygen.conf" CACHE PATH "Path to generated doxygen configuration file")
+
+function(add_doxygen_doc)
+    set(options)
+    set(oneValueArgs)
+    set(multiValueArgs DEPENDS ${DOXYGEN_ARGS})
+
+    cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    file(WRITE ${DOXYGEN_CONFIG_FILE} "# Auto-generated doxygen configuration file\n")
+
+    foreach(ARG ${DOXYGEN_ARGS})
+        if(PARSE_${ARG})
+            string(REPLACE ";" " " ARG_VALUE ${PARSE_${ARG}})
+            file(APPEND ${DOXYGEN_CONFIG_FILE} "\n${ARG} = ${ARG_VALUE}\n")
+        endif()
+    endforeach()
+
+    if(PARSE_OUTPUT_DIRECTORY)
+        if(NOT EXISTS ${PARSE_OUTPUT_DIRECTORY})
+            file(MAKE_DIRECTORY ${PARSE_OUTPUT_DIRECTORY})
+        endif()
+    endif()
+
+    if(DOT_EXECUTABLE)
+        file(APPEND ${DOXYGEN_CONFIG_FILE} "\nDOT_PATH = \"${DOT_EXECUTABLE}\"\n")
+        file(APPEND ${DOXYGEN_CONFIG_FILE} "\nHAVE_DOT = YES\n")
+    else()
+        file(APPEND ${DOXYGEN_CONFIG_FILE} "\nHAVE_DOT = NO\n")
+    endif()
+
+    add_custom_target(doxygen
+        ${DOXYGEN_EXECUTABLE} ${DOXYGEN_CONFIG_FILE}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        COMMENT "Building documentation with doxygen"
+    )
+    if(PARSE_OUTPUT_DIRECTORY)
+        clean_doc_output(${PARSE_OUTPUT_DIRECTORY})
+    endif()
+    mark_as_doc(doxygen)
+    if(PARSE_DEPENDS)
+        add_dependencies(doxygen ${PARSE_DEPENDS})
+    endif()
+endfunction()
diff --git a/cmake/EnableCompilerWarnings.cmake b/cmake/EnableCompilerWarnings.cmake
new file mode 100644
index 00000000..78133af0
--- /dev/null
+++ b/cmake/EnableCompilerWarnings.cmake
@@ -0,0 +1,110 @@
+################################################################################
+#
+# MIT License
+#
+# Copyright (c) 2017 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+################################################################################
+# - Enable warning all for gcc/clang or use /W4 for visual studio
+
+## Strict warning level
+if (MSVC)
+    # Use the highest warning level for visual studio.
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /w")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /w")
+    # set(CMAKE_CXX_WARNING_LEVEL 4)
+    # if (CMAKE_CXX_FLAGS MATCHES "/W[0-4]")
+    #     string(REGEX REPLACE "/W[0-4]" "/W4" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+    # else ()
+    #     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4")
+    # endif ()
+
+    # set(CMAKE_C_WARNING_LEVEL 4)
+    # if (CMAKE_C_FLAGS MATCHES "/W[0-4]")
+    #     string(REGEX REPLACE "/W[0-4]" "/W4" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+    # else ()
+    #     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /W4")
+    # endif ()
+
+else()
+    foreach(COMPILER C CXX)
+        set(CMAKE_COMPILER_WARNINGS)
+        # use -Wall for gcc and clang
+        list(APPEND CMAKE_COMPILER_WARNINGS
+            -Wall
+            -Wextra
+            -Wcomment
+            -Wendif-labels
+            -Wformat
+            -Winit-self
+            -Wreturn-type
+            -Wsequence-point
+            # Shadow is broken on gcc when using lambdas
+            # -Wshadow
+            -Wswitch
+            -Wtrigraphs
+            -Wundef
+            -Wuninitialized
+            -Wunreachable-code
+            -Wunused
+
+            -Wsign-compare
+            -Wno-extra-semi-stmt
+        )
+        if (CMAKE_${COMPILER}_COMPILER_ID MATCHES "Clang")
+            list(APPEND CMAKE_COMPILER_WARNINGS
+                -Weverything
+                -Wno-c++98-compat
+                -Wno-c++98-compat-pedantic
+                -Wno-conversion
+                -Wno-double-promotion
+                -Wno-exit-time-destructors
+                -Wno-extra-semi
+                -Wno-float-conversion
+                -Wno-gnu-anonymous-struct
+                -Wno-gnu-zero-variadic-macro-arguments
+                -Wno-missing-prototypes
+                -Wno-nested-anon-types
+                -Wno-padded
+                -Wno-return-std-move-in-c++11
+                -Wno-shorten-64-to-32
+                -Wno-sign-conversion
+                -Wno-unknown-warning-option
+                -Wno-unused-command-line-argument
+                -Wno-weak-vtables
+                -Wno-covered-switch-default
+            )
+        else()
+            if (CMAKE_${COMPILER}_COMPILER_ID MATCHES "GNU" AND ${COMPILER} MATCHES "CXX")
+                # cmake 3.5.2 does not support >=.
+                if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS "6.1")
+                    list(APPEND CMAKE_COMPILER_WARNINGS
+                        -Wno-ignored-attributes)
+                endif()
+            endif()
+            list(APPEND CMAKE_COMPILER_WARNINGS
+                -Wno-missing-field-initializers
+                -Wno-deprecated-declarations
+            )
+        endif()
+        add_definitions(${CMAKE_COMPILER_WARNINGS})
+    endforeach()
+endif ()
diff --git a/cmake/TargetFlags.cmake b/cmake/TargetFlags.cmake
new file mode 100644
index 00000000..4f83fb5d
--- /dev/null
+++ b/cmake/TargetFlags.cmake
@@ -0,0 +1,50 @@
+
+function(get_target_property2 VAR TARGET PROPERTY)
+    get_target_property(_pflags ${TARGET} ${PROPERTY})
+    if(_pflags)
+        set(${VAR} ${_pflags} PARENT_SCOPE)
+    else()
+        set(${VAR} "" PARENT_SCOPE)
+    endif()
+endfunction()
+
+
+macro(append_flags FLAGS TARGET PROPERTY PREFIX)
+    get_target_property2(_pflags ${TARGET} ${PROPERTY})
+    foreach(FLAG ${_pflags})
+        if(TARGET ${FLAG})
+            target_flags(_pflags2 ${FLAG})
+            string(APPEND ${FLAGS} " ${_pflags2}")
+        else()
+            string(APPEND ${FLAGS} " ${PREFIX}${FLAG}")
+        endif()
+    endforeach()
+endmacro()
+
+macro(append_link_flags FLAGS TARGET PROPERTY)
+    get_target_property2(_pflags ${TARGET} ${PROPERTY})
+    foreach(FLAG ${_pflags})
+        if(TARGET ${FLAG})
+            target_flags(_pflags2 ${FLAG})
+            string(APPEND ${FLAGS} " ${_pflags2}")
+        elseif(FLAG MATCHES "^-.*")
+            string(APPEND ${FLAGS} " ${FLAG}")
+        elseif(EXISTS ${FLAG})
+            string(APPEND ${FLAGS} " ${FLAG}")
+        else()
+            string(APPEND ${FLAGS} " -l${FLAG}")
+        endif()
+    endforeach()
+endmacro()
+
+function(target_flags FLAGS TARGET)
+    set(_flags)
+    append_flags(_flags ${TARGET} "INTERFACE_COMPILE_OPTIONS" "")
+    append_flags(_flags ${TARGET} "INTERFACE_COMPILE_DEFINITIONS" "-D")
+    append_flags(_flags ${TARGET} "INTERFACE_INCLUDE_DIRECTORIES" "-isystem ")
+    append_flags(_flags ${TARGET} "INTERFACE_LINK_DIRECTORIES" "-L ")
+    append_flags(_flags ${TARGET} "INTERFACE_LINK_OPTIONS" "")
+    append_link_flags(_flags ${TARGET} "INTERFACE_LINK_LIBRARIES" "")
+    # message("_flags: ${_flags}")
+    set(${FLAGS} ${_flags} PARENT_SCOPE)
+endfunction()
diff --git a/cmake/googletest.cmake b/cmake/googletest.cmake
new file mode 100644
index 00000000..3c6cb56c
--- /dev/null
+++ b/cmake/googletest.cmake
@@ -0,0 +1,49 @@
+include(FetchContent)
+
+set(GOOGLETEST_DIR "" CACHE STRING "Location of local GoogleTest repo to build against")
+
+if(GOOGLETEST_DIR)
+  set(FETCHCONTENT_SOURCE_DIR_GOOGLETEST ${GOOGLETEST_DIR} CACHE STRING "GoogleTest source directory override")
+endif()
+
+message(STATUS "Fetching GoogleTest")
+
+list(APPEND GTEST_CMAKE_CXX_FLAGS
+     -Wno-undef
+     -Wno-reserved-identifier
+     -Wno-global-constructors
+     -Wno-missing-noreturn
+     -Wno-disabled-macro-expansion
+     -Wno-used-but-marked-unused
+     -Wno-switch-enum
+     -Wno-zero-as-null-pointer-constant
+     -Wno-unused-member-function
+     -Wno-comma
+     -Wno-old-style-cast
+     -Wno-deprecated
+)
+message(STATUS "Suppressing googltest warnings with flags: ${GTEST_CMAKE_CXX_FLAGS}")
+
+FetchContent_Declare(
+  googletest
+  GIT_REPOSITORY https://github.com/google/googletest.git
+  GIT_TAG        b85864c64758dec007208e56af933fc3f52044ee
+)
+
+# Will be necessary for windows build
+# set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+FetchContent_GetProperties(googletest)
+if(NOT googletest_POPULATED)
+  FetchContent_Populate(googletest)
+  add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR} EXCLUDE_FROM_ALL)
+endif()
+
+target_compile_options(gtest PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
+target_compile_options(gtest_main PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
+target_compile_options(gmock PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
+target_compile_options(gmock_main PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
+
+set_target_properties(gtest PROPERTIES POSITION_INDEPENDENT_CODE ON)
+set_target_properties(gtest_main PROPERTIES POSITION_INDEPENDENT_CODE ON)
+set_target_properties(gmock PROPERTIES POSITION_INDEPENDENT_CODE ON)
+set_target_properties(gmock_main PROPERTIES POSITION_INDEPENDENT_CODE ON)
diff --git a/dev-requirements.txt b/dev-requirements.txt
new file mode 100644
index 00000000..9e7b9f01
--- /dev/null
+++ b/dev-requirements.txt
@@ -0,0 +1,3 @@
+ROCmSoftwarePlatform/rocm-recipes
+RadeonOpenCompute/rocm-cmake@04f694df2a8dc9d7e35fa4dee4ba5fa407ec04f8 --build
+danmar/cppcheck@2.9
\ No newline at end of file
diff --git a/doc/image/ck_component.png b/doc/image/ck_component.png
new file mode 100644
index 0000000000000000000000000000000000000000..db892331d77273208f861eb68f1df46d0f2667f4
GIT binary patch
literal 565049
zcmeFZdHnQbc`y8cw1A=rE-fk`RTMYOGD#)~!X%S@GBcUUB$Kp?Stt8slFUpJm8w<T
zYD-<JmRhKYdqr`pia<rJAfRHaRZ&D#upR{!MFHj90est@_w;k{{r5f3@AsQz?j-km
z-QVkbEqPu!9di$T@M9i);J|^03jA<(;K1Qm95`_3GY|WHQ1b0ZoV5gpgUT649eCfh
zkNd%a1E;;SVwP1}`(ff8I1#1qJ3A49f^}J)hz?Ig5GL`Qs#s^>6qKiHH}pf#z3&<V
zA;>de=ot{ILdc2e04Bf(N}No<uzBD8*2>H7uLwIC0u`L3hnc(Hm*8})f)5x1#V7^N
zzy-W_-)W8ao&rh+d7gN>XG>vqBEn*zViX6(CrP|2k4{8raGr#*2M(O)#Ov-6x3Kmn
zhih;<0-cQB+aetl>lEC?!cgyI=wu9rk&|J#2X6k4p>=n|C!?TjZuwRb{-@@;QG1oO
zx^KiSR0o{2>lbL6&G>A|oSEJ~(w$e7A(*CnC+)Y5cMn%h?%h}Fdb`j8J?TnwFf-+S
z#h_U*c?_6-H?&rTm4eIR{WI+By}7#Nu@D_Ijwx5=Gc^9b3Ni}}>fXkj_)KS==Iabh
zHJI0~F2c-B2bX({0A`jQ8l)Jy(;l>N`V(R*+T6O`VhNToyn(EEhTf&tlIYJvsw5^u
zoZgU@w%H8zz@7Q?27<$=CsuiX9f{_&X7m#gYQ1974LAns%pETjQeN~G-^olx9;p*;
zL<dN2?Umhx^5=sMl}Pv=OerNw%EA&_lv=RrIv$GrP>MuBa+5`fW=sKkYJjJ-4bAo(
zeCQ<w86*un+oU+HfQvnQZ{!(V3?!53(`8Ai%pBAj&ecU4Pn!r2s1Xfq+|>l)KzTZX
z%Vj0vWs6|ot5MC^k+5ZLtk2DB&nK`gsI$i9<qjON2Fy$VZd<O^!fJvYCJmFW&!cgb
z$UTa78zqf5F3n`401=iovrcr)#Ei0}mk|<<kfjgq9!@aLkP9ekl(EK4(tIzBV_HM>
zxfr`skIbgQObzB*oXm6yu@<0Zxz*NX!euqj4s=Oq62YV0lmV3qO1dXms-KUW#@ufD
zsDe8a1@Y)rstb8oZiLAsq6;QtI9!oxg&6O!px)Dv#Ozwz?He0=RVZ@BEyOaexmf9&
z%f$Ctr~s>%v|cx-Q))VySqx?6h%ttmVm{vKGAuQsD>jXmr==ImW}lFP2|m>q!}TmO
z?4E*TMlfGd<USNC#b(f!&NO1YVtAenc4Gt7%U6s=Fg;zx{o3aoEu?{zvk4^1P-AR`
zQk591zmjqu>CNSl*asI!TEO<VS%6!qGwiR)Tv&qAkfPaWI$l;y(@jaNhVfkH{291}
znG753>535zq44H9RAXggZdb`_l?og|;982#_V9KwvYaqeu#FX}u|f8Cx}o3!7^{<Q
zH;7Qcvbc6j8tf}-5n^1T?tw&59s(Dw9u7HZs>6{ga0=_|fv(79wbqsWreZ@`wGF%8
zYuuV-RT4Zi<cpPTFZ*Ekg1YvqEZ9oiL(~2$i1AXaXa<GjV&v?>Wkqh6iVRNXGTw~P
z$PxBrK$D^;T5J>#RDtf3u{Oz!9)MnU&QIj1RB{+RpiV|(kssvIS}*s4F~=vuOdg0~
zsDuIPn{q-&b*t*5VC~Gvy(CrywyV3jtY~u>BWYH;@ECk`t9CEa{mI(c08ve%^}aSr
zD);Gqny+FtAM!Yv_i>BN*rrX@om;MGC|hm0gd9O@iO&nv+se}*<h%hdH!CuRk`|1d
zT{*KMhLpoqn+uJ=rMs|z^G!CQ3RPt^cTdCcZYK)+0b;Xcw>Mb>#zlFQPhyx)mc_cq
zK^#Qu(+TJ%h3_h4ImY^In*s&G7ISPzxhhG}0g<}2PwlsD;X)D3wn@<rwC)M;*XR35
z#%}Zy@_Ni3tE;}St2voMS$jm9<Jl0-kqJIgS2}@&8b2gPLb{wS=Y(wGvq_dIlW7~9
z&C)E>Eyho0ob3?oh)a?bC>IKE#-g>zCg4;p!fLQ6dY**#Y?Rue1V-AmIue;VK3wn3
z41t+)3saaqGs7j`3};ZKlDd&3ZXwU{O}>DHeJ>fcqmXxME*hu?ZtV~oRJ{t<%U*>N
z)^rK`SZnU^ATNAI$L3)|sbbVmTS?i}J<FVibS%>TrZwp~Y!Hk)7o|K|CnKOsSQHV$
zxvp({{Z*tU;EJ^yclvPOtTv&hD5|Syo*sLX2o#o1pIl8+++B5F44*4{8`~nGL4~=+
zZVyLx2b-kVVdFS0nL#Z=$YSdWes9)dnvMri?F2q*^e8#rN`(;-Os9jyp-dB_O&F;x
z>&qU~#}#s~IWd(hGBn#4*lxCFk{vSjQ>w$^nrQ$%pbBX?#`mI)61j->#&|;|=2*pD
zDhU=r7I;z9gornHKnr?b6P9v8M(Juj9>#dwg6F+u6^(0Avz&4b@ULHE@l<BQPOkC<
z9ZBFh@QdRXT{%%IcvieD<Y)rL+XUR=6ygT!HuffB8nJJVGgbmJNM(o$XQEWPInCAH
z$dj@$A{!#O(xKeQg*KRg=DPv03!|wt#u|hlGbu^7ldKm`D#efO!kJ7Q6zB}LH-X6=
zUPi{UiG5_aoU$po%UIM^xKfV>SwuD-G2r*+l&=kT1E7)`m}>XrY6>_5IR2J4{5Kq8
zyX-AnXS#Jp%h4)Jfa;*Z)N~}|zi^1eY5;|~pFe1=ZC7QFnIhI;k`9L^zo?gl-0Dc(
z>iB>i2M7wZUTGxKI`={b?WDqwQ*oP165-4msL1KnVim=+QI=8hWa3S{wN(My#hQGC
zd6FK?6?_HZy=gpMn()9Im>sAm++3aSZDpqQwrM6?SS%{EN8~=FjQBb1LM9zYio+Th
zw`y>6U*{6jGP7Y2C7Iesc|M62{-m6@y*Su{6=8!$>;y?&OzK8jL@l1h**<hfy)`<3
zd_P?BbxBz<T{lgRtp(I<0;!K14S;4gBxkyqOxM!RQ!6om(FPzl8;KN0jD?OQ`cv?M
z)aYzLthyBmMt%(%696Y<X|*?T!SVnN%NF^-+fG@tq6a-u&4A8_bxqj1N^KfxKW^zQ
zm4zl<c3_x{@;cX*pcjYUAj^PipRU7LmrWQ7LbhFQo1P7Oy}l`U_yUQ<g#(*Hq$CQ@
zx>l=-!!Y9+6De#g7quDXjRDv#28holbD;65M)xF#Q!y#WW1@*0yIrU4gd*JmXGlt{
zM>+#G>J}cx>)4zDtry5tr!9mTIv);Zd9h8)p~7MmA$gK&;XBhPtr%Jd@T&0=0$vJo
zsYYOnr}88+R=6_|;GMe4TNp_~B9>LKSE-pPcQXUXjpOjMWlPWYk;6*H9%V-TK21;T
zSuo6YLv#qXQsAJYc(~7p)D8$On+dBZ-5Uyq@5!b$CKxXyt_ZD$-2$LqqIB$kq+0|;
zqp8~Beg=tSvJoAEH05dPtexD?sdUYy{Mz3-IqFKI&D=z%Fo{m)G@xwZY9V6FZqSSf
zrk`C#s;k1?+RSG1dY;vmFUFqfCxnCOi$Y(QDJhoDJjn;TU@rwhAn8ml)<it=LoPPV
z-5f2Au+?VLyqr)(ic<o=_QJRVReEx?A0>vk!r7P(Y`ll@ZZo&y5KPv{))YpP8d5C8
zeH$C`3ex<!-N^37?butwWGc--q-<4lCxTVV8bd>R4@oODsTC($n5#;Fxh0;J12yla
zMalTp4oTHs2V-kq*<#B%H=5M7&1|*ZX4N8DAF<F<M<Gj0YG!YvrM567cw0mSDkDBx
zkL|jzruy7MoMM_ul47}-u*n5dSHskxk|GKdW#-L>`j&HjcP081OiiSH(Duu)0L@C%
zVbR=ck(oh;4B$;mPM9WM6-ZzSrb437NLZNlW{;H7*1!|Yma1UlX0yWHZ`o097m|dC
zl0XiK7>tRf-2QyE<kRJ%<V+p+a4ApM&3?%*NUSW?$%KiQ6TOhBq$;pcSq-bnsMIz)
zVBJz06$_<?%x%$+S+I3kyYz;e%JRzs!M8i5UO7>vK{3;+re%TaqNrEv*+fmb#d@}k
zhFU7|x<fGYK|U^t8IcOu7HACC<|a+B?sNxGBy5U#RzE;^XhyWSu^5yTY=F_CMM|1+
zuf@X^C#|G0v9LvQ7j#$(?Hb8lq?WAtCaTXhbF*FeLPm&C$pP-tfQoUwrXs*~SYw^W
z@I;MT52<85jS(l&5MqNuyt*XjEMbmBcTDDEdYi<g=uWJzOSq7`n8na#UZKj`HPK>+
zrkaM<!+>jt?GCII3mQ?J)wTpYhc!)}=JuXoh2Er}bCf+uIuZ1vH4jC2yKLLVJY1!d
zT;Ft^^D&Q%gbg^udS0iAouZWbCAtH%Ijdz2>!+Eihl_jwt}#Bj@`wtoCRI(mPLFPt
zlA#i{41uL?i7SA6>R`Z^ts<gx22sUPt++EpgVSoD&GRxh!<B{S3wiIlakS7-RIyzj
zbF;3Cyn)lZg7n~&^lYSDxIImoEOpLpoQBbTf)t1IjmqY$M&%k7KsSIVLKza)P-jNd
z1kednB*hw%{nR8^3fvi&y#=ycgwfcuI-%k1z6+QZ<@tpcET)lwIk32JN9cN<b`)Qf
z`9k7Yz2l4ZA_hv9TGC5GS!p9|MwslTBOBqgspPsWB7SMtCX5r2*l&koKQ(1+iPLg2
z*JG}8da&t+aJ;l~17{JTuHN4*qr6w=8$(==#vEB`V5?IOx=ll>LVEt*4X}aI*&I+6
zvJ|e2iY>WCA}44%wm#YuC9bbkQn5C3yVMd7aA;Ou?1fER1sR!a79o-cDsEzsFMwM5
zDO7W$VUhhoZ?baxk}EM)-i$I`AIytp<xbjmx80PCw^dwv+)A2eEJ&{D18HFS4gof9
ztU(T}b~?5!18e1u1|w25R|DIoSGFhia(vq3H)D7QxG|2>mBKh%RicX-TussWx-X;a
z&AlA9)y%2G7p2o-slIepepJ;HcMVPI5RY<#FiAt2uUgmV$8|m4I=yJeKuFC4S~h7%
z-jq=1xiDhLEgY8^LNv&7%etzBkfS<q9CXKM#bUZ1EgS$Btd1^6ls3v{#4br02vaA^
zE#Gf~!bym2(`k&c!UwH5Gf9;%21AI4NO)vTI#Pi6wy&1u;Qo!pV}Nc`I;e>DAa_Si
z2$C6-Ga~RHb6nC0UJz0-ocjwM9ZcK#OpFa)SW)Wy-VKvSR%A>C*+vE>np=x%OKGUG
zlREncnAxZ^Q28{{0z(Q=IES{T?Pv<U!Fu?7+icQR=igJ85bsSxc5kztP97@2!64yq
zUgat&w*s76v%b?}v*j*bVDfr43hF4D(M3O-vw%{uB;Uz4d#^+)i38Lx6+&bx4Cg4#
z?&xt;Ef<BG?Y00%*qJjJCBQl@H~aCXh(}UX%2+9gT0iwsRdXp^OUOn}r?niDom@1L
zr1{nk8;*UC?=5?Cz=#w#k?U>U(7B)oOT(Yd1Ad0~+hmz$LAK58Q86<77H@!PP{6MO
zwS*?K18+0aCNMKuPC+rKdNSC7&wR&J!*0p;og1zD?MCu{;L`J<KskE{vPR@|G+1i5
zWB@oo{tK5*lQR=5b4$kpj=%fdaT;v9WFeHh9=y!9K1Hbbdb^DPI#DgdVB=n#0qC4}
zUrmu7i|c_QEE{u4fD*!8OgDf4ai~2Sc$}GV3PZan>{d8|<7*Ykdwt)?`ruYRo<{~R
z($FNTgHaC*M=)lg5)<7i5<v-zQdtToKu9Pebi|pj!)TA)^=ym+zP@Z>Mr)BJ3{7Mi
zWbUvycx)onX^GGVED@M9+mkR$A{>NnvbI2p{V3lTUNFVDZbpN25@##|+e-!C6}&J4
zlGU*$dEpZ>Op{*5l14OzXCzBwGnt%NoeG|;LzS@o!WcGYpEPo~aWQnawaxx|$L{Af
za6H;mv;oF$E^<BG1M<&OZMXR?b?3JzREn94Rk%Nzc6w^kEi%$*lbYf=C=(#s4_rv8
z`eY=_orW3OC@?VD)}$H}c2mSrX}@IjPPs?BmYwiW<r9?9+Xl<6PLw0Gmvy3@^$B7%
z?S->m!M2^<*jr5%RT;;-P~6RDprG1%V8SS4t*0YtG+vU%Tv<io3Yi-S?8LJ{zTCE1
zGg=i>goC|Fv-GFvs#5EAw`=<hXoPKnB(cC1+jIR0b97MKGNrt%+ieLps&wG>iS1^R
zOmb^Rj6f;gi`;r!t-YeqnXVIqNuEz2bji$Q21h$8te~JC;Ainta1eJH_}pOx@QYB$
zcs+y6X<vmw*>GkJlb~8>1_QnsE?NOM^2j4Z77~5U*Xe-TfsIU2)tIRox(~R7yxsE<
z5>7^tEqZ;oh|2(XaW00kBvI{V>a{V6*Uk>=>SraMQVcAhcU#G$1aHH3+OPnEKHD=Z
zf49~A(XK~$<4%&IC2-U6?i6al8iCtvNeV=;Z3nK^2uh!s%kFroce3S$3v?J`zTj>K
z?i#5`o})-PoVC?h(VHEXaWid7Qbs=RxS~n-R+ta(Nxc(2c26$4t`VD-@waH(8ShiO
z+<<Bg)b_{AEmP>)d<lwUywABaX#?PEH|{1rocB|L<CVq=R%};E-7Hjj>+i=r8FFh3
zpaRQH2btzBi`6E!t%){=Wg%n3!D=^O_Y`^?;o7JtV){y*h}cx*AiZtKiOr~S9fquc
z3RYrMp*~pS<_HghMh0pT===^Px`7LGQuq6n?x6EFWi@$Cs|c|ZBcdcB(0B(O`<m7b
zpk~;fMRuxwQ6qC`%4;RLXa_5nk@S1PyF$_d8(7N5U>B;h?~o;xwm!4$nS&x-#tNtk
zN&wVlsmj-=-Bq8F)ShB%YvwGN)fBD)s7|AOx~AAB9Im<z<_Anx=1tWE19?D^We|iJ
zk<JI<l0yW2LaxV=A&l2^VMI(<<RqsPl`$y0+iP+;$)_<>&|y$Y)q2<f*&pbI$_H}^
zQ0xV`h4lRq=tl?V{&G7qLURlcOAdmURxPyyLKsu^6sR?p+7u9};vzCK<rPubxxODN
zm7&W)=f&7B`rCA{Q{?fe^JszJO+JEYi_b7jC2`S1Vn!~eT{w%~=Sw5sa+WpKhLkE#
zbDO82JynHcPE%P)Ol)*c<EAaHeW?JHC?r!V0<g?Nv|DH*7kZnvZzR@WFrXAHNS0>5
zT<ObYX$~pif}iY?xkULQ;4B2`W5g8oOM=)<<z`3h9kL<qxIcxkA@EZ5AS9w3UzNpT
zvTA1W+J(ZQpLj9AR?g~)>(~du13w#VMV*d4mYGqlLYweV9u{DOF3NPVDh4rMg#^Fr
z52+4P$t)ccIktf$6tac2AYi2kZRcWt00nLgxkjGWtBP4rERqkR!jpud2K;d4A}xpo
z4sW+Rbv@bSm22^ev)0labFIOum}Hws7J3|Q2K97l2}#l5<U)%H&8$Eq!}VaDDjHi0
z$f2e5G*7{Lr0Wvn9!f69^k!)KyD8e4QDMyhWlgp0dbhyi8NrtO2ncbW;u#y*8iL`@
zluh`uUWh=(1$-9hiOCHnDuXk*v#%+GCp+fO4#d3j^{KEAHqIQ0sc@jgtVyk<0r0r0
z6p7FP(xlz}jjmi41j73Icn)?eVeV;w%wQG3grSRByX~d>d9qtjw6j^etJtc|8r%2@
zKyMD+^rzA^PZEp7>c*Smz0tlffoV+ly#%Sa4om@?tX-gi)P<^MHYfli?aEi+Fy%5x
z>IKQpTP6{y41tTl2*-A!cG|!KVJAjzu_arCk~kD9S8Mc8m>LC8f55}mb5SUm&VZE_
ziP?UW^`Uq_bj2xcCvb!bH3MtbA`(ewzXCKj91Y@_>QCy~%w7oL07iW~3n35;+RrE1
zVxqcvxJ#9tX;R8?vP0pdg_$91yY^Ub`w8Ahwf$&ljR2Mt%{*d3$QS6cEuEXa2u!^*
zg7tyCX$_~h4}p1(_ksb}PB(LwT~PeA*ul=+Z;jE6nF+KwY-gpT4g^&*Qfoi$D8jbT
zV4R}}3df8UZLA6r)SB20G#G?ye5nrve7Lc9t9A?8rDvgfuhSNslYBH`I;3A3CCi$9
z9trg-L!*TQK(JGbnhL{`W@{kq#Vm%CeZ7_VHDHzUih`z#xE7X+fnFB*BtVG{<=Is3
z`+${a0l?R8rDVU}@IXUQoYyY3g0FjN)FXBhRPYnf0M*;V_SCPUH6p2K#_`~(MOrB}
zl?5TMEY}=@EP`3RETm|&?wKaec{?9ripC2XCtOy0&lWd3Es4<{x7y%t+{(~Gh5>h^
zM!BN;Lj)sMG@K6P{yetGV6ZP?U?+MkHP3ih^!r8)deIE~eFnC75k6zP;3&IOdo(Bc
zP)Zwhf6)ehF)mAw?!y7s-{vqpi#8*Cj))j#iKSgK%osTTY)o+$M9)$EK4%Ls8A7{w
zx~+Ttpo}}Q3y7g9vwd>6%69mK3Pk|atPk`#P^ayT?6Wh>p_YuGl~j>&t<_EtuF9%n
zG735mLEdiQZ4&izqL?kV4vdC|OAXw)2Cn&(6Ax{Zlcja7%E@Zu+NQk@l3qHbv#Cm~
zrzR}u!#G0+TA;^UFGjKvYBJMyM0DzEf)@TDL6vwkCPpmq_S@Vx;UsN1>gYzvB7!DK
zaKROc#uBN$sMe%{1oOZwotc!3a6+h+g+Xp+VK^&1k)PH*G9NBeTo3}g-y5U>q$Ts3
z#&g6Ry1+7XU0#uae)OTZ>F1%m@^aug(Sa*`DGTd_HTcaM8XION0Szg%3r?*1?Sfpw
z_Smr~Zd2HT+LOt(Fp>384}k7ar*H_B#klpHlu&@9C+6w7Sc(V(42sDHM`Aus=wYYE
z*5t6}T?^QAykoT$;zN$TTX&nIKadE!wK~n&@8%U33OC0Mp-O;j9m9-1gD4)gi4k8}
z;vLJ~q@e^)k>$>mb{%1Jss(PcaLB?-1oy<MVxSS@@6q|d6LeRrHa=br44uki5GrDp
z@j{^BO~DPgln#5#GAylqIaZk+Gjd8y=d9UuxJPZQ2ZeFmpanCIwbqRG`3~Fjl4(MP
z*U>L@E(pYMwOTk}qg5My?If_lKzi<I>VyS~SQ9PToinCg$fmxhQevt_Gy-B`ij8ab
zx*r000=*|@+my2r$!dLtTSbXlS@C4i_GrAFh`ZsIobaXUl-yt`3=D>BJHO0Y3H5uO
zzuYt~3eXvm50Sxq$O8I1ow5$q?x*aa>?W%v-K{x}>KsYtbnIssD*%UPSQTSj4EQav
zGds|sEIJwxX69hLvKS#w>F<?UB%Arp2Jr_J3nfnR$5fT{cUr6atD!NC_X7xnD3y*i
z1;&63?q}PCC8m7lhU0?H$&!F6!RBy`*hnGk#S9BmCb@N|{Dj0!LurS#I%x83?XQSQ
zNBUZQx<<IY&Qp$7GJTBfV_|1@Z#hlT#lB?l^emePG`s{PilWL8r45bHHuG{r1OFaN
zrT|q~VCnDV0ukfQ+~U_^0??0HI#8|@tOfKAUUic`nP=5xRw;oF)QCGtRcW9z-g>FI
z{A{oo4OeNt9YP?sX@Gi}W)m;15rBuy(w66FHmA94XOB~1kLPGVuXGDGotzcZY8voP
z3^JGc8_Cf5Sb~60&};icGj?cg%`i>m5Bjdo5tO;*){Aky8(5okO4lW(4y1gv=%*VV
zX+bD>!!aVi?uq4UF<WEQteq9Pw6PGbh?cY799mJdHyl}jxvA?^8_u)Ua=3<ALd!fS
znc4jik|MY4EJ#OM;$uXTmNDN#a@r2Lriz?IhFh`3Xddk3RRp7~<*L}gu84_|@jgHC
zX@=UTrH~c!t{QD7x`7O5W4JcABe7@Ug4Ga=v)t(+1g&>S4gwZhG9q&xgOEJcdCN@$
zR0p9!I5!mFGyusPVA_Ss&&rNXgGeIi*-UWlY1s}z;)t#Cm`qXnV56?GLF}w>Y@Is<
zELc6807)aZ8s?6K>d?~LxL~{VrBa-!Zai!{8)~@~L%LeQq(PM=uqG*j+k?*qRgi;z
z*Pb&4pX?Gia+9=QXB}(v9WzO)nlyRYT=Z-&U05l%pD5klF4e}S2pV@#3u4=zvF*qe
z%yH^!&0Fdaj*M)na0#^!n?4M-4{H$<os@c`kfhUI52U*&3Z`s&^PK3cfyr_y_1t91
zk}hAZS5<A$Rjt}BVhnql2#krakE4b<AZB{kGc`wR!v%pxaUrK@wH$LosW0{fh5{1@
zY=5>$ItJ2lvkGKX$SjBoV4}QAr>hJxIxhsU#{kdBVMEt}gL(_XY_=F#0zg+bo$-9I
z29`6`&_Od)7t3YL&RJ0@7Z$8f5?3k*nc|WDFmjfYC1C8+!M*uaeSRd*<4T$>O?lq9
zK)8B+P@glc%(UpGrJ^V+Ma68c4lzYuL7PcV>@1>2s)C!9RcP$1!m3Ffc!4sot0+1_
z^+=maagNUbA7HZq5kjEz6j%VN>Kp^p6<<tL4}-Ou+AK;Lq$dnVthkS7NX=Sf3~DB%
z)3<RZma!ah4NXgma^fl={bUyu#Bd5v*`+)lXY_uAjK-PcInmyf!`xyv)Jz#ebQ<gJ
z9OP02b}K*;;E?@^qM3<iI2kpTk$j<0;y~|2Ddv1;3|to!9Yxcr-%1l_=!s-ig~Ptx
z0B%Wbn$dD-<bb7AsvL4I2!Kc%B4JX<-Uy12xA-t%X0!D&$!W(}du0ovi}grw%B8W0
zS24tBh_6m&Jr{>qWH&7V;ih630G*fCTe=akI#ujY>}wR%8;6=PnwMab1`H^L@pR{p
zw()W|-86lCrx{$hi3mQamoThqWGwNs+>785uJc1@UezU0k(##CU}cD6IKQzfL53&r
z2H#r|iu#j%%MgXwiyaAytDs-IP2)-!zcW-$gzdJ+S#Udm3n|!h&RX$CYh3K+U`Ew5
z8XdCncA3u`)(+c>bu(bB)Kv>?cVk;hQK(uYCmGv|o-f4WT<C#pw~@)?YmGxdxN6;t
zlR4oKb6U5ywxMOF*%2`tjxD}t&PP=m+gv>NeOp*fnLVdXI$(h)MVzjt5u`0xcv{e%
z_z2Q`R_a@PG37i}*~{v92m*6k9N#(XF|T7~&n!p+200CRF<cPK9X%hhduvzBK@!5m
z$ES;G!Nr{hqN3#v0A@i7LmcJBW{D!p0`K*$e(r1gVHn!@P9$x3Z0Jn4CfZ~haP<y=
zooV?(*ZrR1Y-pmC(Ui;Dr7ePlGc{Lno|LBj?NsU^>%AxUZ7uXf74xGk?%V_*V8aeV
z9Gm;L#Sf`1EZJEa>^Mv^hyCed(m_i*v8zci2OMp{g%CFnWsuoqM@}uUvs5vF-zCWu
zCRO*`M(5T;*v|nxvcPXsxHD{=j5EU$ZN_gaft!l}lPNxF_j@^5?O+H5T&7!C?2lJa
z(=*IbzAMzuzF7Amf4HGN2Mz!gqoztEvlDe&yBn!$V0Y<efSwL{X+N;`L6S}F+F>vU
zf;6{wJAFRmDmYcBJ_qoFnhQ_?XzEI_ql}(l0~R3spl1gQkU_~UlvRu-^H2--iv`Yc
zo8<s_atFH|k`drpvK*EIWlA)ej?2w9sMC&54l-!m5=H1xTfx`DL4OW|QchlqlhI~m
z?uZz0GfkI$&1!m4TQwTJQ+EQBdi`n%Cs48ggJhkp?1%b_7*)M$ZEdh>Fz$~y7U_-a
zS;2QsDoQonEg$bnH5c<x@Rk!r&!;?(0xwdHo8@jhg*7k9M%$4|w3$&&^^}RXxIgE{
zfzAUnvZ6tlI_?3-FvpjDX2tD0xea4tB#HKN*;%Z6)7-bNIWgDO90N&wDC1Ab8t!PX
zjgO&p=2IkRH^~T27?9c1%VBbg^-6Ev_8l+T^;k9EZcxi!bYd*F&8ivEte$xMU^E-I
zz^p3#z^h6EL|0MWAp_1-IzCxj>&XNoKmr9j$h|SIx$CaUiJbs98kK0?n9ecY2AL=c
zNq9v)YzRzp1>DrUc{5u|MAXlx&SDIr9pHX>>W}uMVyZ|#l8RlYiYH8G<P;!ozlk>E
zpiw54y-pHY9uKuyr^0~$MyX+87`jF3#rl$-%)QCl2?nvBg1xo3MlBHc9GDjDOcDbQ
znOQv*o@#k-ib_tc>bcEItAU$$3KOuw&gD2arG+`~n|a-BFYsG)6D1>efMk&UaL!>C
zvI~1g<3la07{~(I06#<)0?$J?cyRx2HlA+MV(ghP3Ow7?-W$7|xg3m`M3_#964+`8
zxUF=6_lhd$GDUVaT2Vr<b*TkIE%1q^j}SF+kTj|NeACohGvl19kxLoM`XFzU*H@cp
zQnhoMGCOr+FSzY^yxDD;9V8f`4T%8mSZJjRld{vAfhg|D{>JSU{dhK|soqF-=38r@
zuhvb3%=48iTALlqt2ncv(F$_BSkgxEVyrtG-&Tf+4TIf5B?%sp$OgllF}s#xb+wo;
z>8A5A;^0qeNm^92CUAg7z$V@D%qFtru@*(n!EQpfSrUe6&imWm*zi#&tz+NQc!L=Z
zoWTkqh6Ud_sJQ`%I+R<;ob@$`4Jr`Er=|rt>iZzJpTYo>sIr;@u0`=p=XLUigT=5f
z?bE^_lzvhM3wkJSgmp2>v~VX^b2v_hLYef7`8*nEJ)mBBiUA2m`bypzdyxk+Mh(S#
zQ}vk!<VtgLJRBhs-8uBB-N+h990OxqA&CeFgaNUV>Co*#07<hw$osH+$VNhTtcyl5
zzQ)GUG@pqmNcMw-p}W{@H?y^)QBAo*X=Um)v>1>(ka*MeJT5qp-Xu?U!}Ul><{;6b
zU9;VS_JTDck8DfJXl1Zo&r)GTcaDlhx3R!)EvqA>+;2@}Y|Oaf#Pn<6#$1}L2?IlV
z0v-tphFtCGEwIPOBwCCieKE_^%oedlqYfssbqG>zX8>DSuNh9Jq-r4`pTHvZ-nw<m
zu!qGUmSmu7$Az3dT7Uq*I5(%h*QUL(;2{o#3V~wKg(~u136dTb6LjXRw84Hnrsah&
z?0i?a>_s3}HFkG8t#kx<!T<|>N#u}EYJ?oa>v>L2S5`5MQHDzQE6+>gQKGP6zgGY@
z#KFAoLKz_RjupwUg3R0`Uq#*01L@ta%!w95+hxM{ri-zOPWDh@!Gx}kJGZcAp&HAD
z14f&>n;wj_xgX##1-oE=*{NyD2&(VkaEz)fAylPo*DWAx?GnS%q^P^Cx$U^~eo+?K
zJ{bu)na{Vg)}to4%rB}vyhlg4m6TcoA}6ZOQ1u{XfnRtDBK5o}8R$yCcPUv+VcBRQ
zvjMdqZkH;&=)kVt^!f=6!b54!EUd|TS|gmr^Dxn+)>AD>xuY<$RvS@V?WJsi^?Ht^
z4p6K1=WvHHIlIzRyb*?;zEj9ugS)nxs70aCDqScbF9J7Ae-*FV!eDhpFNrXoPL`Wq
zCtNQ08!YM7ifw3|ntHwQh8Y;}vNzCGac#?jSTg&}G2vYRL<s}kWkX$qcceGjO?;#6
z%!(b!fdmG?d=m^0-9W}>3^C(%XlpPV6sej?xy11*b$b?#LXCvS8NAze28Ukf<)j#G
zouI?o0h@`7;R?V9g-IyLi7ZUu<L|~N;2sB~n;j>P{ei7|{#wu(aGPSyObQWLs#-`E
z#!V|goY?IS>Kiy_>f?x`K^#wIJ3Ozu!S2By?DyN1MXgJgLS${Pr#A2p_^9Ibb!Y*o
z<gC$Gvss793r89)LVlcUnH_cxsVOnv?c-D<nNa0o0iSMdiAh7;NWe=Y<Q|PCSU4mt
zF_3iyJU@>RW8eD2dIvYzUIZDIOtqo0Y(R;6Vd0qvx1A-ri70Wz_hNQY02fQzarjy`
zNw{8)g9ys^8ORVqK(1$RX;+D*%6&1X*RD#S`?bGGVrB|(vXcUD+xRFrRR9iBeSb+8
zR5e3_B2NY~z8nGPn6BoC6)^_Rg*-W1tw4gk3DTCAs5$Imkr2!pDM%w;)#j~d&e0i2
z2P*}^HY#Kl8v+|}L90-U0P2&Q4PGfIx=tq=3=E6T`M349##pjaRhG3u<Hj6_oiaRA
zMItcCBnana-7kaw03B8tNYZ0jp_oN1T&5BhrPv;?N0P(@a?Wu3QL~dM#bv`;SXL(R
zjEq2{`M@(bAp;{uzH#e%xgDFNEfYbwgIc<?|3ISWxKr(&z^r+kQ-&D2G?YTpFb=3v
zkXQ%y^~&Hj9v4U)4rvxMuBVue5{m{;5C;zZO6)@=NHtx_D%BS9qGfE}H2j1OKtd!r
ztu4Ny2wR0v#I2num*nNWFA5NQ?cNoj<OE?5qTYEkxh6cU)S?A|B$z?Mba}sjfN=JL
zN8oKUDmJoAj2k|kfQR>0srUhS<*3EM`%4R8%x&{{xfpX>plSh-RPja;4c4}$c}&a_
z(pPh`?q_)E`EIEWwi|41Q0QRK33@RTR!p_XqP}diqyEe#%Ys$6I)+=wau8I6yKm=n
zR5cj9dsT?-v@Cd!gh03+Ml8@|$`!#jACC!YNG3e8U@#hlZs4_L6{Rgg(5zHoN&@c^
zVVbwJ&^0qlSS<z%qZ*~^X;25Z)r?;Z0gLhMaEf4U*~<s>4dS}xxLIs8YbIc8t-wL#
zrN*iaxIeagCD()ly8sD;<b1wF+sdB^BX=;{ZfIE&c$b|Up_`&o3lFDFumOHb$}-da
zUTt(X?Q@nZH)C>L&k<ve^ar8T<Jx|&*$})<quLs=B&+iPF7_iuopW`%S&Li1l6_!p
z059WuAr4Ko-skHE83&@;rWrC`?9ybBgUF@3-x7Y`Da@TV3t(qB8rTycDr!HbY<i%W
zePzw&qimUxeX`+;Muo{11}-XL4;YBy8x3lKF&0Yg%X?L2RTvM)ogq#kIb{HEf#TYF
zG-~s_@mKq14D3Fp(>SASWUb~BaCP}PGl*t((E>)^SyLG7^v8P+>9d8aR^@y;+6&#L
zU*Sf&lcFq_{fwzrd%HlaF>Fqur8Uf@0;BYvIRq+&1sPXbn`tDV)l`x<+l5q+V++~z
zFq2P3bcW=Hqjqd{4VPBhXuK;sa8;>rNtAluPb;?cp)S>y%~~<Y@<_CDV&J+aM$u4G
zTYIhrQK|tx-tx+P9Z(Dmauam{s`!Py2`bhHu8u0GhI(eBAYc}VCR<_Ha-;~&$jmFe
zA*CGd1f!kj%D7Vq+@b|B%3`=-lrhkYy4OPVo<LU%1fM`U$eCR1lc`-~SQ1P&n4V|$
zJOx=Doy`TRnXMFFBTES+waBZ)AYEiVTazFX%&s+ubPXA#VQ4rU?DDCtTY!teEwgMq
zS=$AAAc5DgBsWCoeQ2(&77D1+Txy*noJ@D)ZVuV$u5+!k{tDU#d*F{_`=cR_Z+%s<
zvVMXAW0urQP=|+Y;$2{n3|4_8)I@822V03Xo93(wQnq>uFU{u_&lqM8?m~v(<t7BY
zUbs>#eM58}FgI`4lW{vlfM-r)R<x5_8ZvdzHL}n`J}I|)j~y~l+Naw{l6+tr%qTht
zDILy708OjBt%WqQV(?<iBD6wiV=B2Vk1TEv-kkx~)1Z(HW-#*!yzgaOf*X{Y9Krkk
zCf(9OqJS(vY1Y#^BOL44B8UT0OS`P$a4?)m%8V=4AP~LNM0Z&^Ik*8_Z$V0njx^?;
z)jU757F!&6>X$ur1eJ?Iu2RU{OvNF^hsK)ePxx7_QVwu>gJApICG=Hb1;BB~4;UXr
zV|KerM?4C=SE9jXm!`z(%N$~POe(8rKUJiW)t?TaW}$DFX0392mCAChV&*78QnMH+
zU&N*RIl%rfFhDJa6|MRy2y>CkbT`ELc^2Or4Oh&bT1Y6VfD9_YTijj~)&17(IWa0v
zvTcfNsa`_m+nojm24>t+pd~Ka3t)&N>p7tdLhKDO)0@{8vZZ-F<x4sDXknC<Q@H>y
zK@KdR_tyl(qK2#2`;iUas|noE(%<?B$Wl*f0t5nF#mn0eI4@NWasW#36>y+T)T~DG
zK8JRT049K7&Do+-^T8GeX{!+;1O2NH%fzw9^F_4;UUqO3GioMkPnBnuF~GqRu8aj8
z^HgsvPm&yC!nsp4a)Kw}c(Gom&Atw|;3?7DsVT#Fin*oM<4~uEL0U)U)CHqAres7<
zw$o^Ywb)+Wiqv>e^t!fs{Qo8e5tQHmQZ~xG<C&k<4jg#=0bw{$>*xNDPadAUUjLh0
ze_j96V?Xkfv*FKv_TT>S>Pv6EYkc(4M<4vnW2sZ#@{Kcpdd;7h7yOui27HzKY~Xpg
zVDRtC$1l9}`ezaU@ORbz(bv8BoO9lP2zA)=esS92&-%9)z2+ww^x3<9`UCQN?Q<6I
zyZANm)H~~<NB&ju$<LFQmDk>L>aBPE=7;22GWhqY-#_TEGY%a5@E098<nad{;OTAF
z!Dqec#uxnK&rdq#fj@MA1Aln%n{K@2u-~b<?#XZe>=5XRU%lWFvj^D4|1_M-kKm5~
zPhTJ4>3L6n(J|NFeC|09{#_^MMF;%;oxj6m-rt{Jo_f&L`Ll0$sroyv1$Z2~;!B4<
z;I;eh@Grml@aLWVvtK>wZNJ;ncaOj5n7?@T0~M9uHh19QXHO2GfBOaEgh%~H(F2u&
ztMl7`_hrAW0h0EDOAmk7@4gn#JNtqof9GX?(H9<Z`4Kn&Oy+;@cV3GF=;L1ayD$3-
z{tigmd28yazx!H%E%=}ZI{E((I{e=)_}@?2|J{QBJ4NvSyDfOzGY-C|{N^8DyuHJ}
z;i<g~U-p2Y=f(KpZ+p^(R~~w6R=@4_&=sf6?*5a<-3Q9I)kl5q)Q{+2dGD{Uxe2_X
z^pFPvg5QQ8|Nh-IpFo^X{SVHZU!DKuBTxT>m*3ZqQ=fatbK4ib%)Qw^?YVDw`$ztJ
zY5q?D_HpvXCzfx#>ap*+q5bZMKYhd<pF8(O2j35k#~pd_t^aW5lW$$${knI(_q_AY
z8~#4`>dVf%=O(3z4u2p$J^!&sz4P7gdHuiNDb6pr?fv<$etG1%Pkho-fALpeeK>mk
z6V?Ym_<mTs<u`|%?7ro7XTHzy79Uzo|M8`VoGrYr_sa8r`IBcp?v_LU{6W=$W6wDB
ziw8ZB!{uCc{`)>KzUn=nfA1ZaUG~BsJ=FL}@G||9Uw-z8n;&(}87JI7>~EfX`486@
z{q+;xaparDUmSMWYd`nxla4z5sPMYp=I)0adgnnmJ@EDp9DCcNjy>t3OHbTB^uyo(
z^{sFF()_#MB)-~Q^N{t``E$Sg$VVP>-!gykw8N9He_(Ld9p8GE_Pw`$%6{S(FXQfb
z@6$j0k_RF_UwqBs>Q@&6BY$jh#pxIQdWu|kb}PRIzM_%m(}&-O-DlkKglYSpq4t~h
z&8n)s>8h9g-p^nE{_}79v93giJfMX-_Ypx7z5e2>>@d3S#LI`zHGcE%OMdaG=%ROH
ze*dEAVGn-LeYM>3+(W))-0|1R*FO0*^{HPxefrkt|028g$Jamctnb`>&iCH<h1<?Q
z{X_44%+>YtPKA#9!y|vO`1F?^bo+tR&N%Xm4?6vWr#!fQ#c^+a`7I}a19Cic{Lx1~
z{D1kW=~K`D`~{KU|K{v<*M0ZT_C)>7hh$(Hzxu>S9`@U5Jj^}r8&8YB@|(}Tdvwtk
zgTo(o#1D>p!v&9iKyrTE6As3H{;$O+-hK7EfAV$gvNs;{2){WNFtYzz|0TEI`QR(B
zy#{&1;XgX+Nq_bC%12)HwX;6{FBkvhnB#6g^s|5VmILjJjywO*TMlZ^xMX+<f63_7
zGyV@h-+seS&JdWt{}<vbkA01a?B8|C2kv~vMKAckYmT}0Y3II-8r;VTUU2T^&;Q|9
zue|HA_qad1^jE)<{-nO-mv<fdid(+^le5nH{~(b^9sACAp8Up7VEe1@y5T>z-B+G=
zX5^gxv#b2SpWZ)TH=LNC@Wa2oySV15?>YG6r$)haa_*tm{@ov5`{P&M^ucE+$DMl1
z@qd2g?FXGUJo3#)zUjzU9{P$0^m7Sx+&kCre8YF1^|G(rb;Avp{^IPX|M7=D6`vV^
zroaC7zkl9un||VLcYfnN-}~GT=(peYPcJ^<=tmrP^SO`u`+s?03t1j~;iEn?o1dAU
zG<wWw%(q@xzD0k;KOJ%Nzg|P%c>e~!lzHBl|Kg!P{lp#L`oM#~`K*io^7g|H`^pU$
zf9+4Nxcst@+<<@TpoiW5pwAw3%8MR${)28g01j1r>`6~K>2=>c_m1tA554AXAG_&o
zf7%uw{r-tZ=BHl&u@`a=yY-ZlKPqODnmqH(r~c_B!`okVOYaM>eaX>Z`y~10=o=rC
zR(x{&a~^%+O?QxAKYje;)1UEC@v49L*%6OD?}TG6_<iAw^H0CCKK|D?-ub=9kr&_f
z(i^}2<flCMNnbej$%7NFf8!0(Juf)troSqxKY4O+=-(X=Klcxxd2fI2?^gZthuw1B
zJ3sog&%gJsJMQ23yDQQ24&5Gob$;Zlo__QF%I8%75OMv6&s}xqGhg?rs~$Z6*(=U@
z?$Jj*=1S?$@2X!pIOo=52haY|8PB@yi!Xcm$-k}q@=yK{5X)PSA-?6G{j{f?^oG~n
zbs6#fQ}4p<??A_#ipY0A=-?k;slNQ#AN}#_O*j7dITzi+ed_}K>W|)j=4YR9(^3B!
zif_H{#h?8w`;@0&_LSGiulfF?fAfa#U369a%=iA{&X0ckO*dY0=_^lq{C%9^a^kQ{
zKKSEjeC^`rfAA^E^UgR@xaAG?k<CGerJsD-hb}xVIPy61w$B~(9~J+2a;W%I>AT-J
z`Klv`=3BKOOuqk{x4!Ol?5US6PI&q5*bgID$ZtMf`J)Ry^{i_?c+#2Z6(4qgeeNf!
zx8FW~#gD%Dt5>c6`g0$6;gi32?K8gdm+w6I`G;Kf{>?G3g^}lc`i8$e@96D|A9dT;
z!9v{qqs<k>0KMXxb07atCk&2_j<0V0O?Bj1w>{ykC!RI@FRJF2?3Db6*(<OAhYx)7
zimzV(53l?8AId-d&a0n!)w>t(dE$vDekMNC{$B+9gmaqYqW2niJ@!49f8eN}9Q~L_
zetmY(^KW0@^N{n>{ey0M|54|k{>9_}@JN6lzlFi-6ZQH3c>eIxU%&M;5cb1&z3b>N
zKkMW3_75KZ@W&r?TF+VjjQ!J(z2i5<JKjq@?36d%`1WhhJL#w|-T0R8zw2zAyWplv
zE<{g!-$f_C?(4g|9{H}LvWq@=RQ4}#v%vem@BZcQ^$&UJhi*OX!x#PV*Z=Zi>0$mq
zez$%zvVX}TFZ=xuf8*Id`tb2LTyXmJm;LPSm)uNT^S<gke>MJ0dHeBq9{GhUn&Zzn
zRRBLfc>Yho8|`m+{q;}z(Al^C>et4s{osk8zw7oNeEEBSaM>?kcF8Zk9zX1lZa(&&
zS08ijv%db~+5Lp!1%G|{h0+_pd;GWVe%ZhDv~y3n{fNWg_@>+bas2(Kf15z>_TT=A
zU;Ag@dD|bKQ9blI^nKG@pSnH2f0?>_;idol@h`mjp6ioWO0T%_**6~b-DiKI{NZ0b
z?%$eE10hIneBvu#@v2*@hh6-gyX~+2{8!I?`A6Pz;^y%7$XkqCU-eV*ySM$=Def3w
zNRH7@$!{XrJC76}efmv*_WV=-`N}_g=N%uGzWTi{zRx)uq_H3W=2QL@%)lZ0kAL#b
z|NQ2EK7M%mg=c>0qFeN<pa0vP(V;<o*~7m3#_x&Gq2Kbc%Z@nm6|X(^y#JB?{Mpa{
z$#*_;@eA+TUG)4f0Pz0d-@NqT--7q6FTdxGPY~``tv^2Jrdw}*=HXuwj{f$o=Y05|
zKKgDtJmEiEZZAIh@zcLO^E-fxoO9ZHAM(C`ddey6<sUreea=6B;>X*I(m&2G;Q!*R
z{q@MFe((1Vy|TtWb>Wk@?>q3ge|^$H*BtSV&wuCNee??X_2)nQ<yYso{n`JEHhhTw
z_Rf3r>-BFT&;R)m4|yYf=HI^lSszV5f8i<Ft(V<%>6H(=Ieo0(U&lvXb-yCN?SzB>
z;NokqeCAi*d$IW5V{ZH->IvWY+@Y7>dg>+j?2o(qxL4ocz~f`y@gI|t-*d|OXTNKW
zzi{=<r#*d%bKg1IzVe=*ANJGo4L5xJ33u%u_Ytrq&+Hk0bip5;^1>Um@4h{K{*yob
z@zc)y&Cg#ze*BKNc`yCpF-JYgdB<yB``r(n^_EMYebWbj^poc2*KNsPzUG=EzDj)T
zFGmI>P}EDlE&pb4`yEH!^2~O7&rc57kQ?%*S8QJTTaZ5W1;@VgT^Ie;-4{RhE9hH3
zcGIJ;yypc@`X1unPHS&J=ep}=haY+59UuSbHvsOuPdMq}_v6m~uuFdPi-;`lJm)Dn
zKYi!zKloSu>c@cp80Y!dU2>mBe^C3pC%ovlwB)TX|I07E;M-S1|N5w(S@PmFkGuQ(
zueko|r$8@^{{1DdM4mUj2fs^~xBu{EIJ$QJpf`Tvr#%c8jh{T@%4@ED-Ak^go^tX@
z*v()0bN>S`eCvf1`;yxq`$hea-ukJ}eBp-6uK3xxe{)^-C!Y+jdX71L>UR6{w|@Wq
z@BFvtzV=Pm>93tV;>0ID{cTs={AFaJe^t!B^3W@<x%7n35YpA8bJwT&dtNNO_LP6T
z?2wngjBkwQ)yJHCviSCA{;hj;{+8EFUX?xiqsS?b`RkK^eb?I``-JO1{k5C^eDEIh
zj#q#Ee=_#oQB7{!7pU!s9ubbBB3-3->7A%3NEZ;0E+wJ&-V(8ZC`Eb)>C%;60}9eY
z=nx=4=%LpT2oUl<mE%3<{_cC@jr9jG7z2|1?Y-7qbI!FBNT~TEGAV|g^-Xr`b5y3n
z$A{k^Df*cuXJxS~Tc~6S9C=B?R8>>x$Gu0&16<PJUhi<LG+d+9#frmkY&A8X*o?5`
zusk07aC*V7g2G>+1dP=+(d8E_o~Jv;Z(E(Oob)$i1wSd3Pzay;NKs{}r4<z&GcJmC
zUG>yYU99F<iW@`WsES+-j*5-zhs^?4N)o2o6hF%d(QjD=p)<>7Utw=cQ!a<V`!Lle
z>+kk25bwi!(;#PBq*|n;VJm~g8(SB)-kM8mIvwUbMd>6s2w3B;bx8Wii8@ZMdR5sU
z!wB2+rKZh+I<UP!5I}GR>U<<W!3|6-pRL=@6DE8&p0TQjD8_zlnp#NvBrMYSw3LKP
z?5VtUm9a7{V}NOqGK35OrhfyQT3_jpkm{7QwaRts8?qo>ANCga4&uwiX^KxY>ppM^
zQu!ZI7Wd?plC~-nKT>?Um`}K=K3`*?!O1SDuBfJi;QuHyQJf9QYG34JW+}9^O4<Vr
zc^1s!riEPQ4Ly=Pp*C5xBG%Pel&aTBO)o@O21Z6$;(8z*dkWn*%VO(PRiD<<=h3bU
z3nkMY4L$`6Ha}AEI5^JYD0$}L;9`XayJmZuB@Yl>N_WT=L<*O3!102P892pvjo8D0
z)`RmketdPv+OZW7koH4Tr41jHtB#Muh6V>a15$$K7Ix1xn{C%$beXrSG(6Z+;(jgH
zMjqP-_?nIEQ-(0BM>e+0bNg_u)^k|vvK_I7MvrwegrjzQ$JyLaz0xAp9toRmCq>41
z7AUPsbxmWf&vYVZiG%+kd;HcFdfL^NW_Rfkd=k&sbmW4}x`~vGY&<3bgQnZ?>)AP#
zO13He0()AT@|H0WjJFs!7rds_+-stvsr9OINGnC4j;t?QS;fW0Lu>i8(}RCuLw}?x
zyX&1NUCXA+nd~Yhi}{i`#Yfrp_WPew`>tM}Oul%JG{xyy(O~SiAzLEiXLoLq2}kMa
z-9z&eu=-m-p95i4bk<lJbYq^n$~4dxdU32{8})RkGr&<l#NK9x=0jhjs;nqg^n03O
z$K-({eg7)`uvG~M!Ed8tBF>L+@9u7JE!(VQSAT3-a!@U+QB9OFe&AyxFBa#1^U=ey
zuMEfdcq>tn!<>YM3||Gsd-q;E)kQtl>*LLJg@n*rsCWe6$)EFK?QN_gU32-{^In0(
z8FJ-|KJ-4%b+I7?g(2?-mfR2;zZM+g&`pjCCRE4>8Ft6extPJ;Ur|G3(@f7CSh2|l
zc{QG0D;Y(pF&${|`N2ABiW@RnSXiFX($bQHkZ&z9K6{};Y+~GpDQ7`#(2ohk3^O0S
zRs|`L$mWvVv7Wk`m>!<~Flk-L=~;fVl=oM`gC+OqGTLhhB?pNFs@;t-Pzv&8H?eV3
zK7yXGUcJPPR}GEm`=T+kv0+&}yPtwjyFwMz@uBY(kjA!HBiHJ6`3=Qp<-{zb43tT3
z43rLgEB6kaNUcQfy$!};4EV^2G!{%4qE-I^C5S;-CTY?XfvgAQw@e7q@^N8eYV-;B
z&zwsX78OY(>;q`=9ckjZ_wJBx%6iA{E56Wt<*a4Ryvb_Qz@1i|bd{4+&CkBsU0Y>=
za4?V@|JZbEi8^iur`D`h*oK+FG=KerZ++c<i5hq5;0@L?Erha_Z1&Ywz>gw+4Z{;#
zA_ldc809gRRi8O%b?@+h#dq1~dE4y{+wN^q|7nXp9(()UOUTsN8abhtqoo}+u?l5E
zR<RXKVW}dRf^?hlkiM(bOy;FNqwcxzN;STej!58F0y#nxt;n0q&8_4P?!^2~8vxi>
z&sm^LyYf!k&%HKfE6{!Z{6z&!TRYV7=r}SN<liTW9kI@|Sd1q(4|FL)4FQk@z%0Y?
zv*-8X1c0O1)1Im$!M*Xcld`S5b6!YK%`?tsA#tna>C*M<eQYI_Zk3ymFRM;g=tl^w
zq8~=w6>b|>jEIn7=aGx2of^DAO~>TlgVJ&xt<j>Mn+w=bW$-!oN9WX)rAPsv&#!mM
zzjW)GU~YM%96IylK##o>kWXTh$JE)74E*V)yoc5V!^40EqgHh$W`C`#Etnp4+N8v+
z8IL!wAK{{#PQl8q6`8}z&Z?@dSeBOWzFPu^4eW>w4k5lXm5A?#m#3e#=r44>+ROY|
zn}lMuqlCH+3hK4CwzV(jF=snAdR-yKLOrXFijTaGl7~_|&H9rsLO2ak$LY%Ob&R!8
zGpkK&)gvunU$<L>bSYBt=5R0&yT^oB=T32OvY=YuZ|dh`f!oJTkASC7W0DR#2D5|-
zQxlg3VaI+1JUSLDh4-B?Ihvu~K0e&=U8A!HT6?tcHS%B7!BomQ!SJ;h5nGt~b-vq@
z8sA>}`s{Ps>Ao>Z4nj*fV5?J30O?PrsVUCw<02&3ufaL)6^#QNw_Y%swb966n{R6Y
z;}#l!A*v}XqNwVgvWju;73}3TUN=n5uN!LR(RIpv^1wH+JvkSmWS3@DHE(ZY7tN%e
z95)bTmoR!&>#D`UH_of{S1r~sgDfr!EfU<mNr~Tbx{_{be9>S;c<;`LNF1dg=WdJ%
zJ%#Y4-q{M?QajY0EQ%#cQ7YbdNUy<9MJx$AjB#8PZfx-wRwO>!1D^Q&;Qgco6)}-!
zq@P*-L-|LQ9R)=mT(x?Gor@!Nj*c}xONsOot`BVJW9?O$QrV~=A<hPH{DG^far)}v
z!H1_4MSzlzY!Z9!^sb{HTkKbbaFC)ww5l+0Hn&}NAH!5IEhgAOlzND56^#{bMu%;B
zWK<?4i;oj_=Y*H8XLcDJz~uZJ=n>Lj-oE_v@L<<gy3Dk?gPK&-aO?n!AlJAxY4wtB
zN|)0BwKay^1Gov5!Y0tQ%0xE=g&#}U2zFew+yn6ohTT*MVQ#(hpSFCwgyQM&c+630
ztO~&vb!2>gkLk|Eix>N|C?yOl*U(jnneXo#cEFXBtYS6X6{s6=FHa!(t<6ste-Swq
zVCuIQ)>yYRE{4}WHe~v>7fvwwrr?=Gnbnocm~q`a2v4GoAiT!GMy>NmU#NLP*u;CR
zXG<Jqexu~s^U{P-F(gN&=OMH#+RD$&F`>pQ|Mi<7Z=S|=Ppdr*bHU}5zqps0$r5=i
z@M}6t_M>(7jpKa0ag6t6LiE&xf-5QjD<kdf*k}|M=}=3)P`zZ%a;pg=sZs_7uZvsB
z7SPxXQw^lVNxRqFbJwn*A9sK1l2kU}J{OOtMON6y%uh&pCY22hNiY<<r%CGy#aB4A
zRTgI=GM6JObi2Vx6M>-&cg%CbT+F7|_md0hT6v!78n8QG{37VOC44b_kjZgW6b0?g
zHL16@6=CujGYr?zO5#h;S1$A!D~DMfqT|Gj%KhLt#nrWPU#MDuUzNx}bcN+Z^#tOe
zTVI!xy?up9h07v3(`C&iY}EcJ<cpx4UEsj&Fn^XWFq4EYBMyPKC13*Mzg8E}02y`9
zNIV+VG%&HvZ?ss~^fuo$*gnLNjE!N$U7g1nnVH$2K0Q8j=1lz!YJg?5r~UIpsNR(Q
zO*?bOH>Ne>!qsD5yts#^R9j6Vl8O6OUU@-A1d_b$M}SV;IlXgx>(o}0`Atf<1R96O
zN(IMST0?;}-OXM*R)MyYm_Bl>#S{Q>8vI0+Db^*;7Mfb3;ev8PZEwz36$FX7ZqFz6
zGzzul2`CA{9^v}COzvFXz3gI#QLNdY1@Cybyf)1k?Xi1rE$ZmdS)iOfhUo7ED)zzk
z1qnKa0m1X}+0Tatvkg=>tnx}qs$zNE1%2E%eTpI9yp=7{UGLZEO|<n`^^FPKe|rHA
zE1868j@`7NXV33eZ4M3&K3>4zV4m;&LL=y~Y_(NIM=_C%sc2l%ETV#h#k^=b6+XXZ
zHDV{*cq~giYzB4iDfGmb>S0%}^cEW%A5E7EFfI=}-plBECj98-OA+q$wPT?}zxSs7
zI2mps`n}w>T_(7r+O2ltr9J-f#&RjlN9x5tdYTQsskfe;4cKb(vRZ*gA5YT}GjVEP
zqP3&rzXb-pLA1d}SOhe_2zqY=HzBp)?V6nrhf1KVu%lTf1L8!|F`%3$>AxyHY5VKr
zo8A<<ZX-kr3a{G8SGRxWD4Pp`w9T7{_fp;@`%fGBgK+a;=*<?c^7lzgn7*MN6CiiK
zycYARI6uPQgekh-7MU<Dy^0-SvbD9b*uck{ryg~j>C2W`o7ZMW1vpX|&{Vl~45XV2
zvH%^(L;SmIq*IL)m_6Y^vCiPLxpS)>mllq#feI|>F~%|b&_<n|+W`|HgQj?_XqqR1
z55B&G)Xei4pLoaBadhxvPK-YyUvFAd=%C7=xIe?7)Y3MmL-R)F=v->DL+tA6suyoN
zcjOx5m?)=+23t?Utw)6KY|JSWx8jmxS+uaK!pglKbG3;b>uCNx+Vd9`w^|2p(B2s1
zYiw;j|D)|S+&_+#@W9Po5%k%9ydsW*-gO=pALzEPcy9}%69Q*RI2!CaHH1OZ2C|>Y
z?23VftRDK*?GDTiS>*>Rf^EA(*_4fxeT^kx+JcRhV9KXL$;MJF!qsx)=#@0T9%)R(
zmNF<ktyldm*?$Tophqw;d@3r5U3sHh^dALeZec;m#ibN2hC1Bah_fEZ@!m49k5$}V
zZcTSOL0OJ>elxO<O~1-yn;0TWR_sAWp6&8YmR`L~^nv`dXZ`On8Js`&bK5+Z9=Z6D
zY{H0A*mtB--BaHP`qkBywK&}WVtZHx?D_MLyoa#};v-e}il{HDNLgz+O>{`3O};I7
z1AjI)V7()aM;MVlCTiea%5N-0Gq`%5@9<(XvyW0*mTY8XDvR?)=c}|g#v%13@*E;7
z^AK!;7VceGbSj0id!;*RaHN+5{2iPzKcpH*P#IYDnqjX(bcW`s#!U?tWmDMi6xJJ9
zmuAmnZ#-5pEbsEMNyNSLFq|nfUo*vO#Op2&FMbNF;&f$y{LJf<u&{6k(Co~21-t6h
zg`~;K2ZqyeK_Nq^O6)5D?8Q)#sU?{Y#ouJ0o``HHN;yQ<&3j(vo;V+_5~o<!s(A1D
zM?xVDn*K0=*+l(d{5lcok#brGR_9vq0B?tOOkfmAGJadp<$zx6dt$<G<5a!}T#21;
z+7Ab(^oSDGe^hNBv;1>G<vh~};$g$Cr`Oj4nT8qPU(x0jd3+BB$SUI|0)V6vLWa3X
zDfZ4at8}5*x>zwYx4q7>ScpWzqa6{A$19DM<ns$8WhJG^<m6<aKduCQpy@{Izux})
zoPKD^VW6Ay+-NY5DTMhYzCI@*tFrg)ffmbe3&4-sI2Cb)ql-JT2!=5=BITv=x!g^g
zF!>UaZjFZ4^d-}%31wB>3>{BH@80yBx3;s3EMxEKSZW4U3bsh3R(L*0&Vpn;R`P)@
z(Oel`ep!sFO&gOOMn>c%=0ZRs%{SM^(DVpb{x)yP5?k~#^lVVhJBZ$;i}p3@g@vD@
zfi&<TN#c)%r?p3giQf(=RcuPyT@E%WUACw8*49yz6g13a9a*EZNvbhONcG{WRLYWL
zaob2}X!axUns)lchS8zRFrlCnDAulqz`v7`FnTFB*wpcVJy2G+0b)1dTI^`)`U)Of
z!K5TV_GE{&w6x?h-x;G*JzS{AOgi2mt?%9hq&9yljeEb=s40{6pHkb)mWQp1h8BpD
zZYKMs>VKQ1rb5s8^^sd*Kb_BZ&E;#~<(AM16jcu)yUfHfn-dz_o0>k%7oJ4pu=&=F
z24r5?F||-T(bXZAFp|HnlTb4FSlqnt+S2ve_=I8+qb?@~rlI(0$?kVD|Kb#Z$JHeN
z5uR>be8VsTNIW})6B1sO1UtZ4(+ELA44xiLeA39QXJiJdtmB<OFZ|N-yvl!TdDrfI
zQuv(3nq~05e}oz_WrEJQ>Ga|(@rCnmg#=I}UZ|1{l2^?~l@ui<u!0Qi1e@LeNLyZ}
zEo+$zEvN)P7TM80_%J9syEV2<OX!}C4~xz%E;fF5NofvX++BGO`;&@2t%zi72;3i0
zNWnP$#_s46X2U<}<K^2Pb&Ru?v2d}KP8|N#wrvHemQaQ$J#{;5^X?aFbK$rI9%cVh
zg?0f5w~p|(p3W~_Q8z@5YTY_lF9KahuQ}|AP_lo*V2H-6uH)l?(|Z#z!^5wu>>>rv
zZIZBSD!!GTDchwkqYdD+5us-HERcsAw8l>sZJ{<;gn99>PcrcJxhk}}qz@Etp1ShZ
zxKDkN9Y?kP{-28TrUOGef5YvnjY+}1uslWU?XPuog@<_1@mBb@Itiy<Wc-AZPE@5k
zSz^b^#L#nd8F*9|-c6jyQQ^!1UFN=E<QV3ew)#8eh|SLzHo_*~7F#o|m)0QU{~<U`
zMXcQWihM9yI{B4+@0;1ufEZl4Q(G&$wNeE^E_T>K14-zv9G8vpEU@Sq_ID9NRa#&p
z)uMCrGnBSncIKDg*ha=hZInkZ&q+{$_eXB$K#S*-?)n#oBudY~+ve9I(c5dW4VpT3
zzR&DctcWvcnG$pFLGoO-T4RBTHAZZ(<=4<}I`VCmof_^{`goXYyTw?UV_5yT5q5BT
zgICDV#Rdexe)5YH6tx=w(9&YY*t;)D0m)t3(el8+7ggh5VCVs>KnXS@jRdtc$tbHg
z4y;LaE_?=fJ4AVxlWVk*l~XR7jylCnv99S#*xuR#EBsO0TgN{0|LFURcK`*mp1I~b
zs?Zl#VGHFKV(W78xt^Mm!V5b@W9FJELDBE2`4cxLp@;T_I2eIC%?lhx)6Je3{NTYa
zNtN+(K%BA3?a>;&g_(wLnXCe$9XfycexA>J=I2<)`6}G^436@E>}N&mQ}8?&&-5c|
zhmHW6yJXn0Y8i*Sq&^a2V08C5RKj{iw!^-gJQtD{(A+;Zm#a4cC)$LAWnH+`*>MV&
ztq(ouKV&3*U#MMb?zb2_>gZ?w6a+w$_Gy{6F3o~ka0oq%)`D7e6Jx-hHmUQSlKEVJ
zwt{=oPITg_fw2ZqT{c^7$>2j>Hk-htJ)B(<O*eePoVbx^pR~brY5PyrB<*O7i0<X^
zRZbf-mB-7ol}CEi#k6Jd4t>sjf*Ul*msnHlAzuzy0X&mDD61l`(IH%8-G0+{BR+9i
z-~1V(qf^9^M#{Zh&!LLDwJpX~FDGp=R`{02g0#p0BygvE;%s9vhe?5}lm7y20oKZX
zFvxlk4zK;|In?xbI4is$eC?vt^QO+uDQ^AiZza1Y@%EjzFovjb*@JeA_sXiO_O2$5
zgsVcyRpsaAPh5arGy!|VB~g1~#~sm~7|^}*Y!tCgrzrUy>kC(}k^QS@08-NW=v{Zp
z@Af@bbL&sViYKDi=sfgIBoe0+6BFkE36-;2kL5dowZ97XG9*PXf8y%Wc3M!Q6a?=o
z!XN>qzK&L2kP$M-FwY#5vDAQmS;cs_^VH<GR@`sy51~=1Y7LZ^sNHz2=kc^^dN#1L
zMQ@hn)~%1o9peT!BM<xI?$z00a+E_}vge{5I#t41jbcsKCWS|WR}sRO$dxW5XApBQ
zE_b44$)GO;T8lIkR(10p!i$O|tYN&L<QLe4O!I-Cm*UjbZ-78(bZ<8<h0Hm`)_^Ss
zC$i9VSB%GSrEtxAxp(jC4Mw&GDl>5Jrkj@Tp+^s-8vk)y=D+<_ZggiJRsET@?~$IT
z?)9wN?<h$1D%Ewg{jz1r1xU36D(mTqEy9WP+dq@?$<YoJ9Q-zl&H5^6YxaKuaQo$Z
z)1h}pkG6DHQq7h+b6Maf8+=P*?z_7#UVFgDJdDM9+vDaU1f5&OcUB;td)${pLtSQ?
zIL&XBIu_4Mzz#Aj)}^iBYKrQy3GZ{1A3t+Snjcz0hxf7aS@NZFef|11P+9SQo*!{k
zEH=TmD;oIxnX#J}H~041i-0l}wjsrdVUjZMyn?B!>X^APS&*w8_Jhpo=J!A|>MH~y
zUS_+p@ZBCZmsqh`nQtnVb6M)T?y6$95x>A=tC2Fibwu~a_Iw+fC$^{1PxI=*uJxW@
zMMZ_nbChWTkI&0#^Po;Z3&~b^{&8T-UBta~Z-p4?-&bj;Y(|0$9_G<pWt=QSu6)XK
z-W&EGzFK)HErpx&)q#PZO}-6yFIGlJ=f07kPTYw7ifmPFzP^WJpUP`76}<(X1kP81
z6n16w3^m<D+^Ep$9ys!uuy$Zrr`1k<#gKccBldI$+-Znj^!bx_U53)YjV*RWwH4>3
z@#_Q!gzS!g{|jh_^5?+Uf#qn~@r|AOS6CV8jR*fo{Xa}cwN(^Phjf&5CRtZxaaAP^
zw-Gmrs!7xcW3Tf7n>E65?c%TV@`K+v5w_O;P_}xVg|jyANLMg=R!sS;xc`0j7)~P@
z7i<D|s#R>*m0;Vw))KMa{q)x4#_ByckWC8?b*VAjhww>9ngKT^k!Fe5+^o+}J^YS~
zo<{4&Yjr&n*A5J!wF6&sm2n)J;4&ZXcU=)RZGA=~l?_fn1O}d$sN8rBwi}q&vgC<3
zixsJ`<aCw6&b#Wh8tUpASi%#<U6Vsnh5M~xNWa*cofnIO8MWD2dsSfFOSUOSMPI&X
zK*c@e!&YS+jFj|{F!?ye-OqDYWMebXvEfRK6-G`@E)iWp!y9TbZf#(NB!tu2wmZ8y
z`m=x>z&LI#-1JOJ_3go{aMtS$cC|um$S+^|;(-cruB%$D07(8oS5N!49ndFpL$oF`
zv@9*&Aq2KA>bbU5d^8qb9#%Wr<y6L!RRq?Q>Rnhp^RIOS@G@L|O8<WB)29;x*As+&
z-qjy2NMcH&gJ_=zu2iZje?O`0Da^kWCo;-av1boneN&13!}{v+oBxJSrzCreDdw=(
zI~ORcp6y*%*5!?O-LmfyzOv}AU<!>`-B*e0D=b$5*+lpASmO_4mpf@|HfMpGt{2HP
zm^A=C`Kh)}643}T&^vNVVK4Q=M)<K6T(^0s{bDmxGb=s)w%R-&e;sq1p!(ayFt_*b
zLIr8Y%-kfj$o#9d{gmP8VYBXjR|+Pnhw>pzkl5<-xU$L|HNGQY?ucHTS7b<JA=&1w
zCY4st+gRf2A*MdnQN_$-HkCHjt!A)#|Jl~go)C7(4x%3?#NK1DASvT$SD!e&FcR`A
zWwnh^^QM%*98~G=9L&P<-E-gXhqwi2AS-n7p3yZi;19yn#{fNo0r3;5$$|!C9};s}
z-Fqc6(_4bz;<n-32ZpSz<?2mGY930&6x7zX#&5Zl+3QU%$P_`U1MKn2YYG?}QHetV
zn~!v|pOFV1I7u<r9o1n8UxV{O=MdF-nXKlx=NFgznz`ceu11WgFEV75>T%YWa>dtj
zrTNlo?qJ4{PI*L&$;eSQM7p_eU97r94Z0+AHiGD7+^;7yEuLDI&a_DOALjAnSxN;m
zrHyOK#$VRR{A-EcYeA6_qUEmqQT|gLfQi5N@wF=_C-L)z=t-`Km_}o^>oJvf7aB4T
z9;d~_8K8MzRWFJ0j+aX(C*2TlrfGQOc+#cU8<8=V({vBjLmDmLtx(y<z8++ixU^$O
z^UW<u*!SfiYPci#RW8!fbD=)}9aq`8c|a{larnB59R%fDF9G!8imyoZkJ{58lwH%{
z$W^V!ksw9!4wC9RK11cJ{N%}pK@=)!V|8@Ojmd~nyDWRjU8Z6uW@nV`ee#5KZrh{p
zI6>vy+F_&1PKj?+5bKvA;+||(r(bDV41cq}+Aq{HTCT0lOXkVstgk=xrMUKW&sx6=
zLL~=dourZ?rZ~v6z%;cn1QEB#tX@K?FsucN`!bXlojH5fpduGql5-C!d~=e!Uc0fG
zyzOXeZRhMfa~NybquhU@X|HO@k3Ech^Qlz+u^DJ?gh>p__!;rDF$fg7@;uO&Wu%7V
zM8KE2_}e&S`1r|NawSGC=nOkB)B>?)^zZvgO#C#{wEH#rJwCDG^%_fmRJMj5PwNhc
z_P7?L#>y&g8To+Hb4s*=;Bic0Y3_ni5|j4UIlM*=Zi|cL67KuHKTrW-Ov@YSyZ~9H
zZSIJY8r@*MG9jjx)0JvfT3zvdR?Yhj{e-6>-?3NCgewJO1RqsScvSnRw;mgfE+z8`
zrDJh_xO;97e&vJ^bc#IHJc<RHd7nw@{=_1Uxdh{~1pny|yBnk0%2|$@-(*tMFTbg)
zT6Y)ZQA?His()iR!qUj7Az8}ZIE+br-s0F**F@y1O<)t0Gr0BfqNK?XAYK;pdn&8B
z#)=^$a}imUL(oLkCuJR+pb0lcW0A3n>y&wf&qMBk+1{(9TDwAggl(Mb*eY}7>PxxH
z>`45AgDl)@p%f5rJMt8B>T91CntK}|Eb+dO%8r+B;h7t2OHvqDP&@I~-m7s+7trJS
zd-cEEjZ+3&_td6EIP0{`)~U39c{9ro{l1|*fpLfx-p7QKj@0Pkt(_5Ly)wMm@aTCK
z^62)|hT6r|<n#Z=c*z&$us6yDn^{?Qy>DIURC4H;FVkh&vq`-+cRkUTg^fLaRnNw)
z-_E+00p!%*L5|InK5!{1+>f;|spYC=`ILxUr5O=6NTO{Vw8$Bq5O&p75S7WLZ7Mc`
z^8tvlygP4!&1h2i3+yF1eI#3-vqo$DP+t+e*tpmj&NoV3l6YV%!F;DV^uA;7W<bA~
z#C<^L1*76HdXVZjJ>5cjbCPxL+URA$GT%x|eR+BCm&F=FWTA3W?Pxw%feJ&6sq_8d
zm~?gYLse+)xHV=@*1l5FT|4yq1O9YK_5J4amBBNn44KLVn}%|zH)<!l`g6-?4Y}OJ
zql0Xmh(F7Nvac0ij3j-(4{C^s)L1jfJ$RK8=2R4wOO!+_h*SJ)t$-?C|LPuL0_`F~
zdZ!(K(g|&ceUoX@1kT9|Ckc_0a}syqZ{zBtu;7rGx&71dr+%G({&!LP-~LMEIX*)W
z^H9am!a!t=@_gu!qs$FEz@~l!sls~lXzi)I9Ep<Nu-UXOwbRF^0sgi6+<DyHG}SR*
z>r%CobLesZht>C|<Jaz7uN>|#z+urRhx<TyLHOYH;(ta{(o8@St60{@)f`UXUe&)r
z?%}g~(KhnEXTm};uYY-+yx9Kss@ew4?OzLCyUsuSO&WSfRn><I{U0+AtoHpcf9?y$
zkxBdyec;4(d?FK^0;Ga}$BVd@voRu`^WS#M{$p63`Qi=qlj~osZMaU>&`Eu2w@Mcg
zF%Z-hcf~X!GBx@%*S|=!|JUiB&HHEKT>0BmI$gB3PZ+zMuljEAc7P)231dE4U|IFw
zj@9LKL}uWd*YCx7-rFE9q9TXme;BLpGU~U419bflZ}sZAlV$EbOLzM3W&TUXLiV{V
z3*fT&XLz}7-RmoQ(7{=4qqvoZf^yJU%lRKS;2WNv5*lylH8c73Chm#d`6sKT&)R~I
zYN-ZNEwtaF%dzrS=&zRm{M-Mpr}=-MaZ}xFK<$B#M3TQgEiE13N7jvP<SJ?STuaw5
zz5Y!5ZnJv`mG6maSIZFKIo!Z%;Nd8``{zZ9m@9diMUrE3-iJ*4c9+(sO__fXp?4X9
z*YLmJ>ObH1f4<)=_L>?)a%=k4wO`{B;Z&6cK6A6mw{0TgPt(Aa3#|BnU;E2V=uAQz
zzB93+@rzANYV>W5>AuRIa6VgpR1k<5zSYJ>TbR)&I4k&*g#D)xyoo5dhZyIl`Y$v4
zFLQf24dr;P78~&VpB)w&?f51Ln8vnMjeC20N5;pGASYjnsmB4uENmj%Kqpe^Z}oCY
zWhxt```E)FmSAr9v_1>Zz%hem&oDhr`W66f694}@3vmi)DE#Efp!F`A`8N5$+MkSh
z*z8;<>~GCxXJtLqDK_NfZ#*g`XOJvM*K4v18CF><ah+h{#hbqy_f>kpj+%KsEUKyL
z-WC<emY}@bc}S4(i)DicQ4*U|#Up;GapAl^O-D^f?LWgxxlqOL-p_c>s*gQRz+RFz
z`NAL9depunlFGmBH`ISNm1*<n1(r-2G(|YwL+J(GzL&-{!BNlW{k>*^==h&>u(CD3
z_5D3HD>HNR(xFDCJdOD{h_nOpT+J?{-graP{hlr;<L^zB_*;q^pjec8a?rx^Ow1I|
zze>ruZ^wL!(g5#%aF!Ub1=#riBOd-cBmA$QUhK7;^(^3;YbzXxy-UPbh*^wgT(^fq
zj;mmYV%{r7!wIvIk&!Iy?70Od8X(+`X`t_7d(Dacd8+S<7jbRKWe#^+>xJ>O#*gfo
z^euk;hW)k3jcGa3ru4m=fO83Ot~67f#+?R4mR>U4SDDkeEB+6e`V@dKZUFrf@Q+9r
zIlZ9pGunpFb(=^rS8+Qe@Uw-c-r0cH6i>wU{Y_&ZHV_t)wn4{jB&$6yN6cw}nW_3~
z{=`4(b|pYnO0<1r(zi_#-5y|`ZE5ATV+lpHITH4HO%PAae~xj4kV&0Bnc~1T$^UT8
z0Xh9gYX9F;0rk9$9305F)q2G6wcTH|dao#kFHPL}L2EeEhDRDAjvvt!aJFu74VNi(
z8B(p#mdY)6LfX*=^c%32Zle%c8KvJ0%S<+oBgfgluk8|yt~7Gbcs@!EVguYgU-zH;
zdEI#-`;l_-?7<7!YW5eh8R3h}2XBEd>lf~V{u$+Le{V=H{vC58|DxntH$z(5Wu$2d
z4@}$G*VMgkBkc#N*a<sogO!gt;uiZ{mIw1Z!uQ@V0Q1NXxHaTYh=2d9UY5Y*7kCfi
z8vFUfT^bpym(Ln!cc`pwu5OiKz@s2^Nypx1I_$1&FWCwiOrF0_f$sDI+3yHiV0tYC
zy2|tylL*<{7KMKXZ_gz!=(ioJRHm(?ko|Zd_5)hm;#~P2o9YE5IG)VIlpo61<mD*>
z6XRK*KYt#A|Bxs6Y=;Z2_vqoD&qBmatXMlw$o*={-|4LTu0_*@;NXxiH8%_A!&a?B
z3DkVJORazWv)lQ%MEie!nktffqHoNFEjhMoAQ{yy>;bc9g&r7=jfpy4NKfjTM`Lm_
zy`qwm&HsEM{0;C2rLJXNG&AN`EuL)uMMjmJOND-A0QXcsB}-AmKYr^arC;IrEnK+H
zxu54C2>Utk=R>|LTTPleZf5xBDv~V(4Fhy-!asruOW2xwlc}lakGl2j*NjaOMT6>b
z8|+Ad&_g=v(K%F0lQj|)5|VPGFd4Zhl+0g#6?_6L+#-L^-h@$*Ux;>=V^e(c{6hzs
zB}ST99JFD7Zff?}Q~jT8apNU0bQ4Lqk6Pv6tznfIm2i=f((QbQ$eMy6G%xxev?_1B
zo{Wr+<T%@}#Ud#9An{0l<zpS5pOY_&A2j;NDY@})FTkBQghweMRw<P4)fah{VU-Uv
zW;0S|%Ewo{xBGn4%4=K8xYHs+OqWvL!RY8{EqcOKBs`{G4|wY~;Dr=d(-z%`wY5G3
zfH;%z$NA>^lU-JhV^1^S_?mt@<z3^>&YyE-l<*Jlq$-1wYC{vml@??r4191G+&Xtw
z!(gV+FMeN+l{vX<Se138Ih*Avt;OI!{7NsA9!2^u<<HHJN6&n_%Gs#xpvFW+%{#^v
zsTE~0HTCpjc*tFZzZ0y_f4#)*$PwNqkQ6_zYc^xXV!wGxkxuxY&pEP+Wy@k>S#7C4
zeC#t4Cp2N~w`SSj12&ZC7V6@sw1n39VvFrYtBs0{Y8?1(UZZE}Y1j|R%G%uvJDg>U
zI7)h>8EwMC+<Lj{Ai<m@?s`yfH~JZ=$bZ)NoR0!QW)ACE0231x>DVo#&nqa1x-WOW
zvU6iYlJseql1V0Of)KDZbhYgxKc#|6HEixU;;=1j0+imfSZ2hZkQSd7eQJk&CpVX{
z;{&anyJjeKmQ|X^Q1-Z)Ejaj{yARv^x}2QcE8O~oL1$;DzCW=IbgTj0EikF@r7#%<
zGH6NpwfU3HGtL}vNe??0XA4{_o6UYX7Vb>!vV@AYZvZCbkJ!`Sfkbsz_9j#61t3+)
zc@6{x{&4HwRt{s3zFnj1=q5f!FU@0=gG?|}(OBrPquT;oJ0}1SrGO6*E)TyAv_{WY
z&l@A6C@}_@e2UrJu$Y?nmcg*+FB(2MtSmYBVfgBJlR%?f-Zf$=VS2k`ms)QwR?`T5
z8{U`$jIDgq8M)Gp%X*5Bm8!8Tw!KhXle2lL>-ZqK7%5M}Ow6<VJ85+Z@W14uStLdE
zV~hO8;%O)B^+^X~k~K~Xk@OwXY>)o<&ah0$+)5Eevy-D};WpQR>Y4s?v0!g;g%~#a
zq4%$Zn&*ijj3CW3XU<A5%V~ov{0K`r2y(20;b`3j;_UP;rN~!*d><sQD6c|(@%7Ry
zxI2U9>R-=o)lqAv73%X7ToSdU_+)WEeYT$XTLTMf&gjO~+FX--<8EJ@FagR#Ge6I|
zLPNnunOmS#Y3`n*hnJCZ2HMT;$IcTQrWN7*+xE7{4`F}>7(}TKW6FjLLMNy`k_Je1
zXkw$edM<3BaS4;k<h}oGzuLfCLCshiqCIoMtS<JtCi5CpwhyR-CTvU^Fx*^Rk&9MX
ztUZHdUflX?nr>yfK(iBEu^#t(WOU%6LQDHwoIr*i$S#j=pZZ_JWSinE_kQ^5&=n?p
zX+6tNNAgBrvh^f$MCF9XHGO1coJI6!XcA_kO0qT~C~%;!$Na!mD!R36sQiSU6dFE(
zyb>ON(_q}YclB8D{h3ij=~5=LXut3PX7VBHI9Ls<#XMSUkLk!`?oOD-Jr^-kNtRS=
zAx*kf)z->a`r|9CB<QOIqU1Y{*s<juM!QAnUo`N34eDX*AU+!A^N1+K4MiuX4+a!}
zi2XU2#9PO@r{AU;ffsxv|EP1E9g<)hPqsYnxQ`?1SSu#@ZEX?ho>}szTI0T&Tz=}X
z4l2mcpGP!G#ZQlia4#$cNvZ#^fv%VAb!@%r9=*Gvpirkoxx7MWfr_1oo%ULIxg==c
zZBgapiXYT7KQmmE=`*T5zuLJ2I1XI|LujMW@uo$sQ7cEY)}YpbW(M`mKu_J})5Kul
zf!0<Xa(I2cH1SZZu8!i!YrZ^sf>UHqI!azW!V_bNnzr=iZ@@+R?N0|4;Z5qjzSv>l
z4b9lCHw*&*s@XDEN6O7#9j!Nb7G}D64{m;b8(X&jO`UGv`W;p$m$<)l1Hcp0!KVNi
z@#Bd49`v%f@UGO_O;j5>i1GVdgIN6A?<q1xviV;<e9Vu;&0~L%Pc8+xO1+G=t|eE9
zwPc&><%?Pn)lx9I%K&MRe2;Lz&nq-OD4}K1AF@@XJbItlFPL|w2ZO;Z(c1(bPLZIi
zm3V}X3^z^l`H3i5tKs?y^j^9C9c}*2eo$d_)-{7x@SzSso7PpX$3`irRdF>>O~0DU
z1HWfwDFo*Y>PXe(k&xp_rCW|^QoIrQZJ{Z``V0ICE73IFM$0$sG02t1NIKMx#qdj-
zN*<+R`$3jScTNNNv0tthtasHoF~I7%e11!7o08Lq#Kni5j+lt#RYM8?RkB?8NBiv-
z*Gu4$PwDFto}c|?D4vHk<{qOXj_1q6&zhhF4qHke*@vYiChFG!q;fGB+uqTfn^l`X
zmaKpHn)Yr`810N`#S?9Cj9%CfTqVEG85|RVhjFZJjNq+bX+3hkS5~-$pWAs4#=VYM
z4_cJ7N;^34F%K-WE$o&Vz^Ycnw9Ow_s-;Bnli=U|z_{(anyWYGBmmHw19xFy?m2P`
zCTALu6iQw>8n9^$>Xpqpze^AGu_Zk)NxFDfma+5bfgd*D6fyj`ZHKOhisL6%+!U8}
zxH0iTSpr+ML8`~gbv(B9Pn%jlH#9WFwv%+cQcIlYhegDhKx;yqhWFs#$g#fm^1*-e
z9#=UUwC`}k8dSKrxVxZ4s`KqHS^^eEvu4M(@ln;s6UPZZ5yH6RB9kcN4-60NBtZay
zinnURg+azW-4VQL4z7Z-q4ZLQIX!7?8hs62Q~1F~Iy$;XalKkm@iGHRQPbJpFWsc!
zE#Jv=&j$jDR`Pqc(N})#*k!^L$e_!~|LSb^t&-kot8eAbf_uTE!;zs3G>zcehsy^g
zEd1CUz>BKrZ`4p>H`PNj&BiG&VOtdA@?3>&T5w?zfE&ZyVjx>Hj&vlklr&g1yOqrs
z0i@j3v4a$W>r0WJVy?$5c-7IU#Sv|};f+3o#6r?e?m|2@lt4xEA+>+`z~;HjDy2`6
zvsP<+77bOFWzXGkEQ&`pH!L+)MoN&EFVTM^A&oa;E*fey32!l><Ui)&=893JI6s^X
zGP=`DPcCjB%4B4$TSl{7-KQG*Q90EF7#HD{?+pt9nGETggi{X9<*((U2PL9d(@kF5
z9c*4I;7#*^et8`jC>^;~M7#RwOPR3Hczs?hg=gDgS!X8~VLOU9(tRmEFEw#oK7{7r
z@xG@5T$;%?c+bit)p7se=n=tNR?e?uhi08gf92GU?_M%yW!YwaarBiY=7}TMe&@FC
zd<C^mmGoUJEJ>CyN98^JhFX7ZC{uxwlb1JUb90j-4MFYELXN#u^pNIcGwj`)N^y7W
zh@GgMPxcAKMF%%0!Fbo4D4@@pDQ)*nL$UY&Rq$VxtjK}C8;w?42UV=*(55Z$BwBZA
z0|pfawN*{SXLn5OM<6uDngd#MOKE%&Z{MaC1%?YW`yXXr8!x9Bfbb@pb~zQJiDCRA
z&BQg5pGAttWK`Mw;{^fWc6^m}?w(>l`HZ1etf_0mad>WbVxe{|G8v@Gm)yYf2I=la
zU^20bp;1FRVeAMxYK)cOKspJB>JH=Fg{u?7QRulPDrt&4s1?9m9*Klu?&wjMv6}DS
zTB7OGQlpmycZ+yzTj^C>q-sH`b&e7r;x}YDd_5paO8TWd#L<DPSC)>R?5wBMmCl!~
z4Qtk<o8B&6O!!044)<5o^<HOJw5+OG=^qY<enH^313vuvwM~`u+<Qzn0uI+OK1aun
zl%10)2@A=}u$-KDQzIX!@B6wiy@ugUfo91<$}wz}T{&-22JOx_n1N}6f|TTOtv*xj
zOVX=h5*^B+-6Tn*%RK}X0?hMhCwbHXNaZUjDlQI~i|`_t5p^L13wy+M5KQend3k~j
z6Mlb~)zDMYs&LkD`|iH<BHq&ziX`8AlQ_)>sbko~i!K)L=0Zi!l=JT~g+GAZgoS``
zWJK_>DG*z3dk0p#(?>&PktUIrf5s(7voY1zeRaJ*;kPsH+S5U8G6LP4tL8O<8;aUh
zSGi6MykY9=j-O_{;>CAzm50|~zvm88eCC=2uF?&mk+VW5G>xnjX0qulV_20hjtrnA
z(!t$kKl^Q2DMrBSuoys9x>I1Jb6wbAY%YN+DUQCOYRi9XmDPMWzbH=xYjE6O7AYXz
ze7mJ0r3X?tx@s1fG(5j$y<TC6c%1QOe~CX9vDXHR-T@g~mHDlYX$k$cHeQOji5`3`
z=6dW#SYK(-VK6>!(+2vKkz~0MkHCXk$<aLJeFg`C!d>+>ev?8{zh6mUB`YFL7(p>s
zS?O)-KTJZMZ`KWUa~G*7-Af`aw10_eYkyFtViT@YzXto1r(=#l<s2u@+pklV0S<T5
zgnME{)9Ha7@J#s@QV{j!_#>hNT4i2BKj}GWI0lK=V>^2u?>RGI>`#FEPbL&~IW@~v
zl8Bpb3}rT1gLY3DQb<bglq?1mlCnR0E6)4Mz?o-fq087JY)?ZhJB$co5)F42sqgl@
zTVII*Nsh^;E~mvs@zWhwDsrtqy;-(V#xKnr5pku|5e4ly!oQGE*)^Sva%H*oBrXqH
zy>-vuNUn^gJLik2P;xix+Dcvto;rzUotsQ3IqZ?hSK0mJ&Clq#6`Ap^bB<nh?_qUi
zshTEIrxz%3ltejz={8B-P;$85L8)uECs$i^Lhm)Z22_%&GBO^RnVJ2ZYzw0kbBM;K
z!Qjo<?U*0%L`hjg?SyqD`%T;X;zO0AZ=aR-GTv)~%BrtuZZ(9S@*6L?q(u7olTiRz
zph3%}E;3mmrXS+GisfjWJ)$U^lE$h_ulBX2<xEN<;JZ6`?>HAk1xnlr)p|8NfLypt
z=KsVTnG+8G9AZ&M-5pJHDaW0#-_elB4I^MgnS&h~_csc%?lwNFv@hi)h&dm4C6ML-
zy*zJiQCHD9PmlIV+31q9Ia^pPGR+S>biXGhI&yB1v|_sw+PC?zj7H5{B|${5(r%m$
zRa&g<^|1PAZXO#7%&D!c1P}R@TDHleog2N3OZhAisVRs0W%#>HD}t4Dpb?hr@8H(K
znfOO4B^zT*f`OE=xw^thyXzX5AzYSyA<bELtykH4v7)z{X7{Uob2a%=8#v;+jjo6s
zYisM!F{f-R(j{}f3ZjS{slG|aO*yZT*^XUl6qb*W^`8ZgzVA`8b40(IP`2bp5QL~%
z`PtPS;$q%I8U{q^WPLtmQPOdIz5lfcR{%g43+Z~Ng$#nj*GO+c@79M`c8TSgo?|50
z<DJ~r`kT~D+&v{eYCe0xE1Lf8<L*t%XVvU|!q<k3Nv^9T{nMVrTwX?PZEZHXESaR3
z+*t;6k1PaPz5OIDCx^|vxWVnX)Uc;BBID(qfy&xlz`;D}@1sl_w1^u0tjmv%?19+6
z3Xi?LqFLDy2-)EFdHlePDeAiYVmrB_nz9Spq>y6p(UCj-h)7pdYNc*kHxx72%i`+^
zeuo#(ZK(H|PxL!L_2?hqm3tI7_7~t26efUw^6)zK4aDe151r~DxX``c3eC#K-uYqr
z6}Y$8ZRyH*vvs|Mi#1JoHz_x9Yy48rW2<MMtT&j{Q!7H+!+3`3>uk{uKvU+WER&Xq
z&d|0WY;)$*Dmv)sNvq<OSsOdPqY14Ww=%$I+hxA@DRjt}8!L&_ic|rGz7MAY!5VcA
zW}ZRS-7A-`U*jm=QCGm-kCQ>kmaIiYrEn9x|5(t>PbVJi3HgIqi(mzKr=k-7;?i=n
z)w|X<c9C>G>jT*2?uimuMzD;VHQrbQX<q)mRb4k+p&JE<dgwTgC8-l8O$UW!$3hzy
z`Ek@Z+6IX13{l&6uWDlHklB)=eIozQfwj0UBQD26C}LpkCA!gZqE<VWN3Vam?(N$P
zocvW22A)-yg$zAyHI{#k!-w<!X1WJKcW*t(2}!;!bLQ1UU2z+m|F&{Sw@Ny#2fc5l
z2V-Iz%T)ry=>1G;H;G$J4@0jWxJY^5D}DHISuhW(Ml7uUTtVZo@X~tE-N<vxK8#Mt
z(8=kuot5b8U1hasFh5_6abLq216gjkKX4O%Epcvs9vC)cP7_I*#W{KiC7VkukH=5n
z;7Q<4sH5x*m<Z~sxgjf~&Ze5i0X?>F!+LUa&As0hoz0-VI+*3bvg5@qWElSOUO2rv
zBVqF9mf1`O_`5P9%kzO_u8)bhtDY=zMZN&^Nx)?_OC!Z5F7Uu-v|?8(FE1~m{C-}f
z^ex3ZK9AluHkuUd3_V442FL`3(0otT?j5*1{%MQZ0-ur4&&}0uUz1;uZ=qGTq=a+=
z4#sf&vWFibwBmIMV0a%jpBCC-^><3-ZHzRcemK_)&xF(%ObE)8$L7X?tjT%oC)I=7
z;gFpSMoY20@*&xj8j!0<O32fJj<fM7rT5Ay2X%=dU4nTGgFV{@^QB9(NQkxSL@LZV
zkDNiId}u3d4Qp*Ux)KM=?8T9kv3ym&jV1EgTUxCh<q}b<Z1SG-4pvstLb=_vXAAHN
zm7;6>Lewg8rkQSp=OhZ(MGpQ$XfdSvQBPOgi>4+r51F>@2`=8a9#mEN=zCI6&kz1;
zpivkx7nfEU|4=I5yc^rd>)Y5NJSvGcFWOb3i-s&&C?K5>I<m~S)6)Y{l)&{w_^nS6
zUF3^gbVJvrF_S^R0(8??>+RJ`>U29P<}!{LE-|(R{Ia#>U6=6-33XmM!0j<^lHukA
znJt$=8WaKebBw$9AG+x|uN`_zh@tJn^)BTgK19g<m=G(PtX}SK`g+y^H7t{{^C?%g
zZLxeb3cTH=*gqucT{0U+^funVwV-iuo%jy=-S@&T2W>#{Z~#ERyNL9}Uib&IWxsbk
z;vRH#<VRS{4DW)qg`=S?xK8epb{G)_o7k!Gn?BX$1n*UfK5@46QvBw&d@n1~un`Yf
z#9$s?b>_$4EYVtqQhZ)bua29%?opm-y{pAOpIXC>3_J~#^=$CjSY27>1~VU-O)m|h
z8M~t?SMGwb<VQl_g_kr)^|m@Sm}{x)9Far}xK7!)jWLOo4c;oXAD|$>C448;jg8sl
z<QFxIQ+JDtE7dCf;RE&UsO&RC?OG-ntB}`;oZZqKhD29IwI%E|^<ls<6ISBUu_ecj
zl$~V{k9nkwMeG*TY)|{HATfa<H+g!F<~Yqqtdt!~`Awo4N#IO@_jv*jFP}YxcSLV^
zj0iS|rU~lYu)YDmby%9)z1uf`&eQH#K8#70Q%-?G07XtgQQC<f<S0E{ktv1k-FKB0
ziR3leO$K^s>E2_f+@ml%0O@voo3CcYtuUL)G(U{X9a?=D#wLMe!QtNJvKW^Y*0gLS
z2B^f%*7pMg6v*3Tc=}j0xC*&6)RMK(2G~*oW9yWKPk&>9w|uB?7;zfn0562WUUei>
zGt%&Z4!^rZRpb}p^En#1(_q`#7AB=&qe1c)iSzjdyAR_E%J)u-d<vzvG-0c-#co7q
z2<9Fik9TGZYQ{lUr-)JJq#+(}Zs>76E-c)>jA%EzoUOB#4r`2!F^_#$UR7|cdikCI
z0sac<6s(FUzn36r8`(8L{w5%xdkr<m4oa0|HjqV6=9l~tbnZRm?m)=9FqGC#cO*9L
zGx(XCo4SYTQM4<s?xcz3z}-6NIef3cWX0Obx*rxM&Yvbq*P=;w`2!Y;FJB_z_cvS8
zqXOd`*5Qt}en4T2DG_z3D9v1wh|Yyz1`ez}Kt}o~xJ22FngoMe7za|ZcHKx@6=QCo
zo(K~<HXScSqrBECqT4XI0+5Q-v3h09v)XZallboxKL`vh6_Iml2@-iV>gciD>Q5~^
z4A5DhR4t>`UAJ7h+RQ|r3dc9EN8@_&LZh7F4AOmtMsG4`$lc3qvyWkob8tcPW8sVB
ztGA1H!rA?M;YE7bN{557ybY-^b`-wXc1gT{%R;cA7T4yFp}(sPUJ(Uo)C>f1tjtz2
zZLd;`sh3!R%;`$D3w)}uJ8v37woX>f^bs@$Uj5nw3K46in+@H9gp2*W6K%7Q%#+Ij
z5m8o&6|4PQPaMW}ANRbuME%p>4(Oc-;I<tpU)mB`3ZZ0OG0Hc8%XT3rRHQu(yP41T
zj**ay3{oE&P8^9pEFQHo0aEkVf&9roHx{#!Q%sc;CY12y%n-}=3$Y}c;_r_|E&F()
zz<C8?Kqquu+o+c-5?FCsamH#UUfBF4kC%<{&L#gnRt_$X*vqiPE*^U^|H-Ebbyk(p
zvEoUuR<Rz^uV`z8nG?W)TFqlZm$LHn$ILb?inmYQi;F|=AZ17QkkH`Z_Yv1YCXg<h
zgHZ?Zy9;5KH*PWq1D9P$r}<~;rMe*dD|mA*qbF-y*5c9}Vq8JVVrjh1hHsnu(~ZXO
z)t<|P<PWeC+_akC<K>J3%fGvsbEESzbHSM6v$3ZiUUs%wf5I4+MElSVA`V#j){+t_
zEP0bTx7g@Xkzcdf6hz{zJh<vc9dNHTw&z^mXDK|PTzQs6qp8YYSt!50Uz5>C4)xf{
zh}{Kd2Vy?Of8t}~Q1frPIoJwV1&w?;Q3AgFs9IOFgI77Ph?M!HaMVN(#srn<K52;V
znW*mH0dzQ_C(AkJ`k{~gqYflf%F@%nE+!>PuZ<Q<hl;fcI3S*gJbFquc)=1gC>#xA
zHi1vT+w|9W?^hNXqcobw=w^*ZXBd`t!n$x{t@j`y-|Za}!B5aV4u5G;<wIz-kY9E@
z+Qwp+wj|2tKKr{4jZ`i~GO49T^nfPqDBQtIpS~Or8@ey8AH*yV<M}<_L6nDhD?JMZ
zd@grAcGZ8W9mj@j)OhaJE`Ug9Pc1fY8F=Of25451JGSTV)tz519jwjTeG$Mp>{VLT
zmYUKHE)TJ-Ei7nW5EmOI74D|(&DkmT+_XY!`4|s%I>HZz1!D~$dWVtyYwjHIlglQ5
z4wW^TKc1R<W3K5_H!|qgzsR}0p0vU!_h(9hAf&q6d6wf>qw@mw*KZznTZK_T^^VV2
zncoQXc>#hnA|~7XuR`*Wl8IV|r(TZ(Mq07VXgxP^15|d8ttS!pWJV6TUj|&;HGe&R
z1*s>k4B?OOo8QWVJU=WU?dle&k=Ati>1IP9u7~9{>o$-+a;rW84aO4KcDprUdw_RA
zn!lk^v;TaqlC?&#-|~|nljCKuq9Dz8{e`F>I-1Xp8oSW=O!lSFD-SM9)<!2gBre%6
zcXjm7O|d&AhXWfvQK6I%{GN~MV6c>3MW;$8k!Ic20M8)exmdFX5l0bt4$Uf$)*8Y?
zr|pIc_#cx_E2{1C^4IawfeyaKJGu>ofOgUnZ4*Rm{h)SNC7^TBeifz#HL1f`PGqTw
zN3}pfMXhnv<KdVBHbkV16lxGC_LEeuArXD>M%qWh%>yx)cp=pn2U-ystUD~dU#B1#
zlTZ$_J$g~StM2as_>H`A6O&s-l?Hr{8e(BV_G4h0C9L-0OV!rA;jLE!!>i+#UA(n<
z1l!4((K{4@5ydlksLy)=V4FZXv6+Atp<*JD5Tgu!xKlCyU0pRaRMWev^QIBSnv_{W
zpI{(5HsNyk;?n;^*;mI^wLfc<q5=W}8zcp!OF+82K}s5=yF0h2bW2G~cXuP*NW-Q}
zx|_|0cX{-jd+vMR&*yjFfBY<P3$C^1JM+vl&&;?sH`J#44Dp;g5=8>6c;BhosLatd
zSLC{SZ<A>z15<Si&D)9XNzIs)OZ89t>V!6fFBOwm#?JA;F(-vC&|MFX-RJ$ow>b-v
z;e=zGw>c+45t2;-+z}_GyMl+id0?j$(n<T<iG=io>(&zmzF*~pt#~_t_ZoXguu<S$
zY9tzNZ)r&!HNDIzXj1Y5?x3eZ6V<eD-`_I(oDEvEI;dpi!ZhZypsQcyx2xQecFwvt
zu6DD*Oc+KxB)~k2UVJ#>)?uJmzE`<h<ip36ra0^fQB<BlaVJXOtUKz<y-42NHA(KD
zQ2In%cElsWcOf`))0--gUom--#3PWdRjWbnC0=iXVUi^)Q53AfX(Pyfa3Q<9SKVgH
zxVXs2x#K%<HkZzCb3T=-r!(5flspZ#uz-%IGeiS*nI>ZJVFdWcy5E@-xDS@WPUNB2
zadwXK{K!2-@`wI89!wlO7C3#Cb(=tlJhQnl#r6*6<_v;CNnv9y8@T!ac?C@7e)dP7
z1I%duXE=_De}TAVo8OnTH?s@H5Igdl8CSL^DF1H1jx2h%p*I{S$cx{-|1&=D`}%{~
z5l?$zW90<`GO|*IQRfosyM5gXJS5zGb|sZJF~%wV#!F77r>yI8tYcl~hM~Hu`z`dB
zOrU*YEG4WQY49>6(CM&YE6uB6rTgtYjFPm2-z))Kc5&1)dUK=&l%*I0yb?+{CFfT2
zwMRx2sGz+3EJUI0fO^4VD|gZP=1y(wyxc)F-xj9T2o+WEucLrzMZ|X6RFr>D!B+@R
zPs1<83tc68`h`JZ{gRuKr0vd)K}X6BJU}a`&ZI+eJ0Oy2T9O>Hciv_H>>09rZ?r_$
z&Ku1Sy++3*0#PBjPArWIylG7Q+*;|vs)klgL7RP_cMt?g5*V!MHOfES_;?bGb+aXy
zYd3y~@W=Nx@(D&(a(r0o=|)vbX9$=GnLqErDt#{LsNW;^q9Dj=V^3dFLTRONSR}Ud
zV?s1G?DMr!St}=B?Uc6I1}o?Wop@b#%1UiG{wz?-ITO@9>)Bd;8I7B6Gd}*Q@#0WU
zJOsqz^=x7!oq;v3H|tXJ5D`^+Gj=v}jPq|TK*-N&rxMb_?mEX7OCm3?t5>77!^0!$
zmX-v>OI~8xmW-N(NK;li36?WOiHv7OKP^C={dWd?H|&MX;oFmA6V-0>@GQebj1^^W
zieyw%wa}GI#HUhtQ59wA#0T;@gK;dpZI*&-jZo-iMsAvXHba%^$d=_;VtkwOmoFA|
zGWOQNb-=T<Y~2V-WvFd&xX8t?qXdsbG8mSW^&ok<W5RJOiCMa?X2Q0Q?x@b*)Y;j#
zuD|)H?*yB~?(EN$Z%DU{W<H}0CE@PKq=b%#TtZxJ8I^M(c&~%L)EIIx3P5ga+$&e2
zAL@+pX4{M&(n|P(w*1}ol9)U{rLKBlO%oP#uD~A)DXjZ_&PL+FA`<JT?O$79oi-zv
zKTS@Gqws8PK{2fN=Don@!h{1x6hGcTMruvvjBzFU(t5JLwjpA}&$GtzjAVU<MWMDi
zt-}GocQwV8NCJHP^;t8LUS%4J#`_w>@0KP|-urbHa;;hDaTeYv_rW&RQ^m;Qb0#l)
zg$53Hq|ntA`AgnN624qOFK5xCp{di05L&WV8#)Scwx<026#1z=M8ZhW^YWa#L>nZ<
ztS_dVM?w^rkL}>vi#JLkzN_ki=6Ov?Ng13k?ubLI01t8cfXB^+^KF~6f?SPG(u-be
z$p5YvBkO70f*eFqo|2~;ib>4x{&66dUS3q<=B*}YM5z&VW@JTsc&MryhNJz;d0LMv
zCLW<I;f!?-FXv%yWE@4S0}oIHhnzU?%t?oFHth@BfcbpeDntu1y>8IvjmRG5&HA`M
zRN!+)CT4l}!@@$nWC_)6GI@8RR~c5xBL$h5A~;GRB5t-zxGt3|6%JJr64$S&^Tt~b
zpgYc<I1CX|Ga1@V=dndS?oJ`^=IRP(3aP}{#Y1t2$9+?fQ!NQc3d`!%XsE~c;+Z;Y
zjrbM&>Mai_s$eTY8V5XexrOXSO5Z=``kLC_SNq};VpES#u`=0ZF1;-i)_ZpX%5!^j
z2@t27u2bnVo7*m}KK(f~PBNFJMOt3N45hn!7yBIaF!eL!8A0nXj47)wrAKlH62nFv
zq|hr&tl@0cuG|?XnGyE3670p&4ZUb@V1Z8X7ozV=d?U?z))))o{;5wxy$IN?gt~Qa
z(%#6($Yk(qU(pd+1=lg@*OO;TB(2LT%z4mlqaDoED!PPMr1rs7%TTrSe)>{UQB7=~
zCG31(l*GB93*nou$?h5E@~<%QYV=+uu+eGJtk#e5U!rL4pG%~8OK?~fP<HKkaGO;V
z`S~*vq``p`IWNz_<hx#^?;_(Z^~@mGNm8v*7LIguR>^JHG1biTZfBVETH1HjMzp+Y
z4nbe5nX2Mrb&2}*v;9r(6;$uly2NF-?{+Jk$As7D?r?nXBT!10&~wYTl*Jw_;|eDR
zND-Dd@e`Sk{PCIe6?h+^BYl7aPJ;)$pV8Jw^Pb|JfwxK1Ho_({u`#|0qnM|8*Sb6^
za2T^FcZhdz0!XBW{Oz7!F|H6Xj`&$+$moI<=*;?15}5UJ!i%yKE~asm?ASryUR-7|
zlj{*Y=`SwmU0@eM<8+J)Ld#n`5@;-=2#`IJ6}xuQYV$_q2VaVKCJK%x>uA2uSp5X_
zxCg1G5v^Ly&XSRf)8QT9pC8N0Y~c7l$_T(TMK=c4J*{mkOI|*DB21W&hUgWfS^&As
z%F?{6(G+{J-lvr!Gs(igxE0-^#gYOXn94rJmh3V+gA7HB;k6nZ7*)WdNc>mIJeH#<
zzs!Crn8)QPDudtHMbL+J`^~`|d9Z@zOc`b76gMtYih&nx0WG;aX&%RBW>4&vAd^5E
zN9jWG<M}QwPv`RbW!&g^)hil&i%S&FvL)rueC&JE-@e}?_Rskvt2}D1ff-fk?M0M`
zea<RQ>8<`aQhXaiE=8+7&?QiPnR(Sz0(7%|yZV-<n&zL*17$wA+c^njhfP81Fd_Tz
zo=DDNpn_EQ4}R8YSDOaE8ei*Z9<Q`<ax7a_P;lf~u%2bM>`k7!*@QmAAbmAXV}?x1
zuE+307dr2WB^*OhaRamq2XnjhjDmT(TbtERO@2z+J@|>N6txI2#%=etzT|0EmpND{
zR#VyK58@6*Zq_`D%-#Unw8bd{s#cDOo~VM7XgiG2^#k}>>~n%g^LxsEmc1k9E0Aon
zRV^3tL%X4;XjLsMldwvDN@{f@({yisKtXrPZlh5nEggQq>-d`8T5FjOs#Hen{mB)k
z0!?OpMok@ht<jX$QslGl+vyn>&A;Y^OJRoifT+KMDu6(0;_`ra4+s7rc*+dsi}aMh
zvrAEtdp{jEH`$7eA}ZVu%;4%YdBmuL`rBBAUZRk_K4sHyB=uB0&o`VKm@Lnge6~y6
zi16p{;U2@mzl>7Be4jD9mL5;Hcw!K(>J$-R(x4*%_UAAL401K5;t`9E=M+P+0a+4}
zT8Yus<z%kPURcsJY++<6;;ECTQU(Vr6NIG@3Mj3`0H>K5YWA^PzLOd)iwE73Q0@5A
z2Ej(RDZCU}ACDXZg#9>LvJqc_|51yBV^ZT{vSz-6-oiwtyc1S`NVmO?hV%1s<5bMl
zH^=#shyfRPMoM^~onpP_cXt*m<-Q_9<ai(qq6Hs~I$IZFqHeSKTVnD`qP%^Ey>~rG
z^am;S8&wK=JLHuN!(lhc%F4>-%a@2D-@;Y0E5ByLg-RFGoVs5tgK`{H$+yTA)<)`Q
z-N}wuXxw#*E3B-J3|PoQEe;nOnY*HW*8`tR`l=-4tC#uzI6L|DmU2yB{OakuFqwzv
zCFo`~<e*Em7YCv_Q-<|&s<}0h@S;$q773T(>GrWKF7oM6Gv@8x9Lz>sN9~5`8Ziel
zU*n9pTJtTD(X!$l4yc}-bJ$muT4Y;60j8zmGW#y>%ji9I+pnsluwu2c+?5ZGn}b?5
zdDG`Ayn0rLEN<cKwP{MQ{(jnOyLn$8m-DdtSnh9Cc#@rOHq{%LKp>}cIpB{ILSC`#
zh$#>wDdeLOt4f|H&`nmcpnlN!ZBwUJp@m6<z)nKrY-`BBVyxo)T$*dK)gi$)aDnP4
zCX<$J%wAP|S@o>>!6#&!B0S{B(QnGEI;n4!{AGwz+18)KsFD!WZ8K5aNJ2tuT(p}w
z3INF-|BEXeb-;e$kpA+F08W7OYyQ-JA}dKl#iWInZ<hdnJ~f%>VP=2i@CviDMY4Ph
z=Qo~!gV!c2fBl|>aj_g&Zd(2UvBqvX@hP_y^Ta+^s{gfRt?DuF@dy5P(sq;;(P9)0
zj0)1eWvE|X#%i_Y7{Lj7itGcKk9cfJ*mPiV$m--}E_L(n{?yn{=v3ep>5C`#lN|(y
zqc;<H=7U<aYL@c~B?VrB3UPE#DzDF|GYmpBzG)vZO=fs`(RScrwS)y_y9^@Y3NLpl
z_9$&yrkGBbx4xpHOvLSUfY~jH(XY{!;QI+GDoDQ*1Gbi8gGaF9jahUCIfe`JWX!48
zOF&}o)#h?OKAxy#3DXbjhHDnE0d$dVlV=#(2c!*(UK~d`RXgOupl)N+`za0@REZEA
z2hgRF0*ztt(fQ$t`O<6`iHfqy&hZjOS0*}>Wk!@vn^to^h|4t2PLm<#m}6}J=3paQ
z1*iJq>vp8pw_KairKL0?RBp_Np@G}78n<PvrAaAV|Do|~4Sct)TU|)2K_2<}YIOmT
zUzvu30;!co<Z$rn(A4NP9|l-q$ZV8C@3E57#+g(OaW|RT!+SCu$vk?R@oa*-xTEDt
zSHcRoF&zg-*UU!ry|*!-@B+iQkdSirqO-x-^7;^E?x>`ln^*e|U%xok6_{b~LYmlc
z88qI`kUZr9d2ih}hlD*h9QhHbIAj<#h9b|;pMSI>AfZI}f)oqcZpu^~zd+BaG9;aw
zB}LEY`xZ03h^>=%zfqI@ZDu9YvBoXhp}ap4aomkkXdhQ0O&%PhQpeQVZCrd3m9ouQ
z%V;`(-ZAwk&8c{H&}+-m{BHZ*6OBQ2$b}@M#`^8#Ue?kr@KD!{mX%iu!$Y<;22r#O
zlGEBO_UcUeRiH&B(^<vG2Ja4VVVuW|KQTtLRxplHj+xUW4fuzbN3&i(B&I)TN80{q
zdAlQP$~3}MZ8}0lw^JB$mo_Gho=!OOVB!5-5+31~p~C}K?}(1Td8C_n!X`ljvw54v
z($I9md4RsW@-tArx&IB$I@iNB(RI7P3h4^U3;1J-X^i6ZM|SPEU)5(MBLu2^yB~>P
ze>Z$xc2AT#7-gzr2mTrxD<T~%-63`^0KQ-i;Xt;|FjJ)HVvi9ZE(<}UKA}g2n`Y83
zd)%*K-eX8(qu1iLn0`8VFD`hW%U}n5Nb+!MA#OR{U-AbeXHQ&=LSdQ9c=Ej?<7%O|
z?_HlXnJK+$K4$p6JyHW|MRFGgghguojrYiml;LN7=gZGHFCr$GE=eY3v5Fr%Or=rC
zK!g-t%+{(9=D^g;UKu<Mb;|bg`tnKCQ%0Dar~2wK@dBlIzyvhq#0$0?&N>RYow`0v
zIh?Hy9Z7Gb*|l0xN`G{Fk)WJF*Uhm<fvVwZvsAM3W$p$82rPgr!D%T}t9x`4aV{N+
z)@xW_hg0wPDX6l`U**Ar1`Fc?I=c3}#p`^Z`CJLVM2|sSrBJD<F-V455>-w?A-84z
z5<Bnh2CKOS*m4Q-@^wv(kJE#OGccJT<GjKXJM_BpQ(MjDNWG(sg4IoQ`kQn?m$cgA
zn6A8^$73VuB;ex~qRoTziYn)|K3dAP<bk*0_x%VQ3XFF=yW!EKmG}zt4_#fxjm{ij
zfns=U#>=p*eV3z&xLwZD1d_yYj{A(Xw%#jIi`7V3=bLri8>J(lg3UArppao<i>YP*
za<`A94q{^Ltqv6q9d_ug7S?5&Cy^W3IBYtGjRF<;5m|}--gCglWJlGLf(3yxgS~f1
zAvas*NxnvbxsnrdS=DcNdA{#;#J6OU=+!lz@28M(k2>Esfbm9kGGm1E7@jWIGl6LF
z9B4(Nne~GkHPJZ5RypoNRol2*$Jg?7!QPSt>m_=m!1t}gnB$nVA$0dE0_E;n5$&`m
z40+pe(~_Xt{$^v1D-CZ;U+}QVI%46)(88TDN}IF`WC>idDOrtiX8=AH<O6>{!z;Es
zwG7%)oE2Pwg{}TD(ID0DZO+C1E*Try`t=;w!|O)NMW9nW_wh?sqGE*KC*y+<ljus7
zUG8||ic028j_%;w!2p|VGg`-0{#U*zB;_bi;I9#{;k+LMIQb(+1<6}wN7e&~G3F}^
ze|}9}<_mGdxbSIxhvCx;a-L^+WC6Q#V?XyMvb-X3$)401y(+Gt02UlSA1l--Za3*d
z62!U)mi4~VhBTX^`s*|ZUl2~^2m=tazyBqg*U4QD=6HXB9<@B7;-MNn;fa-UX2tfP
z6k0+n>8g}<Q9^MJkWY{a5pHk_$;4So;ym|GoxZD8nA{QcrJ1r*9nHV*^g2gE*)i-n
zzqQk8kuCzE<Rkg)_)N}Wq;h$O0Cr6Qp;61t#`G(fA}@mGHE|w|0R=Mu{49yoUDZZ8
zS}YGmb+m-wn;G2B2CgSi7J8hD@`s6*JaGzLudqlj*<{35P4A;Oj!!zpZspR-)R~8Z
z%`5F53P7p&PadqBm!}}SlAWjKxiUr8;$BM8JqS9Zu8hIRNkltxKM4<XXatH1({DZ@
z*Zm}~uaGM7&N=NMy?JoN9T*=&><#bcUTW#{n`f^gjq=Y<(Tnp5nl~h!0@eBfh($Te
zG(4=;HD2w0@-lFK3b#|!&&_o7!EPPf+^PLtBZ$b5b!6z?@1{94f#G?~etMQ-p?Dr*
zz&SSgQt>Hj<Y;mEJCyVlr$<;N@4gnNF)QFUxy=j2<-5SK+fAq*`%ym3tP!74#k*QB
z(2gVR;VR(mzZ;W(Tv4RzMqT&RmR!E9^oKe-$!}nMBitg;sKy`(G)J&xD~+_buC1-n
z+WX5m7q22>v*<j!@442lN=rOb*S;a#Rg`>5cuiU5{drHV*zczN<_7K(8H%v{>1f&H
z;R)0ac1LpQbP196#JbRVbFed(<a_O@XBuf<uh()%#!9pg|F@AI3csIN61ufrt&<6y
zXWtR)4s9f8)t~|U!3m->8*BzjUq9jX({E>PM+E2+IQN&NXYDsJc;+e}$ev=NdhL2m
zdQt1SpWGoMGoWW5Q?D9=2gobvh{EOl&j0{O^FDEv0MzNSGqsZnQJY~#arbaP!eUIJ
z5LZ*aRbE`oCyoGB`}*UU6zgtrY_Nah#VuzR%7{$+Q7BFX&!8n{Y&G9dgR@EFLo^*0
zlT~uC(P!T~8hoo=Ya{IQ=;~GA$7hdD6m?R}A1*I=RjW0W2rm2t)$&z^v!xcR#Gf0h
z7Agk^gr8(Qr<BECbvNgf90#N^HP$m9Z|yg@BvDMf3up`0&M2S}&+~GKcm@JP8bR!#
zZ0j|532GtRz4MXcFNZ7NFc(-9bisJ5C2ub)@#kNG3s>~~lt0fJ3la-@D|=5D@`q-6
z5(H<S|5Qm+2hH|pwrIIng#`JIFpYe{k2uj88Ou8iYP?~?VjTXM-rJo$>UWz43LlQp
zUi3{!OAUIcV}8%v70O`!hLcogg^aM2mx%bdU7`O`mj>fGR2MP>X=1OpoR@V^7Me$J
z3+c1cA*j5bNbgSf4~Ge}gSGC1A|vr=8Orn;bFW?2O~dmwi>O5E`(5ALnc1-}Tc(`f
z{rKW@%%!ZXB1Sh2=o!bSLdnj_*Zju?&XSMJY1K?6E{rdoB9<tfG~V5&CSGzqPa5*D
z1lRP~R^KymSyf}i(ddetQ2fHvS215=%(o;WT4^1u>jZT3L)EDlJst~~t$6acsz%0a
zd2DvDXO5Vr$>%Re32bbym{&7R7AB1FkBm?quz-)2eWN0Xc`|07IH?V~4i^}1hHNMo
z2Wgm9pJnMqBGrYH$c-f3mTb~9E@}Bb_QH{%2RRm=o0IJm^P0rQ?#@5;4e`Vk--{<C
zAGExANq-+d=4mS5#%gqwuShvQ(hGe*3%?K7obfcw0{H=vc?Ee1$?hY{k{|}(Q!gR)
zBN;PuCRx*xOr9w<{a96j?}EkRaM7#9;!kJbf1wxvyI-eb1%-J^-%m8tQZY%KXXkp^
z6dsINXa$LlpPlD-tj)#mz6~Aq!`Mf>hxd(oOmr7?0l$~EYCv*{a$^U^yjjC7(QWk;
zAq|jOi={$02NJa|3*@SS$e&_{mQX{oen^hr`5WEWy_rF!Ej)AH54Y_A@7(>cd++!;
zEhEb(-#vKE`Sh5U%-Be&FDTeowLwNa>%Jw-UD!%BvkilbHHG0;woGs=1%Cw4lBok@
z?ZI2ZEGR84UQd<uCI{7V%H@)hIf%8D%KZ4VacT=;FyT1-kt$tO+Hkti)$#dAlW4p=
z%4T0lV206=0`gPGLLS&fB*n7|F4m1x7YFn_@cAXFkb;6jqJX}bxWp4Bf>4jx#=A#R
zDp=Gf(BMvLi!CbJr@E=PH5Pp|c69q2v8y-!;?iDqmY<ywJr<C=@4jMpYOZneJ4gd?
zc{nYdyo17g__;zER0rz?4=$}7uWBX{8H0_EB36?zUfXWSYNY}A?6jex1%tTTLK&T0
z)T-xd`XjzVt3EFnKp*!yjLL@dCofonlB$m}E$D_*?W)&gK_V9yrYAYVv6k`=o44G=
zhry!M7Ql$lfr$?eDw0a!5O&t;<)Onimu`Ey`zU=0liUZ-`}FvDnIY;*_ZowkbDB6J
zbhhx~(+6H4LhW`d$I3EUvM^pTi&Ktlj!vx!$ifsA<Z}7vFM}uNQ)*+g+@?#_Rfg(H
z@k`C97ccg{$e@{ZCGf0=dSJ`%sat0PBQ=x?SaZy;-4)`r;v@D2WuR`^PBk*GN)EuY
z>#wycgq|lk>@4M>)*L}1p7KbIJsvJ8OG)xl=V2}caQiLyV)46Ut1flZX;N{tg!_ir
zXVWju*WVO`Cpi>OgpEq>o>W9d9Hb*hRr)yH9>I3Vb<?(5KdIfD=K5CcCB|I_ZV|e-
zMr-=k;DTG}AJ*>g-0nNhQ24-b4^oEY$E4R7(7LcIyC>35oHy>^WfT;iL(eM&ur0nU
zt8Nm7fQ`}NgH|g82oDhue#80WC9G@LB{_(LRLIZOHJw<Vy6w{4PcrD#Bls4|Z%}hq
zoHi{OkNtew1=}%JB#SXLNGGt`?_UZYU~nak7Et6_KznpUP|k^deByDok-WUT)P#`o
zMY>DFx0R@Twk1_ar%`T~7OyPZ!eS091s)5%%fXc91+MnZuG@T@K>bw|Xh^R(?4WZ8
zJ74C4j?PXoYa`njQsKav%I*8^&Sx@vRkcO}7gF!ZrrYGL`xprYijmzxE?)Ruu-Yi9
zr=L~|9eU7}ErwMiRho>9?}Kthg@PhoPIWZOO+G(<L;pEQ%r;USKV?bvPz4|=<d3=Q
zp|AVNc@((&KO&-_NSb(OM5g};(N{+mew6pYrd{bl6*j?33hXk7<|v&jo*c8Yx3iom
zV4pjge>!zN1)E7@R#s6Ff79s0i)NVd{r*!vavp)4U3zbCbb|YkBTk`=8lCB^Ajlqx
zL0d7BR0?6Fb*Uv^H=#Dfr{bI4!Kn}#+y<+2HFL#VF&nTWc=M#HUhIg(X)CX*OAJ!$
zi;^nnqa>zEz}=r7(g0GiP7;syHS#WS?_Zm1ZCf*!sYEZ2it#sqoLFp8)2CY<q&i%f
ze=iU&5!9Cumq8en+(GF7MZ_I_(+d#_I*{cBg2nsqVSIWZG4vM)vf?3J+9}J|_3qv2
zHzNJ&*{%%_W%?~t3N3`7+t^vqE%XvMlU{bq4CGZwJW!xT$-7=%@rcjb;r^#;eA|fL
zs~)omL2H&aJ3YyX#?8H*r2~}AM&ZueQ1h6?MBvAA0dFaEMV@I`i#-Qk^!7_u){wM?
zGccb_tB&uj_sE)28;fXPO?FZDP0TGw>dJPRb4|2IeQ-ITrtAEsH^=Eac~YV^B2i9E
zR?E(6FOlo}D>AIbFQ{kgEj=!uA0JF#%s)5t;Yy)XGaX@dkL7z{XpZUCY|SLR2`7{>
z8HlemC%yY<brtXPE4@)7lT0JzfFz`_U)n^JRo@lsf~)VMqP_~v&m7_K_2d97)1=jV
zPlzGvsO0$gxR~l_YO-tIXjYe7DP<`AI-72?XY?mbgrqBs1OE7*frPdGV3VHK65WT&
zj<Rw7S@3+4AWPeH*=V`oHR;`GEtK;Z<GL3Fb=t=$7hJpGzWyohdAFbOI`B86o?V4A
z&_o8uve2{|*yQ(%P6rNOGagbzDr)A7s$6=T8tWfP195Z@dJqYUztimBv^%2T*KK^r
z&{%uv0w$uNgJSRDL4nUqZHW0G1enI!6;ho7w8<#hvVf|jz#Pm0Rkf6RrAE<-46jys
z&u=@x>Zo`c8dKjK__faFdC17wK-w=LQ+Pl%O>n0%JNp$yMk+%`bho{uO$GV*=p%9C
z?@HaJI?Z%qSe4}iFY#H<h4t-UJ~^l_e{6RKu`$LpiXFkM|GKRK!72AI&6`J*d_u%k
zP0DTwd990S*6<W0W6#sUxK~S$m8sFUb-)>=69Q7QHMQpE-%m|;>t5!v{G#4wt6Zqc
z=D#OEtqirrQpXo_j*I0TLKPpn;-$aJx5<@UwpGo{E&`I`9H+>L$rd|xGMeGT%vyM(
z)pQ%}Y3$AyVgWDea#DF)MW0J?ur+G(;#sDftWJy!H{tqPK*m2kLOY=58dP45ZP*Pm
z(Gr{tDJlLwmLf!N^orHZKdE!*MDnNQWZSKleGqx;rj$8D+GcfCBCS?*Y#P||Ajred
zM(Fvx=|bIr8vPS^-C$+Pu5PUewb0e-ngX9`0){Dalh(9}+e)EC<mMN1ORGM@<hXAV
zc54csDA`u5i8FS*Ah`@fh%e0wI^J62QDI4OZmRLl>6RERO;j{FVJl;y=)uSeIkfak
zw@6rS_N5T>ujEpq&s^ta)k;49&^N%JSfTh2jr}*_f{ie$Cy-M$lg&I}pU7^+QDZm8
z!ii4Sdx>jUPsGc<jsG{jh=@QTc$NdrxauOMYf{Uga!EoY`8Q7qklq$Z*PC7dO(avz
zSo%R_%w=~;Sj?sBmFkL>rjQHJ{z10179_pPdQ&A`*k?aZ%@CwfWnA){KcZVq*_dk{
z1Y)~8?G(vTZfP}#c6Rs!jO|$eHx*SakiV)k+s;>4G*1^)mW}{jXCD4t9r_5VO`{+*
zYV`r~=L&8O%=`!BsHY;NpF?o6Qd@YQuV+yg8=ib1@{!ke>zGG^RQmGj@;L=3d2(d<
ziQXz@bi|8`I)V?EJzjp+J=Hn&DdBYw3Gx%yLHo$TNJGSOF#8=F$9V<c0ji_%p2ayf
zQi{5e=?m8`pN$U1lll5^KT&8XjjiWp&rH=dx&0JXj=l{~ZSK;i<}k5j5t>Sdw~tX9
z>~{rSs^9RV4i4~_>9uCE%g#JWFVJm^m!c%}zOTgZ%@6J@0P@*dc2l;R0-N@v3ey@D
zEw52IZBM6O1o?-P&U`sdBfnf%rfOdoce^O5#QvOO?H8w{ys-<;cPb=XF}()WCW{;t
zwiCBNkuXHcgP{5$zrMut!19f8Hbz7Iij;b;e{WX!T@Gh_v0+K8;;_Nb>vi4vu~Svk
z8<9nU^_KmFI2q5kKV1|)F`gv^)z|JtVahrfI1oa8x(9Hb1F>LK5@HwY=k+?(7bgn`
zOO*)#;wBT=s=WsDKU?jyA41nD{{DsHsqM;R|7RCAe`^6UzLTluB=N>E>IaOZaWNlW
zy%-TuI2+afRlP)-Vy&yFkD5qJq93YZcp2Z@KQmRTE_<9n#+u@MHGf@~39ReV#csoR
zu_-#tm;6<=BvqY1-3R+3V7l~iwpU$tzE=t)apq}+bAkHT88^GdqQ}aK%bF?XLHBfv
zCKfiVfx_<&J=EuWVgJq53BAlCx4+)^4T<ogXq>4VguAue9ZG>r>C7+(tvW8oW_14k
zc+@hc2L9{Fc-yG)94;JS8*iH4#`asOY?kN=aeP>H6@{;P<S)=J+Kcgvf&>Uil;ZSR
z!R)-jkx^(~?-l7pZjCmsUs7|MX@$t*BbU$k-<%u9f#gLfMUNdQQ#lZf%#+pajFfnV
zTuZl(&dmikp6@b<p*)LtCc%e|b+RCZHq%`cgQ?YmdXK!18f9c4Sy?Bmcw}iNiE$|K
z2CGx8L{HpOvgY22hr7mHo&+&e<PrJ~^X}APmcCeX_LM9FPIIm|NAQ9s7tL94Dscsu
z9DTsv)`X<YX#O0Sukk7<en{W;<ND{w9_8Uib~z8oh(eG|wIP=Rw0B+QDpIA&b=#1?
zm9LHA=PXY7RK6fz9tk4>YQ`(c@HHA+ivo*t2ELDnOU;saBYLdQMJrzqdCkIEE;L6Q
z=&4?p9|zC>D*U_})1|kZX<)~t*OQw|Z}x${d%UH=fQ5(0Tx!yEPR#OkpdpUEGWmMe
zJ-SsH)LHTN6V%%Aner+rA)yudH!DbGa5oGq)DvKVvV!c5lMM_Oekiq)Kh@LY0b3lT
zW;HrMbY-_>K?i*3=y`*9Wly6c8KR7fxo|C#BTD4<MF3#ZX2`2uVP2z*hZ@#=>y|LC
z1~~z{eoQwsTHEeO40SfPi!x+8C91KWPkD5GsWP5j*~aGqWQcfWO8W^Dv`*O_mh6RJ
zQu$}-QYpB37UN`RfSMQVoKE+8@UHu#+J%eOL#;8m?dpc%Zinr=iKAGhurFf1i`OW_
zoZZ^I7^K-Z8FO{Oci?09mmC>7UxF7mGS~B+UdoN63exj6W@eT@vRkFJ2i)zS_?(N0
zsjv6>I_d%>O@^|G48aL?9$TD86!MoLx40syF>=jpy7}&Irvx;^`pXo$y1MbSh8u3*
z?axXC4y7^vDV@DgrJc&0A&8(+gXIYUvPx^-uhMRnD|)}e_|2Bhf_9@T_vc@k#AtfU
zYtLe}Dg|`1OJ`RH4ok^mbwanX9V2B+JCNPw?h&QE(zLLbSIk9Vx4pdf8y3xndLNMB
z;C=s2q1WXR89Btk$53Egs`v`UjW}~pozj3jVF|H0->1%Xz=bseC<;^_7F_EOM(%o8
zn{3{#+<?;TU`uzwmnYtg)MZEIinG-FDpd+=H{kfotKBU^j~(yi3E28i7-r*z)WFip
z+P;X@PwUJtr8!|EsEM%6xhgZ&Qk|cfV6)or1xNla`v#NTABw*NPG7>~g=V+Zo>Baa
zu*ED_wnD9nMt7lp49^p6uJ&guD1T8)aC~qia2Uq^C>krk_8S2n@a-D!K~-{{r3?@s
zg7X#c*7U6+M?3X@(2>3Jx>Y#Hx3isEin)cJV|DO0@Pj&sAC3?<D`~;k9}1`QGn@2V
z3B1GKG3eM#rhoTEEKoyHT-@VU2IURK^uFFa!^pR+p_71~WOu!S)EZEy3Q*;Q<NBN*
zP_zzZrlwzCyFb-wh%S8;NaORK<>NzrhH9|}%c9Rcu;b4B=ou|;y6YX8dg(-^zWP(Y
zx$6R>p@{jE)MM0Yz$C{;<ePBD6iul@Cju@{4&}p>I50(?IUO_u^hNTFZ4M!wA99^a
zh9k1011<UZ%K(1?_UFoRWg2PCFqF4(g@;rZJ>A{GL2@DZw)<}_AswA-+_X$o5z`FX
zl64~4=8nEb0m>?hk!Yi8Zng_Ko2cq9E=rbH1SVT$wRM|DhntIQu7+kmUP3NY=aonO
z_&;MxTj4=&CT`V~9iJa8RBMZ6sN#WC<7jAeM!a!v2Bb8)uxd53>wP*EPPiXNd^h*^
zi>pU3M|ONTA6c`2K7bs^YwHk@pJGCtX0eU5H>9)#?mzYAe3cEj{DQre-a9--RA(1k
zZoo1J%!Vz{@vqb*8J)x45<`+~xQ;l^jknRppSn8!P64S7=$;8+|Il5#NiVHowH0;q
zSv;5FZhQ^j`#uEo%mQ_kP|_HC4CNq*DeXz_2NUILj+P4%J710MCdV;L@yVX6+P#H<
zTlcHXkH}AxHgQ45B>eU|*%xYfB$5_-cnAApVpHWmUT+#5GUtVDZab|Stacnu!ql%K
zDS{n8bJ!U0UXfBf*{<G;<kpjwm(DrMV-Bj3VbrL3L%P6(ir;j;o4wb|U_IZYTf>p4
zc{O=&XTd)>;7dh)OW5Og_G16}78^i+{<hH%D6DIK`J83jeK&{$?;b*>^`q+&j+%(R
zceiu~pl68&J{$XyA!^YTgf*EfpX^r3R`(8$MaMRk`<(DM?RtugknY14H-aRK@h}+A
zAZf$Lvk~fG6!qU?;9Mjecs@bpjF{hkNa|~x(%^`AN`9i53YjO3dTVj_WFZl}B;%{W
zd|N94_0Oiyo(*hb9P(%1EEjdzGiVg)iPtqsDR?y)GxB>LYe3AZXpGnFSwJnB#Gfz^
z%y&)6W@?3qSyBok9T&rzv|>BJTh8D`sPmM3^gz0kF&FlCc*#Rqd0eny^;VG7FsF4j
zGcKdvBmusB>8kU$l*?G;FP0gfa|Sff`lBmlV!w1rzlZ~r1U!|3K?;vjN6OJl=ZnA?
zRkRG;5TZP0wa+#T0aiN(6{wmGf%IX%#PU`ND0I<9`0FgBVK_>Ar&<;Mqm}PVypBpI
z%PkPG)ZUi;B@~#|)2JT2G%R>c4KM}%TmzGP>SY$2nQ&yRYe9PZ^F$@Gy0a+U7q9XR
zrX+U8YdgtRBCKZ$yd>9L9j#VXzgc40hE5LyT-^P}_V(9JGJY>hhL={`O6lH`&#0))
zj8e$;#X=sNS;Cj=WnB<V<BmPhsnV6r=gofAh?Y?+BdLMu@Da+v%c>c2fy{U8D;lvP
z4jW7kIC$K4Kb~$W#Q+TnROqZ-fj8+5XML?%AP@Q4oQOq8D39h4y<}(dbwAhQc_C4O
z!$IZ6fG4G!1lEr@ddAOPyBEEq2zVi?J#~i@rHo$NC!KFYjj(+e@(9^&r(aK4#pHwx
zmzcdk2uJ-Z?fb_lgh)hIjE|K<g1o6DHhxq3OT3XxR8#>2v9zK@F8WMUg=!RFF!=U#
zkhy<q>LPW%PaanYEg{(y)d%@6j>pg2zdg6b?|iYtE{?^`cJ7l-Xd|5k+eR7Hpn>m6
z6Am4QJFt((0HYk`Pi?Bp^TWk!HmKkBwz;RXV61Jx!gKQHE}u(THwWJCuu>K{s<{<v
z=JBW%>j;Mo#NVr#$$FpNX@17+bnvOAOVLvi(_J@L*@3n}-AI2lAA!}9zTVyh0xXW&
zBFsR>@#(>AwXC$)93XumcqK;W?_35x*+(wXXbworja<zdrB;a}IFwl5a&~2q;BrxY
zeo%eb9=&=SjhH7buZWk|6YB3i=&LIXkPx9kwnhR(((HSaZ<^8<>&`xq2dG2}a!>^T
z=@;Hv&zP5nRAzQlm7(wXCF(Ej$MHQy;|oXa8Ib!u_ZQv80GEP#D<w);4*TWHIy~Hq
zBoQWkJX&_msQwO`9zPY5=>mQ2AZ8ue_?J!R&R;#fdqfJR+K_1sxgIJy#bP;SFY)mR
z>5*u{@bDDm`#MI#uD@gnI`9aA6df$43u!$t2&Md0EnL{x?!zEZe!*X)zAwGx_qCgK
zbL+SwHP%x_tDkneTQLQe=%Tj<YBZb%k$^5>0@%a1-ETzO0XkFe9@s4K&GIBCNO6iL
z?*Mc8Z1vP=Z;N<N4&Dp+{2~F2&CKk8g@hkJObF_?xvE^(>2QWFarw~Yh5x(_Z2(QR
z2y!0O9W69db4h|4ZzImh0KxC_&Rt3hNz2h}2DYKXD<(q9=_e(E**RGJ9(SKbK9)_C
z5Z3LC`ACNiooVDWHU;f!l)O{JlArhjqwb>qv4d?U?)lE6wLT3C`>sFBgMx)bA$d?l
zGdNU!2G!|8Uv*JE%hd%-t=#=&?ZqkLOz-;o#o_qepadXI08r<U6_M_bDYZgocJ}Z~
z+j&W#uK)zwpot;!SFpP)({IbgmQ)CRGXba_)XUS(``x3bb(-#9m$6@1Tb|rVIXZ?&
z>3cD9b#;}w3nU$TPZnp&m=%Frf6~}_)yg*a?WLu}&pszj(jIk;5!}RW?e)Og>Rxtg
zgNJ^CeZN0$M%bfW(%;yEEyJj<Xs&;HY}U6JPtz5uw4|~3+J(*(Yt@8EL=agx%CC0#
z?@Se`cL|=U_~A1N4s-N8Mf&ShHazm{%TL%498Lyik%^`6sg_7y6mdSs0Os{9<hAK^
zF6RV3ZioNB(ai^xKXtUf1IhpRk~<h>iK=JfUZshamRq{{fq0KCn(9WrrXRZ3>H{4g
zN#&K_U7IX`hll5|px{(P0OCp#<2ml1xPp$bJd!NU`^*M+o$*~ZzKh)=?{XM1;_t%`
z|KGEJxZgMOk9+w;MQu<bjRdA-1S%edlBpG`roOw!G>=(z*x*!%PWtl2P<i?1aI(cp
zyB~)I_5b+w$Oye7$YWue6LA%_C)+}svg&UuyRyxAJC2lbELPx%-lH5N{ws?9kDLD&
z4*cs$`4J5wD(Zh8>tD$lfG%sYnW{iR#5SjgP+)*g2CYi&*U8PbHIeA(=#7<?_c5-H
zDwLR=o4Io$`oDR#m-;XNR{n+6$oN}MW(r$TOeGMF%q-YGOP4ih^NYMMz`KNpegJ<U
zZI4`$Y{~nmL0T^G-lzTc15n;UoJLV0X}=A+z}lmjmMo6cAgvHE^qD+`yF{5KeGmQs
zga375YyvJn>SFT5hH5oG9L$@*`LMg>4ljAR?M?=N{P<CBH~F;z0;5)SpXtFyZph1k
zLHR2GR)oJ^hESU1dxLg9twv!;ERepwEOUkR^;f(`_(ldtgn&Sf`SKsY@gE=6U&Yyk
zllCdzgq<Fk)f%iUv<kH(u_{=mF}||j>j-E@cXV<BWEG%7f&@gAXK!apb#2BKY>SW#
z>lZkTwwZs^MjM()wo6S4>t<nZduEFDWDAuV-lY*UKYH~4HWs+QZsU*T{QX5=(c@-M
zY-y<C!MlK~?(DL@`N0{}PS|17(e@y7+na(i>$yrC5)uVhSE7FA2i&05)8!PSu0ILA
zDAx6Ua)C3|b?mlogR<>pg$Jj19rChGzsa%<3n|H>&aC~Nu3!q6Jv@`A!L$KhIqEF(
z^o#$6QAK&Vzudkf^SnGQaDUkQ9X`|KmsL6KmQt8>>fbAGRa;D$U+&Lw*wAyT0ea7W
zpDPB0<jY6nZm4Z)Ju_MIq;yKn*|UR{@6##fEx`j0<tN;Ksv<pnaK5qClm9lNPe~}{
zfYo%}!$B$bF)9SR?lGaFzb}5YBhHc<fx}=dm%YC(QWl}>ypgtJH&JYWxl>lXQ%c>v
zu)b`C9xgViR+-8#-U5>)<<rvAIIQUF)R2JxVlvA)-k+mSgb=O7+z(FT?C$Cgs+cMp
z3nG|4e@$=rx{kYd`GWJH9l3s+*!2Cc2j+hhs0(n^fJ*xw9!R<VlT)f3L3|Q?Qhm0~
zAtR%Ja|G_amCKg)Rk68E^ZGfIcwW~A8_ph)iK81^?~Sy9yt6F=a^HWO6!)LT2?$+}
z+CmQ3#o_Chqv2SHe^;I0kp4T&`M<0Q4gs)?4HP5=eQx0^(W8)zcYmDHnSqlkSH!bC
zMtv}h9kx#TQ10Q?P#^;ELlf2+Wq-_R3zN&0DadpvsWhxe^o&Wh!rFvZpn)ilSkMn;
zAO8MV#`&La|6fP1?}YF8)fx*iKUm7+f!C_X+HyHFOMclYT@ak6Rc)T3y_L%AmM;-b
zz+uB&_m>U*>jmNb!^eItR4<e+A*fTSEk6}H7_#0uZuks$JeHu)A@H8`7~>f68tX2f
zx}^7SC-ke*Rxa@M6z;#AP}g5xDJveOT(BLmEW}^62A~vI1pSmMtiA0b7yEL&Rrk>k
zbrmzn9qbZM-&=SY{LmuZ5>CjO3;4?9x`cVU&-hGr!}zy<bkZJ={QTk1%y$CB4f&oS
z6BmH<gjMsSuv<r?*V{(@>V1N=gX+V7k|3-ctpHy98vd_r{XadsQ+f=Y2AAm4r3Qwj
zheocZA4%NF`Po*q$*s!@va%GMoSY#+L9Yw0^dD~VhQG3O<{oVJ3;-@NyedEd5&l=a
z5-OEk`Ps7f#<%CO2~isQ{M!;b7e`qAYq<YaFoXO3Gkir~o?gg8T&$B>_nE1D#FzA1
zIReREz4}b>#vIVWb_QbfFD@?rwE0>4lq_J-hArU!Z~rXxw}i`(VRKLw;1~IG%7XgD
z?+4I)6ENXO%7IgK4G(Cze&J+HDEvN3Icd8u-2dX@W#A#ovowe(5Z;29JUpv&Nn6tr
zY>sO5+kEBsfZ>Gz=umn47BeR|SGmdcV0*U0Xl~lnwh$jUT5q(s>Hm1^CrBIz=-Edg
z-=eS_yIw!lOn2IBNpVkA@w33=)yJf1`kjGB3sUKD_lOUE{}Fop>W8vj_nXJC{joz&
z9edUGTQRZMlHfM4n8t^dSzoADG#*7Jx+I&--GT)iv|F?c@SlU0hxsr0L|Bm$BZj4N
z9k<xq&+<OD^OIR^?)ZLzSSyVG3rGF?OrG*$bWwIq)QS?)u+S>~gM)@uj_RG}-vcJ{
zueVexu`=($X3?8bVu)5Mu4^HLM;eDFN0Q`i-G1~Op<R?m;ycOvUouztYfSH;3%EV9
zOT<}>_ktf_aR2clD&`WAn(5J4YHyvt<{#JdwP@LF%YQ@^EdkFI%LoqP*CX)L@oTSe
z5`JSe?VMAl%h!@$ne0`%&CBxy=|6inIQYMf`4JLHpkSt`*(DwL>+jkQ7A9=uu@Kr7
zuOeI$%J1KF_o=;a)O*Kwo1bMR=@2MIRVl>AOBkC;UO)N&Mp(crXIk@PL_wsbyNz>y
z0M=apMYZ3Zo${hY;s6yTKJIOTKgLFk1bQ@-l|u*$hUezt8vkZ1+TTzfgEOpKQ;;CZ
z5V~>vD+=OwWgcTZfj|EAiu{q_N7C;As`AH}z<Yd5aXmJE8<mtKfi+6L{jQI((PN#%
zKafAG7lV`@C1OC0PGyrWF5mQkVDg(1Kb>Aj-2Q>b_kMR6I`}pWbXWaVgw!9h0jJX6
z^4#b61j(`J?MObXsqSdJ?cu&fnlL8MM+hK6`^Rs;rliOb&e-O{nw}9%L3`VL6@aAh
zwWT08ZSZs{TK2b3lHsMBwUUZN(&9QPRn-E5%@098eFf9?BwOC?9?aGtq7&}O0{s4q
zL$C-be}=xVjuh4^)*FKX4a=#KiE@2bHt>mq9Z(YBb>19T^E~AO2GXMC)Y&bE6cx<?
z-TrFyDp^$M1@BUmfN`v^FZFkr$xa9E>8W_)L8A}hrMl1ht4BeAY&C9yUis#!ZAdr{
zoer1my%~DjtS7KVv+IQxAGp9=>$2+Ovu6#t4-lBXmKK3-n`_LM=h*#O7Dh2!^wPmS
z&D}(+bJHpWWw>{?R(BQP?t5J5%sGtij=2+tevLBPX5ZR<0~!@Px#KKLX*<2W-xBHL
zZuH*E)R@;z(KR1VkyjnBQOzAqs{eion3tKCoD#*~g+q#eAmIIUt~5%ac1c>cVU%0d
z^%K6oPSJnSVC{$;Tks%z-B$NFU}m3Co!wH!ZFUDjneUR%XQ7diktM^nj&zA1A4&1X
zjR=ER>8kJ#CxSQLEe+={^41-W6}Q29jnNi8T`Bd!c80>TrlWYwX@YskP6<C?yjS77
zsV8mvZ-*i5;k&Xja=A71Gh@++J%;sbrQJJOf^xY4>2J~qtFBfh>(Z{H`;iY*Skyq{
zj3puh8njXKBGYrk&*I;5*Hw0P2!XJZI>}BqWqGQZgCa<HBq~@cUO*60({G>ciV`=~
zW{LPeHS_dNOvTVizqhHd*26F|G&CO@m92agFhL++FivUl!47P>Sc^M{x5DN-BCs`h
z-?82omcH5L(BggnvDSK4fjTlRGgzfprO1_0{&?mzzG2UC?YbT1C=lZVn|;8;*1iks
z*{2(L#ZP(P5m)?Dh7g&v^v0(}31f}c@M2oY?U<L*(%Dx&UVVb=vOhvGt48?kIBxL$
zMq3kQ%swhUL86N-5#yh~816NHzx~_o!u|e`&<gBjd-O6#*|H4KUDvPt37G4Vs|Rc8
zD^kRu6iL(*CudC=?zWHVD`^JApLJea)4IAWIyFgCVy_y;O=QeGQ-JzV_DvO-y}_>M
z!Iw63qaYN=pkEcOcLt4kRd&(Kje^&uo|4`Z8;?*@7LOq>bc^YQt_|jnRP^c)F4i2=
zd+dXGmk&u^99clyd2~!wO#ddJt#sb)ve2qy{XvJ7Qv2~y_-@SdcE-j#KjaXa24cUj
zUp2JO&kU1;R`F4PJ_U_lZZD?pqn5jKY{gIXjy^u2!Lx!fP)A7c27c)qp2#lJT4Ht6
zRUo3cMf!8ke+uFsghnLZkDd_YfXGaq9G{!Z3wrZ2o4IkNUpxc)cR5z^nSR@Kv;IG^
z@(oXU5k{OVW6ba}FjsLorv1wOn4e^px^)!gzwn^Y<tf}&bu*S@j4#xMyT$utd>d|M
zteC~)jg8o$+G2roqO#cKrC31$4%SJEt!^7IqBJ%4N3UYo;fwA0bkD0JaiCG$<?3WK
zO&|+!A}^I~9CvujIWM;7DF#n6p3cr;<>@lKZC}c70aRq;f@b&HS3+zTRsECACKGb0
zY`@y17n9JP-`6>WsM@Y5vOP%ESkAmIj1;GYr`JHkGae5#WlSEg$B>9o!P=4whb}Fm
zyt#sUNYE-boGL8xd|9T6K-+p>(Tc~cH{Q_@>i_nKSD(->TR!L9{9NL>*Gw&cwM!>Q
zmalrDPPA5?ojALtS{_irSMozZrlF+t?@0eV;qR2Ny-}ee(ZQ{d&U4twqMBiNBe#kM
z9@PVnK5ZRi2uN=F3hX(evd+}atL^Z8YWm&|81MziOSu;(S3{I@WXG~2slQ&!zq?IH
zg?^ssRh;NZH(^O}@r_S~_0tIu%{hG60lkrN;@Y7Ab=I)1qB7bgHZc{!+I_f3+nh36
zIuzbhm&>L+{DZ)wuH08{TdbVdzK}WIf6*+;S8y458t_F0^8-Fp-Vo9){BP6nc=j)D
zrfW87(nex<?~F6<W>YT7U5Xy}?%8ig2Ec(m_-WDKpsoEK6dqK_koJ%qjah0FIQ^zQ
zuMM-^0ixn`*@M+ps^28NH-kjCR@<TAZ*krgJz0pJx5Bn*6@iuq>O?1CeJd7RIpuEb
zI~lUNQ1mWW5`!d^#AR<PF5njWRv3l0B%l5MoOKNu#^Z*y*yNNR0-~3ZmX<!}xG%|9
z1a%hl7U9O(Z!}(C*NOD?6etVcc+;sBDNXHOYF77QEl!u}D27#JeL`&Uz-CN}?Y57I
zAbK%!=PpvJlXvvMt-Ww|lb)vnlek-%v*#b(0)`*b*bhr=o_Ft<K`{<x8TpbmyY}QK
z3ANfxKS_P3!2`j<(_Vkk32B)1_O-Fm<;*pJ1Msx8n6N5y{^;F3PtrtoVVC_hBpE{b
ze5o~uIN5D%q@)QKy1i6HkA8BMgYXR5vDhbt$2S0U->k7|)+xrbwqVN45MV@B%wTRD
zt-a@3Qv5T0fkW>cZ_6Yl!9lOq6^zmfr~iSu9d8A7Z=`cY_IR`Dcyqqa0jHk%!}FNu
z;_OB~RO7GO3I^sdANpPZF)4xVCwvPW!e8KuFQ~ol%yc|LRJ*%R28Ui)K<~b=OduP}
z;VIcBz3?@$dppSr#@ZtKNvOd!;vJCJf#FC$08JIfHvGo$6P4ohvDL{8l>~f4$vkcc
zFEa%+2g?*iJIzm+W&PiVp@tSy`MmK9!DF_q)C&|u&%1H*RqJwE5=Bu4AGLDmkJ>pI
z*FU7@q@-}M?<^5RY3O#@9VRRL(|up&$kaFre$V`h?1UG5`{?!@Z|nzG4MTz;@DCbm
z!j5I+WO6@jTI~RA^IB3Z@qCcEr6dM1pq3zHk8SpJqvrL}>t#z5RB-AujHmyu9xQIk
zkk4CvRGwetbQQ6|2~m8!F%%vcsXWhlkS)K*p|W`Evi(Vs>Pt~k(b&aI+0r+EC%h+N
z^^=7vz^HNpaa{WNI!oW@OH@B@r~I7l%O;tbjHtqMBXl9r#FDsO9wU}sRi@x3n@(3q
zwQp$q&8J9enKdf<gSn*J&6|+m2&`H#yG96|X?V-d6e%IL<RD(jx2n96V9&yLMIyft
zmdRjT#Xzes29(?7u6pe%waJaVZLG5|TD-bp|3*yCnt}1U|8?M;1@rW=rfnw%tlQ)4
zY9)Pt#p&9t2g-QOdq;d9u_e2r&!^wvX#u@p1^Af8i<JN_l)r&B%Ch6uJ()0C0(1+L
z;?KJ$3i=0Fo9Wp=qr8wKzI^x-9sh}fLar~P;^QTNGD9}|AVTb}PayPdS*xD0e!IYg
zydIyMwAkA(SvfxEfwYqgRf*F4s|E}KfYA0!lw8p2qpLw@uNRB<(4WK<EDPr^(tlKR
zQg1KpoUOfcf=v{0MysUdqRwkb^(vilAX&2pt=@BpKJJ|q_rZ8b^?&2CJC&;s)}mp;
zE>Nb9%pn&~VIv5+PU$`CZ2S4xTT`kxx|X9Xp;azB$XU#o3-+88Pq7WwC>OR2#O71t
z|HIi^heg?b?ZOKBfPzxeC9R|&U4wK8NOuS--JJu1Gz^`>(4~NM34=7!jdUa3HB5ZB
zk3P@u-QTyr{qDW_W4PylW9FKBuC=bU&ULPHiKpHtS?~2!508${+rW7s$)sX_Jxa^0
zF}af_lG5TiRH927j&fEDjw|mfqhR5zq6xI4tRF9QW92sVi8*_FLr6qq&X`}-y*P0G
zzAFd#!ZgKE)iH-N{&(~$;xu0L)6aO2drbM5W{A|fJn3}&@rJ8zo7?*K-&S?|pR7aT
zG1udxCwsT|#VR$(zkepLF7b<u@p^6PWseKssf|y&0)FQ(k6Hf3Q(X&GCYvTI-wG5M
ztVZsas;qh}-}ZSXc;|213jp<E2jI0~En^<Ncz6AHXVQkw7^jht-u~QjDz)2BFvQR(
z(!vT&O-rl#Dfx5G+6xwGKX$5P+BG?#?=fn}hlE50G`!jC7*A$@H1c9%spoM>H8Hf3
zY<CWQonl62f<&sGtsfKV*ExQ;GJq^DSpE#7TVH<ltm9bx0Zx(Ew%_KBuq$BZHGSsG
zO~8RO4+dB(Bd=m`dpyr*Mk0ap{OLl2dxD|TqaxMTt5oZW&&jWEO02AE{Kg;7Hn>+i
z)5>kHK5?-AETmj2grEW4FMy&&pjl#m%(?BgR!jQxY$mkM=fcRdh2e<QQ3%8I`TmOw
z_mBKN-sn|_QCcC-#AHt0^7*sv6k3gO66L7b@!8#s=~35}J(>IBv@h!&7U)!Vd*kkZ
znl6@oAPT77k|-A&UOSWAFwmzWYhugqnB)QZ(D=kQ?XL{sjGr!$GZU{6$AWWgLFDo$
zAQ++9S%5wH*(hPQ<`fW&piBUM6wrcWE2>L_OczO?he&m1TU|sbg!~1GSv7n25O1cy
z*YGkba?XXn)`e=5(8V`I)YJ^pA47n9>|jtO;Iby<w)Y4LPk8h?uHOWOVks>ljNdtj
z?xR<ba_!=&6$gvSS=0?$j(My=<YU#(Jkemqz#Tdj1MO;io%7mtvs3SWtEb+?@P#()
zwHQIyD3GVbk?1a%T&%2^BYIyj2}eiA<d|sA_+H(M9AI2hze!;dJ-?Fj0Tn%aHz($E
z@;M9``P_D4l;{3{fH=0RPG5X{waoeL29!tV)YsX2Ym#-b>YeD*($<A0Cj--`!>T}5
zP%QIlQ?vd>Chk5O2vQ_n`?`F7KBj2f#TeGm6|8)sCU&V++Nc4Lx=N(B+%<F5?^dnJ
z`>~4mM}SrUS%F)Kfg6W+Q&Kk9*q)dUztB+Mz%3zN6WnIaE2+iP&~u-u5y14#;&0r^
zB}|kvOFY=WEja^|qdGPM6eLLh7=**07VgJlN3O1p`LA1K)mPdP_!%Zm`711r0n}}H
z|HdxAV2wAJcwe6VEII^hKO}|vq6qj-ZQ&{?_p>AJCjb2q{KAPcolt=5&EO;AlXvYy
zauMa9g^Ua*8=AWL)vz#zMh=6Lj*%OO;6h225075xMyF41OR0?N*1Zh5a9cNx4J++_
zJDUQHHTdAn3%;m0dtx-wW0pMYW=%esN1Wuky%-KC$V!UvALj07c!V|SI!;wrq<CUl
zS|KWoWkz$Mq~o=0bQ4u>ICT2+u-#jBn&B&N;$j-TV`xNWmHdrZ@4TfV9$sjv8e7U=
zXS{FN5_hpT!v@#NG4q1|#M3B|EI84t3N3oAYIQz#%ei7x<(y)opcJ5HDaFw@3H~Y?
zUT}z5p${-4@Q3Ha>y3Kq*4MU_nG};o_H|rcYw*V%S@kYildQg<;XUQ#&(cQKMI0>o
znO-~|%q=A_N0bE0`7!uxOvJ;GRf$8;#@Lc;LS$-lSh-;n>w$TW{@jHfPfhhe7P#Ja
zrZ8+p9PuVT?Ch-#Xca!uaN;?U`(>cc!xN0pVPJ``SY<NxS);=iCKUgsE#mx<FWt;)
z5I)HVrw9uY<G1UoYHDgV+@?f#3yRpMKBeaA!yz=x^i<zRsm{BpDXEoUh#Cok`})uE
z^ndN*?xdt*JX+EG!QHyC0^Db&FAjR0IzBA*T~>#^Jb($^nq1={?ntZuG*P43<Z(k&
zrr*?^3wCN21!4MOz3QLh=xe=oIdtYV9B$4+42V|cx4_S{0gCkKqR|b9vy@H+=lb)3
zjE_bitGiIocT*Z@hL3srBY|JwZ<Naq{s3ydpF|xGcC4B_ksk#k8vX@ZOTM1Ri{M57
zW!50dn?`p3cP4~D{maI3&6heI(uB$8Ey2ux-u?#w_<b()c3=Y|=Ej?}0wb=%WgaDf
zkg<v>eW=iQ`tP%Nr^IyRIJ!W@Q!g|Nr2T~sC~N!<KO_XC@%Hk2jqkA{lQ%Watq-2v
zk_z(j*~F1L1USeyAAzZLQQd8B)1?~y#mPXI1HtR<k=BmYLnCij614(B=P_{WwJ{?4
zt^iSCR35!dgzg@uoOJr>f*~FDim<Lkk7RMQjR5)N{?DJq29PE$_QH&3UQ|LtN^v6%
z(k&6W*C5Vp0Q%9@f|fDwYOS+z(^4HP842!ajGD>FB$kHa(U9MJZ9P${0_e8pDJHR{
zy9P0IAWzmXWN-o22+M4(8>!QdHYa=UQ?0^B1&l@5H)}s#oL|pD5!+bIr@a_2vU&jD
zdg<}Krcblb*2>^p;Y40tU?0?1!cLfsKK@4)H($B60l$UA=4?!KG%*3@@JCTL&b}}Y
zdt(?A{Zk%`-VY4$%8^QmZy5CLQ26thJ^m4kfFlUTJ9RObKQaV3HQv2(@uihZT?=#A
z5f)0~mg6@s=M}K$;GY@@x}Xt5Enf(FF94PIU`Vl@R(Bhqfspk9q_LNA_*x|EY*f0`
z02cG}`OQ>d3zo92@dbBlq4(epuc;%*+0pSakX%vXT}CrBav0RK_kuVE@#^_HmqNYI
zcc11SSoQbi6$O8Z#WCp+^)VfOaop@3le;9%HoCPg+;q7k+adcfXJ9s;19`{o_J87r
zzf*8iiRG?O*t1_8%1%Q6fa?KIuowWw)59z(*9X7LQ_i%17*9|_>WcME6+e-Oyqz*B
zYjkTxk4VV{Z(+^cM*P>BIOVZ~W`5;WiNSFBI_Mpl&?DDgcaRU9_v0o~M-JD*{01QZ
zSDNmRpgI)q57V?XNB*?Ei$lUIa=?<KYI$&Z`ed(9V`pnYN1EB^g7^LAV3sO`M!v-O
zLu!FN2gA+Fmx_-@M{CR)8X5%CL2`syjmeG$R$zp5J_cX?($-I<jA=W9Re_kDZj9FN
z2T3ox9L<r^`F1Vd#sF#8#2=w0-MH#?@S`VkZ*M;lCEjoo;GcS)hu`Hy@V?pK_{WYz
zrBh|inNHBNh%U~BuT2pXpFfuB(8wX?JnsuTYDVdh+^KAfVsK1yVYA>}OcnT8<s3*D
zrz}oa{WEr)vnoJ^#zWE!gL>X2j33Kht5cL(xV}h@NbVI~kECMD&h_w~uhWef+?54J
zAXSoW{vu<ok!j>{*c&>iyq}FZ;O=S~?!PX&>oYC_tGeZJ2_*XdgMV#&lK~(4uZ!8@
z6Mv}}*!Iw!(I9xBrh=tk_Kxr5obn~Wf6^#dRZ`AN;W8Pkixx?A?X;CMXNlGZu%Ki4
zm-6`l{1*8n*zjuhMJuLsz6jkz<u&x!PoNPe7h3Ny&+pxvD{zpSN(O%$!`QZNHYjR&
zJh6s%AW%QNs-id0N8k3%e-S`-{hZm?pTINX(-o&H42x<%O+EEp9~}r{eo3y>6-kNc
zo%Of&>Yw-I{Mh`2eyCC+C9?6_%MndyXAG0cf12F?B~GeD2h2tAK}`(A5`=3ec&$$|
zeGR1884t^POTHvDWNl^2RPE|uu@YQ+au)d=b5A-LE^YdUY;k+iBvF68-~8U_Te<G}
z0}<eC{zW_u{5pUs^3Wk?!~)VewK$%~hdes}dU--9!ow5)F*CEoai#sRpylfXcBiA4
z6+bu0Z`XKbL@-k$5Zh{SRW^pTl;yA4B(0~Vk8}N!J+!a;aO^q8GTny<p<=(1QXmGV
z`v+#IYNGW#J0kcMH$Fn0r7`+b?5eihcol#mhmHLo;Q7e9pl_<_=Z}slUt;>*%=v<Q
zB;&!pc7hIw^NfmMQ;xIOPmos&hTQHXT7G9g9<dtL)%>{73#+w65eAOaqTzS>PAuiw
zC$6tyr-|H-3$?q>>6#rV#;4qI+MGsGp%;pAD^1>&0lg;J4>2+Mv2R1##Em@PIrKDS
z95@)mygBX|PjiO8YF4qc+Up#pU)W7<E32V<EXMOu$AY5IO+<Jni&VGx^bqXJ(g`v_
zZMXpP6@{ao08v?@+5%bpaj<asDlxs@Hr?)`4Fkr2_1?0NONm&G5GPfzDvhqJ{2S~V
zPO0NC((V;W{Q4F2c1y5)NS<Eb?rT@qD0~XzMpqf~$S^WKYiwdp&E7YneNRhjeCF)w
zVpr9H;0=Cu3MrP{xPM=qJ$v=1j@-D0^}Bbi4MXH$q<X%|;KSB;bWmGNjY6k44g5cx
zjn`|kPPY)HVYtn2OdU>q8JSojd22oF7YGPi*J~=HaBdd!A<VD36p53r6ZWAWB0i;*
z=LBuHb`Z>Vt5C?4Gfi$gJ-CSb7;nZ40TBK19&oDSN0fGp$ve>1aWnKoK&0R6c`Wls
zw{zV6bLmY+$3Cz{(f2B>(QwlD_D#E_Pk6cbt9A3goegb|;TWtZQ58c0HkAD~Qk&J?
z;vQG`&AVP@I+cVEE<{C9c`*yZtZNV;K=vvX;<ezSigwx<o4oI^9UW@IiG;67-f#3a
zx>A-sC8DL}@_iX|(mlE>j$Ad56W0?1^+2*7wM#Z0xv_hfX~1GQvs5KJELVn=eje;M
zuac;AeBaj6J4&ZUB~PA{+_>#czlO|I1c8na4~Gw{E`9I$269~(2syxrowx4a%@oPs
z17;cM%ea+J`*0oR!%%P=_;C+y-`re=Y!umxg(kN=fC>CHrOw5N0f>Vy$fePLh&FPM
z@yXBQ5k+1aFAby4j%r>gSx*+*!KEC%PGg{B%ryUB2S0bx2GGl*g3W&63j4xep~wbp
z+)GZZL|J8uu?ee$5`at3e6u+xa4-UDMS%+7F#l6hIt-;S)$D@E<ndz$mQD-kV5+RL
z0srWNM|D`xhgZIKEq}RLoz&o!yy+v{HWQ4&W%29RBdng_7aq2hVKQoI3bWLwa(hWq
zblj3<r#}WX?e6S-_Mrh(9we(#$G|Y12D~0X#K)-9jf&V-s!jIT!#pmA+vE#IR9@+E
zt>|AWKeqL&8b&=Kv|_IrB}XYC99~yIowjDVNXGU2gl`nRi0M803T(o<{Hks5I<&{>
zs*OMfh+5LDo{JX%wbLgJ3iJ#?WJ9|1g~NH)PK=kpEHL!*$r6*fVp<DTXdW&4xz1Uw
z)41%}N)hd=%3K*>{NrEIyF^84?8fb64hF|RHrqH@xj(YES0TM~y3UP!lMgpsYd`L2
zM^;DmPqIy04#B{LxqMk$Kw7ISty&XoU}*dC%LAdipI+2havfB!LpFC>2%LlbQ7fcO
zX1Q6=G}4YVC>BSSH_H8R$dB&ThGjJSy!o-N6y&dW$e_FeJHvXtus4lg9Nh;x)z;V7
z8M__qK!cGhxNEOLhN5VJ(U{s(204b7EY9p*kzz<#Ro|*9IhX^FJaX>k@k<3|f(w}4
zn)_(V(1OTVu@ggI4t^xD-62-piNS>V91{2FPS%mf5%s?;Cub78&erIH0BSa%==)-C
z{14M)>72~!9LZ-hmEY`9z%}6bRHvNk0kv>qE8;|{)UdH?@_fAq2PoV_s$;l+vjzeN
z|LJ7*mN%&Ke0s9`{QIU|w&cHv$DiWC89@VuZxZU<L&3Cb1B^PfA^5T9mk4Aq+eL>$
zrRj`}J}F}j3JmGOemsqj+_+0Nr=sW1E@8fTBC;ed?CC5rKD!Gr7N=S9I94qU8ZqCO
zAk-EMfsFk~Y#{wkY)gxnRZn)P3B6p%jU6BLJewb7dY2<Z&jS`YIXODVm9tRxzt$!)
zpy=!8rP_I=3O4|H{;1&{rX4U4Fhc*LtA~D)g;Yh1ER{1vW4f%25w6TTh<3USl-qvn
z=|ZqF^8ujJ%i$us4nzaOArv~%l4Ux1n~%@3@YP?#P}pqGk(;@()A8)n57@}YS+8TV
zBew@$<lQ+pTl0yeErMaI1MpTPwkG>a_}a9|Pk5wvYXe5@;2U24mf>3dk4eH;G?%3Z
zXMI~5CB7d{^hzZoZIW3PRF%H3svgn>6~Fv3)-?Gngnh^CW(uw>yI)noajg}>Ka<~x
zrF?xRwaS}eSR;N@guafa@aC=2eTyMyVbJjY{X=TLIZ_UO?~6P;X&(p;cyG2YRGY(y
z!`^PRR7g2p_yu`nqe^wlXWgp!`R~nYK%QUXS5)_1Z0bUz-(dFmi-gm5`-baYuhVFB
zmY+}T(i0%IeYT<M1{xn5``NZARpa`N=h#9;4>@X)P~u7+L|yFF8xJkn)mk#Q66@`C
zKN9*Mvz%KPbuX5ew_9A-jKhjQ>lf_QoP<iuY1+zgSPXs4(Js^9Hul;YqApd-3n8vu
zUs3{?2QU{%+@Eb0{V%;}VybBER;fPs8YwNmHO=I${~<^Y^o6uV$7Y-jnml+aT}Iqt
zC8I%pyF|x91;68ixIevmLCA-Y?FgQAr@mDRl6PNBKfTC<ujY9G|9BUqua+{8`qcX*
zUBE)z2<9Cm?7l4lpt`w0$ub-Qyw(EjCZH9)g(n$cp;CO>+U01S1JSN|(^nXukYKg{
z-L3P|Wi(=;<~^+m?3-llyAoXn!m+ykZ9Odb8H<RBioeAZt}Gb7S<9S&15R9CEj`Eu
z*^gt^P&!$P%}pcc^alM@`)g{~by#F%k<2Lx8^K~XSWtVQnr{QB2%J^G&<4_pp9~8K
z{Z6iXG;?wpZ)xH@)ARgprP<$@)c-}lwoOIV&?*{fN!cfl+f(9|!lkDG=+hC@S7A?Q
zjDxNBI68(taek?Tl83KM@DEKsk13nmhS`o?s1C{?(%k)CoyOK;sD!QXnBQ$6`JGDq
z!>4$|g@KnfpO^VU{qZBt>URT$7aYRx9P&zMi&UX1E}H|~Rux{So301*jh9o9`B{9R
zBgt)Z#NkhJ%Z(j{7)Ut?T_i%57_{7Wa~My!GH6#6v=I^6_Sjby8oVoxCH^<P=UXaP
z8${ik={F1O7dO=O9PW4u=YY@YpkEDPQyG={6}giy4vz*1o+Q_9fMH0s$r7Ep;>Dvc
zj=z#uCLXGy_T6!qPV!#Lo_x4(z7dxFQiJkyE%T4p!?61+lqQ!K`vw4-DMT&wM8+46
z7#=Tav~bDu@9yaue-9<Sr>4@;{XqIo6Hxf>LbB)HY~Mh)UwgZX0P>MdFJi_zM`IgY
zcYkO}hnRCyLOlSAK@NH-og-L;iendm_#+ZSz6<uo3_-<Im(VVC4d@^UvwD#|b<(gN
zZat70l-O5r@>;jc13BWeCg`*r$tgQ(L!h)bZle_u%-XEOBLueE(u$H(xv^e5=L1TE
zQ|u;6&2{@EeuvMqQ^E#%`~-xt(tj-VuC4e8z1{ACDiia!H8?%`3bh#i)$bS9<hHL*
zqw0+WitfCZp<ds`X#My<%(`2eJBzotGX<k=xU5=(#a?Ex7l5=@Tu@Q3TlMR#UnGd9
zKh7n2ckb(R8d3?>-}RW?)z#(QkA(D<z0U~u)2#-Nt<IL%$Widj>quj2EDgs^=*HIh
z^cXjh)5{#XI6372gPLDm8ZWr+Ys~ped)`4_Aq~w|9;K8{etI=()-^LJq*2}ly%!#S
z_7f!)0RgieXx%W)T_J>E5ip$QG;uA(ijRSQE@$1~p^tBOEYfOd-9+?7YZ0%Ls4bk)
z4rW{9MX%;!>|S<56BDJ`NGLs(jze8P(#-ovps!)*`%WA<;xI%%ZL#%vsrIpeocsDY
z1EZK$=@e*EwA1Bm^z7*`#iB@*%j!=BpeN!w#4G9gal+N|b;70))ltVa;npthbZFER
zo^Q<7jot!!1!&Ma3<R~&Hl~55_H<R^Doi4fUQ3*qKR=Z`ul9}_JoAgn$jG3Sk(N%K
zzgh5D8U@C{Wo-+e&OHEPwR-X%&)>0HTOl?(ZmOJ;06U=@iSrX_1~`PtqKN)`hw}Bd
zyFNwS)tC!>+}`Unbhw}h-G+5>8+{8DK>JoaktY82-@Ir7fNcojsRuQAY>E3_mN8l`
z^Z1_#?krZ@gxnmi$*Evh#?`*vtOpm&O5$4lq?Oash+GM|x*5r)qdM=c+ZB=9O24)U
zT84Bv*6vQIEU6X&td19wd));dXybK|HyJVet5?^C9B{JLL$*6&h`ThXVL&RUMO-O2
zUdz?=@P&f>cr{xzg}{42{xRZoc(-_(*Wsc92*9*tG}X(z`3n3<qVED*RHS~NB(7F>
zfN{();#49%4A%m`!>vG#NH#VvDDOn50z&oLy366y&$g_fi5vhe7qDU{cIUKpMgdst
zU&ysxv__#83D&zFczSB0C1YLR9)cF+Eh&dsrQc)ORK^|5EF-K_hDz=W*VcOaM}&uu
zl6F{KkXGd7CSI?~2ZFE0U9D$|1D(Z{6cn^KBLk^~?1S;WTKnON9~0SwGd8d6dXkN5
z-UgkO-2k+a1dC6}vZocrJ-sm>KjLWIzQJgMT69839n5}O5SLe7jhQ-hBRX;$0Ni#g
z<czm<9#jV-FqeB(x%!t=LYAM^^*8<U(PouFP{y<aKLI3t7Y4uXD9fmSM6<ogbe+a7
zyn!;X0a^P$X9-C-HvM*_iq4d~JX|F0fRCexvb&Wj-t`VA4`EdbyDyeY84%q}z>63|
zY(5=TQ$PU7UuNfB{r!!y)x#)+MoVk&dzln7c4j_U>-K3Nqw~N+RQxsRIi|@B=@9g=
zs<p%46!e8w6xM5rjvjt~>5F-Y=hA(8tnsa9dwz0!6?g4Foui8z@+l=gcvONC3(Xz^
zjiHwZD=RCSb+$iVPNvXug#Zl~L&Dv-cmDSH{4f6DffkexWi6Db+2LhOL@WMt8vJwM
zexbRo;?@&4DENROMI6&x$?VJfyByl>PtSBs-<FHwK(c{S?O0wYq>Je}`WgiAVw!@*
zycB<hZA_&Ua;dAl0d<^INc&AFx?J<qzS7UUSLw(@+HD0(vdMCTNbzd~1FKPCjWZHB
z{VcoUwbs`)Q{N$mX`{PysDl;-$=PDQ)PNz3A?B5Iat<!8u8&|YrJ5u-zv5_wxHSTs
z8_!v7IPSu@HRR9$Cj6@2zD2m^oN^F&`N8X9)$d>Rnl6mC6e5-0Z9((o`r@)h)A>oP
zD1_h?q$+oXE@7unu2$b3i{N+(uI3b&?TaDazulTzs5z<E$`!8DqlJBY<KV#M5AnSC
z=nwZV4~yznyDCH<Ym}}VZyT*tdBSG@`b=)4Et1zIKTK2W#`0A;TdRL+q}Avq3F3Wr
zpu*7YRU{eo4;SD{Bd1-x)TAZ%yzKKu+TjiOq8w8+qBN35LXuna_U-MKps^%Qtw#k4
za^Cm&YGU!_AK%3Y9LfJssD3w}RBfAHIpS=wpR2{=1bn%Fu($WlqxxP3{?XUNHHPNr
zr{BFUvo7Cl?zMQsT)RzdYYVmr4A$JI{LkDBOw0#qhmMq7ot)~RQz$Dt9dFS@vT`sU
z2eF-|UqmThjPIQB0(T>GDd2Vp9%D!cVB_DWm$(yl3wSarwY+42?r7(8sfmQUoxMP&
zN9XGF*<>x`NMQ}6ax<uNB(dpq16`TM)fedQjYl0wfz7l$#=l=AP*<Yu`3Pq7!@XC0
z7X69TCSdzn>@-P<8JonO2OukE@j8NRzY+1DwCh+W4A-M17>2YB;*NffHR7T^7ZZ}j
zS~Z>oN$Toji)lmzvQU#u0w=>p-v?`Q(H@UOT%yslq#fitq?JRTtw5%yAsp7(NMCsN
zn>5vlE2fCCiSU(~7IQqs6t)jQu4KKfyK$A&ufu-FFWHPl3VG4xNPmZCrYPT(>bdC4
z0yp*>EAk{(KChF@NhgJ23KAZb%&qN7r2=)nN5B4A>>UENSgRv*tLD*}O!wtzo?I%w
zld!w}Qh|Q4ZUF``Dj}~t(hx}hbSjKjQ$8p;A>j|gOv_AHFuvDm)(9)mimb6HGm`1y
zZZ)9n2s5AA#_wnkbUG%V%aQ!_%$D3xwWC09ClnD-P!-a%>cer_H<iZ1>h{kvsYi7t
zu7vuR>be}uKP}yn1SULeUNFH|&ibBLT5K&*WDV*I$ShRsK32<>=I5^44DFR1PmNZE
zXw_0^jWkKwFe;eg7wVJr!x<u1^uG;qRJtp;?)9w0@D7&h@$K|aL-}A=S9sED;kZSo
zV4po(2rQ2<H*Kf-Vz}2#FuXxXGt=Gx*8=RHyA$Nn&JVnd$@qhVr@>PDNvS^|jr_)k
z?4Tc6w^!v54Ima!9JCKohrbMOI}hCLm}?bhN1JnV8j4kfs1<q&1z%Gf{8%fMe}e+L
zLP~oVrrz#Ml>D41(J*w|D6<a*)LOmaGhV-Uou>viyAg~mXXTszsq$`Jd#KfOYdr&c
z{2cX%<kh+Fq2XBjfhY;_XD#39X=CwvvDPt90#`-_uSX_*8;<J-Q-z(<>*NkaMu-af
zQ}`9j%t9_V`gOQfg6%CH;JpZqIg-1rkWiWd`;<~)wpZ*V2)=kPyfwEOJ(}GozzL%j
zbUI@U!9APq7O)M5DgG#}FDlCbq+7X{KDq2n-Czv1+C+37wuC0IsJ?a`A)9aY!z!Dt
z?Ib<7W_bNRcozA6qd!F6deaG>Bk>JEa`_x+xa+~AhwER4mcqREhew<I*chE$#*Z32
z4qFM`_m_NbS&ik(qQ4>IL_AdFOWVDwNlo`l-Fg$aAc`_qE1~4<JLtD7JP1*TIPFW1
zeFM76j12zPFdYJ2K@Wmt4&xX}8Jt`T=z)ADfArwBAMvqbfl<<Ffp6kD!x{Mq4(;21
zLI$4(zni*<MfamNIxQ$Rvgzz+@SDDbwT(zpR<l&;#RT=!NQ#iN%KItB*Y<-Ed{<5(
zE2LZWA-kjX&4jQp@H`AHA!G4eCS-T9h>}$sH9!XrlaK=|L&O&}t+fnL7q6s)h}tX7
zZ&VnSl5zc7#FCz<fK0}?!nJiOo$nTPp9&-D-mIt({)LM~vCEzk6T=o^U=x{m-WAq5
zTRVq~tL@HFA~MiBdV7SA)L`07<FZD+E<whX?x|D-O6*muWaIQ^C;)C*J<S?)U!!3%
z9s?6WQhPFWcU6Ji{AQ`%enH)=p#uv-<~0@%`h4vEK}@UxJra^BSC_IioTYb^Tnm!l
znTiciQ^aQ#e`Z-MaI!wUKY=hrU;fn%$qFOtQ7LvZxGBnlD7z%me`uCMBVA6Rsiy0W
zHsX#Us>VHNUGRjjvTq|e+S+gBICa0i5428u!gig6P(Zn!pXN-L972h1zBm;L(O(V3
zHuge4nSN}IF>du0YQ1K5?!J9ZFm=8=9mlMkF1SBDH%r##=&(Cbx%Au79(Ch4xNR|~
zpO-?gAh;0-9-00;!|?k%In^bp_N&)F0&X2Y6P?Hw8=?7$2kdmZ>)74f(^3amW)2`n
zBp7XGSe)-BT0rsiQcutI<YzX0c2^>piDzCnlT77~WjwQ%3G`eD=)UqxIYgh)laND8
zxZubStz6gl*QJJA7veaPSh{8E`9o^$nf^g=dJkg)pVUaPQRt#^nG6#%UG#^aZ1fmP
zq#WbYmSuDGnz>)m$U~DTfg$BgE}J~BjCYPowlT6yo{H;yHryYd-u*-cc03ekrug7)
zGI4{!(P~p#@U4~)#=4_7V##zXk)FP->3pZy(tflybU_5IO%LmV(8`r6wd$4;)%dCJ
zn7f{y1vEDcr6;sp7=QH)^a8QJnpPs9Guo}=IL^ulOTbsaWs#&&NZ2VxPhkkR6c3P~
zZLjPSnCX#)qtljSx!mQ(NGLPLWF)D12JCZ}V`1W#(aehCq55ulhkmvFX(sBU(hZwu
z9v7m6f#RI!+)b{BmQm7HjZ1!4%XG}a%I&RUHxt)@#sqJEp0ephTQS^b_v#xGi(p;A
za57EI!u{-ENki#yySSBuvfi3r*oGi`g4pOL&L(#iA_sw*PEF{=))n~9<#JQ-qGYdg
z8NS|aeo9oda-7%Go5mJONU-@mLFosKV>FAyCoU}6o1mf=q>D3)Yu0t5a(tOKZo$oC
znz%}q5X<R+5={pyCJNZR&UgX%V36%Dg+j&vlHn3kISBg<y0|DhGl6p<BT>YYsQ8}i
zuY$~%n1&{17Jgo5S9Jb{+?c9RF|7wRRx`J$GrUtNBlBSC{B{ulky+Nd%zmLsx7J#V
z>U?$IAq4Qf;D<sOzkP3*`JMMH*XHNl#HEvX>P1qYD(3WL{pw=g4-R+A(Gq7|Ncd!n
zrRrs%B*59A5!#p7zZ@{<auCoCN$h3oeg#Q)sJ$vDZ~bgS8cw*QPA1#2$k<{sBM%Vx
z0`S4>T!~9&t>uP|@4w)m3ehuL&*TU8BX6!o*U0wX0?bdi`ZJge$qOb#DvHS8z~YMd
z^wyF;Yr)p~v4+;H5&^cNboF{)g2CD0cP7fU#HIqW<Ervz+5MfXaw)b31#8&_1U1N=
z>YG2EqToAgRr=Cs8k)lM63w*uxFfx)BT{AQqRIWu!dmcVqO>DLNiYlHx6iiMJS50*
zR^AJpvF@Q%VVn%q%;$#%!?6$j{_>{xSTr0xsTc7$F#UdIHDmMWdC3zO<CZu+1Clai
z&<$S6SsBkJua!m~Ock`=mooPny}k@nY!kZ80-?O~kudY~fq;o6lIV53be8a!%K+iU
zJO_cK6D>%+mH*2%7<xI7kVfB7qw&3w&sWc5!tN-t$bM3;Jj?Lhp9kB4QfMmmxbgf6
zq<(PdAn!_!n@>67bNy{T?QIWUgqB0a)I~Ph7oEjaUbi<mdVYB@-%+D5o}`+l2uu+;
z-Vkt$d!{BCdF^$4a(8nm3o_aN;JqACH~q`#QkV#Owm>@9YOGKVW2$tLjln3ywbY+!
z^?qi+T6giAbXCDNCH4J*G%-9IopKH3Pfi}Tck4Sh&q_yYY15$K3u@_Qe3y0K=!_@?
zof<V$*=~Cy2q5Wu_PziQLz@G*8QLAszw<GzpHqn5YCPH0T&v%v<lmFYr9$D`Z%PlA
zM-g3~ZEFmqnr(&)n@olbyfsVEomN4=X^9g;Iu+8Dn)?&bUhW2-(Nb*6eV-bf;{~yz
z6r7k;RN}@Ka?r9(g(e>Pjvn>9u*wzMn5z%_R(jv^&X$T189jDyDj|1Ui^MXvgHTH7
z0$_hq?K@b}*5J*C-VQvUn_lfL3EDaQsOS7zWyUPisdEqtv|*{mEv<uhc{~mbc^wu-
z?S)Ss)<|;Ci#!Xb!uTBwcT!`_UDyWd$UBsM^PZ%5tTXksX~tr+?O?O$qOeE^v4o(c
zU6sE1We&ZV>$gvCc?F0{&^U_W&9}H<jWetC9Ax@TWEth2PuN@$x0afa+S@DS);hpE
zF2zX(4T4Rk5rz9`OxcKAWdSl`gwY)A(!kgR^%8!oV-5=bvI?L6-E=1!kT*sU*3~wg
zM^mPxx-PFQ1Esl*Sr8Um^75L8OvIVpeyJ(1)5Z-}PI)hsErrVf8*UqYnwGEd0Ifze
zYb8m+W+-FH`+7in4}B9r7!rQV?q&*>cuNjGS8bx2KYa8e-=9nGizZP&*^d1K0fBHD
zi?+jhH}GS`%{J?OVE(i8H~5q9*P`A(!y@)r%v0!BzX7Vdlnj<KVq6efkh)~yxiF>W
zRMXQ|`zvBX*1&?m9$-Obxl;-?oKMXhlnwY~Ryz-W)CPrNn|8n-qzZEz8s$m521!e&
z$bHfoQ<5dD(<=GtIDdHv(xiV{O;#?#s(n3R4>~(7RWE>_eH#LKSfB1uKEv)(eB{(9
zHn$36!b!G#FRwtlTf9`!0i9}?t*S5H0i6vKyd)EQQO!|3qb?nrygQeb>GO=|nda#N
zkObuF4_ZIFBU^133XqH1YYsb(r|YP9ns_#p@@$5OJWcN9(H|kMaR$=(bcpH{CTdg`
z7zqnk06n>3s0gHzp@(fH?~%b;-Nh+98ulU~Ihte7$OPS=QlTl`Qvl*}=}l-i`R31%
zXsWsQ8B)&@0XCe-0e?1b)mNgk74x^)pZ6B6+q3R)piErWipcYtfZPCkn{{Vy1^n>S
z?vAe(7RMaFlo=B{?=sA7Tw96zBletnzT;1rH%c3$$zLp%{)2Q{RC+~2#d>SJyCjGX
zFBa?H?S}6{Bj|F>sBL>C(_C@ym?H9i0c+O+j-|O0y{FBOs<-*>3Czk7bM+2VC<~{D
zw<A_Qc7$6H{7Ld&Vb`4523MyH6~l10o<^h-@0`TFNvz2B#OXG8w)nmaK@x?MLg%1I
zz<<A=gsYcCFwGJqGhb#dzQw1H2b%4lR}oiX>YtP~5@!UE@kuiT?6v3<cG1T54Mg|J
zgmCZTY}7ZPb_&?Yh<Hf0qRbi!IRJ*x$j0Tp9tc(FVE~;z7(?!%hT6{O<I&3^$I<rQ
zWU)^XsVPrgn3-I^$1xb>ADbliIyQ07P!#1WWD=t={S9owgmtQ8u%`f}N&$gLI6lE`
z1&>D47`lR<6ccb@0*lIv&S&(W0kx2Ogp4oAN|xw@^`G_N3zY(bgO+p0B8=`X8e3xC
z2(stQ!=0NC3|=+rGu0f<KI}_iauEbHd?Hi0Dy_F!i68W9@>ujmbt_Yr{?t5V;?HNO
zeT#=SzM=I`<;H48%@!CLG#Nexn1J`5Y7|v99B4@g11MISDgH#*Oi`Ykk7lv@x5Adb
zYx>+YUi*l5=JDcnZ<|R-JQs<$=Zo*nI7H#Hw5I~Pwt`fbK{uD&Djv!e{$iRSs_W61
z{bqa$QT_84?=tHK&z1P}3Nxk%$uY%3m5lC<3)@eiNNHS)n%fp|>hbe$6goCOE`^Jv
z!%*`@2+kMh5+RxBnRlU_`?0)UJUCxzL_LeuN7IAtwx8#R%2jYaG5q{R`R2>p`nn@Q
zw-a1hJleb;uOj&e)hbXa@~KbM@~;Vv6B6^9J;;LF>ku#R@>ZWLQN_P^li+&O2UlZk
zCK5?Lt1!Ey$L1q8KpX?qpkURsaAlZ-dQlQPGW=u>mwarl#$x3duo|nctdbH;k9py{
zYoU;5RtU|#tMSJqk3`BefvD5T;53<o%3<+q!JW>~sCNG2(-@;UhCwVVyV;hb(Rsbo
zqg@~_wo6kjnNEk)+s+mSm#8a-eJKOu<-d$;&q3SI)hNqE@)|K%87yezhY_J&x)1W2
zX-<6))~s~qdlLR*AxCvbZFv2KYnz#r)?CK*)?!zlsSKw!0wPXFz=rV;2g=TJ)(YxH
zLr@Utq4&`N?+ZXs_IV1`y+3CF55@`(^XrTF?b0Mhdxeogf0nP%m;O2ZlD}b+e=!pY
zU&-apN}|*Yq^q+0>m7qdgjXWA6n*ep)RP3o<EKv%DCDK^P?m6{pU<U~y0PLJy`Pin
zN(@k>FP2wSh*u02a*2DC=gn;x<gojx*R(=|vCM{tV1i6iuEcpDgseOu-?o-De#q7R
z3QyPJQCZqUwj7mdcg5njMxT2gON#6^mF*I98)n2TeN$FX(<k1xiCsF97kv1TEO?cc
zwEEtlSti*~lf55$*446XOikE;(P&4tded~lI>@n0vudL^L{W<^_tnb#-qdX=M)}lR
zKxw}F$Mda3%k4uW9}YNI-FR7jDB;~7tbh;IG-F6uR&XrWs%PGr`jj_jm>yAc+yj50
z`}K%av;i&QXIEh#Zu*YHR>Ml2Ix*FdAy+j#n*~GA&uX^n9tD4i!|TwYRT}7Ub$4$Z
zhX_Sz{vj1;_T#4ZP?BaUkDT{X(jkOO`29U|W~+8Sq9R}8>hu#g4Fa_vJH$4}yF2IU
z$TVqp_~|+VU|6woXBqG5ReaOA+R122A5;)lt+Y~5W92X7y6E=X>3kY|C&U^<;8>4f
zc4Ht(iZaEt;Wbj0-RLV7m&sR6L+{HWBdrz=RP3QDR5a*SrYD!`;(V-pq)Y^{Y_4>O
z<`fSzK>?E2C8m2Ro=`7$yWnM#`fe*9tpfF+fq+yYgr~G-h8a~Ot}1O)=PNJ$K0bb?
z6Ilmm)>UlUgl=(FwW*Jl`fH(;>~ITJ>}%<4pG${So0vH(`(E%}ME@7C4cJ9rA7CRI
zJ#gGn)kbcUVvQo35QB)C5~f_gd6IBMvbdkOo5>raWG<5!@AI85yU@jxsV5;m{VcHy
zEWaVQh8N3MX?d1glae-{y|4A37%wL@SbZ$}51{vZr{de)u{VT;UxPT>3PJpGH~PrC
zTH+O->C^oZnW~l5VzFywA=r_OUwtxVZKg^B>oVCtyrX|cIRXBnZ8uUJApf3k{2TS0
zhnZ$9-B?OVSgQ0+t=8ur(@*vUNdxK3^deHTx(0r{lN@nhn1MJkb8b*=Qu_1QC$MV%
z<h_nFKRsR(`VR8kKD}uUD+f7T%93E;)(+pjm^qqFddE0RHzm?E4Ix<zf5uXc<{`Ty
zHIg|=K@B5%RvaiVCit-K5%tDECeR*I&^cZ-?07IBgcFx3y|^>c!f-+wn^>PLb4M<I
zYU~lYO0X#XvWupf^WZ0)5X}@GtU`JW@d^NH(J8R=tB>H5jj{bP+ABj<K+4tENKd~%
zS;4~No4UBFF66Og()>&W0mxt_n2ifLZ%D*5+Iyb;Bv7Fev2w}`^I-a(ElpdkCA}wf
z*>kPi;v3zy;(Pam-s@z&lp`Er$x44m-2bdWiq_vg%Da9wC5gl>eQ%}-$}v~hme8a6
z_`UYii-ta$zj*3iradpUkufcX^ri`bja@HmI3$r9TLq~%QtQ3gs}(I+XPQqH%Dnos
z@cR0h_1J>f&de+cI3ggemy14eUlLIfx-~2H59q`5H)fKf4LEYYroq>H3xQAEypc)u
z;J5So=gSW>xP$UIrP%zOTEI73k%zZ4ZTqM`^I09g+m5E|_861;+`+*?gWE0%A&Uw^
z7JgpGugE<=C+xB(@W%;0#mEYznQ?22^=G(|6B;dkry-HcD=n>y#513_**#Q-dd<f!
z1znB!{{*e$ofI+QIr{yA1LJNm4<KgdU`Dm27;3SYBqbev?CRX6H2mAa(&nYcpM<O4
z4d*`jZj3HBY|T|CP6zNXt@`IZNrFW#lnqSo;|E-CsDj|zosK!TpS=1KFnEKfa4f|9
zhDwD-0NaK!T@M6p?M>&19K1<kBjqx$$yX3_QOT_6WdDHk^Q2LF`N!#2DxX`O9`2>g
zgPQ#rS()Ifi={A~iZ9mJ6vR10niK*y!44S$>QAW#w=$+d3UQ6gEP$63MClMzJVW=~
z#b1fc-%r%c0dPA0n2zgn4fSeyYxVFqwdh0S&`V6sv+}|RY8vzy%t|{GNGSI06b}7D
z`cXYOvKo8M;A8-ym1=SY4zy{c&!tu;zXmudwn(2T<0@_M6otfMZOLcO-d)lt(2o8N
zdq^V~44_o{(0+gYzi3)Sm$O!p9tWs>RErN8TgjVfchigdPGFjNicprgSICE-+IQ5I
z_4(BZzJI<Huo%wj0s6vChs$Bqv)N(m*3)GlydS_M0VzAGp{42~)#Kgtr9`s9M|LAk
z&LxK#vgffC65_bjkFp=m(R$pW0eQU#qS%gfVw?Th$}72XZRJ-j@A%jasw7WM3=prY
zK*t@=Syat8YiJ$!dbvZ%q&H}FxJ<Ono)ZnHu@W8Yn;eSwBGlfA`k)5h*GtFRTwp1o
z@E3($kFYlf(t|WuR7CbN!t!8ti)0&Y$#S{D_w0s-lONEBP*&_bRjaxK^!Elnda5CO
zW};9gY@c|XuBu2T<9DD2chE!)uKA+cw($Ob@TPI}N6Vua!SSFz==!h+6hvB)0Xfwf
zD)sY8Tiw9l6zMD3x?Aa_zVSV&@@;>zyq3q$k=sBOta5Z+#E7NK@$l@-+3~j%T)uxN
zx!fdb<jAzFbjqY&E(DK<;-YQpatL92{Tb?skJnNc^f%ry<9#`N9=!YXlIT~3qH`4P
zcz9q$1#-u@L5l~RZVj#NS-Avr67awI{_!5+hhp7u`rf$bEb+`rtBGJmO+E#r1z~*S
zDoy?-Dc@+T3ndVHj_<8qjhY`F?0(V$`MN$nA!Rik_s<gZiGjXD<=?SVFP$uoh@qj@
zJfn|XDkMW5j>&}EThK^3T0Xu{!fDEl+8Tzw4hdVCA{&dIg69?z?lC#JR5!LNly<Fj
zNNSeK5Uk=-@!XP8dQ@w37`e%%9fwGimlN=3!lo3Wxj`+4hA2~Hd+ip=$3A`?HLbe6
z2Plg<T`j++f^Xr!K@EN67rjcRH_I{fa?a7*hNxxB(|v`C>P1aCTu)w<Q#V(7TpA~>
zYD+7=56YKL>eYg3!5?k&<jb|+xwkh3R*m+fx*(S^0DPC^Mkz7%#n!i$8|-nTHKA-$
zR_zipCSz9p+2Z4wdJAZKjuK?3_*y2~BOZFNkW`uORJ-9Y+PFlzGSQp;iX%5odh(QN
zgh(c0uI>`(yI!47tDdPfhg3-sk%l+Cq(25EG&`T(eDSAoLM~Cx)Olq@y!5<UCg6iK
zI&Vq$jWMZ}=-+)abON|-vGU_E{)0|?{{1yA={rD&OB4rX-AWq@{CK;l;1Q?3d_hL}
zt45453;CiwwL@6lRR_f{@r1RRQKFezKbD8I!V%T7w23GZc6EGWHXVsY3@>VUXcYQl
z&_K7^nc5xEC>0sN;=f5NO6G;L-f?P?{B<J=gbn_#%hbL(DkaC~AL}MbzuVV<L&{<K
zZMbgl?P+(}cW{9(yII5Smc%b=H+I?4$l<AIgCul%c^s;#NS>eoE!$&EUF%@uw|wnh
zGd&M$P1wcU&%96(-?(|$dRPhPz?v2*_i#G<x~Y787?|gCJa<t$dm4~N=Mo3fd7&dZ
z74A-bd9WY_s7f6lt#rsw+s}Ntt@TMe+C39}j`Jksro!M>kLvFT_94N4P-vRs)9)$8
z0r#2MG7j%6>tr0nU;Pb3&-0DZSmK>wo!q{T_3b?m*rmI~oX2wnwIOf)w-@*h7npdo
zpw6hSgyXXx8gEnVDBLA_3TYh9${M9<OaQDL(7_9J+nrjuKHch#rNb;YY7V)ROZFz5
z1mjVM3=U`3%ijw{^ZkQRTF-2c<;yjPWTI{G9l!n?g8wR8{;!_|dSC?B8fdRBhhti^
z3stYTfBe#QY;6c=j&u)$&{4on%QmRDKQ47}P_h8RD}#3t$UhZLK{?a8ERLi5+hw(L
z+3$$f`;xhaZod5${@=gnpJW?Y+VVNRpxeqFe?8c%PmG;5?5v5UXWmICJD#Dpx*U(~
z6@mH762Crb7}vjv`N5+>9Sk?N>G3LXdhYBgSjyFBRf5VA>kRV`+VuFpq5%>#|MRDT
zR|A;wH0Ry>yE{mSh0^oowG~<uW94dEPU!$MLDKhqrGJ-vrI#08Jof#@IAF|U^x#Rj
zfx0Vv3iFV~qkg_6J;cUAXL#+eJ7JLhW|~>7{UFo}|Hq(sT01e~)xQgWnA2G0c)z@)
z^yckyh#_sGxU=6ny!SkAmtk7YhGrQR00aYg4WM5aL6!%2b$g3p`!{v0C|A{knCrD<
zdb=IG#Z1z9s_+otZt(u~`&K;0(8i_$xjfqcs&Bb8RjNx0eZ+p3ab&-gfgTrHh8F0O
zUvpZOy2%E|(z1I6AS_>Dx%?e@1n&O59&eiEUrYEEhBScIQL#5t2Y8LYFP`H$f(MY?
zmp${eF2IEo`#z*%p&>+1snGfmeDk~DeU*ivmI*Lx@NhK~V*m5q2>lL)OhZc_TO`k0
zYS$+<-K|IEkb?|RM^V7h@uH29ya1@#{B16QE1_xYQYU>8@y7pf0e~`&ue>-Tt+Uv&
zcklnoeA+S=@Yr>cL_@~HWxd5OkDakkGasF%^nK-7WRT!C{0gOakNurOtP%zm?2<C@
zit5@_`E?BSrixc@Us}#y78W-ba$Ws!B$5)v#s0IR{|f^?+PuW{!8`1pE&EgxGk}{G
ziD<mvy}@Q#N8E$`3YWAepjzn9y@6rRH@OfEdQbKQ$>C|Tu=%RrOX~Nf0gDV|{ew9@
z{%<VwZ)*uqzC$DK9ngovyu4k_vg4T%LOoJhf;H;hLy%2&Kn*p02>JcVKgvD!j0b5u
z80K)wLzyv!1@?0ykKe-gAEE`Ys`&4^ZlX7r>PE#aY0t3Wn17?Q|8oQR2fO|A*TAbb
zOf~_|jzDvt$(yA=u<@1&&FMfr34oz3&X$<>{I>kIBy8Kc>bu5tEw!w$vT+Qc_xP1h
z&7FGz*hW+~wQiFO9Y=!4Ierq6E-L89pjg^yRNHybF3Tm}Jje;%iwxw(yMHM+PwNEk
zL=r?Dwp}3thVlM>cto3O88=MI7k%eC4k7Q)2$)(UK#}0(jGU)!a0NRe_P7yW-ODdf
z0>2%}`gfgy&um%rz|IiOBy*4avolBnle3~7FUH6xEyFRO%b$iRQx94uJeoYx5HCe7
z@ir}<|GG2CKww$w^>0`FUyO@8{Z|8`0H_&_;uN)@d0Izs8}jxg*6)izg4@4LY8BP{
zG7NYHDO-5Q`t_O<atii+c}Kepixv;i(7!!6wMYi|@D}c$Kc4-`BuMYHlfG=<V;a};
zh;L&0Acv#czye-EjJu7iHe7?oyR9L1<cl9y2FVvi#e)TksK*{!g&IQs#XJB1JlwA@
zuq_>n{{TU4aA~W42wq5NHb*rS$Y;QVu1>tukzsG|JPC<>8=xolXCV%EVxU41?#3K7
zDpO@Ae2T<RDPbami~mRaRD$tNxiKut3I{gbdN@6bb#B8?ua2ez-suKUnLkwV_qi2E
zaD;o-R*oq$5eNWV*&EaQWa#MQCE&q{pzl9*ga4hK=|G2vsPSc1F%d0TL?=Hre)zgI
z;VK;g`sFgwmG`zB|Ln@Bwz2a_0t%YXexr?6cyCmo<npn%!eSlUyUpOXNDzb)|FQUA
zJ~@86s}ks6zOxr_*uNM!>wt4+IDnjqwjt>*L+5Fo!M*>lixVW~66^>`v;;dwnzMsw
zXZ##lwC(f@S45w`;XK)5?bTUFbN{*l5M{worQbK;uRHb&jp6TFcGkY-VePmSjA<Qv
zUF+A;D$e@o_8*3)l!Z3b;LP6LZ$~MK!?BRS(SrWpxf$z$y32uBc!%cgA4fe;&ceOX
zbA_+5Ty1v<9E>3#-z$7B<F68>nkB2s(8bA^SGbquL+m{=znAoo06S|PGD)dZ?81|e
z#9uJD2#j#CWhEqw`9(7M7H|t0=qmwu>3F=pP(!>1n(h<-$E$Dea2mBnVct9}29t6b
zwwEM?xRR)!tPcjwd%5HQT{ghrR>rXcrO-R~+;9?uPz%jIc;7kzNu_cz&SoWwOFy#v
zq-?g8;)Y*6?*;k?3i|Ct$%q&=dB_5s<c|jx=O~-LVVLPj@KX{uZ#<f?VH@qp8u7t2
zG16)agsGGLH_5xtdWW&dpIbzYdSAe7&{9VuD}ug1!`%IV$$87q-#k_?R3v8tEq-;<
zSm8RI1qA<i9Tsr!t-tI4hFaZ-OtD6J9L@*j$;GVB|2U2Hm+wpGU;|bk4d8=vKTh7p
za{4tr*<JI$SZIO~-w3*FvZH>WA#jfsgpA==N44tV_aAWR9a=@7=zo184&*B8MJoGe
z{wT;f<ug&!;Qofy{)*jPn@djlc;4_2=)vo6F%|He$L8HpDwjILq$<~?PGS%|--6q@
zYlm9>I_9=N1=oiXa9XAOfU0Wplj$Xq`UBIYR%FCl7fB3_cxkChmZ*^P)?hTf49+LF
zJc%3(`FKW!#Pc|=MUK{GUuW@aU!D5=t`q+(lq@(8Ho9cHfB5;(;X3Wmz|5yb7d}V7
zCt~J_N_eisIdIQluj)obN0Zm*!WO_g?gGK4%;_djEKM<?IMC2Fi~-DtGc=Ky@bA+{
z!uQ`;&#k&kXNeA6F1ceoc=j>?O_|j~gtaOml1=+xYcLt2M|;;Udo!(=Fn0ZrAgqVN
zer#$vV-}XzMmS8a@~MvlJN$N$&6oJ=*4*>?&zQc9-1HRSq~M!M0ew~bB44=F3mqXB
zDEJ~0i%iUnx6bZ*9D@>=cvGRrq3#Fzf#1>h0Tn3qs?zy`Wj)v0T%8vs!Y@>>hRdu0
zAZ7zo`x4a_1*pV5$rh1+tpF4?Z1U$A1&Uy^dRhon^c#>^9G|}L67<>pta0%Q7>`n8
zzeJJ7<4=}%+*Rn805a-_n$g0_ZXU2DmpjjjOXF*mD9Y4W&k@qTLp)RTS@`0Q`QC^F
zpak7jNMNk|qH55tl~AEwY)JRrf3|9_jNi8w(C<2;;lT$`xhN0cxBIlJS>jslCIp?6
z#ah&2@Uym?zJwoVHqz3|?H?kqycX$gf7EClD{0CE*OblHNOZ9huOtg{U$+<kc&yr(
zTolG~yx#i$a`@kRs;7&nBVvGHz{m;}Xc{)HtNBFhX{o^nK{4YZ73$9h^(r>w3k%c_
zD0n5%D5P4xe4OBZfAb<hwK;_2y7>Oi2k`IxJ_qCJ^gCESDCWEvWKINHTA$Hk_WX3i
zaoAm)x1*;0bX0jC2|>vLLG)LosN>E$`M`bX2Uy)<*r-utm|=R|6-jgkzbpU<x?j3n
z9xTd@d@&_{JD~Lzv0llln7R0nDShD!9U=Kajm&de-&m5{9ih1rIqyeX4+Y)VpZw)>
zv$8p8K;ovTS+~RYNv`y;=m+Talxu&lqx0*a?(@+fc|?RP{#^Pui@$RGoqR_`m1T*{
zO+kMI*$~!Kje@YM)-D|IxBPnvY7LgJui##5QsLfj*HCN!4`pv17UkP@4GXBCASK-l
zp>%f)-Q7roNOwvj(g-3T-60{}4bmVW-7VcQ!~iqjh5qj6x$o~izV~^)W9FD+&_Cvi
z^V;X$Yp=ET?{M`bOm3`Dbh7A6&hY*s`Ff>7jYiNz_scGA;MHlWG6-a70!ZpmGTBM{
zqRC?FeEXl!sTBlB$1o2%VAEg|SGGpOm`xi@y38g+N+a8#P<hXbgAdfQU*62-qi9<w
zW|Hn=2bH@^tatyg8W|lr-)3QT8|M^g+xrwx|0Jfd?YcvIdgFGb)&?d1bv!agSFW@+
zr}cQpmY|R?^l>B}W9NrP)O74l-5^G>=8+|SO1pVZ)ieC7)I|rWJ`5|C8vOal^L@Pg
zV3?SHcKgGVRoRKO;ZO9*-+?G+aHjMPhA<h7xw+m&QWB`tq@WkOr)Qf(GE=37twcsX
zr@;&<CZtlpo|8+5`R4)d@P8h7@+6Mcd%%@zD^K=I&Xot!YKFeyYQ^9G|L>X&6CXuF
zLc$ond{4k9s3GrbWnF*wZByl_+POcGg+=&hysD(HH!fBnLt=TMYgl_ZoYX=|PYTUu
z0w%y|GlvuY@>SiOFTg3p?*cf@#E1MffvKrsil$l;VOJU0Uy~#AtZHaN;y9=kO#vs%
zSc*Vlp2Cf~C*OKd6OGR#hH#3U6Gkx|G|F$@=RNmJ5$Y@GC@{|ca6-zDlFYuVme0?2
zQKDT=`HMv+pMDCznqVY}c1p<r=)Xu<_FLL6H#~W&sxl+93x1o{>~SQev%`z(jd?o0
zJynve(8%7O!6C4$oS|QPcHBPWWDH06Zcsb@1MjVp;O&)E?Ve_CaP>>plX#vmk?s<d
zJJ<p_0gqQp@QLhJ^CEBsV89i~pb}tzD^u=8$AUYO*&fj9eYsw5<X>Mw*C=IMp|mvb
zZuk2N;;$37CKW{%jrioOk04MY9Gmjyc7Uq(s=${^_#}a=aH0PG$x2|5rk8rBL5{<l
z#M`SQv-$X4omSJ@9!lA82A|k=5n;PSPv`~RNQO}8i*`GeF)qm1hH%L>9rWT#f`}%}
zj1uubtNK+t>D8;l{R*oYa%+O4fh9KoMYTM|d(jr6>F)5CKm?4D@s*3CeNkrAj^611
z)`=9zK^xKKrzQUG{ih^>8y3+y>Ds8Ji4b9ju&SQHRQ~`|ep9IEQ!tPjk%f>K{-4zn
z^BhLlXDd&Uuwa?|cj+mX6J=cN?*_vjKYKa5NsQHWu+WU5POCHlst9+?Dc2^a`A}8s
z-|BHphIkKf^6cHNOB~uH>NCjRseX;THt41iM@F{1A53Mzpq@|0kyV{vX(m_8lUcj)
z@`A&Bqln0!%lanKDq0KrLKR-Rpq-twMXtHt!=1iLgm6ftLRW(@;!&ibH0C-LV0x{x
z9oJju;pt&Lw(h<YdF!HM08rUG9ygcono$OI<-6*vB&%DjZVTWfG8?M>c-grOh5eW(
zdn?>-uY$3{Z4(8MaTiMA0Q#cS=!5%CA-!k~gthK-?;E9h8->1~bF+CbvbaGfHt?d|
zp(82Qtd@^^P8y&yyiRp>ZKpRaWJ;~4C%#$X5_f%MPUR>3TJ2|&U+<u>1R6dkr%vk8
zGxgq{2qC`BlCcK3<Dj-1qzAce{2T(TY*v?f#-I0AWO6=gV#8y|yNugc0~@cB*sX}Z
z!mIRCGZCMh=ZbHmU+4Yq3#sjjZa3ofM|j!i7qis-6H9EL=k6(Bn<j=`PrbO}=)dvh
z{zYacA(F~?hNbeDPJ2Up#O-}T=`GC}{!QdD_~(Ksv}<7(LmSrm@pwCWinhu~ntZZ+
zPKKxQbYpZ>_!bS|NW@Gg15+W)2IKp|AEU@y{k*+_Uhsl|#{u8pnnB_v!l($w0l%?K
zXZ$N~;TpTy0ng*aM_5fueuyU=r>rHU4@EyvPIcZfX8wyOcU;$x%(fY{P;YOhAVW}R
zcds5u_v0&i`ohML!=82QQret9V0BJLDN5yZQ9)^{J20Nv0r*SOL|3-KU`w|sCm7@1
zj*zyS=VYxvu?EwuMw3si*T2Mta2Ay=)c#m$zABFQA209tZ1P&AP$UmFX9NEQ4qct<
z(+dxfAfVmV&KIVWL(?9@BvkTm#h(30-0cX`8A|8D$NQ;UD9mHI_z1{AKj@kAd^_@&
zkh!>Tkkx6l#RahDLe>lp?=r3Wun`HgZIt#Ee=d~Bs<*=*vPJ#+6J90u+|=7Ii|w3@
zy(wOdr&G_e$jk`Yu?N{!5@kXxXud-2sCuK&Cc4#X_a-Gqg@xiVS&R4IFOK`={1A6A
zq9t}sfX}`$8p&)nUfTw}je8xLP%Oj0Sa{gQnQ?!6sfXfd19UMAeZ4?dRe7-oS20F#
zkt`)N9;ZQ19Oie?DqF>;{2L|MhvUhY8G^p1GulJ(OrJ!hjHX5%d{!BJ{g0L#OF(Bp
zKZBU*4UaV7!w}1AB4&5DzE5wxqT=#=fqsfU*Sp;Qnxd6&K8UKtwiGNLbgx>hARBkX
z7hKP=k8mZw$RgSEO^~b=66%S?Vlgd##*wM*c)tl9f~>bU8~%L5{sN#3s3@uYp|~rJ
zUtXVX#>gi!Q`$^_=WeckfBsIRSc9(36_UP-45T7IHJmB_2C-F?zhKSc-Tsk1lThrY
z_<4c2Juc)~<nJwA`zpoYzHp4gt^<<`j5DO0SOCV7Mne2sbBkVFBCm7G`@x>P>{Xg%
zJ~$YArkLU%`7s<VCv@jzC{qunA;zG-*Br3c6))2}52jJH4ltr{C#32MCn(c9wwFoi
zCVA11dz%`G&umh7tdhnavDWhRMaSHP1&|?17As?aDgz|*^aiFHca%jY+dngc2q)eZ
z@5tOEU-qy>ezi6`fslVBsx96>VLwpzOS++Y7c$B&_=s8WO)U?|lG=W`AzUmN)wI8&
zvp=3sJ^oN09<S5OOBVhkK|t9)3XLp(O31UY9GG{x-qnAbi$7{)JfCM}eFx0#2}$(u
zZ#jmwaf8Fc!ro#Cd3l7^1V8q&sW50Cctb6we+}#yyvG)r^(c%;;)p6$ldob=!lQl1
z+No~qm)kD5cb>y5FX8yyTbdvdKf~NNH3#BSr}XQ%emC!ndWF{2MJWSc>lBJm-zH2J
z2f{?>S6?Zm2Y`B>OQ&Y$RArKrUKfWc-$%7skG5Yxj2avf#C=2Qj<DbZ39ophc6~I}
zsK3c241O?zz?hIp&J#9n$vd?_R=sQ63pK*K&)HxgpDfhuqHNFPb;n-5_r=kE@(E2W
zI3n}S^)b8*raU~<>5pD?9Erz(SScD#OGJ7r;CI3M^!C~+7Lmr>zPwE+mQlCLvJ|X3
z%n~E*;E%Zk$-KrkI`<mc=D5h*Spt6s_mls|Ktddw0~3yHpOc?S-)+W_BG;MS97ooo
zkNidCQUpkpVMJ`Ek6T(>sUWoCA!rTmhbdlK-Rp0F)X0S?<NH5^gLO*8`lCgOYD38B
z>KM{gbMdBi)|Q3ax{2}}&A&j`zf1Mx2t)xw4!^6mxLUFPya{FqMOeO%y$q^hBf97U
zI6d|kymw9Rtxbj!_JFZ5$2Tz7w6#2%#eNHco0!MglXOpoHoWYRJf6dal6HQ1wRo==
zdk!)QM3S+uWkDAUjb@2WWk9a_karlcr4{i5H1e>}cOOT8cu`4tbsB5<{)+tYn?D@K
z0{dC`FC}0Kv*rvibOCDzKMbJI<N``z%c+9=o-P9$?=W=X>BDqkx2NJj@_FO3J(cYM
zB<inXlw{Jbg_^r?Zs#g}L*<@$|2|PwBvCJvp^T?Dz{aK56j}CpANz|S@TZof*TP^N
z56>Oq{bwdUEgAkkaJ^l9p@8{d{A;xuR^jp+zCnL2vSVVZnce+_$p%W%+(|dJ3&g>~
zEqMtsB*E5rUQX}m0|KJoF=>s?5^lOR(F-rTBulVgDGPgVk=#AKv2lS;%@`HGHpU)i
zqTTF;5Yw+Ow6$Lq4cX1*`8db}WH1KpM4rQv)4=kJq%xg4>|F6)kz;A-Mu~RWSa-vj
zuWN&XQB2-EUgt>~Ihrc7Bmtj~2J0DqzlBGHQENm8aMl%TSD&o!`JHzFf*c4pqF<*$
zjCYJ0y-qup%ta{ldCWt`xKMA`mmL&JS=6fN7OA5=-b*_cUatrt7ji}mYVk=3`1NLW
zx8IOva9LTZvNr9N-Nn%YLvP!r^@_3Yf97Bvx{r^*Eir7>QmX?KeS2THUkGHGsbg5M
z#7vw6{;$#gu$xzEsrOJwSD|VMIayW118}Mz`O1dsPcj$i876W<?f-qbK@4)xg~9%#
zx>Q^6fTsDM6zyNXwQTpe&UPMgB~ZLZ^l9{G8T<0Yr>2K{ZnMo6EQ7vp(0jnlArG>m
zNiQf!EXV|#(7ulu;s)5h{pt#_JaQXrcZ7lNeD7=f9YJ`v4R4=4Mq@+yA@LnkP%TGQ
z@*dS(Hk2NOjKv-PsPGcMma&(@ZJ(+u8On@E96myw4<yk9?34JZ+_qwt{R9w$TnFMD
zn;7Oda&NrN@Yu9<c63nM0HTm8)+W%tF9aRz?z>aNe#MylyjF>)S0Bu21lPl!9m9fZ
zj$3@7x%RD1Y$;E%_7rdY=p|OQ3KZyY=nQcP+4ExFqN+7Ye9^0Y$h}hYC0<1je#0FE
z*7s$9Dof1p#6=wqhzu^`AC_gem~;x1l1WR&)#jrSWEn#70aEaN2dp9)Y9U!=21yX9
zTz6UF<9J%9p^G98)2Uv_rTOSnGtNLQeBBL9(v`{?7?5?vKa%+<4w7Qj&%;#9Q^4At
zxdMf7%r#Cz?rXy7pC^|LrCv?}1s$yd#D~f%TIlvl3yVx~b1f(o<r!JHA~APKxx{yR
zE3Y1AQ#A=*e&0b?AGN)g5Eg(!?H+85KVsAYRH&7{3r6Uv*p)W>b)_e6L0;U!HqMn~
zE^FJJupmccI5>&>eG31(^00f^izc5NrYGJ{PY+~l=icIsUY@Cltrmjh!e;p$Eiv6;
z<~lFUCKAoq!7U;G$(Q1d?8tFDKfR*tYU8%}s-N`oT{PylEvb{!@>*A@Y97#A8UzL!
z5NG<BM%$9(-`|wi0Lh%cc>$l&Ly+t6#|gCauIpgPtLKIu@6}6Z3rbRA%qU8NM)!n?
z#_>*hPuV~sfBH&4QBnTx!$;%|{_GST_o5e9$fFrTT^h2<d@G;XC#fq2lTN#IR{U3*
zJ;=L49$T6=l<&_}hf!=7Lx#ULNT~4x@`9J(dsgXB&3Y^toL1Abpn)j2TP%QO!r@aV
z_nzJ~=JQxrESs5BvnlUAr0y{{kNu(ws9PSNrRcR30R4PsQ>R_>1D`?Nw=Ve5PEXZU
z8{6yvlLl{B4KW-&Q(nDmS!x<AvT$B`@C+^d{moed|GS8hx7c68Wf-~d<-3DHDSCBp
z8+%BrYHm$>qdafcgRwdGB%Mujk$hWEoCabNO80Ipm_p)q%+I$OJ8Es<>Ja%>b`O?=
zNdeDUV;lzi_x@h%*Uvm5km($u7vzK=5qn)TF0Z9?GGwVm)h8uee@$v$GTAL)cQsb|
zO<a>L*5iORV>z3Jku*}4W@|72t5-B0Iv-F34p)1Pqvma$4+IL?Cpb9#c2jKB0Wc3<
zS9fi{+#bnLpCRz6t&9LQo?<Mt71xvB`fwqE33CewulMv&jVj674_?rWr=0uXzQ5=7
zKl(sRjhF4bQSc+S0K$!+lrEUR23m37Su~8~BqY9kb>+{_91h|Iu}Yj<Sm{Z4y+7Mj
z{ZKDZf4urTJPz;LXX3=hb$43#<@qi!!CL>ZFK8X_K%uCO20LHkrSI7Vol-h)zAwuQ
z6c72DvF~$LI)d$QYK*aHqwWnMqTSVi=VZsy5{d65p0Kxja?f<|bNmlR0hGSnS3HX=
zjS(vsu%TP&W$+m+_9{l-sG-f#Hyn2UvMh6?G*eoc{0D~o3ouF?_Bt0Hu3YOf1~U`)
z74zNd{(fXuR+bFc)<`<>H>GrE6Z!9QF#tMQgZy`E_WXurxjTnq!IG|O#;94t*jBnk
zqOx9((p7G5uG}-)4bWEnDFAI`BOpWg?H6#10LaDMz!abu6Ku-iq?`)BT!&w|9v%R>
zD$)(s=zW|amu(e+_A1sp?=Vj`uF3Vj*>epPaxL?gw<|~?yg;j3>&3xxEpmnsl%VH%
z!m7|LD!>$TKW%$|PK$UKRADhg23TgJ`nC$59?ZEsMti1WrJgTCO&*^0))&m--HRdj
zmNx&pL<lkW<M5?r^<8Ae-M2wvhzXWI#_oR&ZkA|tQ_`8d?9Bp%I&#$1Ibw9!Q!*d<
zi_{8g`1~=Bz$&q6+_1`LpHey1#&#AL7i(a9G4j5e#hR)gIIgZ%Vz|6LPw$=-D53WO
z08@?k>b%jURCBZ)@VH2c<20fG&BJf>Da#lf{vRZQzkojQ5;_3>l}u2Ns}UhIVs?d8
zeD$6W<#}sAlQ~{t)5;ZILjWzVPSab$cZ!*%;reaYRp<{8#FsqTnj6ov-_LrZo_D2a
z+Gx6c8Q0DcN`_kO!!&JsECMLMWaYhw@I(~q`?i<Vv&8xK_1@QDgI|z*h{R{1L*sw%
zDzVR9s=rr$dEkFv-frd2Y^RJ4H+;r#aDOM#ZANRvb)&w1wcNF%YI5y1TyDK6N`<Hj
z(9%Qq&x;dE5{!p_m*fw9Rco`yE_;p<w!;pv_VBKODw=(6dzxusb!ilHS5fUN9-~GF
zqQit!oc{}q7gB&Bo9Pr;G`Rq}Q90#9CTPvAcWHZFW?&P>s@|Xw{?kx$b#JOf?pNIi
zuZ6Qc(1KVUt7iDpKvTIIiMq{f+)r^%WI_~^cWpmahs*AFO^Hp-85T$5wXjIOZPFH9
zK3@YzKjg3Dig120S`8pKCEX*Skebj;*v$+BO<l&5Q**ky>*Ud)>lQ$q+XubAo141D
zn(n(ItgM9phW0^X_NNx=4L@bDk-(d+X3PHp`Q}^wd~T_L5~{&<z|p^i1qOqioS#QJ
zEcseM#Q|*&M*<1T-&$OU3bV}^`yESiS({P{H6aurtm7OL`%~KB2wBdYRC~$D3r78`
z=N<SA0U)jf01X`qaso_GMPWhTGfp35^yzLUkd)21KEBfq*S^POd-E4sw+h_~YSV^{
zR`1scUl^|1@tO6*7(+@7kg#iQc0NC%dCVD(`&_nIL*NcH*6IVrQ2S8;L%Id*^`>)5
zD8-R;-=!19LdgNC;)hxdy2`smTmmkTMmoznatxV4>?~8|@ag-jc9MT_0pth8;t1q(
zvTt4xM*)2aQ5I}9F#)rsx&v|Wiq}|tQKyY|IbC2*9z#56>>D6hv{hOmLcOf8nX$k2
zdNKXcGF^y2+v=S5DT!^3$JS5XD3s@&vEtGi<RyNFxJW)wLbb0%qxE%6eC2=((#g&;
zu58?^Paltm5?K-m9L~Cn_H?4t=CD|OlCXz&4^5`dri$(8zR1lk%*f|o)R>olfvL0D
zTTK<v@Vg&nNqCmkIP4lxv+3LAzG(7nb3f)pl`%)75(~{0%OBMjCeynHlz+Nt7TBUq
z*rRhCr5{V8eYl)VQ+-S(v?soGL2KJhk}^dbtg(9v<kH6x+kq5goU(G|WMD$`;%C+7
zQ1Rpl&_SpoIR5b-Z1!U|ycqr?n@T!H1oq*^Y?zFi$p*kTzw1>i)l!a4NPJIa@K6#@
z)w+b;&ABip)vHK?6F_4d5*7M@?l|&XQojUWVW_igNg&tiyg=r{Fh^!1_9g+Rw4Kqg
zI9j&YO12c*iA-*Jy<;abf^}*eSFl2_n$F~%z}ZP#SxbUn>0%{oGJJ3UeQsj^DATi5
zl67fjIR;yKUAB#5f)wbMKWHEu&S11<hr2U&C8vEx`%{>U|J89uTFUFq1bZAUhC6Q#
z&K-P~2#(uvWULGV;&^2cgAL8!445aX<CH^sY@{T3I`S!3yW~yGCk)Vv<I1ey-xdBZ
ztopi~@?<nHXlK4toVYH}yhOM8U0R3Z)sX>B%lu6uW%2xeyx{wD3gc@PT$=u-uwSC%
zxGA8n<lfh&F;<|<r2Dw0yl)m5@|0O+p!>!z=q!#B6x6|m5Q2j>9$@aCLZ)dHn)Wh5
zE0ZRD0(8P7i-p;+;bTruL<jhnvZQ#m=EZiM^~Ce^CU+Ipf$4I+nCzfWh{Og<b$hJo
zCFDUcL7VUVu2wO1hYKwule{gLm9ne#&Y$?p-$059R+m=PjNj!x4X5Spf1UL*aOf9>
zzt5ZO7WV>3OIQPwqr@ZUE%u0awv$DD+UWjZJCK}@{^I&i2JNyR{-41B^DgHjS8iWu
zv1@T%1hyVJSxBq@vHG^$B%^F60kNcmQRQxzmtLHcLWc0b(Y1k)(te%MH$Q3j`@h(;
zF$2~w>h<5z?e1cuJYI_s9lrI|nPi6hT9C|F88?s&UM99^IjwE=L_aXK4vX8Gak`8f
zWk<ODl&|F}r0*60mvp|f;bWzQZAq+JlpicN61O%JgCe&^;^f0Es<->?ouFHvQZ?Vo
z&}~Yn@Y=Gf$x@3DOB!Wz95z2!pBDd%HUF!S(GTv(x4~yS3RQjMcFzyZ@*9>aD_-7&
zTpTSCZ}fhO)NmO1LQ`omN|spP_k0+j)kxd?Wy%D4VE+XjE5tuB8AT)R%yuI^`wVcz
zR^i=&?jqp-Xq03230Bx_Gam-G9gImXs<n7l@+OCRdmMNApQt2;uOWtFh)*BrQl0kq
zoa3+U3rV~RA^wn3Yg2gce!QF;&OrSmNrF&r?TF0k?u*hiwQce36#mJ<arp0nFKd8|
zPA;tOygYr6O#0))0G$Ze`=5l>uBHuDdo7biY8$}JN)%$;kSQrPo&KG*LY<-<#XFja
z#s0NiP7}@D@%+j6*lhAp+I&EE3k+ryAJm|uT01$|<%lBYARF@2jV{jA4y->Gc|d`B
z-UA)5K1>I28fQT~?wCsRr5z9g@<?%$Pku;F!B=Ag3)O1n;t!YnPac_3_Z;wdv5e59
zWJ~yZvz&Tc3r?S(DF&5~l_)DW&^T2$kLWDq<{5F#h41=7=Bh2O{eHY<d{ZG)yHF{Q
zTw`^W9<lt^*N8!N(#zvYV>`S-GUsW5Vj{QH{$d~!TqjJ+3|b-bSpM>^JNfo3zn!tG
za4ke81h7NvtXLC|74p7(y*-3`b?YS%vwN*L!&hY!UWG5*gE?S$tgK81?%7c_tB^XM
z!lI|oU|Lk*4L7!tuw<3{%3L&Z>$X(RzgC=#d~w~4-1Y@VBIt<X>vgI5gTuQeZ@YH3
z4q4Whem7omeREKghJ8aMPa&CI?)KK^)@TQSQ&QBi8a@%d{DAlb)pADnJcFv8d$=AV
zsF*BFC>e=gSrIy<T_%A&I{(JVsLiV==j7fJ=Ux5bucwCsb?U0W(@y2JukjbwTx935
zGUfkWEY&2)YPWX9>vWVf?~(YIUt)MQ%`{z>i1U+><6ZYdNmcae*DsZe!v&t{`FA}J
zx#>R&a?fo{ohCmvv96$6r}uTIVqc?lR~nt1MgH{+{KGM{6Mg8n?aUXYPA?U4(B|r#
znyR|PXKMEoJ`c~CmKcUFZ*DV~@&<iAaFbj10i8XoASN7{@n%)Z;LT6Ltkfyx583em
z3M7rO8297(Pp+UZD8#{tP)`3|rNepPA#H+aW%@kz-Y6+=*YMYi5jCp!6Xf_dI{wUd
z(9|+3lX8<H05F81<TY-9>Wv2h{E#nxs@f8tvH49or^|Fhfhk@kDB|NrbkN4nt|8DS
zh?~swNy?x*r<2T}oypo<Y$qC-tkF(nnUB;$TA=4vGuemx3ZJ@c>h0%35PVxE`964y
zoUpzfNa-oMq`uD%yUo*yS@1GUBMhx}DLQs(R37Sdn8nPii@3bFyKza7QiZ;j%t0Sq
ze6w*lDS~k4vP$l1zcMv9ahS2)ubfuobGTPau!Uc!^hwFU>o%sC)Egp|=t5Gv(NDHq
zQ~u_5HFcGtbL6*C^p<Xnrt1MEJ#mHGU1tz70U0YIF>JOF-Q#fjN$EnJb1}Wp)xeW!
z!9x@m?-x2oW;UOyw-CHrN{%y6y!v;r=$|m6{^!2qsfRh+=D_gv&bk4f!2XxrL*UR1
z?<4om0yIdm&{tOn!02NDXe3;>TUiN?Rs~#Ve}eje>iVI<O7!n2Bg6orY?G@*(}hZ1
z+V7o&f}8~u*xA0b9+>~|eJVm9-2~o=0F+YR)lmOSy$D-I2}Ja3s-_<}siqCy9&0@P
zj3KvGh6b*C<EoO!YBWT539O=G)c#;TY#KS7UD%JZUz7+Zdz%bDlek|3j-jBjE{U2v
z2O}0sfRT^<K@2fge%y|hH4l3d6?(8{p>9ltSd6mzGr5}J_hzJv#b~x=Wex$}ZvdYx
zCWcq$_KmGnw__iN^FBBd0>a4brsXn%k>}vJEae={;E(9iMOFErQ?r_G$FU^xRNACR
zI;=B{mL-?(H9axNP&_<5zHRx>lr8OSnqa<ugX)xqU8f5Hk8=h36`E6-R~U*Endlb8
z_h2S8^vsYWV~}X%NT#lTwOqw#Fm14`aUe>oT@k<;-P?<aC1953M!l`E7qI<diNpAN
zbu%L9w2{VLyy_}$Zjqc9yG!@l`V=^F%L~n=gN2K!kI$wEM^Lm$pH)*ER2#IhUv@GZ
zNs70Wbk|enYFev?f1L9WAqU)A7{||3lh@wGIsh3R+QHD+Ii-#0LO&ZN5V$|6?>8GN
zFyg;vd)rqxf46{Bm+*Sa5$?J6;|cE6ICWL@Z`9sXK+}zEc;BM=Q3#c?8lAkgqt1D1
zkbrZRm`*K!6_#+T{q10389KOK;?rh}kAQ#flIN}izDd1Pgl(P6E}*n@-Fd>-grbu0
z0$wSTmsHmOb*tdjiCVeKzC2to1DNZv{A!KP=C<=SyTgFB?J)w-M!A0M;`tCx0^vl6
zopE=SUK`K34x)D1O+=)P&WA`RXg|Y=?Z1X2Ic1Ul!uU7RU>Fq{0Gts(A>3i#^E_|R
zMDC?lB*Nk4Hh!8QR*RvUWObFF)Xc2&R|Z&JH`ivNZ0!2?AuCxPD~14@NjCma+G54W
zPy*Eu^oF;nNq|#ELaCNBBUYox&=+}qtV_C3tCij-#Ix((*W<9^j@~mYh8)J{h*=zS
z??oA|hh0rz|M@){c0%biek}-MBtbPn8(MNXBzsXu;hzUIVE_qE)GsWv3a?FTHy@3p
zFhTk^C#!dSs2Vt&Sa+Y&S$i>CN5gXS3~^MmwLB`oYWWK0POW(aX+r173hA6LGUK5V
z;hy8>GWbQAfH8OHs;J*=t4n=xxo%{^$<7>Z_=-FUL(BI*nx=CW#7d>Zrb551;Y9pE
zN`(FlV1mJxq=WSIrKQ~~+7=_pmLC9eNa33Z%1J>_pSM|;;B(k7DBW?0^899NUBR=c
z-q1H&mI?NbmX0BxYxRREqfXw*#ZJZoHF+JM(X&|0fu%Sq)gKaUDa!W^d8Mauy*U7k
zsv0d19#wS;Ng2XI=(or2nyYXr5_gIi!O(XjXENnQHJZAa*pQCF;&(>VDrbcjqf`}Z
z)DXb$-lMYG*NkYpwS3!Fv>{`KfQ3({Zd_=yXJkEJ5CuSAv{hQM*l5Y+bv)wA2SL$D
z1)e{7jvF>kmeaLz=@>q82lhR~F_Vtz8mAf3FrpDlr45mPw9{8I_jSt4+|FqF{B1dy
z$9`GE>U*$yaJwV{16`RSM9BY?B(7Jv_pT9DW5o+{82)ZJwYMK^_MxQYE(Aj0zc_U7
z{rwtz7<nn_w{eU3hlY>Y{@%E=IJ06ulhhiV`j>G=GG(_y49=jEMo$C#3#)+(l+q)V
z()n9YVXgb50E^im(<J&2i}{3H?60(d?atIl?zY!Jh1Y>?LrCE<HbtO4i;dU5h1X@@
zMtA6?i66caoRt;nrt>5!{U1*7u#b%2Zz$#j<P<38Q+~<sVV<f3ECWv#CsVt2`=K_V
zI~ws#IinELKYc2nSVrM#i3uD;>GY8m$G05Ma8tbfnbu|x%>_Ku2EW$NoU&?(%jCfE
zVzn8z@w26Yt*46eKPi6<psoCX!Y>trDhd1Wq-H~p$3fhG6v3JV^tl+AS2*1+;Bl;j
z0{tG7dXh{iJRxVyfm5WOKYpTDX-o4V^hpUf8X_k~1C7%6Xt3uAW@lH3S=TYK3T(pp
zBoU{iNZsZ>&Qjek=x(YOR6&R*2#CS*tCGO7Y?j4cO@8-x;jApSlRAt>*4LDAq9i-H
zIki2#FtZSJ{J3Ag=5D|ZaB^4M8C{6_V-4&|7QM^wd$6<Uyw$0>uVENAGew(@@V9m`
z&YvjL?ce(G0c=URW3P?g7xBz`M1Bb3pM?$hHQ|vKEUbQu5_f`diz%MY@?{>Qp67mb
z2SdLcs(qWCeR;huUPB>7S-nh>&ph66Mmqa5gQ|cGb*=}#vzEKFw4|Oo)bUy5cYnt4
z2O~cNh8*RfbopnQe7FxPEIx2~H8=@Z7*!qeV`?><KEHARrp>GuOu=B3P}A#t2MfAI
zA>$T1*K8M&%}$l-vRImTiw!O*Qqx;~U|!<VjWaISri}u|3O`m}m#HDvIAy=i+9<J0
zt<TPkE(h*U>6Alc`9+`N6}}#~gJzmViQhfU`*_(Gs2532ML)JA&}%!>`KKOe=hW{y
z{dp#!IA-E~Z^G8*J#4?t&AgYzDDBd3H=H{s_r9-e1BZuwMoD_gR*hp-@4I~|6VIA$
z4F`}qf&-z)p%|ng5@e*p>?nLL+s+NJwrJmTSooE44f!%Sd74A}2;<r(Vs!ff>Rf}K
ztr!@gX9|W9Miy9JSy39Asb*`?9J=`2Z}ddsUqfb$*no}!qfr|P2Dz|C@>77T6U}Lb
zb_LYQ4{DgoP$nDNzjZRNR93zPRa&d7RPCDro%fvZVmFd2z?K~0Kbw`ts0dikJ_bhv
zo$5rQ%YAcshDyyQ*Ih%@3P7jVCH;ySC6h}reC<MVTn!zsbKJd)C5)J}7-Q70H5MSc
z%~&r$P50Z-P>CYuv0I0As3FEc&l%I?E6!+J>#y(mqO0csz^OnWfPlIUj{+#ERF0mH
z*!|;mo2R|drrlDN;+tsbQ{$a8zlsN_8k_O^ZCwz0UR`z0Rc~w5lmH$1&$4Ryh?r{$
zDC!aclg%*~rF7WTtSB{k-}TNJLeS2t_3BA@-7g5B7YE<=DrThBeb2XlO3&^%?U_y6
z>T1eO0VRI7rxLZ${$l19UAAFzm3h3eVz;>3MT3L~3?q!7taJ^30ThCY{!HSMWNymB
zVp|eixTEg^U-o^B_Topui&Tx8Z>nZu8qX!@w3&?p%dd<{_&o{uBp2D~2>{Vt1e@Q8
zeqkS5@r_%7u=n<2ql>iU$GiQ}_DQ-g^9}G0zgTrkscI}~7ptL>O#!_$?BREI8L$Fh
zh~jxdp6<Qh9re#t4sxJJj1lgh%wH>7VPQ~a*jn&WA3I-FF4cV{TfTlo#!>;5c<JWz
zPTIzg-jYO>OwDT?pn)a|C{=M+_^JzLZAQJ)_YcNo!^ah9Xx*f@iG)J+$}5TCvyaRD
zkO04ev7Yqpm;Z6!1Cv0krhBaV%}$yI|Cl@Yci>F$Z$SC6elY3Pp}kVh_}C6#n1MI+
z4&ffEqHRCN(MA57=(afhmlgs1g+}&iW*JQ8)NhbynZ^CzjWGX|FGz?CUrR++Rm3&e
z=_s?QR3PW{<xgTb!~(BDA#s5b1mCo;mgzP30Pda1=S)hX6vNrcyt(?Fv1a7OMKm#H
zx{>FFIoPv0kzCYE#`%M|SXh8-Ni4tnPhJ`o5?pTV;7FXRBw`DIo8wM`g|i;tpcoU(
z`FsCW?C@SsKiT)-5cUwZjTf=s^LBzgGBxiM@*Nao*@M;XWH0DtPZ?;+dK{;H@l6!_
zm`USNWLWIO%Q{Aj0vCAc%DNrvwkEzRN_mht#1Syp`Emi{4sFk;Ap1P5-4-aY<!T-|
zr__?3NiWrksOtAa@95Qv!YoRIJsuF~TrIb63W^}Jr!QXglD9jkZ1>d?D6O;^3YOsO
z$+w%T_wV7T2!{V>S55*ckSkT&_`a@o@A=j5qa?6B-FN|vf{5#65>5}$@b0xOGB*A^
z*e&A!CG4Vt)+xF8XsMBt&0r=i>0c*%K3Lw-KK^6y_yF7*iv$3gsVHAzoX(l7z&83X
z3GsVe!l75j4!E(e3a%rwy1Y9mn5I@P=GI;t<}^t9dK@oz86S6E%SWQkJo1GCUERiB
zSm`ZDb<WzKvIuDyBHvvC)O4EN3_->=oX_#-!#?@jb_BGA6beyEhw6ksez|(IBtXyp
z#R_e!iRs>>w<Z~&r4<)TtEJ2RuGgR!%6ac6Imtb15ymeqL9yiR*0<3YgSA5*_KBD~
z+q#zi1ac)Qr^xtj2j0ZBuklf6s7c*w<7MYylBx+kbY+{XxO2wgGd6T?BhmZn$i_}C
zeak)90Ykf7U)5Kj0v@oJxk-R+JXFB6v(TK$uT`Of@HqG}z${|yebRAyH=0`g2$&VP
zxA+_v-1RHVL?&6S*BpmVDkYI0{cvuc4CzCtt^{UH@BYu-YJypRijfSy&@cIN#e4~k
zY$kRUgjip-+^hx5yRN)$tISt`fw<J^X6?&c8bwTc^&&Gz25VbecxG*wZ2F<xq?$aE
zv#*Ol0NiCV1O6?D^lc|su_=Vb_O^H~GtaJdl+dS^m<;R}Ap(DtUXDI}F!m8RW1GY4
z;$)U(PNM<+i-T$TXF<|2FC_EnaY^Tod%@h+Bvw}EW2qRjubLcTVo_v#y|WqlaG9o^
zj_kne4W6K^Qauce8Y_6Zdl?lRi!%MVqw~#UfpCy+?$<mK?vrwk{RFrB?NNh(t4HJ|
zTqA8l%tlRW5Tl5TW}Eu522asF*6(qW5qP4%P<(%n?W|q#->u2{u_~k7l26>f-0u1r
zTJ!29;>9bd`oRju_{xrhzJCKu!UO#NbWif*2+H_Q?mx`Hdhm9pM29f2Xvg^ffi6m=
z|JvFTsF7`sYN9N?x%eQs8XqMb3*%{xSo<qP)H#0yBxuDM|0HMwF<%8cAf4v>K`q8_
zSVz@9Qiy%mTB#NJ@iE-AyM3Elaqkg%8n2()5=NL~es^Ebupip$a(Xj;dR{)k$V=Hc
zT2<^P+NJ)0A#`5{?A_Q0Y`&>dJ5_jzHdYdsFCMKwSQ0W*o%ssz+zvZ6d5OdQrHS)3
z^&<ePVP7>lSOJ)YOpkrlJkpzLsUZH)NkT2thDUiy>4P7D5%-K*IZ4=3i!Dy5_ky<@
z03~H-VpKifVXB_x7Ac6ek0R$~c6y$(VlsaMcbm;jnd#?5QIzp!2ZtKVDTSqYz8@ZQ
z<G6B|7WBTz7orneij#$aO{9na)!wH3&y-W70s^*#CZBtsQ^g=JBH?e!>75<0<p&jl
zwcO*-sbcL<&amSM<IzzH!7t-2<}MdIOD~RF6Llb?M=eR7lOeuK%{EpsfPJN1^CkTE
zsTn{Y3r=9rD$*#*Hp%@=DgN}@lkmWBOC^75IE`+4hNabauF64s*&pl=LyMzT=uIQv
zrav%0SZ-S`G?4C=99almhFu;DWQDJtZH`8+#&-6{QZ+xZ_Yz2Sx46TMHtmV*3(F)q
z9f~?3Ogd}TZ79B@)1;4dzqossvJG%IvmMG|^thELq1VY=M>g7eQ0opy0YM571C_^o
zRhKpvRub+}KzY<%_G!Y{QSL<~ig-;gTegXmlNN>W;Fr1V`{kDR;?Kf84rlWZL?uQ+
zXth>X=y%79b1o;R?5Kr9@eD!PfiRg&;dE;-XdSme5xwuiibF%-smY;no?HS*4nBX<
zjB?pB>9#u_Ir|>%IWB#cfydGJJjisZHuz(%#Iq$Yb2GTxTH0olsYwj?_2xk`$H|_>
zDnH>u1%*F=UiUEu*iVrxdAqB}N*gr7PH%`a!A)P_=>~9vTL0YpaX+hetz%VvPk#w`
zj{jKujuD<9CX!#WpaV&CI$zG^ta(JDc+zoRJuICY8PnrI`01fj{SRR*?8#q01xJ_S
zd<0TtN}Ga%%3s(ef2Gs^y<e}o0aEjGybmEy%qDt=i;J&Tl%uEZ^m3;>m>zO4OHKdV
zOS^}PHWH6{x-9jk%M}N%1GF-RmIOT5WV#*wVfk&gvkeFmuDDB=IeTdm_gI|?vPrMj
z@vy$@CIY&wtWarXL7!_G$v&Hf>=z-(A*;VS#VyC=y0w-xn}ns!Dzxjv5){Om&oUFF
zf1NzalVchy_$di$kk!beiS$!7iYCO509v+`jIQ6Y9i!4&r6fzLnL!m;s@XVtZpU{v
zuLSu$v|ad3#;;}0jZxbHsW=1M;+CS`<eEm2+7Fx0_}0_KfsfB71tUPaoHXN0VOdzu
zmFW^nG}K}b#|sp54VYq&#UxbfeDa}Xz&t?$&TgE9Uw%{qb;l~}c<M{Jy-^w<e)Y;a
zChv3XxW-AN+`cwIP}QHtS50e=b$hzAteLO$=`g3B#iAG5bdMz(179^6<WaaB`_}$y
zZ?ZH9px;ZDK!*~UscbKJWj?ae0WKSC*1T2Q6+V7heQ!ppd+5fjXGaDc#<lb19ln%P
z6D@|VVtzMlq<!OKE5x(jXZV-jr=&R04mXsRQ3<Pq^-I<rc+QM0?aVZ1o_VivU_I9=
zdoyQ349F0=0dT*<8UQUaB&<o;H9)p<dGg|wy^Dkk5l38Jeu-_U7P9kNFISXP__54~
z65XD&Wc<JRUDOW_i@iU%Ldd0yysIA^bQ#X7BY5Vd>qy%HtuM3RD2BK6Ka#zFbIT<w
zthUzt9?4STr}*=4Z2s?zEvyek2US0>1TbrmNOdpOd|Oi}Aq2{LQ|-)!_Y+PCY_hck
zy9*Q{aRBmr=6ajyPYE=}<uG|%pp0DT-eONIB(+CSZ1YH43xDrJA&%RZq%<-mKqrJ?
zXCptyQ5Rp@lou->F!Uv?E*O<68mGbH1wGoG_~tSA9KKwQ@7D&K*N+Iub&BhKkbmXL
zn&H*xYSSXtH-bmHVygWoyRDPiE!0#q<i0|w)fT3IaEC`v=D4|-%@6k*2>I@DhTJ@{
z7GuP!4+~IQc?D$gM&1f#jXpQeeV(GJ`QBVqpUD|l+aH$86~5%>#xWoK>i&FxMu-l`
z0LP7NZa*Kl-4&O~AtX~-poiuR0N$)JdxJD!md<e+c}hBK#Z5cRuAY$cl-}d-1kBvm
za^r4tZ+1|(Svo8#>an}|)VsI)J`p-l1~3EUuqMbs2mpr983=TvN0ae|{EE$`G<~aX
zwU~$H1=$UMJMcx9(}Br0ZZZdCJCy#ae%M~v`%=MatowAOWMe%s=<aE@mR-21CG!|=
z-9Q3cqZDWoa?ByZb*C8QKryyX&19#GOJ`q*EmZxlPUk-zuj{CeR^%&8UMp4r82jEd
z_m8MEv-xLRk2t_(jSP+Z!Th)U3vfYstbEqD_laS0AbH%Pmm<dG%jzhOWg?sy8ygGx
z|4`~3d#=I{(vdws&5-xCyJOObhUU~p7Pnp%f(i}vKBKUKJ}PuSr(}<kpf;U(Ze?dZ
z$!Hx@T&e{z!e*)+fb(r{wKZ}y@=$+CJ-!u$e}gw1doQ~PZ+o<Dw;bOuukg7*Iqztd
z`|=|bYw+Mkeh?ZWZtE~$0QFk+C^%HJOdnGZvefE=-)?kG8j4Qxq2IKtocv#0fcf(k
zf4G&wEA|LJG-r`-Wu&nJ^{mb`WkK-w*DNi9EeSsz4+UyM&rdr06DH_;^sC-J`Cba3
zwpx{2TBXhK2B()t_GEIY1xlG;J`*Z=8!*>J=r$N<bH-Saaam7)*87SFI}va7x<5am
zY!UQ{=MFo6vrs!GzycCPG;z_?tB5$T^PBia;U9?TwSDij5wa+!oY;Ql$>Mr6KSzOY
z5^{cdGWOb={tO>eH>PUUM4!Z`rFRInJ5`S_)qmj5r?}2gZ@kp>!Y{7C3StAhyBRAU
z=3yt3O8}gq^|d@9P~0-3=0K`Y46~tv(l_t=Ti&1luxXi<Qm@@CY%-dJ`<|}T0K-K?
z&E>x`%r4Z`2TfqU!nx6~m1kft>DjnB$Df5vD4}d5&K9T%JK)_&q{wK{Hh04~YmL4{
zNSit3yeF?T-#J)q)QPPyFKj{p<|TLGv=6*CrmBMLZk=a_#RxUKn@#_u5IQfJeOd*~
zpn7d|ii!h2aCtlW6}Q~5y>l3Bzxv~5rP5)=&s<M&xRSO`%8i5_Wmu0>awH8zWu9Yb
z+1uFg&VH5zrE6;8q;@bJwb9vXU-4uAdQuibab*f`2Iqcm3B#UyVQ(>O?Curd@~O9a
z7@O2H_8GlJxm9gOIjKr)>uIh}FpI+0?W!nAfl^vd!jd9u(VNoZNmP?U*!4!t+xL}~
z7$*xisI!&#uMSq4jpQ9tIqeXSpBcW^smKyd$DVm6o;>f!WZK(lMUmpagxKi(%x(x&
zuhHtgt^6@G*lgmdrWCJ}>;5t>bFThpf%={EZrIq=Qei%-YQA*>wVPUfX3kfsY`LlO
zFTWw%OU)<FSDm}+;k{s0mRM2e)vUm$kW;($)Z_H)<xi(($>%J$3HNfo-`doX3z3f1
zk-x$l2tt)f<M;QmCFM=utE@*qD)0^ZSuOJ@i!fSM1d$qef2Mp!!dr)N1fxo55^y7r
zE+5p~(q5u%7u(83BR5~(&PCgZIqkdNVBp$&<QBj$Liz~SbcMe){HzB+DVFz=^XzWx
zcEG>LXykm*Pnp_}uAx_J;FC$#uGXnxY-YTz1gnW}>p_S&3Af>B!q9gO(@&Y?9!#)a
zC#p8UuYB|Qkw~Cj?;AL(%!iAw^v_jnA$w~~)dS(;1cmdnwbKF)qMmuC6<CDbz_Sto
z=+^erukLzK0i!_U%?QoGd&#f-mHCE%cPg5&w%t;T<`CMqIoOI{^P@x)Dsyj*E#|I6
zZoI4Vm6pz<^kyth2A$E9w{rf^s#ErNVlAh0l12UYZiVl*jeb7M_yiead(-ngH}drc
zU^D5Qx&W}ErEm1_Zg<#ne|>+i-g?bmYs9%wN*q&g28HyiMmb~Y@z4^9_I%B5y}e*3
zxcbUHst>&fZ4JMe;NVKeCf{@UXre~Y-*j~ltt@=QJIBlO?I3T2i_?&*iwal#JEf57
z)yfb)3#P5{_ot6XM0SCb*c-z8RB5e%yqsv*>UD!n+3Av3sh-654VX0Nmx}mKm+yGR
zp8{Ze(iAn`znvLDFU!?8)+@{!y9Ei8A!CqzSov+`Rj^RCIrw9c9v?`g+Kn0t-MnLK
zf4N05*xK1%tjj0aaBZWQ*p|HE+P%_gy05XcieLP|Uv$cJp<zqO{BvwioB#t*ZKxKh
zNw)E9cym<u63>+Q*w@xT9xrrlJMIy=ed7NNb6fq{Gl62nC9OLIZAQ)&{Uo);eEVMb
z*)Oy{0ty6d;N73}*sIyS6R>lByScAb8Y8@dzDl1_a(&Sru|2`vNB8jhv!1fhRgi03
zSBLj`czvO0z^H_frRK||J4aAeNceX{QUNy+z#%GD^DTs(z3pS=fXnwpQ)-SITG_PM
zXtWdWu0<!seT__92?XX12I{u9!J0(VXjA(u{;=TV>u}74xp46$S44|ZvBUXVv<$fN
zBKX8r**2&jG*w_=(|TG%!6?Qy+s_ZZrEKpfdPi~Oez^E)jA|)fD<bNB#_~}=uvK=R
z2Y2=kE$L;HFTi%D_F2uxHR2g{5jle1WdVsK-S};D{jurp7xUTc4cgWN|K6tDrFuKI
z?)R(_I83Hr+64V!`u5cXSY$=!UK|>JxxFvgTt9LQCNNxS-RBG#@sPzuyd2&?T4~21
zA9dor8^Y4nG_ik{L5dQq$X4B;XDH|o1mYOuu^&CWPIq|&;<^Mx0`5lvb#{j*;r)oO
zpXOt={)%#mbNKu-JfwI3!wJgA5mob0&b(JCkOfFwVCCLM^rBYdi#oHR-cokr0<}5>
zQcoi<$kmE}m&<pa=zA};pr-<+71eRp-s&7P(kzB;9c`IcLplIsw?)=~rf$lOXgzF;
zDTV4=|Ixc2K0c@Gl0Q>PBYgRjH(ecGZy`#*LJlj{cG(;>uMpxXvl+#TOq9%)tEHEn
z7O&65V~G(<px6Ae=_Zk6AswaVTJzB6O6No%{|)cRb5C6$7hK7cC>*#X=KGRM*VN5x
zlE$jEV_g8O2}T2C+ELO_FuaW+anW6Ip=)SvCJ!R43_>tK{E60=^v^PLd<@bGHa%^U
z(RDaI^Z0R6bWP!~o~YtOu-X-n4@Rr-_1P6o&mB<(BH|>%UI5LWh^;>3NE&~n5&HcH
zAo<={KTeEiISUTY2B-~%8oTlUMOHl}E-JaswNKiMv#X~)yzk7Mfz?JT%crbJy`H;3
zAvFpZLfD$HJ4nK@2-NJWY`rfnM$_Wll31-;IPp_VvBx=&QC#K};EyEAb&+Fn2VBx#
zoox-AYZ*cNgu{B0IV?_x0*xGRs{9yzP9^yOITF$fBJK`pa{r{fMmoB<yxD2D(TvuI
z^83sEZpmQ6!)#)oi6$DkuZ-aUwV8-3CdpdO2mf7zXZKL4o+P)V6ADM+xB6FSZ?*?q
zxA55fV9*t~<Js)9yR#vz#Rfk>2c4<8Z0Dm2LrvZ-m-@6<_@<Q}z;Qp3@VO*)(+Q=l
z9W8RRY6*C-&S!Yog?J}=x$RBTp}7$;*ZTTNyxyBEF;i<NAnpUgXm9=s%SQL<OJ4T_
zULZ$^CZ2KV*TJOSa5joh&?}rSH9`h%w-|GY{<{_Bq&k22J?Qdao@zL0Pv<3@)^3R$
zyUp7!sTaTRzr4xJ&nG+r*#GQt+nq?E)To)ymUlpP9o!SH{pXF#HQf-}jTEQhhOJwi
zKO4_yKR4rzBLdr{Z}UPO{4rcwA>AEyBAsYgLfi~#!YIGc5{03GwSN}{Yivi44L)l}
zYofDzjJxlH1LmixW+)Xz+2^J{;?euUWhgrt0R{=iB+?!w<nuYgws;wG1KFxqz&^tN
z_=TeNJF@+9YoOKnEBdP6TRXwgOwCExhiuAJ2)>A%U2OC+0gr(<I}XAO-3YH2wBozV
zjT$anPA0K`e;4`LNvPoNRj43I=HC~vCjV}$4`J}n68-=9lVB?FGYvw6Bc$aHR<E_m
zk8-AJF1am=NJk^Wp)WUZ%Ks2_DFQiPVOCQSAz=9p`nHB%=Xl1BPPCA<Py!^h|J}P|
z0c+OVm8Wf`L4%HZ(ErI`0wdf#_-s!>IpAG`0y*O2KUo7|kjOz7^3K{g7GPTO71w^E
zfwawza(ttBV(z@XxaRV=2znG!G}8a_)c%}!;6*7CT-Yz{lrqB;dJujGn`x^?B54+|
zh+-+%9dcC!e%^IS=sU@u(+EfjFe+r>KAxk1GoGz|J0WHJG|K7|vc{nR^$Rh9SNIoB
zOj`Y~e;sYFW8Dpl*VVKgxU>>uUv$WDhFSV#SJN@G2#?Z@-%>v0Ng{|vva21HzK&Pq
zMT@37<k~(ENzE-$Gcw+-PNWse6H>%KAOE$b{on58!wD10BahtqVY?(gp<99fvxgSl
zB)AmE-va{-*W;9hWfQ7&EeC-u6<EbBUmn(!48<pu3-`kh`}O4m%~R#eDE9&wautC^
z492$ahekCY_xGKgWuA6kuU=;ftB@DPgc!y-`>oz}+ytH@Z6l9hq>*2^1)NiDOOM2)
zk@JC_{eIu|->7OMHwdpn1NM>1Fdn{8Z0-l0c9J#RwhO4q9j4TUCRRHT`JUe$*}I&q
zigMa75-6r}OO_k9{%Ulw_&t<(ytEWRlb`fx*pW`E4^{Th;vfGY(p{{=Xoyeekg!0H
zQ4pOHj`98cRqRVe0|v8pg!OHD#NTGulnMWFtm{ty|Bv+sBKlL=rt<x0j2e6CxsI8+
z44N$}jzVO4#>%cx2nKm{qsz8+0v(zYGc&X0Y{i;DD?Tbm3~;Plr|WZnZ>yEe>Uu3I
z!RoSnqgf{U{aBuRB<FVo2gyI#@!uTz|NeQxjB@9+%}+ZFQH^kRxex>98!k(ZS-t7q
zpD9;ua5U-e>DgW4p{9OJB^4zz0X=y)|JX9bOychonWu+Xrk>_>abcp{9$`*zav_4z
z-G`)G6#%C}?nUTB_+PK5KY@e-X;;K}(lxE!N6QbcSK>_i_~v85!f|n2@hm{o87tOg
zXjQLzJMaPMW2|*yfPa2ItcsVLK(n9|8gPF&o`7JiusN(x0Vv>n&6iRNJ=>XI6->RA
z&8!k{jzcEc;!qn2<kW{7jP+TWqa4gSPCNg+L`nar;H84BR*dA04*!_XraE}lt_ovN
zDSLZ^cEeN;%!>ZXsGFmG1-KxXZI7fARw;hfuu55DEaI8NdkAS2n241a<XNHe(t>_x
zI(3Bx)zLc_iae-Rq^Y9++gp?z+5wJ%1!{M^7JPpx!9D*^WRZEqV$}Arw3LYqtmT_n
z`kqn00TbydT0d}A@*b{=Z0kVX->ZUof(q%`5+Z(UBYa=@x*!_8;`7%^!oTb1fByX3
zh$2yz9D~&7_w^IAY7O;QWJwF1NBh&I;)%=#7kili>!iE4_s9G^e^=eG8od%nTK)y~
z!k@Fl6Z==psp;+ULo3!S4D{@#V%v4ZgDH|lA)8MaM<eecL1NE?`00co0bU|PS!600
z8mUt%7)7Cz;t#u<SN~1GIpqwJK8B-E2jtPeec3e^a$K+9K+zRczk$uH2ob!n<{Ye8
zu}7s!W9{B0BjtA$QBY6_u}h@UOnb*Gzo%Fi{1Am014JBVm>c|VQ*33sQWRHbhjqO4
zr_9n$gEHeVHi9;~2oQin3WSRIW(gC}iumRUs{yfSQhG0!5;2XO7wq`okGU`K5C4io
zu@8YseOr1oK(*m~#_oNS!Nlc>^W|{GigjeEY7btKX^F;?8Dwv2XS!4`#4hze4~m^P
zFibOBr-fOq4iuwFb{+FV3g+|VLN6aCn_78!<}#(c+~3EF(2EQlU=VO<dBQ}Xe~XKM
zj_SOg8R2gX{I6j6=S$5qN8fG;){L}1d2*w0#B$wpV)xQSZQbq4-oelR4|{JJRpr`t
z4J#r@NK1E%bW5s~AV^6`H%OyMgGh<Q0+EhINH<7#H%NC&cQ?G}(yjO2`+4?z?;qcf
z@5eh9gMmZ1;>_cm$DH%PAH}7Q)U6%TDl|aa*xWo^vobXFE-T~xaJ}bn4JNwtwQC*W
z&$)i1%wZD_q3l>eb=3Z?=)M!vkYm@W_96P+Q|%#iv~~|-d_-cze;p9WA@=8>O_tf+
zU3fng1}yZ5tG^W(Mxb6!o$$=Kc#`nHT`5iknN`&ib2T4J&|2|->OywbU%`p_<CMZM
zw&8X}HBJrLOu<cTm7<Fn0a$Fbmd3w!^N%3~0w#rjF`#}9u}4m7=+fxWfY%YvJGb4<
zlyW`Eq2Ew1EVvDpN*laU7C?<Pr_IqSM~?BYrlE-w#*$%a1U;XeLM<p@0Su}Xxse1X
z9OcF5aajT`Y?_bZT?sk5#Xt9zpP|+N?K}BC)~KTiKdu;y+zrvwyhr7q7CAVB0Rvoj
zG+)=rBEO)Z9w2vt<^VBRIZ{(tH1#R^GHsy#V{^TQ^}+SzhM)%D{BCG#;SovA*8AKX
zXM?)%{Eq)V$$#z<!R1I7`zsWaspzetX|mLJ&U?_TH@0Hu5(o3N$$ZgB`psHr>pd}X
zal_}pi{DN7^NT;NV*&c*WAk7xmNOf+E^GAmcR4y?e}C~hPB>7AqrNz6B;RUVh*-!4
zUL{u}#NB^kAPJiE<a94xJ&&1?%hC(vkST#QEA?Ji5j&~}nVwr+8)h!;Na-UFIC||w
z0jUV)z?a2lNdRU5UJD8BBgTW+`eaS>tz>SZY0PoK!tQR>Mkcm63p57LS)1~9XBHDl
z^FW2I2B1f`26P#axRIMqdR3g6u8_jMF+L73s?4_dsVdYP(Jyy86Sa3Hyiv+tp9e9j
zPc6@r1`=|KhGi=07#bcj$Fvmwy#vXI?s(<J`d3TXckVS?>U4T0DQ;?+D(!4el*(tT
zvU!$HRNAu;a=rSjtTg8;P5y`@7Ae>P`9~!9Xbpqc+MI0iQi8#|@}WcG%Vxs1Pc$g=
z3k#>(wmQVt#D9ybRRhqfTu!mhk2hN{@~w#lZ3n;jW`onM{-BA<i*(h;(eo#<1_Zr+
zZ0q`6?2W4k&$QO7RwhbWh!Gz?Rg6t(Zr)$+EwEVUhDY>HP%TKh6HkJwuC8AF_LZ&q
z<d=}XeT9x?05si+e2B81G(y9#)vQXg$&GxkHapVXy~^#9u;Bt13+H_@D9c!4Zu6eC
z0}cNhnV|ErC@9y?B#w7_<-1+0z$fGwXV~StA#p^te6t63Y`tMu$8I;<bgF;n!+~PF
z-Nlq=UxIiG&qo=k5J`M|yi>2Wm_KfFcMQ+sYeL9@2#A#r2Jo_xU@c_82Uo3GIeTWI
zgy7Xu9HalVcP=XECXw`C_fDNtc+rwHznai5^{tbQ<Ku+-ldTCV+wpBt9BJhd`xkGX
zk{ey3TN7;U4RdA5CBnhLIKW}7wo_ZpxFI71G@mUS<rD}UHBnAlQsZwI_wEI}wQFn-
zr7LF6Q_f+?-d}7iGVTRX>CDc_uKwa{T~!ohG|{*UyMXCEz%WzAC-8nq@%n2nS8-6z
z%hCq;ncaTkR?i<X?~j1;=ZNX2x>;)4XmdWzq{da5x;eMF5ks0~`Zf+MB%V3yrE8iz
zMqT)z+->8=9$*$=B@i&%off|NaWJ|`^$RCBHI1Ki#9R?XzrA;loaN_#{^xr>JphYh
zy&)sKOp1Z5Wq#5Mdd|(jCi1?zmQdOC4WROE04mSJ!d6Eo$GX0DOrTLIr5B1SV2h^B
zWvMxV<1p@iwA=fUNQ`3s$wh~T0Vei@PiAR94ql{QU58143|D!SQ+)ZR`_Al5fvIYo
z{mjwAHy|^&GF{D|y|4D58A)U^99Toq8F1&)n%ilB>VI9x$d6XpQw}X3utT#i-0lgq
zIMGA>3TrpNk;=qyiAAv+e%=!s(>wnzg6LikblSydRf{}KY*X%p$oH5HEC(EfeoT}<
zo&UZtcnrm;kQ=qWbVOel7*-{~x-|iJTk$a7RH@7%b)wc?SUFqOV`aqzL?s>SP+l9j
zd9`4PGfHsKJX=E__W5zVz3t&Y!mRVonI$C$Z3<yw9U+H9Y+~%`e_Ha#akNWhwy=wt
zi;zy1+9+bJznDY3GEf}IdIR4(>@Nj!j!DM}+y*Xj^3|dxB|g_(t0vbL=UunHJyB;U
zIVNV44kLJOV2^Sa>Z_0<6*c%$cMv)Do)<{iFf5N`tKBME?s?9hD$O7e7yG4M!oVkn
z!>S~t%i51Pl&^QvDCcE4vc5PZEaD#1*?l-T<3z2?3h~mahZmi?Wff%%`a{h_w{w%l
zDb-8NUzXN59Wunm`OKHjxP@L%r`u3T#|p?3>V@)&LVF{HuxIj4v>tZbe_}yE-RQM&
z=-SkXm;LJm3VsEjJ5_<)8<qTd?~kqcMdi#nvjfnJjh4Trd-CMTw7rsp`z=`Pulh3{
z`10`AE?l*(qW!`HvQOP~3xhvnzxI0_?=0`%g6e#D5{I+zNG^2JFHjj9y14be-{Iv&
z6%C4gDC2Z>D#kW88Qv><xHi(b@sQ$VWdR~nF^$EW8b+#7>2yydHa10G5<hFHC&m+w
zn>(Yyv6{0nFxWMdz0o`AO@;!kFuJho5vuNf0|xH{ImdW1Aw6F4^1vrS8)Guebk!}s
zv`|{Tm_f(aI%fLGBg~4u9BFZoYgc%lQnt}5(;@>i<Po0TUDd-&sQ{7%=DZB~WP~N3
zp87N~|5weug@&DTt6TSbG1X?5x}m<vi=8op+Q-*5=*I<HqrmE=sz$fb2P~uk5TUbg
zU)Bd+5$_7c)^-zE1>m)yEO__2{tk)q8A&1u`ACi3glV8^uYLv=oY!d4Jk&S2Zbr}H
zqx{^aO@D_F&S#LjPuk~Gn9Pb>hxpG!Wp`aZru`Xc30}3+IX~KDKOX^<u*H1{$YVJ;
zlu;Sy#?nF{Cp^2VWvs~9Ei!WPiC_dJNkB-W;<qc`W@uKwz2}K|x87>mT?CYDW92bN
zGe;i%g1mgVuE*-KVjV`1t9QE#9bM#p2uT;}lmo{3jV;Tq(F(g5y<B9-9f@+5j%cNx
z%B2(Lj)Ct3Ure!0rN0&w_nAdxEJrmeaiGDE+6iZi#Eln+27034G1v9!VgRs>`Fz3m
zE?c@<QL63KK3*@IO#?NfT9v5Vbe_AgbR;WjdA4oTq&B7cmxV_Xj@7Iqxf+LpphAVw
z-r<t91KwrZ`K}5yfa+>30%UQ153WZ`eI6v|rIdPhNvBLd0Uv1U%;S9{>IBJmM!YRB
z{UTEEqiD?Eu@UxA#N#-1SKqC-)uFB#UCp*jYK;4Hk?5_&+r-<y`zirV^*x%v={8bT
zwM)LyA>rwUV)08QuO6uHeDlYc%Zu0B(}JjE+;G4M^ONCNadBg|+Re5IVW>QI7pVAb
zz4u+xd@>(XB8a%TylDA*vEGB8HLlEL3w8d%9Bn*D+hxf@p=hIIHvK8qemm-UOb9T2
zB&S_M+?XjPqYHut8@j6jC^t(5edh24&9L+HV?fM3TuU=u3XyJDN<1-c4&~Yxa6Ut=
z!FDMyU$u2)k%!1z=C*}%#f%9AHmitpH2S?I9<{%53z3)J=h=)O_Uy{{NXFi>QQnJ-
zeJ%5u5>3lFXFz~4&COITpacl1Cpcgyn*%4uXqqhf8XNQlR`WJ%@VmVXA~Ww<KQpKy
z&(FB4UVC<B)3OtFjGpX0<GikT$oF>R9Z1_#`KMO}nQ9bOebG`di3YCibI7AOb(`v?
z1#VML8<XXo*(W&rkPn_}9Jeruh)^)?i{9TdsQffQqlrD0L?dr9R+wTtRkYkY%in`X
z$sfxrid!q6tPw(E7H)9<OpryZ+V-AjoKIax2lDCOqDZZq>#oeY)z@4e+*bRo>Bk4r
zCLkVG>hr9yTTf8)UphIOI6C>dl14iMawpeS$8>MEM*XUtkLUJMIx*DiJ|=@0lARQ=
zXG|4{S8c1WXN5{Erh+v2?Iray`M9%Q<h*YQBwCmV+T1k_kqzBMm*0vRv9jK}cxXOR
zNOFEm_bTk$<3yDimjSHqmz*L(>n^lmt%pb>bnUA*gqL5V+P&xdg(yTRA(9Gx^z8gj
z0W711Pc?LA#%=~p#SLtn8&7|LN5uqbOq-u5K(G<~#S>Qm1R*%q>OpZvo<g$RP;zsT
zua&F4Y0>Z9Y`L}IvETvD$6rWog=Rrj3~&%8FcSo!tm(n}0J{6-q452=++;6DGaRs9
z86BN*3z0_hJEH-ML0^QsL{g1o#!x(oqm40a+P0=$Os=Zr#9v$hGBU_Q>vNr|UWZt|
zl=0}MQ402uz_&!+C7a_E&Tr*eE>D&5Kl(h(Vvh{8A-k1U?+GhwEGjO_fWRoSIB~q~
zTR^Ix!X~a*YdKwoWSo}BYxBe~!9U>RBO?AvFRSi=-ebUm@fb*`?<b~`R)3((9AOk=
z|4<k?IpmC#l4z_dzZ{%u*eYwK`_>QFhwp26+t9nzyC@m)azNd$98b{2%Y#~mnba4<
zEGEV%$gOdOPROD4S0ImbyGFYPi{rw07NGEwEfUA^Q*(2xLFmF4Z93Ngw9;J();)2A
zbfD6WlXtu!Tc<d0AJv_iA=MXbp>7hNzsOb7Qn+_+vj9m~%kmKr;}8;qb|a(KtqqxB
z@bPep?V2*Xckgp8NDRvue9lsr@PmQ^aC&P7!i;yc$yv;%Qn}zfjVNk~skWcb<Z6{8
zq&<F~iRw$4rM03#e6@&o5(K3WH1t7ZOO5q?sK0|jr>eI2yimw~`wHpWbfDm^%$rZU
z$yfP+MQVLa>`y87k(yuzl!>hexf%>&{8$6?S0h6mD<?QUQ|+O&3uz?$q1kEp_FJS|
zCi)|UYDGpeVY^q%=OZ|-bC=Uo4wP28wGZQY<FV*4gAjjy7J9RY$#x@48S$ObvWBO@
zL9}U)d>|fA+9`cjk~;Va!1_}#h*_w9WII_QbsqZXvs=On<!$;q4ujb*VuU}>p&PQV
zqZ^J|UdaWYNn9rX+IZgGUa#v<>6{g6(G<E)?^1stm!;hOg5-GDYFg*=4Bw%``4xeO
z7e&r|#Jv>9=aITzrV{%*2Ya8@%a%H#6!YBHzYthP3QU>iW5)}-2Q>q$x=pObkBmy_
zqyln<G8vtutogz&Ef5jQ2M<yv@%k}1mDu3#z9IgwyU<D@(Gx2f1P&S6eto&ghJ6g!
z^5n=kZP(8owJO0H!6_|DX{LFTDCPx0TC~Q4j_g-+^qLi)$2J|Y$sl9(6gyBMk;2o=
zJ(Wl@Au?#hiALTuD8*OLS-aZbmhSigRqZwkZW9Rft*<5>b4D?J8WR#8eP+~@%B%9V
zv9Ts)f5Fyx2sHGG`%msQD)l9a$tM{gePHwh#l=S|mU(h&>bu?%(7!0lSJ<jWzpe)!
zH#W;XBsmWK0ow8(A|mJ*l&++-(mD&;(xCHcB*OQPGqtMuK-e5Z3^m$N$ZFMZJtE29
zHp@HPOEW+cWeyNMMP4lK83@p~iN|+tco(j55u?b`&1*TDPVw@qNl+9A+-J9YODugn
zJ}ud-3_(QOz#D!KKj8dX1WTixSC*cU(t6BGL~ul=(SJH(GM4Mg-xd4t78y>PRsQ7l
z<t2|xm5Tu_Akm$IB2m@&Sayd5+n!a9dn<uNAtBZQ6ZlLy&!iB&#ZejylLyFw;%I8a
z#^m|+3bm;8Jy%`V$Q6i<k!{kmAbTN*yf4#<W57PevPh4wDG%9|)6t#OUg#Ib#mgT@
zJ*xd0(WNo!U}O+IKD(XH_%j>u2>J_%yGT%*%45rhJYWm?Q=AXmxZgEnVh|*xO2@z7
zIn6|s<1!l-Rdq<`O!%@?U@^fRWw8MU#U8hw`eRuZbbwK4w^ITnmMB*6HSW!uMPH>w
z7U%*(jd+xnDw;$h1AWj4G7qivh@^DqR|hkaCo|=VDXHQ{KiaK*&agd~vAN^vH_A{5
zqPdqN{+r0>$3*1oYB^3^us~>czBw=nlOS0@uqxka=#^v8wy1VgBJ(rc-NTkjelJPu
z;ACu2)Dw~_6GQ&S&E$zunY}%_on*8|^Y9lG$_6g>#whz*GWQdCqpyl&MBEm^Ezi0|
zB0n$rcDH_RR}8DcG@@*MjCQg@6>=(LcSmFrlqF)b$Rx|9j-*vDwxkI(0~OVW859j{
zUz2VG2s*okyD~|K9B<D=sOJR5u%g{1q;8qgcE9xA$+!0M@wsn^YD3DnKyTa=3#?nd
zhrZCp_m@>sQ90Nc5_uq$2h>ly3$!}RvYF<$TjmD@&N7S01fAfmWUaE*ij=RWQf18G
zZzEE;dtAJ1zC2w9_C<WCFx~J~_4By#m5Dr-XdbIvdpD`IB!brH%YHyRY_IHB2{S<8
zWHS;=ZCM?-z$_9HFCY4TKXi6>C4r~r%>;c{4Co4p0VA0Z02ePhVVXTbG3rJh?2q*J
z$8G4z4b%!1?Sk<f<x!G9_cop2+F?XzkS_b^JvFFapDYhm)@%;XVP>WZ+qAj6nqX4N
ziqMM|m1ADbsM;O53JHqit9fl9DJi^1_?H7*8$G-REqY6iuvN#}IF|)Fi}6f=g5B^y
z1IS7l&yjR3!G_x-CvuDe1|0v93^&(!I(x0IxcnQpg)k<%cYbDL|Mb0+a%a2tx+CWS
zkHsW4(Qt{TdO68FBCub7b$5ts?9Q!$ud4$)?Q~>BRcWkAGEtnj0lzl{iZamNkpA>V
zu7cXSi)K*>uT{_F7=XEQ>uv1<>63vC+t&#brqDRnVpu$RezX;8h|kfUZe<Oa3cRRq
zyyz<d`_a~H2(YlLUWXG5q1qp;<eAKrR5$2rjS84z^b4R}+o}>S!8_}oMG5IwaB^gp
zcn=D{Iq&P7z!venJG3St!x38nuWN~lgd`5MTnETp25lS$;W&7ygQ#@#iulo9k2{0a
z_Qs3N@VVcV>3t@gV~5)!)jyO@Sb~pWRYu|%8)e)2h^Z8C9AE83$hZ_CQrmXI^VKi|
zU!_5S0o9f=E4IzZD0fwnBWUOztC0x~6s2etxSkczQ!FH}RJM6z(yQdgDqnKRWH&r|
z$nu7nkkecTCoGp1$up*xmyBs4woJ81W4E-I*9w)CT~(iGrAd8dxLAHcTmdrZ^ISKZ
zNZ3*!oLP;QMd~@Co&KIhV8&9@-Cmj>Ti37dc}Bi-*^=^Mx<ZP(`FDQ(@iP;`37wl@
z*(rMW+yMIrOe%=|3&+k}8Pvyr$z}OK0rAFs#Ki^6Lsm3iw7+FO8J{A~&y(u4%~FgO
zIYV5~u+*TUB{W%+<%&7bTJ0LRTq=q7^LtFnIP&r+^5uZ_wtc(|3Mq0M(4$csWDs(@
z;0p*$hES<IU#ze*OI}otT+<(1k+&vV_c3n}HPo4`aXXZ<93#LYres+is>B0zml;Z>
zl%OmnjL7IQ5TpgAT3p#6$f)hxt`)Yu$t}T}M$YTiHY(K`{i!nBLk5Jq#9um4N3!oX
zEcw>Aq}K7*LefiVRSKHFzyENXUNiO0$N7bWD2|nZA_!-NpNPTNP@^(<Ay75JJFOB?
zIp>ApFwwjKe8Xfohe|H>cB#e0+o*eTarG=Nk9mQ!X6olGqUDBQ>XL9!k2SH+mhAGJ
zrAic`c*x8^lbD2r)omXBJ5E$H=b2(CB$MTduD%&?5Vdp(B3^yZMyem8X~4^g1*LqT
zr}HSG%6-IAGllINpVc(Z@jlqWz1E+qH0zAC<lcod>K3hk;*DyHvFpAuqCK}Dh?e#|
zUTV+oIk0lk9T`TW%u)|~r6lvwxtkXklyyKxheFsFI|w%;gtw=|)M(I&dCAu~Y(BBX
z`k;oZw;}~qER3txn%2x<Zla>0eu`QS2n6R%XlJ%HCJN>Ehf_7{2(_zjZN?TBUxk^q
zs^EQZ-m+QkH+1ZdUg0vCEDI5E2SUdD?Gf#k><qq>9jH62(PtuVYV$rmn>|t9hj*Xg
zCfeX~vQ(Q@8#uZ^+jr}a$NzM0)VN$=pG>{05^p;BGp9ifwK<_c`a%Pa`2C-_w|``2
zqk`Y9;}{bQx~i5`Nu9phvD*#3)~>=InjG|T4@F#{kSFIQ{rMS$Pe%ki4_GQAfigJc
zfR&jW&6BIk8n~S{f5`|{^2zsF&@7R_7oKc&(y?g34Gi)mW}CJbur)OoHeGR>N6Dt~
zk7x@}o(ExWDrff{Y`?P$yS{Z+gZ!ck4FeqYVkeqcL%CrF?j~AP0n+=+zKQR00*Cj(
zk&&)1PgdA;_LEV5+iiNqim#+(Qzd>mbk2ucQArLCR~rN^j*!MrG|y(v+9#(PNs^~t
zEH!>^GJbQwSKge?`{`Bf{YC9^+uK2Cp0Iezv7!`$Cm!%d-a-4~k`e*Ux-R8e#Al&w
zwMyl}-UaumN?(fSJ-iNb#VJ7cg+@}NG=45e*Hd}C60~|X5Db)+)aS<gsog79!jB#-
z2{9?Z$P$Wn8Eg?%pJ>|H%OHWK2FdES*%AoBwL+l5n;{6<V)E>bdbaz=19Y-q?#me%
zR02%jKwkl$!`rd9y&Xx+ACKc@jitZ`UJ}6;8AA+CtURmv1xz7fVgQ%~8-F#ML3>>9
zA;sN!9WMZmn8mzrb}%fbE4)3>ZuO-+TQ*pj3C0~A9jD*5IUlwFgQ6u*!adB8=_N5u
zx4Yyn-q?&+h_V3-POQ4i6dVlJiW$knWaYR~gcT6<#GXzzj<IxGCRf;-HvKNBb8yME
z<kRgf;{n%R())Qg3d{9rZp}q(FJk*Dim>eG7eRR71LFFgk^9bCiEawQGJUc$7%Z_B
z2~N0~O66$pU7m!x=FTBe2ZzHVtY_m@ct5anb~rZvButD%_F6xxZ_dn8_eN%zX+m{%
zU;iyD!x;IFvk|<)mKPXT8|S)G@9$G_MY}%3mqK`W-{;dE-Ir^l^<UKEgj@-$oeq=>
zKUz!|N=8o&N*uh@^&z6X#z|!=a7}7c4jW8#l;P0zL4l&{zUvccQZA&@)F@!1APQ}3
z7)~)uDi?RnHM-1H$qTxTIY)n`W)nJ|>imhd?}NEU!;O;?U*9y=#=z$`Z1?A0Z5TCN
zP?v-##mtG~J}Kz_ayD9GL3DPo_TV~uCP8S)*LQ9Ka@Pu)<p=@d_)aFeWL)M?*8)Zx
z_yGj#spb<IM9Vje;J)>0*+IXcO`Tbeo+M9~RHilsmq|$Rk_y`5ytNqbU?BAG{XR}E
zgtuu;z=oA1U^6;8TIb&K=HXG-*!TfRN0hx@W`2A16+0<U)Ug5n)dt3~&G8!vs8Dr?
zV&hJxkaooOxSg4yQTcZ?o0;tjyNKglxkLfVWJ>weu8Hbe$y~DL@Pf^W4AgKdXIV`d
zHZIcUY6uB`P;Ojh%|HoVYYa5>&*APPaoT*lZQ8DDx1fG6`s;i2DupiMmRReS9<cqZ
z?zv&;&;1MIPf*sgNlOc-q><7~tV~)rMRGTFy^Y!B?~OkwH2*Vs-9K=lQ$w(|dA5D9
z%CRz(9Kre3FNZ-TTO$bYInCfd?(S`ad078k#F|w%>xHJ`OIst{A(^k8Bek@|+$DZ$
z;e+?o3tYT8oBWb}+Zed@E4mm`9lO9bWh1n(DC`$tU!(U-M(|d!>4icss`2<G*Mwob
z--FdX2<hp)w-y_p=k3l}sT}|XF`iNwPn<#bRab~$rQ|L}KzM@&4VtE6nq}-d;eBkY
z1Wos;byNb1V*T_e&Ye|42C>uoezL9{v2!P!%I>?{4PRD;T5f@3S45g$wx6Ea;PW_s
z0@j8oJTg#)t0|CZWVbW1akSbD4}e-$--vlEFL-&;N!|ldy-BrpA7^6*YIX?hV7R-e
z*l;q$)kkxbwt9$XSL=Rl75nftLZ$s?ID#;y$ULiofVAZGDvzjAEg-`*+RT!V@yw?m
zDAC|BsLIJgQ8lkYjnP<s=jU_TB%P_HGpq($(D)@|{at;Rclp<&?k-nLC#>S{eozVI
z$k_9Q1TJ`RdlnGqYtybJ?;ORE%#Lb?oyH;}eW8OL$J3^DG+|v__x{d__nF>){M|ya
zW`7r-@*2!Ze6F!T1!+lvY&0y|OrxY%Ezgn+Yz3@;e%xMQCXm81+^dh#p+c_6Ea<X~
z>LnV=X|Cr-0Gake7i!DfZ93j+SqIU#&d5tcpR>JA7DI8muv3~-hYx{)&9jdw{iiG)
ztJ~(mJjD}}O&SsJWVD|*8O|nHVu?goSD&ChY}w(W<cNLi>IPVm2&3r16ZYXu>bUa$
z$ve$>+SIHcTWha{8rm?a3Uoc!%56_SE}9Hggzc5z-y8_=J#m-?rBWOY*TU-QwuP@w
zZZS7%?o3$KCW0U@lFJNt5AWMXQBx<dF!ZZ(%u$?#_L+@HVe9EZ{Q{h$)ykwH=pmX}
zniFrOG`&tUL?l8io;9nRlRR`Cf8%Mvy-em!V3|4dqI~m9q?Mi+LX+2M21xSZGQUg@
zk?eA~v}^SRVP?P^yS|U;GD71vX@WCp)IwaX&-BL5c1`JPK(=*Jb@%8%S^9nR$9tY_
zEyg1yIX;*@2UpK5uiOC8^gNc=Bi&nGwbrt_->~5wU4U24#Mfa$NdF+W4mp6>AT2V4
z`t|Q`)2luW82fTKTYssFO2~P~*?s!KcH&1WJ0Ho-qbesFRA=|ugN%VPhh0iwV(Py>
zlKVg!wK<I9@(voZazHeQOMF1VwWs>JQo!FLSHrvU24XC~eNSvYx5L&_PK<oz+f5wE
z_UA$2@~gS!_Fm&?Ct0oU%?{t<S=v1XKB-&nynZ}8l{3>8D!?Qd{{*+UDOZHwAU%Tq
zB^F3Y&p6%ffqP2M1*4i0zEGq6u|e3vAqH)FR$f2#ZYa|YmD|s%XiK`Pe}VhR1+%^u
z8(PrjxmyQ532w_zelT-jp~#T^5n$%%iVrn9H0k?XOLy%+DWHC@hBCda3#yHG^*Z=>
z?FnWif=D07DEk?edCP~;D^+}z>{Ty7*PEj*c{3Ge;<G-U<IjFIoZH9R$YtfTvvZoy
zWD@iJM1zQgNk(BF10izf!zn-ohq8Ksz<^cvJqxZEOYpK!7n=C?bamLS`(6{$!Pbz3
zQ5AM-eEw<T$x0(cL52*#aPK{kj0o+jEiB9x7u}*Yh7elV4W-iPg_QHO?k-b^(1_?#
zzr@211Rx)M)h#$U*6!`iZtJ7VbKSMUjQkD@nsZ@hX58mN3{)v5o*($jy)D_#E;mj&
z92}h<Obj_WIqH0k8Fk~bxf%)N*X?Ii{iY0&m6}(4T}6ElT5-02cKARv&(8<a%M0*F
z$?2qOAaY7F9WZK(R^k*mT*^B+;Zdz+Mj+iH?YMM4*zh3WxC{a1SiGUEjjJsMiMweN
zLfuz57T7Jo>a4=#aJeY0cjvMr><&7LZ2w7WP&43d&jD<jfr&MN>?Q{sD^q=X29fsC
zTJ6$_>x7DJ1F8xrllNj%_Vq;T{;5lF^jX7+o(o%zf)!YY=6`&owod7jUEL14SRw^f
z!m2fiueRz<<dXX)nCl${k-i{<7**l+0~FZN%A-prq0|gdqt;dCiHZC*QNV_9OTYV_
z`lDT6#fv#McZXQ|3t~XDfqh=V{RmOX2iEF=N$|HUW0sUluwsgY8p$YhEnJlEM+!J1
z!}fhZlY4A1kCjSoL;TPMK1aPc<XSuOQxY!I3u(!YyMZeqZKynLeUfaA-YB=EOLIH+
zyT`fA=Du*3fMEh+hRiN&$(5LTGc0Mtio-D=LpwWMrMoUqV?BcJ3RX{Psr8dkRmFFy
zYHjar39xBYg%q@fGsR?nc=1g|my+<Cw!t7DT_C@dy5gK$bbtX=j7fC0Fr_;PGLom=
zXq8F!9?o;I7}!ld5&aH_^Wdd)-%M14^>-c_H%ovUyA7l^FhmTgufI0e(A%K5p+j|T
z-*LJ)VQUE>^&H9?X;TC;mwKQU|GJipCxX(Df~3gfrho;FUJzR~rrOP@nF+m)Cue2F
z@F;r!nc+pAJPF4wqa;x3BLfTKgH`T#6;HyB7)mD-eu!}#6!7xHCmJdGi(CzEjMnzA
zvH@w%?o*BAh;+of){nxg^%jx8DjVwsAYNUaTLqA;2LWh_t$>X43fdM-H=pn}?j?Ix
z^l0iE5HL%w`5KwKja0{czt7Q_qT^x~q%EHVJo*w!Sihz;oXALCN<SSmJvtd7LBHVl
zc!WIfdzs7@*}k2#EYX6!;O8VQxO7Hk7eA7ow<@CDe3Z_!eMRv=f8?uRRjlh>A+jAZ
zNpe9zG-eh&fLs4TBEl6V*Hfa$&GY}v)DgqavLvgC8iUSR+9zMDH0Q2lrPB7l5XBf}
z5OXzZvhlN;iuxi8Eb-U3f<Q`#)?CG8x=g#qoIo5EPhBz-ElgglSa3c=F|9Az8U4a;
zo5hq`jjRYr4wZRUg-R`_ypkKq0Vn1~couu3onpKquvpZu>r!BCU&ar~=`;-vWt27K
zetAP({gC_f_~vkx`<Sg^fkQhbkTEz4xZwL@HfrRX4$zLJf?OsLEyTTWb*r#FWk5I}
zOVIV<(Aq8yRFTQ`6K5}w$#k^Y9RbID_E7@i>x-iAIqa3E^&Jcm{nybx$>B3!KdaB-
z22a%1#BF`BP(XZ$_DNjo_5AaFKBX>i5d*5hY)FQU@tR|_283AX<8V&XsHBikvz?6%
zI$NJTB|thd18RXlhSe^b?Bl%$udrH|d*gM7vmX%)yAgxed;%dAoaO5n+PK0!?8~lA
z{|0pJDK}-hWjTWG6=}3%wGS)q^G(@8h25D;IpTQi;~O3YSYTQWWh#jzKDiyjk@n%?
z=7~(M`=|5C*xdy6Yj1g!zJk!vrNhnn*pX1^Lz^}S-6~eCF7?)6JV$w+jF>@>fRtOE
zdP(_C`^cx#I&rOp4@C5I)E`xm<yS5A3ST$TfyxJtcRLN3k_ebX?RaCJ!Po_WaQ=gt
z93Oe9V*(8^8TjJ*YWkDmUt(Su{#HRn-bIDoBKzp%!dk5q@TADu8L*(PM^R&Tt|vPb
z8k!orfb5IU2((|s2z>NDB>*x^QcQbJJGBgoiLG2|+BZMN+ZRE7HJF{K@qpb5sWFrn
zAe2G3LmJ3aLlQgdG`q^h4J!7aTaYLYqmEB-p11mH-QA-$bDu3UW2iNamzenjkEsNZ
z7Njk{pF*dJiy-bCM$j`@l6s&O9RRWwHaH!hF^zclgFs1`B&+S34zZ%;E^%m$>sMs8
zS~o$2BQpByoF~o5s#Okb9iO?bwJNwm)ozm@=z?{8FL_(V4Ki%;g0|>O5D@G%PEV88
z#$C%que{NJu3`2H6%cBSTt&@`&r-uA*)D6ZFVVO3C&LQpnAePF(6L?U>RQC{1!VZ+
zg`Lr?tgILVH0W7Qa?kDWMBYp2cyF{6G8s7(umBOdx|5}x_d~2y-3dr;IwRu6Q-!?R
zD^G#f@8V4KF^;>lqqDA8=bi5zG^o}skZ+EOJ4bum9ZsOS#LD2~6s4trTbKj;BFo7R
zyYuNY2{qqO=bKSX8>Rrofw6b0QS*w$e-avm_*XVg-2_J2UMo$RG+{Q*+z&BHGbEaE
z8>82Q%>AOE%VDa>SoTyu6>MB;S|S{e{Y`v2;p4h!Kf(I_C8l-Ud_Zy<dgp{vmg+4<
z@$RQt>ia@}=PCy0-r1O{#H)5X3in~QK_wDGU1&L_HY4vd7j<zF7;ejvQi>XGVtc!R
zZ@MREMR4}L9lNYNu>SZ#O#~F3bi>Opl$O?+OumFO%i+9j{nbdWcbj(z#so<wx{Vb8
zMn~UsJmiQiA2G3A8_S|I>e`V|9d(h8V^>mc?6qQu_2o-*nnA{vcfZRKd*4FFG=-A@
z3Hw_TPUIYryg@>!Gz&}@Xw~lOmT8#OS!ZywPgNdA2v`N(Y>Z8Odk1^zDV~bJ2Ul|a
zB$@T${*BZieqA4IwcA5RWgx#jx495k7M6p_+fX+`S6?^sMG`wIV`~8-+|M^Lh|hi~
z(A|K(9yu3qSabqskR62CGca%q=m0u!o0kr*lf?XI`D{N|iGJl_U_b@{14r5&$6Q)r
z5q)Kjpyt!ZNC*R8EWc#k)+0uM@A|Bc>rX|qo5>B|E6WOx;+es0HdT?!Sub}V_3INN
zxLZo<!^jpphjHZBwRzj38K(2BxavN<cTy=m=fTWo<d`~Kn?e!u|CsGA@RP~*`#bzi
zXk7uX%~hnA{13W*`tVe^gO`_=&iA&Emon#P>v?Q%R^(jHH%r4u4g4*zyRC62ip}w1
zgXjyBM<hw#oew*TmVCxfc%|HxpBKycoX}WSiC1XD=;6Lin=gN(U=uv@=zjoVbxHAL
z@j|W$va+&_A74^KLq&1Z3%2yx-i*oteKpgV1Ky{6_ZcB@;`<291KZhavJYr*S=0j9
zQ=XY=vXod83z?S*qyf1c;1=J{RL%*f?ya=H_8LfdJX&g1TV=U3TOY~kNNKC4cHgPW
zsQZ<v1C}pq5y;2Av#5VBxmL{D!^_7@tA%|JNj*yJfi4kw-09ZiCq4y0OV@GKjU4M8
zb;Ut(#jKQx6V9aV0~T=V&Q8_SR3V5~jf<EOBfq^e;DzDeT0F|lU=_|&i)&LlTAdf8
zS4_=r-(@M8isj+K!KQouBECNDTsA_yjZv-W)nRWfgj9F*o9@Z3FWF)_;G=$055>K&
zK;3?BF<m7_NHJv8v)CDtta)C7j9xI6(e^Z*yYc=ez51|;%ZE)XO%3sRUwox-5jIi*
z0l{FIK{YBN&QngSKH`uN%oagLsb5@x{9>8Wc$H*f5VrV2i0gT@-9rsrgig1G1BiG?
zLoh>-wd<WmcM?IhPKl4h5Ac{Tg4|rs=jNOJ3%jgpP7#XWZpmdk*o1bTuje`A18#4L
z)kHk6QJVCV+}MZr+#&!Ez*Sg@*gqLaa3Tfi;Js1YO}^GzUG4f3FZ-frxlPiuvmI?d
zU`NZGUWP~LYH0Ec{ce1hh+9<DK(rzJam&nctnhJ6*Iv419>pc7;+YXHezej@!eM-y
z9NHCWtjcm#=BQ|7E$~+QP;7GVUT0sfrn=)5IHi6M?_1M9+Q2v{-lluTt~}l7e&x}5
z2akq!qS(y0d|#ASDpAb^8b)~5dhF>Ud{(m0mB*6}A<nzF@;H-nO#uo3U>sTDcnpcG
zls?qQvH63V9ahV<dHo^48W5|7%!c!I8(EEeh@Y9|ik12r@%nFRaR<S;hW$SOQJbZ+
zlOC9iAiDCSeRCh9*M6CGJyzF<Buq!vBGJI{pw)M}k^c|rL7V+cnC76TQwAn{>OZ}(
z-aYoWCY{g5=lQ;`&`Y|rw<l^-V3-+lhEg;zzG8CXa7xb1{9MQwi9^%+DMrPc%}jQC
zlcsB6XJdF&27rL}*P9#T<bXi&(M6$Y_G0&>k1Bvn`0-eYyulULQXZe(E4t{~LWBC~
zA+xW67STq$R13WZ92yOWyB}Js%J)BS;)o$suN3!{fjXnWlG*!R-Hq?<ar{G2mcuUV
z{wCuXhW$KA@K-L|!Vd3(6Lh-b<4&-&4(zR;-VwUU&sV<ASTF<e=*yN80HLy(?oq2Q
z7juu+_}5;efs<KSnZ-aVmr-w&C*auAM9<C7&$5#Df%Ft}!DQ9T-TO=1DtTJ^Td@x#
zS?g6=Ov-H+$qQ4IvR}wsxy=+l>>C^eEHp+9d{7e**(@eh0Uz2PHP;jzpjD}C)T~H!
zlkWLJbN*Z-@KERWbdgbm0xmPc>*b#26PtxFJT6L<u@j_)zI@$V2EFkjd9G_${_gt-
zcr03<)LE=1N@TQHr1L^TLedpWgK&j4IbwP38@zV$`ONVFUXh`4O(zT0iS6ZWK1qwD
z5oStS88}<9Tba?WT99A;R=D`AFWd2T@|oTGl1u>UW@_Y7lbw@Mnb!$!6ovEA%6&Ee
z!n9LakzP_kENS2IZ3sx$`z0jkNJj~X4U0ATu)cM>2x()Cyj@}REYU1cH}R8IIAeo#
zhOTAjr`vg2HJ@B5%QDe*4}cseOY6e+viUH=@?z(bc4j+MpwdMMHgq!}<LfA}LBwd`
z_TiidEhye~XJ+8~`=_xVLZURcrK7K>Z`H4vK?{qSo-T-7E&!5$x=QcG#`Ja;Qx(f2
z4&vmWH0IeHq+iHXZ0|>xAnRHhLKZRX+9J8@{D^C%YEK%(|H)d<cOl`m-g(EFnNd`k
zahC=Vq@T0~k?ansXj5p{xHw>6BaGx}`a0BCfW$hmgVkDBlQDPY)4;d2HQ+fBC;-(h
zO*IsW95R*iXk|$667h&0nU5{*x57wSX9uRz`!T7JubN~G#;ANAEN3j(c4u-~TU!So
zVh~i|GHcwKcwZtZ)C&^Z;zp>7xqdIxbs0D&XVRX>J^_*Rmz&S`F^r&=Q<Z9y`=PWW
zV6oQIS`l#3ntz?VS>?LU4Qw*dH^$3R*x1-UK32%jlrKVPXlRhz^0yVwbKiq@?BHNx
zCeGg5U0$8@>ri??mBor|=)sA&Ia8}`6w7aO;UzG_2jvQ9Q4ca=iZ%@+Bf|6dxP{Qc
zXG*#`Ug}v|%G04j4`EFD##mP^ye&D<w-G$nk#WUwVaVOUsK({anf4LVSW7J4nWxUM
zjJyy(TA|ofkS^Op+%S0i3$+kB*~5TvrWb~tk;8t#wbiQQM9EkZo0d7mE-#;zFnX65
z3Tk0~FX_lRoFqFZ$JNr(>IiRHLkPdGhyaWUL_HDi%B7kUELPNBAM-qq7x23)Y+2Xq
z?B22<kx53*pg;VMRHMe0;K+5;=4eAL$8A+OLAa$X@NwU~m?6Ix(jkP%T?b_U`&}}J
zX><{aii!p@kG*G@6Re;$QsJdiVGO!^pWE3x6H-Y91DrpAHJ^f>^^P9Oi|%C8fz(fS
z6uRkyvnLnLucf1TkzQkU#U;6W2i+y$gC}!8C{TdF4!n~O{n)5`ut$I|j9Qj}W_2;p
zzl;{>_l?xB?5QmeFtC_wMRwVyEinXi++^p_oOjenvxzh6DKiGg7krlR+T3N{pPycH
zK8!MJKER1&h&cMmg&HdIOd(ZFm98(dSvZJR`D!9RS&`B)+Do+nJI*c~+%uDconlr$
z{iH@g%k0lY@q`C6UJ7dthlNr8CHV}{lCQKFfJsLs5v22Z-9*sy<9$2KR0t9_ZGyYI
z^Y@Oh&J`M<IsHUSWRH%{G!Ve*d1qfLi1;CSfKUbPy;3$y{DmbSV7s~!R_$b$RFLC6
z&TC*6V^FMU8^~%FNGeF!8IdQ~flHeQ=u$9)ACB9KK710PeG@OLfW>ko0V}pr=8NYg
zk>|(TjhzE1psGnfp9+2ftT-Z%)ifx+^q?#%pC*r2vMK)hV3mRD<<Z8=k>(bFr|@HU
zS#y~UH35Rj=g};Ua(2Ko@Ny@+^w)-5W!hONu+crQ0u`Mm*Y>}+D>`jr5unzd=f+$d
ze{PCqEq9I6@!`N3(CeH_6!E3$<S57HT+eXKZ9Cv~?8jl!*RwS=bz?qja@|k`gcLnF
z^rn-oG+j{PD2+RaSSSgH;%wE^h3rw2Cu>V4x_oY49t#2*al!~dWBjw-Ue7HgU#7?b
z=8atD3sZycXr(-X#D~DD#Ao(?K=u4K%`-farkE{VM(!ayE*z22K?JS1G90H$n+0*x
zgTcVXJO)>9;4DiN(5{@*R8s7LVKM(V#pMlRAx8}CCsjB&KT)s{v1ClOInlSgQ;m&{
zw+2#WidIV0^%g+#rAf;$1{U*@-$nW*VCF9|A4RZ4BfT(vYIJUTXQC6@v`c3*DEJrI
zcKcf0<vF^65tlB5yp?P8WU@L~r)M`8JUhZ_r6!kH{C084t`Pu(J3^feI9m#|&x;{2
z6lj{Y2F1OY5GJIzKU`D7=Y15mm?P$o`{CtRjFOVup#k=~a#*w(O=d|tW@e#`P?`p>
z?*cC}idfG<)eeGk5VuaA^nZ`mK*c}Pz$b2gTi8(c#hmQbdo*F!Gk6rL<k{_Qs5$UH
z*CMIGrc1C{plxF)EM5IfMnvR$^Ab@9s8|a>QZ_0B`kizvCsG89sqt;a(#L}nR?*N&
zA9lc!xjehKbZ0-T6r#27+HzXg*r+*7N3-jA5<$ftez!Soc42Poe(UN4DJ4pD$JdB_
z*XS1a=C{yF`|Zb%9^F7A5ir=xk?y-yFqPLfpW1_33Y=g+SXhA!uin>QZ`=<oZ(o*N
z0E#0r(>Ga<W00uoiWelRe_ZcyuoBY8=g$}s6oiI9H&5BQY9)Nb`Di0#*=qEJDw@m8
zGXLWG%K81n*GuHoywvS`GJQ-oGaSY}dV60aM=O+ybFvFWifX)(kt_zbS&9_k_MH3&
z%#Laa?z0hG2e=ENTazaKpg7{D+zQfi-D!;@vb}Hd6X=q0d)xkj?jsqBQieja&QP;4
zWtK1`Knw_#0*17+nSLB+9>8H{Jt=U|{F1FWEin8cx(1H>G_(gRIG3=l<ELQaKi?k_
zVq=JZN|u1=UDsUlo~_FR#+ou)#s^eQO3zQDZu!k4Oa{0#A$IMPz>R8|*XcVwxA`P_
zH}@K<T*Fg^N@a^To5S~jmNZoD(<eUAlNNQO;Tiy-$iuY5AjsjA5lGtVj^^ShT_u=z
zk;zScq1q${n9~%fuRt*EH3|oogsN1HS39w_*H`L<pG(ZgDD_1ww6Mp3FF7#&0a>0@
zg6D=;XU{F|D)pL?v$adL!kN0nDLvyxg^ERd(eh<zixMgtC<o@YrYbRU3293im5B0z
z#!PA<JS^-UI6dg0D1&qzL#o}xXT7?2lUOfyp03FmE>wPm1)`g#R#!HPuRPD5`jM^#
zQRyn<jLlh2mU#g7W!q+mm^+HTUTCV!j>WlEUxkcW=)9zXX62hztl?}ess#y66+6M&
zt248&1!kUrHrqz~A;1=fnx+cdq@(64e(JJz1m&vC$Era2S*wUk=77eeTuL8N!R|l`
zOwXh$P+t03&OICI4s87lV4Jrxm=W3)5J}_DTEF{?N%RMW3<Dlun03cQ;Zpd+j@bq5
z)CeWFn;;U5#~5QXP3Wy}AVAbJ2lA%j6V@n}1mIvhS#D1FV<gc2(iSb^Qf*9q%T+@>
zEp_n4XK~|=v`%+d$-3z7ZTAJz^T)1otA-8Rw=*CNaIwO??zWe8hu9+!Bp`Wyd=(!H
z6BOs7_ivv)#cw@3-iE{pXRtSliHXfsZq3-Zp2r`m<4>I(tR@@yOcXl<OU8`OWmzHj
zz0=U%nZf1b>|U4a<c^}YL&1?_jhVEWL%~+M)-)#<tqM2Ho&}|`>E@q!O{GbCnc>xp
zUh~H}kS%oz#1>xJJvu7s5?6P~0xRDRn$iA`m9O|t*>jtgz5!pu-J#m-+t&pinMzfB
zcMDIrV_=Mx$A=YRe-PARiO5)VUNYNXdTrHjPUPIoXqTItyP@N#rS!;CmHyHuE?17w
zUiSzug51(n0!zHL;S7<>lq&^yrl;<wHDTM((<?#Ub1fll8@J_)p3}H}=S%_j@^eVj
z?(5o)-@4qvv&@H6pPgaO4kq8~bX+Z`D!c(Sk{VM{NSTTa^r>%@1ww!5@W?$Lbp@Hg
zi=NyUXrN_zGKRI-=KFVn-j5f>#=wZ0+c-z!n3OFn_Qz#;a=bOg@wjuq9=n-4wE-2>
zz9UfU1J<U=mQzh#)~l6A*(86%#LlVnbyZNG;C8>r#8|9_Ta*fShbl>hI-`^D;Up%W
zrPczb-NKHpk{&)O0nE&SAg@<dA2#Y7^5FHzgdU+s1UX!MFUXe2v)W`!SWh^9NjDw)
zzdq|GESvW>ySp3!D@)(z%m0R*y@D&aZfbCow!etjxy*AlM&x~KU*hlx5J1J}8oV*_
z@X|(xXJ>WF%F0v)ZauRg@3syfFEanZ9r=QFvsO;qe7fjy_|<6|4F=bOa{<yvY4L2K
z6COuaz)ea=WPtlaq5t&Hs)X<|(hsf}{%)6N_oL=;S+oMS1S?)&p2yEXG-}=;RXZIj
zvo~&z6(QQK(-`?AlgdLF<soI$OHMKC7P+MlYi-;XE?SPQG|;Oqopi)caLgx4)o@=V
znMg9Ce|(-|6+NI*0OqwsmU7MkLm167+{sFNbTYRS-Lp(qGktcDzLFMi#{NlfK_i|(
zEKsRBi{x{nT~oq#PR}Ca!M=*2LBU`l6@GK2dC3DicO$kU9#YJiv9G1A*^w-xw|o5A
zU}P{B@1bz!yvM$NMJyQ^K5OIQ3Pp+J;@OdFt`&=BMF5Oj9X;nZ?dfD2^P&dYw}!X&
z;4o|-WmzScrGN44{kHFkTfn-z(Dlyg_N7a$(@N7#hhD;Dm+SRy(f|Z5Lc@<NEzOXs
zaAwWSk>NiqB%<DXq;P)%YWGdG5`S2)D|r0kNi&w)EtFW33G~cXit2Bx0#l3H2`W5W
zwu3mjh4(h~Wvo|mB(30uV4vU48`F7c0D^o6fErzJ?Ajx#2b8UE$r9+v@ehtv#t%kP
zIwQUnNys*%9`8>><?}2BkNh!N$QRfx=0C-@N3C<`PW)y>2G`itxtZHfmbq#QO^p2{
z)V??ge|UKh>@Lqn$H(UZ9os3)KOmr~&g0gAc8VHRD2!^$&Jh0;7E4FP{PI<yPzCM-
z{c2`9>IpiOICgquhqXIaL!Cx@a+o>-KZh4=a4p~J@%@wWgKC#kT>?R_F}lC0wzzA2
z?VI~Z9K@|;3!&>axvk}-%TBRBfB3@*>CNyh6qVlALNZ+7QIm`6*LQU_^1hZ75)>p3
zfce9hk-wcQ#^0-3oysH=^z5BSGEu{lpQW_cyW_V}t+!J?bG%C6ueq$?aBmB#ac`@?
z(!Jh6w7;k)`Q5_5+viWZw7=OPy^BZFsJP-M7CdK|0lF!tMEy=?tIKLz@%uVWA21_1
zjDxi7_}{F&i;Rp6=tmtMxC7Gz68@ad?O4BX^8HtCN<>9fOSH~+W&gHY{a^n1)E!>@
z{BC2OaZuaGp-c_L$3ls#d7m;gl0)fa!Lol(wfNONpx>t2E^#`1^=5sTDQvrIUNHtN
zOc^qfi?DRfBZObJOi~Xh)0jkHt^^KQs;++_TmN64pv?u|?c_dT5oOwgyO2HYkk#7j
zscL7Q;cPWeu>7d1sTtNJ<mcl8&k|jdjwM)_R98ujalUiYsq9VD0v*J#z?J{5*Mhys
z^dgfg?q641@XP=7^@tDyW9ItEb~D@M+UyVM)of4IZKN+GFfL{g*S08ZR1yUJ;l>m^
zx7LmP_w>|1UHk?TKAqgpb%~Y|E1wq-c+@l^3LnGzY{&8rFr~!Xy2{;7WetA<yUD-$
z3bwp+^3#rRHB~744pP+vaMkJupBR7aREz<?)&cM0kKHi0iGTA2OXJG`E0VWI-IExj
zPX54eE4KM96Os78d!)ZUd6X|06;Ww<V+#N7=6`*_d{tN@nU56e{;Ssb>#eDS9bucD
z<5@J*Uy0xU@=5;Ri~s+l#|fh9K%!)4H4SR?oi=~#UyX#F={u7Zb|_%2a<)2N4xB*;
z({~!(emd+wLDAPc0<LE`AT(%&2w%alrbOUR4lAZ~|L>Bbf0)pblC!|sg<hi!_+H+B
zA1~;f1mMt<jiTNzM(;FZQ8021o3!-feluqoKR%i$v4EkI)!dr3ZX`ggST0r^7Wl8`
zrN<y`zF{Zs=-8O)R4NFS@=Hs7Jv~KU&BQO(O2Fp4#7Py`e-^eoaKO>E6)>_7bfjar
z?~;;|di_|r{(Ix;Ab3nyJL`j%b5i;GY6uy)$a5uA$4B?U=2D+iz~S$?1XeH(qpo+A
zm6ckRpibp`usOWca2_N7SJUe$6C6k*dj<yHZF0AR&^!xIu$rmKurmvfL4(~0jn6~r
z_hQvH2@i|XLDGoG%ldTB@H4X}-|Jg?>%UtT7KQ)QSDh$amP(#D2+0q7dP1lW_4M_1
zEi5ebBn6nLV1^k~M}+IYy+L$r;K8ZL6JT;H#byAg#OJY$Z7z<u`*-Z_udCD_59%>U
zzOz|6qu1zj*GbHNwf{D#N+A=dUfA_U6z1hONRa*0!37MbM@uvqI5=MN$>OUwF@69@
zr2oBVeynbi*baa|WNu-x3*x70?2#OGUjXM}CVIn;E)D#V-9P1-;E~$!;jhltSzPwo
z={`UpBUwAZ8BVT1uQ{L(WB8|>4w#KkA*w2=R0v<O$YfB$0sTiaT)L{9zW?nFuD6(6
zp^<L`{q)~#T`Wnmr%pyWU(2^P@wR^<y65ay{l1z9)N@aIpZ<ZIB;^uX%#&94ur3k0
zH+5W&F4Dtn=6W=~trow($BZoZ4h|>b#%=mXH~+_vj|7omg7HXx``nhH=Kgd&zZV5B
z7<d+Nb=YHVX0(rje*NVwEPPg-8vt<5=+i5Z&QxU%Bl=%2_}AOWl|0hXAqVI?=Ua!%
z!;#(fyxLDa@bHnS#&A4QnqU6-<vLdQn#&{gE~v0}a)AcJ5bGA1y5#?S*1uia9ZphG
z(%RmBqylTe;}#;T`Dp%`8F8M5F`Tqd7RM{C55FEA`7?{$v_b}8XFdiYq~z28c4g2^
zCYY)Pdbh7GmW9o#`oEegQq$6cmw0bUOI~Ndi(S^fLHX<DeW+Ls+AzF)ea$#e^4u>U
zT%Gp16F!$1RrnuG{ny6_HyKQP1elRFR~L?f4{ENj&Q(D%OOg_=h)mg=w2eb)LU}U3
zpHvD7Jj!)_<#tWhTz`@nih#q;JFS1}(?8wl(R~juFZd7J179uDWTE1KXKZ3b3EGDv
zgs9t6;g4aVgAbRHkzs9TC-(TUhqz}ike`Dy5`nnHsMNoBiIBPoZ8tw)HZ8jF3itwn
zfu5JFyu*G5Ybn<<6!TlZS_(%F8>E+c(MSc-bqlTMo&dIUfbze12VlGjaxP%e;F7*N
z8>yX#USH7VNI!koZcY29XiVwXBch7Tf(NOOv8u`FQ{~~|5pcU0W#y|^{`Hps%i#Z`
zp%T#GDJdzx6`QH#s8qb&B;>Q-(m>3ar8y7-_c*hq8%6td4|+Mq;P7xCK<6|&EVEn3
z9WS+#eenA~|M?C#>VV0MKR^Y|R8P*%DB0ORiPO@;MpEV62J^4>@u6S^Rze%#n4O!O
zi_ia-5!}WX2wUr0T3T57>i+RXhB6yae{lhR4|%zSAb34Jy?o0lKA<g~2Pjd$()F_Y
z6{7q3#LQay?W%y9JFEoHwYT6W9{(O?gYYmpERf76xnEzsN&M3|`llb?-~eKD(q|mH
z%w|VN$5jU$dk$PMcnLb=421u;<>sHR1N+^`{2QTj(kDwr>;Ga<{N+Lb)NQwJv85YB
z{}+Anm$i_V5!MSeryG2~dg0&RTOSjwkOh;HN1p#_^S@mBpDy$c35AuQ$ID8n^7-$E
z|94GQrwUs*8SOt^{=;Jamqz*XHFX+bbAB$SH&^jrJ>!2a#QOib^Pi_RtS$e)?9Mua
z+x`9h-x40*0-~nK?3?`mb%f|l!uk36?d<HZZapuw9|F6!Mlq5j-0x{1cMMlvUjD-j
zN<;)yt`NEa?5;a#Xu0P;J0JgPg#M@Z({FizWC0Zo&2127)`6vIZec+YQDu<^RR+vB
zpSP`}TEA{3{gRxL5>DIo@ZQ^vQIG6w#+2rh-G!9vmcai$KU!$5fvwG(m2Zinr5Bf%
zY_4Z-E)MhDxk|PQg@-V}J8|te(EpyG>1H0lcujBbdplu7abaNt@P3$`os}DrnLqsV
znE&6qO)d)#l(w!nhPtNO+y!K`rq<SxwPHuxMl3L$b*ETL!hbz3HeOqMyGKzGM^kS<
z5WQUbfE-n18Ahe-f19;B=5W)~(?FmDTth)A^Z;Pd)-YPpdG$gBwAoPILgMdsPY{AP
z>WTIH_DwGE8HD&D7O=?8fza+o|Ne!4oUG~hJf1}`Q;?HCS})hv`#+4m1yq$=7d9#&
zA|c(Wq%?wb3rKf&NOw1E5$O_;?(XjHF6jng(@1yU7d@PFzW<&({yWwf4u`V$cwg3B
zb3XH#&zwu|N43SgS{<FVY3$=UAH3;SdOGL8$p<DVb^=?${G5h{=J0e|o{gP7t`zbA
z?tOG5Ac;IqQKIR!i$}>oAdp6r>tt+wfAhp=z)zb#3(xt-3pr4J14d=gM#YDRr`gN`
zDpC|cr&Bbraj5qH>s8XziEA4gNGK^q!M6`LEe}Bt_XiKfQM<>PzSMw!?r3r{XMfs$
za+*WHgdb<J;zIKC^RwA+eS;k+{lABdZer_SGw*;79}zPrpm%6xd)tIY(}36X5l1jT
z?;HOcBYV}O1Cm0bk0H2OHllP<YB5e;)9L>liGN=D->;S{>E*QK0>gWI=-o{Q48-#T
zGO?+REEh{C48X3ume^Plp6qHI^X=QWp9>18<>cj=U*F#yfWS=`OH3B8&y$~i<mpWP
zpXS_}3f+Ih^9!)XF5q??gnE#<`yLR*iybvwl>YvV%r2x=(;24c^)^MwE&@IqG9Yv-
zI@>+@zkNAiYq_*uiw#bne$ABO=4^_IR*_laZyEr3z=H7km#2@Tp|iPPGWa4A<d51h
zF)?wup1!oSv?R}Mr+YHF|2}xYzhZm04#qK|;Ns?wMgil0K0Vyub^j1LPs0SlnlgMY
z7a6d}Uh$Z*Ri>Cz$tvpOIqO&d`VBmL?g-bp5{86VB{aBf^u~o~GxDQH<ShHsi~pQq
zLgpz5cGJ4J>JPK|d8xHz@$_H7w~(N|+**Lw23<FyoaGwUM<gV`Q&3R&r)1RtSvsIH
zoMpCqliaoS7+~#dz7qf&9xVBqixFR-8C%bMQ4jR@gM40k`4xm4%HQmLn$69;`0Z4D
z#osmTq%Yp_pQru73$c9AKAP)yG`veEtkro-X0=Pir7rl7=a8ZGiW2wab87i)n$&@f
zv;?Tw3<qP=MwzLpsWUS(TLA}x-7v&6`R}79lK{!D{bHpzgrv#^Hu-ORf{QKW6L~-`
zbOD9PUhukVA&=2n^O#Jq#wBI^Wwp=fNCbY~#o3JP^4#8PYBYK#JN@%%nO6VjzDU?U
z(!7^r{;mpknmB-@!Et{f&!!!?TT*u*nwHt=O>*D={ANUT?()uFYT*?Aig2N@LuaP>
z5*LRP7<d_ph!5kAhz;|!sPJIE!J#kG$_<yK#|UR+2n&CT{}8<;e(ZJTe<jGAa=NRJ
zXl6DFVtRHf3KI&&0}~AOXZhm6$jN<Xt8z(t$Tps-WLa`qOpClJTib1++;mOiaQNaN
zVCMVoado`<<nDQOdwjKiGQg&jZu6Rq85)>CtT?>;LD^iOV52DH{G>16k%oy0O)~yH
zzm`^VH)XZuVj#fFD4k7k{ZB56pxkN|PLz(Co{1P+m(aE_uCX7iXPdi)3yOId6+sF`
z;vot~3ib>3bLaheqYHiF^`#f`weXV3=#0>%>4vf|r|Lx7I?DL%H#CY7ip7PYpR?)C
zJb4{u3qDVW2X3o^Z!H9J4Rd6d3m8bS4%5wD0)KU)-$~Z19bezjkO3%0B+_+Ip(Ec)
zybt&Sn(u7cTV1X`vbx2FLEuDO@Ne`=#CIXa<FaJg7*b0e8Q*s`n+IsG$HXCImy?a{
zH8*$D(p(ojo-g2|5x_iUQI7}(j%K+6pQz|cWWSC~XLJ_QFuL$rCmdhlGlf|=zJ(MU
z@89dQ4Ic^&91&JJL$gf07W*vw8%qOxd<%jQ$Ly=O_U!v5-!>yLqxm-JFiT5+)(G8B
z#V+HXsDJ8)__I2~pri@0h|`?4jAnab7acp>+LxR+9q=4Wy80&v*27?FX-Bi{uvApk
z7!uzB3&)cUY&W<WJMu0C_ztuns4yblkM8b`?@R1Tt-jJ0ICd;eEgAEs6k7ItRW>%x
zJ$(Qsmq$ol0Nx)wtqm&L&vR)Ee}vM#RtycofOpB}Q**>^+;DrtOs#l-T%H9zxHysT
zTzZT`O;mOqzO%A%(z@w20+j;e^9&3D=65+tMeYsig*sk`s4$;mqygqWFi-?>RMx*{
zdZkS6{DxOES-RTT#4CG0|B93di-*qrW^JBgna2R4Ti|hZ1o=Tjx1RZPp`(xNs?n<W
z8n$(FmJAFIcdog7`W8;9+yacqBU@Nph~sKogC&=aRXKi6?Nx?~P3?u_NXD?lyTpsq
z0}ls}jDq5I>+V9#2^#(&L4=mxLE*6=^qV>bWJeu`D;=XH+Pvv9sTx3DW%KzlO#Kk>
z>e|Kpv|XHa40FO_Sh}O{LtN#+H>A_Oyq$($hmYh=h)txRt$pP3^Y<%zQ`KdxoxG-M
z$Kay-rgeUHDzr8x!kq{`r4dQ!UV$B8wtkga6@qiip^gFC={?2DZeh6d+m@Di(7VPO
zZ$Y5H3p!ZdIar20f_6R@DLyv?ufsot0G6v>WeEMj>7MRly8A^Er$cd_D9OmbeVh?w
zI`KkAPgv%zi%WIe9LOTC!tCS&Is-HCCUJ8FMj%Q)h$gJsIA=5-c#rRUm8NFUc1BhC
z)~?G?EZ_b)GBJFzmMB_UZh7VkTemWUTdV>txT@YoeRWP~DDsLYqV!hO?^oQ*U--CC
zJvsax@z<<1FT+vtU6L-a(yVbT8&B$QkPhBBw+WKfIH_B)adJ*$@s^1H%OMMSEi7mV
zY6Qs;Hm!Albc`QR6uIz_02fLHAVUqI5AI386Y&wi4_-S0(Mf|^bFwj<y^urAD=JiF
z+yC(o5tKk|q^G6t)!*51JL8Vd6<N>@{U>rwA0d=j(qo-msERBB9)bWjE~6<d7p_N4
zrto=}m`pxJ_@j050Nba)M8$*Td&bq-wef~bHF=u-9O3H-6*zg?<9?`z&04QyV)<hT
z{>pe+#$0E_fswoCtKF;hyVbkZTkjL3E((rdm|&P}Y$Sj)3BRRL-ErmH)VjPLKiEBo
z63_~P{5SQGx(vAzzJh(+g=|2Acm4i5>T0d*Kd4hdGL@kTnLb_nEfV4lDT4Ri3*|li
z7uE{bS=NG0Iq}kthZZkyK5j5DK?MdXP}<xk5k49pIwkt!13WHYR3t_wI2&p?=8}^;
zI)o5c?1RRA+clv<?c-|O7dyLPKj1wk?tA%NHeXX<96E7W;~%o?;6JS&J#62Slf81c
zMAjQE*CU92EYi0=<1x`kqtvN=M7j%ZJ<$N{`1lt2)BrhjfsOcVu)PcN)f@{$yH>gQ
z-vBLj|G6M>qKH45`kz24xv1GlSd7k1|I!3&?MfdF3*Onqjz<-##w+HDl1#;XZYkWt
z_c5NU#101s@?hi<SgdRwtgfD&*n%`QelAvP+KeSY#zd{CsQO3wD5Uv5ZQkYmTX+V-
zoljiBK+RgsHQwIuulQPyX*lo*a#HC4P#X&KXsoWprS}uA&r1-Ls=McwaWPpikq1^N
zqanc3E-r!`KfJ)7M<Bq@ey`7THm@I4d9)T}6_En82tz^%cf`fF`vgmU<HOx`xb=8s
zhbZpq)jH<7KU|L~R23H9ro9Cz(=<pTg#tG9xEXp{S(jtMz)VVh?H&O!Tb|X44Tt-L
zblveez)$;0?e|2KJY*-5Ii4NB;o+$(ef|W^rSwc6TJXukcSAr0LG_I!uL9lk{VtkY
zffmM-jSjCoaC;xovjHpvPUxpcHso{i%S+m00fi6WCvd5esNWm+Q4zY@n#~|Q9Zv*>
z*A&DsKJ2z<--JG4trWy*;}}cZn>lvA=UwaUI}xqfP&!0E5iJf)3oH0^*lev~t6_Ua
z{`)b!#JatDso8sJlc!z2(tH>|0`>*#c(c!6yq0=smy0VtLvcW<Uh-uf)>Fa&JdYNv
zMo4X7VxoDTWE<G<(^4U{>m(10FJw0ruo0h{Dt-Y!tRn+Ca6!`}Y{nBYXKrZnqG8|h
zv(Ke@ViZW_$}b9ZiO+!ev=qn?*SPyTg4}Es*(f+EI*Wq6Gm0|!hft`;XN1!RY(PF-
zDV3$~1!A6Y#wI`+8XCT(qFR;yAJb--Vf%La7U{_5xN*2V9IbxO-`l4NK#MHDw1T1v
zVg~N2Dh0##AxcY;Cq3)_bs(`L_*OFn7aFiAm}lFXQK}2)Ub(5e3G6`5i$QolrW$Qa
z94v{D=nR-%mA~FQ4_$mPRmz{p@-4zk=Vwy#VT!&P@O;X20B%4CTBx@^(I?$q*7msy
ziy01PsFupbtGo<qO#SSY{^#^-IRhu6BU7(drWzrrhZV|KgT0E%+inJK2Kx01X}Z~3
z*>q5CQeU^coTfM6zeQIsfFAfkGXq#t+-zJdmlXmd<9OD6wdEdkm?5FV`+_xtzhU$<
zXj8G&)YNA&blQTRp3V0CS9X|s*RkK?Ik-mzAm9`p*<(bMM}s+|DRo}dqgkv{F!iY`
zV*k}JH-NDc7`TSSEL6pIFZE>Izd;$GJ>eLEu|3#aUGIpITR8k3>Uv$1k0$rmiJn4T
zND;ex!)$?~Cq$;n*K%T1P!3EF;CF?(M7p8=th+FnBO7*SIj)k5qhp_kkAUMXQmcSW
z;{?TQI8;|x19Q&Y_M}K%9G?st$XnD<YMpoyk%)M{5AUtdzA$^J%2F!shy>W@Ip|n~
zp72MOHI)$Bl6>jw$Oj?=c=P%B_WrJXLiJAZe-ft-%(sJ-g56OF1{5DQ{q#Pz;P@`)
z*{Y@39gq`e=Wj%65txs5MxBUeG_Cx?{$t>+U!U9B!J)W_0v`6B7@e*o#7WEPVTJoS
zeQa)H2f>`v=b)g1)0%Rcug0i8$Sx^aypU_on<v9It(%{1KHIb&g83|^5a~4QG83n1
zuMUZ@=RjX!ht>dIeWC@xkUI~ArO>}wuBX5UtHPny-W=H|#YNp##z;sFRnACEEzoa;
z_=t=jS5%1KtW){Xah?VSU8($QM3DPTu2I4}P>Ro*$FciZS{WIKee`A|@fXZAk+z=Y
z?Y1KTmmTq4S+-vGUcfZ-Vqy_a-(2x86q$30M{GKTazm|D^w(}Wq<~J|AtugHpVgFo
z0!m^NaecB9p?bF1k>IC%0|3bkhzrdg@S@U@nJ$`tMZJ*g6H&)d$m_R&NqIC<o1+1T
z9rf?1r%iB+uQNL%cSS+JIshuu5LT_wXqF=S1NP@0Fa|Gsfn{8O^zOigkdcHb>ZG7l
zx}9Ohs&sUicAVaaWQTjwz1#!Kjg2i@8AIx*rpqAw&|$s)ms-G<0FmOg{|533Fxltl
z<=dNAF(WW3Vg0;BI3`!M9+HX}!xs$<?{{<oa54~%^$yfG-`|sKmqTa#YwInoQ^_?L
z*~P`|zW8z+$<><c&_#z!hwIw|dg#$NV9BJcuAFR|)DpT#vX`jWt-;t|z%$H^P|UuA
z+bRI|?euK8R1?^@QTXQ`IA2LvLq+b)e^f&Jny1Wn&Lwkti;nJ4cGWP~Vmayi(qk=C
zG!#g0)^ms+!*?JwHW3by<gdFoVAmJNW5<~=uQNd}ry5(jb2q<_o_zV{$e%3D!D{h&
zieT>w3w#@7bKg}9ao@!j!@tTXqVb~d5crFo#>vQ0kr_^}Gu_Cp#z)`Q8>UD2a(UHI
z>wMQZu?)=IOc!Vf{dWIR2|aM*f1RTaK0_><^<Fpg<29D?)31ma;8Hvfvt&U$eZ#VR
z+Q4zjDRJ`Md(6?*{?0@{feAx&buHSYL08*0Lm<~dE5%EVqq<eQu17CLZ1Yp-7+!x#
zPSGdmckBeR8mYLy!zLtd$J|uA@y$T!ldsj`fTR?qM|P42j@2^ytAr^4+jH96@<zMg
zv97;D+(LN<gh+#|6jXTOw-Ik9s349UszB%C!cv&PZlB0&Py%sw1epJvZ%g<H=^T!@
zxIE&&Ce&C`Z91e;SEnhSOl32Lm*@ebY+WE(S!<{WhdSP-Y64^wKF%X<+2Mc6MXBfC
z*LWqxX!a?6FJ6<c4{>kO&my|EzRiU{y!v^PrUe^x<OM8#GM%-fcW1U#aOLos_jI@o
zFSUnm^9c=I9Gv?6O+0%8A~z%VY0ZDmW>2`6@5i@OKfNiR7cPc><F~-XRhJ~;UgFBz
zR^Y6?0VN05er<?IGdjUEp3e?Bo;MXTU=wmg1I19Uj|vPsf}MXl1KEXht8=e&?*-JI
zD{rIq6=ZouMV5=^VeM=zs~LaDaAe4tKG2aN;OxwnEtM1qTtb;Hox;;Sr+8j~2<)P=
z-%KO=qZfNq>ONxYUb#U6+c7wXAnD5DRp-MVDD#{t_K1Rj4e@_sFC(Hp@8Cxzjp+9+
zw&ivo7W&!P<~VTo)=p%%Y!|fOj8M?h4E+F70~M8cJ1-eJ<bLQ6butW37^<zuivo65
z#6N1&Avz@ShlMac%`d=V*63cS6>2nT?3ZMbxt{%cMJ+Dwt@<K(pw<>goen&(9*pxh
zr?JL{obv5+gH733<M2NbIOEQ}JewmrE)Eu$*t)i{K_$JtzV3s=WT*qQ!T{q5Z6q}V
zH{ZQ`Onhcu897n%d0k_O3`luTyVGYHW{JHjv-*TIzfpZQ|5ocU&-g>!V3+dA@}ndt
zfqdI<KjdKz7}9pMkHWvI6<%j;`ky4#45<e5*j=t{!Fv_qR#OYnhp_xv57c8&1L7Hw
zs(OHU-}chCS5=J2$Z(aIRq5cwL*pGZ(lK$OMQ(=he>ALr&o*Cu<ZEOnJ$c(Th#(}i
z8fFy4`gd?Ge+GW@d!;3Jb$t2ap2!Wtyt=xQ8`v08_22&VAsFCT;(q~G4eTPLrkfa8
zScFVWDEOx=yp_QK-dIs6q~{TH<fa2x9tM@!Ps?o^i>r&XU055g$A~F;>OylXA?rg6
zDSgxb3;4@~L8TLGD1Fi3?wlUS3Ku6SV3TJG?)WEJO(MH><-l`?##W4vCe{q%U9Y75
z2P}U!>4K0&*%xj6kMMnMW@P4<4AG>b=9^`pq8CV({XemzvwKm<y&cL#Yx7SC?_%IF
zp72%*1$wyvZhq3Z?{s^-%=6Bb+xSg_PVQp{GQrv72*mFf6xK{f<nEZ)Z|4_lKYdbR
zHDGCITNGi5ecH?>l-`kH;o~lS1Gm2-yX$y!yrsx+Q-XhNe!O(`kwq}MWuo*{CDRB%
zX8guze2egFhMVAS_Q}TKn;@B51Xh3tyYst1Zq!LfovX<pT3A?Eqi?~e(#q5jxt9uC
zo%{Sd_ea7ey1IYeV#P?vNVucQH8|YgUKh0a{iL3Z=pIVoCM@>`whBT^R>2cZJQEWg
z-MI1|kYBX|9d!t$%I4+&t_x!~wH8_#>F)Ja$?@X7xBN`58ISFeGRyxr?g2PS6V~x>
zk>5!)iT}j<S{8ccgU!&Z8yeCBz~vxHz53Biy%~AtDL?b<>0_pXuj!~>QA)45Bz+p;
zyDVhfcUjzAb`OT4ScAis3Q77=C9d36b937VS67HU2-o*S3kE~IK!-`U{J1JQhGwFL
z^`j1}`Tq0<H;yQwT@(;dxHVoZx#4P7)zk=!iNO^X7D{+|c@&ZXCOC;oBmJ1{^J)IY
z>89UG58n^VeMnT<C@PdJyT#I;rsE@Wi4%N`gBFnPJeTeaJPbVX?Kd4koAie?t8wK8
zx+!0EOP6&rlD=kZ!wFBg<zMOhSq@uUo2sO@1V8*)*~ky<50mIRw??<Dr<W;l*Qhnc
zxf>46%RXtcNZ_Pt&Co{wEAb^Ip25Z2260%M<7_3^(3Wv;Fux*f@KXXQ=1$apA#Sj)
z(B0!25|%e2Z$q|PZmU&e+lREZkEK(YB0Xi_12o<F1NVoRB?4S+-Q`gak=eB^SIW28
z9{ndRKmd@C4R>0yH2`lu5ytl;hM&zA>~C*_$BM`1py1xB$h)_q2v9J8cI9&#JXgax
z0==BJ$l}XwqA$z`CGSq@qJZnCUHC;BUXM^D)XujKjEJ6T=B+a6+|y{$`GEK=_&|I^
zHiMhdBSn>iXkbimv`LuvymHDL5~1Yjx?lva3nPFhx;Z>?vN^hT$!h^FBtaI>v}NM*
z!o$ia`*cQ_w_-V@`R*vB%-S-4Gb$nI^XVo|tsMzw+33aLnl9>?+}L{A)-1Z^P(nL>
zDv4V4+1IQngD5L8UEMc?Jrr+PI1rHzxZB5t`}%j;8kob2C-XJooo}l7GrAGUPT2F^
zC-VuM^&W8W=k3HeYTbB22np|N)ja4I1Jm1y@fYn-a}bjS)V1k@Q<&kZ&sh}B3ZpZE
zXRn3!#XEb*UtjlIm=y9zi9|fdPk0~EonNDkXTP=jiW6oYjvIWy)42mGQr>6X9AWyA
zE_nhM21a~z+*GKwwtH|uZ6|Z_DeX@exO6S!laoVTSX`7PrFB!cg@=dl_}SKd)AJ7h
zxdTb7er>(Yh9VFM(J)m&rrckYUr<9YxYc!xRqqgy$1;%pt5!$q^()gnQ?QTdeE_Yi
z%X<!2`}b-|Yf!r=Cjq8Bvw2~hg?luQgd++$%DVu60O0=y`9HyY{b2KjrnLC!E2}_X
zNi)C{2{`JA#GCIEY?tP8l(lE(loES&E73He_GixEWlj66CEtmXcGPWKF=|LT(n%3<
zgauI)|1{VK#{F@t)sa}=@>N<+m&*m1y3n5>JFH_46V~}>cII#_F48sKF<2|`zISw-
zFnNey+U{&-k}Hp&aXrQAvlcLYk3+}F1jFgM>^+mqc8Yfn_abZ?iRq-g0B3iO_fs_{
ztfy>@)yie_xbwN-*?sCzclQ)7)W=i)FbK_oCM#l`{hIZ(e)+~`hS)yqdPJ6ez*&B$
zJS5$IXDPTS(@a>eho{DDCHam38tBgm1g`lq`?Uz@w*WB^wxQu9sAZyza$!gW03Sw~
ztkH`Aggg~~0$MNFE3H3&n!mpCYr6L@C3}j(GO4bR(sW$>T1v_G5r5GYiU(g^J?;Od
zE0hoqPaXA-p(2S8c&@ynN>yNkqgo{zP_Ykq4_a0|ULOyl{Rid05Vhe@q*nRGY2MM5
z2hHH5AC)0fzqT-#=9el6xu&@5`&MW}4xTxIxr~a8KLxBPGOS%-k(UEha54`<lI!Kx
zcjODUd)RX;9>{Qeg;`03YdBX6WcT{&k`wx7GftfSQE!Tw!<3?4iGlV?CoO@S;iahc
z(V6AgA|c|1SrRPkwXg{BjevJ!z(C^nJc7&}W1_YYC5{3=zIv6K>SRsE0L5xTM=l6}
zuM+zC%ZI7Azdu3?mXH<i#E>>sp6t2n>6X_(jH37;6{2RT4#YiBuvc7_Scv*}zKaPR
zeV`f|HswyL*|so-Pw2s=15hhRp!8n_mp$qzftQ@zY=oCU?+63o@!jVGpaX4-ji^_m
zSI|@<|6lkH_kakG3r;iihI;KPGb@Un<Kp78j}Ol4dk#4olGYUapKL$vS^=2<xO7r&
ze)f4wbS_8#H^kd1iORwNCeR^?ix(5h1`!QS?c@nDE}Snriccepvq*W`C2A@84vY-+
zr0ogYti;|LREXH_UYW9<5>7y!hn|a(gnzX^IEVVW$K};|6PEcvhx(0tkdT(vL~TRq
zd8goFSMn>EDDkvF4~t)gqm->!OU+u=78_rDr(R{sXJ#4xyiao0VD~5zJYyRzen)&o
z*Q%Ve)(~G`mwr7nx)|7Rt@+N~c1X#W6VDL^rp9-25c?g(Zm;<042CP8YE!94PPMax
zJ@0$7m29!^G>~#B+&yJYIW~KV`NWaAhBFko6oW%U%)WjizBdlD9P6NmabYG^oZ@Uk
z;$%Naa;n4E_dJ$83(>9zI?FHBmG*p_igr9E1y$|$9nH;Y8k?$sk|-~TklXEi;!0y7
z{9mLVA1!RAK~9@lGnQOMU`yedV+2))%~OT(5#7b}>3Y6F9U=X2i#}hjI#6nT+Mb^@
z_$-vFSE3^dR;yP!W-~H*FDbL%T93MbAwFH*UDIvl{dF%SJ`q0AoD$ahN!zA3kjRwL
zEfXD|z}tZJiH}d#Tr~fiQ@XGWi>6h>X$}4^Nh97W6S1;tx!BW9R4v{oBv(~qA~2el
zV-<%n6G6n=L1cEmy28!=X80jW3Y*!&I1ZymP%lB{g@VE^Y69_Wv9yl#%v?;<b8CK2
z2)d+TzL{che<ww_JL{43r``9Wc`T7DU$2%z{H*ufN$XFKBk_Ht6<I&k3`-1jhAv;M
z?hm=4%C<hMn^RXc)XP%|-*P#+#4rJ2IT=aBacYS*(=@%qDbCLb?Ew<5x#!0xCpJ5S
zqHF5|1xkY{70BZ~Uy%z?|Iwrf0{V0mH2Lxf3o@wzO-h*$j~OBIqc1WQ)j6u~QmHq~
zvv;KCq;@_%{-JdxDvMH3*stQQ>6n4k6IR`?`-EJM@7gk#h~)I-zTNZPM|Ayx(Pk#(
zIYc=gheLX##ZjA)EaysxySr@;7<I*j)05!q{3o3C5d{X-GUw+Jw~Ln1|5@xgn;sq}
z^l-bF41=Xmmo@O(-OMQ6tI-@gvP_nIw?1^_s*EE=nn<u@gJ9G_C$e{Xun0?%1D%%U
z&|!N+fJfjr#Jk@bv=AeAR{QgUJWW;HDdZF5ctxq*9?wD@3N`P5;}kZ#!&?sM%dP4K
z6BO$^J@oTJ-q)H1j24B*A=C~(kD78&ns2_IYP%t91)@Yk>6$1qVc>PiB`1LH1=PMc
z4hFA`T@}>FeW_CEcIvoLCnFA@VYwPQvNqdSqf_lZK`_6us8N`dF;6dFrWSZ#0Nd3=
z{u8UJ-`dK*-`aw^z?IV(A0UWUA)WLr!Anr=Hq!$Yucq$Jf&zjRns3sn5_1uiU2oPR
zr4O$*Q<U$&a(#hR15^V&y1xlNpla}k;9HN=ywlK~o{(}N>q?oUx`zkFh%@~Vzyz}9
zf9K617afg>TwGi?GB`wk<pos?k>g?V8w(8q{(1RVnaWto?`@P5q<4gwX5pRy^jjGd
zuZ2P`!DP9i-Nu%eJ_FW3KwktoBelVkQzdcSICwQA80yJpy>a1dg2Vm*gm5Lq>ic^5
zGFh95$*=4gSzl`dtAi^g%x@fG7~70jcR!8R#aQ)q1b(&?rq_GeAnAYL(Gz>^iUgNC
z!I(p=d(b-!X0y9&v?nY}+^KXTZ!Zv7RFEnh(q|L0Rd#{(!B~Xfc0emrYQb8(xe^Gb
zD0rc?j*5YSDLM%)O~^*Qr>yjWByz<b*JL38C|~lwrsjmt8A+DguluXi#)03bi4F<W
zcR%p)eO5x2#gJ0)1Ap|2f7E_SzCFf&w0|+!CkFCP8IJ<5)(H%wflvBRl;(hx&PXOc
zu>o#RXqEC}yFFUjNt8MW3wg>!LJ}>0@O(ZIZE7(wfBjS(;DGJa1O*xe05p7=(Q9`l
zoQuZ&6Dyu>FEqETdlr+`;LnzGEw|&aVVv6`pH;5qNP`207Aq*pN0Odl)sU0T#Uw>}
zTeSQVlu*zJ@X%UoZr$50Ll!UUUY6vOQYyAL7eBz<nyP{4YFfkih(14n<x=ag7utPq
z;W00A)4*&6SG5?`6JV;>bDk(Dc5Ii)E#6V9n(;%<&s1e+50e6qhaFjOR{l(qbkO3#
zqQ+KKJf(Oq>tudLP5GLT+a>lZE$^LIPFRX)PoNpJB=&Q7@gT+#GK7PL+Mun18bA|>
zlAMz%dIQ(MLZF`UY5LrFs=cTk&Y|Z#`NJh$9b=+Ov|X)8Yra>%wU3X`%Vh_Ait`jN
z0Ok6B{h;oaAU5w0CDEovDx}4O;gvAG|5IofzH)a06H54Oyifd&@Y?af-{uX8u7!O0
z^$L_Mj%D8MfmvBo*;~Qgid0X!GW`;jrD~vp0c#Ub>$<@~*x5;PB^syzT_%JlZNI-A
zgs3@}#xI~amaMY!@fT?Qv<mg6X8VVlTDr>VP<dimk$g@aK-J1vK~9sDfJecCR~{L5
z^zL`t7iUwL0!@cN$9SpvEMv&V&CLxMxEI!Uj<#AI>5&}tWA%{)kR5{hQ?mC<u)=ak
zXS~xls#s-PjJZ_xD&$YD1qbnm@K0Lhmoh~p)O5*QVEq-zfN+jyJ`>pu5bvIp__7Ci
z8<Bf%ECLziEE?#;Z!vu$JCccSPbmIC7oaK>tA;Nk-Pg7U50sk^oOF#RI)DmD3<<s7
zeMC(ZQ_JUl?$OS^xB&vZw&1OJ74oy`q_L)`$Q3Q`pal7VI4wsdmg>85_;<2SQGM3i
zc{psfXD<mUNTaUkzd(>&>$LiN8vpnqCzd~3bA(K|q|D8_0~07_S`xbB$K%zcV@bE9
znEx=GzGP`rG`$h*^+25@9;_~VW`5_CZ?Mws6a3-AQ#0TGuCb8|f5}rp?2GB{t`DX~
zLdR%d8CzS4KIk#lUQH-cdPyoYQ(FNomv{o*O;ocn>$Va!k@dT^i4(8MI-uM7#Yk5a
zN8XXp!q0=|$;jXXZ`hrG@x>+t6)4;{))X&xUmvWk3v`i3G}T|Ai5Dx~6OhuEJoBsZ
zTM-%_&@+=}<6v(clz3mfb_ImMtmn)tB&aOZ)r8NSq>qF(=H?yD4@7AL{D=bzkMCrh
z6nIN=-?G}8A{XMme`~V@Q-*XjFvi?Z*H~tZsa9=KY{NUz<1ONAHd#!DXoVDmDmyUO
z8B4%yft{GZ@rIg7cut$NJ*w);;lMw9+_$Yc+#;%5z@f?6{j3}BPGX*(><ouaD>{Y{
zYrdb1H?@K2p*L-T)nAT2@T;zqlM^YK$*?ydK)go+T@7ec*>UWPd)WeZ)dTSX$dyxl
zp}#aEw1?x_x;Vo)!Ms01dzjhy_*W7Q@OCnaUY*ndy`teQTYu8%0-ny|9rm-U+NVT1
zi5vgtq|b?{N3*v1-#s+-@C(h~{7;PlM`WP?vXPewphWyqT7rV6*Vl=-7r1}a$xlZ3
zT&_3VdtA{`by(a9MIo@R(}p8ZP<7-*BnisRPNe`0Z8*ZFFX7#t&CXAD0}LkFy1}V7
z#916wXY_~6PH)nP0YTf4+~(|j*D3s07Pw-HOA#r(iW8mWsH^1nCiooJH3i{o@trVX
z$n~Zb3gHsw6}*X|)jSs8`-IZvlxC!4&wf>&){C%=Pd&sF)}-Sur0uQLTK}?zrN&}4
zV&mXUFHPkWV^oliFk9%9RU@o+xS3*L8pjt-M~&SInh|7N@Xr%4BL(n*c9CWs1ih^`
z@ns8nhCs}@qS!SR&GU9g!YHc2zA4Mt<LKNEDh;Al^tLC~#%?M&Qz1YOq!Q{-uo~!2
z)+98hCTbS$tkR@kyO0<&Q)h@oOuziy{YCH=H5p&Lc${DUZqUuqiE;~d&rm{;QA{{O
z4la^_HeT>$1CkBXYPEeLwz%X%eClg@qA;{%kqf0V2ahZ!ErMArm>M7I3*}((@U=MP
z95q}G{Bb8^tLLFaFU7?Ns6vABNfoC~PkK+bmv2yegw`WNs5wLFwRpmeD35Or4k0oQ
zw_&t*cwje@Blr{)&~I;VD@}n$ooBRam8s~^rV@`z(~o^<kHq`4sT8%apHXdMfzcjn
z?fp}5h71BgI`sb%(BZ%^rs%Ksy_&=l7#+wM9nHuXmEw*>Dij{10I<RG=jhUE`5_7n
zB1@``m|M;J(n!MN7S_l4n}3ojfd4zW;or%SSO}3B+P!5J>d3AFs?3N<3t?p150dOw
zd&?J!GqW=vsGJlPI#XWkTkm~|*00T$6kC}7HE2$$D@EFoXGLsd_#+lV&Nd{j4kr*U
z=-cJbcjuN|uNA+bC3t5voywDDp5e{md3<s0NDEK0zfW)fz*T#L?8b^OqZEranJoyJ
zRHc$#i3=_$&3(DT83uN?Nl#F5v_HlJudMB4a#>kA(m56{7D(_#6my%sFD$~Fn5!M8
zf7=;OAZ&AeSceKLvJ;xV>~aO|UymOGc9W(?d*O4E$=Z`)kg$Qf$5|~c4urDy@_6ux
z#0>PWWp@>|>Z(d@gMDdB%Rc>*`1+CcGkRWnB4J0uj%|UgLg5SHPgoS&+NZ0RjWI6%
z^|tUm7s*s6cin6zeamBa=;G(Td5SA|Z}JbBOY^6kLqeMULU_#;$+IOvMyQ<;O<6Y-
z$yg!!wQ$|uda2A-lL%gDFWL$`#ADS}MD~XI`@_4zS&_%Kp2Tk^r<O4D`bM$>-T1*U
zeFLX~nT_ehnY<&ND+DY4Y)%4K?l2G2=+2d<N=5g|PKn3P%|y!NgmKr-E~jhniWg;Q
zS8)FD85WP6pJ5Ns9Pl{mx=_6a5x7UgFBfwf7>S(7_J&@4I0TZJ4B(OCBi39VYwXVs
z)z|!TpV#SE%TrU-R@$Y(KOr7K#Kio3Kcy}#M)g=*gZ{5N5w6krg8)hP)f)<_Wa)Z0
z_O=R=&B$xe^vs7R$Z3z_&t*fz9tSum5`7dkPo0I6`ft3_A2q%tK3|{ghbNf3Vc1T?
z6;-=FCu___s9BHUU2_5Wi|pn*T#ZW7jsOomV#Pk|rWa?erPIEtULyF^>nILa<Se(s
zRruL#HMxY$I?hA}sG;@V*Z0Gc2|CxDXe%4rN<}t~2U3ldw*~3Ai>ae`?~G5M#j5v;
za<5*G0J(2;%0sL7%=5el4=6X?t<E>4$Ttd10fmL&PwC*&Bi{VXoK#rvQPx~6nj)I2
zbJlHHM#tM`^*D{K=?)mMVd&S8B*}ogFtMnWP|>?BN~#V?ycY$g<Q)NGIt{G&e4Vqq
zTUVw|3KZMBE@B+jw~Yb@1QU1y6bN60aPEp>LGQFC(FVVeBYap+D`_nczDYP`faRR}
zrKrRkSt#%kxY<iL;K<s31lGWycBkIo(=gjWZ?oQ`U(DRXqcIaMATU$*Y*U=H<L>mK
zNU2$I=xzYNs4sj6%>V1x_9TAE5YDS<!(v=8&6x=m%fqd3K)#ip$r5bE#||yhifGqn
z@dIAR4G|M}2AqJt73QM_Tl~p;HV&7K=ZqC=fq8!kUFs!3b<)s~u&}Ula*0429l2z@
zAU)hfE>I#i{T@jHB=8cdI#<c-m&QZFs9ru%&}tWEui(YuL*V$}GRmpCjvQ1!^UP7h
z+PYi2A71IW>$vN?Pbk?;sVo!85oNA!@8!r{B>qZ7u3<zBtq%Rjx$cf$kAI=udm<`M
z4R4P-fg3&ME+>0XP$<^nu2~H8k!TMikDZ6@FMgeo$5g#6)pD7La9LdFLnTnKF<<*?
z$+@bdYXua=Uo}kYt?#VMfw~w!L*2ZhlbVw)(#z|`kyBT!=rq-I*BnL}RZcP6v@#a;
zwNt8AUT7&C#5cb(@MFXoo?L_;UyAe39@2=aL)y7>yQe7<T#XdzjAL|yOiaWxWAi`B
ziJp512{TM{xQQNP%kRGwl^$x{Kf8(Q6I#8FT*8JiQkFfEs*3+~3^tlM1v{GLiIly<
zVMQi3k=i4oQ>^z??P(bt>PQZ`u~|Q5-gYIPHEqoOV)RLK|FSkCdV-9~Nz&znx$<CD
zvwbtS(msb6r<n7j1Rfl<_<(7Z_M)(uGwu(~-MeuEcM1ybj%*?^)vz7yqVp(L9<AoV
zj^s1oYlHh;7Y1cZ%g``j212!gG4oM<fe~j5(?6OcodP_GFrrq_r{p4j%BV0S_XWp~
zxCnu307*>_Vp3#&QGld|okp)Q0JIAnZh31HWB}^4sAZ5ph$R{KLdb*5|3oU58+Y->
zankkS_yo?{SlBth)MtD->G^UoLDf>q7r7fGLZ)Zl-HlY-YFS_np}Wo9>%05bn@{c+
zn@e|W3ylP{U3f4zA7{!<S$|oq!xb+8j_2-CgLJ=yR~-y!kbwD$|7ehQviA~zbM}9#
z3tNz#q?ddyTUE^eDsL7zGD5NC|GT_-XadJdKTPrJ=Tuq1Q@W?{TIQR!|L%&@8&sIi
zDbqH__way7T+`5zoO}e&J9yAOuZ4i~vb`WH=A*6PHvPbrTOa+%*`fexhsC{%pM98+
zLG8Ac^w)|6Aljuk{gl!#tOQ0UE;6`YGf?u&sXt_{u~*&Qyg2J-$Xku~k-Cxg`~-Qc
zy`9O7lS@1TP_(yYhDhlomf{u|eL|5-v9IabRjKNvNZl-MbrTqI=&Tp`{5uKH)`|m*
zmE)#~dMI=|K-E80<pL+FaSo(>Rp=_ych~h>9?k8$xV-5JSN+BF&|BF=gbP!r;O|r&
zCP39Ry9vceD18mlByaxPMNV6WK?N1>L!UHYCPe=H87~yB+ygTU%CAo@bx~Q|wa4aa
z5)5sXx53&#5?U-1<-jZ3wqNx|ZuD8#CG4`t2Q^Pc^K|05-n@?Qeb!M&v@~x!^8}1+
ztkq+W7jqtdC?m7+lM)mZ=kV|lb^@bB0X_t<SA%&>vIym_`yFswk&C>9n-QwUV&cH5
z(sGM|x-=!MjM{yz^V!hYQq{YqN-|cZ?#Lc9a-{K*(xou78&3@MHIiFF?BnC(%vh#B
z+UE8vUVvJ8_E<OloX|-QXrr#_{}y){Lp8iG9<K>eh)XY&Ji_X8*KBy^vDPxP0cdA_
zkhgAnkHCPTz!`asyj)Eu=#&?0^qlV89w>G1<sK+}CXH9TtuRS!iU8$Zwbf^-64$6}
zC>J3itRfH_l42KLb8*w>mv?LZD!<7_xZFqSMB8QSrAosA5|R~97q|h4Yuy7__jn<0
zS~aFaa9{UtHFrvylDJ=-5~zfz5+hA2d->bhwnsj~*vcA;b|C;6S<I@EMa~P3#j4m=
z7Shtw6Wy>7bozi}vNS(R;lPErDo!a~-^wv+97six4@iu(C%*!&u8z<+`$*az;23C@
zkdR=OC{4m&@WoTHo<9q*7RLr}_r6K=x`ePOUUzOGz<5ldh8Lg1;}ZYef`^n~ez=5|
zyoDX2S}$*6Rgz{gFFchj`gPH&(l_nT-t!%qrn+NiOV~*2Zu3;huTmXE>)YEu^eHXW
zVAUO$&jS?$ozqj4QY(GsY`eHV>#R^2BBZ%)@oo6r9*CBGo>&Gqgu<@lQUh~l?xlRL
zQfGi8Nv&ud;vDsFP;UIyWQ13EH;tJTwjK%~h&wo->&WxY#Q{#H_2m=gWgs2zWD@#u
zKO{KS=ojSQcq5m{D@K(b#Z9a@`E~V=tF+irtB-f@vk{BqpHyuY^~}O(NW&DnpOa($
zl0L|!Mnl^&X|Hy_`;XeUT$NeO{-O0nu7N>vGtYD4VNH5P97|M<1?I>VmWN7BRUFH)
zl<o}m6gmSIWzvql#kUbFbfPyBF7l)`4s>=PARq9&4?LfCBrbbjkh|rudrh!D&_Vtp
z{h?I3lDs`2NzkD76@d5@eerl-dJl*m7wq4$lcCPB5PX*+K}`YJes#<C@ar?Z^!?Qu
zjNQx!QsqNxmy~v(j6fuuHSN|s2bBSQg(+!=0oO=tc1jJ&{**_a+S1Q=P`Z|Mv|m-T
zLZ+x#j@%PxK)AXMYdf}jC);xnV1bV@2`?$m8;D4#I_|Q$0B_wGs<cG=em~&a`Q<k%
zc0@l#r~<}=Bv<?&2DcNZ&IMzstNeh(YZy{hHIB_u%rgvmJ_6~%EA38r%bm*!P%`f*
z11ZPZ?nuF-M*20w?!2msn{CkqS_8SGG0u;%m#$rOPhwC&kb)-CnCxFEY$68*$qBuB
z*y9J>1K?o29@{eV;h4vTXSTNsanH4eMz&3S^!Xk}k;1FB-8m|)Q9rOSCUBTG1=QE+
zd5GM3u!&50d6t^u{lEMDC#zP^){v?krrcjVsMZ*)6&D42RbPpa-F&UV2_i`FlvDIg
zI65a~Yv3%HGkf80AT3T&;IOx-njB+X*Fq1t?GP9nEm|C@I?(o&1jewo9*_PDuEqzI
z{FfA{KAq%L?XTP{Ub-Uwf`ErVVZ38vDlfarvG5NFjSNK-*F#LferYEjgnCz!cY~E3
zT{^WpNjN5#{R&rQ=RM%}I>COcq;CQ1Xq4`{PnVn4t<tX@1^9z8u__5ugPxSyr5&pe
zS@{Mf^sxbs&3e!{v|yE<O-@b{JUq~xuL`qSc;VY<s3J=3v-7C!G=tly=6mj{uN8*0
z3k-WjqSg*hO6M(|9{4i?YUEX3QCU(vb*k)ar#-TXoUXE0NR7oM_HDV80=m081vnI9
zSNUO3HeYHUZX8VFCdMvm%I{hn_c>TF=woOiR;X}!qqsr9BiVuoday>cySe;5t5CHO
zY}~b)JfV?@xWBJT@hQtM(mrVc{?h;dl>eXwbn@j=K58PqlRheKV>9`j_1S#g-wDoM
zOmV9NnuNzV{)GDT$-hNK__$&1Q~rN60BnupA`Yu<L<UN~8rVV8j^XXf4s(QfR8jnH
zmI@0Fn|p40b@zsZ48BIH;6IcQVi()9Sd#I%WWrIAiJD4dEDvt%fYJV<r21fSZPH-h
zBD6YkT0<cowK5SY4*1^elxj-q4#}h&+<pm5sMirIKq~tAq-U+)8e>m;_Jq1yUKa%!
z4aa*C4@pFEsnhV2k+-b8U)nr%lCGhpmLqDq@#X**Wx#%W0QFcI;1u;%6U&T^$4eK$
z`5eg|jz9udc!v8nzvdM^XF&c$EwZT0!&Qs+K%?|Z)-bp0oHaX6aUgj2G_7K$VmtM_
zG4ozgfVHh7%}E%e{ZTI{&QBBX9Go71W@gvj4&eW)VRfa4eDIry70<T@nm|Ymym)t}
z8h@%Gs8yF2TvCDn$M@rF+gL<RJ=^1cM>yTzNE>e##UUuq-WT*hyknvMz0-<4Z^EcK
zXSHLFKJZ=#*hEAeBN)*oDs?CQX(D_Eb(tA12^@HIAhd#29(}0HE3Z^~Z|k*glx>92
zA^eruE_nx*u0pjV?aPMSZ47RGh{wB0fA<CaZfN^)?dg0l#;wmi5Ya9CSC<5{q1iRa
z__XnLrY6w(B^l3|F3BpL#51_*S8;0{O|-gu5CaA-ns7h9T07}ud)?#3X>{<08)#Sg
z59P-YH`i#99qaIJrDA8kUPO2^x3X_Mg0g~&KTvU4`FTzvcg9%8?C99}_1oy-`H8Q}
zsC>-DJE5@{wx!u3Si@#cXv<_vMJeK-kmjIBn341(ZS~5tqb~&s73E`zX`~(NYmKev
zlMU8jIdrOoO68?X9+xeeb4+?P>O$}3q%VfamK=)|7Mmfm73b$iUn&xqXlQ9OPZTLs
z3s`{K;U~1io_cP8B@?^~yJ@w7I@<B~j1p7RXH2vlscP<0t@gW+jJe2|!=>1yOXfeW
z(5E~{_)C>~59;m31dt5bV-X1AcM&KMa#y-Y_^@AReRS`-#=EMCIWRC#$I!6*e=F6k
zY%+aphg?VOt0jHnr~ZG*ISN<JB_EK)c}fZ2rQ~izCcOX@_}gHqu6ohHlE_QWOb3pw
zwxd6lQa}fM0Q2I=Gxqq9eP2uIzX%=(g7-Qv@qhg)NGy~~(Y~LWis|omT=K7&&RVDL
zBB$yYayAM)@3-c)?Aq&9F=@%OqY#Ul?yql=10vv!jTBHhzP-!pw~pd^2=K_UcHWd-
zSFu=Mg#2U_EO??cM8&KnWII|yJ+&gV55KJuxe+W72C<m`9YI+}<!fn@@SBLG&W)eO
zVeCZDJ2DiRw%v(w%tyMR#>@yBDm%W{q3N2L70J6<X*KyQw=BwRc%b%a`#F9UF9~{H
z{cc=XfvQi;LxWe$`OA+3mN!?TaO8uyBuORY-Ay8lH<ClHtBE#S?5U=G)j~c(Y;ygs
z=P+%DG|dk=6NQ}iCVnP%6zy|UYB`>QnoT~;rc>j~abL1c2Qmp!qE-Tn+=Adh<HFxn
zSELsf=2`VhKkZp^fyn8rX!Ef`<5=+TOYD7p>guE}v{&f6cP}1pf@~LBZl+oL3lWs#
zj#3F>>~VOR*#RSK<896jxNNt{^uQ0ulBE~!RaEQ3{KMH4JrKPw_W{5XEZ}Ty`2Hv$
zKQ9yW3X%zm3OBgcwKp|!QfrotMVY=YJI4Fd_=OJnr*Dd*$9tZ_3M`a;YtWxw7IZi{
zSd#x+14;ZhN7K+gp5x5ZCrxsd$HF_gk<!ZaSAz<+Grm7UOGqMui`FJe`Ab?}hJ6>!
z-V_QFx|QOI<!Y^^!@ZvcN@$YrSZSN+XhHOKp>?`08j}$S6wzN5EDU>x2fII3=`)9`
zOq@jxcQqKFv(igzc`Ae>D39Jpu|EJsEIH;{dA||Lgtw>z-6Qz4ioWtqM>{N9cI9?u
zXEnUYE~;AfV<inp<V`$?EY+)FZk^dTcM_7Gp&`5$J$JD%U7I`pq9tL<$<%7JX_BOS
zfDtk#Ccv4UIsB~BF0m(05)t`6|Mq|$^$@*Jnwa105w@0>7b@If+yNI##eaXDC$fQN
zp-6;OU#UEQUw$Sfs1-W3BadS>>x#GE)7#hYxri~v65;2#OU~hv$YuYL)}*Yda<Bw|
zQS%-WLmbb5>#OLi>DaXdhi|Ij!)NL&k;*G<GaXqREol_#&6oSsuN%txgXzm^wyd`G
z*}`9{%{7#R`MD9d7(J=n?&Gm}qi~2ClP!-*%E4hbyZ4@{^x(6tJ_db#Ly@7s6f!bF
zShpzvI7<Wi^M_^j(dU7AR$Ec6$J_%)qw!&xEUGH1tw!IF25ASg;P?W0MdcLy!3W1J
zlG|^T5!U{<M!a|6Cw;&xq-UT0FG)-wy+N>@+rPOM!VM^Rc)PmQNIkS=$U`UB39$3a
zrgMBdmH}<CoqFc7GQJo?maVNZWD?HzA0DP0jVFG5DAqT44rYH(tZ2emm;~jSG6pwX
zoqxZ+hOwH_6mWUe&J<vN_C5OLxf5gjJrfTGJa(V+#@*W8_vfa%T)cG&!IyXd#QTr_
zCGoq*!5lyMJ9JF)XydX^8%(z8{jOX^A@0smL>cfr7pR!)RN<YTw{YSV@6(xTBKKLV
z&K!bS^4g<jvRlX^R_f6YGG<!<dUmF1<=HSnL#_d6=qPRXO1s%P0BXM7dgoAeUDUN{
z4vK^vrfo`!l;32Y>b%8zHR5|_)=c*>wpX)P)A|S>bHqwz*a46R;V-(O!qoUtIw6ZX
zaKoI+kT-CPO0?I5tUiuq<c#sWImu3wcM7-8Pe#abxljCMZhoa0B{&6AAqq}J2g<7O
z1~;L<sNzzYKav_VmyXrl1r}hgVd*SSb+EY{27kw9yOPe=Z_dBYzc^?Q7yu~dbSs?_
zJ+sAq?b)ftD2gAJdl~DHWIe)Cj#BJX$14Wv1yb}pOlSetph88m_9|m=0PLHLAB|DM
z0h<NsHzoJ=B9*uMDf#`@2X`&3vWz%V7cBj5{dd5S=T#gm^#d=6T;tm<y;RS8M#5pS
zFzKNag<E&!?wHMK!e!@&e$*caM_6w3nBk+WncbW-tnJZ?4Z2ol+UI*~)PPAY0m++`
zmDMXeplY>M@UNz6({y2Bq}po7$?1)9pmzj75syHS?*O_J3ud$@UG|&u>nexYLIV{x
z*R-XYhWuAF|CaW663Q8{bvcdR=BYu;mEjVD@@+A8Y~2f6vc+!mcbu{MY(RKxzBR5K
z9o1fSZDA)r3i4trgrtzs`>YEnYQ6*|`IKwV(!-Y-r>HS5tVO@v-_bw3D;}`v*=zy^
z7SxY7WhbB9HbO42p}#tkH!s4Z5vb4sR1XG*MtaAM?|OSBVy<_m%h2Xrs}!3p%4KSg
zFyWM?zsLd_#@YrEO2$B1@qC0FG2evqXq9uondMM@&YO8y7njA6ihA^H^@%OHrN|VB
ziQUb#eo=cnOtln`Nqm9tE3wL&axgOqH~hTv!B-VRd2`}2OLu@-+uo)iGf}i}Zciun
zHz0Q^T+Gn0oUAc}@i*Yx!<(fH11Jno;e3#IPO;eZHvUh)0K4wixvA`h^PqK;={vW`
z;{{vPB&ss;O8UY<eYT++c`eXbijhtSlDZ9m#1Viz=0zCc>${BXY`89?Y$M&>og}$+
z7N4_P03p#=V?Go$o(GUtP#cwqiy8)IS$Nbtev7Wbm<j}+^p{qbRZ(8tIuU;=tNvak
z0;g~Q+7@Jx#VPjs?)=s&{m!OlU-x8$^V{{Wy3=Jr;)U<5-5DlK({MfVE_c%szGwrP
z)f2V6Q3OaR2IL(kEkr=FAsLbq*bSpyvTz~=c)R|Qk)E^+m^V~affo0PKCwLHf_q6x
z50L=tM+$^HoUuhc^~dgRWR*tCchUQ5R8?JbOSl9u_7g#lWhj5!R;vs&suejgC7q~g
zb4d<8am+_tQ)PS8i@&)Xn^IqGog?@1!u22MM(ewCF_M{KG&1mD5(R+Ct1lU^EjG?^
za%=br5_WDK7E|D(*_^K*yZcdbLrvCU5iJRb&E$cqsCX7=c;JJ3<-r?IE^Q4QOVfhY
zerr=`e1zHuT(HE*KGM9hY2wlSgqr(A{9@|nQ9*HX*jKy&XMB>8UGqlwm6lPUC8hwl
z%X4dZIqRNdhEu1vz>91NRh+`;yJySVE?ML{Lf>Yf$d@aAfQZ;|lNVF;g%SibufnRp
z#qFZ&KtZA3x}lRXPdJFWBWkXtGK_8$t4}DwbFNr|Ziguo>!T!2;EPvy#-vz^T`i|q
zcxtSIDQ~!sb`G0M9ZIQD=q5+#Q5QmqxV*tPQk0Pu0D&zoZdSehqrUT0>6RFEeyVsb
zcfqMM^Ytmw5{!}@#>b((qYhH@t23m|Wyx<^=%UtWo54Ow-ieo4u{=R*q(X}IOWu$S
zN;HvbiK&WloARXIca;*M(007>A&#CFW}V91IAWx8dCTGsW@nyJo;O%i6o|=fWpY#U
z8_mAeT~j+><_cjD4&*hC<f?Mog;lKX&XBl(4CnejNT1DN7eH^{MSKCut8;3y0og%F
z%T<}QUDCtu%lDT#76pTMbxy(xzBj5axOt2k4H13Tt6gIj3IH9Ij6_LU&GskGdY;{A
z10uB^GXf6WqpW&xP>&g$!p7AEMN4l&xPP6SuxUihZ&E6{90o-j7*=<x1+X2ipr9E(
zz-guY#Y#<ThgI#*;Ytl9p5_IZ|5F8Hcnb)qGz5vll&ab7J>MFhCg{p|<Vt+9+g}vy
zQ~)~qB>BwVAFa8UGx{#n>_5*W<7Zurq?_qme8Jm1g{mcq`+fd%Vk7Z_)NmJ(tci*p
z88i<4T0ftjI?o*1!G#TNnVRY1S+-$urX^#cCW#`W#?kh=ft(PdM)|3>lC%Xa#UJsc
z4kTn<TwSe=%71llRzPUFrPm3cFqWqb^s8-xrthZ_uMddx%`GnZ?;|u6d=svg6l&`^
zV`?AtaK_JY`>gK`?lA<G24ZH&s5%ImUMD%#16ch09wUjDf*Nn$bD4~mlaeJ|;r)6D
zthJ4qT%L`oiGdejc%~LLxl}kd`?%G2Ky@sayqGM%6JT9%2<|vr7c25Azgp|+LPVfO
z+E7qoDi3SOlWzsw3F%FD&VKs2+#r6WA$wN>6ZkDf`!kd}68O?<yx*u;c)9o`E-$te
zT_^(+Q_TOv*IP!#*)Cnefj}TgaOq&dEqHJZ?hquny99T4cXubayA#|sxYM}1%Xf30
zbI!ao>znz*TCAoQ(AQm8)!wyh*J9!=BLOR|(B?on*y%wzBXUs80d4ZP0Cl*B85w!F
zpW^##g9wRwfh8vzDk{PG6EwEQXINF|hMq7-vOj|=Bd?|oxS@xFX)S~+@<R5k@%C7D
zSC&YQGOU0PIp9JLD;6wcO@2UI4wz}_a$~-<aZ;}QeT>h{41BNFZS~YyBsc53zO|Xx
z%B$PD+K5g=8WmrQ`7vDc^D}StVWBFcx=dJnLLSCzJw;40MW}(BW7%3k>yBAXUzWtU
zbi55ZxO1^STe#z7CAi=5^-0Y6<o0Dv9nin$EWVY61A|uEkpvojOVj-_RVOW@0FG2P
zttkA4&E_DRkBh)uUcG|RpPCXu&6P<dZDeS$F;FEi#8&lh?Hd`ruMrldcIz7}zUn%h
z0muXD{>#<b8NwLB)E=&sW0rTx11R&~H|I1;BbGH1C6avJCzyYEODGQCV(0Sz%NHq=
zE7N@I!0YCjJ@Z@Nqxn~NxVDrW$$-PDkq?IjuGwuRb<!(nTE=*7soo9stNqWrboqKN
zb(P)C`<lkb3LgCF(pZHB>3^jFExJkB_K?*-bkI9%tkVq@V5G|S2n3s{_%_w?lnU4^
z^=KA#l+#L0Rye-GVN#*H{;ahk(PEABR}}J@bOHFYsMnO=2?*pcP|>BB4+KE6EoWo`
z9}n)$ijXu=*2mnci3g~45EzBVZcr_D<XrX(_Uc{_U6Y~~6r&agv|$EiSsx6tkuK3C
zthahFpt`c)Q$*Y4AJz$0FYcLiPqKG5umy{?$N&;5fFbv@1!y4WkWrxbCrumvLsr&H
z1PPy*lUn3Hh3$48L`+DtKp8(s)pWMCk)ETvfMW8h^x-=0Mo}SP`r3JPbg*|v%R%%5
z>G+S!@tP#Pxv)Crp`X(N8^j2=zj%}p%|_)00v!XWV>k54fB4J1IC?7^I)Ee9@&yME
z%#S(i1kN~Pb<8Z)h*kj+_v=f4<dFR9+}f&~M1XvF!%zfF^xvYO_swAh8e?>*@N+M&
zxxiz0Kdz;6E;x5S46f_Pd6;o~n!3Hcl~AH1eA#$!!Utt{;$Ol0+|IEP)DN7!%iBym
zcjHHMfLjMFTmNI*p|^BvK}Xv2e`vA$Y7mxr9Mzny=E-%ik60T`$(g)zr~Wgp-0gK^
zWwqfp_0A%FkmZN;Q%LLs9eH=%Bs)kNm$<$3@kSCA^CUv)b9u&r|9W}O=U^R+12P-N
zG%UbW=<;Xv41j78;q^(3)@MHg{QUnp#Vp?YQi0$VTBAHvQA!ut6jA=YBDUNheIrFG
zAhus*VbE%^U&eXRis(Cy`7HzPJQ=x*q7hnrkxyUnZqw6wdP-V|-lGh7s)(9{eEoWF
z4x^h4A(|>e(au9?n>G5Ylm6}`s+i^C!!CFYqMQ6fS8l#*@?p?nOj@1_yO=fef`G!A
zYK`gpm`twjEnX?U3ME(cQ`w&ejWPX7b{@Yf3dnf-tDecN_y5j_tHOu}`(FI?c<gcF
za=p?eElVw035nUI17Hbkj?L2eu;*M8ok6(K@u1bk{jhX^e4YHG69pj4wtuyg2q;;;
z2RaURt}k}Z9<}7e=g))ltC@Dr?bI$WQpeX@Ml#kHM;{H0RsNaIE*w)U7$iuIiGfs8
zQ&Uzuyn%@)LlX8<1$;!+%PKGOfXbW!?w?`pEhMD0RL<^_5{m25|6!k)RSZdlLd(sr
z!NK*<K=$wA0>&V4&Mq|JF+Wfw<ukQHvSoe?!ZGR!M`(LMuw|i&*)Xs25Z;d2fLbXl
zXiCEW9EuaU4TqNr<s$k@r<6nADz&1o>>7<Q7Xa8fI^rzo0s=Qc!<^X~enkS%2aJqR
ztC??_*zWPQ-rmDqz|6sT6&3J^E?QR@Ee`>dP*uCzH7LrjnWIzKXT}>5QO?sF2Rm}Q
zxcWx=#8-i%jrJOVFchx&_L}apeWz`pdkoRlnnhic%)W1F`&4W7{_^6YlF6>3dPv<K
z|8XOSary2KM`Zs<?N_cw9J=erA6;l9JWhF%Ha5Tr#SRVkM13w6hl2@HKUugo8GeR1
z4{wfm8k^@<e5w;NPDQEcT=6gX+RI<?udC@BJCo>!JBHi!k7wmp23=W&N6fTNakY77
z28*`$wS}mSAX<yDq~{leZ<lrzf6r%vagqU{95Py@TD0RCMSEOb3w%lgRsD@6Qh#6g
zi|0)_XL$(qqi1}M9Q1yh5(=_wIQMKJ17=TxZb%i$!O8FPmO(>DPli5J)?hLQmah!K
zG&h>MiDYu<ltexhqKPk$X`HRj{<ru@5dQsbuM{KcoVRY{O-8g#Yxfx(=dy;Zy`XRc
zSB;OeFDI}^DDTW%OE{jL%*VxW6AI4|4<fAPMZQ<0<6lM|xtNNUy1+OQ*_}|+)3Zcf
zx(*#Hls+dafGk!;EIWI<n-Ve)aFD7mXGuK1fp{NZ1kCRpI2ot>7>lkoJQ0Hn9(uSt
zq}z(6%+h$-%g14qh`J1T-M*AIy#V?(pK+X+kh`XLJP&)s!1~P(z)oX9V`B=y?k}YN
zS5Y3jC9bzo;5+heZqCyCcs58}Z_KdH`pYHcW-^7^FWTDg7ICauT;UxkGdV=Oh&xf%
zVXdIpAjd)8X<@h|LZT8`J!L_@k?G^M8H&>yFB==b8lQvhco&?Nfj3@0>rWGoiC7>&
zL2b_Au_$1v5p_4sNq{28Tou&4lQ~b;MRoCUcEd9g?o4aZIQA$#VO13od$=k7jP2p#
zcBDs2nD`%Y?M6K8ZRq<|wc{}?57(|j)rJ(Cs<*58Y30~j(dNI42KAb&Q#~#&4Z@;v
zrs`y+enhyQ;Z2M0_1no74WOp1=-T9+s(g`S!SMv6v-*9okFgcey+M;)O7Poz>>a@z
z#P*vxWwoi|Jz=2D!yqbX<gax#RDwsaWBM7Kn$sZ`Pe<4SwUo1Ejk|6d#sK^g%!O#?
z;n!PFxg!~ULBu>CfR82X8Fp)V^iMkVl)J|I7@M8d<f{A1q-|iO`AlGrUSJzNu5dt~
zy7F%a#hzKRuigI1jr2rVt1%V)C28msgBeofY2@nJ{*jq6Ng;Jwr4>>E;5!q4Pc;`e
z)9I(_)3x6C!!DRlf4h`kJ{Y-T9j|q=R`~-CjiS<+Jmq<Yz-;zlEV<tPRKAs+o%v?T
z<pw(>BxD*wKe6|}&db=#>lM{=S+Zl$WwCjwYcg@Ikl8ETapPYHi3<_&?(H8y^-h{*
zprAyK%Si~+@S!X4ESfPEk~$X=tFFwfJ+JoH^CEV2pfHJX41C&63B+JNrDek($3hZ*
zTBMD_!3NR(^^NX7K&uEtXy6tLKrr&h&`X=p3_u_UMJjB)xO_qM?BV{IiZp^~cfO&`
zIdW{$JD_v0BalZ=^tY{n4nt`=^e<2=x-<U8uYgJt2viU>O*y)&PNKh<S8!Cu-U~TQ
z9<Y+QDbQIV-6**TmtSGvE{t7vG16~5TVm8_b4-(>>=X$+@rfuhbx#Ygv`J%XUc@OL
zC1~o!xEn8eRJPt}AF6`w-p);@UtU75I6)r>89K^_H~?F&W`cXcLQc1<izixDB6_kh
zbVr9rT?w5g-VUdSU8|{nTXx=GPxepz-*wQcpXY<98?Ez+%asS>6EY&FQ2X0-8rOnl
z{=B}l;pdVe_ODmHzjqH<Gs;Nb)HiFf#OHK}x`*~@w%%(@PGZ%IUfligq`d;G-u$lx
zTyl%&fQL8Wc3<I&yy{Cl0Sjxzs7~xP5cn2iwK(1wo~50eqoK*s%`VNbxTvri8ZHYz
zM)NxEJ8)s}VchV}xH>Q2jlbtYI^&Y<;IeuhbI)6m)O)065?~^!5DO^sE}FHVIt{Gg
zI~YoN&TFTI)I{)$$e{J4?p=%cc62<z)LFp-8bi}eI6h!L?Kwh?z7q;{TcC$=`_`jL
z`%+S}wEj1ptF?+6RMeEOOG|H2;LFGP-frSo=uwW%k5Q?qjZw|nyIWBh><0PvjN;-Y
zZPzV<))r1^==}K2MnNcIn@r6m7EyO~)ljdS`_KsGzvCZ{?Zil>U&<t$wgjTTP!LAi
zGftpA#>Xk?o_ojz#Kvs!`+nW(6C0uJtvyo<LtyRch9Y(!Ig8BQ-^BGjLeMcO42=n)
z5;Q;`9&HQEMPZS)4h;0S;nl}T*hpWKtGqQVm%}wUCUw&~wg@)64Y8ta;v?xET<l(J
zK5wviDpTV=ys4{i2sHLBRlBujRh^&PPe~$la%8j%t98_BaDT-tU1;=KD5)rjL=h0?
zmq|Q%-)aq9cK?cztaD&-?FFO9N+XOdX22ib`G)fkJH&4<6G1kdEc1_yQry&3MSpZC
zTR|dzPaGlp9!0s)iV#=|sbLlu6@{$c=2GXKdpXSxnL^w-XqBSZ>uLkMxeVF6b(5=c
z%u&c9`i$t$!WyXFq8<{pSbvR_?HoL0u))s5<#G40aqqh;l0gt3Ke~P!_|w+2u(VXL
zrwUdfPKiFXBjp@pmEgI3_T(Ei{3SBODh9@Vm&X*t1ZXyc`BAK_E(PZ|4XmFC^7$bP
zQy^ux6gi_hO!z3VVtr;gSEpJB+yVPvkJRn(Di1>&J#`7Xt~i=tA{3y+^K1H;KH6KI
z_3fv)1tkw>Ld58tA8Ov;)qOqR*DyIvhj>_>nuayZLH!ln@UsP2k=At!$d%9AtW?9F
ztt*1UA4SsQ_=NRn4=kw*kVWQVd-WTYM9fom2$ibi<Hc4IL#`96=bu=lulbDWqc`%x
zQJ@;&^w}0}gk`9o?4!8TIO+vbQ(Q@--z8=aMaer8oAZDUWX&w>jyv!~yI)^M^t(|P
ze(89swJyyGr{(sn%6ux=FqYa^9mI`W$gQez76ERyEJY$&70uuzUz9=$v>W@e&~~3+
zg*PXMaG~%FCac)!=x_&w8c**x_IGBe+{eRr{<#yqe+y*g6+&@(JR$nu`@_1MF^<q|
zy?U57Q(s22LzcmKUxe7;ayYy**ftp1yRTX8|I46L2IECmUEAFJnT3VL>5T7=Pqv;$
zs~tBK3!VV}vJb@x6Nv8R8op#LU*J+Uu7dr#(u_WQOJj6-s=W;Hoes*Hl&w6d9|NL^
zi0Ge~{#S4#4!qBUO^lBFbED+($s<Sue@XW?Bt??z*R|aKHpK9Mv;g|%MGG=1c3!V%
zU&>)`5zt2)V}5`sK>i}K$$!<p{)Gn0eCpWZkL_3QuKuRGwVq4Fh}z(6nuH+Ab7`BE
z(v|x@O~HL#j3q){c3}hQtPG!=H4si|{Nfa+rmDIyC*$EEDMUuuINUA5$L@!9n0mRr
zkYI3iMU^JquJW*RyM|(&PmOYsIktuur#}1R{msqu$CoPh)Vl5ndOANg?4fI<$N|ic
zK5`dHK8)%#jA{8kjLSd*PN5J9c+|~9TgB0aArmg3@5re(eLCiYYj627A_a=Ls_PA2
zx=*U5^p1XoO|&Fy)HRKQUSu>(Z|wrfM?ErwTrGEkiX@Y{)j<)|-6qwm&S|x)Y+{kp
z0){!26^H8wft`4ZZ53)Q&ZJ$p7P>bn*s0p{Stu9w))=gTA@;ua*3M9DKy>LJieFy;
znG5_1S;od27oV`P^{0l-{S23E6hCrZQ)5E-B0mf&_h%TAuh-J}DM7f^Yfuv?yeAI|
zQ?C4z#r(xJtrY&{IgkC9f4NcPu$2C!8`4WoP6irWo7gr`86Y4R-;QMMU~K|Vm+EK8
zbI|#jyA~Di@yrZ7<|jt2D_aBNa`T?}@Vw!a3Yd2MJD2st?>_IwVYmni2VqE|NrTux
zgyo56_ZvyGj;az%WTsqeF>y|UQVA#ID=sd7mc`~}igTzad6s@2^+&?PT0O<d%Y5eK
zsTb1M+hO?#8be;{3PPCsPWqC!nxLf`thPY<Womx*aAANF^RG|gue5_;lRpVxey|K4
zZ{8hqNiJg3p*DJC90VsmjmFE7k%G~=ih8=Z9R(p*w)o)P9;-Pe7Y~EZ&!@M)j;zG@
zco_E7gH~a!k%!U?Lo!{R4U*cP6GXGr9(=eH!mao@@u*;EzX=DF{PBzK=Wo;2$)d(5
zx34FCjHxOQP-w7DA^#1d7SeBm@Ef7Qf51w}T+(}a=c5`L(MPJxwMn%|2AgLo2d>V(
zcS~KZ8+tR}d-lbki@NeRQ*CcmGFe?z^m>(*YUj+<4mF~CUT|(h8Ge~TZW6>*+M*wC
z+>3amGv7RH-r2YrP}<Qjv>R(Vf}`xTcF$GVz6u+kw$ol~!trBCp`V&?3%LB5Torpt
zehvI@ksI#K_g$0l(-B7)?Fwf%d1na=N;v~<SN}R}tql|$9K6+C&vz7m1Ne^4N)Nhv
z8->sPPji~Ch`i5N@@0SC%sk`^q?yNr?kI2}5K-3k*$}-qSKbQj*}%>)OGEA9CYli|
zd3gD|+nZeiLDq;)sQzg1fd6~d3afWouWrwm*K0#_uVse~QR%dCthiIouo3QVk|nIk
z4X-@!k&xgX#n7nboe;0)OkJC-7X!V%o|k0Lw#_Vkcm*9p3b>QG_o*5vGQW6i(Z0A6
zr9X4E*RapFZ9>$1YPQ!rk>#iDdja;@q6C}YniJTS5w=7#F8ZvUY7g5JuC*?WW4?m6
z!yJ5$Qig=}+wHNK&QDJHRpwU5`sPs&pvh|-Hra;;tkMzkE-{Ak$n@F}K(ybI;x_W6
zq7iN*QKav*D}9weD=t7`jI%#Wzk6uJ4%IM-jHOyGSK+}v(kY(R6nY}nXH#)Ku!A~%
z!xXiuP%(6CUAYQ`<0%dVkZW~7;6CmiL|We+K^RuV`!j#I0?nJck0b8>&#HAI2Bp#M
z8Fgw(KKbJcq>J`U!CIZZZOZ8K@bIvXg+&Cg-$&K<;pOp^!SQ6-kV%Orde$32I(46T
zZK>ITgK#0TV!R>w;K`!QMV)2&PLeZumdskz#mXeah*;_mZ~AWLSi_af?){V5@ciY|
z;AH`$4R%Kd9IWxe!+>jCVA3VS=F343IXV4+r!2Rl2~C0POI>W^eZ3v;J`=?|;RHEd
zI7N2C&2a3KC!+`vrm(=X!#p3dI7H_~;hxfbU))M#FK&*#kNV%lo_4v5I#WBA`zj`p
z7EXBy_I)~Lqq+`M35wYBVc}&u{o~vmPWnt%VUMYz#-{KSiOfr`l%0~0G(W}_yOpSU
z&P~HJcyvoGkJIVtrDx1RyK6}G+{FPIw*kcB@6z2RtHec1(Y^Ag89sTl^>?81P_N@=
zH>%_kZsZ=qcThy%mtHRW`!5?1MT27F+0NJ23@GQzIH|tDI8fV--Y37!B{_qq(o&wP
zPk&!=8Qcs!gzi#9KxA~>KbMq0{rwz)#Wf(%MY-QTcDjT{|M3m{T+f#zbx1y3ocOo<
zYMcC#;Nm`B^~&u3H+=k8MYZls81;MxA)U@ksp&CVZNGQ0cI+%AQc~XHdTSw>M4z1?
z#lgWbFg6wdT&6!4hQm!>-YVqFTa-*f0J5CnB4a<?%i?xer)ew<{ejaY)Knk*9Z$VL
zQ8wL!k#X0&e&|EpcK%P1{CO9!r?FVKzR|1ifIGwr7(KP{_|(#B<T%*qED-G|G%w>|
zQ}1I&*PglcNXbkxXXwJKLuI^iP2P<H5fP*ce$Szk=1Hy$I;3pVqKKLGNCgL0dF*`N
z#-NICUcV(VZ@Td+t1%!>JD;Q%aY#SBg=2Uj_nzoe0Bc?gD7g}c_l*uvcaNMwrAZIz
zff-{og7Vp3&&=d}vi!Ja2Z6)UloKAZa;$tqOs+hmGSLW*4=c2LVDlUYk6<9s%E>nE
zmD1b$hia;EadiD>(3W*14fGBhWi&ddNna^z3d%3<&l4KT6=RN5P4G7_h0+i2oJ+fV
zrdgvhNskV{3J3fF6TIcZyYU7q<{ieGxrIL<ZhjZ0{#uCm$pkXh7p8vS{`Xo)F^G6?
zdFRZn^Achlri%c3l!hSV{t~^qx@NxJLh$v+o!y_^@e`fw1TB-HI7(m(i4hGou#F)6
z`SI4Q>3X?!CkO~IW|4|B_P5e_9K-w9ZUdp6R3TfSg+TM4&Ur8DTdQsSa)g2S`$k3r
zrF!s?Lk!oKCJEDIo8-Id_$Bf$6G1QTR<{tF6}_R4C5;%?XH}QZ<*x0&TkpENm#iMq
z_C2aE3oZ7vbvxs-lX9Y<G=aZ8H@UyNJ_shMNm02a{TJeSH#%LTohe-&490m$#Wbd1
zz%<FKiplsWtB`*!)9HuohLYgLbJD|t#!@(m8#0|0|3`C=Vd2Jg@P@RxyN2jkzn3M^
z!fDGqNOuW=*Ga`;f?^dPGf{6R!3kGvz+l%h?ZkIqqfF4XUenV{g&BaU%1KCH5w3c`
zy?F=?A{=zRmnI%z^S_N7LW39YqZ!rrc2a|if5dV=TSR3kE5*w2@Wuof1j^5+?cUV)
zURa;dPgtrMwMXMAC3ACW%`AU%Zl5)NK65g`JHMc~hB#4=Iyr=N?928I1xnY53{F3T
z5+3x12;#s9pWtXp;y+@VCl{X3Q}F%X;7+ht#rq5xC;WA1uU+q*G#I&Qd8_jSXlpI3
z_6t_FrIv5o#9cp`Zh&PLNf$WJCG&62`4h7Hwo^af!?9j9y<wB_6==6N(8Fgd(CHb>
zYw;U?+W?mfcYWt{cKP?fZukET;w&LTtLq*`bS(55b~;cCl$P{swa{osdu({hW@L=M
z4zCFq{4S7>H5|PtBd{7?Y1t$rgmbi)ZUaSbGx2LtY1K}FXaKPf7WH$HH$R5F_EM_f
zHKTrGWB?~a{eC9pJ$9iVLcw5TWXKQ*LYAc4|KZEG9&usN3520PJPEQee(H&@bb#D8
zXRq@w1-4AeZpx^4%vEa9eEnn5KLQ>T5kM#5tMzxLX`CE;4Q28_#2wTPZq8bJ;@7^Z
zuyRSODy8+{H-^Nx>1Y{Y)k8_c*Scz9-K>(36s+moxVSzLP4HIQv!~HnomhGhuY1z}
z7jhem;j`I2y}N*i!nm8TW@51H%X#3r06!~!dD|yBlEjcZI|+b$VpdkHE4nv^E&PpP
zGwZp(ek%YB!#hKGe`=ra^qNNi;C_#}73h80-6e)I8xW*iV&Of762J7KM9REu6aIq*
zCb%y79**ruRz!9G2fjC{Sw&;fx4G?B`eJV|>LI6v1p#*XE}{hUYq9ow9Ywt2sOOGT
zZIBbJ5tXsExtLuQQ&)X5s()Qyu#qs_k|1Caa0Dy`fnj1yBRkA*EJY2yMF!FOrnejk
zVYz)<_7|Jg5FPCIVn(Ukp%GOp(((ij0g6%G?GvXb*iK6lsS5KM!-Gbm2RJQu;%*h}
zgV277Au6-D4L@hf44FBl+Q*S6?JGkPzL`sj0jjRrQER!<AcWo7AGN!e#FWYoqB#gg
zf!lN_popQo;s#cR64>liFmn74Sc99t_B^~}mrZ1c@oTRnQ$94jn$@UN)~x*h55&$u
ztm%`HQ7#%H`}Hw@U}vX)dzi$=ZEN9$i^s`rAYpRr(6<6<p+ecoWBuNH7sepJI4)6R
z=9?$kvNWImh<$wp6AR;?^14SIeeJ(dB3@7GAgDt017HcEsScz4fr9x`eYjpPiWBV{
zjI;Gl*NS_CaS^av7`uP$+o^3r{cPAy(pNV2Hs)~)hIV`Y_Zayi`$N+^RD{;&CI4n7
ztm5ot?^dm6InyvE-rqS0`!poPO^0_M`{<JyL3^g#=dbtdqwgV9HWNhyY8Z`c2lKai
zzR&Nlsf(htkF0V`BgOibe*4UODh9o}9POo*TdneHK-avPyD@h33e*AgFM-%^bpWXv
zQaeUa#u(Y>@Det1q8k4I<<Q+(vhI7hAuHyxyjHkS$2Ckbmyligh8~#SIx(;EgFzDe
z^vgA{$=w~&_a=(cROX)@N4@41CB6FSx<h@+%R{t9gx!Nn@^kl3Y(NH{a6vBoTERE6
z8v1_$*E>wEzL3pb{oNuGc0Uf#KW#hM;8B#n@3qhR=YE37fT<-zii=OwdyVF%pr+P)
zZ2k|LHX>f@Nr<^f&Ce&LqNX;}Fah?O0((Q%(Z;=y#!+M=fN(k=_=k&T{x6p11%1ZJ
z>7T$SVkwR@B*DA0-WLCcO8^0ZPr$_!N11RApgfpyBre|ifXD7*oQ7xz>Tnt-UKrjU
z+Q^xMHlxLZ{DFC*BXZ89&t0B&NTiSN28Try#XX)H&z|Y}MVdDEYr-Rc%~jLehRR+f
zMsFD2cMlI`O+LlG=Aqg)^auOZkBhWyR0yOqf97RwfYM2|+~lwJ6WxuZ{PXij$6eKe
zvWulZq`%AP0UWPQot(&C3_e`cAes`A2)46?$xf`Hgz|i+CI%}1fc85NbBCOwdj^+|
zwxTO$rJ?F%+!diAT*o5`UzZra;J#`q563cy%F+naFY7It-IH=r@bC30|6Uh9zVa#G
zJ6x<XTxOX^iT2-5^~2?y=tfBGa?`O~)a0T~&fI+9NI8Qg?g-W0ov9Z=DKNtEth;>I
z!80ED^m4{^8VN8CAo8O_j>ZQ<t3ZGTowd*mK5I)&YWv;y8=Ku6fgIXO&TBRbQsMB}
zSfMNf+$ko<MR4IZ@Ww{cP$3B}2l2V(oSApe{6SP}U(KHxC(o?2UUR?2TlIK6R!wAP
zWaO8Y{&aPfM}W+8*mQnaX92|qDz|^kR2v7v=I<BeOcbQGTX!L@r<TX=#qST-yf7jE
z1!ez`aUQVfSeW3=$<R%)jmvI^veViJ#3>(_#JnD2(`umno}DAu8FS-Z@KGNqD2DPr
zGfO5qm-y0cZ>ZncXBw|Kb9ZFL_+l?eamvoqiF)|^p=-oTe4@y*L&Qk$A^>}Ka8{6@
zMGHw#Nks3)_J;~7=1WRYc;abI<{I9**K61{ALmNFDNMe2Ux{>q&VI&Hsg|XZym|5l
z5_!$dlscJjFdzP<QVxP6{Gq{sRW>1u&71+1C6Lyp;rS;-&<xm^cb7-LcEx2{e&Hzz
zqU<_$se}LEnp{+;gyIQYe%mk}X>Yd+9xT@&U0?X~lzV!=(MBb9cH3j~Usp<4jc-5f
zEHY2rH@5bsax@=os?o^p2eHz(oUFJSiAcsn-x%Quj!VAp-+5lxnq`*fXw>eY1Fl`G
z?9DebqvPLyK<HekF*4IQzMj+S2R2(}PJX>JV1ulC!>ITe|N4O6{Rg2Uz}r&WLfyje
zG`_Wc{`J~FU7&C8yyr`*04F+qVuXDzsD2)vS+B{I*P{OgPH}9VkUs8ciC@y?t363v
z{+eTCu&v2@n(oaQu!3M7vI@UNes<FRMD~mw?lUDWy!pUBLO=(tT&%$hEklL9@aq!8
z`-r6-K6>v-+ailP<950i;WvlK=t3*E!^lTIRuQy1u;Es`?ref`S~F=JM+R>v$}9e^
z_f7M!-phJaAc|r)c+x9P0(ie7T=#$_K_|C@MU2Y`ub&Or_J_&65wLvsTz!U?lKt}#
z=6?KYCE=nF-s{5_oJq5F(Gd-2Ory6TM0w{(I)MBR?|jV~e`4-eiH&-%Geu2c$G0sy
z#CwVEzF|iiXKvD0P^idYSNmA)Bl2DO%!&I7uaak2w-FM(Ygw5&ecr3=%x$D;mJ|-N
zbyuXVo0Hqwb?(6_Q3k=kQ&9#%+-uun(EWK={H3UvgyCbe1q65H!Srd$t3V&-x9+DW
zH()_^c41-RY^h8doA-C$4|QM6?iA_}A*2JQSt4y*-)agM;{Re<I-r$2@^!}ZyVhyw
z8QTALYPCdAHZ``^$4AVQn`oJ7onJhU-S473k4Y@rX;x%hkPyAAnAc?LbE4){<&zng
zcW_49X>a;~Ar}V3F;6*fp$@X3o_hST-;#J^YqO&J2IADkl@|)1c~<EP8&JGYmMA@v
zrW7@U&ownE4K14pQJ4Clm36O42<B7k<vD1h&w1uu;ny-{jQWH58cc+ZuB?wK=Cm-;
zyt!tZ>KT_$KLSGvp!95j&>OIVHb0t%Z<%S1*d)pCVV>E=4fph;#L4UFs*81Js4GEq
z9W0zjKh6ga9hpdKT~@t)1_{A@-+$7gMu^?od#q)6i3I`Wa(TaI?~{S8kP#w?9$_{?
z{kXwEm@bnUNMHZyWtM@wOOL+d^4co+Y5xO5%a{eGNa_>g;<SOR1B-;zpDhq5tEu^k
znwnbQ&@d;w8U|I05uS%aB#Y&(&z9@+7OmH?2+wX55o>H?R+aYELc1sFYS3ic(Az;O
zc)lV(B9q$i+BkDAea7~*7qVAx)_7eBSZ87B205S|_&a^x?SXhRKc5MKgo5rlK0p^i
z(>t&Tce+B**D{qhNL7RXKI$`*vmw}e-F~&Kol(-tDPpJ?>^J=kRZLy-BTCLIFd8X3
zF)6U?il1xHql;FU!RW{ofr*z%*{+Vy_439i&o80A`iW6LW7zMQPi+sxlUeuLnq3C(
z+y61rdOMbSlt_lhvh?m+4F*!sZ`e7iI}Gt3lss$N4c<W%TE)=eT^O9fv~XAJ_mC?|
z9}3rl_0wvLOlH<^<a5>Y8~^0hp!j?{YyZcYhK3-PRgjQbWL{#DnFn_xB-nd)FN^#r
zwaCxJM*1TUXA%kO?%u8Ct!<B|=7nE&yL8#`QTU*te!I^UoI`19>_;8~us0UhX!g?2
z`7}jECE`sBh!^L*PcK2EnEiLUrR1vwM_+GZaie<u2J}*mK2VT$mz5CCps3K+rj{))
zqvE3zUKz<bqXqpk!gLTJV0E5K&tXVO6A=6(bF<bL5?gd+2X<(_yAxt<o$SXaJNBQd
z@dJ;J@IO8Cp*o20Mzw@7x{~0%*x!BP3{;ix<JYJH!uAc!;zJO1Z5zQoa)d_foZY=;
zM$DH>8ujp|kt2J18=*}6=g)XofD7$Yt0A?8aIYCxeQqfC2+iom!9D_=r_99oFZ<fZ
z1vwQH5N&452TM;G<wfPcT~yRy2oGSz=dB#md{ysj5)!MWQaHm*o}RruGyd+@M&EU6
zBM20D^%r3a*h{592y-)bvp36%jn88-F<tR`8AVPiW5xOMNtbxjij^I2u))u@QQz&&
zPVnBjlXo5qZ&d0Y-asmCcH}`pQBmi0%$Ke8k?qOHjoxh44jaVk#yhH4kj$>?yC0Vr
zPrU6wMO4xA<=P>oaTPHfVg2X?ip~?s?idfs7M`ijEPYMj+dCXSjay_PT%!(7yp>0P
zV-~j48C~QrMCv>5zpps`n<JnxJfX4A`S+qyDk>^^%hlEd(+`;q310w&sk$j@@zys#
zvx7C~>_E&m<L`}q{B#QcUhjSFcbtjtPr9W_xvF<6=WCcAX+b!G+OF8wvbAdlrSQuI
z%>p7a1)4WT8KZ(DDcCHZqBokq?m=o0uk8A0=?Lop?S<&{A8pS{P8G-u)X;JodMrfU
z_ahhIaoSkMp$Qlw)SRAp#;;WrRsBcclSNlDL%P0Y*D7j+15$)o?upgvwx8RHLl@#)
zjaTm+9aeM%Rf+x{;2h3Td)8h(scLA3mrAH0yuYX@eo9P?3LMUarBmqPGAS*IzfIh2
z;!*2<ZXsPyjnDAk8=llP4O2sbA>hnxmNN+DemOg1S$+|0&OG8Or&~<LK7-7rY3HM$
zLlA{t>#B%hOsgC>8nUWssbnk)*I=t|ZbtmnZv<XRKNlUPM7e~BZi2)M0};};R3b$0
zU$|aYo;cifoM%*iBdTjygodRq_8Lt*)j78eCL)aZz{)zIceA%&s{_^A19P#>^};fO
z;fZqsC*V#LH5iHG!7QLKjB}#rEyW}TlGZv>lG6+Zm<uI=fTe)}D+e1eeSAY^u0h;}
zOIBoRk*Ycx0lMzuc(tjGU=}g7`2D>BM%-pPp7lHqv5ei}zU~mLb@dHudtOq*b+N3)
z&t&eH^H+PekHe+|R{URO2i4_H%?AAm?m2c}z~^d8JmX@6UFo*}KT3&BcjKd=L~V0@
zA`0_aKltbnoCjO$AJ2nUA*lvL(qX9_At>NQ)ITjKyK}OSpIvxBvB<XYu8z3cfWx&K
z8`^CB&#&qR=i|*<sR+9V+l$(-qB~j&3SqgfGV4c^mrinW#W$xI7yT01xf}b3+0ndk
zmvGjb>!3958}bnNQ050Uib~ganrZY15#={WDn3N`Pe%85#<Whz8-`$3$b0)c2?2VC
zA<^1j(>F!WN(!<;XfR{z%T>HT6e8zA?AF-CmZoa$#>VZ=n7c|!io^^jEH{3y-hcDH
z)g89E2RdNWm(m49;2^vgc%Oe?T9^gPJJ%B@JSZSW?tqlc2)7E6p_(WId+Il@Hs8{=
zgI}#pR(f2vbwUtF00*CE7W;)L$29VVP}QE-9rJu8?UtZupOVdUzqAqZA6-adfd<PH
z*IUmMng|Fnnh6uPp*Io-Ot{|BL%=$1Sq-7l3M{$mRpV$ZHp+>?r+IR6_F&JM6J%1P
z7MniK3iY~&s64R9e@JdbeLT%<iQJp%>i!Y>1zz~WB-51qi)3B!Ik8YfVMT6|Zemp^
z10)u7*Tkx<X=K10U9*y3VSE=`_yTSqdzWiHD={vWyGZHAlR~@g<}N(!x5<Z*j@ZaX
zL~AG9*Tz+y@A?cCAkE4X0~#LvLgsPIosLJNlNv6fh8r=A>fkf1@0P+~vy@j+(8&~e
zB2+0P&4fXA4A(-})7lg`aa_yWb~5PP5wI~w-+|ZPAk|A;AKurR<ut0#L(BPnmrJ5D
zFSB^p(pgo)S`hG!L$SDA>H5h@{)bC_G~+VahUPe|tS#h*fx*1Kf(RAP!#+X7SwCVC
ze`>WGpME`(NF#XE<CAekL9PRj;!Nkh8vC8LB~khB?IJ0<Jki^8QO8zP^Y`iXN6fZm
z*OEO|0RM+@OUkH@rHp+L{2Ve&UA5H!RSN7C(l7&$&?c>p{<Od)T9aBr24<kfz8(ax
zTHwtr{)yXQ5ezo9DZsiG)M|(jzZ~uEVsvX{MZuw6l{cZgKM%Fepl6!sj+*gtDFf2c
zQu#O%3P_qlYB+8Ka07zol_uT1FDSY|B4}`y#n8LW5E09$tf+xkgDijscwTr{OVpU@
zQ+n6u)~88k*$25+1~mL=J(ki1IXVM2#gEUu=K|-OJvtL}b8{(vadAmu6+9ZFk`7k&
z-DaVN!9Zt*m!5HXC|t5Ed~ws0)~hAaEO$NXwej83?5W^i#cb4oJoaX`&Ve4}_;|&}
zL>HNeFhP<8lL{F_ack7-l<LMNPdc=XLkA*C!lqTvn<$^Z!vIMuvW8qc^J|F<6<TKA
z^f@ZQ4QuV?7G{fktY1w+OCZrAJ#*3W_SBHb&HcTuQJ<6Wjmgr_FJNXvO)!|UKbA+b
z!-WTO$zz^Uq&UA$a!U{|y`Ex(u4^UEuE`B~zKrQN;L1ye7jAYRJYW?LgcLnGT?E00
z=bHyrG$gU}!MlS~#@i_W8f=g>Ad#?6d$vfz1w~XuCcJ7JtwpyD3}dZ={UTC>otZFa
z9@~dy-pB>jxB70b`nlie4VeOv=ii!~1y~Y2UhS=Sd;60TVV_CaXq3i?fL$t#xH^-Q
z1;mh8P&sWvuHo^+g_6Zz0kxZF#*mebe&a3V`L`h}MGVEluVql~VtL|C#W{_TRfoFg
z6v7nDhb)tbw0i1)sXV{KOdQIh8L5_X&7;PJ9^^kMYm5fvvd*muiq$KW`H5!MKbz~I
zm_z8n=<V5MC06~CHQW24@gFU~MI}^|b(1J>Mug8d%Ip}}f*0Foo$3A$zZUD=wg<y%
zKy^WtWl|vN=_e5=aTP}O^)9|W8w0lTumb;~n!(^f(I9E<TcQ7mh#-paOpICx+98Xm
z{h#w905^6?+;@1~RFOUVgipsI4y_tFF<GdaiF^U@rl<yN3r<DF<^1|Cgg1gl8%|xs
z>f$K)nw+VQd`L#}t?0(#(H;r(CNc&;nERV*b5OBjFDp4Y;CEweP?9so`$*-KE$1SV
zP<2B+P7Ik1R1_nDryYI3>H;tVP=3Op>=qle0{n(p1c5J@*QJ!P#ShJrAn5=;5q&m;
zA*+}t0k%E^Oz>{#KASzYNUft+AiL(L>OXPFIKPdR61f2-YW5NlNLoA$a2DWNyL{Ae
zzmSy(O)PEHUv8s1I<%7-L2l&)aR@KAR4lc7!q1wk|2ZPnUGR&h6NrR0lnjxqZi^L-
zWj-<jTtPz70(1xVu=G#NV%ln~Hin9+^w&It!mcEj8!<zZ4y$ZZ4<WiAbvLi9CoI>>
z#denFZ|0aYOc~R-UQ}*t0A)`mzzo1|oNP~Nd&Vawf^TUatqv-)$cQPJue-B;yw?%$
ziA!-R)IAUgmUI`*xe?q(6w?@(p3{gz2G+3jN8*Eb>(rvLT_vsv<CTwR8>p#c=Yxlc
zXX9+f-QxmXO+bwvgmPoMa#I>gEb+}}Tm3lonm8eA!S4&I%%&~a*p}d_0*NRKJnpxw
zEqFX$3_~yGt!)L#OQd~5u)UT8Bdt#@$jY^XWC01A&tw3swN%vF=a9tubeVhe?t@WX
zXM^D4f`*R7tSO+Iae<GuzU4j5W_y(;nxfalZLW^Epz@h6goKtz2nobgNH(#3N^pf1
z?O0QTPe@GT8}Ihh1?i*f#|MRjrsz}0aXqeWeYQi@b7`v3>O6jDM)l{}f@-(%IVJFE
z<^h_Fb#PT9rhPmOtla%doda(e!rrOU>Kojf^#fSIIx<UJHK3Ubf%0D=)eqXobeSw9
zb#AkE+_)=K;UNK=Q>V`+9(f2@O&oI4AZf<hLpCKRscU{UcTz2EswR)VCIrE<5M2>b
z5UsJ!gsoygs4@y6RX218)|0}JRSJb#JXQj+O+325cu~e#(iY*DyzTSp#iQZi(eGER
zlYm%C^p`Yo+B(N9#=~z$tue~d$l_|MWo{3w;EmuyiXv?hF~SJgT*G$5I&fa}>}}13
zOkXna8-FT4mPvxj*|&Mm$@(vmlvWk`@|`~1=jXIS7JN$McUEC<SyW<b?|RH#n_I*)
zHqgN*>TAg%(wdjyaJcji&n%f;97Y5K_f#z!1X1_sGR4tum&?4~D76Q@h0L~&8S78T
zn>i<S@j;=xl0TB52v(pv;NwJjBW<rL3S@uORM=`>VVDclIcg~>s|Sf)_GM+sei3(s
zKlr32!wX`Jm2{UDOCS62B$1EksoEx{5P-vJfG#YBTRwSRTvCjT1L_}ORsl`L%vX%0
zbPlLOvKL$LK9=?qUkqHx{AAEV_TIa`H3ar>RVQc&20U(<5p{nQFO=XRX2j)Q`<3*3
z&T6soxXnw?AL8dU+h(Z*x`=wZO9mlTnjHTmS(7%n+<WGik#xQ`s6`tL=6@|dFvVT4
z#dWPeHhMNb0@TyFuM_PPGx>0+L%UL*oBf&{!M}Kq`K28`an7wQVc24h=x;RzEDn!b
zDz_GPpNgHMnZKG~(WW=~WV~<IJ1B{rSj}py%_s?Q&MU4)6}icf4N7O&<IQ8W3+~L*
z#T9FxzsWT}YSbBfq8HAPx5s~Vm*O@@e9ncdm?F=gKcB8>v=T2wN13O2C$HrRjW;S8
zSH4snB98kACh=HnmKi~@q;<yQN$1{vz}m*zEMQK}zl|CsW-y%!{d_MUBjfPvXJg<<
zM$mV<8>Qr3Rk;k&kPhGN%%NogQH{Tv`~k7$jzYM*Bbna#&n+f?dJ<i^%;mcmA#hJO
z{j_oMA4$h{$6W~I`~6IIf2XVU0{N*>$SZck$?J`%D?|0pS4-5rZ{HqqAVwp;BAC>Q
zi0eYn-D(euA!jw|ww4Q!7YZQGkx9JQ&|dL<5iO)?%rf?D8~Ac$IrzIfaVS;T126a~
z&g~~z9VVz8rJ^DZ;kH^Nc2VTTSp4bkuI`*>lv-%yRX56AgJMp|)XJmco<WGsz<|#5
z9_C#ciX2KD^5zlDV)v;%@7`3XF!YqT#Dygd<Fe0}z0^H3_m{Xf-fsm@{d08Vk>i}6
zBt0bS()>~?ly6*oqEhKZz106=<L$oryf#9>Y3Wj{eigebj#}^c4=3*zVTXL{>=bGd
z0Bxztvqa_ExtZ^*M)H!d)d6+ZEh$!Vc)TUjzqJP~txMOszSG`ql3MNoRwb|k8&8|n
z;;pY{xI`yA#BS2a%QT1wxLSLi;KY8q@l%@Pmjn+Q{NyVBkNuy@Jz$r!S00<|Qk96@
zPRR@_io`_1a2VRf;yps)Le8f@gMhi_AK2FyXD*Z(zgNV%ZwMTNp}yj{nzgnYvmB+H
zA2LhZDtyhm$i5H^Sg5S8DK${~Da_)b4YXkN8$D_DNvZ-USbi!fXgl8V;J$*<LjyH|
zM-vVuiDQCWvChX#-41uQY`zoL?zbVC(=MH5=isOqZN!DmW%3%Do_{r%P8vI)sv=BK
z4NtB`v7%N*L!(!mCg8PFKp`B?)%Nqlr;~**u7}OEysPxkf8R{*I>ql-1@>@aCq0cb
z3oev2PxOR6Vj|Oj7?%hz6lqc>Yg(&wdm?Ty8gO*4riHZoy3T<dmtPStdAIjppVwR_
z77K70Z=c!EjyFp%9ft@xJr?~#jmj&zz87bM1^W4w5_x{2><EMy)(u$Ebtuz$0v}5M
z4T0wm;7Rvsi*;}7E24hM=s(Yc0^~oOhpG;gn*ic`Zmx5Byl*b+yWTSkl0qIzYvE|r
z8J=sYJ&q=6c++D|Aiq(a$P}yLZ{OfUZ8B|cJn}qG64tI~{He69kM(3JyDQa%_XPNT
z#pr<ubXXn>y_G1mOyB+nKSn`A9z$47<giaa@k~2fWIMsD3$Es0O%uK+z^bA{xf8pa
zOmGQ2g)oL#x%b)&2QXPPe4o2*0g4O~Ceyt8IE(>3QB#+U*gXlF_~NZDguC;En$=?O
zE#JY$`)MhLjAzYj-$PLlsTksu7@YOTB~&Y;3O-cAE|UQ*er*)($sBO@?4lM$)W%jL
z!sqqrm`7IN!M}tbuNc34kyDlpnhul>ZDMruVEcWY{+k;|6k0-DZWD!Br8(1%02#rS
zm9q*iK(5=M*!&DEj=X+%c2`uC(>2#WT^5;^V>7af9{ipaDn3g~6X}oN7o77`q`?Nw
z>fBw-L~wZ7<UC&dpvQAZ5k%5bq~<bpZOXMX@99UwC)|(l#O|f=$B@&XnBi>Po-eM4
z=PI)L@=NPFug^&U=A<S6kt1@vONW8q_13=VRt~w@IA^czrR_u4%wU<ARsb64q<;oS
zNHNno58?JAh(nAfBoC5E<d;f=6p2W<$GuRPhXK@LeOapdOqx~#@YBOGKN&Nl7?*KQ
zM}LR~sJ&i)h(>6*su4mY>qcSU?4QYNBth;q8iyCC@a8@UgCU}xbdI?ws9^NZKC#&)
zT@&vu9-j*dv4!W&fKie$3V+sASMZTq3n0F*`IfrWiSo=~vLi7uooiHyNPK*Dnfjq)
zuY@5CK?rKl+0z|O+TzpaM>U5w@-^|k;_+8<G|p34MT>7&uGT+LSlnBr$Q&MYrba#j
z?{7<UVZ`Hoxw=#5arKX$ExiBWRKwULOx9=raC|grJKyX<&N)sufmK111R<bDbqbJ=
zgFBQP_;S3AxcBj*F>15F5}N|UI*p0;s^vyyEE|4XFDN|6pvIi#hc)lv<k1%UAdrXV
zR=2LiSQ}Hhr9L*^Lq}FUuAgqUTP$?Z(U%gGA^lU_ya{RQgez=Q0mU({H)w67Gr?93
z8xC_86R^lJqRFQbGQjxt)by=GAp!xb_6saf-+v#uWyo=fUt(i*lDQ0sXpcynlBY3f
ztN{fnWBj29FzQ*3b`wA^`4gbrG13_jUD}Y@fNdAJI9(k4Gzd7OrZD46NZxQvT6A0V
z;yv&2(-7P+a+91mMO@UzH@u3^GXnsMFCA7Hh#2bw3wj5MUXl_KP+qFo*ZE|4XY}1o
z<F;y^^V=&j!!f~q7fctKjEqX9h_hm{X7&~`f@edp@rsT`k!L9}YoEcG84w%LX#^T!
z-#w~jS^S3Fr?|>|%Cr||Q1QNURIbf2<KtBn7j}V{BBQN&-UE<kKw?a9_wypCYk3Km
zrOkYTG7Bw~+aA2~u#Jh@KwfQI#8u(W>$=sm{l(dn5x<X15GS3ba!(hc|9b@esre^7
z?rSh5m0A2w9FdI_3BE9|{#@n0iEpPPWVRWFqTO_lrX1?9%rH#+8I2^JzTS^u#ms#k
zbLnbvy=rhgH?6>I{8xR=K|u?CTG8;RAUiy6j_Q~mmX%-H`bJP8q=QUwz4sIBDAdH*
z=kX|lSdRYLWrS42R15PeiPcfA)QRz4q%)kh1h(_nb-s#@)W6jWEV%3&VtHN<-ZNP;
zIe0uI+eOnKa?hm3qtS{|YkTB07MGVALa7Hre5yRLKuo_;z!!At>n_()439}e<!XF*
zhm?z-^QG;bpj*DMEm_N++Yof0^p&jG!^m&iSJ8_h-$Nwq-~=meY*z>UD!sKed|;q8
zF*Q>6MZsSnbNsP$c`off@#%I~#52TMj{oyd){dRM*EgiF9WSE>umy=k))m6hI-<LT
zY&Yi&WvkY!FOLXZTogy-Qsy-4%&i5I3#fxkrpXO!N6E~uD*B;TTuxkP*Rt{+1?Lw+
zDng_@%V?2`<WQsqAa1-$n`&Ok#mV+MhnJ$l(ym)0`Kmpv@5h7veXyLGr*OPDdhzG?
zt_oyi6&(Q>u}|kGd|0)F5bW(4_bxk!O_Ot%m+e|)nZcd8oIX<tCbxuV8YMtl_AGzF
z4jjkVzFAaV(w*=8?Wv=xAnQk<lC0ouMM}!}*AiYsRkbh9orLsG6yvxKx<=c=q}<^P
zN2I9Q9G&o{v~IPs#mvk!bUFSQ>^>TT5)xKzSw1=y?~%6_lFsQTh!&;YasBOWsv+tq
zWLJ34jMkvLBq&)i<&?;VOc>6S{X+;1;m&BwZG-LkrgPRbZWqntWzxB9X$7)+V6p~L
zMm*!o!y=+OyJtG3lEe;Q{k3y8dFvB*wn#o{eC<>J{hi(%p}rA!Pwo-!h#M#R$*N$4
z!Wz1}xPe(<7(R(3Y7vH*sK<@8L)ex`h{v^w-+dDrmd-D|VCkPbN!T~22neL&WnH4-
z`&f#$#Gb$%sk2pF8)uhSkQeIr8IS=fFR2g~s$tok90cCrEt;7V-Jbx>grL^X;Vpyv
zHdXsey{Sbklk-;%4jSPZy<Cl%;f2(bjUc?pxTPZV2=vGZpwX~AL-2j&y|%gPC2mnU
z#G!L*aWR~$<CC{Q2Z6t56y459=0{?10X4(8UV+6~2~|my8z=PKOPIE)5qjod3eDwK
z^~qeVprNN+b34^(U{_60GirBVE^2Sy$<F%bJ33w@yTfUW`cEvTdgqJ&u<BgyAW~8p
z?Okp7A^2jO$Rk|Z%(fu~k|N6C!|_LaezVY7qpf)39wG7p9jQaD={h>If-N3Vz1Y*S
z%*LR`B&dLv>}Uu#$X1kPVq}OFj$92+5K6%3^^6_-oS@vT7k<4-?u&8xM8UOb+~w94
zKGK>NKf7`z%@R%*=g2<6_2qn~#@LjefFsO__a!{f<qUzldT%;L8U&#JFw^-2p56w~
z*5`oE!TDJ1rv+{6C?2}2?*MA1QB4j8U7`<32)=7-zi2K*n6uYW6259`+9#9y^wIMx
zDxu%sjlJ80xYQskA-x~OjOuf@%`C-?xF`iv5x)sNT)BwfQ<@2Wxy%G3y*GQW{&<j5
zh=5hi<`+Gj!tWdsx1rBK5`%;?`RyDAB?tDfv_!?!H)ZDp?V#K33(ww<oDuRrnLFz*
zF`O39F7Sh%B%wXR23Xe4lh8cVZdpd91*n;=oX(BsV8q{_3(J3$PB`I%x&KCpcnpY*
zd_R(@3PUI;k3QCnZNvT(9F(FPTg7{xC`e*1qaEmj^?@lYrz5i5fEA^6uhBc+Tlgiq
z2R?eD{7ED#zd%@!tsq-k217=WCv*w7f5#AHJSb{(OtT1IKll^T^t}SQ()!J5L$n)T
z8<L6iCptEIUh#pTkaLl|(L*hGvvg6tS~6q2XswxhzUtpWM-)r?{qnmV?XL&J=6Uza
zobE_-N*~#MVC&P}I0<#7polXDS%{P!9wk`q><%ejnzw#@-0TKNdr)coY8Cx7>mLY}
zIW#4nhc70ySuPmwEEEGnSssilxh2V=#o$?-g5BNucInMd+(-HaK75s-J%Dr9bI(7Y
zPkXKq^Dql>D=~krS>x_~jvSXE)5ZDN0hkSJjLSnV#4ZNQNNY!RJcSa3V_L!Q$Ni^c
z{t$NbUN2R;>qzk<oTxdtKi*S$^nByIdrl^ID-taj>HOkWN%kzyrxzV%lLvH+I4AwV
z$5c^jwIlNt%1oLJyu?JQyY?!1*6k{=oFzb)XyE^aqE7MsoE(APpD~<D29rQbffiU&
zWrH<=%cs7}R7MDtj5RfN54r&tzdK(*7_!ztu8o=l2ty95lo&i0)UB~NbSFgTm475p
z{{Gbke-|DU4&-iOy{o(e;te77kS@nzPh|3^F|A#qN;NOJ_d)u6&yJQ$NK|Wu<&(oN
ziAG^c60<~8*wOBVZOz)4Pmu1FX5iCCr=9Ad>=pNAdiyaFwkGzWYaPLWOwZorW`M*p
z%r^3*(j$-Bn00?fo05mR*wOm`*!srkI^zD%HclGbcGB2xY};vUHENQ3qsC}(W81cE
zG-zzw=H5R4efI30v+w84oSFH~_xGWYH;Phb!uO$Iqrb~VHpoa<t+`4E?fiL?DvEqI
z#~E$fFIgAbBjPrWU%iNW1R)7rbP5i*3XL$CMS>{EA5UNQj5Aop5907KKmkZcMCFP4
zGL)kyN#r#jNn&o%FTJ7fxZG}t6vM?f;x_p_!ockST_5X6XEn*Zjo(7DGO0+;$WLB<
z+-bNvR=Yu4;A=US8#Ndus+O=kPm2c?M8}t#Zaz`;l-j12`-Ts=4t<!lM1|HL<AYL(
zgzXXK`BJU+rIEZ1$k|qpqsqBsMZoM5!60=$B2E2)8%mC?<@#7}Rhq{_7-xlVZ?H=`
z{dH-1Mm=&!G*M(=7tvtPMs%@w_+SRxfw_BEEYXAYIaxT*jPLmTI{pgxsi<nZ@|uK>
z8Ye8$+E^PW8AJ-o`Of7?;5ODgqh3hmdvB)yKbR)l?H`F#F?+FU6D?9EVSIQX=KMP7
zY|#~12gwk-teZw28!_y34|$qmqp`~!D^C}mN*E>$t^F`59meEB86){A2{SM5o|?=d
z;?hrfM&3Xn1)VTflCsoNJpamb1hQmEb#KJUeMT6Yz(jT%m43v@O@DWKLMjy8GydxV
z>KBVmzJvMk_i(OOd)#l*_qb>ip=JkMgHd!DbX+>GC9C!tULeApSU6?AvmeTdTGaH_
zGK1MJS)J^@&~Y)MYLU=<@=$cWDXFzrp{NJkvoQurHPe0Y<cfpk@uev9=KMr7cN`Bj
z=T>D}FFhe(j9l6k{TF*`UTg&#eJ0%1mhei}R#B_Ff_+geJ|2jx<8I}vyRs-H1yy6A
zosvr?sf911ntZ%{Trv$h4qY-@iQ>$-zyz*}lDb4ZuL_sq2+TpwWOK(sqqKPZV*=)<
z8`bOGP?I2v*hbuvibAy5^ZoraS}$#K9LQQ$c>`ZDUz1~p(aE?d74n5?++Btd-dh%s
zoeSY9To)c7w+b{sS^+B~&ydD$5c54}EX@Gwmn>Dqtuq}3q{VAW4yNf+JNXeQ!HQC#
zzbspN?`vw?M;hDx)8bbg|1U+%n<k0b*?9BJh-n*GqkdMl5oz3*URptKW;7nK&B$;L
z=NjaXuc?K{XhWc#<<ruAb#<+slRTM6r<m6bMZ<YXCoY`(%MxBH>G8OCBGIli;bEk#
zs2hK>vn|jg2+(ly6NZWUVB3*Gz(ghpaQ<O&@PVyA5k92GuB#L##Y8GBwz!}Zi5NH}
z4AwR`U1m9M*+h!0y12T-f~0-1D_Fx2!-J}?a!m2ABx|NO)`s4BM%1UIedT5j3jj27
z?CP4DtZvLY*RVusSNrOo&t9~@BocpcIzRJJQWdgbG1=t9>>qxQY2@Q;7(6qxj$XQ$
zG(MJ%(=;@UhU_A)RJr&Wz#rBd(i-PXX(KBhgmAk4i%6}$<G}A+en94XRzDZH%&I~Q
zjU{)<W-P*QP=$G3vwm!O{b9Tp7-FE{Ze-*QNB%=7?KL@B;|!k+UUhUJX@jHB`J*=)
z&p~x^0qk$Jq+P%1BuBWe$v`9tj|yl`s`RTEohU;n>8gB{Y#-CwCXR}~!q7Fi!y}?L
z96%Ix!-KK4yvTTW5QYduW$Mny){sHN9A#aPi31sYai!EdH;ZWV?ceh(C$v7=qwR%H
z6UGe0T_~Zs=^tc{{qnDAtw*n8c9ToBDI3_y^XMWZ$QLJmdh@Ae+eT`tsDy1ao%evV
zNS460D7W493r4YO<le*&sHtNWX-zdxFJ~@68u0UceD+tg{95O@75wBsQHTVk>F_iY
z5xIYi8>=TI?aXb}kw-chGFdHj8hj<0X`Ve${PxgEMoxDApHULGOj0r<v;FJ)uX8jF
zjlZ%NruqG{?*|o6q)xkYWUEu?nGZ3C!U$PRkL^$L(NjTS$mIW(EqfCGAJv5%3Na_{
z;NW0+J1HpK&i*42JAqB34i*gsms;EtoOm!i`n1OHfugb@5oSg}o*c`E$8b;##Z2C+
zujdsNB_^KqTY^lUsGQYFK{zS8*s;JhC+?ScE!i(-O#t#=&+AKgbayHi5Y3Y0E}Lo(
zTcabsa1u@edm9yQ93M{X{M=GTRwnarB-_zM)fTy~Bf#NY3&u;{?=8JG7>Ol`=C>95
ze8Sk$Tj)-#;ZZ-?(?oqKY+%^iv*hz-#`&16KrLGt=Z=sfU_A5}3ORHaMosaMk$1Jr
z@7kM6+6v>K@_G#G@+^BLu<PYE#6Jqt<Jc%pFM!|sg2e@tfA=VdqPee*21^g?ZU2{(
zwQ|@!HP75q&JW(9;hTmV76TDM7lolBXAGP>@s5CBqo}Y9A@ysV-=qE2cEbp<>Xi%0
z0rHxAYdHysk1EcDKi>UL_dq-}T%RB;@=Gs?vU|rFDi1U2GtBcHev#0LpZbE;nT)iq
zS<Fd9r4!LkII;y%|4AZF62+M(ekh{!fin9I8)U3*7!+-rBG!&N1ol%x0+d3!{9=Sa
zD$P?y0vU!obtu3E|1`*t8$w*b{H(=$zp@xX#y0C(ntx}RXY4*+KkGF2MDA7erku9S
z?;8yy*T?9#toR3NR9n$`tr|E7-r1PQZsL_L1(yDE_(^|8*l;^N6>4dCADFD@cL+Bx
z^^TJ?lWUrGh&-gzF)C;&XzqC6io1uaJqE8z<s<R-`4316HzXi;p_iOX5h;yKJL`8x
z`v(Pws_Qq^pSpaN2B`O+=C(EadxjL>3t~K#zr5f4SM-9I%!7NzkH3&06`w%1`+=~x
z#)JGX#$d{nC?Qen`^sPM$FZuKVLJ5tu+r3)9HGRht3lg0{Xfw{ty?FF2d<f2dUt%o
zGO|)L)<>O8PL_mx&0zoL`C*HgcH^-as|2_P7QT?W7uG;w?GXAEr3`=#+A>U)9d)9}
zI<6g%7%1Phk&hyu@!NW1F~1*kDR?~8*#NmNRGY84$D+=A8!JVo2%uwYW{9(Wj?Hiw
zpfQ<h^ldiC3l|HjSi0%e<`o&ZAF2#otTRv!X6Iy=FmxcMu#5CEr-VT~VTj`n6>&NH
zPD!98%656_Y)WOIGl2X{a^bxzMyq;*uX)k*{H3aQu+p7(dOVV`T#rm6I36)?oQs^2
zXw3s6pP0(plsd``SD->S$&9c!N<AO>)jjzwoFxyvIH<yoLiqRg-2O#lSQj<93>X|s
zW=pB)P{=5-=7f(cy#4~Zdvkb-`Ht9ex&0|Jl_&klP%|g&d)(+QI-QbeC8b-j&V_uH
z#tcX{EY3*UT6P9(Y8Vdd!;?{FV|t2FhkW>OQEq>iQyXI0F0$j@0H<-<Jvk6CY)D6K
zTg>I-pv^EyJ1Ai8Vr}l`utz#kk}0W<ENz*FaD<~AE)U&jbx&pi)$mYKA5~{Uh7`{h
z$?RygUYn0CS*w&gafQJ<TCOZyeZA0*Bc<`e{#tIhnTWLI_)mUy#p$SLBC^_pL_tLN
zQFHN5BDQB}?43BtCq`;=?6#(J$URb<Z{m|awyCNv7QV0YzO!l<HP=%Qv%f6U!CX^l
zjw#GYMUAB|N{-hbAAn(-1Y_9o_EzN8>|k$ET~f0Jtzt>}QO`z9D1BCt>Ede1;kuHn
ze!Q{E(J9tzv#fG=p36Yc*NUO7onYg|#UAqWQ6Fh>PSu4U?jEJkHHH7wZB&r4Y{!cu
zdC37Kl(5sQ(s#E>QC$(dc~jYj$@kB@l2p_4+=iY&I*5@H52xn<5%L1;|1nrOh5q9v
zHUBvj8m>QW?z=+iVnDphy*jhMQL``g4MD#9zgLXB>NdvdCNRcEz}X=~5^4*pW`?if
zM(m0~=mf#>l6?^~p3S=tWai5RP<LSCMT0(UjVXRnrtb6+^lFWWXI^FPRjIp0&i#@8
zcD}FKGc*VGciTI2=g8Y&a7^5NCjGA#Kmsug@!&1IX_P@FWt>2G4He!Y$Z~U4q@LTc
z>XuM-fPxLV#HpkXJXMb^jjg*!S9BlsXo;G@s1n+ScxaG7YTW~-V~cA3Q^`~FD=<mr
zM^|WUd27c)6ewF`p4RX?ApG00ihtxG+bVR1(HUbASJB~Es~yc>_F6skKr^+6t{+lT
z9@bCtt%+dXeW@X9{%&Atp!xoevqqc|tuWap<$M;zQS3_m*SIt|c0)4!Bzij#lK?b2
zq7_(ybWJq@!bj5zW27(3Ey68wtDqWaW_M?id?;8mIknOsgHl-N{2sadqv_ip{%=;6
z%ZgNtOVdluuNz~L2@C7xf)q=-P_!}D_~NXC>Q8d+`g}1yvf^^)r4ivRe`+JBDEg;;
zC>7KYrSTNOm5WI<df6W*k=E-u8l+C!oA8*sI4pWe|Bb?Nm-<f>j+0R%{H33m#}Poa
z=D3_3^|jeY!znsO;6F^%Z<d78dycEvk9-JNM>7+nIbZSh2{F+qe8Q4`X#UjicG+Jg
z-5Y~tyw`L!hMt~xPt>sei^7BfotXZKK}M&JG|Jc7iwjG?zWUEXoUJeX>fnYSITFRI
zpA`Chh&{KHa%M^pKP<w0=%MU2O>waJ`WEPO_1wF}!oh`kra~E|07o<w3IleN$=YjD
zV$xA8PKVXzGd$}M9r;JothfI{vgL~C$I>w4PzXZ=J4(fkJ)K`!c(>#GrF+BSSZPo)
zExNAnAI$b(`BS+gp-M^&ekiDBl5a17D}`)Kly;H90@g>^=YWJ_MtjDbGQGBIQ3L;F
zOhArT{4;j!YVjS(GEV$KdwCpGuJk%N+jJXAJG>>g*S757+RNj68HvO;3fgJ0fMc6`
zLly%=94IvVX@7e`++2`+o@m_{u`A@us2lp8pf?SF!Ed@aaqDQ;jkG*f_!_2F^B9aA
zn%9twX3u9f^@^<L$RX1r_fy)ze7^0<+E@0Qv!g@q{H8+~RYAQ6FsU#{@H~jA8K<KW
zrX=BlyZ0AuCxk0hzbu&q(^Q{r2|O`W+NV&Tm~W#-4y~9z(hb>!2O3Jl=NRRJ<I~Hc
z6G}r{=-E=i6TsYzXVjhp*4#W9YotxHQPb0Q#Ma5FvFgHQ|1Is4{~9cCV5fb^Zuft?
z|EQZ0NaDbomX-BCkFwkiZWi-DeXBZL`J*W`wxi+rWI<JcWF%1!@#CCSjm!$Bi8Nkx
zsE`g?FOEFf#jkE_zxZNW2?UFU1$`WNH89+fvyuxox&F<;b`XbYdA`%&LZ%~j16FzG
z(e{l*%YjAsazDGl>bruCV!MO84+ESrxf$NIq#ur81wDh25jYXwr|7BIyS8ccNiY%+
z8u8IGJS-@FPQFL4Dy4=Kl8-5|TkH*7H+^)^oW;vD6;%0}Cr=fDad(w++Ba-}YhG5{
zEP>_3{z{DwR4_AbMRfhvZF@axgt9{drga0tL1MzNP#r{+8p&$8`N>$IUukI$Z029b
zKFRSQIR_ZN;7jn;e;cjA6fci5-jT86zF|=-tG?fz#?Uu#!$L8IV#_~(Qj)x#nFi5g
z4qik`t#Vo86$_0$ck}&eRY_P0KgZg-z8)+8LEF&!y7I?IRMyPQ(OtdK_(CpIo>vJ(
zPSIxQ#!2x<;#Dc!z6#}MPyb)>NCs8?_$W2<raSHBK)T+NQCl<iSh6n{U#RMzf$LeZ
z3|KEk88W6`LHKlyD#R*<h%_rP&oCWWPzz|4^i@hIINQBD{kU!<JJFNu{CQEC>S&eN
z>woQ0yBIPvq@o6CR0}IRvr(nr7{(Y<==Em+-_9nSe^Ue)q69?yM~}K`|JL*yGctsU
zl?G?B&Eq8cCw>Qlm!;UPNgq;yQ_Vygz1)w6v(DkOk}g#TRZ;Zs<lnXR)9_NsycF2v
zk?`-y!;t3XSKcnMqz&j!Y1~$ha>jPi{!AC)gvKqbz8V=GRO0<@*>Q#^ju9aC0_;Cf
zWx{eyL8frWIX%QxSIu&=id~+Y9pYMFbUM7ef7Py2Q6{!6+CZ*~OoI^Y2nNoT9C0=N
zPT0}}3ym*hq3QHvQ<rD!!^05@&BTv&=UxL}FQ0RlSW3zIrZ*PygC)at=ZX<mBe{!u
zx}~3BW8!%8DGRpRm>1Z$@neJb-`toea7MWBU8nBfwyUJ6q*Y4f8igsOiC!?$I=@(U
z6<$f-DX4sl3Pt>QJ7ky-k*%0Cculo2Z>|EEQ7M<iicZ^pu`HpqQqBTnwyzgmC&{%j
zoM?E7o$I9JeLLwxV44k<ocRm*sn4E#vGkWlFU2=VONdt_1S{=YHEM)ozxYD7Z;_ma
zO!fd+npo6i#;t9-e(V1AFnaeZe?d_n4BURj(@_}|2EHwP|4p8quOc8lH_LHHQi2kP
z{$0ObnTq^$!Wx&So#aIr?W9xh9g<=m%{^ZLcTUX9OJRf%jQHbvoe!_?oNDDma9IyR
zfy7lniW)qmwr!_`oPt?H(a(!yKE>&K%O8~(BewO4VbZ0gbo;&Qkluc9j+hMMqF;=<
z#3;Sv*zjUk)X{>e7P8$wc8;U$h{{GSQ=il4A!GX*D_6+&f61`Zux!`n{@kl;524*x
zC>(CQMQWy1F^g|#Xs*Uh>)IF`T~tjINW(mb8QZItv#ewRsmq)QmopcRr{rr|e@(=s
z?|St&$v<AtKN1v2lOiXX5tJcnQQ(^T)4uWYL+BNvK}$kw_R)9!?fZ_Y6Yn;9TJV2U
zssanqhCSCCuTTED8}7uw;*sMJoQnytlB8Y)>N@;plWzm7MWV&-{&Uag%gcK!|3r@S
z1TfJHXNS&4NC#9&QLoW}^q5+g*!770-XczZpe&=L^Ox>ei3%0W?vwZb(MHk;roiG<
zC6E(hES~l1IEC&~7UsQUGY>^&J$%;PieZHJucQP01Gj52(WVm`j!SJcpW}!1=0b}x
zE^YkMf2Pu-V8HCc+HkGe{joRUKEyi;TQk3+!Rivqy8lL#wQNppHdHlc8M93i)=xxU
z70Z@QU(nJ@9zP6JIm!L)E=$`&PsT+zs8Zov+Ve{C)M4D#62sftwUH5fM5&O7Saj-X
z;Hu`)erfp6H$eZvwaXXhN+K@U{bYWy6q7F?na{<pRL2*><fR_Kz)-mf2FS`CyVF1A
z<l8Bz>7S(b8)iJ^Se&+QOS8c)0nGF+Vty{DKE-p^U&~O<C>F8qwQa)=ln0gQ162KG
z4Q|Mw1$)Cot>=EeH_)8^r27!nc7ZQEe7r%D45ly_O+zKn{*g#O_t#Bfgs0Y4RwvUg
z?7fPe#|ySXGP}K<Q0kuOn@);yz(pCJ(bn7ag>A*e|7{;$WdAgu@dQw{C7S$?CJelV
zg4N#)qS<k>oo508;i@J^2knOpi@`P*ot4$isPX$z-@cg$)ip^uDLuYG38=+zQ^)#K
zGs(&3#W~-yZHx}aE#*rSlmBf7tECpKnbjI~yC88CjVyd=<ViwQFr<bvXuudLn(_!A
zy?sfgpwDF}h>ME+OM}P9JvBtz7)Bxo6&r0_MQ9}V@gABDyr~;n{V|keq1gKlgx$N4
zK4q`9Y<DhM=wn=Fs(;=~G{v@Rg05=%Zd_(3F)rUe^bdG=Ww{w^gf3Lx>*FU|X*ory
zm-{o8n=@tHlwulu+QF9Zry$HE<DHMj?PcgKNF{SKJP9DGgL*U<7di6%r2FN`b|0K$
z{<;K}m9>D4I58!gzJY6gCwJcV)t&GH+>?^Y+Z+wC*^8{SkobJcM~|MY9Iw#XqJTTp
zZs@1>n**bzj}pX-Q;*(b$|qHarT1Cp>c)nkv?PL1RH=Sr3G%((n1c~CZjGm7XGLOw
zcEe|QZigDC1n;&7q>?7vk)ot2gMq}0f`anDXh~Ung5t5_(z|Np<uapxO4;vkV_g@o
zJCw0gWn*Si6R5tA#ck07J8!@}OP|Us(MHY>KI~1r)P@O|*?Y6?W^ez|SJaJvK?}!c
zJ^;}`G+JinK4x!^?E3PAC{Ez~OEMpIdC$)E=YJ3VOa%NT*Z0M&BK(v4AwkGBF%4Y%
zj_IIas9_)yaVG3Q`wcURBgFpRlAf4w&PK#T78!^~SY7ml^gu>1rwFx@c(d~!7G7-i
zer8h3%$1}>A!p=FK$N0Fu?8%Gr6!E3zbJmyw&K(28a#K&(T>xCCAxCnEauIHR^hnl
zI!ahNrR~jx3tcTt^zG1HMB{MIsrBhWgz0+j(Xr?X=?Z^bt(AVaCgqQ8SCve-Jw?U_
zceX;1Un)}|T6cm`lRrJVfs#hKvin6miG`i{HzYKIgIQT0VKgmDi%yUqf~fhbwASLm
z6d-<^d*9RPyR}OBo=R5T9>H&D^0|mblU-5Hg7jeoZ}FE@tg*yx%*RYpKh@o$O2g?_
z7k>nT<_aVM$Je?zu~s@+JRp6s_*W6%lW+=iR$1|h^N2l}s3`v|>KiZI7gS3j^tq*k
zih`n)kV0I6aPOd561W2GguRi(!;hk#h^#z8xYf{h|9+ltCUa9gC^ciI4MRLsiTM?V
z;gMf^0kwfC@O)k45AnKZP7Z|s$8V^M2I0ZHA!1Sg|BEPd=iuz?pN`LLX*x20eg+Ys
z5|9Z@;R;73Y)J~<3EY3Ud}?cgN3cH^a!7tzFC)DpwkeUZ)_xmkio}$XwQ~H}Ly7|Z
zL4N&-+hxDjH<&ZyJZtC;{miz<U|zKX#yI4&7>XH7Y}FkcUzFOMo<o(Rj8_w;vu6rx
z3*a1wXtsn&WD~Xswdv#uyiaudEY`=qQ*m7s=GU3#hc4LqJ-al8FMgbTQDtzg;=X^1
zxzWLs4?9L%QaA?m4IzKi%V@mAk5MmCbtu?;E}$*MmUb_(^<RUy94&{hu4z(Pt|)m`
zWzPFV`@P~YYur2K5e!?9{^hj+KE3*SJ_)YDO6C8S-m|1a2on#dbji=nO=Ke=UY#0z
zU;voe7UbMy-<ThW<_nO2wh<jIWA2>mY8X_+1D(BH;iL;v`>2;Q#>~M^4KqwX?r&_=
zsI~KZZ$(MCa?oeVlfa*^UNR`oCkO&ky{cV04ysrsn!z?bF4$3-x`SVI6W%{wb=Ji~
z>z{9yzmi+K4eI3&E*epIUQkk-4hYY))H0V;sK}@+wYy!7n-|9ElB~j43A+>Zp4-1H
zdX1hdq^^r)!blBxCsRKVMRjjyWa9fXpe3PVXyr{FZw7xgy7+lq^B4vm#(wu?U`}WG
zK+ORS{5p^Ziugzh_-rJNnm!!PLPk$s2UX1@;Kmv-RW&n81-s%Os((RN6>DvMrguSy
zrlKMzzw@kuMJ&%h*3aLhlnXw&0>pqV&WEi@o$W#G>N(<jUp@feqM-^#9mR7g+Cs&p
z`dK6iZAIDA8H2?WuQST^qccVoR7WOODZi-{=IKe*vs)LbJjSy9-Ps+=70n{YRtf|f
z9Q=-3s}O8Lty7_mCtrb6BELa3iIB}WCMaR166KPEv(HDEKiCmsajnw3imaqNQF=}p
zPOf#vZ(;}c%C|&0#Z&lOLsh0KiU*srYO7IeWB>_%eud9Dn>XB&6P<F-EN8URGngRG
zUg!0JMcY6ynu&8XTgWc7HJFD{m?~k2rZlx#uIl7LcPQ{Gh~!AYiwzquOCh~3U?V*A
zP+&z-7@}lbEkRMF%g~d%1Ny!Vyx5{ZKtDnATo0j5Xa7$`Ug|w=v9XMdtU#{W`NetQ
z&+hTHSe%?)3ECX5D_+C$SRC8k0sN<*TmK2jNr9EUK|sBU`2U2$d3XmtBL|n&CBPNX
zD^k&L%wOdOke928279MojbRR(tE;-6=kkKj-)1j=$G|UDFD@s!9d9e2h>=lU4BVG%
zgxEx%q>M5irad>E87#Et!A>^WqjzaE;ia2Ub=EtLZ|2h9fvP^}?-k&Pn^2|s4n`ju
zvEY}@hykYu=c)#!qz=Gm29>tY+%}=1wPrjDS5qUCJ&A5BXufB`xN^)>xG^|GND>4Y
zjW$(PrvO_1F0elO_ca<_4b>p^_VR#q!7dLsGfw;Sx%?uJG`w@%6T>sOkh^?LNCPm<
z@vdSJ6&FV{+?f>>)Vvbfi67W}2!)o%jU7vGf&u~nY_Dvq8=C<1uW;=2O|)0=u{2ac
ze_G#_E6R(_3f!s0DF?i4Y`UN}+S3IZJ24z6+2T%`>GE*ts+e90ex_eCGcro*s^WuN
z_?!(CPRG4Gs80#)7Jt40?<XP4!}GrMWnrnHqJ4n~4<#psqsN94Kkd-6tMM>pqywfN
zDJVp$<b{P{Md5p+^K+?J<3OQFU_!T%VvFnnCr{TpydSX73#6U*f$ubA9v<%d{FW9;
zV{IF;Q;unf<r<=`!GlE%9#&Z@_f^av@J_Pc_k>Lt7uzoCjFa>(A$}Aa*b4b?5-ri7
ztg$Ln`Pbb|N8SjRask({II3D%qI+IY9QJ2TgO`+?lP;ep1e2Vh%Y`fgN)Kt>mFI1W
zNjYAi+P#rEA;<{_lBWncq`;iFun8TzVoRMUQF9QKmH#pu)eL3rP`@$n?3TFgsSeW@
zwjAa~5v`U4xV|}D7r)HqG32|bMb2(b_0^~V=ULIh&bsZzUOzt@H>Xz`0XokQcFX4-
zT}CNhJA6CpH7z^dIG3M2SZvO&1Ld_g!BNkMkBzmpY;R@&7%kE)%uN=3A^0igF|195
zc4Fw@S2n0J<f`vDW#9Fnzv?y^e;6VT{4TjVN#6Hd51N`KE+X?$Y=-JX6QB^4#J`z>
zsw8QRR{NqP!;&b6g5!yJ|9i9m9Z^&nmlMJ{RlzY>HSw5vem9FC+y}Y{0nw1b(TMmV
z_nhqZP1(KqaYLebLfHqmpPeSapLO(R4rQtZ1K66V-}kfUZgYM^!4eQ=jP=c)*Z$iE
zjJ2P?&QVo6@evM?)R4kLp@<vwNm9kg1twA-D{O29N<_}6Y+D05`df6hz10_X2KI_4
z3@FMN^SYD?<KiC`0-dr^bV@{!L<M&QcUqX_VnI#wwr<7HRk4P&I<01v<&FX6w4Lgm
zarmEYGW!IUlE_**fT)+A?ngTMYh=wV;Vt=ct_DxcQ=eC08m-t2kn-s{?f5E662IiX
z^38SCyacj6(8i22PSoGBatIkbmCSM9m{Y69tK!K&KR%nngrFPCp2!;Ibx3C}{LsG*
zw$j(8^U+b@T`{(U-1jU=@1ncnbblLJ@OQEM#rZzCZ>f08NiFR8VFPWra3j|r3#Z>j
z-OjnOoX2_ZQ%_UW)Cl1VI4u%$x)9#-s%`hhnmeNuP_p6PTWDCuWAy1Ls=B&q5fTu7
zeBOM4+%S52`?X<YbaB)7_s>o<3BeST&EICuRcy?&gVrhFI~+gC+t|LXrEQF2MO*cT
z(O+N?b|>@bNfsXB!ww1h@4v3bT0h7<IQ~9>3iXA*enqlPtnBMq3lLNFB|(V<c=Iqw
zNWBpW;zn-(UfGAHR-mLKy(}q5byT~0)mHA^K1^m)E6Oio9Z>>}<c#ue`CC750a;D@
z2c9xOm%FEZ$R1G3PX*Y>ELK@)8SBA=`w4z+ju-9`yz@Xui`tps;9Kuk9;~jB`j5>t
zJT@MRlSiXA{20!zs=mEq20I@p+<W3xqOU7WB4Oyg4-$p_%w;=9=AFIZztT9b%?1#b
zqq})<f}qBwV2^RMloZ9}b!Fi#LV<pPl8kABLui4!Gw-DjPq}b14~^E=q|#MLNi~bj
zkMaH3Z8fqhj9<n>2VTR)Z)Rb*=C|x^_<}ItkrI$`!_S|-+C13Fj;@I{W+^8O(=fc9
z8XrV*GwFm?pX4TV7*4Wx5tH@NTc0dsi*W4Kf>!`^pYO_f&559<DXPA199_*pPwyL4
z(@~jtbTD{`*=?4HcJKU}hL06f{Wess-cZ=#BfxTewnbVjzwijBPsu%WqJ`l^A@tcJ
zYB*#10x`rfqEHnGHSsm`hCfiU1fH+W()-QcK<Yd-U{+gE*ZAcVBH^?a^aK86R^%<t
zVv9NV`%1l>F85vg8mp%~P`yNRVDn8R<zhErvBK!UZbF#iE0|F64rK#C5izH}wbf7U
zi<9gD7`lYrE;gVI;0eD<ww8PVifp@DF}?6Md^wVk3JP73A~S@-d9%WwBw5m5iAAZ}
z%a$-+OIm~(E?`A7!~PVUKcXQv?Z@2lRI530PaFYa(;gzq&uJvCZXl&Z(FkZ>&qjyp
z;P-vWc?7N#xe*xk8WDE}qr?s-GtdYLBbf|ZyT`|Q`eNGg&@Mm>zZ)P3gU56!08m<f
z49e12a8Za7Wz4!Vc_?;7q4XMM(!#R@QZ8TKxz7&j6F6YR&^RQcR@Vvj_>Sj7vWs2E
zCbvd`y*PqUF@LV^70#BXpaO%YkSsqGNs&L(7BI@nJdX-<Cn!Qyq0vhGS?K_$nLYa>
z2B%G9?h%bz;t8X@qM7j2bhU>%O4`5#7KkIk8@pUwOb!;e8_J3z^je0nhtd>9^*B{>
z@!kC`=&HQxzlf3QkAdhO-Yl_J;7C92znT>uF^}<U=07->7Rj&VRW_VEQiZMTFi^>r
zhqv~Tt&Z(+a>tge5cU!{N`-PktEY>;*fzr9H0HU8T`w1~a_E22cxFg{KjYYTAF4Ux
z<M8mCG9%Fp=Hz}(x)5fbG2A>QNo7e-mTTQEYVXYPj0+yxCyejbg6V50c2EYIKEN8-
zX30{fu?dOYP)MIHuiG+yAo@FQcZ##@$5ZCy`!!?l?u<nEjM#i&97s`xNYJVW1rNB)
zfCtjH(C?KK{|Bw=ET!|+&Ok`-UvXV?_rC2Y-|hlje%dlr9~7k{4h&o!AB?mbK$z$#
zya|*5JT!FCPHpwvFgDiqv%VlW33>u9?|L3!J4vqJes7>4jJ{p6ws70ytd^CuFfWH(
zM}?oeu9Xa9Bn#L^a%?Etw`S{VwE4hG*_2W@i;+Tt5GE#f&|8|mJ~`7dWky*P!MeeT
zf1q`!lqOup9oImjjXzNrMHx3wUqt%-lHfgZwn#%HB@6Wf(Q<A(Uo7Up!T*VWZeXOD
z5WbJ&8YTtFfbNtW<hw>mks!X>EVyw!HiJ?{6-8`NuL7$qvm4XO<x4ZKW-m(AdlO{Y
z!<m$!UJ!BDVJ9Hciska~SwBwSTQ4u)DazkWJ0z{=WiFzAr6rXO?)=(3r}7irO%~sY
zPxZf284*QGpTlxIrr26mcx`j0=|v%$Vk@B6td(58`O&b0zb7q%K_@h*Kb7o5E~wMe
zrysDDuRpneX0y=PF@6-$Ee|1WUkXPRQCHU)l>yXzI~mV-80wvv)iCX_s<75+M~pJE
zWVBWoyN(*&ii6Z^bd_rCI@EtrKj%q1Q4+df#8~P6JiZn=Did*@_H$5-DJV^}k?dCZ
z&VvCd&C9g=vQu&|f^DO~TbgTMNn|murPf<|NaU8JvNgmELZ*r1VJZDEZ`E<{frbl~
z9><}-Dniyc)tK8oMtMTGY-)g)_5ybBH0cOW{a6pFZ|_GJi$$kS#Rd1>O@F@Lqeqo8
zdFJw~j@+N>OeICtgsik*5Jdd${n14H`FV_tjDO0sn8(w3F^GwY^@S@%{MG&XoS@kl
zeU$Y-SBf~${@EYnTJ$zyJ{g@JK~wNeyKbdq>)N|b_(7QFjVMzDg&X||52o*^#3<f{
z0($6x2;3w4a{4Ii0K&c1taRb+CyBKDw}vREPFPVVOsX3wy|?5nq9Y#g&*CH^Lm)O?
zy!oCwF-S3R0GoIj%k(UCw8Vv&y?|aSL-c~2L_-!N@y0wc--w+q4l3EFice3bNup(0
z^vfLea^k!T7!ORxo9kwbl2C+u)X=*Z|6orPa*7>Y7a&E`yvgtluN_M+BCWPK!75$A
z?a_iN+>4frh08lPn?dla5uG8v7w|P^tX>N^iSCJ#6+`9Mx7*7R&8Xx&zPuhzFo9d+
zvb*7x`Fi7txam4SgYT3~?|r9|CVG;a4`<%)Q&yVmpV0MaG&;&^g(8yZqm!(em)t5(
zQT^0nj8U3VoVcI)g11Y?)pVPpzY-!HXMxVQU%`R>rZMp+&jL@p;G)rqYXz-5)d?&c
zP2-c|I9n>spO5=BX)rgsnasP03s!a%35*Q8*6t@n_-*gICn7CM*z=DE;!UT=6H{X!
ztqD-QYl^KCiF?K$J=(n1<_-dlSc^ir)Q#V+owvMZ3Zp4(pgF%*o9;pGAUHucLx7*>
zC-BHN4u?p3=k-fS>vc6Khz?YFsmp!S2ZvYhIBmCIsv6rD@mVdkV|mL&lG0o~L@wO1
z_c%e3gck65!+4mYinv!QQ5|BxR^v7~!dKoysnwsXr@m#%ffo<dN&Uk`4<w{x|J4@8
z(J#Mc-79GpBC^XA$O702?@kC7=V%2VF8G45A~fr6hI4El^T}AaxswJdnx9|0g)AqX
z*rodBD@*4+ttY<#waIz0-!9TT@chhZGh@`CJeB7%S9gz55iLbxw5klrehhtk?Xaoa
z-4v(d35g<sa&e*8Y}c+lw3Zh@$B5%tz5I!C?my|vXQgk1hXd0G{&7LJA)B-?GN}iH
z(l4Z^#<fz$GaUA|Inh@OR~^$&<^xPCjuDy4cXSCX%w=tPNL8a*j8*S^SwE|PcW8}c
zU#*X=u5BEh|4CUa;iQLIDwP};)6(*uCFB(@j8ksbP2K{Hj601}Fd>Uq(8UTq;{giv
znSFzAMT!iGIBwUU7lh+h^4JW62b+LHWaW<<jfFJG-<Dj=mjEIk5j;NVWr%yx5g3FS
zL`%;VJB9=Wa!D4bRh^#HRKN97mRb9!s{gA6z;bymZ`#^AbC`S2V_08#C+C(hVYXe$
zl;wU1Oi8({b#&N=z4PF|LT0FwcdA%~Sb5I7s5X>mH?1rhf5?Pv%hCIoT*7Dm7XGDE
zZ+vjzoQ&c**I;3O1|Y|>noGh##8Wb7?XX++`cAnH8<hgwO`#o6;Z$nQYU8Wx$Wz54
zuzufNEpbLkVa5Mj%vepHIr{Q>?1}MS4yNy`LC9MidwO-%UNEs&tmELJ_4r#3>EuP0
z;EEH&5%pN${pXC!kpyN1FU0+!)Z*^G5EPXb7xQ+XC)Ja2;Kv&d)acb97Huek6K*A`
z<gb`8hHk9k8VK9oc~Vb6^_)^sWd^vkIKWZ;L*qyw&&Rd67^pa@<boS*OzS)8Bwwo~
z@|xqBuI(x60$a9{$8?NR5<?*6z;ZxmqfS5T(OMPEPWU2nWudD1jPB#H`$8cdtz=Qu
z^xoL;DaIU4*rkV$?|?S&6;Ag%YDNcQ?%2R%JbSw5*W*Ov=AUmT@Ut2`DCmeDyCD?l
zh*E!yqWtiW^|@6@ygt;g=hWY-&bkUeBRwhJU>WmJ+~qJsWUYAz_s54x^TheBSpH)E
zQ-TJyaGO!+WVPQ70Dw`+kWQW;OO&jdDP6!iI+g*w4Nv^hSPYFvCd=k;5po;#fFzKN
z#u34j79~@;W}g$1B<G(aL`$3Zu($ZKHGdu0{B%cw7FL^viUKSn(`QRZw5l+usFc)}
z23`muWIs21+xAh<U#<+3TQiArr|b|29a3;i9)sm1d`dz0R!B2HxfqzW)$2G}Uy7mZ
z<AOPL=U9sWot`RJXAcmiUXVEV9)C>45N=IY@lADqtC{wiJj+n2QIyb(ZW5;haoP32
zv!r!i;l8xgJY{9vN%6Kn2!1Xm!=3a9u2N5Jt{yb%FtNfnF>mO)nc~dYh+GohbqwDx
z0;uEPSuLv2-Kx0eZQ9VdlKmE<!~bw`JizY<lmjwvPZ!`Vuo%_%WN%sP)!oGyPjVs}
z_Oe`kJpqT7f1aK<|7E{+3O&AQ+#~0muEnozkdO=-4uO1+5BRFwR9LEXeq^1vldSve
z+`Q8W=HH)jebU@xfCEx!`gV~fpkQqB+)Ei<Wt0d!rpwARS+JyEzI_V(1TKH2HvHH;
z??dd&k=&f&&yo+0(6hIGby>_VtfLrI;pOG0rzz5P<W1gabo-EI(Z3)Q{8Qi|m3JH8
zrwf&dNVr5&Ae)qM8xz$TUM2jPIT2>u49`B4UD{D<mn~01zftrhm>pC2GeB8P4XO|R
z=TS6JSC`M-Zu!@Q?O!TP_{=ZFoPMAKb}ry{Wxwgp=!CSA(oh5rGhX??=yOW3a8!tX
z*!_jT@a=R#mcCKIUhC~qu?OJ#B%VJJjvIZL)5%OYfnFo7BBXiIR{L@deYK0Bol>FF
z)4r`y^i49ex}gk$-IX1aSyP|lT^-r;YITF^QaPxs|FK`vU^=?a0AyDo+_NT2q;=-w
zBY2{RT!hXjqiE9WQwQ{=muNHDvrzUL#9=lxKINxSP617G*ni~KrrRdIQ+*R4FCWTg
z{`<Jgk`6b9#}1&cqMyq6O!3lFUazb}@jmMIdc9WF41DmHJokL^%ni3UPfp^rOpEpT
zC}7KmU@1r9p7E@GUY&bW5CY&Y6qoD<&~^I^j&CeJ4h{^O{*B{o#-~sx%Mol-o4U7J
z$U_>JEn~1Z;{@!z9ufDB=BIjoWbdGy2?huQb;zPDmub{+|IqIzq303`n{wd4yd<Xs
zThFVmU)0K?zot+S`Ty3LF#2+x$pw*^h&}Uaii=p(8G@=jF`7?*tFn$l89Lwn#!3-5
zkJjVVOUwxIqw3^|tSU<hLGDAVYT*0Emf(!5!LSm+mX%SRl@Fedh~?E7bC>b;?uc$)
zU1Q_mTAPPXiRXGV1hmR$<UXwa;!9R%@IRMVGkZPqXLB-XyTQ)!Il@KoI<Lz+UAQ6#
zqI-%fVptJ+iXv1so=b4vhNw1Om+;i~byw07Z^5`6>tXO{F*>GXg>IkCINs!VFXM`$
zT>eAd%anGO$K)4BDm-=}@W*^eY~bB*%+uxZmOQI($b(x@P{^t*H}}W&chRT5qsRMH
zQ0eOxt^-lm!2S;FpTM^)&b1A9g0~I$Piz`Mm*W{8%T|QU(P%Mjzm)?AitPh9)ON{<
zju?J{5YJj>#LQLSUi~Za?5Ixz`0t#NsT+Y~C{BlCVxIS#cw^iUN5$ghxKqOou{>SR
z0&Ny&$PGQl&H6ur^O_~bV?=tVq1=&RjbJ%aW}phjN-|KQqDW><?p08YY3?{%!aj+c
zLnJjC_8x60)lHrHoZ>!Oj=LFl`24CHo0KIr$WU~H`mI<H$fmTUi@<S;XnFc>Y)6de
zosO&JwJL+6tLM<Mrrd7k9Oxx>xq4I(5NyY+y2}aTSpNu9h8a$H+j{OfVHDacO%TJH
zqAy;mE!_37I(^Q2-MFc|lxwZ@tz{V$X1>=jUBD1#0upY&ue_7Q#Y0Q`N-p3%n*83J
zS5{wHdcV75RP@ZOzjKkevsjmKEA=t_-#b+IKbX!0L0|iUBvTMG^W+p}N+gO-;-$sm
zlHV6K$q6*k)Y-<J%6SdV9!fUEa22yNGqnwzYMUw%U$1h7Lq`{hG3;5JZZNX^>lxW4
z)S~>x3glHJ#sHvH9_$U0BA<k7&b$`uub%4MP)gN3bfv;;#qhPT0Ry(Ug6&Ke92P2&
zo{>&|p;N(Sk{h_4NSU*X0gNIb+feFm;F^zSisDZ_NxAs2jXJ7e@TfcaVKmO{8g5&B
z8Q{NkYVr5^K7Qrx`^)9|HndC1_27}NWv%4k!gSuiS;-tNEcg|PK+}Hk+`c!$ehw}H
zkB<u%i(w@>K^W?BBJ(aDGX$@$FhDjoH5CZ;-A=m(jlT(DgBewX%}LxSfl)&?waM9e
zIx@yx|0e>ffd#dzzfO}6g?mY*PT$DqyrO9FyVQG_A@I2MWlX){r-dzv^;WCHRw!_$
z?x$gq4mTzU2q*;dQH?#S7#c*ij1nHxYc|60sy{W@M6;QXZdRF})I1n%@n>P%zp;AW
zeoCNMXhq#E)z|W>>x?<2p&2YFHn#s&LZTF4`gO~EE?qyXyM#nY<6v@ps<3x$Y%Tva
z?;jj2N)cVb-yr_c^qkG%DuQvlC_3p>%DL95M{tX`91qgx;s6BPYQ8mML+vBjw$kuD
zoDV;b4G(t*W2NQ$P&I?DyE}tBdagFwT(iz%zn?3&EbV_C@lwS-s4JSPagl+G4nkmF
zNARWvr7~GKy%h;jA9P}Q-rwRyC}4C+I+d!$gB+-ecx{lQ?~_?v8Ba^cHcRw}vyvl{
z@S4+qs1?tM@hWwf0L%7i4cnV#Ub<XJPViQV)>7EhuO3ilVn-{`;b_*yRy$U4SIsqg
zWW%RsX{NLlvjoX}KGsb|w0@jyKo3NWZC=?OyPAElju6#4-$>#LS<{XtvU7U^bD5{P
ztJb2w(u?Q}oA&stqyxtXwBhRu-fUhYS?Mi#6nt?JxjjmL2rKWWRKBIS@a=j8$U@^P
zwp8J@ex%@&s9m5qs(uhORt)s*hA_hPtObAH{#`QToBAQQPP=(<5JDjpZRIB9u*$=z
zTTkoAH0r&Fup96XGlE9*!ufZo>`^s77j|X!-TT2s`!hb@Hm0(dfa-P++x1wvWBMle
z<5gg<1shR|z1kx2nQ}u~@^Nab87O1G^ml#?8)qKjTp!AdWr!kThIi)0d{8dkpc2MM
zzqK{HM-NH49P>@J?AcsSO2Cn@!@q9>uFEH)TFxuv+6x^x{Yw$P?+8E3MW}JT(du52
zJ)wrl)JZ6z3hH}oTC$gf>wfs&>O9^)Xq?Iy#1hT=3|;`ygBeV9f(NqnONXH^zl}o*
zgad<?&urP1XZ}2^OvZHA2j}7EdRpf|9flbxkvdde&JzsGynJ=sKQ(s}w7?d-O|(jX
z7kgAqk~NuGWh^aSzZGTGe>@2Q-`-5QS>@o%ab_9fErwp-G}6}0fW(_cw!aVN!Nc(Y
zgSdTYZ_D3<d_bzaL$CgRzfX(nJu<SdFC5+!8`MHG3M3`0px6t}=g{3;+C&!WHG|d`
z>8o6}5~wX}zbgOW9qZF~r!&830eT}zIiT26p>4--OwxRfuUFtxTuN-%?3f~jg<fPm
z4jQ7CBv)oa8Xeq=ki3f(;v?-RWCbF`{V$)54m30S29O|7)tbBwJdtih!{S8W!6#9;
zfYqB!{9p>&SdrmgLLflcw=R<=H!s)uIiZ?FFR4y)^_52()IhI|;@`rP+W)&H^h3ro
zlhIleKeNX`C9Wff(K{0}OYBj&g-L*UZYs=2@ba#E0bE<+LqdD`A~MrrwaP;;umECo
zjDktqQSQw>6k4cQ>qNLonliuPWUvqZEC^cz95wm6RrUV7iF*~e`jVc$waY^2kGOlB
zM=&`Hd<*Rx!qdNcU2^3Tu4yOo$J%~9Uuy|5I$#84qqi!?_#3E{C&L*Xo@eRj&sVL6
z+SXiEH<o+~pe#l#J;aoJ8_QwPifP5=HlTGaf&Z|&2htodVNX2~X?@?uMPz#lyefBW
zd57u-2Oq#+MEh4j^@VkXd~OBfl0I2{hS!huTd)CJ#f*7u?^9U?COgLyi&T(X!+T{H
ze=@6$k;PkI7J_ktAW7DMK?A}}61QGGpR@E^0;KLNf2PI(Wj%k1G`p{@*hFR#u8T{@
zf=Dui5U|P_aL23KWFD$LCmm~><wXRUweEdo=y~x*w<f-=;dRsYvOEux;1Gmlb?Jsm
z(O+u%N%p%0KyAZO*-mAlXDdSl&h6pdoyeKHCo$^}Oy;n)QBb3pn!9`0uk;b76)*9s
z_w_%(S(PRhbZVJanGE#)t*);%Rty4s>%^~4thtKqvxrLNG~OYv9$LJ)s{qb8UZod=
zc9QNYD#OxuEt6OvzOw>=s=al&>(^KIPquBlm;1*$NiBags%yBKh1aQ^?C%}N&m!k<
z6HutTD>@lY66e!aq16a<X9WSvB>TXMNg~dC%5w0s#a83n*R|NJ>*ws7%H2CWBC2B~
z_A+(XOrU+)p{$weH_#-U8k*W*TkGqvp427XBCCuIVw#XO(v2CzsSvf1rMD|x>~3@R
z<*+-hXy`I*b;29NsT9ve2&yZw>l<anK-DYvbdJ9eem3E`d#Bxx&w0&L9T>x|RNW8@
zevo)Mw>MKZp*C#y3A<y%)x>D!N(ZVlC8yIfsMml0L*@?@<yAA$88+c@Va#|u*Pe4K
zb#MYw@hBnttCj24Uo9+d?6T;)(ZI_8HU>}9geMtN<D0|@M8hmUGhRe)i)HCY+22*x
z&z2ygSJ$;CxAScsLVh|CokZ-t8j~J(?SzKm0Y@k=Em>%Ebads8pKckwgaV$ZpFe-r
z=Ky1K8vn34E)E~#e`C}gQyP#8u2oU=d=&R&$TrCfc5-8AH7Bmki}h4o-r*oj@fC+K
zOI325T#c@^OX+S1TyXN~afUggCFpUQu;d6yDNxR3rHLA?B7WXSJqaL8fmlZ|W=ZZ$
zyyAeQ!Ojv{75dy->s}kY;SHd5G!mle`{P}y+V>oI+-K@HdZsX>1HT2&^H^{>kJ>;T
zpPKF7wj3}E%9JEk>koj_7xu{@Qdw)_OI5TUH5RKu<N0A0efh<&=9z$x=0<~hvVt#U
z_^vle1*(w~e@)t;R&7WdbkOYXVct#PdJW-Z{(uFef=?jFcYgF%#KZSM>za|L*<@?L
zfxG@`Kn~tll>$x;YCICJ2vBebycT$B*+j-+YVJepp7Yc<<$)<){absO0D@X1iP?#m
zvkm^l7_c>}4T_zlerU@RCYG4N+d}%*l<9DZzg@s;R3T>$QPcJv?JDBa2^!1Nb{}@N
zb4@Jr#H23wJ)x=V)Z$M6H?{K&1Lwpy?Z0*OD{58_!T2rT&<`cMnK%t>{YR@ZU(YDN
zd-T_urt9+dSeB#6g1#0soy@J2z`c;kL-;`o!HL}M9~-?#W~{6gw=KS%J9L;bL|#oz
z{@gymRD?d9SooUOSRE%cw~v?-Xq{1nH93OIdYn`;!n4)~&#W4BFmgOW(Ay{k%;XT|
z1Hge{ynid`KU5D|-ye+p@-+SYVwev<<rb}Wkge-9?OQ45n;I^J(YEPqgOFBL6+<Gh
z);=dWn;V#7)&80a09^BcdYp3k@jjF9)>H6~;<9EETyb9=FpuH2i&sSJ*=s_8NWYYP
z`s`})vqvO2L+{sy6g)CHOLtCTV9yB}Y5iuK(Klp;Ihe@95M*-79Q=)Ww-qYfU(q=y
z><7$(J2cy-3yr?xc+t9oj@<h!;C5pw+?X=(Avt^?=>j+uO~w*1n!mg}mcFI_Mzbc{
z+k27Z_X)>_#P=a{lhHy%7)E%w{^N4xVb0uKV}S<t{e$bG_ZbT)$Dn(bYo1u~>h*Mt
z<;q6B7>fUmAFA&Onp0{7eSmY{QCfJ~EAp8%*>W@lV@rQXX6Lm6XrE-vgPmlZ_lHsW
zY`E|=YdFSW#oYPIrSH%~QWiPT1O1}frGBBZzpkc>(}YF05rrdJnAxzicb`)a7mk9<
z9H1;@N9E*rXW<^2R>T1Jv(!rMTNUx{>D|z)d0af!*qDR8_I1Be*iekoc#>aXr7wTZ
zQ1>~^9o6B@g+`iodkJqF4=YZItOi3$Juw|Ho<!)oq9>HVJ^mczns^Z##eAK>2k2_V
z=MEul-T`tzV>Bc7ZEHh!kJdu`ql%5)`P7&Pa<+ff>!mNusT7<%IHfmlvp=~Az9E6?
zl66nightC3p>pg64p(f99BtC0c_PNRy~%3$!0A%o>>i(@yC#`femlZk_H>)hT~fMM
zL$>PsH|OF5gQ?B;I@2-Y@;9+h)C5ml-Jvp$H7><USwejzU5rCl)$yuK=<PZ=xuxli
zcEGrygWVc-#6wK~-I0r!kQhg?s3$6F%Iya)psY7%k8$2Q@n>!&oX@Rn<bcuF?AmVi
z0BtClBjOof5uEtvHm^78z|q|Y)R`8x9-Hu|%y6CvFQYf|ioGq25c2Abu1^z3c)~qN
zP_`r}{5b_pi^6A?94IxcT0MqDxTM7w08hSGM92sk*VZhBpB{v#HjCDTBp4Fsxv3Ao
zZIE|wi%OcV0lzgmXP<V1aBFXwgU)%ngERiVr)E6jFQcsAzO%8-pw&84&CJXuzsH9t
z^!YIb77%WK#Bn3|yqq;XZU&;%^LaQqb>E#V;028nXk};9cbhZ^;NuA~fl+f(J%n+6
zL5#Yl35x?=9i1Gx1tuN7jE<5?uHD!CpsjS>nIsw={<Yt;3njveGJC3fJ>;JT91D%2
zGq&JJN2Mo39s``<p-ewb=Tnb!${i(0OzKUi<vRatxpZgeL<_ES$-YKyxj>YjbTX;y
zqs&$g+PdXkpgR!rGpcJ!>uKz)*iYM`FJf(nVsdB=WgnZ8b|tr(rW=+q721`T*vYmi
ze|OGt*8P-mh$ywwh36dqSp-Tz)k`%1;f!FJqAvOTRr-Ii^_F36huxMpP@Dh-ihC(e
zaSKi<P@q_Gw^F<~6bJ;TxVsj2m*7x}7I$|D?k?d?&pFS$GuO=bT={a#f8T5W*4iCe
z!%<)oM&DPq8MPo!r_)f>d6^npkq)k5yJ1(Mtpahy{#0{|d<wdXr*C(^;>f*83*1me
zI&a#LduH}k(6$$omPQ?1+LxgAJSvK8uN2az_w~`fi!Q4a`Zp4!rPz~>@uajGOs}cp
zi^w~ld-oG-h?ZJcHVvCX+74@x8z=Za<)uj?<(KXZC+h&@pz#j2t6Wz4+uCyuAEi*o
zL;DF-<ZAFTH$KU-#9I&SkIg@C$lPILoV{z3I?zj|(%DIO^2T>iFec}l?0$I#p69Ec
zY8A(;mN^L!h+a#W4MXo4%-pa9(?Q~L)VZ24uHG?K3O-)w@O18D2|5J|x5^`2T1Jf&
z@exx{YD5v&YvO=tL5RP3Q})s7F0OVUg#^sNo?}m~JEz2oDN-M@QHNeklc&gLBNC^J
zZK#EJ=X<7yOrR?&F$d<!&l8-JpBeAFCZ!t;UOX7SSKwzcj#N~vbLE7qL*<UIXu`A0
zaZq3M)reJb&u6SFapz<g=l(!}9X@xk+}hxOBnb5#jD|yiVOz>@S)3p$Qxg6)Y93u4
zq%;H*!dVa$R`7>`S;F7$A2kUls^H^`&ZY0*{r(K-;nTgGHH%N?`+yBpSJ_)1g9tL*
zZbsrU>5VFJflseV{AiGwVh@wF8C1lwZL}&klI>@V$$`Y!l)~t<i7b25bo4-!zeMGK
zN*p#GneaZv8zapaZ8b3&3nGOEE0gsNSsYCA2yvq-_&mJ2)$q;S+mpgEyV%-MAxLD)
zxLOrduWOo_WK^XFFis#Xo)c#HPQS(o*iavJFcs4ei|Do&f2Q^G1q71MqJ7)7mhC>;
z{(JmK_-OGh=!0WkN68=CAWTivt?aBMzW^^B6SDzL5s#<9-U3Ca=H9I)_3mp1>S+8g
z`H+YdUI0#cc?kmLP_)))a<ggd6h3PJ`s?WruV)qrJ~X`Q)a)SP79!c**?Z(Xh8ebd
z2K9JFa~gjP4BQjJ2#z-pr-cy_fnHd@uYCIIeM>Tro$2K+vHzu@{wnA6^)@GsA1A>Y
zB^sKnFn=~N$H0gKfm?AG|D^jKL=uo|IuK>rt_GI3VEgsQ0u3&j!Yy%3*)KtL#OFPX
z;<X;bPe+s*r22N=2@m9#m&fVu)pxST%T`Mb*a1Y?J9O>KZVq#_w)%Og!p|YMCiE&h
zb6US0*xVU#1EjFMGCe<LMt7fHPTNQa=6`I*Fnl9#Tk6C(KrGdV+OymiF$yzm`w>(<
zPB-kKK<8c<>0ed;nPo%(PWrGCg;eY3?e3^$CvsM*&&U)}HR0pyFstKb6?lY3nNF{j
z{N!O9>plE}vFX#}+0$lA+WtI9+k~pvHV?DV=>8&<MZqxlevKyal>QdNr6$|ekQF|7
zPd{ell>4Yb4X`kN$paO1OJ{}tHu_tCN~`!Pjk90YTujvC=?%OvhUY!l(&gB1Cp1Y!
z<+MbB?>7nEY!`xD|ISF%R9;sFZhXpCRa^B>mo>0@iG@N($V0VRUSVPUptqR8->ziT
zTid>3^shOhx;PP3H#4{Ao6EcKq4|)kQ^7c-j1im;Cs%rQ<}L%d9%bE5v`}2dHj1J)
zZS&7=#Ti3%wXe5=b022D`lTf&Clilms%eSfqX{_geg?g?tITEzR7yzxS53^v4bq3#
z#J+==AJ@Wymv{yCMP3lV)8$ryQ>lYmEM$x^mjsC1O-d{H9Rn#F{p<8*<jowE&-+X#
z1}#O7VFpsdh^T`o7J>(NH$h8++_yiGAJuJCuvewpRcqX&_ak39orEpE2}%K*@-82s
z2V?!<Ku~Ak<4F4=Rn={HMzL;n_GRckMv3ZKG({?7IWtT@H@8W~X1GMNOm{f;^S&uz
zlem)qM#PMGnKOD0<_kT=c}P%`m{o?MRQvjyy?jUShrF49c{IVgO#r?!mXW?jg%P38
z)U(QLVR<kSMnC>*ep!U0;)UrNlbUxV)!G;v$9OfJ&Nic5G|IN>Z|B;JT3s^k)-Zwz
zBnyNn54$Ytqb1Q=ZJ~(3w^We6_0P?8Z}&g!1Rawv<@uYTByh6&4JX4Jpls)#-=ZTC
zAPTkc`xz)Q@|#2VoRHNIl=@p;-27|=w6HCBoh0AM3bGI=czmk(vLb($d4N@RkhWu2
zTQ?gi3O(s<oxeX(a?J=9@O8GaAYvqI-2<d(bTDsyaM<a#$B}2ZuTR3bh=*ckN_nmb
zy1j}ee;XS|)3(piXuXP*!s@=3H>NGOMrc85$<p^Rxvw5Ik6(6UQ|8)Hkx@W;gedKn
z(2OsJy2w70sPXRamb`A`ONS^+tJdfZ_ZB13XOu5rzEr#Z)wFV;jhBKC-!$%~uw{=p
zSZJ`!I4<(Za{0a$9mdXAz!A{PffxQD-nr;WmHi!G!q_Y=PZ0=N(vskT%I{4S3oV!G
zSb><cs68icpDZ@1*M}oRNKQ`KyioL?`#YB@=AId?NNNQEt?cMyv#fo0w2)M_EF-OF
z7BPnxN3vix)#zUynM1Y|e0FS~sbmLg1OWzbefN(=j;fu?scnZ+Q<tMbpym>HlM^jc
zMzh(JAD?@jz7-p_BP}15kJogFJ{gqefJir@dxIp6JPaePbkLkpAG+GqLqG8N6pU8I
zgpP;H{YZo18?;lpjM8lvd%mv$;-2|6>x6#5gO8Qd0%i17y+Co*9DE`lnk#RJX^W>j
z;F3x2T3hgXNqfFsqoRcHbft!@HrO;C*j@AT?xe`$378n7&kGe8oGH><8{Hevt5`j>
z$T#<uC_QwXbKKlx)(>kdcYLHV&NGX?VUj#~653@H%sie(Ah4b(2^Ev_>YnLI1X`%l
z3I3$<SF~x|7<Rm_TSwRA=Jf5xxPT4r9N8z9B@5E{&pu!u_jV=36)14u`DVB%Q2;}+
z%lPGdbqK_%U%O~D>g!z3(Xl+2B+w*szxkDTv$B}$-NetNHVY_Fn0i5RclX~Q0u_m!
zY@M#jQqC=+ZYjJD)qjlc@X7}Egq}0*ZH=L@zS|4%EmKtshT{Waa`&pW>dXmv8$cbg
zOcBqtX3NkhuPZ!BJ$26*c^{!^2VX^V8#mPE!IhS*f%X2MJ&gJI;a9133hyJbgeYbx
zO0D*b1lZx|YniHdLjGj=Ri?ZQs`Kx+7QBukZ0rOfxf#+r7$?S~Uu_!YE8Sea7iAq!
z{E`*^#a3zUW7`H#!tPXTb@-8uz%)n>48BLZxBesRK(rLD(TVZhZ@M_aG#b%Bb)hkC
zTjcs^pC|A{=;w${<Dk6IjpXCWG4ktRT`tacAG7LVjhpOGOZ=Js$m=SBeF!RGv>S?V
z&iET2Sj7DQcKnA0I0l|%s+O}=&u8zwqlcB-zvP_qG*!N!fvXlHgepf_+<_XvPdZP|
zwJ#X}@yMCZ{XxpL;Njzm!Z;Rp!w#fwqgabP2Ys!RHc<*UzEU|Qoy{lo;HlO^@;TYT
zxYFf`hQeoWPLMx~yE;5;iUP?Miqk7KogCGgXiVc?7!?S%F{*ezJ}^h}CYHPZd(%MM
zp7NLGIAmJ<7_17-t=2>Z^Ih2e#3Kml%55u1=tQ2Yc#iLCP+=x+C-?dV?W!!^KVmut
z^UfAuPRv(3(d|8$W+~nqi-@!xT5Sse`|%K1k=*L%)Y!RwhX)>U{ZbS9FP;scqWMKC
z>^j!9FpR?XOc%7dDl$mOm2H)wLh0S$rMS=qe$rDhD)nETB(v87iSc|K3Ol|*P&}5y
zR+~erqk94MG=YkJS~Ij-r@o7GxcAXXE>Ym1>2y5{(DL4nLC_I1O}6Il=Pk+vc7qe)
zA|vnVn$Rij%2Li}Ql44H){x?%I~YqVF+ug5NOfxWb#_)(_l*M(&_AjwBF=QG+>UlF
zaSJnW>0>N=4g$0PxmCTOD_?2TophL8GRms}qeVV#h;Mfw#m2^-osAn{A5NF3S&rvO
zo){(E4iFAVqX+_$!%Z(L;joV1lZNpChg9{!84EwUo}OO4)e-|3O}(q?JKIsNJDKM?
zjhP8s-p;Q>ik3e4&tc=ZnF_vo%|kmRDJ~qk*xPu#@vwK9h@VjKg7b_ptEk(f;ybga
z{qxIhba-MEfQ)Gz-I|Jh0wz1Ai6iRcKSu?2DZIPP=;!1CF1gM5%kmQ`XpzwwbpzL*
z7trxrYi}`G)k=_uSOhhUNA;CY#3uaHL>@eRVMetPXF=g@?ffK4C<g|(t)u#QJHF{P
z&7T)vk8AW)1xp@U9tfMN1_u8nHOR|#mfjVVDG;MJ^e#s6;ZL1MPBdtc%s(o3H5>ED
zxFZSa5dP!)$vpm5W8ZfA6Nsf%!S}F4T-nM+fh?(sb<77*NkOhBfc4;T($oF>;V_LP
zORBZyGl7Jc^**!y3L^-}!q3&@S|RBzQCLGp@~US-5L4|52yY!8a4h!+QQ}DHXQ+6V
zG|1W<-CHVeWT_D4ML)ZspFA^)C?~mB2Pw?)A2^ZM3Pkq8L#7VIJC{AF>OT8!B-Fay
z)b_hSlmDu;y9Z62Dh)Y5)$?G6-}D;G$htpOm04s(Z~5Vnd^hGP66vfQ7DT_LD&q20
zM805RpAv#>T^brPsg(v^F|=`cw923XWNDI~Jo~PCqP2u4e|qkRsky-e{#C_~H-gr8
zm6%k^wf&-_MXF?)$^)8Td24?s>Jw>p6CljC1BN7;Xpy><+P7DUk%gGBR3{ii7G8l=
zk_>ZHfof;q(!X!vzh$yLZ*5@&+-lJCRgg6WFZut2yY9!RO`x>DKIVDzp}ANswRcs=
zt21ced^Nunlki1OOgNmc6O~V7H+S$E%Pul`d;pVH=bd{w5R$7mlnr$h(e(5Cqi<I*
zIv*Sq*MV7<nEZKe_>RsldKKQSw2UY*LTL_?B@zzjzJKb%^WI&^p>htXZJ_H1*(Kds
z9jNVkzH5;i?3dbM^tvjNr;&(5N_)I(y;WUO)MU^=W0i*w*E<V9?+M+a`C+*_hsCR3
zQcFO6FSs)-lvvj0AQUI5C|?oTjw4BfTn?P3LG!kIzIe<2M$2ggpO(k5eALzEm#vO-
z+mG`vtfR=os;kG3_Ts`B#;-MTwDFR!pjPuOFe6*F?LxB(GPtDuQN~kfvubtxr`(~E
zB*~d3bpksbt6tf`t^5@sn_j)V|F=Iivz2;S=0j<RN73uQO}0?GYxp(3aAxWhai=Rl
zIm{ePR6-<>qsb8JCJUUd4NyFP<V+3$M>65}9Hu$F>##FV;=Bf8Wql!YWmBZ~6czRX
zmll6xGSfrBrRz=<3GSjmf*10!l~wIBSACWerSOP8`i^f2<VEFZ|KVbcX%?=d;5;78
z)NjVeJ(1YJv_q=L^T__j(ik1Y&sNon*K9jVu!`TTGzx-;Y@8~E>$QG#GqpwnojZhy
z+-}r$WVBC*yxCg4!*647M3*d5s}jJBQJH;YL9^2#aKw$tpoN!Jz-uRN*S%s@?3(NA
zGL7rKhK>&fFeiB9laYU(^n7Fvgp99@M=>!ip20p6{yNaI$n?ladw8R;3yb1mk^Rst
zcH#Z0BoDo3e^IL!g1=l5xB+^VZWSrz0iJAnF}LXC;_b*?UR&+)a=1UBCsLvwV@SCj
ztXfd?_Q}G)1YJ$Y-W`-x)sH&3xyjhtmSc#XOEn|i=OKh_T*$inX#A5~lK&S~-ri_4
z7lr89wy))DoGi6Q9hb53WO0|Lsq#Mlo!9vjiW1BbKN*Hc^`Qh0Kz#OHq7v9sqr~Hv
z?<^<SaP;P+*HUO=5KTYN1N#R&7Ny|J;CYmBO4seH%V|=j%3rs}_S4kbYfRYG`rf`g
ztYb8^Z@bsekuztG^@&f~mm4T5{}5CK8}@E2geXp~%{#{t#5p?c53=>k5xZM%BAn8$
z7HgN~!hoct3J1QvvQ@?$(^pj`JLrstz7*S|<@iHx5EmE8Ew0GRK2jxqI+j7XN>i;H
z{RxCg07~5QWE-kScW!Br>b>M){PL)!-C(z*0i)VlzHs-i7G1a5mKx~mdKukuu^iir
zYh&Av!AF`s4kxpBQ{6GB7knR}Q)%zC#H{T)ZLyr<r)Ml*a%*H^c956y3%FzdM87S7
zlE<B-K?zIYX~(f3)b;Myt5S#-7fb)WWRR3!e*U|oMf!x>g2vM=Z}d`PbwSkZNY%ps
zp~vOssE&=T7!%BfW;#vZ_`^CDop7?he@V6)3(m0I)W=Yez3Uz^G-!mn?%lC%(HDkL
z^v6AO+0JKQU-X3-96~)PQhAoELZ(L@%%1U<%QfNE@)GIEYqw0|<^5uvB<7$Uv!{s)
z6W90)!R>hsfB~C`5w``z5pkK6JsEaDBiLir$1H5~nQ9BSb*_Kf?Sx70``>2X(Y8I=
zEG;wL7Gouctq*V<ZpafC=tL-$jBaD~IR<MNU^$$*f!XcN$#EgxnaxPGCjX(n)P-gz
z?=jb#X{{Ju5@&{q_zl+eeOj`)y%AL%nKh&XY=%5Ej<h{x7(;^U;9i#xjrNFH*#8B1
zK|E)r9a+RHeREwRv%Bigs<-ATYWWmihp*$Z=qx}y=-}AZ7mJtlx$kg78Oc}I75&dz
zS(7DXh?()zYqit$6M$20p)#1osQD^+M^}Z+z~XEV5zt1Ha+G}Pleiul-T8um92Tl%
z-txeB=O<4ewm1dc1~97Q4*yAV-9BkxUUI}!es|`#JaRR1x)C5d-(ZLP!JTRv0R`J&
z{&y`!z@TmeZlDe+Py2)2prk%q`LvRb{BtqreYa?G9^mD?Rg;o&7iaHeC%EUAtWN4@
zYj`h@gx{xH<*nMxM{jO!!}UEsGxU42Oi5An7+R7zdF?MadsDZtb~Jc37OxZ{RM5Ar
zJ7c5b9@CRBpIvhJ^3Z@h{PK-9K(b_*4#-S4WUuy^Ja!q&;tt2teyVE>E|=qLPFI*e
zO6*1zoIYIeDIPYPhe+7!wxbl8E5#1k#9mS*ma^HKmuPiDSkJW^1d+KIws8=x!p)nd
z&f-%&NGq-t5Bb^osmj25@#ZgYY*>P9)a33?bhTiwL2?L;&8XiBs!|TW8-6Ey<h-s}
z&aMrM?HI#&O1oPX#3qT#*l;?e+T7JiZcn8Fi|{05Z?ntd{0w&TAGN5A$P5@Sk@fod
z80O?1#AN|lo8Q^X;&=EcezaVRh31AeHrkQ~SG3*q373XE=fpFMPN_b4h<<X#g_B`@
ziqzPhNpLXY)xD&|K?fz;{8^teHTGH(+~iSt$72COG{9cXu5EoGaefit?+8B0z6W;v
z$xolJRr9vb61Tx1o4p)7sojL8=15#0%*4jzJPQnui9G_cK^LL_@5|5f(wzLLy*l|?
zw@mZ>-Qy$lf-ua`z3(@>&Pc{`&u<?<{7}Iz2B3%xxtkR^x;LbkZr>K)IgGA|Nl#f1
z^+Myk7)eM5c^Z=&AglJasQyvwgpO7O5w5zvYvN#Om!tn!=AjW_=1a#`S}d=(k0IkG
zL8mtQ%W<{cB^j8w64h3*Aupr!RXu9*R|#A%w!8XZGxSTXWT<t*-EnIGd$wrrOnM#f
zT5d;s&>$+~0?_c}yYc5-1bmgT?{v8RUZm|tMT$5LP0~+&c51bsYKJ9pN{5KA?E3l&
zrlyH{wHulaA$LvQ5$QHk$DgV6%T&FO&20ES1WcfAQd%=L_r|!x@8<It^p?Rew^1Av
zz*gi&MB*Owi$D+<BDJnRVHMH{G@uh!x7Rda8B{PPlPrYJWm@z%yE>bF2hkTlC1n*V
znCS*2uCUa#xm`IA<Dc-!yizk3bSy>@g~2ftYZPMGnEUB!<e!g-?3731B&>`TY)|pr
zoB_q5b}=A*IPv9J5~OSQOb?|x=bb>5d79OD&~cSez1PR-X>h6v4(dtkZqc`DpenVO
zwCYe-Vucy9<PxY!jE5{ImdoamX<upbs5icWtzQdWc~oKrA2PI+$0W~nW|w03VB2bj
z0}b0NpYSO9>uJk&T}zF7wZVdm+UQ%cy~=BMyvBmcEb(Lr6QrXlmj$)rRgN&Y&ue*@
znjUbgDPf0dmzMa3V)APL@j`=P^F*w#druf0=t&4ApptKv#&KlrL+u(X2va>Dsk$qV
zl}yUhYcXc;b|{5D|Hc;&i)JTL(mf5kmaf6?{sIJOsh?7Vz^^Uy{YlnRDq7*njL4kE
zfgnkPIP2n=t36l@hjCQ4`o3klG<+7(4r|)k)=dJ{Ni{bv#r1GTQk27eUcJ(C+;w$4
zu;k;rMBPYq;ckiAyRx~34dgDM(40hGA@=&t7IQ)e&v?Q*01<V1^kYVkpJ>QyXRf4f
z+wHTR6^IzCxYkJbY_X%-f}9h{VuKyWXm$sUB=htb#41mRRM~(tL;XU143fAQ&PhR;
zm6bKB-}G^SmQjxGw}Z~LWX;cZ7eZAi2knUyML@|myXG5rmWfa17R)y~y@8d<$&R#5
zl~Z^YI?aYop#=T%Tje`nu#ihtQdMf+97ft;-07f%s_G1vP%8!3g~uD}Fo{8xr2UsW
z{=hIwe>o4RR_lZpvfV~nB}j#0$O%$9qQ*jVQ#+6aFqAa;)(<xPzac70;8<=)q{PeY
zPz@f(F5?t|I1y)TgtqgA^G0`?^O&c?BZwgs-qITK@PTx5OtY?s&0S5>1TpcyYHF0^
zl)8g>RN26rRr<*YrU%god~02Wwu<-(C1rSbx4o@vB!mi5XcG+#?E+E9W2S#kz0eci
zM|P4$t8G}k_Ldw^yYWPF%RCVLW$3NNi){%=AL2*7CYr+@3IB&6%fNMDosUM4KyTrj
zt<43d#&1?9SIx{^yzmtL>DaEuU2YO3E&YUB3~wm><=VGO;FQ5BT_0fA%1=kDicq_V
z=JK0St%EI>=e_n6CN8dQ-8X9qbul5QAUb!!f1InqKhE_E{rvnN=c>a}&0ve|4@Fbx
zzdo8E(A3n_$?C`b?+!IwmxY&fP|OSiXQCplESaK(ObCzu=nfmFsEXC}E3_a1gA(bH
zNdK}-8$VK`@3`BmqkKP~wVy@yfyInN+J`@~)H^WCC_drcKb5_=IN$G^vUNKaIj(Wi
z$C}qpDKW~NBk-+{NWOElcqgV?#qQkh<Mi6|K@iiT4b>aj0)ozlL6{LPemH|b?RGnK
zd5FciEO{gZjyJIBg%F|NXh7JZ=TD?x8?;4HJyHr?%R@Lk1p-qMCG@GNQO}+{Rd~>b
zQb~g!FMs${(SdmlXU6ibD~1!fBwe`}A)$-3ifgEhQ@7`r08r~OMaL3*8-j$Llb!%b
zxGO}`RY(u<CvE_nh6nwK9_YRRA!2O)?oj_UmmnYSoVECz%MAv45_-e5e*Z|rfTQJB
zl6}}VE8X4p4c@<B^8SJLZR9cgP4RkGfNJ-UP!lz^2bg8gI$%gTGNa{VKR>$?j75Tz
zB3c2~ms4$+2~{$Q)Zd^G0QS63YY)Wk5bJi}>F@eI2`Oa>GVkLe=GeM)@085$7ERS-
zvM*$H0J5XD;vOX^0q(8nD2{E?R`JApzM>Q=WrlfN?pkF+x3&Vd`lHjAsm1+RW2oV-
z40dsK=-f6OI&jkP^W-_$F#Rn(TO=!rJ06)L0AJz%API&6NU2?^F+An@cCA&5To$sc
zVZli0O}2e`veENjzqw@k!N+;3@G=xy5LrCV_0$Q%y-JOKXcSMNKyKR9a?_tJPkrPU
zm&tmgrb&Qcy6UpHpj1`Qcl6>tdI~ru_W3HM@xI9m-HIxWE*N>EV<oHG0D<@KkU9sA
zAuq1*1^yv6j@T2br21tcPGppN7f^&2oaa>Xw+}hTE00WO=aP()Gh+{M+p?>sKDX9B
z&5Ifsy1u=a(~?5IjBGBCG3wBF@JPc)NsvU{9sJYAwt}zRDj){K&!JLRZ0u6B^^wjw
zSgqS(vP~hGsd{UMINHB@HT>X#lmF%ZNzqHJ`E@Pn7!~@Mt6P9YPG182^(}ChDfUwV
zGPHYmJn?$O`zh?o+15uXDuCy{qyuJi33)RNc@wO9zRV0zWf3w15f}?PjXqvH+fUeQ
z+q)btHSZyx@{N8y-ktdYuOC}Y9h!cn2!1`VX?sZbVs_H1Z)oVv6n3xwn8IU$Q>d7J
zvm4f9VOabxPv93OF*O@}S#s3>gInHYj|>lwHz^yzOIOTZm54;!&6jPVGm-T}-_uth
z>h(v7)#FzjS?yi1D6(gQ=Uhl@Sb^q*3O+CIk;bUun$=H1_{h3ic8f^Jy<;_to{NDy
zfz<9>7$e;Lo}Ws3&^bd)FQ1HmkWo^TAaDS!YRu>>0>crHKD6I`{_>gfDgK6{+&tkM
z;Uh(7_k}#4)ppq|e%}LUo^>8rKO1>}J_gLS{u&}>x8L#*=cAM&I!*@mpuB|u;r*dw
z*PZ7zW$!53aGXzOKZ>M80}VO?ezJZ@b<|{c1u0DDUmWX1gv+nuJZm`M;hWCm^Ao8f
z?Ol)br+`=^wp9B<f%(|hsF#zyinGZS@W}}tA0PPc*ZVVdGO$Y7FWQC%+b+F)P)yp1
zXRf=UiPzq<(@)Aml%eDyLC}`Cor(!#X9{`kH!IMVh9xZ%*hO>cDaK!8%j~ABr0T7;
z+FgA=xCSnd^R;4-^!u<xrTeJ)h$F5DJ22@i^4J$TN>2U&O8ud|r^=Q9>`?@{+r~Nv
zFFnG2`saV0KEHg$B;&V>;SeI;4bKV{iLCx9WsCX0c(C`nCw?e=VqUi=(&_wm^}(_Q
zh4FE5Dg~I{r(JMLVV(6Kr9e$5_5pqZC8CiAmE$*f7cg+MMZU`NS}g|xd*Gg`|8)&_
z9jdOM6tvDi-WyJ1mIlf6ko^k$b>UTI@#%>Ls7@nFM|mhvE~~3cc4##k15BB1-s?EN
z<>n(T@)F#T4m%UkS?R=Z4)&T_($X7UpXv<yQe^#lXxLKcLzMmm;ulh}7o7)%kNV94
zBaB;)LNKw>K5>LZ|NPx!%Yc@d^6?52Y?rIjsEb`>R_jK(Wc$2c;ujJMGfihXk0igB
z(bcY5wK+!ym>4@fLmez_w$h*f`iQfuiAcXp!}!EtsBehDy4zti$Aj{x(w_^dWYPp8
zFbCmw^{!qK=3!NxJ9g$Jm#*!C-w+oYHSy2Q-Y#Qi)xTDhqG!@;G7kz?r4N6V0MgG)
z!JGO!53#J2y|oZ_MDxoS=vtVgAIaWfgj8}Ir@h6y8|l>#CHTD)rCF4#)~aZJl{3bM
zEyxK_WC8cS@Z_r3j@Rr{u(N_Cwk)UPRz8=Hz`D4}L?iQWwT>?yoHAQKT8Mf}^OB{K
zT1W{5%rp9sn@vY^3T>$h`8_iB^bv#0tgFe>7<X=y;^%4@FNl7d1qrlUMXG`T6+GUQ
zYT2p+)FuW$C}2S60JtqWFn{Is%VM0{#rZ*clSopZHJvz=2er$Sd|F~S@_NJZC&W`(
zD&Si~nQ@N&dt}I>b+2AYQ)M_)zwL0w&htRO+uo#IVF3v>ncit1AKsp(BAvTyq?C_=
zDuI()xDqI^LAJ{JPu3>Ywq|4nv%n*!PNk~aWde0J7_;rOa(}b=;rWMaeW;)-PALj8
zeD6;emF)4<U{Mtna!x#_A$PSJN)Zlo^hr`rkBAe$;Z)34v?->5%D5FG^q}O`k0#4Q
zIu<6hC~>E*rlP{pnbU~Yjtq^2qd$|2iyM&^%@wT`^67o`wRw#iO-l1h-vhQ6d}Cw;
z3R}GH`4Bw-uY*5>l85qjSqy)(wmpc+DfY&BfZy(@VloVqgb!)hxhz{?ojwJEhFi~z
zii%8ngUvrFc#X1id(86!_P7OKsLI!>S)(a{_~57WklAv*M%&Qw@o|CA*kUWc%i&@Z
zrt{U^d^3_g1c^s$?9Y*1T`J}*VfjRv1;+_;Kjk@uP8HAL-KNs_<-thd=AF!Vz3-J(
z34h*D7`VI{Q_G@r(4{H2);@o9z=Pn1P~-OBV?h@#6I%m>-&)8L?K6%ZijV!N(D;3s
z?;&mT18|~YOquxE)qimC+{jLhK462Q+6k%IpA_!4n|rqE_ZZfu9CbO~HlRc{*GZo8
zhJ`1m&cAjfG_n<X-)#{b7v-B7?Zq#V&CsAyi~KApbC)+9n<8v^-pe0wzc?Ddi=V&a
zkaV+&!Y}h@>+eWWEk(~A{|QZXe(nd{-8T)KZ(2tgtGQxcrKcO9w6$L_Mg3Wx6=qKx
zTkakiAFUG#BmA36_u834@6(CP0hf!tI>Bibc7Lv@sx1JS={+^|&+phaWG)Y8G$Xms
zM-<*Iq2LJG=cJS9?($A${~=j2>rC?LE@(UX@4q#1tA~Ft?e8H;EIAyXkskiQ(p%Lv
zfGaxdWh2NASqb&H#d_G(PlSH(owrA+q^IolBr=WbYWv>t){zW$tAz{`d7XEAC#IOe
zX>aMQ-jJ(5+98$2tz@Q)2ap|gK<-JMJ&p6;2DodJU3N#e<~E1>SU~pl27lPwA;W;)
zE5;d29O9=_=%VDpT+i5%tjzb!Vus+~Ic}mLYwmRpw%x`At%u$B(VY$eKY!$|10(8-
zJD#($5cE#V1G&*ei_KG$S$G<VwZuK2{Z9_EY4w*0`409gjN%NffmrR{4A-%cp>c}v
z_5gQH%AU|N|Ma|$LVN0jemS3&F;9Wt(@|7+K1;x-woCt6-8dr)5V-(ob_tEJnIM1g
zaANxUQ7Ed@QIJBeC=e>-%;^5(S))Gtkf{6#gH}(g`d+i#IAGoI3QcY5M#HDg$}={6
z`XP~pq*Cp5@Ltfy=fmWFS_{u}$oA(>0pJU~QKvtJ{AE%>5%?$?ls$Gm)li8ZtasL9
z>b6$7Hb4;nEh<RbgC3wIIOhz{IIw+rG=c{opvipp!5Vp$USe3vnsGT8?ag&`N1GwW
z@f3Bst%6nNvuu3k{tjkx!lF|5P4Q2Ezm03l*FpRUDp4VKjy|s~%<W(Qd>m4a9&`Ty
z<W=U^-mGNbr<U`vkWo;(7Y2diS%Y_{PEYQ@EcYjO!4BLr@Ab%Kw75}5+HdXvGJq|@
zU+W|g<r_cOn2flHYw|gTF|Ge;-LHy{?>LW*RZ4H!ieF#mJBQF{w+hyYr-ms44%2Tg
zwbDyIm!1{mkp*(c!;nosMHg9daGKxav5CKvkfy%2^o1XfkD;5$URX;7^W1QFGx{a7
z{{<#g8H<hVIbHlNp#zeBq3<MvkBjeHnY6b!u6l5)zAOrYzyuB9?62J5uwP_aTNN2U
z(Vd$Ffp#*1{(+?kuOX5L`s-MMa}g&uVREq1E2{n30le_w$Got_W*6_>8~8bX;IHV-
zIQio!s97svr^ng`N(^<LJGGH8yIL4JmppSVNy~d`O->8`Rm&A46OcA1GhHD|(!5PW
zj?0Jl*17y^$*reTjLfiK7|}P~{vxrlE;JjuvqM9~MuPNWqdtX|%V%#FEaaAj5((A~
zHZJk)d&W7{>6l8n@886c1$w7(vW}uLFPuA0)>0&;^^`G<mcyV0=9c|VKf#os@fGsZ
zs>qlq;(jG1_gtGa!78gvk#lJy>Rpngm1oIR*0&-Wg)VaOn=k8fWDHM^B;;>5=g&Zg
z>b<2HlC~XAqP;cL5lX-B#Br<;vfoTS@Y$-Ae&k9j<6;r`C=@cg5F$!uY>kGSz^1AX
z1)aVV@3|AlRagv#dH@$WA*T=R>ieGc-v<^x>!0T7HJh>rAfUL58JuecnA2k)ypI<5
zGvfVj4zje@ruHF>PY$sv7gv3?n78c}k|GR_!v2UgXI<WDSF~AUIVR0?nU*oblFaxC
z-||Zt=3nNzFZ9|X4JLvnqZj9~b*pJ~TAWs?3|Cf6VlEXNk9>W?l%g)XTL*MCB!x)2
zYx{el7Ni{Khgspa5pVRkWLj?KL+w`1M|>HW%EBTmK||kR;r9b$b8XbSK~z?_@s2(P
z|2If+^(A0@vEc|kS(hFDaDECDCf_Nni#Z=BoHJ5czuwMv(}(aZA5M323NP&kZ^)Ms
zogEIY{Vq59eo|%c&y(lKslU+%ekU&9-LgYIK0k(L7`f*@yXl!T`5V~THaZqE7+w8v
zx-;p#Qr^@lqxXU{Ire09Vd(eIdJE6t(QfzTYqVYPFH+9V{TPZ5@w#O%ub?b`7w{hz
zpyJX8;R#dUPyY+tBGCNq-DxnK5dYDR3Ai^$*Q7M;Ww$mOp~@6;+XM%4Ab|s+97h+#
z$9LhuPra`Hjw!tUzi+|~qyk7EC2HS*X8t?j3VJQJ-o9Jb>m!7G3nR#4{HxU@yJDDW
zc!`dAvO}U>yMat@-1?B4o`&E_VvRq34mClIj^)uE$e%qcS5<{twT?07thI^aTgp`0
zoVwoz8w5F#6?~!=lk2$%;JWktYlUG-TU4?p(-hOie~vUZX=MCRYNlPQ>{};xxYg4|
zVi4KI>wms#L<gByC0{>`z-<v2XCh3wV596j4)04&SCTV(Zi~iUJwE8Ap|$p3(qeMU
zB+y^@Bco)QeP#U<m5)`f0L?0*tjCng`I@Vw)8xJW*(GNun*uf2q!BQfa4X>Tkruf+
znd9+<HtIiaS9pdvhh221LhARxh&+?`FAg+xcDfjclTK*937-B9e3Yg`sV#GF%x>%X
z9Uthu^QsxbdY~p?SdHHUHerE$GQ77LR6;3A@BLj2^cx{ZKIwxY%h<1%U9~dqfFRu#
zjrh%v>tp?g(Ob9UmTpwn2F-22%2hc@J!Le|!k8?bCI)WSdY)^4&B2+u8>j)(yvE$q
zi`V6?Me7kCyq2tY>dEM~W*Sa0l=B)gAWNSY+uMy)_7aHOWjPH&j19+y=NYlvF)44J
ziuSoLjaGw*uXO}}Q@;Y)*d(GALhwIAjFZ=SrFDI#>uAxPSnb)LCK7G}s}<l=s#YA|
zhkW$DOzNN1w%dm8>0NG*3x3@Y!%(>{HR-ttnWb{WkCCO9m*QCc{b=*n;ig3;t3KWx
zn~ih@sZnz`F8^URt#W>FF0SMLH3Vp@T3j#Y+8a_Y;(bgdrUhB_7OrMSccV*DUfrG}
zd=c(;HM+8k@Egf2AW>Z&d>>6Cy%^uC4SQD7t1q>%|15N2hFOfH<sQ80Q-7v!@qcHe
z>8(@>dAc0+fNMOlJGJcqv=)%g)Dyb_((gw!qUK-j^%ANs!)>UDv+Nf}nj&e_=`93J
z6xH_4P5{XUqW2!bFUGDZ85<ohi7Qg*B<WWtkhfcaTbT~N%<wVFR-B_h1=<_Jl>gPW
z42@xI+lfX}&JyX~9Z7e*zc3$8<6{M*c2^A%4ix_T78W`@@&1#yUf%pC2kX_|WDV9)
z>B9NidqF@yxPN`^`oGTCzxOr58d_~L?y`Ja(D931a#q#<WH()$`f)^8WjJxaA?`2B
zW;F0}_bUi6pfYMWa~T_YW8LLYp>F=gb<8iLW%ER&@ANURdea#2N(endNARr#<437>
z5fdDz5YlZKJM|dWUBLHBVpKX~vwfIlgq~*%C5MC@Wqr;&#9-l4-;Lt0xo)i~XRn{Z
zt0Ngx#ds07$6eoq`Xyojl1A$C@nn8JU_~n|ceIU2ZN#XBBE+7F4EV?84RQ`sI>KgV
zyhNK&qo%e%L4~2ic*>D#lu7TMCLoyL+6O$swQDV{=rD)%p-D!aU85C)W<>195242j
z>Wjd0l7x^2zn4u^qQtt`xi=7HUJBic52xyhh$7L)k;r!LW#@<y-Wp+OPx2Y=02MRJ
zjQ6#FsaMqYu6RdCb{-{maX#o{izJR*{1~z!Q*%q_R_Ys$u1th&5~2g>Lpr6s-tsw(
zjDr$w{7uDP-iy!cB~JlS%qt0aiMX5j4m78xzfirD&kA>k$0rAw91hw{@?DSUQ7M=;
z#AefVIi%SbHm{LPhNJ%AWWnnp4v}A9j#GWlnOjiC>n3HR$Xp6~zn_L{B}KCg3vZe)
z(diAjnZun8R$#YBPO9{3eNP6|n1$3owOk=m-a_YT>K)1oc}|cxn~ijd;ibtymnQw!
z<drrkwA~PeigEeF)fzmo0`4?Z=}?91e8PktNbwetu^&%nN;r<nfNh2oYvW~hfM0Q8
z@+3+wv7S=W%m`Dvj?4&>O!-pl&1#=sX?+T*!0w3*k@o1`Nox|&@=FiJn7^x9T=@Yj
zr?=D|gN_gej%ECdPl^4vMPorvU;k}U@i+hJD;cyX9Sd2`<PwHZ3qEyigxLPZZNB@B
zV-y0Syl@?uqg_M8JQtIoSsu|7Sh$?|pLii3kzIH*UPjS2V0#PxtckRtoIc1w-7;LF
z!8{A<Sp3MD@ooQ$TCl?SF!Y%$&%%sx5L?j$c7J$F`pFf@oa$#sUeq9K24z9zaArQo
z^9Qfe0r@GP>V?x<=dCm7{KUOb_73aBBF{~f?MR~5Te4xoA*p52wOhI^OD6Y$nx2TX
zLJa2<YHbZnZ@f6rPvih$*2kl;%x*3$^x)QRAGf+D&F0O8z07#ztrgHf)WW%sMg93O
zjQTXuOq<Kk7$P)zo%dbr&j9@+9v*9wLIVn=T3v0yEQ!=_cC)<?c)?{+i3EHmf~dsr
z=~B+|g_Hg4=o&H~5-53m4~h)%Ih#Gdkh@r`NNsVN!$|YOFEZ%!Uh%Ed-=m<jJgzM|
z5P@qdvKOOv$|?&*VDW~LXO0}y?~8cN=6=HWYF5#ypssyql(ip%7ThG#F_69|>drWo
zA-~1#P@2RbSa6h{(|WFeYbtf8aWjD%0OD#e^TuuxN6jpqC9Mc3_wALM2u08845^^A
zei)7IpG@&7;kXC3%s!p7x{uqOD7o1o-Oei7^=BYl^Z9FZ&j0Xlmi$6~bn1WK^Ssd9
zGTZ7@G5DBRMQ?C<ZwBV=dvf#5G_?hhNHjTQBae?9ZAOkg@XPGS`K!_M-Mj8vI34G;
z&7d_+WQZGpDb~J=j*L>1Zii8aJXj^jk62Y+TpDG=Go)6d63G7_8Bz#kt+Q@s^j|#w
zkJPCyLM9NV?SuQP@5?QtsH<W*1Y)Td<{RnA)Ld|hQpNb6VCjc}LjBL$W*qUK)E}2U
z-_=*f=H@}BOu?VOmcHnelW_Bxi(iiYcdF*=Tf`3Wv**|~>iW}kkT?46pnwY);&hF9
zbEK6#8kR<mq9xq$DvE2&GH5@=OL&gv#ti<pfL0?inb{?RudT3*z4xm4Gl%&BS_N&u
zzIwNwpET~%)wEFNm4v;)vfFAg5u&21l@RIFIBk(qdKR6yfHa@dE1v@%?DfwGD*C)b
zIid>S(EYNgPcczs5feuO_g1KVPmR=I*`L+Q&YHpTH;CRY0?$6@BS}+$A9<5i6{1HI
z6?lkX<embP0snS<KvA$Ede(%Z0yz)08q-$!L*99vQ;7oZ&`^QrMz~L;iDqejpI~a)
z^an%|JFRSE$ZWL<j=R4Yv`So})1#GS_><l9^v&KvivBJ@tvmm5+dPD%SSk{stO|`+
zdOwanAO0L4o9iViV`%g@E4V=YyE16PG<HB(^G#kDvbb+Fcd_MdP-;5!hkqETuy)sg
zSh35=p=bstPc{3PTOS$yblTH}cm_(onXf#wflxp-=MZF9()eW44)z7|Pz*4IY0u$w
zS1b@iH#x7iJ{6LX60x4`u3&r+b=#T~$(T4jUDY3755V^QLK5B>b#X^7Y-mSP87tHA
zbhQQ-JzN7v3PPd@e8{KpXLkmgKfU~j{`1cCAdHS?uJ4;`scAKs(h4?o-C_E{CJ`&o
z4NPmog|A(jF}4Ne;JhD`(stE$7|2*&B~ZUnZ>CDz8Xjf~?)9SM+E=C(YncswYDr*y
zX`90sF+fk5ez}p+JQNeYA6-lGc((s50&zjnf~kqmc~pe^dyM2h>Nh=};S8tUid(T#
zPTs!PPOci5nz>pcFsiHqH(!YCcnr-^8oB?^NBrxf#PE#e5jx?}sv6dR^-o+|7c6dX
zEq$oq`}gb{GhCGwU|j_@>id1Dl4vKrcHok9@Ho4<3brR$`ykFJ_TN!#|M76%mvk+5
z&TVecb*EbYR{+U4?maLA?!Nl}BS_9=t`T4&C2umjnGR7x5|8*5F2AHK6d<ndADLIT
z6)->Z$W|ELeCk>V3$^3T7%S%31vJ4Y()3*0`Qv)wljJSr+Kc|~@@RE{^?Bb&?lG?5
zt*1SIX&$QaE`+vRs514J+K4q*;3CSMhEVk6t`KX_y(!uu(B&ysUQH>RMnWt!=3LI_
zCo6Dj%4`eHa)y~@3%Ha<#4y{bwz70v=rm=+L|OghS{{<1)ZK;(GEIRRIz9Ar@KW+X
z4ajo;6ww^q21ez@KL6`x$m&dQ(>pEuW3C*TqhFqlLGt=Y@~q@n(i5+NuUJo2bq`S#
zAYmC$*Yf?xk)6xkH*zZRzrGf(<mDde2vWb)53(*Xj(F6zl$D$ak}%mVA6$l`W-pu*
zY)-8fvD)f3Ig^5AY7TuSm(ZwhtuyQ_#1DE>+CNQeW!Ve?``5f8x;u}$CsN+I{TSG8
z+=H?v@Q;>ehP8&Dh(ECck0~=|)OwiJd<-~b$pb%I6=@vIj<;}rqcrD}fqB4~987w3
zMOryT<`%HC{K(Y8Zk$)v-5+S!B@vd$XY2tlh41gE6M_1_H}G+i|M#K~b3?59;?(@d
znaW)h17u0%GX*yvn5y_*qZtnRzWs;3g?(na{|G^nHu7tJ8ScGGoLcw@@6~WZ){IjU
z`KMe&wnq8w>+2%7U*81_J)Yf{QM~ndYQ7|Q_t_e?mzULsfpTu|*N2061o7D+JKFyV
zVc^5DWR6hI4yAR*dx?kZ_q7qE2hkJ}_yvQ<ECdk|Ig6>eYUAWjaCt=uo@~X6i^i6J
zkxXQMbr65Si>-B%zrNn9r|m6>iu~iJ9Y_xsM7|8z8A{;SS%c^2eS)j@tWDG#;{_j|
zoLQ(I)#>F+z=%|8z@C!L-i!@Rs@dR;H|HYw@qt8j{52++IiI|yD;S*ddAMM=+4&{|
zPb)$aqBBOcE?2i~?_W)6#f2NZBM+&(hbQaZQj7P*7!719F)`;>brz@NaymIOy*0J3
z+bW(u3X`)_RojgyMCnodj+O3YZ6V0QC1hq!BAng*)y&`6T#5b11C+J0N)&4UiH6Y6
z0C{qzp)Tn{Dhq6;JJ*vzJ7V%(bI2GrU{V|_`TfpO8)0ZDbLq{44Whi%vx%Yn?p2|n
zA=dcN<`1dI&~-o)aqB3iqbKV)VfM967<WPOp8{R+q3E-}?wnvKkRnRyYrkeaQ^3o$
z<<A;BQ9zVdo6phl8|0TZEMz5Bx|B*MB7{tc8^){%IXJr8VH$Z}-hoKZg9){VcrfD?
z;r%arnA1-sgq|mq5Yg0CV5E4!hUfjK``qdD3M?>j{4L%6b9ch^D8-_>aS3;}a_@@N
zEj7$Uf;=&NmAj6IOh|+Kz{llKlI3sSE8?IpDMQy(PTJlpZiam7CsD}C2c2tHQq^`S
zvY!ZUrF=jzQRR1;LILDxf&cHOmTiam@R0G37gZ*^Z+$O!2Iwz$nb>(6R2EbgIKqze
zq5sn(#9KsD$`%d{*mzuU&Mc7ju+^@nBr+dt&;0K$aE$jqirxG8E)}j3cG;-Y+*@&H
zFbHpK5NsT4b{6`;P2vB0yZ9gKe#9J;l@&z@u8{cGdoup7-jj#!jSwrcMf+#PzI`gC
zAF*#mlxHpXLrSsR3py7LAN4x#*4FaJIv>)FAr6IVUR7j(gwn4%GK{Rn)6kPGi@P7k
zTu&X!4r_TghkG;;);sJ^ImQKlE~!NodlqbN9fV+1qy1|=GfT{Z1Yj*BN0)+m_|3`1
z)D;w|a0J!q!}@IGNTP6q?n^2uLw$2KVahX0-8!7HQH;{EWHz(iSgiRgWA)se@C8jd
z%Z#(zfu+l8C%=(o4{FNUIZzYk=T%xW3G#o1KB+p^pYYUro04%kZ8lzjyPtiJd3R_D
z=zHlfnsnDCwg*jAkq)DO9zvcC?5*x7#``4opq`JPT6)K2Y?T>9XSO)qv-}IjBvqh!
z{YR^EEq3o?{hhei93oNgOoI_?jz8WJc1=M0Eh>{gHWm@Gf@uVHtDKmZa?KyI$cc~A
zHGNU-x2w`e*sZ?4nDcj8&LOFpshI{zx$B;%FQ85n=<IEl^X|!!@F{6)R0H9F!*?cw
zb@IWE?HgHd0uz>SX_>qm3rn%!v-qm9gnPzD@HOP(Iz*;7=Wm9=Jf1x(Mx?SEx9cS^
zLa$8C=Bh*sQw?-L<L<^9^jvn{2}^nN+MV-@p3~p%8atI%7qM^Daw}kYQC)CE_3ef0
z-*dSWSoCG7U#wl3JY@L?h2LnM`}(k*-)o5X4+UO!qfdT%A7B+7-v`zjj==Y{ZwoLw
z@*6FHRKvumubFSj+abUB#4p^NDND9OK>A!!OF9H43l1QYKcd*3%4}kNN8%;tis5Bh
zDR*~#@Gya!_w4NYVYghrn@4@tdABe5*PK_%1ggd>?VUfihkZS)Fe;;ENuKFkb7r@w
zg>+^trFkxE!YXmVTA`2Bx8kr$DzMf>g8RturqB6_0UJ@p9XVGG*O5MS&XrS$|5olq
zOM5j7K09z79X_}Rg69CA#$|+z)RTnUGd-fVpl|0qpR2Z{Wb~)){vGh&P$z`{877mj
zfgF`366Q=iR<jQzborfFg5)uv|Myv^=gczIA*;T&PI{rT^fLcWrh*osBlj?Vc7MR(
zvud%C+0S>F0XMWgP?LEZmuF|<8za0o;rc^_zc)MB2a-k03@`uUemFl2=F#@JlJPIp
z8ek_k?C+4BXWc+#rUS?PCP=(H51P*7=FZT<mZX<&0Q+0VMIG&oB~-9skJ0))<9bul
zG4=EkWV~^XA5j+b^{n)?+fG!h7JsW#GU#9p`opf%GQHE!X+KAQ6C1<cz@@86I9b3D
z7`-e*(jCHFV9txtnKySe=um(OqGJ+97(Yy(?qTDWpKCx>jju!Q<yrtm9Cb2ZD%-wz
zIwwSO<j+qpgfVY*`aI%@b4(o*U$3<~0-qL}iE87lvk)I4H+%GL76mk<m2a4Ptd3$s
zA34^%VoCaL<oI8<PsC$4QpWm;`A{RKi92yh;xJ(Ji;NAbr?hy%D2ejVZ*S>)(-18m
zK7d$srIpDpr_uvNsI2cfdmX>rO|zUoAKtgmGsNDJ)Pc>m$N@AUP~N%nRW=%0^yZ+|
zhQ7tldLU-m@T0!K92gae|GoKP!(S7$HMbS)wSYUq|Fx3;mYl+mH@xc?0h+_^CNsZy
zh`)5rQFMcqTP4U8zT{oWLGXF9-a3!H7Ju>XSn$w&-T?Z$qb3(uGFDV-Q*HuwtMF+$
zEdP%`4E;#+WG<Lgp~KLCd4?rXB((cjD|xkItZ!K*tW0FHi>fCp>8HT)KK^IeFBbx%
z?rXTS!DRSg%eEi_0EHGR2P+_Clj@C`?M+IR<Jy!B!xMrDt>28XE>K8zKJ2;PJv&Rh
zZ7Lao$b_~PAH`rbDC}5lC$gpT<Q^_(3jNmUWH_tTlCO!5FmAkhO?1>T+(JIr!5~v#
z0{>@WCv1anhY1@`kFB)g%supuU{Wsg8=RHpr?jqCCFZ;vLd7A6+Gh5G>*@FO!!#H-
z{R0jcK+APHqqUhBPLO~-uF@*#w-`f&6^^vi0*yp)^w(G1ZzGpwJX?}_>npDlz6#P^
zS|I0e{qC`o#4cl&C(Pk3`F(WtaPmbt8&Gtxwho%U{D~&fA*QyQJr9dK&R*2~Oq%`R
z^>Bgh`O=P?uC$ia=veD+2l%duLCRC86=%LV=Y8xb*k)myoUnltWyAQoJI>h1i93ZN
z`ZVM3di3|}R4~J~_(i)`xt?V3R4=tAW9;J?sKbtrnO)zD1m`qW2+$aP7HD(-VKD3k
z_1OTPf=|&B%<dehoo9%v7EE4ofg(?8`x|EY(8}gB&_8o&^6D9lVs=DhLPk-B$HeCe
z_TVu_%=K#na#eMruLb&m;M9_O+I+Iw(|SfVFM|oBVjod0O*zRZ5>r0w{6Fpq2jocx
zqz#Uq>L1r{t(?rzKFe&iC{FEuy)XZ_>o{-_m4Y8P8zYoc=+3Tb*M<9u|KecbUSkq6
zAuHo5=SvCw8O-i0Gv1g3iY<$6f)%IHNteYTV277=n8_Y*_g2pw0kzx7QmBnU6_=U=
z^P&F=oqPH<?d2|sQQJI22BRG&@dsuOW$ao>5EYS>_=Up#e2!u+XWn5vZ0c=~4u(hL
z=RX#=1!$H+RsmnN4nz24!&z{?Jo30_{d%sifZV<fo2eQ3779BdTvu&I58Js%<=5e^
z`{lA#AZ4juk5`*q4Q5k3z=sd;a{^y~VtTcr<M}~tuBejufrY4}u8EwsXMZV~fmBKQ
zB)&?V_DQi@R?J>Lx#i;jqwB4r+H9a^;kQ7GLxZ~ncPZ`^cXuo9u0;zJcXxMpcXxN!
z0>w2*kduDr|1QqOf18zLWvx7G@7Xi6_nf%iYqg>(&DpOBAld1|qw_5G`%&3Tn+W@3
zEMT~;(UW^0nn=V*i`5R!AamE7aKQIWm0aaMBr@3W5#mm(*E6-f{T&rai(EJ8H({rE
ze=%u&sU1)3+kHjf=;<c>JFJnA3ZtBQb2G%JWzy2!;3nytz$BqA{=-H!c~tM^y^L}I
za8v73g*F!L_Z|{3r<Ye*+K4K0mIBU-M=BYhssn%Qi4c%oy|?eNHPEo*%J*hSHa@Pp
zit{YpH4F~<`LJeY<RX;-h@&7GquW1&)q8CF4pM583Jtq_?~mQDeqwl=UG&zS#jvjw
zkq@j5?9_xsFWTh#1(IuiYwxj)zn}c!(*o^Ia$oLHJ)uAKIof#&Ql@;DCC|AUld4)A
zK6y3TE(?^k*z0K<-3e}|RXkI<@a?b233lW2b~}LQJT)jBM_{9Cc&43RJ-qjC!g37$
zv)RB7t>6q;ZL79wJ5A;G?A8*@IgB`fbAS<Gg33gC{CRU!;0UR8VWK~Fl6)K>7kiql
zHaZy)j+jzM^*vnwpYlcU3E&?qLHcAr;fF@zC2=8Ez#27)l{@+k{B=>sJ)3^3j-e6}
z6N89nen7($j_JL%@59VP_VtQ<xRPhsroSqHKN(~np1GVH)^9yC1a`=1)vvEW7Hk%}
zqZQNosJjyU@pMa;3ZO1@Ci{*NS2m6{Zs{U;DG(8Gb#6=FM3PkGYj>hs(`fWzwA}?+
z`-iO~5qLXpMQLKrw{`7$>vp;Nysm&c7&XF#3s2$TLAS`5CMfL1<#-4Gigh7E%4=P$
zxl$x-c{rZ^*-(k)I!(-k%ilr7$}{UUnPYtizZj_`qhi?;%{Ye%Xgq5Qay$PjC5ZP_
z3ii4K5&CMo;oQ0Dn%q&+C*L~{oTauzdPo*YefFY1+erjq4S{szcY=?vGS(i(#hi_Y
zgxtlA*i5_qV6q2hDPXm9V#ZNyu525$B|#J|J=swcEcrNP(YSXjQmh$8919)GA*RKV
zUyd906>A#>R8Aq#%=qSym?~2H<orbf%|XPQPh`ZvJ;X12S3eMM4+!>qbP~%x*ScQ@
z-^$UqPj35HnP1hu6m(hrQ0+*UWO{!}9ekpCw7=cK4CGT}HHe0J^jqtzIN{Jxm7)vz
zyLCFi>MS1QN9p#l6N^_kek{>k#keM8a<6mLFoE*r8`7|QY0#c+dBX>XS&d<h(cE3l
zR;c$KRuX|zUxhlnDnoL}P{!aB_*SIP2DP;G;`u#-+9kUQd1T)h?>SE6BA#+9XL{2U
z^7z~LA2`HGuHdoD93}1NdFx`0yrlz6qKf9gTj2vdJ+ZIb>+^-r*I>)jEUJ8>-y|MI
zm%7Jn_DnYAsgOE)p0sd{zo8QDf1%^@z3(;1M354ZspW(#=hZ$MwgvVQb^no{2-Smc
z_vY6KG!g3WV*^VvqF}!En57I2>Ere_W9(d~v~n}BcRW)h1j{|WLUN!tk4G|NMj^P4
z9O~0bSoNy&PgHBAc2Ub#b|)E8YyG7E!F>PI`~QJDp{U;WURRIn>rdd-l4{gf@N)*c
zqz1ShxSt3CbI!C;`BNsxQgNTDfW04P<Kj+`L~Y<Jm{W}wA?+77U`w<gT_%C2zA2WH
zov=4C-DHpjSn^sEC8UY|Zs4tSF;a%FFlTmZCjOrL{gCC#vk3FyvniJkD?tVM*_D_o
zTl@63KjHD`+x5lj2bS4___>?1w}7pb+2gCOO1X`6irU;1#Cr3(SsnBYovnFsFnZ!1
zq{}{dM(&XspiWV-#beaE{N`iuZhYx@VMf|{YeS^65B;lrH_&DyEFNA6T>Gy`DrzSu
z7dG#N*@v!_A&FfiJ<s)FAATSEof2zjW+Moh%D&DGIzz4haN0GN+xUy6-{Ro6;&gEu
zuBfLHvkX~W<Y_9Qz+4m4LSG}W#zw}gebZY642|)@#D-u@r1;fmFLI>6>oo$dETvke
zCZi<Ixm>#!uoO3oMY@Lcmn$%Z)aFPEr_OI-8L5qz<hRG8!B#`Jxz-|77!cECalh)A
znO%bFnC#leY76o8-7SRsx!^l1D6!_2J;Um0Vf~Oje-}jXo7x}qrz5R2i2IMOI!02v
zr;;~p?gqwS%)V+r2?co&pW86`di}{*MwKx+V@drgkHv4LvB|^B_YFk&!n7Vk0sc$@
zBls9@!^9w2G#lErkZxwf9bx$Xo%U3_=+-bThxgnW5iBAmz<6h<?A}<HSZ)14?4T*J
zsWRxh_AW_y4K@w9VPv`CCG}%-ds{zs4eg}LJacLX=RR@W39KOF>>!pf!9P-spqcLO
zZDuCya6QK)5EmKSt&?y%)FV@E6jIb9w@BE<Y!WS?Jj-3gSYI=czy7Zmprx|^Eu{Y&
zZi^=o&X!;PB^cp@>3@oHEY!bPbA(@`9lQUEIWiPobmgx{8~d`mYQpw@5dOwfi0*!h
z*L%&iIQEGDLGQ&c@TVAPK9p;&-UFe2N0KKq=H`nv%B-4d`JV9kXH%1W1?NhcCVhi*
zP5+Dnm*X)f{`UU)<>WWrLK#GQzUTwL2+M#4!};}tcvjtna4{TQ#*!vX-jIl&|3&RR
zFw-M!nO*gEPxQy!G5$d`mg*@U%^*Xx_Pdkf;`yjUHn$TrUHLfw8$gVg!nrP3^qrvN
z1&6FDGzjCY^NgDo#Po6ji=@?QP>iI3Pqq%2lW!3KISc9X$m+X86=)EdZYiaY4f)zp
z|Bx|3v-N`WA`HCxAP26<Et??#PSVwc+g%*tBEo}aDyvp;6{_#9uA=1=!&%;swHP`j
zlWa$gfCKR5Hri(_pY0vHMX!y?2kzX_mke}u3DgPOhT+tW+Ev>YILai)9f)e1nq!wm
z%H4bsAEn8&-I}D0wow^J<B~=>g5NAxg1{Dnwh?U8>3IyZ?cyv$@1qwr<_EWt%t;i^
zlm#FbONKIB1h28g<#zVprbkF$ZVVhx@Vw38>t+*Q!Y+1F{S<k0ijlB;6pABjE+Ofl
zUpf5ZaIA7BzEp&;i6j~Qe7N56e!<fIXqH-^e->=b50T*}$ujiwedfXHdS8!f8nLT&
z%U+5&xDtpedh-Eal0h2N*M=5^K@Jtr$9~uGb#@)5h6oH41=%MzYq*L@TnsR=!wKy`
zh;$(9=RLt*!p}}7-&#L97&pbhmj3Z_09-qhj`Z*si9q6lwZn%>lsM-VBukVPgdpT~
z*r5<XsF@#{odJ2CZ(%p?&z_Cv&pBNwbI#k0&H9eS6<B0JoCDxH>T`A#|4m|I<2OFq
zp@b7wX>gBjaDmPckiL6WO9%HsAz7P!n`!jw4eZrBy&kYbto9hw=%NqTffuQh;d<PT
zP*Kxg9{QYp=0><3UH2_S=BCnX<jU|diqKDfGzanaIRCYc`F|hX|6-sGGznTgzrp!;
zxbCPxn^gw-_yfy8ud%3;kg}di1pzhY+2C=FApkQX%s3K5U~cS_?hR&nzW$V_Ssi0H
z-2Ve<%7S|5mt+8VMB~B4>qgYYF4^(gD$n!>`X`EC=G=eZnQWh`DV356Dx^cb7B*3l
z8^s9)J7XvA$x@4QJnz&Gr1snesCwnQ6CzowA>&eWlKr|~VL9Qk?nrE(Yw<@n^Id`C
zP)}o*@c7sxOD2Nrj~#-=FsUkf+e#XD_;0%&qcSAT3N2|)pPgmA0!t>Olrf-v4J$13
zTd@xVlC7GWBB=_dvJp^OW@QaZYNRfgK^@r+rR2NYxxFxrRYqiJu^^{w&RpWQ<IN{r
zT*f_);bsyV=dWNdLde(+ChqSfZ1+L0!L9e#!@i^mwQZaUOxFOYR<VsI!R9^rZp%yg
zN5hHV`8tR(;sZ}k!Z+VxZWW=Y_@(6Wi_<CM_RxsPXq;IaHBNn4U88Q_jdpx`OED#w
zeeQ7o8jJXfzAuAo_%*%Gc`9UnR|NT_=NhJcPxlpm7k`T<4ilSLfnb1f(8u<jq`Ixs
zaXZdtJ_4(mWKGL`3BZ09T(h<6BCQBhXk@VkdG@3R*~?o76-c_Qp0Br#<4ziW#2mV-
z`su;~dzTd_{m(l-`h7I(KM=r~$`jL<dy}=Zw1plg-;C)PH+`lVnNGHFc6d^f%Eddw
zOktWoq7hB{{J?Gg*DJK}YwH8G8uZnngU!iH;dO18wNu4wIXuJjc+}!6j9CUs1_IS(
zj(jw>NNe05uXiL4&pAb*zNDFFw_qf(z4s%Z>O0&;6!-6T4G-!Sn$XBr2!DSJrR2Mm
z(iI^-yZRSS-v9u+AkviP!}q?oz!6v?;^#O29by<5fvdNh9W_J%-g`jQ`BpvR%L~dY
zAFPD{0)#2LR{NJaYp~>YH{=A<O#o92D?jq}gVd%!87ui@t__yeeA))f{;4y<C*;lf
z@nGLf_*1@j#I40*3xHMgWIL48+%)54)WRmK71Lb-J{<FsY;*7Mw9v!(c<zsc=HHfh
zJtpNUI6iUD-udC%gbP1elSm_ZLUVV~_xk6ik%)+ahK!ljkMXt?aG^cH++)mjti$h2
z)V{a7N5DG?!HgYW0s*&s*ygAjJvKrj-GAV6Q0<s$V5>_pjY!!eDMfxDmYpGLf34{z
z7GDp=Yb<&47azP_tB!KY8I7tZSI;R+f~UJ*`3f-rBI*lmZTlU0dVYVh4<e<-spC}C
zDRK5dIe`rFHMPM7IGe0m%$RDv_!msurEzh^X(>y=Xh@(+31{~H_L!Fk#ywn~y9nS}
z_34I1B9CBH1&E5EP9K`j9uCy0wvkL>Gz=<LV3|qOe@t1-p}DzGsR`L1ertHV5diUr
zFEf@~h6r~A*4oDBk_bQM2K(Rx6M}ME@5+uxzzTXjd|o&7AmS~i!Kg}Z2Xhdi$xJ*y
zh21C1`k0YV3y$`|4%w3ymSG58o+MpCKQpNWn|kDRy^xIeF&C(-_BV-j4KtRMi&e-^
zli)vmLn8(*r~q6z_y2u@t=XUV+4HAN;weJ>A?4@JbvVs9lm<~jjQUTFQ(Hl;s`io-
z<v6SP2r^^T)mK{~sBpeG(U$lMeILJstP)fF-qQ1B0$Dw8WCZ+10^Cy9n?99e&P2HU
z^|!CrWK~8UDu{7(Ol^1|`101FaGDIqP_Vx8eQtAy`44?LHh?f9sDsh4KiwD&_lk4p
z#iko~p~U<kL~N+noRVe$-dy+N6VZvP!iO#}CNO|?)0ch~w+4x`moV#$W@acXETa%<
z#q>yB%F)IW2u4JJ7mI1*b=V&^{yZ&OsKxOM0x*wnQr)jUe_t+M5WgvKtrqk}Pgo`z
zXp-2gWCVCveBg0mvbOv2XM9zv$}2=NheH)t*P+RLES<Wyi%Mn9%zL3j`8?o$q@Hm0
zZ^!``Z&RrpRV?j1j2ND9Sa%rpB!jTe8M{siJg84d+o!fE0V_5QT1Flncu&(se3ezK
zQe6D-et}y4<~TIUs_^7FsY?T5N-;+w@6RJg-=20@N!6$@n}`$o*V}j%SlpJBV6p*l
z^hPX=m$fT%0Z<3<{ubiRta&PWja6z;$!V;tUZke&Ti91d{pkl>u6OV<VA}G-s54D`
zvW=zrhKhxj?A*7-DYP$Y5ywM?@~(5aHo_+3vHnX`pU1mWt`5vU0?`1|4dRZIA9?Uj
z<J~dCoo~PTA|9o>Wya&tyUy}Oy9wykY~ARjY3K)I{)7a~nwoVY<bCqm_CCcTGY?Ny
z=}ez3L%~a8n0zVt0DpZ19NP(wJS|DnQ-K!;VzJbUJdIuxV<r;!zIaF?<P<*zk@l3l
z|9Mx;lVq&K*(9L2JKjAWov9qXy<gPY+@Z@nV_iQKDnfRTCCz~JQ*}G7;5C`zhKG6Q
z{%irCA8~0IGC%hqXi*v6zjT!PmE|Yabk@9IUjJ%mIF`FmH6OQ9aR53b_AwMH=(gcl
zv$IeDsu0iTX4AAYDom_tT}{RByZ09e_xWp$)ZwLH>YTSdq-ugE19^FRE5Jsp(clYf
z$I#v9e{@0)=06In(&4TTLTG(b`5Nkh{s~qP_ID4KFSVs+1!=izX}%YWa0-LyGu<Zk
z?K3#_arN<7fS*-YTsik^y-V6d&m3TGl}Ab<_zvgt)5vgixT|32tzrz4xfqR}06<_)
zNjiWit{owfCYO_F>h1n?fve|+uiUjXI9<=mH~#JA`W@PixGSpRY#MlI6iQ<^aGF@E
z0mbfZ#jK1$Cw{Gb6*v}%Q&t+Jv--wtgIl8zOsCWWlm5C2le~5kl0UfSGg*Oi{^~<z
zM}Xx^f{qxsN1#XW@WOlPryGMLVXGZ&51%e!G*a@yiL%{W%)*0fe;O0FC+^Wp+o2Y|
z{uy9VbJc?a{I9#UiVPcP<68-(eCa|Lip}jRxuwq$QHa3Oi)@&^R2`?vG5#|bngkZH
z@a!~+S%mhx(1F30E2a(=BFul{qn8VCxD4NlOSgQ-UDLy1Yf3s%2Zx7^412#14i49L
z<s9)M2WBA4hh67Tf}gHwp^o_ouLYF36tJA^LOYkr3&+}uhotJ*l5Pg`4<~C>eRU?U
zmT(?25l&s54;;DLr`LlMX!q5k;-7yQoA!lrU|$J9B!xb+vafSTYLmRLm^D0{)y9O8
z78)uwS4$Grec%`d&m441vL6o4vz>$A!<Bm8-Ju`@S6->X94V*Mn#rfK7QI%mY@Xyw
zho#u-sSdFg1FMuR^#es*MIlBJG~d2TE1tNbV(<dMDzp+<=p&~sHBN9y7Wc9>KA<sY
z-e))DnO45|pL9y8szX6@qu@XaVSoP?RGtF;Z^u1dj&!i(VR6|OUVnopyj&B6?t?dJ
z6{n;g_-nJA$U*N4p~+Kbair1`^7K@%5@_z0PGywoGAT9}$`%>*{u2$Qi`Oah5O?Lw
z?DWrzc$6_=E?1xtp~D!Px{ivB9tt~?1{O*r<)>~94CMykJ<pW>8c4;Yre+BjyFUQW
zkU3t&#bPB%S1CU`o~K{4hSj;+^SCAq!Twy=ug}<jcG?{ni3wkwt=D$+SaBB}vCPow
z3EmGNMH=)iyO<i;L+~z784VHh$zY|BbbWT+f{#R+EjtcdS$a1LuVQ(ac+<e)R_`_g
zG9)v_N5nlWYR-FnSgSw_wp8VXitY0q9^l;>KDW!-l0Cv!wOWmWo{woldP7z<bu)Xu
zUK#)PDtzm;96h$}K5o~r*U!`2ISTcp=tNqsZAL6tD`n)pj9_{QHSl}hf%=<tK+0#g
z54xtgAZ2Am`$jn<&xR{c2&h=Dgu1#UR4i=u;cH9Zu(w*hf`pOOT8-hIZ~Xi(Fc_Sd
zEu+^>%$I!&hVRK47Ce$m16xjRXufuxEBt*fG+607S>2@igEHP(B1(GwbYpxtk;0^8
z>tPDfN`C*-N~Ai5j>iE{(qD0!y>W`|vd7h}QwC7BcwJTu+9b1CeoNtX%Ukh_26xm-
zc^FrJk3f*cWh@AyCse`97R+#AJ(3ug@=t(rd@RgOpR*DD#??;icTdK;8SYEB>I%eR
z$6rG#Y^F#uiL7Dw<0uL9kJr{K;m(3#&8)2T$>>1E=0F6Cixz0>Sy}4*fP7iFv&1`h
zfX;{u&2O}Wm$U{G@dx=OrKO7U`QpQ@Lg>&&G6QF$VP@5qg>!joMRjptA+aR!Z^OIR
zDbH~St&948=X|Q$q-8D(s$V+?yCbX2PSE-x-RiPp^%QDgW0@Gs_&wT|vdAx8cqD;5
zNg7ZG=vp_uozRh$)J*YJ;U=vZp$dgCE$++ue4Ve%b2_4;T-pX>Unr97S11Pr63#x3
zwzm_Xk~5zykpx`{7Q(BHynq8r1k0VUj^MA7{xrKi>3W60ZrEtz-m<%r)y7N&p1pBZ
zQnk#sWN<_9-W=;5pJmblim8{&F~O9m!wgk5sVP@cxs{RaP>UgM<X=TzeSmyMh7Sm&
zf~4DsEfEm4b1d0s>2&OAp;^rgg#yD8L`N%toXLFbrD4(l91<ifdDJ?@zt65wNsga+
z+bxujWG1g6zPp50Cu$EjlmXF=x}UP6vzUr@XrKI5|D?s5@IA>+&roUZIt@Ur5^NnT
z|3DbEzJ(7Qc#|>V+VgI65&Gs*eU)m$$fQD$@$W4%-hTFwbgq&jSY~_hPs?XnQmY@|
zR<(63t3#ISevEuBex;;hvRadEC?8_(HYWVKHap8J<ODujD}*X`wJ!{5dC?31{}|N-
zNFxb+s^Rj$?EV(yl}GacQ2^@P`4_<^<4?gaWc#WgL0-GWIqwO}cRkJQEX^XeCZn<X
z_HWpid;=#*5??*mpWKmwV}QTT4in|0mI)NXyz&+wkgbF4C8Y)v0XEzgO5shZIFqd#
ze4MU*=4$@MxW9`WOIt+*lpZTX7gS72e!n4R#!&~dE=gQ@!OPhY2m{i$>j$F1ai<QB
z1M2l<E~lUMymJw3nb|F=D4R%IifHGW`06a%JqrW%Pi>}R`TZiHQDSMg>I1oa?!x9`
zb;^G+h&R<B*Jx9)KWgzaLKLc1dVSpK>kd|@9_FJ*CqKoSmC2n887FUfOWn&W-d|Ky
zzn!y^0?}zJS+AV8bU$$6RvQzmI7@8hU9MIaB<B@e<C)%DSpVXV1Rjto#HV|Eg_4`L
zN);lN@bXfEyaF`^HL`VauAK!Ft8!)dryuD|A183K3vW5~no|OUA0>QNJo&cAk<vE~
zjvFKbvJPysJ+1_sv8iU@wV((Pu!iadmnk2b`!*=h{UHI71%?B`kTYSyEiIhu&9;cv
z)@LT5@eF5}c`V4tp3)2ooCzkoULyPYN{tg8=sH+>R6Vt)scZ08Dn|Cl$;v0d9Ggp4
z;vVR|O+z@?`SKV^&a{<{%Wr65hkP<7u9oui{`44cB|Cs46OZ>V&A{h-#IGoWO9Yxc
zGLSct|Emeg!HAh(LwQSnbN`&VQIFr7|8(k8vC&l>jg*20dH~R4o;L3^OO-JV{lTnI
zL%Q$bqqt)(s&T=c_g6OS_P&p&8TovH(0{iv&Q>V8TrpP`EObAgnj;yEz9LIwaOSI_
zp(Pmiga%*|5nuCQW!-GhX}_;FYAF;KD^rf!?yb4gKJbH)ui>A9D?0sNC`E-$#b1BK
z@?p&N8q;kBT_kPGQvSLAc&554)bz(2Kml0$^VN*6yBu<pyY#a6TuV7vhTJ#Y=LPPF
zjQ3-;tQJ-pTh_`ztO7eItBa7;d}?wT|A7qwm65Nvs44}cS;%YN!~va)hxha#GxK?D
z>^qI&M|zL?u);Os=}zJksO-|<lxx_iqGFQRgiCoccGQ7^uD30wp_`SU1@)6Z)98ET
zR?Yyhzt&<1NMrNm>2t11mb{tQWn?#gB;tkZ@t6@q#=tVoUn?Cm#KgEDcv09luM!q|
ztHjBf#ZtUP8l=Z@F{RmQZIuU^Em#^ETW(iGe0g1STF1w1#B!!Z;xg8s%hdkq6n6u_
z>`vdm+s=(S^?*Wpi88>-+alhQ^hv@Q+4kUm4u4ARtI*f!ou3dnqK{#=Ti$Z2DAov@
zro)!!r${<toJZ-$FlKnJ$kse{rb<~Wh1=j_Hx8~mmjYVdR7dw^2w&BwG}g@?FHDox
z-|ILVfrXR86+IQCc~5rH@=Y~TVz5uBHpOK~RQ5GNIjt$xe=cw@ql3;h5fqHY?EIi`
zr@0MHxzeW6=P7qdimDUQ6C-2d61j1;3p{gV(`r~crxjmjXz(d^NP9GSa=G!6=qlGD
zRYxH|H+ZVVZofAPHvX10j=utHW@P>6660sE?wZ`Kp<^tV30QaPz}%eG6&0vKR2xIb
zb12+ED>he`O(u<9J5Xnh`mcWQ&m_8{tDi+1j3h6#IM6?llIfrUB5mqLvgS+mS&17;
z3i{f8M_el-0Qz6_wZn64&O1=KeBS^vL!Q*sqTv8G1LgX}M_VMYgkcS&qH=Rc#5GE=
z7=taCum(!WmzR;IqL$_AVrwhUG?|5SVq>h!Q8U)XFHsBWahMB(zB}GociQ_-vyc1^
z+M9>kYO8+w{okR>H;C1yE5v<|zcvcSF=9B|f!+`zgmXUz#tZ?T)&Yv>rR%PCf$I8G
z=)eCAH7IVjxngf{T=quoW+?lN)E5bpS8o1vrdWloiLHmTh}*+!U6xrB_OSowsL+TK
zTAfg&M|zU!|8f3n%tE*2uSS%V7322I465_dq>`*ZoMYe@<nLKLuFB3<-Zs%f;s1yf
znlBVrp%CFl5+26;_{nH(&$N;b8~{3*%@1^IDJ!k)r%a5zkutQo+U(-$`G+iwCc+&q
z&tBDN25i$>#ef6s#A~`FHls4E*K(KdI94^=E$utYVi$}}Dw6_XqZnM=io)KsXX;o@
z9m~U<&)rJhUkdw=4?<#E{XhbsnH|(P7jbgxmOMKscxk=;q`gCMZtVcZU{2y5N7NSi
zRD^PC{5oz@{DXA<-xxVVOIyp^_Yu*dAK<GE$G2Hmld-tZjZF4Cou(HQI?7WYC-Vhp
zxQrQk9O5Je_f@KVBVPF?Z}F5d3YeDX$ORm=9E&>@vl{0;VMr3rvt$F7yE(D9i-C92
zUb%0Thg$GU>l(&q9E<M*9jL9ff1ps_eH8nH;1We&jLUt-A`LpbZk`lQZOSlrWbrbt
z{OT`v4C>!tFX0E1&H{`=Wwfx*iQ06)@1~SF+XM#@*c-p^3grM^iI;`E3V6z9fRyPP
zYiA*D0Vt?VfmR<2$HF#<>EH?P09vK_)tpIQ*Y7u`h<3ioCt-L6LaH{u!cM=<ZGzC*
ziT*Z5FLrFMX?92-deS1RJBW=%S?~S<aIslhla;UZ#wGZ4C-e(Wb%Am^F>$Z6!(B9l
z@j_N#_ddgdUaK}TZh!FBHmBNRt-6hM!hekaHe5<fs4FE<0to^uDKO=>qalC)?DroN
zmr<=w7pxtsO}eg=1pGPBNz!oMPuHK?oc?_G@MzseGMml}dw&C$%uhbm68I;ILEK%+
zSPI+t5K=A{2?nIM273%Yt&Ezb?fQN1b=|&!$~aZkUivVL-}#Yx)Pd1Q52>V4_5ru#
z94553?&rQ)7}AQ738D|H?l?*su&eB`YP;nDDPa$dd;jcuWtGJi){daNPaEx~%s1wx
zbTvQU7eRr{CuAFOXxX4;e{M;lQaz4VY}Ur(oA|0S_JhDz+)I^J(zrd2=q>8Fy;GN}
zzz*}v`o<i#8V>Z0Ul{AM45TR{wMQfDktb?hBCd+sTBkALq&h&>%{)JpIv!EY2|apF
z`AV+7_v-m(1iS|;6V_=b-qEX6VvyQ}@VK+l4jZhPw1EmXWyK@UIeW9*8fDQq1$*o8
z8~!&V<M#XEBvnmLN1@qllJnGV5#S)8Zoa99j=o--+z>*omcbR8dCN@I$qj|9$PmI@
z3bw~!;s-4{s)C}|Bg9Sdil$8j21?`1L8PTEmwa07&QzM%96fi3=D*@m`ykx>|CLn(
z;S{C+2uFZ+H?vCMa5m9~?*8mr_<VuMzub|ycG_y2w%pcVRb_!xnKinSJ<jHm-!@bV
z#2Ab}K5%1?ME-q8F)&t_l_2WweC%6t_TA~99~?G+PLoo3GUeQ|%cmB7B*Q!*fC7#B
zR^T7^AOg4P=YdMc0moFWoW}3=i8iHd{F4j?OtbwCio3ge*W5ypObC@iHi`S?W*OTz
z0eT4%Q^2lCYvNGC<M2PB82W#O;<G&2F;<Jcbs%n6(N=M!>^F1NX<MEVO06<eiq?B@
z`=~<(rHyjz-Z&d$f*U8jui2>&87l#xBVKR+*TP5E@a0;2OB??CY%HK_-aPj;FN{vK
zbc4^bK{UXMKaX8{iK%D>9$<b5zAihD#aQKt+~}3GR^{`(?Y-ljakP4%0dbZA)gQ~N
zRlC169&+uLekc*>y2JwhZiciS+_x{lug}L(NMM+>TInz0SjJ*(-SYmCn3!Yu{J@VM
z?#-{Zp)OW!)K+`(Xj@4ExX_NaO5<bA4CZfrX$u%OQbwf2QV!+4e*BcF>l)ug5<me!
z4I>$wC=n65&=8G&)xBY0A9;5Pz0FpE@=wZ8-?s}D!UL6^L4WwZ)6ChXUmRE&XozN_
zO((dSPUKolV0~%W!W$2m5ag`<WhRLm<!8rL!P-J)r|{3Glxj@ifdfL&dtZg&&7gnk
zsNuD#*SrQ-0Y1+qhC}y#q67SdoI3d9@+f@c2w~vynY*#&4bZw5l4;aX7<6@Rei=o7
zq9`Kx{g^=e4!{HiStgk6o=cHJOPebPpPeR^LjNGxuXVbys?@Ace=syyBod6m4F~{z
zMq5Jd0Z?3tV_;$iS)xNeLu1Gk5=rpu3*&>zP&JaYle6G4xbG^^z(!ZM&5-*tc+>D^
zs45#){4pohEQi4t8U1X%i7uH@ue?PS8yow4+je_=!C>J1mi_eWt_hbNGNg$Kr3?KS
zio?9lzF?RtDsfj>^)XJNJW8HVAW>er<nF*U;ac&nh76FR?OK9%{A2*kDt%SpS|^0m
ztN>T<zwDjKeu~7Xr+z5(q3}|*(FeE%U0@XA0wrNCQ06NPI@KLjE!01)-I;N=iUI+I
zug4@E+fF7j;-@6>Q(exF4y4ymmswq~r-EMD=hV7o%pLlz-wg|Bb%2Hkps|f`-#-a(
zZ5Y5eO(uhmx-Jr3xP*)mJJv~Hwm37?*D`l*Rp(y`4d~Q*h3ZN(Ve=1I^ct~hNi##U
zPsP(p75spD4u$Q=j*=kQ^+qNlK~Itj+hy`J{qr-;^Yk{v6gOR?i?G9DQ-f1>{PL7S
zsZqSK3(ZC{>AN<kQEBxE<suGnS(<OTYLtHgS$>`6Jmsc<V@4C17^ijxi%!1Xp_{DN
zl8dB@cxgwcO6;NCx4bAyLq+Qg4L7~MVhA(sb~v-YGgb%fNZ-Uy{)PC#o5qw80FYvd
zm3{y%&DENV6ue7;oX?&Zk`X{kvt&mSst8<|LUH9{iuHC<SoV=Li(|HQ?l+qj{Kf1x
zh#UKQlf0W;P9k%hWhY86>JT_?2G$;TO~m88ZV_6JMxen-m(C^s(0e$b#iF`6znC@C
z#XoczEwfxULTYoo|DwWJ=sIdFImSa_s581uO*5DHR1AXu*~xNff6L({Th_fviQJ?C
zZt0Ao=~gI99V0r~zT4f0rdeXyvWKvizm$=s>%0^xCuGG%g1t9ic9yZ>h8%@X-V}A$
ze1eAhLWEJZi1&S}ioIko{?`l8De?2LamF&8*2oDu8Mhg=9q>D<gF~9GgXfJ_`k2W!
zm<}`f-e?42lc0!KA^B_xJ-NC5u8;?r>q95aE#si?CYLy6u>Zn;1NxtPMtr~T#du}+
zVS`8ZV;a+Us)A-`WAEl5VKAmPmt=E2mv==?(+5*sZ1PdJ<?PxmGK*XHb|5>;Yalrr
zecTTQyXKbfh%bhFW+Bt+XE}2G9@^g=SU1AKMyYQUIUlyu<Xp)Nwz2mBbo7XcW2S@A
zcVoD2Cqh(WxEVBQ?j%vE5wy~cQVuYu$P3b!w?LY_p@bfSaLm3K;}8;!>de&!XJU}e
zM#!0leXdX*RH=Fet8O|^fkWJP56lN?;71*Yad58C-mAwQ4_*o6CUzfx2*8^HM=7pG
zbA$B{;oj+9q;fPx&@tZ4P56=D{<T!sOHzf}Z|iU77&KN)g<(sf_K+9`lQPRP#W&J|
zFa7K5>lWUL2OEv1==UcZglP<{4ex*6a6L6=8^s{6tPrMsGglYDjwoQ<zdq?R&g61J
zJC@h$eb~j%y>Dr|JNzDJB&JxUj=5N=L%8N_vQkAA2kN`44cz&7di#g~<>caP)LI^U
z(hR~(YQln{UW?uqJ%zxRTfD35>-$R;8lEbejTV6JH_*mnIv#NfS?vBlcSpF$IPW2(
zZ^j1*rRAA^#XuWOU#?U|pKfyfTNOWVplv5|n5Ng@PBZEd^@evAw--uKYk2;Juj@H6
zZcjVXqHi7&Mo>!(P=E;tZ~y8TILer$qbB}x|JU(imR&vPWu8`4)Di!1IMhhHTqPuh
zz^srIBCD!3YAc;*ci^YE!;u54zLVEeLZ_41k9DT6gwfYKzKRq2TdK+=+Ulw+rNGh4
zfD!B|X4iG1>`WDHBj7j+>)lQM0>t%MRN*qTx0@6TM3DI_mz+;oTOtCGsHTS$ci^Yy
zQu+0HJ&-@5UKPsc-Hdb+!nA^4x>_%Qm)Ttr`|J_loPdC{#Y2jR^sui5rXgbw6m5hN
zZmNaK0GrI(xHwv=y{fH4x5&m+MN+T<uMq<o2cWM;cfkpM9qoap#n`bX;1Sk%LsfXL
zO1s0#;qoH$N4aW!6QSSve|tK$Ho|5psEqtS5WI?4t-hZXoQY!T-1V?4s{s+vEZJ#-
zjHb4qy@X*nwK=EOC|jZ^%R(iqk6SX$-|Z%2EQp4zu_Xep@R;l4)#EnQv-f}QaxbHc
zT5NDTQxU-ma<pOq)#H%obun+KQMl?}U}Tu&f)Y{WfQrMPBwk~7d;6g)vq<r&ym;DQ
zsZ}frKRDD!kMQm-dL3jfF7mr74l>rM4sV8MS8q9h#gR`)ySLR7@yK{qHToT$QA#^8
z(BG?n|5&Ni6Mu{zE_z<PP{)prB0w(J9v`Xt(DT8HiDBdoSFu*W9qN`oqs@I8lM~Lw
z5PX^6px3PXkEvh3))dK>1w(~L;dwsA#N{2ECsFvGaE~vo#!L>w$rU2+r~a|;uOI?B
zfBvufzh2D$JopYAN#q9oTVs0v>cp`8=tDUii$lKi>L62yuG|z54D`{5Y+B_Tz#X^0
z3WzAR_WaB_^ZwE_ko9$sVjM5Ys;{k@uBBn-hq<m6-Oq7t12SLH;0&I)#J%50)LA_j
zzoWxcgUL2Q=(u+KNPofGSI(3Jb(sRVjF%z=(L_8bA3>kcK26W23|ci+>~HAnA=6IA
zp{=U|2eSBJfz1yeWlV0iTY**h&1bG_?ILtfzE_ajl`Qj~gyGnR2yRT-Ax3kXL#@s-
z9&=h1jgOXyOgnD1jkUc^6)QI@lR>QjZ<U5>H7q)p^Zg+83muF2XVBHMp%{&6h*jL4
z>dp6PrntTCD=-7A4uT2SPru-!ne}_Hx<qiql_1|e#Va<Wc`Gj6GRIn~Et}<0xNO?D
z8;|LOpXppHx5tb*S8WqJpYCpe=iG?s+YBAKHCI9nz5;2>*ez}s+8#fD{+0WtQY;IH
zh$sR%1T+7F_VmgHMM(ykWhq65=$!uPX9)g({TEmU{ou~F=ljuoI_`r^&1698N<4+%
zI{bEqx{YaAX6$$-mSshMQ~Juq_5lNpx|l9|AUqC=8lhsP8B6gzWjwoN)Ce%*vx;4O
z!j>k_prh?qLd>aA0o5D<GAOa(Nn8LHS~1B&9INiejcr_ui#~s+l+sx*&|h-?Bt+c9
z;}5BdK>-kyCSdGYJV;43tC+(O-15su%&Y6o3;kI<rdPGvkz1@QB!$RGYKOvx066Rs
z4f<UaRkZyoIc7N$Nhn(-sehY18bQBfB7B~q$zlGJR5?h-9^0gGlKKG~9XAm9fKw8t
ztJ=9EU#Jc0{N=zp1~-qHkx|~C5FjWCh(Vy42BeLTuhS}lUT(G|&flz=<g2mfeM4a?
z>~DP&N`2$^sZM>EO&~Qwq>XR^FHBWyxriRbv^gWmtf~ucXI*0TJeZsHAQto;%=1HK
zF~SF&tbeR$7jYzA=~O6HrSjzAp+bra2;-R#?M%GN*h{+HylS}dd$c!q<o^5qG-4%w
z7Yfs5*N;2b_dZ-<aNzx@gO*J_x#h<-06|xDUS+zUJ7F=W%<blKiuXLN2iBYujzx+-
zKg#dvT6ke3<6|?k%)jhW?oy#KIGllY|0(=Hc~h#?U`8oB{v!+|<lFjkyhPtrC7DaV
ztJzD<SR-QajiQgY<nyy@oCWPj(D^U8Xd_+p6?xH~${mx>L)Jj@2`MKNro>Cm7LiNx
zA<p7k4mbX@lyJUnoO-`D;<AeXWwIPCzb~ggXa^)waWgE$r*35tNNGXf$s)ZJ@{X#_
zdRA&od&&se@P}A_{cRuIjS_o}${aY$LirR@kuP0f)DDe%o56WL^T|XdRO07W(cfMl
zApVSw*X-|i9&fizg#uKUQcE0aTA6Ay3#z;ycxPE$eo!4TaP=VW`lH^gHTPKl2t#&F
zvHzZ+K0FAR2_Mbij>jUClc{%Vm{>J6Zw(rr7Pe*{T^_Ap9}+gOTbS$~4YquOyRmEd
z@gfiUNT1d!P@L8@9R;6|e6J4&;rT)pG!o}1VQFn!1CiGLp%JGE)=7QVtgJJ7hRWHJ
z3SxRz9dCasRRPoJU(aMhr&gXOt&A^%wfG_dmkbptRYUb+beinI;x{%oI2H8-Y|qO&
z$0@T)-S%Lbn6RMJKhqhA#M?dQzuA;c6AaEc8#%|-9p*Cvyf2>-kBqHsBv6UWAHtIz
zgh2N72Wtt(p}u!t$SB_@6ORsUEwb&V@Qt^|A&C^KBtJthfcoOQAmqKDiJBHfY}1hj
zW-<>_V~jG(^fm-aThXWJq;{jBpGuWV+P6C&Ald+I|N8KHY2Da{B!q(vhEC1b*$uw%
zd<wJ9<DrmA5%St`8<iDpe)@iw^}M00+eJX{BYwxIz3E>zZD}}Lujl*(Izf_=AzGLc
z<hjOlf`NgN&lZg44{@*iXr49;Y;tZP(ZP~$mX-rsY&0`0SLu{+epXOWs5Gcixt5vY
zzOCbiqWQPPRD==VNer>Ykx0Rbc82{5%z^GV2x$OCopFdqg|o*muB6O|E~dLpZ-+JD
zf8K{LYXv}bIWh4ew9wsVyGfofW?}5GU+PV+YRSM@I8xPpf#JR7Fr8HnpA1nB(j}mB
z-9It@)V*Q`&fVxmSlS)in*_(}=pycP#lir3y6Zlv4uwI0JZ{f|512nou;??NUQN^*
z{^ob4dWMihH<;KEx=O*6cp48U+akYhM2vn@WP@JBN>FCY@RuxhFI<gjc+^3C?JsFZ
z^=xWmu*yePyg;F=>RoxT{jE0{jm%tT3}hu@l*@>89s_VK`atuL+5WzU$>#UqaqB~7
zTIsoUN-=SsuP@3%FG(7OaKRvBnXYAtIEFVUWc-khyk_V$qP|hry3?YtCEmOH-!eaq
zPt#Ld(41;nV+=l(oylOalii<Pfz_;{bGgyvQonx%C6QF+%h5L@o;5f0jXW3V0~#|)
zZrcfaoz={)60dJXqS8(6hW%d(nyqMq3wN9UtT42;8<4p3aJaw#rW=(*($~$uKfn$6
zAcFRuq_g}I?X`&>+E<JZh2|tgiK$rgCA%r+>7I+%H5QvGru{)}eJ#kb6Qn7H@)C~W
zql&QDXQADge6+cLbBn_hGgR%t#2b!=XP=k#4t%48->Zu(XrkSZCkZQYyMqC3p}otV
zHrC>~7>F`GJQRw+1*;*Aav89bz%?J4QFx(&crL*bLi1-^%&!?}JBtE&nD&ccN_!C#
zE+AjTxPy_N$}ra1_{DPU6}oV&jSe|fyr=QN6@I?tA7K0~{+3_1u3tdl2N6tB3TA{1
zG4|rQqKZP0qCmA~%Bbri6U~WKGZ8Fb+s*H|pdq%=MnV0Au3g0;QqqQ_l`*@WVIxrd
zT4-35p@-~UR&d|cmkb6DVtujQ%o6Br354UZ{wRDhg($oqTBiFZdEY^tUVsdP<0l93
z?3Q$!pX7V*0d!2v>wE82j^*Fi>pfd~#>w=k|NpH~*QyPeCAuZH4QF}-9En{SiC?3J
zDwMCJGy~7frmdhbZS|7~#bnUkew$3CURH3^+{Z}wXgYhuA}Zhr<(&I>+rp8ID3^JQ
z`1R%Tr=Pcfu$050&c*t2Wi-;92;O9Z;LM@@Kq92zfbgGX$MV>SSd71}UR)%W@RHWP
zA$wllFdqW&12LQ%*A;mmbS=6$Yqt7d)BFy!W3P?cs`OAm_L>%4>n``d&B4j@=4|q_
ze<~{9voZquGqF(axyw4`vYucoM8+Kk#F3>_k*KT>w&&#(e@0LZgyoj5aJU?ju-WJu
zI5Qap_Y7v1-8sBEydrJ~A?60wV4t#;7M(nrdcZ)f*bfD>XB7FIfdZK%Eo1cS4}&1o
zDziPz$@AgnP{UVry058k?I_|SM*>G(Cv4q3(oxbh$~0=_?UmY12p1bvmqgjaDHA1-
z#5bw|N3wsUpE9Pu;Y5hx=~0;tN}G*yUW$8-Liw*mtVBae%h374>^17ry{~}2sTnyH
zTzR=*=f8gf@CdDInIaHA=e4?Ub-!@AO#!xRzao+mu`^cGv=)^soXU$@)s0yeoIGCd
z53KfB^ehs`0r-MFb2F{E2a6UoE<60*1vuo_spHN=>D7C@e3I+cGOsd1IXz*Do#Yb7
z9TF*B9dm|kbb(AwL|b^tYY;bX%gknSD5Gl3sCN_3^7nS6#tG@5x<{6)I`GUESJyk+
zSwbR6JIdy>HLo>CO=>7kC2@~F@Wd6|fG?9kdnM6jFB(jy{KF}6A$!X+xQT3q)s>Z2
zG~5KiGDjGo)P)*+&dhyQ;2N8sv>81{Y1u<&ASx{?RD2JT@xoDgiNA7h$eU6uQ={ne
zK@W4YdOdWbZz*xcpCNUyA{prR>^OC}I0(N_o44No;5e!h9gYW26lJCSLnHN6a=47Q
zQsm1~Iw6o6U?UHdQj;;HYXbwDsgzIGmA;>9(H5SH`I^9mn78DxeBT`)5q6AIHsT;&
zCcs+8;wchK5>(<Bzdt`HS2V$R3{%!6(D@lz<WdyBAV9S)Z&6XwNl8V9R#*LJh5_?n
z{4pyLCM;J>RFc-!`z%)}MqJ1fI8<mX9aGt}^Vx|k!d8S&akwcGs4vAeGQ=`}T5P}}
zbyH<Fl8(gm{GN)oBZ+(2!}1{|YF_10ux-#r9A?7RWR%Mjje84Qj6~J2L0eT`d*A3Z
z#I8I^c1u&FQ>+YOZ#<AdLe08sY;S?8T#wTm5exXgab!fLYT)%f!wYU5hdisq8^7;`
z2eE^Fe6P46Bo+M{{+0@!i+%2f#5c8AIrxulXA{)uEZ$+zKv^z&wQLt~Yhvm~@Nxz(
zSYiUAai~@6Qg2jfyw*GcJ3>C_YQ4Eo2w?4CH&HzZb{&6>_%kt}@U1q$Dn!E{^U%Rk
zscM>TAGf!?fBRZ%S$4fO%Yp6rau1kx4aRsHyhHA(u2LPl*#q5$y;0Eul>?Dhgli?x
z&zW8x@L$0xg3WfTls;+~AI5@^!6Dy**hvtkCn|#$Fx2jQ1w9j86-z_br6?=PWt!=M
zOBBtma(wW#hhAym{~w5uRVTJSO;M2c6tg%ohY|2xnl1F7^$&-4rMU42<qWgz%aiR6
zM0$q5Vs>kM^F;%5&?C%07wyAq_5^l*4pcbfo8-1tfR0au_1<1T-x)tW8B!}p>LV;Z
zR44@FBT0*thXf*hymg>R!tg-UrBj>w8!09GaTr1(5eek)A{xK}nwz&L7L<?Al>_lT
zDRr)Zwb;=J5DLiitmNG?6xs_#T2vhP`JKxQZd{#y;_VI{Ox5FednaO*lLI1Ji#jk5
zIu}Cu9AUIZY&~^TcZl6oJQRUl06B;5>D=7dDC*(Cbz-2;1$Fe#ER?ojciZFu)6EPM
zK>T^Yv9^HijKj`WvAIy0^c+zqUTT3NsE81<!3k+Yz}5%%1=V03Ue7AgRBG)eY854P
z%)P}xCA>aN!RjBUwyM%AOi{@n)|QI4GWvq<b(<EP-GpR$qljhfN<1=kY%kq_)2GEa
zRsVY|0@%zgZ*^d5tf%sZqb;VDY@vRm-78PAnM6>iooIACwsJ7I<+10yz=%COZ|Z~t
z!IVq6Vg2ElF;kbU(??ud{W5Sfiq?X^=Z?Tyys9svgGm!VN6<U99vc&aI%dwQAyV^B
zOE5nkE32+kxOY}?t)%?N@Ot@5W~O6krU<A&y*yt)YEIg2WZ!oIKL-519WwA9vNkIU
zLry_c1+1~w8O72^?}{zaB<k{6sHZqpup4pm^7K5TS&q~Vtc5kS>Dh?19FXMG7SnI3
zw%3bgn#Io-y@cFbAk~pWs<x_B?jMefQn0etDX~vlT+YPf&U=oTilV(&@)OM^+Z@E0
zv>j;{3tOis%v)?uYLJu{j|7TwiHJorE&b&wp_6H#XJR7mF(GFK?gM^S-8D{o`qC~_
zEe$;!EFWC9byV7gj1l6%tJ7mpNj;o{{%)L;0sm88*HK1d(^S2QvMCy1A{6>7Ib<a5
zt#Z|%-l;SrygeM-lg8S-TC-`_bSZq?oH=l0z}avbxymtz_$+hGX*I1hQ&8*M?-?ef
z(cx?>E|h3YOPs<}fFo_?e<%b0ETC7jn2(l@qFDZga(JXWxjk<$FSWy_x^eTrp8s6R
z7shN-h<!M5u-DI~6Na=~{Pnokekyx|Xdo+l>g85Wd|X7gUnl=BC%W|(4~BjWVcSJz
z$;)fy(r~Y)6CnMMdkZpixL@t*d7#>mJ?EJ(G*hd@XJg~Ou;}>os;nHWe1_rjT{rp9
zj+n-fSQG`UCr(Y4uTfGh8vHHt3hYjNeQOHCnB5f$Nf_TBA0jDpYtmL@*)8!MHKZ#o
zjeA3Trrq7a)T}dM2N&8E@Lc1s!>(0^{b#n?Z4a|EcsSQnk5MSvi^u*GMw$mJhRWT)
ztn*F4H(fNC&Y+AyMXO=T(%j+w^dWwI^JDk}#tU|w8AufcugB+g8yQ7fj7G7)s$d)p
zROsfLe>uEZeDKTQhc~60ebm|Fd5jz1fFVs{qZxSP@(lfgD<AgJtp{dt4alWD-ma(d
zd;ze;&KI7(P_Ska#N-uxDFCPhtHkeJZvFgaApQ{z`$$ylQY}hSsg*L{c3NX_{;}23
zqWSXZl&qYLP$cu#?9OR2UMOeAW+u1}i$U}ewO4aR#rF=mFm&*!ac~?ldwraGRl_?H
zO~{N388woIG~RQ{{Dl%I<!Y@{lvRU{_6`o9{ro-_Q_+YM$gZXSiE;`EOjnuz07>w#
zD*^+0UiwbqL&ecg-Pzsar>7w^r_<fDXOszhmqA7sIE1*-aL}><32%}L?Vj`>hLO6W
zioxYCD`X~vpI}o?MjbqVV?jSnz|*nGhZykYs%|OIU#5Lul?+9~z9EM?8?GJ*oIus=
z(NE~yT@`v1;Q}kN3npi~UCN6?8ok6#`jA$;AZ@xyYkd&LPSp~4`MM)m_F?)5OP@ek
zXoF4cF0?9!*N^XB#x%gCGmV*VFQ#nEa<g%03FuX9W861)oT9-6&q{a`Y6BW9%9wyU
z?bk&ptTp=IWuwbXR2)hOa>}JZOgd>T%UDes>l&%xhB~OSx3RGw2s&ze3ewSu$|^v0
zrSf4HX4znw{&$LSq?F(9m|g_1d|g#!(5KIFaqq?kL#&&jtD-@Yk%ihc@!6?@Q94vK
z3|45qYHw#lK}uCxJi0%X5`KimYSzf`PpOtAYCEs}=xb>V9lSjgWu{cdk8>|F`tcSw
z1M|=g#=dMQKjpHP_Wl-zWKQ1w%@2PC#SL^D((vLq`IS5INbawHSgW{Lm*8QHPKoW%
z5Fz%OtMtTHtAa6wk|JUb7qcmsm5f-#4_c}LTne`Fmlg8NbxmBkxD3VDZ;xkZ?$skj
z)&YL)iLQN@aUBWh`wQM18OGi>Mj5+T94IK%e-0-6&gQYjXJhPNTcF-`{P%XspX@3A
z@7aS{u)tIsB>ZJ;koTNwA8$*)hbS-Qx3eC|iqRp{8q%t78pMr8NcX^^ao@$}+SvK2
z7I{{2pfdfYZ}ve(shKyHuOf0NGSk?j5O0bxWZjoqBHXYZUqSOkMmFOf77}CC+s-LE
zue{-RpMmrGJmP)6McR3?)l|Z?4sKkc87LbyNNjEdqw2-7pqGtU9u(=<=-drI<$XNX
z){+~gb-$IJAsV}giGx_p`X8jSs+10brAP*7jv=n|0=BYyB#s-C)-3-Hmb(`2{%=Ki
zNw~PvlpU@CH!qui5plRZkgPR2Rasuy+;DEObFpRHc6{x6{v7zEdyG1&?YBD~^XQ^r
zWix3oXb^srm{FK!2xyV}1e4A075w^q*O0h>iiU#&hl(15ffMB32aweHr!1_mqYB_d
z95;cz-E`nxzCJB&`xcHZ#X_ODn*AXn;BIVxYN~v-RYkc2{%$aY+@x9l*xyDhE`2yW
zLJvkSD@byut7!l%rM*o}a7N;|+zra9mn_s|DVFQg1pXho-ZCo6xLxB_Pyt1{L6D)l
zy99=m?ndcO=~BA8Q;_a1fsqbL>28KjVW?rw<Gc5}&xf<lCqA&Ai8cSY<GOygm_^wG
z!hNlU-k__t0D&BZ`Dwk!@3`T$1HMkAMiet6fPf)Cyykgq&j0=>X>2$BJXtz^h<Z!7
z$ZdEC6UMeSDAtL<?joC~AIJMyzhIVd-sbT{{&d1bKbRmvRHmdl^5LZbGdAXrbuc5e
zua;h0=fW^sqtShI?u3+qoWsS0)1UdLhj<HyUX%%D^eAF$JPKV>J<3GnB4*~s#Ku_f
z`jTLxqK27BEI>qveEQXHR<dQQREQLZCJ*SP+Q^cUx*%mhndddb;=N-|*Gt5=9({&F
zcbzsb^EsnVM>D!gN5=Xa7a^@>Tn1u?dpBHLwF*g=)*E+xS+V+?x~aMaK{qDW<JMm@
zFZAPV-rDNhT2O$AKoIo>n;T|1Z;s>sAVOyIvi+;kf%bmfPqRBezzZ$;gj_Z6+CuTg
zsy?~Wt;nrTG=Hxm_t!gNsA9MaYCN5?kKfH=K6Y%1W16ERQ9-)+$i~_?4{_JwvjjCZ
z69Z8WIt6I#B)5fhPz$XrnEtKX*bNW<q3PvaTBTv)Nf|zNCHkmxBZ}Ml;jO*A?~mk?
zOTW2GWjBL_hlHMWGk%+U$?eNbF^k_P&E=Mwp{leWy%*ke_+H{5V&Iry`C<uo)w#Ax
zxr^(mCK(+LyO#wLKXmjNiu)(j$$xUs=YK==8@79vc!<eLKhFBjW&V1iGo!HHykCv~
zsnek(%cW#~R@!N$QJgT|Ja!BLO&m+AIIeotf0=ivt<%n-o3)(-xsDv2A7$H>vuKrd
zh>={<IEbzIx5A0KrvEWby5Ur+PuN~C*1)v=aW|!_o1WrGz}_-$B=8neCqfhLyX+q3
zC%*Yo+RvR<Y=Z%N<9RB|!pxi61R%|O{E{u;1wih2b4COT=6~hV8`Y<m=kc#}xW6$C
z$oyJ6>7;TviH#c&D#<au)oX*`(dqc$S-d}2facAym3eL_#0h$>KA_y6kpx{>Uw`lm
zn>*1zh+%7C%OB&m?(8ZIp0E1R-2{n==?`3ts&(6TM>eI0Yp){M)hul#%=ir#TdtlJ
zE2IyIma28)gvOAQC`(UghE75{Vq};AwL-pHDDK5%mHk2~rPT~rBNX@EA8{)3lmmGI
zbUw`jT445JtRlYgqJG}UMEi3v`W+7ccS0}uYS18tkD^e*<f;v7hHGn&U)!W{j%9@F
zw|7(rPabSLtd9s>f5;C`pJOBqR`uCWl{<u@m`H$z?pt17-r6Stmz`f%(1`@fpoB_+
zJj+O5JU~kKul>M)!EhG;?|yyxU(rj2Ue_0|Mr**5_P$B^UiafZD5iOysuvqOA400$
zo=C_OQTniQI@0*9{JKnz$a{x(c7Qq+oh|{==O4Y{vtj%$A{Lv)>}Z-QY6wl?!p}Bw
zGyD^t@lNpT2r|l_xeM_5J|*hj{<yL=J`v%WP{%;{Dx{DKuIj4DnHDNZ6M;@gCy4V|
zPAa!@rV{5PJcEO4c$#iJfWDtHfqL39UgQ>Q*oko~u5o~W!*oT4>=ycL$fz_v2a8fs
zv@6>;xk8pTwVI<v(qVeo>DiGnwOjcx_r$5NDT7$vy5D$mCH*dx&kLg3hbhn<h8hn(
zRmfxQ{1BRijf>xG)$>%50-vE~_`#bB+^ofTC}jG6(b%qK@tnrL^YnWFa);c5a+w+&
z1q~CIxXfBY=v#cbc&Pe9)3@8t@!;b60FZ7%!QzQ}OU7720VxAyI4XNoiu!jf#9gDb
zs|?NlHBKSgMNIgpezDzWp4p2*KtCWCs(x2;|3u^XLr#<j--9D`L^k`7h^56tB+&-x
zHr|QikLiE40Fm1P+fPYqMdh%y!iB0xu}pij9jh3u42v_w9|QGY0dE<CSPda~EflB+
z0e*+~zaV*Dzn6_qjv~X=d%XTs`+BKFY6Q-!V6XE*=bNxMZc#UbmQh+TfyqPvFhUpK
z1_vkn0$Q8DFo4d0{$BQ}D81--$Ala50IcWWBi*9+?-bTJEr-IjLZ23>>o!~L;I>#C
zLimX^4uQ?+lAf^LxK<Ph6qn?$M@pdvgvp{BaH@{|02COR2{j{Qa%zK4M8O5PI#?#k
zllYtde29*`>043}Fep|^3e{Ro3Y7QZ$>zOV`7XkyekmRsdGn73EHKqqdPbD@hJ|3%
z2vEoEn5O@Mcq`3705p$5lDF@af76f?0YL`-dcchBgYyg)ck-sVA0*PdU#&4$*gf;D
zl#gtre{nMGY<+ut>?gAb3x3tp+p#=g+zR#vjLttm{x_^-_b7Ek>G4YLe`yO0p9AOb
z`cP-84M_)M(U6cX|5RwRjb`zk;l_>sgrXK={8O$AwqB_UAQ|)g7r=lP^wb4UnC7M1
z+S-a|%{f)XJUF|hh#A`zOek-jScz39RpLM|_tKJ_ey&K2CfD@@h2@%#?eloLNUDTB
zIIMZp0<)(#iwB_BzvrWa-!oRrUBBJ1lt&bnHo?w1?MCOQ^nWDE&;)J&hCv^jub5xb
z;j0<i2o<wXEl%EtGuOKx>sjZX3^bZXdR*RsL-iMIDQ%U_lH5&~Ic90G^MU|0HO!Y%
zL^h-tf6~{=)vS3W+w6f-ismlcd-SY3T-C0_^KosU!@Q5uc|F^QKS<-O??|q0VsM_^
z^saU{F;XpxLZ98|yUFO2+^qFUYZL7tcE9(Qe%VO~D#t1E3XPhD{2fICt9G0b%VM23
zBx0s<D}V{>ftJ9WBt^Ue(PYvLidDAuuyzdRr@w!VEps4PBsW`6!{h%zc$RjoW3QZy
z!+9R}A^Q#98q)RC7-`W_iq=NJmswT2yr!Il8sCx!&X-co{+FFt595g<QB!cPM#w1N
zKxKcGddq@=jYnn7Kqm>6<Q{q+wde?3sO~|zJT%Ii&E#rSAH>!%;6s__$9mI}8kc6P
zHeMP~ev)AFs!X*Uegq6i1WJiwHwdP}R;z_hN_umSyK*9mK+Km^8zs-&4I3GvDhd9$
zMRqR(RFWBi9EgX-Yv1CvfW{kxXyxKQK2UQNd7GZvvzbffCdBY|6OyB@@BCjBHTgpJ
z;&w>l2qj=O_QAqd<!VSK|0?{1b5RZNyQyU|v&0cat1`y?$#Ifi@V)L<&WxX9?O+vi
z^w{_PtLwr6WBHhNiwBZ}H7B=w&E#?8r5&ICG_8yNSo~H2c>ub!UL(frio>8i#=8?>
z`%y@qpu`(DZ@d2@5#lxR-?TR2Us^i>#mrA*-I@z%KYPU2G7iziz7m-~5$t$pYKf7y
zQl&iqRQsE<!>;J5E)P-lc##L_h2oPa)q2Up(P2Vm6**YRn}Y7vadYp4W9}UR5eZQl
zi)kqOYtGdCyR+rCyo9+#eflD3949^x;P9c%dWicGfnK8ic{(%ByYrG)?>Yh<HeU&$
z;WUuARy;6{4f9nTxG`P}P2SY8(Z(kp#}l5p`Z%7O9_V4S%tv9|(p!dY%>QIe`;=B=
z*mG~+4o!N_HP@Y6A9#^ijv5>s{9PqqBGZ-MY2(@9JROb7zq(P_zq-+S?j^qSi$B6h
z=Ao(LDzaEvmN_GC8(Q-x72fUrLrx6<!;o3AT<6d8XfU_Dbb9){WZHnn_A&~CANJ=D
z)V<fT-@6##xSM8Y65r4VWfe?bkG$xDTO|<cWU437Q)QLN#wLov-)|*lqSG~(i0<sr
zM!YZTnJ6q_xbU=1*wo3?SoBxP)VS;7;52~UqEO2{^2)x}0!>AQmapdXlE4Utln3?@
zrzb)kz__dPWtH1)C%g2H=gN*E=5FD3KaAp*jEG|4S+C7nsjFpT)A$3VAE(~;M$F3{
zM1?1TpHbnLkE;t(1spv7xevlT3Rg^gJ1`N4?Ksnnr~H(#T_P2lv^>4YVp+GD_SWv+
z4)dN{Iy5?v`U_xYwR?1<sy_Z1h{N;o`^kQ@gy7_@<F<D*x$>!8G(5sHDaJmh4&RbD
z@3ScvXnvyVlePVwT<+<Ac2EF-Utm6F!P1Jzp7r<j$>Ge7G$;>$H}fm_)|`1Q*x>Ew
z1H)nWNKK{zHQ9#7ri%j-_wm(;ZQIxehzJ52F~Snu*lt3JNWe`51k<%y%)5Pbv5v_Y
z+KQp-4<$?fI~RH}E#W0)Pw$Ii_-Gc9ophYp%uj$tE;}TcqCkfuFm8{GzSD#Av3~(h
zoqy+RKM!Tj3bOn>u}~k@OccqG6T$}_YclvhWQU-eN@-nI8*55U6cEe&ukiR=4f?m=
zuF&`QZ2(;Dj0-R1Aq&<vZmA-)f`*-(WusY!Jn#Kz087abHa;bcepVJSJnYmZBG|hu
zyEIwIe660aIpeCG)X=z>xxnif6Tg|b1fk^2=CA{9oeY-XG*G|E$kL2kVtt+osrtE+
z5~73UpQv0AbrhBt1I?dU-1r|I$gi0UR0_>+zt|v;ft$T<Y<b~8ABZ{p0d~e@6bzko
zeldy$IH$QvR(EVSQGK7IDqv>EQa(v8C{!_5u@^pI>6kT7W_bw~N@gd3rHn7#R8wTD
z(do2zK<rk26K7(B<*F6bQdMWr06@OdILF3#xx()pw8>CY^J-z{DT28H>PQJEN>EQ%
zQ-6GnKEW))Ub%gc)=EPa0n1`fvz0>`&a|`X7;xdMM~*j^5hOI}|N5*EQld4HicFk9
zvGWAyqsd7x-to-C-eqT0EP4sj!pz02A5LA^kQT2y?{Gb0@*!VGz^1791o%>dTRc=-
z5jPUSL{k`wIgtE#Do15Aen6_>ul=QhzA7%c>xN~Cj=`@D<{^_4>$Y$WJ6kh_X6NuX
z?-9LDWeC67UZRqDA{!OkZ>Q8(n=p1@GgCG)?fjIoDxWD@ByT8SLh~_2zL1)P&S&lT
z*jrWwRh^h>OH&<EtWXuBRv3As{8}GWCfhh$x!6GKg@~;dj<j!RuB(%asn{-zWqw=M
zL2VHbZsuv$k6m0Vdm~j?9G$^ge>dzbcl@;e(^L6U6=dm_J}D0Bm68FQjxF#c+L!sQ
zB;OX{6N1rA3QTJKFTG#9LpAX~gIBa9Gx9vD@@vKE#>6vGes63KrubOF#B^r5ZhACY
zl{oTaDGj$LZDwpP8h6b>eUMIlD~sW&EVjl)Ol_Nmbu76~3LysB?l5b)>?}FDysh9#
z<?zMH3`&Iznf{!EORY+uKo0qnC!aEou75ydO_s$e-FJpEkv%Nd#c!A;$hZbC`9BGs
zj$w@_5m!~Je4fNItNrf!Oc}2SH>7YEOt~W3=I8Rj?W?@?k5uvywojoP`>r2;0$4MC
zC}L<VIiV2O1N2IV-Bk1RPBTG$HW9jGM9z(?gsbrMB@eJJpFao5z<C`a0YpSm7$)G+
zP?#z@p)qfDR@cod48EC8|EyL%y@Otu`w(Xu+M^5L^kzLwma&K(L;;@-)|R&f5U>TP
zoyWIaoPUw2d0Iaz%H%P(Q$M7(RQA*d)CU{Cr8jODe=K-?2=|y<w5xSLB{2J)c=FpQ
zIb(Efb@knHi{rfhSncQD^M^A?)-dB8CqL3XP|F}|dHwI{21=|8S|kjA4Ag*k;{#u6
z&SC{q4pFyDzT;@>oyDQ5xPV?^V++6X{_qj#GFsKMQRLCieyvZ;{t9Girq^=T^!ZOP
zfT#g@Y}*H38Iq(o8eh5}USsP@h$`1`7%|V5Wj}bH{SkSD2gRa8dB)$k(A2yUVyhJ`
zWp_zB^ahmA$(wW0Qwbhjh$EkcwCSS4qNjA5w8#(Br9M<i=j^-`YR~l%zrBYsJL!CS
z%M#S<lYhy2+3d;^6DygD6arlVlxAgQVbKWA?3Ho!6WAZ|kGkbE^)VwouB~*;4SIp$
ziWLKuCVj<vMok-Y6EWCrJMuc-&ER0glz^cV)!#<44AFRt;K1^UGf{d&ZP}EOA?t<~
zv~o9yo3waIVB}LoDHWYGwyI1uS=xdpn7W}T=p}P$<CC$ihP5t@+|7HOx7P6EH>Ei|
zz9qLH$dB(G9%j4P5uV!R?hMkNP4&&34CE*j=mYh6sdfGl?6b}iG9kc6%|lIkt@r{|
zMJYUUGjvRy>szra20q7?3ySHoe{pXNgbCP<VyMr1L-5KH74<FDHej1@l;cNNI(@Me
z5_{WUs}ngz<-F#Q*h;~kKb1E96#ug;v_gM;S!9x~{IDOSBGQ3-gsFk(anD(0^riUq
z!!~Y1Y$gx;LxkOey#*g0BUO?Lp9_Sx=_80Lv7HRR2}0LztR}cdck^^kGZPmP)7=<Q
z0*sr<+!P)2AbG_{T6rbF^-zOZ19BRlett$3a%GlZd^)86kzd(gYEZ7rsB3*?Z954>
zDqut{RXV@m90H^JnRo7)Rk$tC!q#hy(i+fjdPT|miV%?=3Y+@>BT(D_`Sq@eaXvM1
zEowO84$|&gO%!kkks#aSFBHXKw%==I;D47#n=<5-^KT=;DK#6`w}zc)<nag3FT^N~
zyM2e(0_5M^S40oNrqlC*h=t>)T20!JJ*y%Ki9t;|Esu(HD3{iH^3$#-mcTibqxVzJ
z>e7CP^`ox(UcJup=(J%1t)@W@gJ&spzEfNK%inAHDfpg$&b|NGHWLIw-H&OyHefOb
zTD_8kR0mDfwZ&zKq&&K$NkZJ-34TEZocn`IO}5ws1Oo)FH^;gf<r)XS4T<suWZpl2
z!A~5Uh5C<UwD`YHeuA~F5&JY!WM7ma@>fhL8#5y;dw=<;ddC(blSq~7ueD*7u@=Z#
zls}||u_cSqWrB+b`N$i*MG%J!3CoS=?~B+#l;QHmXtQ*?ZNc~JunE!QCm%l2yqWfu
z`&LTeiA2t{)^C@0Sk}z{GJ~Zr@}B97Kha3s^TUvOvev)^2@?(Qg`3w!?B$Xp-~jkJ
zVTJNe5yrM4eR79GPDe<<tH<7IC{qIFJG#Sg{=XTNK5dN;IN?KyRK^(m8ZOVuM$DT>
z(ADZee^SC%F$3V=KgyA=Q*;i~t@F*F84^DqT=3MpEj{Auncvs+blQDhVKmpqtcO_9
zlZ_!Z#<4{df@s$DtFB8=LXP7TI9WJ}SJ(ThwM7W4;1=LgXIm%ut(y+CQ96ymP^Wx`
z>7nNRMgUUX@B^V^vbB!%x0X~E>~HchX)Cc!#}S7~YIBEm75L3hyn8Hjih6^1L)j$!
z`D5X{(af}Gc|#&5LWfxzyE7wB5sQ%hQme|mMsjTogU_z{&dae@S=2t?zqEk~r~`8r
z2&*Wg+z9sKv@s3)>nUDJ=zsh3GQB;^P+TbK25~;p8%-u(q>-Z*{kJdt%hz17soal!
zyvR>w%`#r3ERVTs{*hm?pDT>jSnU=>V}0EI?WfU5Ty-<B=V%8J?Rfb5F5pxJ>mc*m
zk+zVRm!m6VE6>H8KXYO=%A;;0>}4j0a&>ZPY?a>A_hyf_v*~C5{LC(Y<s&Ag!GkW<
zy9$vqm_Zah0qJ4|MyTNSQ?8#soECdt)AaY<M=cG`z=Ha2wN19?)6@c%^X>V+>`fvT
ze;&4$*0VkFF!c8rypfWsH9f(wt9vPKh-~X-NQ$h|t%LtFOKmF8mT7OP&fO$x^))CT
z1es`y^SatoWDh=NUwMe@S>cEIjJKU}Ugex&Uv|{Ka~LHU$27H%Ba9xndbTNQoytcs
zf~mPiC-5$ITAg1+s=Tsq``aLL9LjD(JQR5%tT6#oS>k51UvAUL(!xG0GAsJQsAVC5
zJkb5N4!$>*gc;=X_9I6djR3{kB_{kX>`B-mQMrrKWk#U}l5E!c#Oic6#-KT@nm;ee
z;cyEE%h32fw;NVSuFzoZhrXl=kh7zZG2;)Dj!4`re4s%x#0`_ygON&_t3hkN2BN3w
z1PJ72-cz6n1QMg|{IUUuUXdix6SICgd39Tbw}xzH)+)C0$tF>=-@)%Z9DezdiJ-D)
zyz1wo0n4($C!nh_WMh+P5^yf8;7@)DB?^comUucAYsM@Y&--K<<hF@}n*Ot2XFZcU
z((h!Y4HpHIybov~#F&Wr;*veL|98{Iz5RF7mb}Nh7sfM}NyeW)oU12gOiiGF3CCkX
zH^m0^0ySv;7<3tGYU`w=+3-nf#!l;n*VsWj!K{Yx>kPzsW-kb6%h7@V)-~48piw9M
z43`z(VFgLHj(!mzwxQ|ga1p}ap+6`DuO%dH78d>8Y5w`N8(Dvyd{H#~L2X#D_3{1F
z#Y0%9!cG@KkUqOhp6J3bT-vO4z>lPazjU9JxFkr~biTuI%K$7`2{phZ<}z29d%VBZ
z>au*vpiUk(%=v*+)**^E0QVK5@&N@!i|iAXK4L1f%<V+K5wj#vx$aNVYRMOwaU=oF
z(J8uAtFPR#fM?6gTDmMJ@*1=*(-Tnn;g~Xu-eN~lzF?ZyLybciawXiXw9JTM%hmDL
z#;qy@op^sj555FzJ>x(0(CAB+b|YKIo?X(yrF$<yTowbcpflmsgMZ0En=12+;03D0
z?}4>+V>BcsHO7Uu)NZ3``)msWmUOJgUGH<bQtz0ha7vrgRV%#^kAB5=GOvBXzrN%g
zFqS^i{3(QHfwYR8^Cv|A4lB12gLBt-T}dCB0-t5KSgo1uBEL->s1k-1EvT(88QT8n
zKVx~JDg7?yGJWReh8as106xTL`^t@{_dQwx3-s>%mFxTjehVHqH>#3bcC>XAqfdlC
zUL=!Y%EUvWVLPSA-CwUfZauoIn16&P!0QGmMsq+|S2u5~Sya}J?22D?k92#UxqY4>
z@AmSPnOX*Hv{C}gud@{rzWdssQrNh<yrk3K#|;w;LWkY&+A6>v-vAmn6HDp1_3Vv5
z^TZjP7Bh0~4>#9LS}eL@KKULAbdKa2Wk$qrv-t4jKHbMdcD%l0>MBNNQO0kY6cp5b
zy1^k^@<`uI!{H>`ynqeZSN>)4IV<tRR=9MQQ9Uw=$X)kvSnGa-5KzjDlhLb=kSOYN
zOJ5}R<Gw3VCcC_f%+nTDdeZ0I{Rwldvsx9tFoIqWa&uQKgL2jm@0$N1+a)Cck1+Z$
z=?X#YS8S?{_96V|bis}ki+D}q>q>qxo-OSy#qTw9;1A44I)K>)h<c?%-}nT*#pMeh
zaYEKcpNRsLG<q{QM+oV-y9jXMd-|#D!XfqH{KUDp_4SF}CAGHIACBwG9D=HI4T%@Q
zQ+Q_BAvu|Q)SN4vE#`Kz2bo2S?P`~aWW9hg>ig=DJ9{1blnrTu4zI$}d&b(fQ77yO
zF_$dv&veEO8Tt(xl)VYv;P!i;f%R6eW7>v1+2Cu#BQ6O8wPEW#F@K8}16&9pDwBL`
zu}cx#sNQ7Sa^dmDeyls+lk+BtvYCP1$x>}9y9Zm+D{aG)c965GO`K>Q7+&OD$&gzb
zw*mpz`XY`++$Yw@wp!+ojTA%_*TPFBsWt5DGRYo6Vqi{AGKLn_A2{77(k+reh4C*4
z;G)M3?!wdR;58>EZJMAcz}*hO`wHa1-b)-lC{)R+OzVZh>NFbk9o7vwczDRG$o*|a
zZD7;C*8s_c;T=_f*Wy<Tg$vbu!MR#($;q?QFXhG3F9tV;yIWyrA~&47BxO2>Am6Dw
zcmF{zyBt<5eBy{xHf_8*^;E1`%d<<-SX01~^+7)hP8Rjf)qP_tH>&bY!e(|yPJBzc
z1!s13gJxYQo{(aj^26hLBckpUEJ0IXwprgk9Ej}*svWk)B{%T=HGmxvo069jzn_@I
zHcXt9-SM%)G!XW3`%l%!(MQ_d%#97O_}Hw@vWT*F$12kb9m~gsUpFm(ee$@qD6ahe
z-wo3Vatip!zGjGh>LyUmC%)J};8&AX{|O`!`^LDCLA8<E+wI^-&I}Ibq(^UK;=9`;
zG754Egsy*ZK=dzkHpV2zj+U7U2$R9RZ1ScWrPy|tU+)NwZJpcpy4_pQr<goJ3>y1N
z>>AUjpDm{*Bc2o;-VDBIUw+j-?Juli6LYDAkZh=Xz9{x#w5Ndbki_f)LmlpVd#TtZ
z1qWQYGaD!K(FuV;Aty_;*&cvpA<=+y58`!A8;VKPzxPliOX;bQ^~{#kcC4`8b7K=8
zcAmkF97lnsDYwMm-1Bu4ZNw?@nrCaOBVp;zh3zjnZR}0YLyJU&NQu$vWCP?4r{zc?
zc^KMNz}-E*ZCce}>ibO*9{mT-bar#*OG4&f6Kwwea7$=f^;J;9k=o{?JR2;n%7=IR
zu<AE6q#^}@e<cz;apU=qUeAw&;Q7a|H}?5jGl4COf=brr{351=5x;Szy)_>UxDh`=
zGh1W}N!6YwC2q>z$>9>STA9SlhLSd#s1%sB#^jRfmdV*fT+=10nOsrmxypv1Yxi}b
z3M%hMKhj*<uB-rsa-Fu{m~3LT^Fr+&;A{6;O;lm<i-e^`KNk<4z<1a`QrlU~U9r3(
z?uSx3!uur34Z*pfOQO#@fD%5&#-=b)D$?m8?aa~@N3JHzk}si<u0;;lU4z{DEU}Zn
zPCVY*VX1~8^RImYp@3Z8Cpx$4F?GHuGZ7N?W(QqvC;P7^yHS*q(yQg4z~9>&VK`J&
zjOW^w75{cFE~T_O*`evrYSByn=^aNBW$G@)B)-k@1EE?*lm2jBnYnkL)nSH{ghl==
zMa|K`4h|}Gu4KtArATm~=PgSJBuR1X^m?Q%Z>$Ww`LW!<pTu`SWavtVFXP2JwZtPd
z3x<<le@>W|IQjy166)IIk9|poJ8aYTddI|E;iHC@m`B6{!3HaRp{gxM>B~dG`C0kb
zR6K1gqWXROMSq&!4Z3O6$r(dr`4X}@&hj9HXw^lwDHha#)S!l4Ea4C7Zd<c-A1~u>
z<@@F-qCt+>(wUCL0fQe;;LD}0PN~tFB}1DZyJsw#;|_i|lmV|6=IW$ME^^{U;u5(l
zYqQ3yuYM-K5WW`yHgRXp?2r>j0!Q>O_mguT-i`_xZ7fwC^`wES;uUSI<x}I<iO}WQ
zmY{7b8J)e^N_6>)29GJD>_x_SCx$)QG1Y3N%>P`0g?svXCy0#$<jMu+<cT78&p)5J
z>j_6ZNKK<!onG*vFq|bd`>lMkHUx^->fX2+Yn3y_rQLwm{QP#~u6qJyjDq*#Dj_fK
z(f3h#VlW&96RTa?tENHUn<!N?24>`OG2n%M?SflV32{3K+mhNp)2u$}ZSrevR@VC+
z6n^f6FOdO&g_W*-yIhZAU)NQ+!!wBl4?T2t$AH1BS3u4@n$?B{7+oza6^GSm*t?hC
zNk#r4Gz{U7LPhl9LGi$%jC@Od`S}BPc>W1z+V5MY*2^*{Y8DoBn6v2UdaciDCLWc^
z#eydK4q5}WZ#hjITXOf;8s2c4gQNm9g+!&mpB4i(7a;nj1L845PRL<DyZeIIpc*#_
zGUAu(2UkwVwdZRvqBvEnxkNk7vEMbukk4cFZlJocl@;_}X-x6$y_ke=esr2mV(=LM
zonK<<Vy=`0IP$AE#51I;2JP>ud>~m+KfT{R4s4Jb*{hqWBorOe4tZfy0`IC1_&(-t
zB&NP`iDwZ&!B$r2KMIu#{FY>@cnfT+!&SVnY6_DZZB~8aReg8m1|PY%y%DH&uBS@)
zfOSo5*@n9Jr4p(};-(gc#Kr6Wrpyo2(QnB~fruM?k1NMe_n&(=o`Hatryw;*zd#{}
z6ZARs<_mntEtvT&b_57NQ{xj*l^jW4Q3A<mI;iTJ8^=|P_WXI0=ggkQqvPRRX{lLX
z@E$~ptE&TBl+Csg9J*Ij6o6%3s|*Ilmi?vLlIyn^skgZ|q3fqIzVM9}mM&KTYj8EO
zD<;>b>O2M;zUf`(LJ;^>?;Fo_(c2v@ajKDRD)-5$A@0w&kaJi4Lje~SR|wO~d^wI-
zy#CT;+qZ)xLpG_J=3aNX5s$m<zn*6_9I}R#+G1up#SdoC225vzc?COeYV|r&O7)O-
z0u*<>t|MqN^$+<#mDZ7ThjcBPb=n{wYxUFi#<N7(@6$XUCS7R|B~Pq6+oR|zMt_<-
zoX=5P`5%Unm<ht!rOZZtLQgp*3X{?0yu@#`?Tl2ZoYiw>%w%S@G>I3vwDS$Nh@@h1
zQ-g7NK*9-^R4FQ{yngV+KOE8D+dbK+5OVq%dgha36<JDu-d|hp8kIQq&q`{$<x&=o
zx;qRC6tffIWB1>yDY;+lBMG7;Hed>}!3Xc6B!^2hnlolj@J_aW$MF;Us`+Zg*F^l4
zH)|yq6`27h3|W30E~i<{2D26={w~Zeqi<r0(T}uOoOrKRPVG{<Wsu)6{y84m%18>$
zQZQ^EU9k1=y#vsxLN99~`;&;xw_B8(l*D!)o3Pi$8=aUL7P=O&o0`%J(^293Qg(!d
zSWn?F-*2WH?9!La1U(vWie{9knI|8w7(V#yMZRq8EEzW@#FBkMarRyE+<?tD;a9$%
z^=+K1MDFpPJj~gbC*ajNMgROyWmTt~y62S85VnDtIyMYQ-KuKG2Ubm?>sq$`bDGw1
zysr1Ze8uL_yj;@XT<voh<=x6NKSf_(dtd_(mi<*%8}D6N98;|Wi!Tt2g=s;S3LFG%
zbrLz7|27aI-zZN~nj5q=RX<;UZvWXLlV)szEe;V@*^m5REdbEbcHX%Z0r&e#Eibu~
zTv6>00dI=sBDc~YmkORRHjN4J_osgUee;gE-GT;m`UWOgk;+<|VYdQn7b`~ZyR(HZ
z_%khON|$IE@)HLpiL2!Z_YaeZg;k30R9S=Rg@B0Q+MSBiAWj2jX|Td)hp`l?88sgs
zm&EB_G-dgTO`}{7ErS#L4HNPfBw%oL8MfIX_{>rXRa3V##tDJUvfT8E(M*kVg?l2S
z;N&z35SN3^PIj2@{6jd}hn+q`Sq7^iFU?!~?vJ>{5~j464MHG6Ee!$@(R+VdP?p^6
zkkz?Bt!Vr!t?q9~;X%Y00<OD#X$v~i(zet2_^;kle3uT5ih)NKU8P1eYnAI^`=MM9
zuu1uUzRlFv3BMKb|4Y<Re7`jpqcRohp0iY7+Hx~wpCa=&U<Nv|@RRDrb6dU*^**|}
zv^^S}d`%;!CVnrRn-^=0U<d`S@}uvq02Fypz;C02)f1mwn4nALM5P8_IMj=7cfw#T
z?LQr+M-`wbb{f|6n+^)ZCcN1z&!8cH?b-8VPDMzx+N$Ee0Ym?M+aF;cj|sJ;|BkK)
z24Y}iFTocvG88#0HZBB@n@DuW+5?-kRz$-Bw5WOy4uSJp&E<JnH}>7mNm9$G1E@y_
zzsUrgP3U19PL1td^(E$yf*g}$LWj-G$O*NJMhbzZ#(e_kO5)SU1wk;oOYRnW1DS=*
z1<p0)Ya!@aUw@~>-3KWqy=r=@l78RV?(L&EO(UvA=^*mA@szky`B-67n^UNls5lY9
zLE0D=k<yp^o!%RaNOZ5tX_^B)0j@$9-r#kZHbrf~qQAicRYyefigYk7Bae;d=xbsw
zGldz<VM7@~CGl4cL!T+D>EgH)pR_g}{|=~Ek~Uy~KWT6;1@(SFk>*Z*kUZrp8CEKt
zDM-{46ChU77gqH>GioKJ<IBODrvoJ%p=;ySGJL=2Xm33$AR$(2M<i#v-eMStxZwK`
z9IL87Ob;3-|ELOWk_eH;{~G;QD}B$EPY^`nLvYsJX5SNr9Vy-SF_Y?4KU*($`fM4)
z@y*XrcbkhMAiKWVcV)GzN1Q@DHu(mWL)aJg;&>|?tBYIH9=74XVU^j(RY~$7!V`!}
zhQv@R7b#QT!{fMX${oYHQ1r>yt=%ZPOzovDkwf+5*VxMb?o$5IeOJp9i!~9G3-@tz
zp6*A);|@YhiJROxd|~qx16?!^Gd+_2vGUu^7GarISC*t^Jj3PndAF^e+3S9}2`xon
zeUKT>s|-#{6X&w`GK>WFD?d3%t{q<Ba^dp$0+~h{%y8h<#9KRW!wF;o;_?2$keq-4
z@AXz3@0a2}fnekH%g&d}63zEyZ>{SmmgC>H^2vGSF^l<fD^w*vY5MBqpYtJfZ_jCq
zJa1o+2bSc02(>k;;zk+}q-rV3xZPBdheEeQpe{E3M-Smvt)u8YPcJyh?(D%O_v!cg
zZ_>?%lM6jj1%0taaEF|B0RE`1?rYf3nR>p?dxRmF+rZpk{<PbG*_hSy#ts8IkP;t~
zQp9P8cp<+vQst&NN<8_>g7$AnPI8{Mm{4wykza7yy8kUj$1m9u5t26n6C=P$b#DGQ
zn_<N?9G%bdgHXEM(<$^WS#Y8v?*uVpbOxVgYNe>`nAb$_dfBiw;8P>yWJ1xg7e^S^
z_1RDQy)rA)W|R=x_0azB&ySX|C~+wHqIvmH$=g9E1(+HT^%IXbImS^WPH=G4p&GPc
zlg_;XyGs2weosMVXFi&aU1QYMlT=&6SOYtT_Gk<#C@5;2M~B}ATa$B1ne&6nc`VGu
zo1l5VkV4w|Mk&*(2b!$3F>Qc!a#VBn_^+_AbFpTErnBVa{`*+rd7P8y%}&Oxp01Dn
z7uORc#OyZ(#n^<+C!+76DE<1A_c1yRUvieN8}(w!iNwsA^(DVej2-2tBjzPk^OZPC
z?F967bkeDKZ{D1)x3QIXPaSEHyDjXmtF}kS4BYyY-GHU8Ph|$w^@i9aCjuNOAT<UM
zKm05YhKZf;UizxXgeQUI!|>&_xc|wHhX1k65~Ii{27w0ne#loiz0ZoTcha@JPA8?_
zykpfozv(G>C-R-ENBHiJjeF(rkW=PJ;j3uOUspeoC{2b&DCvz<TR06S3*WDZy6V?F
zfj`OGJq1-@g3P>7v*L~`4|H%TnItsuzV39DueR}K!W=Q83^G3oB-$j;D5)KQII|l?
z+AmR~9@s1(jOq>O*(HJ%4808Z>bkOO{ttr)3LbOrgKY3I<xkl<rD`qe7CF*6z$r73
zdZ6zF&pWcA%uEkR#F;K^1|NDhvMJhHXGO^}f&(9#;59>zImRVwIldEeb;|n2id~8S
zA|j?es-o(RvhJ<ih!-=qT5QSX)kdI{fqmSMGs+}`5&hGECvSsqPI<+oiFbDL^A{8z
za}g;N5G)?_L$of_AX@JExz&cM7OQ&o697n!L{_^`VvIS^%NsA6)>q!1Jd|YY(^51o
z^>vK;#-dXCg)&C{@QCnWV<$TQ0^5<QcFLAYz5j>w>%dOS>sKjX=2{Y#NK;frn|3t{
zZgHwL<asThvKiJMN<8vvJr0btsx|e0xJK1BSK0p#hiHU0s+37MW$YCIy%AR|-HdQk
z7o1w(Xl{olksvEkZ}lkoqEp>Q6lDj|DYUX`E?%0u0>@anD+5C5`+<;rSXQS4uTiH@
z|0vn4JD$aLy7@kB$xs*5>M=H4rB#`ravn=ck9vg;-*`cLbUa|LawSE<w*wio$F<W5
z^Gm7LtEbcP%#_VU0SJBTufaE{h3x(SGV|~I;B3Mtg72xTD=+*_vnQ*Y{Sx=+05!m9
zL>C3L(vUdzflcnobi|2E1D>3SoaifDC!>yjra_UxRo|{5R}Zzk!4xcvW@_Q(+58>;
zSr>iIC`LZ~*^VgntOAr)lbfkm&Lsl%YZo%^O?Hp3@_|?PARX!yk!+VimKf<&O^A;n
zWtu9z;Arca8*703KyQhJHKcv^s5k#J6Hk3y;ESRd^nPPPSv4|>F*6Qw3Nb!WBCp?9
zY>z&B!oO`97Xj!&ni`_d?;-R@SX{58=2wyfMgI~;wMuq_hrb`-wAb-8t`;dBltSU?
z&}|M$jZjiCuS#He9~A^7`dZ#I5{~n&+B%BXM05lOl;p*U9r(B$g&BtB?tc%1@5hx#
z9Kq81!X69`>v+leJWZKywx51YJ{?bqxotJR;54vBr#0s%7Kp!rA77{|z%pqVyBPMU
z6IZG8&?GW+(L`&Ey9SnakLm&@y1_M#WBNQV6P91&F@C>(^uFAWuqr9`l(&8a8i6q1
z_(8tG_1kmfMt4>th|k#NdYhp#y7Rl-ecfL|{{iZK%ZmVHPjH3CJ3X>*UjSc>gK_j<
zieYzq2OA8S|0z2c$HzJsE&1eBba6^RY$a<Q7EU)49zOXraYF*ID$RI~BHRD{<u`=O
z)iKrG?V9n|C!LqLcxv#*%i4m+?}|p*==DpOshX_SLF%_Lc;9!-TFY6=%NctO(RvG>
zqKHJnMu!|tVbbROWnyAocc_d<GOv{Zl<NyZP*_4wmY;}EvZDd{65dbQVMZsl@3Pd7
ze!FM%TB^Vjnv05!w(z0j5oE+HwJMa*DAO&{h&xlJ5Z%0J8XRpVgc+5VYGB{>mDl3S
zqD>TMzV&B-zo}0*pZbU<cN#XspNj|PYU9l29uG&SgSFg_-Ft2<gqki0IMQ*I%+wT3
zu2u-S=<H60SFAV!Gk0k`cD-2XdJiH?JSNsTuEkzvl>Y8xEuE?cM%5WDc!<ZKr|FG2
zUHV}WC&?e!M7<p8P1KaHH)tHSEKB3ed=xHfZFSwlsHW(Q{nx%!>zIj5VH$W8-wk=A
zLpWK*P4}D5-amn<m&TpdN)Q4W46xjFgR$VwOoms^IV;GR@l2F!lgq*=0GR?`?COX8
zVHIiVBCcT63=NzVykSP(s_^WWE{A=Zc~^?<acAq0?co@@)<y&lxsm_SIG$co_{1)0
zD&YjnfvcqW-#^9RyMr;h>=hRrZA@c)&1Ea47Ac40%J0`j&iWOmELyahfG()%>L{(|
zn5_L~f@eypCu{t@+EB%g3K|V#ii%I{@HUgcorJlXTpL#%^jkJMRB&w8DrGa)h(9t1
zfeJNajWY0TJ*RX@hsws7uDpkOV(&!NdiD5mHe?7eM!U#ZaB*Eos`qDA0ErXna9Ko9
zTrm24haT~ZR@jHrV~a8X=DwU`1oc5Mkg!f8*BMKM{^-beDLWbz=xzVt(78=@KKb%*
za{T7wfIx3#asY0!_bp_2g0C&lTS#MY#|T4#QjxIwuR<r{m?hTOC;XQrq4UY+n~PD_
zR58NGRiE$8h8#iT%GN+?R2CwDywtOz0qc6pk+&WiAgzp+KjoN>>&;zOJ{3S(x^s@c
zN!~l;6tvXKZEF#9gFA7y1NCs(y6%to;aM-!I&a=Q8Zxp>4pQgc?wq$@!T%a^<hSTQ
zrZC1{kI+d^&%9)`IrtGt0&7UP7uYg=Q94s9NPIe|B^DbQ-80KH?|eMzyaItegE_n@
zDvo$&slN2e?QmWqMiA1ovSMI6I~^X53Umf)4d3_j%xTi#U&8A-6o{3s{7p?w>cAoB
zB|ihypm7nA-3LjQPFq8A(w}^;FL`7cYD{J$73!)biY|u|dEVp-dq<Yx`T=Qi=CMEQ
z^j)+4TEw<YY*er$WB>hp&jG!yvDRCPb|Dq7X+R^t@#l^SwbLsr1zf}dz(m0PjQEqa
z*2s7*1gGWg<q{uHr~?>>Q^a_48}H%}x!dCv>9#fAeuwMZ3lDK0{*L92;ph{bN$_6T
zy}DZi@d54tY+1;^y3GBx$hv^cMi>Z-93SH-fA02N_q|%9?E=Qza-Q3K+sk2fif_mj
znBmm8AcaPXDI>*h)@$uuu76OY|7jT=pvL!Xy04!^a~}$>;)98}n+b49k@HeS!y_Y1
z^s}}FQK<Cd*qiMmA0ndA2Wez%eWS2Wlce-pB)<v14d$D`SG>qaM(BQckAHEF!T)T+
z4QVl*p*O|@J9(4;=@oH+LY0l)*GSIno=I#&lAfQ+AJf^TfH*yj!BY`a<GQoRt9(G)
zX3$nIBaY!#?h#zb4wB^v;TE}qscSG7qrB@IjR}*o#5hsG<a%iNCO0?wia5_qV^SP2
zYHUTG_<Zt7&j1Ea6kXn3Q*Ddq+JeD&DVu+X*q$EcKYW<*<g@75GBI`9Wux$oA~<#H
z51kMfw*=Odl(baARXp!49&z{0a=j|)OqrH49YbIe7C+jzB&^0HEKmRQNA|tPR{z~s
zq1Omp$lLvt@7zOh`_ZR8sP0nK*#a9!dZ+KJ0@H%UP6w+R?ESR?sY?rF1RJ2k0Q?&0
zK>j(-{`jkE1C{6<(hp=A)0^={JL9{nox$|=fqnR17YAgqY$8lKv=M=9`z3z!Hf!G7
zFWpM^Nx}AF@FYc;#Ix(dcblS`_I~pzEuC$F*OD|hL=;&)l*xo9nxo)x(d`Ri?4wVx
z{qO33ee<r*Up!<FLdn^PVxZ4^{QfW-`zIq@qdUr9A#|HYq|3K9aZ9OtIm}xxSch^}
z4fOEoH0P5vmN&0xduz_bq`J-ZzGiZg?P4;e362xn+onwa4)}(`R$z%_9T6Vyf67`r
z@_8Jdhz=Rr);pyuj_0iv8C7hcMq7_dm?Gl$;A$rO*)u*=Ui5~_DUeAXZjw^y<!`Yj
zx6Ro0@RV^gV0!^uKBxbiT}_>PA9M7LT(N?P`E==~JD^63MY-mRgU8vY-I36U4ENN{
zV$Qv^br=XTqTo<Ut1yaX$1dSJj>@<T97Obe;5gsE@Y(5wKN6BQn_=(yH>xQ|CklSp
z;S;Xb&c}cEublX&^(hg4=Tmx86T{XQNP7-)BToOV3UL3c3Y3VC5pJOtQoNAJ3oAGd
zuipwaTy8CpUt~1E5PLhad1l;6;gfb^-c`9;$4e$7$Z75Nq)7cVrtlO~)d-7W$!!sK
zj1h6{H4dZtr1p&2MeqAV;VF9)J5M=u_Q<1#!*A@%g+KoP_H1tgiQ)8k+U<;INi;bI
z2_*$PqGbzh*a`0zQ_-!&)}9_V+mu>S1T|U|RYDs!cFd4z=pa$I=-L#Z{b0^K_+k;I
zl3_r2Lrc%XgxiR2uVeO`NW#%7VZO#b{mJ|{3Y`l5uvCIq_<5nJZ1+o7r_+|=#HYsH
z$`1B6y&kouiOHLI7zTA$c(2BmneVR_9zTyxm~ef+3buPK==r>QI;7R)fqW-jI%QMC
zPqsW`T(SJ}k-gTFs}NdNd_}o;ZEJ9?`Ty-BUw9tQ%@N~|XmKw2Fs~E+XT5Kr7NUR4
z(f8=~81TIv%?(_-wk@*b6wUm1N@>1W>k0^JtFVZ0^@el29Xo86o#nYT`=7(72>`XW
z&PXIq^a8(YIBF}ai_|Z)O~unc{qm+N^Wr_Wk7HBZu+4mFPO_Bq&8MA<(JO(mT`ujV
zB;iu8M-uZ#BN1PG+rdhyu!MIiH9jI{@9&7Hyp3L-V)Wf*K^IdqlWhZoa4|`)Pp)!H
znAT45Pm@x4i#{(aw<=%$g?k-TM4+cF{{d-I&~2)6@Oxc9S8r*D@8FNG;DzhY*eu~o
zW`OQCXIft$rm}SpNw^@^J^$jY($@4=4c06WK`vPL#S4>zi<ykvbK5Zn<tlWQ2ite+
zY&1M}sCR`Hw*z-_Plf%0-j!5y%{B!{_I@yDjM|+Dc})`_enjGKW!xfsxjS0SC0g{N
zVYpM@q`=nIz~>g}s`)YK9bL_~tpS<MvVL!qu3KZZIm$=ZY3hEVgw1qL1Ek@RiB1$w
zv!U?rtH6xT8R;+9FV4EEJ|#-g7nQxGum@sCPicT?KC!63;J(E;zWQ2nfhoU(<E~CF
zh11Hogp4tTF<K$ALHGB)C=L$o-E^BXJ{jcx+0~N=UKrb|c&b9iWGfKx&TVPW)L0?A
zCNakkln18&WjPh1LaN3T2*aHx-?&MdS6Hk~?c|9nb9Wo*Ug56TKRdg)s6;GvT58@;
z>&-ySW9o-8X<Q?>5h4i$O@Kn%yXz%+``28wLa81-rO;_^V_#ok)>XgA6X45k^e^*g
z#k-nE?nu&P83S_#yqf0=`zStpTups^fp8`E8DHru@=?wp`Mf<+G}Ykc#!JzkJ!1sw
z9V0#-dMvSWq!Mj_(}xG%?=+ApGrU?l*XSGA`;n$A$~6A2=X<fCv({y!QAncAQ|flr
z0VOjH-=BL@>OM96tskkkaUa9=>tnW_0HLoPafp4+kv2F;p0}2LEOX~7^bzb;yE!RE
z*6yjl;$fcEW2LkBs`H(Hh~Wj&P<s3ixfq!>G3yEFLl)A-X`)ljZ`Dq;5mw$)pVj1-
zxY#bfn9GjCCcOL8V~XmpFV_FB@BA!JRLW#-X7ToXlZsZsS>{{dod6@SCB8nVto4FP
z8+17*=xS6w)})`+jef1%F8)ty@mv!)zhY%x09!7;{C^v)yaZ6PB|mz*$OWPl&Zs)3
zBUu<>z2FO<)YpzWa|b#o@1)%}Mx3lwHshEAouz93esQ?)R1UZxM~ILsJBW0rLYja5
z8Npc{@Q}O_UW}1XxLHf`(P?$g_k;)vAFz`w^2yH%Wm75=IB%jlV5qIp*`uI`n4^(O
za&j0hY@JXbO*Hk0#543N?nek0Vx+!i5xikaJrhQwKwzmX4)y7|Ef=J(L0Cw%7>lUe
zk-mFLGvARix#yaxw>DKFuXn$Dk;3wh<>rVl+OP}xP0MR@5erP0zVq?7ntR3T5MPRK
z5fvV$Cu>}SP^!e)oprr@$0I$^l!EB{{SB{Hr7KHFao06dGWd-68>-K>QIT!kV;lDI
zHEDSYLnaY3W6hfpCx1}FP(TOM(tOp|y5B9P2RFEtY)ol~r;)&-J<8U%u{G(UW7a+s
z9W$ZfFmA3kk7Ix|SrjtaE0TDa*$zY#vs<0gCQOzPX_pwF-NtxGT_CklzLR{Y*6x0_
zt#$lspqH?*db%T%g#i61S}o*iYiv-qwPvMkdLoBZ!vuZ>9m%DJ3<>>)7dnpG=p}FY
zPp*8rC^CKX^lY=BDnt}0xC0*wX1!WO6yeF!q-^`8)L4~T+B!=;kIcDJ|9O|B;Koi9
ziV23*QcE$ByMvx0-P$EbJNqZ_FQ18xCI{sLBCOb}P<M$GwUGYnBCq^0`n#?|(=T_K
zc0ztv*wV~vkpYqvM=M-+gFm$2d~r}A&fqeiS<v=hdt5BSws`ffFR%(xx|{eVQ2gY`
zL_TjWntx<DPV)U$w@5In_S(^LDtV~6CJ-+fb~uwub|&w%_$wi+7FZ2aps?p<{HN6J
z^x_XQ62}zB!ahax#xEA4O;IvMGLkZQOsWOn;9Dpv(Q_nB^7X;;xAK?E5;gdxCN&R^
zyi^Ok>q!5Wap*sa1oyG9gP_bWtp95*7Hc+Cyqce@4z;3fkyCylp%wCJf)foGjf-L?
zx6>;nQnI)DztJo35{#&mC_`zOMgxEndGff$VknhKzkJkIHi;R-C|c;VN4gxVo8boH
z=Z8{`Id@xG^<V@b2f5Oy4ozG&6exj-m5wWdd0rA%d%5YqP}oNc;WUu%BE`bLk_!1=
z6b>9VuX`C~cn7R$1Bn=j<g${fX$=FGFPQd^+d|z~^qIwYy>31qN^{5Y3Q=@;q7a^4
zE1*RWsdO9pspNwxQ=U9>JES4Hq)vzT$W!Q`Buz$LqGDRzb6(*@20OXsTBWxet1U65
z{J@I_1*1dHZFPMg@%rcJ{>Z(|GRA_P2(hjQM6R*em<)V%XCF%xkaJdkt;;IvU%*M0
z(3731C26(5^!JV_zN|@=DskfeqPmtfUjOLv8_L8FOC;yGAt!%a@1BuA5>)|U4dxZ?
z<6rgfkQSAu!Yd`8?r{jiDqy)~@jLsliUIz7)grYIq3FP@z5&Cu&=g#ay|soryVuZu
z=)O*N+g&&|5n_KOKB1`)Ap9uelvxc9+m4d;x3U>`arojhdi~9`y*ea1`;(1Lg%2OT
zDY$dpR@Xq+msyhjLSx>^TVSH&a{1wu4kSB9yz-5qkj7jzGVfbgR_*S-xdo`~^}X_O
z#MT|p!*$>60};FB4|yZiW6j39m%wuEG1VPctyFe$`<0hNN~vBPO>%Xx8>URO^HZ)b
z_V~Lq{zn3fhlD*2Y<4*}`^S--Q6t3%ixDRr3A%)LKHOEsxP$_^;Lr-g>fp@I!t>UH
zAsX}?h%OB|{#}}Ysq~89y>v*nd|iWs0p?7FRqx)i|0PMPy;wfMe4%GLTiAE>y9#@b
z)}P!#x?0_Ne>$||H}@aobCsPhE*3n`kwQnc%lUDn+aD!sd|<Y{GvxS@Axq<=pq_r;
zyK;%hS-04?2SDbsy42vRG%t{u?74eYDjMfdV-Owg)6+Q)M9g-yq_ED@I0j_rrzL^r
zb@?03$)=c#)F4eSt8<F~=4ivHzMhEZ6{(ft@PTWUP7Al2exkL1|Lhh2bUm=!O5q+y
zyz@U@Fw$7(6&yec_?7DWbHKT@{rTUhhe8r0QYDA4R}z-aT@i3vP{8S9xQy_=DaMoI
zQt!7Rr=Mx37NS+tWf+1n$HAkt8sy;*NrSFSQI_ODwClBY*`XK({dK1gt@DR^U--NI
zh7s7kSu9?Hk4@5glh!|n(E@r5d}-xiYa_;}9x->sqT25@fG&)C`VC|DcTa!`+fE&I
z)AyZelC5os>GdfW@Dn3vrG<0-Xa_27Y%b<>vjW`{GiP@{QJI&#ld<{3)4}#ugRjyB
zCb4OCIV0Rv1eDUsj+_n$B<C!5tt|gqYpR#zh@&9P8Es~~{~xmcDy+@0TNi~}3N2dP
zo!~CTCAdp*hvM$gLb2lR?#11qxVsmMdvLcDLeQOl-<<#Y_g*JC$U$=D%KJQHjC<VD
zTow@G?;a3R9?|+ziH=FY7bYvBtxHZNtK{qu5PQqiKS^D=e0e*9$P9giKXBk@POK{F
zXdVc!i6QKhC2A4-z}CAUlQHq>^5@;{_DTot%|TI@RWFPQD>>pP0qW#Wj0Sgv)3<)F
zl^N}6NuPSb{@&MSqq+0OWETV`*i(=sek;>3(_yTSOH%BYvIDr!)p#3WT{StdT<enF
zYD?!6gdQ~#ZU;67&34cLR#hXkPE;OOzMo;0Ju#&87fVwEB1;AKs}}AUVgxCr6*(&O
z&7M}8k(d?buG!w@c0G^c+*i3Gwt79D<3?m5n^mmg1%2CEwAN`u<+LhDpZdW|4m8<0
zG1O^3O(yX7a<d!>I&=~Xo+<ZT`ze21fFa^G@uSp9<V$~rA>uMyJRkBw{otL=JJse-
z<smd|=zD&q@a2@Gnmdg3VAi`+?MXC|K%`1|Q$lVC*%atjpQ*VOAm|j*CApOmz~`+%
z0ZEuE__p;6_Vvljc{{<Qc3g1iU-ntWY!YtVj>BPwXRVP(Dns34wPtBf13Nu&G=I5=
zigH<g_c)Nk6q~FTy^#A$MKs6$@f<z|%9Ob`N{=P?-)?C=B&1Lw!ES#}1ZH5Q>Jhw`
za*i5hBmdDGiTpvG`aL$qx7Bi*`so>;SGG~_@$ka{X&0tv4GyiYuY_ybm6>-s{HsU^
zuIkFj*LgUC#a?V@ZAq=UrXm7X;XW0kDoQU<GIWDZFoF8w<-o<CC8?enlI?I{2ajY)
zbVMF+8_QJ2R!*i?NO2qC8aniOu8VF`CK7964^plE*yuVq7UL$O{0P0exba#?R}(>D
ze7_Z_>jJBaq<r=0M&OYs4#9p0)SH88x|hjU*dU{U^^o_!T!8;j<Nh=2z)BSlMyo3r
zi{}CnCk8FE1$W4gA(tomQ0pHSbBqT&tO_kPhr}=)y9nG};h78gSc%6a$vCUGKHuzw
zt>5aY9Pv!QStSN_Bn9MGfpTy9zaKk|d8NA>l;&j2i6h-i8lo#~Nyts5-M#{N$al_T
zUjB&`*>nIIdmRUD`aOP%2dQb%-VSvj{>{Az3&MA7bWl3}^WH2c%ZB9^0_*TQ7cYkY
z_C8ku%Z(8fPo_pSBJm*4f<lXD!_mPaaBbC0%!(0BC7U`n3q!Bi3YEx392UChnNEwL
zK~%G4(knYU9IJ%;RVcREaq|r2By<esm>D66>cm2RX2k-<F)1uCfN^z^H4e8hC!Wn}
zeB6<7@yk9Hj5OBYEs{jE)!?Ah{>=B7foNIOzEGo#5aXS=T0t#d@$k|M{p@~iz=GxJ
z^Uyq$XGm<vg6;hbLTQ~s#zNA~A=Vmhz}!es(27-sHE!97qD5VQt9?+R_P5Jq^!iml
zR$s`vJ;TV<mtc3vhk&cSXt*{O>~obp_6CMFr3*UWI~hA)C|($PUV-z)B7k}0s{wWN
ztX=yZQ{1;3XaUIZDoRs1M*n9v7)@jl=d}#e#y-f3TpBTjr~H*6H#PPs^dhFxmf1{d
zZ)oe#0dK5$^{Ij>v>C)g4l~5nP#98sLeC+OzI~i-uyF@T)slC(qIZ=(Mh)@@e9>3<
zM9J_~-c2s47Aec<&y&ku;!mv9bpNC1e5~FHEY}PMOXZ77EiGNYP4*PQZ?K81tZlC~
zo1MSj3dQrUZG<8rcbbWk;}h7Wx0zog+>d7c6uB*0eAWa;WOai!a*>JP;%J>NS%;^_
ztXd<!L-6{nAAGfhaKk(z!v!ZK;^}ksw0-yvNDhUTA-#R^kv%^gSl4X4dZ@la;63~*
z%rWS}*Rx4X0mo+4DcTp9xz)od>eePl5&$zOhOIfA!=Z2_0e=Jjcg?|L=V%*S{;srt
za4?uD+y#NV25~e66UyH^^VaX<mD`WJcVj{yV7L|htKbvxYtOhv0t!05h~!5%wNj4h
zJE~E96L9dOxh0pI1_=;1BVAAX1}FW#H5oLFC~fzZ!~e6c%4<YVu@2$IHdNo4pgDX>
zE;v~<ZtIKTZ~4+2YzBd5vc~j;8Ul`yN+&oDnYNw*e2H0?9r?vD#Lro@kPzQ%aiYVn
z>m!<WuNqTZ-9R+oFNX1>9{oNrw!4kJINRq48N+Kyc|%1ShosM&->geR)A{RyB5#NQ
zrJl9eN#BkU=~|lsYMSFuMFed0=G9W8ZSb1a5nb4_e*I4^{YLqhsQwA?(!rEH5nZ1<
ze+ayYeVE}e&Dr+ufY%f)=3GBa*FnvZ*(>0xLI?bcahI++E8;2k1%7zxV!%2=T%(BF
zwAM_j(`1~M+l@W=m?4a#RGBi%U)!=p=2?TOfm&WovhmJWZ)x%9mCbl}yW3Roz(I>~
z)gfaJo%BR|v9d?zQY)Ie=ySJ1#p|A*{IQKuiL4(p`F`+sTm_q^*Q+7HBR&^|2{h^K
z%sKLfah%zR!!VUR4(DlUd*~q*m($&LEWu_mhi0?IZh=`3V(?t>O{?SiN&h!lEAAn#
za^pDTHuO>kyQ>{TzASYD2}T?FVkt2f2VwDPq+n|`ba%&xO0se1@mAn3>mnGOh>3{w
zPA_Z65xm!Jt+k<i+Uc^j%Wkn@RKh1aRSd1w`P50QKCaF<aBNlj>5?1NWC^kPn@3Z|
zbaa|$<plhh7MCacllXidNByBmN-M&&<f$`bxsgWQA(?5EVTbp^TS!Rj5=U;xSAS`^
zGe5_KRdrHPu|B>hw>9v5!c6AMvlKRt+h?Gpv<OR<*u&TgYpqOgP?jDWo-;Et&Ca$X
z*ttQF_978p&)4}jIu{cD*eCv!d$-6#q#5j7Iw6&&oOdHLqyrZcYb#x>-aHESx}7L1
zlgGkvo2W<uOSQ&@q5*Jas)g^fb8<j?d+<z5-)F*jV?`hSZTjbGFQL=?aIj2DPYu4R
zFumP@Lv;IT<ru+ym)qfT9$?<TfV}@v!oxy}zDrQh^1M$<RT9@w#94bFS>h*{{T@8q
zkAY7Zs}Q?*k4L#q7xn8r819xh3WsoL7}rVZCJzDLkD9FNZ;@;$KwQIV?^n|ojnciH
zp38NmGXW2@d=oYk{i-)FMDv&jwDLo2N{_Hu6x{4ittbl=godb}Pen(sJVIDqK8ryI
zrL-7RC$@uD=NIZ!P)RwxLe7YPM<N1I+MKVFq(7^({Yu1B1m|NRUW1>e0=`td6?yU#
zO>H<nlW?YJC{sEeOe;%1D(_kXuTm~tq_w}(_3=@PO4W&vyP9=c1^FxS$DENbTQTFV
z<r|s39mayHQ%W3p^I8F@OT=ExEQ8O+U9YHy!rCBMX}0gtFZnAI_K<=VGZL3q$l<<8
z3r~8V76S-eaEH;vGNmGV;;1idQ<wsJ<$5CIf2>}m4ckJoLh5EK$?c;KmTcpi{Lfu!
zFyiY%MZio*O1fAZwI$F^E`%NrDiBj!m_S0i0C!omT$f&fXMhmVE_e-cncnUgCwr2(
znyVv(4YT&D{XE^OS?zj7yoJKwnAPZ6;My{mje{8VOW^v;i9MJn0$#qsVmf}B5&Fur
z!}VRh?mBCkSXd`R<NPp(L`CWF9VWR9D;}b3>@B0|0s(%BY#CEtdDZC^LY8WUv(bD5
zTOXoJ<TDJLkSAC3`^>Mc1a^qu>mqp#FCz$zC_t?(0n7e@inew7j3A+AbR?Y1VGc-v
z4;MW*KUbZ-*dd;lo^~P$-Lnnk5eYuh-i4+Vt6|>Ah&W+I$(VUQ>nozxC}#yb&GvWI
zEMM`{AK7<8+DFS|-O1+O=TQNc{q%Wl!oK(L$c|b%n7@<y1zzG#!sLSm2>vSt)>M_S
zYVeOg?I&_`I1u+;<o9C|SK2*=0O)xvzf1b3H*wnD3&X_7vnc$*OkNylKoj>dliMzo
zQX=%H@8ccN*+PU{rq>xZ)bkD8qP0ZB_`(<jq<ln;-=;0^@R}II<(_FHkl9yx5-7ze
zLwafm-^QxEFQFdV4H^lfh5xs6`A;qSGXMr%0{nM-SKirI!V<Tf&F*48x$!E-=KmjQ
zU=PK3TyBvGk<kHF^MpZJU{z!-1ZEPeMnAC9=dYFrZ^3s?cn>)1c)rKYrmY=L@m5*s
zk3LiDE{epI{7Bh9Q3l^1r2r<<`Af&ZpCSKX(OX(({Unk$N+;en7|$=c!;tlx?p?Rf
zGcp8sjh#*BGaB*X&JPCPHQHq4c~XB!ZdV9+PqWqek%RbgCJ?iXjzLZqo?XqcaCnr7
z%vWQ)ixJo<OV+LEVt+yuQK99;#|zM8k(zj;=1$z<&XnpXIPAwjzjZ6!(XPR&+1B&(
z9W3RSdgxw-N>en1lcWyz*?h6a!v|%0q2d0%CSa75S;bC8ToXapYdoG*%=6`S$yUt8
zv|+>2(3E=DXcSi+S=__<Yt=Z-q3pTtOh~E0fI;Q+9CTxKA?Wh5D3F$Itzrv07_Ol%
zH>OD8NL#GQstJ#4j~pbw#8aX69_lKma8^-E{=!f6{EFQB`qDO-sw<EvTX5UNTrTi3
zITdEj;YtZmPG!4B;Zepz44qH!-yMbDvBG-lF&*JR%>-ba&AUv#fmufPg(gf65-dj7
zlBhtTFc<}40N-)=#E@>KovcegX#SZKk@FiPG+*Uu?hQ0EFo*loaNMV~J@1d+w`8p9
zi`(-a`N4bdXS$D8TPuuS^4&_h9@SKBE|&{b%hZi@$+X%XrLy{$w^z_DA}dncQ!ar=
zy}D>#wG;!#_*<*H8FBs%c1%W3fc4YKs`Jk-NiJKbc#ks@zdr(hZ9Qoru=hJe$ND6V
ze0bHp725H17Ak@;k)aEbc?!{!(+iP%id^~J6GH0_uarrI-GW&x{<A?s_MZV;dLqVC
zt+%-y&Z80wxYOe?>wNKi?-s=e2mCi=KO#|pY53rVqeYLvpOZ=R@82LD{i#8(SYznv
za+vf!^!_gg?0@?8D*%S)?F(+FmS=?e2Q;)mb<$Gu`ed=m|FbIc{wtO{{v-l57L=iQ
zb8c>lSVRLed)5r}OZMIB8QGOE$5D@iJqbK_*=@0nyyTG+NJZ$HoFrZJ@e--*uCnF*
zPz<3VG+g<Y0#jP`Phg!soGIBc+l~88iWDBH$14V8Vc$j5nd8$lZHveQnfj-|6D`y8
z(-uC1wpW}Y4XK#%arrT8IDRRLfoDV^guvKj)YA6uem_R^wQr$$;S46nyw|*{hS3i+
zmZzQD5>)xs>H~~e{d%VK4B;C6S#|P>v7!wHv@wDKLi1AZt`$8<kJGyyPQGmZu3_gb
zzRAkWa`%6zN+8K<s%C+xUure87dr{2TWZoQsWQwp@{*Z?USqfK7rgBS)GV{NSFzr$
zE~(&PpZH&{5aR~YCLG@!Bx4;lJw`3|_*|BkQPEEczQ%JM2`i~dYtJ-_QOqKBF9mmW
zb1*mSiAej2P+Lt87Vt5+okVgUXV8>MeYD(D^IZkEfShlQg8W%G<eI^>^_@#2tedcg
zn4{JhbYeX2oijC0p~Fvb*~NOc0^mcxqz?q`9YxB2)ribY%iG;Q^n~)~et06;K70fG
za1ZNl1Df1Fc6veHH>RoJUwRn%D@ruBv)ngty}@kte-*K}xBsN~^<<u5zCv^2P4xk-
z3%sAuKef~@#{rWwd^FM9C8-)jKFr=bYyEY+fgE@wtu|NeS6$7qh<~n6jQZD}+t<Cj
zVo)WKT=ajB>7d#9bTgl~;pZA#wHl<4%0kxjkTLJv8qW^xRu#R}hLCg7kRsq~3B(_&
z<Rtw@uPX=|lF%abFa{H;9pv46ZZCu(8nivaOR9CaLy-E7x`#4|<#)15dV#+@-Qft8
zJ)Tum>+gLXG2q05*E7K0_^6kwlAKcSCs9Pg6QXz;x0H^T$aW3&WpN6ylFLh~lv8cf
z*0re6D-u8@HBb{zPK#EC%4C?9?G~53Jw9@CSmM?6_Zfvn+x*4U_QuvfP>17@fI)g_
zv72$+d2abjQAxg`iKX~n<6uvQ|IMoJJUM4nHH;x*^h|jCS&ERA@*>#{TUF7!lV*YJ
z_~masupEq1|8vDix+rl_$MjTY#BZqTCx)UCQ`sjqH<z-26^V8VC*^#f+F!ShKtCk9
z+LVF{Q+YD#5xh9el##?J7<vhj-0EnqMBmC+D=7T_QhpHc=Q%#}K^pO|9s?s!=>1E=
z&-WQmV1+A6OcewRwmxMTuZg|qKnHtAH!{qY)jEzm1RZ<g+bxOm(-RDCG~HYq*piU{
zp)6w5ej}#+bmEp<lyVITTHM({0Ieh!e#cHW{KxNSK%u4DyC?f)oV8O7S~R;tXFXG9
zx)$&5caxdrBO^hF-A3g1UKA9)GZHXWIGCO05Hx%q(CYatlvSVyHRjlo_#A#Xb%k+t
zcQ;%lccRIIjL#Yki%TWQ?OUgc?N#|FkM+a)?L!#*gM-R>Zy57a?yAoEYB(9-bBQ(K
zm_FNeG*0Q$pQzW#YuF<N7Q5Dh2cqA5_0GKWBfA2NdfI8-3Uxl;^Z2~;9eVr&k_($1
z?M~n@qFKqMVkPOD5*vRkI^V{uV=nJh8Q<zm-5oO4_H_etUtb&b=!=H}OeKx?etptz
z+oJp6{~)w;pVE<zS*JpsbcB3`6;gF|U{<7-ADQ5aGwxjRXx#M(zi#=^c${v!qNBZ6
z_wy$0O?mfiRvXz`tf8f?vwkVsaIHlYO?)am%BHkrL22+_V&b4WmRIm$j=*d%qNRD(
z-Jyxv6-{95M6%{`-e*7c?(V;xm=nb>&*+UYDGE8u9~@4#zPv2hSB7{R$(cz|aKn;v
zqE@|1#$MEy$bP=W_bj)=y&Yu<K&8tLgRMeKL~%}OD}LHUIXT=}52&;Qea}=*p2k~M
zXJ#PLO|roX0kXw253c^Rtzx7^rUySi*0F1*!fE;E7O@i6e1&2$izJ@<Pnq)-2j&UA
zCR1}wK@e_gMUmBa7~=1e4+qXiYC(I=;<}c)BhdUyXgT$#3K%y%Mw@PcVOLi<6y@dw
zeXRh2dL;G^Y*;6j@^mtPrhV<Rm^=`XvlZ}Rx}X%szL#sA$0s+uW_y05V>rwzc~sLQ
zBzJlJD4sTnX;T0HmA63zdo2*<@t6=W-sPsyDTmv=aE%0wyo0rOWQhJBuE6nmVNXJP
zhxq}yVG#0-p_|^Joi4}xC1j2I+SJ4rx%*}E@HgfC2?E)jZ5<tEt4%il)8N(pr@^~J
zZ;1tb`HlGL)-h6I37v!3F12z@zUby(O7DO7mH&aE<9*<Lf_yL^=#!PU)s_%mdQb+*
z%p6)mKS|y9z{qPV3z<K$$~y_)!)>01qqgJ_?p`q^fF3ASVv|$kp79{0;6U^r2QN6d
zg}+p(EHoK4;knP`e>x%%YqT^<dz#~93(=UT(nx-tHFyYMFroQtVe!H0CAPvM{OWyn
z$1Mga_IR1VVr{hM?srP<uL1O>)|dUb@Yo1r#cbJGNjvJ}`oI0u4E1s<JWB@e^i9})
ze_IuhYttaU_xfwGuZG>INkr^;4l~Z9zNZew0lYy=jR(<U!~TXh^3&^bq=1b)kvlpM
zKSP31GsV1Hn{Mq(0K2o+o3Gie%*sIDFEY_bn<W2MZmmV1$jb&eNZXRqw~A^Nu8-D!
zu^ynE)-*C#n^w(3GVgM-pOWGdE>&a2!Y`daRXe67pI6)}Mdq}dveX;M%9B7<)PyUs
zCQOvv=&TeYe<;!gNu)hc?bJBeHrgJ;WsSkmo=3BB(d?eDvF9!FP+Mo)!%T`~n1~U(
za`!q9Mt$Aw+YNG&^n!Ib+lOy~=(zoJPeR*=0if&;OmKL+7lnfT-UPRLeP-$31lsya
zyBv(#ToF<jHRr7FVR4wM-9FH!76wNS8H$vDj13LAM3n(dStmq$zP7j1C^<t8Dx~bV
z2sg4lL)n`cpvV3H*8{)_KfNIoma?W5nY8|2fdoU={12HPw3Q#;Ny(5g<O_9^Ol{Y^
zWW;d#Es`|*@E%quhNcXz9KSL$B6i=5_cN<!O=3JqsC+WFh}n$uYRlz`^SiPk3l=CG
zUaE*7jA;KewZEe|7c4%4&*WuI#i)NQXw+F;H_S|77?u`@b~@q*4Xs!B^#z0%@P+>o
zQbVv=%`jk-`AX(lwLa|E<DQJNPC{G;f?SxH2d*ZHhJsyAf4B(im*4%t2DbSsmfMUo
z@?<8U;Cj;j0p38jq$K-2I2oj9FrY#YNX5j>lPZiV#OT=rKUdx+x|UCY&KKXl3c9|n
zQOCBgHbH-tdyWs2ccs-3qY>X74e7IItG3ntu{GE|D#iFMv!ZBxam#B5nPdaMz8=>h
z^2-1YZ!&|LXwx6>4c3I<<Hhc`K4Bg~x{J)Kl|`0sI+w<WG5BXbMJ+yI$O8LcmWk$G
zh@P_Mp({9gH@k+dw$aCp-8)Z$UbkdX`E_ZR%UF{4)QPr*jK7Gk+ek|&!jRh}7<~-0
zH1NNsM?Y!aFZplT{JoQ-hgMRGG)<}8Es+bUb6pU=@p@5R=NFv+o#8*T<!kQpI#awx
zH1nak^Cy4Ac4pu+7rC-wNPHvmT-Bhm?vpIo=x=ib#tgprz;C}Z@lH?t>3Kp3^d%zm
zj^^);%5vp5=WD$P^`#J}7j+WEV!e=(+s;`RKZFo^aKG4vwu7dcI?O;(4}m{s=JjHZ
ze;oo_J^x=Iy9=M?Z}`2bBo*gc$)ouH)UTg7?+>88-~gBx%!iQAeE$mB7umzUePVpf
zd&u|wEqw5=f@QX$p**NRDoTLM?y$*$BFM=BX2jJpa(lXjHlgE)CbLcV&uJM*=-f>r
z)2o9~M{W_9rQ+0Z&NuHannr6FZDHp_{y&}je<7O}Y<Qtm?``PXq5Zk4gNzBxVeU18
z6lgd5|6};u@BJt^@1O@V<EC&X7opMq%m+at(PyEj<x%JWnrf_3Gs$SgtFyRu$<ROI
zRC1`V;WM*Gyk?{TJyihK!klUieKrsu&|yvN4^k#rjWg;+1kA^>kd`K(mJ+-|>Z6W?
z{E$x9%2hS1NS26=dxr9t<~5hU9SDr$v4tzCf9UI$q{K}`i;0VUj#P~btam@f<mq-=
z`<~9bj&wk)+q_!p^n^glHy|sBEc7h=h_crYFN8h7R=>0*BiV0pBTpkXk~GS}?J@nz
zFOZt>0rwGh6AC|2axzNGtpn7eW&CyaCutA1rgzgoe99C}2{%T+_ukTLQs)^#$HLuF
zcp^ANPxr?yS$CLX1f|Iy3I$;Tx>X>cl4g)Gfy6iLt2SUjZsbEqHYTL>$jLENs`6Pb
z$2g>u30x~Wt@a?7cZ&-Z8;^A3Js(*f(jPR-ys_SIHHs#n1Cj$JCG!tqzt!!8@$_=G
zlvL=U30K}`{D!aoKC#>HYF&x%i6I$`hZytji++`8!3$)~o2aV`!9%hw&-$e$I+`)U
zgvyhDe%?!jz2U5AU%sL0%;z45G-%7}0~N*IC;e~o{xZ>*D@25x^0@f(GkNF<sz*Ry
zQb42@qb3gQ&`2zB4iP%S*{Cc`lYQjozA(;&%z2i0^&52tP3+<Fc_>jdIiw4{G0f6h
zlHEyul=;r8wi!A}+=s)B>z9(p`;fgAkk|>+HpjhCA@ML==QF(|JwC}+o*HKz2OOor
zAe;CJdK^;Ab%;P3P;eLY=YxF@njN>;TFH4zpmyS6XMf(4aRddyh>XuPxOLlw5_$i+
zbK8ruoN8iM+{<|XkQzwmFD){uCxTYH*=Y_D_%DFpde2M$8V92MTqKJr<olp8#ZFF6
z9!*~R)DQ{Moo3lVeDeH9j2tnW)cy}$wY$TYW6qy?EiI%R!4fs&t(&^r;K&F-A{6z>
zICcO%E=b#?w$A=9r%)S98OhNfc@6rvmEv0Ty5-bocB=sY`lr!!hC-w3?_XzhtK9Ls
zXE%Pjy>lmDBh_6Fb3LZp>>2TuIqJy|kBODqmrl9uXKB0ldALGqncnP=QRltd0ig)B
zGIr)W+L}aCSw#MHPnoN$wD@pi^Bo+NpWRscEoZpmy8pw&{z<Y#kIi$uu=X7}5&n(m
z0~Q#;>T8bJ3Xw%Zi>y(z&lAZDA=7a67Z=GX-!8$4*S%pH@-)Bcj#tK+yS$A!r4I;!
zvj2xu|N9U#jrOq<@kVM{2&Ow;S%<@=n-ZnvQcK;*`%W-kO6JO$-BU_@qoYUtXm}5+
zF?}|3oC!_h=Iml^ijYzgfAWst>0SIN7eD)Q!INaRa*uWJH`t7+*Oy$NQ$DZjU)O32
zl%|-!wblH9BEvV(KdE8XDL1AU&37nFa5scPYkSzHAII*QN`7-N8H>y#wLMpWH=cGc
zMKY3{Ns9uJll~+}nd`c~DdiJ-x`&I!neBJKiR>NGEn??<SK_=?q23qD5R>qVpg(~`
z<LYLKf8Bn)P1vra7pWg-R41bE^S)4Gpd#YnGAw*18l&y}NQ%eNv&Ax9q>QV<a+-Rl
zC<e8ZNx5_BVqP?KqGBQUXZ#;qb3QKh^XDXio*FmB3pyGG#m9#^=w`5t>9!)!UXb>S
zti!?f^!=Z)w;A+l)X=~C6WXn8Ymr`Aca45Zl`AGWGC){<QHHgU`X;n$Eh12hUv=Xb
z-}jdCumY%Sy^9IkCg;w6xgF&1o&Q-2U$h}fbVwG$g0#}3ALKge=cK3B{eghGpj%ln
zSIx)V9)vKETT?FjU|#MozqB&sm2y??Dv~u6{xZpi*u$xX$C=+AJ6P+!H}XO<6{)P_
zcmG)g*UE=Z{Fs^LAUGiozV%k-d<^?)6PO{<-Y=Pmp1C9PBAAjT60V98A}DcwLxZ`^
z{buOqP<hr1FomMzHIp|Z*OAvL)Ic8Bf+8~he{T{{sCC`A+@fDMP)bVvW^2-AjgQ4V
zSuL<}Im1$n)Lc6_h6f;**PhQ0Xv*Zk7==SN(AOhFA-}&md_d+&)e1$1WM)mc=F>&-
z;tJd^j>`W2@q<MsIdqgA5VM>6@=3>$>GSK6qkML5ZdJ+=;)++YFyY~%+1yrdnbrvw
z{T_H+##HUcn~3~Xr||19*4Ja_993YdYUu8A24LA`+v%%_`HW=*OVl^YthOTgANNc6
zuW^(V&Nq8<^VRy=g$ydWLSmr)5VSr;$9FQ@Knd97_`-Dr?I#IniunIuTfE!I@{j==
zXW6lF6;wYVzIGin2$SG<IeQV_VJXP4C}9yQ=epn$YXt9N$l$Q3YbX(lzQo*ec~7&e
zp*4pZ{)^UHtkV6XwBF-;jkkv9Fi@UTy3MnJ#=VA!jHWk%LIa51g`ryeg=xzjU)iR~
z`Qp-NI3mar5hC$LI|9x>SL<?{Pbczgp39v-6R!D4itB?|_uvyi^sEj0^7MJBsIPPg
z<W!89x;OqPFJVqFCyd``V)UOCb&ZrT(-J@l%ucm=I3k}tAZ{|{^2MInPhVb(4%*_A
zTV+pArTrs@(?iUz_z$?EmiTMj*b8pMC1?7YRw-FINzzEFuUW)lian3BSU5DO@7(X5
zW$z7T&l;S~Hx`44Y*Yg_+Mx=QFI>DAg+o{TrgGEqUVz!A_O59+ekx9)frIz`{xrv^
zga69~xMbj1k7EIFX5WE@SYHH7sy07Tv8(C}l+bmwpl0rm_-%4JAPRgctMJ%BX(6Yf
zpY_GbCGT48t*ixbT1}VTzDVrr(y_jh`|t+Wad4k0!2U~79sDLWM(%Su3qH_F29c*N
zbJB^Fc`H0sLMd@}7E=nS2^yB7u;HHyl3>)i8Mdd{$t_T{eZTFd!5lh`Hu$hXrvsUe
zsWe+z&K*rrf|?YU+<wgN3Sh;?i#laDX2;68F@n)jct=8ERzmHU8=H18Vb>rdF;{dl
zDBPd~+C82O!t%*ic`cf*J%tV>gQ)%HtbCz9GSge8gX8#@!2~Q?6zGAaXkyLq`<ynf
zIdrT8E@yDF)e|GP;#)ko9xu-lMMDApP?HIe{|ev8d(ArAC#M2`U4Bz(d%?1n*V;{g
zWGi%<0JT5eg=XDf1_NVs>d(n}F8P)g(>Z^><Fptp<1p^=ef(Ivu*UA&CAwZza_qzB
zwa=;-mp!o)t1-{>h$_}dbf>o)^ukIBmmg?Py9HDaH5*iytD&J^E>Dq3-RmmxG-^EM
zdr&dXDkCoji+Df3?JB7mOhC!EU;oXj!~y<BHOufyhAcn)-+Li1_rfAVcm~T`n3ab}
zn>Mbmg&t(2ZGTUK<Ic0`ZD_7REr4zlq>_MX-vi4SlYAzMNoHr@yM#-V8oS;Dx(Rs-
z6;=A&laa@Dv|O4_-FWbQzfWJyQUAln?50y7fiJ_?<}MX$z>kJp>)QbtuzSL>{Q7}v
zAcl}O5j5Chi2C;la043TAZzv3rx>W#>A~4&Vi<D{bp*D=0ac*BJ0lyVre<c_UB9ta
zXI1;Ri2k#-!S9;3-#qRS`!X6&snZ}?7E9s4ZMQgh{3lRczzMd%kA^Mti>)rfU2gD+
z0li1Pv*}%KJjA-A6&OD6AD%R5C~77LQoO<uVR5Dk(kybAgt4aS$-@F@r@_M;M2+ad
zO+#rg!P=)qh7J0}1bO0wT%2GkD&YlNbtX;l3rzF2+~P<M=s%VSOd>aIUR#t-19nhY
zZDyC-;sc$U@_1o>yD%~Vvqh_S+NFu!j14oYEQ9maS)byu>IIuxsJ@6}8$rkJ^YB1P
zA{y+-$wa2Uru6H(E^hdEa^2-_PVB7yM+Jk2gBH^)Y{X79^Dg+YNH;*7{_>*mo-g^;
zPSpNgNocWnDDe*=82@W3v<LDR4KQGU=sV~$BWw9glEAM{8{=1+SlkGA_vPl~oUz>5
ze#NuIs3c12+x-)oJ4s7;f^j={b>Fa)fTxz68k2dkhtJs(pQ6DpDe7OCUY6I^+I-xV
zA<=|3w3=ER<=^auoZkn%s3ng1y#Ly7ou)M^SxkWaP>HG3sTNwg9{UZH2Rdms$oZK>
zVD~Oid|`Pg3)ZHaG@Hp#U+j<8?3BJcbo@EtNmD;%MNyu_9VVB`O!Vya+~?M+WE|AU
z3y&ZGJ1c|%tWL@bDKB%a*rO84>|FW$&5dncOBWbsn$#p2>DIz3>2RsZs4~Sm9)Dpt
zR(m%$6(mXIZu~In`2iO@%^~D6)VH9=K!33>5Ocb&iCudI5~iX`z~|N_YY*62KSA>S
zT_>-57tlbm<m7UFwNZzEp^#jx6M;I0vr2vV2l|`cowNthgl&t1xCH$o5L5Xh5g5Ag
z=xjL<yPGEI&3!HiE3hTiTWE2(`no-l#gXi43=E%a$2+=8Yx^A@1QKv$%|7Lz2f}3~
zcCJ)>m0~8^k2FPawTrIQkIO9#JLEKIV=9ZUb$><kU0uY7kWb5()eODHW?R+PX3e7q
z_t5rKVzOCWKb=j<{LO3-r~cJksZAIq-L{?9X`UsJ1%?U8hFOq?8-mrQ_f)7Dm&!lu
z75lqR6WbVBPk6snV?B3k-k3jHcJeX*U+ba(=UkxE+i22H0VqhcO0w~=00((Rdj6>%
z<(d0;-smRN(QV;~_~DImHn@ywBjNS_{fjkCeR~t#hFF{iuu#gRco1r5CmoC!Pf{I(
zYPEd_=FE>kb|qHKbq2jkBDA4h_-uJt6USy_&39I#q>1(#>~Dq0@wa$w<pqfv7?9i?
zOhuuS2eT0?tie1#Iqa9~^2`v8LjYfTVVb|}w}N3=!<g)!{#U|Op!-e`RdUL;ycWYk
zH1F*V<om66Eu?f;^51D??b}@ojDB%4>dCjoK1U`u?)K?Bty_W-yka-nJY{IQ!eti<
zv{B3p5<d~)P%Fcz@Uq<VL>@Nqr7p_H(y%OB3F@`S=A@<Nmk*N!y`#?MDWx#yLg}l{
zFC-hT8bfp%$jz3gnRUM!Lp4p9%thbFljm)Cg>u38>HYaAjDu~AF%>^sw4HPs8dS0s
z67*hXdC(I;IQ*(Gi3c7&P=WB`V!z|cg=Pmey<vBoKd6~;fQ7s=dIRYNyOq;`kxAg?
zh);nH+~fHOccHK`#KmrHjgfuAN4Uz(zHs(GewdK4#IE=K6}D4<cP>6PtK@MDZr7_(
z6n`qW3@bMCrnLGRMJ;T}WicL1f>&}zx?Ogp^CU82tbq+kTqzPc&q&n%io)fx3G>hj
z(Acy5a5hwvl(t}cnOA&Krd=MbEV$|a5d9)zcLkY6VcZ?LwGS$PCqQL394#=i<@1-s
zw@Z8XEauT(Xhon^9UJw06*Z*FHPR3Dl_Z8+Fs{$){9|Vv2F?#6O_r$JTu1tB-n7Ub
z!r4KERuzGH`nVrs@8Shzeo=$UPYL|V$&m12j^{|<c>HVMM)ww7Hq-x|b^Dueru&v;
z<pk*Urpnb^ePt%Cc8fhgHb|@@n0cl<mfmO-nJ~f!je_<L1%ReHHnnCVuB%CPRRunZ
zdq{CuI7=GN$ua{>l3u{8CsheRI6Tl2K*2hs+slY<p@Mzu)Rl`qD9QKYc`tG~K!$(y
zTM4}3eS<7aSBqhme0OmOlNCzpO^EWs<kQbb#h?G?HmFh8?Eh-&4c=Pi2-N<F!i8CS
z=ma!1xq8bC-J(?Os<oOBNC(bI6_c+>IJM6pt$u*eWDSuJuN2ZqyMGgXTbXU|tM;SK
z=a2)yc#*o_@XPnr8~j!v+VBUy6T(8r0rz;H)n|MSIeJxja5q}q;Hu#W#~&yBZojCY
z3^}dN7Yd&`KOZRke9dQh79-a6@8PUiH%7M&i&tdJgfSJXfzu3`5SVyM)+0{ivSuCk
zWttjyta$f3z<BzMN?EnunB8^rJ@eLA$w@D&pjDi)IOYsa%8rn7g4emuCA^unI79Pp
z4SFs{dpn(l)F~OyBt6n{EoH+l!+isDYRN-9Q((welG~;%SX_Ve&(oYwnN9^gk0*Ml
z!z`T53AE(0Ej`<Bw$^yNqUFZOii;aj>m!VNgyMaNI6WX3dHBO_WV1}IJM)UNbEgH%
zg8GgL2Fizx$h1$yxs^_vtdAmvKQPmysq)Xa-=6-AkO1H?8KJ@!&!#<bk7QwWGAho`
z;~r;gxPji^LOS@kA8+oiuE^p{7l+W`fNG&&_dMox{C2*b8JGik;%C~0-!rgxv)I2U
zmZ7$|KC5Yz83g3bJV6KZRF0pZe^1VtCUuyT{Ywu1>$58Ul<_)pg@K5KBo>8tB0@F$
zBaO|N3U*_JT77B>2V5onlWXLm@)^eh;!`o54fi5{V869T`MUU(;E$Q2-sZ;^VKji~
z{wFy?WwB5*`0B+Xt$5GJyM1EiOa7RcM&7{W!m<Q$5{6vN{77C+cq$+)P5U^)!&GNj
zdW343mmrJ>d5fpO(r}t6=k*A+!}vV-MoDiqDvS+-4}h_xg0vq|#_eGnSp(>xs+3zC
zOg*SA#dymgra?WR3T?{!ePPs&l9!9cxqjC7GqJ4`7CXZT$MmbVx7<z~Y4pFQp!sLn
z^}qhlru0|P`DMdnQQ&Pp+MAd5y+0Oc6vb2`n^=LWuDc1N<lO^uD+-=G>TTlMwH8zi
zVSX}_jq=4&4I&z}*Ebs@R1CpJfP#_px->=Kn9RVv;@Ohm=Oppn&KOF9<}ZcIF$b#x
zW&}jqEq}U6k^D+)HwNN${iqnEIm#^C5@7{b!u<?i`foxiUbTra7K%(;L4|`$Gxy#e
z4psX87^U~ul3xGIegLQ<0JYRKJQj+?rNw$%>%r))7H^tf-Eb}W#@!ezT60aaXZ~}E
zj+nG3pa3f&5=T<3A;b@Oh;F6zJ!$i2${Jl+9nv6=*VE{dc67>c#VWxu9MatclDuiO
zZ694Z*bc4E^GAwY%DvpsCiQ*Zlhh*xR!YG_`DbK5qj#UP$HDSMw;NyoQb|vv&t-aJ
z<`udewBNk5BHFC@$WZp2*LC_P>g1?kg!L}-NvjvRfa+ISSv|x$8W+J%67$&bZ30!G
zd)*hdaN#>6v$(Dx+%uvW+`h=yZ-;Gh)pb95Be}NHW|*DEbGRVWS-#$zZGh#>dMc}}
zwPXmhu=C)_2IsDvX|TpFHN0%a?-Er~k`C&kUA1;);~$Vb5v1{GHEM+lCk++il!=t#
zQ*&Lx2-MW%n_Nt_a+1hgxPGy2cGQo#ArisN??1J)a|gTa#dybAT;+Qj9f{yQ2x#IS
z0L@mk6f+9T5-G6R-sl66khD^q{2rbNCw`HK`07+Co?+S~j9N+|<C}h!Gn0GY7V{Lt
zV4pfqg)6yotJ=OLJ7-Yo{Chd4C_P<zS#!CtyQaroNOsiu*6u~64~T;gs>H~vcrymm
z9G#0xtZo^JpwB2H9kw^H2uS#Ay@)J%$+Vg+fzIu_Z{1JL0y9!psn4q7(V!i|PaD7s
z-mSBZc>@3k(EkPQ#fZ!KXdCmF_I$PRhv_8J-*s|;1fd#x!z6XJI#W4jyh&YZ$HxJg
z%rAipo;WOe)JtbGagVXE*&9$e?DKkXsReEwZ}{l<(h3~0*NNmxQ?3_o@hVJ3gk)Eq
z>`lMcP$EW?@MBG+A|X%B6)U8%E;m}mh3bq=Z6ofn{EJm<xd+ek!)>9#eW)Q3rOQ(;
z=yC+C|14E<y7@!=YJA^A5@vAw4kpKRO{Yl&(N3oK&X3RE$~;(pyB?KdzSWJiLeJyq
zS#&B#KPg>TDKhVU*27wIGV1O#NNa<MnM~$ORI==Azn!FMWG{b(b2I$Yt<ik8i8Wvl
zn%}R2JAd+e>t9>%xoVmXn`Nsh6NswCtACd%0h(9A9@Fi4<ZIs*4FstMln~Q@%qemJ
z4lh6O&TIt7Wn85aF%0Ow*N6Ea8_!hLS~*q`8<X#!V*AsPy-Yj}ffxMZo{y}5Uw816
zTQ2@yvii7PA+_qIHnWOmadPs!9Y@R!ImDIrOm7isoM2Yp>I5c$LZfV08KmYe{8<NB
z-9zB&CH#JFI%M_Q8TfxHuY8v$MZDeh1yH#oaKq~v_xdgbnNg*d(F@g=_T}RPe`q+e
zx}hGymoW`DjdICx>=TgR<(U6fxKZ5xS_t{&9Ft|&DyO$cg@#rF>ownG3G!j?MYXKM
z6qUY>RZ59mj=+ZwWgpGZEFO@Zt-`H8kL9sip_bY7LlZtOyguE}0)tGAvHRp{o`>4!
zEB#Kn(7z?9OnT~0;J49FWhaMlHa$rYa?R6&4r@S(pHekL6Je_3>Yru9k-_HArGA!=
zGwezg^8)2u-4!oJJxZY-&e75S?J*}PwJynit#yIqxjlX#a>0vcYnW0)QpdMcJ7q&Z
zSnJ3)B`9O6pW4eCU*_|Ybe%BdYO*vbh#9-QdgPSFdK(<V^&0g<NoXAam_sEQuL1SX
z0;UHy)jJR5nTPqDF`VbXCbs3U;q9X#V#49Znfte8*IbxyaUoH}klZgt+D*}%<R0{i
z129mE4g-(IB`+746uH>PrJXTHR=!)T9d3WZfA<abY+d|VU?Q&a!Vf@-e>~LXE;KsB
z>;HsWw)rfUVcgk`8FwGv<YpMe|EOA``y#xQ)8$H}Wpq%`p-f~GwJH!f9(}SEGT$W#
z{;ms|BNvv3K7pyoUitm7v<`mR*HTEe5>oz|tx!^ER^Gi-Q^O$SL^;($Vv|2woa_7h
z1vp<*SgG~cnSKeu$de4sI+GoD?)S2OX=qa<jNKi&WZMdQbp=il0=rT?->E7L47wQ>
zEeU<%w$7b3!|)SE$GpG41~6)r{U|D;hQ$Q18iaG04e#Z9_*lpOIG}*7r!pJyV5+Vk
z(tzJXEhu+}cz!rGf6CBkpG3q8&-aJ-o#4r6MARq-FvXa-Xl~sYzp}1HX!hAeQkr#*
z>?{Fc&MCamZdQhYDVK~w7`%Him?!KR3WK{u8I3hgh!#SIP*F50Xe-SVX8Dk`u`q1#
z84`)so;N=NN}@$MBE-d|Nv}?I#YJ;btuVg?sI`@$(y4@27O2xGfVX4#;Jji)MHS!F
zP#c(7rvKs|x{;dzuNIMz0;d~LjB>E1lW_H$#Lh9Fgfa?$ST(wBNo93Y5-Mvor)W&R
z2sk6_$7T;)NAVIw$bSz1V_DAi_`Dj;Acp%?fz<)ovv6<0ll@e**wk-D<z9((tmr|x
zz*zayLZ|(ly%NWq3-y;o2UA6Y#SeAc6MTb?M?`(}ynz5NZ9-M4&CNbTf3bHtwgp||
z?HbgX7=#yV<rT;<Lm~6l$(*>c3qE<wQ!Zm_Xc4B(TuaLEmhTQ0EY^CAsm1xQ{;FTD
zGSuUDTgcn_zVH);M?)endf$bu$zQ6}1DGc2w*(pIK;g7RV>*w+vWl5ME=UAtcnmv+
z#cKS_kjx*kQz=XoiO4H$cHipuulPv>@X@J86WS>gCiFg)g~Uz~efgYboKBP-=xQv0
z-SM~wtlIR26O1){Q*2Qz&apnxilo@y5}NxJw*+}<)4(&Ivd}9jQU57NMyKtR?=Gi3
z5IYS)s?9j&@9)MK%20Q^kj&^Z+EXqVaZ8m@dG#+OITAXDX3yvOUS`^OpzZXVr1u(6
zc^svoe!Z`B!)saQ-GCJvyZ463uQWN0HEZ%ULeS(>uDqqMQi_A*k3tcp_Gald+S|EX
zZK%sajEM(?{@4>^)H#*n(ev$2A>7qNrgKf@{2v^TlKo4!oEQjvcDH-tREyVcD*HK5
z-l4&3`YC$CpX{*Jl_-pM?m3lNCvpYUk3u<o9s@eRjF__+FY7WkXmdwU=Be?#)8Ny8
znEETdL}s|)II}Cx!Ag&NZZ7kUfz#j<Y7EhJ+B_d={Kx{jVq;DhGu#q(O}+0g?+&A@
zrn~(SZ#0=N8|@whpRD5pQ=}qs`d}ftsulAs-kU|B&(2_&s)`DBcGksUO346JOdT(G
znoy~a*ry13?+a{)D%uL(45pa3&c$&K6^tOoVIE^JFv5a#!(ah&w`tlc#wImu1_mOm
zMXUBJ)imK4+70e<gs-$a=$QRw?nLWIC9Ea%YMtSHn`>!K91~u6fanCZ6wNwZMvd|x
zfUMZ<-@_8E9@lskM_OV@cS^sz{k=764N2)`<I3h1(~{?MsYUuSo~@jZmuc49-BB(#
zyCsS%Qgrf^{2T%ga<U5DL*_Cv4@+nAoF8`_vX{N!TG%#*r%?LQ<;+zK2ilpts*K!(
zaai?qO1291<09R(n&mL?1`I<Bb*@b_Hqk%U0K*Y-Q3h)q`t5%g*hMKqB*2eMCv5h)
zbVbsP+HE8xmi(e4qB1nfvIrMj?Q#BJUj`P90+CD%J6s$-oz9~ceN9c6%V;83^PLb?
zNM_c?+C!&<WDm9l9TKdt9cyCg%baA5en->$6n{+~LH_$8STAH}|4*7d+1G@xWea6m
zp{Gh)e|V*w`rBnGHPs)t*35z*5pk9nYkpYS+A@-Deh=RjjN2W*W<qvzS}kWljX_%x
znv0^dd9}7*C%_B9L-usY%(6^sXjNT&4Q(T?6H)t2No389@l5;I>}#h#gp3JJ>&t8A
z^YLb*H5>bd+2jEWY1;fK*Kkq->z^!bS2a#nlI|aWyj2=oQ|M(DIjVv}tBC`zW*3$;
z!;PW7^54l4pCg_w4u-P$>UZIHW(Ieav#$o2Zk4=)uAPR<begG<@#xc!`L@R$`D<m<
zS01ATZsQC^LRb8!@O%YWsXu<#l%rwsPqpgM+9{D)^=hb>p=k0uIFuvn;m7Y8V<2Z|
zutrLC6h@^&bU<v9KnaSztUMWHGj7S&=ZyuvzKrZDfOk(f9^>jzV)}q-PNv0M)JHbp
zN0?-5#M#bewQbs2iKNN>1{MJ~laO<ToX1S2SE`s9na8L)ZlWwbC&`IrjDL~lb)1HB
z$U3-Q2DR_G_Z0>5wXPI+y0A1uqO{qJ`v-=ip6g)an^)8G0)a#*y5+f(K@WnqAn$sY
z8+r<(w)kij#Sw2=ZQ{JgQ7<N+4u6Dq+Dx>~(QIkf_=B9B6X=wm^zrMZ%*KNUi2Mx2
zfm96SdAOK!<$P=E`^?b_JTtWaT&3e;xs)^5;p`?0$g<ccehJ){{LE$k%#z(f;8u8G
z0prJ<>DT!fg_^e}C!1&UdwhKSb~!SHlIB7u=2@~ZT1SaW#3PzUmNQ%DB7wuG5ubFz
zSFU`1e*mGIY6a$Wxg#qql@=GpqCKLlF1R;z9$tmE1^sAl=B5GwaNnP9^#ETU6h3S0
zhrkpyZ(#ly-fv;r=XgrE_p$F=zrQk8E_En5rMq*C)uGk)VvfqeDy7iBO8J(yF$Cv>
zEUBZ^Hu9o|c+nj9@04Tt%zFUkEe=a=37OalEdIi3cV{*#x|L_zisWVUiubN&z4D7%
zt!fwhs3k@u4hxl$_w5<(VsLHWmj83k1&(|QOUPTTcfU<UKV;mE6V0xN+c)|xewQAX
zubt<r2*XU05iJ#4O(XFbbCyI+9v08xmi0@T40opchT@cfJ!T#F3LomPHGb>1RL^6|
zg6hX18e~i)s;{CCla37phv^VDaY`s6+EJN$kge<PKNUU4WA?)_JD*Q-SAEn+YcHvw
zAILiC_I<_!3m<S%;iZ742WHJA<4`sEH#<ECH16nc*>I6475$!^_`i8^t`GJ`z{(-<
zp(75`_wgNRLXkNKShnnA@Rn{f`lSd3ErqlLCrNWctipq$IbXk#F7zG*2>&T3DMZCq
zq8%+jz)?U3&8VOfM%P1^-{=r<_h{!(w()z6K7O|f;3QVpjUe8ny~K3R*bV%kLe4Hh
z;_o&%3E~28mXS1V_hY;(qDDL%V>J9;ID84~4T7A(HE#_%z3_t!0nWRdX3=MV$P!`!
zs)<Y^u3P^l6s|~9KsP=thcXX9?Y?20@*;Uuyk?(^^guqcLfa_z6JS-z_9Q%W5&CFg
zWy_0!1PMv*&WY|QFkX7ee(D`V|M&7!9ulx@!GuXfFnt#l8iSVJpub?d&>hhuo3v2o
z&#Qv}#F3s0U|^Ig@q3BpR|VIZzo=cOa@nrS>L_x`^ofTguVR0@B|2jv&0r+GwYxM8
zGgVZ&AE6v5F7X=<=<+vHHgA&sq3$u`2(i=G5mXM0WfeAFL`_6*396MaN(|`1e+B35
z$Bd|iY;lI(lD<{oqHN}7%Z%%VK|FP)L&#Nn&B*9vh+9&ym1&^{2h8JSfMpMH5B1*&
z#?^Xb#19939}w?A>8x|nd!VjXN!{<zpRWlEfmlpX)VO&Md}XHSy~e;na~&s_O<-CV
zG0+GFNU<J&D$nS9eONg~Bq0w^Zef@-Q1h_cXl#ytE<%+VchV3*KSt%k=PEg)VJH&C
z-6t6um3{s5LT)=>8KCa1Tt6Z3AV9C4kD4pu6Pa9uvn;bxs9Y#u`YM^3dB}lo9#&+X
zGB2*wl^KlzK7bNa57UgFF$hn7|5A7UCE<z&XdL)*AG@ow(<m9bECJ7U((<J*(9`Fz
zzU;eHPLeAj=G<TXBd%BBuGdd@RqH`Bc_O=xPlom6V?NftcDxlaTO+Tr1w)aRwq2Ko
z9bxQi?3V9R#1?~oJNW<;{-$Kys`GBjdR?B3Ri^V=<W6=*{3t3c9avN+Tr>>*+Q%Md
zLa)`39tixiDzz-b4x|<*FZZsFg5|h9Q`@P?`;l+?N}Bo<+<$A=Ds>^1)&uw_<X*-W
z3lbbh$%9#^OfKwV)hu9;&t(d>Hiju~|3j%C9fFeh{)t-oZuc8)Wg}IAZ_*^pMyR!U
zJz7^>nH>>9NTcNYyU@`^K=d2=VwbgHMO>9cT20P8G&i1bbG}GoBp1%E+Z0EM;Kmoc
zXF?J3P5<8EqZmbwgdN#}1G+9ha|P|B8tct+{>HRXXPg$S#O?X?^0K9--2RuRmvisI
z_amm4jJh|B<#cKWqa#14jV7NbXMb9}Qyo+K-t%ZTZM*|h04g~@KY5fq$Vy>_g}}gz
zZhDn$Sk@t`u*{vL*c}f`Pu||3UwjgT&f|Q;#pi?RSCo3AZgVecN!p+G5e=N)zJ3o^
z^gQt2`h&sdAS{~MK{^tIsL~v|KYkVQ^i*<Wgf}yR;#eaRp24-<z#>0rU|otJh=6u*
z#RuX-u-3Axy{{B&WqG}K%XR2a!WOG-c~^NkYfL8Wi=;k7ufeEI1TKk|Xj8+oxFvV<
z_wDrma8ZMQ!5cD>95y=)kHeVaKjTsPYnZ~-1~JPCsT%950W9HC5<2)N*^dqQ-A$06
zpPF~~b}u5yx#_Fe&6@cyCsdM$4^`*Yp9`qV2P~Yn?r#uuaJtelo6F+Gvx1^nB60C?
zR*N20zJuS37;A1)Z#BzVOnZmokC(}=JuGF$q>DJbue<;W6F$HHi~YDEHtx)-o)Y`G
z{URY5GTooWd)#BsP+urMgxu-_;#6-f6fY8Ms4U?r|Lxm%f@|k;e+jS^rlL3M<U2w*
z?tzeFo$3qf^tl&B$Ubx=iH>`yJ3p_YX))bqx&1E}z{+IOr+C~s4_InCG^fLy-XUuM
ztf;N7jA7J#4O1Zp){7YRd}oQ3S}O5u`+wN_%BU#Ywp|6Jr4f*l?nXpFLSpC`x~03M
za{%e??(S}o?(Poh?(Tj2Jny@|wb%as&aYYby5hKwIx|f!DHpWXjvnBw(tqD=+&$yo
z5puSo;yWNN7>dW!{<5#G%Tho{(~>WJuiQm@yw;YEzTVl{$$tx_^|vGBpBj*~zuv_H
zp+AZjbbD(18$C-nC1ECC<gZLug2{T(`)y&!$#b~pV8k3qH}r+^!WMafbO<^Mx@bdG
z`@={5&jIMxIY@;+2BO!uV%N#_ImNh1Clk1mv&5PysbwqPwB~r|Y?X`8&;}M+mcO|A
zzLUoZAjdPEf_#V694&%H@PY!woAn%j1%NRoiO|hmLQj0Z9~}!XeDCI}0!3}eHMeq*
zrJ(Cb=-%#*r3;YLg2kf$7+)2m%h%E=(@158!V6E@4x!9q(2Wsr{`xWWW5QV6Y_UYI
zn&7$^K;LXoX)u@uDpOrq`M2r1&qvpsE&CP5GWF{#xFphAr4p;C6z@gfm98~-gor9;
zym7aIB}=x@Y`|8>vBG|&;KOIO@f3B*Lb|nD1vOZ35-odtqQ(~a$~>7_6!q{mc0q!i
zmG+yYYkT*E5b!J24kIBZ@WjXB6cfM;GgrfJj9Z`}i8Rb{qFfPCr3Y(ot!d?br$xN5
zI7BmxyHRfK2g;&CZ~)6A4%mlGvDYHFJ`iF#S5ByoLy9Fn8y>Syea(=|7aK8W>~6Ap
z;SL5(UPi29`YodP>TnBA#2afjxNWfN8ubR*sFW&wmR<aFU|+OrEO|D1iQ*M=OH+YT
z-@6o3KOFF*`p5ij-5B?;pH5q2V&&E+j+$;R3@KD*D9k;tT<4eiC|xgIj>PeIZ}fg=
zt0MC0b-AeQSRka*f`!hBiAYfXR>lATi3<uJC8d);Ue;$Z+;b-41dmqL$enz|>i%96
zk<m$IQ@+Z&Y^v_PNUYBNtwwxIijP;E`x&L!C*G@57BoV2$B8H3jSqhtEu8(@mvZXi
zQ;2*llIwywrVEwfAjl7dsm=CK=pE!^T2;DR1DN@LjE{%=)k2fm{njLjMW&;e0uchO
zYty}x=d6O_O-<+c`R9?ROO-*H9^&GNQ0~7f3^tVj&P9N~fX{TMkYZMzylbH{UW!e9
z@7{bk3OfK9U8|FFDPJl*D32k@u#QfNZ{2}3PwKpPJ6L&sSAY^)=a?BmO-+aw3o(~d
zk-Y139A0=@<jG=da^zbIq2ASJmkzg$^a8cgq6S0#*ZXQ5W^Ok|#+2svhfl0)3Vbqt
z_)LHq*mH;U?8F_h<$L}oASFkq9$M2k&YPEH#tcMW|86w>zk6;7%D4pq^amu+ZsLpS
zT?bRSuT-u<$i`qKPQyyA{37lOousXt$J2ElN$sLP#upIsebCx*;~kKj2Zun|Mx!9C
zogwmAj0EVd6^?k5@5nSj{nTxQD?ek<I~6pk3>rl9MJrK@Ksl9lZM6m#TXU(rQ(fK+
zNcBGy_h^;;v3~bHhJmh|W@2fPqN*0oE&3|k&EArAqss+}@;{8cqxd55HSf)8ZcCSL
zdJnx+agKKI{)Ym!<|44$hIEfk-Jy9Txrn%aDy$twZ2Q)ErgQCMX(~9E)&4nI)P2a1
zUhigtCuNuO7kH{@Nh|gus_5o4YE`H5RF`0pyaiiKWs1jax-`?yo!9-&$3mhm&`fa9
z7!#T@OL+GK>8Q__TtW&1&Ol<YQBKaRksUFfcXD?bDh)9j8=D`^iJZxF6Z&Dj+x7+R
zT=?{~OZd@<R<ArqTfB_6ZrzZ%2A4<R^;hS0eRj(!6%Asrocczykl_giU-a@&&g1jh
zI&>Y=0wbH_&>IQ%!(~asr!IArU>LcsI==*VTMY4dI)-1Uh1qfsYWEFdR?o9zs!Vkx
zP`kt*Ak@Rj{nqmSNLFi6RjE<|fJ0<plkBv4N=Z^))-QXySBFD~bil4)MfQql2H!XM
z!w0MD$>-JIHaM5dyX!l`>+2@PM5WA@i<~jk^4|^J_MHoY1WlgG)0&FV{0ZUg6dECC
zkHJ|N#?r^cgEyx~8^qOAu<GH6-Se<B_<zJB&LX(l;+oN=op8xgsA`K8KxC*~GJ|w|
zInTn47$4gh1wuBn3u10xx~Mc-Y~U6ZX{#T{&x7Y!^v)?$np}eKH4C~#RoJRT1h5CK
zm)ja32!kX%tuZum7k7WoBR>!WcVhmAWtZU&#*;=RK^9eb*BJKT)A+fqqaZTTG5V;z
zTGRU<4P21)Lqi5s!6U=<=BHPWB5#^#pZFN3^?wo_`y~7@4CMnt<4q;9rL7+BH70W(
zrIPVkz-m3>*s3B(_Cr&U^v;z9%zPO{)2GXL;p*0{p*1*^%9#8M3VkIskK=i*^;02S
z+uHQum@${z<9EuaMlsZ?-%Ef}_mA;|o+OZVFK~fj6mlQD(Nci*PI$d`>?Og<5zDll
zXdJxv*sCi_Tj2|5gSHc;`TR6m>)V((rJHx){YJUp!oVQ(X<1n4d#-46D&4yR9cs0h
z$!^*Y_m$N_flY^_hkwf#7eGHY%dY1g?Y<m}*1Irt`ye9PwEPoR@_O<CJW$*DGh4Rn
z@NR-I8^pbWYg*bz6E2>g7uiiOfuMk<4NO(oo5+Z*0c(s6RYO-PkLVENaO0=yLt@|s
z=@+e3!Bw>#0I+acc+yG8phC=2L$%B&nQg@+LBt=Kb(v1qhi@eW#eA{>ekb3Iw@{@N
zMV+B2_?rC?LA+2U$5?GpiF6-E3K1$Yr$D>enwy;?&b?X21r7W`{fil4K!Z@O=9&ZU
z=2ySk-0wc5`;F-5%*84ay}6aMLM1xH6#98ZO^R&IcBXS}ZcnV<rQUe!Vwxp^LX=V&
z+0%_Xt2^Vgu{V^tEyNg&p|g?j9z}z>dhx&rhH%X?)pAO6K?}LsxT9q;FZymZQEtIO
z?`fm`i`-ZTnw9jbb6eAsNfrTp&I?DeBn5-R68f?jDbd9*u<EFW`kePakB>gTV@+Xw
zjl9()?V7Pt(>^+JA9SU^P$kCzQFPZu8Hnc{staWx)gl1pa9r~rbBd@nb9MzFR8Uzy
z%XuZX7bC)}!PZXI8KRuyKR>~$p!836$<#vsQXa;HA>wuIpEQsVFHxkI43WC%j$6iM
zr(hw%fd$~GS$L&L&kpc<#Gii^)yrvlee)P$kZ>Ds-elI9p{!0e>_{B`s;J;@^%7aJ
zVf$|?X+gl#qei>_6edi&<!${$TlP<9($43G?UiW%22O@!4dp(n53x<VYql$?aDljn
z$v4mD(;0jdC33;Ub8qj4fgz+`k|YsM+~_(2=$vlM&vxmGMM_>aj~vFpi;tbdG=$Y4
zh7UCw`Q>~QlvgCO&r>+xJg;*ny6`*UMElZ^sl7$@Qm#rLAup~@-;4i<(1I5cvy`&G
zWm+7v*0%d5#au%rbwH~Oms#cJe!BisSL#>$5ArK{>l0ldbAqa&-U)G$9x2TTu>nRN
zY78@FMEwt)Hc77v?DAR6@qjNc4SOv48}a2Xy}rBs{wkh)p46x{Lz!0Cuj*2|WSQK-
zSTu~!n6RP9)2UG9f6J7&NTFoXzWG`CoK=0rW=h`QTdV^ys<r!X{IlU3fBoGQD5Gvz
zVeO9oo~ki;x?$bS&G@sX@Zpg}Ybyi|9rrkINlAPcd;^0tgt`Z41%fAu8Lzd2ei!=^
z%O*P*560}|Ag~BMk$lH>eZ!oCB+?WbBbIo-vyCsrl}FPRS%sB5L$5(Dr77<5`KEj+
z-1ye`#)y<>BB2OgX3~O*s2RSE1uC&9PJm(FXYmi@^VHU>*o0(OlgPdj)tyvCVlZp@
zr=JrTZ4`Iuqpz~%V&7^`=d{ep`09^-FyJLmx`;=w<C4$=r0&%lBl~Vs^N`hkKV=-u
zBG;F3=ey*`YKNTF9iF*F9Y&f956*5wcnRBNEf=_Zj@8OlRv3nZ$F;{-(q$5YFHFn&
zTi@#MEfMVt(w>&7YpEg&5I<n7W{eCpO7y{yb-hz%G2q-M{XGgWDTz$t^(3qWBVw%)
zZ7wL7tF08QxwL;bhjYoPR`ZakfA|SJDj^{$VFE;7A%MSm+?50#ns#vG8y#;Nb_;7&
zto9raOx7%D{Y=>FYQ9m&=*V7~{M0yYMzwx#M}duJi6n(;Xz#O6YbYFyJ6_-@%4_^)
zq&wtOZkEaAT!(Agk@~RTbq&kVwM>nPM))IPICtymnS1;xv>Wl=&SU#=kDQYOm~$@H
zVgm1>`r@{6l)IC#83oA%<#2A&f5y1Egc!qF0tj#Z@fY%}LPx`6bXcHG;kIQIL=I?}
zt&{*IgnbMH+$jZat95!q(vmhGJeogYe*wr7JnZ_MFJfmmXFaQ$kK-){YzgXNg2q2M
zH_EzMvf}f@xy54{DHa7`ngjk&i&T(-mG4r}_V9ClZwX`P_K{<=gOnd7X}Tzu#oHBV
zYFkiJAqGCxLD+Qq0l{wX26?O8Y>=xH5!X2uhTea3p_4I?*5|}={h|@x)^Wh3R{2IM
zCuTkEn>#FOeBs^hotVQ4(>UBW$WQ{g;J(81Z|Dk(YU}a@eWO5@oov=`#H;_nU1ylM
zGW7s?92lE`olrH>Wn0Tp)Ird{e3P^1-^KrS&rLMUGmp&r&){3Ti+%E>ul838xO->y
zPVDY09^{V!vRI}#w1NOWNL8f)yeDZh-wy_rR@8O8Phn!HQz_kLXh<mZ8_EQmXeqjc
zIYk0Yz@VlueHc8&?WsN)74#PG+>M+`h#LgCk|^u!Lsr#I_$jZUQ2UayW%baMZeFEA
zp~7mL>zJs;<LrrEI-JzU?tZ!b$Z7x`{(Tk!W8jo3<3faUh)E0NNLZsa3p9Z#M*$7&
zbMB1NM_e7fY?WUGAeS$AQpuW-hcg;d0#fVa<<B>&9w(~jg7+M}mAr^TUz}e*MXeBH
zzR0S8OdbPDcG-slb3J2YQc{}jB+>9jx%$;;@T9B$uI&X<(MQ-GA6nLwKIU~2BbOHd
zAYG`WuR=b0VM43&3H>Aor?OYFf1}Er*W}WpDb6x~nY@(s4R?E;w3zOZU;ypJvcr1O
z1u8RJoU>1)Jrg}nOz?~PCle1HvcXx;kRJ0C8hybwIKYQL`XUCL+IL28--dzWp7oB?
zHWR8Xab9?Bv9D{3vy_+eD)3hEKYJAQKZo|=gfq3aoju2{X;-;H<m@3H?#GB5f>WdR
z?{|(siH5*a#qnm)M8!EV_*NmA1*M=kIj<>|c&y4z8bI^>qa^&|7BNbfTY>VN4X0Kq
zJRTV3TECUPeu`|Sw%^;c$<`OOjDr1Aw&EEbN_Vo_80_L=kB+v1YN2US$p_B0t+VYK
zIsBq3K6M_qiMEH+-EU)mFeC7zfX~i!Kj50?`T&axHO@s5Pc@9z&>#7hSi*=abtF~1
ztM@&NMH7{s*`eV_yUw4xb(#q$v2ms|i#iy?BFR~@ncu}9hXR1<?E?5nu4pu6c@9#;
z$mw3d&MPul%5s)K!{!Q}Ip#Za<dm}EKi}w?F&(1&3vwENwTJqc!E0mDqkP!^j4Jhh
za!e(LJMLWWpE}X@dJvHGX#JMg0s_wTaDJRM#(788Pg6Bt%u$QqDy==I+`(MnK!wQa
z;|F!$330Fhj6iG#1<7y<pIh4XjJ>gP$WMHV>?e_KVp5;NCdM_8$W05tIipLXbGn|<
zA4V%o&?*;~%#yml+$D~1?<+=iF0+~Wb3yM20m!nv>%bbYGiB#PH&>*P0a|JZ_&^-q
zg3(rEyd~O2`-Iwl+uH=m+IWh@kS=l(R+@milk(J`m(D$7PJu~NWyZpG|Bhm~CS?F@
z!(lm<Km&)dF4csVNY_s4`5cHfLMfe2oeh-`Z5tnk0On8-@r%|nGO)=-rrzhcaFRIa
zOAqz;59Ifg7=vuND|@)tF)j=$kQB-WDIjSh;bEA&Ac+-%-Fxo}YDk)Gm2sHtbra?$
z2w+;`oi-<rC5Asu)#a)1nl$d<#zV(-t2%TP%Xd0gO)?$Jxz=Elk-6PNVMRpwZKi)E
zS|*+(Z+7B2`?t4)RHyG2dSn%K<NNP(E-lVE#RHQduBpEVZ?7wqv^EBow;G3uZzOcz
zPWyhO;{b3gKNY}7R4qjJ$yl#13lng=g%tMDkoCPM;()J{o<%ef61%rZYDOiimqSP<
z^w#FC#aVxHeI=Qf#+p>)ZdlYySdbkKO86x_5ga?qXLCfX5>knEm>l*ouY!FExEOuD
z(n5>jjgAN~7;uUkedJ(+GSi)_s*4$&ruvaIX_t3!2c4@%f&E3SNKp}2>^2r~GdScR
zJElH6+Z=tzGicCUa_pM%U~Lu=>V9~*mMc+Vu^;XT#$iH(C^{83NG;7K5N!olZ4VaB
z#3}#95kKL)0(wXq9O%N{&;HhAa#DNT8Epq3#b3%H0`Bh=6C%%1|1Ef9Rw%mgvhIrh
z8YU<7S@Z&MC89ie)|XJgT?8j}_6rmkj;4!G8ntfjZ9ZNnvqXQtTl}g;vdlsd12p5j
z#hoS0)&ON%Pju=+Ky^eq_5~5TQ+os1aycsK=UnJh)TP_^WKfxmWUgC>Mn*CfX(pjJ
z?n~8oZa;4kz8`%Tn&Au=CJGUaO>J<0-uO&*BG&IcxoJ?VO&TxU1aD##_-FgWG3*L#
z(?XSetTu1Mvu7c^ctCUxO5U2JcIorc=3}@Dk2?!_k(Z!_pRQ-mn+a)WzAMfB;0KZG
zsZ{@_8=lwOuUQ&d3EJQyg?wtNy0|E3kEgT9LL`jLV8lZ77mkN7iOvQ+ykf7X@~dpy
z)hopg)h}YX77^Q@DqjVYQCZqDI{U5JO6Bt0@Dk7G##F-<*NYW;Ls?%{xy?!oK2nF*
z=kTKAuShMcwh8%FqgI2k340!6+)@V0Wt5Jh|6n>&qF~r_iS#4Y3f8L4bl*#iWiO3e
zn!5*?U`x!MR_TRP9u29kpnAkOfS8Dd=NFl6n!}L-e`(EJ4OA~Qe%Z2L`oCi5c?~?)
zYh%z^aoyIO`wLUwuSe)`xK`9Nch7%72Fk71t2A*>={u8MuCC8E<C6nzyG`Ff4xMxw
zpM+)4Ypy%SuwUb!d3M1}pfjFcp78T8V|v5S60+S-(l&P~4|w%flz+-=Av!~TrB06=
zs>{DpmI;2a>QW<)Z{_UZRFi+`VEeA}n`IN!H8}|N4hg3_ewryD?Hlog+~s_&@6c|h
zNpM~SrJGbuWqbN&Uj}~%RLP6?o^S#ouYa!S`%pv?!LD+|UGS#GT8p6M$Y8&98Ol+k
zj7!5*OB_R%U0x@f9Bu2{)xDYe{7$!N)i$3cgoFc*_wkHchVHe8;~(VRd1Nbp_ShYZ
zN&*i*Hdd_j<<`%kd-178NFm5yrf>{~;-AZ&&s{^wEFBjmI1aj}!o6{RFIPB?xUtme
zb-j2y)Uq24A9eFs4FQ|o8ipMm;ESfZ71udzU@p+!LW7_+enVna$Hy9oMXqo=+DhE|
zP*E=qNnUg|1Ywq7hJX&$*I8Z(RA-`P7tbF{Kb9LE{O_NwAJ^$Aa<AfZ1l3Ujftv0*
zG`uq7%B&41E3MxG34#ftxBysix#^!9lkTsse5f<J8Qk2cA4GK#aZ&9QW_|s_wEAgS
z&eBr5Z>R{O`OO`hsLQvWs<2_yMmSiSp~@h?h^ZFUickNLj->0p^y-%u+qc>qKq};8
zW=-)y6WJ;ez=R8QFnniaf1K%Wr5<nDgYim%O!m>SV&(@Hy-Gyr>*;R@x-PLdK8|IE
z%khCVvZGL_%V+(aSnNk~P7l$0&z_f`w)F($H!}L12m&$v)^!4P)S9g|!*^yz<=6w(
zf1&j&TA<|jjOadfFyD&?Pabk=r!@&_l3)XL(Sd^E8Sy=5bhv-XJXHtPizH88g6IO;
zj%QJAQP5?48eDSok}=i{ZBp5qK_AhJEa;N>VhL!hU2VHfSmS!x2}tZRGBi4zyQ{<T
z!~H}dz$v4&Ka2VqF6<9MLwS(X+Pc+CN>~3^03A3#UHrSyBNpKpofk;|7wOTe`rg4}
ziSXGq&z)&&O_2T7Ra7Z`>C7QTC==oQ`JJ})&X($(^fh#0qMXUh4$d8{OEx%HoOR4=
zG`;c7kc|@Hnr3La+3OV5$sLUOK6ywkd3n~6mg=fwtRH~a;ZXoCMJ(%E^UMxqJE2?^
zPWSk|zOx?`l`H;d`!YUc%i-#b1-IS|x{aSWrdU>b-5Divost`Y$rOcv>y0!X&|&_S
zIG61@l`C~Y)Bs+-soF~kfuS}{Uc7G`GM3iWH-!p!ONiC#s9XxAoyR;!-?dvT<Y!xF
zj~%-nSEAuV`BT4PAgpt_N`A4%?EPz^2T)~|8U?<0{{&fub0g(B-&^yiukrs!-dsBJ
z`TE~gPV{$`-%)J9{)G97`d<|jmOfVz(e`LJL;pO=l!p45dXUjAEh!TyHM$@nMadN#
z8GMLHO1{OlVi^xen5nXrz-!Boi4cO%{@xwO65U5l-Q~;W6Qth|azxToG(4^BsCCgG
zsmXJlOx=aPzrVb0YyKkC`dE8fzOaas_RXai>xjHi0xpF34Jj`+tp>HkbE|}hKLuM{
z#f{X`7lQ+HilG?&ZWgd^G*y>@2%hD{l`tvZXm_tqu2^h|KhNrvvXrs}p0uW`rB+iQ
z-Uqg`jeVrLFb*~g*)s4~j+gVlSq1UCiQFmT)U*8EbZV`v24oM_N*OgEC5!xJ{%6Jf
z>ss>bhXHGm<B$aHe!4EXqVd>uIz`|dZ_{Ek3l;SP*0vg+<%O(3B7YJx)mr+04{#+v
z;4g^*a~j74zxxe|;xTgHSQ~JTD3z_a*|EhPVFi?KOT>Y`>q8C3^|zUPC+hM0Bb{KK
zMz2wYg`7VoKvEiYe@R0M;ToP(#Pp!p^5}n#cqB2(K6C#xcjMLP%CUK(wyvSvAnA;2
zA<3T!1~7BIsTne9JfgS+MU8l8pABqHQ2bqyF}0Uw=ZEcd-{XW$k_w|nK5NLG!#TgW
z0me_IfhY<VqX~L;4s&u$E17g2-{fd+=~vs2fN7tUng{Lr>VRZj2`a@x(|Yqk3U6}h
zP`R9?X93e6l}H2{+8xH{A=}SEN~6^puP*M!>E;TL-hwJ9$mZg(Fa=pzWBbKvXA7@E
zBA4xWyb+qGB|ihu%M-)mV;pJVk&yci$_tecFmFwwC$eQwKe*uaq?@v)R6xWwr4B4#
zIy|V=>*&|k?p{VuU)S!RE!HAeRNC}DbI-*xD#9KbS_l_n<o-N`Gv>wkR|bIw8W9*#
z`F90>0*AAg@<4fpS#B_nl{WLG$Spx=tHp!i9VI$sC&_F~gYDT*#p6Y4OJD;UMinm5
zRNJA$VV}0L@M3CuTA?IN<2`AaXu#*33gMg&&cGPka=Crp8tz_6yzC5Q98+Rij9%6%
zf2`u1!(^@mZ+M-BP^x<mwkpBYb5Xpo9Y&I_g?9^??tB7)(y2Ft2ay|#q;10UwWc^-
z(-om4WG=C<g>G>Vzg;gG$^2H@?9;F39PP9dz9v;F)XNa_7$BOS8*MIp_GgBxIbKmJ
zwTX(3Vllg*Cs=Y0sr_8}x#LgZuO8>2$IMq+$lhlu?ZEJUX8{hA)}h4$*`jEobZ~Cd
zsDtq4Z0I>aX3-<5_>~U%U=<e4-tP-i*q+WPmD)>qq=u-<=j>SOd3Qv^=F@apaQVoB
zNGO443kR^cOA~*?(sjYGrc^aV-Vz7(&5x^;glm;!S;^s@nDZ?AHRbl0`;ZmwE65ii
zQxO_n99BCI80I~Ja-IEmLdHQ>?~6C*@^8{Q?Sl7dZ$m%JnF*2paEv~|-Q=JYG-w#!
z$PG(f2$Wxat#53jf{6CsWt@wOi|M{#n14WiT)jzh!g`|QaF=%RwQW1xm@-@EQMElY
zR@h&>O}-l8{+H7JPqj-#JiME<Sg&*a`Q4Qg=bXtBZ1!NqlWUoTxO6A~5U*48Rnmy#
zkMhht^X1rl$DRFDXBQ*J>yjJZ3Rc6<FTB?9OlfVNI?=t>*0%hWS;EtnOeFEF7jdCT
zWj)$Pj}up)_^w_`^Cf`_6E}V*Yw|dzFtOsPdK`J_>dFc|NVP*nr^@9-iRMB6SO8*r
zsP9DZd~IW}VYdC!Y>9gWggM{`hpW!Cau2(3;Jih0y<Syz1j~;<<6hQzjV5CKK+hgA
zuN%x4e;Q864Nvh}e+7EW^KaQ==~BtQDI6DvIyY>n=Sc-I3xZOXpCNdN*o-AgkP9b9
z%~}=1w52+<d)Wkh^mou*P_o&E*OKM<@gH_3J2li)Y{`4&SKEF)N*a;p!w<$NksY(t
zgqvs)mEO<)^VvQr|1E>k@JdAOL?3s9Nm1e?F_u7MtLc^bjHZK~c`ss%1W|z@BJalt
z!W9SxI~rE;G`{-K#AvpC;*lirEwz#F%jEpl5<we-0sj7^lan@}CtKqrSJ|5!FAlG_
zKYw}t45Ck~#N}2x`r&RzxQwlrAM%a{WRRcJBGu7ia%pXy{V9@f*Z~(-4PM-E9~(SQ
zx*;zPMzq7<>uiW;Ibbj8O44_*JXnP2-Xg$ABBB*H3pHja)+-%w;2cJh&ej<$Rg7C@
zPPRs^^+z=cZDf8VD<qG0x1c-qCtaXTZDm_BJYD>4Zc%`y6E%A^u>vB^EKAe*Ht)Z<
zxqHAz7n1bPJ2MhlxvVDSy%1|Gbr(%AzVk1>md7{Xjpzt+F}$$py)7hke1m>#p&v7)
zd9pIAj@xs))2-~^Mx`14>jn6(ehhTQqaAqCE-Fr+A^!lT2+@bqV~gY@oBRSBOs(b&
z9W588m(<7&sc6y2psO^Q(Qptg^qU*Y{(f5qqd6e&`qSm?e{`c+{%jL;lP$f0KNzf^
z|Mdfj_Ae7R@By#U$G2;!_Z~pg)i2k>_>Bq1b&dSor8j=Xm$!|Uvlq<WmRh|#>f8B6
zXEo>dF7<Ju9%7cOPt_G(wmdK}4Zuk+8tf0OO>}gO0iNt0rjd1GcEVUlQOv%R^VJ!;
z#xd3AQzKG|#g9E?yVDt|j<p;y4a01|uEma|$dw%t;c5q4oDl)DP3`V;M~k)hM%>Cb
z3yU5Yen}`jS0*p>O(ysfeFr#u!Q!!Ww6)4E^y1M6M>_$!J^Rj8M;;MTQ6aK<8<Z^p
z3^{DqBA0O^Rai{<GHHIz!Bp^=00H;jFSYu8O-;@8NjP{kN-fPtqakK8hr@!juFdHM
zgEAzMI?q-UtNP7|8oOdMq?P#M@ogKi@#^zgd8%0@Q2AQN-ZW!ILTQ&ez1sU4|BsHl
zrTzCBcr9Z{#{=<O{8tkyIc1)IdyH?Crjm2x*+L5XlnjrSX`FX0N#0FWk#_X!SoL6Z
zIA(gRaK6R~GY$rxM!KM3^r!HlWhr)3W=sSL%6HQUBFb;f4i_r#ST!o#D+<!`!hnu)
zRZh)M)-wf~(4&!~1*-Lie)0`A=dgyZ!y*fa2ASFAzrvomz5`YXGf;O$t?jI?@E+n#
zqf|IYMUtg2My^E8lGR47kceb<P0}`9yni1LD%#foMD!q5?s{aZw-wXrLd-4Jl1>rA
zg%Sw$=|#^u(zcZ-g&pq}U;|tEX>EnVa-3x|?X|EQ5sQrwF#ZTNkV)nQN)*BJv_5hg
zs*L`K-?qtY8+cp~QDmFvnWRg!u)Xcogi(99v0CaXDG<KnaDFE|K`i#pDrZU*ixI4%
zGyh-ukfi%<?smVmkF&W-$zVg|m%ID6vBaRYTnJ;OgwPYd?pGllEgPeaKU@k<Duu=Q
zNcr$ovHF}QYq8qICgz1qiswvLZz4ftR3LBoRE3yQ0mEJCXZ>8MJa1p+QF|C79#_vm
z9n2mZy;5BrUGu0zhpkypn!xi0W=;##61@}8AciH|vbni*Wb(uIMxvC%%ykXaQK^_k
zcITE0r|t8k4`_TYJUKibp1~(3yG>lYl?Bt^UegXwctpFIm-vL>aJlt))Nm(<%oZX7
zb|<F@_)hp#Wh&27d&kbRUw=I9A#7-U*Kb&LcQyRzKZH}>-rM;>CY@Js8_-(t2)$t6
z?>FZsgva;bM7E@ur{E0)_d61;o+R)Yy{$7}9z1~agU!lwu&-RXHv97D$94dBpyT<{
zse3|aa(*$@<J0aPOM*T(!aJl1+^Ws-UEuKVI4*Ws`~;*r>81RDR@|4Apj@PQQhHG|
z!cVnd#X}dXWn?Im@{~-W3L6^T>ClPF(i-dpoGbz29?r?7Ia`;46t}32PhwN(nO?IN
zMnvkpuqzR-8?v&iata?EPc@;H2}-cord)+=rd{7XFrTi|yVK_SCxOR9Bdc1iNe|Z3
zq^Nfgr(y5O@J7#}ZA&8r;juG1Kb-lqT~z~)>w+X`uI}5<LmLue|9|%o&0p_g{@G>6
zSEwYfe<dV%j19lq_v_yl)!C3Ym8@YDK1FB(6Rmm~QyY9l>qVa)|7ddr*~sWgUL_X9
zkjG-`H>k&3b@VL*^Q#Mxt8ih!owVIV^wdniZ7L)wip;@?wJJiT$lS*a!c$}>MU=wl
zexHK@D*P;FGg43*ImnBEM7qOpfinr!AKmja2pOaYj3!luww1AsDOQkXSbGIRSOGUd
zy?m~(=!bM<q!{a?kgT=<1oT0V{iqwtf~pQ`;eDP{XU>XsPq4;bcp^oeLGJs)Ty&G4
zyMw9b-vTS_-R5j=5(jH2rpCJyR#z3Yewv;fM6Bl=vOnaa9|<1Gt>+`sT6}ZdA|Nw}
zIt34yur4>pI7*7ytUb8noxGhC8Cog~5sJswRaaUpz@}~ir;(sFF=D}uA&IA^;yd(e
z-Trnql(0m0vn|3)J4PFgHn@79x!XP#-RQ+l{Oh0Z{I80g0{S!SE&Js7{r{79tc)&a
zil^Sx=@AIKv~|iH(j#en)Qr-Of80;q?0VjW3kx?IORKa?rV2wCJN}5z(YaM<$!t8&
zpJ7>uPCPI$0Dpzl`S2NnuvXga;C*Yy6vL=4lIec2RDYJdGuCe<lw@mN+gH^yg!eVL
z-%>uWDN|@5inR84<Zv!z++pvJ4!enIf4@2gcH>@;?L?jhUF=J<{iQ?0SHvw0FXUaH
zjh^7Zn3`!O?F!o^X(!jaAVf3DJVBjP^LQnh(0hrSx!1I3I4c=qf_b{-;3B1BZ@i9|
zVatI-q=S<-M+}VhyI6KncLe<gN>avjUJoH-*@lFlbxOx-@W{zt57&nS;FGPRHkW(a
zqf&yiGp`J_-L0KBQ*vE$8T<k_s+y18E{W~_w`1qqFS!p-l};l(Pqr_d!Cn8=goxAb
z;s1d={DJ`F;e4+OZwU+fRF1}Jz5Ryk-VVb*{bkpVebOREdFRaLE0ih8*`FI|n0c5_
zB+{~eWQs*7?%V}4aqtEp2LhcORs{60>HZ_b#Y}+T5`W}uxc?*%$_Cs&n|UFC(FCaq
zsdJ&rXr|8pNQJ0{0RMyK4~u|3{dEg3ls0>QFYh1|sr~i7Oa_1EA-y=!4<FY#p`d%;
z^LKpj6nsZ3be4GdS(*H<W^Wz}vxTxiL4&NPGO><Ln7;J6bi!U%xw5U^YLykcB%3>|
zstsK_^&$Tl&(y0d3uB>)^9M^^A3W+p;k1PY4jF++dqr&~n4kbf$L8`vYWOx4mGo6i
zoM}o6_n1|UWP|~IDJx%C@73?Q#MHg(q2i+6+u@sloJ{NZqPa#5`jEFz408=f^b=*K
zJF#kyq%$9~24O-aYf8##(bUd-nci9bl~J7?{_^`<%T>YU1x^v~>}%2RA93;kcORH{
zoYcuOEeAO4W(^M#3bWcU<RUi|R2Nqvr>{e9>KKCTbL(|oEOwK!zSo(xRHQUC1N+l+
zUJ^M-MRQqSTn&)Wzvga&@n(h0$;>HWc!CPo`kul_g@dWDL%HWpnK6!Qk>=*@!BI6=
zwff?rlLm^@v?cJESRZ4L@dmlZ#PM@QF8EAgpyDwYbuPSX>KSick3gVbVIOtWrE$+6
zhTm$`iZ*}Q<CXy>J^=0mOqEMsz!&iLA3_`O&}qVGX*c^cwEr)8L(+bfO%1v7xK+m@
zi5`2e21c~GcVKNMdZP`4$Jx|`of8I(PCzyK(J5qWx4$av$CEca3L)kbP`EQUSD|Xb
zk{mRZcmFm~EQ1=$Wh{Zi)Hl2@(8-p}QEL5^ka55_Fp9Cs(8~gwM$Sl`{as%L>%%ZM
ztD~rRu(sL{kpMoAm%^$+n>x!?SY^2NRpxo0e(O=z%dhqq?0KBc!QB;>-<oxT^66+5
ziW2spuQbIGy`jHXn)*guXPGxvC9vAFy4;*FvJu;FruJK-%;scJ@hqven$je(OOSQR
z@x?P~Mv~OoA5EeYX+_K3@21BrzJ~O>5b@5pDFeDdtCl+q2<E%GGS$?jq9aiY(c0RD
z3*?cgx6Ol_sN>Ri_=^JR@14v-fy?D@l9v<zh*kVVe-m8J@VvxHWElwXcIU_5?(y#w
zQ&6P1+I1p+Q6}_Q0MqS)J$VoU1H>(w>g5*v;03}lh)G`Dc*yMK)MUj-vZ8atDFC`u
zi$=`<U>gQ6?16$*t--cIw1h<6CHbb;iVPA>!WsGOI(wpHf))5jFxRHIm%h}vC!LOV
z_^~$^KY_)>PxM9)hV2pEc)BUFfamu9(kRX{y5ATBoXM(-N)tVDo6JNTInf}5TT~<;
z8+j%kLz{{5#eVs2yE;DLXLyvHW3~e<12XQtWO*SGt#aXTlgs^@#Bz%rrhh$gZ{c&k
zeiK!0y;y6sLeb!W9vHBB#(7UQG*^n6lSi#YKgv|F>gI~3WD6)n9{mhP2IWYR+5?1f
zQ^I1R?pb#S-bOm6t+GEdXkUnInUsIuun=v3dU_Sh!nF>2z)*;X2Xe)uBvJ;G`-W1N
zIC8n5fE2K`ty<afT@lX2M}7##W}+<1=hEkN2GfR0=*wW5VdgbZlbpyA5Ukk8^n40s
zN(`sPuuuxHP&LVx4msy?{^j{d>Zs8`k%PrzVHUI<9k&}w(#UR_Vt~`aMUg^9S9jvM
z2)Ve_XB2o9M2${*csGY}s&`vi&1rps!KfwEp6c(tVdYzutD~Zg)5cyFZdn>SUS;EM
zj7^uc>olQC#=Fk@;iVQIPDZP{@os52?}2M(d^m$KZz)75^i?j2%yY-_&(=HgHrMbq
zuAFk?<Xy}mD%hvni}%CJ-W7)hXWkNqLEXDocE#Ds%{#_R({BvKWBVm>oYlTKED0nu
zA~3xspdNUk?b>`qXH$JAoO(B4Wjji%)S}r7-woHeX`k}H#T##NvN>?M=TjmD$c3@*
zJ+*Pr{@hVh8Ck#H**tHqG~WX1LwGJ-EMt!OY^5z+jfqjnqP=ORK8-*n1I<1|ndWc5
z->x?-J#|1VD{#aQg&4{ehdX9`GK!>7>~U^^ZC8iIAxpG1S|d0CKTJB6n>u1CN6aFs
ziA*H9V|D;mI2cQ=@l~uIU!N6?5v<?fqh7y$vJn`p%QY32aIonm9!pD0F_()&oX!g|
zEL0LS@~bu#g1@@5F)CC-`am6#`>dLVTyH)D#vO=U-|EY)#OL7fto@DcVKNhyE17%!
zkdhSCKxdPH`4tC3%t>B`9r*zc<wY>!^%1DM)VnHi4nR!qc@fWP%n0D$%Mnc4eC!Kv
zhMZl3ZpO4Tk)Y5ahqQ+U|Hd+77lQBMMnHc$e!&$el_u@r4*~%eV`=~d=4T>xjcmYg
z!Tv|()7g-<J>JO@+5ULv&6MGo^N-92ln7)GSEP|dHB^0M$^}{-(&b-(L5zfWl@c><
zsFqv-pIp~VJJTq?>6}=Gk@f@!Tt{nBl8iH=8KENZrF`|{%o1!vG3}9CZsg1hpdiT+
z+sQJ@=?gGCUHjd4TwE`_O?#7(A?B_kS<S!VXCK>3^9|u^-6QUYI|eiao9VEAQ6b&z
z3+D68;m-$CjO{2hhYHFSuE#00Dj#bU-tOcbCwJ0JBNz<E4$OL-U+hcJBdU-K@(&Fs
zVK@h3t<x%$WlV-mGZmV<!<z+?ehL+UDN=iH$^tcWxLa;>ILMHgS$N7v^%}^EJxKkB
z^wghiZdiZNdcSAn_tGuekJLO<+y+WC*b(H(p3|XSCKdEbD51D0EYeWt;cI>%^@;a@
zci*)yn9s7LS2x*+>2)-7sVu(B3p^zM()=B*(8(wPERku%1Z%D-g|y-^b3jCFwOuMe
z1`Q#tw5d*a(1xwVr+y~j2n58Som$<mKM0@*aN6_7q0yM=u@3kw&=JQM&H5iOqyc?v
zO2<*EuHfWijZ#)#$P>L@zCKD0nwW_%u$~G$#|8^x0}!w$g(uTkM%@X>4sso}bAeO~
zS=*TLfyeMMn1_k5#Tt!Gc?|2B=AX3WHEVULyRv=yv07j<k;$%D*S@XX6XdgKA6sfV
z^gZ`7L3+s}$dtC?b<6o~ST(TAsJsB1X;Bg4eSPM^Ogv3UQCoA-%||0RJ4gtpv;Gaq
zsx=QLo};zdM!tL4`g_k$gNVB`mKMVv-Gv=@72YoNF4t%%*6`$%u;f>*R=>i-VR~TY
z6Gpm?nr&DkpH89sCI(d-maid_b~PfdIrgF+1ft>C8e@~?YK+vWC%PZ>js;o3ZS6?H
zwCg%o@wtU_?%MH~iod=-+oFBR2Am~+W*Zr~(LhkeHSEYo*(nJzt-cFcp-{s&{h)Pd
zMOHp5U@7ui-s+{1d%Svt0x4d1qNN9s{BteJ<yGfafKG)?MU<Fr(NWCB)>^)Gt!>W$
zWHJqJrdS)NBT}G1Q9@aVulWOn&>@<@n(92vSX{vlhRZH*x2fpa5s9X2p}CUdmlSLV
z(2t9Jd-CiGa}TFu5R$P$*(g1hdP=b>@5=YAJ@eWz(L+>^iJ`;$vn2pOH_2kE?z5#v
z+d$Q?f<tXVN&gH0YL$3w91C&IbFQnU0DN?HQp|b?){aDw-9+^+9SN(FZ=JN%4bRbf
zm%nI3>doZWYa<g2FP0rW1=?3(kT4neyWdoK_@A@HPd)9Yr_%$sU+ka5=Z2D>pVE=2
zDt>Ge*f0hC8{C{;ad7_jp5GIOfmXRJq)4$?7npsH%VCaNQyXi<J^9AA5rN=ss1RUH
zVgISxAV=V}EYRLQ@kaR+$?6@l@s~GxE@1<yUiZ`lUgo6MuYf5Nm~-)btc?7T51z|~
z3y;=J3=;wbQ%GgR5NTwAoJzotd`sttJsOF`CaYf=MbTRO%WZ7(!=y7sS3%^ldpzmR
z4K5-9wu-(F?6%C9*z;S5=r&>fpt(^bR55a4Wo?`?^fGaI1w(e!`@@`Z`=763CBa($
zhdAZMGb#PzluDG$CpsmprMYe(A$kp>rj*cFEt0G)*`dt6&n5Aa;u?VA#6wiS7<S|%
zz{@Bt%KGnyr~045Xh$l{=BeY)4_62jR^?822GP0?H4a6urwFEEPv%Sl=72V4;o5DI
zApu)u#isWMy_1`~O^@)>zfx5`lXjt3N~wOWxke+U1ocN86{Oj@4fYwh910FvE$P+k
z&6FsTewB$WW#F#$Nle8flPjXV_$vd8o#G8H8svs929MG0&_aFC`Oxrqs>zYJX%&|6
zmFF04TH`Z$JZv*__Zu37PWL?V@V5mloX0#g&It@GzK{<@aevvrstq6$>Lw&KOnEn=
zo*N?@7rFGh+W20+g)|GS)H^o%p!wSw?ib+`y5nOSm@Uu~>qs|jE9REj_B|GYs*oT4
zY_ne3Qu5NBai`OmD4sOpW5^Rt_F=AsX@`3Yf;k94=e~$0c29lc=v~8b*FpusUxOqH
zX-qpG3NRs(D>5T@kyKrBPhwc)ZSnUPk7Bc@R0nsM*B%2aP&BB%wwj9apy`^?wOuax
zW-rkOrh+&Jt2HEO*Y`TFO!w&O8@5l?)?w>T@bK1<KQ9^5Jma>5NV`nB6HgTL_1v4g
z=}~v4^9>~CgeB(^hjG*@5n3XW#HTXp^61zfeeZ3jV)mM@j}TAyr(~P#XncV?uf-?%
z7k(H4*PtJH)xz)@zd|`<!CAq_@R)Utn8@|j-yl!zPqQT&$rExcD~G+uA?UaV;Y;-x
zKRERD>mIUt-;%uSP1V<nv!=+gk>y@e_%^I8IqErV5xkpBZjoz>fpVOLu{)f2Cmf78
zHWIs@AA$_BxcBJ1oeJ4V)*f4a?q#!MCHjWWnu2`D4Haitx3DVV1lJhcUT3j1o-ae3
zC=ePN%3?ZA{RDvUMl)pMP_nbszd(9zztg7rE7XGUCw+Bzf;c3a5^=NGUW{FeaoV!j
z%uY0A&-EbZqzqiqAdMHLmj*&Ao}7__hzI3k*xF2aZU5Xgc5{P(s@OqVMLRGDP%K7P
zg^x`N3UwrA%0|B3@#Xj9w%tQ67VQ3H52VC4bMx}5u7as<Qrmquad1gO`gt&x@Lwj;
z+HUT#KOas%J)D8QZFiy)^L~0U=aCO;b~~slcu`!+;P_Z)+z*aC_%Z>9I2lhf^YVN&
zz-F%A`X1y!gf1qfpS;G(w}-!P1jw*1%QESFBrJ^R@zPqluPe>YGKM^ApkF(%%%(=&
zM{!Df@{W-2wAPmgQgN-9%M-Q%d^lsXp)r@DBx4Q9EJ=>$qhR)&-O~#X+!hWMzzqfF
zjlQ!SU(7&`T{qS$*J{xfH_w0N9Wl?1src<bAxMAy#J@OZsrkF-$!juNd`z=Ekce8*
zawv=;Q<3)hvExPRk!4Eyj-){K(NL14!u-C9_L$@`{FtZrDS7gDloe)T<YJUI=E;@p
z<zOP6HNjT@14SoZBw=hR9nG^Qx8wWdcnd=tDluZoI&NRiO#OwZh;Rw}S^py**E{;K
z)32lb=@{4oU{iKh4&C!CO%QjNQ($O_L5{>gngF1H$E;H;me}FnyNhVP?p$Bo@cv%!
zhaFzyNH2B2kJ^qzOZ2Rw7&Np?ey}X(3wqy?$_xl%4f-bRiH$J62?rk_`N?f$FUM=o
zlm@l$4jpw{lznj7Vv;6vl;N(7=aAo<nq^Chb-aRs2KQPYI8LVvDBZdfoT+?pmq)6a
z-iZPokSlR3qvbE17avsbbdC}QZ^B)726$-7=Kg#)9xhoP9VMsSOdBw)cko@Gb6um+
z&5zCRc74#gv9cV1@7R)>z`@Y&VfR{gR`)S*Ll`N)xqPB=Yj&lN({2Yt$$TMP;>+YT
zOMnv|<{xA|vR@k4^}`z!$LX7Jx8DPX?oPpYjBMxCx`!SKjflIdI-^2<n}gO*FR>1<
zE;VM*CuJC;_WygrRID^AVf|xg8YVdM^<4GkY_NRjg>&RQQrPnm?iLlVYB`?C>EaGS
zI-M(c6BM?2veF#T<|3uQOQ&8&|M9aWJgHHwnHBOkXm>0|1<`FsLxUIlAE$3s!s<UP
zC8$3k&~ejY>6-qo68<zMth+kLUM48IJ+k6kn3P-pYOzxDe)OiSB+L$990~sqzURyW
zc~`B4jD7=N2(l5ElTckc9*o~GKj83C<;&SJv^q}6L#{W(OY)%TDqL)fii)z{nN~eo
zt%N&2pHneapcT@cvdRdT*5u9tCM{4air?gF8XU6u$E<(PR(S;6dgvJF-4Aj~cP|~c
zsQb^q*0g8Z+D>>|FwN?Et6*~_Vmq2t)#ZwO9++l~T&LCv-7sN9-!pP3setw;vq7jw
z^Oc*)Asv8ry5;y+8fhzMPA3NOBS4i$X$#DW<1nqZ{!s3_Ci3T7eCeqCH$(70ya*}O
zz-bf<t2KEPI7$Vz!?2^`)FQoSmA!3YH%FQCwwaOl_lHrS?KZ6IwdwLQXQzDm)@;4D
z`;pn^tk2i1a|mr^kzKf831)8}tfn=yy&tS>jjA2=?r@R_%daWSf9A{suLv3<QDZo)
z9ZIt<p%R}XTi@P?vOPr28<MAxY6#d?blj(v30~qjUX9%c^sE@8Iw!F***bl|e;&rP
zeLdfqbr)@!y+zTJE1}LMoEgSwjs^Q6>W`ERtIHeqY=Y#KHXiFYZN*s`IYYbi>pL5z
zmpE`0{kGR^HNihIVAXCql5^(FTYo(w3)VY)o0e|7J*GKJ%)PpJcv8fY-rlz9zx6sb
zAr^JMCGu*0%L#cQCF?q8FI<MSQN7MwJ#zpV%Vop)O^ZQwhqCIACgGXxl5MZh52zf!
z98Xs>-R>_kAlpMgJsXRz&`sbZA8w9Y6W`yL)$K1vp7fjWP|qnuDi{j{45Gc~l2GI3
zJz{(`Uf2JyQvc~t5c<(^^G?OlnD8)H;2O38!;T{Po2kR)|Hy|X*_Fh6GNv~{?ys}L
z<qIgmCf>9u9;Hbtct?J{*LE{W^U<$bKfIM;IF9*74;)P_d~#o;C1V-tgHD%KoJ1J)
zx)5j;Ff$Z2BK3v!iR5=Z*$o=rfc*|*)D2(8H~aO{SGjWGri16i68>O^)i$G4n~oqK
z=o&=eA551NRo=12vg3Fm$Im#Mb#6I58F$idq2iyEX?cRUeQZ<XETM7BfLp#vb*Kxi
zS+xPA_}uuhh0xGCRXcR{aw`WxI_b|ApA=QL_2cAPJ{Y{J(^StVpMVsI{QdoL(o<{K
zDR+v`Yv=`i$wD1s^UG@QsQo{hp91I9#f3Xbh3O1!LP7#HuIArK8ITtZfxyehB*OYQ
zvIy6|F4YsA5zq#_F$~l>>Sm@@In&~X;f5cIUyOr4&j0g~3A{=^S-<YPhrB1E;|Q${
z@(rxqfDSIXDh;#%V{F>!SOax2qcC3-jA!1yLwpl62!65&g6ufCkwHYAU*AdtWWdcJ
z13fvtq=#Td_17S5hN8MQi+NWpXp0tAS8@3!GcP_5lag|W$D@b(wPkF;pX6q36~U!>
zuG@pv3t@~}_nF=sg4SRzk)a~eZmdY>aXtTN7!pAJV{CT7fJY)acHug`-T|LJ2mC-3
zh9H0s9Hp1_!;ARM7JSLgoL1N`h5L`(;pydD0U81EPi}3H*E?RuhP>3ngS+l5@GW4A
zXukY}-eo>D?LOi!)y*WOX10MIv(){Mmyr|d9>xoj{)9xujg#lH#ny$ND{*-hdarMC
z&whdH)h9!&Gbm{^f%u=S5s`Y2{}_<$eZ1+3W9VdK-e<M`!(XD=%t5Dl!2ygJ#{fKI
zk0kM_G7lh;cKShN7LYD3jEH>*8Z#sRezE>LM67n4#Qfx{^1QP{0^$?m$+nY=y9f{a
z;8RpL$e%~(HDZO|lgZk*{+=^$=N&DHxwY*r=kZ)uZg&nhcSZ^go$3TRW#UYJz_=?9
zID=p29PgEr@9k9`2>L)bGgdP%kx>^!H*Y5bw0_6luP8{cIjJROy|Lh+UHDpc2Tq#F
zbZ8x?)j|dsrXM~{9;u5tOD|LTV|Tl{JhLZv`2J%Wy^RgY=1&hzpU0a5d139${88F)
zv-Hn5Wc4N9wpItesb}v(ajuj3uNUCCeaV#E&P>58@iN5jFpPK-pQH@fOeMH4uGb&)
zq}eJC`%Q3X%Y79yPYTNA3Cq_g^$xu-RcfE#grznI?Ut`HyDoh;3z&e|^)dN9P>V_y
zmN)!%zeP#DA?hkRhO;?Z#;)|Zts7x}2q!Yu(7vS`-S1;6)Lw=>VX330wWnnpp*cPh
zl~F9~T^7pLu_IrbcFh;Og}puSgsrWSt3bP^dOmP{%j0~LLQ`b1=onHvRrbI(>*t!$
zDDW-!4*ZB?6~)}ahHU0YsA~bNQGekn!c6Vvyzj4gJ!=pqt&VBc_Lv~{)xp^>-(C&G
z&A^AwPQtj!ZKeakP@>N%LAK{#Dosx1j=p)IP%(A~z8L0c!?|<Uij-@sXGsoVh(E}+
z@)l~h{y7-oR;j+UGUvWSy<DH9kjSttrer>xi@jDZ5k(hWqT{v_Bf$1NJ;I$xS0x00
znWoM2cBiPhq`)xtL4)4fJ;C%Zj;KOEdX0I&drb-NThxSXZuOY4;>r#uUhVtbjC{;g
zsU&56?%yn>kJ#F+)s<NM7)m?|N5Z(7yOZ6(2YI-n!F=>Oc-C~$usV#iPG?SO1V0@&
zt2A2h-4IEryQ<xWwc+Yc6Mq}eP?DmsiLT?tyZ_Qw_Ta`yyjGtDeRh;!T}HQ*#dc-%
zw?fU)hueX0b#!MbqwFog2o=PhUW{DKvgWDU^G%=Nyystw9Wbo%_E*WNzpKNESFA4s
zrE62Cc?`{-DtaH=q0_~4z$qx|F0t(=N?7{0L5-|`i#?N1*kC>6*wCHMH{N-?+#9()
z-N|*d1B0(N!}%t)^e2Y#tcy9JlKw{gOh(nLB{-qJhP8Xq!Ho}4G4x1BIHMt(;aya-
z^3h$MCOtJ%LgEI}w=awqo>Y1HQ1#Dwtk-AXagN$z=^3~6dDyY<C2a0CwS4?@oe14v
z8tITWZHx3B&t1@gIFcGYaQ!MandE*XLK`RMl9N}vK3Vdv&mT3N`K$ih@9||up^#JE
z5MPflY7ion_L$AuWLv<r`$nhY{QX!*p~T$e`!A$P&eH5%8EMCI1wTj<udfpce-)ZW
ztmmnQ3Nd`{x84b<5ZqS`UjItgbq}HFn&~YXr)hY-!y|ysvRPkbh~;QbOm4S$VveDe
zBj_9J>u`9WkG1_jY<*==U0b(o0widV0Kwhe-ARBzaCdiimk``tf`;G(cXxMp0vmU?
zjqSH`&iUlt`-&=d6$O7-bIsACyGIZ5>BN>k`^9MducSeU#n<azF<ISg-Ym&B%enLO
zpUCO;R;G$xoJ@jl_&jeBMl+rr_CV<KSaVUStnemZQsl9PIBpP0jX1l<{WyPpxfPeI
z>50{~kV^V~fg%Fn*hGY$M>;Tb%?pR!n5w#aP=q@wPIMwjxlafUR`S_?r3)SBAw6b{
z`e~{z^E}sp_XV<l;M5qx)W3Pdr*bP<g|TVrpKHbxu`Q71R&)eYpiRBjQ7eS*0i@|6
z6(SI=H?gEXBq2%op4u7=TcaICtoO8U9M7enhifL2yzQ5Hhbq|jUi`sD?9s5{bR<@q
zmOzO8C7i*+rWcICRG`0}GHZPdD>W~%JZ~{iX29i0424xHdSCs{{fbj#M^hv9I;i1>
zTAVRyfT`wXXvMFMh)yGW;Q|ZwEZeo*irGwPn0*BB^w+4#_6(0`U-Rw3EB#R8lX+Gv
zlga#R=HoB<=4%?<<X!1VN_nSS2@$PIIn3FCZjpNN&$KcR4xnNZ3&3pJ^^w?@XRA|8
zTM?$=vhV%y34x<muY9%DAEi{;s$8r{FGia$^;fu;JPJ?>lu)R2BgkTzQ1>@i-aIlQ
z+BLYc96~MRl!Fa*z2E;EYW@K`mk6&5!Lr<LSNM3pYs+Yo<utR@3buzE9g=m<4-@UP
zN{v_F)D5&0<T{gAMD_@c&`LV+vjz-w-|#Vy3#eaT9o!n>gniN_k*d2)_H<8(S3OSl
zsl1MkB`NqkI~I<?I2x+?evU0nWcWBtyIq0&_HzSCUrjUboP4FQ$b73?h-#6>4b^Da
zT4YSNAV~Cyp<q(HuSkSInNctL*>9x(hQ*`X9)n3!Sw7e)gE1C#ZMiie(uP`VAN83u
zq1NkRlVV&46XS%OVtFMiN@$rwzI1(9p2$S%KatDt-}K=|ao~P2w5!6CDXmJu<_OnU
z!^jJ`MmEeLGqi(SG$i3PtO%=8i!?lYp>?hN3gNnPU2iX=ukUo+d%XO<GnU?TH=6l)
zdYUDmq&>zZ&aB0`hWXr5KU1`y<Ot^NZyf8*QTi5+V|AQWaAWE4hED|;n|gb95AXDJ
zEMK4upYOm{Q;k~h{!UwTH*Bj49CCN`^Ct`(KDXxw({Uv+B76xH@gL9F<<7T)4(H5K
zXh?6QHMT4bY!e!ZcyO3bhS;R9!#WSkyzaRpVX%k}qYsm4&GF67KuVS8$ZFv<Iv4$t
z;mM|zL|{5#%Wa;6&Z&S@&!;nc1rs?JoxF?1jxT(*7Meu1If%{=088>Snfjqob~-1I
zQYkn>Wwj?%d(?K$XJXKluHm%bxPzMIbcx&%2?3KnG-AOv>}a&SN}$#_4bj(fE;XjX
zV6O}Rgs492LNXEwt8^wi<~z!^@0UFnYymhNoC+2`7Rf5N!}-m?1&DwreTl*_SC4Vc
z>(o<jL9JD%_Z2)tJz#^?M2m5EPt~HM;T!gkHbOn+N7m0&n}Y{_jU6@}kk{zJ8d8be
zJK+h-<7@dx-!@4>xuyKYxLAQGATpKRynFd^Jg}W#>&S5T{MZn;?=cLeY`yRjs|h=`
z%gj|!7Spx2Pxgc1?kaIci+l2d%NpPHw+5LX(|fP3NzaHUo1Dy!m+QWaZ7Af58vwI_
z9s_9AiKv5b;DG5}EPAN#!f@WmFz->wyV;kOGc3cd-h1nuTJ_E5MT!lp1K%D+qF?Pm
z55p#+P!q`uqM{B`5}oolp^LzW>**8(=KrwWY0l2AeU#gEkIe{GW)c%vGVXtmQFdK-
z>9%z_L7!hpB*pXDS?69l;e2YecoL1M;Y^@2l%{LZWj_!bLZHe1t#7h@|H6nakwHWB
zBC`v2Bfa+OmALeG%J(2m&XNG($Ey9=W*RkLqob7XH^>7MI$yKH!{?j};q9>nu-_nJ
zDqk^~yJmbQ-yGJmO=f1LEvffw+{B}5BG~oqA9D_|&=ZG2PmJ8YxZ1CB6yl*Kd;5gM
z^b4&>qweSZn`aOkmOdVK)W>QgF1jSFS*+8iGm?ycB>%O2ks|GvFM@(fR{FFCoc>c8
zi37_OYks}ogsEhc6Z+2*_L#tpBlD1<N0y{k93}d5r$^wBi5akruN#|g_BJWs4^aQ|
zEepHdK``6&jRm_wri`R7;Rl*+9>KssLJI0(STo13u&Z^y)P8tlNxncbee4{2-LGV{
zNlcHwA0y~`99T2Hwk`7>#C7O)cmz~Z`~soX%SS2|^zNClNktMJya@Rn059SM>Vi1U
zZv@QEqj5Nm1-tc+=c}Ja^(gQ$N8uc*d|k2UTiEo>EWBGs-QO(4dZgBQw=FJ=4C4dL
z2pYz%I=x9-&x=uEQ>JxWz6`JlZZtX|CJ+hh&@{|YTx@P$?l7DE6z8pO@~-7kO?L4#
z56$N&>$YbbiQF$MXMGoJY;09m6<ZnPG%+K(1S|R?Kgehk>F6FR&Rz`M9Zb0QidS9A
zk@hUZ4Ex{vy&}}pyz;m9sojO%1D+kl5N!2*+&JApCK?3lr2n!_w>vMj@svtY5k%Ph
z%8w8gF3oyvVf=Dyu}DDTI#kZ{W#9Kj@WG2p(Twh}nevyS{!<ia-eu+1F366Y@4bq7
zmy)P_mA8q;GaVu|SnL4$bDI5r^Ve6PTxxl0L5vIG$ADYd9oXTRYwsU>0gA6*4i0~P
zq(U~7eWI@_{}QSmaj4R+;F&GKbomzQ?rdG6FB}tW##X|>!-FFPd2;$~A6QBb;|LG+
zDR_heQN$D#DjV1Tt?WDd(_J3HF*~8>w-LWTWP3{CW}_0rz`Yjz`qxK*xW4ngxps?N
z)Z_l_0z6yD1iQJP+{^r{1GPZUYQ?j4c^mAxgtu)QO_NOmkc?jH6bmT?1xGeN%eg@G
zPs<JFf#U0Pi3AkDrtP_oHN!wYGx0`rW&Z~^@ar0HYlVDKo~VFIvUfUW;~z_<=fa9r
z%A#+@B7$9JA7Fue_q=mNUGWV>r<(0UxsnqSs-`z29@)SpwiylbX)#@iaj9fKi8)ZB
zmtS4{Tn1Sdx-*Eai<w@BKJM={>1M`Y%;Wy8@>^sXvSL?a=#t2UyHH&042kZDHu#am
zMTN}zO-DPFkGH|10MlN@`$XcMXyptH2KZ;?vVg?+Q$*K%`=+KG=i{51Oe*`)-=?Od
z&Qr9FL8E(j4oQ^<<@zk=Br=#@I!EN0dpGF~`j+<61WcR;0gbMuaz~UmmDKJChjS(6
zrkbApblfl#{i~RM`<QkmzHHNr;c#}cqKWlFcSK$KczW%OP!{8aJrB+wqE%EZtYs)4
z#S8-?m5b;EZ=wtiFs8k*yT)dwwTHf89pF^n#&pNmbt{xNpBpZyICVY|Q$-TYe!*8-
zf9|mjg&6Tsfh?}TN;}X8tq;9+!}(U&TLr|@tfD!0+i}#&pXHPDZZA-|cUW-`8UmZk
ztG}a{EMdFqn3)Q}soCo8zT6%!!gkyclm{>rMVZL@O`wE_<q=Cd$U2l~ce8^&aVxav
zd4u(KUf+EmZ}d5%tT>W>jQX0?v^GE&Z;>|<F6{2kjgOB%yZdzbtHS>A)&^tx96V;m
z5lcx1bBX=u;29x`@4kmR3lLr`VCL@UxzzfCog(~A3radt`W*S}UV-hJ2_)<w{P^|q
z)z0f-=tNj*lx=#H(&ZL}Dm)_(AC&@k9PP07Z;E|VZF0rQRHsb|sKHifq5>L??(pnk
zpM?#K&|kiYiC3a-NxY1hd1Oj1z-Px`6KQB@+#eYu;D0j(ogZBB-XW90JzqM(mMhNx
zR3J#j#zX8V-3cUK@>L6)2>Qik&}s%pl!8{ODhB>(lH1I?^?3Unf(_zVA)n-l!9(!l
z0~N_0ylfz7u(GZO=J=qUn!d>e#v(SiKnZ5dZU~9MHfKQiev^3YcYTjDiVrP@$tGT}
ziE|L}c71+wfBk0Vj%AhP6>JRRKmqf#f_D*>rLT@gt{<R&GR43sZ6<7`B!Q82;}BVF
zY@vuIOnZf;ovMYWeu~v@j?CK~uDZKR&~M&J-<L6Uua)T=D}Qp=qtzg~6;qt8UN;cp
z5O*gUiMGlF1^eo(7%i+1aEFD_tW<HXEkPmYhWya)g5q7^YPs=?y@lBwM<Cq{p)~st
z-~^p9nPWo60ZHxPLniF&9D@2TPiV)0%YsQeb}@7;=XHyZ-X>Zl?h0cS`itUN{CPNL
z)ns8tuPykaJ<{$FoNMGH6?G$yMHHkhM9whP=WA$T{XJ5s#@lS_H#ti3ifq$OLf;HA
z9nQ?yaI;j&5})|IAM~CK4~F(kty+19QP!5Zur3sR+L<vqAsSygKf9KGzk}dR335_v
zv~u6c0f+lMMsB%4KG=x6NwYf@UGmKn$4<cU{N6n9U}%P5mHOWWWs?(CJ|~<HaK;4=
zc<aN6WDn)m_A`-*JbYplKVNk}y;|&QLp!i|XY>yIQO$RR@#X6AhA&+W<oQj00h3YV
z?by7Y2nl*m*^#QXcJzL}!Rq>EpET+wA-qwpVSFvs0%0N~OcycUBQWIat3K#H=o8pK
zkb;J2`ii)LA0^?=R@^ZD{G}*RQnEB`2`<5qlIX!T`4kd^YCczjiA?mt7oWx^fl5x?
z+4<Vf7*pm8+_nx(C{l{sLNq-U%(dvo&<ItLfey3<)3m$aNP9B&<rZq&X})aWD-vNi
zRH%PG4&hn{-TiZvLN+xX*nC-L`7w~&=KSi+s25EH{l0BMTW*{v<z$tbnubXU6lbv~
z-i$@~GIJE09L2LxIY7%o=!%y2Els{81%kh38}M18wihcZ7^#VWPCLf(=U$4-<|Om&
z1KNLu+CePd0aKWGo-oPE%XfRv%RcL<7Y4c+tK@(}KC(eT6Wd82eETuACCSx<TikKw
z4&@9FYcmY=Tua>}`4ELSjfX}!pOKMQ?9fLq>GwDlU{Ye8_N;VLVF{}R9X$8jPj=(5
ztfi6_uV_kEMQbx56<3WI;&g{xR>?n$u9wVMJY`x?&pE}1@vGr@cfrfe8c}FvDg(RL
z;!f|$F2KvR7vE1Jkw%5Ig?aF{xnyGL_m6yNGM9IU7l1D`W)|1(`*)3u**-YarM|#J
zb|*D*gZKtvLxWWM^aI8R$XOB|w&f8WAw^m%H_V!^dCm-G`S>mtNlA@Ggq)9&y4LFB
zN^~5s$V;Fbt0LH_QrP_|UkjE#m6u|r&Um1C`HeVe<xpj+Km%GfG@-)J)q$=~1~;Na
z&~MnAI5VYhcsGdcyyxznOSyvV?bu_#jc+>>+w&XQMPwkEQdCE6jS(YB`dvf0*UKf(
zTaBBpnmxIuj-H3_xdU=-S59!TXaD`kw<Tew*``6*>^Mxi?HjjRG0vTnf~FqRSgg2>
z&Ij);7s|`f!UJGYe%h?m!=M94WgGQ1Y0nNS^Pw!<HU~_`RwmkI$Pe3dnIOB$UJH1L
zUlUX;r{8|_4|~yjbMdd60h|ATibNs~l32Yg6erHHuJARWV%_|jA6P;au^5tHxuG^&
z)n%Mb=vQeM*~0l!oAE-T7CPfP{6Pl`;~I{rDJ%4{gC{0tw3L3e^-;+B_cPbaq6aE$
zbk;7d)-o<xA9yLMf3eF)UwIobpZi1N<dehsB95<@VIWHI^&z<i)5$FXkJIaChSg6d
zWo>k(InQ_RlKN5NV?mTiCsazG=t?+W$S|jI$#TBm9?>dXf7q0$BO=4~P>-6#)dM;h
z10k0WyJep@UxJ!7x@@oY%U&!Rwt6<XM4)lXtq<ZYUxx064*Zx5QZt1X72-;($h4Sr
zF{Awq1_CJqy>e~K4mHozjvJ1*j=Hv}m{U;SI>pbV&p|#!9nEjTbjaGI<#eZ*C|~AX
zKFMXbb#lkx-0Kvae>ggWD;O-0^SwXn<+;W_={0Io$V^j5JBaY)049zqR#5Oo?xti{
zuXo>dd^BuCu~eyuXAqQ@w2gP8?vMj;YA=0IOyDHWp=}U=Q%!s=`_*voeCv%iOgKb)
zgW}GQ1AbQN&ru3i08GWbG~v+P^Q^<5?{g~T;gpE8ofb#-Z~~|!Fce~CU||;`>f@OV
z6x<<<>%=`1mJA@@x0$0PY=L7(;^a}>HpKUSA6<`#PRP&P7m{(e{y8;(FFmk%yiweT
zUBhD0MMqj2=6yU2scjO13d<bDCpdB&8A5Jn9X6;yls*(7&W|XOh(^d#^lKK%wf8;J
zL`affuhG@1kUPwLktX^`&%*Y$4gJlS2^<UzOy|mq)>sMPfZ6Pi!fP7UWDP+b#QW2y
z*_EPeNW!&ZLG9UPO9+H`ZnpjIX`8D1-dxh@^-h)x?OzuhaQ0(S&>DRqyrZVnz9!%G
zGrt-LA+f2GPgi6O43UI;R^J-ic`rpDN(Shh^PesraoO1)JlIqk5hdIaZVt#>k`BD#
z)9Q3Mmj9zyUf=J3_nWf&Mn_evLJOgY-<A1z73^{?x_$c$%j^W#dHQj{1JWqZgmktb
zbMU(}HUB(XeGp>G5(zBUu~G|mI#;9avA~Xq>cXrfe@-4f?+n#zwyNV2*uumw5I3^@
zDI7dwoO^9LcTsaxVPjZNc}FDJUsZ#3pJx603W(Hc9%*qhX=Ju58@4Y+QTnY_X-P)U
zZ$>NAg;8e+odHM~SiZBz1Lh@yF)wvkPbUN2Mdk|mAe>~sZ0?Bte5=Z{K^osEv<H5A
z3H{B=7u*Gvx%OGB2IkRmEc#P*Bje^vlP^a*Z_^>4q)M<?9#wlcLUyC>+oV|>W)q3q
zE|TOLqvH0(;ic4<H<i}0lX*$n2G(uq`OCz^Q-o9_+w4{^jT8x>*&m*0uKb;KeQsZ|
zo%@_l(WMrlvz}@XNJ@z5sPPfM-f70O)p0wSS4mxdhoSa+diAYM+Ou%ne-))a&ujp-
z(Fww-U9c3P`nW7B)^ICunTlmxj`6wxVPF=doc`+<-M4@MczU3%a7O{^-W}nODTY8{
zU6D@jWS-<Rz40-d7aONQEjWogd9wesAAf#b2nGrr>SC`-2t1~P3(Ox+oSbpV6E@M!
z4dA2H!`&DU1&=8#2xx~m#&PyhKeU+U6Ljw=ZCxIY;o%E@|F(2-RArcJSiwp<SB)xn
zeYC6_xfdWIm)j#xM|dQ2INu*mt>>+lo2Gm5vT>o5YDHWr`HcA2)nhblKhMk>PunAj
zv98?Dda@uhhFtzNUqYyfSVX*^Zc#3(pC2AacBb+2g@-_hjSanc!49<e9g#c56O(E}
zXS)K@=B;B@pMdTUW5Rv%2pYP=s=SHmt#;AstD;Ot>8+(nARjg+YR^_)P*~t)@zOJG
z6&-wnxff^naKcJ7Nx-&NdaklGt%<?@hL1_RDLRMqn0sax#TWNj2QA=s?E5A554()w
zO@=5x8($|AZulIw*)?9Rr~KRdrKq5kxzA`R02kDyED9&B`@TQM`8l>VHlb04qDS%>
zoCp?9TKs^u8|9e)<?xzMM09n=ZQUC#y7@LYCn}wn4;AGO(M6{?Sx26!zw(F|LM=yr
zwlex`b8hpoQTp;>1OF5BzahzA)ymr!X18~uyDRtczLx>FPBiZ>W5Nz*?0yQB+hG$X
zjl-(Ip;f6+Hb;RB1ZzJFbeF_te{KaccLh69;tAr)qhm&A<bC>OJR6*-g1WTJEro3v
z?gB~QjNnbZW=3M+dwK84mmg0*(qO5D6n*pdKEC)05_UM_&)1aoX<8O8$U0k2(oD>T
z@NiroOzCiQNh-+AUxkYgJ;=O{s>Kuw&O1sohY5^%g1^-3l~7gvplZGP8V_Rc8%b)F
z>w6MpuIGip#kpItg!YTTH@z3qA=$PJ+~qO6939DhEvGkL_xJFl$KTUdNPe?00+$0z
z5+{!h3|vMX%x^TrdzT1<XS7`dXro+G%PprmdM}|MR_#=-;JZf+)34uxe>M>*<<8*y
zY2z?bBRHXR4O$y-4L|;REIhkEz^SO(z8eeGY}TZ@h#Kqs94h^Md>OUkEh>8mf7g|g
z|D_mAb+a{h#C49888I2u%@SPt?sN&>2`W>eKv`1Kaz=b+$_^sn{e*3L`+Ad?c3!d^
zsZj7G;#$rAyC7aoN(JUgB|Ff;>Xgc_rmj(`lqs!3&UXI5pQr>HBH7@T23FMqq&sB)
znpCJ9%M6PKJ=<*dm%2W)jncmNE+0SiNUu$Prg~BIYGhsAp4x1`zq{l>s=GmBS!{6R
z^fmvj4$Yw0abxr-13>hrXgMiheTDX)+QbL&q~a2S@=?$xkIGx2achAB^s7{v4PGJh
z>V)I7XiJ2XgdG=>evaK`m+b~s6+G>US6^T5pRvFiXu?GGel8%Y(Hy~-87HdP_dkk`
z4hK#@4ebVNEfDZ|!OXX#e;QmX^Omo0g&ra-fKdF8>tYZQE-x>uZx&l<YSw(+<Gx)a
zqE(-FspyUW{=WI*;$o3bnO=wBc$UDIMxf|)dXWknH{QWS+mgXJ@<Hi9w-^q5C+H5z
zA8Y9uN%TGWzOoQ`{Ga>Ce>Ljs3BDG7n|mUj#r$t1hlDN1hAtv=op2R3VEOWF!p^dr
z1Or_Ol@O|8^Rv(=Vm3BJw!MKK0b|C7Zv_5n*asoYbH6SZ(uVBkuB+8Ek*Fvt;Xj&F
z_RtV@3_TpMsmeYK5ll)4CnOxtuFVcad!Z0^^)ulEAb2`d>4p}wX;`;qH;Q|x*y7o)
z%6CN<zF0yEW*V78>89p9HOF<^rm>L0S{(TybRds50e24Atz~1(Eq$F4_HHFeZ>Q$N
zF+Y6tR$|O%((f^ht2|@wH#ssDu`|n(xMo|y*J6Pw4B<Fm;H}c^=A?s_qW?f)-hgXD
zzZQq{41THB(r}T>czb87^~-=)lT@}l=%3+DORS!O;=x)a(;;v!c;}583*O#MB-Y{=
zAe(jB@l4C9&o)?aseeIt*uEB8T_@6X?(792km?I+I65Z$51L6>U(@>4xGwjY&%y5;
ztX3PScik0Fr=NgzI-6Ttaj~&q5wRHUxxan;wgE8u>p3J4c>g=`uY>ky?+DgD!)@#S
zi~nV3p5j28>nLHQ%^YGH#EjXm0|+z{>MjL-dHzY?AKt>?*h>-tl-cZ%m1w&Fxr~_m
z3~Bf;B5T7gPIe;q{>cKLWj~dq{LHd`pA45fJeQPMzAXoYD6xl!e=E0AJSFkIj>?Wy
zvt#Bkg?|Coy)4+S^$q&BZSYzIA_)gwrU`BlaC^Qd2!=bfD?QQb!>CI;hX{G@@;^km
z^=Ce!G=0;!oQA8l=*MuX))O*au;(Mf=cn|>CcxSa06nWDfj;!`c?)*fvRJ?B=C^U7
z?g5i}RA`&X3MR|v44t5T7AkqhZjz~K%Rs`y4r>ZKqTsYV%U-+xrn!P2$oLuK;`qhL
zN*bH<6$9QvkD6YxNE{QwuO;*je;RH04t=xdl#oYD`Cz5vm<TmH!@yi2<V;jwc%6e^
zOkmuq{z2DcUlAE;oa{Rf25L`m*dpD$q>6@&4}_50GrR&e;!RT~Ju+`Y(hIL%m2U=|
z=@w1NPYQAHR>N<d>u@eKdrAJmXA_B`Xvg=n<JVnkIY@2@1i{j?INAO+AG-Lo%XtU<
zwrKVK<pnTIEylA{3`Q?Ckl3qEMTyqw*Lz%HV_n6b@>XfZd=w|p^H~7q1xI`#l^2^_
z%qH9<AA;&s@2~!XDmxt0SrhE*xiUDX&lfXP>j&W-t<I3106$WWVKwK%p%M&iKchyM
zwfbi*5MJ^9dz%6;a#|mi?~hCi=qjaUmyjtJ(7~7g+y4kpgvi}_-ybdd_R>dS7|8w2
zsZJ@BP3E_TcE*uOaMO%0FXM7p&JIQ}#*PIFmkR?&u*nc-5KeSsi*&NH`1SZ2{{Sq(
zZwv0y2Xza<6VVkKjBB^<FEa;H7+8P4mgUxSo<#sAH>B$<1~e10HQYwm+U|IG6bK|V
z`L04SrbM}3HoX1ZK{}g|^ng6D&{M`v*#7wdU^NS6{ea}g;H_Ee-r?vRc1S+#&*u#J
zYIl~h5CqGJ5i+y!+>XHWdSJWtGN;wxKvTJ>xzTnWNn6Vy=3R4t(iVDebShxr#Wsj6
zPQ$^$RKbJ9y^T+|fSjJK$qD-XK{T-9hFO6~g%;W#kmF-+N)nW6evq9Ip)C67pVlq&
z=zW#>8~lKu?zVC0{=><P`DlA7I29Yb0&i=Q%RtY3br#W|iM;M5n7Y%(_`vJFU@%qh
z+op|Q)%i~NC?yV;8yD!QUqY3e=OsB#YglVg>#5NgXAFC-fB#bAqla_-7<8#u0=~N+
zVoT+FFt{oYXzhp@Q+HpA6zPI`BRCjy@(2;Ri*xlJrMzkH3om-*tyZI1?||Q68}a$)
z%&uE!Y2SGqD(^=D8->xLB2}-Y2OcT7RNvbz5Qj(YBIF<<rScYDWVN|$Ro5>B{=5<&
z9s`vcynK6+OEtmV#eglB-634_ux=oK#%R7&#CVpivnn<4Kj6}71OS%~wC`Zx9hh2t
zK2DfCXN?qQ1d}}%$S<gHI*D@N0lL`eF)V0!Y-Dx)cjlvky3m!jsz*G}e|Pk#P!Gf#
ztYd@?r@yy#Wd7n$&tGD2|7Ll<@&YVrZV@pI5|V$VjDdZm)3$CBz3O|bw)vm#Q=3ow
zseoB7EE{3ksykrQ|2E)Bi}GK>)(>xD@1t@CBFa<APqT0RTNVDpmEx#l7TfR1iei_G
z0eIOUA>JIf?L$yF=E4txpATAko@Gxfdfi(_AQmnI0ZI?qJr<3FUyh$65jtba(uVd~
zFBiuM&+3L+n9bwyI&yTb`a@&Rw+;<U+uXmuwpvG}*Qg2WsK?=enbD!jHIHL5MmHzS
z7WmkMRrWO@a-%OACMxD!;)XUU@fb1E)VR^Zn;ucAK-7duDOjl<Vt}jUeWd{8Ts;me
z?<Vd6S`tQOE5-8@b8E--qSDR_fRI3_*vP>YRt})%M>8%#HTN2<Syr9{T3%EaOR1P7
zm4TzVqXql#oQMivR4*Q36k~|m>3c4V9sO?SIWA&^ScZ&vg$R?1Zx|4-L?E_8-+omt
zfkSDnyOiib+`ilhShUnbBTEATmfnKW0OChD4sxjus6tZ$uAWxNA5AvKLOrEbqBfu}
zwrA0RTDd4zk-ydV>Vxf!k^6Dw{7p!*I_}}vmX`VQo>qD*UZl1L_}8U&klpNtV3F$g
zz6-wSA$vYMe16&@jPkH|{hg--F3|(ZFL_5l83dtov1n)C$@s+;;|;4^&7QRRocZBT
ztoadb)iZbJ71n9vM9-~jXx~iHNiSjrc)IpKS=dsmbbl&rD;zI0tjy<Q)NSj!kkVvE
zAH$p9dO2$G!*0L8t9QK2t)xo&gYtoi6XPq|TVab#uypf!px4ud2~Gi0VTRCtov-`q
zQZ3Yii6w5(mt>Ke8#VdCUi+hFOuQ5_PYZ0=LU$JQPY<KPRSGPUPm_@M|J9!Z9Mc;Z
z&a(&Ti=z}rr7x+8y=kdY%ryPKg%tilN|S-F;y~1laIc~No7QYptADm(DcxcYP_qR8
z$R78zq(Q{Z-i+~0t5}!2f~V>ZyQJ6|(N2Ta&Fko7^6%I<7%6ORN8msS`e+9WT|%6s
zGvxgOGXOi=xl#x){x*ROv^Rv5&cj)VmsO2&>_nh43_CL>-F{R$x4b;vovNvp8I)Aa
zjmX1{SL9Ypy?bl0C@jBCXgN=KJMB;hZRZiHwO;FkhRGw{GKVYJ;#Uusy3D6S<E(23
zAZ1>V9ynON<A<Z7%SN#d`DK)+P)VGF5OllNV|{I9S#rh53gWA$u>thYR%>&&L_+<u
z(P$0I4;Ecy&vO8G_DSEin|H^d?s~%Z$)f5DNRTageXVNjPMr`GGfW!yg-sVQaI%c{
zD`^j$3e_vbf|Tc(cBWM#9gtCDBr^FOnG0H^cYAB4%e#|brEF+yYBIpzbjf897whqx
z51XM74TXh56+nqdi3iWHK~A2I;HY@lnjpeF(YOmrBQcUIw+5XsW>ecuN^8ug4Zazv
zWen>D0v`SFZojzBHtwFM+sHmhUkQJorntX{ssa`&<0GHWy>a@c-FK9BZm`XI&}<$t
zaeoupOMyxZP78U=b$o;4AqOq!cPbFrm28_aI+)h+4D$v@c0L;o&nv1L9X`f-;(0Zv
zP8EW=tprjhR)O5C2_fkf3rSfqtej=j62TdBzm{-7No2^s0vEf=1!ix%8+Z!t<p~Y}
z!Ln4PeqAej{~u<Q?R$qQwD?c6LSNomEGG*QS)wONPTn?$0<Q5|xdUkSe*~fd{@APw
zsCwUI_lrxO3ZpU!UVU>SwmvWY2cdsdOb&o?kPs1p!$yS~zZi!DlLdMy*BHu>H<QP|
z&p2))q0x)G#Qf*mfoeOcHyn}9usN7VFqL%mQfCAZo;oajGT!l63D4y=(=Fs?ICwS5
z%i*e(h4mYR#mjnDi}dmp2~|Nag<N0WzFc$;$syu^X*BW6Otv22oU;ODZPIg@)z+rV
zSNlIZm(Q3PY(+mqfpet9=wskIQ5bOPmfVg`eIo78(ZXz`gRGMw7|O-!x?A@Xx~*T2
zYBfT2URr`3l1||N+@DRY*}|NTlSN2(+e=Q7Zt&e{4Bae0BDXmV^_l9xO^)fADS=3P
zF(Bw>tG;x)o;QI&hSr~(^JO?{g=KU(U}8XdMRh=SV1hu?A~TyNS>`s&Uxx1uE2&im
z97Y#Ld14Ysw^e#i%A{s1a7#gp%jT;I=;i)4y=1e^4RQy!r32k}@!<9d?^MwIQ?#kN
zT97A(s~pK}VD=UmQDZ9DD9mQXUpb46+{JkxF$uo-S!VtG^C?4N`cp}-@+%9uEW@8(
zm7lGR9GdKdv0@IO-lPs^UcPvwgOg)MFm|@15-4D=<-=ph3YD1M<M99vt*7Npi+Yzs
z>v!>;R-hkpz~i)Dp^GkN5JbV}3DeEB7IR!-#ZgE|xKqAepG$XR86s2fjemD~rk|6C
zH1K3(H5M(UQ*BC?-Ql)Jbf2Z9qM8^paa&y+MI~FHz5To=ch$MLj$I|8Zs&7{q86;x
zAx56Bk&27~SK?i!LHAWI31xGEO^g>FC;H6aIuXQ;Hl<Z<2L%jsnE35KwzjqN<GRKF
zA2K!B!S9#q6E52-Stl~7FBM73)Sr4SB0`(<Ed1zy3us3kI+Eb8mm;<!uormAhFj6^
zTFZYa)KMmfAfBZZ|HFowO91E+gCOv$Ke7GGt;eiWZ1IOzaQJAU+3AuoLkbScH!ZJK
z@b{Z9&58qKSfMb8Fo>qlcFp}JUt&px!Ty&4uXjSM?kWuI&-Gcl(M!>pk72({rTNds
zDsugsO|7>fZhsPK%1ZQOFCX)1X7qNJ1#t1k$ISp5L7ul_`b%fsECrXUg{CgdH@~u%
z*$3R}N^(=BZBAZ#*ztLMkdNw5yQ85)#Gp`Gm=;Z{kJV+we=4z?>P^=RU>lr~-Kxc&
zYM$pZE<Ko<n1B|S{6IuJdgO}-xG6eY64K$u+~Tcq38jg96NCB)wgA0V_as_a&w#pZ
z9cI@7#s#*!iP9u&YV$<W=@-X>q+zcoas2>9pihJv?NRNp)s2G_9aogNmoi}Ix&?6a
zvk$AW@t#2_ozYU6r=|hpCz^wUVcm_=L)OpPEi6XktWRFxv}tdhl?G97<5$Be3_Xx>
zBBHb3BrIVBF}S6o&4*p%)vA|g6Fwop;fq+5TWKDq8e@DR%oFf$f>H7(&Q~uyK26ax
z8*r<}=5^kT+-D+-{(gJr|1E>Nz}yR&podyQD6Ubp-dWOJns5YOmqKRoVph~h$|){F
zjhNFmYQN%}G1Ya#1cZ}KNjwcb@k-qZ*Y0hLL%~l^qQ{)ciyBBbmtmG7QcZh_@5q4r
z^)^m_3yi+brR@uqxH_VJ+Tfku6sg$#Q-xkfKcGp2e{*{u;tTaFm&fnz>Az+=s*-xC
z!GiLbQ0WzJ-h=r`*Xh0NaSo^bE%y^CzEV&++nfxI@AaCHwX}aCCF*GEM2_w)HDx>q
zhK*}~5Y};J8{Em{D9AYk*Kn*m=bl&*<E-nCIvi?~-+QA3{kc!|Kw!H4p}(-;BLBi@
za6xaaJvhNNyt&*}7;4ysYqj}wx$GBn9Ue8~!FJrO`9iAzO=Q@%j-o2535BBSu&hRy
zfEu(n6e>~5kCTfZw|8lEmOpPB>i*>AJgw&Pl_;0dlYnR10VXm(U*KC=lECfh2j9v{
z(;mxO^QCWz0w$O<Ep8gzUc30?nF1jsA45<X^L|*K-;A9+Hh?h98FsbxR*LuMb-~ZA
zzr)ga|AH*BF{s3>+Jc(aj&qf0%-JT<JC8wn4fRju6sB6^oYo;jeE(F4zdi(6pni|E
zRqYIZ5d4x7tW&w>^hlL;(Lo^iZ>m|GL17~Oa0ucmX6~v2Lnz>ja5kx~z(QXL$MS{^
z<*2HXfGsh>jlHhHj;-qYF3SEKMTvBhC0;SXa~%}Hgs`<d#%I~f==N*Mz6LjHvoDe(
z8V|c~(BA2uBLWm@+?NO<AIOtBP2==}Up1qi?buFfhAe&<A$zllMD+{s%Qbe~?=*(K
z;&4o#MHT-u0d@g3#Mtt(6g4sT@S)eqF}{}N+iU|zyLi{PHy0ft78&*wc9G0nmY;>_
zHJe@+P82#qt%axRGmGZ7vN&cI*M)p`#}%@EH6VpvUZL9&yeD1IO}Wh=-WncpF0tpz
z;Dn3Y+5xDmWeN3?*i(^cS>l7Cs-^2F3~BbG9mSfG2b}XMaU=N9K)^EgQKw;@MF20q
zc$T1-e@9HVYIB9#-fb!-v6{S6Lm*ALNARfl_D@VHtQB^vz<7%AUw!rfmE~~0Uu?aV
zkkE77nsgiNJ*FX@A>H(<UQWZv(kJo<I6Vo^@8pabEsOKenA-+>aKPkFx7G0@eD=Hy
zo}of0*Wp}w&TgyVW5`;sq=l<4ug?nxSeUoMS?&m7`L1cNjnm(M-I-<?59Jo&|3_!v
z>lnGKKU$C@+V7+TfmOO}l0oI@f*3?#H^0_l_cuaz5d1ihG7&|`I|3*(F0({PI6+x?
zh8cB-`2<r2l9@$_0N?;Q16-8@mfDy*_1>u+M1s;iPdrzs4dlol66p1}^gm8;y)))s
z)(gIQiq`lL_8f9|j<NDnf$ozhVm8CSl#a=$KX_Rbz{|q=3rjlt_>&sf#;4m&=71!k
zSinMTf?LN-u|pAV7jAd5lzt0)g0f6hD^yDo8}MY$enU-fsu4<da=jcT`;He!c&BJ7
zs8GTUv`Q`}pfTabD8-pLmv9a3>*Loc_beFD`Bo`(x&92bU`#jC2ZF!gC3kx;{kr}3
zgZX?VvJk0k3iD`7soUbs9a^JIL_a8D&w0EnRQtCpI&-6&nh2WAh%bDk?yEC-c%1PS
zw0`M6*^iJ2wQ?uYxtuKo6Bbh#vaPaIf=FJDk}VLRrq*02$AU<qnOptj%B~7KJm0`V
z<lo>hroTKyW@o)17$7Dai61LcI408SM=qN17&7Dcr(b;OvEi@ufe1Z85>f_!Xg$ua
zo?<ag^BCCqZt?<pSmt{(8L_SAb<3Nh!?G3Th8rlp?gS3;_vu1wh?pG@tOJi1AhLq`
zy|8TosIJ%qkaKAQ<Llr;Cf&|{ui*IjoR&H4dtP<($OLp6tPO(q5m9`%2ayf$Df{#-
z81;uRZ=AQy>II!P{UqeCq2hi3HU!E(<~~DmO}}+GA(V#E`Lj|t4fPfv5JuGW7n*3k
zQ4F|@2r9D_kRfH9?^TF90o&sVchpgLG%>Fy?VH<|oJ~zjXv5x$Hna`nunIaA_p|37
z`v|=Tu^^;9IHRqfmbX5~fN`MVQO>4{pA=OunIWG|cZ<s!<S>p)H7WLH6PSvtM!(KE
zihsVi9#zP@r#)<bsNz$oZUWqc1LK7Lb!RXRcpghIKe`&;NpkgXxEPtQ%&a8irwH+0
zb^nS)y7yo+;qJRQ^Tb+@j~2Sy58lA(qSkB2-yspWn#`?bU({Td8nm~XtTu`8j?wcJ
zfBQtia=_j9y8f0EOqeop&Y9t?0{QsdRf9D_Of2Qj)Tvo}#_zrUl}1eQpN4M`8897~
zgrMqB!Cb!j7mL0+t;^s^(#cPZqAo-E-wkA}7g{O?8$3iCbr9F|WDJ#yhw%>g5sm&!
z)nPbQNRVgsYD}B_2&M4J3Oglsx37EoiIRM??7jE+y?!2k!H2rXGqT021qJOk4?I62
zr6MyHm14@9j*kPb4BQ@hfSlirCZpL-F^ACs&}IF41F}ZX^O+g$RBAj%Fpc$Nc)&~t
z;r?VZ4pp5A7Q|lxuoI5lrS_fcq3z}Zw09ejzW_0~omRQ`89{#XlH}H{4Fnu;ghq$D
zP#R+&$!!^8kOB{0od6@rY8Cot1B3P*S5=#&&@9^1dM>^`17_p}SH)^^jRth}XME(e
zsY4fMzpDryT%OTX=zRLFBurI5w0<-t<2Ge!=rJ9Ymv*ZWn<CZI2RAcdlPDb;E_C~3
z+`EYD&IX-@zZ<HwN9?ZIW4cd$Et{}m$fFtNoV@l>d4z(EGp_1i?3{Vsq{s0RlffU-
zg<cpMZmQkHNQ;`eQa_ujx86F`d*QTqF_7(5wKu7d!R_Y%s9f(%Q){hAl(}5j8{57n
z-{wexK|T1snw03M`s|^sMg4nX<X8)`{=~WjkN6nwtjO6o0rPPRVUgAq-jmBQ#xe?i
zuA6uD{k_i>vh`|v3f-`P=TjivlgruShmM<*==m|+pZfmaxb3gWc?bJ%+%oc#kf<6U
zbLk8BNBb6~a=*iTLNL6dmsWCA*q-Jq_)=-LyF}l(JY=uEpbgpAXCY+cTqr=}M1w|;
zM+QX%<;U2jZ$QS}ORWIijT7i-^5ww0ow9TtitRu^K|~6WZNlVywj@p=d!uK~=QGzf
z4T3L)_p;pyGe9d@UWjA)hyz#~kA%=9j9*;M!RV#p%QGp95KzkECq?3TU0sR;7S;@a
zh0>yo1uZ%x$}&d(@m|MyC*E+PtCef_ffF%GrM06Hr~--TDmg7jKfAlP<QB!D?T`sv
zqLKA1F|)2RiojdiOrM!Ag%8?8;fxcFnEZKh&|HxL5^pckK*$X0XPd*Wo=9JdSBUO;
zJ^T|~Hq<ogUu@v^9~C@BX#RUEyZ*n#i);I`0ZnB{2`{Aks}G9^<UKM5Y%K~kiag;D
zeqoQVm%`(h!k1OXjZ7|BN52v)F=orO7xd|dVTZy#-zd%|$o!n|E^>4Mt@wef=l~<7
z2w)lZ|Cf>SQlHS>ofxMW1OT@GMUbC!t<t?sYA+ErAxNrIo*x+;^I(p8Hha9A`-ODx
zS!JXM?7v$I1)bks1y@xm)L&d5FCj`JBJ@{NK!NXnA?O)^;=$(AwFiE?a=NregaT3l
zQAuwB)D^RJeT{VgXinU-<D^YpfpfZ28!&K7V(CzH?_n(3tEpUZ8P(-0C>i}%zjI!N
zM$Ec8AfDJy1|!G@^)3T<V^s#UpB1Mg-d*0YW>ydEsroOaud@>M-sP~O(ff|oZMe7s
z-4r?7LLIrp-RVE<dCaG}I9H-O@61-(AYoS}{7JdvcaK!Jwe7X-iF;RKSwbdvbklZl
zE;=S`RLV6h$IauTzNqD^7*8iFH(03<R$8rT1fMY5M!lbV6qhJ{2B#4pfiy>|5_{m>
z#B{{y8gtiYVKHfc?OP0MbZ1H0BR5l{QeV-`1w59CDb*x)Kq1I~=Nd6=Z#7lo0$i<l
zzyOyAo#AvCO%KoVu9r5o-){R8xGJk7S|q{YYW!W+$G_-c9v_I+B++EN6L3z>r~dp?
zBtjD3lEVlRK^dfXZ(#2YQqgOal5>J>Md5Sb=fO#YyXV1u9uECumi_yrYzSNlua<KM
zmZST+X#9r;_GyPYW1DcMB0*Xp1<c!(l9KG1HSj^<Nt0U}9}MGABLdaQ!QaG0VKo#4
zXWdb8PyCi49=XVk2^b+qUjVzX{Sd2p{R;xZcWhgo2E#4KPfYK&UeYkwy67QbYEx}#
za(MHV@k^$ugXYb9i-WD0bF|NI0WifR$jTlI@_zQwH?}744=8_*%Zfv`L(wDer@CzI
z9dzbt%0B6NYKBY`=67j|25Y6%eqhjQkY5P_zsrtvc6F}c#fiRDdKr-elL%R_R=pmX
zXWRtW{Oz+GJmRzbhg^Rk6E^04uKJk;$TI(fcGQuD&eZ=s#ILsWx&Oai3$ZDv?pYW3
z0zRIb0Mod7EPo}>eeF8<g>iduesoXH-dp%xs}JpL!kh$Oj-im401agPiy(E|hjuSw
zATx%Vkebk3NM7)V_h&FOmk<i8p5Op{M3(!bcF|GEO3=^vVxT%t06!%DCFo9h<+fd6
zG*#&2w|R>0meSbb5zvxOw-A;9;(tbMu;!V_-6b2F9kgC@*d-GsV{!5f9e-rfX$}nA
zVsJP;=5ap0MmHS`+Z8(3Cd7(Sp;O&bJfvW^o<(i*yzA{0fmXOVqJ3&ktbMp0)4~_>
z#rLyb4E^W_31BOnIOnmND-YUbNXwO5*EN6qs3nU!f7d{$aaMi5y7ICb-GTBF<u{?%
zIhM`^ZdCcDDwf&2o+GouxzG25sYJE-8`K*dFF7_E$%i|$iEN|2+swsQ3&h*AwU~hz
zA}7Ab>!T=xf}V4ix)Ob0ZAL<=m`US}A2X}=m&~n>F>fGIB4#TlFY3?4Pc(r*L0}hC
z2eL2_=%|oq9%p?rIy?%cm0~lra&f1p>20_0*5~nNu8}a!{<Bv`@EYG8vBy#Nsw_{k
zwapP%(ZoWL%F#SEz$1;d_k<uD39tL}3Y~3Kq2oodnv579WeXnV*=5KJJ)BGyh8Ct&
z=yne~;j8-$q}_amMLID`&YgHFV*AHR3mlYiC{f}V(hdBx$XKC3uJ}6I`EJdaI*(a&
z*7X0<rTtg^h3_Bi>6fYk@s$3i0Mq%99zUsS$2j(%?1&0jJh?EiNB{J^L>H0i`47hF
zG!x%=n6_WlV48L5=QzwAk^+<=!%foI*xJZPtvW@HK(F!lXemx?+kQV5ps<3^QBi-m
z|8SqICt_t~-ROytOo{`Er*=_w!uE@cd!A%-+5IM_CdR(n8{d1Dh$eh(CuD~WOwKz)
zc4oh%a*K!r8~GH-WsXZKm5eb>Vi^rYiM(c1gEMaP3Nez1?gn!5sbn)|!FdLOY+h7e
z*C#7@L5NuMI1samjK0VNlEs(3TkC9HN=nLAu92Yx`4VS~GB5=lT?G1usdV~~q-&by
z!E`?8=qxHaldo%hS@t{9LP>uA^YHc+HkirSRK5PT-g@r+YY|`b*~U0k$V6M=*{G`w
znEdKQtMt)_z`O04=R!`+$ZC7TI7YCVBR<CMp6GedeyVDUTB{&Z_d*gJbLnU6`_fmY
z6`6N(<W9fvx3SPTJ)#Kue+grk#Q(Ao{@h^`9}}MVAaM1A%9!<^NBh@D6SV)YyYioQ
z!g^DwFh?**fDm`ee2T6f=X=91i>xr4P)A_4F@mft=L=P2@%cXt!2Kwk@oIB;g6jVv
zHI%4#7o!}+A3q5Bn$BAA0)N8H4g;)yVA@QkO$0qbc^i<E^c-A*f44nWez%VB@w&aR
z%>l8oST}kyn)WhBTqHEI;w!=Z0+vjx85xy%T%9S5=5d#6H-D?3O&mCGVgcd=aXb14
zO^)WvLsl@{`284*sH#+(r7hUa<%xn2{krA`l*7zy3Ck-}*<Xu=V@#R9=)ppQPmK8#
znxtK%gAvI_B=?McC^a=Z=hnow8qa>W^*q$!ZEPNwI1`ciu_W;vq(*A*vG<|zxgKLw
ztyDS?V8_YSRXHTgQ+18aLC4Hsd}8By3rwSJE)J<IE<Rhg4{D&9uarkEyA!Rr+4$DD
zaL_0~N&Cpn3GPC!zt1r1>-)uByB}`Tz`)9=9Wf2W@F?b6V!C^H{ZO{(5kOv>bVLH@
z;`p!J*>?Ba#k)@}xY9P)or&20O>RmiZfE_=3jo&&3#gu6D3kjiL^nFb|8@jo{SK@z
zo_Tv!7Nn;piI;%jSLhszu0!RG0r$5Rj96eX;*xAfVmQVT=JM*Q%cdBQlkoRo&i-^I
zV!f9s1o0fwO!*WGR`MCW(2tOpOSWEpd-J3rQ0#f0?-^9=%}lPN=JO3za5=jIL$9{u
z4{JL4vicLy=C`&^W4?fF-76)Ej-=|2zj?9*>nwmJJV-s(O;7D|d1W??@a{=WNY}pn
z<;$1Y_&8|)hmc!c^ngQ7>mPW?@!|V4@F~YIKiqO(YRLXFBJzHRXg{Q12G>Cp1Mh5p
zOshz>4Hp5Ewzq}CP6pU`T=tpKYPl4iD!1-2C$s3OAI_;zE>lGPbL!_xqdU(B_fEr2
z7zcWAtJ~RXP+ihH<YS>hNbfO#$3zn-qxx8u*1sRFbwR_j2)F}F1+g#=XqE2~B9a+J
zYtGV3z=MfnrIeZxd9~TRFTFN4vyCJOALs`k3-j|0Ku=<+TxRXKQ?htCvM$=<l+F$=
z;vw0*p?Q4`cI%s+FVJx3h?{{Q=bK%RzQvxSdf%vcOwG%I9y=hqoq<&!8r?yp*h4n)
zbDtv9nLkiSC8F*nRZP_u&s3S+5x(4=rE@8f3J&xA>d&i*KtCopF&}sj^)^U&HeUYt
z*^KM1SS(jH^g7E({$<C_%yzv!YB-5*&kcLfPjsV(1h8dIyycMj1m}$p<C?h^H2ylX
zuPb?xL1e^k&JXk2!8BLl-$QDDUPo^jWIg^!1+?5x5^kO+p9<e*BF$g?n+FjxhnHfv
zrcnfKC&E0aIar52-6K7yiCE;`|G5A0hm!|jvj1u-Y*r>uq!S_V`uiTsCR*D99+IjW
z3H=j3ky5Salfs-Xw|5jSrL;?lx^xM0@F?9MrTX*TWBWe><_UMN2Vz?b{0SPMl7on-
zRf3iJkJk7$mTJtnv*W)}L?k=Tan6_c>R>8phm~d?>+5=izdzkGZgfC*P!*pw%&X?P
zG6JYjt*Gavwa7+}rTDmc<nu1i>*MH+gDzmM{9FIifbuHTbD`nEX8unZ#1CT<2?bF}
z4^+KWom1mskf`F3zVY-ZyN*ZIc?ki*@l3vuuq`TwYAh>h9=q<|3#n%uN<!N#^)^NB
z-Pk2wGH2f56o}MVM`@|FZ(h-_@MWFpf*bRtk|ae)3eybe^e{Rp?ALwcIv0cxO~F9(
z#H7Sdy7+82D!tXu{1dfw2ny!2T&Uh_2|kIiv#R^smSxRss=e{df!+)By8Cfkx=tgr
z@B~C6qt*MW^UIq=>OGn*j|)*+N|{vhei7+odL;boi0LOBoP+aQW6K~>X;}xC<XDh7
zL{V_!5&U!<G;@yZhQ1}Z>_*sbHz`_;vhR1r3$^YC5g-)s3^eEoHs}pSD^inlu$0if
zi#n!)aU}aQUl#RCloJh#7j++Z5tDEHcthZ!{y4%Sw_vKUqUmBr?LqbHX8dM+J3-#x
z{Z!TvVA>qmJo5u0#ga6<*m`9?2&!|;CL7LS${{1(7&k@xD2(Xd*9mg`r$`XPT=GKg
zwGC_zN#>N(EzBU*JtMW0*W6@>k`+QgFE5KF=++X5pWgnBOTDR~q=UXl@q1=pq;Xmy
zK!}BE&8M{aFBM(zRajSlkOZLe3xv{zSy;hegppcA#hrYhPYa=xN#AEzAfD8SD6vzW
zPquf+{KgdS-F0RZ$isz&QmR1_I}gBV?y4x784IftY?_3cz?M|c^aDFnZ|^r#a?hcP
zVEi<DtU>$XZ*T)KnVtJ+xn0q@Ch838{3y~6vRWgfqo4LinFB+_&lwdngp$sz%W-B*
zFTn+}h~4{D2-VFcnWOFBXGG8M)<e!^q7H#PKNPjf1WuoG-)!!DFF60zZ(r^5U}dwh
zW+_|T_TSjyDL#%dSY5FkH`;@s4rhyV*_9J0Wtg4!<r|xueIseik%0f7yvQFp>wTaw
zp(yc%FsXE|j93Jhhn?tHILF258sAGn<O>BpHL^L;zuH~kmCV?A?aE$qWoTB_F7>xf
z57@Vx%BDm_PDMKRML~`;Yz-!tmIKT*iaD^2V6E%PAf%dk4q&NgVL)E%;-gl7!9c-4
zY2qLxJY5T&MHSOoFjhf`>|j}Kwq1ECJCAb4is&-$Xev?!S(QmiiApGrlk8-)R%GP5
z!+yGWqS^XR*W=-C^j$iL&g_i;jaXw?0{!{X4}WT<`pEgGo<V!`evOS|91xsbLAfbQ
z7>%x_uAb~_+Gg||>ohK(>&(gmB2cX7zC%(S*QzPLfHvui7+D-Oj#sg5sjN^4`Fl%}
ze%aqo8mu<E;Q1>-PnFau$4S-*7?6NSK~^wI6Nr}y1Pa4~*~6lr)X9Zplj^!6_qmKp
zBKHdgeD6-yVy4LYLkIFHZI-{PmL(;DcKgReVp33u8r06R;9`&@il-?kM)Cw-ghF2Y
z3tO-(=S%&8-pbw!SXCVai$bVBH_XB9y-)Q0?6u{Ug(iyYQ)$L=!EwuZYn<-cR1K_Y
zFpC&Jc&e0Vy+a#kyhkEZQe4p4zB&b*adjcM^OCd2P$nU}Wwj1kgVUqlMwow3I{(u%
z$<8s&v)GSOymMgz%lH3L_7zZ3Zr%G*0s_({sfd(>BAo)#f=Eh-gmia`fRZ90UD7!q
z-ALzvbT^FB4MWWQ-!ZS>z4!Zm|K(b<-X(M1IkC^)&wlnk`*?eslZ@r7V&&!K)lAsm
z55w!KLnMCh6jMD*!NoUQ7`XzmzGic+(~$xNf&_Mxv<m>wzH1<5$YOjNd@GBwN9(&+
z+okH3rYK6IUF+~>TwTE{nZu!ax4>B3iowUi_mBn12KgF$FPB?$FQ7F2Oy2uP{1kBZ
zL}8)}(W*3=^LXpEJthjdCPouZ$?=!8+BI(R=K>25y?cfWWSDLe`Oh=Aa9#_teW_jJ
zVSE(<obZ$K4#O>OIYN5Q3%$;)ll|ZV#q*x+Ac#@B`t4msrsMIfk={5C>zC4jTD6{-
zwI^TLHqxklcx+}`Zez(6S9eU!pg9csh>A4q&*w!k7+CWYoNUi7pD;)DWztN1ZGyLo
z#ht~A>f>V=!0rRcFwrAhV#eXb(LXFXY$d(W&1bg3gY-R#8n5d@nxw$+@Gu_`@McTV
zb4ggjkTR9RxaFM64!5||N*)3OAVh~WXTS~Z95B^A^FGG%7sG*3yY>ls526++AB1MW
z&sN7TA47#sgyt<T?<{pBZ+O1=er#6<#<*}08Czj7-jOBO;z9i4V1NH*Q<IOd*h_7C
z1l=GJc<!k&EdQz*{U%-|6Lz>=ixPTPa7+uhIhoN~C5dQJ8b5*vNrpV~C&|Q!TW?CO
z@zDV@y9hrVgH^d6yI4YR(dm79r2wWuHObA9sDQ_^>60zVj2ea2xmOs}&(eL++G$$+
zY}0ttybu%@u;=Te(6@{AURq!U{43?a?pO0r?nz&DT>AUc6;c56U!Y?E7j=@-wX#0z
zFr_i!Ct}-X<6t(=9KDgv;k>oZ1=RLKvGCCg>U<H=-k36%68D@7AMk<HYDHq0TEmQ=
zSC?Jv_*g2u%X{}CWo@NfnS;gzjA=#8tRkvyNSW~9MIosea(CZTL1w4pB(e+sQ}u@J
z7#G9pnI;z+sp~YKz*Kc5R@2;{2k<i_X+^~XJx&*H-cg8q1(ZeKpFT>C$eXX>uQNEx
zAH8u;#v{4|)345XyXSK?pS_AWBnhyVrZ53MUZaETescr_WUY1F3cfDlBzoz7)V6Hy
zG?{xNip^SiVWF)HI6L82A0|3<T%l3#F6{E**J;zKh+y#a+?wJ>1hr5w32UZ##x=?~
zW=iFt3`qc`($E0>yqBl^3kk7>47%NwRl<>yV!v|I7oL{mdOG)Nd%r{o<KjsFK4<z$
z)cfIrTZe<=@g}yKX(@?0^|hcmpjMgr+5rQvaN5;5VdC{}b0Zz47gt83p!9Ryia!lx
z^IQBbPbeirhjYh2=Jb3!?U;ccA=Olj;&iuE+f7=Fw6?e~Gc%ZQQT$XorgC4=6Qhle
z{jL^4XR;c-JEpYRJWVJOWO2>U%EuZa^jKcL#I_sRadSioc}XDizA<mnbKax9Nu^XT
zk9+jIX%Abk##Px;n~#r4-g?p~!f^W|4`_a2-e+KWq<i?{w01ZLys*F8TkW=-J?2>u
zj9+KdqT!5M$o(PJ<U&P8FCX<2i+XXhVIdmXy|)#rO`tYKDxV?!@$*&oPbWT~q7TWQ
z?Rj~|cA{(Pc~E$iI1P`*x$6(42ys0Bg6BDZ;i2?Qk*m>OA<#nvPV78%84VZlg89V5
z2uQZ+KCKIk6@Jf65@(w|$g&9!568eF^zOQ4^*E8ceQ&w*fd0<PgPXg2K+JjuA{xI&
ziz<wV1aUQeb%Ea^m8X-(&Fx`B62rVA-s@AU0765>z+EK0K-jl(8boQ6Oi$?nl>UKM
zatl-aPnBpF50$=A7Wl$R+#}9HJ>*rfSVx(56;oz{T1tw{*Zm_|u5^poXOF+Ug|~Rq
z6HRmv_y-_iq*d{rpqc|WTdU8GGtXH5lfk&n$mOL|!~ekxa>E=`4xwwCeO_N-psw*S
z$df0xzO5{jc!TyPv$4%gO&Nrje6uA0tN!50aC#g!9a(!YQ}kVO1Ir7tfcrBYtUg~<
z0*x~Bg@w-d_sGbtVYeg$3z8e|T_Gb6XR7EX9yNr3)szE`%$-9>*Z}v6PdbWm`b9#T
zeEbljS|P9XY;8Mxfnl`r{E9=aAt<=}JUo#~nr7JLt2Vn{V_QCPLCJkP`fzHY>|yN6
z_tq)JUuJW70N*EfM4gzFl!ug;pMni4RCn}YBhat;9T!#!Y=CPf{)VcBp<DGXd;}Za
z4^f!Xusyii2i};VJe;H08O@}O-T7?)6<(Jy!ivSRV#JGM<lg$8os-J|<MWi~JuBZ7
z7yWI45XZOc8IWUlbR=Eox0hxy$1ur`TT4zTSx@HQ|KcNZUM=_1oMVXW#=r;TVf2FE
zxW%8WKXo4pc@^7oX26Hy0;;#Z_Orm@;X3CHX8A25X4~*(?a%2kD#lDF<Jr$|0zbBb
zgDQL#n7~l9Q*?c>rC<4U<_p3;Ee=vH^D(@KT}-7L%I_BASe?hFRQt%33Y3-u0n|?q
z2YFwfy$%lTI9&KTle*buOTk?%HSKIwhU1&`Zcz=Sll6$4Z8|S{Y|-ef_{CZwaMZWa
ziDJ6Ss&kCBTbnhU3bSA;E_k7@B%bVagPrk-XnNlxfQ}uHa$mziwVrLjWZAL%P}*3j
z{JF?0qTWF8Sso9v$MNRF&hUhdBMKhtH1pAX_lQSSn137%TPUN`7w)C(Jz<9&R2EEM
z@7;W*=lyO7m+A5?@G;*_NbuLfK7%|o{A|u`f#MXWLf=jbDlSfth+E|euep{y%4Lw_
zwhB5^j_q#TWWRE}Apx7b58qGJC8YkfeIQ6exCd%NZ*Q!>LKhf6;LlO%Wy>W+AO@dk
zhIc1cYcTG-actOi0P?@0Zvwo}1rbSK{0PDL`!h>fGw&iZq;7Ep_1JDUW`)Fl>i+a5
zLMdGgtqr6Bmazk>Y~3SX?g%A~Aeb$8%1|(_j$%+SJbl7BQ|A&v(@^U^3naOG)@Coa
zo~gzRu_NKV7%-R<an`+i6$cgrhc9<VESl}S$~=-;(m?T{dTo#`UW2R?|IzqKwXH#g
zo0w^!;gkwFxdK`SaNTT+;f2KJWP_BIu+G&2U|+mRv*p1TaM=V5&eYlkQR7A#EkIX%
ziXYC2tUpq`Z6%Eyg63v1T1&jv7t%gH+DHP=4h|_0P!4n$F=bvC$5@d?Z6O6W16NVm
zPHR3e50KvBs~J+bzut;hQc}`f?{+Y9^~HRn(DByviGly{EI`Mn5d!8$$rlf~hRjdL
z)Fs_P)bw);Y7lBUtk{XIGEV-laU5C~I@>QTH{$~1gC)1W=P;~VXNWY{{_@>zG?8Vw
ztsnRBxIKXeMqfyGzm2Bk+lb#MU1bkOnL;<Dc6DFJd#X+Bq;olu%s4p%WHB3=Ml?^k
zywo7S%GWQ^${$ATrsVx7h2i^Qpq0nL=)pmh+vo~{_C`RuO_%LMB#!&sJxYS5MrsxB
z7Uz)U!ydDcE40EI?3CjNxqTJ|vB`KvK}SMv7q<g(qwv0t6-$P^M~<}KcwfZ9Lv{0v
z+rs(5Rb2AWmjSn>obo3Q&SJEo3bZAmlw1UUpMo=<r;2!7Cp9H4)T~L+b8&D9XozYM
z+me^_eoFrtxXb*RPj>d&(rxnbR^sFE;|uhcwZ{SiS4Zc@r~$hIx#n_tej+uzlW2Z?
zS@nrgBQ0%gk1Ue4IU*0%?P)5}eDWH%rU9haAZaOo7a8Sc8<$3dGZ(+}nT%(0WD1fZ
zF51K47^*1#M`D2)o{CUmT%jFsk_cMF6VrQ*8D5aZ^Q)}F3xuMzTs*hl#>c+_EF&E&
z>*N0VxHvKmP0cmQvhF)#&H8{rO8Rt{p3m?#h9k~qFqP>mJ>cR3p(_SPE!dwQ@4xvg
zWj+w-`>dM$dWuLFgww(%u)5nW)oL4V_<^XP*wtPmm!a2*+E-J-KpPYA`PfjU{v*gq
zJFw>GU~S^+F7{8g#;B$p{aQ6hZv_$XMN2P95Wt_i(JrfA6`T=_iC%n}ef8RNULiYE
zR0heo;28XDAbI!YqXVF57|v&Fg+L0S>!#qORjAeQ*kiNV+8DJa-R!1UAyF&@gamGI
zOv27qz+ZV38vLa^Bsrgh!Q^_qU<1^vNVBSK2I_#eTIwQ-q3%jJ!d>UM6*^H<3f#F|
zsNhLn(>FGCJ(1gb>Ezx~ecFO388H{ZPDofX3cCxtcY{mn-0LJ_zo9JI4a$>=NFxp&
z3O^5*9VU(!!?K4vI%5K_0s`rH8RPVJ#+%r0eV}{tQ9>qpIYr3fdWKYZ$EsYcQo149
z?b{t0GG%u#;?tMTlQg1P)zuhMB2onb?w5F;e$Q^>n6*%+fh0#EwId@ag2PIZqw(@s
zEBSb~+8c#rnRR#I=EIr@rdFIA_od7IBNqi{i0^$}-@e9x$Q`H7qq(eR<hiWwVHrVV
zP!2!P04yHj^c_4T0_O!-F|-*t#{ap>Mz7)-Kv%wkU+(xYDrfgGRQd)2nI1d{#$|p(
zaSmemJ%X*dw=qWLE%2nwuwUEqUOWPM{s_*!65W=h{c1~KNjecM`J?Ds&GZ-ClI@4~
zx1Kf-9X*eN3h1lEtas!3MqB?r(dnoCvbl9uMmo7TcO#)t)^CpoI0R}n-9+3Nxqa)b
z)DKH3YiYd2rItV1_6z@__})!ApDve(=S={a+aRKA{JdEQU^c;d-FM(gd=A%_JHtBz
zeAsu1$xQgMhsCpjLa2s@up#L{i(y$)Z!Kz4TmcIK;k9u+pRo2`lJpm4*|M^4^EW&o
zU@G|p{Wq8A#|P(`w`<kOLA)~7pI<(@n0lKo&cXSdKbVx~bAEYy$9#<u4@2y|EbcLh
z`$>_JX7Ey@PJLl*+R9QR9V1A;<bbYF#n}n>0c$q8qLi!ItGENchfl-;jYwh8a~f>&
zLyFYIusYXCL&x<IrE>LAxXVs-w{KD$DHW2DkrCIt=_`3DOhUrsf!J9A7o!mj;`!C*
zyjM=P=h|NZG@?EnX&arP5{Ei-RCo-som_XJ@eC@|OSm+cp^l4>4#4E8d1U11dB+ZQ
z>lV{_kva~C6j2jn)|-Z``RdR+u_{2_yG{vx;OY`-Y3U~vAEhFwCq5eYb|wXP+yVmc
zH~``aZ+VH=m4)%Z*4)>DXTWab&V?Y-wm%V#>*~{}Zp7LCUzNk`xsYc)YZsT2LuMp%
z8WrOBJWYx#PsGCnpjs%Idid+zJ^pvnrg(vqvP8<Yf2lZTkB-d1;3$6PlwMq5x%SGO
z8{ppKCvjV15@TzcQ0j0$eYv>|jh(a4(yJN5Y<LgUuBHjbjo)Y3!u7iyHGDw})Ltri
zUTQ19&RFbDa+|r;m}d@{tA!O_G%aqH#?4eto9kEMcNo38cTAsGmuAG>pCK0>SbLt0
zJ+7J4;9@=h>2`?BM3p*iCBV{1f5BEMRFP4kj+*KWXE{39YyA)q;^rfn22}gHu32Ht
zdK`ffA?ll&)_16ryi;Pk=Q#qJ7hQJkg8_AZS|;tTg;Vn}3zIzxB_mfL@d0eEWSS4H
zp-KPRWkQu%__KSTbCpj7&?e;rrPRg*E@0=Hyi<I^sK9|CKtT;xRy>6t9>&@_JK_`X
zdvDL-aEh6{mUYSBaM)`>3#H(E*v7}qkKEedU+sgRj*qdcehMBrzrk&pW|Equ-5Z|h
z<UF)gC(pOrpEw?OgDf<+%WT^vh-TVoZ1Y*}nUiYRUKgy|ef5TXd0g8$kL3g&7Ad>v
z){c3|rCx(Oj!*UjGQ!?-d4PCiDh6M;PHZkk77rgN-41G!626$&Gg=k8K5nD8Y-c%T
zgId^!X3x&mD&|YY1CE}bXv(*m{AkR<!^3kosGCBJmK1TqTDzL@Il&JSA)NB{({?!n
z_{b&W3vB}V9<4bIKSdBGZZJ9$(iASDO5;<6D+A@IgM4~vg@sqGd1_lFzjwcyzzYpv
zAMQ`I9Q-`Wwn&gQ==jx}h*|CFv9dvXIKDT(JZM$m^)Cj{Z&xDgTYw(YDqsOwJ`)5e
zTkVh72}MmW#qI#Aq-dg=^0QoPb<>@8IsL_QULqE?r0{?Yqdnoty||jMWPJQ%yG;UP
zZ@t`hZr*H-Pl#`sZN+&M7Hr-x5^r<gZgB~1bFKotK)o2ylRRymGH|3zd=mWNA!AvV
zSx^{utPbV<ce8DnQ4&KMKrrUha4*W1k7Yzlz_o+R$e8#5pd^ApM>)Cl+*yuUNEL{u
zu#mOg`!4u+dlt)K7=Ot%ncrn;nag?#s1ixEvRt|>Gg<1)qFyNW-1YX*&``86z*886
zFOVx6SUxW-7hz9o^6gd|yJZ)S-{6VjIhZbvP6YeF(dkbGuXWse`U;G#Oc5Kos99xI
z6nOr|h@qN4k<v8WuL^yt%7#6>H+t=OYZ6`0MkErWQ)RK`R_l}PUiO+~z+SDcJJ|;a
zY}Zwx-{7$ZP|me|w@FhFyzWG#P&=>vBytTSg`2peJ@k=)>XyZ9HkV-i4Bw>EHcw(I
zQ{!diA^J<??BK=;fvMKf{zrMKwn<hPFa2LFTav%HXKQ{1C=SuF^}MsKXRDvh`%A5a
zOn6F8>XBQmFx2I)Hdj!mEoRg^4Z<?~JMHq=#|PaPn-jsrEF6{-99$Top`oW`ukzdx
zX^4J`AwUwXFl*xj{+n-&P*dtXU)~twdV4J9O1H|@hP7N5Hn7L*_#>M9qo$UxZ*$q9
zsTpMKx}vV&B1pVnd~tC4lm6l#=gekXuwt%WJIV}Kw)|#d(l3cv5$U&xh~dI~d=G_?
zVnxc0lkNlKwO&(R;SW@E!i4L!>(1%9&&`4O(LwCMy{>%R6d|9UODZ16TUQ+^Zg&{^
zA)_^iH%9R&)kqwp1G$Zen5OIeMRWLdCkLm_ckOb*rwyER4<7+xMCYw3b12J^A#+FQ
z4Hklm%G)+qH&S(AJphMq=y^*W6ctrHu^=CiV~`Pex)jn<?R|N5;&f<j!3-Cj0iB3l
zSDVS_KE0p~J566|m1v&NmW_o2J>Pph)lQ?#(qf%Dz`RDy57)?<B3wk{-uJ3yaF9Q`
zfrSz1IW9#NaLJwP(HR$TI|%F9C)n|$$XmRr_9+p)9QeQ`I8Y%L7A(Mbdzq7%L$LQY
zLaUA>7P-#x>&GFkY<W^n50dqfBD%AIE;C_a;W!cXvK`BwL>}vf52(Fh9sYDV(tsVE
zTE`CU_?<!#EfJIX9SiRu|KOoy&oH1OZXa<}W@DL$$I<qi-7bu1c0GkHXr=WQe<+oZ
z0_&a01a{4OJj;fa17%po#YrQSz3$?maa=FEshNzFQ_yEGG|@z82ub`r&dWAD3(1}V
z83Cf7K7E&9=YJKaWqq{3A?NdR;+y*VfKdZClSI#SpJVWL3ec@Y;B9xg#5A=!H*JaS
zDahJpzLM(k4d!Vor!f-&>IYoX!gnJ8h6F`u|KVT)ZQtXR-i&PT(KB8Od+r_5Fn+;m
z>ebNc-T?QZ>N6KJLB(7KnXf_7;nH8<M5hqt`6EvLOaW}o3dTy2r>)l~m6>O(xi84Z
zb0hAs;M5g|ya^E%8@j)3Q_(YuPf`6kigZP<WvB^xGR&41h>Qrsgipzy^JcqqT3={r
zC{#*7_gA^L$45rw$T;mor30tWASC7Hv$7zs{w}rilP!o;8Z5Oz04XBB7A^wAl!?A_
zGmvXDkPGYO-6Bg(v8S>5PEv$~)38ysU`CuawUA(q@|Z{XxzjtQT7(1@YVB%1Y_Bwo
z?F{g5hu~qz`IgY%xrr(-1{;`3j7NP}VsgjDdhS3_*e)*KW58|Jq47k(1uSGbTYHpq
zks_im>SOe*pTeeAsj;yU$GWMc<cVUxw<i3&2cBZD75+(km?oiE?_!^AYqDW1M#Si6
zE`U_iMLQ@uCf*p^@?2_Pgb1Jf^TIJz*$ChJ>z7|zBW}z#8qqEoMTq2A!JzM9$N7AC
zpDD3<@*j$HZfuPEWJtkwUpHPX^(wzV51*LkTK-JFK9}Lqu<r%GJn87OgrAF;J~_#J
zakdIS&)Roc-B51=uIY2y$pP*t>DQPmxAt+oXyV(L-Xy#_%%$?g)FXa>fyhYIK>YFy
z(57dz{!PO^uuJcT$4NbJ4p1SOEJ+No<AK;C5bpegDQmNVMp?IGRxQ0$%kCHl>>=m2
z(%|4=cWj%c%S!9oui5YW#Aoq$<F5EG3QG2e<MB3#f>S617y|3&s!zv5oLVDncrVf2
z(2+4{eXk%sME#MquqxLXIK7wLS)F9_4LiqDKg_Xtu1-6Y8yH#%^DQj4uai9zei6GY
zCxj<@1ql_;tQ&c?uF|}9w!%#Z&Y(Q{mHB+lNx<1geP$$4C+H%5zHjhUiLmbz61e6X
zRqyerAdku2!h@MpTF|oid2UU263ni0@6$b_@m3$#2`<_Vd9X<MC;3?H8j1Au^x*I}
zW2@?+%CTUbK5}P@H;Uo$)08Or++KCv6YW7XKDkO6Yyt(RPKOisilAo}m*)*|v!nIV
zscM6aQr6BZ)ijr<owci#S`VE2$c)~M54S6JKFzflD;%AMpT(*b<<}J<-MD{9pySog
zW#9L5EIoryqhQlEKAT=)oC4Hju<3BD>@#G;09O+(GQUbg&SN(=1`je40qQHNx*zoW
zn75m<gg%?5^xyEl4`Zl>LB_u(SQOuTZ|rrtD0Nt+4p+0dfA&$i15V@3n+<Bbkllfw
zdllsyA*kChYN06(vU`Tro?5_x*N=V&{zjDz6&7P(x2j-HE!<bJZAZLb7dNgk8*%e?
zUcv2z$jQmYM8{>0Tdc!!a$uR=;k5$Sk7cGsJgZfDg=NMWXnYK@Jsqj(q-Lr7X+ZVs
zvN}$qG3%c*r2JDs)84YOve6fsA4{JXI0%F@U+&Dq)T?HmI?SzHbV3IVAx%0SF6%rI
z7emCdroyMu-u&C^oR~%RomgmdK5#D`mszJHS!yyeMk(xNzbnWo`1%}4rb&}n$fmd7
zmM}hKVtE>R5{^ZaGft>?@<sEg*hw#G!)uq+2ZFP7QKX|D?)|VSO=OVfHHt1VI`wHP
z`r$FD4Fy#7aKVf<aWRR0D|Cnx<e|L1jLCoMV<J51J;56QUv&Z%prA=2WDFr_HwY%q
zV$mHrX8Sf?dJ`4xwjYBxOoT+lX}n;5O7n7bffmPye8n+zNi5R4<M+DSC@H)2D9z9h
zQI@XQ`SOZVmY7q<9rP&oXjrWk)3HNKNd!&6j(x3-$Y^M2vX38+zN42bx1QzC_|SAV
z3OWR;$xc-bjbI~%U~-Uw4bfjMm23Nj_F+GEwSS4VP!u#b^u;e@z>MFPMSiFE!LU5%
zV*$&vRg5p{hb(8rm%+TY-gf#&YooPC@mf02(S4~{7Uugsm6)d8Q`p9j^ZOa-)*7>p
z#ihodQOA6)4_z~U08$TITeYTNioJJ0c=_I2PQdoi{~b<23|9j;D8$vEW-6k@Mz@O1
zAuPM^qJqWdvUewSl-?kAOacv}k8ZlNP{VhMgv+TpNNt9Dh`xlh_tv!w+AIrcwQ*?t
zyaV4CP}i-pUF*F;%{Npz&XA|!<+sy#4kjaV8C6w(0fegXZu<HLp^{-kZkjp!H-AMn
zgb`&NKK|M<ZB6bJLNZtP)?!Zk0Fodwe(?sS4e8sI+EUx;FV(-%jBIG>0xfm6vp``h
z!&%RDFWrw<$cTNc)}oR>*eErL+&x@Bd*b?i5%ENunciLwO2icM9Mmd|w-vw%fn}sl
za!MWZJdaVD#{_E9X6lKE8GTbN-#&K5{DL6z!#vN{`t;RxV4On7&8hVzBt+zZT<i)X
zU2S!nzn9U*O(d-qpIsebSDa83*C2LTccgB_iOgr?v~m<^irvy#PyGJ);klWaB-0uC
zrT?j@rzI04$feO$DoQtHr3ds8V#-qJ5r@MmQznZpBCEWeQ2|{UZ1O7h)kRtFV_Ye8
zH_E_rn<Z+_SyY-R$DR8Aj@!cAj<La}VBDbhO&P3;qias$^L=W~?Z^Ea!q4{7P1tg&
zE{;VMo3c}8-KV)DvJ8K!SzjGA;mcb(KHhsW>PJsgJ+!>swJpcjpRxn9)ch0~JFskI
zF^z*&mDb)p^W%P5=sNs(eSW=kn%W|dXoTX&hQKo;@MEEF5KBC<<gVnpJo}p907<$t
zS**(Qaq(C>1`-c_UiGF)bw@8sQS(SciS5S<@Cx{ax#*?uQ<><F3whVNi|Fsvv%y@d
znAv?2mFBfTHR2~T-t&`wd)7af><W?`#>H8ro;f6COY+&0w|DFv2Kfq!8b(yWWDF7j
zQZr5ve&4CQy!qk;VxhVJAS0BtqCwgs^;v;4($)$c4c}O~i*V9_BCK~K`--rMN4i(i
z7g2bc4<60yh$I@#HRxh{gx<(9NW>cT(?aE^9<8{XH13D*!I%R%wjRn5i%tg<;jg5w
zk%6qw=;kZ^)>Ptfz>aDjfxz0D@V9ivKO7V`qd8i}xb%k^m`t>xFZt&zwa!)P4uzBU
zJ%o90`EzZ~3PLN(Y;;e~e4du?(j0`HU9H3!5m{?UuSBvMU{GBt`?)k%ko#$OkArO1
zv&!3n87gj=)4ZUmuox~Tmu+Fg7ro;5o~c>C`n}ssUd1ES;0PiXN7>xqp5Rb|{#v8=
z+jZ~igjnre8L$4ZsIKXvFWmAZ1s{mb$9maBd+W9c((5LE3E<)@yQNOWHg-lKuKLVy
zEz0Xo|I@1JZDps56{EX};6D`CE|~X;TeZsk44+s`7SJWrbwQ^PU@R`DT?VuKQ2rQd
zwtMU6i4)zm(DgT_?}_$*t~9WHdNC-O*oSq7t<_&V%Tvl`YLXtI8gJLi_9Q|@m~Ko!
z)GRQi;`uezXkW;@TZ4HND>XixMSs!0NgM{Y9h>OmzVK+}oLhJ)C(gP0nazx}Jmid1
zdWiiCsq+SDL=&`{1fJ?PZY(i?VGQmsw`7|jLYszqzB|s0gFbI=_kVCWN<0U(;K?B3
zwGLG?EGPJ;iD|DV5urObo>m{%8>~4YrQ_c?3v4*3ZSNjnI%*gXJv;ttSE}nab>Tz(
zdr{`-cY8M2zogP+k_TMhnp9^RXk67HPAdn#;fn)EpsByVG;zH$t*K&o3t_8HfDg1W
z*6F~Q$HUZ^k?Y4|(CQ-rX++v#?yk`VxAT;r5({vTA=5i3;`dZR%v#%~JU_xiV@K^;
z`oloF;}iMLz|adW7lEM{&<-Jnjs@myLyBng%YfEQ*{WOVj>EfEoN9X--DbmQr-^wg
zj%ycMo4n+eG7M*;BtEMC-ab{Bn^SelF7QTYlInO|;jGl~x+$x3qt<W3!@=Sp3xDq?
z%87aVuI!VZU?QU(bqVcs7i#yMc-a<fK;*vfcMpu2lYOr<NWRaFm^aiwod?eMlNp9#
zBezcWP^m6NI8(82cn7pVnwBGJa@uLHB#~pc>C^tOeYOsiU0&<LR2OakXgm!f=??p7
zQp%jD<yOCgI^aj^u3bRe8DbjujWXQ7+soq*It8)~h%d+ieFhH&1+`47($aB?6Q(3B
z<j;BdxGFBUbWB#%j$r*%QMPr>!AZnt&{Uu8sevC>aO>Qvm@2xk|H)Y9l;{16niC?&
zue|rT{Yo*dsS}$ur0-n~<`qx63C2U8p+XOCdX~PK2SCd<tlm-YS6&CTPVp8GjMqRq
zFIH!yQpN{&7nAI|EZfP0IIo*hx}@4o2-^h$&0kFMmfZA6&K5cHWdUjN9vI_9P)=&c
z;!d(7bk+wnKxcix6^XH7^NSN{XY^98Ed}U_JL_5V^&{Gk44Qrlo-Zq^^+j*kna!9_
zSZy0n#BBevyE0eZD5figGGhtU@A+{!hHw-fu>xF4wV-nS+pzseZof@-%?KV(aMQ`L
zSQ>?6dO)0T!L^DOjh8*SHhmKXhe0wC_c4MOPp|`@bKL&1C-BhgIj-Lpy7V10E#}&<
zEntJkQ+q5ZvNDF`<RhfK0wLu^%MV{@`if5VJZNVPbr?O|5b-I3&7Sp23FoVE`PijV
z4dTB3WuM>ME$Hc8$eq;NMzA#Xq@)&=Y#UVe-^KNg&YqF7IOtCA)+=z@nerTRGj%b7
z(t{FRyqAY}PXOz-<qv!4Nh?f1Ls2Wdns*>>aMo0AtY)RlbJ%Z*rfoT2za2$PW4`uM
zO51H?el0UkU7EeSQ!2lBjs1E#${Et{5#7Fi7j77zqA55WGoB@UQh9b*J%PX<WWXxu
zI|0V~*(HXO@HD1`LWGK%_j@G}y7#efO8v6T^l0R@o8NsB0B#ahZ!Y#mmkD%EtR8OU
zSb&oq1ZzAZb}U-ABAEv`54yKYadS^@@5@BcpgO<)=Ez(@w#J{Z9XDp4w_bCaUCt>u
zK>>iql8;Nl{(cqtQL?L5k~g7?h(9&iVRITfJ2~*FdUfl*S^UFk0N#ks^6tXDX+TtU
z2ch+B+Ypx*_AM$^E4`LPh!tMw1D3+YM4=wb?+$((9On60F2%%>rP`{58$6$z=1XgG
z#qcWd!QV^n8z&PPe>+|#DqaGCn@lM+4i|3;y)y@AFLr_UN!FNhA3up&u~wVunj3*m
z?Ip!4o}pdc7~<jS0kJ}FL(srjNu<2&c<6{l-HAZ)nA6Bi4(YE4gS<)t8xk4jodKw!
z3s(##ff+R_%{NzWRjkzb<XI(FhGX(469RXbCY2DQ|AB%XDP1@G$B$axLHg^9bvzm>
zPDRYevy+M0S)bkk#+KHO!9iBsPHE0OHG|wP6->-euky(Fk)%>eIB*M_o`a7AX~-qr
z&xO%tdGb{EL)ZMH_4AKAVf=k^aXD=kgbkI#IV`TDi-tu__f%r&0Xz?Z<&17Q?{^Q^
z?D79l#u|EQ`Gpc2HX$m;dGX^%@%)OjOPh(~RKoTeRSJy?SwV84X9~eSqE$MC7GG)H
z$fG;b1n-dred5LLxT!#hm=HgAyR=QS_$BIUnJnWhNnM^*(l@oswXPcs3(<JMI+fG_
zZm*@cZ(fQM>bv*1Uu-u)ZlGP;z|RK3FSuklWjOUxp!@QawAa5)uiwFI6wbN0bhI6I
zj{SnyDfm4}i7%2@25{&ne;YJ|8wnR&O)~q5x0__{8({yC^NCy6fwE25TQZynwZ4NG
zI9{!DZnH}(km$2H3rR2bZTx)oR2e7n<E}fulsW1Q>+aL_TMc_K0wbPup>yWd$*P$K
zxk?*+Kj7-dJd5#S5+<B>0BuA8no25Bv>64N&%>T5g=~9jc;4sSWtMx5`=HYX(#l8X
z3(3AhU5<N(zUSvEEx6*@4BaJyHm@}#a&4v?6l=uu)2o&m8410e<Qic6Y3S^m`<e=j
z`mFku$&Q16>=%p&>7r0@{}i5j;`ouzebwaRkATBrbmH9pxM7n1^vh-01B1Sp$BaGM
zzf1(G;LQ??f$Otem(JtGCg0sgbH&XBFAdR!<7pdpNy)%F3Q4&FJN3#LQW2G+d5p-&
z7`%uPX(2|08A1^K@mBl9@M+hcI~;e1lMs&+=^o*rTOVrM2)ewNen=kfYyY-~OABtR
z;x0|LRl0i6+DN@exZz_T$$JIgj&jYj+0!Tveu4ukAT4s47&dd>0T)P&EPF1<lSoXR
zePzq<Sk#A>8V~nP(44JsSF%&pb#Kcxr-vfJYNv*PRxV3PO^*<UH|OrW{v<LqBSljC
zQnkV3p=ahgKM&UanaoM}lK44*Q$@^CUq`fc5^fhkE}EgA#34aBFn;CK!qdUm5<hIC
z;en3e^8spNUeD7KewXb+V8FG<6gO_<PfTdg)6;+b5W==^<~Bi&j4?v0&p}y)jKqWW
z{JDu)OBA8D2lPt>&DY>vmbkA0@$+%eu=siN4BSgUn5a*h2rt@ebW<W+;a-@$dVgcj
zsP%i1T5Y9BC2R0{g3STF1!pu@9(QV}#$n_XZrumywYHR>Qs#GufbkhT{^K^Nf8&;}
zM)5Hnu)&IYwKJGFeYS?QcE4cVxN&dhsJ_%ATLD&UyMOKqI5?n153m`{cX>LM_O$$x
zq7BA{xOku5u}IDq=If5>WKDv843rAD{BSqEUqeT~8>p-i5dl)~8!OT($jy}wZxRI5
zh#SJf*26X%&@kFAm?Xa@4uvJ~zu$Iu8Y*B-8&kpm1rSo32U1}1lsuHY)O*WIF`hmj
z&z(*DSOFZTv^NgGevSKua3j$DhigpUo6naX-<f;9)PIoR8g(k>X>iN>&eE|NWtZEo
zDYLH8np1M|8+ACtX7acY?}l&@^q3{1F`T~KMBck)*IBrlXDxI)>{EfH@3EN2O>aCA
zKA$+`Ht{6`hql(X$rB)m01D%F9bFX}Hp0E=MpnmXdYOY-+;FVvwd=<>?J(O(<ua3A
zUj9Il*#DB&x0wmB0a6E|&0)wdScZwJIKUM_VGcazPpTf~t5tZewEh9%0mN%e>1s4e
zQERcMQ+&bf;a%BM8&q>I!(#3+%?0D*nuxKrC~>{{T;DnW)A+dzkzvmI@I@zVkdZ6Z
zHzi{$Mag^GP0{03&kdqDGpR&otF+ODXoy)LY;d>n{)JDz;)yWM^+2PwD#8qKx4+Qt
zryOgQY_B9<ve&SOfUosHm^U1FFw2LPTCR2`RM-ZxfaJWkt&YCh*&XOgJl~-vwC%fC
z)3{ZUpPxS`9LuJYp9;T#P_tkOh+XmPKy2C##AqXj>0)A1jW=)ZO-Tq*os?6YbY%Y&
zIHYqd?X$tGp1EWz700uqVJ--mud?_saGFjv#dIxdc1K=ir(3aZxyr1Am!reaxm|<d
z2SD9*rJTTX$?G}jt1tGra6IyEo^#W=*s)gCg>h#zmX3FBGz!q<Iygi}&AT1k9q2C!
z4RTZF`pfoC!Yy<ZuD3Xouo@WP+^?-pozLpVRJQW+de@W8xVsd>Za$PbMQ@)Bs<aqa
z2nYz+KRF4)Mnb(y+pN!vMTr`$ujmKe85l2Lc_F#_a{4OE2~->HD(VeVyQGK?lh?~m
z%2!eYP4P0l28F-S01hVIUTHeHiB=1Au?9Cq`KjzX0APLhT94)L&~H6X22l;fLo@Zx
z^>I8H)Eljk>1k;4YR@v8z)lYpU@i~`T^^rDZ9&K4aD*f(hN2|V@QOSbfZFYBV3iyB
z#gCJ_f$QA%txN%^bp+2eTNGKNIKr_U*M)1Jrc717uy7QGoCrW{8B!AGPj)$id_Mod
z`)ER1dFB*HKa8h1tvcDT!Xl<BUYJgAM5z=uHY<AX6M7v_njYy^LsC2rdsslCfme`8
z5RBfV>!|G|v7&pCh5-tgs!Goyl}h>c93C8*&JgQ^GUTm(DE{UsWS}(6I$YF8-J6Tu
zAm!dcu?~%DP)G%=<B95f!1GH*U(3~+8IkO?=TpbAvMeM8B?1EZ!u2lwmjCfXIIS=q
z&6^0EunL!PX!^eGo5q^FCT2N{{iM8u5mqqZPA$>}Ic=<LsJK_Rb!}`nZMf84PF3H~
zH90fLrZG7y9g-I*mztv6ii~HBfAcP$QIU&qb&<|E!4*EBs%f(3r0gA;D^MI#wXrI*
zKI+>HyF4c*AoyV38T%!Qx;K_xT}6e+8i1ep2>7`o`UKkp^ERu@(2UOj7c=o^PB99I
zWw|<99?nm%alycEo*mk-c4?Xdta4J~*F^$XXBUA~#i0};!0W;Y8yqYTyQ|l^??kQF
z2<F3h*#wR+JJmA%Ks`m*n{ThXRPPA3(}x^1ji=C?=$fwZ*-Qw4w3058p3Zyp=h^@i
zzPlAH(dj+BF!^g{DS0#WB+&F~Ct7TviF}UEAmDf=sUzT@ONU=O(*V;}!*g}r*!@kd
zlm`8-xda{k?U@=`+54u|#php!vO=z1ySB1i-TNA#^fknMKiR(L#U^S|_z52Jhu$&C
zId;YgY^pq3fBDn!yQ(bI3C%MZYQC~a{oxj881Jrtf1`ycxsg5DeHle+l;eOOt+w$Q
z=l7Di<qP{a&P3w#71SqrK+~ljbxYZEn=MJu1?6CJ599tX<`5A9sqy_8qUKggD|wGJ
zA%Fx%2ej@@2ntIwuFLjhdUak^(%bP%$C=S>nS{xFV>HQ7vUI17u~C-!kv!$2&9b%q
z8o;77-$2k}9d>C%+yv4Z(P`1dOx*eSst<qC!-aHsLcE&2CU?se{AQlKo9QHU+=X{B
z%!>B@L|x5d$bU2ZV(Ua`KTVP7f+w1!?!FPL>t_4GEOzS2eZL~Ds(c?f4Cnn>0jO~(
z!uya8xCOEOQS%(4FDH9h&eSBQ4>pt^^&7I<;+B;O{BZfr9wPVB=oIHlpRZ}rTZpKR
z-98@!p>e6^|HN83_^*FZx^rdUDP69up92AgYJ!Yq+E}<4NoS?TlJCIp7td+B{)QnX
z_oNcYm}TPOIdP<cI05Wu2_mk84c^{l#XHC$%d(NJa-q+o`YE@A^2mF!oh)wl_MBU@
zoOS)*Jl1Xop^Sdr9~(KX29;lj5Cl^M1k%HH%uDlNOTGV5pZ70)jygkYJ4y^#L}l1Y
zAZwXS0T!=GfJ5JPm;L3>QZ%E5&&mpQJ$r&gLkRUbv_k||RLFi$rZyUoFF-^}s=l)f
zgTeS6zr008K^brw_WbT8|N7$ZaRX0x@02)iX@*fw?ppeWKU<_MYuy9r1`7lSui5E8
z^dm|Ih@gf>T7A7Xm4I{f@=kX|bTrYd8zh;8P3V#MAMg7ORe>kJGW<AUuiCZ#MD-my
z@u2IYJoBCo+=)vK7_<vF5kxXZ!tC!~m=IqW)3O<Df*pa}V`^Y$8%J{<JChmjMv(ux
z>xds9VSA#ZQ}p&N-qw6mQ%aj2Y@bx<a8R68`}I1V{$qr`IjDZZ{F@jGxpr}o3Lj~E
zL%0zWp7oT~y_u2Z=kopZv3^n!SCApJZ)0Mzy1Kfqiqtqj6o5F_24!WB6d*9OfIK=>
zaZF478hlBNuhE+k;7DdcVd3U0cfM>1y^qgjGW0^N0z{4fn3X?9V}^v>-rf#`ws^ue
z;Hh56@swrlK$?j{st?Q!D_2NWqFDey0+8g+u&2MAK#T_&qL!A0x;~dqz-8Eq;c!K1
z>A_|{l*Cer%^QEb{U??6eFA2f-lnG7U<Ypib7ft#O5(B|Y(?kW>;^cX+k(5NjK3yA
zw3p~~Ez!ntbK;(k^Q18=p|9reAOF2ze|ge>h?wi$v?sTJ&-MR)@+$+xcxKY=ugv%V
zZiR$(4Uo>4=mEEW@fE+9?BAb~l7Y<&28X!(kD>f21{zNQolYXiNQC)cCiZX3yReBM
zESz=ge}xYIT#3Im_$EfpFA-X9;bFS;zmMWCbN;iVrVIc-UUslF`~TI7gb2VNXt>uN
ziTx7c|3e>n1%P!<PA1*?6(#)Vq5S#Ik~CoI@47x=B={y`8R{{NKXU<o$=pxd_zwdF
z#??FtXr%4UdRLzRV}oz6^F;&pj_D_VvB>{RFMkamtN^UJDWtvbw_5)1?JaLT(a^}{
zwwlTX%qVvr{-|i0F+kWB8`JohNBkPtf{CA^FjN8s{klr3u-8mI1NZO0{anl6OK8fn
z+bIH50<Odu>F1d*tghy5Z)?j1s<)Wdk<S4Qf8obp-Nq}4Xja$r${HC_Y1r$aU}j_-
zT#gVa25i|>&dU)*_~G|g|9DEo0Lq1^yScd)PdRQ)sS+|Mr~qj+?nmp!QJPP2P&LDW
z)zbu*W@rEIoi&NHqT;yF$xO!9bX96ePDu%ev8idD+m0pmzyA&@SaPA4y=er<A-HiN
zZEanAcC_Kbowy8`1AQcf{#c-Wg8I9c9vGH0)k=WbumbK7<hC4qdsBQ<Ny!|DhX3xz
zA5TDBg4%QUp%DDy;K=;G{fe~L<<U5I?oCzEv{gWSidU%|%tL>*UXb)dLb=*yS2fWc
zAw}&bQD!>eyqq%m;NRX3G)&49b#kf@6B8>3Fp4`(JdaHZ0U_alqjEhiU`cNyJR2(P
z;9oG0cHt74(-b^Hs3@oey4B!*Gz6H3hNhPzf9{Sy)`9}#u2!YI$mLhXjnTHwPAMQ(
zz7C*F9CI0BJTo^yKUlbRCf)e0!m5>;@3ABqIadc#16dZIU*prwRJ(A8{Tk&z2-Ypz
zlUbLIQFXndlQG~x8VeoWvOM=E55TJZgZ`Og<#^tO=XJhSb;KMQ9i3^K;?_~eBK7s(
zx5lo$UvTg>r!RvKgK%kz)uUqdyq}%Sd4OWa-!9P&+9PNR`v@Vf-<H!hs(Ev=jLWDa
z)LqLoPld;??AdQ2_|GS>*ut#K{ODtzS(P0^0OkN`jj6z;V*`d3^~6Z1g@`p^p~C6;
zy$1SNe#)#dweCmeVK=yXNRBcI{*akJCdxIVd8=x^7(gjyx}QOH+x(a3^YG&iW+f9;
zyj;LvqkN3?7wP7~2qn8GwlP*zP_>$;oU=Y@n(EX_J$d`T36R3Vm(Lj;eKz4u3?fgJ
zm5C`RJZc~B0u-%=2Ext=^Ah3w`h{i>v=p~B@=MrB6G+_;NaHgH3cl>`?d{F=DB1t#
zy!_tV^(wUV^p7wxF^^>DYn?2JiHW=SVgc{O6$EVK`jd&jw#C9iT)N)iabjKa3IRUY
znl9|IVpBf+yI}>~b_-C08V#_|8hZFlUODt(ccLJn)@uRXpYyt46#6xK-vBhpup3$I
zO&1?^F&Pw73Lk}#JgZ*$%K-nj4bCxnO}saNGlc*nvaY(_a56W1G_GgUB&$q^s)@*{
z(wsobsr|ivDTJ<oUmX({r(qNH>C-0w^5V&5xo<xyVfgpyIs8sd$l&N!%>%M?ae1i(
zSU`4wB20><5e@#WlJ#|vI!PhG0mQu8+FIS4!1-)0gBH{cl9YG<Wp+6ISc<E~5a6vr
zbJk<0!5z99Yo(?IToB(2*jw#h!R?XX2(4Kh8Q2^(YK0m~HZ};@do(1+?v_imdG(hJ
z{O1(;y=Q&$^y!D~**XAb7E-t6q<ZJ4UaT;%QoNp`(<cO!=;VqhptFC8gT=MGy7d|-
zKv50M0E^k$itQQ*HLG^@U_wB)%I}H%&!>qe2(T{~%Z5nOIRre9ybmA5fJ9EEhY$Tc
zJv}WojPhTJ_9A?cUV*id-#$pQKQcJ$sh5|R(`t-5GaH)%&zx(<RHfC%r>7+>zdQA}
zr(BjNDk_=4zEcKDj*Vx^L^T6$W+H$~!FSN#ng50a8tMzlE_3&?)prKd{hhF6x_^!H
zZ?F8ORd?-y8F7OmWuE@$2!FrxAD#&O7tPTAzgp<B1BwSE@=E1D7wlhG67dIM&y@ey
zFZ}IoAcUNb(}*$uU9bO_{sMiQ`a{wE!<heQ?mLc%H*etHB>uxV{qrRM{tBQWt^U^K
z|JJB^1L52p1--rYzk2jX_ks95_y5-fEF1xr^WaQe8ts1w%<r)vo&XA#;?Mv2AI*2&
z5%P9<GxL`g{G;FhHc0Hi44i-a+aK!!ylp}(4JhXm6B7dvPahw7<pL=||K!;$sQCfD
zdah!su=OlD#cyN#pTlV`M)rZ9&pXX~oqV0L0I6zgClv#RPsepJz`2O+2w<Y45YrI#
zmv(97Xn=k`oQK2IcN$Jo-UGxn0Mf|e(DQR1qr8b<$sd1Tp*beoey;H^t+*nN#Em+2
z&b83B#25fneDX*4FwH`wjU<%<S{@u1%;TOYDz>kgRXDk7)t+7da}s>jF(hy|3yq^X
z-5)(0sCc_UND!U+&c2ui;_#QRI0lE0eKGE>|4<0g?0SXH8MCkOsm42L;J4n^ctY!L
ziOwoVVe;vnQsx>7BQx_;ps>+uHU8yFCpwKMZKo(7kPD-4gHWjFr<D^^#+H1!m?3KA
z6KBeHKP4Jq0mx4xhEI`_XBrYsreW!!*hL8(>66C;;lJRKB4%iJ*OSh@WuXdzh#dwa
zr^q`qjo{^T<Dj1%zJk0>+~sOi-4Su!c}HQQR{h#t&2nkwZ23I(<Anu%DqdSrAiY}m
z8T&N=)tRyZz8+`QQnK!CyfW9UAs(9D4+P38zk1|>x+Hb{)V>8{hiR8D+R;fq^m=wq
zYAXX2<EKO}2q6nCFLT}HoAkKmd7VD&jSlEACIGT4Bm5W4r0RE?C>C8$JCxs^q1}I%
z!+BGqNV6hemub6hN4R0P1*>11-x)BKDR_W7Vu+p$l0q9ATVvU-;41cV4hfC6*%;g3
zi_9;er;!;Y7})zsLW0qgaJ$jL95jsVcSq9e1=YP{E~U#`YOlIK=sZ{fJLhD*(9uq^
zh>fJu%5!3o3bMPZ082n3`b#{0!OAXeSEFKImXg8#06!+V$l1UkMZQ9?7F3>fPQ+sz
zWszERRZ6x<%cC5769_l_*slv0X!%%JstrZpqwdhp*EYu2E_nJ#EExbk1gIbF&ZPkw
z5I&bj7~`#V`2ZlA)(D898F3}yYfpZiXJ}cH4O2mmqmP8B7zDhFmd9nii?V3G=Li7d
zlm+eO=DEJRs?o6&(9uK8OD;&@r^?=&8){#MyQE`GjLG366-5#56&8=UyEV{lC^kgh
zpd`{Wx2*AjpgK;6U!k}vGw1<ls`qdXiy@d&3*r>{S&z{|E?!A0^=+}a@de}9;0ZE=
zxY0ZXA|VnH!^oS*eT|EVia^l85h=VPYF0h(lExW)hNT=}6Z2zkcVh%x{6yf;0r%6z
zpbe`u(}9$!8iyQDO3p94z)qe<X!qlwYGYMfZ^`2928+3^$)0ib*&uDhi`C!$m8~^$
z77!aZQ*Yc`T;PWFcx=j|p`E5qchoN^Pp=KAua&=$l=0G%iFMqM0onPz`V<Y}k9#-p
z+53w~8}7F_lYSU4aKW+NYAiL+J|%oX{}E)hTZFL->wdWd%hH9qv-wn%Rv+#;w8y23
zet++XzQCx2!#@D(EdDb|0Es;Wr4#G;YxTj|^S7z{YGJBR(&lUE-+1;MBO?e!;7HcS
zC?^w>%z)rQ=+=fJ8op>V0JJDsKXD_suM%HNJ}M0kS0Qs+A1sJP7O(A+5_WnwWuCJ9
z(8JcNJ6!~k4_P51^O@!g?qN{inYfc`%s!9x$X*XKT~mU~vOc}!<VkUF@RDIxP+<Ot
z+^$?3&~fAXP`2upeJ$A77;3@B6M^zO17|mZcYzGK%FZVszpvCbIEhEmi;&RDL1%X*
zm?&0_*|ATSm^g8G0eZZhrFK)toRAE{fnh4v>foS(N_`($;LqrqgRr2tHu1bCi)w<~
zY`qjZ<KEm{Wj~-`p_tE2Ow4x|0@b;mRaqwo;!rqy3oHx+oGKT>$|Z<DdZ@_ow%jpl
zjctHyZjEM8Qz(=$d3h<|V6mPs@$bnCBT&)L#k0(8j2yMH64Gh+X~KndWY<PPru6hL
z05C2|rFjlz{8(mmRnVeuMU0R4dS79CpXSe;WMMnnrj_dBn0OvuR7nXlPm^qQh*b`Y
z=g_8hzt@W(H?+HH6&FDg??&Awu{k8XJ_swpJwFI<0aARvC}^0<YwKy+n18&@2~xan
zR?v5-lp}}c)_{i97k5t_@W_i920?lvt7{3%7_Pu!iC#e8X!r7hD?!HK#-{-yG28j3
z5O1&1Vxyob%cgrsrrdoSI9Llt-dbru;P3`Pn$MsJuu-HEQ)!sPsa0W;4p6sFx})OA
zV5-l<m2RtxNtDK~g6h`Cc_#FzrplMr@2N*@IUL35`F|NWG(F?AOKb1&%OD<|Du<)1
zr*H~#sR|hyStX9IS_*yG%^fX1gP!aZs3r4HR5q&o(l#XpqE1ky%aU|J*#!6#dx)#o
zsl~c;`H2FmCOS1$&qhvBaPJkh$7boB36w-&JK7*ZfmM`DJ#MmouU0B#pOy8xDvF*T
zUn74Y3LLA-FXjv@t+_~aAg?#q?F<6qdA75JkAZA4d+&vdFTp@0QwP$CGF|y~H(iXb
zDVTcVS?`^Q3Ma{0;hPVkOiYqgDD0M5e#w+&Eu-;dV7D<|Hx9>SNCz2;u83X917WuB
z$<<E?rISqwmr(Z7*VYJ@GJp!1l`XQw?-R>foW7e=%j9u-!MMC=m%hm`Ei|+q5zn-u
zjX%;CRo2$>*7f_3x=X3b?}pK{ZMhdFe$dtPhGt2InY*rub{5)(PcD92M_|O-y?kSp
zMK3zu`zMr`1r&JCwq9jQ^GdVYkZb>ivRtsDW24O#!7jzKcbj~YK3sLQKe0}<IuE*k
zzude9bsUE7)Hq0nhwwnZx7l^}8#3xVn)Nd(HV=K)s7;mep}s*g?7hej!+UfV5a~aX
z?^V|!wR~eouzeQ0J{%WFzH?|9y4e;Yx#N9Hmw^3EY+LW0@Yqhd>l+#-$s18N*y<x+
z^rzQKRW>isvK5l^(XdEHrU$zs>5c&RG!=e*0?4v)jX}8A>$XZ0fvAQgAeFN<lnyxn
zgXPOLJxTjDAXKopz9QHpCN`Q(>ov`!TsI~i*vMiG{$)Lmj7naS=h{?y80gHoKJByd
z2qmWgG|4cY53%M-5T3NC_p6zJJb$4%#2vTEYzrcMzvX1=Ipet-CmRmv*QAuay&Lb7
zEH76!1!c(y>vX@e^iPU=3OEg-p;RllQYU$S%cVo$b2Dx~qi2au_0=Qw!fwj1-;qji
zYdxDNJwl@_Xb*Vx$_x&>Jh~=eGbc!Y=Z<C*0V268K?QJD9`|&22(rGd<kqtT71?c7
zo(z7fEpggNuqXf<!;_kf_h+=P;^7`7{He)q@YB4f<j?e~nbwA?9@7El*L?n&i_3Ro
zU(o1Zhn8C*Qr6<e4>I-n0g46>kprZdE4K8v3T>4pC(TJo#tsEOc99If-&UT9eq20+
zicL|dW~G^^caek(AvH)zM0Wj=C~$M{?Ew-*9BU0F2K}0or30{j<Qc?cps+_D6@m>O
z`5hiQ0z~QvOF~3M1OTB&vgAlvND}V@r!goHzKEnY_m;O`SI%&Zs{R}?-VCcmvipf6
zTwxOBCxFs7@}r{QJQ8=SQEL54cxC~{@wd{}`ufAxJ7K@r0i}@Y=x>}o<@+I&W_-x;
z9=)o%2H<PsHfN5H`r?f?dZZ+M(hFIQC8;hw97oZ-+!K`TZz%8Z?Ob2wqb^4WE?#U-
zdh842m))i&z%8P{?H#un+M|=|--v4Zn!)F@X|o~tz9+e<86d-o0MfT&F0dnl1_T_I
zJ(x$*0(1%paL$v)sqSCfB#KLoEO@b#;No$124!}fcv4|Ly58nws-SE75^_4C6B~3a
z3WSMAiiahD<flQv6y~dap7Y`_F^Y}UL#x_TpEFWtW^A6X#QahTJ&576N{Cmi-2Xh7
zVl~>cC;p;80y$g&ol1H0VCEtc6?Y2d($7^C=(WA@Zpy)8<IuG&<_D$$2e{YO9CcGJ
z{9jroj`#Z*=VS}jwCAQqq=KekX0KN^EMOfv6pa?0-n_MC4u%hOMLxHG^~2o?f{9-s
zvSO2s?rgwT5t(59kf7k%bH}@&wcmgs)#^ZldF%l)sh(%4^X7z-jS+%$LeNLqlbBin
zlK>(BCIiU+zqUcd$Od}yr7%L(F|*|~ht12Ksai{YJXRTUu5Bb>4)cR>842xjpqk;9
zqZ@60O9&IBr5@9svHH5V;Hnx^Su~%U-^1*0I2Yz0LuNc*$Ho#I5<`6>KlsC~jK&6j
z?cCz6c;avy${%tJ5WuRP1xHri|BtS(4vV^rx|J4C5D-y1R8l~xp+Qm*=|)nzTN*?<
z6#=D@?(Poh?rt1f8fK{b8yH^wzI*S(JoESm@5{{ZoU`}ZYpuP{54y0}O)}BG*o^vs
zfH|!qQ4<*rwXA`c^oj}^#<k1M0}0hhV&d_RHNJJDssyEdC|*-K*0q}4B3SNa%E9kU
z2>Om!ps1aj)TKnOJy|ryWa)SID*8rJ?Rl=PV`6%H$<^5s`dS`28xkx9u_tSHp3O~9
zmpd`wn{s(|bxF7ZB#v6~hX7cyMkN;7F+Q%0hD9u6Wo0$I?f(MU{NpgP#S{x4C6E3V
zIxtzGAk)~bE%-LbvmU<>@@Vfa<7uO<j~ISuL|q!uR-ig~9gC)q!Y;TSvaF=N0H@$*
zWV(YzBX#N|)i-aZxE%9k+wWW{5zdsn-+uM7kIXi%yKQ&o?e|^gKj}bCK)QzAinK(r
zH?Rcue(%?%D0A-p1q=rfc?Z?0lT|TiUpX;_bJ!6~I4Fyok~v*NR!teS^vqA#$_JB`
z4jrM~7x9<xh+z#I3DE^)`=<GD6j9!%mlfxN+||`}2$U=)-BB-MIBnBi9GaB?fVdBn
z+0H3`dQxMqY|$&8%CF~p?94S3OgV1cd3bxwJsUYO&Sl&crrE!D_j8kGIff`~`6qxY
z*)~14tlovbYHoRtvOS#^!c}^7(wFmDV|KN2j~=^WA6^y+#DlC#_bb`A>FifITjb?O
zso2M$=u@0g%!ZpdK4N(eZq-sMS`bHvo^yfrBt5H7secQK#@MCCy?s5p_wr!qN^?_S
zUGgTUL4fNuz(`7i2}@}ecbIZLt_$Rs<AIh$BbsiMsMctYqfaYaJCPmX*i3*J6b34!
zc8m?gCVsV_v*s0FX11@w)Pvh!+i_Dl>$)bnc_yOTTRQ$G`yGE<kpj$Z`8NMgp|sz*
zR`EYm*qkR{VMZUH2Sx2_r_0T~G{oMk$*h~Y|K#H(kv`__e{=_tlmPTKB7^LBTqBWq
zTCjV_*Qxg27R(nc(8vdS;{8QtIBcC0HZpjR@~lS7q*utx%;)Rh&Bi-IL019b-|_q(
zzay5S{S!ez5LXo?xt}CI=Bt`p_RrdBO$4Z?e@DTyzw!mDJ+{IbyL;P4bf4m%Q`3BR
zzkk7aK1jwURc?MeD<0pXXMc6$F~0tEEh8KVC^j@wc&`u;uSwx0G3#LC9S>;!s0;$S
z7H{0id~*7-SrQy$ReX>(E0GX~p5yKtjNxSqldP+%_Oc9c{Z(t_)?=|ByI^n3$nv5c
znx}s)RKG1b8%YtTSsLV3tpj9#oPMeM7RTI7if@|wC`V7%j+uA%W0~Fp$_7z$uGRnO
zW7wVDk&7hGVm6s!;nQsS8e#ta?EA!#gqEt&$lj-Xl2Jy6(!Mnj!%;UA(K4<-Ux2pW
zkC;Fkf#8DPoRH+1RT&!0m3-O(@jgHTi(&spOCze67m)=Z*-beQs|>IG8dePeTA8uz
zA&n6yH#e_e(o|obn)1|%Ui&nb_oJxUDa5y-N4xnRBXB-%nmIw;z|+i>sbROYUsawA
z6Q<8K)yC7e9GmXhDP_$I3bMuD`pQSJ-i)c%3Dj6olX>}0?)zF`HW^Khx?RpeuVWoe
zsth<9@5Ao3cc-Dd5LCy+s@adr+%hY}#4OQ#Jtq82LcLnrTAsbC++i>bEk`kStW2+Z
zW2>F5C7H7{?Xnu7?~m*aV3Wl|i=+5_D$64J4KU3<1#-mfV=|8kt;`nHX_EqJgJ30<
z26m}$Y{gGt3*r?;2kRkoUa2db`y!n|pVO$Zk!&0Gz(-@IW{=zUU429tn0ju~AC1)h
zk6juNnT)XGqzP+=BTlu?P75KiQd|zHGaUu#(rX2r9QTf!8TNXD&W%5$N|Y1_V}C|Q
z%T$goXAo<)c?!H03&W3}b`Yr0*yT%?N9L>jxSQfWjz#)w?QZ;SeZTKHM`1EuU?}w>
z?*5c2Z9?pQVr+R&SUTo^fbH+sD%6Lj#7*|dEix;+IOaC<|M8im6}gt*3uAyq%ykf^
zk@iMBV6kD_et%#gt;NrOpYWxyyL)}xJ%RiJ#*m)R&6{jm=DWsLZ8SimNz3VmB+Q-4
zoI(if6z!8!D<E<oTg>Ydul=p@r`d^S**)rCWk|7XfWez76mHxd6^c2&JLY`r365YI
z9BL`O6j7La8EDUB`?o!lb)(R@4;w{9tkkrDlevc<q%!E_w2zHZ4D_;GMaIV*uRKa|
z=W}=8V20wB*=SH3RkDfSdI8MLep43L;2E$|*080>4j=OMS;WnPYO4NhiC+Em7;I2`
zeZ^I5n%HFngn3j5t>vf+3JP@czc$YV7~}!xP7ZToJd9SRu^Vfr<C4=cOLj4`wBI4M
z2~}U#`x~qj6&1TQO9T}oBT?IGiO}d@cGWw+S9(Z5&&#TSFCrr9=Hh6akxw$kdYVFx
z_w=P8F5c5g-A_-UC$zrlxL|+VR!)~+Rm<%;WjdP_ib^B^{O$mT<Z-+lEJwGrHcorp
zo~s~R^rO9;Xh723*`C*FkuiY-G_$#~sKvv9LP$jFkIi>&`A%M3Y-tr+(i?7V-{@NA
zDs@f?RUXkubhY!u{j+U8%@Y-P&02m`0^oPTk%1*(NrG4Qe3R8uo~ZXrPWIn}1zG`9
z(hc+ORf22PbGO37(QEIAf4G1?AF-QK;z>EjZ8cwn#CSNoP)|Y#!Jpw&G}22E|2P{b
z@i5Vsx-sQnvn_4MjEnn(ebQSLzaG^^L@G*FK9=EWxv~;EOL%F&_oSg5VdJCf3VIOG
zK~xk$@N4A=4`r9rdQb+~XDRu2fbLLPN7?s4P3d(iLqDLfw`XNhAG(!W#mAYH_i9)y
zUQx3n_J%11!Kx=CvZzNLlgin<8AI>&w^%Y0KXE&j+Wnv<zRk(n`d)ggVv%(YDLMY|
zgsjB4?-aQWPsr12<I9xx_e^HjbQ5~Mw+6rI`Tn8cNdzc?SbB-edbZCx>D*7kE!Jfq
z5`0fuN-=C=;-D)&_k1xfPwXLS*_2{y5bBBiW<H5Zdc6LI>fLA`D<XfSKAEQ15e%%8
zxQ}8!B=6!)b6(6pEOa=};gQSt&fyr<Ow<6W6mftI&fv_Mz%Z<#$C!9BFcum2>o}c$
zNR^W7g2?u$$8M4+qlJ}IZ1=Q7eXZ14y4!F)<ViG;bBwA%g?a}{_&fX`R+I;kxJ@Tl
ziY&TTa*HO9(9yf3-@L>auHW_{9HFi(*FI7)HL$R*_a2wVXgGb$k~qls76It6UjBUe
z8ECR5=VNb&=%~9FVc|y;)$BYZ$s6pBQl&z$m8Zoe?NR9B{=G#(U}sc-)q)n2ks^OS
z*1dTO5G7>v;y_G*)s`V4Y0m(+@C*rA{rtC#{ZByH(c?B!onq$D4>x-ROXf*wfc+{z
zjns)L)A6==Pet<mOk3#Yav(+kZgDCK`oBpg;Lc8C>wt2z9_wFSt0!UdBcAUx!=EIm
z<6!H5FdF@go{#P40}NPf-GgsqKvI66l)ClZX1|B*0ZG`)?&usng>MzgIfMI#xr{@f
zr^l-_|A1b-B|t@+`k&-WW6@7J5;YifB5J{BtM2Q~amWJ!LI&{JcQb)q#?#f?*C$GD
zuqTmb$*1j_Dij@B{ZXcYt3ns_XCmvs^bdMP_AWG_qHW1bY*szZ7$^m%gtkWz4$+`O
z-5c2I?5mlFXSe{-0x#d6^Bz>6<c9_fHXYif%#bi)$jm2bs2o&#i+jOUfFwb$Kjma{
zor`*Hrf}|kOzvgTW@LTwK&GBRMJNU-vB^L6q5vSu_>EbN6*U7-pTmd7Nig-c4-`2@
z76=5r{{8tx1mfzk1(oAtm2=D=S00pM8Qlk3UEnGbfL$Mwf>O;n)!!%fxj$9YdB4pa
z+Xm0nqmCVo56UdI4d*l#<xAq`(LFZrg{~f<#_d|)Crt-pz{W2eU`M4GSn&b}cOm)I
zDs#=heCHLKFDL0uG42qm`tyiCs@yJdDc#HlzjSj#a%H~!m}`+%=?xoS%jW%D3r%q0
zvUiwu801Zk2v_V~m<=7)*F*L$7^ziEenF!jTMPq3v9jovkZ*&c(>@cvUL-D<hBixK
z`UE8u&c5plIXlSr4`{wfSsKBG?LpY@Sw#r^p0ZSQPo0^7!dZ8VsR%aqGk9KZw%a7m
z)SUhFu2RKu|Cb~{5oD<7N009%yN#TSzpW+Le89x1#d-qSpYA<Va4&6M3%*E~R&I^s
z920MxQ3&i<WBOK>)-&O}@QVV2aN;0UhdbngxVwRaCm4-DgOG4exF!iOixZN~30)tZ
zm~4+N^|Ckken4`%Ob`2gUepl#N}SzP*$c2&WUE(LIOLJS<h1879{`?iaaKCc-@+UM
zF<#S;&W^OChjfUIX8-T5&c0VzTix>+@zV~EA&OQ+9;>_#X`c|!W1l;fEGU)tCKmJ<
z5BInZRYdywIz|^*=EQtW*sRN;#`aj8GujEscX=6iY`wHg0~^F}Ie2RI5*9Dcp<+#5
zEmKt)9d~Oko0<aqx+6ToIoIYx!ypA)Bc;?8j}=W<|77g&KcOVk($ae^vAh_!JI-lT
z*Jpc8NU`WpU$oR_d7o;(`}Xd9<-7?Jai08RKci(DQ{ixy@gKfEkwji)W97LQKE0CB
z`RQH^v(Bfbr6s%W7Zo(j1NHQYl>uLRDF<``nQ>eftIScMw6gYfatWW(AkM+g*qoaS
zeX60q<HH0w%3DG4(ZkH_+#%lr_RQkqA)49$JAMBSYptt6qfeDgGJX5W_NcgR%yz;4
zK#bCrFu}?fz7c`;GBI1#1w0ZSd-u$2p8WfhE()slGaa^+;9mLpR$)woBq{mSr6CEQ
zP}QuTx-4P$Nk=5Tk|}<#k%GOfLM;}nCYkpDmKnw`U%Tu3{2uk7g(@qU^c&81?TuIR
zL9UDN8so2`<wLs2>2H7h1ROB&GFt7PmGc)nj=<FsW<l0TKkt%T835%GHw*RLDuq{S
zS}d!hQ+`o}Cp$1GXa8$V>L`Sy{P)k$5}v-CkbG0^T8m3()dN5U(sNx;zWP4y2^WI=
zqMY1ks_Zk4fB#{lBg4VX?D0$C{RZ?PS`>f%T57+e!IEgUT_D+9GfwM&FuvK=)wXG{
z<SE^P_%C{Ux=?k{l0PdlRnyQ)Bdjg_xy{gkUAlQSx$b#y-{?#<K>c1!qhy{{HXSJ$
zb3-cTBGBmfp#gd=K-Oob2M)Qc;}-dK+Arh=VuygMk*gxqcAZRwvqs^ovbJU-@qp5g
z^%S0Ovc*I-cNG*BvuweAblFu1)fROLfN6#kwd*<oYyc@H^jQ0eSDafKh6mlOV#4L(
z#b}YO-_&|{{P1!0B{Z|TI%a&c3e<kS0BabmjN&JEuWUn%(L)S?31dQ%0oY>QVJ0o(
zMDHPr8}F&`xqoKp?>D3~cECA*_GzZl5-!b=at97z+EDKSUOW3D5b%pMVHN1M*MXnB
z=SS^;k5-39KI{g69Ll;Ev$8gDh^9x@zv-7pmmK*wrzq8Q4MT?9O{Vj_^}ECbZxI**
za^KsepIey@9>?a}a0y_X<&62y>m<<yK>HqH8`qHU2PV-f1H-XpfxfRoSaNy?NE^iW
zn_nTKcbY~Pw`tJxt^GU>tYocS3<{xU60_d_r3&!%RamZXz1pJAn$gInU?Wimc5VDy
z#nfBdXX5g>3jZX@MqgL?#dR4$T8ZIF=ieTt)NdmpJ>+9fEgIcAJ&{ip%TRfS&oM(N
zm1E2ilF#_IA$fkll2Ge?^riKq3qAEY1QG@wvlUKw-QTS-Y%3=CA9O$<nWkSU7TlS*
zKYbmmZvt^iqu2duyN@Rn2hf!XEVU^1o?e#^2M!NUe=)fyk88T*X^!gZ>K=AdCr%V~
zjQ~dKK6xOCfzdnKV@1jJx&@_FN}J1yI+ArvJY*k4OI9(sK8p5%KtdAdH8z`uEKaLJ
z4$#2ZOP;O@`))<AI94k6bKnp{Dxy}^ivE@l=xj!HVENw=qEv%3wpahp{|`bqsD4M}
zLj7M5LI9F1w|!9z<R%VZdK0_C_c@YvGf!n?Ws|W;xYl-c4t^(9hXJgsB5ZQeOsOGz
z&>h4%2Eb)w$>+>Y=hCwjDKYQbm?;a+k7p&XB@^O&+|?w5<!kd-GUi!oOus)#$4K{b
zKL^eZh)(KVWar-Ch-II5BuJx7rIOn{(R~#qJ+aTCoNP&WL($s3^+BfIPvD)Zag<t}
zmlk0X5?vvVhrwavg)1Kb_4q$zp&b^wVPiZmKXS5i$9Z4f>7w;cP=5gIz>Ce|F~g3j
zvgb8sWKHmlce<K5Saqc1SFv^5Pk-=?$-fjfoMMJ*7wE}YFQ2x^0c1@N5Bx`uvH|19
z&g4sV5r9{H1mji7_*@?H!AXSdo?S9*MS<9{<4oeR|Darq;6mf2&{SOH=9Dd50GMQj
ztNl6kLX7KqV|h&4k%ywF@?6O(&tepR62&M-AM0RTs~us`3(wO5p1tJ4LWL^zld4<B
zz|{BNFiXu_IAh>2=5qe58jz3LkXSHL3T5tKq5!e^b~J-_CrGBcLgsADJg<)L$4fPx
zi>Z&#vpz7D<5sRlsUHHp)8jeAfT?t3p<z0EUjZ=S^{K-4=-YO*SPAm|bvu3hZ-lX%
zgEg_Q&yG05IAu=9S}I8Ek5u;6v2D3=#=kFzaL)N7k%k~`)=y<p=>sr(nEG%$jtG@Q
zal@c<+m-N1)=(}(irH*6N{%Db86%o$Cz3!iJY+PQBN8>tChzZ)1#h3k^1a8g{atCU
zr8Hsn;gFzfO3vWnvq~+0E|J41n4^O1a!G(Y=^wT*XsI#u%3=sRjqN`N0LJ=r{oPjx
z?L(QN&*VrX(-O`<0v#qqaWDJ2v&F3e3RDWnIJS5UvAir^kSA3!$N5eb8HtIB?yMVf
z5XhNLNRX+k#_|dw(|>~{`d(tK{GpT4+`&3PYyIm5P__B(^JnUYD#+=ic4M<!`?Y^y
zyAm-97*!Yl$8d^|=EM^<72@2Eq|E=r3&2v&Lcb690t9oMLC@Bg)+qK@xhUp?`u1GD
z5QN|_K+(qe*T9Fi02kHhV4Y-S>V@@T`&GeT5+GhxuA}BbCe`jq!_|A(q1^ZZiW23u
zTJTh>Feu|ByoY#~6?Q(=L`%^B;|6p?S9Z9Q!kmxqWyN#0JNX6KK7W!B>sY&>)0Zps
zZ{wV|Gq|NIdl_mn>a3`wL<kxEzYzyO7@Qrh{|vWi>W!K&*1IJEld_PrvqLvykERt=
zRrSqY+05}ANxe18?p3$o{*@MqK~f?o^2>m9`dM+sr!#wfUvq<r<F$L!xs^?UM8I1Y
z_cfKYaK7BQ*R?!#+=;5F_?H~0I*&kf@@tG#*31I>3Q{$7@NjVt0S}jx*fW}Ez!wez
zik%pr_;@a07=y*o%xV>?XlywGq0{68mUcV7I8Kg!DFi@1IK&1DblC7R&x1wdeJ^u$
zM>-F%W=etZEtjA0AD8dxiOKrGNXwl;Pu0Yo6)Xl@kn2frBE&Gx!BIN-|H0qa?4fue
zY!I=1Hxgx~ikYO0d0Fb8EOPG!*HyB}Ar<tC;}KPRu$500%I~921oOxrd|fb>(l!y+
zMmk#O;;5+jgX8X~o9k{9TJ`T3)0qm}a=^rYYq*?K{SPCY`gdGz43$M}Qbr%3fTNLN
zqTqKeC``)%<TRL;w&E&t<@#|&ys93m>%HkSQX@*|AT0go?W6vr5kFg-Z#bko&W2ZB
zyg(bI%T8XVUVGic8Of%)hh-$6t|tQwki{X+J;3hj_89OzQh$$T{?R1cxwS1p(03)?
zvUAc|cu_Xzz0aSQKNF`;w$|b!lSgg~G6Bp-r;EF?=2G)2E^ao9+P~%&zHbArY(u6(
zqd`ikAFtBp(7e69L!QCRv;FQVu0YS5S}?GHYCXy2N+NhL9zm*CZLevY%}!+-`^Oj^
zZQ6gGn6*OW+@?9l_{6LjM7)J1QBtIvBq;pta{1EYGVT(Ad+W!7=J=$YHke@X&i31o
zql>6su{GF)^h-30OGM1{uvq^Qz0U9>>cytFX|;NNq(kMlE=Qy+nZ}NdV}Fqi4`rd@
zHak9;3^;wE+e!MyEj-0q4!Y7_FZJ4hIgnGa261I^tj1q2q2moxB|{?^fO7m62&7V?
zoxXhvx5@tKSf_=`PS)y-<pT8QZ~q|Z&BEDF+&r!5Rp_|%^ANeWD*op2Sa5y-Dsr!0
z#Y@n9b(u<0d0dt6Gx*~GsYx_J%C%&9hz1TmXho6)S{A>)(s!$wjReExe?V{4hZ6=H
zj(e**CeuZI_OC6!cZFq9n@&@=2W6?O0|Wdr?f^SPfXB#sFKJ6VpuC{Ndej91BM<;@
z2cV+a76<^0D$o)$y6}$gY>VZ+J7jg|#T|9PSb1y7{gWdAPy93=$HU(R25<ezM(&;~
z+9K(m8$C9L0>UV;iL5XHW*03Z2yH&MUr9EZ)5YKBGUsF`F=?})VeSm~7YoB4+zK!^
zn<W_zy<hsrrqEEldHQzUo+D1imglUTy-Hz+a|zGcOe+cDN&n-^Ol7en;>m*FJE&k9
z4Dgy%LTn1%F0)w4(EsRHHvzY7(>KUZdf`$P1U1kKcVbeS$G6LIo?!4?91c6Ld=cB&
zPQ7)F1-pjB29^#~!Z<n-Qv6%u;h59;P^Qe7NJbsieH%P}IeanIzj;UiJjzcW1p(dk
z0rovFHxxhMva+bD<8ar8o%6|$X(J328-pMEk+`ShqtflIt@C;BlW@u)AP@joEN%VQ
z>1kEan2a=UD%r<4{_baS37~uQ)Nz})<z>8>&8Vb})p+*!beN)=xzL^^%K!b}ZkV{B
z#*-Q><MDCz_hh|&kMj!M&HMD_T~QH)7J&JZV`5}(3ewBTexyUNM$vD;BlQ{vDvg~|
z;o4+c6w1VR%1VM+MV&y)YL3U%oK%%l`EB#{-BOSjQ~g9zaj{Lw+~!=Q(s|4G*K^i>
zaU^7kqN3TX-5R*%>ivBiB!qH??-jsM%X9m8#LZf^i1@@-w=<$F{gH^q&6FTt^h6Ve
z;XZ(NyFSH?YpUT_97@4~rv*k;>&s~-Y71Nf@NC$58cYX3yG{a)>PwBk_2)NK0-W4z
zpq5ahJ~C5floI>b07!cdYl7|PztWJ5Vi;7ntV?TtZOgt6^oDK%4F&eo?vu=bZ&(aC
zhR2*yQ3ajn!5UI7<F(Vn^YhZnZCKF-S~>C>OZ&2A*-mwX^9;MmKvhe4MfdiW5`0fr
ztn4~bWm~{*Ij6It%-7y!XWJ<T71=Uxi?;=OaH9IL`^!Dyj=;9|^K)0Wtzs*x#FZsx
z0>8S|CFa$Yc7`SVy+9b%$$+)h%b!W2e)4_1_mek%d^Gh{Qc{UjbnWkp1Fgv{W9gU@
zqBG5fjNKf+?8pWtp~c5FJrnx%v;Se|#_pxYrWYtvC4;rJ9H0(qr<^uAnt^`PeL0K`
z5=59Id1!dJOuGs34-Ipg@EtD$z%s8<LiQ<2)^lke!>13bA|gf?jDbx&A?D`nX!;~F
zBkDFvdlDhj(dU%vt~x6!C=jZ9Kn;<O{BkH~3aL%aihowBtS2XD)6l)b7|GCb)H49N
z%NZKo7czaUc57FhBSj;2n&r?k=-wOO^S`nG<$d{FmHhDEWbY<!X;1oK$(Oh2%`Hy2
zgW!mvpr}B5)^a}^+SIf8o0m%<;G~%DJ9qCSJB-z9`~mBK?1}q0?u7C5#cXO-w{1vX
zsVPy?OTF*)hdgf}AUO9|YUS0d?qGE0V0#>XL;stbe#uRJY97x$4searV}#IU<^#?3
z4U^7{KIXeHCuC-wU)*haYiSEWiwZ^Kwskb^&%Y2&z<Z37y!YcrdEe$&1F*@C<nOJv
zylFI=tuD#$2yNX4o;nX+d571zJ$SVWHo#bVY#j_5V71mhX?s7_;8Xx?Q~`DU^swUp
z#@yAyHsPFYZf$ug=a~flr9ySixe()%r10=ijg=)z0+uS~S<ZhkCowJkc%UK?U>t(i
z@Z-OOGU!j?HT)bq`I4^v7NHrBqh_sLeTVIV?(_dSgh+{>>>C+p_I)jb8m`UV5rrfF
z5Z?FBD`f>>xPxzB%T<*h<;vr@4>>>k)x|PDu7XVMPx7BMKR-48LTN&IiO-n-LsB38
zjl5L(Th$0jmi3XIk<=^EG+I-_40~b0ZVid<Zz7Vd4~4%-J+ekXLBWxg5rla3an7Jr
zk}sh2)t&c&huay!367A26G%L`9_*A@p(SnZF*1n0(Xd;xG7#=3?78f<3=B@<O(K=U
zhVX><9_$*dOQl}Kr4FX;+9%wNF}Eyrrz3iY_*4M#skRzGlo_g>$Vb(oqf0inogU~;
zJ0Htve+%n7e(%ctt3uwDJFQF|#$f<Lq6}UVsSNf`GTNI?)k>?qVvw#vzJtH(q;!;f
zQZixWW86<OAw<8H9<q;Fo!Goo7-aB67Wag|roIU%Z9IASTMK7eJ4ddG<5jWL^zcxm
zUv1Q<km-Y8Cb=&5-rY|*($jTJ2wPIqDa33(WHW8h+Iw;dkQgbFfNY-<Dfc~ka}eRq
z`{2pq=urOCZfXQva^U493Tquok=+aGrFamb>hBqbZNf;hy-b1H3g~i8Jw0#Fm1o8z
zz$MT>Jkor`z;L7uq@yqWXxj&0`^DC)#IG9fi6~20J>^qe))HxWMS96t-fKM?ycF~E
z`(t8^y91@8da3l)lPW|a)05)YfARY78u>StEy<;lb`QAfYuK?mL(gIvO21d?BJ)_k
z9p)c@W1%ZRjQ#i5$q|+*Wt`UuUma(df~hNlKQ|PV1|i-4LO2pYQ+F;hezzh*<PQ*0
z@cmYnYDva@?}WG~l2+r#P&U1M<&`GO$^0^C6|^cL!B>!pR6Oc6ADSMBVBK8Qm_gS3
zx>jl;7I_H~N%k@Rv*W>hN9lX}cE6Y7!yGM2dd<tTkVX(T?$|dzwG+7enZC+K>d=;$
zfWcE)<@=j6doRHN|G>!Ng(JISpMwc6;?CzUzjAstq9eZ}?{Gl$BE?H&aqZiUmd9Kd
zXuD+9pbtNvK(Te9+7e28b5^uPj2?amJ=}8-W<1@SWl1r%7Jkk6iKEqYihIx!x!P`B
z<6>Fp0t|gF`B9TnfDcL?0pbGzBVR|*HmHtd=Ree}^OG^I5G;L^OK)qCP|yDqTeoiT
zraQ>Lv1mbC%EK>=2tzXG-Bjn+#=RComj%O@%deaiHj&zapKUMaBzK4yu-7|eumksl
zzWj*G+p|3%9vT?!?KJ|fO_>_-k4L$y`7O7~Y%N%4?ZfPkf;C8By^rP~3u^#H;62$7
z{**U+jFyCkPx<o}D32`m(K2R*ppuk5HpC6SfJU-FiEAobyIo_onoX1XJ38<ID%Kj(
z3B|i@qPs5A*JlYhP6CK0Zx#f$tTh@m8Ka6>wV9Rrk8AsJSKT1)X}D=wLs5lpsG!Eq
zGBT|w>-XGRF$Za!*MB?->-%S{IFKFg>Zto{Qkvvqk#kusNy(X6yqx{xtREb$a>O&l
z1D3)bp&ue1cfL4^iC=vc?5D$XZ#fxD1}ziGX4ET!KVI1B&C&S^<~x---CZ~5z}h(Y
z(Yq%N#7SSnE?k7qZ69<xv5d<ye0(?+@I;$uYLO;kYj^u%jc8nFA^76F5&W?%Bof7D
zE&<UeO@u`<;z~b5Nazb-`I?Zi#kBp%3nC`z>4gk^W0HD;+R980Y%JJ0Wp*a1{CG^+
zm%v5c8#_565E+4l>GS#$ON~YSF?5oX!AZHx!fbAkxP^=q%Qgt>Ys`-~0hnwQ*ynFa
ziGjMRtV<p{6Kw=(1K~)(;d1DgJOmU?11Vz1uw5bE_ik4EUa%|(f71JuK>h7BiydJk
z^~MP$ZhkUosTiY(U!nrxF{sfiVl*z7y{-|V{MHKq?KH`6U6Np^%5T-Jsg(N${Q!k3
zs5%jRLBj=jI9jN(DfpvqQNwzEbuJC9;I|SMy1<}cmsnP|pS;(kH!AT=`c>V|AU|9|
ze5GMX@L;%bAPfERREW1$-U+Q8>*>$psRH@@_d#Ak1hK7gN%il(eN$?7(qIUbe9Av&
z{J{fs&w8O><uZ7+UuF2}i6nPM{BWN!cmx4&7@;XlYQ~8;vLXjeJ#W*r<ny{*ixd`|
zffqSHu<s(~d0#>wy_o(H%r>a1%b#1LraN?svlK>pBq448_A$RCy;i4?b5`$O;~rr4
z03vjjpDXFe70=sVP|#K!=1-5+!D&KT%k1DeL$geztnMfZmtMj>`l$CWUh1CjZt#d3
zWa3^)b8Ym|?(;ybTnG3vu}8%C$WC2zFLvH*UN%+9gRPRM3QCT58&khMj5ywms*Cnv
z+w7Nck8j3pAtO{<MjcJS&fZ?wbM1vt?q^kvMJ;<T3pw(A0o0KJsie1VNqjP78~w6F
z8fPHKa~{Uj_uqa~zpo~LW|I3){IG^OF0AMIC(HQik{<5Rr?77ClkQ-Mp-n^QM8zYt
zjvDe7o%R<`liXzZ4;y+@`Q*s5!nVc93bdHSMhXIATu3j!VSlP6(FdauW($rF592KF
z<kR-)f2KDZy-ZD})8T|aRN%P|pdxo!ep^0eQfsjI@DT(>MJW@MRLkf(+<H}c)^1E^
zdqi|^8i7|m!)EOD-^2Y>n;CI++=6n?4{r+6d6P>E_f;Rj571+ngXW%8X?!#RZj@)p
z9#{m!ffxTVN0*Rf>fDIdsMaFHS<d@Qn7AkG@mOwvV3S&*0&c7<PHt~OUjk+@qeX0$
ztEMRS_4y-?L3l#1!)vTlCd5RfQFz*D?4db*_9>Tm{F`B+&&dMrw4=5A)vr0h{~u;G
zV4<;qw;FvB=UnQM81-MiJl`Joxk;|i8uA@EXcR4&y@8>Jc4VhNcv5@XU<VF3XyMM{
zyekI+ef=%LLaLQ<@=trEp+^EAK1ctvFnh3t*_#)dJS2Z!@&JL1Y@`(2^itw*D|w*A
zO>ujh?o`#VLSaAqM{hh&hY&{%FDx>*3tdcZ8<RaZAp?@@?xy8bUdd0bC)gg*8@V;b
zdx-bP!{O=mVJ0#6!&zs#d&&rsls6e2mekoPt@1LFY5k;4OvngGLJ}@7%|~LQ3fQ0L
zKSq!y9#&wrE9QQ9JgZi(J9cwaZr-(J5TBupTiIyEr1z}7pj!N6syD!;^7wzP5J@z_
z=^KWhuTn?zE}#?5hL!Z6BN>_l)1l3?9}|F64$Dm!i-jX%K!W-LrCD2UqROh&P@se=
zW4b~q7+R-O>v}621s;lu1h|!MPGfr4Y5bT749F|htKY_jh?{F)Tu(TYmGX2J;5G6k
zO}&f?{@R?gL}Q~-ct=x52Ur^v)Mu1*$AJ^=nti+!W?#Bi3pKEE>@zp6>e{WFfuGlB
zN!Ou$f5>a4!%m~O;d!rg-*!kLw!Nb@vvG7(yeSGI0h8uuZ*L*`kybhYXH5|S7^19#
zd+zl`h<Jy{uJ1>w^$2R`rs^Pl%+WU8H@8Skyzvk{u0h5>rJaS9MCv--9=X{AI|tJV
z2M6?sICp*it_PWleHku+a)(pw-SV&#zvYPO7;AMF3)439-pahmx^u5?r|57ehWE?h
z(~Pv_oY-kMh`micNY@bI6Tvgnm^Bs`P;RxoLAs>@)IikM_p=<3rEUO7huy-^56aZ5
zn6&uB;60yda(d31pJuT*IF)wmxXvd#AprS94gx(E6cj8hW1D?18@x7FnDtZ}HmKeZ
zx1wm9J6QwYXXIS2h$0{9ta-RMfZyaa#h2eCcuVa5c&v5ZkJ_==J@;LAE0W!Mt<dGE
zqgR00i;($oj-d;C_c}cTLn|;s3Wt<5w8alw=&pZBMTMo3>I<jA>lB@(i|>ZY+1#n8
zEveLA{P&q9jou}mCkxw`>ntEDwWH-xDzGT4p8iRhw!tjNV(G8QO-I6P^?|>!Cl>*&
zArRc@x12H;z&V9><PBB53POdQ(&NFQW1I#Sbp2<mLpN#qb;ohHH}X9pn2y&<c6HiL
z=p&WD^91mlzcT1gF#j&0f`J|`Q1c9M8#=gb-XGr^_|CvENs)KFWuz0jj=)#Q>>+@G
zl#S~ioSD)*l4!5_=KvPK6{L>``JUD#@fjI-WQ75nSg&TRy@!Rsit;FUO>0s`l^ncH
zE2E;u+UYf2WlPNmS;5<yt|(};Cv!W*dGtbUB(+!#+6_<mDj`)qoV6LjB4`I#T$dlV
z!ASh$uA1R!P6`dYqnpy!u2#U05ttevM*YSr{4gi0KoM~-pq!7`*=1H&8eb<H8J}eL
zc41#^5l7X=(n9P>dS4zF;jC#7dIcF&8AV;c0odVYw~MADRb@ds_?(qu5*5Bvfy2oj
zj{P?OZCI>3Ohx|TGdD1XB|a40cD$LG{Tez~k^%X#hRnJ4(X&;2D|bVtz-~HW!j@UI
z#$Q!o!lJzGxxZWchvHbA;Au^0gCY^lm6Uw}TR~x_N%|#v<$GCLchhZRX@+A<Xm+=<
zrjy7$-yFA72xM`g$-8*(1HFXAgVn8Iplh#43W*1;Nk@i%vNY^I#otKv8@EzkH9*e=
zJLN9~;HUsiYAMU9p@Xf9Xc)hoPY9+!Qxg5Wx2AhhzuuxP0CtT$l~J`u^K0M^?NT2-
zh_I~E-fO;W6-Gp*3XhKuY4SemcOv7my9>1JqUHple7S>ODh&goi6^t#b(-ZYPfUNN
z)h*T7mo<+BP-&yGC?p47eMM?V*oL98n<v^k;Du*wXtiONu%_j7ST5QXn{5EGXt;#Z
zKY#mO2CT-y!lG&(8B=Zcq<B0RXWH0!7DY3Auy=!r>2EMbsKNooehsPZvDGcf<>g_f
zeI{Zd#cJeog9FmGDBf4A>mJ7GpuR}-0Y2k@aIzH3YwOe&6zwXT+tZSUDfhFj{N1nI
zwAlxzz%U}OiaF<;cOE1pB-WSb5Qi2ol4l6GKywT5^*GC)YuPvp(#|N08bdk9y74d0
zCSqJ0CI$bNQ%1CiY2)2{!(PHtdyCQ#@{Z{u)lpcv*VF`*%_#QyR}0?Qu$f0is;h+i
zKR4Ic{jD9xAFgvr1SEtW&#<?2l=luy+J#TgSxpm7F>Oa*KQM*O#@<bR@fk;zWpMC4
zY_a(ya-nLuVmf->ntMB!)-mj~)L6cCw$iF)Uqb%w00i7h4zJ>`v7p+E&kpW3@zkFT
zrsvOIyL%(Y2ncjwP=D~tajONI(^aZ$^zkH?lwnatfFB6f)*o171;kYl+1n()OdgkG
zyd~V6+$#P^@%IUa-?0L{L^3>J45GOywg1!?)&j%qPRAQA6*KNp^wx%3)aQk@hCf)b
zn4w`88q7iawk)R9z?Wa3Uv}~%|AIH6pbDD}GV-)>uW2v2Mw%0dL&=h!0P0|_<@?Z<
zv9Mp|YMnHht*{#sY-T#1BeIF@^a6#bLh@pJap1n@FpqG<yABBkh9Lo7ov*j$v-jny
z2s}4VC_%qNCD|Db2qbPx^PlcdF)cli-^GEvaHvGrwm-_r;re2cOkE}wwgU(Z6&NrE
z@}@E;Luvc)h;m`sJEjkMMyE3`--N%QfzXE!jAs{w@7)nEgAD;eS#;f{Qgy?d()=Xa
z{$zKt<pJXSAqfNmDXg%pQH&RP`4aT-L6ULIzMv(|mYH3-vXV4JQPHxZyMKrtcb^%r
zg6#Llg^x+RGT51ZBSOHP#uAKLf?WmetnPi4Yet8MCmfS+TCQvN<n2El>pN?JG&`cj
zBH-_Xm!Q5ObU_S;15dKdFju}^i1?KNB{{)WCg_R~*Ii=x-YwK9c4y!oV@JL`2lFpq
zoXi)<SIK1w+t}Dlvj&y{$NoDpu;@Q#UI01-iaTyl<~>@|&O2h<r=eDPmR^WgeQqhG
znywd^PM~+N9{_|PYzWkhd#zP|c=#~N|8f*D(fuyw9=pHZc!P&W96yu(!6@%*At8zn
zp{6-hjLZ*wWMy@a94<%}UR85-TZK&W^X4r>Vk{+1s3^(y2i;;@_hcjP02@H?Ec2>J
z`iwc#E%p4X=T}K(;ki)EM~(Chxp^A-N?9DtG&r(Favmh8)6KTc<c)7hP})7xj09}R
z(L#}?5a1PShZY*}-G4=5j;BI8tLku$lZzG4-zW{$-#y$M$F8cXvgf)0MlR|ASI%u?
zqPj=|p!>#^Wh7Ox;Qv_z+0uX9ZkOWf%i7ql;skQkGFvUJz=sLZaUT1Fi4uVTL<T-6
zP&vqv9i>gN)PY+jF{7>5RD5qKf-*tkE?^)GQjT&#zWmrkksb~KfnhjTjGDNnFmfmg
znknN!%m$j|CgW%#gncf_a(Q!xc>kG}DAG&9Og=<_dq6sBdH~#P#7Fj|;hAyZzO<x}
zGDk&vwWR35#WiP;+B>&!GGS#WYnMOho{c%`v!Q#}eozZ_JLp*C?5EpcO$Ofv)j%9r
z=yIL%>8}sQU}`<rMfQ#^qjNd-5zyV%wl-bBpyo_O&#>lTXU7^ES{Mmz|6<topPZ+J
zu*p=>VPrwlbX#W7j(&-UA;?jgTDmm$vsHMJWA6{lKKcyf@s<T^q_D2Zx5k}|02<mC
zc<-~*5WI_=9qgl6kt#m{lbLq+_6F65CQA%q`(RtIK`vluM4@P^UI<mYahF1aybonb
zgPidSWvNttigx>(g9M4uXDs3?&PV#=`9bb|FuEbxQ1-RyndS@CL^$6-hk9?*n<7Ph
zzQU{%m^l4U++{-6ZQ%CW{B?_Abgf+0xy~j7ku}{R`scuet9*Gqb;cQsUppmq0E|}x
zW@uF^eKyBX^nr5^hGLkuJ?MvCK2B)zYlLV2fOqrP<lBcR#FCvjo8?8t0)5l+Nrk1%
zM(P{WboRY5Y$^c#Ah=nrtOG+Pa_=*o#4tbdvjPsI&zCk0Mmq$(7e9^Bt}&)3m-Y;o
z6$KTVVQ3=mECnVJQKpTJcCyS&H}9*dgXF~cmamR1X_pSK+CNi^HvZH@bGAQv1_I9?
zQS(s~*en(ZZalx?gltJVIDdr6H^fK&rpEu>!MTFwQ&u$y%`&UxL7%CzMRRa)pkZXp
zA8y3vKYvhA56$Kw>Qe$T;Nxe&L4UGpZW=2Z{Y|+M5?@EdnDc;WA<xeDEQyaRL{?fY
z=L!HOPuZQskwK&&KurT^4~nDke($ONmcXu>iTXb@7Rm~NEqd3`gyTHB8!$X-uXaV+
z$8`XA!Yx<KyPL9si+`pf%l7qUW+v=fkuIY8QS`+fuBKYXN<#@9*HDBEKVCh4-Tr`U
z!;{!<!9wKOlH<16s=4vp+bV|_sEnF3Chkh6$_jr`ib_S%J1f$SfvH|?x4*SAjMPfn
zKPK6lkMj?_MJcZh&g!IdYc;UFfbmpMhSQ~TY}<Nz?g1frZ8V3V3Wfqan_yjl+B8xP
zN&4IoMbFiy=rwiBeN-<s)8~zk!~aslYT=DVBP`;bU3jLuYvv)}I22#Vsl$>6F!8No
zYix#&&3^LLhXPfq@)n6fe5!P5?Z=N>8owiXE+*YBH!hde@hR;}TH>P%VpDAFw-RrR
zaT?w+`Y^gMVVaIfP6|~|spvNAwPcU>+*8c_^u**U2Wx8OHmI9y+&PlPFaaNU^`r5T
zO>ID|HYyH4@pghSiFWIszGW8KSaVKd;Zg0@%wJ#u!viyia~c~30L<Lq>fF5pz@1Vy
zV7<$ZNSMDuhcP7<>t9I>PodTMHBJupiL3P+{6&18=XW(~G($;b(PMml_*cEjE!!WN
zW(Eae78j@b+^=p2?inqzTOWFqqx^pN=+LgUvc`17ngoo(O+Cxd@?s8I>7V2B;&syb
zls)mZn}Gar$F9}keGHQ?)hjYa&JIszE|yuP_Anj?^D&mcm8J1}=<_zR)4IdD<69N{
zi47o=RBCF3BF)DQ^hxI*2oIRKa8|~dc?aNRW@ZY6k@8wyK##b1&w%-6yIUpwF)Cq`
zB*1Yle-0ey@_HmuIge}Hl1QMwsh!dIK}(t|LdxoS)j%TVMb*Ik(??MGlT4Smx62=5
z1nn2JU!%SQYd*T5kIW&V)H*hm?ccTZI3v0QX3>pj=>ZcQK>)4~8hZg;kZ~g+I6pit
zmq86S-s4h5_zQ+gQN4O9mn&UGyN*q7Bm}l>^AfJtnvzXz+YmXAGdkwq6dWY{llL;S
zmkfLV81BPzz7bgD+I5e#EqIaO=}b<*3nGmNu<UDN_n`tbN~%hFl~~Tn=bC%K1qd%K
z=W4m?N(&1cBk7d~%1o8<s2g{Hy<Ex;TVHl-s$8a92E;XHUS?vNPFqw<k+DAep=tYB
z(^*p$a^Ev6Mla_Mu-70DHYSE-A@y-;TSK6`KF_(28R3hMcU2cZZ&^V@qvq(!Xr80`
zAtJIWx}b)7H`ixe&IK|_`&9TH5#Pcrq$7DU1Q-uzlmZrdBZ%{r3L6-2e*;)71q<@c
zgF*x7MH}(jOE2PAy-2ZUpPu9;{9++ISaGTgGVRDvtB3LIhifpud=nTmvEY#DkUEz6
zGoNPF$NPTu(xZLS7EjPUv|`Kc*;Q*gaX%i7Di;AG>bNdqxg8nH&8H2DFAUmY%mQp?
z=TvCj;0W;zZIb)Un|Pbt><e9=MOI3|hNRQ4gCPl({lP2Kkbm=buR&P^xTW7u&T4O8
z5h%hJhOu=h=JJ+^u*mp`-@JKazu^qrvI<NWwJcR3f-4tMmf)?sIzb9Y$;VNrs+0nx
zUr&1eAvW9gdBj5lh2GSElWMVULj36CQ}$i@ncpWQZ(JOF7m<yEpcl0Y-s?>R-Z=^L
zNoZo&p`^sTAZM$?h2US_DVpfc_$NYK7zYx!m_-h2=lQgDGcquFv&eY(x!GLp6DKD*
zgKf)th^S>72(PU%AGZFuu=U3^(#v~JNkPy=w8Yly@pOPFFh8noiljXDZz!9J^DJcB
z@H&-`aY%ik+|)Z)tvFR?vnr9RQuI1kGFe6&dbs|b*M;4n=erm1q|3yK_9-c?7(gn(
z5mu#Bp{ur;*0JC=8!-3VRHyB07$<Yl(t68vrRnh-C6qdp8rBI_j-}DpDbm)29Fn-4
z80DL9Vew)~zBBn&=P(YWGXhmRfO(RMckWOB6E37E!edqaKX))}pMMBunn6`%OS|hH
zjl2^iU}snQ`0h!8#|36Qw_|-l@yD02Gq*+s0F}@nRCF3;5el{jh4Vfc0<lw?c%aOT
zuWF!ke~ulB*2_&Xg>>Wv%?#OwbzmD|A5FHDXX;g|sXd1(OCq7_9hvnj9f+5dl>9>X
z^&@kC3=E<&j&weukED@`EZ11eAr3~Ms_H6LY%IaO#TZ_XsKZa)Si2-rok|2Hqnq`P
zHXap}yr7rK#-d_7@SXVFL<JRCpKhq;1n!x6E6{_SS|;^B^vRVRw82>Eq|!=_m_*km
z^c>~8lkQINn$&Y0He+giM$-G_DeH4gNeA&9UT)Rz%X?WV1A{-kn>Ka4{wBTw&AU$x
zR~IWHICUM(a~Z-9jvvx6ym(uFcB_T(AA~;Ib+V)5Qu)qO_COx<CGZw3d&6{j4sJI3
zjF_#oCO*6%{ttnC&($KhRzf)BH!P`b-`?N*<kXI(z>@uBv}bKJ7(LcU2LJiaZ_olL
ziKLX8q=U~Ihgz>6D6Re<8c4bjAnT;tGTonw__~h)YA7N2rH`*$rRU3~x3*C4?C~(0
zKL1ZUV!tGEhuBpZZ76mqFq;KfEOECHj>!*!{eJ5Bj=Q~cDV6p$a$9p#oCzvhmcYGb
z{}3JxeJQ>_!~E8d$CLU%m{hlvM>v}oH$J}qd^JI2g=NsT!j0<DUEO;!T={iK0u&Y`
zVjZy3j&)?)ynut{YC4xN7sDd)c{zSLeo*&1{{Sjv($W~o)_uv=?f(#kefj*^-ead1
z@POX|a~!VN^M4qQxrGmN>A<x>@*&7QJ)XcwPmj^T_Z$6XSF296&=6OXP@+(>02s{%
zD;oA81Eh)AuSF|Udo%_c_HR%{@!o~nPYSsHQDl^}Gs|rAF6s16?x)g_VyIU~|4P%f
zfFjQS09Gb$B=4;yhvP6ZaS9#U-q0Rt&t_;Rzg2D3G32T;>0_^twcvUQw&)n7kLG1(
z!;VC+Z7@uG2R=deg&IJw1NnxZ_ra@X{imAYw~~jM>pS+(x6<4jm^7DaOsStn@U7WR
z|64i-LW-|;cuIW0VdyMO-M)EwjZs@2Idj&}Mki%~&S}Yzow&>0h19az;KatrZ!T-r
zt+=x?x$Ruvzu`zNRKV#CZK(NtPL?)Y=Fspl*RX*@guKVR`pnR(35g%FBYw_<Wc<KZ
zftOv^-hqV!8Q)8R^--IxS-aQV{8fE;f+gG26Vo4#;D==36|B$Zl}1SQNO&Q#Suj7-
zak6E>&PSv30aU2`bZpLrVnPAunGhcvI*cY<>%$rXxb<peKwv@}9`gHsD&N85_=b3e
zPMKS{e#>y=a;c6m5sm>0>;VYdzm8WKf42hfWx%#@=@a+SAMTKXcR$6ZIJe^*`nFr9
ze6T*8+aO05_XoI#=1HK;cXYT^iHf@Mt&}}9n@IpZjWR#V^Patdfa4*2K6{Q$7UYEJ
zyWwH($qT0oFd+%3=SRfc_%lyknO6=LJLVYTxD{z`B@o%OEHfVXZe$e(7K)gR?QSca
zv}oWxUywO0l2`Chx_X?o?YeI(W%vz{KOFacZ|XMgA7X&*w^Z7t(h5T7etPr|4tVbT
z@K@`CjDTmlW5O`C;3B#HlXMPRDRcSzS-x;p20#$jeAtqVnxgyCXKW#G3FP_4D8ftY
zUw1z)eNXp@Al>tPKX37TTFW8$?~=cbY+S81ooINW2rQWX({nfTw`Db31n{o``x3}(
zcadT2LvZ)fyax2-lgY>f+<&+}+)8Z3*W?;=X*s&+H)_WjpG3q}nzfe~O>iJC(UfT+
zo(6TOPCB^(h6%xsPv0r>5`+bbuZXXDzPE#u(ANd_!_cgbUVE*9L5wPQ3JRGa+=eHE
z&5cRI!oM_eB&}>petoB998K%AnrGUX&bG~P@~D)iKQSI9M7}O`lXx@HOD%apg}nGE
zV9DD=z0X%GtSNCT|H6uB$b5>DsSey~uz?w~gd{TF=lO%x)EFQ_6&3dY(wAp|6cCh!
z_^?lTQuznOJ;*m*DJa8IP0w>mSLrGz)hl8=#;5RCMT8ertP0cB{y4D=s17PRTVy6C
zo=5Xsmt`>11W-?Y=92mDMw+3!#OfaEp+;C;T~$_IGiq`qvc({<`}2d88_R3ol$?-f
zdJRv`dfzV-O>B*Q*7a~obV|%R8FAV0L|<=&1ix<G?i}iqZaZx@4_C~AMCSU#*KO((
z3}Ah#e!HoVANbnDJ!_&GCqMKtN6!!XiK<WW^b?#xBL8t>X9HfnXSi0YYCqPrUfh2i
zJX*r5zkyDeu6h<J@hx3Koe_j(+v53>>YhBa<X@7~i~YHk*`_oNeN?gWJ+<nGq&dnJ
z)<J6J&Ts`IveN0K$$#&ioX^n(pB#rD_~^==k}Q;L3LB6xU@4=a6^xXKoww_HYP@Xy
z?O~+kK?D;>=P0o0#in?%9XwEl!zQnHE<Eo@Mo)gJs!}YZwmq4Z!udKqM6^AW<aGax
zPt)CY{bBSw>-BHwFXDBXAHwK(W}4zBTvn7+*hOzLM6SBpx`3OXrM;SmUSs8J1(hz^
z8PUi^G2nEw{~5ta`9vz}7fON<|2A+*V98L*K;#DGvxE%;@B`?!@^&^v<lGR{l;g6&
zq7HNq^$EGJz-egbZ*l7d;4GA(QESckDwGT-0)D02ZJCAoB_82|(bEz|Hi8>A5(CZ|
z^aTKfbDc4-B~s6RAqWI%y`7PAFS=ytJhhpa)9*Z~kUiSFyOSM`E(C&r(LfqOOKhRd
zl3nx*=Y5(FqZR${lk&Tg`_>$;mcN)O_n9t#ya@cmH*|W}ny7ihyw146HMC;Sw)W=|
z*5%1C&P?_D|DPX$>nv1wUdQ4sv0VSuoudNe=cwAccb=nW5}8V!h2I5Y84y(=Yc0TK
zwjkVkyvPcRpyMo!nny?kOzfd=8VpK{Q%iCCmUU${C?Q2u)L5R=;(i9-S|$IpXu**F
zSAk#K+!K&rFK6midHEqxn2(sY=A)!^XKy3E2`u89WUi_Y7RPt0;Jw(!pHGR!|CI5u
zyQ0YK2|UaI&*L$I*F-AuQ-hVj7e_bgS9D<GH^)nODgQ`YKc%swoy^2tr}pdL==DKZ
zpyO_|Bc+ttmoh)J=V_@OnfAVxUBxHGg$8xsc!dXtaj|*XW_BYBg>O?MkRY!i<0fb7
zFRcfgdsf#K`wLBK`rGtb1;czAJHkX|RB&TKnKa2V>iA_{Wy7<Rij~W#8F18J!DPg1
zY3P;N3|_%rL3851V0eWdk(0{{t!;QcPDh9fBCXRvHFBtt{}b5dkv1#a=|v2}GP>cO
z8b8`4<(?AavU?Ql*^2u5rC7?ybaShiD*#=`*L8VWSmNR*1dDR>5ke(^6#WmO4`?8c
z=YLy1D+mMpVdGAeM&)j{LCf~=U(mLL$)9!=J%#tN;6}aV3upWGxBwx@Mq<HbEM!CL
zik<z){7*|mM0CBm2Z#~&voZfD<XDI2c<<ngJjnj|>zY=U#`Xh9aiR_h=tpD}Jk=`M
zJ8s?Y_d_BwN_KuaeuAG6Ey@L1qlQTaK%X_>-N*`q-YuOuprE3A6#dGkRl;16?o?E?
zs|Sz8%objL>n7Ko(LXr-&fTN!n~iD^c&(#vBzD3^CjwKY!k`9FV1N2Wjrn==>r+za
z$;2ccZhCdxWoqFHWe7pupa_qS_N#Y0`-%<QL#3Rsf(!D(s0Q^m;iB?z89yd=u`ie;
zKeCL3#&jMGSw7hxl!lop{$q-9m>0o9J83zm79L;I#|I07{ThPS4Tfm~1DcXO?*Lx1
z-XDk}iF3gl9}ctkgbQ9_#2!nyq}9RH>7zA)P9N8&+Y>o^wrwI}DH#nO(^4yCR=u|g
zH;Pu8W^lSF*1<Kie<UN8hY5$A?4lq4UtA4#Jb(;P_WPaBm`>C@6S>X6Y7?;^_MYB>
zGlpx!8Q2M~`)`oZ&{{e>1B_tjFG&hE=0U2kJ7uU*S{EnMTGcE#R-WmudaA@3dzO5P
z;-BSIYthj&uD#k)@+jBVz2-gH3q7A>!HXpKeMl3SDlx3FT<hJEYm?&MWZ{h+aCPyN
zEg2sW!zRkji5zxj&>bBeVSO)0nj{2|{hKps_FM}nkQNC+_2_JSIe#}IxfWt4_sOEk
zj!2Q$jV$oCK7>u68}X*ZW-v~wdC|td^7CLw^RzC<vcb`Qm!LlayWNkKDZSh4n4%X?
z_Bs>+1bb0gBKBE%`qI)QuCcp~h2>|rOjgf|4odNbIb?@Fb0NTOB4ejN6BEf2+@Z!i
zCo{GujGpt?#})}TWDP-wP+k@Xz3T4Vg979IKj~*ler5O-?Pl5t)Oe`OI(M$$;Bo|O
zD$+WbaWSK19Wa}lS7BHjkBKf)0^@k`VXC&n_LL8BrKic%OpY?s-6m^jXs!QUVoa>D
zxP2!ZMh5eT_su{BmnLpetp~UW`-26n#ALTdQD|mAp6L&8%WCkj`21N=TmRm*Qoxb2
zyBRawaqBL$>nouesLM$EUGh9CP01|Gw;oEu(Le5gL6K<k$7fk6&<l1D>~xW`fFQj$
zejl)vOd$57>6nfn)x0V@h<I6VMf72mnBdEJxIhH@-N}XKmX=Ko4mYUX`RQ_)6k`|x
zEXOon0HJK=Vt{4IG&1LBc#Ia^aigI>6)7>_fWJ#idc=R)tH3ngL?Jb)$aq()FT)K+
zn68%%F%vX4rgc8v3J44g<Wd$E5up$;q*pj1IEEGE|A(xvj*6;nzgDCqq(QorZY88a
z>F)0C?vn13Zjh4h?(PQZ7(lukhGD)zeV*U*u5Zo!3A5&$bI*P6YwvySOA3pRGwLd!
zCM)_5d`<tTqBC&$fKd_?4KWl$>`FSDr2klrj(0zI7Z6glkQo69|9T<vRE*Mf6#iD|
zt`k#U8y{jdaB+7t3`_p?P%Lrz66z2Q%zd{<>zT$$uXOL*xQPk?*}R8hw^<8_j6@Fb
zu^@)9tNfy(qO^4ZbI%w2^%-^_+rIn9xM;E)wc~2NW9nGTWE1>u!RY@E0VX8h0EH$F
z+s9hf<bSP;fqDow`SLsC8R3xl^4X%}``xSo$e0+5N6=qP0XAN1!Vu=;K_TQs02#fJ
z+L<(bBlWMu^|s;7_WOd3s|t}Xd;{J6u*K;Fs#F0-4Fzri-p+3%))K0N9S<y{+!$SZ
zzR$ek)Xu;ymhGEJeQf((NKO`W|AS|JK(Z*?*>Kb%jGLmPxMtq+dQs`0y=7G|K^121
z=NE@_&yEyKOvOSy2m{mqYb0q3!ad%zrqV-jIg=Sg692HjtkK}`ug4*;lo&$<3G+jb
zU2&?yTtAP}7;peMpb|A}2SKaZ$BrAyLhpIkouJumRKlL-{*?BfB@BU>H-a?4rQpi%
zGRc*Dkf8^z#eW3^Dh6ymE<O&tDD5RWS92cuq@Iayys`i5CI9uv-RJ;hi}D1~)7Y$`
z+r--+t0QT)E*P4iNQlz{xEZkf-hU(JTJm|pE%GVZ(r1zRKTk&qs=;QptKl#j@N?|o
zeyBmS(pAq#;g|vV{xejK@mM!RK#?spy4)G-fy`|n@4>DSgKS6z&!dwHyn`*SZ5GE3
zOGC%e!AlByfhGm_d##aZY=spNmUkKV4**<W`v(9n6=f66{`7w}P-FLzM+JKOulaGh
zhmMgkSTJX@Md=`u$Aih~K(R{aA2TGl%-uQo)=vaCa@cgXK3v%7Bgb*{jR-Io;=w}t
z@#<3tk$`o#*wjST9}i^i)gj*Up3$k1-q=IU=6uVt`sUPn^yJW0$rd;qaP&xrCX2#x
zK>}gOAoLi&{g<NObOOPZ>>#|$#UKoXdH(H&VEg%g`&M+L{WWCHUeDT^_G#+{gmoo3
z67BYXhD7RVkg7mE%@zM=lC~_uZGv1X_UM(9zXn)aQmS(ndP#0bk??oEfWU8`SvJu}
zag=gR+MXo7ev5YeKW50~$*q=-Vq&*#G|Tn&Hfd5wvNcls<AsE#jEqP`OUq8{vZK!e
zGOnXf!iD+&J+7FSz89P5z2H*~DT!wcQ;})qZ8)ju|B>{s5u*Chq)9iXh2EbQ`K{(q
z8Hb}j+)zT-A(D1s`){Y0FZPV@w@rIb;QH3TPD#|P*uWTm19;D;`(4#WTfK}51dUwn
zCVLocZ0tztp!MIiivRPzFJL#^Qu&uqV4EIVgO!0sibGG^dxQr(bNZ7f>q3yy`k8M$
zNWy^#X98aCl|Wlg`EksUr-b3TH?fSsp=HumkwZv}Q3MGOqgPp3nVh0xz_v-2&`E!!
zKJ_Z_wy$!)*cAO&s6#4e=&Sn_NU>KN+VGE1kP(vQPNKuZQAY`zG@ZAKY55!P;MM84
zRP_JTu~2bMnB5sclAlTZbuNDv=pV8x1huU&fg=1D5ahJu$o^58V;)d1KVw&aV}z7U
zzq=Ql7*%3+#X7U0p`f5pF)$PgC9k0@{FVE=e?x)TXM`;(+y7LTJSqt7=_xxCdH=F!
zvO~qi;1qlLKVEYy;f<C2k_Cc{ydu7bg!40(Dyy2Cx8L>kN@gJoT=1^p=DcS(;fH#=
z=H;&5rLo{th0LnCg-K#52syJN#qFD?Ir=jVGC3wG4$Su2Lw`nk>axV*<2wA)4J{wZ
zt^c$&;CHuuGs67`1zCN_Qv5Chwyw6M#TWVye3j+%Mcggdn=imTe%wxsJ-e+VWkEM(
zg@Pnf2tp<e$-SpTEpj?qZNZ77l(DX>uOxe3ZIi#PZ7f88dLC+N_&;?Lbz>KCKi`Lc
z-l;N>h6OGSSvu^$xF$^}IilPX5<yyJwOr3h|4`~;v#<-1_}rUXI<17i-ho$?Qd(#n
zRE)}+o|njTO_=)Py0h^L@;NQ;3^J`=e^HI_S_&Wy{cU{6j|E!&E{05J_+-{bcRq<I
z-Ov~G3ypNix8b+pL~nVRcsuSM%{wMewE)zETnc1=jhKO!9o(YX*YMAnO10J1E(5{i
z`XCT8Vl5kzC0{oG_UD)9+r#<%G~Wb${>xLiKRkZUoy<)$x_+1BUi}p;9tXS4|Aqjr
z_SIhxePhF$yv(+UQ<QwOe^`pN0Fx%85$w?{z7Am>iWjQImpEA_yFJqdEyfy(b@=&f
zurXQh8@_SpY4!X^OO_p<@E<9xj%k-@d0+gSvPd~0oBoSe&X0CFn4nfY5H17$=VnKQ
z@++H@lDQa~|KnyzZtkp4ec}78IYWj`Ik{IuI*gxhKE>o`XJ2U1PE|-N*|_SdCdN|P
zA6w>m81o4Tb%V9?waVpxP$fT9-D}T!AQghda4#R@A4nnv`>nl9p6}&kMRRaeu88i&
z7oE6$My`Jy*OTq9dwrU4jfCXu*Z)WX=5es!6$ypScaW>1$@^~tD~5;ifFtA2a`T_}
z*CO<V)+dfIM2MMQ$F}<b>$jPH(jB_GriR(I^2rVd2Pe47V4z;xKo#TppJtys2{4G!
z|D`LMaz8yWS$yPls+|<O`!21PGrrI{A&-D^5yRqY%l!|f^z0+7%L*2m9w(<cul=v9
zGC5x56C~^!lFguWG(+%WLiu*Q)qyg?+xsP_<H5UvW)^?kVn>$4i)VVuQ~~fj`T2Q?
za<Wof^S8Ul;P?;}8oYs+)P55e&t!$<aK+5A<etfH<+c{JhiKvWj_TR4sN=z`FYui$
z80jRb3ncj0Tnizp3^{Ed8^Sp--Gu++prkk)s5ZEGcLqtX-F%)&Hp3wzB9hMMqn#OP
zTmt!}Eg?h7t(3De2I`8hms}oln#!a%EVkPu5zY>?mh*XQ1p(lLUeA3?6CsA}@1Zc;
zjoH@l%)SJ0&#zj4nhj^8_q=Q8_||LXn4pq62kxqc^($p9Z^)cC*T?gLDVR=^7tPAR
z(O0h?H}*dGwNN{-8=Vw*p0385QPyHCy*>`$Byk}8(dvaS@>KIOf!zjj`Tv3og#Unx
zLw2_p!V&wqgoOv0C8zk$eD!DdOuPSiD%Ea8Ywcce_`KqsWoiF<D*xPp_auga7}Y&<
zylbA9Ql*+r^6khbBhS|l^ZWX({#Yt&dWn{3O3%7S<sK-vEM$ylGfTJcQ{xD?6&whj
z{jPh`K~v8(HY3k7n8%n6{;^eP$DyP?xAO;TNKU6I+9Ysxv=TU!c$gY3zvAsc*-<wu
zPhhLAn~vz$`xrHvYnKceGl#{ivUJo$W*9KS^Nq@#^CuBea{M>_Z{8Wr3L|SH-Kp0m
zebGEebZC#?lac`{8BH-@aa^STk_}%uNn8ja&-zWMog0LclEBIPVn-_;=e%vS1c{T3
z?im@8usQq|7nh?*W3*(?77i&i+n&jg(6#+5Kl#Rri0Lcq@+>ZHxCIA%TI;R+6vG`h
z`is?qz~ynfWu7I<<GAJGAqS1R?-uN5`zRu;Pmp+HUib^=k8(EXsvKUN#>b1f`&rAS
zMSJ;<zr;<;1Aa!i8N>cLn(J`*&K$=2P`61nh>V2#LcpiOk;u`1OqFrhhtoA98z;nt
z`|GnJnt%f{i)xV3v()_e-Khw{L<t64l+%810Ps4yq1lT!1xs2t68+E+nLhYAqg)y>
z_zhuLUfw(0hIA>2Hf75A%#eI{;e1E;5`Qbd^!<37RV6>Gq>ap6Me1Qs=cPOm%>Pdp
z%<c9>ectCtQ6O<v`(++Hm;{oXjf-!tbLfWVK0B<~1iKJVqV)Lfh*Fxb>w1^(qe7g|
zdz_*KD9^7YllKsWPeSe?3if7X$WKQfDEuu#h(dZ~{-H&PfuCrZD0_3qST2|is6Qmy
z6WbXx6#TaKL!>bNODEma8_9G|H?_Zf2q>jVa;eE^*YGn5EiGaio0XueD+db;ORw|m
z<y?g2yo=}Z_+K9ey{wO<F)0);?+@zxq9Jmo&UO#P*<P}LYcA1jFH{Z%TP*w=!g)uw
z-ynf`)e`ToVRHZUjFLF8dq}P(CF|jfmx9Y5PNRb3fBN%iH-5VF#;1?<g-WfPJ*L}D
zP3`TU+EhzhNoq1fWpWhkujh6sblZLHHUy4D{m)K3$fMwQ`jqrNBWr~P{i&Vp`3a5;
z!V~bgW1^v<>DM<xMrOWMJ6zO+U;%~aVT%~8CB9j`WHg`Ev6yBTiK6eC!k2T0y`nwp
zWtOmG2nKz(zB>OgHg6Zc(u)-v9Ad-<mB8?VP_Z|y{Iv7xCH|@u;K`90Wb2wjdjK1b
zwFAKU632SR+mx>sLT5Yi3v<R`wj--dh_+DwRVX0O>o2=;l)KEtC7E%kKMgY4{h#${
zxg6tn`RP5xy1%jp50o`4)-vk&`v(vN59z0FM;k8r`q$X_@gQL_S8sVtTszXx;y)h~
z0-w(9@OmATT5M*wWRJCSR2KOIPRhdkAIl=%_4G{TK~c-_tRj22l1mC`OZaRBzTJE7
zUm6YmZ7?sqMs9%>SZ?Muxb~x?lY>QSjp3K|s0FpJ8$v^=*nFTc9Lpm*3mB8soZ?~x
z<7?QmzkOY@GP3Lt0r**-lqJq~nF<D6pDW6fdOu1p1<7+v>qG2|A*Vi(gRCYBe^!us
z%5w#R>nfpA__yZ*kI#rt@&@n6YPQ(faEW6p%WogRQn33g_TO?RKcp2?w8q8#>MZUw
zdc$7q|1=7XrF@H-i$Bu8ADF<)i%IrcH?2N6;s02RuQ~9&73*KCM}+k4{1X`hA$F)B
z8ui3qR7%0IscxSw)1q~BxH%R>Q)p6(zNOF@#TUtgKBR+x;z;&%9HLZ{;@D3~8_4r5
zMe|tR-Eo95s7%PTZv@QI!j4RX4RilK27U&}K0%qprT5kOA6B8zNW`J_X;9U%nu(o%
zh|UkPP1eD$v?+x8jbf&YaR+=D(3eY&^X;xlNj)G8juOlw?U`J+Ab^c>N7b@C!dv~A
zlWw!_iG824qy&g;SDnD|XmQnFD`wO?RCEYWaG><=!I^*<<oX_ctiQH^d&Y_{r2DLn
zfOn@Mh+vn<Lm%)x(G@XoLH|09^=fbC<gbwR8>PsO7T`qhQAer2QNziYc~4;N7Q7BO
zaeg37>Y*2r8|(~rMSxJ@_!vzb4&*mCn8mNT<aQJ~fsJo&VC0+LP{^-mMhXsMuL=BX
zR&cFrGBSy9yZ)5ya`XckT@en^T$@apv3~MV3#7(_J9N3Jk9iU6Hy)|`_;{Jb(b@o(
zt)VlG&wcek+YIP>31Tv;{O~~4+%+nC0%jL0Qf=I#tXiLNMd>M~j?*>MlRIq3jrKxZ
zh`V461qZq!&v8Aqi%Ac39~{Z*XB9s5RDXucAVE98xekA@GM}heSx1j)yZ!pZ>Hpe9
z5_j->g#Jh}`S$_ii4dNrik;K>NcOV6cs>76jf<AI-c37<nTeElKbU=M0L*J~M#>Gk
zX9Vy%^A04H19kIm97>MxE?7_ij?+Y#zX~>{o8GOoQ`5J*y#0vRmM*NjQE9e!$oVFN
zt|cYjYxZvOr{C@Awl8DlsZY4>ybYCcJ0>hX+&rGe<<-88oHd@a*TcIohP?3G?;^zP
zqYph4S9@FK=Q?RiPigsqqLIr`C>@kji^U!-RqJW&<r@Rn6C08jys^I?f648ASZG+i
z%9?Q5y0VGt)9QTUG0D3Z3;bmN=3YQ=?6w2e#Z;B#H)j1R_S`ldx-$MB4@Y#tJyJKv
ztvAIqm$N^#WLD?#1q;D?8?HzP5DG?Zsq7B#z>Tt`T(;QQV7i-qay19cL;AYQcKt2K
z+mmy5KLGDM1`@mT6Jm1p-YL`R!hWVz87OcQ(d7I%Y&RyYbQM<pZrlMGx`(g~3(3UF
zB08HjZR5w4wLPdi5l)a~Ez<{--(Qytsap&aTfag_gm#vNi3eD7MO-W==-14TpZDtf
z%xF2O$;tasnV2YI;)H_;IgHRBBCmJ+EEpQCRS?J4!M~W4PompwiVl{l$TJCYdb22_
zKBJ6`S2yI+Q_h@u9(OW1I4m}3=;$YXueL7@4<FbSiQbyC%RMW6ZE$_FwH#;W<OkqP
zOKT`OAMfRmjG4<sH&j|_4kRb`iW<I1``%AGU*6Z*7k;X7`b(W(%<O&oK;QSqFBb-s
zQur@U?Yy>^H0w*SUg!1INiwY<k-i;w@us~AP{K{^towe+fd<lfM|g+MZX??98lRKX
zrK)Q|&8NRIbfWvaLXwA9#%{@L#We;I$zFWXB`~D>66Q13#Y;}-`gY2!xDL<AHXTq-
z|1AYf8_Fql_=%g<G2=>6Yhc73AxEnNR`VjKUO%EvSG7enMUp`&PP|S9KGOVr-@Ix|
zk8{H-s0QoRuHBs1jcVF|1-8~(V7BwuAgy<Qy&ka7CDj+$!QTQ~#`~KBm0JeYj~19Z
z%jc}Zz#$Z?_=^3!`8VAMo9TBtl`gC6u}SWwYGH<pW3V$g6}@Ns(E#5!g7sveiaLY|
zW8H96q$IuUQQPuNSN%DaN+)!v&$ShD;~5+g6W%)PAJTIRQHXTzVJsiu`DRmZmfo|v
z<B+Yv7xkC`a1KUomo8|m6SfNLM&!8ON;kSL9iHlGo=Gr$@W>Xsk@GHt=34WDDPnFh
zuN8m~N^Aczh-9*ucagf}<K1Dl71%mBA`5(aOpgilr`{E6x&0#qj!l~xG*SH(0@>z=
z%PECH0Nl3a{sXOt2`lHrRpX@At33JGbJzP_-_Nd$J5T<AS!cch19;^L*&G!dTD9dF
zc1ndR+hyyjMaUL3P;B<JG-}Jzu2Xkh()u0B<|MRB<uzq*d+FJ24?af!e%*vkfNmq6
z*PE~TZgAFV)?aSZ9?Uf#mVVwIUaf&ReD;3)K>iE9=@`^&mi`!ge3V=3O&DppYfPa>
zbty&#&(@vCN$IT-ohy6_)V7Lq!in4gw2eUCu5+V+t;6vQ1o$mM_(^j~RLx^Mrp^zj
z#gy|Qi(@}N-2Pnr_Pu5S1R3fx!ifZl!RhumpuOF8J`{d^<9QFog3(^#yWZ!)R=^9S
zrfXUFD7Nn8q}w?;lziVkzqc~ClX=XP0Q9(YXLc_0!84H-PXYL;e94U-S#X~5QD}-j
zE+Pi{@cQS%T|#!4BnHt?Jd3kk4#dBU=`XHoEZK+2x7EST-ydp)<6#<YN_u@2p+)j?
zhC<fqmf_otWgiN-0lHzh+=7$V&=db+LlMc%7Vil5jc<2*g{$;};I+LpY_-pFGzOoS
ze+f?h3uFLlIhtR!@VNSm52W)=nV$!d3h|NH&lmgN-foYK1<^srI~`18p*a3XiPBi0
zP|{&amzA63WA7x-Hq*#gUTY6*BL-pFrOnTqm9V{)_7Gp7sB72dKmn&DUn<Wuh>uB$
z@!x%eB;~}?%>F<Q+F~mjBNA<mfi@J+K2yi#U9tNz(|lIwiMoAEHS=@Lr>4BHki36Y
zOCbiB`$3O=R;z|S{v=nZ<}i1>o|(I%4Q^Y3@X{!6YYX)1A*7Y&A1{ENDz1ZUYmT@F
zS7{n6@;KWP{nf!pp3}WBE;{43fyXfQ(NIjivNG|hWM+tpr$|oEE8`_CN~3;}6$&N7
zw>uvQN(x{*r@J+0w4HRedD};5Y0h^hrtWC85rQ;M_=tQ`e~evZFF)B_U#X2EsaR<6
zAK+h)srGlQ9M{k<t0=p|i10lLen=MiEfY_IxMW%A|6y2vj-~G+-0Ib{Skb4=EkOsA
zDq=N^;Fq%TQNGleAOGpOTlJo6YPktO2gD|%*IScQr5fx>hKJt*3`P@^I1lide_S4y
zdmU{ju=l|g$<n=L!xwJU?$-K!xu&SdP8D$sI=(i!?YQ0~l_VJCes#2EEV*hES8x3O
zr2htGH*|4>NTd2y<dA4$5^ivr#cs&*TywM+0lST+(D+cBKCXlIDiS3MnBw|9tMv!x
zilll8XCh8qK_&~25hO}zm|aspz;`?+;#BAq9g3H)&>H1cqZoN5XXCNke|yM~6*<HI
zv8K#GEae;+jUA+VuNU4j!t^28S9aAX2%EBCzSaPW`7+N{Sk2Z%&t<tiH)b#O^Fk|d
zSDDbZ>s)hMuf{+D0>0KWNAm^S2{3jq)wAPO%jktCZ4YLIJYEr}W;dR^0iE4>WObR{
zy*<A+KR!g9Fx0(Fn}Fo}7<0^pkDyhaA^RWpEMf$oubb4R03MKVgAfb9_3}Q9`=HHe
z1Mx?>oUJQ;nvWUBx%}xzvU={fT^;xJf`>JJ;Tcc(G>OdUTraDWeprdNNZJ~7(Ft{0
z;%wUZ87Uu+y%Vm96JQ;XJ6$++t)jHrmm`(6Ie?-Z(&e+0L1(u$Y*ckL70v693)y04
zRj40)Cve(s3L`0EG6y`LhQaqO*SdI7Sq!LFT7Rq3PK(kG#86Vw!PQ2&N53(CN+jSa
z2F0q5a}@LeIf-#lc%mPG+vB!-{?fRsul`&EcD1ay`?d)8Ze;CkdWX9d-N3-qOC4S|
zL}^X|C;OIH`b;Y>@~|r=p2E6wAF1DtFT7!XYaNY><f3W$5cc)l)!=&!ZWJ7({n&#d
z=4sU$^PO1A#vd9y^<z}bynLmdNBe?Ep#TE+c?9K$6#VljD680eZpKSgHqhrZEZ|TF
zyf4{YigH@=eh2-M%@0XHf%f<+7ZykJE;+&74kMn_5nv<Cu4cVAM$TcXtJR>5i))3d
zDFCFA7%pIU&@<<OUBG?$y0p}T%}pfahDACcaH_-m)`eiL4iB_Sp}QlYlWTFihI~=n
ztP#3ENNZ=hOM`YDUpOxSuOIOSXUFm7>5xxn-aRdTt4`g0LBCa3Iam~zEKs*Gh0z;%
zq@1nQW|sW{k>v`<*=Fp^u-(2^!0>ZBK&#|Z^uLu~XUsFFAew3=tNgpnPSpMhKj5Gs
z+_{+3Qvd#77Uns0>1sKe`yCVkiWM7C0Fuo*=-7z4Q-ULazka`>?Nz=x_9zi*{h8$o
zXy_Amc$trv7zQf)1wfbkv2bqB)J$pj2|IJB6b%XIZqxpbQPbGggTrI90k6vzvi!L8
z*+Ckk#Xl!z;ABc43D~?c@OYl2AFt@!??*26xn5<0F{W=$e?niG`2%WO14eC|yKJ>T
z?B@D6!ECw&gc^;yf`}p8F>&wgZRC`Hb&1g3O&LzC9TmEu@u$PgwzK7*jD6L$UA7tc
ztHcK~tS=8_a{Gw~B$gwR(!*8G@3pC81UY_(HsH*R706*?_<w%+GWF?=4+Z=x9K445
z;b>sVM6rQ3Bp5}~(eu%v0~P7E#MLKXw{dYvw^2CChm`k;ucL2kg3qkR#fQ}fn+W%S
z;T|%sm6-bN30#$tJX1PCBH@_P^4h~y(PKR-5>7q_(rq@|+w$03nW)=f_tD+hoz+sL
z>(3`!mj@{h9Op^_{j~$n-VC1^mc2cyh9fZh-#m!g4|})bSHwS<IC+u|FXd0RR%MSz
z#zIOkkaqXvGt3eu6ymB1Yl?tX7ND9&ZzY6GtdsLu6TToTVyphMpiDlj25Vh#A{Gbs
zwNB)Yq{j^gQr4N{wZ0f{hs7mrhjxJ0p-=RTm<O}-4vVDgkyUbvlj+&5z)=2&yQi6T
zfZ&ZBv+c>$d<RAccoi->>{fcPpM5t$s`B`-JkqTtxP!zn2ngVVfE1Jt)ePLI8!MI@
zs~@{Bvj(hyK+wuQ`WH`9`aRUB--#8p(B;2Lp5JKD6bZsivIc#N=)m~XdZF}E+y98!
zrZu>>OhW1U=F4xNB1D1}RBrGBd*gHjdC@JE$5>g8U!X8q9vHaty8ugt7b?yWL9)H>
zN9va*Rdl46YNC4-SC?1iGp7=QOe<MgQ_$aF+G`1l18v@>Po!)GS7}pzSUKfg@e3gC
z?vr%xKHolIaxEsZR&0YfzV+u*+56~h1W7Uu?%B^S2Tn~N&Nh=gCVlP}@SMGY1&~!h
z;axHgIR{FSuWovFeocg4(}GMkAwS=U@t>>s`6z-u?b=t#$&n{HeW#%sOTteg>XDy>
zN0Ay32)PhbPJXm_KAqiL&^VvnPYlF*RjJlq{-$`Q&l34roU!odYJX>MhzW4r^d#7N
z9O;z@@3@=dmIqsBc%q%2M)!Hh$uq4?uC<-bVK{7<FXs+%f4gKPg@lM3;)tnN3tp2v
zeH^M7`Ze1?Uy=iE|Mdj{+d9DykV*I}Fkwci61`rGwfYg&2rEJLNddE+o5qTq+~uUd
zKk3+tkdVxz$Da3m&KNU`m)zWM%Ma!ZAr4zC<pAxEj5KYtj^^y@npvXfYyY(#F%xok
z`(5h7jj&s>kVRB<6lCImmOr@E9MM3KAIo&KCC0U&-ZHTRj!^b(XROqwth^&O-y;xr
z+gM!V1Iigv=jd(+ze?Jg6is2<<BKuAxRA1(=nw(R=%;gyM%Rf}%0dMZ7k!|}rBJ(d
z?ymkrP1fd12Ztlj`XR@C$(1A`yf<-fAp;Z<sy6h>vabvpq%@o}4obTYuFzY|7ipPn
z=sz=K+=>OoLz)qH80Bp#fW;sAJ~gm+^D3qmBaV3gW%KyMA*8WpxUN-(AUV1{TC2b?
z&3l6I-3Z@|ZuvrchF<`AqLhM6R1q^EVHV8dK0J74rZL_@{2jCV$>~`Ps;to>H?Yll
zpRB(18cKRVg3+3?qZaC^Lx<*Awdm8R?Y$)|*;DB%h0}$^2-;aX=ZAE_NpKAA2_I*d
ztzf6Rw=$k{si&6ZSmQloh}zjG7s4|A4;0X~4sTkKS&i-Lk58m`JI6Q{&^<*BflA9Q
zF}T*ZOTSWeIY@Pny9%}Y>RoUEj*kMW%S+_%J`#Z9$xqcfDie%GaV)A%XKsrtmIFo`
zO^<+%`L1QMH=hrz<eL;vGPzHkI^5dzJV>&B0w6;i(AmJ>in}uEHDTC2EVKkzi@+0q
z<H@pw+iti^d+AA?>d|&BncZr0&(9`6aI9Un`o@ivJDq*X3kOALHVKX#M%dVZeR?hu
z=)Tv%sPWhRdFRN94h;yR&B@VPW+@9<8UDGuj3M&~*C-Bqfx#<U%zaKIrCd(djvgn<
zt3nU?WR?=reG`S%v5r+Qw78sl^L$@1`Si+GwMy^o-0gg+rg(djBB!m&ss$r~7>;)x
zqIUV+8*TZW7bR*Uwjf3L!?j@2F2WpiE3?>we%aIMVLdzBF88+!L|f2LtE<EEuh9m?
z+(&@0{m+`Z?+wdlsCUnrE|VWpYUdohf`^Srczurat}3h1f!AZ9igYUvUb#T>m8>g?
zq$>JPZT+Cwr5%zr;yGOtsr&0|g$C{GMpV!7h;G_DHfmg8&BW&3J$IE>55W<$UN%PN
z6&^x^w(}-X>fl3`jMQB2%_B%c%uE!htRDZK;K|Hv?_kGRzCGRAgv=S6#8s|(M`ohq
zB#@{+F;X4cm?m6(0i}m-nu`&($E-n2>r5H*q3MCm9c|+{jop?3q>@Lx7=@_Tk+Y{C
z#K8bWB8udF<+Z`KBX>pk5y2QGnVo|d<wP}5r^7>~)%mA{PIr$AJbZI_YiX`1*CYLS
zR+v!NBF5%@yT6FR<eAwS0dase&qGfsTDVk&R@2wg-qk_gb~rb+U^#Cs_1<jj!pdWl
z^{(8>uJ7m0irVhDg_}P7KhIYO$2cO=*u$o(YFHnTF1lXGU<J8O1Vpw;_#e~KwvoZo
zWbihjv^}8or6=`dR~`rF>9B`($p{8jX1*!c_X(Er6dy&p>3oIq@!<*H*`AFRb{%BF
zYrRf~1uR8);)_Ua{vo$wzJOaC(+IX;u9WtI7tk(WLz!F7Y;igLz9+~jk7Coe7;n05
zM2N<S1$FU&ga?xp`+?bZ!O-z!I=Aaw&XM#L7w_w?d<RNq+dV_>$7z+y>J#5@-It<K
ztI(t&l03W*Ej%H<J%xh%Tqr+z9`@;P?XB_ZD?H>a(`>mPH<Qk;($=ZUyE&E~y136d
z^;WoR&-nHXm0ZwH&XXA{cT0p)flD>f7XH6f*4lluzzwPxR|OtfQKw)$KCg4QDrsu?
za|hMstMUf!RlB)K|4LGi4zEkR>S%C8M8IjEovOU6w1JQG@C&4K@3(E~LF*3O2pa8A
z@>a{{<1RF`cIQ`e??BCHO@;bGtElzX$6T&-+@8cZ140AiS|33e{jp~t3^=ygyY{$X
zzWDa&@Z05;?{V5hkP@bA+=tIUJr3A=6WsdV5;Avl!?i`t8M!bnp#8KvzlPF$FSUk+
z-!j)Ki}VNj4Vj+3pYS5`C)U;<ob*MgTn(ZVG62!==j0+l1KmFXjwHVQ&)#nXv7s}M
zVI?`yu_W(G>9eC~TwZ^}1#z}eNa9D**R$`38wEZFE_(>)6M_^Cg~cD5Sd(T~TU>g-
z((B>NH=HX>`YceXI@kEBjhS-5`x*g!rj3Iz`~VPW7sNYG#1|n0A;q*G>`^$Ks`~ql
zWOB|l>yrWWrgJ!Zq?QTp(OY{_OrE#nWak<(UZ^iyUiw>H2~aJY^)BG$!$bB`woj0<
zhuLHK@Khh1loN8#qWD-I1F&kXR5GU*u1ix;PmZRyX$vV`zcDb~u;hNY;p`QDEEHy1
z8O<-zC2f!2fyD3WB0%#l*g?an*o0`Q{Yh0y6>K;-?)e>`B!8yUac;nD>i+eknVQaz
zNO$?9-RofC*yyR~tlQi`T@LCE3sV~pPgb2lq<Mwrb*aV<$Tk^bHOTO-`V!t(*9YOP
z4r7)=Qux`=3Au&1%(?6xcwajksmzdIo}g_16tyus)e^6y^7T$G;0-yQfV)`fDeCZj
z2d*!N;|({qH!Vz1XbQksvOEq~8p$$z7Pbep9x=3MsaHOS$m6*dJ$0dpMg`pyS)rmc
zu0xNuDDp0|mC_&VVafsphl1HBK3URO)0F(M7zA5^;Sbmu6PBTsh+S@={C2MClohza
zn=?AT!wH(2YU*Dj#+Q(2WgbGAh#*p9uhC{DUUd(&s~WUH`uSuJpVd!@^$L8zg&lPs
z)L9vvKl{YVxf*ajbU0Me7Lj!q{S=O$gdkLd_s2fTX4WBH#2GOwITTIEmBR>CfWox@
zRP8`7!as=A03ft7ZY*3YA4hVDrYR{t3419D*A7afkj-nCfb|&rN@M!o{RUUwrU-&H
z7&X?>CMMkXX^{x<LXzoyx<X%pOK$N<&qiH#^dnNv@kY507&zy&<=t8cIzV@%o6dpe
zeq9H&lX}_gkGtSkuU06{8=ht_jYxkSP2O6dcHGW*Bz|&yj7#P?wb}bQBWF|hYd(-a
zqtK4ijQ9wMg7K;(fS2-1dBvqyPFCCwh>(zulhvIcLWa`Yb6?DGp5ra^S|<Y!C>B1s
ze{4otWke8=0Cp#Szx|44YoGNZ*q3*2CBJ2@I*cJ*P_ftE@cq+*u9hHb{Q40)v;4JY
z+pXLWmw{$f%ybEK|J?v)dtML;8GBpK>*-=GvCmtd-{Jt-+I@TXCi+HNp)z~6VZfyg
z-cE}yG}ff<mk0!)dyI3LV#@V_WV7&t2pm0)#VqhG=Qr=y{=0X&k{%B_`q_q`MZBfS
z+SO={z8y|^FcG=2Y9i%fM3p2>=nf3XgbJ8-pYK_uF&oYvxrfgXsvyr5O{@j-60L)=
zyzfs4r6x^3*YliLHRzpwY@MoKUAN-aXRF{BUtz!2CSL%@uX~Z$Q?)3&%Fx>qU&0q~
zU%zaoa-XBFVG9rXD81+LFh$7G==F=oX^iU(22xd2=%W&&pf~D3fk7l_fl0%?`iXz!
zt96^$*37@s_!1$cAi&+HM~NKj{zf=$E;Ay%b2nqqUiq=x$}%-kCx@rPWaq(0K@IG4
z_`quEWr8NHY=c|;rn35IsVwwfIMo^@=?c0B1|lWnadZS!mYE*wa=@v`9I4h}4mjIj
zJJA|Sy^Vi~Y}4aX(u%a+(Sdos9{hRmiOuKNF2w#x!L|>5RpbwF92l_T>9R#()Cx@R
z<?uWYZqf&3&S~VUI~}g+f@`7)I)mQJglXiBYgVJ9@ZQ6H@xGaWz&S!LCpHd|&+9me
zB8041gjPe|kKgQxTB5~$so0dU2i-gnFcSrh-_3AK2M3WE4WU;#>?CF|8}E-Wovz$P
zC`kFB%Z2{<LT}a;gsTINxL3kB`?aqdU9pRp0<gr-2$iY~FQ)nMNO=PP)T%N3k%RyZ
zaYrwxrDV-se<FRrV%aAkM(X*fRQgQl-qrdUE}fB~$WD6j=q3p4+IqweyQ5!R*{sQ^
z)e*&aa`(Drrj~Xn@GwBqAcLOHJf7O{BmQa!<I{sfglZe=V4ZnT%ypTvabvP-D~?<C
z0~Z8(84S05=-EHhMB_lSxdZN7*&L<RxLN^aoF0lQ+5}$(v_D$hAaS#ee%4&E<vt~Q
zYPaD#QC1>tIYS8O<OHDl!2uVkAg;^8y&#tYnCzD1NVP$vB)>rL#F`VoP^{%ti66S;
zTjubP*NUUbFr%-9VB}D@4pQeE3!JXgn9A(tS^Y@hlgX*N(rfggU}{3b&}66iD%=YS
zDPQ=1C)02mCra8cR;jRVD?Zt8a>~lee#@GjvvWQvkhm3#!sH3$ohoScX|iGD>jx~&
z?25C82}B;~J9BAlPl&vwnTW1kkCnnFC8d6SIH7HLobQk~bi$~7&D_FMbdz5(IB*M@
zmWsj=#m62MEtA>T_DaCgY99}`i;V@nIWPc3LYbI<NTmM=?p~a{x^iODm*?5rmut2t
zkm~{e;xMFnL$t~hI97u-YzulKF>0jP@0T;TXBmer?F`L8j33m^(-jWWfwW7wo}Z<n
z#|D)<5!B-SDrTMATR+#jw6#pjMl#vrfQu#{XHue|`NBpm*N%O$J!RBO+4PX^>4h;>
zt`{sfn4#fD7?Wl4hVJu^QzScD5<&SPgu3iB6`kCyZJ+s^L%DL!*oV3KnU)1gct<d@
z3nK#c?yr2qz9l9)S%T24Ei}+ZDmp`6#Jt@gAC=rsP2~ATl!|0?tXM$EokDB+CvseQ
z((V)iUwzBbtj+bDd7CFHBXvfFOCawJbq#nU(RMH=YK)}TNjres?V81iQUNGIgX(q&
ze^WtN@I^6BAAIhG^gTBo#ctLDgjnhUM!CP*Yb(0&zPVT=JW^kbjs-~4S?a2{5Jp*L
zk#EvP^NcwMclVpUx!4)0FuWbV=UHgi6crPHCjgw%kS(>mcC-tP4DCHpDtEO%Z536e
zRckkHeyTrdK}F_@kR(2x#Y{(34a}yGOW~_iZ_I8x)R`PI>0HXsdk5I7U*xD0A#pF!
zR_UXCGN2RmEC8&Qh=h26(I;}tf0Vs{$WG{?i)SgKeE?Gf2n8F)JNvTh#LqL<mI_#B
z_LEqdiq;T{j?Tg^nm;vJyu8c^tJ?x6b7p&jWHrKodA%~`he08EQY}(~Arp>O?JFqi
zDE0{gO}S8NS2BBh(;gZ^kMBG~`WSEl@dwRMXnrh8rL&Kc517HS9Vui}a@Hzqg#~q5
z`PwFq1dVvGfnh!^*S)8QpPrg_jzTHnmx%iW$A84UA(LSh%7My@8b=iQO5TaK0dEV<
z0hJ=~H4ebyugq2iWaCAns6ioRV~BN|&DS&f$j+D~)w*T<{8#&=XLQ_1nHbQ4A881M
zK`#$|x7ra+ndJay-%y8^-ee(=^%yykvaGDXS_(G~wn5+*lAujrtbfh5gHR-&_6hm(
zKvhk49Q$!xNxbw=hwro!lolUs36`RggH%vXX(I)!H8^4AgO*!d5e}onlqO!1uxLKE
zu%D%HzP)8g=kL<*w*^@s4y(;dINaqm;?(v>#;%7Yum_H8j!_vIk3aEVHN;RTS*OuL
zz6GY82Z8TtG=KN*yU}%Lyz{JYs=E@+YP#u8<2+6&cQ^EYQj#%mY4f01ju&8mrLsOe
zbzTyEf?7m+*?>yc^E#j+H3G>5rDijpLa9Y?IBeiVMDOmVjm)iBC_hZ~P&arQW-fvF
zq}4T4r{S~@MooRR*+{~`cv;z0yNJxue%CnT+n9*Wn(ok)!h+)oIxs?1VWb5Ochryi
zr<jDNKKc_pe1AtFOago{utf?{_>Lw<1`-lIy|Dj7B-x1y(}ajy-kdLH&^9Avui&v-
zD@;n=Nm1@wJ>=dRrtO&Xe{&~_o!)*cyEL_Km71$Db|C(w?e#!VgAz!_@z~D6hfmm&
zYyO&y79p^o*9c5%84rk8&QhE|9Y#jFS?*iPXU!au$?S3)VS{NZca;<Va%8Tp)f=3T
z7arO7xUqHC<SBJ!WV!{*vYE;fHSPT?A@uQ1WLep^vKDnf(wW-oB+Hj{Gue!xEyE)x
zp(xb(<&cFP^iwYUyZ4^mDMGZdm|2~uOgh!j3H@GKr>lHW7Y(Ozy3^b5^w$w4s;oFu
zvgIzP`MR;zOx_55oz8O~m8a@y8b{JIlb@k0Dr&S_LV;t8Ko1qIOT86x0Gct;Xv--M
zii=4@p&duvfnFc@Bjl4FUyP_@(&lD%-`$Phlj#YsvB%1m5}M@NAJ>1^$Dy7UT7ws=
zyj2vA_%3+kZYs+aj)WKpmKWKfL_4qqjs^=Ez`eg_y!ddig9hy(gK?}>W)sZNQ7Wgv
zEpc}T3uKSzEUV#C?f7z1yuO3;(UK4S9uYr022E~v7!4(6DTAAkHgff3J~HBtoIX#z
z&cexOrKNyFRbb0{#^t3X{W7TvgLr7F-~0r*Y@=$RWKp*q-vk+Bk4sfHTmKExZlLL`
zz~Jy%*>DXosnV(`PBb9*{Nv#bS26ohK_@RORu!`xN4o)o#A{keki`>%Mq!gVg4LI=
zD^-HBLF6k=mj{5YmbV2YrpHW({S=8vwC>3iEh3NLFALOx7pfXbeWnZ6jo<w6W073S
zosFoiiyVnAQ0O$SWpEPlDI4_X*I%h-EPYWu1mief;uhC6d6*GDCSK{=0hq?K^2enr
zCLfQUOi;;e0xHdU2CRixv3{|eaa&j<S&DhFmCeDf3O9b{^Fd>E5ygre)P)=zI2C5<
z%X!5vhbs)}3w8YO3ebxpkIa|2K9(I%Cmb-7Y4_pL$8trVJ&s~-JQ{D>?+M#^;bPMT
zE#Sl+Tq13(9#X<zgI>+cxYj%bgl+{8QW?opuCf-GMN3GLZXO{N7*a9rND)kqTBC<0
z6fyK-cZwDzeP-KSh;rL}nlGN0{J75|KrqmtuCii1{$)c!kVUy!i(jzy2(*p#Ap&2X
z*W^uNblAUJ-TK>?(m}xf7e0uW7%uCbo2UYHd4_tQBTm<3L7mmDTeWX4E)Jof2#Cj^
z-gEP|t9b@wA@?=wz&NqcfRgUBg~Q(rU!FytsgU=8II+rnQsRFBgYj^!4PMYd#tGN7
z=$*yqi_`1W@IH(#e<d@b&07RFWn@YUMGwdLICq!EvPxXjyuV^*(X^s{|2R8n%LAJ|
z(tAIcn;ZVwXF@21PRV@N&S7fBje{${@ZADw4_GTAhK%*`2yOOw1%K5?C&Dx6es0Ki
zYxi?feEc?Wh&`3hT8E;v`y6Bp#L$2P>v`VlGzRCg6R|*>zs2ESzzjt1Zg9;6PO;fY
zr-yii<9uL862}zrUk|GGioi@1eJ$3`c+v5R2wdlCi^yB>C20AJVI#WGj21DqXkW{n
zOe2^kF*c`_Q44EEiv{Kbt{J_H^)FX(QSts$Ae4}wMj@V3RilMRJcA;?fzMD;k*eF#
zM`Kt7z7RAunK&O(v|B*65&k7<+mo3WpBwNE#tlL5QyKzH)5}Ov+;V!PivyZtI7I4S
z`vCG%8~JFB$tJ=p$BVoc9ntgjp9NUt-^6Z{^0qPQ;6DWgk`AP3k#(2Z?S&a*(VDJA
zTZ;@Zea}hfU>_>@<lSJ^CEq{oy2(qaKkWN?!BLq6nT3^ec?a*@a9`XDh9$k=-1F`c
z7sDAY1iInj*RXEBm&q&>L;2L&?lxvAr*|qEQW5Hn1z~fJLL`m&HK=^86IHEaGugw*
z2wGI?@K}0O0%^T~#W)Rna_BQ(z$GYky&~Wi@6o%gwPOthzCYIhYN~)g63-M1`zDvT
zt=$Zau`8^5-hrY@+$VvjyI|v#tRLptlR$fM)&Y|*9Tl8C8L!RF-RglrRD-!5tZ9NU
zY-<J(LD1q-hDXp+gzRNq#XWDgr^ZUqPEl-ft9*K><Q9v{%d#a4$mJ-=!BMB_!MQ9(
z9MuxrRplL}LQ^d9>PlX`V*8d8sib6p;>(y0xx%>@cCAI!VjP=%_>@NC>R%XY4wzD@
zRm`e!u&~3ld{7MYn8`FgNv#E5umo}|^RdFHL^V1ek30S21u$5DK(ky?EZq9~i^=BT
zt7emEpVIp^d4;o$9p)o7F2$><;;px;T#upL&3=OHeU#Ka$Lc)%p-A&qcND50okqXi
zwULt>4Y5-MpFjPOZ@6UFAbx#rgSN-4{jPC}8+|BZy4qYlS}gQ6hlA~Va@HZcV>VCn
z;@QJIMR0=niri}Pb#wMsb^;TfqP!J%XcjNYf(a)B0rvD5x?Xjm<h_(9zm%B34=^3G
zFtq)${2m1Hp%D-3qV2JwCHb*4rCqqmM;6Y;V-mzJvX-qT#*!c-V2Zi$p6a0Z-TE<%
zGUR6+v*}|X;3X)lcVzLl8=X7ub`vQ_i>iM42}N$6-?qNWu}_PBhiS+`&gw%}0oUWB
zMjX3sw8fEzMwx$3vnr!cwBFn9)UhcRU`tgfOzz=GP>bLm6wjszm^jZsFygphI=L9>
zpwcdAUV8d6?7=1GCSx3TkT$pfRoWjc?AzUs`}35;_Ps;{?Qs@-k$iU%e~<!I&(!0T
zYJJ6Cc#@2DO%>`;UEK_K!Du082Q^FIP)wQMW?p)P#&^p|q}%0)8v}1p$1OL8=f+EE
zVh^%X(hr{lU^KjDc15kz$+MB6qKWy07pqfeS}bNV=K;T=9v{N<{wFu+qu0@pyPo^d
zs$1o>p2#XYFq_;4oz(HT4Mt!T`1FP#_~<QM63Ro5#{sGxa<n6gAvJ%8NNamY>qv;9
z%qaqOVn>0Zm3}zy<<G84Te-Sx;eAmRiFkmpCBj#+jDFF(Ga@b=$<976!Q$DMBc8fz
z#ku|051&rRokxi$4pM*G1hrqz=khZfw0fexoDc)`F`IUuItX$r1aZ;J1=XZ+WVI>2
z*I!=JTU7<usV{c{CX@zERQbuDT%?0Tdi#iGlDdjTe3Vbm)Y+kj;{kXhin|vF`<Gt9
zr!DJ7geC)C#~w#I>+UOzYo5K`geHAfQ!P4P9CIU0I%j+<?U9oQbA9JOZL38>m#}pl
zG&Zb0L4AUKDZKU#t-ak@_DfKz<uVj`s#++GjeMF28QIjpOj7Jms)Rz7b?qMtY+moP
z*w0_TtuY{W*6tXR96Ov%TudtIVK%Du@?D^kX!%qr$1{4mA}#ex>R@1K5bo_p8{x#d
zk1lcHoN!nqoAXd&EpjXX9>vV<>;W0}&Jt(HiuQ=Br%phNwT{ayZETm(VB47#85eEv
z=j}<9Jj)!<+_}Ypg7wJhuoKc+^M^5JiQV8G`wRmYR6XXli>Db%$ptS|-Xv%i{%G|b
z)v4T?A2-5TV!-^4*KzAs8e}LEZYA2^6y%wAZK)CiGIepZ@MnEB=5LsE%`DxXmxUr)
zNVr>|gM@N8@%;6TFfL^w0pjejhusfrdQ|X_1uiAuK%;X4E^Xe*Q~wu5^;QoGIY{gq
z6SWBdQJ($1n<kEW2VRG$LB<TY3S|XZh3m(Cki?k|1yvyPb;0=)RqpqAA-2#~3P#9O
zKRBmpxqY&_hF59*n@e^ASj9%sW~OF*Qi@Mx*Twu3VvTQGGqp}XnBmZ9)JtF}R=Z3Z
zaIf`q8*V46j{&nSe+B7#kqsR`c%Yt)#JAwN;Jv~RR@<ko%P-iJ-=m1-7dcRi+5+in
z=G2=e#RHs6A=px>bP8%!q%rraToK*tuZIu_v}-Of6d~?@nTAF9%{=Q=C+dy>OyqWr
zm<>FO{GlE_=wgN=zs$MEEB$UVh5ck`lP1espbCw;$BNvj5w6<-C%^fadHQqiaCYky
za9*bB^3*q^nVL$V!O7Vlt7ua!FERgVy7d;XCJ^oRBNmzDJtGdh!k3FfX8R8aNk4ul
zvz~xxg-atE-QmZMD7u^HCAWvA0>s7wF&EzeC3~7)0K9*G2V!DRq%LXlZbr3P_!wQ&
zs4ZkKpCOAE2c){k^uLG&9ElXu_PqCg2zhnV0XUG8F&12|^Ey>WeN4JV+fCaE7_^xF
zmQt6IeudkmQMfhgoA5nUyTeMz2`hp7UTE&282DmNLCe70f`l~zjRnQ+HMO}|S=44d
ze{5|>Nzn;h25i|E9^vk-K=N!Js0#hfD)%be%J8sTaYT+qw5BASsLp7iY)D!kiJ<20
zc*azj279f={F{5j6gclvlUA3g7m(sIh8HRw={8O%Bz#xMRSR``AvuAqd(t^EHKA=<
zo_j0Hp*6rijv$ygplAO$t<ozhhNjjL3$c-IC14Y4w0*?0_e)UE2gD&;uiY|#0RWl#
zIi}M+lCs#KExA}*Srk@@i**v^qw#BV8+yo5IFrm5D?(YZbVr?08cdgAnMN8nxiddB
zm_du85OkTIM{*VEIG>G_6!Vn_orM7HXR~zr#Ps`Ck81+n8R$B8@dQrWKIQnb^7iP0
zJPLB%glsaE={y=uY>&=HdDfj>KpkSjpsg0=52~K?>uaselv4vwVikGE$6=>kuPX|M
z)o^2vIIDzBc)Y1AN)N%&^p+$o&9jA(a8{OLOTQ(5Elk)wkr(oK-*BGT2e$aIQLF_-
zY66hvd-J#{mg~Pr%!CcngYBfMGeb6NN$#k~!{zk0s~L`SWW+6SOZmu}BUhWU=FC;b
zB4TS14iBq{&%1S6zxbS5H9ko$%s8Xi2i2*)@6yLROqreSKjr;Ok5fV8NeZQ<t(AXg
zLM>xdY_jw~+vlLJVe#u@QIkD@f5GZ%?P%noOs5@*?fS$QK?tGn(8hI?mhDOK!JVXz
zEtr#4u3lDvt=&~VC@$tD<l1SrO!mba5U9r#2Xe*v5B0T?OFR~HDl=n@*?727)<gND
zrMjcf`e815DmpOeM&i-WlfD5$3>yq|bySt8l6D?r2+i=9X$lr5mx+Xsknf(Zh?leO
zKx!?!aT@JhXkPc68vcEjR#O~QqE*%o8{nph*Z%X;q;ol9zq)O4^~)Ddj0iMt9zI{R
z(pd*THdkgic+PHa0SJ)|8;c{2#N)rYLY<wDIUH%0kpwJ8lfcxhK~?T=l!$BA?DOD{
zHA^dots8Jbj&d^2Yl7oY{rnQD$2r}sX~v*oc*kpX6k(5~z`@*-iAS*q%xL}!$TsvY
z25``T8&>Z{4Jyd_(xmL>x!*ScI_x@2rHpCb4rs4VI4nFQF6(Hh$kZL6QQ>a&`c`Lk
z*ovy0Asl+w-{uoZ7*BCs978czyhXLZIVO<F?N(#xrNfoui<H#==zaXaFU}(PXbk8*
z5wfI<->IR^Pa9+8KFv4b+Y~2hT@83u7Em$MOGu?v6cut>QaB9x>JfDZV%s7q_Ei)>
z(Z2mx2;USUg;%GH2KgXIqi8R9iwnjs^1(Q-;HN)pKR7|w<tP?y3a_5nx87@qtU;Q$
z?vB!LADf*^7?pg}WbQ$5C6CpcdP+w0dOw;a;lLstj1!F+z+F8QqK|w$T~{F-g#YV}
zR}Yyu1kh>+gLS-AIklTs@sN*MB2ELMZR|R|=*qKD-+Z<R)1k&K|MCg(ir_F%KfkyD
zcYXRHZ^M{x_|=zpQiyzJUSxT7hYhHl%@5x}chM}}MV666L$(*dt-`y82wm69Qd?VL
z>D1A__;#!m+V}?<vA)n%L^Qi&CR51-^QCOOwA~e*+7827-n;8`H}D$30cW0Zpt7;X
zN*!8&cbM8ckc*ZP!uEuBwt3vjHdWh;yFh0ALWZKkLUiA!_VKed4k&_`QT!BW8-|UN
zP&ecy!9g7XV>Ez|90>rzgU(t^G0Dqr2ov^52d}ws_$mvxC%FX*cbxD&O{&ozFIpwk
zQ)Zf8+tpEf2pL(5ppq1=ftezi5y$TRp`PrI5cSAGT9eA+j!i@QqJ|Vz1pBp#)79>s
z(DIOWA2=)vcE3wMWF{IV=`q-%W~;OhiTUpdD`)YIBh{-piw$Cy67j_zF{4+*F^6rx
z1d$tYta?6(ii!2-up++Ai40o}a5Dwa<<-O_x-d^{uo}UiXmLcrY5}{whxNDTC~-Y~
zktnkei;gS0;u?NI4sve-sN2lG%~Ss*Avi%{Hvc-cD&pNq_LF-hQ!432Dn%l?;2RM=
zh3mM(^#x?>O!62jG~I1_TmbC{>BJ8-qtl7?Osu|EN(#wXj0GfJW=vNMq%X-qx4KD^
z0IL))6QtVW3Z<l7V&0%tP8rX;tJgdI?*V`nL?v-}pt9*5k8@<0(c-l2=`uz_STSEk
zk#p_&C``m^RkW9+_vophx&834fxP8(ajv<_Qcx^F9dW`@QS{nDU{Y>0iTb>WtrU^H
z<&93hD|S|L2(`y6fvueHONFz19V7Eh3nIcn3b-`~kE2Jr;pacc>Qb{rs%)4hJ-Ehs
zQQuURxl{*#&QRCaFzaBI!0lOWq6|;E3agE@N*G?49DwXN0VWFC&+7mN;)NFi40rrL
zs@^&(%J+@>RYJORq@=q;y1TnOr9q^-I|b?PmTpG6q&tW1j-la<-{1S5b<V#omm~0S
z-Pg1CXYc!b8%{w~RC#WNcSsc5ix-zG4)HGGt&c%@x=}ffh?!*RUMyhPiNK1kn;kg|
zC+J%I*Nk$`1`s(n6zuW7&lB}oE3A*(F`2<gjovE$LN{cyT9oc@I~Vj+k1OjB9YoQ5
z!gW@Xe7_(z{k;`?W1&%^Pmi<yHHppTAl)xLhx_w*_LvxSPB#=+7a0mf%9C#BvY88F
z8$MyWkBqexO&Z0Da6d@eFrLELRQ-8>Nw?7rd-IZ8sHohr$0cuRf~7)Y%EZjRt*LNp
zXW)bmJMn4KCU4DTf@;s0NzWO<FO^?#mcco(`|DIT7U1!*@TKlvocQw`le;r12hnQF
zu7PIeNaU6^*hz~~C#G3&Ph$yLm!odBvX7&Vak$deg-U%VtQQ7GWuxe8YL1FbD8rPR
zGHm5~CmJuRewfb_Y_h!SGc>`_c7=Q-OV`_8ovD1$1S?cId)N0!X}5Po8%OIqX2)D{
zy6+paPT;&eGOOhoSR%N$6bF7evoF<6t_RzK4-Cfxc9;l>!(|dN3QZY&M=Mk2LJhWh
z$IKY`9jLExUZ1Q%7i=qV>WgW|pFJLLQD0Jd$e;2P3@rZq?JHC6O4l?yNa2PuiV~jd
zYH>0u96rR_*27zfeY^k~MNvDZy28Iex^2=pZGUC-P$^_=?#cu%P5w{x9<G6RV$*la
z2B7#g8Tw-OxQD8Jypxv;QZ4c;Mn`m<40Wwn)eax}iF@_qD1f9{{#8PMt5ACR?+jg^
zCpx_LP2);kg!y7T_5oc6?Q*Q8Id^D92IQ)@Z+iubFta5;*V?|58@&Y$XY;kJbb_qR
zTfqeoGrH2;df=FHaLDBO1%VqVr96h;_!o{-*H!tN{jHC$gxLM=(m}n>2DX&PR=F&V
z)s_ScCvH6H-)qrLDGVo@{#V(b8A|bYF88M@cAppVas~Z_&$!&2(%LYe9+Bu`clHy5
zmsm8Vi^UeJ+dCo_(PZ{T6~o3dmEwnhxL5DNFaUOaqqW-iX5+(*0D5@;&tnT6F0bD&
zf0UPwb3M#1@T|9U8h(Off0tXR4n6KV^IxfR8sdQhL4Bn4jE=CZ^mdgPC>UkVQ&UyL
zz_>w7u5npx%8W^t94I%m|NG4<xIUDfifqmQ?(=JlU0=$OlFsUig={`Pt*^BT4bEXx
z>EwVP8VU(~Wz?RITyC@$>ztM{%P_LFtZc!vp?T|H^u@-2P>qq8@!Mgiijd~&*X$|Q
zVC?emB(%hXXZ!)xsW8O?8loGi=&RjDvCU~$UW9>JA%EtZGBF68b5Y8x8rY9L5O+7T
zbanX*Briu6?L3-}ajj@oA+I;)uu$owee2v^u^35Z=sFR@g1ng5I$m%6UEYyaDq7kp
zG4Sje5%w8&R^maPcl=O%S}6TH5jV2qBlq7m2nubw6j@juL&+%VN!^LFIo7o(8!>dM
zU2}Ov!O)DOp?P;pErwrO%i|1E$_iznW_x|wHM*$*iYZh39MeL5kT2mtW5jX+)jzNJ
z72DZPl`X58g59P{wqREL(_GlD%&4`W+;f0wl=`3c_@w}-(AfFksc~Wi1-dUoUzZvz
z@g1`9=G-D4G#1<a+g-eJRi~Mp&0&PD^S{!LXed7C#d4W*)8;D>KxNGJcrONT@n|Un
zXF0ykxK>2MSMq9?16NLA0cX&F#H}K4V@uN$+eIqJcaWs<%?isJntz|L)42ok<lg?f
zz3{_f0}50TqSUcdg$3kN1$>_G=FL|>`TiY@PB0v#e@2{yWb_~OePRSNLub#_OKuxQ
zBSlwUwXc67Cw7crJScr}+8O?6hfkhRRfeqBfTTfS<)fMnuMMYVTaK(X4^{IS`#@S_
zJUDsl9G*IcHzqBt8AzkpXE_g(z_x~{70R~u4K0+7@Xp!t!1CdZw)4{Vhz-hsMx&x;
z+hVIQQ|q^1Bl*Svzf?HJ?6l=D%x+RKo*Gebg?+uus8a{7Vo$k!rZ$O!7picJOb{Dk
ziaT?z*;r$KtM@)UBJS70O`L8Ttv*#kzwg^trv`7-ab7i&)+pMt2cq+CnkK_%|NJcw
zT%g>2;TGUCm7AMw1!F~uy5QNXFaM)p7ys2&PqCnH|HNm-&#=MU11UG9^n2qoPNzXe
zS`+LgvMqZLFSie^M1nzWfuwfXa|{X<23^KfM$~jSa<t!r&O_KGILVYoR$nYYuqgD&
z8l{nPE8#=^dB9Ym8X+9jRbK@j4pWvYIz(%(;y1KLn}A0HnXTBI231OH?s@A?s{-P@
zvtyLMC{1ql+*aZu0InF8#U|)m*CQ#@bODsB3*4eVF1maK>1<ib?+QLpVGnjvo7Ejv
zAPa%^;km1symH&R?rkCe!DN9VWa^hcjJM0OI}fi+21CZV_VuibLp~j-Q;Qgbg}g%H
zlE^_RGO*R#nZr?PU_$k>*^wc}8>5w4dkt03lSr~p$=wP$CAz*ZeyKI4zbD=HE2m1}
zqVN;E6m&h&9^V)TEqJd+uEun(Znm#E*Vf3Ytmzh797z3}Y>XPZEKiyRI|`?*l0^l~
z=Ze~C3`KKEX12o{>vvg}g8548wjd9{THZ2gnQ~Lz;6s6W#HuEp`e65uEp*bVAsTml
zTpBX4i}|xp$S_3lNf^yM5#1fn^vfbiX(XMIhg%Gd$)fJ+kX+{t6tI<IQB~J1hLr3x
z%$rx2#Ie+96L?-9(!>fKx$vWU$wMWslr5pFR1RTRU!`tKnqA0P(;;D-oC$UN$DUzY
zv=SF`<-0X(=rng*zYx6Jk)w7^iu?bXxY8qRFI6w`8^b=f54H+tnaSr0#}x99lgb;S
z$VxR{s~Ezbo=hus-(rR=r?Z~~B)z}*_m~3WcKSL}<(<zrT({DJi}*@&b~9?_M8)ju
zQWcIvA$^@^GTF7WG?YN4Go#24>7#koJMtVCg6Xtkkn0ie8paY*<Pli9u7u)RKjaQ{
zT?&H7fYj)UjNXc3IcuJ5`Yjb#)c26Fo$I3?f1WkywM;B(&K6#mL!;Xs#qD6fP>wBi
z<RxQkAa`PAF7)<lYxlE?b-opuuL;GEEy+>h2KSJ2l;|;5)*7b;WJ+Y+VCVs~QF<4e
z_c7=o47S(9zAw_tk5w5@et&x|p2Y4nSRi$}+LoBBtyQd<_(_dsYHDh1-an7jfNJj$
z39(roa3P25T8r=<!4gL<NlfM>2#NXD=9koB7=_F@N$~rsE|<@n|KOwlkDChV+k(3_
z`(FOWAAp=KuQzPG@S;fs>jiIYH^4jUMGqJrHn6f@^%YOWeM_Xmh7909xAG>M^se*l
z%5yzxUFD=f(-#Yd4~V2Jo<)DkS1&)u3Q;eo`W3rbZ!8cao0t*llr8<O16Rp{RyF*s
zD|<fi!kxK*EE(Px3ZUqmeO#;%Hf!dls+xrU5Kj;>SCR&;I*j!c?+ZrH0isFnrZmur
z^KL!mH#u;(>M9?pQT16c#_0$!uMy{xyshQ=fqty|;ZA~l%uhUn6f79(8y-F>IWG_G
zVz{_;0zH57sMkOiX*W&u)Q0%fHLQP<iEu3P!FGZJq`#%G@odyJX1Q5Q-KNs0o8j$f
zok&9teOcWFA>K3cEyqtsEQ~<LoLdB-2$9p%hMmslXR>Xs59Q~?8Vx94()d_n-Hc52
zR>Cl5SI}7%Xli%1S-y>N!GI!-{cA5~u~1!cj>Lf!m}zWlIUnr!8A<zOo|0E?JAR-9
zcUhQor?6H|Op+oD0v=Nm(PsPrb=Ay?TQl$ZJ8~=LyVSr5jA3i|ETcw5l0u{3Ch3Z4
z0iUCB_Zms*Vu#ki$PQshlXVJuDIy&c{Aa##IhPmw+^P~uSUP34WKl&aR4Le*^-;)D
zAuPdAQ5<Cz=2_sHQCcO=o&x*K_zS}zRTV?Q?Q|yExF;GjT{NO6W=Wm~$!(MI{>dx&
zDdD*pw~C_HkrC<Zx}kD(SFbo#MYIZg66^Uw?1XNAT2*1p=}MW>D!D$ovsC!-3iH-R
zQ<-^C49l+-%>fpHn8ty7)d)Vf5!I9oJ=gGT3Qp?smNT-uW|t0?Kg0NnST8DED3r~s
zc;Fsy6=u!aiy|V1y>Ggdl#~Fe=5g48tHtE{6oS0Pv~d=?{Kt(1!<Uigc{fJuYqj?9
z;Sr~qx!>tcjCZLKb$~Niz_M2C$9Vxo0-tV6KM%dQ06M5jT!X>rLvHX@NIuT*Z=YXl
z`jDhHhsas{UOX5K+SSJsAysx{EG(Bpe8U3&11|ty{++8L5ez7?e;y2at(Ag(c+-xS
zV7xqG)7@0b6A-AbAm$uDv0u;~`aKwjzw{+Yk2-iM!`^%mPoR`Z($F`!HUsz!Ve{JL
zW2@ygkZz53;~?@`GC2HDS_E(&DGsQ4RSsp-*F2+EHTo7~#t)@Z9+$%1J$dE4#N`Mz
zalX;Zf?wYYhg;2Ce{yvJ`eU^(t&g-3t;=Pw$>EKIjEIYS-;xHm<&;~7AH3H5Lx_TH
zZ6Z|7X~sx7%yz~0L%UK2a~2F0Qw)Q~@l&0zc7^&Fm)oI$EzT`6QvHcmFxYg5+e*`M
zQ<ZtnVO_%7ALYfJ6m?krX}4L%m8tUbC+JtJfS>=)z59UJN{cp{lmmZnO~Y0d7CE$m
z_}esY=&h@N5Op83%hlUVF+71#4(9^vl+=#(PJUr(e4>w9Vw8T8819Q>+>k#HF)ktM
zz}g4W6c)8G7iV!CNgFKb`HTgr@3M57#5{pQU=mEcZ`9Gp+S9YB14#Hb`bJyKrklq{
z^hsN&@r1p}L)~t#GOzCT>Ld>9yuB>7@&KmnJ3p%Hk1JKW)co%p4gs)5NhO(DBP3+a
z<B2BZb4Tz32H8LE(aMEpS659{Gc`?L7oD&05BK6X`FagIJ4=Z^an!tPJgew$Tgfl`
zDvN~KI!1$56A&AoULG+44$(mKzfq72qF*Q|sJ8Cc;lxS4twA9uN03O`{k*f)n@1+d
z_C^?WI{F)bQB04|pw?+)0Qs}5NMWmo`K#-utHITqX9>49BYs{lL=11#`nxJybC}g(
z=|kL6Zi~BA0gcxOO}RyZ{)cx2@6zr-)IlZN03WZm<u_WR)E6TY-|vLo$l-AAiYDvG
zkaIWTJ&{cxE7rr1?;PnpMHEr0Va0yRwh`_Y3Eu+YEr0cZbjh8oVLdN4@A}u*T9Nh}
zSqgmX%($B9yq{Xk3jV4;e`4I6GMyXpNARUWqr2X;4oWD>Itxfvh0Da^R{8V(_WEN+
zHbH`minYZ1_EB|bhU<K$?`-E;YpJ<FsJE8bvN;kGlwYGA+vX8GM{%nK#1UAzu{{4`
ze99Zk+Z(RTHL`s#-jP?7=}ol1xuV+t_?Ka|z%<rUw0%my?ef@~yZLP|<^Zjxt~>07
z8GEl%*Lr^PVA5IMS1GE_HNKn-l{IFLx@<z_^o~$rG)RKI|H$A}?V<KCe)zZ9-_q`%
zFDhMrcuj7q&*J7TIi1iJbn_N>IpX^Lza#|Gz4T(k+z#JPJ|6;bH3_t2Rmq@Q<e|%l
zF}S)U?UQIf*lgrNJc=Mtl%I#k!Cx(qT|3G%+@1X=pR4xQq&w<(q-+0mWKt}|#4WG)
z{m_Eqad98E4#nb=P;gMpf~zEW2Q<{{A6nTusJ^4Rk{juX#GP=QmsqFiJa)ln<OX_V
z-f=rVR3lu&8d-=|Ate&ev569eol%^9ozH~+`UhhQ^CRzzwVhoI9&-q^)XetzIjWYH
z79-=84Mg|)02x95ZSSbw)n0xCx$r-i!_f&+hmYND%|F>je;3tj`xCi_`L6d68@$uA
zLrzeO#=C`gISB<bsy?EfF=4S$rrfZhHE{7te+)qbAhTtmFM8OiRc_dcNJ?Ie&og6=
zVNIy0I{})^M4?`gnea0BAer<pP?L##%JnpNxbP<)nu9(};v9{HO$&m{^r&Zu)3N40
zNwG{yT;OtZqv{G80N{`a5yM>|(<BFNy;C$posNXENIWChNb{*c-&Kyn_bo)g|2431
z!&W)p(zmtUUVNGjC;4=CEz+F3UV&Eo?iOhC_Tb0#9Gh_g@_Bz#e0J|GrTBZz7uu>M
zgM34|&cHIkdyvyE@AHh?Lq#PW6Kk=<dA1Tr+h<Y7Lenw?OXCvqp?H;4gH7KtErsEi
zi4V}{9mA~oY;fZw=!q=lh)CBgO9AO{ccgkjkUt}Nt2xTNRe>q)P^Hu|?A5%{!FR%F
zP}!4r+{SWgss%3!|FikG`@gSn-{?vE##>I{Upw1n&(<i!MM{VltI2O&f6Qjx@6KWW
z<-w1k?2!Tx&*Ibm)vY4QO^%@Lz%F8)KQST;A%5q<P@@p%uxzMq8oX+lE5u~W`oSM>
z2<#DN7RrkC8upp|`_lQG9FVBzZnDU^C7gNOTJ#DTE70V9fT9k+7bZHNrt$txOKj#B
zuP~@}9#f;Sqnj-3+yHi$)GxG5In_Kjlyb_}Yup3wTZCYLM-X*S(TGfcH%^Wj-w#fa
zS<T`WJ>swbmj!sdS9udu#yHZs^gHM|oBcfORP<2$Cirw5S$%0lS|md`KtBZHS81fM
z;7ClVy`)x;U1~=EfzR4?545p=Vh*XP+4fbIyf6~<e~X~r!ozTL_$obpq7ZY*AUse|
zCvX~86V@$aLP702n9n%t{QX_lty-lc|8UCjBl(8)jbFwJ#&mR)Fj;c>_8!#sMwg{I
zBM%@BUcD*vej&nNa%cL704F7mlzS`busd#pPNzI4ckF5q=AMnu^aaX=k<d`HMsR;)
zA+F}Sh&Y2lu&Bq)HR;Q*j;LLxsQ8RnZyOvE1ZT%O+ws~~XG`izr4X@`95rzt8Gr+Q
zu}o|`RX=g@HiJb5=^JzI+JGh?GVT=7T<PvXQTC-j!mbpp3KzWU=kRUD6T5+VVA5e&
zR&rskQE&t3`3blObyZ_=ANl)C0ISVqPsZit>*QJ3y{Sn}B*!7kAarOn0^(H0socT+
zO=<j6%hp$Y-tCbJwxo?qP6k7bRU?6JoVWVhLCA_9Z)6>#S-tvEK*~PE@2wZGEq7*p
z=D~P6(2g_glw&;VRR1cxvr=;3Y<-HS>JK4a<SE%=p8(6$7TialvCM=m<5eZb5PC9R
z7A*zYGMN3c-@9e5a?o>QC|#Jp2r7d8zJ~z@Q4*^>2afmNhcF(_|0^Y{)}@_$-9tS*
zKHk1NSscz4dUwB%bRYgt6kv<~K;377TX&)FzkN=iO-NQ0%gV*pKS!L5nt=Cj?pR3%
zz#E+m!iO-^an%$~dCnmp?ox*4Fw++eHp>?ve>G0zlT}Yn*!qB;X!lcGBX-zH)uNU8
zM5ild4zHb8vjmyCn)WVqCaaxk=|iEag>%0PryEnwC$cgWo%2{k+`D96WujoJRS2|{
zUw~|-6vF9cLP!6jVsEN_U<qi0`dswf@A=ye5U8a@{KsrYSlj>@@XI&6=e%Xv<_0sb
z0J+qLuhMDP4I&E%I)?u_cfo+~rg|w;>1+MaTe9TWF;O45mx)Eay|F#;7fWmh^BcFP
zJlADZ#~+O09C&nq!-R-PPmAS|>ZQ{eN(1IV<M&^4^PfDQwoFb5G(oM#AFJHmgykA!
zZO%ab17^dx{xEBueKiW!R()?~;UBsSLe?S!)<267ePw_1#)?$TVQZ8TWB5{#SyQiD
zf}I&?J(t+7cqHmiUB6oHa%x5f$7Y&QMNwL`=?@qDn?m}_n6=r@V@e82lfj%1Yd*x|
zs?%=2%fh(kD_95sceHntsqmJDexINHAeJYFp1bLyM&K0Ks<as&)LyK~E_Nd{HH&9O
z9{L>P4cvg5R?#eaZKg8KYzP+CmM`^@A*jyUgNUm<9|FC-p)Og}mA}&|7k<(W4c!Y+
zDHaHcbRqx!0D`L$MKFVoNmrLjnENG1djna)bkNbz9I|_LBiYuN#c?Y(#Lcn<5QX9u
z-cLdxs<MtgYC2I!)O@Dp2!F`aq*#JHpvFWG@R5on@x`K?l;cH?A}y~vC_`V2Y#dn|
z#=xM9`T9U@jSt2lJ+2|pHodE$n67XVh6F#g#Tcx3ZV(8~CaO2@MKl$i7#hQ-A3qXf
z&IwI@Uv2CbDZN++B>3^;nzh6E!muZP*hve<f;)UVQ@JBlEzFI*0gb=g9mCKes2-3K
zSK|P|?rPS4M3CG2aGrf_nK%o59}P4`X3@)*VmDM1ec?NO@lKQeX1i*|MILJ1;MnDP
z^`RS+j0mH`hU<*->Hc~I(!c7B>l$19#r4q_ZZvISeCoaI_m1}01{8v?v|T)J&FSOS
zCC*s1Gv0~9isVToiDS}dv4h_C*QD6YdMLgT(%KvPX3uc$6MxJcw8CO<%>C6JrzjQP
z1*v`h7?L%FBAPdsWJ~2|-{|j-h=nUPt{yOlco-iD^=Ust)c7g)BocW>-SQ@&S8r<K
zZCRR5vq=#jbd-UNP|L(cXlkbLw=sBXjrRy?_df<-yfG7N7Sm#(Xq7C58cx)YYq?H%
z$)BA`OaU@#-GhGVSW$=?N57i_Zf?TAZgwpp)aFRA5E_)8skG-T_)Ph=y1(LM(`m*P
z|EgIX1%$yyTMPKFOt9CMu6sQIoDi3=B6sqYmUPQaEF*p!R}#Jwj=|l#kh1(ZRIW0h
z=ZBVw?&=a~b_Bh-poDpk{d3){{`qot{CEH1Nc^M{Xen<C`(;+TYy{qa4R#>nW1~pG
z+z`0tg+!D;WVX;N>~V|R3~=mv_&6XGfcf!^83Ev6{Mo!U7L!fC)KNO@=d<X}vjQ~a
zbg*5e3~4gbeE8GofaNhs7+Kllt#dph*q|vQ<S#-g9f?d$6wK93n_`WI!$t_8{QCyj
zfJUfSFP{UgvrE+({vu29saUQkTuFwUsz3^b3aZtW{*SeN|NhhXFa@ANlQ<yRUW7aF
z$*DpMJj8uN<n`+G%q3c>h38!{J&6Hn!_b6h8Uc}x{)E2<hJu8-iGvcR{8@-`VZw}S
zvXHe(u*um}GSuxarDYU)MyGzJ)oDr3OBKs0xKueEh7~-bq+GrvXT=EJ!y3;g{;bMz
zO&QfbSwo^k!X9RM0~;ds_4eh{X#D<g{EHdsAnZ<gp^T;BJpp^1za>hXq4uM%iPjju
z!MhMOrODIAhbkVos7KMF#>aKuo-egEGhW&rG~yJ9Y}>l`ic_=G3bUYtiw)NAn^zIf
zf0+`usG6EoDlWXYqNJs5%~gpB47>_ERknV-`o4jQK6%ofKB0aWfr~!2dvPv8Zm|VQ
z|H)j)=k#;5Eu79FDJH&yOm&JK8CjX0ATd%WBxH<(D@ST??3u+y8^0%qmEIK$(iul5
zA?;uE<P$)U#6X(No_%2hc?>ejjDcf!5|?<M-o<GI<U8=-H*kM~yu(@wn%p+(-#mQ*
z_6xMsw0s@q=J5@ut|FNo_zzR^)jeICE=IIjELPv>bb_hJ{iquD?A}hMa6H8WkrNe4
zvWjPLXiM!4N|fK~k9c6YP-hePTEt&vgS~@}7m)ee+!0`-lO^|bxyshpRAlCb%rirl
z7Tj2P6Gt<}?~G>4W^J)vS&&T*z?(;}^i|gPCY7)Svb=jEuVYL6AVZ2+nYvbY0c=Ag
zBEWQHBE+9BwA*_EC`z72)7ZUMl0<Neg$_#eQ+txm(}DId+1Bu7f6}RijZL+?;Ydr^
zT|9xDR3tkKIDY$5Kfs&U%O~GE7S!$J8aW>nT)seHKgcoz3Q8(sg9~6Y4|y1e9Y?~s
zk#ze_-NYsqsjKt(H8guYmXVF@^VX^(Vlizs`9d72MQbq>AWpU=pC0g$txGiAOzB)N
z5}Np@PLtOH(;?df^X|A+S>1BSVo?3+>u)o=xDhzI>$KrAfx9M<FXm`Er|<=ctFKIa
zfMebJZIic|;k*xCTm93-Qut^SyStMTXY-71yjGlJDCoP$bmFj6oSFG(2<Alr$$H3S
zQn$y|1J`roMSr#38ynpf3CMT|)F*r+6>8eibwl_IL)?(9qq+DEla;Hm>kIjFg{{16
ze4+Zl8$C=fInF$12JWW=)Y_pM|IjA8=~li|e;jN~h>`P6^vk{k|67K?Oy_Rb43whV
z%e@gX<1%#M?yfmDvms$!Jp;LFEkt7pBNp&7N0Vy9`|tJV3p5NjfQcade;krWKYx{O
z3B06h5wrL8i`@62M^Nu%3L)CZkHly~a{T;2e>g*SKZ&^}B)ffNL_q#fzvs6p2=jDv
zI1HuR@#yo&9aUbA5g8+!Uct%gRL3W)S(NM_b%#OZibp_={O2d%qE440d1-#1^46Y`
z<{(<)-(O%`dR?NHM|8URd^Ls99UdSEj~Iwlu4?}jD{Q|y@fX;_I%y*0ENJ(HqII3S
zA|)kZyc6=ajgQ@ob2S;^=Ah=(>rdXWUbQHEo5xZ9c$<9c8{r!{uIAq}jwM2)zJ?$C
z%VSe@Q;~4F{!<DAK|XNx6E-bl3MJl)&_gW&xHhBGG*kVcF|)R6FmrWm<zdfb--lm7
zw9iRUl>5~|ub4YXk|X6bnCHTuxubQnE)|a;O(kXYHz6y{0ZLMQ1bIRzJo=;c#7B|P
zLd$g3%fkzp2Fw*KlmbxK&~P;$Iy%+MH}Qe-ZgxNWY4NkEzCePO%~pp7ZAS}7?^fM;
zrF$nB+R<3>dStd%$Uz%Z)8DHWH8gB4bZjRf*7?TZhl4U<yAuZ;EPBF@=tWKu*PSDm
zTlf2(>_fcM>pvy2_|pm+fH;greqWvJ{S<l=F(IjBx*{JZ5gVZgH}0&j4n5&Yxl2Q{
ze9b+(Zc!u)M-7*FtWK`?J$KZT+C=s&OD>$qzF!fabzc$pHqVbk5G7)lK=J^G13x={
zPp)ynJwe%_LhHX$=NrMZUP}A^Z$6K|HwHdt+aL;^mS}8H`}Bs`dvE$0KB$Pf`W>=c
z1FI`FOt>w}kg0nNSGkUON14CFe20hnD)5T#yTwMJz^bRmfr^<d;9}zp!1`31-Rokv
zM;FTatyMyih>m0tO%j3?*p|@F>4%S_DEvg0IueEMEZONd6xoYO%pdgZZ4DB|(U^6T
zPPZE)qh>|1E`4+OCTnAgnX-od?#a!d=4mSqgRkm27J;0qNFd3jyBm@^NrqoHVx5I2
zZ^I#nXFtzXBR-sjF$gi+90!EO^|6L;#yjF9Rs|dK3NzjI;mFJk;8#M3Y!9>X%qiOK
zX{<`VOtThDuxYKO(Dq@djMPgkByE*l3};erHq7ZzJYj6r4Mq5?WX<T<s6@cvZ}Om6
znU8wT6&Zk*zfIeBVgiz+gL52EV>qPg^iT`G7D)Cn9f2=t--y6frOCOyA0fKH-4{%v
z^3)NC{^9eBJKDVU7=@SpTIEgDTBu`)t*PxT55H8D>tq!X(K-V$@i8d!?}1Rf!8z9t
zoM-n%Ao((ILL9x}xTKCT*8RSnwbf^i05DK1Jfj!*dfw~3B8$to$vp@Sncp*}RM;93
z=3Y}Sbc$zqsvW=BZQ)Cuk10Cohnm?4%hVzHDQz@-Y?^{OJPfM)Dgt^;DqF6jz;m&q
z&3aQ*4jw!|&WLLEHr*nsnE&RJWeDYCa-#1%XG#0ev`>hGQ9z%dDBTw11Gl&>vB=L$
zG`5BMO*=wh%B+>f=FG20d5Av>)poS{h&-S=tTw_2`Cyxa2!%WZ|JcR<fn4M@RUG})
zm7!Xw8l^6qGWtIj;Q@gDZ~ftw%im%)X*do!Tu+;DHP)niW&XVUgJz8p@*edC1*Vrd
zE8w8f#N$={OP8wk+&qH)r`NC~Dvg;}(&X-BpdT2X+UM)$t{Yl^S1gWwa=8sDc^_@5
zOe~cKrz8c|i~s_}N~jm4j}{KIrOqI1;ZcUbH0Kh>@jAi#GzM+FaZgnek%%SujdNHQ
z6={`i#k`*snsLmzNhP$|Rm058xg|2xpR!nx^*ivN&C6A*4Z?PM?xI`$_ZU`CiNhxf
zOKkZ^d=iqk#M!beS9Kz(TLvIIk~tC7)}(2jPwr(fZAwDoxUqeS0i-nKX8T#k3+Ho`
z8W-Yr<X#ymONCvOdJ!4k_bY!MRDM*Bs$kf_o-~MX(su7Inu1Bm)?|pP4;+v-`;MtS
zpG5h)9Ec^SM`z&gxP!^E;vOz-M{m>*UzzQdSDa4EH<{P2_)|@_j%*j-@*D;IYtkK4
zoKuyN)X4c>xYB0l2vWQu(vhx}`%yq5*LYqgU;GMp0SGvR56WKhe=Yg;Gs5Ji!#ya_
z!Lazr>Ox5To|9m}x!T1!M67ECf|9mmti#nBnN?jd!K`=FuTZkBQOLs5_XyBZ0+67j
z0~M6TslTtz7_J-Z)HDu0P&BovVAhi+o|H5?#;&!E*f9CXD)#1jOFJt)FbNcQF?Q4-
zC;^eQSeIPrE2$H<c*T^oq)zX}daYtW%>;sMrUZ}8qvdda<p8!Z;gd2W+5j)Y1~4pJ
zCS*BU?%?xUXi~3WK3i<o#j!hGB#;a%;!Xv3pL%cAON&E9`X|3Q8k=uv<*mEVg20Qt
z#%F)KegS=*HpMlwjF6DLvZkQUo7bGyZY8DsQZtjCGG#OX+YD4m3R0w0{jOBQ0}wOR
zXe5y{PHx{R($t)L;6B2{MNGL)cZXx=Y`RPC(r`Cq37=Gg8zZ|NMJEhO;?S>#o0ldx
z{NB;f_y9=me3Ru=)}TJ1d&s_lH6W{{QbBSmWE6_rg6?^9TuN|HN@vfDrVXFn8B|YS
ze7bfu39rxB7JH{xYC-aYs1e31ER^#pJu54ukPmMsAqHKoSW%BVZUa6zNJ8foJ8_9C
zXqmh~tHtOSdnn)5qcp%Ba-Yu*&5ITqj!b|8?@JW;b)PQwgq*zZE;V6`D_=2sL#KPg
z+J_p<FeFlYv@NOGDUY*sn-P_vxt<{PHCIyqn||GN;?f1Hg!+5ZWXJ221`C6}XgSeV
zx790m;*untwLtiLpK7#CRD6HG5(BLLpI<`a`e)(Qk`S0G`50&nVSv~OH5@?hTp4Zv
z6oYji8xP;7Ta1sHrCUG#0RJ*!WO>Cer&(r=9xw+bKElBMSs4#$%-2$SU~l@$Gc7;M
zox^%<K_c0Gs#xJ_4_6vVT|9V~LX)Vosw8e(nm*i@17dISbIbvSk0ZedmEYpOp?zKg
zYt_5RsfMcqvi)uXM0!kZ-X!!<f))z|$?6ydd`2em(E5I#T^LEsi;=Oqku)PdBH^kX
z3w60xSa)j0+Ye}_ClNugkP}_C*O=2Gpnq(25-0d%4L!MbLEHLe60*qMn})I(+72Ck
zTei<?&6R$pY}+4`w|D%~mnGffOJx`6-1a#=2l#j5Yi(}ZU82EdJKh!Ui6#PYS{dQ0
z8tG*4<sNi#qdx9E{V5PK{d!kd^{Oi>E9uwP*PB>-PW~Ymgn@Q}|3su{Yi0L>yW^km
z-2{D1931Rnl})E(q{Scee@I+2i$MGxxcKl~H-Vy}qHWz_LpwcgiAZ#*lkN%)`Z>F4
z;=OT3l>$?X-I2oJ-6D!k5A?wBueF;`(;wD&T*U2fXsk`cQ2YF(*0~HqQ#F$wJcwx%
zwur})w^mTn0~Tt?frvo<c3ucW`vIcRVa{;uL7f{222(O2TC`r}gd6J)$#yb<fP&PJ
zh)(=70;0q9%x^iz+njFfN960u>I*!y-a5ZdQ9zvC&tpRGBZ`yv26=#@LrZ#}@NEXa
zk%vxxz2UhLPw;%-sZNEuHK57iYlV)E?V1s4m2;mU08rBwoX4B`HMe`^VfXFwRTr!p
z1gd>_r6X9Y{dED1;Zrhf*gMw;67m}yTg+1TR5_pgI9c!jFrdrJa+L*tX40vpO5n`-
zPEQzk8XTY}%(732KC{JQ(fZMsy%<^QOgh_c@ErO<>z7-y;0dURbnUd`&zD*<0d!wK
zvTeaIug9$|+b!eZ#K4c_F>PE95l~);8i|ls!JP|o>fh%N)`>9-r6vLl=hmg_@pj5l
zlPRP_mk^+`eSwvWk!7X?R&G@$6ctwA_XXPpjac$etez~^p9mAM!Cdl|bc#dkJ=bX$
zRc9mh^+t^1*;JRJ!tN|aa;!}H`EWM;Um^UEXf%?#w_$fW$o@943qCH-A?}PD*c{h$
z-uJ_<l#=!Nxw70lF(Y~#=*QoEeTZOrfh7Jr##jqz>WL1RYHVdI*WnG<h?O(;yPqi2
z>L&CyWGt&4dMoaQx6Gt8=SDH>zP)7C=kkiYDIEFQn=~kF1{QQz>HjW$Gni(!sA4}+
z{MuZjIO?RROVdZ|GvBNL*3tOq)whUjj#juvbkP?j(Vv{g@aE)zN(${vAz>^Fm|W}}
zvy(aIzzwDq+OHN^_iY4104VXWXRC@J{il;o$k{4aSwum-wUlGQeOUfQBc)QG<m_o8
zs9Zvh6<XBB29f6MTf&9@TDy}l$KuW+a60eqLz7s3&)bdDuoYLO6igZuG`nWlSQ!e&
zWit~AO_Ddm=jD9Yiz5~)Q_Y&S0;SyTjjhuy6!_~*>V^4T>T;1YZFbB>MSCmzNc6?l
zS7nY9{-pBm5u{~4ywf=Jci4#MFI9Wj8lMXmhnr%!|7Di<#73P`VVkx$%+>*bb3JU;
zciv4_4T^}7XO<e{z;?!LW7WccjH~GqMSoXcLs@03n+;s1P}{YZms#Q}Ii+xIC}J-~
z;<=r-VxYv1L%|A(fSO9i<QEW6D~(!6AvnZJYSGiCT;42aZ2&itEvvIu(jKm-<lurv
zzl!i#+TrWevywg!QyJkbKL!R9c_?p%oJ*(u9m`rm@oVzE*}56#3h%f)JLRPXnG@s5
z<hJo2_0WR663>`c+^BK3JN|Y-kA1&13pR<0HKt;>;)gc>9sK7A)6N5U3k`vJ&W$>d
z{+#Vz?}Cls>x-$Z+OLTvZ>qR6L#lRzS`8y~))>|9(G)@~p`o3@BLnyiE)ouA^_ZyC
z@V-NoGV=|6`(Z$}+We>Q3`Z{Z5QpxxvW8W`n^5g!75LfvtD>~~X$Ckd?sY%kbCDHL
z68(m2o3Z-BgRb53^oi^oF?=zrgP_l)t)1r2BrE2*g`)TuQqqr-l9GA;8gjzIA6A;w
z;7xY>uWneKAp?*H2@=N}3%faTxng~?!&RkLO+T8#r$jm$({}pK_3>TH7{>{6ldh0I
zA%C|kinrFd2}}z!BSZt!;hoa-4bP8o2E;x~{Ma?HOJfnH=<~HUnsF2Vbi!jz{NoO}
z|IXuL#+o4!Ldk##;b+@pIcpZ_d%Wx1sv45$75y2Ym**S13rWO{`#iMxf0eJY7GztH
z7hh3P^_#5rmXnRtc-DSAe0+w2;L2)!l;^K{9^?2Quf+^2IKIVwbU4vDR#!NYOF=H>
z_3LK7@=d=OTXr$(cDjJ&3Zqdp*vPPnwxNVol~!a>^6_yhS0Zu#7F(heA|m72j-Rve
zXeCq0B>yG1_JYUA;iE`egOO+$fi%QmOtKg4cx=6Sf&J9#h%xKGzgWnT*)?5`TfD4Z
zR98w8dT-bfNy`V03ZXxaIbNhw%Si0mprqLJNzQ8|V?G<NG>a7bn#`=K%~;L99RK_R
zXaIj=fy!>=DAOi}y}P|+;lkaT?%krNi<PI_wA+OSze8v0nuwtB02DntpKC)|Ih)WB
zM32?z#G}YXb-&vn)(H89EwyrkJxg`>ovIuo{Rl#DHD~!C35KYvv7KlAUFev*b<%B#
zyGVTP20Pl4Z>7h~caOEv<QW=VDg^Ifxy32}pHrocEZ+;w1p&0<eNq;G#gEl;a{F3(
zuQD&<I1Uc>!sA<?9wJqZ6j30-ntLXiUm)Rep<h%Q3(VxrZQ#U&EsmkoIjudem4U)u
z9>aI<b83jm>2`(ddxhxGPM?fe<+v{TW;1Gz*`%;Z{r1Poo#QD+m48Sv%MfiK>V#E7
z2G{D;DI@P??0w(ET!LSGgE!Q_gf4l@FB=c)r;Wl)Y;C`iGyL*?Si%`niY5VuX;ex@
zjDDL7`yMxsilkw=!Uf>l^4Ztjak+c>y+@>^gsm!kfCDL{KcI-<Lgej60*c6%{Ts%Q
zA2=tWGls<L)$(g*lSG(8k+*l3=RC#E=%kfDPxN=^$7&bePw>!wd!K2qwU=>62*-}4
zY4`qCW<^&ywBF^Uyv8rzT1GPnM4UgoEW3VseK^PQk#3Tu=(87H$5jN``ObQXwJ?hw
z#z`18s%h5QjbnpllgpF#{lwM0@4tU>d*RbnM=zR)NY-2_!X@wb^?AlrBCDl&RBq8A
z`Eh;`_5yp<QT5%YpT7;7Ig*Xg<Z%-W7+|+a37cZ4i9mBS+Ai;Bx?9e_u+wfLh*{%p
z2b|~9($aF*xl@8iRMSRZ-Lg-kN1N<9nw4Z^`#(-vb9cK_RxDUYZ<ulQ{5HmjeW37D
z{L?c&>m>SXWv=QM3gM8QxJY3VB)#%;+vxWN|J0Mp!!bla(~Sfh3U$OY%4J_Y<cRVZ
zq3U1Mjb2I0hJMVMbq4>wHnAk*m}!?$l<jXQJ0TlU>&|IM)`oYX=d;wjdlRpSS1qvB
zlKg?Qv1-$vYrwNVwg7_z>0VJ6B7eTRFxz4z%|f*bF`{pJ6DqB%EpBNJ?Ft&s{A9%8
z|KgW(lWSFD=!M`I&$n&pO5ey@kv<~;aqEP^!p0{H)lT0F4p8?*VcOEt@OMXkQhRfc
zcKwf3()~v&=?2_(Ri;g%4UQl59zAk3(f`?~sDJ6mk&Fx7HNXSb1P1q9VSsQhABG37
z60;r<S`<k{Hp77E&R59`tBQ#;dsB1WBvtLaJ!R@VzsO`qF;WJFX!{Z?GV}q8CyF!W
z(nXxA24j__%97EoKR!iMjyhrSqvd}$fSe%a#D;>N0kK$cU#>HRKXBEk;{X}fV`ePp
zm$@O$CedY1RX<})g76Zzs*+o!intFhN|dw6Nr<EYOu3M4Y(}UU5fQ8%dlp?1Rd~v6
z$Ht5HPSp1OK;<NtyY>0qxz7;t5yua+y+&B>sTWE<{z+Y9d(SuSOSx9hxjhX_@pjcp
zzsqAJ+;Da3P%QNbr2dXyy_F<@fBT$hmaSF)RP{n_+sT+|n*xv3BEdWP%+2cV-9BhL
zK{{MSGsZG3Wc$qW*ejA5a*poAgiJlMc*9i$ML#pqF;=J#w)=Idv^$YDJ&>g|^E^X2
zDN7V9xc($fT1(hhU|DIVj5;XgNI!O_bNrc*7I8PEWM(4#TUvS6{Gw%{dg@ORRC|s7
z7!#VEq<L!%%hbF^LvB_^lz>)^8MylcC0lldLFypNrBy0w>-#m%3WLEolhVZ;>QBB#
zMHl}_uSY~R_w&AuO2ppq5{2e23D>`eYs5`szZSUt$Vy|u%O%qjTh8Rh8s%gimEO?*
zmjz%yaeu@71et*2RhtWi^tV4;IiE_;URW%E;AZ@}&N|7LZ{0qVMvyU4VQTg+$il{f
zeovtAe4&}bMeuYlq9PXj;{X!7eeZguFTI$YxysILz$eL-YS=U7HG#5Wcw&PJFA<>P
z^RsmD{`9pk^k7M|*<z|ZGjjiflI_;bB#&YGUnD%&6An!!70A(+MiLb@f0SVM^!QmV
zDaq;V){St5S1b0mq`f&pdUnv{vklkLC!%bh&m^6}J_n@ACCF0!l#&4>rMWdE;vW!i
zap!imGex5FD&@le;#kQi3HtZ8<tbF<=od3qH#sH%;qAO%;|7Vs%9aziG45G$rQ8^L
z%7fqO5;r)OtF6}Y&YmYkw+bvc<M__kZ9U%iqWIX@vSuC6%U0{%!yc?SgpLf9BoNPy
zgyN>MqX+!hA-oG0DzL9&{LAkA*SS$(sfrfHZ;M|^Y<VSqjXeVUR|X!8ZiBOBwtxGl
zmOnz2bT<fV<-BMSKu|Gn8?HZG?dmr$bkA%uOX%Gh?K|<rR%zdtyoZR(y1;$zUN^bf
zHHUNWKqzJ#UjA*?`}6k$)lRGA!JU%>m8JT|T<Jf5%euL42>k@GiEoJdwe1!>VR-K{
z;j%j)(}1+)Bizm2JYa8{jquTf{OnnhZ)e;Z*7}i=SKPi%L_w6JVb%jucTYsq-52u7
zHQIa%K218MGtP#*%{YSo2oM-^p{dbN5^TLb`!c+Uzq%v+s@nIO1dbjX@~v~6y*N6@
zyS|%hT~9#1FXu4gxO{HXd>)>R18sYzqQB~M3v-!#gJsCj<S9J+Wpn`VqXlz%divXH
zlDJ7wDh5ALqf*mX%gDp(e@v6@Ki9BUf+c#D`G4%>n18TOd)m$%{wL3)(9Zo96!Bej
zrG;;hRLCNaAiv<u8*Vz$A{<7^hPaT(!@z8-D=*9>`32eav8Qt>tyULT)GsEuhNp4m
zHbb*yR`zhz=V?XXtcsoJxb|q*od!d|gvm9(8=F?Q(t3x_=V*Ow)e$TKXcOJ=8Ecvg
zvt6U>gUO{j^G~A9RVHS3Yt_tNN_sIqPq#ZYPfxLNQGa+Lj=1=*pr*u`ha-;q+)Q!E
zzxRQViaEeRP@vfB`?Ja-^Jea}?<LnhkF<&7(sxJo2AkqK|K9ZEEwD5s60?&U&(1fP
z%Awfm|MGTe)Gh>L;2&XM+P5rdW}X*4EVNRP&>eJ{Fc9w|w6^wIEQi76$A2R>^yfpY
zl~Q1w;OjRE3!>4My~gC12iL9Ufvf2%j|`WczOz3joNQU?&XLTzLXO*5EGw%de9TBf
zQ!44xZA61j4S}e|TdI+@MRV%<-VgZCwV?jUOYc_jF0Gm|c>#9uqn%C@Tl->p?6m|E
z66j-7%p@d}SNFPY0TZ@v=~h0+r1$BJp`tI}k4T-BDX`(bP<$pM3%OK@)_{}F8M;=e
z-{d~!gxg6VhIVwoFD6)LWc!;z7oFDv#F<0fOXPER$ZEqY|I+8`5I%CICr2zbw}?mU
zm@vB1U1<W}g%x+h*2G)CK3qiRHQyqx)*w5aDn-411&o#p7&>yf6mXZ3X&U#<WA~b?
z94B5(Tgi~uiHdD2CAy_uuYAz1J~cuNs6-<%194h_q>JNJo89WFBHcJE3Ie)wPMwu+
z-nP8AFh6x5Mg0g_4EyM-(l1;AQ`-$0YzFG+hc@asbGz5G@e>)?o-XH4*n?w3_eMY=
z0Yt&D2%@0+mYPHiJ96@TJ?SrigZ}y->tZqQxtD~t#(BZC{tMyV-EnzY<XTgSuk=9X
zGC!vGOd~w7MO?hRCvElv#JZbTgTz6}_rvo{@2GTl@ZPT&3m{a*yc%Dc>D<rfpcOof
z#8LJ75YN7eGQ(N{HCQ=jLA1)mEnK#y;@j=q<gFreZazpDq~yW>eIJ(GV!aD9hB{Fz
zWrj}`QSG`Rs+bWO2^TAuk1dKsRt4bZ!0cdz2$P7oBiLW>{OfBanyPZ;dKI-CNM=%I
z###Cml1jY*K0$cDOzqK`3k={bI@S#Z;eBu1&c<C~gy(0b36gP>&<Yfl5>+BDe{8I$
zu+SxLwgzAVFKBkS5&~4A_11=N_7I7L#;Xs)yCYTB=nf1;54mqPElE)H)p1uZ;MW$A
zJ3nm#TRpeFWU2+fNXqwAm!uH=CHP@_!NT6*pCr08(PZXKDlR>u5YoPHAQ^d6#_OkA
zruVgjp1cs{(S;=QxpDvEbW7pV=#{=fu5SCOgSlcD=b+okUkTFa)m<63HJH8*seIW!
z`|6AoL!TDyEEVMt@_yA-DPpa*XYH8z13#(;dhMSWxbjq#T-U-(B)Zh*8`CLe6Mx<M
zMS^c^zf0Sw&E-Iby_?oheX(3vcV5oLzgv9N3Bu+17?d{oo15nj)=9)7ONCd@ZjkJ3
zboH*&i?lksp%HHgOb#;DobgthEH~oC$R;1$TfO-toxgqO*hK)J$6$jyMR_Cm=dnW(
z6L@(bQRdUYKjCO3%0<!`6jW3dJU_Pk5jgBuIQ~D&7}>vN4C+pxqAOM@WaLyVA{NVm
zt?P4UzH9Y0sL*x;?*CKnM;J&LDg1IkA%7<b!6AEZ=93(7@vX95lnki4eWQ)BuqXxx
z>#s8I4x{Ru8U4Y*4QuZl@q;hasERj_2RF5`A3nun5mWOk&fwXNVmwd`QE$%0v`Hrw
zn{oZu@fJIlIQg5S?Emd}qpD+dYx*i8m&wNC*6CyXKMR%}zSHA7UQTqD4x%m{IbKL_
z2sScM<T2|KgKr^j;L%tKn}cmIz<L$GC-EOS7C@fI2!((!lJ-30**}0V=_+TLX0nF<
z_FUWiaODNbX$8Zf6_^dh*<@>!DSuQ^Q2`AhZ2X^Wr|3Vo7pdn(SM9%kuLO9@lH^2E
zOCIgLh}0DQ_4Ie?^btE`{_}TeA=69WzzZZFqbXhjxOeAwdjB<$9~D%1jiB+Q7H167
z7rUslMZtQD6Yy7R#s02===AuoV%dT%lAo}9U*<dizBumjj}xNBg>yJ^#={Y#@e^Z@
zdiw}m-X+8?^mS!FUtT)Vcu&y&51g_%IjFC7b=Y2mXL!@{h-0+23mBO_xxS}-@kMy|
zdq@GHJwBIZ-3Zd@LIY}4?*>8UxfG#dXkV%O`D2Bl_3&;VEC7&K`bV$bldH^r0#NDq
z2|-HV_bx{*3en&k_zUWBWDH!pyQcW4P64US%rBSVGrfOzi$gN-k`UEFn$h8UcSQTQ
zanIMIc!GaXaY)_`^IsLvuS3gT@Y&&gnIN;a9cxh&>}O4`eIx3JMTEDEc%bIy^9K;z
zgW#Y0kKJfKTY{G_<5GW@ymWoKug|qe6z@hYw-L?`DxS8$_92?JOuY7j!?2VDbFXcm
zdb<AiDgQkHkRRyBz}7=NZg!V$kK+IF8++PS4xnf;SK9%Xs<D8IGCiQjzu-0ge~<1k
z15^?6L7g(BXJ+MWm8MTcce1cU%lLk9<gdh#immZ{iNEFg-W?A_L4Id}VlLQW^#@A~
zJddQ{I@QxZl1PT2y;_NWVPQc}q1Os4FF%^x<+@M#uLO?e-}8|P6ZBsziAR5DxnT+X
zpleMZrP%{UT9zkF*d%?f78AnYy?Zy>rSB%+88ksqpA*#!k9T){wJM<qd4}arYM|rJ
zpamWtKls{oFt$P?aBs;7s35gQ_y7B~yNLNSuAqnR3x+yrN~>o43<v9Lh1Tup&SciR
z*`($XSFPH_;zwl9#7B|xi{u*4&d-1y(mS^$0TYi#Op<~Az_S<*hU_>5Mz@7}mpX#j
zXy-X**4o)a9fi`7h?rl16W!ae7)H51zWZ5s<@g9jBtPEQw;{KAPjngoUgBa5`jX2F
zy6;K4AN-Hj{3o`*(?BX&Qc<V`IbJC3G_%nK(!Xh;X~^M7*66nhQ^)UCAzWSPw`sT2
z`(^L+?F84Nbqm2)(RB-rg2nv&H}9q1e7>B+x#x8E6bMAArIM0o_N-KCb9r1pb1`)y
zq72Xy5KMbyUlkgw7uTOU&ZovhenUv0d58(oj^g|Hi9Pr)J*1z9=(`0Uo*r0F$iWUU
zOw2m8VU|A=(6O7E-?-2;mjBos9%(Oe8)6CoIGd^DE3s7z*skV>wRYllVnwyDA9k)%
ziMAm)-?_tt2c~E2z+PHtenN>%ieo`~<YNCMg2!i9XbIV5t8sG2>_L-v2@iaxgu1hE
z2FV*H@`TI{;FaU5kI?2bDUhU$P-ChQwAS&AOP^|hcOQceYX9wEmyD9b-G=mko>>U!
z`S>p>&Xu;aP^m=r;DCIy`+nMmpr2y#^!~mjbeRbg=_DhkuS|>Eu%?<P!5!nPN9CG<
z3vaVB$mQ~>NMiOgqn>vCbjjcH7etxWm}NoK=;GGov2XQ7H09bENEi|0c-~k3SNu6H
z3B}kt6P_Nqi<Ni+IW9Q|-5ZmxzUL=vldd3l4`s)7>&nbC9Y-yGZ-O)WBPV2%m%g@o
z+$&*zbhLix*TP2*C;@&EG&K;YsGuu6)<2qLSqp$buK(r4wu7oFGIF{ni*N_3YcNKA
z=?sB!P5)OY(yH~A=s8+uKm!7IW<VkI;Rjx)B;L#1TRe5c&hcRvsO=q22J*tQBsx59
z7x`1Fz1>XD)8Xhz&0hP1#Jt43x?X<zzlG2!KQ=q+siL+-`;?;96vkB;eR*^qb6>ht
z9MQRsW!G>2$noA(8TON3dFc8{LU|-1<Vn1oh5{Yto3}5<uSO!fQm$UE4uDJvE;BIj
z!XKc}0|Mp|l#v&E+_`PV(zkIl*`tpQT};X6o*6Ds9dJ^^KfrwX@bTZ@JeM<G2m9(K
zbj_OxPjh4jc|W$-iU6msyL5C`@bL~F=F*nqM?2hWy+FJvZI`Y_J2zE#n~1goO9uG<
z{UldXrFmFr{bC2v;jg)l%00ajaV!yZbgGM+#$)@gI%8mZhkEvS-&<z#xysZQn-fj(
z+WXrqGmp*xhpw-VimHwJmC!MeQIU`u5dmqD&H+(Skdkf?>FzEOP$Wc3x;v$NknZm8
z?#|(!fq~cWyZ5g7=UwY{hI8h5&fdS;&w$&h?o7v;*VkBUsxI{<M^e|Ev6G3R@so-Q
z)32~B9oaNt)jRL0I-+_m*Pkp<zO-r8jfBzp{Z-{mh7C8!7oSJ2z_ZV%InOqzRoNRT
zcB-+tzAeClMP?Z=fSbR8uH<{8&V<N8xH_s?VyaRo9$QwWS9QP>%yO$}_5Gitfha20
zQ|l9kjc`BoN%OdgNk}dD*wv{U+6lF2hnpS~=<kXgJtT#N;d9GT&Ujjw)DIMfs6NC=
zYOa1g^Xx1uM|X3B;-%!+GsbVB4C5tQ-sZ3p;0H7sKAJA_Y7oh>?z|^D(u($9zzE}z
zj-{joMyf40hiVM;rHjW}H4JBtfzt?3Ig8&a5hGSg|51wKlPpsPr9)j-tC?o&{Oy{X
zR7Ylsb~huEOR=ZweWPj4qfn9^jYNSZv!Fv2gKNg`J1QFbiis{~2QImB;@-$`zg$p@
zmndFS9GkZi7J^<ZDVrB2cVt%thCN87JMY-}io+anFqTtN?mS&?av1UM#(5QVIh%mj
z;Q?gtP7HX_O?q!iHQbt-dSX<dWnx4)k+GpE`-dz@%fZloDROCv>YdOly6L%@%uQ{r
zmb{kiRPvZGm$YYTUY#fwH;8yaqbi_%nvR}*)^qiFb~rpYD7d+$Bik?_9h1xcxOz+|
ztYS>ZwWugrtD{5?w!VFS);ilUPwu|GAtHw3eov_t`}^b#(5VX>FR0VCu-~<O)p*yh
z$U1B$z$uDKnTQfPffapjr8eA@NOHMzBflV)*dm<TxBrDnzgdv%03Pi(A1*OjrNL}B
zg&IxdKLftt6D+o4=LRNbQdL2?{=BR%md6W*=F&VmeJZLzEWDJkZL^y37=moc3F^2u
zGLmrnA=wWzXGwReK<6Y*o#5Svjs{*g&iKm?zzx3e6lQRnc-upBW}lpVru!d0Se$<?
zcyf=5%8&BpGu1il25&JXYsZ*4jbpif6%4oQ$4rK7bEYerNn;o{EsJHRH2(>oQ2CDq
zm`ugQ;LVHYFiY6wQ!!>KYg0=h6?0K-v0bzgPZ$<~im{eLh2ff>{@hfTNjT@YwShed
za_V--RyodxH8TAOQVZJRbBM_CDc%EO#xlkOg5g$AfKzIWRk67~t;BrDcBE7M9g#tl
znp$7y(9SKW5f!06x;3xGHp1<45FUkcAHrBu58d=j(rUy4KI_<pr_lO5=Pc7%@1WtF
zWt(uKQ{0F4g-$ejx?9$-)6d^O;{LW>WnZw<ub&}}SW)ah%SvaKEoABahIpoH(d=A^
zyCI#BU*KW`p3M$xp}mbc&H2O6KmG2S)mw;rG+ix9WH@5&OwNBC$Wnb4^_^)xfNOoL
zN8hvZ)f<}7W!w2ZgQfkf290=6cD)L+2|E2d6#oz?GZfyQ#<1*ueqkBTX|*O=Qf$s=
zF=($U_DeoXoBr8;{hAQ>UfGG&9zJ)PBVD^Rvcf%Z;U##KGG3&r53wCJb)OOrP7yOO
zAW-?EA*Ng&uRtQ9Pv}#8*1p5MJRQ|`Ua3E*BtvI#5Ohzh_~xc3l=>bkecZ*D`~cGS
zFeim)b;4T8uVZwQFgk+u&7T>2OM~2|zN;-6cyU`ZSg%-Kn4b~PG^?YG?QR}E+%Hi&
z<*xMbbDTqghfzPMLcu&lF?GR=L0hcqSg8^oy4ifbtVjyWjU131I(Tcv8$93?G8*_Z
zCb#v&%olj<UwrsvG$C%0;Eyv18%sQhm>lI4#rY|5C)(}<Zi_v5%;elazVRN%jKE&b
z(Br7cF{k%3ASFt5A=g(S>x@~1gl56;KnoP;`QcGR2h_tA`M`Ffu=ctRs2mBc^gC($
ziV0XrTJ`5(Hqw<mziJMa=r26<wV{n8PU2lVvg}zLhSj&-_qUSKy?@aQjlVeJ2O&R7
zAqCCdzJ+-~c_E`^^n_kQO5D$?eS3Jr%yWA?pMzuA^sl<#^Csva*UWEs5_!B_EIN#(
zZ#PdN=MCqPjt_zUTddC&h-=)Fl-y0&PUs(=zAhoL7NS$I6SUnQNPS%I<C{)SrW&);
z4qZ8<Cq02;b-l?i6JUagAG&)u#i}miKCxS9(k^t|1CGS2v>4qUW%Z<{8ihsX3Dbct
z9xtE@ke;MuW}cO}dy(`tqkmFqlx;6WtCuJ3fC^eFCTxGXepDamu)CJNF8-2RQ-5b#
za9>*B^;lY2zhmZQ`>&}Qf2bi<^p=YLvd;o<Q@e>89`$};>dU8^K$k<C`z?(*U*y|{
zA7GWeXFt*+<|IEg=ri<hzIV}>E;eP=+3kd6r_|}V4IhsNvDHlPg__Zh+X@($mzwhc
zMaDzFAw0Ao_Jiq<0nV)7cIIE(w+@UNa7-xw`>V95`h{yfr*n!#0@xyXl*J+01>PCj
zZQ~g<hTcESPlXQ|^ZMo@RJMP5>Pfe#vQ2_&Q8+I4oXf~^m9;*svmZulG0QIT_U$r0
zp}|l&o&Jzhoru-^pS)<oSe1&h-ZJO?v948H;)Z!6-OZOfPXFz>1gJ?6eDjATs9;mm
z{I%d%`U`rcziu{g{h>F+HJ5itdmDfszD1O)sf5_;=}cKRZJQ{i?62Ngu58=d41(xu
z8Mrn<(|=1uo?W6ZH^&j--57apo6WsGDo%&m#_S{kuuQX!es*cuGnx-U?(QBHXW5mn
z-88a|8w`Sl`4Aunv1YD?q&fSsp&Ypf1T&~^Nuts^LP8_TtvA~VwG`8<*R+W3>(0>^
zj|otfW2Bi-{#%1dsXMI>7H*YCNyEUS+^b621BzazJmk9C`<Wkrf-G}ucyy_IM<{SN
zi+!YLVL;`01w!Y4aK2+90~><rWc$txKEDCZQ+VWD1^EWry}|tkyadMI^Mn`<Cwfbd
zcsm~)?0;NpHQipB?h_qSK2g=uwky5+>zDUE`=i4bpVoV}S3k10^eSP+5<j(nqUi?n
z{rcr#B{LBPdg^(|2kjI)_MCyTEkQVQoJY^L%nY5+L>N1^4ni{XrBs}p)}l=cda!$N
z*JZj_*XG$`{gR`ctOCo+Y{KM~#}ZP86>NLMx%CMNsN=SH95%tzXpX#)$UN;O$ReZ(
z%)tXysC394chE>?=UG}v?e6p6-q|xq&iK{BJK1|@b)-lPI0udgINtp}V$T@~0AGhy
zu{yS({$n=%lIC=M?{qRHyQr+n6xdN~9enjWY!mSvTtES!DoiF=PU^HVjGYft5glcR
zswRry#nrx8X)n!4O%WGWi_vRtxo___ZTt~N-9Y3lx9nhL%tJUk7^Pi<c3Ih~2(5X+
zUHr2BHPKF7X{R7!DWK~H<zIGP1J8$Lz9nzo&N18+m`yLA{@vvg{t2WvTFgTc31W<D
zw4s`5YU`!jvBxJ;SMo7s2Chm+n~3drM$FD+d4a>Mwsoaj3!QQ+%ZVu9vzhemi07^Z
zM#X&54%^?e%D07lAW)S7C3By4uASC^Qx*uKk5BR-ilL`{P`N?+M{{9s>YElj62ol|
zz2#qO{T(FJ2mzq~Wl7gvtXer5*@M?-`<znc7iHfi{!_p&GhiC>;)7_vZ?9B53J=PH
zUTdBhQxD!rimJKcoa4^Y4Tmst7iR#6B6@I~pU6X<5qRw9rjg;jzYX>7j{N-0g)uQR
zzp|`~&9(!`gp_kyin~h|>{goCNxg$6ThpZNzpQc<W`C_e+58Lr1~z|s$1!Jx<X9)`
z@%wXL;MHk*R-<U_m?_3X$JjvO<{<TxF^MSmJspZ`Fjh)h+OO<v<9Mt=0~Sp!t=vya
zEnV9pw|T2>A~-}nYod*F;_LZYwyalk&UE6lwlv(s^@Hr(S5FEK`6d9C&nPttYO2YK
zyrovZ-Dpuzl2*zn|M|evk)M1bQ|00Oq(TptO5}*jwv!~*%#_9i&SL<6;?qYU;OBf{
z$s*cur^>O(Y`Ad&`O-tdC`N~-M|<CW3Jg?akv)2U%uy_--g{|Cr{~1u2ELJA`w7J(
zmyYk@l{1h-K}S@eN2_aDi@-L)r)cLowhj&_YZBbsn|2F_v*jdSL=!E!wlp5{ZoJBW
z6<@!8O{&q(a{`p*Xg!0s?TE_k@)l9B>RDfrXt=!^*kH&q<C`!FV=#R~pz7ble^qQy
z^s#;yIi$<rIP2r-a%LyzZv4p4{~P}3kyLP~?%<%|yA%)Z<Cw&?I<~7i%>j1(f~(SB
z88rXglb9vEIxmmb(4jgIRg-Q^dk_!bE|}P15mk8Phv{WC13Exe8oG+gW{^xayqz%G
zdo^h_LtTa$yNu?|WLkPwcLO9m-E6>FB=pr=^M&?;pNYRwf%ovn;bq;gcxrsY_qIcI
zuDDt?iy*c0hO;f6c}AOTrXITo2+qqD16m|GLpdWks(L?<c2m=HZx9-hM`ykPdn3We
zJ^V6$Wl$2*+|skWs3S-!z_tJFls_`P*3w|5nB!`t5v_5%`kCQji=;2so7wp^dnu_F
zvBl|$Rz=CIMFLh1g`-ANe(Xonvc|LmCyFB3ZHr8P2jL234|xv#-n$$=<-yN3^x^@?
z2HNr9@EN~^gnN{o-Vc5ivkAhi6ccCM4gp;iZ3=M}m8>Xk`{m?lC%vzq#!4+q52t@{
zaz0xHXe$k(4zZNSGSJUbYmcDJ+zq)I9S_RF>)ti)ns<ud+1S)uF5+}@Rh`6bJGUa^
z`39b@+1}a;NhH{vi>eWD!iMH%zj~BWvAiDk8;_{9iD7lkQTJ>0I;`w|o~-D7Bv`nC
zKI~VOG}N)U*^fH&1!ZdsKiI#^33tPVx~|sfqiAJlgTR}J%(4w07@ERgvTLSs16?$v
zim}5X^~;wlTJA8o6!mL^Kc?+YIdBuD2jlhx$`ntZ^4jnJE*srKpbj-M)W<aiU1P1o
z;z`AI?b5<<JK=<5i=EhauFr0^;8HSiq69o6n~pxDfOi3PmB~^~dW(nq;JbOh<|Sb-
zlwcXyN*)UjO7t-`_4Rneb)He)!95@+jK#+%zKJ%CAZI{Du8Oo&Q-$jpMxL+=Mqx=y
zC1m~mOBC%=AaO4@cujV6Hq$o`db0X1(%!d7!`<Yh{oyIIb*Bp&;Xj`sZK<{wwy|3T
z!{*#BjVm{~3Enn}r}@?MI%e;-lGTu=LW!AXJQmOKVtg34z152HH%3I8HhFk)gFd(I
zsRZ3msC@Fo@ECZDlpZ~|wRu-i;B!8l?r8(zIRtHWiGw;pd5dU{Zy;TQyInQ>k~j<c
z9aE$NPRz`9IUno6y73>{nMIL!KtSVI)xO{sMeRp(QOc)@KLfwSA><bqOu5&f{?j88
zBFYi{S@ugr;HCdu&9K+&<%j~d5$n!idN!u`s)*~lQp-D9x`N_Edgdc!^^TMQ5!>mw
z?XPFDG*D3N$u{r)H4Nl4Q@EW^?3$bV3Ygs%E3oX4$1fKW5@OQnjOj1K(?%@f<TFHf
za3k=Kl8sBSZX1eH?%3@9GD05&U<Cs=YEz-BdhoQ9{dRegX^6_nD)rOmY4ocsK#iQU
zF?~=wAq9+1C79Xi5+-1rl$RdA*sg;$=e0a(!v(R*GPS~3l|NwM7BfkI_5t?z;mtD)
zj!rs`oD(;{0z-+U(1pX378wO9dMFeSV3zJ#Q|s<q)$l+dVrP3tKP>B?M9oLg+nXjk
zV&XMu{h{ExL<K;fJw`j`$881iz!N@%hH|!Ty@!egW%&k1(|0)2hBqoU<^%>l=02Qs
z^P$1#ut;Gs9ycHnX1IwcoB;sm$y~kv4=`Yyuo|*0wK}{ppT%e$S+WxRyR1DBnsmxA
zm)#?J6c1VHbjmnB*JE1hlTJJCYR&E@$B_6GXsNB-FD3PY;-xtA+?+=RpYu1_^9dDq
zT&on^IH%8ih}Xp;<$GZEN<sRT?TN)jf=YxG<P^egk&_p0FGj3HWW{a&MkU>rRxxeW
zk#acYRkqZGB*zSf>w#bTiU0%HH$ndbHKimdm%>>9@nO&ZfCO0qHr<bIpJKR9)p+Xn
zB#HR&Z0+`lt;Bi~e<4CtLg7a-cv(Tl`rEmgLT$7R?ZZVtp0JGU%cG}4D7H3jHTu^W
zh{-~&tJzSY6s`}1qu18bzX%#+M*J=5U%DF*P(aGurxPePal;*Iyk=OV^n)kU6@h1g
z3KT<o$3XS8)T-jd)$|UMfw#0+9+m_|(?~3TQ5N7tMJ*57UTvH)j4O3rzLiE0fg*Gv
zVQ#5WxDdyHveH4>^h2p&o)_1>zQ$Yx|AuP-NsdMX=6&MP-P&z#`~4SBpH6-jt<gsO
z)nXz*I#SpQ#@@)5YK$OkcoL9>pVrd8G3F&ZDRtePV$2^7DmuM6pli0qtC$V;_aags
zgDgtd)ivL{V>-zBd<$i-l4-g&eB8vKI97NQ{#~PC^RM$V)>EEue#ad$ouFM5N0Uc7
zAjb^&JV9pVM9JOJO)}wL3YPxdCc`aXJ@+@6rh^AnXMerEE$YhKBh@^_^~nWQu}%<5
zPxtP%Z#CKSKjy3qk#4J_`|;3b^-lwnet(95%v%8gy}mSQoID(J#5>p!=*4UN=g&WU
zp(_V?+T}T_tlV2C5@|pjMNgF)3<l2>PpJY$NaVEa%)A_o&{ZxlH~6@tJq()QoGVT3
zRmreTJz~op<f!_YTFW@gI!)xx9jcaACj5TG#RopWncafO<mG<_k%?(&zAv==tk}>$
z*#8g;9N`_`KSW+US#or|-xDMd)l;J-X=7EmLZ*>9826R(X0uXh2iwd~5^df3-PlYU
zE7408=;C|sR6(DIiDv|*R|2a?T1LsKFK4eDw!%A^f2TL|u51y9Qq92W(T?SW<3Z&9
z5jzTE)5wzXGAL1xg_f^@u(>DXxw#jP{Y#(5gwITAnuq9melM!8Va%FfLOINPiN58^
z6XUn2A&I^SQHpmy%w5e%s#uG#47EQe^SZm-B{USi5;-rf5+x%j$cfJ~{k>rp`aDDB
z$<H&|!u#qY4+h3qYUSx%VPWXsWT%5{Ecc6R`L%sLw=-`JZ!W45u0Ob>?^z)~3qJn1
zqVIX!EE*}*ug?e@C<NR9LTX=qNVGBFhjR!ZUcsFqbU?&<0Q{Hv5X*6=!m+rxTr8}0
zeELS@$-K#1_miTJ{f)X$I}6eKvsJSt;suyX8?&AEsEOE2c;WCvvypL>c1%DTeU8|{
z-<?|T9W@VGRk0~bHI@jKZhx84*2*bj1n+Ne-UIXo+rx$$s29l#4;@>E#$D@v{hf9G
zTh)jDjzRt}<APqX4sf+0kpaGS`(VrCV78;L*njZ_Bx{1aY|d+S+&aEfvri!)Co?%>
zQ&UIAf*1Mi7i&D8<XPD4gxp%a)BsG{zGeg|U|)y;o<4cXR!c)+4yrGOX>w~nt<e0r
zEy$Mpnbyt=Ap-H1Emvg>DelhoU!#VxT4|LBCScU}@81iFi5d3}QfC#vM$i*`JU(g!
z-pKz6lwDeM*>in)g679XgC%-RU!ZT&#8@G=jqE#dOQV4YYmAJOJyPqw0IQg9<(YOn
z)L8`qz)Q{1@i4<CI_j*EeI-{XOyqBcgCdE!d7I>FTXT1PF`MAbGnLcy@{<Vr)8Er(
zUM-Y#)&g{wB%Qxzn;WR3S?PW!Jc@~#spdpk1eZV}D>Sg+b@~;MU_9F^@#k0=rK@L)
zJ>v<5X-C-2bFplYZq$IzpYlXhgmV)EmB1hF3ipTqsW@pkJw*#U8TcfqA{&UCHOV)g
z`IEuy?(KYT*WqD>z4bB02}f<CEyeQ!ovvqNb}N~N6SuQ~uYflO4e^BE-K@D3`DZ>1
z_~6HI(s9DD+nXs772v`N;awmzIf!#u<B`(5@%vKGm&333%6Tb`gM-h4c{pZsbIj=;
zV`S*kXcl8+cxq(>c#Hq^XAmob#pVNu$I-dlf6U&SOS`i-^{&YKKNc#u7t(?t20?==
zL5$)wNMTsT`+sP?$+dgKk}}r{eWo6jmHba$b@kZl>M)h2oZQ@x%4JpsF@3+$fs3qq
z4?x<UU60-JM`RWH?fa{C9w9hmwyEXLYjY9)ii(QH)0Li|2?q#7790=R$T_AjNn$sY
zEvm2^YPAeET{_M#*zC#nK5f3z?S++{G}{K+cP+4=lRc@owOmVUlvNE(E@E%#t?bSj
zGxKc=-Q0Y6XA*pfwgq$y&&AmE15nw7qv{B}va!|t^h}eQHYg-InmK^uJrYHjN93d@
zt3){eaVY2ZTcDrc;MP8GyHs(aI^sZ!D1i7I1DfAZ9O$Y!;Cay#SN9K$$WGSWk;BL%
zca0bH!yk#IQ!M-daoBXlB;rU|$tn}wDBtmAzF$v;Sg}2Th*{&Wi<{;PxaI|?mzUPF
zwLlu8vcHqwV$*8v?<?=)ZYkRMQg!jCQMg~~yLa&eI_Mj4cu!6Ix%G!4n$r)`(6A%v
z;jbh@-nr`;En!}F=g@DZC6$ln9F%h!OP+rph!XH<mc7R9F8UAHWRx?di$p#GkYd>Q
z5>k{3%nZk1V+S=n#`DkK_R^>yGA^@>$*Lfaxt~=_dHrU9EBxZM+!G74nBb6>4z*Pt
z_I^q>-j_{F-&8`<Nr%pJvdXCOwio~tMJ;>9w}EQy>ie9RVN|zMOr=mdwS=tx$+7iD
z+RdW3xsJC-JR|n^N%p62z&UW`K9A68cVPfr25=|c`Yb0mq78dERRi&_^nFergSI`G
z!lk}1!0&jxCtWMN<D325z}Oz^WQSRa{ZH&mpa)Y^S1-Joss&{JV`@o2<`+DxMFpfF
z1$jszELTqTTqD)L_t3*b&w$T$g+e>It9q6~IIvVL#MGk_WjyR)v=K{cUpu$VZe^SW
zno)Zog4(vGavy>y-Js*n-5E;2A$bE4Q-<Jbul9ByG`DUwJ%wP;db9h`_-DUHQs>Tx
z2!KqoH+>d#H#A`8aG!Jlqi=B196zEBzLQ?^W3%#T2ZKihs|tz~s=6AIZp#%pzu<Iu
zV_ja_tiJn8QGm+otEFsD3KgA<%DZnh<M)V#K5qO8DR{{_r+<x5sw{64QjVneS&yyg
zbRw~ZCH`PDE#<gG=Fb#n-#qd&O1ThayZx=SuQQ5*Y-vnvEQQ@JAh`O6G;-n#6)#`R
zNdVl5Hqhx-VV`cffwmGk6R_j)!pb3Ark$Am0t6y<s-Xz*;Nm<IZ5)hg-Lv;L2Cqyg
zi>X4pm+uv*LGFG0^A!8u$Jk3wlAnsnwbK>SOnF-0m~F7IC$;bBPuto%I-`#i510x^
z-s0-jO6HBmPvt+lmMKqBT{9^=mPSUg>yzyU4eoZd7c%7`@k7p5b*#2-e{1Y2Vdv;0
zkyB32A31rqk1s^18yu6>9Gjy%bqZTU)_?=^nN?H!ex3#Kwa$=^0I@f%SuP|4>nw#|
zc0F~aIoVLZhLPPZHkj|B<!m*~_XYU%c_*|Pz00+y-?h!YnR(BrgD6yf{Ve1KRf^!f
zSIh^8&9UWNnZCYOOje(p36H=Plf_|!JH*->#^*6Z1lyI4kTQ>z=hcY150w1BqYIB5
zxlAKXk3IL-Ae9r_iIwl7<{QnQnVI)-Q#Et{%E!rnWh_z>XFR%iI3FR8yZq%P46AS~
zGc%JJr|OmJo=hdXQ|qO~1t7@;=NM;;3=BahwOL^ze~6>}W<^rl=rm!pgL!dW^-9$F
zwX9z?hLCr@aN_K|t~4todReJ^k6RY3>d1Gmy;}CVRop;dMJIO9uV7i@_)J*ux0>$#
z^<l6_9yz%H$lMHrpyQ8E-%n5gjUj#gmHgakd)vU~P=aVYf9NIKQy!I?SFdOc9WIQ1
z2#W!s{n3z6*dqg7dj`zwB?W+NydNM|&`yNU1Yp@y&m6r>5A#w+Zr~QI7`*kz#-{xq
zuj**6*0IlrBSX?uSs@Mb5j&f0&eO%?Z{z^0twHuh4hRE4O>E@#@d!CZEn~7<evhGO
zQIY$o@~A3*U7D#aX;P&!`FCPTN5g@ZrmlukMEqa>Y?_~9up5Q%)k)NFP_;<ZkX)r`
zz}2xNTCYX2gJU})CGMH!{X!#Cx&GWUuki9d^^Gq_@sB$j!c~h;tLJcylHu+x-zZ3<
zB5l8d07Nx5a1K>2QSC^=<-By_Q|Pj1uJEi2C$Q1hA8#lOO0rXp%IZ3-c>DnAbk8kt
zc65%$WS(J_Pwd9fIeb7p1i`y{YoGv3;e~WIh;~9p;X;*GL}{<49rbqA61^d2UQfd8
z9Ub1v{ieEi&qn}UJjHQy%(Be>39tTjE#_rJmzl8j^IaV_Ea%i&mU2%@kPDOkZJ2qt
z2*O}cZuHv42xj>|^tq>M(x?YqIVxobQowQq98+r^tNpheuA+RvBQ9Cowq#2tWH%78
zwq)-S68xc5i3hgN<<7V%aZT%QxE`aA!>9Etervav?p~|@n)2^BdwQgQem45PJhzfA
z%!u36liboFRhRe1cZA-5M(&9_S+?A^mkoIoAnsHRbL6Z<CLS**x@$$vY|ty}rk0<c
z(u5Br(vM~)42`szR%b;i>~8B=Zoz}NW@8olh|l#8>`qK;VCBXFHyuQG2~Hbk5l|Y~
z5$^!iR~(wHR|en%d;5aTKBD=<gy>`;S;6Lxkbkoh95(^nEV|7iOPDD!nK4I|oyAaT
zYAltL1!1mYkkf$roHg8OAV;zJ-NX)efj$5Oo&FnO^D?c2{Z0>lf5LT0YlgI)QW;i1
zAQO<c?$;D~>u!dMCM~IAG-yB0ZLM3i;ha3I&{Pwz^8I!LU=l=tq2EE%t~O7JSL+@u
zx(cuGiAWd7)J>#?37Hp6TnhYLO%kjJCvyJ?{D);mkdz4@?GgL=8!sCYha%}Sl<$4W
zWIYIBhqN96@dG(fn%`-H;(4*Cw6vB=2n*@E@78<fF!UgtEBV$06>+28B)>uorS*)y
zZ!Mv4m+Ey9g+$D<9^qY5Z$!7^l|)x*J`pl@P|+*L9v+ds={HDoKkX&EJm*Kr`vhf}
zi`m~~aX0h5lQrvi9}2XM+4M%7c%H{;EaGvfEmjrYytlU(zJX`!Ypjz}kMmfLBU{)h
z_f6_Ot0ye`8w;5?cLYi-{yYny|LsscOF#}`cd`^$@>O9R`r1VPfrY`jj%c99J$tQ}
zQ~2QbOaLDbRpS$lB`%y9WRZ$=2XH@yY4VkG9>ADYAzt{?Ty#3z0sXqh@WxQ0mVqH%
zwD)8=#udz3-qrcu&Y{UjoBlNC{*`T&;CPXA8Jj!!(OsbzFJfZ??;;h#HOm+~vCwVf
ziW(rgqMG~0@skE^_yG1sip;xSd{0Ne5BK{ATiOdVD)4S0!SxOF451hU<AU{K?Fw@T
zv+)OYz8jj-=6{f8XJ;w`%sndNyinD{^H2pa7PA(N@axo=-TI86a&5;188y`$Oc^z9
z01+ZeMH8!py=}SoAL*Wb|5<bW#2-nzF&0@IehVZ)-O#KO(4M&*Lg0eWzFsh>^rfm0
z^Qf+(a~2B!{R<O(t@8cI`+P2>3C_d5`r4U$r?xZT5C?LlvmmrdX4H-ZY-;T~Cv%5a
zx#78a9wiD`y5OtLK#Krs^e!z(1Dt2iVYCjvKRbL!asO}6e0$Bw{3O-?Hg0lk&U17J
z)J2+(N7ZewI%SXi!1?f1r?i+Im9i{bd_7>yM*9`1`P@K-@y47&!}sc<lQ?*=FFADZ
zTyRm6QWvX{#b5}Bb_!j2RWvhQC`)U;deZ%itQn%$8zrHI!)OM2brmCK0*o<brEHW{
z8m`S|>IXpi#a;h|B*lll#Uu-Q^~|`1!>#g81e+sOZB7I^W;duoG0quJzW^VX8HeWF
ze5^!J$@6(uc8m0cid6k8_V%)tp&S-gyc_$x9_51XkFK9f;FcQxv`lR0DHq5C5K=ug
z%0yE=WtaD4HF&ihfwaN_wsgsX?-OXm*)2^asQquZ<5#kAS5n*D{NZfvc(V;pYg*DL
z+{*=skwi2PZ6o`OZkSahdkUeKo-pF+&5@`PxKFS%3Q)8Qb=wtZnOAhG$nLbF%@-r%
zlGM_Mf7`fZw4RRsp+9DOSgSI=<WYaGpz_s_l%I5UMpVg*N>=>VxtN|pOi_z_snI;n
z%d3}pu(VG7BE7aP<5}@T1wYZnh3@}0(gBb~&hhgAC?laCX9=TQ@5ydHHE!`0`}Diu
z4k3?&oOXn%M|s7Ty8eW220#x*gMddD%1?u|#CaaFa)=+iRkY^akRtNzxGGXNT!SBc
zQ!W0O2uk<P7WE;{2;AaVFvBl8YDZGmHKGUh@$E2c;H1pQJs&@eRDV*y(b063!O<}c
zx~Z__x#+2o2~mhN2~VpW*Sr?MM`m?sb#;)Pu+nbE(vf+|*OlUK-dRGMrCho3BNwqA
zA`bZFw{rZAhh@*r6CNva>^ZW2^t#MS_NqBf1}AtrfKci9)Q)$T0BItmO{?c{Yld09
z8hvkdsE2G=fD}=wF0`1i*yt-cTsvwi(R#fC(Ko06+^?C(2rnAWVfSOSjcA(rK&%~y
z7AXQ6KP;EpG4t~I|FYPl<K^z6kj6ZH*?mDxXSw|b&FFvdi?VTfV)i^t=fCh-N7(zw
zgkATtASU}V_2}&E<lu4^1?39vCuACd-V(0_T7Z^&X^yb=xE2KmFV(qowq7gWNXp9N
zT2;mCc(yk{fyZIDt3#_)ERL&W;<|^TMuMPnS_oW;{@W{_`EEpk(p(>NeHpm+>x=zq
zl=|}#Vb)dEnL(O47ti-M{uQ3X*1_rL_8(yO>h4a5|33kIL99|1&8XBBLiQiGS-V5X
zY4#>s**ZXvs%`9Xb$2(@ONOqsHNAEhuV#(A-i@-hbqq742jk4|SWoGya`!*|*Vl5W
zTuJ*HEP66mq~tZ{Ep-{c1lq_#)5Cr!N4iv%0%L$$t-l3=;mfkHZdWwW?8e5#ofm4)
z5cimbkXg_}W;GRop9A<*jx`vc9(=m>Q9Cri@m5*I>URxu0Zy*g68b|lq|0dc&bjlm
zGaGb+x}vhMw=w&*{Yl#7!|=qNo$A@oYwtvu4Oa2lcprxT_}@_4eu~VoV)lNG+NJ89
zlfTs@7W~4}wS;KZ#c5n2UoG%oZxw;v=qqw1jmz^I2%|B!CHO>Vo;^z+_EqkG{!+8~
z@!4Lsv*G7aX<6k&F!Z!DOh5x0aXSG(QCC6}uREc}enru+m7Pbflwhj9U{R>tniDcc
zFjV0`Mu6NI3L+pYBCa?BT!;C_?BLN?AODbMl!XZ((`NgaXDuHh8R?SxY;w;%!q;GY
z&aZ(Nkk~?7baA(sY|I){u#_w(?RZl8`TKEgebnsBgcHXM8^MCtas6!gJUdel>b$wj
zTuJr`Mk5@{jjhl(b;EA_^9;UqfyWk{c#diIdBJw>T|3!lkE9eZTxZbr^!5A7tc@KH
z#;kpQf7a4ko)0(nxVS}81T(!=1ify%_ZPE{OmP`?^kexHiHUbM2L}g2JS7(Kop=dj
z(ALt>{Afoq#0s4>W@QU^?5IvE-w9JpUfbG^v$uVI9}tc-50uRqP<Iq=8&Gdx+`d3?
z-$0xw)?IG&=i6T^k3N@lts*_{$N{nsl;ZQ<jvhXwblU^wV&H=O3cRZ<76LU)e%Jm{
zz-0|%=Xo+DG9+J<0cd2Fk5Gq(hK}2eoq8IF8Qa<T=jP_pPFfJnj8~f+Shb3>=%TZj
zxP@%9)0Q7w@yv0~tZAMp#Bp|B%M4epON={5j{1Q5SU^d;zv|q$^Y^$fc5*D|Jb@v}
zeO+CcCju30Vq+fn^_3iS&fl2Vm0Z@ox)y+d@Hqh1H0kkR@2oG&It0^rG=Y9Sa`l>?
z=Gp?MF_^ubDDG09C=dL>WRMa)J4Mphzm0pH=R^39#osXvhCjwOHJzLKidC{JNld|m
z+cX$5%pVlb_;T(pr)RUg+OH=t?j<lT?_vl1U&N-2CIN1bo9)WhR?Q+b%Ywgxzm5R?
z2kIu7ZjT-uFC0&=@Yg@Nh^AG4>tPFDakdos;Eg#yi+7*<z&We)rgP|&gPgH(Wf{%c
zYkLvYQTpKZPfK*khmU}S=2mCwjQZIuCC6^vEjK=1hjI_i&&5t=R`|5u-^UUnPHXC)
zWaaF2dwYJ7R}aT79I<paoKIk&$aFXF-juZZ8Uu@iy$0bgRt0(%>C_~d0hsTTu_hP9
z3Sa)2sylB3R!VBBM3K>$l(ckT3d2U_NmjidWMa!F;U=Ppar_24V1qI5MLSp7C??t%
zH7~hkaUhYYH%H<}4j(puKkf;Jcd?0n1Md{!07)2NJ^1;F7L|?<-~VN`vq-%*y{$}+
zN3237kf!%NrERNVhvNfn&G%8$TaW#cZLSbWHrfQ8Y8I^@&pNWTvlXvBO2_0eM{?UP
zuMAm%RA-po%-4c~?ah<}dp`|CJl*sqa(|x`vjM_pu?sOx$4GX=!ejO;=X-t9AfSbR
za`$#GcDYCd+n=uYrB-WcHJ;#>mX?)Wx~d`sE0uG(xZ_Vq_De|i6{P3YR*Ghav#I#u
zBEN<ZfbZWP-Ru@x<I-st4Zl>Vi}}(O$7`|G)zfHbm_%PbozjYg9`R^Rzaou(a@5)=
zcp1FR`HDF^VUm|7B<ypgPV3rs&)%BA$jg+T+9wqdHm;1#UN6&+A;=FSzmnpB%VK(N
zvJ<JAVj)|=&cmJx`<Mj2^$2}5ig!#V#kgWmhLm%;Q1t9%;(UF7UmXEZUHhN%GDIX?
zDb!qfKSFqZ6Ou)%+H=g)j_|RvdUTwSqZ~4wd7}Zbo^N14n|T8Xx4UJq2cOz2NEA9R
zVqfXL(x!h{YA39H5R>Km_)9%|AXIyK`yhB(R=Gk<PJNL0PFp5n^NChMRq|4h<`x*R
z<N!H9aQ9DYP|*lrRvquLN}Z1G{KJkFG?co@I{Yh5ihgMsvo_pu<St+`_bzvgnEBd!
z_;E(qC!0*qLB1&1QeVEPe{CQ$5gG?Qo`Wy?Snn?$g;)U`3z;uh6$@$Q(|wFC8eHWE
zg}-oicJ4bfJZx@n4{p=q9Fp8(M{xaM^3J6nQ&j&&&RIFOSV_DfSTFkeg)CbRh*j%{
zoJKe4H^N6=S2gV7p3c0%1d>Sz$88rJndh8x`bPj3S@arus`EdC=n6B(CRbU2>rnd6
zIoDe>Bj18uzuVXTLX0_}Ra*iWfK!4Qda8y;7#y7B`u%$_bLD^DcaCZEnA_u%lN&9=
zKh*z24{7MWgzQP*w^nxf6GI0Q+iriHRzAznqqDMnh8h|gD)jE1&gZQm?dVL$ll{<+
zHQNi3%Z0;{K?!MfE2*n>WF0q*g-qdGf9eg?WIi<`t3Pn-HqjZ3*a}DK&2I(T@z~@P
z`ht3C>c#`EuBEeUtWLIiRKi{_-NPwIYwrL>60p5o2o|ek%@<NFOq2W<F7@ZXx}U#i
zodahvUFgqUomFriyO}<TtO*^3+t-ZqY3H}9@|&E2NBzVAX=@a`$*vKDjw1Y^_;@*e
zuS*cZoU7R!NEyayokeC)dmFK8nS3#lxHU*;8`O|Yp^`xD5z(?eNaNS`>WC)54{<q&
z(w#33^m~rDJI*J){;ISknhBo-<*b=~u6T2WNjui^8FXed8t2%LpclOpm=}34QOgRv
z(*mC;1#TB217EkhO0N~HFBjEureQxSs8`>>^7GJBlIkG0M%6L8X#lTS3i>mJ8j@ty
zTSdQl?=<d+hzv+70lEH)!$D~)sey^(k}Ephhu_i14npS$<h8XevD;rwjdnUtmEi2p
zQcmxR7qr#^3KZ~!qR2k$O#LzS2$4LmV5e5IP_wRx5ND;1NlhKvSL`31VIOC`oM^yr
zKtU8%#C}Y7jwrOq<2g0Uet3-R`?AfBiWRgMtzS1Qxch8c0)KCNqSfNYks#dPc@}2g
z1lw_ncV2`XYd<*~0e(r#ac8Q*L+s!(c)nZ`NFDhS$+^Rr^on8dV5?YaX1_vU6|S}n
z5Vj08_uAcp9RQ@le=IPPdG$H0sp0A5FuID}64{t}2e%LYlLrrCxb0iRKq?n+Q+u?S
zUuR-F0;aNqQ+Yzl&02JP2s?>jS0xnqbR81cJyl3I(T@T4%1ES_U7c3~AHbZO)3vn7
zkB^OqSMrVt`idJGW^!?HNu?{;HpY){ZE4S3djE(*a|}8b6y%u<JNg?--@o#sY~5;>
z{b5PTGBNht@g0SGiocP3+~Fi~fv&gl$0<>O7Qvi#gXJg><*x&RS=c7SG5cvv1d}iz
z!cm;^2tY23&Z)z==Z1YygF~?r@-y2!bBFD=DX+WPEh{#BRcCjuqEn>&Nn+7i6j3(S
zXZLGSFE;MGm>!&(y7Hgd5r!a)zyXZ4cy=s$3ygY#7Vv;Q+h2qSwov1MY*qHAV4zho
z^}3beWcQVJ^IxUi1oA%(FWV(6To8D6ftvP;W2*a+j3aya?|)ot-@Ny3J8U)BTj)fV
z6i8j-g8WIE^_`&KgCfmMPy?pJw29sB)g_Yk9vg+b|2>S8Pff(8wHzz|jI#Mg4=O(C
z@Z<xb20z`b0Yg*7qIuHN()2!y0nX>^*N68tz7zO71h8EC!^OZllMCb8rUGsH<UlzK
zh1;Q3<Aly{-<q8z`9%%drC;!XI{_L4Jq>l3ocUtBI|+r)s7YPw+L^)mrNiI%k-D~g
zNacap(Jn+yb@;8opA?OQO$Fa6SA<sSCc*DZjF{^x<#*a#qlnho;qM`|6_t|kFTDjS
zQ{d>=TWeA%R5cAK;%GnUsKZTnh8la+;9-7d+hs%XaKpp=#P)6KDKU7<rcs#~d`7*%
zDH5){h$Q<PGA#t`579Zoctu2fWHRJpHCuw8oCgI3=`ME04wny=;C~V<@?SwvdlPKb
z$Xq-+B==wcYxoc?h>0#?yoOQ(n0g@sr%xg-x7`yPfM|$u#6@0nj+b!bXuSp$bD81Z
z#lJ#NptaK5V=TH27dk+go4#w&SlSNO`R7v@w@<!mpax^6$N%7&W3f;<<|z*8wL#s{
z3_i(ZDkic-k!?y@c3q?l5N5t2;<%S=f2%~OM+bx`B8WZrz@>`vLGgNjN5RgD-VA}{
zWeP5N(~##b$^mJP)0v&Lj0_bORr49X*bfB8$!BmCmBk3TqWVtC<-6IG65D456dn7v
zg<?KnW*#IOTjafGIq5tQxI8rXR={a)V=b{ebjF*jB`q5{U)H*_%_$k;ydBF`)}Q&M
z!2ne&=w-SGVk8U&fcGZGM2B{!9$u2&ty~Qgs)gMjZI6!RnKLDk<7GJ;A5_I059jfY
zj$eu{j;nazH9c<>XcV}3^P)G$&n=nIZj~1d4HRcw(xn_9ohqXc!2EJz5P`udI#a#=
zZSGRUrO%{_JV4W=SG5cHerTVI?@eeNw~Z?G&B<G4=ilKE#M2y;4`If}ZoY#dC1P8m
zZoKIx$vhd1Za%<yvhIB9ev!Vt>@dZ#n5PyALex&?iF!_le;ds&Is6}Y`M4G5%>P=r
z&T8)oh<eK7#{U!vAF#Rd_W5g+ZhI6Q$97+**u}5LS7NZR7NzP;E~dx^BCuZ;Ys*$x
zPt5JfZ0fY%TQ%3Qf$5@dGA!UB0xXi~K+8b`jNr@(Xac<KUG{wVh>rXIzVu&G7UAT6
z_ok*{|1iD5LCemk%G=`~Wxi^>Eyu}9ZB%lcU$(9<IX>ie@|!5zR$x|5<Vo%xz<Kij
zlDskTC*u>J#N*?d4%;8idlA@9D5(?ih~T;csqIT#1SF9HJ}*KPDx7WB>xdKr3y@sC
z9Zz`S&oyPNqQ_cfsxv{7cE0Cp2Q?OZHy1dF$A%_!<Vvfzz9-Ubx(J?i69Z!^fX6^Y
z3p_>^^)@fK<r1kE9G7o@y@;mYUdc+5Kt|Ima1*)G&9h3n9g+;4hoZO44ITk5@C64X
z0Thf$oU<waNY$0Rbmx`MjLWgHReC>~(j^KnYbomIdOq=c9X!No)EagSO|I`TKbV<3
z-+WA8X14XJZVw3SHa#b>s7ggs-QWR!;=*wqIWM-d&2ib$fCtAy3pFwckM4@3SA+ID
z^^1I2HtfyW^Tw!cU`rABJcr;dH>ZoZmIk6@r#ahfLe9JF75DL3M%q_WS5KbNM#>rs
zy%U$$N>T^v9s~Z~ay8b5HUIen9?#wm+2yx!#CTCu=y1V`#13yJ%FkV=!nqAt+_kk8
z0ej;2$C%c{6Clmvy*p4{g8P4^SsGWLIc{d3UE9cMsmP(li&C-p__{HJg1txT7pGCi
zLfQVJQ;~}^xFVeeVEiH$(*eFmLRMbbf6dAxUgIyTK)5f7xvzkqO8L7nI%3Py-vuP{
z1i6e^_jSRb%Na3leBk<vwYdo4r}r|;=D?V;zKw0t2PrQOg!)Be3vBh9u^ndw8d!1K
zAW8l;r?V2qZ!;A7%rL+WG<v75HoTAsG!U||?xR@BcDn6_Q|)@5umYpQ7s>N;$wrK$
z7grTfhOCp)YwBu5jJw=Q2a*VfKNH;IH_V)s*dTd%;bKvfU+`&Ou0dJxx8P1U54%mz
zbLC`aPB0MAIBl$Keut;t=P;C`k0j^k&g%uRK6gs77Gm5}<{30hlPd5xv<uZ@R1#EK
zyoL$<I-022?QebqIo=q;3ziIj!UF@G2XnTHWOuuJ82SGd%Wj~gKPuK}*2+{?H3H^<
zFGk1^3d)r)ZQ_MaqlyuNoFL0NU2*+`KNzpi{o%g05$p5D6N>g#kB;x?z_3I-yww>1
zo+)BCfJ%@79OUCrBJ1ola-9tmW4Lb_?T1v-1>NXRX8DgW2oD)=Jvgt0S<b+=DxGGb
zh?gkpBJ_B}>G*sN1X+fDmn1SGyv&BJ2c$%P63#=0%#9s)>veAK?-X{R=<OT4VeMLT
z>Jr)bOu>8l2v%#n6G;Y`+QsG$4x1ZiLQWZ-Q<HCPr|uHZI(iL}K(5Btfe{Jd?r`^|
zeD5Ae;~#9aH@ey_-qo^K=nK7ao^H7AlbFnvi%FZsG#3wE_Rj=Ix=}t^dBhm^YjYFY
z&Gr#XxLZ?wl0+M~l6n9aOH&Pky8eCcLL3JKqE=R>n$xo+q?<6~W07-ZAWfw;^P!6r
zIrNj#>VEhM)lns%R!sk$;Iq1`JklK?O+^Cu3Z*&3D!`$(R)I1Ev=JN^B-<$Z>#y`H
zBs__OPNV!k+otX>5oPDKrNjHO#@12|Pb(aAqG0V4r6y)PwSNJc0Kh!NaCCbXT6X4m
zDg~6~Ud?Gmg%pQ*guk}@UI8Pw^3C_}J#2x=F#HqUEG|cfL6D#0=BPP@Yn|Z0$MHj#
zF|gZGlF<^gX+Lg7B)1`(DCxwXWopMMBeJ$}BVVkjrPR^HsqL-(&t=wyWw>Lcm#+)T
zww{RgV|*|^Ihy|89x5ZE@!z+O^B~87L8-0_|7N>F*7h<!TFWjqzoq?~F)Nus2J!mN
zp%%Ox2w*pM@d4MO6N)KyRwNLTIG0A(k!Jt@o)Jst&ZrtO*5_+(`!A^BqW_RhMmiBY
zL@=FhBGukKF;Q3TQ=C3?F1DhigzX2{heuS|Y+4a}Q<QF&iUz9ncb-sW7VTog0*)d_
zb0?Zi+z4j87b!S8D>=k169l@j2GXWMjM>8B4-;z3jtf4`H>O^Jp&V3{7MvO~`jNC=
zC36|x&Y{ZV0kn|3G}bbnoRzoSeV>BtWlFSz?6r9jM@Vd$m~CzLQoL0tZXOk&zMTD%
zCxICCdmx4H@eO<?xe%vqe*^(hRHO#mrpOc!Ud_DyguB2yX?I-XlA|P+s$m5G&xVS*
z+a+kM)Y|?q5cT8Tca%>r$u!_ad2npDmJ=P@&l_FQVw%W?PlQ)hjh0lc=~FQ>fAw_x
zbgyP_BzB>AZt^);(RQ%4#>(sdi^7<}uhRrxeo7>e7u+^)3KwMAw$m?Xx?F-aV~jR4
zkyKd0u4|`bDPlrJgP4_dst>L^Zfy67%V85T4Lak60NE`Z%vfig1NhXtB~eElbN6cv
zyp+$QKbIWm(o${+i4G@9)31*ltDSK^5e!qD@&M9ho4Td^4gbWzoHje5`C8{W<^vMS
zrZpT_GI+Y?`)3nZxTmTcokkub1R3>ar85%;S2z*mq0+3?T|)@oaIC|fk??$r9hDi5
zOS;?00@D&yQImZArTG=354nY(cjsbjo$tJO3m+}0fY1kGI*9fa7+V5>QhY3C44526
zw!4AWS+slHI^@aPvU(mR7iSr1PZkTz3OUoIMDZyQh66K^2v-fbclD?vKr)YOLc`sH
z)tT!ys(<#wMBox|17l5mRc6<1w4u#*#@EE>&Z^P(aCd+&U5y2h-p-)qP6$^q%}Vk|
z65NL(x*__+ms48}1p?Z%5=9L0Z8IJxRICOMH=>I-UO9+9N6Z!O9n4+CmC*q{8XLJg
zwK;65fJ?BjS@~^L$<ohDFN83il<pUj-748N{itEGo0X?H4u9ypZ&pXX&5aOdO}Sov
z^Amu9x)3pbUI>OdFJy!tGLU($VtQa;Hfvdeu%C;a1rvTNVWAodscno`4CiY807fdC
zSt>9o5anCV5;d6G<A*JYD*Ff~-htutTa+;Sv$ts@THQRLfkC668a9N{0X`Qo6ClE(
z#<R-@?y8CIsTq_Vc;Vc@M+)=0Jd}kdk~AARl3mY(MX^|2vJ_m1dli#-u*ikpUstyx
z4%e%hL^{#=TUQ;`v|73%5FXMwZ@r~&WX%b$I7+-9uDJcvxuN45!St=mG(=30<x_an
zxy~N$f7W!jEls_p3Tg+}jyJK>p?aj_{!r<Z2Z6;-8>@aRp4H!#a$?sxfZz(>kxRp)
zyEu9!+d8N5WJm7)a#um3!6EV0)A(R<7d_bUNFZRs0zjmfoHlqamz7$WU?w6xQ7I~%
zbc4^e{`1`hyeN4tUHWI1@B%-}or^i%i>)}H_wpil$@viibKGz|)82k8NQ`^gaf91d
zkWp;R6Eir2g6moN+>|=qXQ_jMsL3He0{EPxc_P8#Z}=DIh(=dne<Rb_u@%P+2O%2+
zflKbb<yjX)-Q~354WHx|$W6N5;_B|qw^8uoy;O6-p^=L*ovF7pVv_>{q;c}`ZN&W|
z$!#m_q?<3yNi)S<ss$x+ybx3mGd17|{sWst0VBW|_yg)igY19%dUq6i7#<m_Dst&_
zi<0zN!2Ggq-;>mgWiMti>4spRF6NULcBN^{))=m_dB~m_t^>zGpg5b{CBHzlM8x3e
zA*cAXwGkgH<Y6o8AsLace_R~4a`tV@2Rhy@){}(6;cmS1Pg)B_mSi)01v;=Fsined
z`O!J?17wv-B%bJjjx;K#)oSPDUy+jokNX5n4u_jE6~q*Xbhnx&YA~gTa0j(SciFr_
zM_#Fou1Ky(rM3-GbeIOreV@{SFdThSU394y0*tn!a3jW-kKK}vYeHn#dNcf5t6%;z
z;??G3os^vZl(H$OV}%C#HdBR6Y=UtGyP<F!j=9$#rg8pwUrhb{@$^09W~8-@v6y;|
z-DX$)PdZz3r}V<!9-%qi;p~x29|sSFDswkgws!HdRiQwiVj}YqqXHnD%1S}8gphyb
z?w0vKc>N`JT7NU}i*c#LjndEYnRI>%3AM6KUZ(db1gjp^nfy1{g^>;^<M`n0P1v}I
z8L#NQzD0v_d36`?UJ8EjyPhjNbHnfdlf7pBJdbe`zLgcwl5+I;`Z+LBM{S^@4q_#L
zZFwX*MomL2f`)~qTwIqtkpGXR{f6<8n}w{y4NY($xA^`fizXQ3^UI%I)vg#8n34Po
zG$9*OGG__j^kvfjY2=YAxs636lhx?Q`)ldBxw!(g4yRiUdxIK&B_=Jp_=rmwAXWSR
zsrbU#)|cUrwl2)xhBE~xL?=v`y@$Fz#rPuLSEu#@XtDXpm|R<?{whi(HP?Rjie@e_
z&4cXh?Sp&eI~MXPR`%26<#&uveT;~aOV?%*bJ#>tuv2$aAZmsHKkd)Wo0-4gmW-$V
zx*X@4XPe@MXu{0@tT6w=xdQM&%Q$`JO0lrs`K7K5IO6FdQGHamu?b9SQM-0P&y|2A
zy;1IwWqNgACMGiPBO2v0a;<ZF);X8y7L%(CHkgi%%B7v50ZB_a>Z_zBBl~62QZW4h
zOa@A`(6x&qjlN8Zr6`Kjgk>Wey?5E7G5zA`of#hud#IXofCeLcO8pVyP!{A4m)!vS
zt`5eE(N~N<%%};Wk%%6Y(<it5h2+RJmAFxN`wq{S+7jf++g|AgW9MCkc2I!S1(7PG
zYYuAw;9(2*B~}krF7+5^_)avl_4=s5Qvq+hSYYT9?e>LwEb5QAKeEpT;`7Jc9z!9R
z>#;@ax1n#*9SAS}{{?7^;c-2g7*23Dcom{iba|qR0)A)GSM(rFEqoR$*zO}Ar=9!%
zhL9|XQNt;qF%{>B$?7KHaFMB@p<ossq>61&{y85uZnG4}qG;+DhN^dWG$Y9c`<=Y&
zWA4vnpAf1PHIa>)LM;5K>pDXw&*4{B+%6JRSSSOWq0nzV;W%*njh<fGuV247Sf6Gf
z`d}w2su6(xk#6?yie0~lcOion<8pzXdMlaAQzScsN8z3aQ!`2}D3@BOKm0Ebxxo2g
z=t%>>F?FX~(ut0wukvNxAO1~Zw?Zc*4!+$!(BaDbFJA`bpX^-jQ#dJ`;N*FT%$I3i
z=gSV-scRlHbWi;6$VlHx%iakm%N0@H=13nmUj1ceZmyX(gde-Px!DQ)B#p)Xkha^M
zt0hvzy;iIPoS`(l^rLL8?0Iu+-+&Nkd{*gScy7n(q@&x7t0O7ah^n{79xVc=QgGOw
zAD+OnyPAKwc|d9}^ViA0RZ0gM25M^;)kJ-}7I|PWQQY?E1(jQ!3RmFiX(xuO<-kM?
zKXu#tgv()VbzPaOX1`UMZm&BYA^z`*BZlma4riS6^^{8h2lP+Tt0-{Zg{<LL6=^-5
zDyw3f_|T_?9;QSsli~8{qFCj-xQyV`Xd1qd9R0KaA50I3V>&cY)wT?&X#N4yD!?V2
z{3i(qi$Oa-6$@Q&H@oicwC@<kW~lA+a`PGJ+Pel@c4X7^#eXhQ>=%4pGp+ek0>=4w
z8n<W>3_Du*Xjpned_Aqb?w?Q)6NJpAHL%K7vI_8_l@HZ*59$9&r!Ff+=n<D}z(sMf
zs@v~iCCc+Ip{=rnO7RzvZP8QyS!%>g88b8Ua(`wxpv>blA0~#VbSJzI7_E7ra0lq*
zRdu(E4H=Y>cJ*ZW`8Hpet>z16)!uT69oda1%*^z?<>Ds-YXra_ib%s$`c?Q^)lqw1
zRx%z$t$F$YS6ErGh@HB+T1s9{C&FUV@qp^^PpA>Y3%P75zy0l(Zk|uLT9I<D&PNbW
z#qKL<#{%G>yjvoXX`O+A{UQlUs|M<Y1yrL>wK{zj&Wp*R-`&DCG3?`43V-TQ#ZNm1
zTx^_pQ^Jd51Q03bL6x%^O{@Xk5lqJH@Jps`^`|vpo;SwwPB}1fj}LWnh!6NL+0=pa
zH&Zww4=A;f?k<WrHHpa6(iY^E%k!6}CK?YgI@--Ta<Hz8N<N1xuIzfJH@H<k1U{}p
z70E6N$lMBN99d@7Na;Fk#<E}DYrL2*s#4iDLR9{~CUamP(pD_4zznP`=5=S(4zu-z
z!3+`;x3_o9gP1b<M*cId&&u<sRHv4gXO^TGPi7_TEzAu>$=y9ut^SQwO};vt`ch}l
z#8}t5a(bm=bDbORp@(6WK6VW^Dt&KUd88H3qU^ZWcRc9^R!Ylcx&_>a3+F?VvQNeT
z<<xF13=Mo;iV_BuVwwLR!mb0H>h}G=i8rg1tuivQLMYi(Mv(|5S!GtTN48GAl}#ck
zE2|-7WMr0IGFpi2y;t`5-_PN2-s<=MFIU%ft{mU*_j#Use(ukGf4^_mb98-VdjcyC
z4G&y%_x$odTC?|Mm|kfwxSsLu0B?2rr$f9d>s(Y==9eY4=lLCMnp4ZO3Q<&RrD^_9
z*DO;byb)=iYm}H?=2?wgchi~b(z3laRg-q~FQ&P>X(Lh))WMjFj)_tVse>eSGLBBG
z=e4!tc4@t=pOtyI(OPv_L$QWvOBmaDSD?&p-Xd(BzPkF$$0KdSwiby;QO4AA`$jX(
zTHfA5dCSULz@UdyUd%I(Ir?o^<qt6t)9K=_IV%PTCD9zM#q$E#?lsQ-x78G6hgUU(
zasAq(!5^}oMXqDvZrMZgeM6I9lP=gZ6{6O?wW6l3zihoxpn2aCouBDE>Z8KsPQv?f
zu++7;{3ZEQ+vYESO|Snr*7>bv)O5-9b6(V2B7T*T2R^tCeuyvb?U0PR5KE`uY!NVZ
zy`U4v1yKEa{uzsz-&u|G!#8i-DjrHLN|~FRQ;@3>?II5i3riXmSQ|ujnPE#d&mG=Y
zgXckFVmp-S%Khsc=ptR+7D*RnnlskA-#sHcH~RhdEg#ib?wHwnc6Vg(kdyiPFg26#
zggtIaFLKU1@7ovfV2A6xNu=wKv6h&gL%TL~VM!A6BAC)${QG>J^;|H=Ak`jo?Q6I?
zciRW<OA9~F76dR)Ee5v1jCCW)kco@sw-_i)B_FyK>rG;UKV9|DHBDZxZfEy7JWw%%
z63m<4N9NNVcbP?4ZMyuA4Ei5wy|EK(b|d~q3sYX}u_Ct<pcs5|eU-jmG|Y-P?o!H!
zwIMJ?L<q^0-hzNk8OH#i4bgtRds2JP&u`^ICE{!S+uc{<9{{%i*T4O}Yx2XP-X948
z&$n6}N(-H0j($U6K)dtemMye9u?8x}!l7N}{^+;X>{b}`kBtgaq9Y;|$fy#jx$Fj{
zyU4@*CmKF!-Hj}7n&a7sXK%<V?#fwwa;#vvFub9O723Cb^HbtRi5e^n2&gzuf5{sw
zZ0?W!@PP00>jwSiLjR7dNL&vn6kGUoOlC1_PP_Hdo*Ps3sY|%!`YbD}^<w(n+=K$f
zlA@WP^=o4?Fwzmvw)V$q#~(v;eNHak4VBn2m`!$0;#2e5>@^DCguK&s>t*jQ=JNd2
zs5b_-B)kLZ!3TL&GJ<WTei-%qbzNJmIR8qEz$hKBOcT*?-HXr*i_H@|q<q(qbr>J9
z4y)gFALHAdsLVfeeQ(2yb4))Mj-cAA=+9v2;PBek`L^{Ebbn%a``^B({opXdyb&vy
z>6<nELtDaP9U=R}@tIZ8a@JthDZO0V1UEM~`?>K;&d$zn*W<cdH=3z8f(8w%y-YD-
zMt2)@jNY9tmacecbpJ2O>bq?CcCb#<(`n@9*M-dPn2B?1V7txkD+zd1xDmUO?Qi;f
zh-wTQ@07jW8-4x&Ifc^~0j3HFU6(K`CDP%9Qq~Mo=Uqn#8JDIt*lJ1EfcFN~n$vN&
z*MkSqlI=w!$wakxqJBH$ht|#e-QQ%jKwSr0^>)zI$l)gxkC4^Y4$r#Q6q}Ut=0!>K
zv8^a3&5_uW&2{GkcEHOJ<I4sa`sA1>y&sWMBRE9+F9Jq|_Q=)YuZ*cy*1AP&40ef^
zQWowV>yub>s(Xft)EZ)9O!+@ZHx1waDBbk2f*sd4Pgr+1;L}~JR%gY>qdA{#jIb<r
z(VXPZNjoGw*Qstc%L39&ac{Tg`?G%Soq3u)#{PCuWz2wr*{;k9W+quz^QHDtY><^K
z8f2Eb;+EeCKQ9)O4#&1-Gd?i0y@WG*FOL`V`E;z$SG~0ml6LE7>ERmfE!TYBC30Q!
z>7-^~XC+8%fw^dN98x)=99m_IAD6DKwR0Ipl4p}BpZn+b^YmABPG)r>+o*+Hv+(J?
zFgi(9fGPQXr=f6dIChRIMCH|4=WaI#(u+u2E6RNIICvsmD`~vp<ps>Y=G1Jrzc_t(
zNK@Ca*I~VU{h-EZqXiE$!`41oX1&qU1!d^u&mM8Ywd2=hj_Iz2vFOy!`xP(MI`-e2
z1*h*Ye|Bw*n~5NAWbg1Adu@V3e`UpI#nxAHxdz^<cifJGY|rT8w{bfdv07+7E;jK*
z7|6U|+7v?oV!1jb<tNWvclX=ZyPuq=&Wc(*VyjKV&Pc5VUg>%TyV?k%01i%Fs7&9f
zZq%~BI@%;N_IP1ybfP&MShx)`+;_o+yIDiby+v7ypQEz+9MFW+E8PzH*aQeUC0X;G
z)&uzN{!X&0o`l?efjy<_0!5i$$i^g$ZYA9FqJu+vMI{T(uG7U{cbGjS6YDr13@yf)
zdW4Ex@}i7;DKV6u7<)_%gL%O%aAbJ9`yluB2scmj?)r$W8y>-)3u7d+6cbT*@T=Qu
zL4Ckl3)gU+$m^am-i^tq)i&=GZ`!X~xDSmqn-sa&E$U4cp8QK%W$dmG;FIff&wjob
zVqE4}KCUxOA)~n*|LU2WyQky^FmJ*GE!A)`$nW+W6Y9R9=XY*8m(crwKASy0EcF}c
z6-*3p{-RPZ_>|I)d<|`?!_Kyu#AsA*ODr;Ymdjg>o64pwzRb5alC4CcE$E`&N}0=7
zWZ;+xpmMnMq;-t96P3p!iv*CFE9-hPSI_WW@Onyk-#@=Q`S!Qc7gp>0yNd2S_ONf$
zJ-KytM{1%HBapiYiY^yY2dtyWuN^LyXK*}Poc)2Di+cX>?J|CxR*8GvfN}$lsrEpA
zg3Y01TzYA+U29vB_=QESZGFeJ>S*kV82b;2zq0TQ<##C$Uni;&+N<sUJ`<L_)*an-
ztb~s5Q|=Mnwi&u!(oGE6pG4E5Igy5LyIrpsnuI#MTH6|VL+@;;?^e$(_gMbEx9?Ql
zsp_3ucdb;$R8l*I9#tm~KXjLslTwlT!?6%eN}9vlpVHryGYZ<na$lcjU(nI*dB<+w
zi1oU``b8dwde_SJ><#VMQ#0ON`*~{KVMc$*-P1gJ*`mgI_JSFmO)5vjQW9^rLV?JX
z^m4>_z(Re2Msh8oF^{zd<}vBg=Oh2j-opNra1x|!OW78FU23ka$jV=VQKleukLzrj
zjbnpVNQ-}B6v<8&*&BahBB(1H4RfU9Xv_L<Cy~`P?>)<wIAcB$y~TQJv{!1%vUR!Z
zyG)7<t+l%?`jM(09_ny5qASahHz&P$Z8mbZ;8f=WG3Q&0C)Iryr@aryDbg)nC{P)?
zEaGtWbRVx60U<Hmh{cnTHu}29OU5IkU5gQpBeC<<;bM_)(JQH?{W8hCm1z#XF9d(q
ztCDT86St|_g`R8+0io@w_9foc-Cg_L-3%I1dfp07QH&1RUh_&gj{UXJF%~x8j^6y4
z)0vDD!~8J}bg%vH51w1C?xc1W#`yLyk<!W>YTZ9-%jCqIvjhDMj{$q;Ts{@!sJmsu
z^4DSI(|#g~X92SQ6iaMa5+OF>)|{31$Wq$fMbrH2;Ag4<U-4za??VGbb8bu9s^&YE
zPH?s_cPw`-v@e`L<;g4}Id!UfsL=+7wVBoiT($U4UB-7-Aydq(wfyp83ElNFA@}Nu
z`oVu*M3?t23}JtCPBpZJSn7J>er5)zFlN10egOs21*wmvR^~2$UJ{;lV6d6xCSKn<
zIzpjz`jlm<+On6cdIs%}5B&IPom!rIk#F1D7jDRUlPop3*H%O{NEa3j{)j*AH+a8!
zR%_~ecHnhW_k8wi>Bi^d?lSWTx-wF&?<}cC`|VZn!>!@MPh+aS7q}|MorxpOnD;YJ
zT5j?+`erxgu_V?p@VfKKrJrn$qf6DL-3d!ICG)S$V_n9!HEe6BxcPL5Ok`{u{M<d|
zDiiVjx=ppSM^{C(s$&vO+~~__o7pG=*7Z|7&eG-6vGVG&%JkCmJH0mo+pp2JJ=cDI
z(ejo086HOu)09I+Q`XXM;#wgava6w=%ap@3>>))z+d0@TakV~WF{!8FweOe*TW-<&
zj}{}$D?0^dhI4syMFa|{X^N&I^@_#{pZy#q)n8qTZE%woXkW7SaC1CkKSnVYSWB8C
zog+<M@$=Hi+{A=oTVZr--vI+Anxe%YE&0!Vy<|?g@Ve+POg@dKs=3D0Ht%>l;-i)!
zR&YT>bZO{idXoE_S3fVl>9KoMaXrx9AS|hiPXN39YM%%gw8*tqA81cBIE>NdkYCm~
zVmY8IqvSsMSFM?chdPzkczaxfj7Ra^TK;W)6=qCsQ!C>u*6!Bs)3f`=R;3T^6_<EW
z^mAhQYpBIgxrJUJ-+*f!>xif2^1{T5!Rm#&ihEoCmMhS)Osg4PNy!*;8gkG{ZFVSi
zD25*vwv?@^M<n@M4GNb)v3OJcj4bn#PmCogy_r}T(f2AUoOxRjO|>%jDyQfB8xF?c
zl{K{o<(fI#o)a)-H19b&l(JOSVblC`bZ7=szi(~X#Z08enu0A>y)V6Z_j@d9%6=s^
zF)FK-kn6GCw)FJ<P*-P^YSINH6Yb06Cqu)AFzY{WE6zqgZz$<dP4!BWw)6ZM7=7J~
zr{}|HN3!9}$jofp(tiEiPM@2_AI>H39?^Q~wUvi6qy2fOTG<nyl0~#6?6UGww|X*r
znC07kJ)1Xb_kFpG`An8l=y2Yp<`OT@sl0|RpSP**OJbKo)6+UVFUTb*ZOcjGJug#|
zCzjXtvAlN9nHi>=g2Hz<IJDPnGYCwwT<9(wy)joY#TTcPuytlUrOpy}z6ErZ4-IoZ
zHfL4rDtfeQPqw8%C2M$E_oRq}k(p3WlV_HCf)dTr$^iLg=GD=kfkbnnGWjyP%(~1n
z&%<YC-?k~vZmTMI>$<vdaB#=U^vc!7fePB{{KrYnGanxtecPp2I^(~GO#PIplGvn~
zjPjyMN6KP_z?@M1&}wc`ch}O%DU-a}yR(XKi)8fRqPooLt=r_C-CI^#T?LaYf9TDM
z{k*%~*NTEZC@I?^ZGM@e%Zs+Qto$Q0Qqh>WPP0DdMDkM%5kj&(F)3SdKiZCrPGid>
zXJ+I3uK#G*p4ah=-ujZQff?JHP7+f`%9c}?=p7#D$#I|Y2`tKad6rxMO_pB0NwNvf
ztS}Ml3eSha?<?R9J+oS=<kcgKNk#e&XJ6(o&)jt5Wwbk$>2W2TqRRcm=<&$|i$Rh_
zQT|8hZcd1B`iPyGSrQ*_Fi!~x&8AqKmhKDHvR$y8d9jbTb7oP?-HqAxOM_99#>c|B
zl>Ol>V&Pm1zAJ;Sk%3bsizPqzEf<mMGV5V~d^;ALetzUNY}^{%+E5^rq<y-rLMZ@w
z2jZQT<QPA$JFY*IXX9EwW;lBL4K%a6?Dc#mAmHvi@Il!?{|(maV=aG|L%Wrg=gx{2
zNjtFojfE<m!{x7qyLMy}$2Lw5CJ8>(FVN#-mN7l$mZ2Ev{$=XxQk3W6)DU@Y-&yz>
zx&c43XAHJU&o#Fh)(BR(xkL*)Ufb(3(<E=L7|Nhh_{P@!i@EUF=US36X%QFul=*=o
zGU-Ul<@?Xy%B;*+MMgKsj7@rPiw=}Au=Ws@A$?&c^PFI@Ru$|M1sGICPiCL)v68lx
zLcV}2iv#XI8uJ4EGx9vxge}G8kH*jwK^G7+j6CvBbssS=YfMT3uNu@wz;K4V!R$*c
zeSx)uN8mrhIOUdSuN##(Y}i}I=lkqu^<_t!Xz4W5>HcbB1!|nsv%*C86um>y^8;Hc
z<iDtRi!?u7<rQ2|9a1{<cO=eD5L2hf<~`U}Q7v<br$ynl0t*>olvAg-XqTUGtZmQx
z`edE#{;x?|ndS}rN@4twMD=9%;AEP3)jG?p?F?Kwzxz3fxDv^q#=2`bFKbLwSOmRA
zz>BDkGo!9fdwOg9`<gSGX8+!M689BK3~h#);_JPIW8o53Mw#;+zS=^&-?6WY?aoFG
zOjB22pvcFh(Sgw!wjg1GA9>dUNvQU_@kj4Ie6`df`v+}bgx%?yZth(mBF2woz3EC`
z7xm>n{8==lq1PMedwJYoS5K1hqi`n_n4sHgv(@FDjHLhi)aF}ho&cuCyiK|_{II#f
z4hM3qm_hhrPIz^8l+j<o|NSGUS&q6HGl2l@+JZcRyW)4&L=nDooXF++n)7>7PvR?E
zPjam(onwdeTzg}oyQ>msSGm70^UC;L`G(vFa85ye1RVP0*?FsjUz5lZiS;rspMe>C
z2cn=L&ZUufDvIL%H10Iv$QlhWPorFT#O?coK2;wnn*Vq{&v9f$WXx$@S8)%qwu{un
zODnTQsuGLM4TUqW8*^-o*kvrx!K)o2U^q*D#9{EuO^5U&R;~HG%^&#l4vgkc)x=im
zC#}l{T6DL6@#0t!GYkV%UOd$MSyhHH`NHe>d&s;g*CPnr=3y^uPw+ItR_RnUyh89+
zJd@#cf4zJ|8d{|+i%=r{=p*wKZyl?G!y`d{j1nSfZg7eG`gx<PVU{WTzdYAqr|1|i
z)U&Ubx2cty9oA~zu^)Hn__vf5n$q##eb}%ffRUSCReIhy!twi+AoMgb#6)m2%Xj~f
z)I-fE^Z5f_E&i3su%go0!r21W_&S(fzOleu{_*~}!1CO_sK>#kut=klJKN8^(0s1T
z8N`hDwT<t~b6JlS%$AqY8AQwoS3*5aF<i*R=hbx;q$$XfavZ-`U2Y8n1rr!UOLj;c
zc)+Jad-jE<;MI*wz=uXojDQZ0LvM9>eTrV5`_+b{_`ls)<q5*mk`tv)qq%+R2R$b5
zN0(Z8F6x{>UsWqb=a&uLMVDLioex~d<g$R5Qe_7+2%A1ke&VG{`f)>ZcUGEUY9b{R
z)Azm|ZN5#XPfsB^F3bOv|H$ShtleDp9*LXK(8}9w%(}+zmmkU|lD*W<*gyE?`F)XU
zc+FgS6=!tB0Jm2lQWSxpGiBD^z29G($6YG9L!?Zd&}^C0acAypsz>P0aeUE;F(y>-
z-%SHgsxly{&GS`=h(mb43&$@fnr?7$ZoI>0z9+0=m*U=CDvw3{sZNVNX*avSUnN#I
z*S0A-hMX8)LYL~fJd`u@-CSb^p3U^7PIgVHjO)eAbg-7;RM~QXsYy6!>MfeH{39i<
zCw6PWNj?-ZQ9`1z_&Bc%PmwB*qa*lP>$zeVVf&uj`57zD-EF8)@d`-<XrZ8GOHPQw
z9dZWB2EFS)Rt}F|CeUWJZn3E-`;?%p5nIsd1Fo#usy6t*KVZBt!?cx^hB^P(kGH>_
zn2KJNOtviZp)slo=CJ9$r)Zj8*pK%RT)Qy*dT*#DjWZS<;grjv(?ge4a3Eyk5!OEh
zeI#gGnaS>AUY%1^A{92P@pBycZen-Ukn@!y0k2kyA2gjO{<vNrD{lj3`LQz+jws~D
zEK{MB&iGu8vBDjYjQEe^d2P#<?4AdCG@rflBnZIME0!@rg+L}o_l4<`Re2ESq`|Rr
z>117N4L|Yl*Qa@vN&r*lPJelJhHa$d@e!+l_o2LF=AQ%Df({cLX|X=uV*go%dt(3n
zrI+W^Sj{JQa)BbH>#-O?PgNtFth|ta|BJIppTt+&x5;=_FndZiSI92mDFAgwaTs5y
zxbtLpe@m{NpklnlZ=yOwDmmYM|B8&NSnorl&QLA=s@77u(-#E@L0qX|g6T=IPP(Ew
z`4*=xf7kk8J+$Q)Jqd*X<v0n(*rY!5%ICS+qSZ)|T3Bb<jV)VeDvHza9-;e5@xaq)
zCXha*s(h_f{d?7WHzKBWX93@C6vhZV+BKvovv5Xoy6(8^H?IO><Fw+NL}Z=+P@O*U
z_^1%Cu8>58KiWVaMV2nufIIit6w5&iAD)Bhz(RtqGsBm`>P&bI+}_+}OvFxl3CC;f
zx2^K5dOm~&8qe=gPUtN?sC;wtJ6PMq^*0*(SE`{Z75eEj*oNl0Dpe14@?HHmWvfVa
zJ?47^fpZhW5+JiOcOKkI)!$FA!%lXG%bo)}WfUxVc)XG=FrJYqqok)vn^$jhmv6I-
zn<ytpufWUAZZR*#>~yIo{M8t;%o2feQGzklSBmwe69|BYZ2s9T#t&x+v<%!GTA=kT
z&0*+Ut#*sY;)qQdjaGL7#b0m->Zh{b;O-yd_kT#({~^&M{cMV^Sh*hqRiwBB&uL+I
zT*?ndE{Kf)CYJkpQ*kC5+kMkED}(76cI?VmHV<IkQu>^3|Egu+@A7jn!9J+Eun2mG
zxb)uG*E^(`5HPU60&{S)cK3Z6Zs-YPVg|Aq*bV-x-(H3vJwuANJy#&;Ff@9oy}a=H
zMSB$nM6?Wg9~m{Co_?L>I2s)%v=MZ$s2LM@&Xq97n$<>ek8rLUOt$-DPZ!?_%C>(X
zVsf9Z7oBaf({L0Yh`-+%ZWi@ckwt*06$T7Egl@{OoRhZG94a?tPm&p*71X^MN?k0q
zG+siaoCvK?Y4OX`s+(R>mX?fN)qMX4CpTcL)+eewzWFXkxK>h_=j@A+GHsH`Uu1$s
z&5eLg&;CpcP$jq-^__b{ZJI5zX$)9g*0XS8Ik|5%B@Bl*Pa6&;Q8wNFXol&>*mKDj
zZa2Nm`qz5UvpDao%dzQlnkb{wuyuzq!K6&$nYg3!(SU|~o2x^<zdp*pjl3A^EARqy
z-x~_g9UNL2ZH59I69gbt_f91oU0bfPFU(rF#=vcZ??5(p0(W_qTZB=rqq(9JrBoWM
zFZ2ssUK$ih-(sa9U6auu0RpK1n8f$sLdgw(`mYa|W7Kmn9)U_r*G68+7Y5|JEDfl7
z@{~p$!i6M_!duF$U3-n!^80@;2G3UPobxND0I8S*^Zd}z_5)vbDb%Ar_^dy!+Auth
zPo(Z${6G2ElVBga(gVX5Aj-NjU+bC3E`;m?%*&qP<l4G+PAo%&)FmcAf=GU|#{YgU
z4gu>LvA$Adf<Psfbq|get$JvJP<3ym-m<<w*U|o1R&qes{I6P#_dl3-RE_B-%K78(
z$OJuCmd<CGsvw!V-H`JDCmpW)V(2XeUJWbZ{n>w9aSK5>cMLteG(%qVzxQHucV5vE
zI736+soLh4qqm-^J$a{puqpJvo*Mu5LfR~;J{{m!qjHueKg5|gq)Y%|&15pi>vx^~
zA1jzS#v%Za)$*8Ug;-w%RaZq|oBVz1aul4p9yQUz*P-965^k4Zt(BUX2!T5~udkkc
zd&*iljDNX>K7>z4r1#@vI{!UKYQCiHqQif1yuaMsilBW2M&*9q5HA4A5h}#adI{aJ
zc$7fQfc*N(d|$<J3aJps?jdl1$Q;A?ZZY9MUg+NfbA<>+sN465_(2{g|MYYuvTZis
zuRNt$vP2jGrW#&3X_&*OlO-3Y;BN@A%$2V?*_MnMX0-tjCz#Yc^ZIA=Krnt(D9e!i
z`9;G9EVha6H5~Z&uSbcwO!W7Tz4XWL?pW64q~Sg1XP%O~|L^U)H8Ug4u?B7CJWC&V
zOK)(0ir|C|hrT6WxfCpO3f{8YYIH~f&pre(6EtU8_@2DKzpOFM$mo0Kl{Bz2PB3MN
zE!^b00^Yguh)(araRG3<ExSU@9!-)0aV|#(Fh00bidGhuFhSh#dm&wP^udA;4_-QE
zKh`QWV$GiMj>F)ZLx0`AlDo=Icp^+4COawD=e3JXo$19qOv+(I9?<6Y6Xrk3htm1>
z`M&5t*!4tz!=Hm+&p-l48K5?ct~c&2+X~K+#K3K$%m!R*BIEL{9pYRM4ro)}W%kI)
zxV5>{TL=U%7T-W)P}JEW-=!8PoMYYhr~6IHwFfKndxOE`<Y<PVdY`^i$8E_oLHl8i
z>Bo_V1atU5I%37-OUtJuA-;Y0Aul2_TDe@?o^+#WIXoK*`9XEGjLf6M<`1%#8}wcI
zfcm6k>~8)ii-f7(38$fmLPQbEmWPye;~gb&o~tY2yPk0&Nr;if-uRz}6umu2_TXQ<
z6cK+;`^~u@V;+6en_>@H)n3QOOZN`gy;>A;8gHjRbb03=*J-PJ5yFW2G^2`;1InR1
zG$1m*@T|Q#H>Tl^L3&t2LS73IWgG+qY4FOI%(%l>zdY*)+`!R4{xW`8ZjV6%X<6{|
zTZm(z)J&t(KtdR4fy{P1x!cg9AZ>!aC|65KcvT1Y(boWgnASvuYrplu;sKLZra7wR
zlHpYyeKM=g-!)jY_aD~gKddeC6;Co{S4hztP3pnHX8!d+>0{f}v0gmsS~o27`;hw-
zg%?K{0p)?4Byg{oP>sy$($b9}4s?_!SOm=OD3^1F(~O2_oc1<W`fT3}ED$E;2M4^7
z5c%HGc!qaX>a1cqzR*XKx>FwvR?=VPeP)7-{7XrZ>M*d5&vgY>K>QL;sRDs>*N^LM
z{ysFkrYGXxp)<m$5>95(^KWU)yrOaF>K87o`;uY{PEfzkp@Bsj*7vD9wHPe8VP()h
z?!O=`^fz`C-Sg`WS(pif8PWqpJH2j>X+0LP*v%PXeGFkCJX$Fc6ayawRf&R+<;1gC
z&L)E`7qsatl{|Y-1g;{a!;<RT5&T3u>zel4kKc<{m&a=THJ2BD8bLa6uX-nb=+5Fv
z1fvlW;DsYcFWfK|831N_uRuUG-sxl4mA?zLgf7;3s+W5{%kqafr0VyN?4bO}j4{6Q
z#n}5x{7(B>i>0{<WaWA&?wo?Z50WQvhDJfZv*{0JA;QlwZ}fmr5#@Ay!Hm_$KB4Bp
zTd_Qc^_Me>U`~0}A2EwsUu<!k`Y8W3+3}YLfg@nEu3)@g3OR~;=K!E1kT!4_by5>!
z-cdS1MA!w9w#t=JbA!bT*IIG}gQG>zF;+@K=(5UiU)p`46vWD~@zo`pCX+Ce))U@d
zn>GYPNj29a;K^{L)yYU{6=E^(1_RH6(#}tUcn^vdikKY2U=sP2xeA#?`y*>wsD?Fg
zW&5q)b3ZMe9{8-WFa!voACuIZ#?HE3=mNu$5YDYSEe{#2`jq+7J&pW+eeA@D%BD${
zO(8)xHKQb<8R0W>VCil7Oy+;{q|%(@$yV;X5Sxa^3V{=xL!g$g>}e8q;i9i#VjPDX
z?*%u&-gJG@Q2qGajRVSp7+p#=@-XE%ZiEfsuIO_r2J*c$NC87;QlXV5XC$lH=cU2+
zO|WF$@F-nq2l6U67r9clMQgqo2%zq#eS^`mPe@T>V^hTu7N4p8`s&y7MIP=U+!~2$
zi=XMylBuI$$?r10P~vT%*a2&3YN`0yU=SVrBthNIsV)Ie`LU-#O?)asR1lK*2eA`i
z|BgHmFgWckTSL;~+!u*RT;&k<ONZ$cc`SDpE{=Z4b>ooxpDVlwKE<Z-(%l4$Rpe(-
zjG^7O8!y#g+y9XGVu(`r6s`PB)i@>!NNMLiMJDd5x@nb|O2RCwHm1){RRaZ$%1gVe
z?i+zEMK5kqOgfkB^HLSNUBxS8Tz8s1kMrU778zMig4MCr<)ZUB)*XH6Dx}@PPmiif
z{rs}-Z8nYpebtHpFk^l$g75Bmz$@)~Dzkj0(5CVT1rgw+w-9Cn*dZH5Mwk%nWV$jW
z7>@p7|FY3XG0m2Fd?|$sgU<j8LLM3>x-jqo6!zB39#6>D#GUH`CQy2Ti|5=+dI%Sk
z2QIH;C3F^lUEo}|QNy?4@Cu64dGQW!o^KrHi<JU@c`u{5eWev~)337P%=XHD;FIWR
zPM7*5pFyD;0(%h#euS2C9pw@-fXD9*yhr?-Zf{B^Y>65x!EBqS=k!p6K*AV=2jv;H
z(mHr((6~Ir?Gbza^*0+{XyZfBxppibF<HsRow}X=APX2c1zfxvbOtUVbXAmat!|Eg
zAR8Be4-oT-0<K~>KiR{*1>WE%T_Zm9vYJGbef{1TFE#SML-;;=)23|f9fleM2P*&|
zLGZcW=+z~&y&iKNUvRf%;DDj&JMS35VbtecA49ov9Ju7$b+P+#U-|Pw)zS%sO{={E
z7$p)3&lI@)L>jmbY#>RL-3^9s(fPtl-CQ9UBDNc}TnW-2L0SR*z2J9#)ShF+6(Roo
z{Fq=GgksVZ&6NqL-|Pp7@6@A1&y8<=mRMgIi$6%ovM3({6U-tV&ar>q>X{CfPk@$R
zm!5`4voY`i<9{baa0{pLx7)f#^Tq-al*2sugZ3T2Q|i#5r<MM80GH1PDZsp3=_uOk
zKOD5C$J%5@>S3`yFs5kC=)(?N@eWV-@Xk-RmpFfMG*1xPJ{}62ovx0Ol0d-z<+eii
zUP3O)b$eLs2^r-`*Yksb{v!;^ZLZ_)6`dT2WjbSh`M+0uWBk=h{rPTi^sa1>2U=x1
zHqQbM8VGbf9t$$SVEp4Rra)`kV*Rb-|1J5O&wf!AJ(TySu;I_JGsp~lm)apUU3dPC
z{T0oma~3kLHliC6_Wyiv3r7bfDFYNjN+=Y7r#%;M@f?Ie5CB5Jho(Jdx~=h?!+*-%
z94pAxZEMQ|28EK34qc&y6iB7{I|^w4=e=Eb|NoI%q*U*AJ`}Yv%!D_u0QRI!|M3bo
znirA<<FIc#6f3A=U1vwH=<q_d;M8LgJ}B|1ohZ1nQC8Y?o4+VqqZY=PCg&D~v?zLA
z++_%<r-z$h<YSlLv-4@4g(3r@XpxnFp#_K%(;2wgw78BHT+xy2kGW-taA`PrI>W$6
zID80>nMH#LNpsaLL2P=uvkV7`;OPvD6%mt5{}a*Gr74ZcB0vecFx!|ab?CgoK(aIw
zQJ2tk5?i(vO5swA=Bu~<-fsBzCefJ`-P~itXEpHj+t{ZVj>i!C=LEr*&$X>CxAjAE
z0mTohaLOprKb_26pzPwzh!N~0HxsJ!NrJ(PKwo50M)#g9PS(p4*)4L7evint6V0~j
z+S0b&mB&WU+Tq<cBwkjXumQTiGAYVYcf*<I>IuQ*s%>~**^qfRh4zu`wtx$-ua-eI
zMMD=nGA*NoV>fnZw0Xdq5&vyD+LB9T?!#&B`?MJqJ)ru;e>3VZehzh#_*!!oAHd(V
zRkc5|jT3X=?TioP&M$02jGM~9;*UCL7-SZkYSs0h!<`Y%1@qWSu;x5n69i}1_7*T6
zh7inN(J6u)!l=}1m;9FuQ#^g+BEf(oktp}y6ZXjggu|$_)SF{hur%IRHcs^fTOHI@
z9?E!nyyq?aaQzM1&PBAH_4NBcKaYYs2}A^bdQy9x+A;SuAI=Unq<Xo0J%Fc(92`N^
z`JiKb35)3_AIib`90<LjzBSI^4~iDO3xP@yws9s`K~zNtzPo*GI}f1k+y*bPR<S{K
z7TB)NpyNBdVV)OjFsU7W;*iUUvZ9*_>TRB@%OTyUh5>f+eL=hlL`Zngg*{T14}^u6
zL$jf3RRP7;j01Z*P(lX~3IKF-5$E%rY;+yIo<$;vLW*6piOzUWt)w1f)mA778Dq%K
zH}(UR5Ulc`aOwBk+>n*q>pGk^@t!v|{=BZ;$HRi4$_nTSf_mUq>281B#>aPiiBCm}
z9~O5Q45184P!~4ylRNg?HG_%>AnEgk?Mx&>wh&&J$l_^Z46Y2bcNfIDY+Jdv<3Fo@
z`-cZg%K%$3Lq&&so<)9Bsy438qKZJ6hmJGZY=M$O06gD&%JT{0L|ID`U|1(lglxy#
zKaK;|IE;x>zON?IXtKMCV{hc>Z&GA%;Gkit7t~#j!~!N`)9KBr@bS@Mn_ZRM7+7K{
zWaR3$Zw3+{cP<R2);`z8hU#uKu!K}L0V}*E_{eRf^I3eM7uZ{N|5`I7Jre_`gs@3H
zK98x#ZGK24tK%IYDWWq}eZ;D^L;%g5ZkgAbge)qOv{D1$-03{kT#%|^j?t-+YxT$P
z#?w2NXd*cIU4RLAfz?Yqh%);6Y^yf6-g`LTkK4PTCj`yeR)MG*d2gx3eG%q+GAZu0
zk-MY7e}KSszcs)G3POt=qdMy2^G&REQsy7^vB&=$g<lTFrFiC4_8TdR_(qFXS3>4g
zB}OuxCaX9zL<BIu#+kXiSWgXctI|yfHv*yd+-KP-^nnZ`;CjOWs;Biuo~!xRLF{m9
zlSjceNc*Ml#XBCZa~LtZUO5Q3G!`;7ucqy|Rr8Bo#S~!2>KYc`*ut#{F)hb_l}DL$
zd`+RUy{NUBP26$#A?10nk?2LTXP{b8O?E*Jt?QL4tZT;m<G-rdbA}kG!Itm4yO$CT
zLVbTaP;&A_gk6o;+%LC<6qln8j4(a)mMNV&(&lDIzE|{MoFNT4rhvu?>X~fX`+q`1
z(o)Nlk?*g4!-dU8E$n7c(2gOe1;KfHi|r7e;G}D@%v)Q49pom7a2b3q2x;xG%BgOw
z0l=Lf2>EGJw;`Q2pR-Bl61qhoAjlMM?_<9iZ+n5o`NmJcJG@cu15Vix=o*6HI*^~U
z6SqQ?RF*afQhJ)bK7?&r7>waARHyGOrBXdLkOY;eG>_#4L+_)UzufmPjMOftzR2sb
z;2*g29`Nglqwoaqfwudm!^c^Y2=}W-?FEK~4}xrxI%%=C)h!ny8FHE9`p}7XQ>GB|
z&iCUdZYyrCrr3-~I!|*jyJ&d?MXdHC-w&UL!V~aE$PeIc4R+&Y%Quou%38#TI1i{v
zc`5hK^)D7{f%-4<JfU=3hKfJr^Dc7WxJeVurP^<7RXVs^4^J>(WfOpn=5>1213+YR
z^YIZpySBDBTp}14fnHws<haH{-g=P0l_hGAv(AkMC|>71PgvmzdMgO9%*{PD#v<Ov
zdFy3$ft#&|@Eqvoj^m7ztTY)^bNBbwL_R)taQ&dB!WBl>HeU7tGG?!~Gy^~tVN=e4
zDy|`w!G}LcTpx|Nw2SQS?-$(sAIvqH6Z;1tiZVn_o8xwvfrr>&Luz#LbMMD%->N(g
zkIx{|kEB-DbNT0$ILLR@59sC`ES!oQGd}<2-M&9q-i?D}*+T>(up96lU8Vj|Uaf$S
zkB^ucDH*P3>yzg}jqhw<0qmC^$poNL3H~pk;6+4a1DQtUtIFH|naIgzkw}ggZ=JXw
zy~NHjJTuZ<fl9>|177QbTWN=$-iaXp=Qo}DSC>jocqBHX7qz4JfC?S7-G?ji{id*J
zZ-Z0_K#uh#p>D~&k7D;BuD>amB2=Dx;mR{R!PHH&pD?|nZ0e<K>Z6?PPcO*IUpUuE
z?yrCTSFOMuCU#eiZxstx?4|k;CZI~<0P#-!bB(9hP8{`H|09eZvZ8aw8`Zn-#NZos
zUWHr*RTUQ9`sBYK`#AxsS;HxDSU5ra5(tri+136|T=W}{d{GCdLKwdutMef156Au|
zcgsPi;voKFh`K%g$GyOvapfTl09}e;V~s#BqREmif4rIaZk}p#A*T~32yO|W%&F`=
zO^o=7$@cio#KgYp?-!_0zY%->_90#UV$b_fY+3Hr#iMU)!QPai@=s_k@uPLYw1T76
zKDYv#7zk#(6r$<R3KF1R^pZ~2w8pG*|9Sn*TT`QKH`a!4`W)w}hdTi(S8cXL9suer
z^-qr9JvOFw>%Sg!^Fli-|Ar@OI;gbjJ@H9dz<(E;LcIT6%&i?h9Lf($0jQ3WXCT4J
z4sJg;0@GhU+=hfR#d@!Aawsj|h0=WIsZTQvU<6R$npPIe_W!Ldh#QL`#05%!XBP?=
zV(Em-%aRat-QnFby*ywGUIu9cV5TK~kDde95^+t4Qk(D=^UL$BZGo&b$#6@qFjU@X
zpheH|kX5TBf~2HeroBMO`lxQ{f;$eHMIqogHP8So5nU4HY&ZKsV)l40bul9;(|+h5
zgy*+U5JHYaRdt*n3+(=S;~D*S<gD(iN{jUr11M4s+b~mjR1t}c>^V}feeW;%(hC`%
zAO{Uic{BKeUflk22)G?(;avziUt1Gqi3Slw0O{aH{s$_Q{_6#v$3-kcID8CG3||Ej
z0KD!C%s`o*{&lP>FECl$4@qCT3Ye`d|3<S;6;m&Bh!tqfSnP4P+CnhTjUvRG+jK!D
zXT@$9EOpW$jI)>1=Ox~<aSI)eL(4+$G7YeV5Zd6{PGm&fuQgpl4<BTKw$q}tRC&Xw
z-jMx9M$RlP$zQZI86p|1-4Wc>aqUk+I-FY!$wwUUJh-<3h`S$NiOUS}z>lf&VXVi|
z=7VZ=(p}YYygxRMD+U51&E37WC(v1QEP?dO0cS8nkWUcLG2#Rm2wtrPt`!djjno%L
zp&|x}eX^Y4Csbb~4)Fuk3btv$hZ~*wts!DAhioV;_Mmlpan5}2<0)Om1-vJ~ox;qi
zc7xRgt+&-yD3erVc0UBOS@O)bH(_ZQw#VQMPLs#6;|Xo02qtgz4e!G2STv<0&}}*a
zlCI6N$KSd&;{o+KIYPuU2GKn+kOzq71YlJPP^?g9=Z{1=lWY~z7KZBucoi`qvKnM*
z4jKTIe;EcLM3J40#$9gnY>KtWB0Lrb1h0IG4`GD&Hv2(&^_|v{ixfycD!IW#h#igy
z#@R|(E6Za-6v{ZkTnvq4g_G}jyRw=sKO}0F{sCi)X(U26bsMvr^))~q62=g@%wzl(
zGqvj}actNh4AxyMeZ>V&q*(%qMgdgM8BYJ8PIw^nQ3dDiFTcy-%=va8M*5NQ=dhTp
zu9bj{pKD?>(e5xf$EDp?P566p6uTEwwKB}$(raTb-~K(PlV!;u{GM&a41u%~$sp7f
z{K3zQX56H~=3Uf?3uAcc8_9-}!iP^}5pI)J6V<}C@%fDq?$b-ViT-95WS@{Af)l)h
zW)~`;?FA~*?Qbb}h(rE?;eSPp5Iy=Ja-_Wsq=qnD`S{6y5>(Odb6tYn=aBsC4gTg7
z5y9M;U19nzS4iI-DV*?kf>K5tIM4a*?5^mKaB2!4V(JJ1WS}#Pc6U7L8p?B)=V55!
zmCuPbgU_EndQw;ya#&NB;CidG+0rm{KMbURnM5%%;yByfj>VSBD_P(P7!J{w(Y!}B
zyc`q)zAD+_o0MAkCg;sp3cRhjLRfh1`4?dAh$BP>hi~K5jzx`&pz72#!cLWR!t@-q
z*>B5>5Hzs|Ra@s=e0wt_D8!4N0>=dq$n6f!0lYMWn<fn_+xx(5^2$|TyL=U#Y(128
zoq9sGVj&@#7|y75dC#`z@6)(gWD9{9UmetKPLQ@aws|akjxH~lZGHSXS9!fLVPku^
zq%i_KOuJzqRtiIxkHBuJU5(c<ht0pqfgXMmfyt7!@=$-DDX>w*KEgRDu6!8mXwGZ*
zTwQFdkf+n5hL`-lP1$-sAq+)~RjT+P2nQR0E_%Qj;EvS!^u)OQc<R4ThJ}s@Rdm2y
zG_cL*F3te+Q3{n=bx-M>7UO}xUG7(u@d1lJV#lzb$TqcT#U;R$j}M)1BNZHpYi!8Y
z#ig0-B7fs^-u5~WZIPLJW#ENbYC;6?f)Y#3RDCJO(Ux$^Vu&+ofR8K?UOfG)o_6~c
zlL_PpIhH-^7WPI-?5QZqDgg}Ol7ff^h}iK^k7P?JnE9U5^FU{k>}VER&A1pmh_mf6
zR9-=R2bd;v;_(DjMNw_jId-4QrZk3g)&Z;fp_Hqm(nVC-1P4=8nSVnbmvi6_#W=JW
zXbCX9KtR<S2+lQvz&L&RUj7x~CldDPWDeoIk9z9nftrR{LTRRASCQNNB(iMRe!Mws
z@vTb5(l)~!4~dVNBLXW~nA0EYODcX^`jy1ua7NtqvDP!G`URa73W5JVlN(UpLnSUS
zv(do~0OEDN8A@5N>>)YtehT_8xn~wS8suh2e|$F8OEWA(5Y?Atog=lH@tgiGrkki0
z>V!8L+93?2K?Sh+8*&?(u>PStq&U0qXYzTVASs;&i{q1J(cG(<g0r-fTHr320+wNh
z&2BaYzY~w0g@#Js-6o4WcAM0hi|sdRG`?Za<qVSiM+?Y7A*E3FVg%B#_hiq9KK)NZ
z>oN7DI=c7{fNPuMAcaYT+FNE*v~zDbqGR<u@eBFg+-@3{?x<oIs?!P8N7a+V@O^)<
zMclKL53m(x@c-H1|0nJH@rfifNSnKW^irk4c;ydH8F&6WD;YPEQ_45(H_1A$7cJ_I
zvhz@_$r8&LK%YGm+y|kMf?#z-z~m-Hg$-DRJP7L4`4fY`oGSjt5T+Z%0Gafj+mW0_
zk@d_pL*^(0&S<Vc6w6<$di$|ep%@|_Ws=@ya=Cm!@690qJRkHaeZn)un;vxi4ndIw
zRi*TZI|Xc(1om-|VnqnaGK2-s;FkOee6-yKIcSgo(3ar}#|1>&f}@9Q<~}an!9C07
zhQt2=P6{y{B88puegi^}h`a|rDCDfK6(a!PSs%GKBFfrSFw#L)SSiWoheJadi17(L
zv+%1B!f}}pp!S2MZcdq#-ftkZ7PRZF{{7P2JI|*ZpG1U`WQBFGG5%jq`0I953Sp*d
zBm`iRWJky|uwM%~05PVo1H5R*3U%SFJHp0c#r^ehaqu#QJP!e=>9@EHB_m$Mng8An
zGNJ(BnV&Cob5`mGcjbLLQrzT{!ZBP;-7W2gp{Q9!A0a=GCF#X%jrTft-<ts5j3NYv
zB>%l12?Qty1$io4nx8U)>c~V`(GrJ<RqNiWRRch>4}4T=#BKXDh<XGAH$hcM<<@OG
z{Gej;2`^CkbxYtFD+pH>8j2KzjMgaVu!BM7a1C1vxt{;T4K{@sBS{UW@9r)B|Fi(;
z0fPnz1oFp^g$Ni(L)GQsz!M+TC_*SC2fBZU{GWsWd3;DCF{(sJKviXy?Gbuj4+KL*
z&4LJh+nTP=9Rm+=ddH@nhpw(C=yiLDw5&~^i=Q;%RNU=#G3rr)ZV7IQEU3IOc`n&R
z`@0{0B846~4A~CbMj&EutXNir6rnfe0gQWq>>0T5bEv|PK{#D-`=6rwIVB8gAAt6Q
z@N@$}x!Ii6G9flHXu89Qmdmn=aKg5SAMhA*m;;o+Z=mS>renp&byEjydLOGNymc-P
z!XfPtf3-iD!r^dPs5KY@@XIYR3zgtI6wHNY{*Bl$fATbJZ|nBlfAIdUaz6%KuOh^G
zoy)Ursw9gL-TFeLWuhT&_$!7Ek2xN?*+F8pvNVr+BT}!G?7$vsG}<A%78Cp(q9uv&
zT$!J!J5G@x*8<y*YRt4IteW@-vu;)l_{M=<CS;jEQ69nwoGnIx;a3L#9>(&$oq56h
zS>4s|T$gX;^^ARsVY_3EDgwAW9zY1$3aqHP@*{=!_?~;+-#Da=55-!fYXqW^Lf|bT
z1ZpX?q8WpJkM(FP0R{NXu1ZP_l<eeC*BDrD?p<wAbGHGk-t^@D&Bj8U=)lhO5CE&v
z@Q`JT=*Ve^H@u)uLb(q;=FT-)3^+z};=yHrLNx4wA*0s(70Us_t{vk>*WZw8&;nH{
zZgL>c`axJM(&pU#8$gP>9JKTt&Z0I6?j}^%f%xtbo3;|RI_#qa=X<8A_flT=4@Vt@
zi@y&OU`G62<G~B~A>eJcAW5fB;M6nyfmc(b{`_{yOPb$K1Vc&Pm}_VD<;7XvR=8gX
z*PR`VyBl=vR7ez_nxk*yKpF<7F36&GLTSw(if)El@jxo_y-h}^aGR7yNYbTa)2LPM
zz;xd#KE6YYy%mgU$c?+(_t0+ro4hhW4xc-UJF#}?8wJNN3V_Pz!Z#4FJcBCjT6@oU
z0y1wbiyW_jh%7pi`MDEyyD3SSMLC;F&NLb=el5I(C*^aR?Kl`I=#a(tL%p@pfjG)N
zp5Ku@N47NmRK_2*PjQQ(S`ccid|9J{P3qMo;hsJfVrH)4N38TSuYTo`1~@ZQtQ7>~
zp4%mS-r&VmqF{<8l!|6<FIZU^Is;Vc<uN%jJmFmzdO{`Hi{Nw5KtzPN&`M)fTVqkx
zokT7{8N0_KV8@z73>oOhaN*~krY+m{?nL$z3ENgSx=sEVW|(p=Jwg5M`awBzuwVG)
zSYzslpqo^NTJqp^zz%3#ns>9Sg%CgqeO2!jPt&LlG$<NEWN-!yLhP`9GkRc<IdB8U
zEXv~kqd)R0pv>{<jtw5r$-$VoIUj)7xBx0bTRQ_JMo(11cnF1wPG132cMrUDcWuLG
zRAV4qOhfH93c)12h$aCcf<HKBtJU3(hcK+P$e@FM?@tI@hpK#Xu46kDWm6zl`4D>V
zK!+r-Eq5bIFrsVt<D;S^iZAd|&L+NtQ2DzI*w}Y=NL*e&14f1;Czs1Fn{zQ5@!QXh
zNB$Vq0Xh@gymYMsup5B&=1Eh%^Z<%?uF#g(33aKeFLi4)2ZsG%mrDL-e_Wys9{=rq
z4g-5O_x0!|T}(a=7hZw35Mxk_=ebn<NXieX9zT<y1#*uNtHM#7Bp@$<d`p=C>b&Fk
z0EF+UeG5uX#OzEbw#6$FxN-%Zdh-dYUkHDDzYEt*cS+e1OTKr5NGoj2Om<&iG(Zyv
z!WZP5fod5C#QEDTNxitsL$(mM$>s}AF(~rLhmME&+xv8W-HxzgB0Alj6w-4SeY{Tc
zMzA-r22q#!Gf9afk5|P*3@=(%U~fCxjJ<Sm6lB898r+K`X|k&;%UvmXqe^+JN4GIj
zBT_LCTipViGuW^Oac!N^bSFli?>T%m<UtP<?ED}#--nkVVZH36TDxo8qW<@7JpfBn
zLzh<7@m<_z8O0F(bQN!<SbA6cHeKupc9WrV8WM5w=6s8JxR;;l$43c~??jGX0k*?|
z^RHN->o6=^fpRlQepN)E$mpQSuKJ$Ix_C=HOjGS|$}j`d%m=i;oFWy8D4ZEu_xF{@
zFd(cnZv5lW@0B12^vZ>2P+g%l<uG2~x8hcKgk}i=*RdzqKmxHx;a{fW^ym)>z$9Pa
zO>V*iA22_N6aw}#tj>C_(g8Q@0Yye6(=P@>wmMvx^Lv)XW7c~w8>gZD^#0EE4P+5O
z;X>KLuv5X1jQT^&lHk6;?%j`)5?$-JL>yER`gjI#(@(9NyY8T3Ky!@Ri!s~`92B1r
zZ||;9=z3F>Y`ZTcGhz}xn(e-pZ|<L_5bF*?MS7o1PybDU{_UDOE4N{whEZ1PO{JmE
zD01yQIWP*DiMvm%?;tTmWH(vqur#@Fp&q+suwOatrup#j@~uTXHU;!ojR28-BMXkW
z6BsThPfo$B68%9>dtDYLaRNh)vKLnB++z6uPL%ioG)pu6^{Hw!HpB)7ihOJK|8s|8
zz}>q5K~g@M0r1T|H!ZC~w|TtQ?ie6nMNmCCk4<-wqeEVJ0{Xae&M$oV7xMPYN-%Ca
zaeW~iUtx5nG}IwNC<twFbb|GVe_RZlgia`O@MNuQlstdmWC#_R2@yDd^CUMOjP%>}
ze&0m=|6%bZ0)?YloswC-AN&@@Du@?FYqlSUy>AHfe;qhHixD9zN^F!p6QMd?0Ydo0
z&-*OPJa14$G}J+zQDjpOhk*!#vCs%KPsVD8l!i+Pyqmm2(xqQ4&$__i7EhgA9L%8{
z3*Lk~QWH8rA>+|S%|FHZ#O$}NQEdU$w2T6!XaOQ>^Nkx>gOO%V0V^^}Yz^c7<xKGG
z5CiA<Nlg7H#vQ;j)Dv&4n@{lG00R&k;;wWd+XPHXMF3_d>5gN2EugUzA_2d1CBLZa
zFBSu6#KzoT5Dy9F_V1kPs}0O<bxDdTIeP2wvi7&Pe6#G5sT#>J0CPUQ==}xIPnrYa
zK;)|vg);1HEKwfTXJS$TPn!LzFoW9+{8QZAC)snv`4DKNS$DjftGQlCRl($X?Of@B
z$|Y=TO^bEOb{pt5l!M6A2-q1D(90++6C8$mL5QgzBk(xx+d4>s7Z3fip304J5<3C0
z^^sYfudN5()>2uJ81_2ea2?i5Ur`75|K4r%t8r}%=%;*blvEdDor8^}IS_W?g(f$4
zTmiy{fkn1E;ST;)g2J(=;a&hX!x<u;;s0vwi6<2<LSc-E$QtpYBOq=>M2u%3Y^~h_
zXRyNNf`rZL<=tbHDAdw~B6-k-&$wY7kW;qMqHN0$C~t_>1!Wk8Q&Ey)D?>#qyxI33
zF#fhI*>X};1Ow*q1vO%b--|>CpT!xXNfJQy^z+xgh}Ih94ziMNRXxeyi5SiZbpw*2
z7X+&NcZjy%$QKSexN%{O6b-6MzNMr>zya`u4DxnJK=*tnR1;c0n-1fVIB-W3qekR1
z-M<R~p}z<$og3Y?HS>5I(WhUrzt&(UfVS_p&dkrJeStqfn;Cs7eDQJHQsRk)Pn(B^
z(cO9C!Gj#2UudX5hXj>9gD(u~U38y9wr)Hq?t?5X2|I<w{-hhwqHPJS52z>WCe+Zz
zJW&X`jmhaEtqB`8*Q+#!%@3CpcV=p)QkC3=as^}Or?r;QFm?Xz!UTR@LY*l1W;UA2
z0%4!JeuTKgRYc8c{Qbm<sgsJ~lwtukMIBl<SGO|=v7^Znm!XWvp6U)XuLIwl?6J(`
zT$DdvuMkNRehhQk&=7|!c3eQfKcJs48UPmTWz8A9%AJoA3%AoE3}lG-w1iqY5gX}@
zLlR!o`-X&HAp)ivb=9p$C->>5L;HdTs&t<?Cp`B#dW9N_!)$AZMk)`v*^`4;<6jYB
zi@1BD07Oo-cmojx(pl(MJ$C7Ovec&LXbWu48d03==ma)jEWp{?(QzBM-u#e$ygh@L
z!A+e!8;}7M$4CMJNf26Za=<2_3QW_zATAURu*895H<=#qzHVI=W)Id9nIv95xpd8|
z>BcYcgOgga{MD7kGoU1B5Dn=^sClwFQ#Sm8<{cwShzMyYgFUJhO<e*}5+5q3p3qk=
zoMi-r3+|sF`mTN(lA7)uFcN#2+pnntliax;u**4kW$EWcTNsy727vOTkt;r%l|b1i
zWQd*wW#))aKo53j3m2>T^Antwp6_<*4L2iwvk*_uKR*V9EID`$PS>-z03U>k11rgg
zZLd2rp0WX`UNW(z=Jqt8Bl_c-vTsIiY~A5Sr|-0H_Yunovghxypa53jpgxAC&w$#7
zeu_#^C-Xmld~@4zB5AO19JuU?uvd;WZ{4%#Eus01DB~RzCCmjuJ2RtAwyeQ_t{{b$
zR%OEmtY`p{fYWZ44ax}F15Em6qRcX$CY%06&NeUtk`854zac0e^K1&o#v{n{9K6=}
zDhDc^*MCkl!923ZM=Bs@R~wfaHwn|Dg~K%ufsr34y6Uk^`0hg3{TAk~h)Qel1c<i)
zbS3%kZg^mCpIl+^rmIn-C#bsr8s5w8q@+<bNd;Ldj~{>LdIvtfCu%1n3Z^O4hq97h
zswXu0&5xlrh3!XLc0!H8%Q)c{1mzK^ixjFv0$>Uj%lg_osS!$g=*lPd;CDIp&Bd6l
zUA>R3_-lfuHz|)GuuK5(i4MKYP&Wi7q5{R^cHFja?1I=n1Oq@)H`-@A53fO+JSNm}
zGTZV8g0zrFFNGS`azv#bCHf0BTQJB<HTt!SEJ3I4J;qOy7VFZFZ(baV323qi6rWMV
zI?BQK3soejivTv!mz0J_4oZJ^k{SahdBre5o4#nNM?fa5e6-BkDt|&@dhjbR@Xhu>
zH*x?>F_2aY<p~{m?u!+2B{i@LB%MPpZG=#0XGSH+@3$z#`(psn_8zdC_5XHebCF!u
zP}Z)120e8lWohHRQZDv8Ob>AUTPT8>37|4(frYA}V^AK5MO`pmgln!1Vee3l_&CyS
z5_XoL>ZNtnuyyngM}a$wMXQTNCu#;HMswH@p9=-dps^fDlhjx~q45Kdnd;pX^)OKf
zamxU@M6M>`M;^p6X;2{L(n@Jz$~47moy+Ar2>q5f6ZSi{K_c4R#lHsH;3A7kJq)lp
zDquWRWH2~fX?jEr8{N;J@shGJ-`Nh8{E37M-5r|?V9u-sge$1&=O#g|al)(aMg27(
zt5FLH5l>$ZB@GO<vN*J~I{bqq<m}fb&e5>vGza7fOP6S>8qVF!+|A;3V{0tEtgH(4
zDZ;XSiSqJlrTe1kcm33`Ic9UJ+t+6&Nl?42v9;{porKhPMc*!D-`piHpGLU+&?Ry(
zGdUuOJLZnvnET9ym+#YSniglJ7w^dba2b<^1pr|n;yvo=EQ1&kH4G)!`_XP@m_bAs
z_&-$o>4MU}YS}!rHeOgsjQ>@8y6+Z4DLgJdo*mvT4%1i%lTyp27l-dN-#(#zh@i^I
z{k~~-cdf5LA${(n4t*HZGO5vl0J$8%N?@c~4cBhw+t@mA5835ANp2d#=kCi+a^D2t
zKw&a)#h9J*?Q#c&r`P?{;?EyZp5JT{k@X-(1OXc~e(K%+Ks9)Q_%RRaeb|@~PqlXn
z6ilmUS~QDroPnH|i<j3IKynS%AZ-k)yw0?`j%gNlQ|H2I7A$Gz#P8yFHoiLfNCc}#
z#ovf8-)Y``c-B~<9gXusz*-uFrlpW@sC!(9z}t%AY$Et&kvxRi?^lf`_XUH&kVKUf
zSS|3t@BBJ1q>>8DPeG(~rN>JzSD}K^cap}C@&p<Cc58q<{I^m_zFI1I4CbeJt(Kdm
zBrDu<E-jZ^nE>S^<V=#H5M_iUoKN3eZDok;<iK}CaYs>=#PF-*iRmQx<qz3>R%09&
z09iB5Q|-`3>W>=c9c*l!N<Eh<JT<RTYVE3wDMl@ME3IGqqCJ_=6sG9qVKatP+*q`(
z`_>K99JfXehX2aZcXGL4pjsh6`K;<=1(Y^EsA7%~cPACgtV{&<w-tG!Uab`1J>+x;
zE2FZ;{DpuPtcANp@Ce4+w|&~M9e@1r%nN0_Xe5`I0E~w>0Ye<m`Z)(DAZm4R6;c;h
znm<IHczMS@V)np*Vc;#j2$7m?5k|%3V_avpuip`4L0d<j{qdjun=S=s1@Qj|=7jn|
zhdO`WaQYWk6dR)4;%r_{6K;E)Sz{RZRYjCe{oCCAaxEXN7??*f0W;bX|3WoVM8o!_
zQ|Q4xj1@@Zlr8>^abYyxg35_dBM{UcPA9l63_R@u3svLfGJ*38U)f?9Qd}aE|3J~%
zyvSdeWJ;%@<m_0mxX)ctEyKjb1gTH*qXre+(;TC!BiFht@q+0;*F>sxu_ea_jc?FY
zfyl}qOfQXQvJbA7cg+_B$)es)02h+~z9RHLFV43E9VGxLJyNgVBS#DP?6c0VD!2+m
zCG(b#*7F%!!PuPhe_c!M9|DMIKR6S#z-+M!(Vh?uA2qou+<IC-Uh;)}F0}WA83cgP
zORw~;E=9|a#Qb^OAXe!x*GOLisIR1?go-TBZy*9o9OQ;g<oq=2PZ9Gs;h~bRf4$+z
z*oVA|D1_Ku!13`xbS-GD3HHCJzNx>zpY7b!VlW?_sC|+%+D#}5QVGOsn7{4_%|ho3
zTn=;MwjZ-59$exWK$J@%HbN5#4H{a(<2sGJZit2c5&6C+7ja+zeUmUtyAUwzY}5jG
zqC|C?6~eGg79uc_1t+4)9RNF~HFozw<zvhe9DbOm>;4H5V$^*K$C2X%(x#DNKlp_k
z<_A!L>-ceprsY>c%)gh4>D-A1+;suQ9suL!4x`~+$THXuH>QdDXkkscvAV3V?*44~
zXD=vGog7*6_wV1^@(UVQ23bYYYn>IV-lT`%iSWkypgM)d3qhtdr(?1N#RES?3`G6s
zV?9+^0dnR7?W8!O(rrQI1pR?spnxkiClO=|tg)sn`IgO!AWJPF%=kFJ;uQ)}c`54S
zgNm5*ISh_2uyNNd1YfyPuQV4Q-^bi;V3%OKz8%WbXt7`6qN^kCyA7p_sxt*u_X=_q
zAi#ewQ!uW#YR#*3P<{inqJD`8L~5;n9Rb;KT3wzkN)f)|1uY!fZ@(oU{Ov3;7F0RY
zpOm|x8|#!xI)qa4lcoG?gH*o!1Df_oE!)(ejQy!TN=r;@qauZsfzW^&74ID?XIR$+
zctFy#7h%iLJTw;6qOR$UM%|G!if5`RYyCdGr&xp(@qeL_KkAVQG@~GvfkoG_<ytJE
z!x_V1In>f}_Adfvs7QkzON)6<goT7j@*H+HezEC8V}giaoKr*&5cD-T5UMQ*Io2_L
zJ^axc>++++jS3=&&{vapU!x=Va|bzgFVv`z>p1q~QTQ&zIkq2d5d)%YwD4zOLKwk3
zj_@j8;*BH1yr)Ge7wWdF5*q<F)1E3$H~h{roXu>b76lZ8bzX0<Hx6hVJ*w?O*k8%2
zkU+%3?L1_v&<>G*05DmPm_>nPl}Cu##RAp#m)lX#6)Gm^Lyw#dycP3ouWc_<rH#kI
zeyFs=bT7yMaBSv7#W_t5m{H-5D$vgQPv+3^1l7U<C!J@FQx9PP^Dl>i1$H$J(8q{X
zaUczcW<`19o*PQcCojS#*?d+xlEC^JGzTHK3D58X2Ask9@~Rl&8n8N*glAyD&@@9w
zr~~ajeHq&q$Ylx!wk&_~>gt7y5JA_&B2PpaM4O||Dc~7QpwcGZlr<tw)W6jaB;cAi
zDAofl;LCYL^D8M0x~Uup&@?iXqdH7Ixy!U0m=LXRPX1WSpm<J#?sFV{w11UI9~w>?
z0fuF};`iY&B1Pvc3?@w#snKj&l*)at0G}-fNKd^Drfu^YLZ4Xm=3Z=ku@oNO5QviA
z@NUPDa@2JSg`mIJG7+^nBtRi;o!V3<m|$Xz(BXzgczz=&r!j>+daZJQqlYF&osj=c
ze=PmstD09+pv=mN8kUr_OK@`o@Xi(HDgc3?4Ej9o^TYz5CJnN^%P=m;gN{oIeXY9y
zR5QQr7?EaHF(Su3(6OBVmWdKi09EBDePuUCA<6C|MFUVFgbm9$6sW{k%BV$!cPFd{
zU;ID><v9av9plIhq;E_>(cz1(W5oV8CYb0@$yuRi1Cs_90eY?n90#e2vmj*jfdF~2
zI6@4xBygH&(;5^iV#h2T&`xJ}Kp@9%HpKwr-@_`H(`w)`Z!%wHRn_pZ0&-X8<;7V6
zL&gMT&1BDk8#FQD?5+$cM%8v;tyx9DtIq_19*+braS|_1jo~2-PR!;_%@;cCaOKJs
z_NOIywh4ElWxNMHmgca7@&1Cqz#S$U@&u>;jopoBS3q137i|4j`?471Yw%D`^=JSV
z&%|on*?r?EOM!z8vl~QYOhjv-`S9p>)&&a%p$p7F24kqqu?*v5v{&N!)=uO97<=z{
zs^9;ATqm+BJ85vTw^Aw%PDxf~_D)nNp%M)`N=3>hl2P`Ij6!JGMHz_(nWbSRl<|FB
z=bYy;dc8lt-#@S0dDZE8uIF_<uE)GT?ho5NAe*b)_AHyPgl+eA7W0Ok1LnXgV5GYn
z6lz12I?@N?Gp-4+F89DXqC6WUP?wg@6hU2;)9H;kbT^{@0Mx{%{}%RWBTDR*r0h{e
zzaV-6-pHq{N*t@Ypyv^MNF_J~`W^TK<R?|2#LPZ>m}3<y>~<f&OK3Fyg$P>Dj1E`v
z@xef;59pG`1=9vs#cM$lI9Me~Vft57&=z}7Xvst<PIc3#Tgc+{9!~DWo!DZ`nMHr9
zqk}_b(Vo+f%nu_R4d^28G^0&(GJ{SD?})4U`W*%I;%`qv*XjiH=*H$h8sW?-2he0U
z^0S~Ld{vHjBPqA%2v;w+KwxD4mc?DAgR?;-Z_?Vx0Yxe5dr<O4{IZo^e^1V4UTrd2
zRO;j`AV5P?S88&MI+r|T!V$iLso1>*OHUQ(LLjvjSA?-okVL{qcq4Q>qkh!7wVj^B
zH>~!lsj0<>r7ZGp1J#JoFB?9)t`w?%`lax0QZrSv6MT^`r8p_~5HEq=BbUQ<{(CIe
z^LU+RT#ed~8s8sx0`9uCi$TEr@LqxLF3L0F_~9Ai_j=aM7p{Mwr=L>}ZW9*(<PK!I
zz5vpFFa&AtXPPkgVZ;h#bY?J~3A-~3&xf)oQ5qsZnB@?6puW)aYb*g0S&#0~i0k99
z#W*01k(o`6X8!Xjc%2O>+N)T5vLvYJH}LEDLT=dL*h+3}+x<*nF`EWU+tc-}fr8^r
zitc@Ji|31bRp;A7pA8Qnad`|d&-FHFgWlOE-(|Y5V}NZ$nhrdXt1GH#I~{5pB7tYX
zy&i_Uy&q1`Wp4_lK@7VG*gr(u5%&y-qFR82BxiLvW<Npf5L_`;K9rXu+Cb!I^s+en
zw>mentzEYd-|_{Tj_`(yh{_5Y(Dc84yB#=xxeK=??g1?-kE6SpmNGN_7b!$5<BBwD
ze5wpboW}4ruR_EbNJ}fAlEh~PCZy1>3})IPS`u*uRxa!$|KkAljj7RM7T@M02cYRG
z<=N;XY)LvV;rV_Z<1Lo<I5gYwJAm~&<U62wiU)uQsd-}u{<X$bcSOe+hhQntdY$Zd
z4!FT=!1I*Jura_=2`T)bjSOfV+q*arZXHpb$upmXKu<70=LDR^@Y9reI6#UVQSU(K
zc{=12ObiEjMHeN3n1H^t+BecdJ*J6PSo=(VkOTv~BqAaJuzGas*}2dnbLfKfd_}nE
z7(_1+!4*PO$9g`zP&`My4676_F<}}kOb~+LV7#Uhp2W&9xv2%U0cZvG^gM!6>GOZx
zGV03kL;2-Uzh7V2;dx%&7Wr1sS8NNU{)3b_u;fg8ANv!O=@5}2qJ26E4htogNd0m;
z3oH50mqqL%IulVNHZcDlhT=j;TJvBS(}nKMrk?<j(2e8|NI4YlGggp)u0tx0r#}5M
zfeSJi>YBWfQH}^uhf1}O$WQYQ)BHa=iNTFT^<0Q@OQ*)w5QF1)08UPt-}>$$ywR>U
zbma1GU=dH=4p6lFplwlK2zYpIF$Vn7=rB3c;WceOdYmzOTn-77@w);0z`?tT+Cdvv
zAN`Lvo7+DEo*AbmVBW|&{tAIFQGw?$Ag)ZVi;~N86P6%ZmU!AU)%o5@Jy)=os<4XL
zJVK}u-A<5YU*xd}L$@Y)eom7jSO0b2q1_K9{Vd$tgdl4hA<pk{dH`%-Y*~dGi5y88
zB}9%Dn8wR(0G=W~RALsGjB<ot@Et&duE+5vV{RX<Z_t6~d1C@)LLnrBFicf}CU`mD
zuXi@;+x~fNSrHO_zF2Z}aBcqd@Ap+uE#jYaAbpRR0m;s~KPzh@#&I<@&ZI6bV6-#P
zfYRUL`e=F723(STa0ykX_{aoqc{&D&Sc7BWwRHmw<5o}lT<nD1t-x5)Xb9j9NO~3v
zLn{!U`;?@*1IdNh0b8qF*vpi~NCYfG1vJ`(p85(%qY6-etH7*f`5t>acdK*`5#iXw
zXN%<~izZdq^lEMYe1)l@cke$YiTIsCP^3%_aEV)wzYAo74|%AD=k`pUoMb9hA#9G0
zpeH)8ZwIJFH?+TMWx4^?K^24tkTy<;3Z9qMEq}`_7_jdykRdsEAHF3ooTPQPXpE!y
zU(YPAlGdsCM*&b|>NrdTI;jF6Quo2RmJ5INjI?Xk8pv<)4R>Tbei^1KnxM=9+(|tJ
z4T!Z7Cix~<y)$c%oSExMBo1)h-eRvO$oJ*lg=R>hP3fjWV&m}GvfMR`b=alyt<39A
zzuq$p-AsvnJc!f~N)+#Rl`)c3HQ@{j4W;eBv0lH9cm_*0WaZ`MO>IHIA6G2-1>u|#
zu46o<84@hpmMvc$Xf(#~PcB2$jfjY0NBdr#l0{Ur$YHk;(NjbV>Is5-xT{@KmszP3
zp^AD!xXAu5x^Kh_TySC>UBFbNS}_O*f{2!^>VhfV{Bu_lvJeOsi+Y|kTS~uW6<0!D
z10`gYzur5BPP<wg1ZVQ`Nxk{j2=*V&W(|ot>{~1F+8Ab;^8m;xE$8?utXW|0O1zUL
z)l7g|F1pAb5i~+<d7{lWi~LPFNw|Yox0R1j#U1_XwgvSE<=N*j{t5+)rD1~qqzmIj
zN#p{FiHY)#R*=Mk!!8iD;3Oq(CM6)f<2A13I$v^|ZzG(i4VbLdu>fcfyDjHvc(XTm
z4$OcZIH7_L%B;L79X4<kN}>`-hK7C3$yz}cC3XeYZkXG4_;({}oe0j0Dra16_w`2Y
zx$PEX(fusqP<IgYn{Yk{F5Rx&I%x?v-Z4_7#LEP#;khP9<b10pCj}Y3t0-!$S4l+H
z(*;r&Z%^<0Hvde>THr!JIdq95T)BL<s+Nz2)-^IWGNBY?vaWD_$UG_!;3Hw%4vzm}
z$hf6$K`l8bD=#2cF()3FQ&27BjK(%Vttk4BA@KO3tkG+-uv?-P*PY;4;BLvuYy<^~
zsIA0R!6K20nV@W1E~t{P$o#jXb!9?X4i~7TRYJP6bZmO9)1wUl*j6Xk^m`Ifg#2M^
zJZ8a>py-U^YVPYH?o`y2<D;fvunQmqQ7mE<?r-YC<I16Hfd>^k!3R@Z3H8Z$P+gY2
zt(RcKimGpjk_dGGBYzMu1yv}m2lafZt0OI>NQw^fgIP4&4Aka}XNoitnCTx*alE(_
z0JWNp2x>jxcWC{5AtPOI#i`FM-H>IN+;IxJ9`8YIhs1IT8QmD8d<L`axY!FLYG~JQ
z0r~DTF?Au*yPZbfn^PUveBN?mEr;qS{O<1$o|ncv$jjRT4IA7LVh5mXc35Fx<U%Kr
zKx9BoT)iThClws#Quw%3eJq$J4nP*tXu8i(@Iec1_M6vq_K1KV>JZJ0d46;5UB%5h
z5o0Av^>|L!S%rJ~eOxRZrDmPL59C-MQV2kTGItnl@&<GJ8IGe`(TVa?62t+{+XxMF
zx>2n+mf;T~Y=W6SC(}-OI8whnr!laf(DyD!9idRVSAP{c^;2YWeIV8BJs5*sD)>}?
zmo`ng6LgC|Fz$$;2IuTn(B75^MG_QUPT~$<AZ>FS)e}H)E&2n7AdG1jB~%<qW>v$)
zjE57_+)p*xe9LFK5V)$AZ`&A9WG3jiLDdlvB_rOa^0~+Tv-}Z)H53MP_4L@GilUha
z{=L!R*o(hwE{pHOEUm2(L@{pT`(<%KQ-o$Pxu}^_>n0GdRUu~`{vc@%Pk`BZOt$fX
zF2tU=%g_svr1ukmQNv&c>~=kIr<@+~J90B-c-4%1NXgecVE0Ao{AhOQ*17?zKC3z`
z6f2@&J3jXk)ukj-HWXNie?hZqprfwVa|t_EWs!yPlhWO51$l}4tUo{VL44GRqEagm
zSYls5p_F-X)pOoY;Gg4QxRV2z&)un4C{ZW>^BQH$Kkm5(Vl(wlM2ztTfTR%fpuEr;
zF!rA|76K6M|LW<hP9cCo=mIE%YT|jw=cB$<9!)87;vKRQUf>t>Hdv~u<AFqAZHS0=
zy;u$e@)a5|R8dGsDB=G&PFZ1UM5>QQ(SToXE19z$+ik2W;=)E6C-l3+3mAdNT2F~1
zQ-HE?{HyeKV20%%iNxO_F?9e5QNlAJdR~-nyAz*?#TGr0$kxB#=)x$dxI7=uF}U)P
zdAFZL*7eWNuU*EzHFtK@t*#C%uMQIE1`DGhGyugF1ZL$?7ss&Dz!44GG5AT%@&!_-
zAu36IYf#%fxR8}_<hn2Rs?T8psA5O?V5>omLHFK-UI(DH56yhgh^=BoA4EIGDf%5P
z>b=?a@$^Bh>i`6+L=3a2<Se>SS<ZhtaXEuvS((X`b7#bB04{$Ys;WGyqF71-KZ_PU
z%bOObN{5&%pvV6f_(gr3?|_V?3ka#N1KDwws!!-#<!15CslG}gw}5t~13sysECBk$
z9dtpCK|ib<0x(nqy0!l7hn%$Vjbvw|+8_4+<#8W8ZGm|w0_JQz(f$GeCAW(_pk3((
z_*eZ?PaW!vTuC@FYzG#~Qu$jZ)ZGKwr73<WVwK8Op#FjeK&`yh`lf)J)x>4TlB
zV|k6bM-W#UBtUuzH@(FAaU@U72zg>fr(8KEk_oP|A0S{^w|1M;RC@wgd(@?d5Hs@s
zhc>k(kP16ct*9&sU2&)n!vc7W@_TG?7y)II<pSA`Blx=vKH@{~J7%_KJ?@J-8D<0;
z3{6@Au8|U*Izv(oh%*Ovg#XipMDAPVkj$x_eHuklz^DTbb*OyiOu_h9X^17<sB8(y
zA}`;brfNhX^{Mqfnvt_$rw9rRSS88t91syy3PqR)vj|m_6Pg98khojr^7uBvjSg9&
zN;J6!3}AzkY7O-{e(IfIY<+7&M5+)r2d1i-1H>%8WClX^o$4AIH){9JAAUsodE&q-
z)lUIxycKsGCCv&X;fMkTZrPI2j5@twW74AEEm*3k77;N_!Q>IW3M&}+!~0w}T87bJ
zu`kvlx)E525b?I2gd2=fhyfqR;ed@8=~+4#(SsY>)#vYls5lX{vNUS0)qjO@<Urgl
zp==j#yx`~IZWGQh35xvM20<4%>S3mCQ0qk%(E<O%Bq{^E&I$DNo9%lSJYAXyQ%$y+
z<{b-qcnqosS*_j<n5n~b7=%F3uY)M2^N)eshK69|z>*A4;?qfT1z{XG4VtgR=b{)`
z%_b(nfR70LoAwLT=oZL)nfMDj{1XT0KQ4!Gm>m@ONr$_D@TmmQyJ7D)GxFtZ4h4aV
z3N9q&7%;i;BXnL}>vSEQ-$ZegWav4)eXbmW>CiY=I`A{O0e_(p8Wxr)3@$)%cWc?q
znO^B*WNdEW2Dmvq?aOtkf&w3Oj*-GItO9UMD4kl~!PEs5gb<h+xaAXMAtaOmoMwoq
zm{{tY3lyl7ffzW4;%8nh)HKKx-i9_i^)6J~hI)S#*7<@x7&c9aN${q#n$m|pQgR0(
ze#0*HB(f%NL-qA1yMyKt*e}7+2{|KvbjTFOrlmso5EQ-cKawr_w+JK0ntRw%L7Q(w
zseXS*7L>ZKl4T}Yp6V3XWPc}r;{}4jNrW~jyc;AUqkGZd3*c(85=~6|XQyF1(BUBf
za_aKMRToiP*khlPVh7BIqcbGzr(FreY^cU0uIXy?0a3&MQx`!&lUbZf0576k)n`La
zEuvd0yAeXp?duc3kkAF1?~Pkc8v~%P?cf9Z#p}yqjtE2ZrE<+}A34Ff*3A5_n=1E!
zCmi#yFhOF{!^=-*LPZTVHR)djMR@~NA8bF85DiOk9pLNrj}hXk=};vEah%)y{@zAv
zZHID#pZUH?>W)GE=wg66IQ&Ni<mKzBmX`YA-8m9pJF<pRagb_31;3Po$@Oowe`=8~
zGLXkZ8Q1py3qc(&9O`gIsSboa8u<$9d|i##dl;UTZX<t+`hw6KM6wm2&eRWqGvor?
zG~ADGjV!vpOK|Un153(ep{l03^7%yHYAi9a^)b1ALx=$oQC-MQ_7H*{Ga}+g$m0pf
zn3u|(fuODnH8end)bMw3AN3SPU5Py_+E8^n5mDwM7{Yy&9*|DBuvjgX7f|_lP)j)k
zl%gm#55)NTP>lC@FYLCBEQK5(DjvDIHtZ<}h&@esQji45fI9gf2(VuG!BxXf+Fp9@
z@j<)9$LJL&_8f1|6H7$IhTCBXo!CN4NAzY@)!MHe6%?$*fSMEDfwE)1C#?((gK)*!
zSQMw6rv{A5YO^~VN&8AX!3yZE74A+#kPoF41H6thLIb)V-1nYSzPVI|ZmZLYNO{2%
z?)zp7Jq@SULDAa<POJjl>AO!~@Gn}46gR@5n!B~xS#=nL?L_k`5i>CY%1}$ONEL7)
zZMs32n@#ip(F=O%0^lTe9Cg7a(|{w(^w8aa`t;`O1XtWJWEbAguY~Of@&pIL8_ElX
zfndOh{7wW*OiaQm1Zm-NSn?$93pi0+(;75{P0VPV2BMI!5lD5t+1S{KhMx&c<^@wj
zEkc|uE%bh_M}srgyS<B-P1?N3(Wy=#Ku{qt$>TnX$ofV&y2zLe@al+ZZ19$HSx1Bd
zw=IlAB11TKB4+K9R27}M!=_#|@&|$@6Cw%;UP`}}ucqb;3=Hz&XmoTvl|j+DoF$;r
zygO{@*r9O1%q{JY8g3ytK4p0U=}VNM&K7Z{`=5?CTzQm22CbtlCRlbO5>!>tH&G16
zual1~>D$)tw9ul1Si2#~5bXh8+FGFVK=c^Bn!CNJ_Y~7MR5RnS*BYuLsCy|PTx;ep
z08t$IgXOT3qGq3IXYvWqA%Lic<z<e;ok5))SUH=jSmu8qI1zhjAAuQ2ERBp_efAln
zh&A=brtF35WpLfm0XHf90Fo{UInj+6(0F#s^ZZ04dVnD4H?tr4-|J2@w}xDF4%DXi
zjhq?Hf#`n%jk_@NSjLRKBekN(*hAtl!LV6>PzNWFM~k{GI!Mf1Bf-Bz8nA?B`?>;f
zQw>mL!Bk`sfeS%_ZLFI(s#S;(OYVzRxLCUR&-b>6-QeTCs%o_T$FsoJ#GoIEnrcwI
zm^yhLx|LAxOIW6(4JKX0SxETTOAu*@(Kv*&MU$|`o&dsGF$=PLpru#JV_*GOU;twm
zjk4+hJP55pQ5Cf1+fk!)oeCr=tKa>i9Wj_+z3^Z}ohrznp%l*rDK)46`3r<FrYXcv
z1x$MhZ&3>uB3n>z-Ad|EL+^Mg<a5a~;AG$>8En+}03quzq$UvN`uYDAZ(pOrWlvzm
z)JIgkqMri~CNJFrh4Q+lrho1#mh_&#I6~Ed)Gfw7h%0r12YwT*Xi|~`^o)StzcwuA
ziab^JXM8Rl*)D@dQe*$*KRz74Uyd@Mh?(k@tTl}J0tche#jTKI!V3o&Dg=by2v8&=
zN%B+5kH99E2>!=|=KaQln>ZC-4_h>wS*{LE$A$*ww_EFfy5VM3;xaNMK`%$3COQOz
zLs>LLN*fv)il>Cp{ClBbu_%i;|LP$*F3=^7*Maz~S(15`<aNLUkT7pUUbLva93`s#
zP=7L11<qQNpQ|hbuHqLf9L@a22ku54lp-kqs5n`pjk>|wm_-*gGrl`u4f~7KDsn0(
zV}YHb(rdcas{)YVC_~hz3~;32rcN}cyxAO$rJjP~YDTEs0dfgu1l8UZ24lX@KM7Gh
zX-tf;R`hBP;RDwpenz8r&`7*ZP{P5b>50w%HeTld4)ueeI&y`6u4%eXCu&p&s9NUK
z?vPiAIp!V&eFl$;Qc21U;Q8Qa1Y8Ubo`37<qyo2;t%s7VyyH|)h&JkprL!pOn!h2W
zEWoU~HAFPXz$Qv-eA;FL_-;Uu_giUx{!3a^Tqm+!Q>!GfayRK;wQ(C4I$0oE2?a>~
z>AwRw%uY{eLp)l>z=&yF>N^>T0(|h=h{~{&0yt`%;CQA!3)3W<lo2Rdna_L=OQ3!m
zK8PM8E?K$^jhNDt6AqYA<sJJsv*>4F4N$Q+kbjTvRfIV&sE%ios7rQwCc@EmU|4V+
zs;`to85Es*s$`9TLa@q3cYO!p0=0$T0=U95+AK803DBJfB?ck4h6X#WQMFYgJuOfZ
z^nKe0?$r4ksdE|!t(u}I-Rs6un!#rwjmfyTS@)kB2c8Y^c{c&vkhBp))?swWv&bn<
z1TQ{}5m5!9!O%#deuoh&1_if#shi5A(h00(0TA^hG3@~79LPChM5Ff_`~x#ji7W4_
zxN#r2GUT(N7bN0v+4S%9&I8h9)8&J1$k4d453R#Kgc}c<;vX%84pt;tC^T48bBNRn
zi!LK^iX9bgCd>G?@u7qj^v#my^;yxW{ZmAHx1ZSas}RbZTjlCN`Eg{!BqfqMK<5i!
z2j~=VR5VbEtE3C=D|~vHz6xM9W0{hu=raWkM(1z#b5P?p{QLQ32fFoS=!NKeI@Nz5
zaVfS$gvtW2)4_*-rp<(u?vSVwnf8Ko?Kys*pTSni8W8CuHWI`WxcN1NYP^tCa^Lz)
zo$7i)s|uQdi^kgMmm=~RG@q^QgEv)qV~P(F^{5Jnrla)dp%DsTZSLnrY#|SKuXaQ(
z_mhjyII3gH)kN-cqDJF-YpDC)g2H_jsz*HOWT!sVcE2@*KcwMWBOsDT3RE%DbnHII
zMbQQy961R^*k%Wq7j+$>38M5UwcMTn@X4myB-9vte98e}z(v2#nSh8o<VOpD2^e*h
z0nJX`jdNG1aaxKeX&f;NkeH%eyWWisn62#sH77$aQHg&Xl{6D;a5oW#040_jdJ(Z4
za3Yz%@|7f?LoUdY^xyM>8;B(wBo=y*xnUS|-S;x)g0~$wavnwz<<$G#A`(~<dFNXP
zAW_{UXPtehyH8Rq==Pr`LMz_B0N&ktcEZX1*%W~6UJ)Jg_P(=$?KL~l2Y5hn{*B-o
zj(A0jB%kY9LkTDJZN*!$gJR%}N8L4RQQ5z~7D@>pj{)F=ElqSm$q)UgFcA`25wSDB
zXZ{X)rGH0LbHSZ6IKb&Zt|)RLl+qz_Zg%{GL>$VZqk%;IC+39@1*c70sV1D0SU$d6
z{0vkPU7pf$a}vGTsW+IcIfypgs-FZrfW{Oxf(LD>5(+o~J8~Qim9Nmc?@87+2ce+P
z4JA(l;Rk@T`2xNqs$cc5f$}Tj%R%6)FuRSLF#ic=K9Z_?;EX3Rkz7&8&{lUm6Z|K!
zd_1unkX4*0oeN1k4~e7B<aRmH<FL6tPs-8+bh;M?5GJ>F45X;eym6xLE$cgTyuJHQ
z1g#v4^+GhIj3UvjAS+AvQ|~b8gM)r2j8+!&!f8Rbx5Jie71X_$i3LTS4JQdsFsf^S
z5=VxmK0mlgp&m4;c*7y=-B2sUI|z*r|NT)I0Guinp!inTe0{c{Ak4s;VowCJw=K}H
z_3Mm(!iFqJXZP|Apn2I2J7UJLA6AA+b=I*$%g?lFMx8+Ydjnb&d(EL}Q?%)5ee}px
z`MG>d`8GPh+oRr6dd>v~1s^Cz71YRJ7MCZ0Jc+Co$9MwbZbS8htH~)njIe_F*N^wx
zP;>HTWhm^}40lza1o+%WE^>KHR-w_tXzEVt^_tW^sB6`^o~9Co(*YbQE3VMUg*I0R
zg&ZRS&t_qCVY0Uzppwvt@Gmfpv|z6W1|eJYa1i!)2RdOyyZ_gw<YaR+j&wUf{yeDq
zs5uZ6${O!NNumwcyhe~x;bY(#?E;1If3KYz!3bfTzz(icP`7>qxyOvE2oh@_vH%Z|
z1#olZ+K(^Bp)>*F5$;_ABdXRT21l@GE8A8`@=(uw@rTY>92)yk&oTjZN10OCC4`5U
zz|MuZ1_k*_L*YU&9~;`V4<Qc)fyZr<bx1gl|Jw3E6&z6IS)mrk)ED&YPwPOHV6%t5
zFE-Z{MpnZ^-Os1-V?VQaRU{!#Q|-)nSnmgE2LZXi2xEvk3b-;IsKi=CKk(DiuLnE+
z?}JB5l9de%6{&Gnk^*KsM8yy4{Qqj$>j@yS3PGPJnv%;j4F+dFpjt+gS5wQa|5<4N
zM^N-j%b{CyJtzq+%@4qrY7EX{ugtOi*gg=YV7m*I&87c$kO&(dXG{kaPeml6h+aOp
zN1FimW@ATj)u2qc8Dv8CA(!`m9S&v`fa8-0y=c;CqGtmCtu=Duh(bbAB?6-&(BXuq
z_}Sdyngfw7R6?X(FE3z1v~Gx5p?M3+Du9)@HA_D06~y#1T7I)>!Ew^8N4+I>Bux%D
z062P&@FV|6)el~fK++J<Qm<`b1gSlQf<v*+vEJG=b&{n8{M0Hw&xu2*XAAvJ*`Jnd
zRM0kfobi~Xk_b-$*dg&Q(_oJ4@4tjBOlTtjG{3-<AcrETL|5KZz9H_&Gs+oBa82uk
zrjDqzBDhMMpqYxlNr#-=f9D1Y{k_w%1?YL!ltaXxSD!)KqOJ^+=W3D*OVd7NwgBKA
z)9bwp3@I+4bGaSb2GVRu)VAQWg_-uEDGQLMOY$??Mry8L72SnwM`5x!KO6?i@_{<#
zQqJ_F`W@9$LsO#-h!_rWr}PG}tD&;QD!@A`FR~co7O44U_!+az3#dsVkY=w{1dzv~
zlsJ$@#Wl8WKJ=Lx%1s-hD5w;!^Uy|F30|4S1Oo-SABMnA!Fx1SE<xMq1povL1BZz)
zViSjS<=H79G0keHdqo}G2#9lJR$^y<CF>V}3;+D9bL59j@$+x1&TU-HwTECYijk5#
zv??V{oa^@16Q+o_D}O(bZrdPkZxY_RgN(P1V@2UpS5}%Yfye}7{z|6ky}=2LFuFbE
z9C=}2Ktyp;jcAx5Nxf2o>}dLS7pM~Ti{}B;ET$+V?a}n^pSx*m(g{<6{D-@%lKI`B
zh6IBnV|u@knIVWT5lra&TZa9Yz<Qxjb^FXjH-a+GtU3VDm@a57@q>fsp-LNpjZrXw
zMU7V=c$b##=q7*@0a8oAPgN-usU+$pgz%)4@d5-By&`2oR@?Y^pWHcFEJjLm+M*7k
za5tq@z>0e?4lpx_aB}qT?(@I>dsdT&5~3%;$Y0Oj4`UGrQ@Hq_ZDo_1Q((Jfq?flW
z_B%a<P!}hfni*DW4s8+>D2fH0mOa20PI8QGmEji+-Sz@^YpGEu0FQi8r4T&eu+bva
z(IWl0hux35d}Tpr(DEvc69tn)Db4)n*hCkN1?fpG2i~+%G(+>?*8ouCw|9BC#uy5b
z=UJq9fNaZ^(X)dM+e1V|AwXSs$`VBP2O1`IABJ0SMZdD6{@kQADqP$p=ar;7G(xc?
zC$^WO1R``rsVck#`iUa_GpO_hds0A?Zb)WnOyPweZ0SA@berO8SYop$IuqQ$9QWa_
zci_rO*vmCh569Lb4r=SeFsw<6YZOI@DA8{cAVc<<=Z+9sbD@HLv**c;S1Q!%6$Q*|
z{S5&*@7^f}H);c^xfPYY(YUjET4a<Uaq(wE{MFx8xs>;Wa_%FbXzzlf(USV6o7(b(
z<1IBcO8HH3ac5|l2vr<DHtK?gIwK5~pH}w{vc%&7e6n4J8LCoRq$~&cP%^>UkI_Y5
zk4Q;AwFpt8=<;yp3otG&D4I3;TMj>(Bn&+TCnpnr&b=LFclCn^BY!hdb01&_LQM~$
z5$=Ex7dLIoBdr{mXy8^>DWnGzo-zuGJU;{Ni~TaYLu^FIt*By;isoAA0#{8;HhA$O
zcEf?S$#>mM1q0S^T6il!9!=QLR1MPH><W8u`%$wl%n1((OCh(X5K9okDdN3|d(u%1
z3n08@3C_11u;#oqV$P(_R0Q<{t{a$Kh5Z1X^blaekkyDtbA=3-KeTXpa6B#_xALHV
zhx7p-OA9$(#2%)t%nR@x#E2QI9z>RMthX0%AmZY$Va5aEnX9kU(9r0HjsXAQ;9x(k
ze#((Rz`u(Vq4g#Kq>_Cn01W{Fl7paY2EOw#a>zb$Mxp*9M87UycMghlh-*<YFSv#5
zdZt;x1<emJ3#1{|a-z!DJ#fa+{KB5---vDpYE~(-5nJYx(aY#4I3X>eov3?T5m@Io
z-!>M+lq1C?#=WBrzFOGx#1DW5f6Wp_R=9!;I_0>ekl~j4zTm%#ACiM{D9{-U$s5G-
zG%4`F^=3~*Yz@mQYQRAFjFp9--`8V4+hDNB2tjTc=#fzh8q#R=T_CtB<^W!C0ZY>F
zDDO)45Dp-yXR{lF(j^M!<PFT{gp6l*VG=3{B6K^l7nyh4FH+CT15Z*8$Z)_+RhBf?
zAS}|SxGXT=46(t@{OFj;oC=^s$j6J}y-sY~0`Pqvwuyk*i2%gsP}{WS_`8j$r3guB
zOwj4MMmYU^0yJ`UX+vmM7~TdsnJyR_brY;hR$9ci|LG3vJO@Y0!F?&X1cY8DWZ9rc
zl?`!>p>qz~`Is7r(8@{;p?o;x$ku}G^MlDpMN?i8;o(PiJaneMEcy^=L4sBTw9w|-
zdbjHND3VekC=xJx(?Eng0nWi;5OK+rC{AkdqT@o!odWHv-tbxq78tu5w53X&p$D~m
zK_@CBAQ~cPTDg8l=xiH+lTiQX2kJLvQRn&PJN$DLxc}u}$w@o_52Fe)9Fa^YHH*;o
zudY6%BFP9^pp-3FFL*ahp;ML6#8qnkUlYs!yHlepThvOmY@708p<d{$I%rw{`Slg`
z-X-BkWeP+H*yn9RO<O=`&z9#NM5_A#&kvnzNdzYTFe1&)wNXu^0*po#mChKCI&fu+
zHrY_u_FoUt*^H}6b*}m9d?x@#6v@@Y8$~hN8DLkE=mn^ZpH}RT7p|YW@8l0e8+I%z
zG#CLa&poi475m~1DNi9-qDz$igPm;H|7#hr%fvKJG$-aT3|iBYnny8X)H~8g4M5&4
zgA;mxsULgvqKid2BqohcL|+c5PkDo8WaAJ&{2{J%tP7Apgat`FSYfd8zodkE{AM!F
zh|~wrYe7-g7}kFJ#ZRiko8k(qG0-~xe?EnFM41HpErEuY)JIrNT!Y6I^N9QqF@vsC
z@jXV?YM~)Ol!f!1R{-A#CSK6}Pce$CrUjOuwNSG{6Rz}ou`FfvzECV_TP*{to6NU@
z8oiD(4*A50>L!@a)oUaqmgILs#xea~%K_{kP(>X3V5@5lEi@>rnsiSN=((UXA+n-c
z)=tM7W)`E0X6S5v4du&e2z@(t&g>=)(Vr2pLlr193T3Bh!(3>H-vn-XL{47U|2TWd
zfQdOHwq(d>-8St47x@OH>x_CE`=s&2gm_u|j1RE>`(Cp%y<RXH+LFK}?n8DGshD5d
zv^{8U26CRFC4`bIrjh@Dl`|ZBmw$da7azFg32H@FQ|5u*b5v@D<|(C05=}>RK+|qC
z=^70Z1f+()5uTTH$kiv&o<8wv<iJB|3<W`eGJOHn9=**%0cx7O5d#e0PV95rNs$Qr
zWn4|SdN9g+El=X`T_CLv>T!Uf8Ac?+;o4f0J9+|@9K^%VjH-5`A-8SYAJ1zb@zB3X
zPFKKm0KjW@#Hbwr)D!d6&$DGWpvjcKKLa(nqai1#>0I5i<up}@Tf~N-gSrA<0<lN>
zLp3j9>Q%qAMbQhm9wv7;Wl*cGBu&S&3QhL>4)!MX3e>}HffJhojcZq%ZiI6Yp)^+k
zF{z$QJy86RhGnoH1os@2z+B?z5_IeZAN;&hacT(Ruj|bK1hWd9xpB3-e%?jcgrmSs
z#GVKqL~)N`*}+1ZF1*^oIV(68z{cbR9{`Qv>TOQusAK{K0B+QO=@Fdp*dF{SxzO6&
zNkNPngh$?EjV=s}8BJYhLJqm-dMSXPVghKP?nATqFrZ9&qj$*5I{~Lb*prM&swPWQ
z>#S0oaVY^0HMPa*ac}r`Ai_-(^Rc6T1APE53{}KNk{<Qs4MW3a@4?e1j;hv(s{_KO
zWJ9b07A=l+1P~B`E86Or9!^6XWK&i6+Cs^J4y5%k`~vQpk||x&IhSRIQIPNWqk*(^
zy{Cx&4bf_hWaH<`L*&H}Uor|NKNCuRenLU|$*)F)km(y?BBzm<pefsY!_TxR$OAeM
zQM#&r$G42=@okVQTP0(;4oZ1{4I2jMk~~ptsx#|bL4=#TDAk{i`+AN?%rX{H&M@uA
z%bz|3kfjC_ZC7z~*BuyZmp)=$NZVEbm|_p%*D(7SMOO9Al!yXBYDO}|U9i+FZ7Bp2
z6c!ZDf3SSk>2!c=UR85``JDg&B31zgGOuN{h~hxKlZD%7*4GPNczy(wLFkSB;DK=A
zhUXC_US{DZQ(AU)Og(dk6Gl{nQK)FpjF|EvM<y|^^%heM`Xs((8-zWEjOE7szWi3p
zwJ5omZ<j+}pD4<)RLn4;9HTs4aCaISya+^2@@0yM6tF3oh051S+{KROes!U^zy1aC
zTM-O5AdPKNG|F1-M;dp9sLXkeLgE#%q1g(hts@J~Tv3*o$%Rqr)O(zVhC50Kk)C<?
z)11dChaa*CP=7+{cTOYj`v<f)VquMH=+KRmSi2pPyxma1sUIKdAy*DOz)O?PF3<pU
zY?>K<(&r20I9#Rw_AAcHgUfFAF(`vtl6$ZM2dvJ6Kns$k`YJG*sRYc_iQ8?NRE0zS
z06^CH>6<Vw)S7K_^sa%wBn8bXCk*ME$}du&$rKpSkop~Gf35@i*6AHiV9XAK=FMqP
z_}inzD2o|Nh6_rapv!KeATqfIK$GjCPExYqej+!8YZaEWIdgGd&StLV()v9(S0{?&
zd7v)jOoe~cv_kiaoD-B$5m5<qi9;{uI58O^GK#3^)!Qyi3_X)FpGyN!-Zhc=7c`Ut
z^eHf^5U`hsc+Bw6Ibn(*mI8S|v=O4q5yuZf=#I*mh>S6%yhWOd{jJWWf;W_rt?Pz_
zN6oyd0A+iL%jf53DCsi1lD#tdK@}TdPq+Uz30rU)L)t{(y&QD;X)vx*H(k7m`lHwt
zIG$gAl<x`m=AUbM_@uog<j|md<f(rcc>Iw<fpPpUT=L9Qr>M~!$(BJ<<<4N1pINeG
zebX-+YITfaYk~SDs_T>ac8O1>zuYerxq8490ue}hhvwfNA$^5-4{(D<obNMKF@KX}
zHz&$)_h+YUBrgtb#xHmg<Rz+eZI3f}IN@@?B?l4!%FwkYe#R+;a#UO>pSwxK2s=5F
zY!H?(yu3IzeS71#SJ@Jezc+{6tErzAy;K{z*tOR$EdO8>;LrH*>cuDD2ep2%YM#xq
z`nU;`{R>K{{dOVwF{cl3Hr}y$;ibTrt}vTc@`-nSrAqeFK{3Dfe=^njS@7L6@ZC?`
z=?WcVv4flQu3OU2nDt8AkfGHCXp08H+!VQXflTB1=#qAi&sP2N7z|Z8nb%#r+NE7h
zxA=GE^>>_X`4@US&veex&+rb7Un--Y+TA(0#EWI(e0j&x8D7z6W%Py_DO|-r`ug`O
za-W?1i))J6DWBBvZMI>Yr{D%#@)`g3{16WczA#Q0e!eB4G@q)g(K_NxX$?J#w(g&C
z`PAw-_P0&*jeW=0mfQm6g^Rp{JJomGPl?5XTgvRM-*c91u50TJcdk?{aoge){i;OI
zPNnm{_H5assQ|C3%D>B{?l+YxR%;Q|$VbEmI5ml`S4PxO42hSYvh{~wt%qm6@r6bd
zxBkqod|bsM5#HFcumsSt!FhUYZI?Im+vL*ol{{y<>UVx~6&SjU>rS#7Dr+m|7!vjS
zHQX&@5~M88zJzTLtM1_~YfP>LUh<}A^}S@i=hhdNeVq-t(zW;8X*Q&M^T{oC$1^$L
z_*r${9i!R9s;WvW?_ZfBiQE5zhH)9)zUQ;4KjqIIIVZb$L)E$t&zOrJHhyd|h|M}z
z+%%e`KEmXbUvl)ABu7a5eIFVe1O5^X&Xb=Y8y#!TkVd5Z0qIW}@H)G3K7TcAwJN2}
zK65Nn5D&^cYuHdD%C&@+l|V!N58j$~*@{1!va&PIse{d_{i+TQA1~R{b7h(kpG~*c
zMf)c&ALOwS%ssR3m7({;4e)#L_)BI>f<J#*NBJT0pV8p<NNqe{v$E~r=>v|3t5U{X
z&psRF*HhV7KV-G;3%&o+E*h6h+X?IND_QBUFMYA3i@;Dt)R30n(BYNsx$X1^+06}M
zpPsb5be{TDJ=l;n$Z|x+WB+eXWyiZo2lY>>X<cU;7#H8NlC*FlfdkJY{W@#)m@*CV
z5~r$~T97Y?h`PJGJK=4tx&XEyqIh74m(Hq8sXPO(Kh=K|>?fWQPOKc;Klk?NQwl%v
z`Lzbsheyhy!lAX$qcJ5&-P=Z~To*4=mMm#{zU&8I@=gD+BcI0(?WO!nMRq(-eSq{b
zgFQ6FP7YCN>2R1QUkg4?0)VoD!@`8l3=A=!9<s1Ey+S#Y8BrZe&;5FRC&Pks51Nj#
z+jNAzm^kf~A$|sXCgoJ1w8ACf8L?!kR#EhrpTx5saMsg5w>uXRSVd)KLlR!biU`@g
zk{sk|SGBqDo{+NR%s$eh@Cmf4SD91mG!L^8;G)53O=+m(1YM^8u{i*0QwI_GS5(Hb
ze(KNdHN<Z|pKaUwv#Fi$&Cr7c|1|=K4tZV4pkJY~aDN^!ABc2SkMY<)+b!0;)n=cC
zn#SQInLmE%gd@K<*3_=I@o3P|*4uw3{J}<HLR>ZyNd|o9Nu1#G$@g6G3JiEwP8|3~
zA&~X1hQvzW>zD@*HkuSV-9^1CqU~w|RB`m;5|!S)A(h$nM9bQ@hi|=5&K_$W(34pp
zuQbFps~}ZsW%vAQheiF4qxG@9*<QN~uR1>eI%U!6Y|k8-(`oSh{8d%o8&01GTarA<
zLV6{Erp9@LN1Oc?90TZKKy2<4dJ%a-Gipl^BooxYBS<e^6B>Zspeq>!Q$u18*Sf~`
zj;!7Y*-c+_0`ucOFB14$xZYm=6>HXoaER1DdVT!;Tb&)JCpc^wwlgHyO<RmG{qkMe
zo_`!&Rsl>7JjKyvwOpMBx0fj8{O6bbV<?l^!Z6LG6<=S+U+(GYQH6Zt(&G4LUhIwz
zDZy0~k`*}|k$Hq;S(2))<7mIEVupeN^(@Idm~w;&$#x~EmL83Gm}2x>M0>xgv9X-d
zrLLvwan7@6WJLrjhGI7C=QmTK)n5xLhx9t}f&^J}QHvApel$2a@dqb66dnWDM`B)H
z^RE(U4TylKFLnR77d6-^5p`H^Ke4^`U2I2^T>jza{f~Ek?m*YP*=lO^e_ZsGDq^dZ
zqS*My@f&s7)fPq30(Z7;>l;_`^m23e+IU_q?Ymf!={K3X_o6gh_v(w5SjOKZ?=)=&
zoZ6D}UsJZ@7-+o_!KUDb4SVXNUG6$qw0M1OOq7MTEy6J-${hf?#qGt5%*IKYnjK63
z_A7`3u6)1}-LvAp*uj#e6S7iJ%`peAy3o2BVa+s|21PH+SNh%$X|E8pW~6eD{ctz4
zpj`T;_kQZLleLZx7ib;*>|H<hgA{Ur7Z?RMtLfJT=!TCu{=Pu--n|KlduxjE(F~NM
z^EQktnetC~*&!}u_lJ88V-8R7>RG4P1rb*%iWFOxzE)?^H}#&;2CuX7?q3`C%O2Bt
zQp;7VyGKOFl2=%4v|m-t<Y3+Tt6PItcWnLAu6CJx8Vp-O4}1UcYviS69oo4>PU3@9
z;Rk*I6K>3W53F6%FxfW{Fq0D&A5&S$rwzAJ6&KldUP$6>tYPo<tJj3=ny1-U{Z_%t
z5sUaS#Yx_M{fFhg97hxOT(%LDIlKD}aj9maLHYxStR)rneuN`#!M43C^Gc4Uh=|?a
zp<#U6^ho3i%4<)VT(aHnRl1`Y{=j>GC*ZHyIXDEAm6elD3^XMNpxVZ<b`2$}L`AK_
z?tSw5x7~8SH!NlN2ZZuXb3gsD`X*gJTi;*Ap-@CK{oLilMupXUFX+1j#_4~ueBkp?
z=$yRXFVH<`K%Bnt=+JZf$CG{crOOJSs4HAi@oHi^s3SIioZ*_rojQNIS=o}K*FztD
zStU1Jba|9=00bdWo9VP`t|7q)V}lt4#V;x<Y5?1!$>IfR@g9`6xAW^;jVU2~4_#v}
zlVEmdVBhPkUL)B}_WP^v&@9|Ie5d%(%gtrgp<;H@e9Ko49j&QmxVC(WCSJ<1q=)On
zF^`7V#=^q+D$6MQh1++@_7Tlv9eobiFC|oq41&V!ovnQ^0W3&cVRE0w?ZZk`r95OA
z=j`SCvgV!LPU~mo$%46K+`@FmM`fLjw#I(j9MgGNnlKyZwNc?Z%L|pR`;j!{t$L*B
z?8T7x_jx%{#cpyWM7qWEciVnh57(u2WgHz!@}=BA%l7x#?s9N%U=;|;6e#i1eHuei
zcRJ~y%Ui@1MO0{rI-IL_xj%sB%a?gy0kIGP%G39c%vwyiE?qi{dl>OOaJO<*-rb$b
zC&GKK7>OJAHGUk?nkt^m)qd1zQ?PE;w72J;GUqaL{_e}ePbaJ|kR1ZAfTf#{@<ssj
z!-n^TJF~iS?GJAL+MX|6rrjQ{w&J7v*7G&pxyfI|;TrS>N3-^RWM=w}X}~g8Fvv2}
zOT!4TMD_{jpb*-+)o^5VRP)X>L`$`(L0?@kkLvrR<iS1PW0F_!+HLAr&}?AR=RdBg
z$0_eiJxqPU2X=KQgYTSIcXjm)DeNdE476q+D^QVC+5^799ZA!j?Qfgy1r8^gxKHaG
z+i`+oiH{*m9HpTK0*())EuH>tX5UfzWSyet2g4sQ#G@Jt>C&5zyks4lZNsj>5?b+9
zqB4pCVxxuXYHeaCs;;~n7uuw-bSoxy)YB)FA0>{AY*dR>)KIUDGXAtVFX+WYfkMRd
zCD>^dqtnV#JPN19x(YCtVSpYp;s8dHAV_oRK)c*6)>ZLev3I?65>p3mI{TjA50-TZ
zzT%TA<ZB#W-4P*l&ojRG18aB4#EXfeQygT!4Rhlr9M@I9{3&DdcpAKs0kztcdiGV7
zJ#?i{G7p_3FWbQi<hs3VSu?DDuQ5WH1j9TS^&>};@&H<TMd;WLOl~^w5L~`8s#yd&
z?mT83K*!Gg<c-etVj1p?;-SZr5iAp5aKBjoTx*MH_GhW2A0|#zPZR$x9j9;IUOpZ4
zS72P_XTVbzuU+Qt=0_ymE5XL;b=@<Y_?_$hgpKU#mdn7IjNYgt2*H_z(m_vm0yG!K
zA0WJ^e1T?3EOxKm%!|XV>Ru!yUzEPQbnbwtpM(O#<xEGIx}*kq*1*?(my>PECBh}9
zz~$U=+_xO44k&4p<HdCzmF?@Z{hll5^{l*KDEAEUkca+R*|(9qCO(+c@Lk+b|A&71
z`GHGt7w`(Ke2?L#cFuH)&wn+^4}3zWWM>XhZs*>yd}c60NPXZt=|I{hG*lNZmH3*S
zYNd+~>faI+DLG{Pw$b5`MUqeKnI*e^*?s)N@{7K5>F_0Y)m=TF<<o)vOJeBVc{@EW
zp%3d~>5s@nN}l)20u_Hux`t6Ve(avlTe5{<wFMEQYokk)$E1j4#|RsldwTdnl^vN+
zXhtiHY_eZ7b8zFf$bv<-4u6Rt$Jo<zHcZ)ZFegmmgl)za=4>m?qxX#|!k)qiR?T5=
z<SEL=Vn#ALpWrdcqZV0apkAwGZzuHPh`P;G?L+AZ9QACl>w^D9TU+;B&~7F~PY-yR
zb}Ow_^FO)v9BbZclFB9GQT$LXE&6KD+{J=V-UgUY9L4=+C?&!F+<|A$nj3`&yx!V5
zO~~+~4O<w*V3)!D*3!tfi-Ho{-i*Y=$0Ya<NFF*gupPU;47Tt_xXA(Cv)BU3XSa{m
zPeWUEz}wu+@!FVb#G@}#8ye3=Epk>s9YOWtQC{Rr<3OaI#a!M<rHycg`cxWieTZQJ
zD0`HToH?ndr%V_84$YE5s&EsE!#iqUByg7~-rh}>{kd;nCYZk69c;wvNU>2>^b{NA
zjcgSDaUj%>=e|wL%3w(KO+7f4&8YC3aW8W`WzUUW>^WU}pYVA`H8Ny)8#w2yeUGo#
zy?Hd@80AqZYUsrE?wndADie}eM7cBTpnePpu@wKAeLep1lAe{9XRP^H{oz2miNn@h
zcwSbP>+~;{b`hlsy99aJS$wbkcHe%!ro&GAO>g-4@>fFg8*2N{a;?J_G$R3y_^jnx
zhpVu4?ws4t{YpV48s724^w|61b?972SZeR(|2FAN3OEp!^!eJ-eEH_J3pN1h7Y7t7
zJ_K{{((p3H9GGUXB~#)|V%jP^Pga_;7z_ME0LB24L>)Q^duv2uL!tdW`)YlyJtn}!
zD2{n%;70;uPP)*hnafe>G)FO0hpmo=7{?|m!x9oY*_4oQz=~aNL%~hK%EA><zu&M@
zzVjMX!+jBSi<XEEK?ht7v}oaBJYG1|OlkoD*i-Z^i@7{p#}0dD`4fogENf~%CjVf4
z^I)Ie{#T(7Hg-2|t*=UJXdY4dxmBU++6>ZCx0zY_d;`G05Eu@cI3`?9R>wbCo}Tfl
zepzaf>w4bw)8^)oP~E#ik{S0r>wyrE{E7_w&`lUl(>)5ZV?u-iUJcZG1R#^0b$JgY
z+iYPJ9y>2DZ|>GD>oCdFh<o(wN`iECzMO+`Km|MZiCOtarW%geo>9J*rqNlswk!x<
zP4)Qq$;Q+skH*BhN!QOl{rat6MO(G$Rz_?M<6-9AOz{FeP0dgcM$%v~Q&;vw+ZPuU
z5&?h#M(BjYM1{zhd<f@rp>u3a7F#ma0ZUWCOQjP=m{!{R6go~SM&ACVHx_+eZ_#D)
zHtcM*s_UH^JzJEQZC6(`Qo(Jc(Ov1q)LziEQH7Y7aVXg6^;1w5IR_3nu&`0?3vDU{
zJbTU^vR?%OPjWc`loDZ!?j5<l`TdcXw>Lm9Jac_zlQ_1SJk-nggkxEKSoPbED+m7C
zi!Qu;gcgS5?WfJ1ic()EK1kAR2<qy(ohHwiIPnB-qy*%Lj43zr^Y~bjH$|iC$iq2_
z_gCt|{}VD96OxlPTwQa&PJU}n6@cOJ5wNmW)9LM-u%B`+p!$6_JCrX9*DwxVS6$C0
z|J=(Zj&I{iQo;w*m1K7>ySsd}fiJWsr0ndZT)x*VgGuR>Yj)k|@i*l&7ccw{b^%{c
zmg4K3Dae-Ij>W@M#t>a7mTuQa;fAxw#v?CR!=U7tuT}Gc`6Xmm4DKdQiyRO-&ZbX)
z6^IQ=`mBP>TCY#G0~FC=CB*%vTgEbD7%nB@l3LxQT^M|g&q?<$SHiLu>A;W<-Bp?E
zkMC^A3(9GfEb+{bjo`pc{tw2$<woa!NAtjwQu4|nZ0H3N4A%VPX>D)gmFR0~YLGx0
z{#m!~I%dV1^X9DBFQLyIHmw>qQ&CY_%lfGjM!72NK*e{qnz*#=ab6#k7Zc8p@88p?
z9`#u(u4LB%y*@=jA-m6@+D-^=DwVRPoEvKoGR>28NgfdCmEfRa83MzgEP+o&bFDm-
z=sci$<js9491DM5Guj>*49oa%(X^O<O}Kh(O243F#)sEh-*P2?M%DNKam}5$&HP&N
z3!gtr*V2V|{R5x(*o^bU5MN_P+t!FaA0OX!il1}`T4y=R6zL`h10vgY3rdWyf+IW^
zESvW5ijM*3_~oAl%qz&63@?Zrmzp}Q`)mZfF)egb3otS=y7u{>bNN;Tg_jy=HuX+?
zoQK&PZ(~di<U1P&^nx0gYTWxIW2;x+8<wt97ryZQRHCKC6OTBM9|iFbeb?p3G{c)#
zcXgj1Tpu<w&1@k(oZlKFQhc^be`xL2R>jp>v!5G<We-0UB&Upy*nhx#=I?7FpTGk+
zU6rFq!{f{mpe8sf%Aws4^g#_Yt_zBbi<g_JjZyAB{vBpw-@bH{Dhu&d8})utlZSLZ
z*##w8{J1rA(C!+$Yx;Ch9nby%C+X3?{PfDr2lEsmWwau=#QQfK=uJDD;2COcfS*A7
zG~;-H8TP4~t~tXQNEdEE@#|?`B6O8L4rgtJF_8F(I5=^Vabr!P=6%ORmYd^)GH1C{
zUd4Z+{556(J@L+UQn7WwGJYzxro<zn?#mnQ$vrDl_nu+o{8@IO&(?nnXv(1Kw`?tE
zcGN19mQS-p5ge!b25E*C<T!~7IQLYu`TF_>-Q#a9b3NEQ_O(d`KpZBK_vEqImNmtV
z*rwrD9m7A(?5Cw$EGT~{@=)8sZzEf#jI5HD(f?;^_)_wAlrCMfr8=BLq}IYS`Qwwn
zGWWvO6Zz;)Dn>oA4TXeJGw9C8f+7#jY2}xF_)JTFyNMvspPr{Tr7(eFa^8vqsC&et
zN89Bu{>ii71m(UAn6x>0T<d8Lwi$#ls?XZ1#WyKqzs57Kwn2%A{XON@#tQ_-&fYLT
z|HCa}S!*_*a8<zBum=xb=P=}l_w~#wugd&!)bTe(KcsAWZAm`#y{6zZ1(r^20oCQL
ziN*EBTO@YvGKX5ZB$Te13WS>=c|#Hr2J0nb5q~HlU?vx1&RVej-84>Y28%MXL-a5c
zYeIK_K*8+iuzanL#n<2%t0t7VQkCQwLL;uJ)lQXZsF@swtA`Rs1#9Z}1yNq8vjOzO
z<Y-I06Z`=rKY~L-gm&-#=+-0GYAz5N8QBO)Z1vE}4cPk=#xMoZ&d9%p7cBZL<azUZ
zIO-H~%(aK4Ta{*yT-aR7(*X(nIsKldxYD0zH0<S_v#)!(+d-E~X)Aub)v@#PukoKR
zzyAylJ0Ai*`EE6y7L#Ys^u4g;288b5mWO6-JppO`K-TSN214^rc33hoF#%)JhFt5X
zxKyiHv2vxZt|nFTp5at6rO@JT9AE%(qrfFog25tNJY|3v?mO7_<nJSmv2dAy@R{j5
zlaBeRttJfXq;CfDFwY#JxYhOd_N}ByI+1{qeLC6eNaX@?)^`t&25epX0TGIEU`DSq
z0%vbR7i{FLnr>Dc{@2c?@x7;sHb954FOct}F-}A)6v%AXCX|@J+WX}Z7k7uC=AE}I
zGM4F@Wz#n1{!z1hnoqv>dgrrm9TvglK>=*A_h3{bKjja|iljW7j%hriI@;f&V29S~
zM>_mwUPq0SDlNt`>%@j1Kc-Rm<9ZN9%aJso4)<KZE4V^9Sw8*L;1Wd2Rh=b@Fc*g(
zkk4noj%1p)ge;|Z%A49bt$6#7*XJ1xJFQA*%pGVQ+@G?J@+$hVaIspBYSdtYL3u}E
zs@d1xq1>zC`b_<e)w8v;4zaPraf<J*v4C@4?ry!yn2zL}FM((?j+qsl^Pk<HcMh%#
zK0o-1TTUt3W$l)f$C4K3)=!HtR5g4T60p^NBSi7q{l+&@ev|bX$ga;&MKU#cL0SYs
zlX``d?<$NK*srnFYijAduxO=wB;&$Wr@L-g32qy|-P0GYZuzOfb^N20ZDWGQ<wq^?
zZF$jCt05y)m^s|Ezf%3-(4TnUJ?RulLiMIww=mU7d9scGZlLaZ=%iU<!uL?O(kt(d
ztNC~63-tF1F8XOO#pX~dI20@kt^T2Mxe!qc_zs`AoF0^ismTivBh(+2T{+KIAdQ>i
zk2&P)*Sr}zs>UN{gaXp7C?LJ{{~eH82uW=F^q&E#9f;Q6$$=)f1)>#q6$?n;w_P2p
z4u15d!ST!YXl#*lmdslXN-d9G8au~+-mR4ILb;G3Nw{$p<yk4H))IE5d*gy<FAhQ>
zbyD$(?<JOGoVT}^_JD(3*u!OhVsQ}4`Wn(W_XQT^1@zOwk!6nz$*FU@#ke#=&Gy@L
z%Q-EkMM@JggPokBA399?Y?q@<FtTh<+l)=I+p54ExV(Qe5B;KtPw-Pi(1e4{5y8=c
zi8EU@_SN5>YBiOO?N*v~UN7R$VsUAL05^@5FS?rsyd;#K5@?{`#W!?;RlR}!VZ`$A
z-O8DeCPM{|P?>D8cia@ez}p;!)n(;d7CH(kn#Aa-%elu+#Ah-!sg#OoKG$%3s2=@q
z=!w$@_jV14?X}F`6tP{>z6K`jlJ9NH2FM4TNzoIA|7GpO$&?Ive*l^tSD@9PbJs{w
zpxo}}q&c5d%f3bR`m2x^6VdG%?EQ;srLc)KT;!;87V3~IAO}XOL(-gB(Sn7$1E9pS
zifqNuD>4v>Kx|eGuAD8<Ew};%Q_a#c2Jy4enC%UtT&{9HTG@x=>V`iRGR4_e)Ct-(
zo3XD-3eU=MdTV!l!i2kA$?EO38G@JfE{O53y-@l||AEB=OH`qZRr#0)esl3d+g`VM
z^#~V5FZZ>3*HdIRt^mBK^dA!Cljuhk*^oL1+D2`liDx#usc&O$Y8n;W9O!^K_IO6l
zh?u};y2l0LEZ=gpy6%-NPY~8IS|rg>b>SdBPrPYY>0P{HkcVGLy2vNC`4;w+ON!t?
z_G=qmE<%sZ7X(gH2Q_u|;8?Gjw!~$CMG6D1gl4^0+odSyK%0h@9n+xf$O6R$<+!ar
z=N9G@LAem5m=o+i3VYB6QOw3)Q;1M8f%gK-Rd1EOd<;3-4mzDcBNPPWJhx+VpjRjt
z*ws(8>0_BGLit)OhCI>AxYS79_S@hS`Ou4VSDpHa9=@1qtF_;XVjqa?lvU%cp{TiT
zJ;_lga6o%uHjMbG)koYBxu^QI7+F|Um@zkaNEEwH@3Rj#w*p3B2V1HD|IrDVmA{r&
zF8S}|+5{&gBfpBA_xmNJyHi;%qUVZez@YHEC6tHmHG(_z`pNG+lreB--R?Q{XrGEo
z9gMm;V0Z5KcQN3U*Gs<jhyM-=7)c6tBX@O;qUepctJ)Q!6<mY1tT=1ow_fp|!zGl5
zK$*Y*uHo7DPL$gfv$FdX<%<l5kP@BfYB~*nP;O!kMchW<SC<F3MXVnBXKZ02PB}HB
zl>`>xC}boJ#l)Ioh1hKr;fkF&M*ZtKlcB&I8qpdQI;oVHkb$Kxznzk&X0GCe;7%X2
zIB9wVF1b6Ws<w7SPmgWWG(-jtZrkNF)={f5osxl(PmFS1$4rh0tl^Ca)x9Akc|huB
zElPPUR&&CEkryx9_7)UUD|)NbTPKpLDPhBv7|`kQXJ6T)h`&?U8feHzpdBRgCpRyz
z^yK7ZW3HV>K<|U4lm}nAh2`@dhaT|W5tME(HBY;K&g8@a|L&u^zOejecnV&@M&8vd
zhD*ATS3uZ>=lz0vq0|*WKD=@BD80$UK6<E(3$w3%zb~sKcT<_yvZsR`*EH7F_|vIf
z9bX>A&i#@9v}N9FJ<4IE<RPPz)J=f@$Gg*M#VSa-xVYqA{Oi`VA1FQdxVpL~fcQVX
zL8zlhbh!v*VQ@7P+Y{C|W%UNgZW3!s?ruYo{Cp!!tYz@$ixtT(8Oi_gdT8hr<YR<o
zJw{qw-=7fFHmGdoyQNk)05>th?9=8g;l65rmb*~vNS8!FKz1<>%fqg)-orrwlzshr
zDTr;8aL|+=Vr1zdwC@b?UXl-EHPx;2irTnQgWrsgmT~c79Jr5l%#^{c(v29EG<&PL
zY~}HTKl=M=4P<m4&DunUys4uU`1PlKft!f@D&Bf<KjKO<wlKjTq^u#zIlfihyVpBf
z{%~~gCrdZbe9+=?&p~Uy7)2^rA5guzKWF{g1!@>$SBJ~xN}qh;HD-6qxt!=(zMTr=
z!MuKbgLT_oy3X3awhjU~Rq1nS=~9iTt-B*Sp5unbU$<#EMeJ671@?ke<1a7W9N$Ix
zHK2!wf>nw9QIxa6spEx&lp)u3$h4mtX!}|1TRL!0&xHIsX*yVmU&7^2b#LRAk_@H>
zg6?98z71S?6ZcOK#`KOHN;mAUBQCvsh?xLpRl@d7ySv#dti1m~NFO_M2MyS$_Tc^!
zcou<%5xT1mLKB3eX*bLmuLd@pr73dySP3yroiN>dv34pG3JmHJvK!nA1uF+L(z?X3
zggGWfL?$|*oIm9;@%e=G-{osgr-Glm`P>p=9ZO3Uy*`$};M`HF6?LR*!NQMrlr%2L
zrs+po9Y=tcSp9qV?<c0DRL2)UT1ggQzq*@W<Y0N6FIbpn`dnbm48PcYLsjG3e|+s?
zjeo;`R?{yC&&cqru_~ZRQV8af_|>6t^ZluYGofBn-8(!LS3Kxw*!=J?<ZevoayO}-
zFQ{6>T?SI;<A&c+o#IMfIke3RNlR;?8wbN(vZi3@N-*?{_~bF(BwHjXxKYX+Yo9*g
zH4~ElW46?HOoQc{;_$;6J5gS!>=S3o-BAkP_u2=RM|>l@%13D$mhNJSo*pGUJi^>N
z^f^SjEaqhU>1#j)_F-i3#T~-+ZJ-Ql{pqFwo`&qMvS``4_#c=^p?ZQiW7Q#akfKa4
z@9s20>C?IhnFye3zsA4h1m@S=zznXZL+gkL<Z{`zE*2!1Kru62m8f~=ta*L%5%!&)
zQ?m)4)2AFVq@%xlhon!JU^c;uJ>R0PQg&17bQ`xiBjpLK!6;tou}gk~meEUhi+8s;
zx4HA4Ugl0p_{z%Qj#=N#^&HExBC6ktZ>ld^9y})C`S$3Ar%K#$im&-rvp`*A!*Jl=
zCF4q1iH?|dp4YhObhcpR)FDG4Q?+|vFcZ#`n*Q^Gf5*VuGM~S90J>)edqaZmm6s{T
zFQKeTXD7ODw{{-IQaa>Yg)_AN9lRcLuD1<mwRAhwT%Q?+Ke27o*2uk+Z}xDxE?!c}
zmSwNGOcr-t^CsTw9Yr+5flxU_rIw{)yjoGA0x~5J-uIsUHr$;Z6k$2wFmbl4xU6a%
zE`AVPe2VX@zu@BkMJt(+i(9r}%L(NSSQ&Bbu7d2rsH=P|xBnc}U8I*gS-&*)?yjky
zd6T#arV8E=6tUu2cXK(>u&2jL_{Lkey!_Mc)yDR#EEQ=@wKyoQW?T^j5b4q;)O;32
zLmrsJ=_Kcc3%&3Z^BuOLHIcAiNm0%eb9t$ZIa<L&x<7U{DrI)ZNh`12<}iaTr)(SX
zkFTnS$9lEM4m0)I`iGPVFNa!QtXBus^02Ghdcj-ZPg6bLr)(>{!a{ZV(6v>Q60x&z
zyE@yo2l>1UtM2H3Th`JSoY*P2ey8-QNuLtk`=@%}<@}x4El2RwZg^R0n(LZfK(*qx
z{rIT%2xJi6zGS`1LZHtE0Uh&md-b?`+S_+#RJr-yn31+^*H)rz845#@L7Y;7X04AS
zg=H7J^x{7~d9Qlp2;Xd~sGN%g)Fz29jb9_vgX->hs-lR_2b+9UQxy|Ea*Rh90_T-W
z)$-|v%=jOsykIl4tai|r;iRZJd$Q_;EEk)=+v;aNKGFt}n*<D#G{5z*7$mIZg_Ma?
zV2SrzEM@YNs%pjAKxwM47NcEYHyJFcmyF1<%n1U&il^_bP5X2KnG_;>!96^~GE?S`
znSoih<q6x*&uHx%dm0Q31bdr<%LmIM>Ta#C2oB5FE^yi{YIVV7-|rt1O%5D9JR5+M
zC*<cxGqu4_5$frND-u^qY%my5=UNpGxhlTuEQ@PG!bzzmf1OUgYJGH~ZRGOlA4U3q
zldabrK3aeM`@7+g?;lJg@9(!bC{{MG>)7XK*|smPw-|W({rY7odtt)Bz|tv&(-@MV
zuBW_0Cr8UdHShTCySX7h^)@G;SI(|6{gRa4v3Cyp;~q_1ku=p8J=?Kfh>dN`Tx(#e
zB*$aa`fFovS5&IGRBU2ygK3&fp?#BT(HY0oiVJ77QwDmME&W{DrO~15a%lR`(h;Z-
zXmpfDL|S?rKESv0x2eZ)#T}B0dmQ2Xr%uM};P3D9eYl)+XS}|)9n^kcGin7J`U*k;
zZZkTO<^Hgtw;FP0%+8Z)spzK6NP~qav)V9lZGorRoLl}~Xs4}q(2S5$a>~-+=m!@2
z^!sXglwM89hwi$yP9CNjdKurWaUT0>wPZ&r(9Y#an*Gjv88sFOo%@Hf??tG4-`;fp
zq4_y`fx4L&94EVdW0sG{e{VN9u6nh&{Ls|#Hdmb@H;a3112xWt8X>}!2_fa9($X^q
z-ww-^m6!=@|HL0mUUSrIIN-Owm8`k<az5Ug=~m&+=`lmKqK8L*z&wntcPfQ<KTW*T
zoINXYLE>{dU)RSmNJO&KR8JV{9Xyr)?xgmWv%5Y`dK@0@6a3m(@#sR#+5U%CuglxY
za%-IH@BY2Dv%U6Zq>JnH*sGFKg_x2LBF`rSvOQ$YH6r@!fag%SDfwhrVTW|!uhZkh
zExUNKq#M%nCrY3x&r8*|=g!FVc$J0CIs32M?LQb#yR0b7KDVYJe{%A>kcFMLE(fI`
z*4r|;@;inpWpEcD{H6~V!>|Cd(c9F&UtX~DS5nA$r?;iQUj7RD2Czt(4QYEGX5#*E
z`-}7ZEEZS(+VV|L<KD{6&ke+mui%1qC&ilSaWBaM4OgpeayR}TRp%MiWY)d?8Fh55
zj0KP;qf$nyfOG*BP$3klp+^J)geDyVfw6&f0RgFz-iZ(rdNPW12rUp3nn;s`A|(M5
z65d;9eAfH_zz0|?R!Hu1&OUoz*Y6teADsH@@G%FI1%*$$HW{}l5aX!6dXaX;)ZiJz
z00o7~78~?a$M{^8jTV3*eitC~HjI}Li6NP4isS1t*?B4(j%g%G4R#PTs<MdM9LOq;
zJg_jeWMsP^=TI^v$aZ+meicoLSP)@QAX}O1cx~ov#iBK6%4r5(nE{|xEA*K!@=h(y
z>~Q-q6Sp+EJAiZ+A$5it=!+EaiuIz^`tA@<rIN{7Zs>@`1$J4-?&M^?tF=TlxZ##j
zgokxK13uwi@<YW<<tq5}zYU|}`-TA%y!PAo4db7aPk@`bHNRpVf$_QH*OjOLy@eQw
zx9O+!|C622@4sZ*oSR#X@x|Dv8aY%Rs_+q157O`Ln*z_x19Af@Q5tAe^HPki-gCIB
z;2M;ybwFnGMmsuQNf01i<*Tqn)vSwy`z)X?t7!*22je|Bgx16f07j(MH<S<VS#5ww
zfFV;l{E8#4SJfZCHBj=t`6rOnxqdy&u03k0w-Pm7BGg`2K5c(YgKrHV<=m(~Dbg-I
zeu9{41V6TI`VlTad9>8Ikud#a?6X5f9;~M=77N&uO}z1lCa(-GEx|v9)E&%c<p9m<
z8PJx+rWzrq$)d!tZ#>a*!+mpJ0O+*emsZME@wH(EV>}gX=_jS^tR8}Rs{fK%?$H<i
zl(MeugOmK2GP~jxxjk$Z`tALhOR|a0T@~az5V(73*+r)*;%43Qa@%S2JGfNu>KV9v
zPUH-@by=Oj`r7k&QNfIlRh4&&lxt<TC-X7+vdq7A@OOnQeAD)h=MSOI7l|9f|83zF
z-)ATlJjFMEoI++#0Ks7oz@E9L9d#-hV7Ky%{&%4M{>zj8m9~_$7UN2Ho%Zv9j;*cC
zS(H_ho@>%~*;+ZGvwd3F{Lz!VcXc4zpG@B9hKo3rBK#3hq)Cj|&g53U^jRZOOoDJ^
zn?Y8<vl|{kd7`%(9oEfCdlxXIpdP0rFYExZ2#T_9ygr<=aw~k}0_5b$q~2GTD^K-2
zMu^CdG#kJkGnacF(@yt1ws)qQr=0@gNT=$G?>7e_x`KcK^(6`WW~+>ildX&pDhgQ8
zJ*ko1MS24B3mL(%vBm=qz8gIllnSR4Q_3cjVDSB~Xipay>5g#}=(U((mBJxk4I*8|
zZ~ply;XHTH5d%YuS{VY3QvvOQ@6H;}r-8RMQCMqqbKhaEfLCf~_?Gf%tDZS|d2X4Z
zsk^q>8_gwiXO}AXIC+_fV&?@1$Nxn8i%I#1KC}O@_)og{y}x?0Uh*yiIQ*ada^X8&
z9H4Do+nn%Wnu-8HI<EoV`jLmAGWmbYqWu3=4*WN(y1ZlJH2+ZhrGC5Wx9#U?Nx&N+
zxA`SbSs|{|x{nF|Rg1G57k{bEMqc5d)WTo@yNx`Uln9;*Oant;<<<++p{2Jv6Kp*|
zFI!k^9`*A4ifSZ0KP?*Li#)XyomO2Sq1we0_Vhk??%Shrz|X+X(tb}i`^^l0JPK4Y
zShpa<Cv(vzgyHoV_;mysaHAXCjo-axUS(o?=Z>u&h$d~AU3VdW^SW`A0+Npq7Rb=~
z<w@+3GZ}f}3OtjZ{|$g{{ci?e@x3uCAzu8wpYY4CmACZtVkQGg_I1z!{$sBJO?5Xg
z0-gt0$j5%Rw)V|`%0d9k@n)HUL7jN#o_kt(I`a4@SDz7A5k7SWyfkxmzv_c|m+P--
z-Hi1}pNyFEBywV|2FtSPAVQHAztfScMvn%vXT;?_#!D%;VbCUSNoHSl19k)V>W?Wi
z<0#u}EkSG%IXT|mmURZE{}gZQpM+XbMNFkZF2C@r)8nbLJQOM}bn9_n|KnfDg7}(%
z>vREp6r;}XByYsEt9SRo6D?pAw(jw)P*mn@tCUE0@aTCAN3s&UK4xQ+X;r#5ssQyI
z=9FSRP+nr7^zF3D8rZpc__QM68P{N+;nUoQt)GDcri`-JfL1Ip1^gEJ-r_$gWST_u
zv!8qGuf{-iAA0V~k8ar${gXF0yRUj@=m6|)4;P?QmL`*Z_-9ehJo00pbqe7Q!WdrM
zE!V+nnj*xD=HFfzI?pxUGuhPX+-TdD$7h6i!lIMzEwFd+$%-BfV1jeoYw<cJzC@p?
zXW%IgfGCUYLh=LojQsDiPwrSyqRY#bVR{M`D*&qhN*oX3nlNHJ*K>;bj1kIW3@j{W
z+xB%QlX}O%%5)f1A+0+t)l5XvGF&=a2_dT<^i{8D957khi3Lv%8`)rB^MOx40reor
zdyT8kxd(j4rD8#L?u$+D+uO+_ZDAg&2m8(33?Vc&k`r1Bz!*xiXv?DuBgG&|JtN%R
zN~glR^l)n2F|JD)E$am0sj+eVLE2`gKm$nUBN8ETZ?!cnxE6TASf&r&c4ZyA^z|=b
zWNJvz082Z%;n@3I*Y-X?J8)<8dd%RE7$NmwoE1QYGi}-;NY;Ds^DH8u4uCdtZ?ms`
zbFBrUx_vrO!^ufDJ%U{`bWVAtZ!eQ%Xlxn=0hx}T*I>H!jQ}5mQRvpdpe~|@s`T)m
zG$4pV5t#itWpFk|jq$x<>j-J!ow#a?4M))<Eu!B)ySlK}Qiu_COIR9Con<zOw3j<o
z9|w;&tB@iW`#m$2yi&*CY;SAY$p2mUpeBx&S7lp!p|TU%0tW}Q5ieR}s~!kMj{CZ{
zK_yh3z7GA>w9yGNn0sC;v__b%L5#C4zC6>HGGWh9T+KHZ0QvS`Q_Yz4%i1-Y@D<i%
zkKvie-l@Q$(OX%Niz2^s1ow#}1z(ZTT4{kahTc@OqaUq01GZt%0f6%ceg6X6L&hR4
ztC^NW(=JH3iKl>e1s>vWsIWY~dQMNjxwk%&-!PgBQryJP`V0hJj`u1h8Y}uchlfk9
zs<F=b$Hw+LFZB9gHu7@WZGsx`5v6hUKv;gqB}Jv2iLc?pKo~S-J$I*k;LY2$Cj?)e
z1bc1)BKvm7F<RR&{D5CW?Jg)lUQDDi0tNjXFbzon#3yY=GHvy>V8ogyD?TrzXeTXJ
zX-?&_W@onCqlDZ_%hg&suOh($vP?!-^K=2d#-}INdpfr)!Fa~=waFlwLVW35j+V0K
z$+E$m(~?_N56G-o`rhJz@|MeJ$zhvQ@}0j0dc_YajkAXK_mJu<s7KkcSKGoG&{Bab
z6{aRLzDjZmV+SwWIg_xPox9e0J$g+$(_9#<QsBh!9?o-Y-7EZMA%wLG!mz^3&Cgpm
zZyNz=QY9rBjTP$<=tBAVN~;29D|{RP$6IP@2{0*IWUYE&(JCja@(Fn*<Xyl1-Bk3P
z=TP;}_7y)cFs<n<Q~fS%`vZRHyXX+mVUPiPh5OvS_=}g_-QDw#X~+G$1AO!US0Ym2
zWFxJv%l;GSH^#lx7yB+A`_?R+qvA_RIn|&|137hJGk77bhEx)`$B=ucR5)_q;!>FB
zvwkFGL)aMTtRSS{du@NE6+5(DANAvv=~otMpAzBHEK8}v_ToA8axZHm^kvnEeTeg7
ze|EjB`_$MDdhiTT=%U8XrAbhT8Q&^eqc2hI97A0R#N?cub(b$6ljcW_xh$<p^y9NN
z*>|Pos5$<x%W1nre4~ANT44?#tHS`<c#$qMnuu^)<+PU=PwU90*`pYp<efP?(b8>N
zL%$_~hT0iFCwbEzf*!Pp1X_Wo3(ifN*WJmGkzPmv@q^wpR~fMvh~<6&Z6McXb9}f^
zTq2-(F0l_oD@)0}IYz5QCHBtu&=vrnHTHwN1wL$lBZN<RP1c*`?>KvwPkvqlBtPjw
z$(VyNT6ShgcqwIXwCju*K#qQvE>*hU=8kOUpj~%f0bqd%LZP_l<hA5vSf?;6kmxEr
z8o54_+Q9^Pl8J2iV*db8Q{qO@OU3doy>~OUX7Vns+WCy3ERcCOu3<m;BEEbS-lSF{
zd(Im>hqXFT1%=eHuS$_?GC)*VIkDVn`b;oyw>`@!muc@1iO4X>X2eJ*_uBITImX?o
z#?vCa_QZcmH$Im~W+HGdeZMYK6sIiH2D0zn&ydLo33LF8+ViWQ>gzRaMwS|Vtz~=-
zMM>p4D3I*whO`#{r6l=4s#p@jK5g|{gpjJAX%tuF%&Su*h`zp&TWxKDN(r+(ORo-$
z3OYj~e=Yj*p~^aRa|LZH8BbbcCTt<f3ZCTVO_SuWzGic_$H37poAXH;YUQktb4kI;
zzH<~1n<=43R!rpiafz4oJYI)}PL)h8nLjECHTe1$By#oQy=l0l2`B(a>`ajHowSK+
zb}-f$GmHZ$S!P0^>uo*&b4hbNC9z+Ldk`?rm7e1B!%WcLQqcZO#=vVAza(wPinmEL
z+W|^BFi1WJSA}gEP&pIN2Cd>l{p(y9I;5qSb#-PGHXFD&EY{*D5j&X8kLQ(PJA7!G
zMJ}5%!Ky-M0DIwB<?chld!0*vP7TQsRJ6o_&^}23tI~fWk6UIW-JVXz74%x{>;ZYE
zBdZwj&Vi~wyq6nn_h&~3KBSfF^ri5${Ut4pqELTQ$m)u#XT!!`x4laa(l)_}=_+AT
zQ<J0Wf0uRjnO^F)wI5yU)cw~lf2SWix`qrM^EKAYtWJ=`G=&+}P+6pP{K4f4Ho&(_
z?qcP~j*gZFC%=DIIG^KhZo7*Zn%L-zSi~FmJB*kxE-d%>{4n_)>N^Jx(Vo}-uXb0F
z1lqNP*raDawu76;FK~0~1q9T;PEXHTV*=o}ULb&4gS&m{X9rXJkm}wsmto4C335r&
zq?BQMWOY8;KkMPSA=ml6`-_A?qbUEJ-0r3U>PZ%$FbTMSebm((Rx`jfv)ixqVkpvT
zBM-vvg>LqEmnalV+wH!PzP$4On0gPOMePIH9VJ54^2y`p)qsM0`jHwaE$hAKeD?ti
z4YF3I7w2c0=y^fu=Er?2Zld|7h%4eiVR3n`3l+3`4k9X1CL*rbmzuR_AZpFmP?|@T
zsj#P0n+R{Ao-JAVMgtB7R1ayhfgxambD2+dQ?1$o>e5I{jokOWTVgcz*TKGPA~r7X
z<rtol@zoq%dUO9<7mzE*R1Cm^(NpE*Pw{$w5@a9Fz7Ak?Qz}uwlw}^$;)EdM`4@H<
z@THuM1I`R&KGW5EoSAH`g%!j7l?{?{cLrTqo@^=xpnhQ7UWpC>G!6$e01+eDYJKx_
zCL|rv1B1-XTJ2<536J9WNx(uXX4}Byjw}x{%oCK{T3o#$l}d%5-sTIh>Uo^e^Wg=;
z9*WIY^^cRTr7oBHj6>Dy#nwWp&jvNa&^LD<R;7o`wU(3L0};h-Uj5cu4cL2Z&vBH{
z`5W{?tj~eTB}Bk8C8yjvv{)d2mpyGfp&sX%llx$7wzf+NcKluEfXy-PN}|dGL|=3H
z5&5+Kw<%{T{S_Pp0xxL{cTHr@u8^dzUF($}osz4D*!k){2YgjFG)Z8cN^spA>KXqI
zqFGP5;L+L%y;R{Xv24;|y`+*qh^sfXL~)Qj#{il!JanzeE`v7b?ODNb8;7E69}1?f
z^#TA`^?GX#vE)mezEV#gEB`}8Awx(D2fWxflzmaLuL@e!;{h-%Nb%(O_Hwo-!lEPP
zZUP~YPrKK>Cxv4pSTA<pW|c@+NEG&vsukC1VeD2=6{werzP#tKwbUoCCa{fj?Wag;
z0gd-7U)Dxi+_+0zc>;j(_%oi*#dbDj89os8y?0O9-#s9Z?g9A{<3C|$=95xx*0O8*
zd^`gg?iVLL+q>dmpXvGn-G;K7d!1!ZQsq?WbaY*c%ZR?j{1l{7?K>U9RevYse0B~i
zQU4BNZ()^ks;SfU=v*6s1&YJK_xA>w{*}W$d5tBJC2PhArWCiLlXo1?>G`&uqp@nZ
z0z;&IaGF=pRDAre@(*tnwO_VC9GJ3abAxe@^>VPC=ayKN74fk`Ku+GD+R+YkW1hj9
zfe(VtTm$B6>40rk5vyb@z$ZEI!Q#V>|Baym5P?X$MUrIkkGAv0pHg261f=EzFq6`9
zas&bE^I_=E<=uCPdh*qOvMS5e=bzeAF5*kmJvx>TTigOJK$8ATRKSAX#nE*Luf6x8
zujrE6c3iv=>q&bCGD&ON;^>*y-q}WE1XI@m?%vY6`>00x+GpTF9ZpdssNT!g@5e4H
zgto4K^P$G=h$B~I+N129Y&m)w6>Fb13YDG_vJ{zBiqw<;a7!Nb0G;MnrZ3DtKW@Fb
zn9b}DWV7z|Dhy6u$$+diu4FL_wqEf1E$l5dP|gf?U;OLvQM&~{Yl|-?ew!>TBVZ+@
z*79E3KQ3OW{DnZ4aAE(Od}u)3pxE{bx*<#KY9_{BG`y&2vPjgj+z>&off}3kH8Se}
ziogEdB*JT#z7zHBM^DG5kRZmdI};GLu(1sP3fTbwf$fy3C`k%>!yE}`ZfDm!%hOVv
zE7o0{!$pm6){@h2mf=Dmjo*w$i?Y2Y&H&dFzB4`Q1hx@9sVxlT(iv|&zuGY7naNag
z{BXKpKR3gb=o)3FC&jF5f|(uH0;+`9XM}9r0xed__=7igLDtWulRY=S$O1_W2DVxc
zAevemI>FJ5a-9@$<fIg;twcP_rs2(-#q{SoiV8W1(Nv!45l%i>Htke!sh)gJ7N2)v
zyRXFFEz#o12-X6YQLfqOJrxX$(A8`w3JS7%M~V;EyC^1P%ZIV)Lh{i5V-&5*jT@nU
zIZ&v9r5TI;O*<1fsF1Lp<Ksp3to8LWvYI$VwJV)S^iC(I*oCF$JxG%@E2(N2yPW`V
zXxdWK-#wf%&jfhc)#;wDhAOM1b^T2BhK3tH5jc<c7TdC&9)+=!t$UY*Q)UHGnPXs4
zF`M(=7EfPodQi-T*Qu`Dz!jtvV+?S(yeL(#IzR1A_(XO`oo3B%bSd|iBB{sm@TXl5
zGmda+I4)^O^AQ@$8_OOg=-;T5v8Z~qB#H9OAex8WAsw)10_9zeo`O=|QC-<ATHmpX
zH|$CO23XI<&f1cibtZkr{kY|LZ3%(ZDG=N!MOp#jH!|Kl<hi$<$rNP0<IfO|XuQW^
zlB8?}&5k$rvbLAC07Z!a@HsHBtXw}^T<Zg1kInTuICkkJciQi?)*FF=+k9ij@jK$d
zzIv-mllcD5m-=;PgJ(b))SaseSLy4p#bM>?PuGCQq$B;hkidlL$Osdrkj&RWrIK*Z
z&n&5#U!YB<(ZbpNYh!7lmSro5jurA<_Qu~?qQ28N*?A>3Nci2onFIY|*TKGe*L?i`
z!WCP=Jq`<rvC2&^nTfOTw82Q|wKP};oE~5%qJ80TEk4bgl-HwQ=l2b0`*vl}2y>^3
zf@0GIIU{;|@O0B?`bCt)1EC#Rj3DFl`@Z70pWm(KRdL=HnOtR|v+34m_UCC^@2uJ3
z%Y7?xxBi{5>FW0q`#)0uHT9i5^>ae^p!)kZBPsjhhg{0)x9=0*YuB!o{0U&-UhnUR
zALk;uYCrt1!SjT2w!42?(z=PsJAd}HF<^X<P8U5<O|O*+{xI15%O$3t-Xi-pB#)rV
zS5ta9`Tg*El2-2m+0}o3_#U0FmdNXa!be0qcJg$w$AoRZqHV+9ih{~ywRvSPy_NnA
z)~d<O$XF-C%2Nse@+Qh<@lO)7Qug<IiFTq*P0STrv(o7@QCqfNc8K9j<jxw>dFZXn
z=791&3>>#*6zSEZufRqBZTT=)RV|{YGV)1<U{zm-&3W@_v}jqGNKS@>{;PE9gyS#<
zgN81w9!=3FmJAjsKJadQuX3`7uTM&hf!$dREAG$E_>0$}EG3LkbFel6mS~S^6DO0I
z=W5|sI;5j?V$qT_RS_QJD`S(@J*LJQ#ZVENr(U|B1|)*x$fsd5FkK1(VfsmvO|WHo
zx^4~*ub;R(F=N>`;X{i_U__9J+wh7bSYC%uFj^@U#Hz1XmsZdxv&<Z49iyF9g4v@g
z5wWp}w&&7tqnyzF5RG9YV^2R%q>YlHsEtim-1i}Zz~_2{ikG+xrHXttxuSf8W`hcc
zXPFAhsv_;zqJKNde5oIxkSOkf08VxBok{da`u)Olm)F}G+O%(~gnZ*|Vy*2g6eclj
z2*dUO0=q%PzR3Xv<O^K|hAgj3^C$(Y!ooB9JknGY-N%VVFDg;Q>ez-oeMt-9%%`&b
zNcMdgGMtEk?=4KYe0t~BzS%M^L8JR#I;cp}*nZI3zN@@dwh|)mAYf6GSLJ9YHdY(a
z$tjTa@-Ti_-^HV7mb|xX7IO<`V8x1ywPz|S%|JTT7E0}ij1MO@>TA6xA;x0Xv#(B-
zreFXKh?4fx?6z=?_fD14w}<l#%G2e?l>@bgT=R&mr+HQ*<X2W-tY+x69hlzWIBkD6
zT3b3XNDE_!^1wF@7e~_MF88HxS9CqsiAltg20uk?y%{oQ(iI5b0`A(D1a!Yzz!qtA
zy?MBX5^pCbnWSOFBLrLzYg~n8dZqoT={{o9u3`@QO6}2L%J&CxEwNjIeF{6G;$*$Q
zK1K|%;`JL9@dw#EnMN`!8rpMtM_hxi27g(Pv^G15Upa2b+@lvw3<{=rug*={+zSh3
zRz-%n5dUJO%rg5Qb&^d6U*ZfZ^f&7dOkQe;Rmv@Xir~iEvFNuzDUTP{q8U2UZD-(B
zW{B~Y%^W1RIUlu&IvwUDNX0*%T{;ta<MVFS+iNZLBt<>~o!Nv(H8PNGVM#qh`I4SH
z9)aT@KT6iumO>u{GJ>WL#=v17kpcNW<1V4qG0n|L>+rxe$ntV<@3&dlq6CTK6`pq^
zuA#HGKzxDW8!~-85wFGhL%PX<uY5K6UUPSU=9F~I2Ao|v$0_FA($!F|T;;ITt<Mq0
zgp=QBaaDReGQ2VFWnS<$m_7JuIyM6Gwsq)Re?QIukkQRIyo{oYxR|@zKCNswj1&IZ
z>U<(`0qAy|kh}I>V0G)Cw!fKmrUPe6p)m>*|LG5nWZlmGY>i}{`<HgHf|fo_+erJU
zJCPAd#DBa_*z6q)F`pFPc2-GyS0L`0AT;ql<p`904#>8Z#qH`H<#K>#SKl$dUMF_`
z&5kmn<Q<*DI|LiWe`+zLj-kb#m62VE8Lf=x*EO095wmWB_KZ#Xf|hc()<Ki?gyZ&C
z#wIFKv^#W(oDn>RKjQEf5kayibx$@0yMe~Y%3Vw#YXy8MV)v?a`O;#xY??_S#iP&J
zMwxw*XhBJWKbm;xRU#I=P`+VhzKs?UuRNlr;j1sYQYLFkw?=GlsW-64z~`kQJV^HL
zSZbTRCnTk$B&Z==4+ZmQET|`;AQ5|pVg|bpro;U+#;4yuLn(=1o3xNwW6nc=B7&}^
zouU)e4~xic=-b`|Qp8Z(>-dk1rs;KJKVps5U{kW{6UiD%zoqQltq3R&W1=!Gs4vmR
zew%@tBcKM`baO=%Y6HF;<8@d%#%qyP$#-tOWgTDVw?N8@VPEv{OPzie&&GT&OO$VA
z-57RV9BNRL_g@s3_o5t~z<)h{etceuw3U&Tn*ub7wDmxvslQu%_`1D3CDo(P<am%c
z#&CvR8KOo!;mo^!_#6jbKL2jQ2U29u6}&o}nq@ihD9&jHy`zOFxfF)Xyitx8o$a_J
zT(zp~;Dbo-<hBvohI7@6v{2eABsjF5$3B!vWgm{#@`$eJHXJuYjS*vbV4(w#mxt%%
z8|I8LT_FR|0n->SiTC|%`aAJURA0LWTu*)kd}PSacH~vwq*C{Fv$r05Ex`wFpua2^
zyeC7FE+;o7D%loqa5ml_)5h?f7^umh;w!YKBk`J&S!#HG9~fC{j<e`x5~G_)A9eMB
zlm0l*jMIOhl*qe8+u9lwmp&fA40#-X9KHvCl&t}!I6ciu?%9W{3;R!ac`d9qut0G<
zo#Yd%^cfuc?e;R-3UtEU_J9NK5%Aegd-?lZeC{F(61MsRXpi4&qDm;X7di-UX`UAk
zq&BW2rxT5xT1V$n6f=7_;3_*Ppy#93pI0V6Rr5}C$|&oCxUps^+VQR)iBmdlyo|P_
z#^h6Rtm!a4V3H1Vw?ssSrM%9&@3KykxHi%X>|Vn1%OtN>{y`|aujveQZm{Wq!?11I
z-yZc{CEpsT$GVS3*bc+}tc5{-nHqW0c8wFWsy&>lhe@Rbx<aQ+FlkWr?(z<I+Pg(+
zDYP(0-X%AE$bCv){%GIdznVC#`rwa|Q4Q#y+Y37v8imeqT|qNkZ9M9lHN#eh^7}B)
zHR-Jzv(fF7o&Jgf75Y~!6&_R3e~9-~H&p0cW`~aj9>_bSs``6p;5nGW2~K%l^q9t7
zdVbLAm3qFv69$dfzSJu3pklvV>k7ji9Lt)S9bw8l-Oj0|w8M4u?ds<Y5~H%E4t{zC
z{wR|N`Xm+cDNlcFJU5R64HA8zPjZ2KSwTU?2Y}!PtB2*3*4-825PoU__I|pk2}b{p
zr?waNDHy4=7`?>4^<4X9L7~_~@Gnm^9CZ)AyvvwyF`Wr<ik7}Cb$Qm$3oA_<iQwK_
zKCZk>k%7fjb;KQm^`!wDLSG*@C5f4l+MWwN<tIHA60Z?jhV#0|20hf^GeWwr87+?k
z8_+7NL|h>JAxL86JQLn1b$Z#ZE^BIW4BXJOyTLpqys?qdU7&kd3hlNWCNPmNdsT_(
z!Y4Q5Q6-!&?k?r858O%Q$I#1qM?o0!E)jZ=()wkqe8DaaRrerUpA*Vlw#5|P*4Wr^
zvi(x1;rsE;Ur+~}jVU^KU6&FK#S%1STZhcl8(IcJTY(%zaHi@ObYio7Em0v+KqJ)i
zmFN>f#PeEw7Tpz7D#u5#S0w~qzvQHZL)YQbEGlv1=j<%5^lS|ZxJKRZNk(Vq?G`}w
z(LMxA#*wpK^7Mtz>6CMxZG5bXBVj2|^V}*P#p-)kBo2glneRbY1A`q3@#NiLO(Rj`
z3cf$aRJjCIN<{b6CbDw-Mgzys8?E#_&U@6R7QH10Kh%rCj;ceP5O-@X$$-k4euk~f
z%A5=5%_nlYp6RP##B-am)1<~Z_f`uq(nN750Exj8nouf9XXh0D#=TzuW0}~K_4_g|
zx~y0!`n0V#h{r#1cx)JjSVlqTK9c4VSm5m{(@fXjAnTIwx`jW1Wl8fatf}5mEZO7F
z?*CF*YUNy2JlrEiDAyVa`I?W}T#sCGk1h|}bxN9YYc{W_otX}raM@%`1gl9++1u~P
z7G^S^ou_vRjx}A_C2w0Np1uUgsQ>Dbe9+Sgl;lpCZ=7@VR6;0vp#M(Ce-40|@)q9X
zP6F_QCvSQFKg5zfrJQrc{>MrAKMNoLK$wJF@t5BwrxTSNKYPb`+MD`we>S*%?d{-7
zVahDleN{l6aTI8sBJSSsXs7et)=c4r^bd2Elka)giySI1&X$-4m^)>_EKQz_vq$zk
zp6g8Z8#QEnyn*OUU7)xEvs3Go7n6~El?;VTqm`tE^vrq!+Cm3%<%V`0n#=8ghmUNY
zy=OG=(62`{0b#@FYWn`@_q@6c<C7T7hWR5k*Sp_8bD<Ejm6PiF?-U4oct+<*DLkOm
zAK$Ti#~b!l0a@<cD2!<X`Zwu7|0dKIaHd!{)jeR+!|tg!YDdTEs`8~?l9aHZR~{3#
zt6qK7n%9+>`+B$#+*{`Yfgv^M#A=(FEvP@vdwKb=^X@mAXS5ot^?>tWd4kmFSukh4
zeWdCX`Q7`hzZdMahEk7YbSuvY`cB{*Jjt^5EreiqeC<&MSMS&EquZjf1I*UUn2k5F
zoej!)<A|K899BJLP!-!q3+vPzZDSP!qu^V8=U#D>zJAxQj1-M+ZuTKz>J^zH^dlm3
z@_xAbUN#Q6G&AHqP%VWCoM%VvrBV24FuyV*r+e9BK*N6N>b(ezNIQZSC<V;4Lifv_
z#?M)@Zw7qD$U_Z|Eq~g0q<RoOXOJ_E>CTRrGnTuJ=?%U4MwQQYeT8~$Ekt3ubu@6Y
z&kSi27S=py0}Aj?-MT2pIIrkoVPg{+^YWcIs-+UY<sa>Ts<I|xHLlcOC|q^C5tZe&
zIdg!s=WOzBWxkOBreZ=n+)f&;G46TH%u-xod~1xZCCG_AIkgqPBgE!XKiatO80|lk
ztyr1im~O=O{JuS>d$i3Y3D1~#x7K37b0u|%S95p30AK5Qf6qHDVdIMzzr(QZ@IFuf
z&x3O-0f78~3nz0<eJ}NH9se#Z2H5kiv1fn!Ac~jp^SS`Y>&N3?0fck@r>eCoEz?fP
z`IhELzE0b`1(=_)$By|38RhH_i15k0msiLsbOPpbdPWX1nVC$#B#9=2fk$5kf|vnK
z>8}E}zaIAr*0BDwvu?#q$hz@y2GDoNn=ed%;~%L(Cu_;qZ*Dss)lU_p_64?0!x(RY
z*v4__FTT^}Q)6;4)w)C>TYa79`bNGpK3E^wI?;NJ+whQlST>x8F*y^lJKK0l7}uYE
zD=klRNWLY-H`9D)RVABaXJzG?a*8xn2?sB>%HrO5f!qXceC78lZ|F=IQ`}f78vJJy
z8&tmiy!$<6#T};C;n)szWcY!!(l^_ymFe2sHwk^cMmEsx(ep9%=Tc#;T*UT~Gc>=@
znuSQZMXiXablRX6$@S}YJsvX!@Z8bTF@(Gocl*T$T1lGE1_iuXXq*A{sI7bs8x)cD
zczzm;bcD89)<neb?bdXoTU+gjyWmv_o}(^(bbc|yN5B$#hItBNjn$bT5^beG!D`JF
z#fiYo|7vyBNtrT~J*zoTL;O<(+hXQH%pC@{+*(?a@vUS=CCr;#H<(`bwJ#97VEzJ6
zrd!~&I96h+sxxs}Q^xk^J;D`R2+V$czB)JtP@xG%_uC~3?sZ&#LQwUtgqK_YIfi)u
zN!Wbvw7~c%w4OGxBLVQ;LB;khVtxcwDg}JgyKPyS#Yj^TQKR)ZkGrHw%0Q3{3T7iC
zRdVlZAyo7`X^iL}KGWeS7pUgWf?C&E4KmBLVQz#xW+u3yXw(#Uj2dw;mxRA>MEF%1
z(HXcO(mbdp1BR_b?tTxknYx_Jd2;N_{BmhnchgdZ@0x1jU@ogXUZ7Mqjk+KHz@AOX
zhFj<6qDqwqb;c3fuE{|uuP;<f?}QUnVaEfvLL4%eM1TXL@LupcDb;n&M1I=r0HS*o
z(U-mPxi64W5&?_((z5O#1H6}7(w5a$z8USCrBtTCTs(XY7;d)|nUz|eA(b|-L4fHJ
zFs-R{?ub2<u|}+(C?09q;kH4b+89?*UCGLWj=1fx-mQH*f%|#rjE~=`0Kv^hCIFr1
zwlt#uf5Ks6JRlraoa?v#v50d1b*bA!2)&`S|1mOa&{4nqBqdo${sg^pt0Cq~M{evp
zkL5idxnp_@`Qn~v)mOA6aG4rTk(y)1mk##V_NN>8Dm`3YekKV0I^w!%Rl;2!N!GZZ
zb)$B06B;DrlL)Z(XXb0W#(VIb;z8(=cME;IZ$2-K;<1GLb|>xmQsXkZvbBa?nAiWh
zNc1oqt=eLtJJF_!qLeo7@=7PK038XpXdsZXRrL66#H<57n^WANxY3J%$g88(oe-8q
zqEYj&htDX`EQlwTIMw3L<<2UIEzC+U<FJ$PV^U<iON+qav<>>g8z2IrJ6{{$O~^f_
zI{<BP>bkwR)?!L-wm{ksPhmsRG{p^PAL%|TJbF2qyQ2nu4J_o*szh}~Gn&<-WjRon
zGI9)Vzs@S#!i)|MG$fvS1{>E7&dz*Ms^V;!-p~c4Rt5u2SAE+@1=iLuY~QbkYsR68
zfXL)Uj(=k4heBm`C&|~NDyei+(hQ60QfYxxjdAG^3&h=~P=BNa%YrCoyqMSNsvGzX
zo5Vk8lIbK-QRp9<qzO+2W<lXE22Px;EEIKY5;z7u78WKQ-(}f_=&<7Q`BGDQ@#{dy
zw}q@wGvI>$R|cPglH2h`nAzZOu%KZrcF&~@%bI<CCVe!+nv}|7HT6@ib9ZDF*MRsI
zjaAN^x7qP|HdD7}D!A*ur>7*q1=vC7eL0S7IRVK3MdHRC99a*4$p6<u7mcL;6B+>-
zFyFnY6(awdbN~Yw0N2qgHOl?LU+qc|Qnz$K&@GU?&H$b0hY9SP;s0+8IDGyQ*ZoKP
znlmQ=!rkQWUm>QEUtgaJZdvAqF=VgeRXvO~E*lJ-D)nI+TpD+7lx#JK+FHA)7^URB
zZ^U7*3tBg<MVdnSK+uE-5Tk=tpIks&CxDTysLtlR*u1%VPb1U)zG%v9KtqLFO9L;!
z4>InXImo+)!sG1Kbq?<A5Mb6NCTXZhxmF|f43211C`7&zukNIAcHUlFeiET6)=p*x
z!>H9_J&&g|?8y-i1=ZJ>Nwq5uZT(%hT*BK_?~1-hZ1!mGEoo^bk5y_Ut@p{a+yd3t
zCl(K6+oNLqJ$<naj9;DDt*ay`YmYJ$u9~qyDVyu3&}we_><=Vbpj+Ga{c6g{Jk<)g
zCiQ6jmk&TGmsH*kUh;s>3gUC;URjfIf`!a>0Uyr{<(WwnN7Ed&q^tyg#>(_L-r$(N
zI_&heKxFLk3Ju(N7qGiUZesT<))a7)*^;nSWm0{?hSl)a6Ilcip4cmBX2Q<t^XH|7
z$EZSCmW!(8YHO06@R0Mt8XdQMI-kjs`X`tR3kX@DjeOts0|z@n0igX{7ZeZYP`=9#
zfaZ7tW3o(%J{hpyCS)fA3Y@_cg#&;D4x><7wU7+jM-6<J{iO4}@nngpP^B|S=}0R;
z)ItFeMxR$H^6cQe=byggGsd&E$^SET{ogBCIQORpALMtlDZuL#I-{}Py5;=->2E2o
zU(b6aul%G5YySR&v16yPCtWamO^**_|DitS<WrAO9*UVd$qXW9A$oGb9bs_rNMSf2
z(;SgmA&Pa*+};-HTk+1zP~8eZ@XAaA(mkm$>Ra(cr_;p@Z%U_DKvQiWsfhJBW}2fH
zV)XSR#?mM3nz#J{brfoXoYt9?l;(;we6=+QMtc&IrEPHvBJG%WBJHX&{>$E~BF5fK
zDKQ+TJ+*YhAF({bb)!_W#_dX9`sq|!jmO&9W&m`_q%U?L3!?|8H=6=dGx?^?UMSY3
z8A`D8xgSMy){ENmChN07EK)t3dfZ7H<6=s8IXNRI(w<NY=>FD%32a+q)5Z=SE&9SS
z3;D_0tF3U;Sk}Vn1okayW2|$}A5GMXlNy~)2)@Q*%BPqZxpl2h2F>Xoj25^Pd{E~A
zi6VN2A8=TFfE!`J=1QS;L7D!EHV%_#q-jfZ1mhv#mD%XjWdsix&1mVb7p0VxOhff_
z0$RQEs7qXZ@qDNjW%frEZ;4UOG1>LD;1^{J)&oju+e0{Z$2s8{8`cblq?F;BM>d}g
zWL$`zlufPJvBeaZa;49<qfqbgm7HxlsRJl5d3u5LZ)pMLHASyyS=ZL88;<HXcxR}-
zUNXdQ1Wxj!qJgdPyia&_lH8?`QDo~i$QaPH$Y2e*%A^2){i~cqSXiXr8YEkJVy?ba
zWo0?Sa(L^uA)OZc<<%BF05=!{LpC76KcDV+A=;ki`Z8|e``)M)W~)WNS@vZmmK$P|
zt}i48#CSFFBr(>c)AF@>J&wDXm<%{Ck2v}bC{i;~L2D2IM}6$fYnWW%Lhm6jtV-|>
z4GdY8)Q;}FFC;BY*z!+lU?;I*OFQ+{AiyZ3Q|BjJ`=3!L)nK_jj6djSkMjTM;@_qm
z0^-q906#S)F%f|M#K)g<C)L=Cg64k`9=-uKlz$Ozt7=tU4d-~-smjrHT!$Qve^tPS
zKfE7h2JvLnJcO!*$xoUY(_i1aw7fVLdULgCaDF!Dk45)P^O+>MaqWSNrBGAm=Vx~S
zv&Eh9>d}mI7TNm0=z|2Rq_v8%s}pfW#>g99N6*s9S&CON^MCp%VRB~QIj-vmVR6K?
zJe{vMCG&i|b{1R55$1OWRvYjxtS`W)$m(m`nf?qqi$&0jJI4F^Gle}4N}wjz7zt~&
z0Jn1X{+Bpg&zNA~Kg)8|c^(XsE|(2q(Lto%;q#qqBhVRthHdMv3?Q<yK~YgDSdLa$
z@FlQCrq<*AlQNBmuc@6mn#p5QRB-eT>#0dmLW-3{nvs*s_$PUGiJ?hpNl&?36yvVK
z>&bU%SKK`_58WO(eO2OyqP*E*9bN6&9XMguK_>V~SaRQ#=8~|&T4>%9OLK59Z>|ZF
z2ejRMdMYeBC`<IGRecF8s-%px!nG)~-vnosCOC`P?39!`1FRRznicQx&5^Yg6}4^Q
zwe8<^LuoXT=qrEWuo6$ZqQ<e;VNQ<OtNvs4Q|Syq_;F6p(B3^qxYv$bMMNx5x)gco
zFAaFvqt23S4e!OKj>+kvg5_PoT!kRRr%Jo8mTP_I8!HE~^FErbv9bJ3Y(|62f<A&!
zv09QH)m`0ihP0RnN7Q4$l2~HWvY@$F*3xAUgKBbDs}QDcm`Ps%_<2FqFWa~2H9+cV
zPoL~TzFy)x*Xc$xqX!!lf6X_|Z(6Xg@7cg+to`pMkf%NT{pHF@SPmT}>`pp{DSxz8
zio{tq2Q0gh$nJ1^xxs|$%))ANY15Oe23JWEncGuAxNgTvh1L8>?M&N60#l<9Bj~fb
z#mM1G6jZp^>8jR1Y<r)h3U)*3TkJ`wT1V=_{N<Xd%wobdy4u#xH0v&F;Y`}qQ`ybp
zCNKJ2bIyiWn615Lfbq`$?|=J#`JO?Vs;{ivd~!I=-g&KHw)2dbsLp*m^T?&Z`+Yx+
zi+`W~HFow5|Br8Fcy|E{tOSr#ss+@HYso+=EOMkg)kgH$PyRs8wO5(8S9C>L^)g3a
zSoT%DUL~v~Vi6B+$=-ZWsl2S40YrLEypzvaXq&Ee5})U={O4crj=Lk<R<S4No0dB7
zH+YvuK*~__wRRrhY87n2vQG8a!lr6P)+d;8o7iTmaZq=?du4nFKbJ;8W<L`779m%`
z3z16Q4@VT2xb%6a#GH@*r0BGLr@2GyMZl;tvF!RB&WkS^^K1E?7%h&R?KB)TC_LAh
zD}JzNTsNf)EAird(|!?@b4t=Ov!R%`zAb;X;Kcmv(a-POR{cyXQ-J{VL<YI2$X`T3
zL06#AVJO{8$}Q|mOr@@nKo~YXF&p<K#U_yOPJU8ofTw}IJ$gCwW@uA5bNNg)OB5ug
zRP}mn-);L!vsCp<BiSgk%IR_oD8F*sXKFEY@K?l(+*Y1Ji_xI$iLOb@Cj9n@u=78E
z{{uZNpF2u@1_^guhUVEW4uIUO(opWBeHlJ!0zTnpEF=>amI}zwvM8VYt8(}FMxC4_
z9c<a^@P32A%VUc)l<_Mf@hcPz8;(ox!*FwmQ0XLEhe#hVPsCxu*050IV&XHKp05WS
zS+GO62|g+(&{EhM1FJj5x2Zno&FB{h?mZ6soVa+_AMCM|RPF>?Rres2$$bh#z^;ke
z?zI(fjfZ;=zLC?<i%{#ax=+edrDP$p1WH50uzvQSTI{xeSkenLO0s0mZ7py3dSbfY
zLy-SH%v`q|phcc&SO*dl>WYfyd7D=16fj}NfisS9NKUSXZfBC#Aa&L^nw9}CrYQ`D
zUj}lel6~Vn#tl-RKS$eSnHsV7fG6nH%a^4ItB`<UJ{!kLZt$rTT8(;_JA34v)1l0!
zXT@bjN!Zc1L$@;1&aGM<4(I=`WATUERp3#k{GI~+Q2`qneg-n!)B0r4>v5%Kw3wej
z<+lOBFPxoy<nHtv$Rxjfr!9s$udv_!jl#pJIr%vs&(nk3dyB{Qw_aRROHG^2f&;ma
z2l2mwl9Lev4nkvA-4FZlWo}86zFoHG&W+eh9@RG#s^P|6)I>#wHW>)iJDdu3(w56v
zopVPBV@_lu%RaO%^fkPs22WrS@(6gmL1qDNzIG~)Q_PCCJfW`^XEWut*dIJkCn>|6
zT9@)pE|iznB>(;QnbWBJx;1+Uq_jG!F!uK`9${e_YiniwkVeOjL}pZ4a8<{2(AJ*T
zfi;TzH6M0+Th4D#V?zuz3=75OsF(mViM^Pcj5iMP#aNO9C|!%QacFL)!RLUy+fm0v
zGBdhOqzU)h9{Qx!K3ty;lLaI(t#`xN)SgtjeDGSN3upfTw)*wA5ak;ewiF|AC<l9~
zo*pL5pD7X406=jL$Io4pW_Rez<-iRq0+VJU)Q5|{-sjV#6EWxhcxW}-%?vT<%I9r2
zg#8uevES<Om|j3ueLLvdUUEt*Zq@+ojb53l`y+p2UnvP9yQaXFuU`luegcp-gs3YY
zItOXGhU)?+y#!9`WG$8^2t=mHIXRGGb)}bHug+DXqA)gR1XU!cY1vL)d6K}_RUx%9
z|GI^}s>lyG+Kc7dT5eP|BWtT+9KXCYMh(x*JYQ@qOz(A@m{Zh>+&;t|m^0Q&Y@<I)
zaS`iytgKY^z?}up$K!@t|E}?kxWIc*m5YWT=&9pflBoF9SFbJ&Ftdf%JDUo-O7We<
z@M$};NCe`AzK3c)2)$>8L7{e?v(9@f)_9gj)Sy13yO{pz@n!<craw~jU|As{i(MZm
zdrt6HTiDDE*&DeE14ykB)rQ)#a(B&PFd4STbk-2F*sdN#9dLHS_>^VLY@e7ed;ys4
z)?o{NpdCr|V9Rbp<+f`tUWC;L<%KS)nUIb>OX4sMwnD2N7CcxX)>^y<kOP_(=em#a
zU?7Y@$$+d3fI%7Fb(^5WQIiif9N3EGizM!LS^lqc=+>HxHD|}WeBkS2Z#;;yx>g6_
zpw$U~O_db9Am2>#RZM4?I)=J#>r)Kt_#Cy>w5*Zs5>9B0DCdrw3HL{PQnphCRUYdA
zw?dl0iH}}`rS>fnS^FP4Q$(egcCdNKl2vJkRuk{Gz~Uavtpj5Koryn~8eZ9RU*G%O
zzoDF=)#?&n+!CgD7mH(JC{Qu3w3pXOHDYdZj=RGD9ZH`S4qa&9>(3W^@ZV+jU#!V3
z-usYRz24r+`h&!nn}3$aF!gmjf3Gk8+u_}29=Z3iQx#4OXiTGi$8=jYZhmVA0~6xo
zmXHb8i={T0FMzr^r4-4rV?k<b9RW?8a?d|cO8K;h@$-u84{R}Y%P4ePl0<sQO0DAN
z4xWbEBI4g+gff`%dzwI+n6LKjbO*aa-q+P599wR5wT+_g-9%5Rs=6S%-aalp7*&36
z{ydwz3Vt@%edD!FXxGLkq1?mxeLpMCcByCR&J+q`Z>(OOee+;nxPEMPf8QX^v8tiU
z;aYYY#v0lm(zmN-Xc4|mt?U_T@NwDzISqe4TAhFVd{lRzY?6;JceA1gE=|DO>XE0q
zx;jShY_7xojVOjcn;Wa#OD7>Gq}ubfy8xrhOtSLy(8-Hk332k*3Pyv+;tbNWxN&cI
z+O31~d9f}O4X*wl>k?ZhKgk8vJ(<4rw}bh+&NbC)%}<k>wDHyS;X;T)Gs@cEHm<Vg
zO;;gvC@wUsIddsJbnCA2g_k!J$H03wqLr3jqY;ny7M|753Gh=Y$#X1%I>cisJ?lJ7
z0jZFfcXa^4>5~;mN~jL;T3zp!8wJmVLmWc#Z+RGTpRU|89{@zaTU%W=xY0Ph=^K(!
z9ueGc(i!CDKvd9l#Lm>O)&IDY>w17G#^G$CO5j@2raV@3(eUGR=$w)2`{A39)?lmq
za=M;12v-81;?Xdav2;gKti2BoIGOi1f^FOY`#{W1&hD7li;wA}cdI`!DIPQzW<c93
zTiJpZ-A9l3Atuu!f*2(}TePu%Dy3D!wrS1WKc%hfL%s0mWbr}4`*H}_pv!Tv%>tzQ
zuDGf2i|_m!RW;4g!9k>!4MAn@#ww)%8-1zSj(A`udBk=i$Co_<ev24&{>{eX(|CA|
z(hOiz^&UR)(H=7ux?~GX`k5!*mP{Pxvb%DUN2Cd6we4}`74C{@Z*LgvKwPX2StO>!
z)A{R9q}%WH4Zn0eUX?bA{8^{{aP<Ct^0yPQ0B_8p-1mroiFcajZ6+`0`x3b2H2Z{8
zJhs}91jD)+$a?lE@?guaB=9<16i*eij!Zx@<!!4U`*sOcCg@5U-}Ftmx%;`;5NAik
z#}grGWs#2RJ%mRp6Ri39*T7!m!gsPPT^4x-HYRA>0ih~dD^C%vE@GT}3=MIAOUa^O
zrk{e?^d<Kf68ZV#kZa}mbKee!Zs@kX)8!s2mn|5Gi?TT%=CSyXD{}Rft+%xd?6<R+
zawlb{yE&7Q2gQ_F1Xju7GoxPt+4uA}Bwtc~tjori7{BFWSDTd*-o>wyzmyvatpkAZ
zPtf6KHun15+T1*A<C>`0afb-irk*!I()D=JJ40@B9D{QD9JfoegyY<UZBi857|?)d
zJ5`}tQ>2uq-K^y9A5=Lw*L^Iy=pEIuA;<Lqk}PFwSu<I|zCUnSaXNx+0RA|#V_Laf
zeq{)dP8@bpfXhu@5a@U;dn4>jKk}efPtI;@sUp99kpTz=pZU!tfL!J}g)u;_RbA-C
z(@+t<K{P|^+t|!e|AygZaqj{~!8ZH1h_Dn`wu1QW#r~#V>fR1PeIok;b}H00XMawZ
zM3FjVwnQbCOUC=(mF;iM*K>q=5YF1NzWyf4{Bdp}gfyTPf0M5Jj8U^I{_*U7#ZhWE
z=~}C4nO3qX<Fe_SgUjiq121fJA6oq^onf+6EMab1`MAQ6dmTtBCxITkvKb!o^RTlw
zjm(8Sdi30K_fz9$uDe`LgyG1^xSBe*{#?JOg`poO$ACkL|M7p7U)dol;NrIzAXtEN
zMXjw<s~lu)V)bM_CFOarWdHTIvE8MHw(hOaft+&7F9tBvs+kc%uK^&zF8EE-`|+H>
zdi$-xhDC?>H-gE{4SqN7WhNFDhu!7Y2CX*S2sRgh50mwshU=RTfc&eomU==$GlX0`
zY$uj_NoPTXo@E|5Q-Qh=9A_80@EOWwZwtW_F<VXioT{H%fZ8t2ks1ul?6)r{yr#*A
z4^OKSwR3GHpxVD8tHa{Waw8L0goSctvXyC)QCZ%Vmc{yhn|p~jI$U%AeDFZ<igjoL
z{-5h%u&U>u8okx?!?OOz>ySF~SCo_K|0RlPUU_TzP~W!ENP_=_-Z`foebd$j-V>T1
zZoe_&8H$^oPRoFdpTv6OHH`8QK~)23vqabJJFi_jznp9<lW^edFU0|ffX=ZT^V$!Q
z95azf)<hS;sknz>$l}A_IYXzNO>P1A9<aQNemWx)ut&Wr28mCYoFeUO!iPsBteZ?_
zi4Q9CH8{J9HnR>W2WlXkSpp2saZHm-q4U}%FYE>q%nnA(WZ#)h{KlJ8gDL+){bT}B
z{Wvr|*%Z3TVKMbRVPTs+BGupezp1t#_!zEKXYwSxh^dU9Q*N9LFsru<wlC7rrIQlb
zK=d4DuF7)5y3V<<_Tp`XgV1`0L-TF#X7@K^AH>=oZfXN+g~9mt*1g4K8@bmur-vTI
zmn}L3F_iWuW;W{!ZeW&mgOq!VGtSig?%N)v;n#lJ=VI5nvuCd)X=c4e)K+v>DRCDv
zAK8-e8&Np5TnceNoLWvR3JV?u|H2<TEAP_BqtO+y(b3NnH%nalPC52&a>?|SD!hlR
z9(Xp}e(WV4j~kihu%i{90h2+WFB$<~*MSrub6}P7QE*tolfA7iXNG6ntL{913q(y5
zVmobr8W5gCP?++>)fMt_c?7deZ0RM_FJh|M9^f048ZOqq2fTC8ZqW=Yp1WBLAwIHk
zUMMskFyXy{Uby(F!>zb;<MT6^ZhiQiT+3RsKStFFRuNmSxi=r?6;TaA;gw7hLdPm>
zb%YMPdckkUM0g5jic9a=3am#SuyW%CcA9+&R@t_&y!DA-D|rQFAi}3caS^N?)-c5*
zIjFI74A7Wp$r4L=bEMLeG)GRZW=Lg<q>6L-r1$3``u)E&`YwTNC~nmc#vaR*84>9K
zeR&+fT!(gkatL*KIvHeZkb3h`gkhHQIxKXh_DJPiKt|46-bym^UcbF<uevm5y-Kxk
zXkdw>#Yq-vn3OkF3tD-b8<wk1lPsm2Pu)`_Cx;!g_@&KlIHBN#j-hhcV%grj`+(*^
z?q7?mips9WqW3VsQr_Ohj0JWC2r}%VZJNEw3c>wbPn(xaUoc#AxeKPu=v3~9{Z|tc
zjb$eZ$*#yTu&k12Ow5woPGw0)^LQdsp$4+pWXL|(F~bQ;unQReM{MU+6szo>FncIO
z>w9t5JJ|$QZh5p--rt_iuyCRd&pEC-d<vhyM|#KQNn-{FjSb({5R6B7?&|4cBD8WY
z3drbf44}TkVH$PHKb6x%b-*zxEwFL?{}hV<NFMRq4A%Fam61;WoQKz=)IZuF!90l>
zCn~z1YWcdggg!9a%1(RKNGk?{yaxykPoE0U6V%ZOx;jy*KE$%e)i3X2DlK1JzmQOl
ztdvC~BO_YlFXlV=jt5+lwQn8<u01x@{fuxD_`s?nusX6wT^GJ%DS1wj`TF&WhiRj$
zCZV}mZIwGRYHWl!C~&fl2>a8J1=f8O$LsS7FerBwyfxqkV2kszEkPFg?#r@<o5@VS
z&Dm&F%G#8Lum0$qtFfS0;B;k(#jg|yXVW3|&8lHuFZBIWIh$Xz@ab==gpmNisi67<
zgIZ|_Ni9C{=|j75Dut3tnUuSr$3KY-g;te4;%5(38|G4Jx_<un;l<qL4b$Ja+oRYH
z%|WYG+;6=o=F=iMMB>@a?NKKY5uLU(70V%GzI9TMOduzekghZut$ys?TC+!B>cvwV
z<sXDCZbCm1C(Ej;hC8RMPL;;0@v~~BY~JMzZY`l|O;45LhUzsO5EglMUVp)7@BBZ;
z{sJnh?`;@{r33^)LP`XrLqR~gML?vbq!AGiB$RFt0qK&GR!S+QyQL+R?vRk~j&IKl
zGve=mKkNP0uvh~#=bXLw)%|NrYbVP;^|qXNk)Wco^?hzjR-R7#*&Kg<RnN{iON*bw
zeOOt&%tCU_-&_vpK~Xgw(XXr*V##jtdut-pdlP55YLm~)B_3=?uCG&l&LRN8!xN$z
zl?@VqYfXNOoRb4k$Wr(idtg&g>7)p___`wZyl|@-NJ;fHZ55Lyik8%$A$>j!WsZ@N
z4FtnVoSTh=!T2vpdIepB$Jjh&PiJLD1|q!BL!K}tn>ocR#AikW<3NRlo_$Oumv<>!
zSTpgi2xafQQHNq)pgaIeAB+Bt`5=G2S#@#G-AYHlt1Jx~!O97~Wn~j>3HG`9eC~af
z`H2Kg+}T$Bg*qeN#>ENbKWOH1oq#Xkp!f07qSMi&7hibThWAi>&sNSRR}E_r#m$tQ
zH?|Y5YacpDRo{+pmuKZs#J<oL`dNtbYir!ajf-fX%T{@&bSd|x5=+Ski=Lj8;f=dp
zJd2#ndZ0Qezwx`bmNs!bSKiICu6rVhKzmsr%A*OM`_w<mz&8Wjios1lsT)X%%GKIU
z#0A{ReRj(ChPnn<2j$k0h*Wb%u-EDEtc2X^Qm<=6)t;{Dj$5OPTj9=*%-CAy($Y=)
zC#Rcl-13I%7F~CD29{Wsc9$~T*uQg~=^h_%ag=TUY>HI5+AN*(@pvYrK!=-E-LX`w
zhVS5nu0&DKoV(Q6BUSE3U#ivExI6ATADlu#kpb`t$jA-<I~HbCMc@5-P=}U^3Hey+
z+cA7%+ip3j_OF#<qI%{t<^_>cr`^kKRz9VB%b(u^CX(EGMY~wGw@u%Ap1zILIFoJe
z4#@745q~<u2Ou-#W}9JJH||<}Fh_jQt~Xv({xS04VXvvm*1R4~gOB&HV5O-leGz{o
z^XZcsUH77l9J%Y89kHC6^s*#I){!{QGbZf^yA8NCrf9bVi1RErXDzK=3vIA}?5SEW
zrA2QRMNY^DjA!pW4XJo4q8hU^_>8jHL9p+Uhv{so!0NjazN8)M2p(BApNfhU#<b_=
z&hwtktIof}P#u4V;r<3*x)+Q-6&k%@K<@UNEO)cBrNykABorql&lgX%+UHGWjxu?B
zO`jGRRhcU_zF0X{od3ozGWSD^2Goyb^$KlwM-DswO7W`Z|M*n-yC3irJRWy4i$hx=
z)_bBUz+v2b#TuyiIohwjYsjBb4{Q%+ie?9m^#{IaofvjobWyTC_9p&7)EMjDf44b9
zA%q|@MENmMd$GB5QD1nksiU~h@p!4sv$|HE;WMrL2iF@k+<zoLWpLh8$`S(_H@~?>
zu}yA?jqV4%h;0L2)hC9|*0%Ci_d<X-{9e-XiwRO|NxvjP4Tdhktys0>7uB&8H>3<6
zKXYCDTr_;leJ7^G?rop#yrp}En!L!^Aq(|N!l!ie65z9(OLBDH?HDp*C=8-31`>#L
z;QFsu3*W7N==o~4dG2=JRK?Cwo4fOJ-}@Fygs%3OgPg66%a?>IXWZ5wJjSifluKDC
zS*pPkZq6SX8{ZDOW)2@7rm3lERXJs4s!nvc*VVfy9x?|Iw|U{}0>WEvlS>#m+jC#j
z!oJ1l_%2)1HjBk~Zp)Q26Famlpp%PV_R+i0QpS<Ir+bcfFTtYfqRC@GV>IJ2<^LCZ
zwHe@kaU|?3LbnSZSKE*2FAOoDl40R)4-kyIIlBbtz5Gz0$+pxR4T8EQI{mpP@+>&Y
zdNxa0w%!m8DhhbETg&FTU7siN<(|%_Rlj+!i;=4KJwJn_v)Z+H>3Sb=CLf<(JF(uf
zN&e4B%^#~@vZxK02e)YXojk_!q8_7BnR*MVE^O7Dt%S<Ox%D>X`NvOBcZSAUkg*c$
zxflg_d*}F7<<QTw)sRQoF1dC47pmX|{kfvgj8ml9^*s7&lv$pT`ekg4!b8p8sHTtA
z3u@oh)+!6+1TOnrZ^2&O?ZDj<?yXcFx2(~*P3f*}s_iY8BYFMC4c3{at6^q&K`%GI
zD45Ok2FMb{5jKoHvuBS_5!&&;$92{h-4CX$mkb}P&nQ}a8C(dLx!gL{(0oH}mIHjz
z;1kKJ-j76p{&l`M30b?VpMsU`cGd1g9UmtU&2LbiUQ0`$^ikKFn~iK+=wW;DOli)t
zI+hs)eYaRY_siov8QqL3ivg?POx!zKb_KHWB<8k<nU0_R@yDI>o@n*8D`F%cs(a(^
zCTiXJA?NI#OZ<MJS<)Oh-_HcU4NMG^tIxORKaab;8Nkoh9O(B`t?}M(X;l@Q0i7&b
zPGxtY^<N5l6JNiCoet|(O+LuMHfHpRGcab4KRL>rNe_<Yw@YvSVWkTOvL3wdQ#A(N
z=!9PWjMIyl`%#hp-GH|*hQ%6UZXG`<1jJ#kM_M==&rVKjd!jOb1Z>W-OzS$u+H`-*
zM7Kj+&~Zg7;GplZ?3b0zO=jRTG60=Z!s+=K=;^h4+iaPb8Va>d9g2#Jq<>Ltm8S_l
z6~62||59e<!&xQQ-j(#S_Wr8KPke1aWgF!z?yaVoZ%`0X1*yR9DLPr$18BB2Wkx>|
zKCUwL&MoZs3V&fd-e}$`q&6%{OAmfOpHw4O5p8Q@wzL$jsi8b;Zkw2S-S+B$lIk`Q
zrHEq1K-p^fmZodSoQcaFSGVJRZ#Aq*;@c!FxPn%WG8J3a3|9>2e5959PGzLM?xQ?c
z^WSm4%YfsV4$ogf$^tGucJ^~BI`7QLM$9;-bBlCHD^RIZcKjCJQ{O5<(|SZBqw_=-
z8_&>C-`QGE8ltDiYwqq*0j&JLel=_jZbYxHf{TMrIU{@@J$Ul9xv8P{{BgAvfJ(P=
zXQc1_>pt-oJ@l7*tfrfGau(BaMOto@hT2pj+9{%I(_SzJH;4gs9tHaH67z5o6{eyL
z#ce?^zjP4lGI)BRSd}Yq`(^O=npb`P`M$VpcQu$|4~E9KI9<QTK32=kE4@iDr0|%4
z#*qxsz^*h1Xr~s|j3gLveHGeAvU?X9?K_xbo7X$I9{+jbQBv~6a5ar4!YGBanyYBQ
zSwYLhgmp)gT=>{#sC1)_d)Y6Y9FLI^Cn~#AafY=Zqv_E@MRlnUlLhdNdME?&YR$|R
z^l$6+T!PH3tms1TrPPmyrQGit2T#HY&jOY%Kj?yA2=BT#$YgbPk~JG^+^^~4)D$T5
z`(R#q`K5PAQJSbuLQ?7-K^f*R0$hT|7}2Twp5{Yn=7>h%8Mj5*N0|tI!dz7>X!7yA
zx9lfeORz~f9;{Yw$*bddSHl2gB><|f6T-DhWp1cjb!ke+?N;p{#o;Jd%}G|tH+|iJ
zKqDUaH;Rsj%R^yW%5dCP-{0c7=-RktlvX<@wcf<ScMc8&ZlC6yfHp*6?mPBbqHSHx
zy!ECyZOK=K25X;7(sA66dd<xKjw!XrPbTS12)_Ib+St_UA&7cZakw2jK*xD6_DVxW
zwKTrN!zrrsg{^U2&GRIZi^EbWg}qW5)0vL-%TKKaI##$E14m)}%p4)6v5GGIos5Is
zm68{)`!!sv7&W^vW<{SH$5xnxwp#mO_ysJkk$M@wKR@e9Wr%;fcZwb|H?r0lRRw7j
z^oe+Wc6!p3d|zBVt+e#Xn>TMN85F5uU09PNLEMY(u}Z$eh_j+eP{*U8v9_o|cuW>f
zQ=Yrm2fsdz2D1Ps?(nblWd*3Hrj@MeFUS$JbgZyR&{dHoa-z0nzc(?2Ux8FY2oB{A
zE|wQmnwiM!w}#U}Mn&z=e3A-@3K2fUfHe_Ab<u=}&B!`rPnD?agAWTa>;6SC42NJ*
z$tDIcnWX*pWtSgZN}F})_RjKDx*R`$j*ZdMID3GjNWsS=TBhs4yL?U&4sn1)#_b;X
zLGqN=kvzl0@%@5>qrmuXtsuM3c)swcs5un;x3C#x9=4v_*W*&ND<2&_l{G9%r9Zt*
z?MD5$D~b0mE-!8i%5S92OO76bq7+oPq3Do5OS({$(<|v^qKPz*KiRL2e&cZ7B`Kk-
zDLFY3aFxh>A8Bvy(Af>8TH3!<n**U*XA=?<$Qnhhk$>di%+Jp=l$DizRP#Ge_pcp5
zJ@>@2FY_teOD833>OG@6pPqy`UtA(ym;KUgtrBs6f4fIdQx3P#4-V#;4;DUYa5!x{
zQjo{aU#&_mjW(R!;lgp*?R|oY`4ejf(c}@y9s-}6Ay=M3;qz5yGFV_W6pYRqS~T{0
z^TrU&rUw%TE!4GK&c}9JUPDm1rk)TNd-477X_m@c_Y7bG6nI12YwNE-H$ogR%TWjz
zz@5UZJ#JG2eFxIR^}o_SmOlgZV<qiRIp~Ix3?JwtntUCgY!Pyh4s>s$;>pFuYSYux
zhM*g~8u~rMmTMq;%gbi5w^itJ2P16%R{5ZFr+n+3(X<`>(XF?saw9aSr}C@s=MXXh
zPD`~cU|gjaUCG_dDL!$C(Li^4>ZVJ|B6iMNJoYdYvFC-F!yiKKoJ7dT#ulBK$@n=t
zyO8W``_C`)T#ES!eN^b^kjWzCBNj8m@sIbPxkTXOJk;)enimvZivS7YnU2Wl%mQtr
z5-75Xod1&*A$kK-5qm%~^d1x|Ci$VA=M4-#_6`ha-FyJX0L#X5e78kOJ<qlr7Ng=U
zdEozO_l3~aD?+RVkKHVMb^jp41Jj7puT%ox3n7fOv2^IX##r$9V->rs0bG-0g8&-~
zm-!ZC*VT-7?N_y>Wn>D-oJ2)EKwlX@Cv)cW#uVd>iscpeIr2+L#N%R*10KA+NsOn;
zn5(N-|No=}Kp3^vf{Nt=q8#~09|Vm_nDL(0jqgA7jyXS>S}Y(pw(7keJp@Sa!FDB6
zb*gvK$~2PKOv}Ncl#rNMCWJ!BCCoPl3cX5gD%bDyOe}dnKL+{<Qh)ZV;Z{a9uKl`0
zhaob6w*Kvu4sXlrpzc7-jY=e(GRM`gbH{TDWdJ=FSDUg_*vj>52TG!|tE?U#_48Sz
zt6ijr(xwKf(IOhRV9hcCE1(Cl{K`Kj_*j4lCufi&BO^Ip4&EIqGK*IA1G7h3D%Pr(
zW*)tTW-3ma!v!iyfBfg8RE2I#ORNZo|ABe(O+d62(rUUh%vLbbIbgRC%L@OVXE9zf
zNv=8h3s_H8In-i{F=iU6;P=DBnprf5M-ykPpy`R;m|-C{Y_y-J@?WW{S%t4k%U-M*
zKAokIqqGa}(hf&A2-;?55lCwdDgg0$?_7KezYyz&%Y-sOV2-Qbp8baYgBul+&l@Z3
zTHcowOevz9Ur_K3rWnDUTPfAlrdC}M;oK|48+#=?PSmG0I|D>hZvu)2_}JNEh&!-G
z@{VSXC!sGmL)`QznFFGiB3x_m(>bm6t;g4~P<*<Edd`ve-vRa~{5lyEQVFuo<=NY-
z0^GxX>@VHj-D5@Et3hX<hH6PM$fSWw$hyp5Yw{)M?l9kz<Y7vAH2u-E@}6cr9Hakk
z-uMc6<}#Hvv3HE1F)*q!Pdk^xixGSv147<}YGlwHKC7ZykdjJAPcPYPrVO}TVc%qK
z>GTzK+<;Tq1ty%H9i#Y6%tIMPJdA~EMrkUA8YA}*Erk)z{A#*&Neea&y`O1dr^y|U
z{C4kL`goouh?ECF-D!qH=L!9K3>KIQBAzav0IWl{D9%n^6e`%-D<$D_uZfeh>=AXn
z8^K843wD>@{-wbEGFFd}odF?skr>K&%HQalGzUGyOY0Eyo;<9D<9n22EoWXxT7`hL
zq#Z;TfC@maE+q_jr*9V-4+Gu_{_zKpp(u#O>r#bQ8$`U1RDTiH_<$n1x6zdCo&PmB
zrXm<i?RDfoEXZlumCAr=yM=!s`Q!hzn*n~GZh8pQ#-s^C+!p$P6&gjwO7;BBM?ncc
zVl|mu-^dp$%x}Yc247eM9SELKX*dc6vj0dVW4R@!Q^hR#2HqHc0-{ir1oQM<Y{~9{
zj@8@OCo^1c`&h|UJAj&fH-`56fP>Wo?3;)+!=;cmouWEJc=26D<}W8V5;@<>JU$jx
zzv}A-M+;AG2sz4;ynvq+Z1+~xaIs=bI9ee8GhNlMrY7lKZvck-bgkf|AOh9@RRxZ=
zKs#mP);r5T*W+)A{!9-tb?BTtHYVSOs^pMgybwSbxBg^>A0PCSZXbj^35dWqrDa_#
z;a>Jk|1Tu0f<$%4jsEoC5oN8s`;!Z0y-pWb<*<EJLP+jhyimTUGCU`Ub#4fATX~0%
zd+|{YX`)6JeuR<BzYJ~=Iz2W@Bj0G_#e^2-!;cJTi+iZ7mFs7;B4;=L^n3~cj`gzF
z2eb3-AWJF?giF3Xj*UncH?mM?V*b+$HmYcp_CVk>!v{FpqYkTk7?kerr1SixP4KSv
zsprqJ>)~Nl-%BStev|oO2>h`USVo-8Ir?~1VD|PWhOUk=ygU5mGV*mwgOuIiYsJ-}
znP%nk3NJ%LdoIV$yDHT6Z-j(dkr6`gJ>VxDDE~h12ii~dm*M%F%;xXe!x(6}Hew?2
zj{F|Fa)zy2yTniyD_rZug}t<p0cD{acpi(oePtDOqZKLEaT_h&a0WfdbOS#37nvFJ
zxi|pnFcssT!GHEs-6kx569*<ysPi&xx1_|e;RDLZuK|ojx-IN~k<dV-T~urXOpa#Z
zvJ)^i4cf*uU6t|OrMPv1uxrjKqT9(6grLGIR-NiTdIBR<;dHMiLv!M;v*vY1##KfQ
z$Aln}I3X^z`)|n9{SX<G*Vb(Sumw*rINUBC!MdhrdAXA`B+B?D#EhN3DKRv4U-SRa
zR9qA$qqGvEz|_n<iAh|+(A!u<$31(Z@g%Qw5*`lMvp!bJQ5#V1BCV%ma!V#~PJID?
zO7aQB&+2aN1hp&IuX%!=lS-iVFFlwMy&H<hDGU9gB|%H;1XG!JMd8;s&iyjOg)>HZ
zzDg{Oe{jF<1l7#cxlFo1%-}K#E1}<R9!E&4=%tjCkT+fBl)UrtBdLYkQUUHi^Ry4%
zjaoH)KXxv;dRCPL1`of++8OwCc@I0Z@WrR>N}v9b#N2Cebl?8~P@>wd(GOZ5Ud!qU
z`VK#O^vHjEWkj9Ze3%vq3CV7y*%wA*CM<9}>|2>MCIcWqLoF@^fxm1D_O4WWvODDy
zJ_+ymBJb|91~tm%?N<jRRnIS>GoOIu*QS9_@(_1k=IJ;-#VZ6qlmxJe!G_%-2J`ll
zuc8!A3&0!rn2k4v<GH_~k<m0vu!vRw2S?OHrB>mbF0qsy4whNaCQZTkzpLQY(*N-x
z==Dw?u7(Y$YENFCsQEcGcBFC?TXr)wv#|y)i9YXR5}h+d7byt;=j(p}Ma%9twP7?;
zzvtpCqx!OdVh@II>%A9PEk!x+I5;c;7f8EYoCm27-hNh3gpblep!K8>^42_C*iZqn
z7V7(Wmv7mcTqDlNwy$W@Do$u=wA_Wh!1AuX>Jx|fufq8)(mX^AldEe&nZ>%O=YX!W
z9LQc086rP8+*x}5yCsZ*gj5`=*{t7#TVgltWDToQ>HGcOCB4gh!+YB3=~(Od*8)Qr
zWDZgkrK{z>bPO5Y(@85>M6%K76frF|ULy;9qK8*Rm{`;u#f6L)7(V-jCGKXv-9Shc
zJ1j+P46XwgcrJi3&vi-|N~3vxd)5pZ{#F@t^>%kFGO@BoB_~tqB~wX4QJWZ(kPr-r
zu6+D#%{skbnvzwBm7mP!A1Mn|OIgp_TK3lq-AVzh!e@dytkwH2#&#>5=EQ_Y(rSsi
z7#YPw%FpO%GqbuH6_U@vJ?RXw-^bY*3?L8kG(yFp6u#Zl(*q_TOlY0-F7$k&<Ka<1
zJ|?52xCCQ1qFR_2<f0h|yBaN1b&h7nO*pl}IsB-UesgdYhe<bW0MfzHQTtQ_<->to
zq*cx*xHqLC6n{U=-&Dw>aS&pmEu{3^eF?uqtuVmlW$S5W=puz_6(0lh0%vDIseR(1
z3U+~US5%qx{9!0Y3iM<PqY+Y+cvP9z{KLB%kvx&<9jWn+Lz-Zk=CFaGu&&mBEo#}V
z+BnPHNa=_<t>Xyi{q+Kn4HNBs@Op8Jj-f{P^^NiSH%2+nMQgXN0{;S{_Q=E?tZjHA
zo&t(%f};9;UsO4s)}v<8sAkoF?wjgmhL0XAjI2FvBa1D`oWd-6C;MnUqLfjM+MGpi
z7_N4lZ`X>=Q={PYPM1(gn}H&y(Xm*f`>F08SvGp`oM>9hsDy1(m;#L6DM&D3I13R-
z<ODzju(JunOhs{IPNR-c;2ZBTZ4?s|%Xi!|0we$SZsFaBX`<<Hfm}{PxX4iO_&8zc
zrul!I9#oT$Jxfako{yEU5IuWRxsM0>TW0C-b6YM5a+b*ZKkD^=zj&^G`h0x{AN+UC
zL<z`YBENko&3fcr>Fw`-q^%wQa<sIExi0jUxM73ORmey_JvwR&D3WpG!+oVWKV(;_
z@Sc7l>0BqHYY2H+rRJBkguMU6J?Pp+o2$ww=%U4CHMS!r<h(L2BxY)+@Z`9?yl`8N
z?`Ro`TQ{O@r#*2={m&|YYm4xU&G@I858k4c^eAJ0K%Dug7T{5P!oq<#5T;f89Sj7V
z<*->$<a1b0?hBJm64udmJ9thU5)NY~f;w>Pu>PIjHc(!1xG4@t4)gubEKg)^Yg>KW
zou$}%<5*YuRp+=4Sc#CA*tyeYB?7%a?@ZIc;M8!V>p-`vTt>C;gOS_NXV<3rKU-9;
z5;^Z9m$K@=KiG}vZG5QUZd1RHU|<zM;uXaC<3cff|4S{HvcRma{mQ#{U<ntuqokt)
zSnbdK^ER(R^9O2~uD}@ti*>qPl(5($79(Q1B%B~Cl_%$ua387llMQ{Hy1{gNlvTj)
zI2FxlrD!8OVoPBSyFB3|GovsQ$T4DBJy(-QKHHZg&Q6<6ls$ea1c?T!5UpPV87uSc
z%b;C$@I5LjQGb8`2UXmW=e|5C)WyX`I`b+1&zZuxD^VmPO5xK2{6A5X5c=D>-dQ_?
z#?1Vs=wqpehdj{#Xat2R)X|{o^m?g^%2Lf>0yJF$MTd+05OJ+GHCTzv;2%OL5DCz7
zt9CUH)*Enha9r0X>o&*iCmao)1VTs3)p~<QwE}b{R?*yVCKKJlxpM;ckG3?a72I!L
zotZC^sMHM-ltj4M5KO+58u$0_Gx{>`!!K5KLMXK6!<jzw*sL!5VgqnVDJlT?`9SBS
z4RpZHWZ7TKUWQHc?#n9Yy(myCCIk4eLD$vbFetSZR-7}1?!mRxx1IRJ-@jO`xKS-!
z`>##Gc8yn2gUy<_4{m1aUL(DQ5ca{qr!yZzoCK3#uRL?KO4?e&XwrZyD}sdG;4P@H
z9O2Nfy+j#(%LU42q|i95Wl}Q@lZr=@yKy*J(y~6utvjA`AcTGknkfJes}Q{e9oRKA
z`0?l-2YkC83Rl2GBYYE*p;#&}G1;fC7y0K`hK#A{mr5{E;3we9$}fT;V=ItQT@>Nc
zT~e5{EbhTu+t8+g1rj$yha7Bo;8*F^D`;@h7q$`xfeD019IB4N^s?j)w`7|=%+?kQ
z?dujsn~HuISi5x$tZxJ1s=+%&?B?6Bl>?8SqoZSZXlS>;&=Az)aS18x+`ango!>I;
zXFpR_-kob&uCY(eX6E<ritviG;P#`ng#4BQrf%5rL$^YDgqd4y?fOvf)q}P+1#Ur@
z6c=<ukbsLCU>`|B+o-|tbwFT}dySD1^f7cyYs#IMT>)n3u?{=o;eVqbmymFY85vLI
z@;AxUy<zHgFWcB(m8Y5sAhjih{l|YiF2ooY#O;z0dvO5m_brcMG~YL@2ySr>$8`tK
za{jPFZ`H?2Is?FD(hi?ScXH82MOZ)a{y<yJ_}E+xhbfF_qrfSoSjtMlT{@14@$}SZ
zfM+A)l#GGgN}s}t0_KB85C`l}=S?7Zm0Irz_Fu3=Ya6cowMraf!Si(OL@^QK69TYN
zWki0&XzE)EeB$eqiC)iD{#g@bVUzv9uc-k1nvwKfq|~&mUuv{MV(SpL%*riMc5P?I
zU+5Pt&;3fcpd-KRZ5OV=<SA^4d=XTzrxeNq@1>^yqhKE3zIw00)RSZOt_ooYvq3~_
zI}YO%g5JPY9!Hq(S93!Hm+v%r{*c89fA_U>l}AzXf1NcJY>MSaJQoqy?&uZgdGz6$
zXtEF_?2h~V@B-BVmm1M|LBKt)m=@OX4}&rfjvu6sH+n-17Zbr-yp*sPHu7(jO{4=z
zOs^XaLO7eZ4!*KerswUewzZi7-@`UUmL$2Y-d~5Gr^jcA)r$TPt=Ia&-Hd|`@t<$~
zWXSZoXmG(L#x)Q*dNWPd+V;zRO5XfDL}z(5x)gC#zK{@IOyEt#KcJ5}`iJ?4l8lIl
zPdg8na3=k)I}KG9eD}dmqEqVYeGG8lN#Nhx$wJav`7Z+E60$r#+|uj!a<&6zSq*`^
zJA<txB65fT_+%eIZC?;cEC1_`U*u{-4CA|md>LXkVVz?|LG4V42&<~9>SnL9LXmYB
zjI(zqBK#)N5%_OF_)N<g$FutH8e#9OusjzRX~1|g*~8tH6{4AqMlteC!CThEE1Zs<
z*KWO1IuT+gY(nraURA(JGB$tm!q7Je*%#+#cxwbh`_WoW-bojTHmWxTlA=InH+XnN
z!%Q~6(@-Zx6OWEZf0ok1@QE%WKIDXcU}Znq@RW^r2zl$X8!<Te6rX^zao@PaLT9Q#
zoQiV~xKj8aJ|l=LW#k(%<zKF*05RMI<{m&-H;L@|^XFs8jO_e;j%RB^PTL>}gS&OI
z&P(Smbvcx)S*45qS-sNQGH$SJVDwtg4`XS%gZ0J#Ohzxc4rVGbfL`dQ&)BOgRv$$M
z2Hq>VSyF+WZL`PT+H(5+H(vvUv)e^avE#rIv^EET+&ZhFP9J)%l}eLaphL}-DPWnr
z#2W{-vkXfEdC|E#2P;yRg1e1?0IONx9>z=TVM4TAN>!Je6do)d5TMT?%_IF2MuW1j
zNo&W+j{z2;Y&D}M8_APFJc*!@@rLmH)WR091)bBfj~(DX$O>GYcku3Q*G$B~F{PhJ
zz~zY`Z3nP#@~iYF^%vi;C>2x7)&BEasap^N7Exz}$!2ezjuR>@c5@`l`4Dm9=g0&0
z3e==jsgSMI^tqVtrH??t^N=yFP1Z*Fi2fCSTp6`-q?xZq3PF*fRBmO%W5EP7@LTLb
z+`g{CXf)7&tQ&Z~d4pWG@G;<l(EhjLs`YxT*Pv{}>D#3gXbk3-qPjZU3#<W<fO<MI
z?DwrMoUqGG#${%KfbL%`^5F8!t7eYEqY1z521dl=e*Q9yqsfVUXze2e{GcBJ8I{t0
zR~BqZg|7qk1K^gy2cA%x6BU4Y9$ePNvZ`!BELx>*6wI9;esRwR7JkBpN6XFlOO2(Z
z<Lt`!kB+v=iFLicn|*b-O7wpKtH5#cJ8^VNAZf^xW6tBnEVjteQ686miRG36zJr=a
z7EdMMQS#%HllYB?Ufc+|_^)f+;Y}@wP0xZl^a6f}l|7XIJYaiuY=q$c(?BQ#FoM(h
z%vr+wZuRMpPE-G7?;Z+RqfSe%`3@V8u)Kx}jKN%_y`w@o>pIHJ?8vf!ZM{<Ek<_=4
zD0v7m;$TIwfRW8E0fV`TlxKr4t_)}y=yG<lr{-L>oZoJ^GF&u0-)y;8H1jlt1rS2A
z9O}8g*59Apoy)=UTr{pwh6cu_{xU;{d{M*Ske<a+UE<cS$3xBXs_G-etrrJ5t@osr
zT2p|=>dC=EH~jge=bX&|sd@(zooNrFg%Ml`9wQT&Lqa3f*7Hd!C-~r7Gev~s)|{iP
zAU0IF7EL`bGJV`3O95=PF~_U;d1BMHB6gL#QZWo21ap`JSpO&nx^d{BtcBLobN-2|
zjiRb5bCuJM{;5{%Sz1NNL&@C`Ak2M0lFQu?7=1%eWf|hn#Gw_2dG1BEu(CCa9?{Fj
z=of(o0ooQ8xq%cy<vA5;LQp2bfSH;%Nc((NRdb0w%rpGMz5vDC*7K+Vw^)e{O`&-n
z$V-CdpAh<T2a2O(QtLtwkLIzNOLsEo@1r#=tHP<6YJ=$akpm!#1@+g+JAD*GND+@c
zh?ecz{Qezuje-yK0Ye7E)Yvp{nky=Xz8tUAsBdTxq9yZzv_RU=g*#u7J7W1C9-`Xr
z>Rh9S4PQpcQ_*CxNlSNUH45itN+OB~`k?4E8NJ$8e^IelU?AKI*Qp^ia=z$)9f3R^
z2>@P0nBSYv(Z|da#PIwA0wRKL@MxMlfjOs-Ra78;M)NVQ6{vDl7$&NtQ<Zv0o8fw2
za}g22GjUob;nX0_3{p6Yg|A0YOTQyV#qsQfi%3jxZ*&IK-Ih4oS;>=BoyJc&f6mMW
zIR)21t~yFWH-Q|&jWx~!^wd2HEox{K514_Qo3BavZ8FIP9l94yEGt&H9e4WFg{bg-
zAh{0v>2UW98s%s~M5>&C44_lT9{~w@?NhdKl@`hlRYV7GT<Xkr|4Dmru%*Dj^b18n
zcPnx*`D&q)+t?JWe;W7WEdJOPq}xFJkMQ_<^+C6HCY<O!)qt5Y@QBwA7Gq4#K@~pT
ze<uC#;n!hp7ne#G&{oR4@A?Xqhze5>*4T9FTSdQsD;5nDCjW_3Tte!)K3Ha567QJK
z>|j<7;cCI;Lv)J@#gd>MK~3wZ!~}>!Lmjf$5XvbAFC#c4FMXGD;bDx?ktCvL)Rj1v
z`iGR$sKvTVhye^4u#t=|g!a#}@`vZem_1iCv5$@9SRm9gwEr)_Dy8&?^7}v|MfZFR
zMtY-CLZUfGg=BNW)*EHchOW{OP6(+bACtWE3(=B&QVZhZ>))W)gx-zkdS+f;bYtTK
zjRK<}R8&-xJV`k@Ion@79Je8#!=on{?3|9F58m$Fb*u`QlJXhQ-?Z1cgnLqy+<`s^
zGRuGl`cU|{Q|%5Ihz&E?YLC{?SF84vq!rwIrf=1c*(*eY5cAsAJYPCWW^mU&5R++u
z$@^zJj)|g(qsoEA#<)z+hoyQDpfa<tfd2aIpb6%gme182b8jh3@FC_g0Ofh8C8sQ=
z<rgg%dv?x;^#AcYR^Fei$I1gKm(&4IqHa4%&?_WPUS}^>z4nKtIwF6Q<~&9ZT<>xS
z*Skj-gw(Bl<2~SL1bN`33OqCBn;)NY7pPH0%J@lpgdl`-c06f|43hlc{hoED29@z+
zw8d-HM~^P!3jbP~Ha8bV`;Jb>Vp;NP59WL|GmCpB0tE#4RXY#KR5$=bV-e&I-VL%K
zg!RrV^XSIGQ;^{1>})@8{a^mSA1^8=7esj7{C$1>Rp9eeYi-k9u)bW3+@HirC?m0B
z3HAz!rceq0y)uIelG--;jpgDa_oEO*dULd(AXP2^AH>M<J`>}K{6F2`5F^lT9?0|8
zATeg-+yn%d?YMaU&F4xk0?1dS7JLHZUND=dHUo86y}zYW!n-Mkfi51X#<zr@i+=sP
z#`AaG+`s?~0g*?o^T^{_@r!sh#GNRKfY9*86;8S6$MiUX6Vk|2MH2-;3@^?)U7+{_
z=X$Uk+{JUCvG>hp<Lf@xh-VOKQ&ioTA*)8BmisGeff8<y1vdbxjOnF-2urp{m$Uhw
zRZkKEY*!5r(=j-AKnkR>!SVqhyLXWTI;5V!bmKqXdmOBDi8k!U9_C8R_3G=peZPlQ
z(ptwiap%NmFzXI#p)dVwuuM*11bVWHp63$=9o%)5_yz*(z?g+dv25YB^#(FKgnjuE
z1Vfs85r0Cr{{duP!aNoVKw4=7X;lYprupv~Sv^Tk-Y0HtI~p&YSpNJCAHTP6%5D(h
z`4z1#hnSYwaCC-VT<ddD`Y6ygB?1BR)B!>6f6ogCYoUvj8}LqO{v}RH1J2*iiUDCf
zv%7fwmyo&PrBEvwFpu2BjlDgdgF>etK~7^~PiNK~?gA_MJk1rJY^jY~O7U8x_T<tY
z%G@3b-$4a!iB$<mUQn7|vs`SXAw#U)Q~7!_$*Kz4(D~3?9z0MV^d9i&mi4KIF~bMY
zM49!M@CY9JJ84W~Y3<%MK><cdc$2Ze>u1H|Rdke{W7V|WjK6h%`r~1R;cXj4wAP^j
zoZGmIjUSXyYK>8gEhku|rKMjl53)nG(xrheV*>_A&wyHubA0a7`0?xCcs|6fv!7Xe
z-ah104*@rvq$jn3RU5w~vjkfZua5q~<}8?ert<mo=bl4<pge#9Q`(@IZehFn4+Lt!
z&OeWuYc<gPH!&3TnpH||$8Tu5M?GFQ7vd>ZfJz+KzR__&50Q<S*KsoyOrAywPfWDH
z`+RtYMRa@pH4-!nI-cR&MB~dt(fr511-&+j5V+|e0W)Dz0WkzQ|Hs24e{FyJm%=QV
z_gXTvx!9NWe6;kH{6Z}KMV9?9{{F>zD-m!aqiZpDsbK(LCIWqyeSfzyBo!VWPzX7-
zq|3z_0@tx!Ni+KYKlfW#0rwN!X<>(kG>}jqq~_+zYu#K0rfSI|9w{l)4epnr7Gvd8
zh%)$parrN%fOj71z>z%P=Nml<rA-$x&~?x~a$&(_2#k;Hv$8Cmx~x(5I*m>78V&sH
z5iY&|xvd8-<Yo=%N!`$W0F7a!*!*=J8gUEAMasFnbG+3B8nba$GU%VD<o?f*mEiLp
zEf+kWto6jdaf5}OogJT&k}V^_?t@lPSl!>}In+Rq*gen>>btI$Le0<5H{AT)s`hE6
z_j3dxAt7k4LgVtl>+b)3p2=0HyURl~PoF*|8cT4T1BvHq{kjdwlrZ=}58^H8t%n5L
zu`aC@FlXCj*s2V<K3=fh0;uT;FI)j*?NG3T<SYN{EQZsKenbt5HSrZs8!oHkhMXr}
zLB8Sg5O88j{=v6UvYb3YE-EVGFzsbL0K@LGFFOxCk8T|<GW*`u)pf-F?tlFAu!jyy
zOjaMQMi?i#_J7O+6}B=Q92~>Xv28A7AbEh~qF_fDygZ^YL2rBo{F^uL)EsXH^tDj9
zZYx1D(V`{fc>h5OghDXM?1{xzskphdb*H?mqvJi3a(XFfx$swdOnO1oyGbzySW3<s
z@skpZ1dvQ4@p|*ObN~oXT7hg|kfLj{=1X<||8Z1bJe;&7z&v1ylNZ&ldtfk>!@7RP
zuENZPCB*=(y4eLrP_;5>KY#t29d}+UBGUnu5J;oL#U~+INb4c^AD<NL7KOgGAnpz_
z;hH{X=~g*O0~eb!wx{p~aNUL6-|+nY0chb75zluP`{byxad3!soZh6S(y|4|cVD;w
z_EjkI0+%Z2;o{ydv^V;!r>|&0mxYgyZ-pK8qKmZ_=;P!O=c0>^h}UU5GmwP+J}9fZ
zai>k`ljMu<uTGBk_NFs2|0*y4P{y!~K0xm9$jIkCW0j6upffFnS>Drbcp?adE+Dxe
zvZ$~#uGCJ>&X#2@okWghf93<}P|a_%Fgc4yK=GG^`9*us;JE;Q|BH-K7+njTZ4yFW
z_a^$YHE1v~F;}>-FZzx|AubocbDYA(P*GER1i*m%;L*Da5c^B1RNcDB=s4t|p`q8Z
z#=y}_=jm1rn=x+xRd_r+=)v(2{FUtgZ2j-4lM^`rrv*eiCpwallxzchYQ@U=VtQ5W
zX8=h1FK-ZyH~;Syd3fMn5j<S@w7EYMqV+%+h$hfbI|bMO+WAm(aFs_E#J9YIm+bz3
zV=V#!1Qn@n-4cUXc>!U7tlB}HybW&PyITl3g5dzsQxe@!NkuLm^MwO~vs2b<7}(hF
z#eH!!wgHTdAXY7w<b&wHd#GjwW?f#_rRo0~_WwDsS~1U+krEjdmG|Fubt>#4fy*bL
zrl!X0&gQENyOGxcp>t)PRJ05K`+L}b-?==R6gfKx8MPZRCt}yL8+yJ81lRt1w7S2h
zNNEVz;;Qv+63qXPEmGWC2jdspAO<MJkGi;MIuk<)GKCzc{TKwPFJFeYa|WIwSBQL{
zV5d8mnSa;wpLfG@5E+vXI4p#B((>_XfPCu<K6Z@K`+(J|XZ}K@bHdO2@4^1>JJC3Z
zA)g+957CkO{{YzkTsC<H2xjdR`6SpU4xzPxeNveX<bsJ3YL4BRq|E;x=L4?u$b(pe
zW>WDEFwyl@Ir+<%N5GD}4fx71g@WM5Tfyyaal6$Y=S~>}D*oqSV1JtEfh7$O#ErxL
zT*vSPuA8>fS2KoPi9$^qhafjbiqe%<UB@<3s<D;|@=Q9bndzPU<E+<rA=iZ9$M6cO
zM2)@nlnNT^smldP)NXD|cckoRwjir*zJgsjPvPgZ$`^98MhHcr{CR=PdaIbTaGNpY
z>JmJM<2td;^$yNaBSGxH8K{%JfW8E5#Wi88ugyY*B&-2MEg@3|48m}!KL?a#tg5mT
z09S%ThG<Xj6+hQG&Nb}dOrVx#p<P+4P%)_Ys^V^1Tp5}DmC2YF6ap<yFhh>MPAt>4
z)w!h^Pq|AT;YsZFr)UEY)Sx)z)poM6z2}r)o328_Cxt`eKOziO6Q4xDF1r%+s!|iN
zJ3W|hhsX{ps^m1IeLir_<YOc?Z)+xz1R4)f7BqTE-?n_MDreQwY0yE=?sQ2nhtL3$
z6I#24dKhwpfkr)Vx6)JK?lc33Tc}hbv$Z}hG(cIZJNcpAxg9;gx6F+y@Gto$%YkDt
zQ<f(?&ruinCnf5t$<I{je49Uy*D#_$aR?~)=ur1wvhU>kStey4=7hIF0JE1Txg94<
z8l(&-cqT}!zm?=&vHw`EDSQPptbC@qD5mSmW`u4+-&;e-Gb*Jf4XaxEQEVS)td>2X
z1-05~mIfd9P)+E<Gqd_zcP&FqL^f{lDjqu}&Gl{IlY?W-6&>5nlQ1<cyu|UVD}rgp
zeZH*j?(4ry<ev(~+ApHg(td&De2;@EgiLqlf$Cog7&r>Vms)!VO%|-*ZurH|`3_HN
zH#h1XdR2X>(fQ1lao(%F(-%dc?e{|7bxqbCZ`G-`(*uQrLpI>ySs8Ct=;ZsMcigD3
zn3gE=3KAuDkfRv1FVuC*&i(aTr@JNW^~T|jg=5#Kn%80F*T<|0UzIF##nJ!py2P9c
zFuJ)t*3&z$cN(R~ZJBC#ou8^Aqb}rZ^S*6brg0)kufrs32?^$@gOL6zB<Z7q9J2`)
zYw);n1@0xbHB8{Q4Tsn?l0js%J~YOl@?)|cc`mDdCqz(+-*&C@jn(aU&GjC)0~hQZ
zlXqM2hP0*yK)#mQui=8OD4Ge^44w3P=L4A}i_=58_J03JA1pGFlIo9i!X1>IlqDUM
z-_aG?HV6*$t;BU5CF;+%8S<)yYGzU_f6u1PY6hO(x-~6wx5HdbfA(_d4bhScBtvhb
z=sDkVor<*8=o8KAa@W_HmD_vPmHZ+E0Dv$?HVFaK!D1N0xec*Q7{h0so~b&SZSD#N
z6_}3{`yEC=v=J*}q+hC2<w<rQ-eU9rGBoiqy`uGLcHXH7O}HN|hO#51SV&|ydL@*m
z=JbmkkKU8(s~NNH&u4;oOgHSdiAnqO3cg(dZ9n6uMwCu-^WOXVCfdj1cAzWpQ@RP+
zJ+@Pc?|rQ!TI0rXIG;2&%RR(211Y^q&g@D({7PZnY)#WxW2~U&WSAzErvVz=zpOGr
zn>ghEJXR|4J2f#+Sc~sItKZevk1SG^{$k(&;NA0@?oKc<S-XN}YGy{q$|}!o@zP<|
zZ|GfLo0Xo+hPTZ^ciqafryc-YC&)xu^$E_wHh;z&mfLy7UwG#D9v*A63Dq_qn%#(n
z?2=C;_$J?N=bGId)u}7Fm5*_<b+U8<5BHX-YSunah>$(1eyNkTUzTXU{*ATvSl_L5
z<X~f^j^MGYw!6gQP>2vc9tdpg$nA&siO4KwuQ&;>4Xg5B<r~PFq2mQMu(oG8N==(z
zeuCzu`GheYe~~$J;zZ|rq!6A16<FPsK*n}lJKv>=QZkr+;e&IN86B6R@Nfu7=q_OS
zjvENLK)8ujvB#e;RzmUua)&tFI@ddgs(uNB8|l~7vQ4aoHV79oYfFfEy5Lb-qLOV^
zT>3MbQWm+pT=F{t!qO`+$WL?RtB20wUh!f&?>>x1LK%JTAWzy{RPr(kSEMKDx(zl~
zWN^*e#9)YH5wp9cW|coWMS5Rl2YN=A@5H0(i1Ie(J5oZYlT?llbENXE0rwEj2dYUY
zh=_E@CONmQfJX2qBdIM?U6;w$_Rz94&T?pI2vSfQ^3Gd~8u-1g#N=HecG68nwOKRD
zpHrM!n!`48-pH6LN;)1Y<-Wf4cB}qizF~t#D$T{&kDZiw-2QXpTXbjR*B0+gw{l9_
z62Au&)$FYHi*;hG(n5I5&UtlLB|Yx3q7n;w$YS1vz)8Q>^TCf2x(OrhgVwT_b@$WQ
zn>?nJrU&>uMwl%a$`Wi&S7_&Bz2-GeESqQd)xS5n6m7L*%?AFeaIg)g2M|0LWYK_u
zZ_R#EScZX(mvJ2^neCrKgiUSpfJxq3ZoBjwSi$yRRfQp0JcU7`5vo&8N%S?!S@opV
zZ|~aFP3*AAt#_lv03snyz!k|-ox%Z>)!^5YlE2)KHB66J{(`01?O16Ow0+<$w?4EB
zqd+>f;MeqlSo?+YlO+*5`8ye8Go8e<^D<W2+Dl|xp;d`3`(hN1Uu18X`%a2tz7uR0
zm+S<#F(|(@bsuFZs%S6znpaj?;_}rYCJ*l#8yxBWsjvlzB`=p;U9F2xK=Ay>2QIZ6
zSFo@mK=#)oef`PLwvFQu<ne+qzN@@!kIT0g<i8Po*Cli=%N&#u@Q!s?o1ggBAoW{{
zQtNuI&F;$syHgi)&F$r%V$_FrFSPMH?<!%-G;Y|tN+=6%)1g3^1TJ6_Uk>HmzI^F>
zt&_x(_r5el#BM#fe@mFcwvT`4bOq0RGNFr%G2y<GOjU0{(*6bmP6tXSN=X|^Gq?L4
zTdAElvmM)8hHd*3j=DdeH=dIcpeIN+G$I2UfR0X4LxTzAibeo^S;IpXbbckeqe%<?
zK*Iv$WTFn?flj-^a)$sM4rdf=l)p~`;?XT)(0AR`>H7dZ#g~(0{~1sSKHChi)8Q>p
zApSE}_34{hc*(nsC{FZ=*Xehx+A5!pm(B<d9?#%?7W!<I(eiG#eRhwDX~juobUeXT
z_X$(9{9vZ-(m;xrO~}oD#m3=rniSx!H&O5;5V;*45jm3D@S<mW!Z~%{9@nyKRjY5H
zRlM(UL5|8uiE&8q<Aby11bc0@`LQOpsO75nw-}X?@IV$wse#=u61ks7CLdyAs$YaR
z@w~8$Bl#p<6l+-!M;5^?`N=$rlvS2IlM#cF0~L*sG+%;n>ST0pN&{QM!NXWP!@YZ+
zZ+>)s?`V7`-XVAX@VT5<TH0=&k5k`m1!^RGD)`S`eIQXtt>*9EJOAHZ73BJpJM8Ru
zKYsk+cwlB`cD$44{vARJnYTDu!Iuc=yL+x%l(t>(d~0oFN-K)oBrRH6pKzB`y>y1s
z#Y22=f=!^KvkAY)4TtarS@J{Wl?PII?9B|##(8eCiap(LW>n{YYtEAp`NWo$H93AN
zf0Icp66v)kto#UP7<<RZyv6FtlP99QZCMFZy-6by*`o@=<dl*kBAq9%i8UWpE9DUp
z6@#Xy<i59h+~1@%^LllxY1d(DGiHUdi&*)JjO$Q1qGjF0rGsw~?DeiBWS7tK*p1-3
zyo3xUjWey5FR?l}JI}PU)im~gR=!Hb>?4Xys~?QYUtWLU>uti<l&RbFnOtRG!;w##
z!v7O?GfHzqZKtoYW3!^MQLdE$KE7gHIxjk!(_r0zEk;7s!b?eGo*ma197~^t_5K_4
zbu<h&dInvTa&14U=RRl&r=vf4GCn+fxn$vHrlm=kBR{aoBeM#^Ionh1HRyGOs9Jno
z3a_>XS_$@AVu)#3t7dR@ZhTpLI?L6wen@PdIgt=t9MDFVkk3l-i@r2=o9xH$haRYi
zpG*EzyjKu^Imv<UX}V!=ctu2NbW7?8q`^0n*&usElL~p05b?}8;>$qR7nb#cO}lVG
zLS%AWa$FPv%1*9($&5?yoGzyeM+G)g?N#6QyzFjqus1=oGi>%e{v*EX%TTQBw@dFo
zXX?eEFGcQ3a(#dG?Db6g?n|uAHnPZ~&Q2Wl=;(&wG9415br#z-<YOCg(p9H>T8S@b
z^>H)yGsn3a3&1J^{YA~5u`iz8_{C==bE9<J3IA&$d#MZSd-bX1p8KTgDy1~}+zR(f
zNl33@hF*gIc$@-LE5kkgW!T?+L$<8%fnicnU*_NluyNyNL2K2sbgyeS-^v>3M*qH^
zoVyvQebX{vt9c%)8MpfppF{Kzk3DM}x5ay;{qkXh(&&>(v<8+4@Dx!IycRaI2evb4
zLx=BtDJ8eTl+0g|*cn4hM5)hyaepUmV;!WlYw#tNw+eg<4s-%To3|P%sOC6V`A9AC
zl~uKoR*g;x+f^Q;*{S+i&7sD8hWqQ=;&rYsg6G2#ySRi&f(c}5VCp9ldic&@A9}EQ
z2IpGQ7)UmZw|eH^iU_^hW9r+lkxpl3Zps@_-_fur(Lt^~wFqirh4?iStW}b+mVKWv
zB?*vlUMVOomLb`BC30ZpVj_g{>U4o54rlk!YPDX%Fd<XfzI!|SHm_7{v5cnD24CRA
z;m@5zT<n*l?22`A*X~W5v)#!0DDljIi~u*A{oA-B&x$+Q{Oi;4KS}g2X;yt3_SJXX
zs;bH2O1QZ`*;%IZkJl$iW8f6dh~)eQ&LzwkB(SZH<cL`6f3_7vjbyIQtImdt>f^g`
ztmW7oFHr3~?WTkL`_s{?&y3fMc$}!-yp*&BlIf3*>#R#*Qi}U?armDqeqDQuW6kkh
z3(p#`t-IF949uO=Hb3~JpAP=up#lDb2!_qg5ebL6KIGVpAYYdFXZ7X%>sb$N_HLlB
zs$+!WHY=#tAhCBJDD9n)7pg3;6n65_Rr%hcvHTTijN(`GXP@cb^{6(J+=MB=HMJ0~
z1FjF47oo%mGRtNjBG@JD&0T)vn>thXn=sUQr9QQLApx|B5%x^?kt8JNU{f!RI&?ao
zTnXx*Ga9O!I%T!8{hBhB@`98iUt}oBNH6+i`a5?jWtOB_h3(W^xwuK|A6)eA&jJ#w
z(1*%`I21CJX!MtsUTpGliUDSIDTi2b$5P~=WM*!63FYf+iWgQfrn}E)44y><n*CT(
zDz#o!^a=<=3cG+=+qeJ(hrTRY7g-=!`GOQ=**;QlYmtdee&wr88gEWIbUz8Q62=~P
z;eB;)Vp)6Yidt^tR5N9?J(84wSs>2qnzuVw+o_dRB~@dGvf9dvnmCqsQkFY32-O)s
z-Yt2HDmzw^N15by_0yl5pSd0;CeeF+`&#^F;5~1l#yeHnvaU@wJ}dH=Rl(9Hj+go-
zek>P~Xbkc;Z1H@*%yXt?N~UOiSE^}g*)Zrs_KUBDgk9vL2<`>mba(>od9J&?<6lyj
zeJ3iVUfMm_V`XKN8s(vhY>J7k#i<{4VqN?V?|^h}_6maqk5)rNPfRq!nqZxH!qkeX
z@qPPS>GzR=xVqxmQP0V1fBNzG?Af?GrC0z_tl)?53bT_{UGKxacHPFU{4qEV;$x{W
zLp|;%GtCR#TRu@{AQwl@^;zmDy6pRTYT?zrgzV5)LNH<T!it@P>RhZq(8KeT24oqZ
zeS6*x9(M8PL{xO-dgqUJ6fwLMO?9Quo+^491t0NgalJ2{i<KEEEJbBU$>8dd>En%q
z`i+yX%w%d23<J3m!z(52$uG0-uyc0=bSBuardhKjn0bf9RN}_@EB}ctHzIuB-&WMw
zKR|5V)`o4yzUphDsWmc5l<?;8vc#nYP?0J=s?*)ye;Er4>y!>X7=<7S-ByIHzU<i&
ze-5kstFGFo9Wy?CXE`~U`+PUe@3c8Z!bHqYLt6_yL~y`=up<FXe1<s#1OEOG@?kOo
zgl$dLU+--FcETIU^<nXzn_8pZ@04qk4wDWfg9lc{DM)2T3s*{<`){j&-U7x6NzpXN
zRS6C}G!RK0L1VT3fNPPgtG%*5$r%Tat=i-@pPlT&udB~XmwH|v6ct4X2{Js0b?KB1
zMSUnLp1j%qt@MdrFQ(s7d=SYrPeZa|0!!9?bGwv4CgxE*jW`A}6&Gxu3yVxX1A=4V
zEYK0{0<<8H7Q*Yc^S0jG75y5_5}(LgZv@$oC@!i$XeL-*U#UIy#eS`cErKEIPteu9
zx@$LBgt`q0AduDPMJ4M@Y@VuTwiQ*sPTassRcnR3obfPMD>PVc<b7p=IP&B@gC`<7
zem?+DAtCeZw^rpnSqszg;o9!Re=^8>L#4&|r*)6^=TduCXKGXh5yF>>8|)+(*5`T3
z0Kq2Cx$6iBz}{?&CVyhlHhf9AoN%*uwpitP$^O9os82NLuP1p)DjmC3+==)|{hfmc
zA5+Rm8iZC*sNHX{pmv?QPnf5B$7}m_K{JMX)F+N*if1-pD^<4?pJ;lueV}(VovimJ
zIofEhdeM~G$;2$|izhA6$9_cny|!(<Kl8QzSID;+y0wo{I$u`jD5LaSgsoRQDy-dD
z_2py(lEK9|y?hoX8J2umzDZx$3z9ep8N}b(!+8dUIQWEs-s*V&6{!2d1MPM)F^yY$
z!REH5!OlioS`=NmyNaQputW`>jY(n|Eepp#CM3x;;^l9=A&(=z-Sqw{gztIWy2k51
zMg7ydtv5w>s0eE{f}Zrg$q89QyCv~dtUA_ju4Mp3QsP|g5Y+_@Z#Vs{?ub3Aja(Tm
z|HHt@gnm-r^@>U1S#vK<+(j&f6xNytO*Sjo3N9s1FKco`UdCU*q1w9uhm2!R>NC&5
zQJabhiDi|7>KP?vWiQaE_zKUs!zbza_TS(ehw{l0ETe;rynF)pE3p)-yk_)f$FA{(
zqBPe{c6#|7O8!ob1N-WkPm)^wnfAKQ{#Cw2BL*A4${U<WNOqj>BmTzSu44hma*DKB
z+t)c;AGok0xGh(YymIjhuL(Kh=3C!*p1R6TGs)W#Am3<XFy}fRxVm*c&dqx+f+<0b
zr6hG=iqJ7cEC+qIBL7v2*sC?|Ehoa&c!6FFH4UGK7x+`MB~X$7#4(P*VS`^v>j|m@
z@z^TYku|lnW?#w6%kw&{Q{5N;v)-5@j!fGv1QF@DmI`kJucRyo<nt)*N)C-owvCt1
z=~>a@8Z&pcpsf`1y1P_cRchQ`6FEHmSXQY$M@IG=6PE|Qpe0Tk5Je2Jr`Hg}tCZP~
zvixo<n)_o;S*<98VVYjQ4_<I6$e6M0cLk!{ZtP+!1(1$G^DI_-pS>%t6+nIW*SJsT
z7}|{xMz6YVZJm(KDn1ShV+ONcLtyF94X}9qp}Gv^_YguAMMeRk+ft;rKV;4Yr3W9R
zdr8Ah`%zEJ>VtFw9}v)c3VKjkSxw3)TT{?<B%qelKnxulZ9?({@~cWH;^ss}8syPN
z7IK4w-R~OEe{@}6-AAXK>^|9Re?HpTPxg%Zq!;Jb7jLE6YfW8k_kq3ej0`Y#FZLh4
z{$DN=Mll)~H-Ej`(eZ;T0DT`Oa2`F+mG>OCqP+P)>MrMN)K3izH-)usC`|P>F_{Rd
zrxna;JQ<<UpJD=-*-6fNU;LGK;FhPr@y6=r!-6Lnv9uRdEKy?szgH><RSX-$6VzbS
zaB~w)TdI$M(9~Dp$UyS*CcXvKC|2a*T&u)!3XGdKFBcXu1rr|2;E<!oqhBt>`8`jV
z{6*VQ$wVG*-jrUxx@X<2s7LJY#e|d2JJ|8PF5N^w&UKrRtSd3vY-}Nt1ux&w@0F0s
z9`$g4KX<9bny9OIfZzxZJH8!7XO)Rx23Qq|fM=8seth0Taq9!OgMs=cOks8Rr@SwU
zic%cf#R`in?9;OYnhP;4d<ZCMz#0)M7l|U+*i}Xg-N17??1-yf9jjnBAHG*u$aoey
z{>phNr)0fb95<GS)<^-FHd!3P8CA3i<F(0|@{?QMP-FSu#<sX=l4<s!n6$h;()~>6
zaUK{*JN6M<RojLtJNvfdxeTIK)EapE39Svc;`Yk2m8~ORc7mK9;qD(Fj?taWX$RAp
zm$+w4Js*zpzWAzkr5$Hqqr#o#OPKMTdFnAQ-q6adVniM>`l<eASdrsL=`r6J><a*@
zjRaYXt8ES&Ch)y^3ghpOOq}}AjUB%kZ)&=H|1L2l<wSGVx()b-@~z~REAz86eqWVR
zKJ=;>Q1$T~XIHw=oNEi0x_m1e4l?hFtCzZ3&wqS5Bgwbob6BeHketDBqH;O$%b%7m
z{}d)iD;Fy_lG(#h`{jFI%sShOEE*edX%2&UWM^L;g*$4OP~D6xW(v+&#Y3{fA~hm%
z<b*Kn85%1mJ=RXO-nQCkA-p#klvlW`X-U{GY{p|8WQb+(ynp!)&K#37P~CR2^rn{b
zYF0O`Kq{c2p<(T>Uj-RIHh94;3FrVZu^IuO#NRAZkuPlG;5EA))Zukv=)W^HSTh#9
z1h}Cgg<`yQW7Odj4t%dsIUC#lRMn(}pp~AUMK%LT^xAQ^cdhe3P=Pkm5g@Ak%h8a6
zRf$1Jhy|-tNr@geC3Bx#l6FO9p3kL;<jKg$2bXb(rck?&QFxDQrm<Lju@=jo{uXa{
z=RW?^Mi^Is{dNKhTl~0U{w)Mx67fw;OhmP{$w3lDh2ZnU?LUM0&#`Xb(J;u$kaZ--
zjG=}oJ9wYOnZBnq2I6z(ENW#(neq0$w*JwJFkUu^ey^^+TklTkU3hIk*4db9l(?m&
zu(h$j@#uXdi-mZDMKvkr4eprfPxcw{x^5L1U6q<tHTLVTzJJ}A()m5s<6km!Yc$WS
zPe!;Ll~<ToxG*Q)Sw!T^mcmqr5@^sK>uNax&jo;qt89(zU$WtFh(MGwNUi=Wo4-uq
ziYI941Y+Mv#$C*5M*J2EA}9Bv{(o$JbySqw_r8JB4bmW@ba$h)(jeU>4MTT#OM?uc
zq;$j3$j~X>F{E^NeP8bN-s|VLerwGivt9$<bIy+E+0WjGL6eiZ{4f((41c)?)3Iy>
z1=<tdwBkR0*l~xKKjjyFBG`}Z11<7PyU-UFOs%XFMPIeuF0F699cC5l!(mXWvQ*rD
zyWOEeEXgElQTS>qxy%gPqol5relKvi^86)Vt-|^i5f|2=S9-mHDsJGOXz*Mq#><Qn
zu@Pc2Vlp={8I@uMsJZz)tM>2kL_c9h`Tmda1Oyv^+u;2jb$#HTM?T0ySiJ9UtV{2q
z=t6qHxIF(CWK7aeUNgJERcXT-J^3*)agmVKzM@+F<K3#!R*g2kWj20bOs?&?TsaP0
zfwws%Zl%L3(h7wFZCs^*fGYUc_qVNWPfRUFZwnP4JKX4;rd;D*5?_LDKzI17SGHRp
zQDwp3KU<_}3O~u+o8|JM+g;K&moP_--N@^CL|<Ht;WqF9e)4R;?J5dH7qZLV6x}!@
z{bh(KomZfeg2x>VQQfD#(>+KW>OBDH$%-NR)3SITS_#4K8Fde2#6x!T7~=v;cIN}!
z7~Rk(IX5@=IXyiJ81?WpB%U1ja}=D1qiDF2wX`Fw3h`YY+6Eq;fZuv*KOGtPi7SgS
zLr5Cc*~pJ*(T<?&at#^5jEK^sTtn!jEi9^_ZI=mV4Fm1Xxz<fYy54)jF=&h(rv_k>
z)m`KhH=e8K_qTF%6Ez$9MRF4b$pQ&)FPtnHt?;g`Sk+A{$(@U{Yc@?@-~V7Xw}(JW
z>bI=_uZ7);04XJMGP@bp$Vj|yn)4jc6X6fEI=4+s)ObJ9D+K=HbBTwP@B1(2$VQ(*
z9*4ni_EGP6H<Al0$728|$8F_R0{RC9T?a=-;XpjlCT6Rb3#<fXEwyZYns$EAM!
zwurJa$<eXwe?Vxju1@F))*Jo#OeOh6kOC)JG7h4I93JoSNXq0_u<ItiY?s4k_tZpK
zqYV7;OlFZP&Gs^jm7k_JeWYdFA9mE*4Y)fKBa%+gZz$A?gBYKf_zH-w%F4Zct5LIQ
z&I_R&78A(^Tm7Mc-93HSl$8KFaA6`H_=Cka-slY6T|`%EIBrsI(&r$-u}RJ)pOpHx
z-E<D*1fUb!Y=dkm(C01F!qCGBZ(T(o=J9yqSZ!jg0~-<u4#Cr9uOn{w@DCUQhTV3M
zQshb$Ru5J^%6Uf3@RnmnXE69j2D)R1GPmnevQ}y4-$CgAbqGsG0H4voPeJ<bAKPMn
z1Xp7@?FTd_IlJe8!Nj7Xq9|`)H^L!)Zq>sAaMAJMY5TA5%*`H#X`B;gqpWkwQnf$4
zH*?((J-F%6*8o7L+T_mVmBO#?Sr{UP;9p!E$sV-b3yE2bL+%nY5s@22hGyWQqRMm&
ze*CVpT|&zkr2V#4PmmIQs=D-VoJ3Y|8G+`b3R(2gPQ`m5r+w+jK?cr@gtC*}N{N33
zAcOZCbg(YN|A1&~{6iw%+w||@1Gb1>vj$}dxJE;E45$eJA0lSkSKjkxM)N(O_e)`j
z4i16gav?#L;aT)uS&Ab0y|9DI%&1rBIs6z8`1|cpaDBaa_U@2kseOL*8w!^0+ptLi
zLw@q#m`UXP%of_s9QHb*oQ!}CSe_h2;y%96Fgr0lObCd))9a>O{UzORz*HABK=70Z
zUbC`0?IFv>$MZ!tNsT#QA$R;4$QOq}Y-*zQo>}9cc=PbeUJtB6%Qs`_zjo6^4g#8D
zR=UW%<MWUn&#CHqKC`jeUtsUB2hK=Dgij3}xU`fiGZPv?|1?MnFiu9eipnF<42m{v
zFd=>^ul21XmqbTYKwPDNCuRxXieR{t3H{!FeLe9d^`PH_-_=$oBM!J6p7VGNe@>!Y
z>Zs(j>vx^~{*!IE`{nK{{$q3B{_*Ka)OT^vfmb#U?pV(Qjy5X2eG~eI^}xc!Ww~M8
zFp>#T9B{h-QXM~SfD^XORb`p{kNte*f-^TaXLj0An{!!-blI(*)U&dhPyOVA1~|7E
z%wNBdxV${rf%zX1^vHj`2O7}fNLXf*;acBD;(6z~%BW4-Fx@eLnPnhOC_6t>Vk{aF
ztyLeZYO4hl%Ivh_iG?CZu-E=YN1Va4sr@W&XQW0OoW1vCpP8}tpiO^rAA?W*HQ<kG
zjbK{ZaayGljiiBm8)zJ<)n7vYWd5|$0iq+xq@Ciwn3L>-(GEx>Q4f?|d4MhZ@%rmF
z(ner%vV3#n-U3B1)8Qpn#<3yBhQy~&CF=EUq3-R`&CRBas&KFLvx1<bT{6l!$=(U(
z9jkAFY@3o9eZ+~ZGw_(ft~gw`;e}W20&jA3&r_$}-%MJmiLqheccQL$8s2t)fwHF*
zKh@Nu{o*NDX-%l;#*E%TM`E`Vd>yf&|K-cR@7@5-8ef+;I{vd2z@zv)u>LZV5y0uk
z=6E*tQT|BWD)LA1(^E%O$$5+P5>S`W#iQW87TtxaogFL8RQzwG-(HlxO`V8X`->#}
z{)C4t!b0#PnAhL1?!&aeZy=KzeEJWNX-=o`Tq47R8^2i7H71UG3ebM|o&3~1Y9R%N
zxOPo$OK9GW4g4dP6Xuj3{yZ)Zx80paU`PL1lzQMQsU9GTNU)oMv^2M|Uc9fu^G_gF
zX6&-7VQ3?7d`fgh#fLh=g%=yqLwHg9c?rV@U*=;z`^>O+uuxOCqeGu~qA*E{i$44$
z&)1(*62m*MDk~C(=~Q1C^~4I+m=fpJ*3|r9aD(3>!jWkFxvBs<a^j3yB6tIwNp(z&
zoYcx@UY~G0#@<Gyi4MaVIF%&k!gsAk|1IiQ_=j`;_%-PtAoIZrehR_y*0srm1f_<@
zk-nxYQWW3BD#WeO_~td~O9(?sG6xoWZP7b=%%U`b>xq<<RJ&Yg5Fin4FjaQ+mXGB2
zQbG6}%=(lEh!YVef@9!(5m|BHiizoI*w&Gq8eeMg%bnkpSV}`$xPbUTC~U`ys&yP(
zSXVbcz{shb$-YLzjY5JR{Ou4#OlTv&{q8iBmUywSXj}mn)jh*s?tNG8SN!Km$LDyc
zMATp7^Z${D08Wbw!1oY+ktPfx?A+^`978vfMRD5}#>Ypni(xO#uFG$emoc`~=z0BP
zxYmnvb=0#Eg(G6ZU;ayFk5-y5ncW`T+b_Sj*6S?b{O>XD;`}$vnTsw^OK$DIV25&#
z$8F|QW``=>hTO@AgiA5$n9UW$Z;Aqzu@@YB;T7!{S-Vx_EsU{2>Wb{qDZu_~``sN=
z-FLp6qVc|Tgdd~m&r%0821Nhc6dyP@Ss?vLuUwA#&m;Zin%rJLfccRCd`LaF(_!>>
zK@!S*ljV}vS`P80!>wH`P=UzWKhCNt!E8`s1Fk#SGVGB=v71~8Glj^Eo0u57J5=$u
zPK9Ra@0%|2=!MzRbmrjKUWfKy>K`0uZCYZpo)N2@!~UwTh!N~97j<=qg$?P8j>tNO
z4^MWrO$@8pKa0}!Cu`vq2a|L?zmC3j4m{t8<I`=+7btboFy_zN7d;sFp1k$)=~A^f
zWUFW%knfU*up$2Nb#l|foW}WMYkcw_dS5@f#kxM^hyQ;5_N91uM7iAZ-#L}j<Q|R^
zd>8yi*Vkfn_<^Z^fk|vra5P_ASAZ#sLRFY=1%0Vk-%%`;YsK7UYa*YJ*9n{~Zc<;h
z_3r`;g?q)GCxod*JHSLo`2$)#n!U(W6{r|qE~?#^By?3~!T%oabHm=`cfRK8(M&9d
zaiW8$CKLI|V5MLwg)i!_3>%z+sG2neYR<#ShF<>x?3v8~(@$|YX9axepO2*<suYxR
zA-<`p*d4Msd}hz-JG{Z@&soXL&56qTq_trhTib{JwOkshscUl)#CT|2a;nhAa%S4U
z4%{2xa0a}X#Mr2b36<%#-vyQ?%Yh5oe@ZL|G1wwY1gmBq@0ksNf6|1>eUK}}m|nU+
zrZYC*()z<BVxeR-HK?L_NL0xxj^FX`GRw~ecIZRo{rA)F0|baqKYOJAw>p6m`4E{G
z<(VR|OF;7)8IBH;^2VcS_D{~~;JWiHOc0~cqoV0j$>3S-Y%7Ws4H_5*$ugLTZA_)i
zTRN1=?B1XW-9aoDdpa;#u*K@eemK*AGr>>u=SJf}KdnOp)AI(d)KLKi^u*6T`PXNs
zv-RetsQZ$|?~ZQR2@(?`ahGjElZHOOASvuWuCyL^J$f0<nY_wxR?yq=ExMieYlFJR
ztl<q=-_PfL@A?|PWs(NhQm+-Ir4`A?C7?+4Nsaq(RN*Ea{&%bZi18Zm56@#cOU3?h
z+<;a206DG_WoPTac&O3Cs<bX4AgDrg&x{IIK6ug-gudH*j%VWitcru5IMUU%;xZ;n
zqLu1rFyU+S&nNlsmR*)zSVIou@9dusrU_6@*0b!lM1K_vzp_y0fH?gv37|``y$rb0
zYroJNlp2)sPfi+JK$qnh>FxLarZ>7QYms2zY7}v|d+uK^h_lBe?Aok2c=ekf^Vd}R
z-?B06!Dv8Nelh>P+OJTc7ZMdfR3~GEsXz$*$@Q27aCkbBA*d2WGXe0=?qi>JJ>Kwr
zSNs&7^+?{WmtPwuTcRCaHEWYp`zV#1Q!+Up@(guLQ(h<ICO>M4f9#l7UfJn+-DaI#
zapmc02e^XOQ|OA&(!IFY3KTB;&J!v|u<~m<3%0c~*Fmo&kmu}-2e_aX!|uH8Jh&)y
zLqak@BSaxfc2hQ5&q05P2{+_o_H!b5qK0!vHzOz<FaDI)SerjrW~t?xU<WnG_cxjO
zH%bIv?-8%>119|Y^DFqRg#{fj8qU?@WLQE{PA<5#wA7366CL0h-TvY>XI|I0ZK5&~
z0kQWd-D4P1tzqruznJ7h?&EKd5fD&6_8R2toE7%^A6C8^AV-e^Mw4b~kwxtYM9dUu
z)J+5mofE$T?%6Iq#}cvrCo~ac`WFfqltyp9(n@H|Z;*!Q*0IIPO#&=Ngp~*J)y7Bm
z^ub?aX|R?%xOB{xy@T7<QyVb_+#_e)?rn)czE>8<q9Qsv1pkAJA3yBh|GeNynh(go
zVJ%U!tok-*H~W2DnPC^2nx^J?lWh%fJil^1q!4#)xYm}C^jm>nnxc!8S1RT8NnR7a
zo||=_w*N&gO=n$0(ybVLnoaY_Q-5g<@SpfzZMhlk3TC6N6)YP}J%hs+Hhqc(bwnO1
zH`7dLRFSuQ`6Tm}ol9OBj-;CLrl`>B75dT731PxZ7t^MxfO)#$CsxI26JG-s$$X0~
zTECv2!>l9}JS)UIt9L>%_;2M8e2U^45&m@FhYvG_U9O{IegpqUh*j9#-QBG+T=N0I
zl>pn!z=d~RW#;3wcEElyV?21tSgL5>hVZPT5gF??MKQJQgNOJDlympo<h5r*;<X8V
zMgI<DT%X6Q?xB*p5R8sO)jnib_=}h9oDsB21=6QktZyXj*k6u&d5M$Ke&}7(!227`
z*xoahJ+}+YT(-xLsZt`5xhXZ&-!Mh)ST4VH)f!awqC}5^A1JMrb{8B=$bDr$pq!>c
zLj*7PT2PDIbo)nL=cnvji4_&qzeCvbiZ84vf4X@QAWa2-J40FdH}Cs}>8q3{`(tRR
zbk-2)$&dmXU}`0-4uR6D@7)`K3&msZM_yTJqbd1H?UVWFDb_R7t=ar7oiF>kFeFvG
z0!7BK-<;01voHJCiJ{*RNObspImpSG<tqXK(!*+>q_tk!4@k~7s?Pt6*9Pi7W{QhM
zoUt$dCg2L<iBArkMaKFOC_=&dK$3KV=Q}c13$pARP5W|=((*WK<F=uN(S6z}yizvv
zmtHyl-V8zdh4txFUpf@1G5i-D9$Lks{tIhaR0nxNw}?P}$UMoFn}-JmwCf<CqH<jE
zrvRB2+XG#|2G9kmqhG^ec7oSLXr-=V`=)WKo~T3LMR9Cyv95J)n=HNC9i0vPpGxT)
zTjUKkG;@==CoM+L7)~GIkGlp@ei&I_B@%kNlHmd`qBRfSbdM6LB)2k5inogBPb4wU
zfjaf6vjilO;Z0g6mDJ18Gv0@qMb%--<7}SKm1F-|-%BS!GXhs4{G_?YJC*%KDCwY@
zu7)o>`fB$X9vmuEN`btrvP%BA+lD*%dIXK4p+td{V#wZ^lUt|oh#maNtzl)}HU>my
zVC8UIY&$|ccL=%5{-pIe4_1O+aHt4&Cw>D2F5=U_YgiN4U-Y|C#fKLBceIQn!z7V>
z_a&K%yOKH8THBTc$?n&1i5CxJ7s@*qZYi{$Qo<Z2G>C7v1gG!HAdC8|%5-|hR?(c)
z{DUfMux{@cUk@E}`6<u#GL{IL6`FL9iIRS-Io~AVayJ{ML=1bbT8hq{kOxh<6KZ)A
zA~7l7JK;s2L6{xWd5G}7E_VmX+FQ0q;9)RtmOJ@3`jRceHxG&$%KWFWXexbb32@2(
zZg1<!10Uq(*&FfuS?dl$RZWdSZ#>OZle=?LT{IyF()Ia%u)n|mbh&=pYdz|}02l_4
z5hQ3UJ`;F!9qmzr6f88{f^S`q0Tq*L4Q1FKH-N-UGPNed>jaE{K8t01@!~tvpvYDq
zAq$wFiB^G}`O&4b`dL2mokW0XSy~*1&6~UTAu#@W#w-q>+sV<3gj(-i<4Ba3&Da{R
z-Y*6aNX9R>Ii)|x$oLl7;=9)}=D)s;M0Is1p{vWDL}&aIiY*T98`8Y<9+xs0$KY5U
zqO*@S6q6MSf#pVYz@e)aG0}7_Kb3eL_Us3=jGh*!MkR5JcJUI`#CHFVqPxW6bC(C7
zSjaCe4&_$VfDKv(SvF7pZ2UpPkjUZF+HVm<Ox2ywT+Nl6xIP~;X2;HKPu`I^tq32x
zUMk<rR39{)s))E+?wxZxWly|x_4?8n1>`QV*3rR%I9x<$l<gofnkzhLr1sa4ncm8Y
z<!<lU4$z8M@Xtb(){z+<RIxA^C*xa@27Fn^^lu8I>#HnwoUgoZF)eolP#C1MLM7a_
zl6i;5YO_EkDwv2CHs#J<$hBY1{6|cx^R{E|p!%*>zcIAp%_HAG1I;5utso$=BYyQS
zf}ni21l+R`5*ZnIF+0v_v%nABX)^fQ&KT>pp5?=~4oDL|MOURt>C_9SLA#ftI2xgv
zB-qWYS<%SMi)kaUK2%gIYzIBs`fyZ!zILrJx`#s54$`4z0Y5%aI30@dzD1qZFn}Nm
zqiie-{<`IwTFI-O#4~_2?MC?uYe}Z}?rby2jn9pDU5;0yPSqcOcH@!!JH5KnB1Ne$
z{zr9bSY=-&F}H)bn>p#klCP0%F_odZmfji$7H+D%b22B!n%6FCt|CUyHERbki3OX1
zVKXzI7pDTQ&k#)@PApp2u7oj_`Uj)yuRAmkvzDw4i{Er|MsTA>YC0d9Mk60tItMB7
zND@vhj*3y)(o?9GU`FZdC#MwNc-v3E_)Eb_m#}#ycJb0uxU3UVR`!~cCTi3nH4%I4
z<sQ=(7t35U$O8B;LSZ+Zw`k;T1cRMAu*b;kPUJ!M$l&h_rI))MOGk89uFt|0aZI?Q
zKTn5@IRz25-?8Bcs`56=nV5^lnx~_sKj#v<>OH@irrmL4fQ+)5>g$K*%f<g_n`yp1
z<0@O>b9N?|U8fZqarw<SXeFAj|4$`B;(>9%d?`ouk9TG!egf>$OqH3bnb{`mcVGf1
zGP=OYXBb?+q-3k~gR1b^m8-siR=|%jF-tS^x7&fpNfCKQ8?}<|o}oJolbBhb52m(3
zgoN4gG26L}xz+v@^u@tT?GvYxo?<j{v|v~#k_opajQLx))0WgxnMsN`3fGE_FbdZX
zy^HV5{J^_Z<6E2heQ10`Ov(N%kg>zRYtwE7@Y~b3{Dcv<l^c4DVR9<HJnoucAwga4
z1!)!YHv=Syh~H(D=KTD!L|}M?rtz0tHCm$f7Ta$O@6U{$-6eZ{BxRfFz_e)>d)2M5
z@@Q<lM|MI5-@q*Nv?H&aTgH|~9b+fhm_@&=vf^XLD==(tHYu`5BLUx<n9`-5khm{F
zBD`!l<P~B3a&f@0xe2#F@&v+@hhNHUip0tE+KYW`jPBTb?Lz>)Li%yPzJ-;QzorAy
z77=5w!%7JWDi@hTr#hXx=l8zrZ%yGFM8j_7m$6%yVc=5GNvcq{5wT|`hU?0r{{9P(
zo|_^|@Tz?Sh5*T3^fJl@-aTKyfSGfU#RfSTR?#xX4UN>iz}9+7MHuu&H{5wRN>zSp
z?<-ZiW>z2@*gugC?0Z(~21lsoF|*PZ@2axy72IE4>!|UZq)CFp*0x~%`tc4Kxz^2^
zu9<1w*MB(vR37^A|Iw8Jf*<Oj(Wj!E|5D?Cyhq1<mxKRo%24~ck)a`lRDNey7m;i$
zp!<{tKWNa@-y1G%=0e&Ws~vA@Poug}I=x_fxmZ<okr8d#CJVv#Z@=p-mooKq?V%ZP
z+c21Li*)O@gDxst$P4EQWJ78xYxtl9A3O@UpBs9Zvi;gTt?r8F97<2Okv`7j@_0_y
zQt;)uC5xdd<Y>P|Vyk-c!%d<}q9nu#smQ9xF-?w__Zh+Ah9gz)fnLK`hM93hvWRHm
z<jO3aBL-9caQE|1<eG1SQeQ-iq9+Z?O8NKOj1RA<*{&w<*wZW!l09=KO5ID-ea^EW
zE35qc#Zc0lymFV8SFx{$7{UH1*)8UMg&-Hzp#QihzSIlsI?5(Fu6S69MCBS@)ZSS#
zXcXTc^l3cFsYF`pCy;*l|I$=}F0e^6X$X-A^^IUbJ8HvW&*h0;U+>yY(%oF*;#jNp
z_Ix;;7$DORQAru=OB=eapLxH_V%*CgZRP%0tfgqD>IGVO15uXsh0rN5e?zbriPrXF
z<B99;caVgURk|ss1qb$@%En)O4|2!B&kX+}AAJ0WIm3pxSEt(IYy11r5fLaZZf=BX
zV<^FJ(s2Mhp92XoNfT0O!6GcM+j0cxNaNojv&^lU2>nt)an4CWWSJ{TdXe9kJ?OSe
zl*XE`I-}!Fl5smtYgFtd#>v#K8JF&%n)rTgg73+UVHm%ASDDb@jOP-yY;rl>Tyveh
zLXB;fC?EZ63<?A2_0!59Rdvk}l<;#JKMeAqYi5?`-v-g#LYZ3b==Ww*^KFBqqPpOK
zd)rvmQ<Oq*gMm2LoUVwn!O||ygwkJ*xy!;4$B*iSu)4z&z(z%9<Uv8|g+FZ5s9#oo
zCZRXUAla2`rChQ3`Qj~H^dT2b-eN^R)90o`os<j=Xh;7w99sIa1ihzorhsNbc>gJ(
z3FN}v%TIIZwf~AL#PC^EbvTS)<P(MYo<vzW-u>M>iH(H_|BC*hm?i~V6n5iu_{ljR
zw!m3F2EG`s#%K4o=P11w@9#IM3qF_|N&n5lIWn%ykZr%7{U34%^?}?G7h*;JH|uA3
zVltF2W@$<PP*h)iXt}nWu6UN1_{B~I4p1U}^?pD!>(Zft4XIfZh1AGZ-v*Y1GN!KH
zIF-tONJC=L8mZie(!9iZ@yq?j@r~=$X>6|aR?%d05GY7VNBOkL)AMiuYY?<+0MTzF
z*z0lD+g{ekxy^GuHRI+F&u&#DP+HA*<KTnbDrt{~ka;VB{eDXg`Xvt3f9nixx(3tX
z-0^l#HBRExO2vd6&uOP2XQgzEQ2*-+qru^l!HbtH8n!{BB3{o<+Xxu3r>j2@blSvW
zfBv;;4!Ti!&jMbCWQNpc!UWrMiF!h8Pi;25cJ41)aMtW+yFQwv-+Fs0eaW~T_l|Uo
z?4D}WY?_c6>h-h#7=K6Rhg#L~58BcmxhXGmu2CjtQC=({1y>0YjowuhRd>&PMMwe%
z;#=RDPq~}&Ku^hqU=PW}>5FBWn}4(bd2GhlKr+GQk@@40PTKX^Gm>_Yxd}vYl_`!B
zdMrl*T;~)Np19n0R#Ewa8FH6|&f1jR!Z6_YonhL4CYF6HaZf^f(@V>g*B=pmC#cfq
znr->fnYWS%n^TFDYM$b6Kv!SSO8@WH0$)U4KWO{TkPQC=bZq~K4TSM0$IpN%7KXs2
z8qa1lq6c0$9WX<!&lz}>T@qT&B|~o~v@yS(FS!Xrsx<ou7&oWUm$0UEC{I~8@T`Wx
zL~hV05$<|4MLX>GA-byI=(N5U-*VsDQhm9d;W~(t9ecZ*d7HRiNj)?bx=f;*b=2(@
z!r8vgK}$-Cxx(ymC36r)43A8=O}aK2SFfO+Q6ik1h;KcjwBM$}4L$YIm77V@9-au>
zYLt%OnF$z$SCn^sEwlbcts%Ix)%%F!fJlglGfE?;zp6wx!TyoZz3byIgW#a1V5Z#W
zGQxDs6^CtNM_i_v&$p$MpkEnutEb|FbtH%0YL6}Wf(I|FYrd5Lb=rpJuKcQ9Zu^r*
zj9@FDESpDj;^I)7HcPnI+VFf_OgZQcuq*+_0Q?dm7oDxEt;e}tcWh_dyC3+%RzB?@
zC7pItP<tH9Oh=O535I}iERJexPpM?}4;P$Rz^QD4;Fn++v3q>&Ssk+Yb3JuSxSb+)
zp^ENOn|9Cql<opEJm>(vx4H?Wk^!&0GLuj@A-%J?Km%B%#3fjmEolja<|p1EX-Qel
zPe#hAEKq}h=~|4TT1+tzKrphLO&|aVWi&Ci(h!F$$)g>3YhrH9@8=ki%2slG+`02j
z2UJtO$2$1b#N2kW1O+TOiK5G=S(HLd=c;9m-8Z`!MGQSUyYa%_30CbCZ~HM*Fq*$S
z<##1+$*b;bl~)K_k|rV4(7^bm7fj7%(oe7zxP9ZfWX!i3n>0>}2V8WH&d-#@b*}s6
zt^Sn)xL1fAozUrcKys?=N73h|8z;<kSLKvr9bfB^R0`r$W%G2#y8=?9Y{EhAvy9%G
zC+b8%KDqHNj18Ox3fg{mlZWP6@?zxK>$&id!X)U3I6pe=C6@~AAn4RM4Bz@@KLFo4
z)~!V)&1%if6xwlYqM&`kCQEjQZXT{Qtd>AE1k}s|3cdL~vaDmg8Hy}<Ye%K-x)fEu
z>ZK}Dj<Y#%8U(oX{ZO*m4d?H;-;&G8dbOeXkz_V(Cf4)FElNipL`sOf-%YfH%LzYG
z!>E0fFOU)xF=QiQ?@3^p7WC?jD+4;tq*I$w-%Hk7PEwkwH>>C>&{FRF>Zo41jBt6X
z!%<^cP1z?5Y$NYz8II#z&Cb<2=m;Su(t)W$sgA@^Y!B*JFMg8Z@$$P*3vJmNz-?p*
zC?UX~V+wo`(vG}uE@BEK$mw3xrN6%Xo4in~)F@4%|AUKqyZ9iLb`G-i{Ri~60{w@I
z`I$!M&a>!3H$FV}>-eT8DB*Assc)%NA^GJ8cJ?DTM6S+*d<D){&#Ue)5}C@Sq)@oT
z;@_j<41%<Ah_*1i_3;e2E%^AV#S3b<$r=(FgsT(r2dOmHQ1WI4XEYRUj%&G3hStm<
zYdL4NdoL5W&)=JH!_&&*n+GT891ZLUqnK<gFHFD7PuiV)u|a-&izi>d(ItiEK^{R=
zc|Zdv-9(!BB9V8p<)^&A03DfF&)~ih(F&={W6WCmpuoVyIAy%Uv{C_o?a~|4yfdqW
z4H>T^61ai-9Ho2C3>t~6E~b^K^ZR<E2odl+%O_X{hTCphSsB5|8}D7)Z&=Xx=}oJX
zlXo2F$8;8i2P;DWnx0hDjV5bs8lIV>?>hTMv$Dk5U$_m-wjW;9uI8bbX}N=7KxzDf
zOlg9MvM{0Alr2E+UB<1<qZ$SS%s{r!mb>K}=J_s9_M0H7F<huw_2SG-kd18O%36^}
z^`nvou}_1{HjT`b#QKcOIo^0&y>zRkAuybGr_#DikuFklx2cmqLxfCLm%EfcyzkxP
zB#&_CrN!a8xNcQMkOZ97sK?{zY~z*L5j)qb5f)4}VobR3zE$t_xo7+kCTz`K+BMgG
z%1*t71N{#O8oNnc9ER#nf`WAPO-2!IEc-p*<>xrs$V37T%EPzm47iN}Fp6!sq;EKy
z6BjFZRXQ5Eg>=0*H`p>Y;^(PrBLV2Y-kZ1O${f_Ke~~(C$ls0ltEVOGCnqHj98`F=
z<C-%5JLs_YBM%fc@<+03Vb0q^s?@W($A~Q=t3b`o%&=6jx||g~xT+l#s))i>-j~}E
z&EFgC5qML~Bn)=WAK<$o3>Z>>F2QsgPbHLigmeTXOFHH(S`BgPa&NWP@yU_;Zx0DR
zu6lX*P&_;TnKA@587tM(|F+%h{Vo6bwC*kGnnZJ=m8jM&s~@G{7~`v*=4oV;!q0X3
zn6$^<ik*jZ=q#4~R%x=P*4&Xd3`}27mcB@Jni}p${UjJ_^PYR6rZMZ}T66~ag`V#A
z3oFjjU@}O=^t*LzU9qbGlJg*md8{id%^zX$|I-Ze{OuAfn5<LNrfJ;xwY9Z1K+H=3
zqj(;R_F_o3Rw4X?v|YOE_d*bgb>p3LUSTukRd4k>sOMRO&X7$YX?<kzB&(!Y&j8Sj
zh=GPpa{P-}cX*wtyi!{d&k6dZoyi)&O+%%7qJmz-SkM92rg?%Akvl5}2JQ-&075V)
zOf+{UfLj_8TU}En`ELI5$~?^z`woI6Xc%B9ws-;P76L^y20`xuc&^+>P=dajWU?o|
zWg+DN=Orvpsivu+o>kr`YLXG@kDa-%=BIE>>Jyk#ts<oM(&Uy;(*4}E=Ow7uF&VPz
zh+^y1V034^DP-7{fhSp_6LTo(b|%1mi8i^ZJ1pmZcScQHaaM2{#yuyJ3GkW0>tATW
zJ*sdR1q^n3w;@d%W@@vA#Ov%UBJNrUEa0|L5harzpU##~VD!NoqLA(Tip56lf7_Kr
zCVQZ`hM(ldXh(}^buHYyCrN}tJMx2#7Jzd^^Fn`6R%1<1*_eUvK0&zt%pNZQZh5Ud
zrmVW+qkWvQqH|th2WW8s>?6ulE33s+^LT&tg@N%I%jRE;MgMb{l%cq|B68~(&*7{o
z5hL>bLfY(T{N$dxcQLOAN?#>TOlSW$9Tyt|`nh1W?VNjOW$+KObCYd^n}Dy<+{M#s
z8kvyt_a+%})f{(Rv~b~2e?5whLBhTLb<(cNO&-?4?OV^o^$Syo4&`9{XadCDq<)of
ze>3QuD(YQXmcML6W(Mg(Qs`JOA%`xeL_`DIPEnEkxCd!)!M4N18GM`Hfu%I}PBF)A
z@p6d(<H>+hYFEM;TeRvrv?oi9x|f^wdX$0uvC=t>tjc=D?nAB6qr-91Ed=+|e8S6&
zriRgG!tHZmh{G?bNVd7D%Z%0^h0th``pe;g_r>;41KH*{==7`%OD9S=KdzG-&d&qc
zB;vVsPy=<`%(teAi5J9+x130dI`5<^moaEL$<ZpENr37raz!Mv%$E(Pa;7a+8z=?!
zCe~Vg1D#UIwWn7v(yEOz7#84S(a&(-iUA)jSW9<Q#M6wet}BpDae(c|3ak2LX6Bze
zgUx$gWR=FkdMSs`vPs^mIH$|5v79+?L$Ps9Wejf4@(TQ(dKX20W}Osq++V)SeQ{~?
zT6NOr5T9?~aH<Y|yNd3L=DGzv^QxVc!yPj?WVm;RbYn;*6S#?20%Q5>*U%P;sUjh$
zo8*Sw=FE!lB7dY~`l<8WxUu8Z^??4g_n@rG_nr?yIXQP=+X3Twlk|UC!Pk0w0Aav(
zj@f$iTY%8oj#yPyr3cK9lax}rgXy}b@;cITTF)WK$;n~owVp`o={<<h8a|oR^Us5y
zb0P@n$i!EDM4ZXS-y<ky%}=U}I;%5h0T)}RjL8~6sDVeBw>6wNR<k}$HvA<o+9Dgb
zRNCiGxxqn5!-+=*Uv~|Vdn*-V4iD`<8qsAkknerDxF7<NEO6x^!cZpPqfqr`pV<})
z#DoiYd%`8~^b+n8U{XCZ66vC(G1i$q#`A*|dAIo9*a*ewB9dz?Q+SIVMAY%3VO5Js
z93GFjlh5GhD8`B9k9FJ5i%(CuVWwE92(MDrgrwq;TMuXG8)wl3I+xTx$uGSxrC$U#
z{%u6_3JXexoj*=nevLgxLig>y(C12+wxdwT{5i4sfV+I0)b$TwnHd10Wk}E|SUp`w
zdDLeg&?d+_TumH+*w<<NzJCs&zKB9$4=*okj>EUM9O}z9PorxGjSvieQ1Vhrd_Rr_
z_2jdELg}(uk|O-E4AxFVKb=whqHX@gPBc}fF!S8-J8k{?oMO~`+;JBkI`sU6FpzLa
z1HgDhf|8o&HC0I&Tag%8<{GdKPZcRSYD*q_PbCFb(TW+p*nowZ7__l;bctxME3Y!a
z*_M*=1j$9w82=2wuu>MrSw|zwRnq6nD|y^9n}9Z4LZ85EJoiQ_N5|3vs#|$XeuO=H
ztN-po$`UTfwC-FP%op7?Qc;#aZ!3#{kraXUBl2-Guc!L72PxTFS}R|j*p`=@mo&E*
z8B3;8kH#*`n|9FEve^r4yGOD7Wt%2o3nn4{_1kmID%0UdTH_$b^1oEMDa$_i+`aG$
z?<r8IxCO*B5t8=e7?2J}r!EDYgGL3p<hh~Rl}}Viz_5`?v)*i3$T<~F(CGBc42agA
zq@u-DG;tLt^?qvcmC=<F2!uSsUt3%A^{U-^)yh?YVa5(w^6g3KVqjb?weIY-Ovu3z
z5RJ=gLs;`YfynZmJFMuQUR<WfivUc$Pbj$U>JpKhd-GPz(#lE^!+B>znR?;A{HAwi
z0#Sjh(N~}4*fD*Us5koMZFs}WnQBMUFx$+J1>PRDq2U{6H@kf~-SU?2u*pQ*m8Bes
z=LfmsDYmzR0Y0&qUpxdwBv;_^;xO0d$=YQn5_sxbFyN)!xExHiDFid_{%cQ;0M@Q8
z`NNV;Uz2j)vQ;|2$h-(yM!5d1Ox~m#K>1S|bOK0&xSJ@9{r@%LAn~tNRXFA4<=+bm
z6oYGwdaxRgdg;nW+-%KEZP`e%m{lGs$Up9Mk_nma4(p`~>9Q_#;;U}N`T)APlC13w
zb9RGp3GX7FwR))_*G<qfNf!`Jk)>GOn?MO|TT-XSY??T?Q=-LIDsw-yr26G2J1(1o
zjx^&C_l%D3uHpK|6dx`f*4;|#vD|eu{g#~$Ra7UNM((&zHa{5MD7hZ4CNX0xrF+5G
z0NJ0${`IMWJ=<B!F}+=jI>Rg~*n<I!Oy@B?Qw!s)Rf7v=@Z>ECyr33sG+tY~SsO6=
zGrn1I*_3B8q{&7g(2wknAv#p|=85>?(N}|{;lT6vqbaOAT89|2B$6>*Lte%tK^=0c
zwUZMwemm9o!Tw~`4Hgk$UD<C+e=w*;Y%=m=0tq_ADhp=_F^`A5JN0S@{(yu+C*C}2
zk`xcv0k+c53+GAsHTU)!jA0)Vu%dn!V<R0mca`V<5pUaD?O4WKY^w{{iL*V+%XjRO
zDtD<}M#0`)_d}2C)Jb8R+jDT;^ZN;N;j#W`0;D_e?1P|6Do4AZzCxQ;Prg8R@G}3|
zPBQ35ZdGzh4XHjYZN7IX4QhB@QJ#O|Pz$V{mwJM>&ss`dMN<kifNf3<GCOn9Lr}75
zfSfLn?C-sv@p)Hgdq2E|QdIwYlU35Lj;Te*qCL`~L4#ouX!Dxdj93^P_);GBb8IAx
zYNX<DzYS}8eFUNa<jyFtfr$h|4cAdu{^T!F@vLjp_kEk8lB8St)}_^`F2|COb$GK^
z8QTL(sl*Md3Z;^ELr?enR&v5Xhr-AnyH<y=)$-(>-F*8-had|wVAbABK=w!oON|;;
z!Q7#9+P&cFzeFPyI>jlSZn4x|J1?4^o}z|5T?-%*huSCO?`z|;IkWq5R15*slV?*W
zf%Tjf5UiE@2==fp<4HBbBhKuDqgw!UQ4~ByXT0+mp{x9aX-`}`W%)V(Ve`NN!k6z0
zOfKogy-?q3!gEMS;ho>odvBkFYOAEiA+AJ<?H2SEhe56inNvsQD2Jw{j9RNe4U`QM
z%63g4DQq&8TU~VfhA7eOjYW0l`FiCfvhi`2%>~hp_N~K_DuHKK^|LOg=Flu5ny3Lr
z;bfO>D$o&{64|0%GiGp$wqLA&Mz7(kQguM#KxTgaY}azfqof%nSTuCZ&+6#n5V_gU
z`IodKAeO;XfE;rW-)=J5wP?1wwBKCaFRilGTV&~zYxQdc7nKHp%5{*AmaVl>&%eG-
zTajX8UlG>jAHP@r*>}ghcHH4seHnLa%HyqYX4qTrBAvSQsi==>h4P<z4?sWRivsi{
z>MIYOFaH2sy(I+8#b%PqPXm6%#SDVCCrW?-!Fh>skS@lD;6bG9JkP%36o{=osg5#f
zZ^FH6&ymTlZ2}oI;ErSigWhtV^n4&TcV<NXaED7NSg%NP>l#{2%SA%Nbb}U|Czbuu
z;wrol=7<<v)rYEJ>luk}{XCcdmeKP1mywFofr+IJlN{}bCd&IuvI7({_a6Dd$w|GT
z?fdW*8r_~1obp)%lB7`V7LV}wmN|;^>&T2y8o4`dns+lDPP7cqQt#Py_Xs!{3Es;!
zGq~p0e96q3I?P;Vr<(f}n|{Ohgi6=G2cJoYJNm@Ckv`DfAiU8Pvva<5n=?;<@doPz
zdTi&}eVc{|sJHf=OnZ%T9tMAWZRR5-#NM8-{PfLkTZO>x?MLdMu9E?=kz~6hMxEWZ
zH)1Gw&c=fvpmb?$o5jlq-R1xOqTZ9wJvNFB?7-O93+>(U2;IqT)mP&XB(xT>WDI4#
zm1fwRc74lRfcS3W3(X<^#`HB_`|=gt7*HMQ=;>EkiI`5ko3pJ)a=wqH6f?{m+wOY5
z!aM;zjg=V(xA?rdt~gj+RyyJq+u;=MBH2rgWu?Da^O+N}Nak9o6D^wUqh+yaSpNKl
ze+=0>1A`7&3GmGG8|#h7o9r7#h77Y<nN3nw8=9JjBj2(VAyhTjQ2)cE@)%xg`x!&?
zb>`cH9T~C%@E}}8ZBFJ{)vG8J?>!#T#*0jK{j1+2OxAaHU_SZB25u)=QO!5M^vgs~
zsMwn2!+~pG==N&lmVV*RWO>Ma@`yS)4J!m=<V4mx@f2ASj|s(^76FZu)yQK8koVz$
zV|txNrhq`Mad(DW5vf1SMRBES^P^Lz$afwuv4}wi>N~VAtTcc~(;ardyMe)I+qA}1
zlF=Y`Ngj-2GjNR{LL5NkweO9i;7I?>?7m*R{`+(TY8PZyD6r6b?#(flv0i1f8-88|
zXRn&L_Hl)S&gfmthiO=;?b2e4RrD!s`{1>FqQh>Os_~B(JGUD$M{1>?2|}9IJg0ck
zl{UuZz0~Y*WbHHQ&372cTAjyGllmK~*Z8BXy&ebhEynLYmvh62u@9I9U+i-PCY?ck
z;tl=yLe>eIA8AA~vQz?V|J4Io=YO54Uc4Zk_>YYN`UznE3?Q3E7Dxvg7XA|EzSjE)
zx8Qy6`TETpa$4Hn@5`UcUh7Feba07ZxEpE+FZSNx3-5K+6m<<T2~mq2Vptmuc_K`{
zJyhI10=NTVl%$L6B!>?c6E8LlyGE{oYo=V^x$Y7zwoz$N1X&dDMrNqqM9<V&ZjJ8i
ztroc^HI;$hNO>O1QD1B#yY2vySghZMtZv73p5$}+C)|<y{KhHlCzWQNI8ekG-A*d+
zKvPvq?ea~=7qHj0*VXq44Noa`0`mdelOUdwcY4C7Q%{B~F_P_Lb#_$EE3Ld6hL7ge
z_$1}tP1_6CeG;pE;g*}8o+AhCnA0;zn~p5u`nGGReJpQbQ$;P0WWT3s_hJ)K%xHsv
zG#n^D`Bb*#5~l#`2|uLcT$GXlF}zp6`BF}&b-FiyVSHqCN-cSDYB|Mg+FqY4$UlLb
zU~pLC&pKf0jJGNNVg_1~HM7Bpg4$MbF<fr|see942Zj~kPS)<1e2G9@AXM1f%>Bd<
zxqhh#5I=X$R9*UBW%tH0$iCAn*Odoi4I(r4<uNYo1t#ZNC4{;u3BhZ@QN7y2ji(E%
z5W!GbK==N5zgCD-#{_QbUhIv@oh|)zA%tPz@c0K`+=@fHZx_*^!SLYIijxA2GEWu(
zh2Z=GZA@riaMbdt(>K_?crYw98KhR7goD@RsUaUBp@>Ufgfk%%<!^#6>I2v=sBhxq
zS~?6y+trU?BH{P}019OMOnfh0-bN#gsj95}jOH8f(oxnzY4GtOO|zhd|64~+W6;{&
z{x+af+pCFXVlFUz@`ocp^<qFH8*Z_=T@mqMeKs>2&m&v#vPwr89;)$Z9woLxsg07h
zK}uEwr^eh6ofWMYT=x_gT?<ar$jH9x6O-Pe@9UQ>iCxEaF}@V>Z=Zd__G`=nJRRgR
zj7`gi>#!f(V8tWaV6Yt~<7`E?c#g?>UwFe6BG=9ll6p>Vxx?{luX0?ByX=f~EPf~M
zNlkkL*A}yKH9(@(8`B&qsq~h%ec}&2AduVSuf@wN7Vqr5kGXDwmI%*zkQZUUYL&9V
zz0{1k8q)3Y*ScM1-m4i0oe<{>zJK6Cv~*o{7)k9HTbz~9dgZ8%C;sWtb$;ES@u#UJ
zJ($`X?sV~gOzm%j`zrO&Y<D_p09)HLI!TVQx~%ZW-}9%#fSL^|!|7&97l>PChQMB!
zg~FPLF|C!?KI!#UItzy^YdsM+xXQSHd>)hK-p+okYqQiU(YIQzKRF-T9-R0?Bhj?!
zNCp#Ho%EhWC~L?7avbS6RecoA-MnH!(4Q?-B6@-9d``_wZig+*Mt)&W#)8x{<c%;1
z`;r7MX2AP=_Ct|a=f1>}X)m3MWj#u03(^(3Pb8bPv|)_?mEWCgN5GS%(%)UD%PAmP
zzAA?yahq}JYD74@(LqS`K6LlX<Q?)NOB;{qbX8Y;2C2&#0x|jsxxD(kkIldwF5_Qa
zTq?ad*AXb9zuKpyE+$So<T{x>C3(YOg3^S+%y@aEm+M87SR)JwsBL|RS^5^nIqRxF
z1MwM$XGNOsf2L^MP#QA@P+MnLQrz)ykY>$mxwYR{?|3xE$534FmV5@1RzW1R2^E2`
zhAgJx0>*aGUX6$wj)dx+@F|`~cK1#pb{KTFpdBAUaQ>RytaH4%*Svo=k(OjlJX|qf
zX&JL*oNS=DSnW%?zq4IbPfZ3Pw(lE&uPB|r3+<%jFFSs8rlLZ(=Q`Ti$NIg%g|CL9
zJwPrx)F&s9y54ouR_Eg)wOAB^iTRpiOS%9-oCo`DfdJ!214#2hqZa?%Vx=REx7_pI
zXxu(QWGx7Wq0rq_(O!sH#|HDO_Y}{;m$Tg5+{o~fqr~rW2y`9DgSJ!Hw9Gm;xFNED
zoxthV4}wq=>bMW;fUNT<Gh+!e+cv0B!1a;)<Qx)tnEeTg*p1DE<U1WXadWR;+SRJ0
zbK>-}M+%eoxGJ2>b+^6O`IY!3HiICB4fB}8jQbz?MfZ}NEL=_GmnW}Fx0XE>O)25L
z2b&c_T0A4^6AF3@PnVyq-dylU?Rsd|kg#0y1iTSKQ&ZcjTKuRK@l<!te+xT*M8M{T
zq)mXSc@{Vl%mx#cWo1>(mf%R`XWV5@<mPg;PcD1TJ<+3=Xts{4x&mJXZ4(WpgSKT_
zQiU5#wucP2TjISiG+~3U;o!dvp>y<at+)Tt;iLBm5czRE`s|AM2cZinZ{Z*6)Va~o
zzsCD6>AcrQTkcQ1Esx!He%b}4-rWR)5K(rA3-g~8+f!J71jCA~cE_~L!;WK)GCK+O
z>bY71Z5%S)?#r$2-}R1&*<Bzcu|<D2r7_)pACt8{^Wozf4I^LMIQ?acGfR6%lZ36a
zrI6<U?Pg}Kxy20Ie%s)~Gl%hvYGL{shZqg|b<7t}-Bn9TtDVd15KWLOvhigtCtEHV
z8ndt+k9b}6;A;e1-jc51Wkdimi;S4x({YAr;Z&f8-#y8a!nF4^`RHQ1%+^V%*8W@b
zbDBQy=vV=v9>V~luUR&F(HEh3?j}c-!&<rwLuqFxd<ZE5a-N<5=)TG=<#%T#MX}OB
zb>gSF?w`fo(|V8Sg@p2P@k^(cz_US(LF0AMkcO(7Uk6`8Md}EFwYl(68mwwi8~;Uw
zF``0R8q8~<)HbNWEi5X0x^pE5vcJLZY6Y`<v9gE6M|Ze<f265=?25pWHudT1a;YiX
zw#ms}GbJT_ZMUtyx#rC>4UR#1#^`5Taa#w171#_#7l-XmU3hVKQAvUUce&@JZ_0(W
zg!-Gk57Ttf@2#C3j;kj<-`{z@C)hJ(igEK?Vs7LyD~f`Gse7)L2A^J(pHwwQfu7>5
zX}MQaSJm{4)Q%5qtikpvBcV5#guyJkEIH`u0->Q@T3_BCPgCwJex2*;;543OgvgDA
zj1KrN{yl0YA=bYcI?oKNmyDBbcx}jl<}tj7!bSfaQFRF=&(j*T+!V3}NX=S6L3}y<
zx^i{8lLRH~$5Ppz&2RUx8BI!{26{BltdbT7xl}UGvX3Z`GRmg$oh|m~=j>{W-wuAt
zkv7+z#j;{oArYCGdG?(~S@c7RS2_OjFUQO5M^!qe!n^s>RfwZk?Eh#1*e9KPq&?Kt
z%ShaXt|R$7`5c}vuX{e?+xW)2n?!Q?!d*yN>ntep4!3jgwjl~S7nn;(tmaH6OLnV6
zN-_6ncRu)>y0a!6&nP3^MlpT1YyJxJNfq%eU`&E8XVlbe0@V9&X`*(N+xsR5589mz
zyS~sM=F0pLZ@!`fKx@<g)$E@wJ})1$Q335(d!0rwqzy}~k5z$EK-9s2o>oblOR&rZ
zF!p9!Phvq4^^=@ZKJFf0KT&!UROx8Dugpf$0TNc5oUIXk)P^a?x3BYT+)Gj7)ADxZ
z?wYY&ZjK}-tO!rla0+6!)mK#m?z%fqdety5Aj^FwHlSIAEpD&Pp_s@ZzS&&ti71y|
zhy~3j+yY`G-H;%>=_k~WDw8YQIBS8Gf6q*k-(L?Z&*ZzhIquf3{}{B-JBB=58OOK{
z9%-pq@lBE_p`)L`g4Qi(<sTwAVxAZG^+`?QCK?>diuLD)z%pk$EI*K5Itt^|$N_aw
zT<pdUe5EssjKw6DY)q}`GwQu4S+=we(BVN@^=1OLf~xEJ8!j<`vYj=%5=l&&cYH*X
zN1Gi2lCgW0i;C-q61aUi!P)3?z`VediQugh7TxKeC7RDSaKSq|N1wJ*lTN=c<#IIV
z9JkEk9YD=BU8kGSyh$Wy@A!PgWPyq`>MJ*!>RHlck$Q5Kt9{1x{1jox?}>9eww1z|
z_zRU}WGFWUb8+M6M_F0yo(v-Z@3&%e4;QtdFL1>oVp}gRN!0<GEcL05T$i9e#dVdu
zzOkm+!<vnU=Y+DEX1XPe2(~b-h$56M+g!t260;A+sg96I)H-26=g-imcwJ7G_9i$M
zqk#9U%}2Kka{nosc4vyUW<1FtEOy74^KmQfKFGP-!LbpKDZMLZKSSbOgFE<F($DN*
zhod^s8Oq4S9om|!XU2*!3Lf`a4db;kAp7b*K9VCyBHK`9C5_m0o1G*To=>T?3;DYo
zH;yZIX1sH#^{4+nei-&~dCJ4~CmjGZfnfu^?nDGjs4yfG+IvYRFsz2jU7i)&0FvUp
zSI4Gxw|**VCd75r`7HTB2=HDtO<w`;bysY|^0ALV_WGvT;R^<Iz@_GV5Ckuz9{6;3
z5cIgscyGFy2Z|`czuR~nJBw;V^F%o9aAJZo&^q7STZdENZ0`+z?UHS)B8{P%^A=nU
zQxjR{YJsLa_otgR3Kuwut;Uahx~h>@Oq&uYNjLp`2;+7^qkF2J2kf~?T0v!umb1<{
zT_0#yryzg#kS!*Wo5pY4{n`#+w-ZCbVX@+G^uGn>9$OW7v91m264P}q>*BoAtZJYW
z{6Jdj6x5(~w;heU;tLeSg5X!y4p}Os3(T>N6}^*QwS&U^XLBubUkytjG{nmQFNpGz
z2J+?0%lS#SQExhQl8&Pj3RXB}O;QbG#nP(#V=-5?;)l0%o|NtgaQakmhr!cq&rLS8
zI}|H(zn_f2k4c1RfsCATe;8D$R%J*wLR*Nrw-pf`hmE<pUeqiUVn4{9(e+ele=-{1
z`UC%Z^cZ(g$ta=H)skz@jccRuTlc%YTLWxHutnlxSE)8~i>A$PUm1J7z4L>9yt!gf
z9lgBm^qjJ_O&Ei9UIt-gWCBohw53?x*a(_2^|@YxtA|k2K}-fEy2CmJA9-rOy+Xgf
zE5?M%XcvXRKBk^sCbejsOI!pDiMLrbIR^ndPa#yz2R$t>E;-thvL;FCsR8uTEyq?y
zp4ag=_$<foMF-uGUL^K<o?1F$p50!PhQt;Q3Svs$?B;>aNOG_{FDp&TbVFcv*Nj2o
zEkI)II9}?xo7!B)c4|*2Hvvn!Cr&iu<x~GgNk@>FnA@b2;+DZG^xs*<w|wBJoX`)h
z{w1s*zUZw!XwGlV-RK+V!7}SqyU#Rv!(m!ah27(If5>f@H#M~v;6OuJ^PQZ`x5H<!
zFQ*FDP@Mg#;|Svbk+o+thKvkelx4}@Rrw&~iL#;ufG1WUS_@4zB|jAdm5Bv>28I?G
zw3kKm$5!+1(9uYnD;C3?Jp38Fn<WMt%PhS7d(Z(N(BzM%s<7;EEfuaX0))8R6lV-p
z3!UhzTf0M|4=p;3vPyHh3QEzXHL109bBE}Ht>3QqlU5cNIuKX91RDH3M0e`I#QhiN
zrF>4Ank;i`N4m~MaN*w5Oaw?Qa}|e%4!z(HCXo0hbCx;aL_y)m&`;-eu06!qIhyBV
zWE{I&4|Ziyoa=5HW)~Gu@-VA5nKGUxj6>{a?<<{2<(0KG@$cScBT7nSt3N`frk#Hh
z(pge|?l=EhJT5u8JyA5PGj=t_V29%j%l#R$N;?|0U!eIrOq>MDg9#q{J}t;cOd9tI
zcVtGkyq*0W9_Rks{TBa&*Bmm`2{SVUpEQb4^wH4T`P&iRhmO5FEiV5#k2qs4Elp0Q
ziviXC5gII<GG?sxv-87df~f3wh|Xmw>_e~8or{{=O@LsbxVPM^EYNwIBdIMaAK!&I
z0-CMzX7Yh2ccM+$2Kl5dvk2RMeS+8NN{J2BI0k>v>hltcYW%!%3cl<XO4ke>_W#lK
z)?ra@-TSyANVjx{bc1vwAPs_aBPBI3bmvgg-5@314H5%_q#)8Y)F9n3<Zn3d`#JA9
z-|PDM7jEVP_VcW@?|ZF#t+l}<O4*P(FK3@J8^zrr?c@`sy+aJ5NXY0uQksyP14QgA
zy@v!4G@^#6oA{7?(5z6GhTG^&J@uaQ?0wnzeb{?A_r9+6Ydt?Q6%QbT7T7Fo=xAxy
z6yT^ia%XG()FwTP#zkdQ3W0WXFKUf@gzuG7QL`v}I#EjWBaCe8943x`_|p|({u<$J
zAPUZNT1mu+o^V=8^Hyxx-0%Nbe1=PnkP<5Cs3T}H`!aFpRMSA=MGZ0Z=;o-etW6qK
zWgBDm5ZN|kx(^yK8xTJFAvO{sN<R)jcuvq65%ac3&En)Pcg^M3{X_^C&^74<zeS+g
zBlxH?<!8fJgKgg$2i?!CwAO;#Jf#JL;#!*93fk8<^*m+kt$KzR3zx(4pX0NDkoyK!
z9^=xr(Jwzd0p1dvDL0eZm|vH_VAk{S0&M5wP!_uqJ^Dlanh$O_VV?0R3h;d9d+zML
zbP@n5kDlu+;kO6ai5YyYMT)^iYg4cB`c!92!{^U^@#jZWmoS<9Xmn#k-0a}pc0MTw
zWD%~y{A_Zqvb@Ek0rS2L)A8QPS_W;Owq8|}mUrat;t>Pak_A<Henz7}Cp64$QD%te
zktLcOH}?`SlGl*PsWpT6$9VLj2+1FdrWP{zE#JvPj{^c=?#IlvzIB5|4AxE~v;1n~
zb+66XtbyMRv6hjynyG1=Q@q@;z7PA+QACQx%@=_l4w^;XW8dR+q-zb|)4J)A>73U%
zrC5vM|4Jw3AZA<-!(Os(2K1z<kiD{9M(<GT-_s`*MJJ0JC+c)s7WBlc$$fp|=Gf`U
zy{nc%>y2G=*KK7Dm0UgHdv9!cV%p{X2ITC}rAQ@+Uxeh8A_8v1TzO`cKwM4zY`xiS
zS-#L#K-OB20j;A^?+^oeuV8E~_-F2ha16rlDdzfFw`{v!tnY<?FMA{}IlfaH7F*j0
zd6w59{GFw$%r^l4P-NJ0I=ju5Mqtl_m5RMF77;WJs6KY9$PYnCT0IV_#MAdAWoP;S
zV8Hj{xyK}$AP5$6y}`xw{l<}k8g=%C&ogznh((VPe(R24?Jf@wZ|ME*IhM6&+n!E6
z6Epdftv;FMli>!{Gr(=Oo<Vg3_b(g%dtLD@bqlMZ+BTYXzCk1{VmjbYQPFf+LNIal
zG1q_t*%mJNc`%bb2HsKF$+$OD1SBxjAkBFv%YVrCEGqA)`6Jb6mN%g|fZ3!s8^Qs_
z1KaMBBzq>GCx6V&yka)zpoIF>;+2e7ezs*!J8TEq;3E+E^$yzbZ=$z;zQN+$pc+cg
z;&%ZZjS~EPV_hv7C9f?f%)yZpX5ce>jE~uY6$XW#<(Kr5N30y+K*O{!t~f+j9r!<#
zUi>P0wO_Vg4M2rY`1%U8{<c&8yS&%IR6whu8a}(Fto1`Ce#-3|4^R@yLM!q3B4+cO
z0qKOh4WgC{J9TW}a?v<ITBxRj^F7y%YrHas$Jgk%eiBu|0N5YkxsdzI8lKhD*;^5x
z!eSEELKY6>%Lf?eHyS*!Ub<OZ8kuf}7KG{H`UgqIH9Z&c;5y|QWeLAsXxSL4(iay0
z0NBa301sNy1Zwa&XC{A7<2IJ(aUwSnV#GKDNcYbH8W{z}<V(iSdATWJ8ND>R6#j8}
zWL73bXUp07#U(f6cWu1c^rgN^LDqUu8B0=5<_B&^3JJ!2x6I6)^!R6QmWMBV_2)XS
zx;v;W=7{>vigOG2#Yva6+zpflCZ-lUdAh7PjLo^TZ{sn4audE?4I3Z>s;o5UYGIUC
zzs4Uap?^#JjvjPoX5K1HS${TB$O#$UjXU8NgAawJ40!eZLjT7&5x>7}PWM5c4o0a4
zxc2Au8JWa}3QurLOz*}qooKb(L1}@rkM@*S*keuODdX;xx8eK+300xgVh!YYQn}OH
zpGqnV@%fj66xOAeHDS$CIih<RIVqs{7pog9NRSZ@x8=LFX+N6sGP~}xyM8Yk0&o}l
z#MJr+eb{@-ZG5s5jNZYc(<WeTO^|wk7Yx#ODJ-n^KU;n>;8QwDbHQQEf13EyvZAu&
zKAO(z6ym`Wi-{0EgQ4(A!y1OQ)++9Szaf$z>mn^Tev8Aj1)``#7Wx}Z6}`oe44~VD
zNC={JD`f!1Tlk$(^DrGXb6hTK(cDnu^#D8WJ3GN!zZV1_v@h8oLpt)<^I}SIXEnfc
z<b>lZwGOK~X8W{A)NMPvpn=xTZK8+asWGB&ys1)2BH)z$q4814pHV_6^E9oZwXmj~
z=2r`HR$=bf)W^@N9qtGgzBidp<s*^0QX*3DDit3je(7_*3)grT8yg>Ua=bt?@6G?`
z3Uz64EcBCG*|kg^8Gko=z{F<2bW7c!yhK%yR8AjI6mt0?``!VK?;@rbG1;Bs23%`w
zp$A>>WWvl1lP*LT3<^|&v=Ut<KZ@tKAvNE$X$$f6cKK|a1U#O!+6gqvip5#Eyd3^a
znPS`EV#0V(5ZtutEbwyUMOfGGNp&ha-zBpVMPDKsaRv}sfWFP%Y%S$>n}}4>!3bX6
zfYCOFM`9!-5%wm$qdBajEMebwe@}qehXo}K4-W<}PWn>!sa*L3HBb5IfP|O`Sp#SI
z+D!v}x}9C`VBaghJp-8r{ieGSYoc25Xvd}RcNM|9pvV^O>sGM3k<p}QJ@H;vLQ>2;
z+(}3Ye;B}~`3EG79l3s`rXf#oe!JH=xxV=BFg-ORjc)M*qq*XR5`{xCj3oCC&!3T#
zmKIiQ|G6g|wf^F6x)-Yfp33MV#cx%>!krCplr{H`SLc{FHo@Lo@Xjyzmg@pm(YURc
z$la(o(=jV5>jhc`R=pu5n|@V2^c-%V8_Wbm+O@ZIChQ+^53%(qzGJrrEMc_<_S+PQ
z+AJ$1t+jqsNLf7U_}m%`W(*ViKKTm`(`c^Yy?>;x*8?TcL4?DBWME@PzzN~1fULl-
zFL&__c%Jeow!NTjxIo)F{QHa;v-EJ3B=$fCAuqWU7v;u7Fp(<k*lxjpxF4{u#L@>7
zP?$tn5JiieB!a=@4!UJFE`Q~JVIf##fo9+HW732NNWNeCVDr0iE!Jjc3^`xl?a~R<
z2hH&@a=M+6pR3D*k*9l(_;CS27QuUoRX}%bVQH-Uz#-m%Q*o7pg+7NK7MtBV=w&dJ
zh^igC<prUn(ggX-)bXzy!WauTY(!_xKCn$f4oC8+G*l0Ii^{isU6S+O3r!gj3`X0M
zP5N%d2lIE1`;V~`8WWArMM3xIV!W4kMu4RWix6kGtB*F*T-!nx2cjr%?1Y9FBA>8}
zkUy{7^I`uRKs1&xH)C71D$W*vPpeHV3xVM;$oel%G5I0TgIN7ars@6*W&d6OiihaZ
z(RcA_5Am1cHrgN|PhvDg9AY;ds&Wub1A=?`=m0<hiVKDTxwEO|uG6C-MV9ZMm)Y!Z
z$6DTX2E~XijOn8m-nf{h)PMBxh6=|{kcK5Q*4;HiOB*H|S3~hcYK)eTS%Y7j3$kVq
zS8t93>{TbBv1>!;X7zu3{O-v}-6-_e$TJEJ*J?BYkJI8S)s9)!$GJBWHajewg=hR?
zm_!e`K?b)oa2`rwRfB~8t8R9Gr`;2o&B*t(%Op>3sVCtmd`;zxx@ryVC+8Z^sliw6
zBw^bvhvBe@itY97=HfegV4j_X(xjhiHc|0FgQC6g5za^H_h=6y%n>GYxaTPXOZF@5
z;j3oCR7JLmy2PqN_1GR^g<kLqUMmFZ`Gmzb)+;{NSs2BYITyqg2RjipOS{EwVhbZJ
z;BLKH8AD*Ui-fQ3SaB1><Maz01tTIMZ^xZTl&1;Fg9G|#uc?*en}!D|us}#Dh$Foh
z)<2F#oGI&SR*v0)OWDVfg?{4Wc40NE>w7J)`T-+;qHo6mVJ$kH(nE1{+}PaxCV)-z
z!K12d?n)`KubUb=KMWX4t=Hf#R(mBy{%D~?hJH7>fONx?CVZ*m^-yeSY3BXYtrER^
zLaDg=$|3T8()sR<h`ZZ)aGb28b5rMijI{#sW&okuO3}$)x9g|Sqf7x1DN!Yp1W$ea
z@mY%c9R*7ziAUbqNnng4ra}n$zG<<?YoRSW4fK(WRxXp$uEdiuk6@%@f(5PWTLDgF
zFJGrbll6te!g?iNX_<n(wRnkkMNbA6eoQxmA4lMEx=a~)HM-SzteWmzL^afr`7x+j
zq9Dh>*vOpVBP)N8yjFI6@S6>DLH7A&@Nk&%=xY$K)i@w=Swe*M1x>gW&HAW`FAB#H
zX+5R)b}Ultqv#<Ehdp_efC6`Cj%Ew2)`?f+QrU3nqHXIjS{XxU<eL~AFM~kt4<_c!
zijQ%Q6#Z>(FFx*M-`+)fJNo;^!kQ-DseDf{%A-3hq3&w|>_1QQS1PD3zBXob$V<tH
z+Pm!r-$z$?l})FD1!y=vat4fH)zsaZT0yfI<BAWG)~g}=?f&n4m|K(hPw!u1QIxzW
z{{%aQTI$B#g+}y9c#_H3>3CuoN0WkFxV6ul9C#AwH<%;Pw?w1+)G9GcmQ50RD$@Q4
zc0Oe8$6=2*NmH=G(gs-|geo-uqLs<G&+))1xHhDq|AtKerJGv(2<AG121@vi9-K?u
z6|_7NqTleJzlXOA%ln>#S#}JF#z<6`QX?Vys-s&NL_nfJy^QIOU|vHt29O=C2goA8
zqOaM4SAICu!A;*oE{l`MWl+FU{?n9Zw(%XomvPGN(wHbAYfC?4y>4A8j9JI+4k2(I
zcWbS~$*U!irZns_J1eBTqlZA*vl&Gs=XpIRXN$%6i0panS>_!iBc@jxs|dfMB3EO_
z&o^fIcKI<BG(3(4&+*udIw1xTWE4crRd*S2BUlVy?asDn=bDb_oMA(6Tvf4XU$_H%
za~m70Zs$mB_^*LT0us)~T-=7X$CVLR+Ir*j5=`VC6|Jm>JDWc|kHe46YBVU&`995-
zbIGJB`uPto+kBRIO$4qwxzIYqDD79b8Wi&~=muSIPlQB5T!g?#hVdd`w<|cr)*(#T
zspWw=B{}T(%q@as*$n{jM<6Gnwps)k0qOwWt-B8_?=N`uX{)vL?=`|bx*Fw3;#+D{
z?UyQfW9wFPQq<R_md9R)e$)|lRj!RErD<df=~*nvO-bd9D^^=;M?XcbnXgOil}u^t
z{VyEQKHEKp2!2KpoLJ<lTh7@`Ydzm2@%@wqp`!9gx3%3~%oW<d_Z3162w#F6WaK+K
zD_Ng6HG#^D_Vec6G07v%xe2^9$?0HYbrAVgjLIPN=Ixuw3phJr&wb73QXchPy8hx@
zzzHd(;u7kZ&qeUa;j$&nU6bQ`=9SK?63$nO&$DJH<+CXFNxH7FEC{$0zSKC%$4`?~
zyW@Kjf=M`TcC|C&tHtrb7ejj(9zl^1O0;pnw>%hFcN1gKQ!K6nbVddm3UwgFTK@TE
z)@kWPg=C^Jxzv!0kjg#qs%!$<`MjOsGE6vgt3|x!4oY|a6`p>mkn-{DEaWAJUygZ!
zRK1@ap&MDNve-lFc>fq=>*o}SWysy_Qr`f8p5rZv@+rBiwPNdIW)Z$X|IEtqk!swD
zcyw_Zcm*C|gm@CV;aa&nK*RyB`7Y+V1JD3-B0O1tSrG8>UP4#k2$_IjyF91DNR{8b
z`lx@)DCuLdLze3UDyHAJz$lj6^=LqsAg5xqwO1?E2KchD9Yl#yl~YhMJ{(1KhS{B5
z-AzFr9*^F~D+6to6Kpiz)HmdC*UCivCss<zzJ|wsQSB~#PJi>|d}UO61_tvViLcYe
z3^@9jexd#}eVd<+J9L0HJJ^!ZPf&g8Cy<Mpc4uiWi0pc8=JwoNaNv5t0gi;f-)>`+
zzBU-!Oz#k6zyn{rrv)WN^>9SJH7&u+0{iW|PCKsXZ@*7X%89W~5Wwkl#Qicsl@tTH
zA3L&P0~)f<-*9m+y|&HJvgY(uff10lCr$xmM^&~<N~)_~d>zX1pfr|<!#@*GLcQlZ
z$5<2HXFbFzHI~tt;RRZCv;l>ruPPHFM@e1D180%~G+I?N&M<+4XL(||=xhDqi#ta!
z-C$uE@6D~GR}?`>-M+DmE!D@->D1$OP$s?u5#xeNyYG9imc*|)tL^VF|8OQ<H!07}
zI)0a+g?#zhEy$TpT-~K?s9S%*Z1WH`qWR|5x*E`-YM55=HL4omr1>5&-H$l`=TCNs
z3?t``K_;tU@#c8!@Pqg5vW>Ff>*V?kcvG%^Uv-J^Dl?P6)H}vTf_}hQLBV%Rjl0%c
zTBJgR@#ucQ!J3s-WseB22(pq)AHiEwc4Z81a4{-;##>N|vgN%cbaPG49FNc#6Mn5r
zc)jRDR}(8p;A5;^pGM+^zAw8n1L&;W0krzo(O&jIO+^-|eYf&s&NLCW;MESZxEMv~
zoK6jF6_an8yV%QuL;qJ5hvCSLnrBj!rCEZmAM1Mt9x@vpWOV)JK-G&~q;2cbidkZh
zQ(B^)$T-_^m22H~7}Ey0ABg{g6B1RV#g5+}<7gj_isjDri%>|)(rSHG<f`s^ft|kC
z`q+Q5_Oxb&@~e@|ga7#YJo7NR6b6(T2x4VXvt@Jb63vTBTGh+QfHPXo`i6Qz#Vc^Q
z*+N;8bgWhwQS1RHEt_=fqp0vgUrEpx;+AASHY!CC&9&CifxUIBf@x;_)F$%g=k4F!
zWv3`*P3A$-csgx2xrLM2Lf|sHAK#S}BKp1H=@<%p@b3%tbT!p+gV4Mi87V|7o2}0b
zIRSxsew<&qJRszk-S+eYM)GqfFi%gVYXo!=!`u5v?`r?R`ucg<X_hdGM_C3CZ^K!T
z(vd{Iu}rt_-oUn=k&%uWrP@)B?9UlzPj=^8%~^iQjmy<9xqB#Dvx#XWYJMlg3D#+}
zr+11OLzS^3rj6Iv@XKP8w;+3)#D+_JQE-ElPg3Qbob&ipU}!Af6{2LoNX^ZVE?^xj
z*pX^sHoL89^JgY;OUlX{?GU+YuqqH~$dgXAC2PUE*WxFO&7a@eLqtzFlFf)uG{eP4
zIC1(_R!p#)+4wbDZ28T0ELW{P7M&f@93R!QH!3+i(MLnw0w=T@fX?M>31o}|qY4)L
zU$6efn<YtbsJRVEBui-iml}#vSsPAa-I;G-Twh<WHSI-E&Jk`hM}FJDh)^sk*-J1!
zj7Y&!#!4<j(Ay6HG5|h(Z;svR87?B6goE`A{LB@ffS<klgWR4B&Y-=EE2(s-3&>)_
zs-&y;cTD%j@b^^$I2ff8eA1r@iHc$&O6GRXxdu?c*fX&o3=8<|6jC2~@WE9N#q=tc
z4#@*?+2l(SKpf~pW@PeUFx@ke^tI1*R(-vn1(-eY-1W@t$tL2~F1Dm=uhB))d1a3m
zWKFr>VLQ<Ax@NTCmc;14n#s-!ASe9XW73^amDHG!5rRJ5<OWZ0q~s;F_tu?R#pJ-q
zRY(Q5NMy2!X5b`vZ#m~O>eV}ft{q}vShU}8A=S?%1cQ6S3i3#wXd-ynwtC#^ri?LJ
zsUu%|8y78Au`@xDpg|mPieSBN({DJV-wU5a$c})0_{?1z9v}}9o{1WuSiXNPip0Gn
z6w1JGW`}iUgYCh~Ag8f;&%lb_ZG;JSnss8{EJ{(uoUSPw;fz!98XIt1-w=T^q)#`}
z@Kx|?$o`mh#>EF45t7F$dNV~!$jc%jQ}&yG53}QuBA@8m{XN<Ljw<uh@A$-o$Pql6
zOxJy^nWnn0DZG`g^Wn!xgDELFV@Sc#)oA1HzN^1y`bw~>CWG6K34)XW(zGSnn5pnb
zLq()%u~S|_bKpQIjhA%7!mG&-YAC}cY;u?4r3(?=V;)C0Q36&Tr$j)?kNvch-I&Jw
zm>#}6h@#;2Puy^D*x<9Fn3qrC5+`FE`QttB_I`5?>ra7x`h6Vs0*=$*FqOhk@~_K?
zRUn7Z>N0hPCN6fp<gZn5((Kpy?9&&XGERJ2XU7dLc<s$?<O5gPUPkHa=;ztigKiYF
z=h3g;*MNfFxT@@)ae_J(q6S6~3-;^J0sDd!F^m4FzjyUzGoAcTLMb1SXyms$H5XW;
zHw8L{{;w85f{_u~x!@MyaO44hU}Qz@AdYkP%rE+GDxE4yy$v=4ocYs^OQ;-O0m#54
zR(p#0oUnL{vmQjhDH)|2!iCREFJ<h~q!$|=^5l>pr`12|-4ZV?K5l?^GuwxTm1SJ+
zRJf$F_p_T=;p7~PZe-hQMl+jed(_rxN8jcX8DXYc*w$jW=){aI)OG7igE){%#z?b8
zdEJNf5%0N3-!;$9u9l3;R{^aIoMf!OQTZ>0K^+b89fl4XP8s^&>-nd6bFI#FPov~N
zWAvg*fA~NEpDqzrT57$8nMq5B5Qg)F+Y>D$${3;82hqp#z_1REsQ|`UIMh%C7=~qP
zH_JwCjB~eDkQ0TdWJm92X7Z>iOa}92D`Q;EN=-ZNRwrAiUK%WkwPE+}5pLslZ5i`0
z%m6QLUtV;!%cvOL=ghwL%vmC-E)s~Szb=NI^6|V%)1Ht?9-3607gVO7Et+K#ym-V|
zcqu0v%X;ZTjWt2cfnV|4?iazgRra9vSpS5Xgt4=WU!I1BLf9U$j;ybq1FL{k_w25t
zl_{5;<LDfqtTqh&H<;1nYL=Z7E3*$Le9TB9<*lsbN^|&^m!1?PKY+6{BbZI3Gn{;J
zthY!ahjKp<9nj2MX%RUbS@A>4>hSFQoCRQ(#@vSN_{Cz~W8=Lmx7@mbH_0umJErDc
zW$fgWUDX79>DPx8*bMZnZ@9CW{Qoq<EuV{15K-P@9X*>F|3{2_E!bLxtWQ_7a7ZI3
zHXzt+aD1r$)AFwFI<}L?j@?(Hwzz8@C$6hg75f-Zb(^f9OSj(8h?({`$;PbX=x}~S
zeDX^8d3(M(f=)AqWcgds1=j(zxz7{^Hd({&y`e|oI^QI~#5_NDh+Ri}3VT1o25zX5
zi*>6um|t>Yy80bPKb&M$4h&W*moGhkj|2WOQiNtMh+XX00KH5nd!#v^6RmYeqOkXm
zrSikBD4~-1P_^S@>hA~tP=WF%1#HU`rCY_Q?jHm0C$#b+mK$6N*+vz0HalXdq{822
zbBB|LGzA;n<CQC|=8KS*klDi8Q~eWeJ(I}8v%;-6dbh^Hn#=Zhh*gAD<uPT*if5zF
zg5gqCcy$n{*{t9`6^c%|A|Cat;$}1BORnAv*Bq7z+mUq?EYH9W8ntWnjc{Gld0*|R
zjQ`iDg`bRCn7oYWe@2~%Yr5W|ki=jHJKgf4<>loC$B+s1_)ke8meW43W6jP<K^0as
zf)srsWKLE*y%~3w<u+t2<2A%x?*Q6|E5CzY*HP)8@swpE&=3&N%ge7v43SjF&5>~A
zJ?IIIhYN77ul(`zBqWI{O$e@t3u<Eje2|%MyvI>{VuS5Q5ee2{u9QE%89?!tw-&TK
z0bZnS4vmlZHAG~20xnV3Yw|V_;FwdBUuOMGn{Z~T4by(nA480nwKt{|<*xX?w97Mg
zLn0t}C;ypv=EQ7w@R+=)zsAkTk6~$-eIrppk5|q*6Vx<!q*KNWC#N-^yTUY;=9>3V
zWwKtXmua)FJ`4yAz`Zk_*HER<e4@1VtR`xf3KLEKBY{VlPpbo2H3s<*u)95#VJZBI
za9IK`j&S~7%gy*G!ZYxMaU94IEB8|OS1vrfJ)NMH=P5&0;Slds{?sG1GE-B>zTM=C
zn+>6dw{Em<SmS0gB^!$@HH55Tq^SkiKPo6p^n?wn)gAd`c(8Q9_Jy2A5CJMNm_G-;
z<5B6s<4GQv|Aph;zyP3EVD*oNoZj6!tZQuiCH??KY!tXn9u>tawST~_()ANildkr(
zQp{nmM^T=P=ct@^+rnJ`B}2Agc;KA8C}?$|_2hzyBhQ|}RM|U{Zt?`h_nLSX56??M
z>+qwseV}%e%49|E8&cw(v*9q<MK@id&`UHgh}}RoM88*Evuo~*`S9+KH)N4`;&6a^
z>5&auV`c@yilge*bkynpK0hz?(*xt}a7Byn&B3iu&*=LaW%67!;2lfFS!L0dP$X}*
z)B%HySm(|-0K3{A?4@8J!T{PU+rhYkn>0U$%#;jd{<d)9-mDz1{p?>4khkI;=E;nX
zJ;uMH3NE@P>#WKx6T@K3Hr<*d*p3lSDaY3tsxtwcQRG092RLKV=baLW174gIASVEh
z*wQ6qg{tbN%H(Fozgq6S*t6;?urS5kCe+WqQ1i(dLW;o##{sn+?;oCJsX6CN*8}T&
z6&JsUQS>Esl*Yq}N7`FUTCo?=C0tdjr2J#I(i4?5S|7B|4ou*%g1K0@72ou;Qll;h
zlCI0MId><zXeicFa7^6Vsk`@K$t}TnVz}qXko5B?MnUDpAByo~7tZ#zI@-fYKI^|7
z?k^eeDHN>VQYrjjzJ?%F(%W0u(8vfE?@OoaD<B9R10ysuQy(cZqC^2vF+7k*r&S-6
z$io89)7X;Gdb11q;bKF;uZNL`!+E|@Q8Fg7k}twI8|}ia8C8!3p0h$3YMmYQg`H5Q
z(VxQ5d~#frWvakUPpN0t5<mvh(OjcgD;<7Kn8%d`#;r7y&{wC*m}+pQ_8*tIk8K90
zJPWeTs{ST3-=>dznhYiCzA#dT-9qQnmu}c0ec*cCN92~=^}B=)Uo5Q4H#oaBOO|!^
z?rNlA|4cRI^Wed0uBG9JsK~#UnFtWLxw&l?8kvndJqsGUIlo;abU!}$lzo=X!3m0z
zhAT(bPL{!{lrb#B)3&*A)l`ccL7rnp?k(ISmD&N%tqq!!t2!5t#eaS+V1NHd@pt>_
zgyC6(TtX_e_{=3lpw12L1Xo<7^e+ES-HQ4A+ff`*T<-dIfc!_FW%;_gpN)-`=11O=
zX}Gh_*B@!)I=t-c1ltX?gkHYlf<LD){JEPPA)a5Ssa8if7hc5r!UQW)%!bb9R}TFW
z@3zIpa`dTwG0ZTB8UKSA{6!e38K1PUm;ZRruOxd`SE(q3T_t0`Cg)wBQi!<cJFR}}
zk4Mo$MQAgEylk|4$t$zWf-8cgfEXb;KzqFDh1QH@n5H7U!|K#JPpnl*t!UOfdw+-1
z;V%a7+c5K+yB!X5F8(RzCiNu5F%coss%FbeUyF$ou}Vnr8FVxoj#jWS;u2(m+N(l&
zy7qV8lpwi&DreQPChS=IsDMuMUZi*zxUS}<#}M`|Wi@~^PtN>k$;zv_^waMqE*moh
zt=9T3!7zvDfV^(+grG-lqm(g7Z=3d4i(<W01BMvcf9ikgNGS2d)o5%yVekH4?j#F>
z&uKB8snk;xnVgyV4#yHxJG)!Q5Oq6R_2(#??qPfD3e^At(|MbyxA}^Ag04Y^kg4@`
zz|yc{^Mk5iyJStOLdqyIS|(gAA6BBi|7~Ze->pe#x_1UOTl=Hu^<tc{-YY=X>_%D&
z$3{bk>frV#&^tv50WBTnRZvgY8oS`_QM#y3=8RqBcv2iuLAXaZJv|ep&P4xltZ>Vl
zzJd#{0k0!*w2Obp^3Q42f1|>;Qf~ch;C~9|E5*RJKDVP43S?|bEpLZHxflTK)RN1L
zoZ@YMJszCu34HYfMzUs~XRS>Q{b@Q@@m{CF$@k6oWxBl$YU^`2>1OCahhKHQl%v;x
z!)=WP&3CfJ8qB50M<Cdw4)@1H9)8h|%`zsAgYQbg-79cyY`s)XA$ot$y0_sO<Sf+O
z9Z?%GdzZ;?WUsN>i#T=KcvpH}rYvhrm^F^Rl@p1*CI6vag5&TS4Pu=NomV4YPzBl5
zctzcQ_i2~7tnLWfw062C=YN=ZF5@#>o_NcTr1JM$!g)@6><{1Z;r(gxO632l5Tw_B
zSLoQxRFWJn<9~C=yEQF-ulGYsI6~6$HMM(~ovA}`^(LB*URvflEdoJqPcM3lGvhXM
zoP{J1-Tnetb<CYYA827^^-k^GnhKfwWl*HU@|Dp=)J;J5hk*#rlN7trXp@xcsqKk@
z-v$XVhLj_>@x|r>_s56V^E|AsSv~n5J+C_YEWTE2V9JPlghiEWedc)IaLl^Xq<k+Y
z%0yMZc3&HyVJ8}$kj9ka=<gu!@HS`LZ*%wOm1}@#tXB}&0PK0xF8t{o)<tl+eRug!
z18$YvzmJ}$Q!pDBej*7O;ACR`otsnlM-2@R2hY!wIIW<pnvVIg>D419F{rR~KmR!z
zA&H6#H(Ihg4d67-G=VqW1}D!0i_Zq97lY=5#>3Wd_n-pvoNu-v<cm&q|CsZ}b=Q~T
z8ss#H-)mLVDF=gH9Qm3shfq(H*q~(WWNd;xf6rH>K^=_aCtcN`9134LTE*)|gSsr0
zYjN@sfq^q>QhweL>u~S-z@n?BbL?~dIcP+<WZ-GRr!5&(Z3cd2ZYg(dt6Y_u$9=#K
zv2NGZ?Z#5wZ7c8|l`7y42c0czwbtCJhtKC9#}H1Ci&9`eV9VXs#{8)W<uBz<(u=;<
z=gJX2V^;ppH2fdnWL=gcy)Xnti{a^LYHzVG>hz6@LeteV(`zdbwM^d;Jt?@$zopAN
zwJ&ZA;L>RY{9TJ=7`)r;9AX*Gkn+bN(wqhX{!0J|v(1MC6^s+tM=?a4a`()d<>VRr
z`{X+!xpKo5Y?MB;y;xr@i*>dH)qCI{f2B$dk;1ur+|7bYt5DEkz+RUeW38N^T{=Nx
zl!6m7IQDf)Jp{ocMvoLe#oK>x;xEIM6oRW98dm?=)?X|6>4@JRZtU)E_G%Oj^mKLC
z;HMB54-bn~jjIqYQ{IzciD_94kZac}i5^b+u;5}xYezqPhL#GA!`D3KZ0n<e4B;ZW
z9gePQX~D}`ESA*am1?TmxgRZAPM#%-JH<CK=-Fx8+szhz^$a}}qW!xj#3&Nz4D+q@
z=WMw7KUB3bt&9;b%&}=HT#l6(pK|2H$@dU}br|jZ=mt2?ws>9AW|b@-PqsQ7<;*VE
zmI}J9aZnBRetRi>FSauB?OhJue|1*4?LNIaIrxiAM*hbGU}6Z`-?wjcSR@Pz3Q{z<
zxxS{RrhZmlUOup&!NG{I6&E;1P+ic(oX(nE`B|S5DiFDmbS{G|T(cw<ca1`&sNt{f
zzsd9?#G_T>GRFd5$m2^#@t?fQ$ZCoHOJ20YWwN-(+qk%olS{Z4zM={n%r{(}!M387
zRO`Pw6++lKEtK>f+$EM)0>~#0<=|F*Sfl_jtS`cFpF0(J`(KpJORs2oliAexO0QON
z&y>XnY>92|YU~+8FFs5)$}GWR;GeC#7w(hhu{l3vDDi+gChVKD*V}nM`#2N}=wDG<
z-<bF>?gnJBj79v*qo<xyX~lYqXrX^EGbtfV<Vl1lCE>$UX~|eV>;0(zvF(pfwheN#
z!2*KIp{*JTv+>)ih~;QO@)Mf1Cv$J~TBS_9z+>O@%R@P%d0(?(l#)|e*5t^ahcqXI
z$~f93FS+h?e$gO4(V&-vNx^8EGjDpX2DAF<4fQyQT(3hx(zCq<!v`>pI@_;{yV-1q
zQhCjRjw&%MgF#PfdcF-}^Qfz&;umUKLaJp3D(QolE3$h<p*qE#?*>zgbu6o>vhBiz
zwKHGu$7KGG89u%3puQ)30?4@{{e5gnA|~UtdS9@Jii*Z(WH5_3Sy_SD4VvR|VqHq!
z<KT4ky{YGyjnj@{ku`kvBt)ddJ%i^XRH^|eIie)k57!}_wRJ{De20@#&2vSTwCtKf
zQ}F2mp-d9Y646O{SE{-4cfWCR<+r=Mdo8aHhp)bChemAQ6NLRiKRRKp+9S(9d-kV6
zm;hA7Go;1z!5@9{zUMH!Q72OOF(zdy$MT#iQoW-3wa}Hnr;!L%xKgS4HF3@&Ddgvk
zmF(Qs4&FZxC^-JQJe3y+iF|?+q5p%U=aq(ScpKkiHt5swmo5($&Ft-!KSoG~F~%<v
zIKB-h<PtEi1{^^pVt<7byhrFy6Yng-=8W#35>-_0y|Hj9jV#|qyW)S1B(I_zpOZU|
zD}4C}Eu_6KJa_6e>V5OMKKv4JN`&W6{5i^1;-mG)wZ5r~A^)s_v)TJ&vvl`eEl6IM
z5?-LiZ8S=>=UCRwWo=kjp@Kqt0YChWXgx=mqzQNSv|SZVAf0gxec<Kn_CQ$FS?M_|
zb-ETLdxQoSblG)Xaoaxo=T3vlRp1`{!RMaY`|suaS16yUBq%R-e}m!oEgGP8X%M~5
z_r*~Mgf(!>PH4yMfL$VAskckhQ&OKu%Hm;bZTl{R;dq08T_yCNVxrWzn8wf6&Q2%h
zOE=qO;^mI)ntW6<^7Oz&t7rGg(e1_S7B|dk;3r>Ce!QjOxPEz<Cp>W_>_<e@h>FFr
zvE1y7LG)MpQ0MyN^qX?C^I1pV2_22+E-sQ1kuSEMFC(Mf_KfrsFLiX{t9{|1gY;T<
zDn@PG%7ats)<o9xt};2}%uUQEK<W^Z52d7uGF=7b`tejYusLg6!hf7r)CdGw&mti#
zmp7C9f76ct;87VqoKZn%>ySPHby_>W0ossDd91azD+MjbnF3y+{=C<`yM>yWqs>sf
z))7oqVB9}A5#~u`OfizDn*Owq-j72M@~ok9Oxgjxm8X(+b@TddRmAZ6tgHG@6hSSC
zNb_`F7^`$R{x|HwQ<W3naxC`IwdG<;TcMHe)TGA2qBahJE89=QyrNz>80-`l>A!(~
zHD|(p*W!KE;*i_l*EEE>{M}}<b5^#o|K3mb(|)cIrq2JL{n+9^xnMK5Y|Tu}ri?m)
zTKT%d2hl(N@pw$gx4i*S#|^@^J_w}Qp_K<fZIZG3rV%3Koc!*a^6s<uRPi@!UH-$I
zXXCHOkFw5w9lu*z@ku^%Bek;HIlgwbm3XNW@lf9Y%s=uZWv&T*U)Kc^>Fbk=OA9Dr
z^Ig?@cdwZ{>8G9T#~{LpTtR3gE!SRalCuKA;PkjhtDBvo|JR?&!VkyK5BnE|^Z5V5
znxOZupBzn#v5l3SDx$I?%E{=pG0nRC-H<TnqgX}y6*4f)02pcrA^wSeumG8l#LupB
zW7V0q-N^Om=0IMTt)7af<fFpV(G>rAuRaRDxiwYnT`ZyE?fgV`<jV%@v|S)T_Rs$6
zavb-5G?%y$9lirb=Gt2x!Nli@D7l;YTP{?s_=$lVK8L6G!~a1<mSJtBGy4A^;eXuc
z>)KO7tCuLg<bS;VFD9Uv1P8wHf>>tnmQTtiew^bvt3OzRPQy-%N7*#p^V-6=;PhoG
z3|(-szIeZ?$8Vj<W_OqA{++J3hQF;M6bYeU9_FA7Kjcqnj^ffz;sAbNta;-r4SO~|
zc6OyR?$*BZ-<JJu<l-4a(7iu)n6opqSW}p6SC~pQ1Wu;wQlkB2q+4L0k#N5(N4i}{
zyrY1_>6>)^I@PdwJKHb+HlhDtmw3{F1xkSE>14aZvd>v0Rb$`LOV&gpZ@pNidQX+e
zvw}@2rq){y-b5Q*c~*T3qjA6YelRwjX~ZRUy^nZXMz6@utHDBYlB`6RtJwD7fkio(
zl+^+=w%&q<TiMN~F7N_mcbWbww8=?QnRlw^J^xhxf@Mm;{6x*xzaE)im{YVBpGpXM
z7#kv$dl+}sB+==c5bLH?J#<-t4An1)O|g<)aIvla_kPl!T;h0ra{6C(79nYi2X_gf
z8E)CUnHZ%8pWWyuz*hf+=$_LR_VX*^%4*;?;oL4%|C66iQS5(XLjJ97x|fo(Q`u1i
z8g+tGUlZi2-XFZM<usw$nfOrOReb?F<2pXCe7ERUHs-csqLKdXnDVG5Hs`rFmaWz&
zKTY|kKP_mA9`Hw*d>py|m_y{R7;sGS|9-#Z-+=S#-4nT@{?}gM|B=jK%`Q%$bcugK
z@Kq}^FLKhO&SHT{G!kQrWYYbP&okzM)alnR=gq4lXUi+$2M>cd_Z=~7p+-AWf()gF
zy|f7IuFuFi<EMFXbkYeo@0o}Si>(6}VzR*Hk*xOmjlso*tkZmaQKMpj3-fG4@tWy+
zUD6iMW!_(!&gWNuT00FU!-@;k1bju$)NFQV*x+OI?!_^>LF^`&(!&jm^{|d}%E(xg
zaalJ^cBF500vu*zbE;nJ<h7<RNShC|znrXEmC&sn&9~Nil}UFOYEy!mlEUrpmQH4`
zj%D+NSPMS<dPXDfZhlqiTz?7wc2G)^LFBe(%3+Jgyv{nFNg``D{l?*FW>|y}@S62r
zSldRpF>dqo5KN9GI#J)}lvR8mHq`&|^O7G<fm-hAyz-f2WLed6eCh?^DwRk~la9>w
z>f-O3kQ6bIUUelUD;3g=^w5M3>>mq+)@KdnW#KoV8yk%dLf<`o@W2!~TuXZ^TGZvI
zD?U5}g=J}dUiKk=e)R=LANP3&8=DoqZNG-R%1PsFZ)~w;j?#t9*sV%Kt-EB+hG}>L
zYWs({1FBw~ZWnNT$NtH8$yVOwu^29t-)?+as<sYE>1yv$S9-B1b7Z=d3$E7^=P`dK
z$~K0CT;byRm`d-Tk>ZGYmG@N>iVNhAXyJF9#maqn7-+j-{oVAJM!x<gu1wQOgzwX*
zzJ|B1S6XvC^VmnmJ(sY9k9Dr39WP@<U(#Ou29g^K;Y}6{!rB;C3L(6Jr}}Cdm@^_4
z2N|feuJ2aPmO`aV9&EQJt)*i<QJ50K8U1B+I1su?eR4t(ic~FQe{D<OzqE#f`Dq?p
zsfyls$V(;+O09?a4&8ON28eU)<K`ACS7zpqgvM}HTE<=bo+fO`IL1=BJZuR`2yk@8
z_J<0PPketAGPOC<FLsN5FY#}ahHD~luJsDn=zpgF*OAF05wuhdxL?o`(`mJyOqe^l
zXA(_~fcP>hhg|JA#b@M#=(<&sP<!pg77He_34T`F>e`soOGNdCjXkb?FL2;FF))ic
zAFEp91|Venz@&_nI%-Fpy)S69b+x_C1#OqsgMhs8xxe1<gYyTnOd)HAJEB07_@pGT
ztd0DmfDGgnKG^N%6J@y_Q|iyiHr&wP7N=1V(U-A<@_CPlZ`@+cCushd@wux~g|z-0
zL35C=5k|0M^q5~o8xG2ncQ>6?ZX@RKSTt9QB7LQ3I!NLYmLx3<C&lN8y)-q>dDUFD
zOZn>lTMz^Wo3lkXNd}v&!(Vusi=NaMa49sWZ-G?+#+O0tNI-<~<0@pyVf8g7vZlqT
z?Ge)IjoYShrN^IRv1Y&AOc3`Yx0&uMs;#f9lAgNoF9$^Gfi5`21krh_*|KtQk>cXd
zOA)wm8>Cf|jy~+bfI`j+2iVOfKa+V+NQ>o=Cp!vDc8i0|!NdLPtol~_>R)LOd_}Bs
zJ1wyFYKN5UhYvXDP)1;4DpJ)mD3x69pf<d^;;i+J&885FEpwz`qdEN-dh^y2ab=7a
zlux^v@4WFMUnqsAej^^Y#_)Z~5ts>bFNRb!`k&qo#bw)vA|C`kf+iB+7ZgHs%QvoZ
z+dms1t5Ce~nqR<9&m+1T1H&GR;6HBX^2bZ!=}*s!VLROKCVejbj`2tISOJgD8l1I$
zRmZP+jVRvGhkHwX_)y}_IqSo&68m?=MGcQ`T%H24EnWGj;D0U$q9muo`>|S`jk3+m
zin7!Krmz=J$*R|LIVrfaaU}57j8>Xnmg;bF)T3|+XfS-=J-t9k7jVVP86LTkiGs}a
z3;Un&iXsP~5!8jKe4A&=3)mg`xPK1=E{S_dLUz}qAZ!#yKRpgvh^L!z{RBTW-gWx9
zNHnWb<U-f47(o@SSK-h8fF~rxPYATh@j{>NqudOWIr4kS^1WWcVO0n3q_8>zv#I;1
zdH4eNtm$O+QOX)JT<3>p_lBSf#BG{zMRfZYn&x6F`Zz$IU;wZrrev^*am9v8CoA3V
zVZ`%-U4`v%3ag7u0Efgmh11yHe2GMr9?r-50W(dnTP$H&HKVAXoyeljI*6}ua%LR2
zl%O_N!A*W0z2otbw6D%Cy!1)GAEu%|i~CHDf<3a}bu+RY;N?4(193qW)44oEkWo>w
z$A%ADdc^wiUXRim))_F8c~ode6Cp}182r1oA;jrzf5*xtLG%4zvZkhJUVx?!Z{v}+
zNi+G>En|y&32m1@u`^8^adjP3vZzFKf)zy8hyzdf#-Gz8Ag6vqwie`QHDD4Xh;9TC
zL^Ar=-{PG45qgl_K9U4OvB4=Tc7~%&dfnLE-Aj93W~;t%V?=HDtH*u0eL2KQ#Z0xi
zuoyfu5wI$@)XCNw5JzPmWO+uWJ=c}Spc<8jcl+!a^)rRHQd)ll?Gx&AKgF}=)nsGv
zJnuhgXEG^Vb2Lm6F4;MH?GZM@?pHQi19wU?ASfE=TklG&5I6?*5{|ck>secr)MS*V
zGyCvw67Y93?D?_NE6gI`E>KBimZ=gK=xaAshi#wf)z@hJ!e;vC)&VcPDQ_|dClq$n
zo5L%g@UWdyX~ga$e-5)H`7|$d+|5jObY3ot5-$kOnCF+P(F^G7`J(!5Ho7~mrm}nY
z_nZ7`Bbfp*{HsWkh!=KC&1Bx=cli3=H;T}-RZnL#om#J9oG`qyN|V_Yx=F97H4MgF
zDj>)}NVtZV+>X}<@{;fJz_=0)$3+`ER1$0JGh1GV#dQ@(@Mg8^wvDvyKy4M7CDG0o
zBC_Lvsb*M}j@jLxPG6zqEof|y0ZJrZ_Rp7mEj`*2v+oztS@q?EI5<3=WpQ54+zydx
zv$JAF5A{Qj7wb$8F=m?yO1!<`1(@=x<_81?*`ywzy<Z=gl30XXT5_hozrYtm_bBzv
z$_a@mjTH2fAYF7^k=~C^s~0z78?F1|K{Rb0T<PL1s-`lKZiKuw4k&!7FkF=S3zdkg
zD!T4ur_~C*8!p_<T<l?q6m~jRq%dy>yJW#fv57O$xi=!z9NZ3luV8(Y9R?3}?tPCY
zDtt(0KC(!6S+#a|$2zA_qucsGn77w6sbXqdXb>o<rFE#MFQ+G!t^Y6%zm}mV)$5d}
z%m`kKG9vX(MWJ768-s{8f;PujuJB+@7M@Xv+}~&WSX4rdJm1#edwM-(A~AS!k%Z&7
zNAGkr>A;dLqLP}ts?EWm{XsJX1md7}TN!yhlM*d`hEJC!@vSe!;m|AP;m#><H8_M-
z&Upr2NaEW?u8%Dnli37^qOfxTPE+-fYRL!7A6I=VUV@Bnpa#dN!#w-I0<RbS^4U6>
zalvyPyVXt_>(*tL&?V}RMd-J`|C0J<Ij?TnRO0_>&-pG->E<ncwW$A<V1ErnOiWBP
zHC6YQ@U04bXFEJV{S;>*0K<64CkU#EDRg?2QX(O~-{HkeXZYYPEMB?`%-9ER^fdR%
z)Eu7Xx{!&Tn#oq9IMHGWV2hg9B_dSdKBj@}F*(QFyn*tUzdYs(i()Oc$bNvbsDSAM
zX)U+X+0a<vMc%{BYohwP_hlgg%ZMZ9$%>Y*-%mIhN6~dp1cI8erumwVZ(y3@mUYCc
z<TCKCT(6K4Pgi-S;vFG?sIW4NahaAjt=lbM!P`%mGWhWFofK5ufnffb04X6r;WK!V
zb@lTLq<#sdULKdQjk0*aCQ7id$98uE_P&At8Hv{K3UsU(9%QmV;Z4CI$Q?(+BVJNJ
zo7Lz+b}>$!qVbpf=&MvhG(_O}nSt^ZBY5+j>qEjsE*O55NvJsC+k{_>&EK`0Cd>z}
zHZSZ(h27B%oK_N-^?@SSYrJYY1%@m{V#%ETk?;&Yhph+Iu@JB~xvAK`Yvk7r;deD0
zu*0^qU}lC3e9m!#RNIe<zHO7lv$9c=!ySdnE;d4M_#JT*2Q)1%k&u_@uK}`GKw16O
z#N!{F-=GQDIQ6!rwX?RLzF0^jOuJTW19$(^@@JMER+qsqr)tlz*v&GM_F<il)AY<R
zf!Ohml*_tRpeA?Y=V^i`H37JZ0@Gxht%L)m7(H6T<5s)N`io`U5i5m+tz_7V=PHzw
zQa6-WBw+1V9q=$HwlGzy0%VAC@okXP@ez78HrTc-ouQ?<$FOtxM~?V+y}Un6cj=*G
z{i4N5bf7mUA-gQZ4>R<}=ck0_|1xJ!!LeipKHT;)=t`geBMhU)F{Qd;gO6eYZjVX<
z09>SRKE1Vh`sh=#qKdV(uMFk=Ga9u7O-ti$-y7mZ>(9L*Bi3H~egXO*tL2x}j3Q(Y
zO3?N5h1Kg-BX2DE7AL~|_L5WtDtEP2!FK%HZNJF#+2D?>s&LMBzcX~@-8rHRYgKK+
zamPD!`NunrW~?r&2h_A&aB?*=KAd*&#+i)t148`u;2q{EGdJAaa|I^dgT+R6Fu}17
z%(rivc3V5G=>nFwM%rwIHZ?h@nH3*86;%6UGd6IYSE!WtOYJ^_@^gX%e^yPrbIsr}
zS;t_U=-weCsxwc9*Gc`_oEL2Q<HRx4;NsdF=v7hRWM))!GA}5A<urInA|0u)>bwfM
zEPLTe?~Fv=vW;S&E&8!DV=tnfT>aN{e^<Vy?yW873N~<?tkKk{Zl;5&vwZ#jtNr6V
zDKRJ>86I}eAzmHE3_jiuYJ&T~B+WTn>(A%EYs3b4C7+^Mcu@9ad?gkivFvI$SLa3k
zXbD||;#(gtW<=i|(Pd|uOF`X#mKz%pViJM5X>znMpzue*9!=q(e2V@VK8uCOA~Z3C
z;$Y80_Ne;j>=c{3iRENqiULK$UZ5d{fH#V0CHt`X&U>m5?xR+3{G-`ALU{dsV2(4q
zz2H67^$T}hb;rC;qOe@D=CgSruF}p2F;ge|bGUx)55ih(N1kv*x765E%|sl0tu#Eb
zIqb<`Bxg7XNs9E<s)@5-gE%l}_J(s>aD0t86N%#QF0G@PJA<0ii?xl2kn$Y}R+2W#
z$t@_6uO7ed8%L#G&=xB_dqi)ln%4k4met@nn`Rg)&$eH&HQwoRfLJTVayS=m6m%Z%
zR$zi1KKudml?TMvSjNxZxE*+m$9as^+%1?$w4n{o_>HU0Iv>lnsLjY2%o4-4ELQxZ
z21{8QSKA<jhw}6%>d?Ph)R1Ofg#RCV*Lx~9(3>i$`^P#p&GG&`@A=Ux6{8wju|9la
zwDZBjo1L8<96DNct0yJMZ|XNms$At#;tucY3~>B3bLr{BsIuBXknAetG#AqsQ;uU9
z)1aj}8yP?7Am@tUvE!)K2_kZ7@bzmHMS5V7Jog;81^E{z#FM>r$%+Z=0B})m*HBrR
z&E0zwdW(rpPTbw@@T&nti!SSXRCc3#(LJC`lOcw+<55IVcV$prlTlf<g_S-j);+%p
z0|>Y$hUb=JV77S99X76}CoXG(-FKtl*Je{Yk|sfSKW2x>mN#*`Zr)n6UC>@qg3z*)
zB?T|OgD5pacE63GV9t#!Vj4F-$8zW(Vs>;Ewa;0I&57Cwm`II=Xj28L@Op%u)KDx%
z<w*GU4W3fMUM~a)Q<Ss(KE8G&=2xQ#>QDEnbvu)K^kUVJS#5oH7BAvSz|L8Idf^<&
zd=1T4$4jR8OCc%w)Lvc^8U{%&Ec_fcVqK^K)@lQO%;+k31*G0iyf}b7t4vFlS?2Y9
zwlG5^&Sc~R6aF?Ako@?NqsHnB{iR>+@r*hGav9YRa8RTN;rA;9@j!B@hi_V!qUsU3
zmmUn^^R$Rd(xZ=`b9NKsrzv6R__w7ol(5gra6=>t$ai%dM7hAT=)bi-;9PDpgB)(L
zuke!AG;6T5uMV&_NRd$G6@l7YEj6>K69AY~+plUh8zxtZ(E5`P2N6*!GRxY~k7jD=
z>z^}UlBQsRvy!dn9^<IK^r`mrj1qQ<uVUb{%5Q$OrUA^_a}9F4yHd2^#a4k{2|Y~f
zOw^gcAq$)hfw$Vp9!7DnAv|XQiO@Og+s(N1Enud0ToV*{WtiakK-u^w_Id{0Y~B!4
zfF8$3I=(6Thu%Maeyi+Bt;|Moe9|9a_HiYD6sj}%(l;SF*0OszZ~i~}&(>M=OM)2o
zv9y0t3D2iYr>L>(8(es>^<NoxCF2=>4<;!eidwFy#!iLL)!|%=6TB@zjRGg=%U>l5
zLyJ;h(HMXCoz#1OfWFZbM`AMx3?%688EO{?aV$LJH9#Jx<JqB~dAHPkbKm$`r0`_d
zl2%;j(y_mKKDgqU$jxIXxoXah%m-Q7pIih<(B){xTH9R!2l2131tAFmhoMbL^0^m6
zZJ%gj;R8a_pT>pgES;dWi_5HI5Yaos`&{4D;3n1jTtZs|U~H~2c4l)<(q+?qr{Pi}
zZQk6mkGwiF9P847qs!w)>+sh2va%k5?eDT|_(EV6T_9rI(aQE0DfdQlzUMg%_o2JH
zq+zG35wZO<M{+JF1|vcEt$Qo6dK;GJttWP|`Kr9wVJC&vIfGH$M%TJxrT17@-{@el
z<kGl0hwsGWy-iSn>!a0k&RhM=JEcU46bIjJG>;@hi04C?YVK#`fV<uR0Y`i6aH%fK
zs~p$Igq85Ei#Ht#&^*y-6V+T<Ke`w8<a6j)P0BFWHl>;Ns=F&ozjiZqUiq-=bYjQN
zW-*imVQ`aOh?AfG$?dn+6An=~EZ~drV=MT1ozF-FCg?v>kz*JakLb;<1sF!fK-Wdx
zTlIf~8Y~^oUF=bGMi#!7l<RN8Iu+FhCnIj>Jod0uSN}h*-YP84wdvLk2@--^aCZpq
z9w0z)f(Lg9+PFh-hY;KY1c%1mX(YHq<4zOYUG_V_z5f53Yn}FW_1RlbRgD^TSCoXR
zHBpX?@<1rT@gY3%Q~GB!lcuZ(OiV_`1VOiA6*`UI1v?VmcHU+Z-T6Ol-bKocs10?k
zLaRPzPu-EVkp^VMWizm!mKY*EWe9eSK|j-(m~C|q`b3>zAQ{=6*1slK)4V>4g;PTJ
z@nL<Xj_oXd_WO6yR#WiKPLoWyq93I$Id7wV%;AQaJzb(%63NiLF2zuchHq<rPQn{N
z`2YzVQ@VvL_umz3leM}W2`w&}eB?sXo-Ic%(G+!mf#sP@N^OSUbdqJUPiZhX-|_F@
z^&djRxG+rEdlryUS{k%Kk{?cPE*{d?RwSRpJ3(n&oF}s=tgY_&sX|G2+ijhB?7kZ!
zWF}9Dh<ki(js>_YmW7=GFP(;4{U!BP_S+$3ICJ6y6Pf-30nP5DUzvq1G1B}<G|_bN
z?}q707{{wm$i=><&GPzM<Z6tcM9qjE_Gq$(n}pS^OsA^AlRx2LX<btb9k6^j64`)S
zv+wgAprYA{^!ddHrHpSmp4?LkzeVn<4FkTC&mf_ha^qKA8HH6<7oVu(I@rN0Z<!9;
zsdLz<vzEBXE!~h~ZC1p-q9Bb2{1N9RTSe<XiW0)~hLL1{GK9fwmRh>!O1OY-Lqbu~
z<_~Y->yFD_{%X}aEUlZZ$#$bQ54xtt^abv%9ZBMr>p*je=ETIzaDqQ9^_3E5W8ksW
z*bii^q!G!5TZP1ag0DQbLDI**#<mM@nN&vkUeth2sH31qCqsu%YCM8C*U~EFhw}>+
z4Tzd-6(uycS;RKORRXxi=jlIAE@d)qD0#-si<`aD8K9Lz{!6ujh*MP!Wc!5wR-}>8
zmBe>4NTB~OAwHu+>3+(v_A<Uw|HkE+RW{;(Mz~&{vaq2xWi{LXd58X!ad+YX-G68;
zeIojQ6L5f>QrFPXJ4Z)HxqQ7UBe+7PES*7Q6_1^Oe<Y!Cz?6Xea4LQIH%dW~TEkJ+
zPwwTwk5<^<_2vEH?FP<_^S2l{XD#t@XmlAgTQt0D*Wu~(_LwDF*YmBbu1DmI9n)vu
z_}&nZF+IlVj;qxYny=q2icSj;9+(+@EwPuL|8~hGb<XaUkg_4*^xT$UTPPC4ex}!C
z8KMs5s;(Xt0=u-!Rr#fTE8)LA;QrhPQmC4{8y4WNzt_stc@T;N2rkZ()~EC;UyNU4
z7GjSi@%@}HFaLY}6$iw(hfA{LA1zw@+<V6|Y{pU5(W(SMdvvhtb0;vImxqCCBNKDo
zg(cLdA9l&yH;K6&@K8UbwyHmGaruorIm~Cu`m93y*It`u+(XY-7}5x0k&ghef+6~a
zTM#>==$!z&^<R6a=k57LdGO(L$QqT$IAR^~4a;K8gA(R<^`3`({yW|DvjGv`h|WR0
zSlGc*A}$FJU=H#U^j|T%eMPW1o8ZroxkjvDUoUrjAlh{YBu{qZdAPa~o0z00vC!sT
zQohY^RF4f$11mM?F=UaNuI0F!IPiuqgIODkp2pTonmu9Dec?Gm5QNNrhX8yJX7RUk
z@<3zu?nhQu&{N~FQLH}&RSt5&y9*&fo~pjg*p4c`_{5fxFg6@5yt$%)<oCO--@>jN
zuGo!l;NU;4!XRX%Lcq>kg8f1B!XGfBk#C>fxjDtRoM&E?2m27y?m_1ePJ*qu*IbL<
z0i!viY&^i$?PC;0=t01<Qs6_19%>Pd1&0qFg$Nf4F<1|_+5dZHMq=JlT8NFCgAWI+
z)+2){hj>2C{BZT%;i={J49aNR=0&~M;mSR8CS`-6V7r4}v2T5TK=Tyk>>aoEv97`}
z#W3NX7iW!(e=G~T(`Jcit==wwJ+#=??_I*}tDBPtGBIY6H!%ONfrpI_cPhtW7?8Sz
zHg&H66g^~R!T;Z(CtP#<bhH6gI7Gbf*bZ0yVGi|ILLYnN!?1kG816GCeM*%~A@c`E
zp@~KL!`#-D9NX#agTL67NNR0dn3}8Ju#iSPP!4#i82DNM%L4_(-%b3DVU5N)p8)O_
z$CtT)9aA$24#tA?{<%G#Z(Jh>g7O`j@NEjwlIvqRm`<7uDEiRJ-&dF(S5H`8(wo@O
z^E+VUc^xd!mO++NF1|u{<Y>L5u*?sXHQ=<=k^H3;(1|di^}dcAuP~mOr8(8%qb9uV
z^ZSy@;Hl(0Qt91ms}&m}qL!;rYo3O2<O`hfieJ|Tdf~TOd)Rnp?TU~qwS%TqaMf8>
zn@m^hT3qoLwHkleBE?s4x=>S_QFtHyfe!;QU9L_0omAs>%dHBb?wvV5vhT6IQ|>95
zF5v}Z9&tu}Wl-iL&nE{hX+(({-p<b2{F!THxJ$S}D?Ihl@cC8ZF<+_Ugr=O+w&C%A
z42(=4uKo)DtMUP!7!Nj}BDyiCQZ4$|GxR5?D+C39eP<`6@h66RuNL5hq5=$2qN1X~
zKi;-!{j>O9)KX{!25Hm@mP#ytnP=O~`I0s9`)OcL=s?+OZ}04)@@Ka3K^@0uS%t2N
zkOL#V0fSuq<)Z^_K2HkOY?0lAIlCd@I>K{cb+%r(+KDnYB6B13gEopOajUAnGJs++
zI=cZ%@?*pCfgl_FDxJ+#7z5TT>w~}e1UK#SMb*Ybn8zOUJKLSA$Myrgpp}kn3|BP8
z5WhTH8|ror(XY<%^@pU@?$7g0UMiq?q81P8^jkMJ(BsGqAGCJeMsJ~!kXPwg?_)|@
zeCX)fJ0-_MBVXV)#H37U30t=5a9y<rE+Jwkt*-GxaoVr;!NGu+NQ6_vXO#PK14Nkp
zNI||TQG-ua6K@VW1PxGXnmzxIZREKX93qe@Myg?apko*vD;0)E-1M+u*xn+ptd_MV
zFmBM<zjx#vf4a{zhjEDJjUyQ%#B{#Eb!)d0gc-p+e|c?i+hCVr;v6n3*Plx2?Z}~X
z8z^z?8D3$yY5`k6aTeDGoqsSWu>7-{llG8SFRo~2VNSXIB{1-9RMtbj7QXmMk(tcc
z;9&PyMkBlD31xwAjcUz^XLQWKC(Y?4&kgNfk<2z`eh4Ws9|;+6xx-y7B*aelC81h?
z*B)NjHno`Q#odELnGRi-??>|%M{dlf(ls6R8!=L`82>RtRS(g4PRwyFFY$TlYwf98
zE(KQ?Ky<>1QqCGkY6|m6OO9QKdiW7WRg^TRjUd}ad!xA6^?qU%@OY*}%Wpi1M}Bn?
z1=H}Nvmiz5D*aU^2P?GoEx)ue`IcjUT53b>7@-#J-aW{bdcYRG;kUmRCC}k#q~vjE
z^tzL7x3Jitg2?2j!8R7+*xwm+9NlKwWw@Ouxh-Vx%<o5#`~0*JfjBOD8HCQvx4n-z
zmQ`&I?x@zQb<jJ#h4M&~af9A=2wY}kM-<8;JW}U8Z1a5C(@2H0?E#q<iHEz?cnITI
z4`lc?X9+T9*SG2~-P3*nJ0(2j0(_kaMSsHFs~*O4i@!|clKZ!i@mxknJ?fqB&N{QY
zwfL(6d}<*SJ1;-O-`7)j?9W%}ZXfRtXnNOBLX%_R+{sMGK+1UzJ%ir-K1gFBI<DQh
z{Ab4e*KFhiS$}?q&Fqx7w7jpW;f%+0=8C&5i}!YLA-m$Mv)=a27KQYJgaC1S4Ni>u
z*dEip$TZAFbhcCjCei*W;vv)J&@E`UyW)bTxKvXu8t|#);x}xarvEdfqRJ=>sUYUA
zy9D&u7+Bm1Q}~NEiN5qxM1S_v-SgmPyN@m7v4POYQq+JYl{iGF;V0YhtA<e_NCBj%
z=FvCfnTWpeC44>KpK&>~exEz20nEY4r7B=}ww#J%*jPQ>1kpWypg5X!K8!EtA)cQZ
zg4^i!_FS`YP0tMlbL5cYbX`3;A${0yJ>__O$%-ta*UVkh5;B_4%P3!)sLiNySRknT
zD?)1LrI^yE`}L&LJ%;(ZTi#X=tH<-;f7S{0nMnWjC;<)I`Ws+6wv6;$N%CKgP9tFP
z|8_HqEin+Vr%GsR6O4@fUUQ=}`1H%=AJcILbd*rDf5bI_bPB_+z;wz+{F4eh#%^kZ
zH&_Lnki<<|dkL@$99eL(;h6COL|hsanzZlbbi+k!(lnDI3+41JFIV3WUwhs`x4SzV
z8uLSW*+Idoa-xj(WL&&rZ*<<SEC>XMQl=8E`$XyQ9)C+neOn7O8YLBPog0KU5^<C1
z(~XA`xlgENk1HRqba$%BP;7LQ){$P(3i=4XiKxD*mW?2Y1+t?5%YdkSw;>Wd)+?dF
zj`Prr81cXMk$A<8|7T~=(sr#DWkf$i>(@fcn0gvBF^*3NfK*ydU2Jfz7PNd_`&CBS
z$n}SC<MZAE0z^NZS<ngDOY5+WFqG|6UcLBe#=4Lp8<b#g80?>sg+_Op;ry7WU<5NU
z??%u<Q|M(?>QPU7>o9UHQp1_u;daG-Kuyj+3+@}XsSOgyXKBS83Gpz&A`@~?L|F;u
zESaL_BQ~GkMx?dh`qKv*AywYAqHj)Z-g|OoeOnv*c9X{K(r*7dqi6+Cerxj@YUa5d
z71$U_aQ^kNth?z?S!KzkV<H|lIcbsraSk~se0z+JA3;-=9T};xA3IC9y|DdZYd<tc
z&vU_Y9TH+XIateiMugK_vA@QJAu~7GlVHg)|5EpwwSOlJUDV3Kf!W#5^w_Vw?kJiX
zt|rs$pxqDTz|G0SsA(|O=kz+o?*XYKLIQcSkXf_K;t<LdW!ZDHq1_>i8P@>flZ0hA
zY%6Gt^gu@l?J=A~6T&yz9>zW$PzgdpvX}b%JU1cDPy8NN*_4SR!SA$R6jTZ<8N%bH
z)eP77?io2{_;(BYd#$e08)SHRuA`Z>(>N(tw)QfXk=azKXiWW2xI9saTKql`sve`d
zxsCf|t}^mNWHII%m0xm<4rau6ACrNZNpV%eTc|LavI)HLPsgLsJ&|uX3Ll7>x5uZO
zmY`1qm<An55zE4@zR2^jIZl+BpyKaWH~jm8N_H<;H<FKchAjK7d9dEA2|i==l*o{w
z@#8f53`^jy!mZWb>%ryV=iW0U()%~`4WsM6HD_63L`aRXIpQ*g`*ekYjf4)Mn|_YW
z#pfV}ZOfBoG~k_}bqr648_(SMZNkrye=X@J?puVe@};DI%lNgJuPL#58SB$c=;eMS
ztT#C)A#=T^=rs$e@0b3zwOjO)=D0HPo)kzwMs-)s9a`Dps$#(Z<%;w10K5|qr#2tC
z{wu<iG6eWN-5bNkvgGn_E-ybB8Bxs4e4<S*WuSl8dikI2flcZ}>im2&_?p^U(BaEr
z7J|CQ0)D?37GBjG@h@;#s9m2BbgNr2QQ?<51+R<uqB#)V4xo|R-}Q}51NDu*N3sOQ
z1m~R^!j|!G@Qywow$6Rnbv#+H%vlq-X$!mYUNhh!F5+X^g-c!1X1hUS(1{ede!LHA
zE+sV-33FE;>Urn8YZuxPp^lou@g&~|3TWlK_};LB(5-VU?1|gI>UU_%eaNM-3EXnI
zNpasey|-bIV0*~7@TPZ3tNRg!)Mxsm|9vm$9H_@NZxT{I6E}PXIhZQhWp6#4tCsAP
z+f5AnQb!UJ+K}8PJ!L~hwlMtsCORLC?3Zxkl_jecX+jhF#|XS|X+_{09uk-{6W46m
z*!0|;HrX!T|B3TT7<jwmQ9xx;xHtq?DM`K7?Fush@YhlOsnGKHw(@F3vg)<1GfLj~
zfDqG{F0bXdwPUj|Wvf*{mF#LC^%DW5_*DJaL8q*cV6qGS&dH`^hxF`<S^7vQlKO`1
z{PyW!F4((Ob7FRkrB!r)$vs+ZJuHjw<kaTpz8ED3oM>YphxDNM@hv7kG(A!6PYN)~
zT&8vpCEjPk@`@PJWIn@+ivyU<hwq%}R&gqd9vs$kPhwX*2+owCLUwjt%j1{l9DHIC
zR+>&valtv-j?bKhvcb-H^2uFu1}k5h&5##vfqmqibQa;{2}{0Q;fksICrsinID-ce
z(>6!vbyNCTCc1C5=#u+zulwNWczYujK{bpWESveaMO&-ChV|&nC;A+TmkXsVHB1PH
z@mA-GP@ah_9j^M0=f<EK>X`ro4j=e%k<7PQTO%H$9dK^<W<4}DI%DKq_#h%$5wgBz
zcG~P7;=->Mj4G}ayE}bAuMTze+?}I6_fBM$`D>j&&*Wp0Cu;P<q$Lbsp9=H7-~(`2
z&bC0-qn>9CCk_%E<9gyIB7}E;QQRBTuayJGx-6@y-MLKjz+(_z$V^Q{|3RmZN6eeD
zhpoRN;@^k$&JMFI#N~x5B_5q008tPWY8v(>B9-Xuf4l%8i+lWLq>(9L8u!mQI~SMF
zp2ybxnG?`S;JYcFoY4pvADt!!ndP50r$$Vq-=w3l9QjypNp*OA_bt<FQX>|EHkA|e
zmz;_=YGqrp0If#`#%^fQ$iYGo3beI{&`>PInNlyqZ`f$13+sDw|1(AfakN$(d}DiI
z?8`5|u<DQC<(h-7JTap+xjVh#^Kv#v`!VsGfh5y1knMp7?j+ytX5W-Wy|I}AF>wE2
zhy-=EVy*6&&9+nLcHG=23g^14R<wSi7_1$hg5&N;#P#par0awR*zR1D&AI<CLC=Xh
zAT^Z$3WcseKRs9ru)~^Y<&(Fqq`7aS>3<!*<EsgR;<0W$3?pNusT{6L@O=(3!8=&U
zpU=FSU>=csc)AZl9*mXk1^qp$fE~IPFEoV75$teqcI(JQc<X*%CoF$4#9uA6zEA?C
zHoO)@_-&6JncF+>mQfQNwGX8v3*v8R;sM|u*by6{^1Yz)<^7K4Eyt%S6{h;NsjT9u
zCV<zmX)|g|^lLs7jp4Tg0}iA;IZ-yLUx`iGI!E+1+p2H(x#adc`T_zYhdB2l<%^X5
zQKFvTQVF)YbZjM@`jqh}lgc!=a%)U)*mzn>b&=F3*uXsrB2ZT`*Y_97Ij|L%YH(nI
z{$g%1*o4q>$H$@FCrEV3w~2iSkcAFi+&+!g6qui}iFpTrGu%ON_(Gk{mV7f@EJco@
zTbuqyQ}J8S4^A~gyfb)i{pNT>D<Y9T7%5raX6WV$!kY6<wm%O(jU*sXYqP$;?=yGA
z!48<kf5UMX{=3b{6Q|tyFj_U_optmB?>;l4_eogi@81VgV@DH}_KG2Az<W4CIF@h8
zSN|y1-&-2K(F-Js0Y@a?>Zki7_0jRP33?yD-?UbILatK7a*;cIuq@HI1^P$xVH>H&
zYfyDbZgn{o4u7oBdXzqPz!uGeG^o~L1W*1ii%`n+;@UzL8aX-2&UcpQXJWjU_$dD=
z?@r75>!s^Zh^Gs?vhJD+hUCEKm5=_N&NgzH2|I264jm)Sc56d^oE&;MVV#pz!)B9C
z6Par(QIRH1Zf`Vy6+XR+fTVE;xA~g%C}*Cf1w<jeKTUAe!H~JguBAaYE1kbsqoG@W
ze7-xqN$WP8t0DiMJcjZKxQ6n0@vyvcLfuKei1Ccilip<Ql4oHeyf&Ke2zx{)Yy8s4
zXtnGbX|@lKr0O`r3zj#Ub}H`ou(d6Jfnl)a6^nbSgQpR8d7zf|QhvZ_I-oOd-3Vx5
zS+df&GP7Tw+&XZau5cnYj(eT2Ej18_Scq6qxRaGQXM0&5hs|S{zU)8Wk`edyeb<+<
zm5Tp)D9_$=Rc#1(s`@pEa)(bx8MNzMB6oG0_HMxQ(fj^ua`^E2=b8)G>HG>(5LSD_
zv906L&xdQYi8zgQf8u^6G<>xiF7Wx+bo(QG4IyA=H?Ku5*ueZ>SJBs~u6&s|l?ojm
zcH_(z$iM*7N~`<0PEejL7tcRtqpNeoz3JejA=wVpVjq=-t&NY;u`pd1?(3Gggox_H
zRFy+z&0Vz{G0#*dla<#bx*|f7bNZD63aA4lgulbs*Ur{V96<rZuW80W=N$X%!>x_b
zyLrrD#QS^|WXVgz;}A}bQu)_^=24s{E_8WNiuhPV#qT0P2J3K=T$+OP_)6ubfuhE!
z>9+usTW(&nW<=Unpnv#x&9&CGT}Wf^QN@>>4J$rtK_?iNz<XJpc4XwhuwE%}-#W4t
zhYJ;O{kfT7%sG%5D0e7%y}wHyJh>lZEIkH+4rJkTL<WeYp*urz7_gBo(~qxMXUgb_
z+g7$Y{%C2p?{T-{yo>R{m_HKbys52=Z7j}aMT9OIX}{4;{KMw=nJtch><Uk^4OrtW
zf>w8zFni5ZDK*H`)eyD5ymec_Ty`$3Zy4a0Wz^bjs#(7j7!mXf&2cXMiWeo+C<%95
zQ+Q|8Q)8gpetNaU=Qnxwaurq)k@u#Alh}2h;fBH<qQB@R)TL)=KzAj-%oh#y#e6tf
z-BN_rIYexr9jXrvMsC$`unIeq6-khFPffg192s`L5&CvF-&MpmalWS57SSSFP_3N?
zg%i5@1E2u<N91+{9WMH}+$97??Z!kVCQF@aE+GBAy8{n}Dxx3{N!<~vX%bIoSaLn!
zYWgW4$0XxuxycKZ2Z7#Ub2Uz-gLG2(px1`k?OidmJ&Q&%QTS}X)TFCPKuQv`b35;S
z#@k?$_H&B_Xc;?t@BgN`)BRY4Ece)><gFH>t4$V{X`}uYxCij&Ra%><1A41oiqI(4
z!gTq>0spA}qYI(a!)H&QNS{zamY0E*>7PQqPpKL<0ru1l9~-}K;@hNdEfPi$jCD0J
zkw|;xd3=kak&IfJu@=Rw$bv<Ea{GSkBb9-^$NnRUj3rI4<?nt1Pppq8#;Uyc)=C-D
zpCVn%c6zdp3z2Sv*Th$6XxCtuzG-jRZT*UjSI?Bv^9k*cv)$4)hIn*w)}^;?4XI@}
zN^(qTv@({m2B&+r5ByF=hWTx{n{7+j1X4LSGrM;nAzd8%;HfAB#lMr=E3D6GK!_ED
zzSo!kuc0r%@Z<L+z)z$$OX}t2)#UrY%cNd>isDuHLm=jR4f8TX1T#sc9G!rf#^fFk
z{SRz11%?Ly=TCa8tx1i>CSWB%+D$NfNF67y8nweDGZ0M5EBGU2vb7SgC1W6(xHF(P
z`^X7UMdpFjH>-4vN>2)$8W8$L6KZNYGdA3Ri!(4kvK7ed=0YTf6myTNwsAwtNwTzz
zxOrBOQoK@)-_pYJiHuZ)wb%$~qbGXVqN6sp%~fS=&w^xmbXP?m&)(QRO(Qtu2nWm7
zx9o9gF{qUZ`=|KunvNfGv~k;X_APzMRWe-_4UF2j2W(Y->=pJId3Qw*u=zvsaq3PZ
z!sedA^;IX=J`@a$=fx^bSM~i#S@SiOWP2vVANpfOGN@XudW7nHOnv%YsW`gbJ|h9r
z{4;0GSNmvQwySV-kzCAQP_uo#z6+$42qiAD2|B}Z(G*DN@6YtM#a85|&||n$tI;RQ
z+xr>#NA^7c&Aa$`VQ(e~a&rGNh}vT_nvOn^i0^R`fv?S(j~i9+FubU*UGu(Y5!
zVc5{eAPX4Fj?i0VWD*?jwTFSuG4;e3+U)<XknTj4xtnW@In3UDY$6ryUs}{nHI^K=
zX8;d;I_#l*-%uh{dHkHthCRU;wV?{3r#)b9U0uUy!=QM)XAy0y0fUP>zA}tw<HQxQ
zZ5rvk<k{}!)6Gi6n?%xXJ5#09MGNHjg3MWLCyy{Na9O+M)GZI=bvQEVx-*?L2O~C8
zrx(E?=l)Sfjy8=1s9SPqzY@gkXju~u5_TAl=YPvlyO35RAVek0z2A?!<tMqx3@)Re
zJGUWr&j_SjNnHX!NJ<}kY<Gmo6);n#V^fQ7wAr=WJ=~@W8~+q#R2vrGUQ(hD?+&eG
zLRja*WWO#}IxV*3pZ!aJp{V`Go@bGPH5B&0qxL`X@Y=F7=EKP)q~1G2N&nbI{<HE=
zAp+kE2Js9LcQuz3JpS#xWMp|#a22V&JyAfVcULS^A17;H;iK@6H<3M$d47ajT0z_t
zXX*uIrTtNo(V{$dqzl!hZ7NOTYz&@`PGYb~Y;f3w%}#1Wj7PuO+A^s&ltrEQ3=}%w
zp51Aacr$J?4K*+_|6EqT2S4zQIajUfheUR_Wt_=Yakj8B@QX^Ljp4%GmJM*A13;D9
zXBs!YsqT?FM!g9e;7HZ1-Ffnli|AejioBgJH!}aOsjMp7Ma|roraiXPbkWz1$|T=I
z>~JpA+uw&o5}#jV8RtR;C-ExD5znsuZ3Pggi5gt|AN!xWro{;Oa@z3b<k8(jqP*|0
z>4%0$yR6a4!zD(r6cTi_cpV~Y!XPBSFYiD2iE?jdQQUfwj)n28c3??!*QgO1s@sx`
z3BQ9ZMx|gqN^8sS-qWtn<sl1d)dur=gTP9oOq;bZCH8x}iw!%J?cuP}0yMS5%PUqv
z#74vR4Z9;i^61aj|MJ1#ukC>-22RLrcxBhiO)@TBBOxoN($SHi^@YJrRxXf?e2%=q
z<cp38mBd<mSQ)+ZC=Qf_T=Z1~9rk2s=p9Ck57NBdYvqHfP=I1Ph+X6JI!9DyScBM8
z*cE1;Bd=QB@j8dG?{MqY1$JM^9p>2!CUF#;(DnKZMCSaAASIzqnLZF@5&iDWIjf?~
z$UIJoAyHYzO|;}r1n2y1)*Q;tU9h8Tn5V4oSD;I~9nRdiUs~*Hmt@}C9WDEj<K)Bo
zONcI!bJtFg)3QnUb)Nbu)_$qQ_=A>k=g?z^Z-hiwxl|49?D`ZfZoT{}qud-t#?yFa
zk}`++v#Ie(sXDGNB0HbupcD3kVC_;U3;yISUT=yE+2j}UD)$g(GTE#RF9j(afXT$^
zbF8-<BEo#{*n*d8x<npmni1_zbVv7i8@tCA?}6inliR37aal+%+U^aWh$C|j#px4k
zohL1758*7~9oeZk@j3pKHS<_zlb@xfmF({_BSfOZ_xh%iOr<$Up_Dt=GiHvx-u4~D
zDVfu~etRxO%()6@XQA}OXWQJLD}3%G<Ov1jG_U?lyz*Mg*h&vir<wM_ob%=!=01LQ
zT(1(F;D|X8XUzINule|Tz(RZ8eaP`f=Fb&1wEAOx&&NSd4R+g&E0;>coh$$6OZm)m
z-+tI{#pzA{eM(p4aSLKDwEqSwAiz+_{}aE->8x_e|BYWTyx-$(fOHHIB?ZODBh0UW
zZIv?#o$80?>me}6RB%S3`Dcg}h9xbAzBz^sfbCEb^+Xr*@7<c1`o`aC$CH!&uAVq=
zJoAb{uSWgnQNEb}w$e97Jjm?9Duz>hs&M7j0e+VjRVcHaF+vK0uF6zX;3xGTKF^GV
zMe3;TvZ{1RupVHL?D7M}kZR57!B7&bN3LL>*krv%pTXv<B=2+1TIa^ZIU)Z+^Sc6O
zoo@40i$w1FA6a4@Ze%{R$8PEUgl-R<$pA8rx|skBw#^K95ZU0-2l-hoTM`IHX9p}V
zdYZuumpfz7{P{PBTS0Y%q6jRAPmIEEhmugRj44l3p#Z0uxVaCbkf(PuJ$fVc=T+j*
zyFbT{*)l%;*uYkd-3>;OlGLA(v`5#;cROF~!lTj~UxTw{H(972GV!Atq2G(@tV1J4
zGptBn22!;ZguJkISq&4;j9<5$jS@-3MP%IOX2;#c2oW?nE9<OznIAO~fVKE=r>BFQ
zo&$|y1S)7n{HxE_HrysKGG2!`GQ3@(xhsns?FFU745d&9g1_C<nl+y^lhWY8^Kx@_
zhXOK5sfsu)U5TzC)9Y4I5nQ=ixgh`Wh%a~$**@Qq_M`2uqnlhIs2Ue)qvDRsk<~Vk
zc2nl&Dn)atw`V-C1VPbK6jsYR3?Vn7ai#1X2%E73Mr#87wC{=e&MVE)H&`UtqhS;E
zMF^#{Q(8iqnbm^a<7RUG<Gmo00QFijwaKOo6*`{g>yc`^#-9XAmA{q)IGe8{#QfKv
zmgqg6w&|!0S7dIdg>EsQwdr<M`8Z4VVy~!U{oUez5{yY<rqNy<=Yr8*IHUGf*v?(w
zqL&j!!Z2oEbm3bQk>{8eTs~g)w;QYHHk^hKpgs2b&RLl7q}<gU6g)*%%8vV%l88xv
z1GjQ<=wX{%CuDS%dp*y9zwtLJgO1*l^JKLAnx5c!caejYL#D;j77>0*KsN7OLVNGR
zqcC2ls!R3pzhxL4)elNwCYnM?!1(rGMcb6}`q5HjRb5^0!_CPoqUo3;wM?vnzYH#=
z47Q3}h_f*mH08fil=Ii^)x}!Ny9a!wU7M(gZbX(FiLb-^vi*t*3LkA}@T+p1if|{@
z3iQ6e%q8(xi-TlKU=tYnGr&krm(LIYx0*<Bv4(PuPuu}^EMJu6uA%62y_Orvln8*U
z7n~#JXgjJ$qRTAo@>kdoJ^hlDm3TgEo^4&=+p8XXeN7?sIVGmOX>I^=;+pmul`k@4
zqZC-{h&H@Ucz_BkF(g^*w6kWy>_1hjnw!I(8Y_C95fVHjW^&7J82CFmEcxS*!5*}Z
zp<C61gsdw!A%(74%O@gAfZBK)v)>c|*CF+mmWd^c^h-?(-i!ZEO_ab6k8`N=+2v0G
z$kXg18ICTu>`$3bGg|*_GRg!V4^NC)&0|w%%!D;0Crf9q^4XiNU$6XqUY=;n6X3o4
z8@s81T9o(_ffZzAqr<vz7ab87wghKr`>tu|LO|36^H5=@;NBxm9j!V+hi%g*RWd!S
z7El!v!1QcbZ+K&4d|RBq7<$d=8bO`ww41ZzBo{41e$uLEdW&*$EH|w1-?1keGMm$q
zjz8i&r_wS?Ev<ohcipV1amr~^3_k*Y!riIrTfllhf~D*<Vm=1SgC1uso^m!$N$Njd
zZYSt<2y8qS{Jl68*=Lr~8y-d}@cuL0Q1n1IVgfg&{~X+pE-~2cBiPJgZlS-u={@UE
zXFJ;C&tLU3zBQ$lN_`X6@`N8xv88(aXz-%VcNL{otLeO%y>jj~&`*4<H2TtqacI5S
z>mLVEe5rWL-@+ujL7Zc{68Oik7+$muO}>WwMZ_62Bo7KLTQQ+n1jQ3N$IT0)zla>m
z*}Z>jXz?X|{AA4pba2QE3<gNqBr9n=2ND_V0(O<AzZBThZx1Eu7Q7^GGmbHfG!}7&
zr549Q+n1$y=T}~!f!1iG30ak>PY_0ggMF==#=9I|5A5yAz=r7`vsKlWN@0XI7_XFb
zeERz}+&v8uP^`ApRH&`KMYJ<bb3tbQn<@UI;Q!77OiNQ^u{!@Q3n9q24-b&lRWf8`
zWTxu+`g$w^f^eOBYjclITy}39AZCc?ClSrBP^j>oD6t}b=1loxr*vpO%fdor5cltd
zoAJVVAT4K{xrd<ij@NGb_k|giYb~Cq`aEF+fYfL@JUm1han9di4;L%<k7;tu_4)l?
zhWek0yQgPG1_bXf+o<!dfkJf`fB(!W@9oq=*`*Db>S32j6O0t)wM#|l#?w$yACf10
z2T6^G>vwl|5%9kI1IWerklCawa&qD~<Ts8@PDV&4mJ>BV;WrhU6Zd?M+*%mZl{Fov
zQvVf+e^fi*F(h(`zr(lmL+FL%-M-u3Y_XP3)y~eg@aUerASa~CVN>G#SPt*KOs+V~
zgZZto^=_ZjmixXh?o)f2$P(Pv26P%Up&3+I5L%?&jit|z4)X?j-p@Y`hla9Q4_2{O
zY_zFUn)FH;erTgwvSA0(&TFZ{k}hsYMPFSF(;F@dYQ_-6M(p*5IoZ;ss&z;U@HrA2
zu&M(Ta|3#diHQ+h;(PvL$544vEV@v&_jzYi<#I^wp@6JZL}S9gb9Sm)C>O}{_3M<G
zsvK)rU9XaS#!UQhhgK(YTWFZ5L|Xsj?yl8)s>F{YB?BsrNZPGWc;tR|LXs|r_Q`5r
z%>Fc`>86(cBTUO5H~AU>EoJyd!^cCwfci8)j>TW|4K?ufbt#(Rr%n<!6diCLX*Baf
z_r!0eV?zNN=U>w^aivv?vyO_*keLQ!!>PN+V_FIO`;G2;>SdN=UE9PV{?zZ4TfgwC
zz-y8_BHXmTh1==dyV5J3JI%mlidiDT0$TNCJIF4m&>0yBA`L=Fc!-v<iqfsBlr$-l
zyIBA1t6brSAFI;ZTLz5z5^Dd~5i#JEX9cEE^7x|G3jclk1ET5K9ae=E(z#`6*>Sp$
zR@;f`=my2uM{mm+Of=~Ld{^0501`0zq4pfaX?|=-1=$cVe@lk=Zmp}*dJV4KqCD6j
zm+J4(yu*=C+C)W%q!>G#PPWhal=O$DTOt71IFPxaFfu(`>9lq0Pz?oE6|gP&U7J7o
zd&O1Q2RXiM${xq^nT6mFX_my210_z^&l3JtUo?esyLC=~oJiv8pWn}^X}5fwxG?3V
zfTc2#*j!;}2OQG*SP#>(C5q>a!ZFCC<E6{?SOWfdAlUS&*udyw#{AKi+mQ@>y3V@!
zje+-t(LGcRUq~`Ql|2T&ArrknhTh@M^FI=oj*!f>prvWh_&kC;MQyB|3i#Wb7)17|
z(y??@*_z?IiN=%l8dv<|4nIiq<7|rnU4u1K8CmAc?agvakXYOG)<L5~TZQXiRp{Yi
zcx33k^^&tzo!#o=;=XzG`o?DXd0$i+kGd@5n){c@z0Of$O9`^GCStFf@A-=LW*2dH
zW0@PVCeyUt)v#XjAY<KC?FKUKmpH1m?mo$}-QGXl63CigW*y8$#VdOz2TYO3WIABN
z!uNvR%fEs@*smv8T2-55MW>v5errpN&Cw^ny(HB6fZylhD1q>V-6mJZF9z`3V-hd`
zjWO@6>OeEQ^=;#mjHi7?sd{hdDmp`686Zdq5V?@Y5;qR^*B4aceE0-#I1W)Q?vTv|
zALh&6+$h=Qrtm5MyjcSbChWsnQ~hA{UAJ#2`UYTK10IpqwVZFHYxR9)SXUC5{u*DK
zFfF$FccATJl$2Of<zrO)2g+kc>mZxdbi5%I`l3YsJoDeqGbhJ!0uWCw%O+iV{^{LV
zwaGf~p{xyUUAI&%hXArce#le!ODv91(PmGsc@59V;_NpLOlgaHm0uv)ZHtG%pIrFP
zzvRI=T&=M6-@SyfeKrT_ZCuM~D?pZ*5rTY6Kgki>Jlp+iC*0(Q>;C4Np=p_eG6(X9
zg`t#<bs!6caZoK&t1sPdo!tq?<)XGTV+y9-pSjTn$kAqC@8}WFF_42Hyz!kjags=4
zQbSh5QTw*TuS>^dUqx29)H^MTjy~jv*{?Y|R_c7d{x7@G(Humj|ArJ0XdC~OQyZTd
z|E+;dDgQDZh~;<L!^G%amIK&gNbc_L?C$&kCeQk9aZ`_LO+bWSu7XM%ZF`&`r`AJo
z@26XDa8E~w+U;ePYNju&Jjiw0?XTDGtukp;dVa{@z_?sA8I}O#%CIYQ<3Ax*O$!`%
z_z-%~pS9s){=`l@P<@5sXtwiV?AJk#bJN}-q|6e^nJbpP+8)~99+r2J>i<K?K{Vu1
zm2<2rBS4!98xj+r$cZO5WQ^{h`h}P|X}?FlAi0}xaG5=Rb=_P@Wvnz=;$_qAgF8^R
zl^M(5$hMHn79NZG65Sz0q5ojcbcu^Qy@2fS+TF@m2rm}DjGJxG85fLl3wR8j=t;qp
z@2;i@2Zvyi)KnWx*glHirb!ZXc(;^d2V>ZHpOFg5&ifWG0%{snwOSWb=bK4g(k5hU
zxIW+Md0%6bzO2yTjNio)1m9cG$&F={Y&~4*lJp;m24~INpOZXWPm|#2aO1UT?i&g#
z#~XH<d@s`b<nFU=1Wik4;16+1;RAr~m@_;q^ut>EkKe0`tQfHi?HnBgrGWNZk515K
zl`4EEHDh&^l9lO6?gDq9^ZZbt|3m;jpDM)I#yUE?-sWTHN4O4v(#BabJb!-X3`d3o
zakS(&mPzS<ARvu;UTOo$QH2VLI=yvoNyxrI^LjN9lP#=tRU>T_*l<o8SzzU-ncYr-
zGgS{(#X3m+p`^^f_h)ll#E+=9pobTDDwp54c%IXFzKja_N#~9(j~)F=kP(u7HYu9l
zo^x;$c{vkju&-PJxN(AlR(M!&M}Y`x7mj?o`mJ^j{FyRH7aW9%Q;=e09y61#C41`E
zYF_g+HJyYzdGdm68<ZP3Pdh=81mLuwj7NQHWIted<xR-U%ox5wB4Za|A7Jv@f#=aM
zY^7K81{{9Rz;Djuo*o%WIzXko>Qj2bVD10+&pVO@PVwrTKwGhd@q=&V@l_QC6rsZz
zT6}|7-BOmV!BeFQALqeqK31^F+0@P+K~*Bd#~X(x4-bLZlT@Zw*!_zyfrZ8q-dv7c
zHtR+AiyxL_hxa!=Hx|cTN7Ui3xv(ZXhA%C7n}SM^``D*@ID_Hso&mCa-h7=mMI^5N
zdLqq)4|Aw$3;4sjJl$P?JI*~nzX4i)#|Isnwnh}C?Bw@%ST3q(QF05`&V}9oo#X(P
zDl-8f^$k0zNhAN)mmv_)HD76HY;GR1w`XO$)<Ftj&_<9Ug&fbX0l#OHjs@c0fd0me
zFOJ?~a}BE8efk&*Jokm?%e{aw9U8ndyJ?uL2JvNDGg0VHUxA70z`XmBFv(Yei~*bV
z34qf!CZ6rn1M&JR%t@JcNd#Pw_0L5@RbXNmb-v#Pp1-3sSSX5U22Jy&WMhPLJUuzi
zE(ifhEPENXYyZVnQtwi0#%h5O$gf?HkD8(y4Do_ycS@|7>bt3r{9g}0emG`cLm99a
zi9P4pXSyVRVL|lXRZQL)yyq&@AO}i4Sb>gB-C?i+m!^2*EAKz7ofyEX$+QRHGfjaB
zxG_6t)#B`=iTa)E3b|ge!4kRd*L(!u&XWUuk4r<P+fn4H@5AjdXAD5olSf+YaE8lu
zpx)T5#&I{4&D>aQ=<3xR{kfQp*Kjg`b<(u7RBy?4dy>{=6QYLiQu#$O<m4hQXTtx0
zRS(^}zta4vslG<$cv(kwHG9qPxvMV7uj%d_PY$0ox)6`KE7yWY95g^Onu=+&hDBB@
zsUcr}yr7t$pXMT>=YRcLiC7rxS&Bi+-DF0kJ277S?ti=hgi3WH!P3$_=24k3Jv>AW
zIg0pfrinLxWPOX_$kSdu%O4GSJ8mW?3AX>n;M|0StsG}983#jiRUp?iu}jS@c$xyP
zl9c+SWPI$0K-ScyKAEnkAGx~HxCOY|6$bowANMli2Gl_<rjEy*z9=w7q0>`4c(KlQ
ze{z>{>k<MPMZX*qQx01g8A%KE+gNGtZ{G<?b*L-d>=&9~cUk=(47;UD{hwNldU*dn
zGWtYg_vN@ha$Arl)s<-BJGV9jmD@7gy%610{)$<`WK2Vuj;A)RS2!-FHCz{fy)3pY
z2cD$e_;j%>5$1o3su8d}?X>z`?ov3fiXZ%WxkrazQHl#`7r3?Vc(BaM3N{Htz=6bE
ze+JH?-`=azX*B<l@2r#hNeX0AM8omHCKoe}pHG?BX+b#H#=0xX3ebMNRb0~iBq6y{
zH0Cg`hd_LTuodac6!$%T5b|XV6J6Rt(H|JJ{x@5;2=4Q$@n+Y44DRrcLYi@{IEOAt
z8lpp9TQu84gfUstYC|!TwM8sp!`;tX&2HY$Imw##s|udr*_{+U@~=;i*|U<4M~M``
z(I&+frH^H+fNL>7mgOMu>~vLz%>EjlOMy&W1C=w9rSCrs(Eo|s&tm@!jB+TY{%e^E
z?_#3_ove!BGHarShK6?1h$4(W!zb!*JRjP=G4<870EkB76dW8_^S<t;^J4kU1kt4B
z(M3T^K8Y22EE@WBbTkyVJ;;Z;><9|QeX)q>3f~KL=rlA?8xasYe<_qwgf3;8u^NhX
z49pD%IiB>$;?D8)ewQL^a<nCXY^)T&+jrd%bfAt*()2B6CJQQb1Pt4noUwsMinG4!
z`&bSh3fOW5yaz%gvxiHj)3<)NB!put`_Pj_3;;0`o->6jq>=C$zbLY+_79tORgT`q
z?}@y?pbZCeT2t)-JJd^5c{!3ov%V6i?E}nR)JF3#;J!(L9g2drIh)gG`E(5D^UDrD
zlpx`DGoQ3#+n`$N=qNlK*%%zG7cMw8jC+%$7X&{$si2?G!$p+20<?s~liiDyy;@IV
z1E1U8rP063z|!JOm?kR=*$X0mgWV30b^1+x8|%U4j-vi_G)C8xINBbY(H~%2`Jp{v
zVZav{mm*YRw-Ta;eFm)|)1RpH(~oeI-*2Mpe70G3k6!Y9B~-b(?g)@JqHNdz*KU8*
zmC(f>M?XbsXRdO0w@)QK2t2zF0)Qy!Hr#75wADExSIPRIHCGg7!v-`@{2kxO#v{JG
zNUogs*PQkG#SK7LkgA?)ROFiEU2Q%%A4cYRjfiWwwoTm@kq76E7us<Sh|%R8sEn%S
zzBCUgIqn`BnegAR9{oz8uaxi`x*HP%Zd!4`>8D&z3?|4>`|n%}oL$YIHa6{dd};t^
zz$4`TqJ7wu9uv;)2oL)96jjr$oT-B%6$LcRS(m|um8BD&0pB@utXM;xH83FyLQRAG
z^nD2tOK!lc2lU(DF5mCTku1%eU9Yvi@>@cH)tzg`taDqVynD|~d%quU?dOoa(l^%U
zi*<Gm=RkiXQ)$Eza+Z1SsD%lsKN%SMJw%#6yOR%&aWm*%xEl`RgVvAW0k){KLZy6O
zXJcJe;l!oOtZhi2)p=$At07-bKEYih`mqz;5zE7`Ph!hxn4exOG-EdSt#RJrF=v^*
z+<-|!=g*&S-!s33JUlj|BsXN4J#ld?vrg*Oa%k9WC(^tex{0w6>Z3RqE~_nkTF}!d
zSrLU?C&ZC&vY(J|Jf)Bt;~zkNe|X5l{ePUt6wd!}d3IBghW;Nej~PFlxmFY0wUe<_
zsJk?rKmef8ft-fl9U5{fO4Imn<XH)TKJ2?XN<UNeNU|s**}QDe;bpG2ylYfKF8Em>
z10j2w)Q<s1is_#G9~|SD)h3DFyniC%SU&c&=RaZLhkP3fL4N%-$I_LH2dsQrj6qH>
zk6neET?s1?V<f|{{Kc6Y*Ow_odrc>BAI?TUesD?_g2Y@1kIAPokt;wO95a~>MB7lJ
z7SX;ew!M3FFPD-v<wDijM5acWefYQ>+Bc1xk$hK4U3G@r+!OH^F@`9lYG?Sa^0G{)
z+xkbX1&m8ER&C+(W|`3b!Y!UlPhP>+G>1uagwS>y+zxfWh(&$4O&(=0&_}U?O{s#a
zTFwq>Gx_Ww7Z`yTvO)(;+ROPUKyLdz_m^pNSS$K+F^Jkd7-@3ZuEO5|0mijUax<Q5
zs_mzPNdZkwgru*c7h7qonv+BzN1Qf066d=eB}Fu9XW-<&An9`9gdCXq_nJ;+OB#V~
z53TIJA~Sotg^6zjGCBG`Ds(jr&z}yAiEttc&-=CGWo|UR(dBnGS_%!@2z;}&-N3@W
z*io%h#GPD;k3(#~(pPD2dVaWul!S(6)|P5&5XY0zOfXSXN7-<F0zvQ?>aSlW95ba?
zlGFRE6n#f-tEh+z7IGE)gxS#NyY-}aGC@S0&hx2bdC?jMTRi}wt~DdgXN$t--HWY5
zc0^k9K0QAIXKXG+LcYo%z)=A7m__k+uFO5r4EIA^3)xRWZHkTPfSWRZ6^kWfHK(M4
zjjZK?!XO*3*RY_)C17;5z4JJR4&_sJ?KxxNi2n#zg?a(QJ=$P%Pgis%D`1)!%-pbl
z_j*2Aq^L(UWDLXyygDVlgkWiuOND;I<7+AMLv5PGml43y;QPFc9JN2ngAOy@c0M%Z
zdxrsj7y?&MOpG(;2=I5Pfv!!)K&1EdC#Wgv898DdThK8={#FO9xIy@DBGUFB<FWTw
z-Y&p=s`-BitX)}*xl2cmxigMltR_i9>lIjSYwbh`?6NGK?f22?-d}zdAQ+bfpH3@4
zXYHV!3@X}&Zq4U<TxTX|w6K<FJ>4`G&Bq(<#~pQbI~}#gHi>76KCTZtT=WKACspg2
zONHMYNv?{sjjI_k|0TWyi%Ngq{!_EQjl>Q5-zn99QlS~=KO-reRsg2mBb#{lyQqfn
zs_x#~_5*(KT?Ibb2U!fd$Xe@}cN#=>9~o;S8LYKK$B~@ly)SM>$=2GUb?GrS?NZm_
z=|#(l8>JsM)Km&<c$_$LL-)R2O!8a+Cj+US<$<#H9qmxnl;{4?Mxkx?#m+d!k?@__
zQ`LEXD@^fy#KwJYE5H}=)Z>lSkZc`VKKs3OYx)GI-e8l*%cjARX)U1rUP}X&o+W|l
zMC@6Y#BDqy_+m9MD(kX%sT(=gpCwSq@A|u9JY=be63P!I$DgP+&4h8JGA7C@5;m%r
zI8JehEmY)K#^-Q2kT>_p&hN(+GmuQr%k=PxFRR%>U*VQzx-8TvT))4RpGJ-e4<0VG
zbK=lzO-_aC)J}X<;|x&FcFIjs-?q>$#^i0{&#gva=3)`^qoFbCD{)TsIxRj-sgE8u
zv3{7dGBZZBV`-p&Ga`O87o`Qd5rOOc^-yD@b$N)0-zxYgN&Yb74eJNumK9yPArpzZ
zw5+AG=K>DX4u4or!<}EzDF>Z2WhIVjXm>-KwLnc!W6*>*M024Yy}r~`XMmTTmCm34
zd2@@tT*MoTnC+eUu{PuQpW;gF4FmF2F5~n)qu1g7z^YEK2$Ff{kw($`<;A-1j%Rr_
z)lz+ZXFjm|ti*_Ei?f<iw0PZ~2n3aE$c7@ZiFN|wGHqNERZ{Z%Crn^MCiKaKf0o1l
zp*uLlkJFeks*<%4dh0>b<cfb0m$g~ow(te4{iWR1?NiXZ9gXBc;m!mq<BSRl+u;QI
z!TPLZ-p)&l!R>DUeqTt)AlDDfmmo`>`FVU=-Nv!)uP=+2<>^RN{50wAo7=Vl>AVUw
z=_sESe&_sM$Gq4Pabs^}2hd)gaf6JTUKefnZmjW98(uMI-wguth+tbsQ4gnj6Y{&~
z{qUwo_hZr_rHUb{**_h7rTryNcY^q99k8ejkqdqmgaPQ?Jsx}JbNmnBiE&v6z@FAH
z`_aW{40Lz-lVm`a+&%64STgb+;)jpf!h$tTY~*X-VUO8HdBSd~>J5@*+0({41sxI{
zXJ?L0-rvY$k5XA%8@@Io>uEEP;S(FmT|AQA;K?*7iGM%QS%5Mx)k&cYFtT%&b*~71
zZ3`wp^C0m-_ggC5?4$0Eh(_Mcd|8I0Bf(uG=K#vv0qo|}YFH{u@#F-@z_M7}@xAFv
z9WN8H2FDv5h5H-7P$%s}zc(8W!Q*t<@+^}@%o^Oc^(UCPS^th)iu!+4AsrJJk^d*o
zU;kqf;k7^{fa-0A7RXLhlj3A)RZ_~+<^+?Afu3&tNDPj^n#;*kv}Q)uv6^rCe`y|o
z{}c_Axs_NC#)7lLf|xCexXmKL4++iS*}9}Fd~$pg2U^P}GSw8p>JDnG@QC~13V$>$
zP(y9;H0GOKyN||Cs@8X$9GNbtz}@nJxk-&;YT$0q`^$JC$W!wX|48Z5yR*by*lt9S
zwopr4kG8k6Ze%F>)9m#)eE0#ay;FJo`EAlx*@z_5JW{=u-T<)@;>O}uV`iQy8ug)5
zMOL0`x;~p%8;J7w>5iV<pxepZn9G<3pO`}&`)MTP$LZS$f@2=bORZMT)R_jU(!1W5
z$B4f2M;N8pg6h6%Em5+NtK{UP*Sj;5&8_OL6WsS?LIqWasi{NdX2@1MLw!=Ns_{ve
zd5lLoCh8Y-6MzU|=a)hjX55QT9^y8Yy0+=Req1&m{*ynu+V6aKuK$T|L-;YG8e(i|
zQHp(I>VhhLycp!mXV1K?vou2x;&Q&Bug-1X=|fx2mgSSWQXpb4>K`(S|Hjx(orr9t
zV6^JD-145oTGc7^+XodBm5OLAIouHp%^LMgwkWaUFCxJ1?6c1u?xt^S4CG@R>AO7f
zZ#A2Dj|4bT&l!zd+fQMSmxGSfD)TeiJEFI^I8mJ#B;D^LsI$3~n*&7g+p3)!-3>c5
zF>!0w#*gN(j>>c-<mKhf+nnRFyUM8tJc{Aa4%b&Jg8DrZSAm6QcWtZb&H$EO*AsYB
zJG&MKo<l=6r7TFkZy<_27BMk*n-@>3{Q)D<eDiS51XLXJes6EPU+ls*UOs9#K-H#6
zW}ews&%#{Q{exS#WCQ_Nqq4+m($p!Ee?_<zMF>DdM)K(Vy3xk?nx^jR&r3`9*t_s%
z7}RT8)6<4O%NB-Rz<kP9|H<DoOiS5*a4dzlBy#cE*Kue2eVgZd`0K^HaXSWMvt0HU
z`Zm})6up~pEEXVSeOTX-Nw^<E#3__rVu#Z&f<vKTNf=t>pj-eFEA-;z?bp9wu5H?1
z_o*z-5a~2ud~c3vM{msst=8Bhxjd{f;WDYJ7QD=C4{DPva>LC(xig%a*gHpmH8FX;
z+U`e#vtAvQiWwV|uQ@IHCrej1_XS#v0cA};Fi=39*Jema$LUyfTC`;KX;ju)a4zLL
z3|QT2upkJllF6y|N;{>^3q)`NCVr+?-xwpG)#}bBrT(8yJbZu?w9FPPpZk9~hr_#0
zDE@Kcm&n_t?s4&zXPy&KTtsu-PkZYvcD}K6EAIw|)v9eIOf#in(jthzthY2L7*!Ys
z2lxPt*stFT){#P=cYDN=GAC2HG8=z#>0zLafkxW4XXcWi?M9L1DQ=SV{~ud#8B|A`
zc8w;u26u<x?(Po3LV`QNZQ~As;7)M&;O;IPcXxMp*RzwEcP7t#=NDBCRdn~=m#=lL
z%$2w0%(xV8s=`quETD8RXa^3c%}tLSC%xlai(51bpnGjrRj9ledxdExYY4}QFAwSr
zt<QGwmoa}b83i1H<|cz%V>ppxUSpnDY%Fm0cLR{f34I(A1co;|X)NZRCbwMq^mNDs
zG+bgT$2;FAZx(p+MP<GxNU!V<_^Te@7kkovvYaB7`>x~t7$!`}Ys;`5UGfs5^8rwp
zf4JR?znR!E{IYM0F_B`o_sNgFP)^pNNI%?TaZ!5LX>Z4Zq5ldVb+D+2HxYa}O#Wou
z@dk=R8)iKr6`NC}log-%?p7Av82{Se<h!Db{7R=?>S7k6hK7nS9&M=`i4Zan5r8?E
zTg06RpE5R+lGt*FoVYJP)!FTbroq|wJ-`WfW!g!%-Cxk@z8`-(xAKv*0=Rri;L3Xn
zs-Su0SJAow(&0(m?N)q5-yggvV}`?f%&fjbdOMqj7a7S|-~;*A&_KbEFF%)pmELNy
z22d@X(}8n^j7QRN{Z+gEL+=kLqnPMCD{RpfLcJ3w;R5ry4&)X5Bv~W*sI%wCF(PKA
zvr5m(;t*^Gau1u-o6jh~1B`DO%4~ZCZ~RaucZ|R!&rl4AZ-cZyf}0L&aeRBzBrjZ+
zux`ZnPo|!$I;SE#vfWtyy<)|U38051(_7r6%VN-0PvO9tB8G7o$|FRUySp<Q@BrUK
z3t~ao&OJVCx#oyK9UoG2?pzLqUfK>DaCx`pAKcC4j<M|Z!b<|I4O8js4Mki?%t}OG
zy8-NYyt|3*KR8ED1Kd@2<m*1ir$Nz)0Y3m)!+?b4eJ~BgL05s-99_81x15;&PZjtZ
zDz%%8H2B|5g#Q|Vr~C%!uJ5W`Q_%xo4+O1#c4dHoFEc<rKdt7VC=0Cf-2u@g!&LcK
zFe=SLn(LwDY`a$D0!1LEIAS<Nelq&@h~GD8mU$fOuDUHgB(t|Q@=?HE3+KhnLV=_=
zm%Kw@T}0WUq>sLHx#0~F{{=S-zNN>TW*c~%6$oE6rXS6UA3Bn{ad}9^qYc`d_dsgG
z6`!Mys9F2c=lvGIANtyRU}fRbcEfWgG1iVrQeKw6cmxEBRLXB7<nz784g<@*V^e-g
z)|qK$wA*_OW1P}}z__<Bb2>T=r`z4Sd-Q6^h`S>~q+$_({<tmOz!p#1pbZ_F9zZE`
zp|))20xhD)hqcB~VNIug{P~ih*xvZ}qZLe4g&eC14Qfo1^)H;(SNPWL$lbKNP~=Lb
zB`AwuYsD;%Xj-;SWUd}wuP7P(_yX=`@VJOBgPMn#h1Fw1cK6j*aIK0pE(YDKn~?yW
zSiFf$!=oiFy*KBqB=6_C(hcu!QW~!JVaH;)I~U?P#RG9?_rJ8#gzSw)Cf=V_HCP^h
zLNL)cjrypdQgtJC4vyc57G|M8=F$e~aM{`0fPY;4Ma($`mrA)t>gSe%lvKw|#h%)h
zcRFvobdTewg1RhS8}!+kjrOB<*OX5P&UVX01s2An&Kk<BYiBt*wui5x_-zCqhr%xm
zMqNf|)H5wg4-2#wquQ}PvJ2Wr;`fyReJzO5ZW^7>AX@ocV_M#JnK`+oQ(h5In9AKv
z_P0T^syuy9<GH7|liC!R)%}5oT~;QcA%O6gXyr&hKQZ)w5zV-LH*!k#Rk8pf0??Ct
zk=ueq%q9&dp<3rcn+S7Fb6;v4J|(d(hgUZ@L`4t`<CKkYA?;((#UHDNfw1sPsz|n1
zC0HWX8C}(ZzFdu_19r;~rFJ9n1)uZzL(myV_L8^<AFMNcl$-aij&SRUB5up<qbL;w
z7I?T$*`d8Yn>5V&4r$tr0w`k_9aQWB(8@FB6B%1#k6S?+OaxA<yE=Wn7M}+&<1A_e
zj85vPzAOjvW}81gC&u%s6DnnZ(05GtgD9)cez_hcN)LxjEVU9h0TE2qCFSdupk;c7
zAimrZX(PPeS*0duxgTg<l_Q79b;2Eu=0O4Ep3-QxQ`Hd%`P0CdpV%`$39_wimzU#g
zJ|^yrQJ2*JZm!!U1rf%E*@<Ei|CvxqVg2sV3=CtGeYD2}X)}R3s!@$FeBt-L+}m*Z
zOw{i|imc8W0Doq)CDdO88YX4ZIhLS$lNby~Xnl6l`{E*N{kz%4-CDtYq~arUKH<Mf
zM?`J<X8fNLwOEe|mz1-!AJPup2N_*wh({FS+t13<Z@WV!InS?!iFE|psFyxt5rqv^
zJhXd}gDvrkxLC@DKEv>sr6|{1<8uCew?Y&k{kw)w0>ZI`e_#H?7XXoLPT-(GOE*~d
z&<e?5U(i#vgTQSw9nUq-e$-N)Z64;-dxYAT==*7x)hbu77&Ti6LP4yHw7ON<&Nyd8
zc=UvPn+M8PVaxc;Py~~kwnk%bCxi+M1oHRXxXK#2YyTl?{Q6Y4G^Na%S;tVn?k)ka
zH=moprq@W2le2I7gZasf7n*;0*W!K!QJZoz$|Q260=cDAp*ZUFr*W<S^Z%!Am-=0|
zOC_WK5i0mE&hA;_cipbsG#fzO*!9wjlT`-FB*w*v={Dhw&su4CJzXsgr<oa&b9`3*
z#R(YAjMXOiVs}EmMKWV5?JZmmIf8kC3^VSQc@q_P8VIfFv@Z8uk)^7nn6}7fH8tn%
z_1}2<u+Hfu4k{HbN=eqNPSC(BiW}C$g(%SbwT(iS;n=`4F*fgo#(osj=pVX#ARR&t
z@P8&f#3{cOfo2NzzLU_GU`YIWuF!IwchI@e=lQK=bo663P|Y%W^~vxgxVb?DbWUNm
zjJlg*>WyfMrNM@Z^TTW@ExZ#<f4(g+9~&aUjm_>t7JNhoZ4o8jX8drI(ZT#gmN#Y-
z#hxOnfDiS<6MUxUzo&INV0|w->*qcue4QL}p<4x#H^V7(MiT#+v2TG^K{NC*z6bm7
z8~**F6Tnb4cfG8HH+Eq_&G6>9nY6d8CJn~~9l}q(#8*Ah(`~XEpOUC@UAT_{3zbn=
z>?z*9PRL*uFrOkJKGoO6{dx)q&G~{NA76rIp-Usz+`aj@0T2pEL|apeU8{snlnTay
zksAlNKfYcGK?I6jVULz!78q)x1_uk3X(`@Qugu1JGkVJH`Pa*B<}_HFdcyGSzxF3S
z0X=sSTU-f%ceG=9@%vLVOxjOG?EhtY{&`33KS8K6FLLdWza;m3*2LB~fPwpK?Si+b
zEAZgpKh|O)9iMc6i(<@_>V40m#{kVghO`P8NkK0{=rhDDetYQ9XVr#e5QFP8Zyz|X
zwcd9d6qM8-Gf}Tp>iNHX<E0<;>#Al$M?o446apa{G8#lR0^J5tYp&i}*joW-y4m+S
zs(+w=*ZmVuZ+~it4IY!soEW^Tg!$ivNty8`WR`dlmzx})DgXEj$=9LRzNgNdaRs%e
zbB48YILz_h{7i}QNdsMh%>Qj($^SNv)GHD7zm%$oM<6~B2_R5XQi@4R0!K$bfJpH_
z>gt@5x0Hax3PKAQ*5-FSOf!r@T8(xj%*@eVsL&eF4n*IQJ&Ea9;NFhgNI?YlNUJcA
zL^A7}#1lNLN1Ye{DFWfJ`#38V>=Jj0U#6YDfm^9sL81#XlI@1E29mlye~W-K3KE~w
zWh+B_BX_dD-*Z<QyM-U&;q5z%`u5EM4%2Ti`KuVFcL}Dhd8|^-&MFPfJ9l^^s<&z7
z+Dk+mU5x;N1yMIV{c!iZlFEtC@&7O#?s(8q+8s|%b^NDX@Hv(sCLwvR{OeQaa-%&)
z0EhWB6`ghyTQg!pFB$3YBI45_oTLI-)!7sb#~WKaTf4d<d{wg6#+TLh(S`Z-E5}Zp
z0+Glvs<|d^jE7v(%Fpds{uzUt$U6oV4m!j_;&!Yi(aXhe!=lquCLR$*bXqXgwp(`B
zdk=rb)<EX1;;!XvMO50v!3kyV$Jn?WO8IW-D_;G@+h{)sz30St-92#sMz_>UMXUnN
zep@8Svi@ymNg1FAV|l|@3i#`?6pGL8NXpi9p|W}T&FyW#j~}8To8#i@p}&*Fno~zT
zm8z)ob1jXp(lj)gbL(G?f}l&0ngqRuUeE7Ou!f0Mx%@ekU<3uVtfvCr<Y^TM+0;0Q
z7Mpl}TI3?x=gIBYR{TIi@_APZ4LZ0zFBfl4?0T$n^jjVRf|`(^?@pb*>PMo^P$vHe
zgeb_9J=!=sU^jElaTZ)$Iar}rZfn+s4eN-P^G(Um6Ze8hs~K4*i0=C98~m;J681Y5
zg_+2p{O8O-f|r^w4ULV3pjIo|#MBfJgoswk67gUAiRbuu1QVpaVL5`x+>(dEEn|A^
zC%weE!TOkTgC4m{lF#E*!s7+)lV$S|u&*`+)2wCw)-=K#D%Q4GOWSpKvb7LzHGzQH
z>i#Wym|E_S?GC~&t5-rr&)Hl+{UVd>GGC~Q<sAK`TIbjZxyE|-u+QVjES%zGWPjKP
zj?0OYG?5{h^bAr08Tx_Za0k9tZmhrr+(w~Ta4`9$|BnazM)}+3<g29(|2tr{V|%Yc
zF$2P?zP`SibAKt*!kH`621o0ii01#}K(`d3f03)Zl5b?vB!3ZSg;xPB3rmqZTdN3S
zOgyv~=V<>^KW=3&7_!Q5QT>@>_CO4GDLLP=Z8It-i@pB{N30}7fdv`0t0PKah<Jth
z^2pAhkWR96l>B=PmuH0QJbFBFd=Sc&INMHzH+<a4L8T;e2$#&?E0seGB5AlIyWBB+
zP^Qq1L8SP<nB_jq5dMGTs8IJW*8hhE_?^5g{MJLNx_h47|Km%4f1N(~66-HdcLge?
zxH_$Fc~09P25(?wq{4@spPqrVPX5)boZUexvkT4@Y%HJ?sABVp>pLYSW=o9(smuBm
zBH*&a=?^7bz0H+>1^+7f!YDh&O#VkON>P&dTMCvAzF?0++~Fi7hx^^Gi=w@XKu_#L
zBsVnekpVV0k4Y%F{j)FeJ27H3G%mMo$lEi|K~~GD8m<VpFJs*cpUwZWAJFp($%ABs
zYH<H^@1RgN0;ARLMNn5)cYm=JxEM{Z-GmNOlN~h98Fd9J?ZK&bW-_5h2Wj<-gT&TK
zL87oMBLe69Cnk3O%5Eu-C>Z{RkB?IKx8WIuuX|MJ#pXST`JmlP#Nj%i@D`|tEqUZK
zE2`2&<kj7~@qAt>6%@ILruy%uuPf1VoP^~JA95$j#B)73l!{+zT3i2AQVGfcX&l|>
z0aEs>WALeboxWmKW{>tuE%RUjZAANLLiu-p-9PAsLj9&yhr4Fe{bf*5G+;vsjB`{j
zQy`t+M*BTI6O)9ZL<ig5KQ6F}y|K%v3@7_{LK=mtb}Cs07%bkn=<;v4+sk;K^Zz8L
z=)SvFSj;Qsh#Q>niBKvV7FF&W;e=mR(7}U}px-I7R$=J;eCEbXG571p#qy4V=`kS9
zetc#me24ZVidL0jdl2#?h3{V>gl(I3ccc|~SNxw2P;lM@x0fvCyMLTe(r<k#mWyr0
zzYk6d={+<w^zmw|rWFi0I5-3}bdJl#=D~sQ;w#1PY_HmF+CNEHxKl~w5nb*vDpdzw
z0VDW7zK@F(&f8Tt{v`h2#?)?hH_W$^URNRHU`nM>F!>G(o83Z8f-CocR8Dnw#$n<m
zsrQLhZ~0rdU;B=`_Kf3^9-pb=?n02aGR2?n#I0(}9fcgi#<77s4cF)QKjD5(XC~PZ
zoi9IPzrNbsd$H%%qs%s_KFaY@I{wAO{q5X8D}cVgLK6x4zfLfU9HzUwyQ8-k>h5e!
zHN^&`ZVNb^rZzP*GYY;?{Au?aCgrvpfIZWjT%+?duqO9~?1adSh&AEX{-LtD0UdRe
zld13Dd5Fzkd20(`WgytTU$X%j_q928a4UL>fO{o+O1o|edKF4XeEib;K__9X;~hQF
z|IRnn<KxGV*+DbhVxBLZ%%2>JLu4@Erw+%QiZg0Pa;-)maSt<KgYOMp-WeErU%nic
zDA2PNX?~=%*<f=5J<oq0!-wv-`l-5Q7nSy3mvaODX7?LTg49QyofYJynr=CN1NBot
zN{I;YMlar03OQRa{QSB3fw@@&Ex5^j8@fzMze6%R7)VO+^xEvzeX5K=sm&eB&0Jn#
z<iImk0Qy_A=X!7I!#(y#59+?nvv#G63~#Ti{Qb4Hm%7#s03pFJ8Z%8!pnA3W%WEzb
z`0u;=jtllafwaPUMbxWzl2p)Fei4=Oue<%7*ye#$vl(RNu*m;9IG<^>e`zht^>zM$
zp2!jnoQD2|+3x>ay|vAsM54a#R~1itR*C%PScVJMD_s33q2&1vJWR7Qe0c^8ITAcN
zHC&9$EK|7>^bGDiu*J=4dn8}<>Y8YDsy?xt%X=J8r7m3G)F7UE=)=Wj>p~0N{O2{^
z2B|Ut+!+QDI2a&HhkxM=w?qRhZL{b8NbDmimZ~!xbnEMxYlY_J&eI*2?GeN1N^|6r
zLI&A(ts-HGsn2tPxz-2A;SoLE8C(vGPat$L(a4;4gN|XFXW*RwJGQ!#u*(4Q1FwPZ
zr=I*MMXeMbStsurmkTf+qrvC+vC*W$sXw*aF26!y^AA{_8pa_(A_|JC7kRrN0YVtv
z$IPpPotUU|G$K-+&C@tT7r+yKy8mid3Kj{z3uyMLI9ft8Of%<GD+eE2{u$5p^6)a!
ztoyB9-P^WO7sbVnl4i~n?gxu;Rb(IN>PGp&t40kIw)3gS`rW`WtZ^b=*GE>X{8)HU
z$ob1ZNd3<Deu|4>{Wnt*BR)dIW;I5LXD~9F{}KrX2}evt1=HKxtL~k--~j>h4yO^0
zN}ZXC97K-0xH<+RV7}#A9X_?8wnxtc=;Zk}Up`oA2WI&z3vN%1WMp$VC@sQNJT&3~
zFnO`$`XcYou&XU;WkBF;O(yd=2C&FgFQ1e-BlWO93=G=|ho0dN;Wmv;oBn~&L^6fF
zh>Aw@#C8xv5D|K?Qr-UR&}t6nF!HNyDcN}{Wn!=h7E7ovCn?13@&d_yx?GW#UZbn6
zfo9Df=P!K<E)j%KSD*8$(qc56?08!1-S}5iTRYD3{!qDH?L$tBk3A-)uxy!1y_Gn+
z{X@+~+aaE+#C9EBMh-s=AKP#ISlYAwwg<M{>P9L)RO~0rU2}X3o$P@9-Q(eM>iN0r
zW?*-3RnGg~@*YoctKD#Z`h@E1sPC$R^1at1s{o;)dopmog$K(%zWfUwM#-UwYQ199
zoYi(SR~V7=SEn&Q?GmJzDjw>lz<tvHF&p*YW>Y;3u_OGKOVIt4Df|Pf!A0ryoy77A
zXt&Npp>n=%>U)0$#^0@zN~@y5)F0b##td8s;}!?3-@~?0UJl5d7?R$%TOCcY#bFf9
zik!i|bu88~J3k<&(;I1dp(8bT^S|@HF&*9O%dKMXWmtpDxx{o3>@PZabw09!ZR{#M
zM*CA9FH}{n&84$r_|tbA$Q_K9nuTUvb&aN1dI)&Egy877SDdO1LW~Z!&au!&Cbbew
z%vYh>?Mx1yKX1RcKYfPgL-2W7W95dk*s?pvX9dH|lcla4ZS;UZPBMA6Jri8-dXpH~
z&F{>Ur7|~3_%$ZKP&n~#POw0(`AmrZvy1G&`NvTknpwc)#FXo)R-BB0ZjON7*e@Ke
zMsK;~c%=s!4gfwMUYrHn#)<M$`}`pz{llK$Es1aflf9VoXhP#vcw!qt-k?e>7v<C=
z7K~}zTLxc4fSKq~amks?S3HA3y91NtniC>uq}K0|W6Ffhf4I0x{Bn0F?4*BN4KY}I
z49IT0C_N8T{;DzFNPIy1=({E-QP0-k^IYeZetp8h7x2Ia?SXtkV$CX>6!~2frlZ;W
z)6%)#Q-@Q%9165z!hFMz><4Yy_gYlb65PsX@{F^X69s<Y*E7}zCm%{APEcX!mhcpw
zf8lL#JVj@CIPIBJATqL2gPrl2!pstWI_=MkOWBPSY#D2~d0cmU_baFyuoWMgw!vb}
z|Lyv5(E1{gcDY*{)Iw{S6HMj7AYp|RR$xA&DY&RlDbOf;*3Wx9Y;@3}q3D+qU|ekl
z?jsVOH`~hUHyx^$!2~`!6&M+Z*gZb>m*)A8F*7$j9pf-&HF2)PB?ig6q0=kA3w`1R
zX890c<rb?5|9X%edxjK1@PAo*11y8~z&k;JNPzYfZ3i1EvKj%(hBv3v7oMA~(E81H
zR}VCQh^uC3$-j24?2N4q|CPA$^<;_CW_!K-G#4zTi4=aAf=^TZm>DtFTtBu^>G07S
z@}5RZtI`~Neu1~E#z1Z#4=!VAD?8`>dcx`y-?_&dsm%*zP^2nHD9d1<*ACV)<wl5s
z{>H#0nMgRK_&_Qo(v3OQUxaG2IO*(yzj(dTpakym(8-EvRD1r%)Zp0236D-&*akCS
zf=0H2DsZ@^^|SR~4in@(h3G-);Y8P8epKiK>H66jlDoS*DE4QnR~w(--b!3-_Shr)
zl<<B5^#j4-MZ>H#<BDirB){efw1;-IE5CNcp#>Vj7ksAYy?(Igp;?;#5g#AlOY@nC
zw9EF}5Y4??nWUuyx=ndXJ8{6MPDB6!zSsxbgEC{$qxZh7%io|IS(h6y(y5eJ`HEE^
z2JIdgdEB6}M7kc_oJ-97v>9rvPj^#%l888&I_iU92q)Z*iI@sZQhX+;znH^{dAx?9
zSa58f&m2|#a*Cs4%1IzQ{8|P>--s9`vwt3$QE?bVG)utiQ!vlv)s|WIAs&hFJ0WQC
zgxJ#buJj!Edn6IPw>gB3Rc3vIhG_HDji4{aBmnCUCXcu0q<rOncVXxXU+Vb&MZfIQ
z$th03JDOd*7>m3VU08_Jq2=-UmD;x#X;W#t!KBp=kP4XVY9hpk9uX5idPG%9d34EG
z4FuZU!KKc(L)kKS?Ku&&`eHL#lOG?aiMhKEUZMQVJenCX>0gmeWEKIe(yC_BNXRdd
z#7dLvpyH%Bz!vKY7Qfo|<|*sYJS79aAM7tX+a=#6KqCq0@0Rd(nWTU%-lI>$XlO`t
zU%l3q#_{}5S4$c+IK`(&{>v|KGl5<0jr9d05ivQRF5`1tmhftU-0lPaQP*;2)PD2o
z<eA5m!UGV^HEaI5e|WRKPyeaQ8=wbdSpYNNyA-@86buht{5+{^AVLcEd}|o`gw7Pm
z?Nou9z7_A9`3ds2BXF0V9ACLBm>bv#+Pt!*3KGR)r4i_ner<#Ee1b%C0?CtJ@{U&D
z9NC(j>%CgEehw(8w?k<FCOJ6D5G3u_p2Iw2UERNu5l%U~7%p_?R24qzE~zB0Rzt(>
zALbUkfih1nWEaSo`qPf9erLW9Xy&eAgg!Dx;ig`!08i*DmO^DQg9ckN5GJ-s_ci8;
zoc(TVC<gSuMxeX|w>dc_tD~Yyp5F6;vl9Kg--L3K5-0NZiowFf<Np&iDDn;J>_*vk
z+M*Carq~)dVYrUKU@kilof!3<cJEfd?@(IrgZ;6dNGXakzy3EZYI2hOWOP*IZop5L
zLM?=PZPEZ){qOi*3yJA*$QL}H{C$@Ubk%0e<BfSzIQY8?r)*rFNLs@ydede-ll4D~
z{&hIWK5H_dgt;9<(D5InCLxHQe|FBbX@4CFk8S5F^B_-CWuuS^7)uL%1rTk1w0E?a
zGGWqMHWrp-Y=7h)??{Ma<n<nhev`+c4xKR@r@F=wM0CB}jp@sNke+#_Qs{cI0PQ9d
zRCVhrXyLwyL%m0uof842m*-Jh8?ke{+<15%Z43=yNzK;*w)wXMHm?pnjAVdadF0ex
zdWkQVbZ3TuBh18;NxX-v;M*<q>(N`MGx!81yAcWX5~Z|7eK$tQZ4kf-5@r8qEHaGC
zIr42x{*`R0T-XM;^+&oenfzR8?JFX*hRZ#457%eEI}M`^k;HJkJ0g>)hbCDmDKV7d
z>Y&HR7)s8YGxGwi!YHvD+822U<wzC`rdG@^WoBC_4kv7w3bsNe-7}@)30eF6!vqhP
z2YDfkXx-X8JK^E*fkOth_PF4E!f!AcOH+wNBjuZqUM_p6zmTv}H^M*&#>Wq_9CgHP
z)M~m)x~ql&_nWgHDIR25nwfy)+ZYf93<2K(wzBg7hCuW`A2?J7gZ>JYAdi~+9TY-8
zzYAvgZ+GWU@zegN`zEprXQFoRoEMYx7#<qj+hk^FYktQeU_CWBtB9hS=i7hRpk|rM
zjpD`2aj7CMi)%2*!<eXAtrT>5C{2YK29TJ4X)c&3;O?bg3G)>XCAqwTPhDsRfA0l-
z{_&o}#_(%Bi8;HKe*i&~XuHG0mv7(|7m$$YedKGNE`TjK9e$K$O`6#PpuPEc87-ap
zw8ZK1qIB<#1~V2f=GgXJn06IxIM0pHc^o`fjTwfu?scFOq1BhtIrQ5PqZM^lK`lH@
za2NC7k{r9V`4Oc1m#!@^jgc_HKXjyX?5Hf1k|JYva%{fH=RIe#sLBmgE#o>neQ|%>
z8zX{gbj84mIPPgJ#xn9qD%Q$Nc2q2v!%|z|n5OfM6$3FB;Y0hb_sZSEU+^NlKY^x&
zom>^Rkc`z&WUtUPOY_-g2|5*(HsGyb)b`ZUGu^Mfp`t)RLq1R*H!S3rLBvpR2J|kE
z<3z%>(t)?)=br%T9Ef^9QrCZr(Eu7fkL@>bg=x3rr00)aa&K%FUF7&xrLe%4#NOdO
zo#0Ve-gfsIQo8a1v&Gj>I0!i~hS4|uTRb55#D{F}4p?gnln_%SBZVqOATLN+P?uiD
zwuCw`mOr=;8nV@t_^Q+v0keepW9gOnzPeUkq#?CMdM1F`KIl`4=(O%KAE_k*d$Yls
zW{N`>F?Yv}$#G+_%bV{AhnEk4OV+roU=fIi8(lAfYOeb*&u0*#yw28arRM%^Be*$P
zwd4*-_X_BYphh)G`P%ya%n@J4)0l2Q{`KHNg3ykg(ZqNaOVdoXc;nAUtv6EZ^5L&j
zvvp?I04{dxf?Kc0=&0|-Bw6$J2P_OwYY7)dB27&gBNKV)^kXFL5%NFpw#u%sCcFn^
z(!1993D#$G@O(1_`a<5SoiX*klG|W=(~F@uW5A0cAB$=8&T45YDJoV{M}IIwR)lSS
z2iw`<cw>WAXW>f<M=>Ab4@Wcjmc)IRzVyAZd-&=&2e3Qrah1|w!hh!3zT#eSv<$iJ
z@c{PIK(*aj0pWD~LmX}V@JpI4IZDEcXG68TI?j)Deo^(A%^U7Jexk}~TLnGSXfd0w
z%E~@(n;*m(UR+<>Dj+Y;UTR;PGOE@Lw^-o2&lbegm2KlR(Q(-(pBYv&1da8TgV;Zu
zXh|*=Yvh2oNLu~~aH0&pCCtwwHl6sk+#VOikp|r{O^%OIJT25a<$%A273>}P7Vb{1
zYKpgL<({3knlXOFNmzSW;Hj+j%|kjNa%;JgD_4g%RMKqO-rYWpL2>RBrnSv5Yqg4X
zFrGTaq?xy?rP2s|uZDY!8-X9{lbBtX^zjO6m<3j3_NN8A@sn}r*1%8xPR;35x^2E<
z?!;p!2h<G?hu%_ccK8R*<y$>yqr&CJD-wzpi$5b`x|MU}$&&4~JYJ;izt-gM@&;x-
z33)cYE^zDG0iGV)x$M8;S$|C5e3-S#4c=|ip2)j1-LE-<CEp2geYDBos=%S;3p0h;
z%4h7NB7@FdYYgD8>}&woVYO|0NQA1m?K7LQ@HpUs?S#a^hi7nZoY`<{K<NT07SAIa
zE>~>R`5q8xRaSFP4oy|MYelu&_vQ`nn83o`k#)fQ=ZC-VJ2W6hh?$=7e=hy|4=!PY
zK;yoNY=)A+n%7?mo1}YS6jk>s(ZJRe>GL#x*vQkB>=@RO0W5F1l*rFwLLJ_QZ%Ylo
zw78MVlhDo&lRCJErawGlMo`gH$hf$0Vt8F~F>5`gzf=-4p-b<{>;1?lU__14q5o)f
zw7(FK%??D6!MOB-RY(n!KYPN9et+8XPJZI`XxfG;jG$#ADi`-FaWb(%1C<U!z0alk
z0kK^NT3zXi5|YXgM`oP{S~?NRjX1qy-viM7iXW^WI}kfr@7j{KK&5c7@9MyiW43we
zoK#dCQ617+th}8<D(nZ$g|*Hr*3oiFbaH>A6E%$(cD~K&_ft{HoKHzCcCOuLL7k{R
z*w&X|Ci@Pj5kkdDELRd@UQWf?SrbLtm5XDqZ?YY*yKt1%exs3u;gq~i<7=F_9N5XM
zVQo+3<g!K2TLvM;cauy5Hq_Sqo1ecw;No>zb}KevwI`hQOlxBA5qzgC*O}l5RvXmc
zRHNZS3wdWP;NeI8>LZ07w85c~{Mjddq*qSL{Z;F9B1a<&j7~EAdr!Z<qRQt%);7k*
z#{0ct?gojTOE50+y@e?GlMVJ6WDUe4T68%VG-j=9%tZ(kU>wa?IOF2ks}C++bvHGd
z#M6_pST)fX6sxV!_4{ha{NB`S)oEe2yX!t`Ol}ZGszu#>R~u85^VTqZZa)2rN}_%_
zUgd;84Z@eTQd^VYju7=565V*mu!ERJZ9z93MyvWOOpCDcf>OVQ&JTn)GULM-2X)t}
zS#842XGCgFz#PU+Z{<i6&a|EVPuj%R$X%Rnbq5OzlW-h<h*9reX~Xn1bVU`aR@|9F
z5o~nB$}yRL%Q4_ve6uz+t1mYwoPyw!-%DRY+<VgBhp<m#RPvDZt1rut{QjcWGPo|l
z*o)0=aLh&-%b*=#6A&~BrcGgM(7*t$e~22eO-orx{f#6YUd&+9Q$<m!Cul$gn_~i1
zW{14}BdK-*UvYZq8(!s+mdpFpZ04FX4)lO452bcncz@usd^g~TJ712Ji)`%V=tUg#
z_V6@o4ki3OYY*9Xmzolizt20CH8?zaKNx&8#OE^SATY@roOyW65HKd;b0J}5`7(`s
z(A+2F89cQ8rXPgNL^lstAH%qf1CG)gp0*oS=Gja2Y%n$eo`SL|5Ld_Q6+m0FblV{_
zX2x=*Qfy5|cOE9rTKeH3;>r1Ikou7?L5hb_(-G@5O!di}mos)-X!*uvT*9>%Bc3DI
zu+8O)RV5zIdx`HTdJx$E{^9eM46^K(Io6IxlmA@&*YD7gqn!LTNdf*jr_^wRHvPW+
zjrFnLUn9H44U^(v3CtXsvFeF%-U9EE)J`meb9gzXoOrKsze9oy&8`;r+JwbB@-)|=
zvID2gDpFv7-W@qR-u2?(YcN%@6F7T|>w~W|6PkP<GBAm$PC2zVj*I%aTA$m%*Yn}F
zzqO0Q-p&rd$c>Uz&Mh2xzUYyGyzhh=jM$woC;@_#8t!8wcM{n;k@=QC%FRUIyz*nD
z$~j+m1|9sGCh<kz1grlfL-k9e7B|_vULvW==z^LT&$*0Jrd#?H&&c3|XytX47zmu$
zNc0(%VIA1~X%UNBjBl>Q$bt%`kLaB!qDl7Ecm3x_az?Dkmukr3RJxCk$dMZhKnI-8
zb#x2PuvC4Tq*ax7+w*IcE!YvmC(WGYwpW9$FiX!rV?_I=y;DiTPWX)$sRo}ocLwm(
zm{qxwVwhQ}Vf!2$AJ&c_8HIZ!M6e_ea7+Ycccow1`LZK=fgepF(_)%tiu6N&u=8-t
zA*Xh1QhIbgQ68w!&shpxS8mm;_?#>m{OIwRF3?KKc!~`~(%&bNO51p>GxHZFSnRaF
zQqj?p5K|ZXbn|?o`Ej|Cwld!M17UZbEk>%zvbd5*cM{!p#>)8$DkX^uTHV8g><84d
z)xKiqvwNq3+8uk7tiXu6b?2X3Hw#taSitauvcN0q`hxq=?gFKDhE*ezt+kZGbM|zw
zDHPE{o#f%Q;M^l7SS=C~^kS7lR1Hq6Fgv3`(xWL#ns~Hd1sG7@Ugww^aQCkcjH(4a
zeb?#;8lvprZK?&v+oVV~8)xNGGJR2{MK$4A0};9_M$=pfKlW%IU&K_X(Niy@Mf!w^
z?k7?nuZ<`Y8IuJB=hb6e1O*HFn1M0lCoQR-tt$6%>!)z*woNkQ+#Vs87ee`Rz_oeh
z?p0I0J`2zjm<)?iE4k}7LwzM6NMfzbec9A;<4ATA>_KvO1yD|x<<Zr-H+sf|Y6b6E
z49oz_u>6p0^))xR)2hl;au`a2Lp5dXI6MRee_A^NQSI^9h{TtyWP7Z$3cln~2<k>o
zey;pP9AGh1Nwj$M5=@V`{BT_)7AuZF#iHBZq3t7@3J^xX<;R1v_sq;KA$5KYw!e5Y
z_#q5o*)?6htusl1Vj8{YMB3$k7mvmWTRu{w?jLXE4&z0}?KP=#>^6!M(NMqA^va0g
z9esPqVHt5;;w&Y{6G;8#lWD-`yr2SJtN~SdJ^xt6u%P82a3WXhw`XQNF*-^@$^%0#
zX-3{5M46oiw(Sf-BSZHFMr+Cp=!ZAb$v+=?t}7l@+}>?prc(99LFC5sGm2zW476c|
zS8$RTQ5W`LJiTlmiNi43<#@)9PZrt;NV(<A|0$FU=-*&J3ixKdX3{2cc;j8@5*c=M
zlz`9kYgIIV+?9qBTU38OgK*dTGo|1G%aAeoV?-iDT{>LbYX<*VuSfNjM~t(Aqa+&p
z>06GP{{MKI5K@pBCv8Y|uYvJDenW)<(XGAVR#yO@l4a$!Z^RE>`mHtfW%Xj$<JFHz
z*W|g?$KKa$rk1kghlo<Ey!PW~i7#zwduw>{hOL40R@q+4h+HKh5@_^UY7|&VLD=%^
z2$B%mJ8^LrJX_pyNjydyq^A3IISe1V2{z0gwZ$MB$X<y?TLWW3O25p@rez(lqUc10
z3Wuf0_Etx{#;6oYyx9>NXl<_vsf-O?8*HRt%Yc_IoJHf%x->nM3Mg~jZZi?PXE8M_
zYCr&)qx7($vAW(i0xMO_mnbU#Fv`e+^I|*4O@z*umTo&9)G>e=oTk$>kWG?^&1NRz
zmkDZo>KKvj%`vPc96;0@Y;Re@FIfJlPAKMALfp2(W*yE_#w?kcm6fU(pv=hwWk%2)
zz!|UhY0n=pvgFVa>tAMCIv3hDozuK?gtgoh|9wPQ=U#QVb##6;wCOyKL0QAMud>|`
zS+6rs8UvHGE9KjPAI*%#qzkKVYggYf-E_%ZrrL>Q`y2T>m%V>XqWdL~-QIo~@x!n7
z^3pjRtK-t05{XmQHLv97eX}o@r*Ek-;Q8lNe#gsly-=FV|Az%gy0gst!9;66U958w
z_yB#jESEMcHBq38G~uMe23cI8Xu{QQ^o30vLc9Cm<ROJ~f2|BL2)Q`ijWw3!VB5~#
z_UwSQL84c0=pyF%<&~m0k<*gt%&t2>ZD^z?ek)(E$&Q|pDP_4H3!frFbc&}I6i(Yc
zI+Kn#O`A=A+Kf9%H%-^>US;OZ3K>7IRxYihU<}<UXdg;MeqYc9Dk(McmA-Lpkz;tx
z6qF3U@mZO{Y?kZf+#Q%%bUG)KELs#OOUiBJ_vNSJck9aeYJNiqDkh{BOKg~#2>4|e
zw~F*}wXJUS`c4c6OW!bf7x8fDg>BXIlurslq*6O@7dMtM$zt<a0uE(ZyUi1NHKl%%
zt*mR)>KjaTG*D4ucp_4Z%mDQ7mjy86lSH99)xG)5RW=|ZagvhyU+-ZNr8JgM8!~K4
z;Kd)y_#KXKB*I1d8n|F}$Zw#}cJ9{q`x?>eEK~_T)HS_%3QTkvQ{SU)4m@1LDisq{
z@&?$GO`EO3@0O(~mO@E;)7O!-ldT<{&J+7V+h?rYOtb0jV1)J_mfkxN<>l)oO?_N$
zj+27?eGDVTCTvDUuXN6A784OzvHb21NZk`h+rkqI=9ZNZKl)Jj%zmb8o~tFvz9xq%
zLmO184A+*<>4Ar(oQ5qrrkvljdyI63`zZ&hH^XqEDok;u969vz_1&0`tH~NaF9l&+
z6YL>K!Z{YEeDmLU+@&hlik|g;2;-}#6|R+$d)-4uBGYPi30uvXX~IL;V+?3gfj&)A
zN9Ifl%%!~uQm=-ohZwc`_QBGlURX3gCam@-+;0kyK9y3pkn3up)+CwZWhmE(S+S>Z
zDiV}}!ss_Oek%2F=&&Eok~cKTW#8`6lQumx-_(hQr4YP)gs}IZm)Hg=lR!E;ntcN>
zcr%7*wjBj@i9m#G<@4s))oQRs4A<ja!}G8ovfbNSES{-(U(Pb(<O{!<6EnC^8cBDP
zvzgy|?tNO0V6%+an5;!QRR(wOT8f{}NeNH+E4_Lzk(r3BXV7545^ah7&yuPq^m`5M
za+S1prh0wRy6&03A;znPz*7?0`ISq{m`AAIQX^MmpsW>_X*P;6J)F=3+-6}|TY38s
zWkkn76c+)Yht&XU@ns?*G9zfi1JZy?KdlTrpmW-UyJ)tAyl%YlwOX*HPlAZ@nbB7^
zSQNtImNsWxtvavUE!x_-C>&2hnk!X0V(%t&52i8$i4>R2=zRMi@t_4;W>5r5>t*D#
z0hba#sUMk*=eM~%pLqbp!e|~=fhqk*d~;k4xTCdJTg1>k1@T9E`40Wf<Iw1-k#C1+
z%&D~?ntnvcMlw7Ya(HCW1sI*lxdC67bD1Qu`E)5I=W5Aa63mTl?;Gjd#8LN6L=bjL
zy)~uMTR|*#+0gearYmiqBKH0k&o#H`zB7~ZJ0h#dhAwpp`%%K&RLybtG(1%_LuHn?
z<d~%3ms<0UqrOOVAzoF=X;Vg*N~#+>okV{%<7SvcP`d84L$q^s3xpnqRre>z=gKO-
z=y)Y2Wu-ZN?7T(NmhS*yAqjMApN{U&l_By&4Lq-SoWlfo$=cp9++`=l>TC1pKeQ|q
zD3^wOdONEY=whY=v$#9Xajc!jB^{iFTkKX>w0z0sBPMOm`|?dPWGPmf36C3pC%8gw
z^wIV95plO%4e|F&earyS5)B}B>|{_qJPm@d83Nu23imjHFc`71-?}lnZg8B=nRWw8
zvhQll7a=xgF~`m4qsFS}`Mu{(*Yd$S@eYvo^LnPj^&aP#l5WRV8+_V0<UN{E>fkfg
z+uMfk35{C1uk$)vup`eAQ>}cV*o%kK)`pMixcnsyR#snXP9$nANEWS07CBWDkomhm
zGpADR?2R*|RoJ~+P8VtTXRHxv3yMFyGODWwai&I;X9y@oXy2ZSp9sF8FIAA=l{dYN
zky7vQL+Wwp!t=is(0}jT9uriKc-dI7D7j1uXO-keF@8gt(zC2Y--#QjrlMXO<K0*j
zJ+Hv-Vl>$6;5VIWBziM~*%Ygrc11nSqquYo%2}{niYxfs8u96ksCt#mz@PuNr}Wnx
zF2{s#+?;U%JBIV;VMyClG#lqhOC*H3dgO+ffr!14ArsJ=aKh1y`@lM0KrU?9FgS#n
zn)pq<zZ)5-DCnEH3jFL!LaVM5=xmkiAx)Ca4qp$$W=ZOR5m&0Pd>G$|z23=x+PKm`
zOa!Dz4kdul;uCAxW7GT?pt+7Br+t>YceHaA2;;=L+vDO_KG22yrTnYz>9et?OK5*w
z#5_@bfDUDuGkmLIp^JhCKQgmcR3di7XvzxB7ZQ;MqUX!0@)p69QFb~eQ%3s=R1)Lq
z4b@b|OWsZ0igVJ*$;K-hQblN}oPmUn|2O|(_?`bW+4qtDvj(H;g~-+3I1WHj<x=s&
z@~JP^=~`RH#@a%fWC%7Lo*xA_>gfSn!$Gjs)Z_|i5!(uJ9GAGp;f{WL=uAVW323%|
zp?T4+j^8w3en>;LO=$P;Hyi05NceS0M=mA(cr%102I*#KUx8$`9+{O>*iiNvgS6s{
z)@L-vPK(=*$+xjwRovxEzL-|QWGMwv9Axo95X3ssV#}1c84gEcYIR)$s2>sOsV6H3
zLwA*(FlU>V^c1F{+!n?MaUP9tS-$oxpjxM5%P-?ZG#hzaV*6ThF%zQHahFkaaV#Xr
zY!XPZ67ssg|3&8zt%fFsuxG<y7m#vB;7bzI7^~gWcz<eK2Dse4a}s)2rfF}@5|>h(
zGsGguth9}!G4ot7BM001X0*$T{lEz9lvUktNsPe%pg;yrr6f5>RBg#^&yVA2h5wL;
z+%%4l=6<)iB<FgL91xmtp3S}qz_hOZ-c&_h#o!iu0XG#7K`Ib!uc68J11&VV#_{7S
zqptl11_69VV(IncKH*A*!7JUD(%j@d7lyo5pSDP%$#j0nmBX2enF0CrehQ5Q0cOp0
zLTXgt$t^&*-AQbiIz!ob7Fw-DBXFNAb&C?~2q55D)bdm%;2AK|6t32O_pY_CM~LVa
zt#9z0%l?Jpagl4BXBua87A_ev7)Hd7$e(~KjJuQ8jGxqU$<pS>?MTW_;~AmRH@(H5
z#OW8l%w(1Q0Myt)R4neYmqbHKJ$&T;F{}QTg+ykTcS~bn0(V=l!h;f{7?tLI5%*gy
zR3kdJug60+mVINb66Ln3SD7;YEc57#;UQn<58iU}OOIb{*1m!IrC+oJ`qIdOl*vFq
z8P;H|@}vGLCz=9KACq#Pw5N+5-y30aZwaJQNZySjPaLp?KT4W4D<-><FVlM^31f%G
zU5b<RFp)UmBR$)fP_A2H^n<d_`a0Xw>4+Dq`a~r=+q{;q&z|;y^qGKS+#zemeD=r5
zrq#IRf;4G_6<%wl`X_{XDpMp4=hO0vm1CqNH9Wn5`&O+YHB?%Zx;P5?UvToBkwn$V
zxDCkAue)txEEC-rTlw7m(+cu@h?4I$yG90CaS?IN2B&#zXwhj~O(yDsDn=H0F>x0O
zE}y5pZk|!Vdaxr|)PjOpwUx?JQN4BfP=JjgsT;-%WgVH(6oU6`=T~#}Y<oZ6g1Un@
z_oA!=R{^?S>&=_(UN4BL+GwWsMR9}cH?=xca^j99|Jw@{Px##>U1>CoX8lM16kQo?
zvJ4;`)12+PGDuu5DK&_HyH~z<e!fb7C>HN_!=h~@=D|KN{F;C6?LvtmU|_(Rk{Q0w
zZwDq=J~=1d9z%BwLSU%-zj%+lgtX5w_VFn2;@U)#<mheG!_gEK)vSb>L{@fBLL?{<
zX3Yh^D?jTGeu&ruKGmxo)VvFX>|!Uw<NTCpLT^iC8-NzFQN!!tuPudb&aSB3Aiwsl
zr6cP^i4F4TN)m~wkNw>uLGx~NS<e#*NZygbiz>f_&Q^EQe@cxjKsJU39azXt#yBJP
zMDg^(OXpnG5#2CSv#Q%`&pgtJCL=TGdP!^(AH=#0M+xtyJM=+Y9}P8x2N4&vKNbZI
zh;os6)N5tBTIAW58hY6#?|F&2t7P&pl?d7cBaj$jYQE`dPE)V*X)hBLsna(?NSPs;
zUo9S%y`Hg^F&TSyG_@8*NZYDXxa6~Ja<2Jy*abNu4LGx|mU{g$$In(`ys7CuSG+d4
zxa5<+Pv*(e-i8m4h@X#YS7;JRG!xEfBNY$tX32<SES^p!lrwpnAxCI!;0H@(fc(R-
zvuEqjpl?r6Z3|$;^Y9xTKBt6%r(2(;zvM?}d8e6Kk;GbZj6`aU_HJ8-!+z#u-<!@W
zESN-Yf)x#!5aI0$v@?^(b)R$ABq$8>Xf>a(t7A~1*B87Iw563>ijmA|Nz6Q2udZH!
zjyJW8BD!J42Pv~NxWnE4(dh88^+~_&r@bn<A!2=<pT>#Cvu9;r#FYUTDPfqst&k?7
zBhDzf<6C+<Hxdyc|1Y3Ts}ntLN@CE5F4X7xg<H-FibE1|CQ~8)VHOhkT0x2?qr0KK
zyZcUhI!BI<6TAc}8_yVVY|ikY81K|OV9giUN<0+c4Zc`gcbBJ#lrH6qeBx=)<VVMR
z-&cG4b_3N<OkKWKrY&xsl7{IoxCLKG%@SlcW<vX5>@otnuL2Jo;pdpH@!6=?E|4W_
zYTxg+0IV+Yk;0M;Xl>VK!v$Kf<%{D8=~l=Ti_4O`C}TLw)3@f+r4Bc5Hy6M^)oa0(
z&bmE>_<%mJQ~N2pAlbC4Zck9n(5bIN#3Lb(faMPrt|8T(a<h@<hi$We|B%SChPN{q
zU57r#XuGuBZb(1-m@;|WSZOU%+Vs(r+j;HcWdg>5J$o+LWxP{Rex2}c>u5Ld>S~w7
z{bQF9DRL}>;jekUOZ+j9xnMqPi`kW~*B{H=0v2}QCATCIEB~(*r~J0!lLy0z|FL3G
zu+1_+Yt3O-I*2}YtL_ET$z3(@t)&)2Fkv_nyXE(hu9xcucuImm57IX$1hw!dR{#yP
z-YAC+gw$VQVzbSk*k&eufJIUTg)vw%nb~dPy@uzP3utVnMep-xCQOS;H8al*Rqx3h
zE=V=*6g&-wZEqwW;IhBSOb83ie*^?P{RldLR?W($aZS3j4|Nd~L*R6e&QKdvDP!#F
zFlXD7k?aP10J&!ENl*i6`C*i^>n1ey`;b=;#Wk*cYS1O)Ccg9uxou$9RCEjS_Fyyt
z^Enz95T!D8<&x=;Pc=1k>=dGV%|#@TE6WqS`_svn)IiPkO8$`Wo}S?4HkBd`jMnVn
z#6m#37Cr1mbZmA&+pAOyFc004ITgbF#rC0Xnz3K@Bo-P;Zy&Ftp1|BHIM}m0ZR2D4
z*{$8kl$jhCQMc_Wqlq&}L0Y(8+E8p!Gsj;;v?-lot?9=^S-Q*RB?jB!oM=rBy#_lu
z<Kxv``sG>`B7=T(?2n7g_b`e41R!~$j#A}}kxD@1dN&})%>hVbPuf*{KQ&kBBFGwm
z;JY?#R*?BcCp&``)ZmSJLa<&bfh9dIF-qM41#RGTkE|_6lI#$czKMS&{~MKJ%O?JT
zI^jBaYrGf?F(81~4Zrtw$h7$jor#4%C~?LNvbSARRfx<G%eHxEq}(P?fZ);PNf>*5
zd1Jv)bl6LFwSpDWHC2k}psBLqq)N?j!rCsoFeW&>)!B*;5!6_HFy_z$k%hK^bV)vH
z^BO_A$0^GRT9xe(fYyGBTLo>!6U<#oy`K>@v`imBb|gxHTYS{D=+27EW-UC|SGp~G
zuE7-&vUyl+*0D&5DaWpXVfxZ-r4besH|ns0cyEDgOs6KJoC!C#G^=PMR&JX&*L*_|
z{t#iIm}L-`(MDKreJydVQI;W`^NDGN^xe*&YLwbrH_oWnFDki1kug&?%WUdOLLELi
zAgYPzvefou3gdN{$lWU`io4F!!7L(*q+R*gqT#spR5)nR2&C8S<_4nptbyIZOqb2=
z2hMGkudW%#ERvi!{TybN?e<7>|M0YX-|jKspEY6T(Z4xbd|yg1UM;(Nx<URDX0r7^
z=`AiqB6gYSFk0x(f0oRC$WgZbtov}KO}+^xkZyUJQL5`bFEhuHk!Oy*@b}5M7xLy%
z@excFfb}V?m!MpKu55Z_n7aO|I&!nml5*R+BK$3BV2nP<kuXj8s$!WwO~!4LP!~^9
z&9Yk`a*QPT-PS6#m)2Qr;8w&jXbMGIjVxkS%M>%UZz#67t;uc8?0K$n-!-o*7N``D
z_T@<B#<!!Bow<@0D(MP%hu~q2Z9G^0xWDVy`PSwozJ-hh48F6Mi@=tb(`H$jEG<sh
zAVr2kz2mq8O@d8l7<AimGq4nhfMdhhuqF{xTtQKlYgC&ReJ9O81A~T$R;UgNF7&TO
zUm5FA3-A<HJNSNrnD9$dCXYA3P_k4sL)MoIMb3{A3?<KV5IQZcp@m+ufNd7%U)=KP
zQY#C7)Op??1(GSg63>&KfB=(P#cXm#LP3<M4$<g^u2`UlmXj~Yle1ZLdj5#99-R#u
z5NZ^2ibIm;JtybR!G_Y-pN+R8?T>%ZpM_U!Wsxo)#K;%{rL&!R)w^QmQ}D}9I`6-E
zK8g{XjIJ~UgJUu1pFP04#62*Kd37()7)UO^`gI8%uI?WCIpH2c8eD*STVc-$n+?PS
zrPs?n_~Cm!fJ-%uf*1phgX~s#9SXpxU-Iav>BgzfC&@=EU`Uudm?*q_$z*|&I0WO#
zxfNo?B8YH#QoN_O$pQ)cW+!isX2NCIg^-_NSEt+=A50}j0}5V`${CSo8=lc>5e+Ho
z?&3MZh69WS{di&bp4Y?L&IpU8%@akGNY>)tr@lSjfl?&Xq6H|K9r)Yj*V`NW<;H<L
zFq=6?V$6m>(OMZq0Y7@NTnc2MQgkx8ttGA6lB?Wcex$lf+yy!)cJ<`3KG|+e>$JE(
z{anp1AhN9snMbJ?RG^i|mRep)jX(b0r1At-8kxXIXjnLeuGQS1cNU-CeD6uhoXV#H
zdAQA~N}5V}T^6uhu50ABTJwfNqe?DvVnf>(At<WHMEP-Uq9n*7u*pjkyoBTLnLDUm
zq$mXoT5Lm-hK-5&_iy~;IzUW=@pi$4fT*|(1)*h(vU(MME(D7abt#w~w(H_uk<j;*
zf(wrF5!W&IE$%OHq4dADmN!}*eg@(qQ?hu+QkHQG_P<RIkX59xh3WEH<JF7UbSMhE
zLOXxn;GTQuT&64jMChsC&WHciJSsuWHS$bO0?K1o0n#i3y)O=BpWPB6{H{M@<eWX_
z76=~?(;|gCJ@~cw0zcc=6NRv&^_dW~rZLDql`Wrd)<k~PWLW{UrrPnKUKzkk{T}IT
zxoaxW3~np#Kyut`@?olCJ8eS)Xx2l)HjOM>ADa@I3vZ&&-Na~>5`>thB9WcNBds0(
z_UQV=0^Vla+kosHwQ3`&^&%02m7R(ok*jlq!39)O>C7t7N*hiSVz`T$%^&7@i(I^0
zb%g9oq)i>>@pRlpkaybBF^##D+?M!KpdK^&vjRrJ@s?F`-TA`h<&@yp|3}tWhef$=
zeOpL(cZ1T+&`38*Djm|@4Fb|F0#YK~DJjj+-HpS5fOHH)$M>+$*?W7w_aCkco@;pK
zzE`jHTWeji_WTTBANRLFyTtYXZgFHK5b$0UL;pvRF$UWWnS+~KS5-yk<?#eh@lSub
z78gRlr8?=xOnhst;!lSchAm#f#cfg@eG#NId*z%JbCj+kJ!BDK)sRn=qDzRbmT?@q
z%m!$(n_wY9f`jxNIuLS*J3?w@oPdB_Tb7LaWI=&gURSd&CS@R~@g-B_=DMvK*VWqu
zjR0W4-b)&CTgx<!<4eKJ&(UorZZu@J-(D6a8n#saG&k8<qX&5=C{`1hn}#6!MU+x%
zo5Xd4<vLQUXn>6c-5-N5>kRcO+f~{EVaa-FvTD!s3MHGUomk(S^-nX;`|tz@ab!&6
zt@p<Bd|m@!AahLfZm}H=j5q-)mpF?0(E#UDEOuq(DszCHBQ$hI57&J*mVx3v#-_#C
z%MUJ)wW++Hsv%jyq6Y;GId)tARVU%%N!BMBwDi{jupI<9;bt1FBS1o-agQ_9t!~;Z
zS)8C`L;iWShk{sI6zSh|@&zGEd(R!DkOAam21+mE7R(}Vr8RbmpKTp_o37z`{z3$Q
zjDp$Ue(MT5@<0lfp5cg6|0LUZYO~yUUHRk3PgZJ)xT77jABEzY1iN3U9xbySyWuWe
za6_<q)t$EY9(zv=W+D+wZ=3fBxRixE#t-X5SF(_|;<~(kQl*-`X*Mt|gQQ;RU-ymd
zGO-z~eT*{T+j0B?`6Zi7-kkD3cIgB53!VpyHLiq!`tgq^fnCb0Ku29wb#*5YbwKy`
z(S5G761i#|<F3wS823>KO~&zAcbwS=tzK3*sG~wJoJ9k=ybUe7%Tco@oN{V8G>X~I
z?5+<p;u_)9Z}&%Arx{1zQ)Dkqv}g+X`^%GPss?hCu@{wSi9jv`uCM5Qm3uzwbdpYZ
z(4`b4<Q?Kg*80EK9ExIO>PLhI@f2(1)%VA_WDbR?Zl-I`<F`FunhQIMb%-=kFV{gI
zWWqnbnRu!bf(yRPZfQ)Eeo3``L(Ivk#|(-nl*jLuR4a~r=3?T8!ja`**!CG8jI6$#
z{3NzJ0;E{Q6ncHWnaIy!R^jNJvWHJgn->-~Wdlb%4USE^jI$MTKUAeVddY~WLJG#l
zX-KSRVpX_KsE|wNv*90SDB?~{=U(KUTTRcyaCpL*QG3Bs>2KIJ@-gAdYv$=*6kG&}
zazpx1+T)8~)S*MnKeba+-+uSO3|hZ?eCcD5pH{-e#FWw(yv1(Gn@yy$xN@LGA;E8a
zjZmEXBDsYq(WhG=U}*Wa-<t{LoPia&D>pa;Hzh3rdoeVWAfzh5EBbu$`$X;a%eE}H
zQo&pmyH7!1k_@Fk2seaPI(Co#rdmW`R9FJ4Vb-^fDMI{+#$Ya+MGPkf7lR*9NlZrH
zcSA?UjB)V65zjPf82O|hzi%6er=4CS+X^>b7{dk23xWqCp(wyK=*u(uGh5x5fSll0
zcVD&Nl<SdiFc+=%s2?=xtD!Oe;1+<^l5u9_l^VR-@x?T?8ijCqh_w{?e+y*I77f>{
zlPd@jdmbBB$Cz?;qdPcdvA~fEV-FZwQD2#mipkW5bj8n@{cnl1<$)uQM$x09|Kvys
zNh(^}r$Do6R|GDjrBc3PN-r?OV4}I<g_%+|ay=3v%{GcQp$d2@(vMpo#379=xB!U#
z*!p*V1~SF6t#K_Y!4Y)b?3v`ercYaM-*DV5Q2wOu_IiBg5b=%Qbw?!rN0esvfwA7r
zc7uRL<I-GZ5K;+QI4%%ZZg96n7sf&Dth)=3PO)4vXt#y@hNG=e_0j?LtUO#D4dI83
zvUBwl8b7gUG=4tC83y|OGBM@FS_a)^9;8iv!9XZL<{*~7See&#QM}MpJ8h>fH5tQ1
zi)|p6vVs^`aKr}GR*Q`?wd{k^1f9NM!LwZNScVZs9e18sfF=$7eFxwTm0YW=44_&}
z1}in`;u?&@c{Iwvr&)AcLFI8-P5!Vo_NB3gzO5!^>^^(p0)7=s$Rkd5p@GbyPiI4W
zh?tJAm47{&78s}0)UkGHer35=+}{wazZK0Nz?n7fhsa@J#i5_(R;t~n#n0$!oqZ_@
zgHa2S5)5h6u60l^9(1UXS51V1+BG-(T)1x^qqP#KTQoi<U!olzCE;Z%jqEKox}!|M
z``UH~`s$ce-#Rlw42J(0zcz^@3r?U?X0qa59!(QW9)N3J^};vE@t>YM9`syAX9q+1
zhn_JFDtL{m?tl@$)X@=He5x2)xk|G}pO9%KK6R~~sLSVbP3L<hzQ;G<Z=Nom-|}9}
zBC#Sjt|Ks75G=5e;{^`ISr_)(7rbIy)f<${L}JvfwVdZLZ2L&-77By+b)o`y*c|d@
zyn!F$9|#JdHB5Cg=sp<pzI(#RKza;YUO%<cn3`g?l5JZ)_GWKKlx<%+)#*E%f$l6v
zUO&RBc##d}xK8!4zn2%r*h7=7+EN;96_hU|t|CtpBo8RM1%M21_Z<Qyn?Zp-h^?=x
zPLL$c(%e4_SyATYn^Xfe!Z`y6sxlG&!0^tK^uv@fN-)F=r2<9GQ;<DEl5h>AU4j?8
zA0zXD8>l8O;d!uJzcSz?ZNY9@OsW_0tVLETOBR=x-)quWxyC$Lk*exb(o+SodW`Vf
zmPMGeF1~7Z)3$8b4rSO8oUeZ=UXz9Jq)@c_tdcP!Xawy5f*Zb@UarbR!e>QYZCJth
zHE_f=fjgu8(45DX4iIfb%r&$v)ePvU<ESI5NY{kV5nbD@>Gdy11}hoVV}Fqvev)gC
zvWKYc;>&GdR$Qvq5QoWHi-W<TZhfJt`n0~i$Z2t)2=o_(^8Z!<5FaW46tI4o&VNCd
z(H#KV{&-C~noIx(lfn_QN03ayYyB|70-Yp$QscIUN<kb9Bo>*&ne{{fjM1R@;tvH9
zwC2%)7kEhy>n`l{#0TFQX<6aV?Z!KnDb!iWU;j`dMj-zM@mE-rZX8MbScuuEA@Rx1
z`8-i0`>P9jF_Fr5O(9Ul^2g`O)3D3{Cv0*np%(`nu7XbN1p=I8DGZu5%E#HiDBFeG
z5?+ZSp&orhcQOG5a%Z5?SKJQsS7KZ8Qy$mU^*#O@3(&lNLE?8$5*N6R{;Jz5sZ>iA
zKx3I5B;&8hpQ|e<pvkIzlM$zLUB6Oo@+MZ1`70e#q@mWcR-`{}uc}78DRx%~)K)vG
z<?9qs7(>^W&--srdK@LFP4D*ahvCf4asdYS{FLpT!T`eZoUCzsEeH?OUQEMb?m0Pu
z&ldHh=8Nk=+L5H3Q>VRium0b3o(Am!l!+p~j<l?qOMQuqT@#L6gP$4D+T+&qKkJU4
zAIU7Lvmq4SzV`!>?Bpt0op^PQxC{on5_vnDfYyizp_ZudD>zJ_a_u~!K8^(z{52M%
zm3_0HlL)ojF^6y0?;O<KcZgf}P9{$)+t(}PxXnK$!YgnH+vA3=oxk^d8vz`6JMrTk
zw??HNve{7lcYWs-!aT~IEAs&XZYK$Tp;ssx6TbQ?$~~4A9>QLiI3-N@RZD$OGwbSL
z8;Wc4K38Pwem9P1-dy{%)ArO1XY^0dEBu^BGvX&gW4C<f6rCCm-94WjNcFrn;Jt<i
z|D;wy8btZFbkFih?Wa1P7J<(1rP6v6Y^w+ZX#|lvR4w-XQMZeE6{#zIz(fUv(w<rf
zPPkn7h}o%E7=Db~e<q6X8`$vkMR*E1Um*o5#v0C@QF%+ZtStXA)ZJ&^TQ>V^ns-|k
z18^OlxBO(e5?szZWGl_7gka>QvdV(Hm31u2fMIWW&Sj<2N1vGuUm3R?LfRz4&-F5d
zYcnA~Tc0b7-$T{MHNNxWKTl>;*V?=bI!*lSNWwP-l(dyq&<bbYs+(j0pvcY<XFP*8
z&uAc;JJPF0WYuvSNMVOwoH~)*kfSGL^!r_ghC{E=X1A6Iw}z34!P}l~9<4VA*Y+zd
zLFo||PP@Hk(O)<)m3edt!=pSiV8iTd{v;j-($97}j9m5IqyIPA$MK{m^dSwHUi|*)
zpKztsIjrB)#kg#GjZ_WC75dF<!^D@PD(wtuYfB}K%zBjN-i{O#1s53g;n91r!nn8V
z+u@Bd%$D6T={B8S8w@;C_Hcb#s(rh+El9evjxRH;<ME0~BbM}JBhJE!M*T2=wQQ4u
z2F6a~wkvE9QQWc2k3C-=*HRQ8ws1*M<vgu0ydvaF426P)*v#}-&w?#i@@3oX2=XMP
zzF|rLHV}+%Zp0>Us4$tw^r8@RFLbKb=|*$OY?s4vnC>%^z7AA`?si7UfzcK>uUvB)
z(<zOcbS9-<G6Slo6%hq(+>F4QaJkbk7?%OTi8<@3B&8MB%s?bDGNIyfTZFBoti|!M
z%wg``ueU&If^dpQvdG%)Pu$+nF?ro2e#jlhS2@rlFH%RO%yJ4UnpJTxT57gP_%ihg
z?^oYg7PQKpR%>rnE#I0~EqAR%Yb4E|@cHD=oERw5@*uKr#L_aKhlT4jjz|78wMoj3
zjw!+jAT=TA^tHmW@})v<Nf9hDVs6}U)l+6i76eQ-e~dlx_?`jM@mg2*y+4p@`j+qi
zdqU^DxtH26(c_nSW0(_L0e2{R;C)7F(Q5;F<E7@YjGng^9(PyX=?GtVzELXR&QTzQ
z6qM@|My~D)o$Q5yM&O)={rk*t^Kf8SU0o<nIS%z2-eC_Yy&HM4KO>fT^^Vr0cS5D2
zCH^2Fs&)LGZj^)A=FuCX%ZsTjMAy9!j)*pZSYH-8vX#V;k@6#F8MGEaJ;Gz8Z;#ib
zc3%5iFaM_iM`)GK#AH@YUqh;(p3TbKp7zLV^e55#uuRS&D{X7#I-@Pzn1Ye^`z8_J
z5biaal!>$Or3~2QzGXJ;_Q&fF;n2NPCr0{&Sts<H^vJan<~g>^4>8E+!wNfzZzEP?
zK9g#Xbvpetdfy5_qHkx(1MRQHPHbw#=lCnnCAyhLiS7G|aEF1FWE=Qm4OuY|gmZ-4
z%X-2_JE?Z+M3|pFCc0Bc?2ChYFFd$p#wLNvx*jCWsy>w5$<Kx*l>xr6&cou)Bkug<
z+yTQ~^v!R9Dx)oQCsmg^q5<S>rjjn39s|p|ovyfz-#;xcgq$#Z$pOx`N$h}Eiob~R
z%`cUAFxvGDiIuEVBW`J%jmp%P_Saf?^fw^+KiK1LAn!q@i}}BZ884j1fO}C_H@D-1
z6`>~h@bEAeUiYmeKQlyDd=cM1KbxX<Cn2Q_vh^B#aP75>GnV<GFXN@gjg-RqD{xhZ
z2e3xA1g;EAOuxpLb+_-~5>hva??wJWklRud`OyX4$<rkZoG(w`Gs*u9j9##{Kxq}Q
z^kd`ghP~q9rArI-uyD$&cOkX<DBTmoE^Yl2!{c2P>a65%exPB0d$|d}ROg%ZZl>DU
zp`R|%rQG*9%~r8ovwBVY#IL8@af)+J(;!VF0Wr|?cC-~y&7+QKre{)qAc7G#!hE4S
z(mCLl;TduiKQ~qfD>UXSp)3bSKtBTB>#aq=#Rnrs#Xa1-5^@nc3UdLk95Q9>{Q0TC
z_ezWPY(%BY`<B;qA*E9%9Me|wU)m4HfvgPQ2H6D&7%nZi2-zn-hFv!wU$Ab<giDQY
z5u8eF_0F^%9x<DHtQa<6L~u9Z*)gG$e)E|Ztj1Olf>meHXJQ4g$0bI{bDW-LuQ7TT
zxK4qc71gIJv@|&^5TY3Q`3ZFayfP2RN5VX8=od)FyQ0@^8QnL9c|fL41x;2VyvLV`
zYd!MjPVSm|={|Jd+MbmdgXDu*fy?Uzl-P*e1#TsL+z$(+zrju@PEELilgN_sw)g8>
zWS~k263vrR!|%_p>K$aKf+D-^^H*_+S*b%|iCEx)US@*;qr%vE%LXjo5+nrElfH97
z7`Zt+a`$MW-i}BtIra)N`R~SyV(yG4{97Sxurz2vQeAmDwGr>9g+(XVdwiNR$~^dH
z%oVBucVV`#02gNDmU^0#Ho4tLq514L<R=1SNngNyMtkeB_W-ot_@S0k*L`pp=5q%9
zK{eypvpF)sPcI!UJ#e>cdrkTl7alJ`#8pJGIcD{>Z)eEnJPJgx$72FVeoVswtK<K+
z#;Eq=jN&nOAAXTeD+Xo}m2ms!CXIgpi$u%TaS)HnIR1x|4Q9EE*SKN)l+E%QXtI&T
zU=%b8(@z!wdAIGkK~?s|DT`hmM=!sal0Mr&!Qb|#jx~>p$W0Z0RN&AhFquT)l85U~
zB}zeqgof^v_manEx+1PV^E&%*HDdZw`7K}Nc`_Dw1P54FXA^g$(&8UxA8fRajiUKU
zLF4tW<6+-xl)wXSr>hs|%WUsur96O6EKahmhq9#<0EashGjHu1?AZi0+Th;K7~*c3
zrZr&>zDyWH?kN-!Sn46RX_yEu*M7D$ZPN|lRznl<z#+Mj9l*%k<B02c!uKNhvsXjf
z`6grjK$lwD@}p8eDH_L#h_1WdKYZMWHX}U!CrW7Au3NFvqklZ<aG<TSnPZD4;*>BB
z&~32qnXg80(YmsGMVK8+rw6Uw?UY_cDU{2hyN!cQ!XMIBW2-5NiNPbxPpUe{VhrDa
z8C1bah^iVDI?5HU3TXicbCux`7{8X!JZ$?-#1I*_HFk=?HYkESW62hPTz@`z8fKh*
zub@%-PH}A`{lYJ9>JU6MP1H>sgSY!Zlc>a@yYKwnYFWLT9a8I8?fbQ3h+N}GX>Rys
z;})AvbTvc43(XQdN`9x%rQc&skJ0%v0=<B6%IikVf4EnHUuX}_P;wWOObD64Ah77Q
zye4$WWODTKLh>z-%O^QGOVCr8{_VFsmekqpol9;G@|Vt`y0wB&Vy5Y*SQGruZ83uH
zy99Vb1<^@6?e@(r>#Qo{*}Wo?j(=gdC>26awmrQeU$&*Bwt&q3iSSmi6Q?VM+_qA;
z;}H0DzK&ajo7(hwp^#IE;-*1{@?QZjJ-J%s_{8;;q{z=|+MjyA$O3sR(!UKfaw?_Y
zz-k8&h7z?q?ws5F0|QP88bd;1yGD!Sof^WL<)5qrH}`kv^!7kbIA^7E$J=k}NceLk
z%C&Zs&DNCVeI7l<bTB~UFVod7)3FTH(JW!wJG`4J<TyD4r`pK#jLx948uNH!8QxE&
zkb%lq!dLAwnDq^AqbQ{a)zz{OcRKr8Jx7(k#1Mz5b$2R!%ZC4y>UPYyFzR(&L*pmK
z|D5z|RPt;~wMxGZ!^Zs}2OdGI7r)?Jq*WHb)k6NmGoqK;wO)OCEH#hrloK}sN~2Hq
zZJ{a~t)Z&*np5b5i<7lRAs=6}U~aHEOn3`60Yf9YzM{T!e$wjj^LP8Ih|TlC6xKcn
zq00qQCZgJN*PRcJvbq}D<$B-T)ZU-$`vtpS?dhqvw5BR})uJJN_<_&QPv08Pon2xt
z%mwkwesR;fl#`^A^eJ|uA@+VKh0GQiE_pF1?b{wKJJ!LY&R(R2T~IzIM0c|6f8t0s
zi?9e?SifHOn$$o&Ydg{31RU%JRv^4pm@@zyLt98ROXsxaC#Gc2p<6r%^4cKks{A1x
zG?D=ku}>0Y3H;B6s8m+Vk^y$i%rj_5!nFA`n-t}<`6M7Z%~mCADTLlTlaLRnNRJp4
z%@ZF?721~RTF2<?YE3O&qw{w7UK^!V@Kic)55Mx9SBl5N8QcwvhA05L35~1ExXHFH
z?jhN4H%I*zzPY#dI-bQz!;;BQqN&#xBrgO=si8NVI*07HmtTwzhN6D=AI9#*gl6UT
z13;$cUm$P-%zE$F)Ws2X*hvIr@v%rVQae`sIfb$I|KEa?Bk>^u8ab#`4f^xRMI15+
z1nTJPleNbKa(_HYCH3`Z0c2(qy9R)<hvTiG^ciH}7o_^aZ-|rUNBahDmlgfhY8Tx<
zL}Usrx*f($dgYz_W*$G$oqe`;cvROCubBUdiQppi8>IgjhrRHl9e`FX1t5GkUr%}c
zlT*7<$&7FJFb}I2r?3N1pPBFffCriO%S@LI<xk)yrz5AV(lFTWa$15|TArHiAnz*1
zrGv_wDw36je#irB!X$vT;+7b>5;xVOb0G!<ktfqIIT1y*L^c-=*^>>R{UP#-+HJqk
z*-9zhYP-u#nU2Itu7HUyrX?PyexolaFGYWS<6~y}>?Be6hfyytB3Gepc{JE-Jv3p!
zedrdbaIOn#^<`DEF?hqmPto4Frwn-86j>J6XVB%<&QmojdivAYC~0DRe5Mn;WW8TO
zi3!fqMIy#?A;SLbS%sAb#&?rcdxG>jQ{;6#P+(!GcZ0)Fl>f%aSk#<DmF+HH*-BFs
zp>uJGMu)SaOZ4GV`cGz&c78xNHp`*OF1lqIqrsd^i+oZ~uRK#S(ffFJy#n7=y4zt{
zI|N_x3dm9zUlTeM$RRC-ddx}2U;0j$zEeNA@;TeV-;9V!FCcewl5(kgj<Y)@M1yjW
zEy20``eGy#0e@$nA>j5(y062`??tPelvjEN`M3@XyzKOXEkCb|#!^5T&cwnUiWzdR
zw^qW4L!4-ncVH<?C_Lbaykf45y|IQgB4f?9?sG#EEZB$~VkbfHF}3CP{6La`Y_6hj
zF~EHaEBpi7@&tzW4?!Flp-Xr;lweBmsi0e)NuzZVb4vg1?HUBN@%>hRrrY2k(di!X
zV96$HG`7(5D7Yc*`qz%}AlwQ*D-CR$WYiwPd~v-JDQ`L&!I6#b&$U&%z#7idXbYs1
zj41BDJLp_`fr_jWH(Y#4#qV__$bxD0esjjfHVP?kjomm>?UTXOurb0F+E*%1`;?4!
z#B(($JwqsMCdt@&2azOdr%Us%8*>Z4jw^8?1Ebjjb%%M+kJ;wrP#2*&e}xliPZqka
zdLBWH)}ORRSmoyX&z;u;ZcknY32z;*GpN|aM&(Xa$)iTV3F}5l)3uzf)WTwUJ6UK#
z)z!2bMNRpxsCl|0glN;PJB^)q(yV_o>9rRRq*O02;qh184#5~l9lw_gXp4B~eh9EW
zp>O;WlcVCVoW9#N;lMqeW;qaj*rP6QoF0x~%+MTA^RvC@iV7IIH|Td+;NB`z1^f9d
zvwnr{O033$n*hjzkT;`$EcXC0X&1kvIopC>SRCmN`rZAvzc>nhIsClu4?7Q(Ik_In
zoC6sieAa(9C$^<N><68wG!!Ef>h){Z`C(~k`H=mRBI;A<d}up{&s2m-2h?G^pwO~U
z2{?WDmDQE9x_5QB2D=%MWHEdnoZDQlsARv(wlWd$>)CJDy=XzW{9>k_Z@+I;^UDE{
z$LS-C!|)y;eSP5*jx{<xf*IUP{LS`V!o@3JG7U8TAuOQnz_BY`mPjcqu!(ht@AW#_
zuydfSl}EL4H7Uj|xngO;S*8Tvq-(|G?xxKsVRspFUJjcvbxvEV`+?<BIU^;eQ@31`
zH-fQZtK-!wMO#JPy#iY1`;U_?{lU5cz#258;11&0@ag^wP#@prE^Klj*D-@HkPy**
zXNHo&-G1D5CiVwTf(Mw+Z+drYtlB{rFDC_Yb>D_9DZ2_S{XH6Y9Q_N!S2bDJ4Gvw(
z(JLTa_4s=xjWVPS6sFf|TASx{#kb8+E5VpJ$x=JnFY`?~ehmgM^y^t-?uX{s<&wZA
z$+?USqc!5&HD+|ZUR3tn!dp=p&<t%Xi$;<j^ZOqMEkU#oasA@}|5Hohh3a4f8y<7I
zTBWT!4My%{UZF9TUdaA9{KYy#SxRN^nQ0&WW}VNj7LT1J;Y#vrTB~Zs#|{ubapy(9
zJux2&-B_}op@!5O5uloLl*Lc_JOD%d9ZhZg`fJ%t=(4bv1qeDa6^0%NV1|ljDn_iH
zwQa8bOk{Li!3v!R%UbbH-pL@6q8&~so9l1<@_n_MZ0D%c92i%atA*OA%r8@UW_RHk
zjyk+Ct>GCWZ8VnZ;=7WhMZiJ9_XW2rqGz8RaOyK$enw+U<;i3~2buNeWZ4aRJuUd(
z`(KD3ZbOQnE}IkO>cci)usUYYW)Y7Ww8>0&N+N~FCK$;ba8d{oDJpGANS9<iK%7>Y
zOe_T5d6fEMCFqk-9zaJl75a>-rfCh7l!#`WV4~K=`d14zQqRDCy?QG0A=-^}sAH*9
z#+?2vhN}I+f?%(!NvAiQ8Q=9I+(Mhz_ChTfRo)ONj|_h!#iKh{Vu@%c<AS4{QH*?(
z<ma<Rxjl{nmK#fOZW4}BnAF`%kcl-a(|pz1_xX07g*#EBtUX<_6mLLl=-C*sB{-V3
z9F}`+l5i<=I$W&w2ETj4wgXA>^GkKDgL#T&#lXUteqxok5l9^jsq@+H`u5~losh(^
zbFBI<1ljZ6aY(|MT}#(^c|+M#A*ZILdMjeSX97pJ_2b(}rs|kjk25(aPL{uVuXE98
z`K&G<-ygfN>S>uC6<Ki^wnXUchB$I^F5B+XKS@ZtDQ@;XKw2h<?g=|zwlF|!easWO
zsq?F;l}D_8j#$;{cfcE}H36s4ysp&=AVxNSImoPdCltujJdrrUG1x(dufa6=Bf~S6
zxiIebshBLSXsm-V=gKmvY2n+!VQ7NL#F5)7FtdqhbhHrgI0TyXd)f&b>kiCvF~TmD
zQF?xw{hxUSVu=s^&Pr)ztbeMSNP)oUXtY|ZsppuKqMwD-J~1WbDVgOPJ>c(Wm1ct7
zSC}BTYFQEV{xNSQ`jc9oMe0y|*eBIKeo_)N!lSL2Yi2X7`zz{9#Cs?lqkF0-e<cAA
zyzRahX+t=AqQC9>8!bs8>6PA=TBSZTK=q>|_x|$H93Jk0SsWRs%-QvMIMT7C6i3Q|
z@>`(Y^JAzo?{CHOLd?l{cOmJuAJs{?3?sT7t)D+x^9#S$LSwA_dIX@*zgG={v7NF@
z>8_}@l_J(jUQTjib?T*vKCND!0a_Ja%IRx;%yypH!8rzCO6aQJtw1i8mBZ6Y4ep&9
zs6K6?UIUE|igXfsc66JGdF}Krn!r$oCKj|2TO&*BryG&@W>VCM4SgMwfHN2Z0&7+p
zut>cbQ2lA7_0SyHu*IGD$x%r=rG1*!R)mSy>V~!NY3Vg34%Hef&9Dw}Ln7g8KSfaa
z%uKcCPnJ{rzNUefOfR>$IUL~<Yx_&J3flVQ8Tff(`CB@Qb{Z8$Pu2uI=-~tqH0-XX
zbfw^daN`%c1<}i!5DN=1I~Pe5Ex+Q|QnwxPT;r!qGKxMYXmrny*KBzZP*3~R8Tq!?
zT3_NEkcOBv8f)zRbv<Qql+Y?~a;^ROmlK~0(nv^VHYO1i`oy2PJQeZ?8bM)0fP5Gw
z$3A!;x0A%ISbNL_BQNl}HmUp)uKKpqY*7QG#JRe=<$D>_?}upAYEswjt5sPRskO87
zLc2a%kD&hfRTbx#kCB6eTr4IS<iI{YyzUEDn}VWQKaWFE71EJa@S)#6iH4hN<c=ed
z_xr2*EX0UIM{nY1F<mbCQ5X^*kSx-FuVFy>vt3fDh|=BOpcA#I&1`apSJyN-iE`2X
zfZ0(wF)K_c69iuPy<ShHT%tlMLPsC|HKkV9BT%7dkgCF^!9HB1L0cmq=u++-ks$W2
zAF9vFgkwyQ5}3lgRic?$+;(^+f_q2alxSF`;qcym{pE7~d4Y|{p#^JWC=sSqv(jJp
zvxUfh{_WblOt)hV{NuUb&6gOrp}i_@^L!~ZYh@m*K{d5JQ>vgQl@&x|w+;#KOr(*$
z;_>K4`lvOp^D}y1@))OskH3fbyHpBB-9kE*XrBGr3Aj8#K)hR!TEatt$?1Q(OCHut
zt=afhGoED}CO>7Dt(X02_)3t4KJ@!JHVBLl^Sl0KMSx}zo^gHfbtjg=x+cpHQe*g5
z|4$hOGK9dqWuTd^3&hOA@aGd0&Ol(5M2^i|MgPD+0_jGL*^rU3@tbhj*Par%9wEDh
z`j{28ODK~yNK<`PVof}PAz@G54CNHWFSrcJAk3Fth7*G{65pM;H`TT4B9k;=<f@CA
z8Z2p69YSCPyOl7X;(__qE$a1nhP_0jfz4*Nf}rZjUm-da%l$)iYmS6Jx)o(k8AP5>
zR!lsRJ*BuJ984fIb6!LTIz7caymt)&#N|qsPjPZzORFCoXZLowiZ)u&Fs}&9cu_;q
zC;E#px+<mE$D!eY#zfy3lyl)ZhtMB<@>6t8q8NwEC}O)_TtU(#>FJb%+OMH~uQp*P
z+v{)lH^P506T+fHsyOPMrLv%=jVZ~_y&gu_=APjINVC%*X@@tmr_FsO^#l09fRi+t
z+s8I{cAo0&i+O)5FJx{P{&?scG9*3!=>Gd^CP(wAUE+Ntrq_hOzy^UYzqIjIL?VLs
zYfFtnjLkSw^jzxTH~F6w+KwSDS)&5rm9=ii>*ULI86A>k{*MEoJmjZT#o~lxnK5F`
z?#Wn-1G-{aE^S8F)t);y$)6ZrdM8p~S<N|>P|KX%J>3{e#x2)=B|X`uO@s{&nSfwX
ze}&u#0J!mqD?d0NZUw1C^`^rcZwW7s2#%4_@kdklB$0@!S*9m69(v6#DJbWB&Fm63
zG3I2d)wkF_zh7)3uq0x$j1qC8JuYzhRz}D6d#I;cQG2!4T!d}8F?Guqhd@OplxU@y
zke%-{1tmV%>tfu!$Y`wRRTptZ-X4UslR3UV)1{+~-x^P_w+G*{^7BW9pF@Y86LCvR
znL=7&4K}5pIQAsOWY&6Ro>{)JX>~g3haf~YsI>MHi7em+%aW>N>-vWvO#qz_`{@z2
zCHFs`$~C(@HD=Z~Bb)qahfTe(7jpM|`BwF(zVaKfUqhtFi1lx__^l?=Pet<d!S7Tq
za_=o)MQSZ*(aw2O+ajbeB!xpa=YQ>^$?jkmHjLv(9jUS~>S<|&bBuI!KbGtp?SL_2
zuG))Wd#G1tzslWwTT4eU!ee%h;Iyv0DJm;d^@5#`D@%Kn5_~0&xUBRe-h$<GkFCWi
z2}mou6au}M2Fn6^JrL=vWVYX7FzN>Wo7HN*ahr|2Z&j4Him}BmffASWe?|ZSxp=BL
zz;;%<{aK=ZM0+&yEGRhG*xsJ)ybTb%HDFfy_|(+(rn}-KI#8V61&Y(ROvB<i;Y614
zJnnREJE@Y_j+etuc`FLiksbLlNR72i?oy?r-oKY^9;G!{$2glMq3m7A0yQb~$jQ8l
zt1To~A=jSuQ=f2D{g|n+vp3kJ+l63Czj2#>xl<wzsS5RX2EES@A+6F~3+kY=cJjs~
z*Ce1RwvESa>04hGID5_miwP&}tMFz>ICM5D0y?~W`%AE29$3l^wA1XSj0T6#$&pF;
zcilB(k|~g{mLO%!grj!j_@M4V2$bVcXztRDOcc~hBl~10kLdMp2$%ROy4z(jPpqt)
z;go1w@iDK<bu`U|v&8(n>!P+CeaFP)K#asu_I+%>#hir~UH88pHe%Su1vZPL%}K8s
zFh&F8_S@*hyr1Ge6s>8Q7`bt;ZDzXW;wZP_XLoHy3;F8uS0H9BQ+oDAO3{EoKZ`Nw
z#Rjv627{{=?UM=&9i4HTe94tz9UDvW;*6VGuaj@_^=Nt`o}9}Q_^+nP4}PvQzHK#S
zJo_r3cujkIx;;)20Ka)E*}Oa8#x#U_wRBGD$i#{5)EB$HQ@-STBXv4c-}F2b{yP>B
z-fT8kFZ{zJB&KQF5Twx?<*m(aG)CK~JX4XZjA+)dxHS!l9HsmBN>2qn)>a_l+*y}3
zMpB1ja_Mh{9WaJ_;}V<}XkB23@b_iQZBG$bhepx=#sb`njG-bS?z;8JFbZj?YbFto
z0#!k$gGDzc$Ir3(Q7Vc`zafNl^!5g^9(faAf`N`e<t4^yb<T{>KENC$B<JUeFzD0R
z7@}aHM|Gl9|35vdEC*n=qYKyTENQhtA03E)Ho?_-h5G7NTh5K|YYnv8ewXVg3rxp@
zZc#k}%D7V>ey<chn$^k4ZKO>1mGR;waZAgD&w~g7_Nwh!PcUr~h}d2TQ*@Ib4CMOE
zH+j*Iew~+;w#~~r)4y7ayQ9sXb0PYy`JOx!wsr}<)ul5Kj95opcf2-#LMevrXJuI@
zdEN$|$nP`Sp~%!0!d5eBY^qlTijT9G@4WRlH~poZ*V;ip#0Vk?RR>Qi=X_oWZlg<W
zSpQO_>IlP3MLOdGcdjJ38V<kv9mn~6(>ZGifHoTUZQ_Fht{$;d?Jc?>?X`3Pn8~!|
zt+QRArhRe*`5ZCs6jx=rAcN_Ls48BHA#y@K2VI#sA_#{Hx=|t_9xD6Z^mrMnWcfTk
zKXBR7l=_B&CNsx+^sbMI>wlWmIG#vHJXE~%7fh=EeDXRRDVmfYW2xRQk@QWbpj&)=
z{G$^_=D+5C!RIOv`PWbwhe>jgNSa4m(Oz!)hL4YUB|ipG^cc9McLI^XUibMcquqNk
z-TSD!wjjv|TQuMzMD=K{ynMTEP>1zfgTGs~;}aI8*o=@7>QkR}`KBxx^6`(Oj<VX%
zb9HyMY+LcT5a$mwA#47S){*MQmvR=mb*8d6PItmS-Go>m{{vp{nDu2dO^rd5xnz5d
zBU-UyhN*shem2a5{0_TpIW)qWmtRqKBbPu6o{@bXvpAJ)fg!9Z725x;Ko+Q3g!}=X
zd#FIZh<$+7fkv4~asLEfw+8}rq6AhY%hEA24(-Q)z?s5cH~_lt4t)Zzob4f4Al-h|
zY=9p>XWl)da9R_1tB2#22T=)}Vvz(Ul3iudTF%JFm1EY6-6g`PFONxwk%YjMHUBNk
zMb2hlqvnd_6i1vF7uSX*^J!UvOy=45VtDmGBLibI2>BC;>@|^2b+!pMsH^7{u>upX
zLNDU)KEyV{7war-P1J&9vd&*yEI~Tbio$9qkEr3*M*c1W{79_s_;o!ME?k{CePNAl
z@=;LNCAUNpq;yD*VEeKZLDr@24`KY~GvS=vG~TAiTGy7<-!uOIE2)3Dl6e(YQlkI4
zlCR>{)=Z0aHeu(7O+2KNJ-`MFU}HwRXgtkq@JD97A@3uq3P+&H>d6zp4YDeDtOmdo
zY{I$qERx;7u*Tnqe&BWrX}<g7F8Wu`k=aY}5bom_d<2AyDtTf|nw6oV&F<mKpK6n+
zuis5BR{c7blUD@4x6lScOlMacRy=T$Fa^Dz2wwiNn1C-DGF!!|FSB7G#&c1x6NoPr
zewTr~&T7ez<9^@s5E;T>a|Cv}nh6|S=FQD7%H7}5Zr*Ihg+ukqfpXUn?HV;(=+L3R
zd#h(<To+IKuND%~1D>A&{n=ZZH?tVO7xJyCh2OoC``@B;bjKKVN4KL`7_H5pOcaQt
z_#sb${-t{U5HRT+3l}%IL~}u`D)M`Rd&rW}G`b+X^hh(W-DA{0`4syFvT$CHqYaSg
zr8mA=YooGlx!=E1^*((*1y+G=0aLaUZ}NnuDk%-f1SA5A?S=7o7^dFp@NnMaLGR20
zoUbp;bx6=MR^q>SM^&WemI4c9@eldIpz70v5MFdB0=HsIi(^EEuCj$it)mBf^*w}E
z<fy-qk)u{Fr`>)rkETn@?LBH!727XEJZ%!E(C?Fd<>?Ww<ScQ~vZeJa^MV)BEepZ3
z=VF%Rpc_1cJO1B-GymyXsQJHKGW)<KZ&lez$^M5+!an5Z(?5Co)XE}XAt@*~mv(t&
zWgGG?fsX0`h5wtGh-WO0M+mbLguL+k%ZoX8DAD{_JGgoEt{e{(DF7~CqIV57y$L`J
zn20rN$#mE2ef3~EP4{L}xj_rcpP!e{i7jR8l_sw)<56UpIIH}~m>+9t`Tp!gt_gs9
zW$H<rH1aZ!m%G{6`z47}*oxuBz*gZiHXwc%i~tM_In^*{aOmE#fw<9sLGgDyQ;jdt
z82NHY1qqvYT3b0<X|9_tPUM=?gQS5$ag9j{ec4L}aU+0M3$ee~6{26;yG%K4IjMOQ
z2EJj~WBK1kC!=SPybt)k*{Pxb$$WMkQpW=Y%da9nG%sHwzj^b9h|hNZaJw-aP6MFE
z&6m*^;iXmV0O)~mIVwL_Wf8cHerzyjFang>>}>1MF<!y{^tntnpi(FLhX0Gl9zqYr
z2(9@3G=Kv6(0DZTzWlqf@p=HvA7BsyU=p*9wweW~-T?F@#IEhF@bX>jCNw7Oeo1jl
zZloW{PEs;ze(Jmu_58oXvOKEjTTpjqH1FKVYm!mvAHU<&6V|CCjMzXm8)qrPkzGAb
zw2jX4j-@>R8=*{ZPhIu+Pbk&vE+ingi=6Jrr}t;b=8qo;Xi^$xW>gXqlJ=4)8c0PW
z^KwTre3kbH`4*lO+39>%bg&Wma(gC<PvU@_^4cV8Im8doPH>x3)Hds8)}_m-51$O!
zP3SD<Q9~)*@rsxCPA3I!^SUmnf=nRpbbs=M#9jZB`0=J>CX`GcpqX1KAmw;c#}s0u
zh7;+KJ&Q3cF<-oPWuFYu7;A5D={v6*3-k`3OA*l3O2NpsTSe?6)wR<Xh81l4XpH)C
z`YEol%nH6ht|=7+URnLRq%BP@yL<LN>xKJs7KQc1E!Y0Fl;tgH<9iuf6>!>kvug~-
zUsGXvR<;9Xi~AS<f|0G>@2`02ecrgcKj!~+X_g)Gt`7luga=;@OAiYledM&3jJ099
zR*)^J*^03|mQj7kw4h)kdE?Cxs|f4nr{?1L9Lv?9rW_e}C58Rp+943r1LNl(jT7t+
zQoTmRcw8Lj;4PKA+=;N%v5-ekkf=U9da{W_MLeo^9z6T-%U^#%YApVcTjKc2-^2V@
zhnWuji18@q(tc@wQ}fsWZN!z&;lbJa#<;e6wnS@P>9Lh-)qApEElEc=abn7vjmtkH
zF<Os9ar&Y;-J42Z6_F@q4c1JulxA+#tjTaK_s=;fg_fqYDKYtbT~#a9(_eO-y+}1{
z3CIx8l@-0}Zv4IS<_(Hr!>F=YK4F>etMZ(rc+UaQMU1v}v2rO&FGrnUY_+n@R|_5b
zWpuw!v5Y%))Widh+@dRGh>~gZF3`sN1HXIyjk~ko-xX!!bk}EY1_F0}W}9?Ap_HZm
z=Y&+N5>M#BP2(du#s4|-!{IBK9VImzI}}JDqI52uk7oT?BHSq7S$uZt1r2F0d%<BM
z8X-d)`{s5bC_kLf%Bp3L8aYeGe`v}Cs_N=f>2|7q)!n6o8Y}x-2y;;AS&G`)!U%66
zmF*<eY_7*9ng`PZZerntPD2gdc`s^A%bc{bmK3e_BayryCaF*XVZBO@)fn>ZTmFjc
zOgP_quqxYBJ4c8iW3<0~1h6AZWhE4)Z0@z&4qtqLcuaa)md{zi?;c~PUQV;2<($tK
za$Y`Lv#T^1UotQrKf@A#^q(J$$nj`fbe?Qcf76y~Rp80H(yrU<CI*x;U&f@5MIR;U
z$<~*smvY}TtvI$Wx8jc86j*H9H*oi$#b|&(IDf^hg8kl(SomI7sY+>DxpWBE{=T7D
z^$K{yvdZdoB>QKMGrL?vQC{oc+1*kNyQap1(}y#`X-!A9PLXGETC?Gu?>?!cxH!j3
z10R`0_#r}2mK%Z0?|JdLNm6~Yg6q@z3G55UWtEn%U+;XK1Hae^ks-5KST*K<+3|rk
zZ0zD(8AA{W!GB3;<Q_Pz30-`qHMvAC;cUwlg7AXnwNeN|G-uz1iSGt)n}L3ttPPGo
zSFhsMmA}sP_rL{7+lYDx#kXJlqw2JtdK(>yYwk{r_}`YnqPF-)`Gt@A$pVNlHz_ce
zd`>41uQ7>^%SXE-N<?knM)c}%8?x~7D!08#cZ06E<)(7~MAeehH{|9|t*8F*0To5B
z=|-#$sZG7OxOhRQoq^$9+s47@ckN`)nVkr`AE$l1rjQXu$(a`HV<uG)Ftg-u8abjq
z{sdiK<C{eOmuTNn-j|~^ta>SGg6<wKmYqkrQ?Cv@LAFqvzNIbc+TeLUT47NBrQR-P
z<Y&XrlH^^j;?JtY>rHhP#=U|q1}B538T%I;qF*R0a4ARa;$|!Eg3GEaT&?N^e6;ch
zli|M_`be+ZjJ#^R(dxCVOAKaYv;3wz!q4j)vo0(5Es$*c!j6q^^}28R9s7Y5l#xHb
z5oURTboa`C6Mkdor>MRke%WwQW4-e3*0hDUTA->RZf%a2A({4H(yVd<(&aT=UXc5*
zbd6&5>){_HbPRfL4Xi4S7`>C*O?gz0Na4vrks&~7aOvBirH;?m*ASUWF8IQ$2gTH<
zV@hAc2bA<F<G5{*7hC+BC&C#TX?7OBEf1TT-Ksc(a;jPjp=<T`hF#)9P0}&qa{uE?
zYwvm=ahdIv=nPq10-gXn%EQYckg@-=hVaENa0Bn{%3O_fzt+B>_09Cy3GdhE?LSnP
zO#FFAIo+k)V)hJYdT7{29@zB10`zSaI0*i7EG_oG6zkWdve9e|39!kbD(T|1#uD%N
z_~aO%jU9$2)cFT5Ar4OM%NPA^P1{?Fzg!G?y}aHr`Q`;Qq$c;e2oqrrBiiK45yl`A
z#*)p{GYT!3>PKC8CZVJ8K%0|Y(p-hNYf7YTzZ>2Rjgb^P_blz<SJr5GuQkERC$*YT
zFoAz?nS}|yavR@tGc-qa50<@CT*)H+RL-S0d+Ek-8Q|Sw6McJOx<sQf(_ZTm#Hjtp
zVLFh3!w5XiRHXTbQl6n5=$g<Vrfoyj*)b>L{ETFNs9WC<6Er9)ccXxa5P_1ARy1yE
z=Ym_t@jwrv8s*J*6fK@PX3&dCl4q)3NaCaIS(GpFGyTgL?E*S*ZI0xDPY;!*BUHt>
zN%V^Sd_q_a4+LY_1CwLZ-Su)A-X=4CrXg9$FFu@R4J>sAc>YK#u&63hLEs)@tDn{U
zK58d~Zy@pgK@e`5=+>8N!=>a6_(;nz{tiqygZv+@2E4}$5F5Qag2{hK_2@D8gV>y>
z<}+nvRT8Hwqyv-zBdn<w7BK^aAw!sB(;H9oId}sk^m|2(DOILcu)PA1x1r$31KJNC
zqLB5PUFAu2YHRv_$dqbrkHrbEnY6z6x%s><QcGfuex1f4rlj1xoEh&BK|tsUl4qXy
zNou0?++Lfl-}%{@)@h(3lje@9*AG`;_rh4R@4cm$iBD%*C3}S9j1m84rw5fO|KUBz
z<45`_!av`<gXc+VX;&S3Nm+mh{--wE<X;<Ry4~G1(F9}1M#Aq|xCeY4dwo5<7AVSd
znDk-~DP<W{rthHOn0=4uvu`ePLUOo_d$|lX-m`d0%k*Ne_bb7+E*N2MM=SC&WiWJ9
z_3xQN%CM$IH1q|W#GL-vY8iE3Uj?h(K_pWv+jYOCGq11hfE0egl*U4wE!Qc4vbzls
zTv;QA{GMX>*c2FZKUAE6Jza}lA4+bLetDd5a#qi>HuChJ@BirW7&hQTxE%0`{KIWZ
z%tuqIV#PqpJx$Q9WM|gxb|bV#D@6CSfx&xR4S)3Ha4fluaw&N0uNSMtB8p|GydeaX
z+tpoTwz;ytpzr)VqvI**NkEOHjBrDb)1CR?4!&qKVsEg>(30CGcvhBjE+3aM=B>8J
zEDOTvf!*T0hocPms@3~7H{n-4w%h6TB>3VGFLzqP)NTEKr+eQa>cv|GBFgAEmMl_+
z7|QH`v3dqZ3xC(0pQ^Nw40HIP*<B0o%Qr)9Ak)2xhV(4+=*o@gssTXr{G$&XOCAk4
zs%&hn{j1g{CMLXZj{D$4?Cme8sKlL|oKE0w`x{c<wL*N(i?Qr^03&9k5NHgz4)C^&
zcP($L^~;h3?hG$aA{T4kn3w#LGUH^5;-n}Z*2Z&MEu1OE=ePZcy=!0o`V}T4WBQ56
z(7PA~W#vSNadZ7f&lStA<?~>Cg1vN8j6~E;iN%2Ll5FfvEzyQsTc_tcM8zUP4Q8lb
z4y{7kaUz#W6)?EBby`~8QbANo1#zE)0ZGYbjNUawI|Ki^iR~Zr`oRVj@Ddi!<7WTS
z;cToNepjyO#9UI^l?HX^>(DUD*)(1oy?R5yd0^cpH!er%d*A?$)DQVgdW8$bhPD57
z7Z_~vUDChPGAZ1zRCQWP!TtR-O;;6((QEv4ZGA$vVwsaxP4SI<IYh}cPdEo|<}dXe
zGNWQSb<|<-Ie169gCPG!+TIRoGGl4q>a$$UbI4Zi28`rOSqKjzAU)Dlyu6w*JIlq)
zDfcPmiiTxz+I2fk+FsY*`(q>mn;)EPjg%>SDsA@IcuxP|(7<%S%RMM)oG1Q=Yrhpe
zyE@x11$Gz-2(K(Tw3llec;y$dY36JV=;2rBzcy3ek<)C%X??7R_W0QeD~Ibgg&(ei
zzC^K$pBU@TK+1LM@#x`v_F)=Nb^EmQx_45=TWUz#UO3=rH3o?*=w-OO?%O+9@8;1Z
zRjBpgrx}>o+0JpRl=6M$uHY{k^V_){Zmsx!$I1(Fei!zc#B#JUP=>XutPTm2ZtqQ>
zyYa%BZYr0@3qh#w*Y(Q~s85|+UuEyHsjS9a|DC2Hi<UF>A6jLY4sefB51!jScx8X7
zNntso(F}o|mF5O81+cEk=JpD1J&_Zl81dj*lwoQ@^>&_5zI`Sv*P(`gQ2Y*H&;8^+
z_)dPF7c5%;W>G8~u`i}Bz+!KMKHuyLBlAH{xtxQ)U{>%_`V$E#6l(h#cMIMdwBiBG
zPjPDh#X~!eQD3VrJo<cycZg)D(!|O+yx-VFMAI2LiXMX{RZQI>c2FY%B%&0;Z<D`(
zs=Xz6pu%XVm0p;$AGkkXkE97_&-D0aiuuQ>mR|$9lE0<c75Q(fIpPOWh<ICe1|eXi
z>$drN?XNUz`}_M>Op=>bpa3h6n0H%a;jo(xK1j|Ht3tJJl!;*+rlBu+fUd~<ZHn@n
zz!FD+)=oW9HwDPm^%8#=`;|r;btfClUrG`X^S~P<Jr$MN-u|$?X1<j+{f#a#%%!Mr
zkK2CHDJIRa`?mw^sF`Pd#d~2+r2aivr$+p{-<^VmiI8YC=BBLZI=IMq%&G@xH;$7_
z><XPMKM>5S_}1c`hlsjwD(_()!<eu&*=)U;mmhcBSOo>lTw3;D-BJTM>V4n-P&H{C
zNzB^QRmPIQM&mJKQVB^(d~)(~wR{DE@|eq3$Jr9ctn--H`eKk9icx<+ABjgG?+In*
zFj}bLv+*BEI<#bWbDBo$4~Fq;fxReo6-5>l49xCquGKz0IEOW3xSL^h@4w2ggP||g
z;p?ZEF>gIQG(LCDl`~hL(`YxneqCxwPiMfDc0Ed32fv`jzZiHN)m?MJB6i7C4?0Q-
zqnMrmBi4lUV~XBMu<G8@&+Up%_!VMM*8Oeyq_vUjYH|Oj9B{idgjB>8{qh3T|0tf0
z5u^-|lSF4{r-Hn^ljr=0$4`0OcBaz>^3~5qbnxvJ7y%cU0yx}<%V*D77c$@AvvmVL
z2rq{rZRT=L9X`|V!G-TK%t3`Mg$nzNf=$wT6u|plcn89v*jMfMU%6Gtet1?hdF*JM
z5;{S-?IYyhyzhIPi)iMnuzg*pys>Xr{OB(=27Dxn&|mxQzB7huiz#G`$zyIsz`%9u
zD<0g_DXzNJHdXgC4yAkx)0J*F@xMgVlLHu(@W(qI{J)Rss2x~mGgqRV&L`mRusxaY
zy;EGMv~*jh3Dr7{8V^qLBPYNbEapI}6k>bv5tS_#g_V;HMU>4Sl$dkIU4)XdL^}9)
zq(zujqB(Q3G()^v|I~%#i){nS>Utv%(R%Q)an5Bz?fc?2*;e})xBrA)Q~tB=TK`%<
zNWPz048<GnzDT39?M%rYmij`~0ADh#GZR4ZvOEbpJ5-M=q?kNxhg?X$y8m!}x?kU|
zw?6h8IYOs)E&B1of6ZUn_<-BLr^Vt30nA@PBRbEMzmF?!G+p$Dgx@|xG8|B$`pxe9
zYWs_I%E(DgOv|~LTC*Cunjl4JI1l13(y<*P#Qp6vBFbS*#_q5{MfcrZ$}+>xZWV`x
z(jvtuDexGx|3+p{O5i)pC1(v=`%}C&7n_$cP%=zmuJ%?=ieV9o`un<I|7+@>mhUqo
za5eqrIn@0}-L-Pn)Te%t*1jWTGvH6Fa+C=td3L>dbIDpb=GJm>0i$4R1YMC0H*)m$
z4B*PvI*h68ui)mkHn(fnUEAzro{Nrh-sKhKzbu>a_~E&ohF!_Wj;;xlX&Xh!(j)xm
ztdAb6BgLaxL@tTb{_BaVsjK7T<L5sRyu0!K?z+z7(BK`;`tWJL0e(esQ0YbFpe914
zo-{p2A3}&@XMU9v{lVvare%*aED~-WHT0)s>^<)bK)Rpo1e(3;Ocr%gq~L9EE$RJe
zt(RF`+lHbA`H1xypWZFlmm`J6t-rVEMO+69!`4$qbZd7Pw#xqq>{tgJZLVxs-+nEl
z2*!<epDjAbbr7}t0<P3+`CP!`wQ<3WSicUwF!KMEv7wCzTOZQ2fS>%nm1#jK+s?gk
zi&3UDwW9h*HXl9dVHM}nb3%3T|3@VU`l4nCd8oj5W~R2MW;eoXL|GK`b&{2Mr)XU?
z8~Z5A%Fc!6_|;*Ss1IHbAyJiQLmS|-R?wPQgkdNcRCmo6T33E8fno0A1PMRfp$E?V
z%AquU<vSbK;>C_Ylr*lM%pclxepdgrl%u?RA=jr}-ycgm8>j>UrWLQQ0~UO41#`D)
zArZ>?A#+x#Go7;P`>#KS)R+M<DVxWTCCA@KS^fE>y1E(#6LW5+RGZx071$+A!u3A<
zB^Kb2UgsF9&y;pK(-kxdJg~Jr*1QjswCZbowz)S>)YPxigM#+1d32j~%&e!4{$G37
z8P!zw?G-^329d5vvC)Enf=DkGL{y|mCxG<cq<3%xW~d4R(vc!ai?jskih$CKBtYmO
zp-K+~NO&hI<*hYu{onig*6@J^S@+)DbI#uTSN6%xWuk#($KkNNWN7QNN#m8O^SLR{
z=1H3FdeQr2u@q<R-iz>lr4pP>xO&!#t(ZoTarFaar3oI26mijD5p|jJ3)!(QmC|QL
zGrq|sWKo_wE=H_ZdqqDE)W`Pk_7_<9ULYM*dNNRI$$a&Re|-e_3YPo2aGPtiQ%U&p
zHxTGhbHcFLb+n*@cAV50F1&?wEI@wBH2Ms6g9w{`v05P7t?@)+^4zwiO2_f`Q*Yi@
z`tGFN6u;q;+f^76S*n7MXHO^$zkUL|KX-l|q04@DvnlmV_PcL2)t_Po1m<~$er!7S
zL|A`&e(7$l&Yf4N!LKe~T`r=&YJYeHjf1v;YC-O#WHC-IAMW1!6Vn9x7K{X0Z|xC~
z)3RyjiQ@TAxbgl7NkjC~fF#f<wRl-S_b^pjPoQSgs0eWZK}l3@y_9N!PP0^F<0sC)
z$N;o*(3suVe1z?1mNZ|8RTv!zq||Zhkh)*}w86dXW35cr(94&_ejAG%Sy|s#^e&^9
z4ZTfT<MP)_UbDZAXV(<a|F*Sdh}A+_^g{5smthdcs%u(g&vfK?FDyqbk3<jWXJ8i~
zvVVfGdaatKh*^wr|I7@VQaYu!af(5){}J<^MM^|h1wfHCyT_r_`wMp7fW&WSIr!73
zPtK!tN4E(zH8ppiK@)G-uDC>u5AqNbVon1|u6HYR^=Z;M;yk~v3k@||=KyMST|&??
z9ZSt_H}7OVR1u<uQ$N)**8eE_@!u?PU9u%dXmjxL$4!SHJEG!Fv8QT;+smnH3=OT|
zPUZn4x`XR=eAjqF=FeC=!@`t!@bBNnS5*mlTEDZfE-<l9{Eoz&ElK?40CCW<*Nos3
z#n*>zpQjd^+Ns?4&aA5D+xnS&dyxk)tP#w?Z~xcdlFWB!Sw8Uj^Xc)1P>MhP_|_c5
z%W(Si=o;1xEYf<YA_KGmHNcM+<QFT8T0pX#^k$*FDt(4sPEuz4QPy-9HGnbHxu&25
zVV9XDLt_N_h=(pg$*qmCz2uQn%801ma@{4TO(V(IPLeiu!?)rhScQ!AhO|pI^p55p
zIl{)JaOb8PeP^Yc{#e%pe8X!!&5U2tcC8W}9V2Wvd5q0mHkd}XPMGv=ghXawYU(v+
z^lz|Yrpk`l8w426D}PW2IQ&~V4x*2O|GhWvDW5L<q0>}r*8*Zhe~FG^>?Ke%ZPUzF
zdW2^9A7}i4<i5)Oke2+4`dCdGs9lRk%oznVY~uJEr4hqEh;*zD#KuGi)z$aOX<Iz<
zr+hs-$649wy*ZNyE5!uH0k7>AnN(5|^m%H;(1hB%ca|$u1NoFpuyd?B;#_yaZ@2KY
zt=yEskA44c>XV|s)l5iQK=%7nPyC5fxww2`-7|qLZ08XZzn;+4@Kn|tt6^?Dx#tg>
zs2Q^TH9A5P2=q*$vNucE{=$t~5`A4C<98Go1O6T>3-}doG{jL7VJ0_bi0^*2T<6aH
z0e3piS$_en4+i}$B8MCAR)5^McAK19wQWoQ<MD^YPZyl?bkB#-b{|BWg=-ti0Cmj1
zX2wd78;j-t4J>n#4z-a;K3lDM)FknaWN(tj!gjR()JNO1Fl+V5FV8VEO<rdqrqy0Z
zp(%jAgd<`j4jg#;VA-i$2fqSMZFbj_StPLvkS8kf0*?z>V`g5*>8p+?0ZM?{uEViP
ztm#E_Vu#X=+-8u?G2N7e%&!mh&wi*H7h?+}_p*~%{g^dg536*wY%*5rGuHd1w>V&@
zXB^S!^{}4Ya7>u*u1}p0^OCpvL~T!Z6Zr4}2Dx!tGM|L#CJ0U4jwE=MjY;eQ3Zjvh
z$T;`!{t3hRvk@;3{`Z%aCL=l->#9P5YKA;f-YZ4tK(wXRr99@blJLpHhg`dTsrPCc
z^d!7an4bZa7Z)|vCd90p<;2ySypv*mJxPP;zlmIUZ@+XpS*Pl==%-pu0WwfZ6j>I|
zR<bEo<Xh1on>}LX*P0!0+cDU%_MQj8@Zt%`?1}5$6p)y=m4@@(Qpf+T$?rmEd1Zf=
zWs8U<YW_3ypaR$Fk%C)I>=$m$Axm3hzIE2mR_X?_!9+q@yYc8mJ)~}jyM42Bqhx5S
zW4I^C`L|78`F#B1T+{c**=q&0wRmsE8p8eW%+F10`QTD4fvaJkL?i6E&_X|$d?V`F
zCR!kMo80haM_lXK0_u}x9Sv{Mh&-sf`^yGZH?gjA%O+u~Pc#`4DvB-EAJ;tNr3?Fx
zTfB`F5b;hju5>vvVZ7_!viXG2hq>L=6#UL9!g^<0o@xTl*D{yr@$m<3$PD@Z_>Pds
zaRNf}QswEzy~+g$1mhQ=6T*S#C!;<#h)d}aA%&_xQBn;V01)1RcAE|tc_nHVR%C!P
zQ|z_q?|#UW5c~ZqUyE=5Q)l-G@`(W4Xu|U7lEiNJm_l^eo&;(UlJ*=w^}G2ry;bVy
zvNoB)5Vi~Fr?yovE0X?w=Dd4nas7t)W`?kSTPjP=4_1h$QQzB+L(ww{kriY)M!Js~
zM*_T0m=sc=YEn|oq>@z(FR=%g5V?}F5s8m<crOJoz-Fr9x5)IwypVk`!n}6w8j5R0
zGU%uEsY$1$$#qZD7uR6n3Tkui;JfHNmp6UrN)ArBm#oI&bFcUw-w%ch_wWtJKk#Do
zqAMo`@e6?f)=Jeil(}1WQyx*p@Ys*5*Ub0rYiilL64HA?tNGiC_6J;_;{1y;$RGlr
z$kR8n5hTZEn{65cGt&Lua=wwtKt422a5tYM7Dp5JDjnGaAGoB~+Idg-4{lBWFPP!r
zq}Niq+goeb#AlpFe5`2blCx-`NcZS6K^C-qSkva9mRKYJw=X-s3spGBrvwuGRp-~j
z$ljbc!L)Qk66H+q-SGNSIq{SQ4?<;n-W3zpw~k?j`la0_ovq-y6Rgq^KV7`^6!!Wi
zmhRfPq-)2aBX+(C*J?D5g84isG3NpC15LgSe3sk6lF0oWSdyJuATNo=qX58i&o3bY
zf&D=}7yJGyp3%C1L~T1Y!^L1X?rA%`jfXN0#cGZj2Yz_IX=C8y#s)W)f^}VJ63yJ~
z!=#+q`ux8gB?U<j$q)UVmFO)B>eWCZwYr*I2fkLN*P>V*C=;OpxQAs<g2+Yv_#bz>
z&L0HvKgfz!$cqBVpuZW#dPh#NNdakeQ5Y@J4Zg~5vz(tfanq*Tqs%uo*TJJYHA$~p
z=b9~=xzZz!zW_;2ouk%Y{4FID`!=UqVWDkayXY*<P;X<Cz?{hR_M_y5qQbWRl9?iY
z825}en(2y?M&!qnceAZ~m%7|`Ei+3?uX}iyZb?QH2C+5muHHnswq1Gq!gzgTFtz2Z
z*J6&Z#qC~d>zs=9Sq!sZtj~Ih{^qvtM0bF&4}EQ6kvW--=RvR0zfKV-CO0~*`H=m3
zI_g1myIoFTd*qdZb!sWgcIVu-+%S>|iato~XaM=do6nf~RZ62WZ<tFwqJ14-nqz*f
zgK_{jz1)6^^;MNoV3B!QjgLBJ!4YW+>$&{k$8C5moSx^Gv@r=k#u-6f)FGKM3DeXu
z9{Q;%IM`{rS6%lsg*ahgbo+Xyl6uGOw5Aj1N9hA_t5X&ZyV6+exSE9<KqD9^ZA@qV
z=#We;P)y!l=&j&Gx(om84=GhWZ4RGEKZqv&3&3}KFv!XL_x|{ll>-WN5xKxuBomhe
zjUuXJiM*L1Fk#D2E8nv-1RD4ZqKaAB8_8EQo=?;R%<#0BW%SAqJ<I$q=;&GXE0px4
zLrZCtCr=L0)0-@6ud<@m;Ti>N7?ahg1x2`5W2w1x`HFYB!;(MzxqP3!`LFJgbA66}
zIX~-9>_E?R{k;wU`^P$OK~&*<^Low|MN`X;{3}vR4YigZcRuu!^uI8K&8~MDhofwl
z$0lc5DD0|-Y8H!ZX>w-`DjaUU$T+A2ryY!m{mIR#ZsEOnt&1irG^j1?%8<7nEO+$8
z_L~cvw<je|5!-uTpUAW=Fvg}#nMuU(Ereq<j!NcRMgg3?{k0<)!`L{xDhf&*!}J6>
zRd><5$KgW|>8+ekjO`sCE<zv0jJiu<Oi)QtvL4Bfja58d9(23SBQD;J4=*9rtw!_>
zTjDSbY*J6R_fTcAATbUV6~8Xo9q<UWaGXe)gUMz(Z_X8G$Foy16)0EAr<RYEDF{Vc
zw@zj>THETzqC{4oCHGhq9)u(4{D2U}Toeg0J7|kd8jNPA`l<$>Vx?zgwUx0T?BHgH
ztG#NJ#&aOh*R?Po24&r9dV*Q*^Ek^GHK;?8<~<WAz1Lu0%pLXe9nuYJCVq=HsSWP+
zW(MPppj3e#sREtv7-cNcorT`5qn)d%t_@NZFt{-=_)D9}L*L>#`a9D!y6)erGJ!sw
zdM-Y@qa`O@w|tb}INpm89E4L2TOIqfSldz?1*?DIHaV#sQ#<$L+)h1vXjecz2vEPL
zX%%icT2W(B)`v^rQ(_4m<jeSY5VRjo=0iWIN_nnz;#M*CeG5ok``(QaKUjs!+!W<i
z5VA@Yy1Ad0i7K>}tJC@;KUlH}I7_^!%62fd2)0V-@}0ID_RXf%*7xI8n!{AhICPq?
zP%WmfNN*9g2V`c2FClS3f$#d;lW~8&NI|s3zNVLR4HFjEhd<;=9_)t>562GD32YZX
zhkrkbv({^*CuccBE+;~L!){YUNpi~pKC0WWtG^)QJRXFD`(W&?&_j%{-_k`$VBv?y
zwIsIvMg;)4I6(A0P8PB5qikIkslL~Oa5$?3(lGHD2?;$D6PJNOQ*9lb^%X$1w>R4u
zW>dGX@dw^922#S7)F@i@n^7~Pm!DD4-CRRiF~p34imS=(wuB1j&G(MY6^lwSelUG(
zPstadP|1=nE{!(E(&bq1K6={p2QoTwgnS9>SQ5_-{$}Ob8<^RR*_&-p3qAdv5B%Vp
z&^pKm-`YC^2GA;Sb!?%?bFK|lB3$Yd9Fc%{vP=b|CNy1W(=!Rz^r^&IHg4)nEuQDQ
zbBMrw1<NOfJMQ4ImwiV9(i!e4Dyplgy}DsLFuC-ZT4v`xGG9A-PEJTO&<oP1wCbtM
zP-w9Tp4bhJrJWY@i;Hd0u4S3~)xPD<eO%e=(SAigzmH!pOnPVwBa^pYlpzR7<&j!w
z;;p7-=_sH)H}MhJMCB&lqN_N}z)(rHFj74yEjT)Q2olSrZhi>u9)0jM^bxjIj%YSd
zUpX7o5mE?#lRBj;DY>|-<!KhQG9nvm@B&oKYsFG*dJhz1{Q5LNAFHj;ZM9oj*#0y1
zddMu^fAxTGm&3qn{_wJ&IlD*vMvF)=?a7a;J=dMLN`-G;o|`B@41?!EXRz3{0PYtG
zjDqS|$b<MOPMfc^I}L%3FJE4%^X6X0o%UL(N&tHb3>IA^($m*QsLqA?SnuE&q?SCf
z_gQe_G!n!_;bi=a(2}ubzeRn^s)kU(=Dw;(hd_EL>b{rMmO9RLhAQWAkNk+=(3g~R
zZB6mCyQ1~GX3MqlY+}0*e7`1>@2-^Y`5jk0W1#%?vW`%>T?OPw(4q*I@|UNU%q00i
zaBn2w-S~-(8p@$72?rJEg<n8lpH^US@Z4-~UjL~TEEc)7KG&Nr7ixnR)C`<EGoI}S
zMzK3R4?u<kvedexr&AFOp#M$kNYU*PVN0Wtg5I%uh+adazzA+-=u)jWS1{Vgsl21<
zvxRO;EHFUCc@(np)V}iG@tnlEN==knO^9VIW(2FF{$k4Y%siHS_sCJq2^J~OJ2%S~
z-8L8LYj-?V5JRqsKDyhOiBhu?Vf}Dc6}?N_bg(2RR5@gSJpOsa?L}5@Vc0{pQ+MC9
z!h@&Y!m7PCnmal=YNSA6Dflyyx3!kVAT}Fz@m|&+5YrcT3{x(L=qX@&ZB`h)c0zul
zU@Rn7vY6uY?`Bk7-r98GsCJ1g(ODXN^u#_@`hG%f-E<sCNeaTIG1zdV;ymr%ZK@vH
zXC~7dpr}Z%oaL;Nl;A`RDjcoWTK_F6RGvKCXO1k`LjGYsew{9xKhHIE0gGvZ%jM+C
zEJF;w%s<};@X=+Ea~?aVkL;(RgMB~*tAtg5vFRy^v&_u7tP-x%N`70>MZ__YbzE20
z(MhPP$}GM9wWTHWS4aEpU*u9iXTwBbE!mG|bp8CcYZOInqwFUdC$UlJ2X-)zr(W@q
za#z`R-L^`R>aY}=Ik>&%hovR&1_dRw<zVRqv+&qoT?_I$uTXx{6xiz|TYy$QVmu@0
z?k~ObL<g?1_M=B>QE9uAuqDCHf|3~qeV1Rd=`%0ah&X@b!Fc{*tDA_xQWLu$_mwLT
z$xpF3xI63;@au%_MP4u-+t^$OkWN=F$(u?-CazIGWF&V@7KNPvATUQY)_KmJ+p+bG
z@kpgETx|k;f&KY~&nCy7iGYvc-s4)`^zfMm+}VB5-8w%sM0K;m1sC!%rTp>lUQ&hh
z#*p|dvPN|)QqM-la}2iLBr7^vC(2#uahhgWxu~eJIB>L|-6tRP_y#Zr7U(lxRS7a;
z%0m)rYoFgMrXHdt+%G_xATydc#G*B6H<z2(8E9$K!H50CZLr{3%VqPb{)~&#wl<6O
z<HS~ymz*eCGanwJC5d>eKWbQTy3zfK<f|p`Y3yRMPH_`6OdPe*lb%8a<+&VZp^MVU
zN#ix|U`G9+SFRi*;d1#yG*mjON@2ydQuI9%cK>d^Zj}~z9nz&W2Vp}0+*^U<II+n6
z_`b}dZJ}-zAKf#ed{q`C0ZsInMONwJowlvMje6hSTs^eAhxNmd^a`yB^?pGFH;1(9
z@aMd!4+_vj`$G7DE^`V+-{U&4uvTa{XY*idn-h@|g9U50h?$jWy3ESlGiU_0W#RFS
z*p|6hHu4~F??azpzOGuW3^Q)K7z0#$lNG`dX+y$g)`6^%_6bAzSbbA&!JmxBe&sp$
z1x|@(HK!c=-Ih0v5jXcu+Oa6auFsiAx=?d)u0*`}Q?^!<+G^^SwV+38HvYQp{IQj&
zxbCo0U-Ud&8d3dADMY4+QKQ{#9-EZB?N9&4>Nfbssg__CDI*Xep3|SIGok)VrZFCZ
zYH+J|xBN}Y6YGpL_aW{miV^H|e$A<vFj-Dl6O&mO{q3Ehi_iTS4pdE7DceyIMt4s(
z^|2K%FU0e}8HAR#T(~$LXU<mV8=TZDJaMw(X!-_nUaHy@*^S>(XTQ;EI-nm#rFgK7
zqI^E60XS=45SJ6{%7`aaZwz0i@)Pq)im>XvZ;kY}KB1?i*dyhFnBE9Qg<%roXkll>
z4=B`$!F&*ZBD{@+gV*@^I;%XKY=<g(9oe7h@NUh?Svf4~!2<;V$2mer<T#wK$t9Es
zxsB1FGNg*UDaq61eQT?0thv~o)jO4)>fFNW_Ii}RVrVw(j(qy1)#krbquL9#GhVwc
z2pSFbmq66vHol)Sn-)rb@7+5=<y|Mgjl2$LKEv*0o9f2;{yk6@@>_QdeNE`NWxb~d
zKSam!=_V|wU6POJetpgUEKC6UGT%HXvnek@I>aE34JOI|w8!vZ;z=p^fyMH1L6r7@
zOp%fK5^&}~7AeKnibz6Gu>;{R?6Pk?rySNu2=i|`=}he4sTH*@*DTI=-u%j|;XYWN
zRu6>L2a?|7r_?%M@uX0`kni|y#w`&!a1yTr)^z4R6nkS3AM31(%|sMk%C$Dx&26_8
z(j<oO?|`o(BLf-mG4XEnLF!X*y*F+;r49KEx{Zn}E3wQjvCPh|s;<Z^OaV22;<dqf
zGmA{-<FADLZYOts`X!_$pYu8?fQ#07knoL9sNcBS=|OdMIp)`n1zS$L3yQao9F3$c
zCxkkCL^_*BI*UCjmfYBeuG9#ftN*2o1~S~I54<+uMFHgpRpEYmYVTe_(9yjXbxa>V
zvmHS|S>>3#`?W*Hf#xVEoxU04za)>|q+Rzza8BA12?H9T`RDf*O23ODQuV(zjmQG)
z5CFP3rgGQ25OpzKk%6)+u$a|ndNp3o2lH~|s5w!z{NYk~pgW&X1Q__?)!l|%;&=vb
z9qLdX6pzDyr8xkUV`z^&to1-19E|24|9W^H(3g{TB#Qs}&_i#=O>D^pu1;9|<24T*
z_}{KdM{Kq}OO&PipAPuPt5I-)bs5_cW&Te?IegVRQZO0uqnw?j|9H*AhaQ@ufKPz^
zXo|{20Q&U5KksS)m<-PHlh=>_A5N0}cexJ5#=pyTINbitu0#3xZ+0E7D*u0r;Vye!
X-(8V)^+?Yn;7393(VfCuPoDi3bPXiV

literal 0
HcmV?d00001

diff --git a/doc/image/ck_layer.png b/doc/image/ck_layer.png
new file mode 100644
index 0000000000000000000000000000000000000000..117a1b3a0ef890a0a2db9bf23f14f03278d1d965
GIT binary patch
literal 549343
zcmeFYdHn2Tc{fZEE!0+Rwbd3CZC{{Pg)EcJ4U(BjGFfIOlXW64S!XgyCYdCY$pn#}
z+U03|THNaf_*8*f>QcppoyS&f#jT*FDhRF1;mD>1Jj(js0sFoWe$LbO)BfAf=bV}R
zvfR1v>%Ok<^}Uw)!|9m&g`fMipF8TPqkdt;4`xRl_2icvb<`7HaqJVond4sbHyiNv
zGv$n9j{4vipLyu0qh9(u)zGTaCXABcs1wotqemwq&@wBl6Vbtm2r^6puPU-NI0Wa@
z%#Xq-@E^T~KnU_e7<wUuY7lZFO2aVtfO@AOByK-?zq1I|KN$!+1p)*7rWvjMY*&KA
zu?9Y12%JV4a0D*ky+;oly!SkChR*XOFauYNsuK|wLy1#R9Gw15k=JDDM5GUnlPC_r
z7Z-SO);;1E&hC^b1Ggj4Dd;0_>@SNf1vjxUm>F~m2E)iHFx&$-|Nl$t?uJi6!8z3l
zog(@_Ue~QUsGP;4E5_V<xfOOVL>yb@q{W=tYW$-st%IVBfS4W;+P}WMd$`)?!J}vW
zppHB+lkRK|BvU?m8oU+=ZwSQiN6w;fQgAu?iG;mfplTM6Md*0w*>Y9B(D=#w)(%+I
zBQNvfS9w{QXKNs8AlGgz!p>a}D|_q+NR}PU!N)`$?`0lEgzbq4gQTdpPq#=M+B1eT
zp?(=NFcG3qG91kN>v1$D?7mHzN{UY+&X@3&x1TM|@i-=X+)|q6GTrQ4VX;7#E4|g2
zsgtMc(X?U8#MI)+#)$WNw6OV_p~0);skmnqXsHXH*fTIe<JO#tL!zHYYh=5hF)L~q
z*K8%siPjjX)J~mF7nvKEq3ZQl`ZmMnV6WM3CH6Qmrn<cFazST}c|RPDm{84yLf8(>
zS`;T%q7$OsOI8zWrPsELVhg|Zqn7i?RNjQs46+6UkU%hQ$?0tO=&<$JQY8n4tY~2-
z=_(L)u3&M~#Y=m~#k*uF=X#WkmlmyNgSKBLdNy3L(|K)<#=RLBV@QIJ1+&=mVh#H_
z?e}R8O44DBD1tRNvQ!5f!yss|6^N<J)rP2U%vG|cWPc*z_J(45ZJhQeshSy3KASpv
zW6QIi#16>BAQ?u*H7O1z3&sO)W0tv|<@-AAAwg<r{A6fUoyJxzG-*p^NheD&!ChCk
z*t~@|teMPKt{^t+zC2;q1<@;Y(yM%rC<7vjXQP&5<QP1!@wQ<kJnOJdZVbVD^q%2B
zxLG2cxLq2`ZbCo?jSU@yYlh<r;j5Vv^aNHYtEeE=g#*UOvq;gR7_zougDxq5?_0u3
zo^LhAmop#SK>4axmMOD}nb~ZmD~y9U^F>BxmIiINsz!KWjd|^^yVn*LODSCzTrSM%
zfrN=8rti(0)izbKp=Iv54dqx1G~OHI;e;nf`iS>qcv<?sEbSHB+TgyNK$AJ5SZiT~
z6<|nNW*61UGv`#w1w<80ak-gCR)4WF$x@1Zs_(%0vZ?vbn2O$Jw=oNIoCHzs%bl`h
zz8~?-4tE!&8*CiLW*~ZyAbg=U`!XD28IshrBzj|xcNPPkw&u*Ft!fRS2Q*L75Ez3Y
zHgss}S*uT$oj`~^>~b_6&@|7_>p4#&Y3T406;>QfP0~Gh!ow2Dnh&$Q4@KfuRx4&C
z?}wgl?z{1NESAM`<&L^zhD~7b?MC2}Ma%6J=M#K5!&l%>aK684e2W-+s8TA-RziJ2
zZxEp;s;$Q^?Zli;k#y|_zM>44J(Nbcq}Z;eeIABIh@6;tLCSQx?%s+Rv@3B+wA!Tp
zoYE!$bD60+n`>)wmDR{>2q!Bo7#p)*qk~IM?95TQhnib*z1lIo4PPfi2iqWGKzA0T
zmgH{bS36gAcB~zt1+nQc8^9vMFiy6gngI9Mt%EOd80KMMA+3sqskvILfkRTnkl~k|
z5iG484vgQe24(|%7Q-iDZ!w>f%4|zn1JWMYeBo;{&Kvn;-9+%(oSL~<_01*lP7J(X
zQK_(uri%tsR`a@MN5EV9kl(IWv5{=*?PjG43ES2&J}g=-3DqLZG*!^>6|8Y0-R)>n
zi=^38+yIhHLXt<$gqU}-N90*+!mA20dj?^SsMug6Q<7(N*vgJ4kl}U%p0<3*8PlXT
z0mpm9_l>Zw$l9?|i&!LbJx6DPMJ95&bc6<<gbK$(>?TWavd@b|?5-wLn;!UqfDkr@
z0|(A1P)k&Fh7TolH)W`LO>%ml9UBA2bC`Y)Hqmqn-W8fmk8UMN=D;)kF|{K7F*sWJ
zG`o^0t2chci7ddK1Y|bg3S7|g#IBN|85Pu^-?RlPigSsbHUwRc`VqN7XMCe;jmaso
zKv!#S$?;3s=OPDeD!Wg-m|^uXBxLh;f~|)GV4uvoE-Pg{i-=;+Z&m|rwciC4Mi0F0
zga8s<<-W}rGjlP=>cu`vk+3EyYhurNPoh>yoR3!}sjskr_s}sGvTi9(v~ioMqmkhV
zrQc~tQCUs6o~#hq#KL8QZDyfvb74L+s{lbPScY)8AqhQP1_UdOu_~P*;K0-8Ub`Ky
zDmHCH;HWx}g!-iAD$<N2fm>WN8=fYmof*#pF#CN#5%Oq?Pjy_Xnt~B36C%6-$1T69
zAhD8(Bp({Ux>*>I9IFKfzBBG@w~;%)j&?8jS3>ZAVFz2QXSLpR>q(ZhSSO=A>Vt6*
zz#ksN4iaY%$jMKbfzi5dRrZEc#GxnY0N8`xScKe~NYk1)&5oA{ibPU5PsA(_q9F?S
z0vpDuu+2r0@Me3c$omU@v5IHXdd<X>NiYdArvikEwRs?V(OjwuzJT!FG@e>E43N$4
zEIi><P2IW5%;;^?wd`QAp!5SG4<SY3RoI8@e!NmVc8+n2J#O!sTpYH;*<cC$q(-7V
zpRDw7QmSn)1{4eoxZDeXK{A>?Y4)PNYVmcv-bKFD%K!-t!^q;B65vk1+3#~K8=>}o
zDU$enFN0?zYGw+_G!yGUtAr(tc1X&u1coEVqfSf0&L)rsozcW11cys0?38{4crDx4
zdJ`XP4>O&xMNS9XDQj1Kx+iFBxiw(Z7;Sx}vF9lnxBV@%j_iJkf!h-)Z*o&v_Tng@
z*K1r6r&$!6vJFGah;6OyzURV0kF-Ytt|Kc!_h5UpQWAv+s?ln~AX@W78!22Y7mOL8
zFyk3;F-XWJsse`Jn}O(Y8Ybp=Ozh*m+h%DyVJM&G=AsgtD|6T<q%Ay%v)G<-3cHjU
zVB+j(hN=U4mKWQ!94IWt5Mm%|4qn^y(utvL2`~0RLcrEYF11xC2dO++%@?>wkKkHc
z0FHqq5fRIp2PQGI<xVmzLwX)Rv)cY_xAItVzC(u+N%s2_ceWg?>j64|HrUcbrFgK*
z2TTo=mR*k)t8_P4FnmYt+xZ0JM~N>$i$S*mXpkr!ua``Rfcj{vwRpIOgfX=jJc6?2
zY3gNO9_CD%aVejLTQ5g_QQD|BI)y28qW1IMC|c+OW_61m3PAj<wbFbI?)GLjlQVVQ
zIH3>+c9;+zV(NvNl_@2Zo|@#eIdZMhXhikba*+|S6h>S;w`&zG=TU3SM75kSM2a&b
zJ`1841T0J-uXa*07Zx}h_m?i-!+3wMI&lPqCAo$&6vaISnCq^M=kWqELe<^N{>BG;
z$tJ_K)*nKY>{O};Tnpojp+SEKi3>Do6mO;5i)sXO7QQZNE$_sltocQaq*|}@V;Qe(
zF-zs7Nz=H)tx<0lEwUyN2Q5t$a)hKA?p(AqbX$bC#fm^>Bt)CB+mKpnst)26)3qon
zj*pEt`G_*LD4jD&v5FF97R(0bmh(e@A%qOfOvK%?CCjKoZ*e*(_K(C`_HvET@TR3E
z!+pFckfk%S6$*u<k#0Ae9a65gb3DOZv06_2^{jArTUN^5LKG1}98omU5mJ`%0j;$7
z)Y41NHt_%#^EBJ<EMBLuveYJ%VQfvzLS~Yxz@)MoR1>K*Hg)Ir3??3x2BNw`7_-21
zfiDOKo5~5x0>QVn(k#4HWkB(;)oiDuu7Y4z*=(YvoSx0B)xbza-t>r}O6TK}m=Wm+
z+fv;)++@GZz!8&xzfvr4$R)x<Gor=kI$bjG9IO^CQp%2dEgmg6aUqTg-4!ShI)W#l
zdN2BV>c~cDqo!)uo2?#>)`S3++yKsjM__QXVOAj2W#?HM!xL@Q21q5FX^eP@fe;%M
z;x&s<S;CeCe@x}${x*pz!JjzYl<-mRW6rQ&<`t@BzKs?&ni>Y$43=CwXlt-j9UW1;
z#kTBNhHdkGZWoN4D438r$GCLTsbILucxZ*UR@>@ov`8npx#=bsVga=pZNL%M3xK;}
zyrLAz60LzWXN_!NWV*J^NY6p&#|%Si0eCjpO{SUz0JWK|QVy9!D<cqtJHi6so(7n2
z*(w664iQa|8pWR>2Ao#3q2^_7M+*nfb$RFe@k%#PRB`<f^Vi)J`8nRN3o3w9DsYif
z_j`siu}se2d;6go5|l7dHyWES_8PZm0dxa+B9sB)49uZ4P0~eX3zX16a+uoGLV-K#
z)6<b!k5=Qr>6C_dNgu@HOb`~vQlG9ym<J2GFQJ*50zbkCGGB-sYXUhkjUIPVp4GPq
zWnoCzjIh}a*bpX~?kl+|ixqFVjSb_(N+8>TK&G~gS$JPgR5RwfbOxJl07XbEf9dHf
z)Hli6TIIba-^_(f8go=-08?i?bel#@h4jLmzr<*zW8+X2u?((A#g^J4D{r~);wNcG
zl(?B`l;Uhuw=|N#0B2==H`=t-a!n;0Jwo!OhT9kvj=(5n3N;)xcPNtXO%^^W`r@$4
z_tM%lX|>od{7Ksa3S16@t>X8`t!Nl?o#OT+&<2+85MblS2IRq7=VMF2Q5U{MOO#+Q
zXxHs8+(78%__W7w#&A8_^>DObDMN3oiTz>*S5s8YNEyu@Vg9?;u%{khlwOCWrsXZd
zRn<)V44O0%zRC&0rskr$X#J2MH_dqK^;Y!|LK<EiW0O`2ri7;EBWXx&;kd*QVvkr`
z*4IRYlA5LGq4m%x^l2vP9)JthL@kLir1gxbLEH{u+Qi!O<bGLr39;RG9%HQV%T}1#
zl*SkI0OBDEmYhjP3XsqZK~PKo#ISe_1Y3*=2I>cpUo#Ve8jh(Mu?!%4T=o&XAjD#z
zhPsK;(^j1c@thwm7)^b|VDgo-nomcvi_j&SJ9@Qc3{<H_y4$-1LOojJ)0MHDi%S&F
zp{?zDhSJ|)JzU-HH|e6w+-VlX2h)h%xoqc?2MVg-A`a(Ou2FJ3f>UQkdM!4y>Qu+%
zY$h$6)oRu+$hFD>O2txqr`zl!8mS~6aKB6hQR!%)qJ6gR0}^cMg}<)107uxFM@tDB
z+T~_9-W0JUu1XmzM}k47A*vZZgBuC8m(yt@$5f{m`;};fPK^!5VZisSo(dR|;wN&m
zZT9_qWG=0FI8&GW3?<vdO4rNvcI`@qG!I*R4}wCbH8K#>7&1!-o0&0zhZAcGPJ^K*
zbPYc9dRPs*E!#apr2k`FS{;lSZ|6adL`@~yGH`JY-~f3Hmrj#cCC=I}JqM)p9!<0v
zu<KLBsH}Uiwcdsdq2bwfy8`IMw8J4b?!{{Wooe@|E%e7iM$bpq-nIyEhVb?21`r?)
zbtO9B?1WQ>eIJGW3MX(p(~!JJhVwNEZslWjHRpvsG+8xEsR!0Jn$IF7Cb~mp1tl!X
ztW!7vO2QD4C(J?<Mtf}Cb1}xBg=^hjWdcjg4S^jZs>kBsvB{`OOQb)CMFR6?I|^ng
zgoDt{8bor$PRe&hu$*FCC(&g(iPtOwyOx621wWF2W_7Gd)<Z&u`&6)IsrhOO&nUK!
z&17ogbS`+3MH=CT#eA^0Not<MdmlsVt!tB6&F<7DCo^}7?#Z;x1+Iq&K>t~$1(IbQ
ztqxlhDuuO=Rd^^(J3lq)HW}F)lZN3rXgz}Z;nIhcilkO@m%w5MF51Q4t<CIh*iR8p
z>xbpg?40|mZrKSBRUyHQdfTP7HHlI}d+ScM*CatKroCv^E7(>9H}@7(MN`Ic9SOBM
z11G9&00hR&Gc%P$X>3t*RavZ}1)|Oo*o$X$Zf)E3URo4WgoA_0-U_GaqSBhSu3K^l
zUc$COlUR!&dw#gWJQIv&rA)BytS!N$%H-)hcK4HHk~=d3B)3_-xAL29l?6p%4!cRv
zlRTe5s5P9)Lmcg>u!4edSPBbL@Fmo1@L6L-2?Y!?&SsE3B{dkF8_b+RvaGW8d<g!U
z`)CCc#8N;AEF^?RX!e&(4GfuKs`0Se_esDd<n4}!kZ2-7t`LxZ5tmEc$GI3<CyD0n
zr$HN2c;nSjH$E%!j55cT{dy|~%qZBfoi{9iqR)22MObf*P^x=GFzz&IWl?0!`%|a|
zdjxKCMR6$rv!#7!KT^n{D*I#0>~t%LbaW77;mF_6euh*O&oPu7&Dv_LnEM)Ab2DQ~
zG4p)faYeh|TVNr)qs&hA*d3*JQzJI*Fx;Xo3XZ34xdFq@Q8yf0+hJiEss&ERILY}l
zaRcD19(RI|R5B$vUfDa#1=|hM>~&4vhPyFOMO=mfRA9L&y*7NSSZrd~nHVZ8kJfBN
zFKRXGDgEsVH>93`nG0<qU{isE%yv&r+@TgXQN)hWk(0PgWP+rOE#b@MUIuOv`1}qf
zx`mHa$_&ZM^iZ`;SwqhH8bZ{<iYO@vyu5>sUDG$`T~4p(P@Ss>Sv(b*@<vJNExlle
zqWK7TS4c{;OGnwvSzYsX9kQh2HXK?#n=Vo-R=`kD0-$c4YJ7vb-SBIQ*)eS6%shR#
zn8FPJ)#+-NW(>QJ28+(X!sT#X=KE^Dq-C0+%H=Xz6RAo^7Ke<?36+gk^U)Y2T_j?<
zpeA`g(S~!z?QBghC;2oU7X4^hid8n)1Kp?1LgSaJ2q?A=ZlS_Z0`utLJhZmbJhI2|
zpyVLfavHIviP4y8rogSS%%*@S4Hpo}mKQ|f=H_moRC7~ac3F&_PHt1WR^+h+;`vUl
zn_PnX4!_1QjlzWhiHCAAg}Z=$ms|6E%Q?;zWW_Xjn!7v$?U*VWbB4x>LgJ!oAGcj0
z3&jFZqKE<sYyitF)b|TR;G$sDlJmr&X_`^6Wn$T6xiBrOv<HkcWF~c@icBZ~&O%Tj
zMoiJLB#3${?`vY`QG3db$ti>l(oxWZkQL*Fnk*EP#eNoNJ`@eYB!~gFa!yak;&7y-
z;hNqGW<L(t;f!gOz6}rLK>-X@FH?O{&@o>{1YeVbN6IhMIvo=^wt++xa!1){gp~rc
zRRxlUmVONR^E_=9)lg?xB&SzJAdUtG?lYyH7KDz&+ik67lTBXv4zGBbk>;51(2HWS
z-mK(NkL%mZW@<U3q}bz>Zp4IPS3!T_2e8Q$d)HV}14ro@fr9lY-zUaBl(NSC&A<-p
zDcXfm5vV4Bua?bf9gk-OTkcjsiSra+vrE@NFx;E6312q40CaqWgF;kdbM!<T;=|nA
zHO!o+I_A#OLSBXDbhKM;JQazVh*o0OW-^gRn1#krNFP9&SpVdu3!fdBOVS*xz*-4=
z*9XW9b^+`vs?XYOFWsq0t%HigCi54u)7TBR2@`<c9J(Q=;xtbZhr*gYs3G*EU15XJ
zxK9QNQgI!a0yf$BzyoQzrmbCY0<5$<zW`rTZVidOWl{%LvZ9m$ND<7Vv72a}H*m_n
zHI!S-Vv9%=2BXS1_GUDi&I{oFK!&a7qtJ3X15wsWSnoD#5{h>NUzqmY1YTjIW(XUO
zz7i;Jw*WLZlIS>Q$VoGsx%w!gVKj8t5u}dmojO_T6Ajeh>Qt$1n^6Xn8ikV<9u8R7
zcgJQ+f~+!X?4*Gs0W2r>>WT%$T;R*L{oL-YfapC5Hfed&&b{6)0^vE{Thd%R-KZ?9
zGyJruVNVU)xilNjMtxz>&Pq?CN18EDo!zvf2;0JdagHGv9Gfrt^F<+mQ4@C$(R7sI
zmPwEB!N#o@?H0VZKZ`W1(-?3}@vF(OLwajovaC(=NMu%PbftR$1Ut8AXrN|cxW>3P
zhx%Z$Yqlbv0aht57-*`;&B)Sevn=w-5+yp6XH%IB0V`iG0ls!SC5O$12Ofgqg4Qw$
zzUif_9#M->!B4;onBEq4r(w0q5K%+d91ot-(@JTW^)liWE8`GEU(On<5LcV5XWKXz
z)FCn~_Q7)RMOL*7TwzliNsRWm1t?X;tqkch47ejB<%$*#5R6##;gpt1HFl^ay(?i5
zCk8B|u6bAp$$0~2u^*7+5O(VoK4YQIHPu?L&xs+F_U8?$x67~?mu1i=;U!0Ia~Phj
zHWIEP0>*%ZxLXc~W03rFF~!q~o~MN*=Z;`1g6ep>ZF=Ogj61bk5(8UiNvd9~Yka~)
z0sv|@1pXYj({@IY><sf5YdA7WrdV^W(@qes0tK{*wCpkjdAotP$*PwV#Z2FNFdEH$
zhW1qhTnia59=J9qi&>+|$ztQXwwo=JUOMQnry7w>ZFpo3;x$SeOEcaEF|w9WdpK<+
zqH|Xhv<T@0RbpvONUYPZTyC3iqA`#<x>2&o$Ph)i;EKdxiPY7rj8c%Ly0lAgCMFV2
zj2cCsQ=3^7&5A(ar%jK_2Ud!YMoX9M(diO061C~$Ibsid5SjTtugJhZhETjG^GIF<
zImn!tAQf)Oqby<P{3e6Ob9*fU4>@XePN>LMr!3eVgMtgUDcq6Plc{Vpk<G|l0^gxc
z;Rq^=aT|Ckp@2kB%=cBH6p$eZC?*>miTOC`4>~uNQG<r}9T3m)p3_!H2zhRub;d!`
zBH^}9=Q+tvuK1|%b9_Fk5|CQQusJzIlz`d9NT^tN$8tAmB*Ig~s%^3E2%A^wxXr>5
z3tI>t2vs$Nq+z&2RXP}%zEN#LyrSnOvyS1FJG5eb#K4<^qq(#n^{g@~on1NBhV@YL
zO3dV(*>tc&U91O1alA*1;W##0d$r4JY{!eX4HZF0zfe$`AqI<u?g2xqHfH7}@O%iF
zxo2n-7C2%<aAaQ{+HN7+=8j2;sj=!KvCvms+;B580{R4IPt3L{=OUuhf|A%`m1va{
zPxQ9e$J>ce54O~VFEy{^Xlq2zhg93;Will)A4&dl)3_+WWCT7!Xm!8>`a7Mn9@Flo
zEM0cOYAJtfk5^5OBx*kn*BCnj3C*Y~#<;NLw?u7spu;%*6;0S{591YGLcB8EDYKPq
z=d}y!4=5IioDz<iDj{p5HN(YVK8<%Ygh7nfj|~OJfDZ0v+k_>i{MwJkML(xX0%k@o
zhhxM>jMyL^vM^&)TYt(=DBPYa?V!=<eZFnN1u^MJUu#Y?gxi@s;~C|U#3&Mry0Cl8
z8Hy>8;+*f#*6OkkTYy9{Oc|m5xj`5kwcPYUevhS6fGRAA^lQ05gm|Mmd=@1D{f3qY
z<x0Ui!0+HyC-jNBt|qfeS(?C&_>)u<X>%B4mf`a=T9*ckG~W&&e3{R|xWoM>wj2q-
z!^U!D6<w=+Ze6?MbhN{Bl*}vBfo(5mg|wP3c`pV%N#sVHn|v%nASW2K<iL)-zL5=w
z`&CGjzR3}cz2!1}+|;zQNvHj$#57vWC7nz+Jkrhv@P-=-eAW}ng+9wLX4cM%T-<;T
zhGJ#SdMdPF`oTbQ0CUr_6!e~~7uFzySi)EbUNUpZ2ohI**+r0^XyIc-6s?$VAvtXa
z+#WO%Br@CzCD!MGB`;Pm$~wM=(QZXd=4(FWCt-ic?9y_yF66qBHWPD>3}$1vvA2@Y
zbMTR~Cx)I?n-K(oqInJq7DggbIgdd|o|?Slr%TjiHzb_TK|=!}_+1KhRE6uZW7Fs(
zwSw8mcc*1L$fMHLcuZy(lip~HH642k9LsW#fJaWxB{uHD)uP-JQ4_L)jStL)6iZ>I
z`SD=i#ZcB(i2Bt6rshmZfoPH;xE=V^nS!Fp?tR`Ae6kiu<R&TEtUJ~gdUldj4Q2DP
zt@qp@)t!{vO_a{IOKtBmL?8E12V&daeA|&LnB%lX#yi>oUd`8*!X?Zu+LJIaAJ!oz
zCMEWyi0Y@kULO(_1yeRXH77ckKd~&a=O-5EA>pfRQ8n{^)o5;u%m+O~0Kr5=;^<tX
ziJ95WOv}+`?t?<3pvx&*Sz~Tgn);5wP!Qq(^Jj~sV;~(jtE6Qnv!E(~3GyPHF4oAr
z%R+#74DgH`o|`5}P;Wt*%@tP82%syQ&Uk*AfykNJ_m}&Drdw9bs;r=tx&xb~6?
ztprp!Sb5gO0*rl14?Bb)c}Z5|N}O4?tnPiFTs;zu=S?d+EqZCGD9S?7Fqdm0OpzDR
zW|9-NLo`TLa5K<oGv8H((@-YJ0<D2nG5rKJSH@I~b9@H)0K2A%2m+I5zy>f?ml&8X
z_+p|37;H4mMla<g@dpws?BW^Hu+A8R_7ln@U3?hJSdRE}!$^v9;wy6%ua^Zen8H)m
zlE>q9e<vZ*c<lw=YG=z)?hH4~Oc_J{G&Wmxi}6dh9YHI=A<2ni*oiUs*34K&a?s@?
z&}OGfF&7TUAaw!yP*&4v*oqTx5C~LNMFY~^18&J|_L4Q2=YXYDnhaXcKmkPD5Xmrw
z+|5WC<t;u~4rjB>N^;N(lm#VdToD^-<dxQ3j~6jCG>}l6%z8c!u}D2F0pVt17yw<6
zHrxJQz?xKXBY`v+s5g#`xuljrNc0e#h4HiwrEP50)6Jg5Yh%tun-#$)jRnJ+LB%3J
z%Yzjx;U+)u)T$|oiZYDafRzD?;rzy_Mlw8sH~7w3p=dbSwL_v1da)-$akV5Hf4}!d
zjIZY!C%|so;~co9;X+*QI4@HIDZ_<M1~aCa_0a(vZ>?P2vu@N@tiJ}qN>jBUcAsxc
zaTRHf<RxQQ?}ejSP)9x4swJDpGlN4wxhm_$iAs2c+Bco8J2%$0-4QVtjvc;dt5TK5
zE*Gm|=#CcC;f^yV9k4)*B1{+46=divJT3a2`iw+%R+?K}pK^hw>||{`0EM|Nj@Mo`
z=1tIKXcrU#`wEvA1D&wyzACXhr!G{`gfIy~|3js7ap!@Ul~n`4EU3`{M|ojyp@>!B
zJ<=ib(AW*4$i-`ca^dmZ9CmwROtwp|sR7v8PB`kO-<x}zK2geO%B@?=751R9EZ1<J
z5~t*LD)x|U7s#Y*M1i1T;c6XsX#!BNVd)6R)X;VK0kefgcbzV44ioJGIn^f}w6vgM
zXtGoRN29q2;?zj4qrzQz&4`_)${gfflC?5SwH>!HxoiN3Ie<qN<ZTLnhK-Z)dhQ7B
zFl?(4rwR&~DW9~voxEJsFa!!N(=9BJ;{~+u&21^K3$2SUG7<_0n||QIOF+e#sj`yU
ziMDP0jacnr-}2Xho(_0%M?1S^vYxt)H^e*$GW^Ex{P~Ql;8bBq4&Vo)j-Ue2)P>@%
z*8K@avjE}gp1agF9n>?GMT{nDWJEh%$2o3e(IAsc*EO;xz_V02C`XJf_NjDSZnn!N
z?f4|UhQ=*XL>{vpg+@e^D)9I$CtJcq+DLW{dX0F)Fy+v2_PteG?Tvn|)uUk=kd*}|
zP@;oHvffq>BXdDWRj<mN4OY=(QsP*oH*RJH-zBLSZSHURxGuF^$fHqUO%yYq@;nN%
zNDXe6^>zvyL9&*%l1;Shc{MdbU*#4LRc^dAc@RcceNd*3dmu5)@g+H2aJx=#qnMCH
z!L_U|V%^#HuJ!GSomDDEMm{<Wr&I%XwAaPQP&x}4igWjg1Si8f0)YxlO|f1Xs4eLQ
zN!??$e7iv%SMStVX!nb~)CYZ)0Z&V_aSOt#0(1*iB>}3dsOeEl&Q>};**Mu`f)SvB
zf~E6d%o~2zy>j9vAdSW(MlkN@81I5kl!PLJq8aQ7O!P;%Z3OCmW{Jd#%%>jck!rhr
zlBc1xqZC_1$dy>sohzOUyCA0kb^Fa~GhXhMiQ{HTBFp1}G3#6y$loXpJQ_rPkp{78
z^(SgD$-E^UhbgeFog=kC-E$CHurpCua>&f-Y4Fs@ds9^O8qLgIR$S12-Z@Ob2D_A_
zYKyw9l6Ky7<^}#W+(cPZH6R%j8L1rRAa&H6??b3%l{um#8<2-sM<DaiEgsxo&&JbD
zT8sl5MnPtq*#%>tvn^U0CZp+uC_$`-fZIy1D~J{SGDT_^1wD_;t<UH~M#m?HDIr?o
zA!*Wt`DWj2?KS6Bd%2XMHQDS8-dt=}ld4twjNQ2#SLe3lF=*c#*3f7kxsU+hj)fN5
zXi|1wb14Wriro0Uf{bU=KGTzAPu)7Ze39)}h?*~a!P(R-ui?W@AFUuSh($w+^|9$~
zLRT3iE)1-LNfLZTAsY<y#%w0W+Co>Y{=UmF;^4p7Qhh-&_Dc^$1Z>i6&uk)#0qZc7
z3akm)u9GNAb3WXHZo3fm(k2cAgP#uvv_~%xVo>m1f|{d2)uG%%_KY+jc3FWkJ~J&S
zi3~w)KZOA%F=aIcT#MoNAX~x)16m)D;w~-bghD3eQtuCh%_u9RwGq{FrNVJC7?lZG
zsOpL~dceK%%n&pfnG4WwxD$AwW6VH!qzQkxPvZ?I#{&rw`&~kxsU?RNxgm^kg(x5#
zPzJ<FrUSnR0VG}TK;MVkLpCB(vp%|t@eCWUruj@j?Xd=p2EM-8Zf2QhF#B?W_LXU{
z?+Z(+28}n}%;SOs?M?Ee9%Pb|sG!lI&Dd^3d&>-wCD$?5eTB}lSvr#XU7|wo3=8~g
zWKBZK-PT6N^BFgo*kJ?Gn3m1jFbJe4;E`o9q!zpW7R2LY3N6Nvsn0+YyDMP&UZW?o
zENWV92C$V4_Jhfk(i{Zz6F8LF%UZvTdRPo<Npm#we8}6O1t{<fsyz*ZHtmf^0pdaE
zXsOKmg(mPp37Q`C2|Du@2E7}P`?5Y7bh#^B4pyL6HTG+>uK;!k>={Jp7Lh|CWe{=<
zXKGGO7fvyV(IJ!W7D154QlhXC*((4W;$YtNp*0}%p0ko+1)2FtzF2iz543l?uqQ?g
zZLNgwP4%&jPIge@z=WxdyGonkpa#oD^w3cKO%KM|JOsV94D18>vQyiZ5!9^VXpCws
zF{(=0ZCXIqS_{L{q-Z+R+;-e~r<Vn`OQcaw<?43U2FwJPdA-`fJ5<7*q%`)Ra-x|-
zrlC_7<b^FzspoBRjxNkcwvxpZmR;#GyJU8Qt);<w2X@V-M<#tx9!m3}?o6_2gK!Sd
z!-S)Qt1ZR&QnYdw8$np?#I=s~dY-7!sMCZh++j@4F3c3)j|PERD^$J5eOF7gqA>be
zzfeG51U|RJMZ9Q>Icq9r2|D4Y6Km7!l*<SIY0wkCVE28TnFhV_W=PLrt4Eufkh$_m
zD2KbXXT$Xps1gRI%SNUF*Hkd6C*i#9!it*WKm!9{z6pj${!+$fLt@68&~`rDph&f!
zlopOxso!(@D6|*xcn#OvF5obmyqpx$)?0R1OS5Z1A1nZTFqnvfp2)%mIsR^aBRt?>
zbW?M}IHX-I2(ytn1h*;9%w~`g%T&7PVBE9<#EIS35IM&&+Z?aBKB(hqY=`Gfx7Z!{
z4~JyCaG0!QL6fqvGgBAj2SQW{NE6Zlm7ML(#cbALvhInJ9`WPcSi4b|keU)|y^B+W
z8b%rykMQZ%6^Chr&l7KLQ#*7u!J+}>2us;i!1L+~neW<g(A4mLy%Ru(<*?fHu{F&I
zX5rxL0B*ZTbhBaviSNa%UVs!!+Hv?swkf!=#>*9y@7AD02m!sGJ<F{UN0Uh*?`OV7
zpt~&GB=K+xaI({aV7m`daHs$rWXRC!7fdxnmPMY>GHyvAIc91(;;e>qoQrsBwpgs1
z)|R1}h1!E2wi+#GdvTerc+F7Tz*f;2Xa_4tBX?dQi+Da_m)x>d7<~zm)MkTM3W{dw
z#DGCy(Ix-3!FE0tom7)$gWmfw2WqE7epsyp5R!=y&dX+4E=d|4RBO<r$FiehwqoHj
z6|q%{?eInt#Uc3d$&lMg`&wiapN(cwS=k^nBAN4q4(v@dgpql^@tellj%~`7iDgtn
zZNH2EK%?ilbM2kNWIWC(0}Qq1N+B8;2V5y=tOItvm~)$eTZ$YG8P0IrOfeHB^gW&+
z9vp?0Ktd&GHC@OW(-yMc4qe`!hY7m`4UyEearlZMTn$1IzwuVNC|fNsAC}k|kL&;=
zC&&;XnmU-t4H00avC;t~!9yfUt)Juv2yZtE2)x}(#YUEi@t#j7;Ne|WD&f+Q`Yo<T
zj1GdiZ5~_tnA-wZ3xK4GH;OQ4T~}0NVwO;$mQxM6#>*h|OO4)cux!qt^o|>u#cZ?~
zR{DBH$}TIBGoLC8R@<5w?jRPutO$SCsw%3@hd2Qo2-|sCe7PqOu7?pinoPN3xy{F8
zf*DW=kLW|J4@x(1<~T*^3J^3a6_^owOdm4>%R#f@EMbipY>YN9wZMQnxUFWqJ^(Bx
zaHA=LwPi1-)eY$HFUNa*V>q)BmKg;ODlZLIZNUAp+bg+!M7srO7^GCyLfa~wj3l3)
zZ8v>c1iy7*)p_KnsMx}T>3+EZc}m8y)7?(nn`}yQjxX=WRMx1-Tt!Ga5_?=r_Vycs
zcl)T3A&%&D89;p}DVoYP=_V7lfF*|@+5lO`Ocw^W*6i|ZkBpas)~0J@tk-Fx=b&=w
z@3urpdWBsZvnA{eB-)(-Rnf?par?AllS;<s(%M>6B(>*@y#`Y)3{q619bh2yaK1+!
zu*O1ZLV2fYtOn!JxC_J?BxmLzTcG%^xsuvE--nCcehlKiVdrtAwd5>R5v01pd`Pcm
zZqWip-bGUw?2+Rghmb7zc}G>MQ)xHqjDCU7(^_1ub2(fOtHsVO5YT0BPa(@0<YIv_
zX3rh~m%@UME2CW-6rj~ilJB>=SWsgJ+4L}*PX%;_<a1B!*lGrsPP#XEU-sat(%_OP
zO)^X?whW=J)s|hi!g8Ij1ScnG-?uTQ4<(JW<62OqqVe&TSJZ6D3}MilXpW$Y*WJys
zVndMXsFvVoD{B`81+%4KvlWI}k}}#$c3$CoO3dL-F<LcO#+^gp^cK`8i@|27jDcS?
zgBI%V2y~$%_yjUR&!oOfrf#vulI3KBnfcmPQ_$7X#av*R*+StBsuV#}i@ZqYl#gVy
zi~^NlHZwfRpUa>P!@%KXoljlU0bB%bht@ujje2P=#dO4peuSzdq#6rd0YloB(K*7&
zv>tadWT$nPT4%!rv|a8%K8_`&0gi7&O>x#_f`MR`GD|Q<fNkP>Ij4qL1)5M3qYXW5
zAvA27v*5Q@PEX-QRjv5p-0s0$$&mHIFV6<zLaWRT(PhB=yv-)#c7TA)oH1PVz1%U7
zsfT8WZbW=iZp|J$7(yx8Z&#uig4kfj^p_E1!np*{w94CNl&+i@o%i&}iJ*<G<gP3^
z+|CG05Ixa_I3Lopkif}cy(KuhG~^Y$BRA=`zf2U+<tNU1Mi+!*9a{u-K*n;*29D^#
zL{w&6k%2;VZ3w<qc{#WNTyH^3i;3**9cu*P+R?W-$kbasO@d0jkgF83H&bE2@X<UQ
zk`sQ`Xp9Gv-k{j7`h>YyI!lna<LM#z&Ca^6D-#Krdd(Ve*|jOLrj<i1kBMcq+D#Qv
za>ywS?R9f&*^S2SG-jO})lfwVikZd0`67Ow%mMaC%Q+Zj?yMGM3d&rRmDU5CSJ&|)
z)o{h`X@!VV3h1B$yv6VBR}I<vJugP($$Fb2Tc(#V`L;H|!hpmb1+s9#)j=SRWGeW*
z(<lxGm>sB%gKYb}newHa2mO(>E~j#_-H@~s@?l0mEIRkiW+%DBlv?_d6>dWWbg8F(
z0u%y#CCJ+dBri1%@&HQk6-b~=v~`2zB!_mJ00cm=_Drv|oZjM~ZFPmn!2g<qGVz?T
zs#g}svV)t3(te`#G<oLCX&h|fVy^dNp6QL{Ns?p3NcD=noZv|`*0U_#@0w@}o)Usu
zONYZivA0Y%j?DgGnKr9(>VwtKr_`!H*-lpy)?zztD==fa=ymVv@kjVM4><pmpIf8D
z-}&0ly#1)7o^#Y_Kx@q_Z@=ld8@Jw-r#*DrAM&5N3;uulzxvwGJ?+H)JDwf<(Hq|J
z`Ja0SdhHwTx%;WV{!HfEzxa+5-f_ag#sB#Ex1N9KPY*rgN$39j4}SB3<2Kj954`H;
zuU>l7{og!npZTGe-gM?&XI_27ZQuDSbtiVlZD+kAzQexZ-b+7t++9~c{LuY>$Nu(F
zKXddk$3Epbr~hXU4xDnV^pxh#-?>curT_dOkIvx@#~u5U_}Kr_Y`TFe`Se>Zjqf={
z`p+)uUnTO}*L)QJPs`<B2RZ$QV~#oRa`vuI{g>ATg!8!n@*w{$oPR^t|64f!Eu4Qt
z0{_OP|JI!U7S6vRfq&!De{0TvNI0jT`ostR>B(pFlb8M2Z|}VH;C0Xc&=qfb=%K?`
zA1*d;{o^B$m;?CFeV+W*|BU|mZ~id)=Ki33{zLa%biMof7yiR%KKf$)@?$QP-}Sa<
zpZ&lO?z#P=U%K#5U-eq*%6tFe<%b?Td~<fgp>zM>2;=>$KmW-k=%k~cdXmJS@Zhc7
z_3tjf_>Py|`~JxV!>7NZ_XXnS*LcMpue|JkTvML<)!+ZfQ=ao>?7(Rc{otJFdtZ6~
zq3^yc8()3@O?O>)@rSPZ`o})<N$97BeS~gYdE)o~>YTG)@PSYM^B+ZDz;2Lk`_R>|
zwcS5FvAy`WA2)mF)eqhKp+oPCKR6<8s?__wkiGWJU}dNM!rz{Ix%}@kxmSPrnD3r{
z_CuF^?=LRC@MAZ;^2O;V|LOO?_U#A1b=FgxXWxb0{?M^c`Sa&N!ROv`&)w%g{F39J
zv-pi8(Vx?Q^?+{7Pdw|km)|Vh{J?jKYj0nl|0|DQUw+5Od;5R+cJ=qer+=(D=?|Xz
z@3X=C)K_2pfsYF>zUzw8KI`7E-hcNKgtyqYTzbr#UUBSGo^kfkFm&gSzID#z$uGU<
z%RjoD+6|6C=dOACu`jsd`=2^=%j=#zzWmP1ZhQL~*_TiGCHmmE{^Z4{+|+#Qk6%~c
zcITh|`<wZO=YRMg?D&HIlPLd;C!GKI^RV{lW1b+~@=t$o!WW<LoNwlT{R@xV-n@R>
z5ym<2#pC|+lIw&++-dhIfArS#(+@oHiihsI?a(!kd$P1R|E&79lglUkyL<B6*N{*9
z$LOMSuRHGKcfaAJ@w>_Oo#R{fH)<Cg|J>8xbKW(FE;|3bAAUXlP<#20UwkBGIp&?e
z^dn#xZ-3H@Z~yRxf3p4ZX^;Ql)eqfw$^Ca;e$nE*{S%Hm;xmsW{HOoIfgc|{|Fj2z
z8p%M{9{ObdOU<*N{?%Jg0_t$`hYy~2=7s1N$@hM9eZm6|{>BME=I!tu!F9j>`WL+E
zWk2}C$@|WE?LF^}KJ~lDU2x7Pe&a~*dhmJQf9U?ZE_(QZFFh=sdex8I`_6gzN9WuF
zHta%c^U2#k4y^8?*Pn9PpWlAx4=;WS`QvZD`Fe2o&2RX-gD*RL=_R-R+3PRdA3T@%
z!i#SE%R~JW?wj^M`;B)TVH<wxy<bib9RB8+r(XGk&m13p{gUX=%@03#^YxE=@=Lxv
zeWQE9L2&Tl|K+ut#esvb{L5bse*N_KeBkJJWmoMld*ENLz3sfa-+cZ9{@06ZUwHi!
zj!)ijlKf6>clL{qgiz2kj^5pT*6zL=&VKNQUS~3Y^VG>#t~~Tr^Hnc>MfnBnkaxoU
z)=g*K@%_nxzkc#*FL_q^d3Z~?ANky|Z_R+sMSqO_1CM^`CFM=t?ME`PpIYOsCm;C4
zH{0jm1g3QesDk|TSG{KQM-N~9%^%)$*K3Jq?QY!MTi<r*>L<SBaCPr*KJ#ae{=m;A
zmmRwB(C422p}*X}^MXHp<%^z`U-L83{dasKySMn0laKJeSHJs-XKX(G%g-}@{UI=-
zdH2|SdHS*HdYQQ3SHrhE?|%IyuRSyP)ECPyQi~Ve`kvQa_v|xoe=h&c7hQ@TNiL5a
z{;XerigOTX&u@JCx4ziz{ugG4&%AYh%cYZFyTJXW=Y08-*PL?s?Qc41a^4wtUvt94
zKl<$aAHR0Qi1Yw^%oksF|LrgM?fb#<fBD%T|9kfM3oicPWq<L;>z{PvRS*BmsSm&D
zhu_Gqe&^u_&%Nt4CqL_~d;UIp+dTYL@&w{P5Ca~2;MkX)eb<>cUwhK`KJ>96bRRI@
z+r{32gIE0}{(C0@3q1Xbw?Fx1Kf2{pHyM`mdry1ZFL~R=@js<-kFNQhzu0{D9gFKe
zarJ|DBqtvHvlm{aK6ER0+UmRaeCwgNAHV+Ob3XtO>h}&)pM7TOzu9K<8;`ispBniM
z*L~&v>h)iI@m*(>A3OMuiwEz$;jR~-0_Jux^7w9U*WC5m!<S$3x!=0@X?K47(1q{1
zP(1a@o(v56-tl+tI%25!=g<G}wWnO}|L+_B><*wF((&09o15<b`RksPUGW1Jf9`j#
zpPY8?ncom!_%ZFW+dp#cb--QX{0r`U>rbiU5s-aq-tu=RpD(>U`Kw`j%YmOQ4xfAW
zLqER0|Kyjx1t6Mr_>3zrdENi#4u9pqFPmq;H*Ift#ia+o{+heL@KL(JkBE|&sINYJ
z({2J_EWGo1Pk&|ePxk<SaM6>`{?T9GafSb?^W#t4REvvqx5k|>ea#O)dcOQu_uUOf
zyWxhzfByX=uKh97J}^G+f$yAp_UN|1{OH)Zb$`*h^<Pj2&c5h#U;W(v@#nq#@5>kb
z%+W7BIse@AE_&Zthi@W2amJfZl7&y){Ms|W^w#--$D-$>!g<CCN55-*S@cI=efQlT
z|LZ5c_yrGs{|`=k<_W~t|M?X+Za(|0=wn|su6^mX&n!Rx#@F8cuAATY>8HQyiceom
zzWd)7EOgc50cLgn_R1gq^Xo4q+G}6<iT44JxiWjoG3VWX;a@!MEk_+_kG|`QyT|VY
z5?4PzR<8U4a_tKa9lrYEvuyv3!|$E@nVXM0lAV6}E!V^!y7FyTegF4PeP2gUF8%Cj
zhi@#67mlBI)*bPy#2a39_E{fPj`<An&E{JdKJcs~Ip3d}{4;t7-mhMC;fKzC@U}}o
zab@REfk57$@&Cs(Pd@)O#n;b%_`!1bu9xhKuLGy|y>mbHf0^E!t^ow)UHFapU*A~1
z=J4e=%b$JL?hNtZU58&6f8fq50KQ+OPG0)4n=|FdM_k@xVaWTfr@Z5`zx{{rqz_*5
zxsP6R_uKK~pZ*)Czvs<Ii>F-bxA*)#$GyVs5E)Rn^5j$RxbcYU^P|&W{jT)mBY?d>
z_{}$dZE^lfuLKUG^L&rH_+t<L@EaFB@sv+K=Z5dT|DyARkACq*w_Wq~Pi~H=)Q{PV
z)6t`|EAKVz<^TG<>t28ARpj@+|ACKx{kL|{6ww<F-2)%}rE`C!e(ll6d}eUVKfFAX
z0cC9O{Mh=6AAIH#KnTtmQD^+_Pi?`Y8V=Cv)%QJc@U-)vxB*=3f)4=K@tik4^uteF
zeZkklcfIgRAiDS7`S3q_=g5EX%;IlPzwDFIe_);$AKmH5_4Lm^>n&%#(|qOrMETw1
zUwu`q9%v36{FA3%4(8!K_rqU$+5PVWo3j1N1qN^@)xmRje=_{VGhTPZNuF?Q@{zwg
z>Cj(X{J~#4@gqO}?z=7mu#`OKf>QvSyX2<&8-IM+;m*zv0q^y>Pks5w@bDN(9(CX|
zk9XgC@Y3(T^ZB1X+sFWES$`cE-tng$dd3OQ{r*cnbIU2*Z=b$6=2yV(btrPqi+&8W
z?`=Q4vHa2(&-lvxh$=LFPX6g%HsALJ{2R$vUiBMax&MZ%AKE_WrurY9Kd2u1@xgO0
zJN%==V2R&6{hj|I30{6o{k7mvJBF-W^o`ZUAGi)+z>Ocgwz~Z#M}O&A@7=m!n}u60
zJ$RmY{sZ4WxjgmoMW;RZyf2*g@B^Oyv>TrLSZ#h(Jx@RB`*&V`*M;v`oJU;=^zXd`
z5EY$w3DC(Wzxykndg7_?KCk`W`<26Iedn#_g?|jJ_?DO4c*J1ivAX`AbB|Up8q2SG
z_`VysbCG}9zVV~C-7DX6=?yXeV?YL<KfUMIU)dhI;QH4+`DNDO=7_xL^!GgV=-WPc
z`TDYd`u*eCbMN~+`XveYz4+u)zV^|Buld2JpME_M`MEE@%KE{JjH^EWi;uzNpYnY_
zS#<Bz?*r`*o_f#0^sjEcwL|y^{_eB4@COdUr`N|FeawrVpMUDOlivqs)O=-l)x96c
zE&x2`qK`g(S07QWpLXmsH-Gbshi-lEInTcL1|YUm&#kT-J?=X-K$rJ@)p&XOm6x7+
z>9^l>=;nv+xcLd6`_vcy13B#%o%rF;BiFv~XP@1?`YVUBOTY8BOYZyL#k=o)B&9xd
zP5JU?ocp^+9a#U|?ZE5~U2^^Sc>Q;~knvLb*7e)I`mrOa-7}tf;EAVR0j0pXy?5Qb
zS;j{uFs55hf8Se9x$M?IyX20Kf8!(1_;Y*r(Z}xFkzyn3iLd$A@i+78h0?EXzgB+f
zrNkYd{MMg%uXy+GfAHk*-}sb2J@>e$+&Vna;ms?+mcI9o>i_-tr+@oN@44;MXHHK#
zlIT8j{xu!8U-Fq>i~r|ke>HmZNf!Xhb=Ctn<$wI*7k%d~>UCG$f9H!==Un;2uRs66
zZ=ZbHum0u}3GoQC`YABD^)1%`zJLDsWj_XHaoRuq?o+Qj@U#Cfdv6|1b^pDM7bPW?
z2xTmVI-wFWlh9yHj^P-J44F<c6H-YL>XtdPP{$#2GMAyu(;>4+=J_0SPKV!qyNCLG
zfA_PV?|Rnr&$HI|pL=y%IPYP<_P+LYU3=sH^17jLtdyq3{)Y2@nr-iozlBgrgJR`+
zjVTKbIFXd?TNbPAWdCnhNvA&y7jmURq~fQb-&h1j81#=a+G9>LIdy4Jm+d-bc|0eQ
zCZo(l{`*(>T$MXIL~1HSJAK1+W7_xCyzd4j?!6tR8MV;Qsv-bwS}YBdO}v!LRK}4D
z^qvnG#V0J=jAH-Gts<fBx0Q9xcVm9z6}^NW7Rh*lr-;1v^1y6#td_5qX&2pH`x=m*
z5-R8UOt2sI@8{w7w3ld3!(_K!wb$p=)X&BFBfAG0VgY7#hvYKxYF!2uge5HQU%`s`
zR=@EZx8Vpm+=aaOFBkeBAH6&EUgfd^x+E4Lryi&oUa$CH{99J{yNvu=Mn9_-y8rn3
z->>+e4@+F8f%+!lx*gsk%P!D1IR7o8iOKNMxAF8ct7Z(k{~DJ6aevoKK|%^GI^-h1
zjh+~OuhPj>H}b>3oMCo$xYDn0Slzj!Eaz4J?HK+?V)7G)3v^=|dIjIfyO7EFZ#UE$
z1o!i?;K)LMkkGa0|G27u?)kTW_$~7DF31epbW(vJ(t{uM)-~SsF9+uOb$H+bIKyF&
z>oH{<$NsGl{KxBiFUY&luY5O>U*X>WO1@GMfxFq;9SD(9sAX~FfBP$IW8odR<Fr?m
z?kSkU^Ig?-IS%mnzrHR%Kc;=|+*vk$<9@eR9A5m#RsGxJn1EO7#I|T01K&y0()+L9
zUj}z0V}zn<Df@Ch{K>z5%kP)Lj)mVeidPxfyOqZK7(56)!mGExf5TsIl}<%r*yG)s
z?eRO(dP=nae|Pd<?=2B|wZ_$M2vGW8iE#fDsQ(kF{}%#vrn~IX-(G<KaxniVX#bzi
zuB`@sn;dAU9NGFku(KV22xH?tzDI9zq0mx|Jzx+4gm>$?GV7&;ji2KkyHJ_WsG`u6
zr|)FucP_l$tnpj@_$NpCIi7$Vse$UUXX0(;fkZ{s*1UWPpoYpe;D&}?0d-)}c^HA<
zQV6@MCfd~bY4T9-YI2b9<vt+oGt3ca<6O<8de&K3T0(eXgX+6s+G`nW-wtiVxsl<V
z3`*^lX?mKJ?SZ|wap=>-wtm}LFXpw#d2JHG?N>zN7}w8C;B<S|<<~c<)(DJ4G~@ov
z{`WshbcxbFU;WuE=hu>j=;Y^sL7B|VYd)0IpjSaAuvD}FEvou8r);8i1K+@flvM?3
zq~0Ca9^coN{PGvbP?%>Z1++dHB0YBz>FwSF+20QARom^PinSUMOR5V(J(1Vy<OdfP
z_wXtt%5MuKf3JH_9GKBLIPK+9Hn%Ikovmx&k8#s7Hp+O+Y`UFA=4BBnqM4WW@35j4
z4VA&J%otIW_~1&!=ppAnl?Sx2sF!XrpAoK+n%n)B$4_II^N9tcYv0+41%N7V9&R_V
ziYo{!5}#|PB5XHhzCvDhp^|nIGekxY4VsF$CQ3*@OAJ@!o^^GfFI#O#kTen6npbN>
zFaPXrTQi}`zI|Egl!yQ1_^W1-m3Z$rGu>Wq3-x({|K1-W#J_dGBGMS)V4owJQ^Dwq
zX2ry;N?@>~)jt$N>tTfp?UNFK;`biE{<Mp+ZfQ53=pSq$*<gYjpgLJ(V{9udDaSDf
zaqSK}T3n<J?>qelXZ}JrzgIlxivT2*-g@8eMN2&=53jH|ZrN5VwIFlh5K4i^<J<FF
z%S{rRKpaxHTr6@85-B5Im0GN3?l@@m^7$U1i;>96cl{vDJutV2=-Pidk9Mcw!EbFE
z0qF@oQeN@R(BfXh>Q1@3OYT5o`Y9GcX5<6z+(Y&Q1ON1gy9*W6P97XG;9KZ_B6Pnc
zhh`c$(U+Dl9cwj=K1l9NO}f`}BY-OcI11m?<2q@n6TUev8gSiW80ta%50}viH2K4O
z?Lp6OJF5+&GTU&fS042R{;BS*^atbb?bQ21e_;!t(`Z^q4}ZqNm$+EK+Qgfs17cR<
z6$Av_pZ9AhY=6>8Tcfv)<~7D|{4kmQA@fF~%yOd*pDHh!y?7)LmFwr&w#bS3IYiSk
zYcJfi=cu*CGgWE2{nc-{<S9KZ?QN<&ad6(lcPkGUm)U>p0d#cgG5q$A1;Pt5k2UXk
z0eJiKdf4YTsXc^@&7E&=ky%!&!@7+IOK2cWn&EZ7?NUBMZ0slf>hBr!{{YEE5+J=b
z{HCVE(=FD`c(xFmS0b5g^`^#;T!}3Ol=8pgUYb*;d)8rR^Ka1qFR=JK(rrBquVPQt
zO4TjQ<b|mReLB@!_f5DCOFj;jo_pt9xpjyBWbvCa>MYjx3TO+aF)|)Yk1Ob9iZ-h3
zTzM5y4`d9vE}pXQ&CdrqAjHs5Ww@9+`|%(^=u^lgeiVb7wZ;!o|AFUE-Tb1iFTK#w
z_{_iWF5INe;M~c)#y=3+o%l+``?yd}^?pRkZCddKS~&0XaU}Yjfe*cVO4|Y-=DiqX
zss=&jX$VA1opcU`gU=66g(OC{^i00Uf{geYO-ryeJeU5=$PjCIE|GKQ87J{8EiNPR
ze?1r6p7?2v{B7e$|9<|w3JSvp97tMX+YT<S$MfxQGAEt^a1FRl8P3N$A0buE9v|64
z&r<MJ)0f;a`H3%74bFq9&aUiovW@4+6M=Tflo+>Gz1^JB%Z;KPKrw{#t3|bTl4?g4
zm0cj7*LxlQd0bk~rIk*Uz5r2pD4^9sCbTsg_GmS%p!|dU*;{CsT;(QH!*avpZ0${{
zQ+gesi~v0y^THC!NerbN2)^U!p5QItB*$OsIMqzhryv5cIx6ekh`XgL*8J!*w)b$p
zIDRHqE~{|Zzi=J$UOoG9x9HPLir?eC=JJ7O4*gtUSoV1RDSCT}*IktNjR7H#bpmEK
zVY*sYtWUG2UOyboCDR>ey|WoA&EKWyzZpV;HXmm>*_Yw`red&VVq89JUnd8>=9_el
z;YjFg6Js;FUA0w9#4U)XW$a~dj^3XM;|8yyv`w=l(ntWo1mW`rx<~69zkUAone8JS
zq$dLLQF|o6J^K&@48Ri7=U+m2Q*vv7T<7h>roeQ}9I|@psPc8HEvJ1M7_&Z!!!?hX
zI8Rx^FEv+@ojR}VH*V}UAQpFb>GQ3$Xv-;+ub#yE`_E-zV_gv8vZ?;(D^DjKkg!R9
z+q9xncn1AS7xJh8<k5d3`JXQFf15{-EXY)^{akKImtE}_MRddB+|%rK71Wiy`eol~
z=(W~qzceXw)9okvP_)iNek9_MhPY{`%&IL4?Eqi+D|z#6ABiQp_Sx&PUkKrK&ipCU
za569d!YWdXKwlT7>{$oN^?ST6k~|R21PlblhVLUD?7%>l^dgBY^uH@xe_Gq*Ir-RO
z#oy&a5+M;QP~SNtSt5z9aT>70ONRXOqPO@N!o%uC^7U`+ra}P3tMs<M9qF@{0#|t!
z1kHHppIQt{o`X`ALj<3K=m1D5Db8I+PE81bnhw|}eeX9Z(+hs%%^XFZ{tkPVPCZK`
z^g78P5yb+X=H3Q+883~$ZxL?3qB2ZVF4{kCx%`r9r6l_103`i(Vu$8XVQjGN90&#=
zf;rU7jxW~nB&-biZk!!(?kJdz(KxFQ7d(K&`i<AllB~`;bX`$(&J)RG-UAt0n`Bf}
z*JkkRQ}(eRhcc*)MA~nS-#=y0&~IEW<EV1x@l#!X<AQ2!*PpFnc5lJh;tM(rFJBri
zUX4|h*x|u`v6V8bv=g#jYc>+DV!*j&<MoSwaScCphJii4q{{7aQ|fz^xythkUw$BL
zDM+unF{f^v@B3_{zFu5O!;lJOmW)tQ{~h~^Eap9rAo;9LfrRSlMLifCOKN8tm|>WH
z2=bcOHQDZ2M`zYwL9tB*Ay$}>ob_9?&9xtj+CDp;m+0Z*y(ca)+3s@YTaZB#HBjBD
zfC=rd^_UI!Bn5E4=>qX4nbFzlS%BX*@3sppe?Sa)!_$#5vIoYZ=ivdQdH*v8D|gTU
zRMqUYP6=*D2+Yf4z&2jl!FgK&qvx7c{%N*oC?3yEg*$xuem$#XE(R|w3uyH9X&Ocb
zdKn3|u0q;v@f_)$32pBTM7Y|T%HL|7gJPmea2nuNwfc<#zYc6v9;+fC+}~K4U$ts`
zJSFiDmYg&N37QsAl;ztG2XOwu`2Ee`?xjPlIUk(n<x2-Rzd<^gFWP$y61_sjCL034
ztD?|s?4QcZ`>Y-4&A2_F{5ns)P7fK;2cf3@+vl(j0kA!UT&>yF!IUXC@|aJ>81%{S
zm-I}%T?}vdACNMn6W~eMj5fXhtI^gPT<OFz0pR&E7;XA8a%&Xw5XA{}7{_5-4J7Oo
z{##*}N`xAf3N9rX*_AKmDd(Un1m=I{kA1qc15tkqL40BHenP-(OA3`)`h*;y;7bNZ
zH-7^2;g&7DJ)Wbe`~UP6`7IuD4~ZU6@R}<|#Gy2x%z0(kh969VhH<IQL%Y~}@``D_
zWtFMVkFk!+DEtfRID||83s>!c9;`2=FMp{X<$Jq1(!}#PA}?<mszCSG%xgSzMVw(B
ztO0R_U(AB?kuw7=(+lB`;AV;66kIo!eb+TTfn3d_ctZ-zUG`aDPSC_<=3Wlu{muq0
z#91n_!}s=}3Qdc0oLrxMk+BqfA<tGUmtc5o7337PKD#`1zVwC_p%dmQ4}9rf&pzx!
zF!vm^anA7l78?g?Uym3(vCm)>D(Jj=i=eBgZo0k{iO3xTS`@o?M>x(U0$xnsdKjtM
zIzR10B)?k^3qNuL=aXKZ8@iMY)SU8}G7{+W&Y8kX@zGOW?i<wAO%WZr+H`1lPJ1<v
za2=QrLprlfnHHo*Tk&C`L9r%>CYjYm?jMM|rf~*Qe>cGgq{|<Q6ujL>1e6#$6p!V)
za?aO)t0U=kviO;2!yrPbA*$y}7BUVXzee3z(c(SlG>J0(ad~BMqtQxEbY!64x+-vH
zI=b-8rxd1ucL~W>X)|Ses39tk?SbmPC<2n*W2J6rfb4Sp0CMEb9A{V!a>7CpGZ@fy
z2;la#CJ${;d^sw6a^Er|Sv3zyh~-ef?QMVSX+e(tTOU?6K2mG@(tD+6a+GJoT|i``
zs(6%g`^DHj+CA(pWos&TKOc)s^=CEAczaTLh<CZ%7`@_BvpNV(E=n3}A+LNlGz1I{
z>hE<h{maNp$ZR=E9y~!A7>QZ*c7va7Br~V9q=x&{Bzb+J)Rq7~wOkQxI;&II2Vy~F
z_&B_?rg~QDIWQ(!8-g)r6SWoZp2gJi&xw+GMtoIOeJ>FAQh&7gu2*_ba5@GjVS7#2
zKCm5{6F6Z*Yf;ZmIomhC-`l&*>Neoho8I(`>eA&MdV}2~K_c-#lw}Lsv8SAy*L;3b
z=Y4&v@w!E%iLer4`<0HdNYi){?t-raQ8yc;j$824YFAF?TtM*4wTSeuR8QFj&Urrw
zI_YWE?7?I%l^-ZdKgZU`<wabeOkUt=Z(ZE<!;~+_&EKRZ%ucmfr`nZ~2M@jva1WUW
zX#}zLSj<1JUvF?%1$cd?$BD&T4AC5=+aKA6AC6CQmq%QIDTDsdHWF}3)E65pHiZef
z&vQ-?T#^!UO{R%%zg3v`hoRGCbm@jnTImak+;$g~12FH63N8EuYo%peVw;~4mxHj5
z*Xi44c}?P1c5pds8QSI49NOjD#UP_<WwDJ8(EL!0MNlFEY2g^JWfatu#XHs>>=}+o
z(z=PKnzz;<IecVQ=6*}C^AY6$3ZT^J?e!;$=x5cV&%AOlz%__L`8wIj`QXf(B1F_f
zJW-50_Gg0{mC2Xav&49{Z)b{r^kJ!m6pe=bgu~sWZc}*<>w;bzoJ;D*_l-_c`7Q4B
zGIhMK*zKI!FA4l<F6z_NW?Phd@;>8bW8%!EsqOKjajt;q(?&z(<v5(fikbj;_jp-=
zNZm|#=4=UpVSG=V#9&IO{JQ(fU<o8EryXgXD5V!3k;bl<00gO$JJaQKv8znJA{~C*
zVTRRtE?^jn@GLYt@5g?My+XE0?Oof9!hJNI#|y`dhqm2YlGt|_cAI#;0<?_1P*#my
zh3g(7Dx}wkK@O>?zdsR<c_|;?BG#p!lC)xGg}fJbFX-BCIQfm|V12VL{)5gFxoF^N
z81{mlE>GbmiFL>}W$&x!mz*#{Iw+ia<W8A7Iixquy*=N0Gqa$+zkYSVa6A}GVtV`(
zOX@auGrm<QsHwJxb7%^|=k0a6bNV6+oZjbhl3j^AS?$}>*~7$ZtLqEyw=4#e09=0N
zmkd62TNLvUhuB>$4-Wtzkn{aG5C@!8l||~Bfc%e3uUF1qTchvrq4W)DAV$&-=%PDz
zUlG4yiLZ3p_Ji+QY>^qcb$iN?0QoUrWzUoG(r((8ulrDas*mN*7Z5ON)m~i&R<Gck
z;F6>QFYPwl?`KURV2B5l<-cfcsFjdK>sj7L4oPdTyPH;1en4k=oGMd8IUN50G0+H6
zKLyKmEDHC7Q?ng-rKmCn%p^kt67eazNJDrt^D;@7q!Z!s@yl<9V80wzkU4A}yTKc)
z;15vbJ2Z=aivp)iXj+(Rr9r~XB|PHTiDe~NKa4e_o1|X~H!gc;GF;r%*hU!3C^)f*
z;0mz3y>wv}cgk*a<cOPVJIt0eN`FJofsj|bmx{Mm_M6}g*$8R<gwDTg^)e+oTnbvW
z0lT+#Id$BaDU%a!hf)ifSGnRkPG1*@k4)ZY8~y(2^>u_+XzCom<Dtw%T{~>)mpypM
zcv0s3I07-})JVZmwOgXx)6BQuS1AM#i<m(er0b;L4zi*=INgW+5F#hL$oA_{!?t*H
zmnZJ?HB&qA`2$Jks}=H7LZnV`$mnCWx>`?@q*~1_DeP}qa5v7ruX(eL>4gb&+#H8K
zb^aN<olY6$tFC+YNsYgB%9X+mj!5vvrvot0(kAa+3GG=+X{wty9_wnHzuMt{n-{^4
z`9}d#HUdJvs&nU;Nx?5`rwp-Gsps2cC}Q6ppSeKq1QzO^)sH#-HrvtD>en$4fm_GV
zsiuVa)l%zL2PUQ;`(#=2Z^NJc33VgNd(<I~Q)&Fk(p_-(T>;dDQ9=pTEf&CgdMpF5
z!}d{JK>fZOt!q9~jw99D>PMTMpcJR`{2boV^M)|{B^i5!9Aic^nQWg))6(X4$Q!rV
zR1v*1r%yW+`5hy{r(81sz+k7pRnt-TxreMz3tAD2$Lzy#r@Ql>R@B)HwYlAHJYDy+
zSC(94Mj-0XdRg0A`KsWTNK0M<_di(BkPGIp)-vZyIbwT8;tnwL)Oi&|TS*NDv~@27
zk0nEidy;q;p3~GK`S`MJZjVpB&u*E3dp`A+@z!;JwSCBe-wg-8D#Oa{(mjPxz_P2>
zHG-`0R1NpB@AVsJCo-Yr1|jJSUHrylD2sE#Gj#5Pv}@86GBaUR_U(@=?dKVU&Q9H7
zLd*VK_?Q>{0@Gu5>>K+Q9$a7qqYRt>i<Gc)>$h&EElyl}s>tJk5q;naPCS2>t^J5;
zw)(iaKY{)>oBe}M*AI!|RZ1R@az41Joq5}S86|dY1Hz}?yoAX;#E*6H#*y{cgCpKd
zwAyk@&qF&)YtE_eJ{d@0jhResk@m=x+eaHD4rsGoG>vV0Jl|c;`3`rLhHXK7!yL3b
z%9p~k&l(qy0Mf5+tPdgX=JDo<A)@jIrBYv8ZqVO?u7<FStPykeIKziX1OL>FM(xnk
z7e!<m%5`G7F1BX-ji<+Zp1=$wd0yAg<li60bhd=7(s`+^SX#0&ldv-FzL`3a&SvKD
z!p!6^^|v*F1`@D+pxnzTp>khiaIi3D+x718Prjru9sU|U?BRa^ZGC0PB=Zr+#%_Fb
z?<Z<)5qW|yh!EXJ!pohH$uXQk8isb|<GW<)JbcI<2@x?-XNA#3E>SWo-%K>y+DYS2
z_qmp7Qo>6@l!t#HhNq`ygeh11rcQJ1VafQgWP(4gfn%sBWkWDNRKpGlHA;v$hJ_e<
z`;OXE1T{n+34BtLB8Q`tn0;7=rr>%frLY6s&A#&&F0MhXyj5I*Iw9cGV5U|H$j10h
z)+5)zXPz(k;$NSkzEBnjUDY9)Cji<K+7VCEVb!|<)F}0kdzWTU>!{skP5JY2(1M~W
zqeYmt1mlPp#GDC^X$<-@g=rcpxBGg7>H5_-VdrS-n3{ir_wi*C6`av}tS?b{=kN4H
zCkIK*@||O+y`~{On}78H>)}rml{k}Ex#)3+u_t_|Rwp%mr5I#Y1OV_3C3p=)c~B8L
zEn~}#wp}ei<lN~xeD@=x#M5)8nP>N*H0lE1S`?mnz%k{|gUe=~bq6mgSKH)4I+)||
zxDl|*TM)MpSS1CYq*HiR0gun6UNA=3crocP^9|FVBW#Er6Qx77A4uo$8;8zK%&8GG
zdHDXD(^uT_LUd=UAJSQTs0IS1amRyh8`%b>?x3oJr?>E)FKT+^X0EavRmnS-(s-HH
zan`*~=-u%B9<^yDOnw~rIZoER*&g24UA|JlQZ+m!)Y}qSPJCtbJ{QGvt#II+cJvAd
z<p872XOtSeSs&ANa(Bh|M{%P+bI7iZk4@9m9C;Q?^1$01?r9PO+-Zd0xc(0^bT}Ks
zJ|jgEbo*;=9tiI%>~n~fgHj#xeqLR?=P1QE2ycDh`|XhNJ78fH9E&K@$^RO4Hu26h
zaZ6qr!?Xet*`E$T&9Y>2UwQT`Z%k<VmA2mOzf?weA?;D5%skRH9dMjXcQ8y##r@>{
z$Amsmh+@1)DS%nR2ci6A?nRaBcE?!fMz=cM#U-4vxZDe-qUn_b;$(bz+cQ(RrSc%8
zmJ5r0w$wJhaQoE#Rwnwaq8bS@^ai8{10PD&_JMs%qUFOw57d>q)qZ`CAK;^;4wr7M
zU^gO?dESzx?TCu#Y&s6_(8Yl@#6j<{7&|!)1;2FAbK2s@=QT@H%LXsk7NE3<0HrOz
z4xGWTSBs5$xd{~d0EQ{+p5h3+^wxJNEa9njwui~-MVF}WBsXR68Yk_6gs$fGr&DKw
zi&izF#SOn1IotKPp~t?2g4I=4xoph7ZG+vM-vQ=;n0y^>`dd+76xhCpq?@x_rf9ji
zOVi-{=OXYGz|%yIgI^U$1Kf?CZrwmjJqAuD^x8u+q`3*Ep>n$W03n`72uVqi!v-DT
zHRn~>ec=<ss7kuo!z{Cv75YfTlvF(ax?tqa*j8m)3A3@*1nP{{D@$JCU3SU_nzE!$
zyNDm9LQ<q;9dRjp^N4Tt;fg6?^Y5p#a2}!I<0m1PRQ|yia_$Hhz)HSAEv&*77msup
zX)1)Oj<ll(3cn?{y0vX(TIvVCt?JKW^ggd=y&8#~91A&M5wx;LogXF8UQITal)96`
zsQOkC%Rfu+Rl94ig_RDr8<h#?f__z{uReAs%(G-`!m!nyran~6C7+4JiR(jvm~ypl
zlLA>5L!PW&E6rY_>2?fp52&vHzPOGnI)B^5-3{q~fd;yb?zSjQ)^!P*`ks*zQa{LN
zQA8hACp{Ti@pCLgX~cxDpKHGCRfORQ1{OUm2xBH>m^^yPm=UOWUim?unerHz`2^e)
z&|{K72P;=7=1jR|R-Xd0`tI8+GCHTDL<TX+Bz>&z)B0&K>1WqZ(zX!fltU~8O5TC@
zD_8d|F5#n!uk?wA17U$PKo1_LOVo9U<GzkwF?h>^yvrBN@D^S7y9;+)&M5x^%aPk`
z)Y&nXADT1<uHb#**1mI%UR)|nraZYshZ9hGddZWI<vj>JA-i~MJ<>GdV(?m=TPLRQ
z&*w*!>T0(IFM1aXUgR43vRpo2ySON{h^Ut7FYjFLs*5VBseY|(6~1*Onf#ej%fW0`
zyE+@CRoqwzBTJUdB6_Q+&Z)4+<w?-tc?-MF{_O>jW{9c_>}1-NIru?ty=Vmy*VMlo
z=GRE9(OXCMES)cEYhl?+q%Vyeup#kOOD4~sC1Z}9;tDH74-RP!Dfa7ZvkK}}<oikj
zeIY-I-Vm+12jd)iy2?2}PQ#wc;=bpB{%1)td6~>_ew8qR6`o?ii9L7cBlX(&s#qAW
zkiEwLWCUKntQ4>3nIAWH?K!4AL_iGfWhnviNfX&mtTL=$Dwm+Yh1{twkt6Sh;h0tJ
z7Uxo&*^DB7^^S${3r3mgbfeO_>5M-QmEO<=TuXsZ{rjd5d;U69@J8saf&j~l8&GHF
zZAAf3P<EBM9^v{{b`qV2CFz(EOC!6OU~-$ThLWgMRdSx(`VLi-^jp#U(9z*RqlSbB
zN6lEj2*kS|!ElL5(Y64ueim?ONX9oHA_PmiOZP0&dsBfIY(5`YQ1~vJefNIR@$TM0
znifa3-I<RyhNtjPTUt+h;7;o$VB=i6KBG4>T|c2#5wE-}h);H6<MsHi;RHn+>4Faf
zi<x&fC_^n$k8nGxz8Tcnv<y+_-0}{1<BF^UPCwWHrkhaCb}ddc*k!mFeY;d}4dcO3
zV7Vpq08NM-d4dIX7#UF%o0p+uy6{elU*{9ULIjtL7Jq511g~IxvA(3-`30o=7hk+U
zy7!0c_V^&Wp@-c_Wn`HT^sJgY+!!@{gx~J*ss)L)<$QFd=fONdOsQ=VkYF(=`?@?2
z@Y|-tywfSS?avM8N9val5jugk<uv;l^zC=xx>mh5hVX3B5yp-wOT*A79Q9g%<7*`?
zTWQy55+w|-f4{Apt}bi8@vXOr4q<=Oz1NL)vUuwS$37h%0=2&<oD#hLN`I#$+5SyY
z)IHI1i|_Upl%^0M5S?2UcMxXNAAc6&V~NuXp>I@i7b_3#?u+*!PoCVyF@(kyoohiR
z-~8__(^Q62>b^xlw`>r2AZ?i@ua3v}uDHQGpjW-Ti^@apsJ(%_w>uT@eth~Q)I10~
z=x9g8)>a2It!MU|S$w5G{@p)qbV$|r1}U&!d3*4aXb{;=t*HAwl9qFa@<xBo(#v#X
zwu_U^My0*C#ZB8IS~VHWlg}i-+iO>}`5Kk&zfCGv#^veF>B&yo_t!yvi&PIeL}F>t
z;Y*tLTt!n?eHdiAr$#fy9PRSE-Far97^je_DI2_;Q|i}cZg$(8x-%g|T^oFIiD^U<
z8EUjG-^*vjJ>TT6u~g_TzC_-STlEI<BhSr%WYd^q+Nq~TUk2)K*9_ZJab{|yV_E5G
z9;NP+900#O?#w;x2zTkWledsZkeN7n+T8c=`m>eh0WdEz?4%iCfMePK$GGwAHDKmb
zV4$&(YQvoK)(MC(*m_n9XQri;>1kA^yu;i1UH7zmmYlFuxVe_HbH6SYk!`ycO^YvE
zk>p5p4i^%I>Ei0tI+guuH^|u=_4URFhj9R8Kke=W>6~_pS%Tq%{brT5;9H2BD_w48
zDl)CLnz&a=h{6eR7>G%I7|t1)Lm0=ZTQ>e|J6_LjyVw?3-Im0Yv_O9%lJRNn4;z1<
z6k5p-<H0YpUq@E^@|*>TGr<!2*6pF)a^b|yjb^uU9ryWy_|c3YeQggwX||8~Xk<@U
zP69W|?P3&NrA+!za{4U_tsJuNmPuUgz31-fch^4haJx`lp=s?R>VHo3Jn+pDJ<WdN
zdZs%M3SAxejev<haCItL*?1j>A~a7k_(YE<b|uEXYk9ZIuCHfn3+gL8-;Y8*S4N7;
zJ#hueYC?5UZx7SS>pC=Lo@j4X`V$0Omw@ti>UE`4Uu-1vFR5?hT_w-@LV(>;-`bEk
zp%3Ay+qagvvYq9hll}HgDHnT(fWga^c~UJMvs5U=<j^DKi$F<MaOy^^nXqqQGP+We
z3&=>5F>4q8Z6d|yCd#m&VCGvYiENW%lw&gqLs$`*%&4Pc>#yh7f}cdd*l&k+-PztE
zeUJOea_K7kDR!#EKY-b5Ak!`1|DN;a&cmh#HKMdLwhcWRMK7m^%{k(scJ9=z_SBfB
zWD{s9lIS(-iZ)7xdMzK$OmCm%xD`vFu=-12B(3=BA{e95@p))1cVLU^r|dxhJ*53K
z>dV>l^B!M+s!-1KF}csxt!k7CB!}KE|M4BATT?KX(|BRLZJv2lZUu-u?QYxwhZd9(
zzPu=j+K{hIeGxaCJ~LjK=Tm<*d%N^Mj*flo=OAePaCcxvPeBB+m0q<MWPl3L%cqsi
zTqdh`Yv!Z{O$%V5YhBs~T*RQEH=v|TdqZoFok`v(b2D)lUuVhNshV!0E#N$(gB@0h
z_G9(%7dCXsZEG$|a;iJoz{DyN^STtRl{>+0?$W2Pn>RRB?t$A<?OlD{4^f4yXCsaA
zyxC6W=(<U08VtWS>u)`5NfBYL&vb}XMjcT7B-)HATrPPXkUx@`gYbh%LmTU>J(=GS
zM$F;4IeJllthDR$R^<oS<k`BM)6$-zm^OtsXKwTuejPX`H+2T9yn1E~c|XvGUE;9M
z*A6tuMHQX7F=uG;laTEbZgx+f<T{ZQkjDP(nr%U_r)3WL{7-SQB_l%2V>6gZXr%i>
zI20xJ*l#?eBzc92d2jWWbWhYPy)NHOL+&s!BT!2CVw6S4Pb}a4lcYkNKQeY;OfPo*
z-uSDmN*&W8QgfbPGLzOVOg?2L>U~+s!j3=cP#r5t`$FZp`>n_%)L|tetxASWvv#pq
z?s;d@XIua2`pi8>xNLF$o~QE>z(pG=>0-|Q!1gRSwABQnDGsT@g@@d;#H#Evjkv0Q
zs(ffxS~bLN<Tn<Pfnd^)mgXM-dCE+Q^kpuNa(aQ2PFaysTApd>ozhh5)C-XWP&{w`
zW3t>D0s#S2SonvDU*KZHNA=C5wvqV#M3^BDMqiC~Gevq*PWKRTTte<Zh1PG*mBbBj
zW>6~j3~U8y@+6Sab>Zm~aN~meCb}z^kr^_MYGm51@Q8IDMtkD5TDl&vi_wa#9u`?*
z(sn7LT6*yn8RlXuui5w;G22DyE~>>zi3yd=<cdu$qaTM5l0FKc%d`6HAMOpJvqOeQ
zdTa(fs%jK(N#F25$bHngx-Bp2?ID+H%K!MR*$xt2?@Y=c%^-!G<kc@zc9*!{X`bV9
zWPuZ<`%tfueIs`0Cx6W@`o#og%<i<uR;TGM9<T>y-hnkLq)Hf)Rh34;N!uoO6Ol{<
zR4FEnwgg=a&#D>2Sg!W0#H0^p__}<sM5GzhklDDXY~ckpUT1f8;Hu6T+VYt`xFyIQ
z{^O@k9MWeCvo^Z(`Pi)_f7UnMEOKjJZHW&p^@4=GZ%8BQOQ8=QT)-u8+6|VzeD-%?
zzo$^QeZ;rJ?rjxa3cugeg`OA3b=sj5>|H#=Vw^bOH=c{*84njI4^s8L?ox4e#tUJ~
zC*11$Y`k&US1m$W2X6G}+pj=%;BcHg<D!SlE=2q4;cK;sT%`o;hGSDHc6D*SA?y60
z&2X`G@&vlK^O;4P%?BlBiR`x@cKOdJw_KDai}jL*S8azSKj3l>-Idu(z>M8&B%ybP
zJn`W$t|}u`I?0#2X~-;A&&c+E`c|$j(pGl}6<sHt?1@|T5j1>W9`4gQp5QAN+qn+4
zic2^CjJZ5fCvC>#$t8&h7-0I)t%bxBH^OmbnevbSQkPHpfQg|_)A&6z%{H<YSR?kL
z)SrW9ucex3O54)5bqNNC(hzi6V^_6TW<ep)m+eNsA(t@e&;``iw-EG}4{j@Oow1}b
z4qr9FITe@~1kv?Z*QYt!;f1gkgmc1lcdG+&;Z^-G_b!6DQP^cSu1qU#mNs9|Wr!*g
ze9-P1ICYK=eWC8=;I1N4d}w_ITT{DnHudbCU{4CG%fl||tT^A$t*kko6tF#*y7wj<
ze2$HyPa>aJcG1T2>f#q1a8)Fv7V>@onIhi*srj=ugj11J{o(`g)&Bh<={CQ?kgZpg
z!{|G`io_9D;!sI)p;M=}>e!xhZ?ZDcJLnnXvbmT=SkzB0!HBG6&2Tol*RUhF(+|lf
zt@%h9jH`6&X!^AIg+IMg@A<We`0T-=!5qQcgE!wf4p{2BJBWXhcFtBFs*wQk^yF??
z_;0Y;aWdNUV3b`=uHaj}c`q;RvO2eMkZI-crAzgUY$*QS;0FrmK(=+*iA}Fnj7ocL
zi=QTq5lZy6tqV;O9Vbfhbn!vu3`M<Z$#)C*lS34iVNZhzJB`pTnQV|l4SWIaHyeZ&
zj{44bS2|T9Q@)>j9n`KFmic>cLk`-U-PP|C@>%_clN|O1@+m`u(iSvX1&3_fu*&#y
zN^+ue^b@)%*|W;?L1+Ac1=#4Ey(VuVK(;06ba@@wv<3c^SHT4eqWINr#MGgs3QjR*
z2&Bq2W=K3v={CvXTl68*l!YWh-xwZce!CW+k)&B<l&<9c2+_t<N*yf9xQ#A`OjiH5
z(_Cy1wwbkhkInQcp_J-;Rdx_z0Bog-`eIV6i9yQ^mD*)%KMvwxvn9S98LZ2H?)rnp
zMJ^jzJ^@6C2E)m>3LW-&GkybmWadF7m_#;bAPWYXjts$|bqWpDQK8H#EmH}<bGHp%
z3v>2jN@Jd0wjBbAOH}fJDcs?inacfu-$uPvE4x&Ap5i%s^BHZ6vi-Yw0}Er8GF0E`
zb6F;Fl+7I!`KHjMeYu7h`Wg^J-=85s?`)VD2|uFv*^;IOzs)0HinlzdPNLL2RMz!E
zK_|JKs~nWpxyPT~Mu%ZC{H*fzJFup}_H|$o|0KRUUGLgFsFk-S)9&n<vmE{bLsnkm
zJ7=NOJ)L}W)>=nN;H07Advugs2)yVboOeMAH$`%RbJZ6TZ#trVUQ3iDA2knOiJBQk
z*ag8iXTsRnen5Rfs!a6G&$b0M(8P&vikq&l#TGy2-|A2aW`4}v>Gi!{hFQ7ADH`g+
zvP52}PrdIOmk&2GX<Nw0V?Ag%mHr_HuK$EL9y;{i#R^y-KVW@$uKjTYl4+u%U$O=<
zSiDzo7UD~2yUp~0V;}H~rVF^DTrMV3^HAfE^Z~=*JzrpKPRuAZS)*!`hJ;JUuH6ma
z!Q#T*#Sl2QJ#w^^bU({OaZ;??NR**Y<+?M_vY(}rQANmLzdg(r7Oxsy3-P2?H9U`V
zyq#STm%MQEp|d7rLgMq#yQvm|$p-IZ?dOA8Ge84EW|Y1EmOf&@%QnSI0jpUH-SunO
zCXI*B+^R|!d}s8gy$w5eN)wHm;X_^K@zw!~(Bfcvl6K#0ySbl4SHO5@SQ33)5(JYD
z7;cuj|L!@VW0`-$dVKiZ85b-M(vVmuAl~IyU|sAD7yS+4q5MqtfP^3!yHloUl*766
zlD2!E^^w+OxnM(Y?ch!-I2xWKBe-^AL-qJUDl374xOz1t$6gS@@0|bK%=zw+16@4g
z+RH~4g={Jg-d&WCLHf(uvR#;4WfKQ)sU|q^jb#^LwFiQ0-rW<_BdQC7tbMEYpq1TJ
z1f;yU%urFZlW>Xhns_`F41Xmvhxt})#TNRp^KocOPwZq~;JgQi-CM;#!dp}kU#QHP
z8hKdlWBq1sBr$p1qZAB9ISF@UwjmuD)mrLTrL261P=JwysWk9cu>qL%d3}5gaTIw#
zFr<4XbuZ}&4(4U(xIiGaNs`CqHzcVrg{{tXsvO+wDot}HX&pGh1;cFG(g=%2HD4Gk
z4(>fx{~m<nqHB!ET)4xhcWMLU{;28g)k{(jQPS#&5S9-{jnH@LhI#R9_<9dyLK$e_
z)MK%^J67-X9pBYZwEW}5uAZ)#S0$PD(NsimRUFqB^8z|X9nl7&kEGFmuM2ou`kAYD
zk#p~X5%9`sZtiV>ysP>G*R|XqVvruR&r=8h%OBv*Uf%T74K6NflF;7=;E%!C9-F<&
zc<fbYW8hK8IZL|GYDc_UyVf_Y6>9s`mZZxa$!^AKi8&n_(u8j!hFoP_DA!258wCjq
z1uq10$O7c(D}4EDX;Y;>J&S+2;iQ#~UUA<+cZcrkOt%3SJwvbA0#(u^vO1;bO;b4L
zE?>yAn?z+@>@1WJVsMcO^?>)N5T&fZ*u}^>eqwxW+H@m^`|d)OGdO(G5J20xtHjaH
zpssZNtEY;Rg>9Nhs&;}U#U(KQd9HA9L|!yC*5Gwn^tAI*6n&YhZ?mqRvF_@boTX*N
zg#oli@p}sEZ0lv;THbQEn8~~tpuEYo66L4g?CNbBG9t=E9;CE6r+k*wk{`2vu?V_M
zNxNoGvp`MpFE1gn2$F<vcS1YRS>r%wd7JnJY67dm>8A)P+&MAJMQ>zh2u#g~n+<;g
zmOT<N6cND#)M>|KAypRm!JK62AoBjUw!p;ID?;J0E+sb~=$(Sjv>>Uq`iJWuPFZSI
zfIUF<#n;p@jmu1C&3#K!UMHkaEPu#@bp-~6Q_FT>yO?_6LF-$;=dm?Q)=;lTziHdk
zucw_nQK(gPZy7MW$LUyfvF2j*Rwrp%z7_zpnU)RqqD>PM;xJNbN@r$1{Ov%ZweADM
z`K)BcL$gt*%OJ(LV{0#e2J64G=h9~ZWW~<2g0{65_l@q}`cg_@W3Slt<Jge^K_~pO
zgsA(`W=YV<PmgYZq9Jdi92)g<7%+_H>4U0m;=j2M@kFeFlE@ZmWPwy=6j)03hB3&5
zdMrCB{^CA_7+Ac0UxM*&b3ICKgK*!HSd65i_A`lQU7UR41jo%1^1i^*CH@W}!!glv
zo=vwG;_wkYn-cP5Jx@@N)YT|4m}7GA9GMaE7pQH3W|fLigwv{JaKN1AUqAQC>@Nzh
zT-Nvh*aF8OALcxszNs_Z+6WYC3E8=?{hV!Pph=o<s<DN2oky~<3eO!GY(b&Ey)1c)
zp~2yI;hQMcknx~Ihm~%pgey{~ApIBg-&~3x2S6HA{MM)bMuDgOt<ufhvYdME(LX4>
zat+BX38bkk$Sur}TjX8~)*!ismSz+hDO$ng<wel@o6bA5G4WItdB(*Te5rT#EZEQ8
zP_m-G?btH<nQ+HzW0qRsT~T9<k6v>pGCvS7Tlr3Ze78M<t@5Iaz(zNzeZj+T@Qg*a
zZp-Ma*=kwVrlUiZ(@B%YeN0h8PK(JQt=1CSkANr4ZoH*3%Pd+BLIIsr6Y%fqapn_=
z$yK|dj%ZzrqW^)=K|RY?i^Z7T9qO*<Y3zs}^+|9ad16;dK&GXMux`QvccnA7G#PCX
z;o<v@(eBpx1qNjts`C&_Yvb_q+ty<exJb0Su;HuXuK7AG=})nbQyLwG%2+rzSNL!t
z0@n8f?sL{UPIrFA0T+{C%$GC5U0K6=qT79^@kQ;37hYWpQ$~oPFM=>A+jp(!46C|z
z)GFTXB)=A3cg~R1JtHxgt3jaD9IXI)*vRwSbJg7Wsne&OiAgHHHYiNb6GhBUnQ`@a
zP74RObEVcb%__7lvorG6uN)qQaP38b%yi-0MyHOZI>^x2!&fqo!XCN0bTBa%AA66c
zX?d_Vc<Z}&*Ce40c?+T|Dd>-Fnv$6vYH@XKqsJC96LMeaWj^=?o1UWJ{W|K#IXv3p
z=L34gT+tyN6S&7tHg*xeab%MfVw|2tyuo_LF1iEZGSdxD`%|EiMSaaFUCQ^WnJBgW
z=3*DQlUh@J=-{(wY_&#U@xI!aDGN~8QW>@DkqJ1<kFQH#99w|}bZN+pK3I=E_}0BU
zeP2{Ox0fl5d$$3FwxKXdW$HL~2wuF&A%&~gYR}BSP`Ml9v5|ks5U*RVu`PZGlr3j?
z&X%|p6UfoQ3om+$)JhJxuM1e<?9;=yp7#j%fQOojK%ttGQ{fnVRYY4n;-a<Fx;BQ0
z9BEJ(@MDTvuMWj+&ZZ~MLR$Qo3e)~r6OG^eN~1K&)T54HU3W-v6h|8>&Xmf%OmcFQ
zTIj#1|E90%qlTJI&qc1<a@aXy*A;>^+Y$Uvgx}BUZ;OwOKAW$d{-q*%i>XP4kqAkz
z#ydE&(y9MSrV26}k*;4><cR?bLkFxOdX66)oa*{B#rKkC@||3WrDw!CG;s-cV*Be}
zM@xk=i!E`Oc;A3&q4FM1NZp8Pgdko80mSZ}H1!hxR}hQ?M*Hg>;GYwGen=(IeHnUg
z<Z<D)VKfZ-5Md>hP;6W29aP68L2dXgTIbg4kEJZaWdK&_0Z7leggjJHX_b%%g&UR=
z7znxnE~X4GlpaW5Ml&=|SKC<XHZS(N(p($%xNHau_Da^pS>4MWFL0gB^K@?P<WU@l
z)pbWPx01t!(6c7sLcW?Q2;Ap22k=XD3NCwf+tThd0WumEP)J)a$6s?m_CATbV_`R9
zN=y?{d<ww{XP5h;9&e0m$Zyx++MGmhLXvOaP8o@z5YAt79*G#5oXgjVZpx;YND<xX
zXV7COr}oDDmWyLCEZ{(kK!6)eblAP>L$=3_?Mz7-vH&>NRNkY9h3k^`2IYjO3g+dD
zi#nl2h$#Qjgbg9r$~!BgWYRl>S2aYxc57&|w%QCE$Xi(Tw!7~F{b;7YWH9>I(IJLG
zd6g$b<nHG^HHUQW#^LV|!wX_*)-vW_Bb+d8JK1hQ>zH$99()xsO@o|qq$azz(-pA8
z;^x+D`YL7iIq;NTgwxXXxFdY$y#ndP{%MrVIKG$Dy6w0JAR=V;^6vF0y>5@VOc+_r
zZU>jK><78?gfIb34JfRAJ2nvGu{tdHx-mcv?5%dkN-L4HWZx!7Tct*u*N#w`#&SO_
z-<~Z60DB98^YP&zAeG<<tv2|=wyXp3=7mHZ4asB3)-v_I)yPaGyuA&i@fLy!mgDgo
z_JuwCu*~q&Dc5LMTmNz2E>GmB9bgLVHU8-^4+;d|Iw!3#%o|P3td@B;P>jhVY!y+$
zu*M|6^*n>v@|4qVBxs5Ekjar_K(s16xVG)DD28xWr&o(UbIcpj)m(0xm|kK|$4)&<
zUN2qye#kdyi`^jqf=p(6@bC>ID(u0yv^XnG<J)A1jDL289@@Czz0tDqLtZDp4Ov*y
zIa#YS`~y)tVXtFvhF7L;XxtuqaUEYm-P6be@k5~gJ1<o2usXb+EJXeZ<7|y~BOUc}
z(<)q;$(FYtoQmD959W2CQ1GIET5-9KNZ842kN<d4-XaC7NBOl-66~cg=AW%Gzyy>Z
zSjYNR!(U}=9>dmb$j)l-nJx$3?tmEAUUlI6QVR1-=^}T(GEC+rQ0-VPk*!$mb#f~?
zOm&6(4IRe5lyV44nMzyHVlDi~1(#us*-K<}gsCQ*+j%+E_a`4FT5N@@HxO+eqN``l
ztGVsKqr#~@O{L2<<pOceS9`(eJXPqgXf5sctXq;Zj<UIyvpur>TK8)n*NK2ir_uzZ
z$8m=BDo=Uw68kc>H!RVAIEdvN;+{hLjexgF*d5q7$YW7RFJ4z7x9$WMWi>=RH@sc7
z9hulYJF?*Fmgq)~<Ma<5cf}WU)mvCw7XCb^jjAxf7sjFK=noY5v%f1s_9*ESlI{eF
zvs5sK)5e&c>I7LK9Ga?9CQ7XjvOz26#-oGvXC*0{DQqt?`k)7kAohxDDgRmop>;m7
zhw$Rc%lEI11Iaf>Zj#E=U8ELXcBiw9)yMYdPLwQoPU9oJD-wX))sdS$m(hbQC1n|7
zxA)hN@_pePdLIecc}ALoC>L_e(JiTx-=p}vCzN;&IlfMAoq-g+%&!shLHVuNRGBfR
zO&0q3EAM_RUy#Rkwjd)+t4B|No`g;<HkirU)%KOV`Mx3PrqY%ra(+sAeYwsQ!9EWO
z5zdI&3-T5v6x)=13pTX%Sh27TmVA6#_R0FMLFwy1A%}%-NVx{U0eB0^Vy|9EGPELz
zm;8JfWmE;c8$bO-2BQXlz$U5~s_c1x55J-#S!kN;c+Ob8co4BAxR8B8fM2*4$CKaQ
z7FFlXrp;+Qk*?GoxRM8bTnK%fUgSJN%}B)|s~z_S>Yu%rD#+;xn_E-Y<`3tn$BGMK
z;z)2#DZ_UQ(2JR_2OolalXLd_+mCxV2qzuDD1R2j8OxX}EnFR7*~^J~mpD=U*w@sS
z5Q<j96&$)CP2(2M;PbZ^pat0go2PpOvElV!!Na!CTR}pF7m;Cx7;O2E%K_K=2K)w_
zpKM!-@ADkTS}t6?SGW{Qu<4tuR=5vb6rs$#as)WCABxiT9M1ci5l~Th^~q*GM%F_%
zW6S}gUc`lx{JYjwUT0t-g+07H_{}G-3)V`d=dKsRu!2|TD;s}#Tki)arTy&7yu1Jf
z<zcJ$%Kuz=Y`jdPiA+Ft>5;NQOXOQYX4^Vo+Ti7PFb8`^PJbDSrPg9_3a2N{x3xnN
zpXnc9)xYjHL-8UmB7>fjzCS}Zv^@{>Lm3Imtbf#Bk2V{;9Or0f;cG!E&Rj4;7Zp&a
zNXzI1Z_tZB^N%5;^s(=6qo@ZTtPJiPwj8ulZo%N6+4F@*uQrpxEgiFc-27@lvby}3
zyoD4&vuAF-SJ1X+=sr4rI(a1-4DRXe<$soTq{i2#L$+MY)N@n5r;q@%FRPc7kMqOV
z5AB<%t8{8i^8(bEW*#DG*%_y@gNj&h@k^lPi;D9uwisielH9S=R$NAFg?RTd84AUO
zS3IwDGFdxUuP6EG*!dkmH&i`t9f1+67`%cEGGp_eRO%H%4)HbfZ!Uf2Vv;D-;tYPB
zwtHc&Eq8_%l)7qZ$%hv>MVp{=UrYD2?QZ<$IGC&}4<kIGcKiBO!BVN+jYV?G2@InQ
zgv9hMaMNf23BI1uP<?_~;N|V}tZH}@gJiYkDd75~>44J<o5u4F0Qg4x&KufG$FX1B
zx9W_n3sZ*_c0u$L!<li$_$PkBZG&bsjGHSjiP=aA8GzwKtTg&J7!EK~196tCDmssH
zptTLm#yX+>J}LzD2keoZSTrs)<9Tldpucw5d?r5i>~HpqXK_BLzP<U<BaY)<W93ag
zqj(tau;gM+R=HR1DGiprj;b9z{m!3CVqtRTnHQgTENSYs_4z5!BzeDa&s_XQ8G1;Y
zc$^A$Sx&Lnibf<Wd5Xb`MzGI!l_E4*ly0_+xezi(79ieypiLt@BP9%db9@KCJn%%Q
z-H+{YQ|U7oeC;J@TMnls<VFj0p8hQaLXi-7eZ}qkJp~@bSbO8u@iPztQ}jf0&@RM(
zOjw7UvT8g#q!wVIb-*RFbD9`vF3se=jtF~Qka)EVzJ&D5n*xoI>9o4kAj0Fgu0pq+
zWjoQVYYo{@jBymBxe04~KR&kYjM*wXA0L2z4M@ZwPvbM~o^#i$f0!Mcp}#P`Jp<*&
znOf)snxln%y634Y76U{ILoN?Z%Z-<-yQ?d(R%@l!rxet7Gd-wVDnv3yuO`lK9Hy)U
z?DN}HQqXQ8mxRpT8V>Rn=FS#Rls7<Y9%8R*Tn<LCo?UzruwwBs#sK)H+yj9E8ZV=~
zf|_=7-y%ui2!$Q>TapY(KcV$Vg*Adw1MrUhE0~l~->R?Y!&=+Y$&GcbLTb<E^t==b
zanFMC*i-m@=GRk{I<94_JBGgR6&&#cwhXtF8-SG$pR$JvT7OuG{Q6o<*j_eLY>gz)
z2t+%=Cp6f_6odE~<sh^>U?4<%>W>0)rr~kZ9&;pPIIqJF32BD236O?TT%TOJnlcz;
zJS(md#WoY{YwG$(cWI?_nZiJ8a48sLdnAT8DZ?`F9}T9>Xj`P|X~<=p)9ds_M(Q`$
zr3&SKI#OH*B}mP5dE4o#+xe5wY`Jy%AP_llwF-7|#gHzknktU5dS7YdDeu42AZGLl
zO%bIVD-+$;*GS2<zB=woyu}u5ks-07BtN#0(*D|~evuu~mKIQ5TYo)?T3AT6{O*m0
z1ng0&Km>10@Np3&o*0mL65Pfa7k#?puC`!ixyohbxCoL|*+_pJy$oBp6%=kY*idSC
zg?4|S)ZT`N>nDVk??))1gjE%OJX6~9`Fy<+rmSaaS?c~HI;p!O%n6?uZ$%m#yFdof
z4(LT3KEmZ|BP?+gyu)etVQAH_Xp11@O1XYV6Mi&WweP=j@;Qv4hM+Y~SJ%Kt$>!g}
z;oTLe5GXZmzF|mPV91E?dXtC-Zl9J=zdIS%eR~NF4xIW<Rn(11&#}OJU9c?q)QNA#
z*n%)11oO#AwYlYTPwTlwW+@ZT9`ox4W8%bE#EA7h>k&6j$`qQBTaX$u^wj?>*0RD7
zJVuH%EwY392YxMMfX&p&^G1#rGc)hPcEdENAEhaev>!#?mAIcrz}(dg8Xix*6lN2V
zlrfZP_pYGpGXgR5!AoJ`%>uor+_U5-N_|Xkg3eUm3|eHACR-)%walquZ6K@3fmE9Y
zs`HyuoP)pz97(v<k@jj6X#)b`bG=pP)|xC8%n%Hg>`f)&_!6Q-j0_%;@r5P4>EU`N
z#be{i6Bugiw?=3KFmqzt;xTF29`}J05Wy74h&0Na)t&?36{h})i`haCiv%KTpo24~
zA`VK_(%``*hwt9#vxgU;9!9I5;F`~}EJ|LzQI^9)J|0jbP0*C&-qatwuX6ob({TMR
zl%mj3PiHoOGp_;8B%oiKA<HQqDab>eL|>7e%1F5*eTw7O7T73b(F@KgRj#YpuDNrG
z%*gB^2h*W=U@?&<<4|!K6Gi2m4B9<w5O~BBp`WKGf<x+Hg{vjkGs1%tQ;hV2hTfw0
zV-eE0E1LQ(Ow&i}p^BwFPimo#ZR=-$Hj^mA6;vPiC}Lb;m8&y2K7*yyd$xF@x<kz=
z#}<CV)MP45{rYqU(w^oOqVpFjBen}^v*1^>v90>JB!Z#8!?6X<JYrp40#ecnW^0R=
zA*`l5W0Pc7j-Lrqd=zPP=Mu6THRer&;n<s^_ZLMUs2Wn6Yzk)cr4%w)mtZLR4>Ot<
z^@Lf}j*>Av_tkl~*RL<T?1Mu^@2m_+CdzoY7^U8-{HtTkS06^&7^YS*&eo~K<#FMN
zT`23!KVBAHwc*q&<av#juYoDwJh?}fMMn}0!z@|7BQsZp%IppXp||0?LkSQp$#-Pe
z*G4x%*Pmc#YWvUnek(koE~Ua8ECHEbF=(`VNEjOlTRSz-mVF9FNxOxV+V&Kb6I2s_
zEPULn2?E0>S)<|NTPyE()YcC!4t|~v+r1if*DGVt7O%#Gi-kYWw9ztHrr;~yO&Rso
zR8Bs}JXSqXef8Ys&#xY7iRNUqv4`l4vN9(3J8=!5uLwVHc-Q?{Da-#^t7A~c0`r@^
zmrBCN#B74ET{$PPK22KOAP>q#Qj6ys%(DqL)R?j%3e~4{Frj7jr`$ZXfBr!`4#rX=
zRyie;`bbaT+?{{w?y}_FHFJzdeywV6JF>N+*o0X#v&fQV{47N-UqjL)$8`FI1q`IT
zbg2)K&k_c5@-J>g%N_Wv50R}~fEdYz7=4aQN0w1%h-g)V9J+m!eE|I6x|o?6;u;JM
z{DGK;dSmYka2s&DPN)t`lG{T4DJokV-<fvp&{cVQX#0y4Ze%NcD%X_?*aPxvO*3T&
zP4WQ!(c1gVU-+iDw@y7dJnK4GYFbd-Ay~;%)$AT6=xRAf%ve>poKFZUvbUPm(T0^M
ztWzc4TLgMTcAQO)d!mYq8!lnQi{o@8K(pUl<-(DMT}iwW5m0t0+P?;RK%>60lpZ$>
zU!KGL&NvyF{nUx?Sagw7df$$RF->JEBIb_=tPvM}7x&%3iqqzy4m??n<hU^zq_3W+
zfJC2>Q|YuZ9{byl)QNP3Jg$h*@|>VsGtPbL@A9-a_->ZLy!|W0{2iw8BrXV<mFw#A
zHRNjYnGUwnL@1FoiHaXh_?T$od`^&UrYJ%agssm4n$Sch^#{ZO;r-|hAuzC>j3Ul6
zD}~LvzUk37FY1}VraZGquUHhHGEZ4SoU6{{$@Fx0#hU`E54VgKb%HTAZ*I)MRKM4L
zxLEs38JkG7J+aRvO8E;IhoTuLf!vDL#b-ZhLH6E~da)GdP?hPpoCp>XDK3gx=R5K8
z3KiD}4&LLDEfcP?DJO=c`Qqbz62x}#*D<_89PIf}VJ(GvP~&2rId@;S+NrlnjS0F$
zu*ta@F6Yw4<8=zD+Ad()`Kv(swL@x)OnX5GXp0GCq9A=SZ<xkcaPm7s7tb-<?ge{<
zQuWmw#_^4OQaKlS7-n+5n9j3X!o@3ye5hDlsC;@*v^VB^6YHatBv;xNNA<FMck-7*
z+)7hcDx1Ef5HR;f`!_dmf!I~^ZrZXbCOiWCF)W7%b5!#J^!Ptqv!6T5Q0TUNYLkva
zavIAJ4|Ayd1n>EYxQB9mF`u^gewD0s0e-WSJIfuqojOKQ?mwOceB^Il&17wI?Xf2H
zOo-?AW`+i{zRoLMNbHI2DDlWqzPnme8XC|r#Wg?|FYO(5g8SX$fSG3K7A}EXo7c#w
zb@IVBkKHnL4^$QxE=#DUaGe{O_nIlsK_)+8MG7e#b726P=S>ASQ%P*!`^~>ZapwqB
zOhbCV@t10s2mV&z!Jgq?nXDWHe``A@>+1aDN^GgjN<p@_J)7G>-$`U0`wFPSEPhp<
zs?t@%mPt9*B^{9$#mUAp4Gabx{!>yr&xeqn9IC-SW2TMBiC8V0_ZC3}b4FWOZCe_A
z$S(h*6D}?dqn=Fvp(RdQVgQB<Tr_9-7k*~Bm7x#=7xD*7><GauTie(05?v|$+yjsJ
zIH+iOtLB%z3~$mk;(Fiu(zFO&>2BK@EH~fO^*Ofbc~72II^yj^{@_V8xx-orA4ht6
z0&>uGS2E<F8X}CGDk9sQ`wYE83X+3F%HI|8pI!8lB==74Y3VvH;A<y^_xS(Vd+&Iv
z|M!19qf!z|WF%>!qU`OIj0$Oxoydq#_O8>A>=Lq-WRsm04OC>0Y(*V=?~(8II1S>w
z-mmxP_wVohPq$~zInT%AaXqfbbzP5Z+%Hg<1CB|IPFR)bvhlj!J--=?h&T#jDV%z$
z6mDy<Pl3tkqR!5OI}_VaEVW6rf&6W;Yg*o=%rLow!Ws~0&IsK}$EBq~XZ9EbgPEN7
zk&@=m4X_JxQf3)1={zmB*m`NdMj5cI+9Kx1UHd)F96MhsUZpL0DSqq;81s@j;Lr*7
zI$-Ru+w-wVWC(S-=e;L7PqIRHu;`qbhAaOmgctFwDP7%L&NuVJ38}^}LN1m4U)&US
zKAQ!vpK9|+UK=MP!qy$Ydg@cQZ`utdvtkHg7ogT!)90(E9tUsrK1jK(&RFnG`G}5-
zZAVPoZHnviSy-ImZste#hq}Nz5vmzCog{Wk2#;3Sx#zuZhtMe&aq@nM&?!2+#)yUp
zncu+y=uRkt%DHY4;agy*I@`g7NVeSz39nqZ=Y5aJU6$sWfk_L{D$Ym#oLc88kBP^o
zUMdo9e1R$ljA;x_1lT@v3^NsYqL>f6^VFx=Ep_?@?gkrL_{r;v%v@6@k2uYaXR-xP
zu$K;bhyEyTjzM|Cmk%)sP8NCZ#%0={F^c5Z6}!}yXJ>Wsj)=KD?V?~-+vbXqf)b5(
zuwr-`1?)fczc=`5d~r?<nJ?TK+x8bRD3le=jNwk_?5nR~y+%J)(70<jBv54RK0t@>
zH0skvNwv?6pDK^AT#w`{*XJ@b#YI(m)>k*FR>ZuSDwoCP%!nIj^4K^EAl&A_MdkF_
z=0?bX5!O4opcP#PR(WWJRT>v}-=n~S@1e_r6X(GJ8O0I(8uEU4LD@Z3lDh#WMx|N=
zBx=g+bV{FGH)6l5fQ$##p|jJ5r!l3Z=^xbI32`C{$!k>#{%b=`-{>aJ&X!30NTEIc
z3L^gdLvz3>UE8Mf3Q(w{F68$rQ3Sgc&Dwruv2Hh^nRa(i6LmZ?vJX=K!;Q>PhnK$K
z&3lOr$>})dP+n%uSQr%Wq11qeaue(H<_3=PP{6{<QLg#?I=$^WI9(g81DtAt+iHy%
zp0@)Se8vllJnG!Gsvgr#hs;_!?y#;6(<Mupdku(y%2T-qnEL?~F!C&^pZY-Y2qYLi
zhs>R%Ob2^W!y_ZEnP0I_0D<BqyNLuz1p*}-*!AdhzI{KnlRKT<yHbXr9VmQ{AOwr^
zk6-GVx3)QSf2e}%#!RV|jUsdNJ!MLPciEn0#hF`ktz=?y>lA24#F%vNuQ%@9(LP-)
zVC0g>Jl;3{<$>uBeJc?ghWuLA@BMJ|3{Zw^gF*qVQA1Jt<mbElniw1s0P>*+5-Vn2
zE+*EjpU>%ya7U(I1j2!hOowoLus*{Qq|0GN>_M*7R_)sqMg@H3kB>z52%m}clG)DX
z>^=YS!mH|Q3vA9cpcgI9<@NLTRRKc}99NvXvsED`pERq{tn<q?s9tMurc0)KUp~5}
z@`1Aed1>aDx>LNkl==wc3=~SWKT4=+w`JsG=)<NArfI*vbt3xs4&8wPXiH-7oI6v^
zdwa8Nec>3CpKA(;GW7+FECx8W_GvYwntjsR$3RyiZ0@+PZ89?}<7#{z!?F2#CZb;D
zg)O$GCtB+_oat1Nn0Z6@>TpjEB{)+vsO;kdDzs37Cn%(Z+%fyIhUxOFR^vuuk^P{I
zy;>c^i`qS=2ZaaHYxeo=?Q3^?FqB|xCsUhLRfXsAxP|t$3mVZ(ph7Q%_eqbH3hTw2
zDK9QLPa{HrjNcOC&j6Y<6|ng&nv(*E&0R3mBzSBcBeSbscRTG%5>(U9<~dD(wlI6U
zUIYQpTLRX79j%cwV6vstxUaFwQ<_Fl7jG?cH+X0CV@_sdJuhKw>^oF0wa6;RL`m}Q
z?LOUh{RMPoK;h{O#VLR=S@6#?5;)F8AuX@-dIqkiSDKt_`vJY?Dt(8$w2nai?ajh5
zR~$wZrSr~+_noQBt~qGPXiXjDonr>Y-dFsTl<B9k@2|I}J*0M?yb{fdg6^I!-F@r!
z$Ke^Kp5+JBI)sX=y1%$RI8sdPaA!f6?)Lp$%6B$$cWjYzc6l;3%_#`|L5wKJ(l>sz
zW>hm#F3_h;Epl}4uo3N=0GDD;eHn5afBlmDAhPP%@cV)xNJYpPnt7SjCaBw1Y@;VS
zY!0DGUZ<E#&ptu?Cs{dM|5Ecokkeeud9Fj9+i`F==S}h+8P;~BD<LNygygXtj{(Op
zVJF*X!3C<V<)PByt%a`lc;1@y-}W6DZUs!x%K8h^k($Sf`BPCB90XC)(qi1VPXX~c
zwgvnO-u}lsgluZg-CIgSu{=oix?s9hz?#=&@>sLVPWN`&zJ*t!Bi~0Dd$*nzJu{am
z<ZNL|yUD%Xbg@(7$8NX?>zjD)I@MwK>~;4Xpl_U2zo!nYb}{?3hgiL!&L_4vZzqb|
zO}C4jaus!srz}O4sPdu0RxK~n#$!X<dqoyJvs90mLL9THym2D$tH3NkrHoyHeKFqt
z)w8K+bJY=n)PzIQ7g@C(5kO?fZqH&yI_Ya-^#amm0cu9EG+e0(bS`F-sojPLoTI%b
z{6npqVrd<j=c>EOq>2Y$j~pulxT5^QQoTq|Yim9(NK(_nFmi3%IoD{c`kwRjbo7nc
z=y9-0`uwb(-v7kcG$7FD)m){Ow#KBs(lWW8CvG@@>-SdYa4~=w$OuSia2Rn)-`Q@A
z0IS{>Y#Sl=9IKEh_||`Qn{B2-pi^r>O%Nr|a7#SsgZuiSsozQ}H~cbO!Gb*rN%tB#
z4;kR%DFxtKoM$t}QO}%4Qn&54UJo)hUMm(#(cbCap-XpaC*&tJ1-I`X92R`KO+@Gh
zlrto+wb?IDIvlxkA!gD?HZn4JSjFF{=$K*uFw?FMeaf(R#LgbN_xJkCQYW~-u*ly{
zd#BT+i~0w%1F}W>81o-EU2RSg%!lBQ0ljGsfI|g|i;h8}S^ng;QfQpz8=L>YH%`z-
zr~eDR*^%U|-NPz{DPy=Bi&Hm@!vl)k=>+&paaiw6x4N%gU}|^n3L;Y}zJ2x+vz+^r
zlBtIG<;^1<3XwyJ)X%5oO5E488j4Itt@qpxx2McFJxA|i5?fWT&q9~Ks?T<cOrh>s
zHAA_F-G*`>v_9&LS~s%A{e=;4gYNq8WOdSHhW*qx4t$l`{>fTD$$)mM9pb)5Tz)ZA
ztsnx5r<6qH{Gz-@Fc<PPv>~sp>{!Fd+ug16pL(qaML{ZOgsObKlzmX9KJ;jD<^&{S
z*E4$<k1NWBFj{cl8r%JP?x554&FDVS1(<er>Egr{Ghy-eOU{!BO)OcXbhn`MqRB+F
z`MfeTQelj?{c<r+?%EzvhJx^7GY5Qx_q6k<uYZA%%+AiK3P_>lkq;NoZ{=)w<5Ja*
z!k#S|p7qgRpG{{?<>ACKCgKko(}cRDyyZDoOMTYl8-1VWZm#~TnS=b+J}EpV^Xr1X
zWz8zQ)%mn>%ccC!>=}(_FH)Fk3j9h+PrMcLQn9&jRRkWR<fKy}bi2NGkAX*nJBfTm
z-FQB_u2f|w<U)_%KkfK*@>YYgOj!YBfu?7ES09>&E?ogS`x{UTtSje26P$_+*tBAr
zy4$vzsdN=u#0!siwg)H5py)tGgne1seHq83uL9osB5%C&_toFkO^j$OA69MZ{bn!S
zSYY!epRV0tWQv`5dcuhRY;y|ikePhsP=es%Av3#Evmm!6OgS>3<oH89`&_PqP#FNn
zOEh@C+L?H6vXko`|B`bB5niZ$qDlmg1UU3GMu{1jiHQAZ*o-=a`dH72NG%`$9Cen3
z=Jp@THc!Ob!gjhRdwPK_q&i3c1f_w$%6#8&a6BX`-}W<Wu|4FZdy3@O6k3cM+}mL-
z3*K9U*5)jGwo6~75}Vw2B<kM7p%xFN*GfWt^{}a-((Z?CbVMP?!wesl6=$g|w=D@{
z{8DDiW(a|e!SwrH$p>_xj5K)S+uDJ`cW;Fum0!mMFc>HmrJ7q^l+`DOAVc|N%7Man
z_sDuRAwhkAY4f1^#HId%lDCv7-A5QeGK?w5rH1Xk-oBcs^|6v|n2A!O=&dSv`7e~-
z>kOI@YExT;4iyW25f1(%R<pyh0yom^kJuobwWiIis$TZkY2}9e;Wy+NY9&+W@^^oi
z%sYM=uH1=1qxha(iY%aNt8~B%YxuR7W9feEK@7F<6ap=Z&8?|*rSd%djpON`Pw-fm
zYM5ly2^%Ffn)!^Jt>glgD^FsQXv%Q3RwdUxG+O=c)xL~vU*3Mk9?{}$=`Bi5oaG)J
z8p+k|4`X`Lk$^2zxY21+7cJ#ZTUXDgeiy5r!81BU?FEUZzQ$a}2%c^K-Pq2lfzOBR
z_K3E|v_eSf6>YKB3+<!N)e*j4(%Nap9V1Vx$da@mS{>L_YJRqCVcNuQPj}ci1}yI2
z$w(Zpnzz@?bqJW%*Pq<SoD%F%E2PC8JmEU@X*#52KE(EN>!F;0K6<^yww}5v)%jvN
zE*`_lv&{!BSbo&MBlqrXbW4%(FHE!gc<@c=y_S3QAX6tsEi_Mm6xPd}={&e}(bGf1
z_Z*++-8I-Zpa|)bE(=5U7}lRyYP0B6?`toW2#MM5$Qp1I7>QgH2RPO%;Ob8vU>2P;
z+(tUMPqaQ&ajSxBXdETnipGKO3tCSZQ!>1JJCxEqgg_mdNcIkRq*YNcdfw=oPoNa`
z2x*VYC$lLoUlYot6ecp#AtSnn%51_c!}usq^JAGK>pPD|==rNY_vN;9l0QRIv72P*
zyM-vNS$%?Ph9;X%gS)kt)*a>GRxaiy*KFD+HOeK>NV2L)l4G}sC%ij;y~G<fXGOR5
zDPLb_NpqbiE3x%?UiJ3+C*q2co1gXuXq+Wi%9^{TL0aFr>m`|7N%`IZrciOw>QC(n
zT4kXJMLawN-;uB%3GfGfYHe5zVZBFRU#Hu*TS0hU&!jvN>TOIidM+dSy@x#2F+(Sa
z%!^%^SVH;)5Jmh|szBEnx_$OO4|Hkpn%AQ5lWm><(cUZYUM1O8`&yoBYO5{;VPo?v
zZNTWdGF5#YS7Kj<6n<0ryU>JePm_bhpGQQ#gBMXr-K@;sa+cA|f3h=cSml&^fP}Su
zZ7?PEI<JW*DKa8l;$E6IHUMBCcd2-~bpgS@)eOqW@@qY0G7cN~Auf^m#nd^T&eqh-
zg-^}K+rGr6;$8cMuI4CX`?T~x)VBfV6ds3!)(~)t`Eq{R@^o;*UOKSA#(u`BexN-^
zB|JpphI0;pWE+dPc#EZyhIKmjJxY`?39Wbhi~v)T`c9}uUlqF%0ww3aPF{o@7{PjR
z&r{&8hbk7|_{rTvM)28)_gLJ6*F_}MBhn+b?hYdEpM$Tao;akz?=abao$@MzAc|Sr
zsdLG`KLs^I8*;>nTp7q{Na}0qeeDVP0<ZMMnuK=I4t0KUN5Mdp;}^&ox_vf2Yr2i&
zb8Ejepe$H_N^hfnGK!EM&Pm89#7z|*pKm^WORDR4E`WO5sQ>tHFO5ltz(`kId8mr4
z_lp5Y>%K@nnYGo?>z){E9<zv4a!zmDj)9;`rK(~LjqyT@O?zoS>XwX5Z#JH~6h~%W
zOd^Hq6V)kCds0oZrZ>g;kK}LvGCQqc!e0UIj(jy0TR=xjZtg<4S~>yk);MEFToGf3
z#P}0R&u01Ev?){ITCxO3qi1gWJ!omuM8Y+)<J^!Tt*zI+#;-4-@W#mL)94yi(TJ_N
zbtRH2W)98GYS-%*rocggoU6&-73M#QLm0{^_Z_bY!`iRv*rLI&BOQ>;-6q&mYdd@x
zZiSa|0m@SocsY4k6n5i`1E7U}lC)WN$s@x;i?)=>X7dE<qNIv^!LL(`DxLS+6&M$!
zo64P1=hx8})JeM@c~h!dKmAppeOgo0+RAkQ`Ifl$l<%|!yIpV)vuNubzrHA*>d=jF
z!TiCmm6_z}3pTAlpQp~%6kol0+Bx@&^Q?V(8LghUPr(8BXi^q*pXy99ruphtd0oTz
zn`?VDCHRB<u~XLdbe#fUpqM<1Kbo~abiy@7n#CHTx$>b&xQ(I)b<Q(+_I*{KZViv<
z)Y?9<iRQWlXk6=!F7>=DdAIvi^MXnRTj!KQazTagjJm*x-c-t4n`q#@Y@N{55nL`9
zrXs^kyL(G;_6d)yK2HAN6kKXOgsDQVRIQ^6`!Nxfz{qzw!^=^;C<#MP4C2T=i>E}A
zY~*GM+<JWn0Rr*@vzM!)YQHlB<&LXhA7w1QQ*UP!+f-%v<drIhqx3f8!ebZe#~wwE
zWNQ`?s5CN|v{c`DuhAmS=zQ%xEy9e=Nzom&>JTDc2TWt-Rup_+Chy4&xHy}+vwSzm
z+i*iiF||R&LfLBY_>r%do|2WdLpV62;uD0ELs-iM&EGM(QT4)AI$RtA56@Nh431yA
zmFX7#rHh+1OvZ(N*ErHty;SYGBZ16wJ7_6`iU!Hf+?y7ad2jdC^gaFi;?U~&N+bWd
zAKwKEV#O4#z&lb~%r|m0a@xyl$Q9ix<%1m{g{2xGP&Ihm)~9{N#yl9hmm<A$x^TS_
z*KrlVdt_XvRsk8M>7ejW=d`1000fJgv+y1aZt_kd55Xx2+Qm)?i0kgA>?`7|Ia6xu
zFb`U#8cI9AAG||R(KCMZ7I-fv%m+R4Zti;1f5bmUdG~G8ZLVf_wwjc<^FVW3z#`wv
zT<_6zJa;n^z-$3X07Gd+DPNZ7QXV*FH+K3ExAT6_)-Y(F`c>}9m-{9~<hOQjLKCuL
znoMXYuy(%fVHHGXy4bN^LKan^YN<xM^0`gR^VRQ1NL$Y!vN<ua*Co+8-urs-(C3sk
z+Jc&j5FwuY&K*Ot$#$QNg~Og9m)szQuGkUlG_Uraw}SP%ij&sgwXiLD`Bft2c@DsY
z)#uAcQeDW4;e#HSInLDB3J5<WW)|B)nn``V(0*oWY0rs9RK`2za}g4gnsBW38afFn
z`>1ty3Ke^0(D8Wyeg#iJwDnhr0;RPo>(YHEODzeqiI#E9q&ukMvstj?<_Y<C3`%(_
zPJ=Dk@&y-XGs4b19}0DL0zXC$*^Uy^Pa_B4e;!J<y>vA1h4`!BEWo;H5~ZIDZE-Ig
zd;cs&L1<>{=PRGaApN##yw5_cI@9W-LP3V+vKxRc9D$<vK<2G1imV_#wrs0WT)HHK
zfY-rFV{xCOIw4WE^iwJ0NeyC2fcYydghM3r0onzLFSIPH|EXNd+BE}3C-}UG9`NN|
zKhz~OSrk<Dn#?{4D)TY|jHP4vv^H+%@(qEgYsFkU;Pm1I8d!d*7HwDJdiHy+L-k8H
zNf+oZB}!_Uh2S&+RVXd{PgD*6AVp;hQJk7QyRmBI<4wv1|6jtw&`%CRMOcRNz(E29
zp02;u*G)_n<i)xxA6fW?AaB1{J2dse!=|O0S49LMr&D`^YXcP<iYcEUH&$7i5wK=p
zApoNLA!woTIjDd4djJY&=oD%~tY%fGbc$c?EqWQ23R3h#*ArKWZ(a85P3SK6dE2#L
zz!lmmXvuqrmStMG?1m*X+1SARF!Gp=#^X!D4D{AKa+JxzeoG4_EEHQ!px)Xg%nn!8
zz#TCrPP%|vlw&u;8fB^=Wc3)sxwJ4$L^SNT*tgp4KM4*_tiDHU*&mrMb8A0AIe|y`
z0Q(1P4ZGaD%|iT-$&DU1sj*mFwV{q1poXSDCbkc|2s{HgB<@ckLKc;+*Co*5-3}cA
z#@Nye$8PQmZT?dZ2}S}`5I1)HT2CAs5Z+BmKp*`A{S6xjxrrB?)@_#{B9=t;4IBl*
zOG&Qq4a_<$fcCSZ%-YWK5FP^sgs6xrUU<?wXz)S<P`$?t>tKHcZ%-^W#>Qj=IZBmI
zNB$y3>IMjjeDC^KL;26bu_rgo584!*g7gK89JpZ>dpdl1bN323=lxO-tH8+TVWaHQ
zzY<p{K~Bz6*D@BAm^%}HpZQk};KH^S3XK;pCN8b&J;2vnXdQ^W2=6!s<<(pC_+@|7
z0}W4e79XC6TA7PE(B!maViC3d&gr;CUJW%-8t<?7@S?9>MzBp*TWrhu4MLV+n{Hwj
zVc~NI@p?aB3Dc=(AG@$sY$>(%FdOjt(G$4{$g3A(LBX`kcojNR$YH^2F1*94_RN(i
zKjo9CJFuG0{k>=B%^*!mau($*(Q;{t#yTKJbv2-pgguAs>9V9;0W_^~RUiwEN=J5u
zul(4u7pFr2%Jf9nucYe;ZI~9h^5ND$JsnsMqdOnb(Oe2NEx5S^Lw?6HK7wh1F~~mv
z3i=9cL}1Pl2&Y*>C@)$dzehf>+-3_Jod)G@rdxxA3oWr<=EXe-o1z!{?E0@4?uOAn
zp{{$s2lzAKcEnCpBdtrI?nsItG<ToPkkko{GWTcHP6ynW39y8oKB%kc%Wv5Vcvk+3
z(roJ$JF=(88p-~=nT$pg_PBdn?eEk7^rqEGi(7wW6KY*dT=9J^jN*biy&ZSVS~BF&
z2?;7ZQo`i>y&}B*{O&j9=(6s%BP?1j@EHDz!hwXP+GQ+mqnxeeQvQLD1rRfYPIl_o
zG4mo#IA{ItJL}StqX5E{)5?y*3yBvpQC$A9*(D_b!;X@_OZtT11K(mz0SKW7a$gi`
zqEFLVb1qsQMqL>4Q1Dflo;1wq@X}(bwr2wUUL5H7@sqX~2^e@%QYm<e)NY5sE7CvP
zBSu8LmJa4%OJo(kL=7V1y;QcSkPgI+Kk}X8QPROf7yY?v1}b?kHbE(O&3eel7|&_Q
zPa@sThvz6mC_T43e+q(Cft?yli(DENj>`*$z)v4km;Mw7Y4aLUMvTy7G8@iPx#kJW
zC6%!>BkZGxg=5{kRJM~3eo3)q%U$)CaVQJVUMwC&JLdu$yAR*d=Qo@E{-qrp2IfRa
zI%hVB5&tD9mL~T5TYvtd2h-zJm-4)~273(M>EkqnmjuqzEsYz-i;SYRO)5(lP$Wo%
z<?{f|OI#uO&c+zN38#(Z&uGgYy3U~E#6etP7xHBG-}_%b0+E<~bsHZuG3iDC%G8K4
z?;+GA@gt1q?XT96&0W)mS@N7MpWL5C0U39p41MbMMEEAl@YT(PzUCtX>|PZIY<%6N
zcPkPEguVrP^7A7_e)xK-Qmrj0_D3L$pL~o;Bk&Urq?Og6nd3(}2a+X0_2)G!9+n+n
zuv&yxe2SkDh8bB=@vOrkCG>Hp2@Lu7yfKYPi7B7yg!Zb6uv>RI0?L>4_8;Dh@jxmt
zc5T=Jo&kz~Z#ubL#DU~l;`C*7`yYe;k3s*(pnspnRR?9c1T6~_6VZf>USF0Z@G;R`
zG9)PXNnLhbL#3e<{GYX}69IH}!4kZ$l#!<-b8)}4yhP6r8K&#Ug~3UCubxZbFI!;Q
z0bM3L9-Zhob#eCWEQIg5ugFs0`GHE``kpA*;XB7$QCcBG;SlNf^3Y&Z5O=~r6P(o0
z`Q7^Ka)tlr-Pk8&Es#Nc1>EtzgZTh_@EmpPmh7o{VQT-8J^C3+%dZ~5JG#JyusL3R
zkbo}wEopxXK{mg_qw9A(QV{NWc0yMeGDlHSLr7oBMp0ajs4%ppr^SNrPXGd(S9}%3
z248koqq>l&4LE{wJOW3I@;7Rrs7o}q748Q7ng9YOR&X(l5gKuj_q~cpb)&UQ@_qI1
zs<nuA`rUr<2b_?Xd*ZerMA&LjU!K0#8<r+!n|nN8dn_6{4O&7V`|_1HLd%H1G{5fb
zLb_MOcP$}cBX2Z?u4u?ss~8HvtwXW+bhtXo_kM*s$xvs=#Q&b%H!Ho8N`EB!p9eON
zLIAuCMr~0}ETGM(1{aID*k&wFBR~cRK5N!%sK(0MsbAMuK@(~?s}a!Af1n%ctds+p
zGy5~}eFG4VCZ2AN$Q|T%AW0?59emc|HRh#<0&s(U_>z~UHU62`vImwaxbDtfMF~m0
zlVi|(Oco*KCY7)ZhxCnhGZ7K1l720<2B&zeLoY%G<V9^NJ_#;sC-8^$h=|3uLOA8e
z8Fh?307UN3umcZa6daZagDH-al2A~oW<Opfv)^h8W-4p(y^Q_i;qgCsoc~NspHo<A
zMtA8~oaWYc65!w0+67l;7T-cO<3v9MmT>|X28@TVjeO1@LeS2^YHdJ3b3N@p2HN-W
z(TUKPcoi#&;(eYFB)?Hz*?B`^C~quIR%aRF+&{?=Y}f|Kyk<uC7~iH~5gSwJdFpO&
z9q!DxXSjt+bpR@P6|u095{Fb?Xjh9xpAHX0-~!mmqE7^0`2qso&k@zPyaQZ`a0HNX
zlGq8glz`jo*s~NaR7?hCoVsp@UoQ0%T_w-{5HP|Cf7gP0mD$OW_O0IFwiOrvJ{%Ab
z?l?diwtmx~HC*&$IshN!1do1MO!G9v<a|)yp&u<O@dQuDW0q@xYaWf_#a5b}a7q!D
z#QExe;0X0YwetwSiD40mKy=GGnX?^##r@BW2d#v*DhTe{Vg7!XRj_KF?e!pGE2FHe
z?m2+SciL}F0zkTZU%&-A8Uc9*S>^`v7(7Q6EsCwz@Q6Ljco@HO4$c>-HXmh^JbVMO
zlwxr1F0iBSDyheSqNmUeythdd1oIWh>`6@U#4g?GSHA1dHVIR^YKB5}Vcj?aOUQ8q
zx1LqFZcmr^!YgP7+5V<!KSo=d7!}b%pU~UAZDzlm3?zN-jD)0@qC7%v6#f(@mOI#i
zeCw9LNy=NlFYcTGIq1(1_+mNhP>@y)qLWcho5KEimw#_}WfJCcFRS{6p_jkpOOuH?
z8;)F*@@}g<;Hus$TwG{hj6$^`;C7Zb`axl4E13fU=9=F4<Th`0W5Nfp2Q{<;ADKf9
zzzLnb(D_?1e2c<6WKk|%(p!W+0D9(}-R?C6x#lmjVh<8GXjsV4el8JLo~lLNyA}Yj
zq^w$wvmBrSrRXhht$|Y-V*Zi62MXF_np>a`UXb&6<j#p+<z=qE-zo{-T_}i!lfx;c
zoz-^a#*g`NRJ5jYGPaSk0@}?v=0F#n8W0ghZvRe0OvL>{lv*r4jHP@AZ5HANRi&C*
zh1S7^B?la>!q}dg=xCJHWmS>^uZ;$r`~*u~eilME6_*1q5JUrj&cvJUhj?KIzB}?>
ze-)jVp=DLAQ=meS-=krmN~C?J+!9K0nkhtNCaSj)df6XFL=P+%iQQ=rokXF}9Ee5r
zT3-cs7Lr0Uy_W86zb}@ci=nw`<fOivv`l|0xEKSTHROHgQw_4fXO^rwltCMjFq5Zq
z)oH*9S$tPy+C2>E@_dP~fFAfDzHby#km4stKLWUx?|-3CZWeCGer}v;2exB$2}OwR
z%n%FWwd$=smJJyJ_{%Rw8H+j)?Et6<iqq}a-cr<{5`iWXE4&-{FSbG2#gcUxz~Jx=
zCK1nEwY8g9uxq(qVpZ+flYCz@BnzM)Tso0}H$FNFiC@phY1b{Ky8b>W%Q)vJNBq*h
zP`n`q$^PdkPJ3IRl=`S_jnLi0A@Szj06LIQW0(<tvZW7FC}qRvT5$Gu>d>Wn)vH((
ziLka$zM>{isEXGm(lcTZ+}o-|p5^zDKLkNozqmqzoPsDTwVJVQ*rnX8>*T-g`ArOV
zEy{}>heloEq2Rxgm!W37kF?%hCIIr?Rr<UJe2LpWL(*$C9LK8h@z39cj#VImSFayR
z$U?+Ot>pw%hNd@FklI^yUK>ik;s@sQAU>jeo@4F>YMEyVT3R4q8&$WhEt5dy@2G&H
zk<kW~5C6rIt7#iiNK9=h4rRn=NdtoidYL3ep*pl07)kwe)kaaeOiN*fo3Q;5p%iz5
z=H4cDvmEgW55=MFaWtyWGy$K{p$Z#fLIc%NO*d9J`=e4=OE8X~FV2K7UJ>f1=h1%#
zX+$RvXGPU^Z%{o}aeWfuOaY$WJzfZ8^Mav~UdCOTew{9w!0mhGUXF56?ZO3{=CGsd
zKo4p$KC)<PMI+YF)Cs^qN_vo7ieV?k*UhwDelX#B?|VIj7SbUgAAREo?uG&+I9Vc?
zt@={J)IYiwK#QWrA&QqjDcy2$akilcI-H({%;JmO-uvFXq|>s!^nEN;kb_FZU?)J2
zn>@&aiW&vugcNBtaOhS;dS<ZJMBIf8t9;*ob-oZ0zooSWbk!O(&N%o0pD+FxT2z2b
z%SClfx7Kh<K()zC`q23STto?O;L56`!$MA?*Z^dSIbBhYB)Hkrs1h9<nv_q;v7iSy
z2LSG?2V(U@Pqzj97X<#ug*qx*0%q8A)iixSo&M1zso%K(FCZtKE%GQ8i;q$Vv^10w
zhRntqXc!hgu?0uy)L|lG3sL`Hh;hFh$5J}pJWolgBQs#zH;m*7bNcfQ|H`j7F*Bs!
zA^xpnfP=z8fBkDP1@<T`L_WL<QGp=j%p^xao5zUI(h!0-CF})KB;}jPX(bkVPM_<!
zT?l*}<OQuMEur7b_!BGn|AHuP`(GgB3y2_#euAuS+fcJAT`aEJul6QD6UuZz*@S%i
zY<*TWCFHVLb6Ri`Iv(3-T|+}8prLvc1XdH@Y`o2APa00gsqy%$ls{p)`j;-iY+H-&
zn5dQV*>#~Zz3jspMS&(@4$Ez^g~X?7B>qhypl%7Jyw4!S5QO*G|MdzaR*i^JWB8i!
zo|+nzLcwcXpD>Q_6E3^33?uwNu%(nT=v3MOg{cQXg$AhXzxWemv%d}|_6NBffKZsE
ze%OaET1+#jP&gN@Ft8|p&v8ApFsCaV*g*9@x)6on5dhIRz>z#dZf9JW184Y-x&wsx
zz@Brsso_;-rYr<H4@9BKpwgfI88P0^`Grv25s1X>IKn!HjlFRaU6bJG2IQEjkpCAm
zCfbNVX;8%|DifA9bLvyCVK7d3fZ&vfCRui7LF`9n$M~{P0-;U)*MTs5!6$h@)%x~-
z;c8g5S<7X)4<&e29czzrE&t<cMfdN2IS>6WvLEUR7!soS|K;T~5RUyHiT)#3{EtL`
zi3viq`X7n@N2349qJQRr{~w5J4vt}fFN*^AP*P&m4d~Eq)aSVWzwrM2rL44lhyh2!
z+$YO|i+E;>E&IjRDn{blRJQH84RS*cQ);(g`Z*v4i@6U9`ILr+WN870H0<J3PJ`Yc
z9{BdhkEsF!@%H=ws?t~ZE4HT>_684ecdO^%`+aRXqg@b+d$wR5$n%c*B_9%bkB)-I
z-K50p@&;n9Kd)FT-to`dkDoFL5!T=waxK<D`gxLaz9*{BUV!JpfCTFemysU6*@e&7
z`Oh!<Yt-d2<Qzy>7z|RxN2z_zZn5R3Alrn3)2`E|@~iKV|LHH7izq?nP23q+cb760
zYh*=L9f4<>lRyK-Z`F;d5MmEVd|JE&dH$bX_h-hxV!l`-`1ytSHfgZ+vCi;3bxJvx
zlp8|c1{m{=h}h#&+OKCWJm=jTHGbB9rXY=ZH!f>-NGnUA-`UH<pt7^P_i@IJ0bzX;
z0b>dE8LQ?3=>?Ol8;QlC8Tt1I#^YP1lT_z~s`8sBQ@uot7x#rV7%}Ua*ud|al9io3
ze~1~+c^0=0Sx+Qq-^|JDule}o!dT3Of$`ZP<$7nO1&61a`*BOG+pWmLTxP!7uCs*N
zm6q(#18_rBJPvjptFnAJJo`w%^L*WLZw{0^@kECa6=#_58xFUbFG!yXIQ>C2*^;nl
z|1F*1Sm|8!Cjl+9=^a(zoc-%;ATI@J7`cY1=6*N;>mk;?BO?0_qU}B)VK5Ek#wb27
z92?`BH&dI79CL~3sfQQ5&nh$y&|qEYZL%=0`*50Ry3VqKYf<EaQ#Jj{_v5!|(>d)0
zo1_PvLOt&<S-;L-b^DD1KIN36r$8okAk2s7oX`XG!yuF(!Mr>_r|{<5(e1XGrDHeu
zW19(GEPWh^az${jjtj4JBhefOHe<P4Esr=s7M^8%f*?l?Q-rK=aiUE(Uc%zvPS~IZ
z<D1U1ctiWz0?n}XLgX#(_%-~!RajB-&#gigbyWf@793#+uaSy>`~&We5hAc@gYz2r
zgTe*LKEWGSu7%;}T5b_ui#l3MCa>_yfE5ku4suhf2IZPrq9*8T!%iWhc5DN2;EOG!
zf;Iih?cDwXqeW8jS~~zobb4JDN?1fkVfkWMY>2IG2qodLy@V8v*W@vC@O{H`Frxh7
zXvDx6O&DeC?G>YhkyKdQF%Mt91_fAuy&pfK`F?F1T#MiFWp$zWIbd%AS?Ka^<{)5l
zewcPMsf93_VJ##j@*i0)Yp_J*UC?xa290v(A4b`YMtO8hX2k@sTaajpdF*lwM%e@#
zMDG&<j}V=O5!JT{VV5WGb;+{~VGVLcaVEvxOFcK2QSJ7Y7rA2*&stZWx0r`OFTXgG
z1X#c9iH^pDNaSL*fQP4!ona&uhTNk;s&YN}lq~|&MpALqC;&&Sa8Q%b{V(DGY`skp
zPG==CF_LJD{~L+_=y(JfT1UX;RQh$xrvGPm0@wBrh6H|GrPi>SAgG(anj^fqja(pZ
zrnfL4t}zEr3vuV}KCT2*i&wMj+=q#9BbFi)mC@`U)hKGbLMQ?k#|~R!RNor2;?QGX
z16?SK8#Q68d{6^}yz|E-;92o1JnJ-^9wJ9Ye8a@OC?&RuitS_vL<dZ0>SF&)f7t_#
zTH#>%ukwPw7+eSv-OBB-mI&2L3~90>G2hW3sAruUZQF}6v?OogZj=SMZkOVtLJO78
z-|VP(-iWQ>Aov_oZJ*z44ZRmT)V6GB#Ko9xK(Kg4JzbPJ9RvEP5r53`L8=YjeacT}
z3HgOByFyCQ$QUf39Sn?W6mEE4y5cqXlbOC3=#}|01dlwv{0oMV%$#k)O$P%muclxT
z0Cvu1zw31#v~u>(6yG=w#b<>UweLAAltN>R38?9ac{uW%H#|L;63X#|Q*D0*8BN2+
z5$n-vpTRRrMk^A3-{3TInwfYetbx(ivOVNP(oKd~id?k$Tei&I20X~`a-o8DA$#kK
zT#LbzPbHTx1CeV3@Xslg^OC4zAmGaz)Zk;I_DGPCOPNr0Bk3k77y#lu;wBnneTrrn
zxnPy$QkL=0+2YUFb65)h(N)`Z8?}z~NXvyuv2ekp?PBjZBhr0_=c1(WHaZO$q+@NZ
z6KrhyE<lG$(+q27dvR_Bo8tGK?GEfa`I&v8OL?9@<>*OBjMKQq%7C2SO|aXhB&B%%
z{FQrs7%A{wUvrVEx9%W9prUI0W>>hiXkgWJfkfTGFv^^C{H2vgzHkpZ@<=!Sn~?`c
zKIZ3<CuSgc<Ub6S?^}XB!Y_^mqkRo%ay!j>KMdRj2B{FGQssJhra>u{(|o+Qh>?tb
zX`2Iiy&$9A7;L>RHGRweq$3i#v=6`D1>b1d0GiQeZ`H09>xk%rVfJ#p_yYT?eG5(r
zJHF8EpH9j8pQj|6;3@F{)=Jy3R4DiB&~HxwmetO0MR2lj;kYwfuWdm&uHTS-A+y8t
zX#`AR?V%7;P-77!Sl0Dnx-YHm1|zzcC+`0dAS`r9G2-K!e^!Toh<OZxzG$QAOGqTw
z0ve5;TZ0H;4~e-gTctgzy5-=HJOURG1G^GSQKCcQnx=s4xK;X>4CH(;I&Ob`gnbPp
zr)+E40Y5n`2V8Wi7oX+*5i%n+!qgZ}5Wx{5SK&pv19J}zQSxG7wB^!>=1Nlmy|=KE
zw0}^P*M3z}69<y9BOA1*VIc}IXY;)R+OTWm0AH8QmB6sq2l7gVf_>AF;97@;A}A1C
zE`f%dR!$cdABW_YQC+j)ibZ1QLC)i%Y3-5fJ&I)hu|_0Z6`f&KqYa<QETC^*NaxpX
z21LPwiY|o~KA*p3mu!`^ATSEqxI9RDr^vd)_>EXbvxq3zlT)nw(HUPt$iZkMb4)VQ
zMwgg<gJ|$tcx8+iMwJ*$m89|zzOyS>6m7i}9Z0RJ12Ky$hQ~YrlDlh&IUz%N4&FKt
zoQ-z@??Z;&wiSlm$1S#-`Ex}nZAV}3SUFWRiV2vHzohvee#1Z5a{v>aQ}6`dfdn5A
z?OYEwBVkJg&v)5;75tT8Zxa0YIAQ8?v~1^)TGSULG=v>mNZWyDH&!mB5f)Oab*U3o
z7Xe(J0S|r|{~&(#Hj1tXJD6N>gB1bsi;)7((s*!;ab<kb#~8NvXQ~uSCCZ5p^EE=s
z#t)33q|1#I!pmjyBC#G`kt|vi?^tmV@C)mfgG~?3&R&%dP<aa8tYu_`RMZSo^*21n
z(I`^rydU!$Pe})=MIB1SB9ZeD(+C}4Fu9_^s72#4*7>;sU5K;n;$45N{74COUy{)`
z&tNE;fm`~8j8UXgv2E}?Re#Qyv<Yt8rtYO5;H7No7<yy|bB;f%wIje=Q8ch6?4?GE
zD<*(hew^uG&Hx>W&nMObEyhp3!U}!_O<3+AEBLrML0g+)<LA~tl|M@>?SfQ&A&HDi
z=q^aM3F=?$s6GyY1cibFu#F3fr6218F`4LO8sI0Dx>$h81Budk|EUmxg^&u<!b(ak
zmmx3yl5FEhui~CDfU3kK0i#)HGKT`AVJ?A*M1GEA$Dv3#r}XhgazFw=Wvnl>@Tdn4
zhN6TOG=Xh=3xh1HX@I}{`VMvkXrxaH|BplI?uqt{hv0b@p7B{+??Ekx5Pe{WZ?oOU
z1mbt(!ydj(zkmMcJx~dO)`4}8?prZBh8^a9N>%wbvQhq++@$|>NH&Yg2!Rb;cp{dN
zx;n`Uvhg8J_40ZR#DCbc=vopNvJN~R02BbwboWE>WTXdT@vsQp#T3B9+}?Oin0eRf
zh<^OF#H_prdJn9aFYqQANE8k~euWhuMaaU7rQ-ZwAt~YjTRcdSfgj|jKwOqQ4|nU2
z%(j~qkk~q#O;BtRv>#Z)VT73kcP*svF2eJ|!#Wh_{Jl3O<**71WvW(m-QKx2jtcI)
z7CZO*JYS(I@QaE$plt(}%cZ^kxm2IiFmuOxsbs{QT=4At!6*Q}^xGohH-K@44!8~?
z`R`4vL?*gETr+G=@IEgA^B3}ukk+MIMMKnK50}Mn4@wm4jP7ke3ut1(fM+L+Oy3E?
zQ7=Dp#PTkT>_i)bzmqtH$p(Q+SUF*I7l$!csY>ksc?_ZLa1L=Fo)Ho#qz_v|<%w79
z_)W|!VBRs!o%!hCCBw7Bp*S)4(s^`=53&;I4M95j8C4EC{zPQw*@v%!dq9pYYn4z0
zJ$)UPalw&z``@LBaA`xl)Dj@O9(J+v981<T_wQT)_<NYSkq1!34~fpJJrtspuse1!
zx5Qz0v=0+2Z4|v11HKiW<Q7E9725#eY|tr1ag01GKKcvpE)=%J#yJVyov{J1#80He
zlBl?c^_4mpciM%-N=T)Ev0X;U*gh`{!hRb^<`Xy+e{J1ebkVumV;KVT+5xB2=2z|3
zV^jijl4^Mg2MKKqfOy#sjEZ)g_zroQEA<8`2MZkgjWmr^EBG9H7RVp))MFhYe;H6F
zFg675t&)dxB2T$;9h>m$@Flzs3|a?U>aLaR_&<5n;<npv)<LKKDWOx3-GXeDcSU@>
zD|RZv66PLtAeIQ7dnn#b+kp3}{DVOshxCa?;fFXv64IOlNEJdWowbUJR){9#h<Q%|
z^QukotHJX)dcK(2wATk+S4bY7gP$pG!NL%cj-Y)%?BORQ>m@<^t}Zb|0tI!`TyKaj
z0e`tey^!2GzezLp_xXoeT!{FHr0@lsHpV|KDwcQ{@*sg$e^ogD;;&88pm7hE4<a)-
z4;{a#EwgC<b<o`^*V+E!H8?i6=oF{WadN@<fzR%80FHm3JHZ?2c5tT|Y{lDc%PFRa
zU@5-V$$6ufc*3Ft?2}Oa6qXvyJd2fn!MdLwyE_(ytcbnf<g&Q$c7<&PV`KDyC<o3N
zp}MT4lIef)a={R1xN}A6u6^xA-lXMoMbp{^oMrcZ?KOv$AHkglJkf8mkq}vnj-=ri
zO9U(-gz5!Vk6K2n!RK{xSyfwD6ir~wk3jhjRNSmu@~{6n*WJgF@fef6glq6?yAC_T
z55F{EC}CEIW#Rd#Q7#=!@+0&T&}eI7P$hzuk}K}fM5c^4JYRH4FSYRA_eg)mGSI@#
z|KmYKvp2ayfmUJ#a!MZ0nca6`bGT?Sjz)lOf!q8=18E#$&^`qkyog+*gvkZWWhXmq
zx+<Hg8!<2?6(3?KVvt&NF)BGH;h71T3-?kmNBQ?<3;5zNzI>Fff3)8MzP@TxFv#(k
zwdE%q)N<K~bwn@A6-S_ESS@&-QakSeI~wyG#^&oYtrY@Wwfw{`OQgF=zyN2K0yq7d
zeIe?r1S6PbFOf$@3)8w%b8;Ym;|%*67{d(~av`C^1;um3;dyF<avEtW936u*<|rZd
zBlwF%UwOfKFUjNIxRQfl5(`wo!V>2*mfDB?TJQhOO#mA-VhCOW?JYk8BY*GyKi+^8
zjo^o80~$IId!2w<{AYNRkFYx!+!kF=8^2}W+^QO|LR+968dn7#c3M|2x^la;F>nq@
zZdo(Jm%1RpA+-N)B?Uaw{PJ!s{1z|By>1OWyx$l`38n8MRI6Ku=*8E2_0fuRikShv
zExo9_8@V@CK$47M5#RiSElgZ7;jWJy+xmnY+m-u)z3@BvHyJ?|z|K+<wH`t&y3B8O
z62-fo>M)!ff!VuX;1NR}k7!u+#JB#@mL3mt0(L5P6iH!6K0ekJToI9j#71KN14s>*
zhwb_0m###YRV{ZTZww=RZlW!O02EP;w5bMM7wH_$`&Jr1NO&<LgL}Pe&>u<j`=JY>
zOF}Y5l_)QK69&Zon)jyYWNw5NUWj<_^7tYQH(xJZ0VKc=UHlZ-YvO7R^UCdoAALnj
zvE{4LU9ps#7#_3}|6L#ew58oH(7q4vn{HpRxo9Va(Z>JOf9StUJJSVhB!pXAjf}@}
z6?jWYL=dtmrGOz0sM)?PzXxYuyf*?N25i3r&lY-MM(EJy5GcaKI>NP7DBP-QRYo5G
zeGyl*c4wo|brGt#*7`L57@korY`Lz|cr2=~_G8b$f|0`l-JHWnt?f)rx(KOwJ>+t3
z<S-Ilwg|lV`E4WqH+BT<PW@dF^~Spm2#GG_ru|tPyh42DfhX(gKKMz*l6G3U5G{*4
zVILwXcHo(2SelR!f#@QJ9(h$<I2cI&+70w(@Xx0OB4+xxX!l7F^C5{o)^_E8fp7ew
z%i>OlDdHPNA&b}NYxJU;^lJQ%x%upYtm#v1XQ^Y^*sJ`W2Fso>@^c$}!kD9S_v{;K
zx<~vR56`Ba(K%ODU+jO#Z?ThSvSD#b#Nd0`x3Ul(i*jRTO7Dfm1^riJGvWn277DZZ
zINuG#mNw;glrC>3W4XD8_^*F%X0O2-ByA$4!m#9$0zDe+dDPK=*ocTLj(*_gB;E{v
z6~XAxp+7<lK~&_TY>uc%md<JT!y2NM|2*Zu*x+K4n5=fM9B2)3Ir_DQ4i|dUIW*SC
zliZZ(Kl#L@VKsdfjcFRdy#5#0$n7K|O+5)T9{cvZG|l3`<nh%5!O!AgRbIUx&j|dz
zBDBB`Iruft{>fwTLvAT}>+^&?V`#e72jDE_{|FH)T?-SKSz9<xjbVw0$>ge==7{Ew
z>6oot5q_0V_rS8)5@|D7S1$|VP@)bNoor|&aj=re9P?UFn8ck^Fz(Cx+eWD{84LeI
zf;CcWunDv*!~Lc+J)`~Q?)lUN2uS7na7}AyWV%|d+lrln|JrVaUw`t9ggIejHUi4g
zuhmn*MZe;U8fDxEu<t5VGdtwyfP!j3gSl_frD+a4UW7)%u=E23H_vT6vy%V?y@4-o
zFww7tEp`Rid|XTdjTKt`7gzz);KY-hO?1H9U&zbG(QH7Ig|%MXY-wk0R!sdDgZ*B`
z9nG7A<(Dq*>Xlr*ne)`JF7Mruh&usldYH-2@2LW!XA8XcZNB*VfG_aC<A@DphMD>R
zVbiiov78lcErJ_Yu^^hx;f<B8LLTwQ<X2Axa~NKA`?}tF^!^0&{v7VwyD-b+Xus0y
z-HmwAez_cCt2qiY-9iS?7uI%_-x`A%Wp=IJeKHm*X`&pFd)Jj>HmsfsnKZoWaFPew
z%R5T&ta8C62Tc$h^tN90_Ge67;Gz7*T0LQyDFz}~6>VIUIie0(ow5WO@eM05;)?`Q
zt5pYi4RI_`E`-6L{1zGvF};fNEn)m%<S^Le>=~zb!1OYXMC@?BsIKh-YiO-<S(2sQ
zq-n{6)r9)_7T7b@4ZQ@|=j?7cNqprYh(XBQfkA%HstPg4z(nM%D;jp1_^dcOkjG%p
z$PfBz_Kb#v?35uO9bjJ-fRNGW!Yv4V+Jn}{Y*UU1`3!ATPL~iD;ssU27{+3O^6%gn
z%3OWT2AoeN3!_<Vv1L6Lfa{tUWLUi~umhHeMM7so7*~HP3%rjTe`%qNSOmO&VKsXz
z?BK9s%eKHQRQo<$W!(ph*CChAM!U8JzTsOq*Z#!PPNB_*8YoWSl^JlV&mAhCS^X*Y
z8nD>r5@E%)Xg!coR1y2|yC{0GJ`k|He?rfC7aTRF(X~%SVM2v6@Qn|;)!&5dB&7)r
zg<+|KW37-Oh$C>)-oUO4G4)}#q4&oD$)ZmNjUiqNJOm%4H3T%?MMC&Jo$Dx?=}usn
zx!WfKeoKN?<6+N}0U5V_3?W&)X0m7?<FzDmC3I|#0-tX_>3<!Ou^*99v+vVYNkqoW
z<i~{2Y;FP>7hXq|rb&2tAQoQ5;TR6m?!l0adga*FpWZnKvleW)8;AHk0@k&DZ|O_)
zR%>*=k|y*scY*9=N?rSe32iC{X!zqx^|wZeQ_@IBtlAaOe>oz*-mzjd3@_p|-O!jV
z@Vj9oi!MI&xD8*roCWLc*Set}t`D!jqP0fB8~*xC5x#NbGn}%DhYwbbgT^L136c@7
zYgetA3gDnBs#NyJAq_YWWa|&u(}@Vvh_)-YA-pkK6&4;uce4f^8g?4^1{c3(*uunR
z-qnjj@1TjuO$!h5Kmu<CfsZ2+@-}$JA+?kYo00$VIurU9#uO%@K}q)P419rMM2HXZ
zsScvxt(tF*7f(wQ;4|RU(?G$w*bgcz$AK5}A~|Hy8iP*eFbBw?c%~A(Z0!QsQ8sZC
z!vJ51rVc-cUldZA5`wTO0lnh{7xi#kM6N~ZEha?n6^pv;0((-XBBPIZMsORv>Ac$l
zeiya?V|<^*uy~^L?OG9w7?TDL+~m#>&Wazw+^nnKg<kyuNe$JbUNWnhhFcQIqo1%l
z4IPpY(9YHzMb7y7+y{oSaANV1Mr`p8SBq%s1}j`oq7Z=7)}Y@;fC1MKalt8`d|G^D
z<&xmX`tbJhGAli0`0S=1GT<oigpq)O=AOLYPG*Z3f!gaWIt^s-<<0t4-J?t%2O<gB
zFCd(yfC05?1=p>f$9p-Lb6|E-8Pc;afC0-g@Tac>z?cjqXy{3Zyn%jVZ?10#a#0<H
zFTXy-pc!^MP46<ntqKA9HOA7+5zwTKNHmGm2dbhuLbxj|n-;%O^Hi{wd#t)F9l(ES
zGJ=~CyRovtmtRUnmRelP^zc}{??9Jxz<(lsIufhr!6E{5GAI$|IR&4w$lNUCX&^?s
zeG>}f!xj`TZz2^1noW|GTtYiVigqg3(9&*meLxjD8>?O@3)Z1Q5;A31euz~8ifC*_
zulXa0(0>4*D(v<{Jq=POkp0#--*KWpx*$DZy{l)GDS<1JfDVLHCkq>;u@k<$YQ-st
zVEfR1|B<eUqyLCPK}5wh_chTsHX##h{urnbTix>2uR?Q40Cknt86;@k|JV_#ZbzZs
zQnNl$eSf&)=>4mWsZyC{?aoXQ$L?-K-7RD@jOw0Q4>X<4x3^+fPEfI4a|9?myeFc<
zA6g%O0IW(6Txk%W@#Uem{9={p5OX&T$C0A5DLPWIY8i3qMs+7Q(sB0ph%((-zO4UG
zDt|L2C=f|DYr-N)28kqB`qU@R3lkT1SEBxe+ZSOfbV&#D02L%k#+&(UmZ>JY%8Q7;
zP9YWsVrXC-2PJq1lRiT&9;LKThE~2kR_-`SDnbcy+A;SK+U{-Hdvg0Fb&4V5N@dBc
z4=z)QMmt1pUu@cbxUfO!xZYu>SL6MSH7}+6GdJ%Le*ZZ|H%j&at99<E1>(I<(jT)8
z1s|cBH_Skbf`ON6VG645vO<O~{urgRgR;TV`8*3;Ik)|IcY;0k9aIr>U1`n9$b%w!
zyMyC%i9a0pNpDdN+qbVfL$1r}g9LSt7Ew7~?<j*fGRhZX1kUz}1@7Rn(hIz>2k9=I
zKsLPY0vj=n%OmqP7`OheJ63ZqaQd)3aNMgiE=DXA!e(uGsxB0a`vCJ|IEtS_oRWc8
zmRBx_>!SF@;>$kQ$e0ALVREec_SHO>=Q-33jZA#0cT3AYId<2POVG72P3eI>h0Q>d
z5+DR|oVI{AO&2BsJ0t)wI?gF4s(tIu*)Di3qFX{#g6Vq7R?w!8Z#H!7!Q$FENg-3z
zTJrd1VXdKNS2=HP>T7#JWo)ApL1*LvGL2^TUPk8^18#EBdph)Ntm<%P4f;Ovz-{h5
zlDe^~Rym^4#er!AP7WDM1W4NQyuJ>o%H+OP-dW;Jw@|RqlHNQ7Rp%`E4pZfENw=IC
zT&^d0G76jo2!Z1y3Hs_qZri>=iTZ<+ejGDe&Kx8aLPf^Isw9_{;?7^^gU-B*6%$ZR
z>1w^*s=84K$Ch&~C#U5zJzPiS3XQJ;rk)ucF&4KecNZ)7j^rtm<IiOR$zCD_@8%wg
z4(@woJkM|8M$(1_N(aIuQ#R2&NQ8$pt8amD<-HsUnyofbSTpRL!KXk%o0obaLhai|
zZin-W{cGPB1{z<$4YuZ%LT@<YP|Xsr9iEC4?9m}xy6v`$+7b<{6Sp5X-2E=hkWt9|
z8RFQf!Mq+a(LT{q|3e3$!;^;wbae8<9q*V~Or}a(<I`G~qf@jIN`AdsP3M_lmhWYW
zl3$-7t|-vl47@g)Nw)ATuPy6I`1ZA|9Y=xD*7I&g_bJwcAh_SS%k-fAaz*=Pc!X^)
zBsYs1v<LC%!h5vPeyKaaew7FD5YQ83my<y_@fZ&Ta9!kthBFTV7Ws|2&NhzohlOYA
z<1d^}H#SSrDe3~qfy0v$Ai(<D3od4wwsHVcF3mu3kYNJA38}kK?&NHm2t1TS3^nc#
z_)VJL16Z}+wJj5U4FOOQFWXZc<rI1~hqyv`1BKY>aB<Dv`oum%1=6q^y0g&4*mHOe
z+RlD$Nq4Xg7k7GQKhax~Tewr)QDrleptk6NjkE{5-v@;vVJo+Fm?31(@O-l$EWBpW
z)v&Cj&xz5q@yuT0L0I)Kmjg>L>}=$#Teaxu-a(dxU_F*2^*lWIbBfkTgXaKLdPtLA
zbm{Lyy;5+4hB=@6;#L{6ybVrH)hhSJ7{FF@lC0b01J(T>uXiX)dV(nN1LWK??&+4~
zixTv_hCZD~c#1@Pzo%ICoW0VN9y>o?bGjrN^*7x9da7lYAw_(7us{V_>0GwzUDt`G
z-n#gJ=yMAICXwniXWg0O1jy_jHxe?7JsC>@ls4*o?Ac5l>+n3|1EuaV&!|KEBIvx)
z@9W?bJkcREV&MHMBHF62p58j+aOev%+kCUPXC<syI^dpQAi+mZ4qQZ509-wsOuY>m
z0cSvGG2K;qTI>OaTS&^O(sziUkY-g8o6pug8lq-a?y_8C{L+g)n&>=-OthuyxmP{0
zsf$y(?py8CTZko*B1yPQPB+PWg~AKel~SAIn0(F1{?lOx52<I4^YX8cy3kSiL~<0+
ztKPr7-Z@IcedC8?<BLnTsUKG41I&dJx6V5<_VY15W_Q=`VEqAw_1^%7!=G8q-ZiZB
zDkG1+M|aEUx6*)XT1Ojt8I1}_uK3IYaK0b(wdSn|l=fuMvvk(SYo_7T5Nj~=x3rxc
z3ewqwgS`(`Pcit8?tAlm`Yz~fge|W7{5-a$A(Nf$=eqWQ)DvPC-z?1nA<v2rGuL^M
z$+Z_z<KiJWEo~t@%dI<0+yeLST!7z1y8H^0@p0SbNwypjgEuF43SKKDOAv?FwAX9c
z)iV<e*PO|nYFaBaaL{_F?NF$QjX!kHJ7n4*Y}M;(-EiPSQb_CJdC|+n&6y^?NMdg&
zy^y4#s^vVR*O;O!qk+S@Z94!Ey?LyD97>}m`(igz_c`*uG_$6-2wfq$L5m6jnC_PH
z14<dQE{v4`Vu1rhoexkKzqL}8o)(D^1z1c$q6plg@_y8a8UIvfJ0YvB;*uJL5fUG_
z=px=)3>SwGp|7ul4;o*5EIX2p<YO)`4dRArPEZauXPb$7Mz5ZFN5JC=-+4(3w;<!x
zO6J23lUI9cV|@|d+ZolCmv7y#!22fQgG;Ezx*zjn6`mtHxjDTR_cre^B26ZzVe1I?
zQ!SLaU6-hyH95d!x(+W21~Yv3$$zx?49h<**?y)4@J3X7fRDneg9Xeuq0HXz_Ik>b
zKsCJ&5BJ(Ri<@`JzI~nC$Kki@tGD>9tfg~PhNv~*U?m~oxeWnF{4!i->{B<2%FJky
zr3qy00^1Q7<A=$@22q14C!eIot+omQqBshq01hW@<U=y@HM(Ax`#(Uc<0`sd%g2Ax
zklYU(V|nV>QN&M>QW{1QDKB_fGpwewT$PX^%Y81Q=`OH`A!3hSz-p@D^@+1!<iGlO
ze6Up=$+YJ4(EIwrZnAiyzR#bo+|v*!0`oQUn{)fZ+!W&_gn!c|q|!P!HN*u9#`oy$
zb@zWPW@pV#^pyf|+cV3a>cP`S=EpfC-^q9}J;0KzXCPYmJOEgC+~0%P(!)QhIlZPZ
zq#~{$dd+<hm|L3_(=j7JLtN3q+ykN@WV2N{y)t43P5NGRYrlo;*k6~LM&wFpRt}kQ
z0mrdLjQ?#NBr7n;cK>KB1h>b3UNl%ZfoVzlt#(${@<F^0*v~zeg^h9w@1JVF!L>ug
z`q5f)+IOCbm%1khn&+$)h$~*53EG@)KVg&&h*8IHj|l}^uL~D23#*O4P%(YsQ~;;n
zbNS$6D=M36ipDDwv74#6wDMFC>dMPeb76I`pg;IWMdv6-yTrD67h1#>34#0davE%A
zyK;D;1%Lb6o?tiv<Bb;IhtaOK2Psmcll~08!D2;^{8jHEMg{Kj`ZC@-i1S~vJwlc<
z=^h{fi^}@9;MXA9l1tFk(cZ0?XudX3%njCj?MZ)gRs|WIu^^xI7>|dMzTUQU)0*d=
zq5$Ue^ognjKXHXKTafql@$NG<gP)+Ac$DD_gp1F!xh2iOyF2F<^s}>DXVj1sD);#C
zQ8(H{L6#3-wd!}RGAt&bMJwChlaF46oQpG-XAp=Wf{h?8eF2<!^FCUtB3y|bo*_ji
zTq|Ut0{e8&N-qYl0+#tqFGIQ~?yCCr@2JwQ;)&X3HzWjzK^-u$nZ0=p&O~vCazfWX
zA0fOB$f)hmd+S2cDLiCkXi@Aze>TUmr*L1cNi+YIx;VcCwT#|rAqAmtG}&KV7%z8M
zJ}G&d`hgaxsCu-ovtVve1aE8u9F?6lfMVP`-U=c>dwvdpE0v_ZnGYe;FfCl`;ux+X
zfPv3wkLKk#$#=@wSXjFXpw#S*ReW;mTga`LiQzAAQ%zfi?cgTm>Cg-4S~L_ixU10H
z{Qjdu>dHoSaeEnLlg~!Eg<N~H;Y6;;jzW%>{&Cw@v*C`{+}ed}O76{rEK0Q=P=g+r
zHeFy=J_1YhBe0`q=fvHOMvAW-?XtfI8FxWX=m4H_K}k?TINSLakswV<5aawwb8qgF
zGv*o@%mrL@Rt3bd-C)%5{0L2QPx=55-iEb4Y01&72RCz*tD>${#reSwHM3eb7OYfL
zFW(1t-CvTtGv9PP=T&c9yhqY^m_r!NL6ypL_w_RpR8o%h*2e03*mS+8?woZ3+(J)b
z%lvVRC!k=ZOKb=BB~QFm-5(l|RM5r<lH~B9Z@Ba92f=VJrP#B3V^!0hYXDA=IlzLf
zOI;^a@wzqFhOwd)4xU63t&{%w1eL0n%(ZIC6v^fK?!JR_Pe7y;yB~I=<8sauUAQ_8
zh$!r4c_S_($2>sEj!%olF&jm+Zx|Oj0VmibStvBJ=@nq$jRM@}_J$NwdkUpwt=5<2
zLHs5h&`}uM1+e1Qc`YJ$7R0&7JVUh=!HeCZt3X$qTsQ{OsSCTF*$1J)57yc*D1?cs
zLI<yn(Rw@Dx%|S8iu#7O9hl8)a5z8Yw4jx)|B0b7*CzRkkHuo&lpo;wD13ax-Z>j)
zMZ5?$0z)|EtVaD}oq?hP#cWGuh4_MFrSSNT)51M__P-4p>@?_59RLWY8}Gq7_ZZqb
zs~R|#S2`jBVE@n5GK{1LX6MH`za0zPUx6ii2VR2j50R%K75Txf69v+wkSqkD&-CbE
zCu;|!2c(hgu|@}l`#5~Vp6(%MMb8AT|EC%8w^*p6jj_H2@*rriyh^L6J+2|5%4e6P
z9LXI9x1RQbf_TM1bzjMQn?C^09Rz5?+tg2MFv06KFKn{CA`cOtB#*HLXvwN{1^QWq
zk?_jK?Rd)<^4LK=-#&v{xh31eAnZmmQ9Y27nNmhX!l6y<HW~H)xFU|jN*P8<fnpAs
zhL!J(#?qAYZvcqC4r#pkn>>Y0eD7<(p2!C1nXWRP3QKd@vvbe5hvxzEZaWdL)rSq7
zLhM>3C4jvYL7j9EtTA6nuOVNyVaKoc_cOe9cA~}%QrY)!;B}k>JC%{^*04nyd3X;W
zFH*Y)_mhQm&wmiyyRqTQ4yU<EKZI}dWxURyHs;LPxby6#3*%!Z3$qi&&a<NC+*~{4
z8X`b{7w12SIh7kcQKcQUt{S9>E~!JPso#MM{^%mnlxY%tUh&YuDaorWQ_#PTKWt&v
zd|R%3_yx^eYd>+P+3j2hTw8g&MNOJ`=|!xszHvB+dkNpuT1UZfnKUlc1^3o@XO!?T
z({qQi!P)x@**mcY6e4|7m-D~t2yk7(S=y}|k;}}zFg{lxPJWMar|F%L(5v=NeK?EO
z94n^{;(cQ>;U9Pm$}Q)`j)qGDG}HsIM`g3kzlA&sHP|UZTw&kZ!!rhOgh#Z3zF5}a
z^vAw`@-*m%JIv5@HqO#<QnD}+?C&g{esuLXt_~cN-r(Jf42c)WAys-(QAQd08$aq9
zRD&xa3N%hl?l8nUk|q8hV{ZXf<=SnJD*}QFf~Y8M02ZKhgOW-qDjg!-y=f2wP!SZA
z5ZEXootxg2ARrQ(?v(Ck<EDP=<(zZx_uc#b@A!}5I0G0*g!g@(6?4utm+60iL8$@?
zpgoH(^Da&suJEaFnr`_%&Hve>UH^@Kk=^J@P}GCiTfS3I)b2~s!Y{yc$D~*wZE=K7
zu%q&`pMrxeY!QoW{LH7&40|gRj&1Nr&nHl&9m)!AiDb@a5<w~Q+GTQ4$e)hOk9te$
zn&+4+etf{sbD_3*ckK}|nr~gu{<pLn81gY3jcA<wV(=48XsgIbga*@zaCDcuY%sbd
zPB|2V7B0h(&T6Iw=cmH1cz$EdUcxSk$78?sE4`?r(`avwQWi+Fiz#Q36nUEVY0v~8
zLQJ6?eD2mAwCCzw?AIGUnZ7b*K{n-oCKIZhp~mAUGO>ByoFg-H{C0<?NC>vK3MDZv
zN{_6!j&FCX@ZQxaOSnCi8PgK$K6_ON>h0i%({GvFH|NcX`v<KE^X)k{WgBX8qZQf1
za?0W!JF^Feq3QF~e3aFQ(1RXL5hBZTk45rD2HR%N|KNe?_c*Q#t)@kq>Q*9LyK8sx
zi-YfI4P6)1(k+wb;)gw=%gHVkaAplhg^-I{vgl#EEPMLQ<p&xcQoyPnz3P_v32KyO
zC64-E$qQ*D74vqzzIO29q25GzVSQPoBblBsc0ejfr0AHbV1dSKkM9f7e?mK_$seWZ
z+%^f0!#b|3HJ3i6csdA9y7r7!nWd}uTJ?WPz#?Mqr>TH_YC_9+fTfHB2844YqHwj?
zYD~AB1miSIVoSq902c0TZ!Uyh>j<XGi6VlH8&*zi!GlmMujBxJ7FNxwokpt9dwKD)
z^qiA2r1;RGGLP*c1F@-{+egtZ^F0PSE_CNM4cnC~`WTDAy)&rlJv1C_N}tc^=EXP-
zca7wbI`2}VwO`@D{8+UZHntl>qlm7DWOfP4c?X6z{<H~jA`5}yPw-gfCr;hI=`h`R
zS&H`08^sCV&AATSU=A~Ttv_&sMO_JYqg9!K>r9<)=Kdz^mbL;#dcm+QLZBo{xVOqx
z!i+_I>I**EFQ&|oB7baSYh`RAAyk=LZ}Uqmqh5FaJIkJ>_(vq@YKLH<;*;r{ciEQH
zeS5!3`P1$Hu+gh(p4mL{JJQNIYx=l-ba~IkR>EupY8}(I;Mlt~^%UxCnF<l}Lw)sO
ziX(2lxyBCQFSfPn7MDMZKgG9B$Kb-Pzj>SG140rUh%NsAsXUT=0EhlR-ZQWu?v`Q6
zzm15Z9)tjDwQpe_VY@!vDl5s4H=&iyJ62|q{HVGGK(aL*tK17u1=s;!N<IOI;d<Yj
z!S&&drC|vJdH!Clh38y+mR+YQFo^GCT|+x9gPDtDnoUdJjP2@peOs=vg23q@e5c(Q
z<l$@3H3)eJ`UQ8`LjmJf+u9u5w;+fTfqGSGzl`)=zQ<OSZMnH-FlxRtC`w2-if?%n
z%mEwxnZ3>20;4FiO%4lFBryp)UBL<QK1t#F)bc0gHQHxmwo{g(?`r{IyFp4Zi)LqY
z+un!ioy=kUlrO$8umd$lFWH@i{z4VCOda!=k20QP`x18{ehTPD_0Tn0v{GWZ;0U30
zJC!Y4=D6|s{h7KVkv9#Vo69Tcmp`E$>GBvu3bWreZ|SGDC!=gn_(Np)5%RFQm=N6l
zB%(dr(TYQB-(#uFbSB;47l2PcWoUIj+RaqEOPKZKJcyU&wi}Ub&&If$1^j^m?iTt?
zA9<HWd$^GqJ1T8?9JWASgi8CpVO`L9T)6aCM=yJSzs&`4pDM^J7nd2rJQ`;(I17m^
zU(<!pnHL^-QQw*WhP%CRP7iW$-NdW!JOQS;y%Bwn-QXI1<0@#GDn(^?AHsxssNrLU
z9KW@$49*mI;BES|^Th?$J?=ESu(J=-^yQgfBE&I!4fR-qvjM96byr60NugoeXzzlV
z{DCxeuB-RIJky;4XmA*6`)%c4&yWz3jMVw=ZdWLwi3tdV+WzN_Gp|hI-~&aS$CF4C
z6M=-Dx=**mr|i;?JpT7$<*x!pTI#>LR5`=^=*KDFuP&TDi=|)^)gV1_Zp7+6*}Tk1
z@4ieQG%033r9>hoht`M9N93xnm_+$W<ZI6&(9Vtq9@5cQvqFt5ZRB`P?ZP(U1h=(G
z`wPM`{-G@v-1!!V9!q)W#02t1Z!cG$q#sn~<25TVe{Z(XmoNKs3+m>8ewib$2|6t!
z?$`L!4(&ku&!B%?jY(+3QlJO&{!bEG9hN_9Q^8SPP|w;r#dIFZ)~swZKdO;-NVl1M
zqpwA{#AS)8P+Y`i@iavaV&fVlm2cBkcf;t6N2#8Kl;4S4i<X!tS){vQENH1?Xfpa>
z!6}97cJ^_S{?0huA&OwAX2o#s=J)m9DMPMF0Z?XOo#s`qlMR$MeBw*@!fu>*Nhkg2
ztW2118%fig8ogy$8-N<&pVMI$c@k&$S(jtB)TsW{7>|ZjMNX8ipg<%3#n2HtvknhV
z)Ap9!AzC_=_~=WTpnVTrTZ%YjnQ*RIYn-4z0w|JY;39dZws7m8XIkU^sb4nlp6Hkw
zg3fmwypC_8ZB3%kW3VIduB&#hF4o;ncq#tN5u8Cu-GJzR<L1UG*RCb5c-eQ{dfrr6
zH*yjGn)bz#%~eHy`iB+j)26zMIO$!cPK>F_nDKdtz@mGYXb-j6I>H~TO-0mN9K!<a
z;S$?0zLpau9=<N}shn3vVyOwef*Ph-pO7H@(M-O7p&Lhq-B!yI@p>KP%{e)gXS2|j
zK72L395Z7ISWJK5lBWqy+_`%x1mLQBn_<6QiFM%Yp_FP5BPTThPO6;ypDRpoQpw)I
z0d2WK^6CI{x6*N*gqE`xcxxes>ni`(L1m%Gig0OUS8mDJE!d>!vN*SA>gQW_pQT{r
zGjF?0_tx}5q9&Dwix=!tN&sB+16q6WH@WC`#YX@U&QL`C@VlQk_l?ws#`+%~%7&I=
z%X~^DD`ql;oEl4guS2ZU{1AJjR?|FR!Wwuqz4kj4L(J_w2+oVQTZ7)_MEPmimj5vk
zxQxat@h_(vFj3$vf8Ihpq@fU;%{Mvg6g22F*Oejrr88Y2uf#&29?VMz9Wqf+fj+Xl
za?K^VfhW4*$O4hIO_e*d&rS~Y32lfOI%kx{MNW18{`SymF;ttR$|=?|t4ysnh%qW@
zE%8jOo(f&+Vq2nu4!ctsnpem2Y(cXanly~BGU;|%Pm>bIYc!8q^Mev*-t?ryfBJ1J
z#0=!^x-)g#AmP0n?C;ck;X}~O@-<B3c;GVR&*c77P`q_YrDy39b|j%B!i%{rJ1O&(
zUu#Ce%|Sy>W}k0ZEQP4$A^jssgx@cFlw!3BgHu53Iq7Ich`w-T>mR&M7uOXoZzeD=
zJ*>&QB?^FFF*NjJdh);5NqAnP-}ar+)OY&olj*oRXg*f^PF7H>qIzN|8Wp|SHmZqB
z@tN$PH6OQf7pJftM)g|<l@DJ&HaF*-5v!>@xwJ)OD)(;`E-3&u&0lJat|3S0zeEg@
zBaxQSjr|4NjA~kzi29pyBoo6|{wHC~QcBd@HK=(+mrk*L<Y8Qf&0vY_p*ubX5*GcV
z)&B2B{QPtvmBvuU^Xh~AA~Wo9xJgpXZ=l*lF5TZB@>rU<swgFeF=+|`_vwa*-|Jt3
zrRn`6D&TgV*=lav^qKFQzaY8itZ#h>0h*&a=ZKCkiviS^tgy!?Ibq{w1%H)a&UZ$N
zax*~zm%_*F;(D8R`IoKE61NHSMyDDqLR;JXIDq{@T@zBBFkZ|c`tTa2xOLsidWwbE
z9&9z5ADtVRYRAbggfaBb^xpNiOVf%bt!<nv^Zesr4#f$2@oUL20y(_wJw?*lREzl>
z8~P-<vw@|w<7wyBzS){%4~m&PJb(Xq9jE&}rZli}mi4@lWs-rZ*IutPZTD{e!!)$I
zzFDY;<v>x+it3A}HE%SBv<xbOpPvx*pJ8%bo5%EZD~<FnWn;Qdv*B3K_1=ET!AxQn
zPF-EN;T|Q4@C{><*ZgTq=k=b?x-+Fs<(3m$LZhylYjjr~i5HuR;d<QS!w$B}`kA5I
z5fDeu8(4dNP+wHST3J5X2MgfMv96~lh#8Pjp;!}yyk}WxlbQpJjZPt-^g-N6@*=;&
z5(OWauFdkez<cM1%6U}N{{MdO9fO)uH~Xj`9idd_U+79<d=THYAoPIt*O}{T0Vw_B
zV<z1)e950_NCwlZPT%ef;aME{{z3+OG07dGSrm}y*D|Zq+FLUG)RD&0>2u&<v{OLA
zXIt(IPcgSP5)T0=8xCRQ7a6<xbaQ%|b#sn&f5E4fF<3*-9%F>PtY_Ide6=O~mYHeA
zKtI^e%HWnv+B5wLRYw@P@ZY|6!m`H)_dZ@3H4#+VW}|T}NYy{y+Wf&?%zZ|~?2Jk7
z@o{!{ej$n~k2_9w_v4l${X{oNh;7n4^p-l6V}xsB1~a9K9&^wscWZ$Syo69h>;BTy
z4b5*{Pg<J4(CRvUNLz6-jfw5*g#;p67sjcp1rEtvd=?!y8QW)#spzhCuMBu+Y6(Sk
zQYUVIZ^Ed4&Cm<^DuKar8PeX1@jG$;6uviwE3<WOBHFPzr&JGWpI#Dn=cjtY2j{|z
zN1I`UcRAhXYzywL^Y%<q>}ud-{w1#-(Oqx0>#A{78?eh!8as&XO)q!pD|>mT!gjWS
z3UKT2DJQ~n3NGjI6RWUp<s4UFF9K@VmwE2>4i=hl6`w!gtZl>#^Y3HA<sSXbhY0QO
ztPS2E-@f=h4>Csr`FZ5_2YiH&<vD0;5SHdE0%7(ge$BQ*fNhGjMh2ws8b<igxqDw4
z4s^r+D>}e;sRB2M!I*rEBiv<1{}@NLdr~-N?=6D_VKEw4luInN>z+>*YzkBpcABO-
zYDT;M)9V?jHvn(vXjxX<@;Oa)(x`H<pEe5$`$s~%D>0p3X^fP7?~OaXdMwoF#up)#
zw^IKBzvNGVA<{Dzq&-Txh)`yJqra1lpoivd8yOE_RalK~y&GO5&B}IcjqTXDHpz4q
z6<hAHSk`v8zF)(OHAmI*T`BA|o%sspep<UgdQ#En{+j=J&{C}%<F~NRdXk{~eZl0+
zdOKx3q4*<4X2QG0eb7^FdQXkZy(M<WG0SIvFY~#N3FAZHcAvKT=zqOK(!hFb+slYH
zx^cxF(M2@2a`MsvWU{bpm|qcE=@^%dDKC|*IL=z4m)^|CuTl|PFKnB3YlHQairB_S
zvyq-A)bhyfnV~Z5O5D4FC|#GQX;X27e75TSfKDZ77us+GbRvo^u<TZ5Q;c}hYdzFh
zvsljOao1P9t|>}@p6gmG?b<Y6bhq4P$ZhI87wOk(%wX!8V+CCEZH`~A$lWi3kNrKo
zk`Joee>Br32yP#!qsE3v?jjH^R`)BUy3IJ+`JbxWmkKOVB_jH66Q6Redbw4O-v>Pk
zvSC-0F)HW<(8Ukgmv^GN%e}S}AU1J>Bt!I>`x5ZDfXh1)Tk-|@J#m@`Hd=^|9j`od
zOw{uJB0xVM07MkZ?6n-xtS+L9?(lK2rPF|Ue*(jsHs~6AukkX|6L0(;O^ix-{H&St
zJuImxtkZaf3z-qM?tD;$Ai4f_Wo%${ucha)z}K2x(r?|Ca2IlXxTt@O$6~Q8UYTnU
zD6`zp`f<t~$Isq)51U4Asc;Y!wV}wKTWl73VV2}Zu|9iTGxc$r$;%EWZ#vZm#`|x7
z?LKkw(#WXLoL0eiE?KGT*uP3^sGa{aH(o;wQGNb-=$rAxYR8DhTk6MSo|s}QF@7-{
z7-@Yqs{US0ERALI<0UCB?SosK_PR5o-cvjZP`B<K=(Uuvce%9k<THyW>9`tPcFtjJ
z>QP+E%coq?w+#JY`2nMS>~)LY)W_G=xjf`8su%*F;Q-n4A~hC76c^(NuTn1g(I3IY
z5ENDV>>F+#L?LOCc^!k0Phhhp>c3P1cXg}l<(vO>SKGLdyV~^po{f5?<&%a5x1FBY
zN{omMaGY)#dSzO$-;u2~)2+BrT300_0klUPW$qhqwsUm-HPTkvs9n2)I-pTZh=%nA
znVeUkeG_~A_`Up*Z_rItv~Fl%@iGPqY|B=t-7RXT#{v|Qzz3+WGr+g|?hPFV4lQQf
zlWpKoMb4gJeYY|5JoZUg=+lfl1+VWUo8*6l2#X;6hyW|JooTo(buCN(f+Hq7x|7dj
zgPW+3>xCW3d&{Zhfs=Y7p`TwdZnz_GEd*Dg%${^~#dWoG`E!aVksG=9=yD+tk%NYr
zXM0Viq<yDANYUx)8z7F|r4;dRZO559P+~jrr@6SU{5;=&<BE+-lLV~5k~I^<Pi`b0
z&k65<I4dJzSzy&0G9~xAszZ+mrxuGLa8v9ycLNaIio4HG9EZ?*7T0CVkbC9;FDG<u
z%nYfY4q~Pw>;D#4CY>AjJ~O@gx907C)mpjA$UU<plf&w=VFtlXog4NX82QQf<^Zhy
zfX4N#5a1Q9ni@wQ77JuCDhk*PT;b5ldykXY7sa_kO8y7{qRh3&spA$ORL{GX|E(UA
zm791@a=aw<p^GxSnNBakBr@HT3-2>x%=k_?)t)zXO>Nys5M>zQTednQZ-dk@Cpgv8
zBm6B^nVxn%zV+Z{_cciYr|J7{z&3?svPT|nyrZV~BB^+;r`!uW0QG0C!^AJq33HA6
zFYqQ&njCsB(r)`V3fK<aC>(HjYt@r|0r#j;&&J{JP^TCp%!ez-4|F()Zv3MKa10J)
z5K>hZUHM@Yc5^fb3h{Nl7TkLxl4C`y%WZ?5v&7z{=xQhC-d7Tx)z%|Iiuj1M<FzKH
zX>K!)68hzyad>O19!-Bwgz2%tmIQ}$1fspgLmd^Ly9zz)&(7|Qb=+~kP<v8h=ZWVY
zhi2|&T;ffOjw_dLEUmBM7Rv_<ZDaFnhxK4*fT>&kjEnfLo>qmomk?}fY9l;9!-XfQ
zGeUIZ^ExgjFh$dg%<Fh`qazWmmp7uyd>EQTWp@=ge%5nX<crkExIuFbC_(?0fB&yn
z-S#)<nd;-NjP}EXap~)U9O!dLWPZb(y1v(^j&muZJvX-Ern-Y(_l<G}_U9PZ8SU@w
z#_i8JwJ-bnQ99a&va5svwA4F8CHv$NcJlKp9YCU<5*tQ$CHQcqGk)L`4D~=l-6Nov
z?3G>}I}-w1vhhK96_i|>T|7$28#dPH`YWDf5M%++I~4H9nor*j-=+vGB0ocN##Mac
z<3nles9Ks@6Pci;OQq3~{CZ+^e&{?v6@iucp-LhWmf@Ndl5_iL_t}(8otWtAIdOnp
z+$)?H`uG+}XkP0njugnI@#W4AMI;-VK~Fb`9_rq0dj1qSJ;&Qw!bru3L0uraLCGZA
z<60YOZ6js*nk79m7E~$q$R47+Q2<ciS>7m!-~I8LMkXtNAgYtVo-1Ecxlcg_C0Cl9
z227D-g9nhq5&XxwVp>oQZwXb1jDGFYv#72ySi8kNhK(wZsA)JWt0ECwFT`eI$kwmf
zYaOFzsgtD4IJVug<c$~MQgGclNAU<&s5IDfE!<sKv=i}<4xLfpU&nOKmNbe{?>46%
zo{pv+{&@AF?CKOQwnIngoG%MR+oNB9MIhJT6-l!$*2yQ3qACQ=3Nn~w5{;x;ZB;e|
zMAAUg>{JbsX8#^PfWHS{|L^}K$RXE1Eqm`bmQ+2om-qBGu(FqVr0)eYin-^l&vss4
zjDB6>_$^N|R@vn2a;weI%&`kv&Y;qH`pS&zQhGnEp}GdY{oCs}1wkQGs<d1A3Na|H
zj-V2t4xU*6=C7FLh_&Zqi5m<lbFwR-7m2LI1Z5bkz$y$$v^T*V`><xW6!f#QrVI2o
zMYmHQrG#`TKZAFgLKd#&jL&yRNpmdE%T1Z5nQF$36n9s`WX2wB38C+{w_@qtP#Rr5
zglWm_%#VJ9YrishQctW34B`5`#`zDpg9+=;b}nfiwe>=I+iTF^B>-%rjKX15&WY`M
zvP~)SpD@%aOQ_q3D;8x5z)9?`rFeHQ_7`R~SV*rY3V;gYmfmUt*W*=(!4mPSM_K-j
zQ<R4!NXo(X5Fr@b5E=Hnt!S_f!|5DBUd|@M87*nGPY9J-N*Vvl6aTMgpF;DqV6pjH
z)H~lPKz4pYH+as)fPz8r{yMY3pz}i)=Mo7TUsJqEj@;$h+UT9>$xDqfBHfQ^7mcKT
z+-|D2(vh&}<U0?dlt4z2t_R%`7-3$r_zMx(#S#V1qLJqxT@Igc9O+FhPy{4prOn^z
zDEh_htPTE#9gXW|b#Epbc%pVMiI;rK^L2!8tKtsyu9WR!CN9iHf84lzO}(h!pxkUh
z95A<3Z?JwsO0c)SuQ;Q9HgaNIr+%JBdPH-swPg*?=J=>Ta~LQ9+^;>rO3cM>_Z8aO
zI$b~c`;|EQ*w~i?wce+O<^&ioe2a8~E5CW1AzvL#?}$I;K22l;U&Wp=^g*hP5=dy(
zSQJRim-_CX|JQT+_fwOcCC<&jVt)M-yH2n6n#OV<%&V$l!s?SWWk-bggUwsNrfs6g
z>3yfh)$M`CY%6thgsxj*^@R|_S163Q4XTLi2B&w(s6}pY%na_Nf`rZIjBXTQ%!o=G
z+<KhvAYCD71Qnqx4qe}gri4hF^2lL3lB6c?YzHrmyt-@4jh0D%s_20?lVYDM9qrYl
z)w-Ji{7is8FU-)m%IoweD92lp(`pKOdhd=D+mF?@#mO8WBa!Um5X65oXoXF5zkuDH
zOO3uMhH*INLVp>>J$f@RGPBb01u2E}lzyq}j${4F@T`hGEre~=@zHyMEAfLLwQPFr
z3+Qc?!#;XZE$af$y_9ew0Pgim7<F_^4)_0!6a+uv1O_##J}_fLSj!)ejg{dPB4e&o
zcKYYx5#tH>8UNEG&L-;Z8Pt^F8&5l&ZBzij)ZiA$da@PlbBW2b8Po!FrA(9+fGTZ0
z&lQ>0#jM-5T2fGCO#s^T5u!ogR?RlB_XKw>5a-!jB%^>0X~ViKjgvaKf$%BOcyH&A
zD)2{ga)dJ%QBJv+U5>?dEmiF6h27A+h(I<1L$1{iUqPFlBf8eeqnf71p;Ctx1L3sx
zE!%C=JNMhICTEVFY#aTvuYTz<BW`9n8czw*(m?P(nrB+uT<4Ma#kriWYqwD&fvjLd
zvOHmPfnESQh(V(t0o1BRb_R8cuVx}YJ<RL2yxN*jebWT=t+$<BgEo-JG65?W1*$p;
zF-t(TICwHn&L{gcx6@9-#>x)oLWgelu|vd9Aq+d-MGj>zqDI7aMfD=x>}_%*oe|Cb
zn-z{z%DwrP;S3_E8&?EsH|~zW+Qs;f*B<}cVy71X$x_j5BS@fgkx9rRbu=e#RF66J
zKS}k;)DY-2Z5oOx=_8c&R#|16pLqc?jl7`hK93+_RQ`UNj<$W!1k`}!muuObcc(Sm
zpjxydSsxlELk`d^jw@_MRysN~Bv&|o9rWQ-jG9$?Axjs43+IYmNC6!}Ke}U#@^-iT
zNabNzr1b6vGz)rgp;||R+m%G3Edp|MiXD_su9WIW9Kj*r=KuiowMz^WL#k(%*8?pa
zzG!Y)5!PpFy<!^wX)O=xt()#RN_u`pWx>&xMWeeXsx$}X+z0g?oA~M#n>KRPx^tPE
zQ8fv1)s7`g{@JAHU5(*KLvr3E=yv<y-oc|b-smgK(KIMTmig|&hKsU@(=-c$;SRV=
z)Q7Sw0P)2bE3Al1kB);#?r*M`X1a<bY^nO0p6A(JZqu<-$ef04#cqLpOr}=;ua$M)
zby3r124Dw1j~Vt*AO5c)A&Aya@IK{CGm=k3DkZ~6CLF;Hdv9l~_;128AKO~}u4#Q?
zS2qvM2-+*!7T1B$neP9*A;AP2cs{iyKe$x<`^|N3Zvx1@y<bPNz@4%~W(T*pNCD{b
zd6!J}zbY|^e?Goe3yaa0QhWr8k^VNDuI=5z$9eNFr<sj58}M4s{hku=OG{VGX7k6T
zV_q=3PhW^9-NlX>!RZ0zhi*y7^j=udFudl9^2Uf;)myuVu^iJ>IfH)&cSJws&w3TO
zEa|mY#)-#A6Mq5O=CzZe%g?y*`WI=Te=0kRMJ-$dtwbY_XI%3en=U!LZn<aiw4)g6
zAg^B$MPZJ$Xp$rhc7-<Xnc&<cgp;+JG_#n!{GzPANz3{Fwa!jyGuQ)F%um=mc^3ba
zOd@)rOPqf3zu&0M0oyUZctk*=klVI3?o}YO#L)D7v`70h7m#3#l^@jwS#0f83la*a
z*A3R!*CL}|Q2E_z|D#w?a_+g^Es^fM-R%ir#6tmcsC#gz54WToWJ8R)BPk8H@ce(c
zh0>hMrJ!vaYs#f_*aOn)g@eHW71^|pq7}3n15WKqwc1WR_OP7qb=t3Jq-AwPfXNDO
zm<`+Gi@SsLQ~&dw?UUr@V;zGMNQqz*)m0#1?O$obZq2c!v^2Gh+1a9-^HygBooIMB
zgo6Q)xC@;9JDawgsMx_=a#0W63m-r{bpCrc<bWEAI9*-m-JPEF693sAM2U_R)SJ91
zC#d_+8e*h`Eya*wBiQE{_dN<huwcZ9Nj;-{=eCb`pLr6iNaxEs4+>Fj=cMl8tF6nL
zAn^~$^!Nvc7&A*&61kw~)GA{?CdaYt2@3WA)P0GSA2edxny>ICI5bD$kbBxp5xhnO
zE(c+s^FV_vDJ8=dLU3sFGMb&)@e65V-tlxj=3)VOnHG(H+ZWP7m)<4t3ICzL62Ty_
z+m%hYLGi2&90@_OHC*Ti!1K-rLoTpH(prHmJ}DMA&hYGJp!vo{GB4#|MiI3)r>{Q6
z>Ul3wy?m?tj?fl`=-XnI`aXzml3Z~2j9cPIjjcHnmm-bLwK{vGfcn?jNU|tN0Nn_i
zy$02O^)xD?)cjJjcYUAjO=u05dnJMv?y8{ntn%#<A?L1WWcpklnLu^QdWKnp!OTb+
zkN-G5;U`((;?k^H1I3p-<Q)coG~ax>ZzD(}q*w3ApEh%*gO_Qa7UTv@RTzT${tDxm
zbDkEhqjueFHajEVnd$cr!>~P493*Nt^32T44ESLCG58w@dd7lmN_EY2@(UQe_%cy#
z;ZX8u?Bg`GiOA=u4w$^^KH~HG1z?Y7MPTG)rZY_)U9x*@K|_p|hUEJo>GXgePyM{@
z+1%Aj9htG=kQL+8EbDNU;g7#4iR8>4KMz#rxKWP|qX+0x&)<CVOccNbx1jcL|LG~s
z*QZ!@-i3RZdR_Pd3$;y&{{VjZRMh4AqhuB&vBC_O&rMX*I`U?xk_YAfx}&1Ql59ua
z-Sp<(-m$6JMYZCV2BL;u&|S9f%}um^{ny1j0v9s_*o6J_)u>j-1$doNYfC*jQY8>j
z2&ySPO*mSm;*{537<Gt%o0(!faxjR8Ol-u=HK!w5>k5)P%&uJmO|w1>UD<0eKE_-z
zg>$8*X{iuw9l?|vi_w7GHZuE7Bcg1#1GJ+z@?i-+uEDWH{n^@o8U{>IW$8!u5_H);
zfU!DbDD%j}zaV^(f8!Mts#z=t(7AgsiY4>nT(JO!y-Fmv4Di7-+lfH@Nc@7GgV$k>
zD}Y{G%CPTlmZUFHd31Os#zl7FP`YnVPlhQDUAttB=uwA`1`Dlrs_XOV@l#>Vw@`nT
zG4EbZ9a9MCArI)*+Q1u?@2o2H=9!1C;9x-T;>52{JfBHGj%0SWvQ_ABh3J@(FdC02
zTVR5WKhVQQDO&J>u*>3nF`Jkj5EyNM3l^~y1*Lsg#VDioh@9@8nfd($!`JnG2(+tW
zSvij*=3xahPl?q&?2_ZoXn_NP%4Fa+3ZUj~E)()H>7kxAjrUgmboF5^8SNah2J&k(
zgw1bU<2R$CnDZmv(bM-e{M(Di*3B<!CQ0B4rCo~~Fh}%ed!RMEa@tWvzvP42@q@b@
zUF>hHn6@O~UD|?hgASm4rQ_;q!4Z2=%OLY#^B5feUbmIw+$yM;-jXilA1^KcVgdqz
z+HAUPJmC~Y8tx7JB)7E+Z5Gzg!YFD8fECfa7mgsY`vaobOqqz&KstatqOX*qbIg7H
zdFF;Yi4^oi#U$%oHvt%<bp!qT2mL#2NdGE`j6z3TTM&WK%>@NXgsA?vt`_<9DizE&
z2<4)(JYhV|2;RjiBAgV70eMZ_?fA1CTG9n+j`LRdg?@I>w1k7?q2{((^5g2^zWTl_
zr6bG)g?$2kKPL+1jOH;7dIXR}Ry(<`sqII#?pLIx>sl%5S?EGvj-b9`$D};#el;jx
zPZj!yQk8a7<z$802}X2lHI)>x^=GuChGic`XJm#O>34j=u`f`je=X?Yco5NN`K)=A
zHMGM<?JL)=SQJDtH}`VwSDhPf{S`hw^-!LfH`yG~N6{1>vz?g=+v9uyBZTvQH+PF4
zhc8JV?uH4ix2SL!?omMBiV>|ye}hCIr@1aAppL>!!+Vy!z73x=pb(vllb%7kyRJX4
zW|-ETc%m4_H{3D~cOV@&1Su+k)S;wHvVk|fF32v%R8P*}c{Hc!!fs`VC#~b`Kmry5
z<f2<i!n((EZ{~JfB>d=hJsEHJgdRm5o)}tZrGxgQWGdWRGZJFA(^b3sf7w8c-A7*M
zb7<%;6`6fRr0Fm3v*h_sfpDjH>n+Fb$zL}7UQyY5w=0lgsr}paO-1wcft4_sXAHeX
zoR9r13JWx=^e`vm#JftzkLtmFNcez#DCj}AL`Pc~^!djj!|1tW%BiHE`Oc~LHhxVh
z{NYh(NU3@I+M8B5=l&4oeYqf}Pg4HgA7UvPKKd*UU9#l23ajzazdl#d^O^j_L7e$J
z-TSs22R+xi!f97uq|m083|d%CKRj-Vc9YGsOnUsR$=aUom?cC*1oX4_XQNl;Nqbd^
z=|4lT-iPqrpd>JOnJ%HB%v`{37weXFn>x+n6p4JE$M1AaOdzGZS`-`&ftc~2RV_=Z
zm_MN9y8(km)@X88`eggmP(}YK>F&3F&A?hrq$re%hv!+2K{QxZloH{eJ$@4~RwH4t
zti!@dYW4G`XIwW4&tG4X3;GG;T*jc-x9@(ZD}XBW<-)pYmqpypGN7<SHqWA@BKAC6
z<78xAg*JI^wXWf=MW=TMq5R5V)x1vk(+!5M`{m`ymraq7&<F_9pd6sipia>lmOOj(
z@>*#ZE&$Xt_0S1@Hg)nk{~R^cxm5VNAlr?wtJrlA6=>DX-t~RZ7NPU_gL&5(YdBw?
zR^psP2ug@8qbuctT6s>pYPN=fFeT|_aDtvF1sTe@X5Ne#MZElHz7J_vlaIdyaWe#E
zk;NgPQl{>KDkji}vv~Z})dz9X9o?6&&q=s0f0>75v$t7TaejYV|FU2&Od}%k`{qz8
zQw<!G)9UZwwH-Qa3&*6aIC1U@91{if8q%AQNk1Nm&P{4qP?~K3!}72;R+SG#f7U!c
zzuAi$LVHR48t(RH<GP`7ZBBf@tzA&<Xw$isFCFDScETU&>lG;y-sSq_^eJu3{ruvw
z^gd2TIZ&i&QEYpBZ!~oLg>#f=M$~uk1&H8Ft$38B^Zk@r-J)2V*U$FyVTIRf;~6Qz
zZ^dbz4>;&We3|OO?#uP=X8ANsW(@!GpX0CBUdme8?U3@w57(ya;$y^De&7F9xzcXD
z7gDk9tw;Ko=@W}QV#JX`ACtqEjOc+PVK=Z&tG6FIqw1x5+ka15d?$Eh&^{o-Ep`SW
zbiemwY>)N$&kwcvzgaIoRj&uR>AS@l6N9auXZrIRdQv}7d?YK5S@~Md_8Bn4O1*dk
z#@o^y^0gB)6H&sAWu002!&`&LDc+G5v(mVBm856VdYQbV<GaTjJv%Af8NO~gr9-jz
z!v>|R6H4AfVxDfOO@iJq?Gsb&-iKB9&X4@|@WNhyB*Qs%e^*t)!r<7}QQimY#&A&C
zWi*>qs!a(4MKEg-bHdRxRyy_}3U?vxOP;nVNA0Y8-my;O)m^Gdxc!TlxO}Iiaqpc5
z3^Q#^-o}JNmBMpBR)`StrKf4e)II{Onc~hSVAfJ;cU`ZvD2a7lYYK?lcOZao@L2b#
za>R+TtLr*{vK^)YZQPU66F;s6%%dpPxLTJ9AtbvtS+T7IFE+dh{tm6m@!H#DJ%F7|
zo<fkl|1n%Z243he1-l9Yh`wg^{YxR`G5LVR6l~{8trKr94Re9qH5{tcn%$$xJ-r_R
zVjuFt>%u6N;yByA5g`W!+3yGnV+#ESdZx{R1?I7;n(rb<7OWiX-12`pxTMjKha7MI
zFns35^xO$=AqQ!-to=K(Dp7OV8ljb#EQQm>-?eztuNUh2<cmt+IW(7>kA6s#C>Tkz
z)Vmmrx8bVUcMX|9aQSPvX5A|)ie5s-Yf3UBLaj!Wdpr7<c6NT#FKwN@!?qlq+u%=g
zP#S|b%VP7@AC!MmPQ5D4a~!-BEP4^fJPS{AXmxBT^0ENia&M+D-%>|ilu@xRR?t4K
zl@F&R+3I|OVMP83^g0gx)`jOS!#s`)pZ<_cAFFF>o^|1=9E<qUc)nU$FdRB|G57U=
zH2EW+5EeSxy95)iY#y#xHpRPjs=tQ+=Y5aGs!$41_EkY(00LVuD7zceBNY#=2TL9=
zR_uF63fSsJXZR8F=o9!2vim~FTmE>u)lh_ASxy%LMyjU>#Vh^Etld(!u5B3Chjp<X
zz<j?;Yls{&{=Qp}U(ELTilEwQ&8G)oN^4;nV+@UFTbYMTd+iCri8edexJmlgdQ5Q@
z{of2Qs)_e{l@*|eTd*XY3PFT|ehr2a*LnVUlX;8?UBiFqKInn^qw2m>){AHP?5xNk
zLaKMPFMikn3>tPp%I~u0iqwJX3Sfy{y<W)XtxJ44QQnhc(1*39Yx2}DjU<*q56&%o
zeGc#2(^QWq=Hes}o!y-e*hNon8XqD2`7&&~zNTF<HZ>Gon*QUvbq(kLH5UK%nCP5-
z`m}x+TOZ&<Pj`+E0n`18=>5g#XId&mTj7E3c5$|-^~0dP`Ac+s0X1OcZyq4VQ6mU^
z=z?`(ybBVKdPhB^_w~I(_9(?QXx*w?krTV+c~#?!<QLX`e2YQkT9<>{@8WP7krXxz
zwX3o$S4rJux!y@e0;^%k3gzOEvKD{IJs>N>?d<uzigmR%m_pWf$JUR@yNn1WDeoq7
zQix0J{wC5a^2~2vRl-9Dewz8{K~?_}If55-KJtbq_~s)@ncb&e%9@Z>1xmt^M24Vt
za-A0H`9Mnj0(qXbkPYB@fh-a2w-{q0VnmOZ!s>L{LzhvOf6d{2e>gp8e(NnQR~U%$
z_-$M$YAiM4LD~FG%8i0rB%=Xr;ZvMJG%mYUGw*USPlWK|q_|o7;9d5}3aUA9r$9>m
zdc$Bb6S^*kQ++E97keu)T`K@|oHJ0ze97I@(U1Fb+r;|Ile1=RBcJ`kYlE4&v{dH5
zbq|i$hrXly@Vdb0h?95Z=XZv*u3d)feQsN${_iZ=H-*xDz72s;sdZ!U#z<0K+5-S*
ztfXr6i1d}M=Jk4jk9RqD!N59hWJ0**Ni@GjvUXvnwa{ye9IRaa905!s3^lSeF>H%E
zsF*mO4ex+EBSOcJGl^;D0ETiw$qe&?88?T_(|#7PWuj&3#GTkxZ1v!4J$n|Bff>*{
z!jwcUo|xiflr=%@_KR7<^Ai}_glWe=KtSBAYBj+bCvjP?{tmgKlf{K<9ggb46@t*^
zbF-2}1u_XZCKA*eTfL4^nrmAew<1eWg+A|Ez7Q=OrT5+t)n@dMTtlK6_iYy5gTdyz
zfMuWhJnM4<+C)RPBA%n+M8LqkJ1Chsv`yTM;HDYFv;n7_^3ozu65xLb;n4RHKmp2$
z1rk$n|80cz-i>H?$%dneUWe1?iq>W0(02W{p5<U`?yVHskl9Ve5g}uRvu|k5a5%)R
zmt$V>mVIkpnR_g?cBD=PpRs0J!sgy}KGCRc1nq!|Y%5?js~a1As2@_nsbPW5VitMH
zx*W|DF6Hm@!R@X3&uT+7UQ`T;(t-#09ZLcXQc})TCFI=L<>`ikE&;TuVWUPIDn7UO
zHv4VEp`W9>T`U39YjsjYFrO5LVA@@oy4UHxbiJ3fGR+-$*g^aCB(wOy^lD?6_u+;6
zsfX7Q0`v@z%c4#(tZhHSp?YMMuzg4&?8a*ml{ii(kS!jGd0#(L-<9esU$)=z%OVXK
z6olpB4zTu~4$C0s{)peIhXpaCG0#@V+ta~8Z3u>gnrqx}cI<QKFkf!px{z}{Yq&f#
zK#9=Z*7D3CW0P>Nw8djP43SBNh@Y7XdCZ4}D|8I&>|vR>knE911l7Y70coMl3QIC&
zHtGBe0qKlNVEADnLc-<C)qi`x@WmeGhs?)T_+JBcbIxCs_?jj5J;(G#$|&>9{r!GF
z#a`3yG*N%6rwz(Asxp<0{jTjbxU`=D{m|^n;=1Gt1#}|x736ohSG)rtG0x*30O=GH
zHRznNoiNMfNYJ_2j|-m4AHpEwXyLtD8#<-~c`$&{<P9fYm{Q@!Zw5tPd4k5s6HMv7
znk|+!CHB{GJOoAq_*HWO{cOnZm=vRnDA-1ibjnzE<Cu2?v-{S!-*v^T^?uv7cHeLb
z`9}*dRN5=Z);d+#-WOUtEO_=Xc>UvaXb}|Zf4CKW1t*XwqkWi#XvoyXDo(i(W|HV_
zPjm{%VvGx)`F)P{SQJut38K(nQYXbupT7((tLC+1$Xw!I9j}2FDF|h>dW5bb%!{Nv
z$-Pg4GJfkt3VS*Wcdovd@X<=$S={!iA4xL#74peaECz%!>;>{n9dQP_#n=f7)o)I(
zMJpmcUj<WGrP=xyj5ePxzEW|@6JN^IVrOJ#nlKwSayh0wqB-Tb-gB#!mQHMI{Ip6;
zCb=lP$74IugMTyv+~XL&b&{!b^c_}v@e6X7ZI%8qK7JOeZaLm(@fSo2?8wsfD~6`E
zlD`%YXhR!6QA4ZGTeW!GhOVUheTtvlQvnJ>+3K(Bswt|8MX{{(7bzW-a}3$F)7oii
ziXua!?6RWlX%^<xM>KI?FLa~DJhV{mC~b`1RA18r5vC|-;vhy`@S&J~x|l@(xKaVb
zrv3~r3{R?_We*BLv(FA#WHu&ff^XSa-G5JK={N<|>~WFC5@(U~mr%^RR&DnUf!epO
zwki(p)Ee^Mwnn7)T={127m)eBsy^icMJ)4QMDRzL9D~3Zx?O3(T!HR>Eu1>%a?Xod
z8hYs)3KR$DC_4zaIobGjjQ7zs!}cX5I@j)ePMxCch4=MEg5-7mXQ?pN@xkI29ZLzQ
zLPLxM5emACg_m1GNX+}kcaZ}5gwyvvG1|6hlyr;&XA)Iz?)?2ZNr7#p0xipwQrWY;
zy6-T7)3f>m5abUq?3`GB^Q7tJ<@v+!q>!W=01+=}vZ{4L=<}Rx=K-X#cO1#XJBc15
zLi_XFLD<g>dlR(1Wu~!Ky?Zfc_ua<AtXCU8#LL=OT-T7i7UL0S7T&jNd{5!^ZyzUy
zxS@?|y%~u~A<Y8!vboo%K+=D<{)olnVo9`THYS>o^wfUC^xlZuFT=9&)P__+y;!mB
zO?m@;F8sT(&>h>lo22M`UJ5ZTjS^KtV8lG?-kp(NzVk;9pq7^!NVe9Cruz8pLLf|_
zR+3rH3>Op1%;hEbGYer<l)Re7J2rgf)bY#5=ud(L6q%ewH};DFMPi;=>z%wb-T7~$
zQWOQXHuZi1FVGJcS9X&-agxR`fh2?7(RJ>89JH?;%&Z3roP4GQ<;-q>(oZzbp9<4<
zL7Xk1_v4FhP>hW?$#gUt*W|5s;rfCk0eE1yV`P@J>RwFKUKZV*+%6gQ#O2Y=e-Jgd
zMY{=Iy_LH@m@m|loflb{XYdeT$nH&2VQv_H;iwvm+GG&-ECMpSwtle=U!QP*i-LUk
zT#RSD12ZeVYjbf?l~ZtW$<@6=Oe}5=;UQezel*93tbL3db~J5{zKY<p)~{#QVML8#
z@)lnL;%<RNhfo6gORR6O*k%OcY7wOOCXV)YE+-TzpSBmVTr-sg-8kF+<B^Pi`;eZ+
zBE9N)79bUmoc(YWdDs`I<UK~*09i+<riX7X%Dg$}^TcJ1_Sl7+i7!u+4LEn}uEJP+
z6#DLPfe8F4ryqUTxk@I=Zu^rAi7}55YRfqB_gq4Qqf>X#uY+D=hgvXceun$ou$XL5
zug&(vPwGEAX|_g-{Ywoyi9Rj^cOT<gSHfgfE4mULR+-=Q#mJTuCPmzq>r-+U*SM_l
zO=U-XO+hU)DAaxg_j<B$qd%thE%8&ItN5NAedEpHp-*zP%Evu*X-Lo&``zN2f9k&_
zSPIVIWhU}MYGn4Koh?=*?v2tM7b3`vzt|VMd@0sL<{M6XJe8CPmsy0^Ljid71If56
zzh9>FQYb2+MSHgr_n^QgsK)q_AEltXjMxUmFWHnwf4t2=ZuTSz2>V>4`Vi4==r#XQ
zP22G*=ZY{2JN=v%hCZRF=qobujuYQbKYcw6U!&&zG9Wage`dwIEA@K@<l=f5Nz`?j
z?fiBG>3(u1l||NtW8y=1cL0+V;v*|ZIHkVU?8vC8F!3nBX;@)c#hKyBugIj>!@}$+
zo6Vf#nh-7<$U0`djA>7I2h|a(4ljgg#z~V47@ynj>-_C5`SwT<qlk%<N7td!$L$np
zjx%0g8AbBj%b4-6#BuA9skKlyl#g8nJDG{qkryZ5ps1BQSovF@MLI~l6k1O2c3Sye
z!wMyzKJa*xa43b!Q}znu;ZOn?bdKvpYk1WU8|T@MTSP062QeJCbdeGLYVmL4wB1?y
zww~gRU}UuE$c6v?2`m>DVGh<D3TFFZZ10X;t3>$B(sv6p4;X;6QRdRZmYb!*G#y&J
zLN_KbhB_N;MjBZWOH!k7;8*6gY)#(6C&aApH!CrFiQl2&R+>uk(db^jMo(ly^g5Gq
z_w5UKO0SrLDjlc81Bj5Lhe*4(Wg}O8VSLuXK^ajkuuxq*45JOZuRbX0+k~^jQ5f3)
zImo3uj<x|@PkuOvBk}u0=eP^WajkpPe|UM_a9?tQq&kJq`A$*k>KqOWnem+xXSDgS
z*+h5m`*SyubaV9$fgxSiZe2a4H>bsKDQi@>#4w-d=-(^8F6s3Fl$l(GOYi4|MaQ@Q
zWMw>jEL>QscP&O+`B<M5x_@<pvKXI39+xjT2rk`!+a~&EBf#MN_0#19UIxU$=TcD)
z3>tKkSR{#1q@rfpWDwO?x51rm_VGh1L{J&>bzS9OH#@_<>2%8fsAgwecL?dDr#?q3
zaa1|gr^J@dN@qZc$X~?;ZbH0!U>oG0u3gB9(*pZ|b}~I(;FW77z(C2RGp2j(B?sk+
zi_6j5kLXUcQEq?)8og4{C*%obb1kPkqj@z6MWIFZBQa&MjYl{xgJmAcsX|QX`%im#
zzWi2N`suQy?)QE*j?wERX9x@}nRWFqpT%_ip4xHN15rj80$YM+WgPNb2x8BqLM8qf
z|B95ZVs}&f(oTB58GZ^BjLTb2of;yxlLN8f6Ofq|`{_YBa~zdx*Q*oTUgM@<%3np%
zE;hN|JBnY^lCTBr{t`^u*yxZGR$oFpEa+{1J5>yZkmQUNb<?4<K0kbNori1F<N2OW
zYy;itu3?4FrDKE`n6Oi}b5EJ*#i`Iz^!aRbs@Iyn-iwbN&WW}5?pidjVt9Gr46VcT
zA1X6*3Q-WDXhYj;EG)2lQUX*z$1R2eqYe`hm~D{iw6(=<{+q3cZmFyNv?;i7tt&|T
z!uNTGK!slMiVc!5$$wmg9jVo*$b2h{{IU0{uUlN+sln|^FutFfEN4~O-QXL(w&-MO
zIO93DL5>w;7IJuG(x`t!ioI-ltJIqq%@`ZV*v|GI&SN*4$%@#+u+V3BTkbrs!MFAD
zwBnCq3mrwVNFRM(z!9zJJ<}xF&Bo|E6;1AJ?G^82)_Q;MeqO)?Lz$=ui2U5^ZwemN
zYj22Cbv0m^d@(i-WB*x^riI1NZ$x}=%kYxmx6wlsK|e^ZJ9^ib156voxX7y!p(xMU
z&JcMLsYk9ImXK0QNsU}%q2fQh7@wU%wsDZYr4ixDBlVyX<VOC5mD-;^_{a2o^GMOa
zXkUX2^-en!OjN2}k81+grO8En>KHP&%mlOE?Oupwm^qk5B$JT;*C6XMd{<0X^i}3S
zPTW=Ysh0;O%mu2VEbE7toYIF4(+igN^Huy*-0tdWm$c#E*kboAiYs<G9NLyLUcANP
z=R>#sRYFd7Svxp+6XAj)E>=8bJBp6d(N!QtSy~df4=v~nly0+}@57YwmalTRJA1lx
z3eRjUCrfq<#z6BB5o+q+Z_^F-PGv^t5Gy`O78yt(%GA6`2$Bb%5)%>gNoA9Lq;i9&
z@ssNU4brJnks(Gr?{5$rAJ9e!K7G+l`M}7yNV1Oz7wfed2?H&?A6=?mlD7f1dm_oy
z-J8tpNfGoSxgPk%qER65Zh&Lf`WUb8J7A%2YLUvqH@@?i?bPoQmJ(4c8>ohmDv#`-
zqzhbhDg1c#ah1batas1#-8xbB2DPk{R^R$kHZx5qIHbD!jIBmjI~|4%RfJM~D>0Y8
z{apH34-ua$t0MQCdH06Eur}+UzbLD<O{pxwsYvEcw8j-14*YyS*GX|9)|PGlL3kkR
zV`U;go4WTjk7<<NUmNi`Sx@s&KP)nf1Laf(Q!+n(Ac`<3FwQ2c6@y=%jLQ@i7j@!a
zWcf(iq1#IOLg#NNkQ4>yI*ms1Bj@AU5^`JEjU|z%*#RF*m5H7{m<^C+v4;Ic)&2^=
z$d7#ivf?tGpNUjuO67EGe4=`mo;@}kbgW<%L7!BeU<`H~<_1#<@Es+!AR#2Z8te`Q
zw<dLE(gPI~S+Qi?{lu`nF(1lbp}6uJT;(guBRrA(>A@Yv>v!6pC7}nDp24uoNW+;o
zG)5L9f6{sG)puMdOPn`PsulW|8>VzOZ#c)W;@5md+4bPSn6o`vuP!WmD)+V<|NgS5
zy11?yw$?}<AmR&jqV#Joct0>%bOW5_Z)Q<~-x!jcMk?Sr>?@tWwM`7q!L}*#*1<jb
z|2EG@o&&gl{(@{cmWSvn0E;opJDtLxL4+8`e7vFAmgfhiDfcq8^4qlZ<3ed%DHq!A
zBfh~SSa2s8zT3|IuduM{4Lj?Y)YWh%mwPJ`Xe0N@_`0C_^zPvga_o)rca2dSMdWs;
zT$SrX5_d00(z!0S&5{=5H=`6q&`U!Yl|^RpyW$#!^~);jZO=1?7e6)!2ZmYi51jdl
zJc)D~`hl{TJS(S9-~fbkLem#|atI8q1zb9U+`pa!BmaMZkNanlU;Gf!;bT-CC#b@|
zVq~9fU$G4P{Q7`gAx7*hpFHtHj8N_%GYbE>nM;$jopJ?9b>YWr#zVS>;4J~-zjpx+
zs;Cm;%yNQ-8*l<+%KISFX#lU+aHmhk9>72qNY5ir-v}9vZ`8O~B5GPI%zgae0{tLD
z4aQXTY-Tvoi;8DwCulmO_=G6n5i|#-Nj!vVG;{b<tkuVYlP4s~D>I@xLXV0JT$4E#
zd#-TWpK|}=n0^EojcFOvPZH{|N=J+(SNLIF+wfu9t5(7|pT)jQcrpIlT|POTU2y9-
zs1JM_$^Pzk81DTg(PLpb{QNUmvstN;+c6t=I`C*7{9uPb$a*xOp#?eAWQa7|?!gu0
z2?l(T)FQ&cRK*7%oqyOPWfw9gLB_4>P(p$@smuUppmQFG(PDG<6{qefM~Hs^EE{-^
z*t(Y5r#4H5Pd&Qp$aL^8IUT^WfeSvt(cZJMiYm-b_uDL*oh@}PSA%5C=Nc+^hV$sY
zhderhklW~KZGWfW3MrZ^)vkcIV)0mtgSXpN-QTCLdOp1%l+!|9H<iLzc#={02GHHX
ztHD|OR~x#m-v3e9=R8WeC%_oqvf>za&HfE<aY>kt?%JuVE`$`>X<nB>pjG)S2L4p*
zX=2UaHo93B$B(3h-Zg#-M1$CJ!-q@U=9^L6CWL`8QJJ^VR~&08KRPzV*xV~Hsitj9
z?WSUn&cu@|j1;?rGb`aMRwhdZv#?#ZA9hq$G3*SpkBLYfIDt41+N7$>WhU!sHeYw~
zJH0=4S2ekXT$ROVugGJsZvWr-+`0Z;!?)l6@;Ur)3XISb!rneTXm$|u-Y?&5Kc4tl
zLo^_#S-Pdjh-8CGbwGK_9k>r1yWQdAulg5{A!OUd3-7}!*I?uz0fM`|Bx0?4H<V3=
zDClA}YqohiTM97NIuAgaRp1FV%IGzIb-w>;yV#aTAs;@K*XsE6;DwzgTQ7=_bh7Ey
z@K(?MGSeH4Ql*T0gvrkJGr`Xfa?$VQC{1#@>xZT5L@jqf9bucQFGo7);cs^H7OEfG
z{Q(Kj5YKFzhpNQFn}Zy-@98r$S(859;QX^X;m9|na+H$pLbK^qDJw!<cIKFfXulF`
znoRrpC#aA&<a(<9<yVd*n1wejBb(~Op6s{^LP2S+UF6_moe%oTz`X1G{e|Si^S)zA
z4LhtpE*@qh+b91E$q%kLn+_~|-TeN{vlE|hN}VlT^IgjKXNAzA@aS#S2Bq+_CxoZA
zlDl5Td2Pj+i&4=v+lV13c4?p0N4r4l8&3Z+<hZ%*5^k1*$jM6r@Kcj>yan=P02e#M
zk{p@NQg?}^%InADgOK~|nFM5(U3;U`@V~HJgOB5LRXbioJb0<i!U4-xcM0{N-C<(q
z0Y4D8^;WrR4q4}~r?<eHn*-D+Z__fsh6YA|zTfXkm!bY&(N!3UuD@{g2WNxmwESr@
zan!te<ZhwM#D|=AqV!7lY0eCr_sxa;VP)GsA@8)y!|-Z8*KD)4em-U&yF!d!vha>^
zvTD`Xdo+K!`}UNE*uu{YF3a=$#wgXp@=36}%Y{FjU!W^o;xxzaKXk1llX9`UQc(8P
zKhO}NhuP6*v3$7|I(EvM54&B|GeUz-2EW;kMD;NBWMo9|_{Z5xJe(33+P?2j=~cU?
zrO7Bi(knD0WULZ?7(c)nuIpKT`o{grKfdIf9XGer0I7a&x;NY4dg5SJ1eGqZ#BweR
z$p_`nAzvdBG7GlrKUWYslkT9{k*wr`qwal|n-bo}zpYiM;zW*pjLXnf0HyaG=0Q!H
z2=YV&f%x+G$4*~WmG%bEwrF;9anO>^kFuH<BAmgQWJIPVkN84#2qw+mfExM}@6<uH
zC7<|A*KS?ns83jh=*IKbyF#hjARtb>7kyF6eSp<DRc)1HD|s&Ad)LD7LYP6QX+Ns6
zKuaB4xkm7@`eXow3!if;L2Wx|&}00AeY;Wb>;^fP0CujcDS!6c7~qScgN4qa?yFDy
zwG-t>w=#dzSupQcc}5D(Ik&f$ikduY<?FF}IJPK-MxDOl!sDL|b3`<RO8a(3=VD3_
z9q;yq|GK2xe@=H`cWAv_A2WK~&1k^skHa=)g}*7XUjzU-WTnT&<1d5?@30c8$7
z@@o`^;xS#@f$hL=k?KAC?$4ea__mUzAcZ9731g&mpr(;m82JNQt<b_@Z;_zc{BAuE
z(|nc=eD6?w5pgQ)ErEzF6vb~H0u7j7Vxy=tOpHwA-bV^?ssQ*-k{a#8SN`>^Qkgl9
z@wFvYbEuC0xbrRE(yTyg|3mIVES5;n!@}<bjJ0A*DJv@CbsmkTCt!uO@9xbFOB@tX
zS*7^{e5kle$KekT6Bm@af1c^&3IX*4mPw@bu)Y`eVF@K&_|5D{3g4-_FFfSSw8Gs>
zxHix>!|7j7-#9lcwiSL!&Zuo(z_@X}xI9x^1NPQkSAXAna*%_-eHdenM|(N!@8`HM
zJ>iehX4Y@K`Rh!pwsc-VQ3=asY4%p_z{MaIVVXFVfD1nZ9~gd=e*1lzRT}t&pDgz{
zzsoyaD7y6P+%RTy*JM%15jDIWU4|3d-xWn~do^!ox|CIHEPDBEE%06aOj#TAV4$M6
z%CYH}@k?3;`H624@82B0A;!G*=fnn+$UBY6Iuat5Lqsei`oyqbp<?#k;PXRd&)|=w
z1rZgWJ)0!#S{{(AJr7GQkgpyhyLp%lCwTPlzrrHAtzMR+O?Bunl^yM2g_l$C=Ayu@
zempk`oNHB~b{Wi9|FL0DG~}8I|L@k<ADfbgh$Sn154|)U7NLY8OAo#pSTO}(8YT~9
z?s+Hfxi5Bi6gBmItiw_9qq<>k{BmQiV@JTyy6lJNsXe(rwzg(??zy8c2dac~K5I)-
zNU~VeKS^_CbxuR8z&t#4rT)eE%c&yA3Tl^BK5f)wzFU?A!<%>}!*BN{=jN&$i$iM6
z`hE^QVo_11C2cfqUzFlCt5Of~9G_gfyhy|#DyY-$cAw6QcC^@hq0roFpYtQDBU|go
z{L&xa6-SGgue+HOlFJ|EPN&_w@(LF#?jZ{;w90vUs`p{s#EI&BcsJEk8NCIuesYcK
zd`vHpU+fUEr~e^X9-z#TLxVgg_|qbFDDE4RMZ^(UY-YziD*e}kefCU<m_;<In)<=P
z^Ux-fblnWQcR>aD8`%Qz)HgFv=C(^7s=|<J*qkGb&+uH^xE(-$@zUl0N7`FJRlR;|
zql!t0AOaE!f`}*~5=sb&Al=;x(g@NGDxrXM3DTXSgtBPqbdk~^oi0E`8t(i7`+xS?
z`<(IJ@7^)iSVI?!SnoS~KJ%H#wQQV*cT~tVbe9T58jj}NvOBoeJ#hi))6y<`F(kRe
zkBE}>%E$wUJ!e{@?zAs{Jg%6#^FBW|7Dcp#@~iBMC>JrR`QTwQ@kcA2`{VDy9CMB=
zrBq6NbhI!h3HmbG8plsAD|uU{)Tijmc-~ZYp`>4QGcH&6$ba%Ui3gdu-=n0@QArab
z%-TTrW=i+`Qrv6Pt>ve0(ORdv-rg7G=jH0B#TaO3r2bm_+poYdU^&B0nJ3;8Q!99C
z7gg=<{%JiuG;H~cX~STHOk8Ky(z@JGKW<ikv2%jvYXdsYCR7^*zv?{--N=DU>VJ$$
z#~dBdkLNp+_)eWJw)--@OUdsVH-NvR+4H^iVjTa%VAQMk-4QNg$M`MWclWnl8@|R1
z^Te$BT_CT}@FA~M3CAIArQfbTj=}J8&r=NinP63NuA@G@pCx)I)sN^rTs*ivxQw?K
zUSt@DQBQjuBgD|gU>)7!?bjg0*7s;|kmGm2h2tNQXPCrBwCT_FV`KdzYY*(m;S#?e
zYdQDuDsSoWn}VQP8yDe?d`jQK+7em{EWQ($Vj&{oPDf48z_+!{>dNQLP-i~8(BrUp
zomU3$r!><-B`w1(t23i)q~mifd3vWTRp)eHau;e)n8@XoJXO&Mu)K}ITf^~7@)Y^d
znb${dm2Wt|!f1Mh&oUs5Su^(rzV-T8M>)^g3pUa0Int%tCMB2B<LG+VqSh%n^$2qZ
zSCTJCYq(q?;tCnLsU^rx%17}vam?5D>D8Ss43&J^TQbe@*|F@C_4w6nqqwYuv~z8`
zUP4_`;!Y1*I58<`17_ucfMt|gHCy~<CM{puD@SXe56%l{XLjQe9QDf87g=Qvvahe(
zX1LFggiTCEN|wG>WC?hQ*pFjapRd75e`drDIXHx)b3N3NgXhX?aE>f30a6}taIA1}
zZkjG;ml2(kM7|>Q?yNo&y!!pu>t!=0_Z)mI>c{R_1aUugXv@i<3$AVT?#zL4drXgI
z_3Flq;{ry9-R76(fcqjTPiig~DGxT<-&83Ty=9viDO^IZTCYkuQnr2DKdM4j&PeK9
zl6eshdzuV=b3D)M8?=}T%5oPrVJ7ul-$eDX%_r5g(Nv{g2Ck*rN}5+v(puK2?qX#3
zSJ>tz^GKEI)`$;G2t$sAVTW{&ddUZC(#2Uzj||bTE=fBdM#>(Zs5ZQXGoST?DEPAH
zjLcq-x8MMKlFiGPNyhs#9izWDb5aYP2Kkf*??^acR>scM?!UbIP}V8ZR?Y6p6_kL&
z`r0J*Pm10&8Nq>0iT%cFCGmchI_YtGX`QM3<{1Mlr(YgeYX_F&unTBp;Hve%PZxP*
zSs2dC^;1?K5hBH+L@$=#b{#)FGsK`6$O_?U5WHA6dqzghtI<N04HwZD4|2cVD{A3A
z`zt4sa{BNW_4x=SlZVf(z8t<bHMn7VH*>r3Y+mHl;x*ousb9PdKAelZ0uQgFdt4<=
zl$jrno!A$4wALr}x7?gRH&4vz&s?jcr1{W)dFKaz;B3pOTgLrqTD~jt=Wi%y9BaGO
z6X{#2EMW8DJN`rk4X^c?<?rW4h=01d&WyNj{cds0n(DJWQ6gWuZYnoW_#w8Ch$z16
zRgZ>I^G<lQDLv;`lQA{1y{VWUy9Vizc(v-)!eG}~kM*qL#}ffvEtT6Z>m@ZZN>9@|
zeNO+@wjlTT@e2t&H?jwPzl95*^gb9~VI@Q)PN0hCEa0Q<yRp@dJmjID{Lk}?N;s%R
zmhgJM2k`D6m0-*#t{s~8cMrg7lU*dNA%Pb)G)^NT+c1w93VIOZJ&rpeS4(&)5?`YZ
z)xwUtz0BN}d@#?i{h7+5F2m<1>tlAJJ+k<AgmL`fAh&)e+a(RF!nEUqGj4nj%NwM-
zhWfcUVwjoc6=o|3WYh2-Fb@HNg?KevOZ0->j_AYmS<=GSHWm_BbIE<0qlbc>Mw*vy
z{CcmHHNS*gT_CSAeS$we?b}28%<-=tlO@8dGwP=vqr7r2#__OcKHTN`l0XSF&QVF<
z@n<L-8OI#w!cKqAviJ)NFs4|fDHb_YD88%v{+3GqB`J#j_c)VHbH6@)cNSgyS}kxW
zdK!eV>qsN~MAPA!{QH@J(9@9e#DP?@fK={gkMuDiQd#T=0%spAtQiR6xo9~ljVtKz
zIf7sL4ERNU5Yj(+!NGVPQ~hV}x1KuiuOCkwryJXp9a5XW_cZCUJV%&^mgmf@<%!I1
zgTzAXzh-pK8`*T<E&SP_=lv&|m+QLKwQ{8rPotc!85V_t5xtAOau?$A4pmeEoRIIQ
zl+)cG)Z?yudyD#J-TJbaNEnQ(Shs#^45jj%XukIo*GfFCyv~mOZMKLH$y}@yN-MI~
zB&x-+?Cz&rOP{2tz0>u`C5T&4$rBw`=ScZwL6Tf`tQWr_z%g!Xoo>uAVtMw{e0ek-
zuf!+ThYr(iA7!;ZJeAwpULh*koYAqF+F~lPTfmRywGaNbV<)U6V&5Zso8V9+?U10&
zj_!frq2T=U`NulQ+x9Nuv14$m{InozXq<qbi1@yE51z@6NY!(V7fvwF5K~O!X=u8s
zXA5uKhGRC{Cnswo(2DcFQ0sDdeYddj7CU{NK5xJ1JayVPEwSi*PPlWh#vp!mfhMUk
z22-ZLL^#jy7GZqW*=6JSio0wxLD&f0Gz4cSQPI6&rD2{kv1bAm(5!Oa))7SdcbE7i
zPgM6<uACZk9NGA)m@UpFm*zfH{BXAcof2B9-Ce8s!H@II*4T4OSN5Co3X-3`Vo2ir
zh;Is5khq3}Ie5_=c0=6aB7<mufk7fnc;1+H+kb-P%6ivZ_X|dea^YST2<qz4^P6(T
z2`vLdbgrBB{O#|IaXEmu^5~p`B!O?(k_1ui|LL(P3R-(P`&G<=_PfGSmIgA>fCYqf
zn-C;XrsvN}If&xtDuTG&slXmk`$ZY}mR-}Q><?`mImHQ95le9FWDYp1DA8wUo}kS8
z?k)`_%Ul=k`dZ8uG8IM?)V7%aK1Lwwg0K+;mcPYEVwkz;_}r!azMCiebg<?($VmTj
z*;*uXEiy}Px9ppxqfapn#*}M0d{9H-No`HiHb*gK<H|>j-~Gua<Z<04mF!E58Tu<`
zs8lB2jVtgEC08cb=ZTYCWBsy^Jwi8^K2&Izd(l*Ar*eg;RTm7ARa$N{`@h{9xb5Ed
zzUJiNO~+G%SLTy@WQO%WpNmTX=wtjX2F{a+qsT|>+>8I0c8(S~7Y`cU!?&v<W^u-I
zrs<)RY|GZtn+UGG0jYVuJFl2`Eo-{<KH4-t&r<)8V${0ZAT8JXMB!;~Xz2YOJ0InO
z#J{xp^;YOIZ<*n3TVuI+tP@z8s+YpYr?^G3iGPHf7>kT;LOlMg&+&z~WL$Pw#q5j$
zHadRor{vG~-HC<9b)|9zzj>cEsrwG+i>SoCJNP)%KjIOc;$H~RLT>usj~90lGk;D=
zFaVLF$5%jorauW?Lym6Q5HVY*Exyn8VBtwFRu>fQeBuUi*~J!c`0VcA+6Wremhk4%
zq5LTm1TCnyZW;wk<u>a4jJ6t9y5BSAsNZdrC7V@j)ckt<n|#Anq8@XRwc?c%r5&%A
zDn*Pti^U^I$MfWCFg5ymO-*rs(-_6rz-|GQZDGs<3(Rs1m%?dZpG#sEB)6{l&b4|u
z;NYK2_Y?`VUBR>$;=2y)K5PzS_0Sl%mMz6bqc0E{4+orJ-^)%dK}FH=-p$?R^nzo8
z&^#iCs--{Dtx}o?e@_x|B`$&Ee+6q?ShJY7tB29TK{$7$R)_z7tb#;h>SU7#i3sK@
z7@zx^gv5wQ?f8QLxn~!jUm}Ggs`L71@K$g~DAKS%lm`Fc{9n*x-@rzvqjk~@${QT+
z(0$3#FIv{z2rpyj=MBS|LerL1ww0dmda<mQwxspJW4(=MeKu6A!{#>T?PQ5nemeOR
z^`G9<yhKgdO=4m!M*a1%zx!M3OoB_^POx3NZ%#nW@J@!^y92+IvYWYj74WaK>kD7z
zvZpA{*-LUtI2iD;2Gh>vcG4;K1Wg%Mh)jIbPPlv~pvzn9j8a?8QOdA?6;x(V!F2KB
z|Ew})!SIPKa>%VtBMxd8RA#~Ob=2W`?Sow=OaE#J2Kn>nuz#5nWeBs(NW^N7AGo&V
zYcC&h-0YeGd1$$yCDInAejk<BZK}q^TdA-T&+ce+zBzVRUk)!2QnVR#sH8|3CpDA4
zXuNb`&1#LO$VE=q;2co~dgti=aPn%lj&4FKg^Ben-|zF^%C~>dMXnm5gG0T)H5MP&
z${gM<y75fCH(-=<<8_VY?~V-7VN+Em-hyi`yWh2Zhf7|QeAv~tR{<t>103`-^6v>x
z?!SWd@e*J6x0@77b{ftN*#ExxY<NACA-2bsl$_|$7MeW(oBK@e^tJy@Zm7X1umzk(
zu6~0Qlxt7PVvz(n)gfW97Nw0e9~wZL{@k-lKXvsVA!%bP5(Y>{zd0&tU}ITCI)_&5
z6w*&MMM>G9_|1nB_a+RT$~6s(mUFtr28oKK%O6!IH(Wf*geHF(_%w=dvzcB-t>ze1
ziaVAptXk3ebrc4dX9~o{BuVVp%&XvBp)_}JOeLkfM4Wve&f`a`ZMoGhNnI>T!1VLd
z-CZ^PwB^=Ql!KMHMb8vN!CB6Kptt{6;k<wKGLLKAXUzRq7<7r4E)DVt|BvB0DPmDl
zJNxL7cke`k6vZ4_X*Q4-#FX~Ef)`C&K)uxWR3SI2+5_nttbAl(@P-*}&?&@tXw+p0
zgF-Lv?<^cY7pN*wiod&<X)}}`&{J)lZm?z~y8bzg;bD-*pLF`!f?6S8?TV-*lH7<G
z^N2S&Vex7U4%y4RA0=+h_hU{xypVH2k0YL*t~_CCbXPzxjY*MhNdN6n!Q}V&_1P-4
zp-r7_jLldxM+tbA;WFz${Ccv$^su1*YPP()PnIh)gv;$EGw-&b&+o0ik>1^=VHzy<
z`@HKQX)Lipd7wO1XOKAi{6#{IgB<Yi&;R=~6%sTz;J*EdWF7)H;e;%&AujsZ$yRFk
z3ZCzqC%EupF4KvBhBQ&J5NE%ay0~)q_93aqb72S3Ylr@o`)+~?(|IX$3tvik&V8x>
zqVDk7uYLY3S#^O%U`*PNBO}1xX4(7WZ<&^nwxF~_1%Z1wxgFI9uCyMap{u;BaeF-_
zSJ!qvvQ9Z{(sg}q{Iy!7;ojNI<ZLuHu;hj5Ek6A9E`dm0NL%U??cYD$fBpOadv4aa
zh$w<t(woEd_lG%F=<DAJ5o@D_TGv-Lt<?+dD5Ty7SlW}D`=1}{CuZ?9{Bnh>=PN}|
zhGhKt#YZfLO*u{hJZFB5uC}U{1WebOb(i6J8a`{hW7sS}^~F?B%A`_i(|RTW!Y(6R
zYS!qwnSx5K#AU9Q*=3)Cv|1s8!w<1d01^LoH&)dD;6@J|>)-OtJw&_Rdnif|LU?;5
zzGBXQvtS7qc|Tyt_`yF@CMyxJi;MVKZw?AB)IXT9cDXsz0|)OW{+{dfVg9uG!*ZVM
zdb3tbwl5FF2Sw(x9&VME@5jGsCU6X{cQxs-WJR}lKuDMLYB^_XquBPxf7v|irz&hA
zyK~)n7oi?#ed8=TIyhA_L#Ldk$YhyWY0D7XX^tvNGC1C1*D>&Vo5q6$`z!77mkbWm
z!aiTP)xj)E(bEUg{O`b0z#9~n@;q`x%m24|4iDlP?PY}}4-eR$Fhi?|R|@Z3B9fn#
z2<;TT?{<B9Qfq6%yHsBG?R$0AY5X*7cmA{%6^qV2y~_5I%lw^Q8`&Y2i;TSVyiSr&
z*hsLQKMry{d43j*l>&S2Q63l-lCQQ<riGE(`qdQ|MsLU&S+>}Sls_4qhNy4K-;QUX
zY{RjFc-ub~^^S9qJA>hxr=e8wwKKgrn%>dhKRlWeCrQO}w32@;@B&kH*L*ZJSWc%a
zj+WlGp-Sk#2}gA=o0z0z%9}T5$$^;ZV-KTReml%?W<blTqQZ9Z{7Dz@WBcVzJDaPS
zz;np3z5;ZsH}2i*3nOLR#Jgd3^1mN))iWeYKgWc15T!FAVVAyI(8U8&dl{UGw#uDr
zjEJNa8J_rObT$+Y#-Y@Yas2Qei#@8g`X@3JS9*mW{=Fh&&zO6uK+=bX-BnS?iOxd0
z;}ha*=On&&4iUen(*sX@H%}wB{?CG76r29@rj?EfdUpLiarJz+QW{*D^S330l$!^0
zY|Eq^Fx@#<&drTP+|fTPFF4tO4h_+6|Fz_nz}iJ(q7ISKb<M2Pt8B@RvGa-(xt)rC
z{FB0%Oi6G=-z)cMUa1YK32bmr{7{pY5$ADKPWywD)Bfc6<rx$iqOZ(%rsCE$G~jA1
z_7~7g#d3$=F=)L}G6gHB!ufWVA97ocdm4)ZFYf~&olBC8y3+ydi^8xAGXaPK<>Gi9
zZJMIp_crNmXWL&^?C&|JQvyo#<_;e-!AG{2KxP;N1WOZPNu91u9u~tT!m!YS@LAu-
z2k$gKo2mKw_$0yzjC#c|(BA|D>*-~M)f+k$ynwIt%Qr+z18;f~v~X~ozeTjIr5dF%
zn@v1+_|so}k0ia`U8idFXJTR^bJQ{$4JT&=%nIr|MY!+p-i0l<Z<3OerTp5~c}Ew3
z7>Q^v(p~$<b|p}hUA%P3$msH?8J06cmG1n=t|nPC$%9}Eq|F_13shNe?;|%`wGWQ7
z-uTeB-1Y!nWV=D*&2$ib;r%oEn*4wmQmv<iqz7_Vb@^odxmod_BYDDreYM0!?;kTi
zY9U}$)maS!&+}97ou?&x)#(>d?zD=3?rgl2r6G%GeZIPbx%&1qatI3an_`;i8Vr4W
z*e#~)FK)Ko6efwg@)&WnU%=76U*@&gc?^a3m>Y2-ptxeID(zn=>~4;At)YhQqmQ%f
zxy3pKL}g_b@OT<(rM9zYswQI9vdCs^Dw8F$-P?CmnCl$;q`|<<2h|b8!Z0HBKSm38
z?VqrM6E<zqZvp$*Nj%2O*7k-GXHuCVQ;|0<2V^zupkp!lo<pYpEk?~YlX^Gvk=IMk
zYY8e2)44F(pk15}J4$aX=0JawU9TPwCTb!Xhcn@i$UMhkKJ*Ugj2^-+=cEK`=kn*0
zriA9B6;ePrL^PlAXtKUQ=GK#aGWXpr+cC<YFpX@}MMy~a>Rrrr|AkRvio{9$F}0gi
zdt}$IT@!NfX#qy8tXt2{qDwwLxqv7xJW?R#dRudlSNvc52}8tBNSC}hcUaZDew@Ur
zX3H)uN3C}MYw?xMj+~vz6j|+wDs+10YP(o|3n*}lPjSpf^s{Z_*)WvBRW_yoHJW^R
zL4({0se?Y6Q71i>+&42d;guK^fecj3!=Qk3fln*`Rxa9olud7rJDR9K&er8DI!i9<
z&TWeii`}V?t|n^xmz!7e3v;Sia#h!M`N*%Zj)vJ19vZN!RV3+LZQZr4tF0Bn!NpC+
zCc2(NeTh}~q(5-Me14cj?=>@7TfDFeWF%M5zBPHuDC)lFl;x^7`pDCe74;leE_TCs
zrqX&q=~*Xv2kDU$e(2Va(n_bnI$O<hCk}rq{y2nzcMAru5H23y!Kwlo!r)Q-K9n_m
zkx>WDl>XVw2i>_EkuYR02`CA+@Zf`Ai8}E+LoV5qH0^2YJGm}5YF^Xo|I|-@2tufi
ze+s#w|Laga7Aes4%+YWjT=9#jlSpjwS9_txigbK->OF%=hY4Gx(p#lUC*duZ$9Kh|
zWOBodi<Vm{!b2a(#8*nTIR(m1j?EoMqISpkMFDrQ(GN?$qy{Pb-?)9d>XeLpC$9S5
z@OgHF0YN$&Z@$P<mV3)G>29e%zK&(}zF5v756?WI<`Q?O@Qf0ZhP=G$#|5@}MS`=Z
z5mDhoibJl&#hin%r)w|+aO(`e6o4zm7yuKB08j6igqj{#BI2(=t-QYZzkK;JpG$D7
zp#RB3?&!0Z2;&?RlSKk?*6mze%08OIRykuhR>d?)GAYM*TzxoWQL8fnc|gQR%wg8o
zE<dKh)RK~N4Mv7ZS6b-Zw&v)csKXo}HE;H*H<n2!!5#|&AyO(JpIR>HHdW}iM5M_V
z3c^KKiBP#-D`WIJWmH7!X;_qU8~?~9)`It|Z#uAZ`!rwx|LX0&C<R+*N2*r4b`Lz_
z6%kA!^*`fe6c5yC?Cw6PJv1V>{a(bpXfoY#nzNthF+Yuj7itnW>!bLx&U0kRsr{l#
z+_Rj(U!&p=mELob<BWiYhL;XrqW#SXlD1WJ<(2m3H>V00+R?s}yj)d`glY_iFc?ZQ
zrZ#u#g!_HTgbumUcIZBawUcd?{imR+R{LT5Jdx32YaLRAS6UEoQ1)tK^6-TMARwa>
zdT}O+{i)o4Ubm$p8-qDC4Nl(#B%S{88d0S4e&!t!h*x670-to1&Lbn^#yl*gOS)(j
zzTE1Qe?V)7gO4B4MpRuOd<M5ATCgf(xz%c-F_@NthK7cYs$z80>p0L2a?*+9KFGNc
z$0ZXHM+yumuyLq6`g23Wssue;)l4C?r(s~^*BKJU?egIwQ<Z4gdXtI9zU5`ejI69T
zhDAhK>nF$hX9D;St7X(bo*)q|9%T!Xyu7G`a79D&7~}Z^i34>i2)uY<kowiK*-r*2
z!v6Q3mc1clbeQXJHxBCt281{MtJuy!{tHT7b!J1t@ok!B-$U{(2iS8S)Ht0~_|7=K
zzY$UT<HO(z?YY=HDy&pIXVEpzrxhYe));PA6(w8fj$5mLbi$R2s+6~6Q9mttd4-k3
z>C2s!_~E4ycLDn){*(>VFSGM)gTEJ2`DbTY_R}c7EQx!wBauu$L^8q$z71z#Q0<o)
z)lnN3`~4Uiw=yJh`t@u1#2rT=y>f?@SDZ&RN)x8D=+1X0N(>^z;>5{Gc_axct5a#b
zT-qch&LVCF(GfYLAAIvV6aK0T;+HzkTyec~h-Sd?Kp#~S9wC(kWdbUx*Y21J=D-tY
z@Fn}707ldZbWJfq;CP*ugpUI7Y|)Z<-Fe!WaEW&97)cTk!3kCF@9_ZZrPv0n*L>J<
zZ^(!H`t?V%>bGnkIDFT{lKe<`Jh__9VhYBRU)_Tp-)c_`ck+)_G`#09HE5Df^)$ph
z<#kwoti^TTVOd~Sw+)>#IpeLE07G3bVD*5E<En2ZySNIL`Mcjgzl_Sm#B?mpLkGJP
zM*O&_$6vALfk?jV!{*3fnIl{5JxiLY^-X7vqd15BD#I8GC?9Jbi2*&dsy>9h3!vCq
z77Lh&uUuw(U?t-G>G96uLVKdeIIflh2w&S&)lofw;5jHc*_nfy?GDm0+0VLbhl|J+
z2GM?qVR|P(+MFp-U9bKX=J+6C4pG>rgR;wu7i51Aa|q9^|E(!!z(`N#Q5ycpe^v^L
z?>D%n|5iBiMfE*!n3nE2$RQ=p4jI_DD_`jAv2o!5QMnle;f>#ND~%IUaN4U8w{KdK
z=Ll-?n$awGe0|yzomN~x8rCdI5T`|Gd3T9}twFxy)utxj(@#b{>)S>SwJyS+O-}!<
z2T_9e8aSb->yZ%Q#V%4a*fkl?CiB5=ff$AOX$HG-?Y`M;qoBA0roSM){Q;+5{af!2
zm}UqD8ZaXDQoCHEoc4BEob%_?6B4L9&{ugWn2%$ridNx`ZNl;;cDqH@2O|;{F#jYD
z<Oj*HlhWD6UcMTEIU2zfu=na1zXk00m2?O`kw<5j`BT^89@&ZG)VvfgUS}qq1X>=E
zt5>f|kEVmwr^Q_VHp9ftJz#vr-kS=Rr)R~V>86gn5Vtu+7xNIP$GSa(FNX#N2?PfR
zv+Fls?1)SSKIm9jW+9Hy>Wr%s#1RudIS7T|(>7j!Xy^nMa<%`r(a<j;BpSkQRy}bT
z&*EV*Hsv2vyKTYT^n8fW;c0homXO|ku20nWc+8F|-P%u^<WKAB9FDU7=&{`DnDdrg
z3N)6q?iV9#FiEX>B1>gJk4KySH$zt*h3BCI7lSx+jmnO?Jo%<kbTqe~HRjktqeiex
zWGz+YSBV{ijKlAD)dq|gN?vI=Hcd4;8WV1|^}Uw=V$6ia>1nv)D-=i}YCH0QdTT5S
zcjY1_xwH$PzxKo#yX5nu?lQkonbW7vJnC&Y>Pf9)!>w;h5_W_O=<3pVd3kksViU>p
zb;Dmw?<nXnZChNk3Pe2O;AO*G9;to@Hm%D_uxX>P6>3IX<xwxls@azIVP#1-FhGSd
zqj#Luwap)2e`t@3ZbAX|K{yiDF7)Qvi&a7zB1qo{jAHLm9140595{*TUlDc$(~4vO
z50(wg&Ro0#Awnwn<qnu@*W~73HH1>~^}RTbiC>;@c2>QHBMRUWgi{TRXzoJNkri;%
z2H2AXv(K+W<qXbtSp>Yp?x*=aHY6;1hjf#Wk4FQqGGXoSxBRQ|nT`|$WP<Ct4}HQT
z?B7d6ubkZa-fhfW%3K)8<u@M6p(~uIvP^E_YBTJ2)ZG8jU97AmCKpM+<2J#0E_yjU
z>q7zW2voj<FwYoLlxT7A@Cc?1A|$m7tjMD8-Sf1XX}n(YExD4aD`oT6?BF>b{RKnF
zR%}FDHATcrTz_w1gX0K6Om<j+2LBTAkmr`I8MCAN<&8~t+bi5nxm+pu6>fj--Bi*n
z$@W)gfW+3Jf0CPp-|bhJ0kf8fb%ezo&DD2}!M~M_SENN7Wa865pV;qhYFpMld)P_%
zVF!(dMY~$nAN$xbypAD=<Gm)375*aRs*||G^cu5Hg(N8A#h-mQBjIEz9WAauj#%EA
zY$76ug?&=-eD_bij*w0_xbr(~xs{9RzEc>;fz-;%3bTePgvLc?{hh73QRL7F&%C$G
z_f9q4-n;s8Oy$NnF1Pz$S&IbWhBey~kqkCE<}Uy0qX0^=oz(~P*}zm-2<E&wLTYD8
zI<(=T!=LTJ)5i`gdw8Isu%0FDpb8O*)UxzXJuE@4=&=G^<umn5DXFnwqJOrw#vC6z
zs#yQ<1M|GCtF-nLTPtqOvgW98izbrodJ<JovVZ#}^zOn9To(rDL}b3#;SeWS?v|dd
z)&}~LYY%4$PyVFgC3T}?5e}pf62=j?7v{2Gu_wRxgj4HHW!4#5)`kqi@;QH+Q*<-&
z&~-*S<GLzpY{E}bZSPP4VKyz73zH))(lUgzZ7n%WLt?_3-5c(BO!dC=H(*M?84&U&
zPdVuIGGoAd9f8|(p@BCHKj$Q!_@+}KzFRLDWMuFc769GtineaYmzr;<zxwQk#Iz){
z=`|=7>r>cw8nTgQbDzI-*kyZs9`4jEkbA?4M0YqA_#4DNefp#bdvtpS*He~Z3kPmA
zzdPUBY)1`UC2*T`0DZ&_co%d00|#X-X}rR##|u0SM+RV?*hiQj^DfRs4IEf#bwOVN
zv*S|D>+|%}6a>_Jy4=};Pz27t^+RK!;5W?KICBHzTUR||SYgIB!PE2P{NNWBCe7mZ
zxmBPi=xmK*V}Q8>@0Skp4%1KTv=^%@0cBwcGjhbh5BSJVh9K@-B<1nLTi%I*E9B=t
z*2f19XcW0+d%6a1Bsi&JLdsC5ZoN`RI-JDa`(LjAGsJeh10nXGYtM2Hdlip8s|Ke=
zQkot*%T{Ri>(?yvbIxG$blfgWYzM@3mOp)+W;;7!Cv_vQ+6n4JMm66~us{8!DP`I(
z>@c0uW7EF(vzddO&6V&npQUmaU({$=;{6Ph{*aK)d-mJ`8{83q|4X|xC`qVh&dSys
z9Af^H+V%S(q(L?yQs+Y5N`pU7xBHGH_vV6d-M~Ap1kEO0_GeEmZU)h(8>v>Hp}l;~
zMk%tCnpdJL>aN@Eo-BFkkflT-L$4Wfd&gcUB?b+72Q{G%l~q6K-G#62MnaQV2&yhO
zk@d=<tD!+IoX~+^^AFft=2%MDDkTHU5z@f-_6YGZ@LJ0=vRHmm#t6+h0Oy5&1e3-i
zC`?&3U9K*ex6(~Ghy9L$ceK<#P*6}1S)|ColMC~oOxk7deockCalP+XMScb_(OaYg
z9YU#@M?Brm1+N^nIvrX4KNUsw9?Lo5m+D&xRSOKxL7}4ASahB=_Id9F_-P*hF~b@d
zK+1r^m-lgJb%*Il9{3wy13%$-$Ts}6Oepr`J3~-jJo%SNnn84R^LW)Cxb7XKA6|Kj
zixUrV$>vC>lS!4Z8wWH!H({@m<6mv6P$c)IH`Zf1tb`_?9Ju>I%45IBTl-tmP`QWj
zQ;d9x!8V`z%CLFex34i2_92+Q6y~Cw{ebor)eqThfrfLxpwp7E-e;UO&Vw11t}YnB
zA6Uomr)+-GxZzZHT`AQ}GyCSb#m*3<KiU*Rw&$=zI?rw9oAu7Xx6ukW8uhJ{p(fJ=
z6_V6ar&B7Hs3U{`U8Lg+ZtUG23!b)4#Yw73_%BS=_~q6^yoRCiIn!nbuB(xx#fRms
zpVi+iEg_muHG=XMqySO*;=D4OBa@_;J<)1ub5n%R$a_*ozDi|0t{6)4vmT<tuagU8
zRgmJMKYI|sQ2!u9j62!4y$YQDnA^7Via6)a*$O5;+7oO0)>T~)R;F2EW2}y{qAOpi
zr(mA)T3%eDA1Ph8fF9emjS(tdhmS-EH;V!ko9>@(hVSUM_SLs=lN0=)iQ@fi&x8vx
zy&$gM)LBtm3ru$=5wdv!hTdr;BTG~s1zwSQ^5ltkX(Dpz_r=6o0AZa3Hl^hX<ymxb
z-l3WcR9%8RxqjoxL8-!l0{%0i%sP1xTK_>{h;g<Ll(`|N{(iizmVqEn%F{;{<bY2M
zR9_~dr#-EGs-J-Gs2I6?`*=hgKen+1%Pwu`eE25X$9<@bw$ZuQd^k~O4~WOiL51Wx
zRB+G6it03lhHe<raggro?O2n#r#~R7Ou}7J>5;S~*P|bcBI@dVuw5VP{JUoCo_djO
zp2Um}iLF}7PJ#jPO}P~6Q>RZh=A+!Una{=YMj|lqx^_hUAF%cMT)7)GU#9iI3dO(g
zBrv+fFAh$Zl-u{NL;8jCXqLMGaeOH8-9n{K7)?zwtRjV*Jf25P5W6?K)WB}MhqGl@
zBYl7D9D0m-A>M93=#G9XHTf;p6)q;#a<8;j9R?JMr{U%uNvfXL<UJXT+g7h@XV7zk
zI~)TgOJ|=A+bjNdRLPM&p0#b~cr~Ka)g6I~@>(4SaB=4{*#iWq1P?UQIYK|LzbOPG
z_@vk3!M&&a2wDhYY-mM<g0YSZGg=v+rILpU`A`Z8S;RJQ<#co;i+JsqTTYxt_zR4Y
z)JXy6aDj0}y!I_j=!(O#-H>U$8=see?57jz&ly5K7o=CN@Hz;<Xm~OVU<U+H@ngb>
z@{p}zPW!v-UM}#?=^j*u*d9=$V|f74Kpc1@VM|NPS}p#~@qizx;^7zO<xB0Jy_AA_
zc#mffulH&n@Yn}6_z=<0&cwTu=srcq#f~^FFpVvKHtKkVi~)<o2!pswftTSY<~i7{
zkP3AM`RUeXmh!GL%V6v{lDO2k<C#lWKjHVqxB;<%7yc(M>uL5v*USwhKQL;izCaYy
z!+g(t=$<Y^&r0X7MF}_a9Q7h!kle^$LyrkR!<cO?OZbnOdRQ}`y+D>$c;2mkWz~0y
zutEo@_>j=2x^R-xmxM_i_(O=S9DwijqE6h6qO?nqz~OOr$9l4Hj9qHJLh9QbOt%o0
zyk5P(n7J+DliD|<yM90LJ)0r^+G0POelr+wBcRe`cipl{zpa#|nE3A9rJFzm6E~$W
zt$LMLVpv6YU2?t+1+campxr#<xini~h{hQ`W4|<F%eCP0c_S4M*raW?q;D%8`%k_8
zJc9Fk@<a8YlEsK%smHM%J06s?3BjnkrUdn!2CSf|%<+GDRc^$qvRmdH;eNve-g|SL
zoa1U}8Q6dDxueLDujL!Pd$|x`qhul%TUh~QZ7pq^t(1V?vLi{+DC+GpM9=mY%v3KI
z9yexqxMh>IZwYY0|I8N*Of@1(B*)hKBn?1B!HY?Z5e?8WkN)zD$9Kmd+=Ns8%uVb%
zQuj*f`wfz*67ECo<O5+V;)0wkR?~vKr(SzY*j0NkGwR@wgeO1xop#svqM{aCgL52x
zssyRPTvw*E+{yje;oj^ovbDV!p5Cmxua0^<SxBm5qIk(Ds4AKRwwGlqOUtQ#-X8a3
zxd9m1=%hhm2ZK#B;6r{@9KF>^v&-;|kCip}&tgAqV<M~xmLxO`pQdyF=7{Vsg3Ud2
zZ}f~iQ{*_5{Fi|j;hAqh015ZT^%e!IuUydPDLETkp##-oCC6oIo++B}!ex7GdYKD;
zDKO)%q(x7X01Vm>pyLy4tmV!E(KqAwPomL`>MdUlR;)Ib#}r{%#hA-GCf?CX_juS;
z%sd(D0^AINphF@s$E)m${}!*%Z&~w`uCRKA3m5FfV$|o+z5H=gd(2u9_WjTfrU5mF
zEM(nYk<!<6=p-Ui`ubEKlzoAeZsw7qVi<KrmZSN{1oS8=V^eeo*>-dIEp}MX3jA$x
zS!<!FPrr9yPa}??k*sV`T1f_B+-8OUn>W-Lteoizj`x0P=NRy&XRgMZx_#%-4lqQo
ztk=0c`hkXpK{AlBd&?I9qzu-bjH%tS2QbvauJriOhW-gnp1uCB-{i{^Jzh2;kvv#<
z39#_!2oj1Dh{=)z$+WzE=raCKI0Us|Xf<j)xNLgZYP@t+Hq}h*!8a!58<q3!Z=XFa
zpF|D)4AUXG!>C)4@-)-nG7k<q#=d~N$4FFEL#neNdN=J#jh26<=Eo2KJNB>-BSQIv
zt#>S-_X(up-x&u=<PXMBq@%KZ0wcbz)}K#PS8Dc=9&`U)JiqPA&Z}Dy5tJ5q`EJ6k
zBInA86vhwTUgXsUR$@os?5RJ_ar+g_b}d)7%%*jxQeb!+`D9g`NOhGnb$L@L<!SPf
z+vQjq1)Dq|of|;GD&a8wAsyCFCrZ8NG@cG~!om$38ykD2nf>UyLdaVbn+Yt(@)4HL
zCAEJO2b`GHx><xDe}?(>X{-H!%<his=2WD*T*_O+>k|!uq!{c5^A>e1t8_(@lMQdl
zlEo$`IW}CLM=C`OIZ#p7tmyWKoqAM0oKcseO|lJnS>`;R(HJ%7_k$onJSUsQp2WN>
z&A;8TXJs7I1dje90YQT4_Ku-@#f*ae6T8I^KgGsSz^)<qWd&^0h1CuxOA$Q`7UQ1~
zY>rPGT-6~QqlP9zrkOwiem4ymN!wvJVWNTBleaO;t>gLOnu->JH3B4yh<fqZEk2K?
z>*?Qx<g`2S;h6!L-|lk22vi=Vh)nOjdtzU!gFzH)0jd+?KMeL!S9uF=qgleDV5i_s
zN9~(b6*C)CCwD9+yIoWkm@iyEl#CZId?3(#f0iim;P`PN%J}n&=Bn_4EyMzeD^Cy=
zl|wq)^Y>7u7Y||5@I@UY2vxahjNyaeLo@EpK}Q3LwT6z{J<a>mog-~K9&b71`YDB?
zT$jkT+$4J}b@3+eYJGD!<0HK-o0QJH?4`w2lC%4I0|4fq^Fy^5H@uuMNl$&trXUod
zF|=qq9Y7&eloiqSJ8&PrE;cpb;dK*(6a%bYhokYTj<KV`A7hv*GO0#All+-AWvpxT
z#_aAo6@Ej9NHUGEHS}OUuruBd#0&>(-wIM0KIxx)<vMQ?-GY5_kshi(?DiofNPrHy
zn9A5VHF!h>MEk|gKS1F*u=SdJ93XD{P(C!Z4EO+P#^RP4@Yg~-0p!GSn`mWN&6yg=
zzxQr@V&YtH73(nxS;nH%r%yN6hljqk7%IMMW5O9if%_S%g{iQpk^zI3Mx{tc=qp9M
zTzfOBOR*0e?I%}MvsGJPQMA-B7L%cz)-h6Kcb*b(EAePQ@|po^C3y%ZV|9U1FbI+b
zK`v|G{r;XtK6&(7;c}VlwjILL)<RO;wN_}{bpse}8REsTqYyrt_Lp4ldpjkvqGIU*
zEbo`YmdhMgbjOw?H(WC3v>A(@xUnANzgtX!idcZItze`P#_U;rU2Om6l_ytK{LkK`
z<1IK=3e-yQ#mKA7G3R^pfi%d#4E_9VERQ{onS8JMyf8gFo<AL^KXHbqF0-(4#~y^Z
z>vLTq4xdviz%L{_*!kn2v9YnM?a$h?-klgPsMBgt;IC|Z<9}3`qW*yo5;pvRnDrp1
ztpG_J{KDCYhnRvgAINzuvPi7L=|Iv3PGJ2Lex|Gs%?jn<hJ&idF|0?}AIti`=&_A|
zK$CfNQk<K)6F-Iic_u)vJBb4sIa3%PRCAa<moIr`AlJWrMa|SumqB1>#@4N=nRGQ`
zVcO{P8^-YL{)LOT6kjAAIZjtMx9wf-qD2aWjL_n4-T!p{WXo{74~vJGh$5vE8~ZPh
zz+2bShf((Ne7jaxkZ4mMAWdt9tJBTWHGC!_YgsDnKTlnN<a&0<LS+^vt&B%I6y`&}
ziXysp?6?LJk!pl}wFvo723g5SU`;;&l?jvSAI4)*r4som@3umS1mIX1g`2SJh6uxW
zSd_CA#b#SuNOASjrDK!g-S^Z!JDXIvY#KpX*tau0>}l&|LwXo;1#%<1C)K)&z*Xgi
zU&qVO%q*=uuE!PVRSAseQhzA>7)Tq0P_!06;VPtdEpWoT5aLzlsm9=k8@WJ<)c)1u
zj2E|<7SPgD?}c+&8z`;6CjXl2viYkvgm31n;S|g!3yzqi+Cpa|ujaBF>=O*ir{^mL
zw8qMtlsU06HSv*2e+k<n5@+dO8$gt=01F_LVm-P&YE!y^n?zPp5|5Y};ML!3p+O<v
ztcdy8GM+h6XRVqh<M#cUdj!4Am78k$DP5|WhE$_xcC3!E_V=%#R(^av0sDl57TwfR
z{2|70?f2{x8llC_cz@R-;%y5J$39QO<cGF0WCKjobTzEHE->f-%D;>o_a(5=;Emaz
zHWGAf2U+WKx@3-3YgI@21)DdP;Hzhe4H?IIX<+JOw<x*d%DYWh?hgRjTO?a2a0f?&
zD>-%Q)SNgPT!CW4pQCX9&B9H4GQv0_A|mMVy@D$3^%BoMG(JScsSs|!uv6x@nH{{x
zFW*;=S{@EHy?8N<K!}{Zf~XGav53P*@e-axG+-wjG{8N#>^z17kVL}&0x@SJmD%h@
zJ*`9eDLDI4ap04&R+jRfloN-$ISw#_aP{OgB#nfde(_|k_51#;i)mIoSKoe&i_0Q{
z+PQG|1>{eXTm|JR8|T^U{<I9kB;~djNzQ;cKuR*cvT)U#5vrM7wa=-U2`!yfX4v-5
zDsWu=c7N-sJp1LhJbG`U`X<*38ssv^$EyX#S7!>1esV@%x~#WV;)L}F$6OHba*<)r
z{{}iL@q~s-?7hl+ua-8yh*wHS-7#pt^5V_L()71I5*ce1v$SXYSGtH#gd)BOrW%kj
z$?v^YI(Hrid)K|sR7mH37L}^Dmzu_AV4@s%F%F3g59+cbE!CVYCc2S?W1FyPR<Pj*
zPy&^BB1=rELss1LBIes$hZEJ7^#}u}?y`QB^)I`5-RIMCW9(~a;f8~Wc7*bJGE5Z{
z;06V7OkHA>wGtUbxMIVIThxXel`xEb43U#kLuHN%8AQ05nJf{_8$$(!aXj|sk?5Z8
z8#izIs{U3e5xGZVZ}+o1GxWwNBucn~Q?Nyd;mvFSsWo(NBdLq`es{^_2hwpEK4N~M
z*sFjEX<4&hpAs9}1U?=?m5uH@D9tbGtXE^AZc|{I<M_%ylO7Yl?~E+p<#j%BZBJS%
zX9C$m=gAhq__L))e7--@311?f@oGWJtv^Gu+>usjorWSPH_;EJ%g4q}Va^BCpt&*T
zEgS@)4|?n>i6?0O@G@zYuzPuX<AgYySVpDJKomvDPmzLA$;5BQR8&$<NVtjsAFd|B
zUh20iy}p$S3hwKf#9pa?X2r&$pz6VO%snr)0yfKt!>TGpmtZf#F}3;r8WvG(XmUXj
zQ*3XsDrRE`_DN~~PGt}W*^gsADnK5?vI6@);d0PTb4QAqI&1aj2Zmn@zJTE3oRE5F
zAg#SR!9SsMIju0*ked=%guEYMxYw$Z%y;LBeM<RBncfBdnrwX(b?2m<tgI7`=-PRQ
zqp|7q(sio#aDdwXK&KnSd@j@ol-Lx0ZK)!lZSpb&pXM?-Pn_vCVKVq$@85GVdzKsp
zJKHM?Fco$GB^B-jh27=fTjlRj$N3`L7Rn#$YqLnI>1@3wszC;bb<XcSq~CY-0lYPq
zoxv*+#<r8u-`(Crd@E6hOPt_>%J&;t-LZN@5ATh4zn|c)v*+Q)>!uzQV=+{b?Ra1E
z!0HoxJP(n_XBj4hG+0!1NGY(i;@8@IcAh-c5Km%DTByiuF9ywmJiU5@vRzP4FD$OP
zZC}@I?I`YGfbvi6f#wh@C8|WiWd1k+but4wk$rIOYFxIFN(|nN)yS%?-+Au4Ey87r
zv_WEGx&aGonBn()+ly><(%Foq`6r=ki5F>uT3Cj~)WC5pp=H<XWgQkeFyEWUa(1LC
zR(I;#z1}yaj_dP1EyB0;f~{cYUXauRB{A>z!@ByFM&OnE7kQGKW7j5&z%Nl6j@SAK
z&X_d#m8P@h0qAeekuoE0f?-VU!~5Iz*Qd8Na!%wasu-mj`d>VT8K9)3T<mt$lrEMI
zn7|&}43*&{)E)}DrNZ35)f6kSr#%+Q-FW}}=H57S4Rqga)Lt*;L6|HHwH8h7lhhI2
zb$+B?qy%qleGGiUN=Hd<$O>5Ti__d&^3ajI4IOmhtA8MZd3BcMx@C5_iPGpj?$f@m
zuQR!vHCf!aaBxR6S&Ckjs+PoEh(nd$X<5;v_(bZT5#F(egO*1J8MH-Xulo*d+D!g|
z1*`=8k)Rlho(f9^q78)A7Y~gt!EGV1*jHWN9aLtuMc_ZZ;dt2p4ICPF1%jM4SvCuN
zPs3GJ8i1U=uz>c>;(J>TQ<cAuUK|R<-=3EcxaId=K(y(-YOhI39#SeWkV=$z3;>~A
z8Fj<cAX*o>W8<KYL5Jlh1F3m309)sM=gr5s7#Mt@)WmF4K4l)Q+Y3$0lTT!_6jHUv
z)Us+)$rGg+Wk?!TFY3*4tkrR3z}lRr=fbY(Ma05S%(*c`$3oqMQg3tosK8tnj|8E>
zi$4oxYU!OqU;U=gO|`1w1O@w(${}yno==dcM7-f*wojRwd8<gj-0dYJDoXI}H$Xy#
z(Vp_mCg#(SeV-koWO4gi18(xoDMfp2n6p!}n<T~i*vHc2Y7aO|L3*1^KWnq@j9@>;
zOyw|}x1|Pc<PWB8%O*V+CN1IZoBd7l7LaUbS;3m;C~mt&(boEv@g-q5%SW(m?b<{P
zpR2lqV`G1TVQl@f+BWD<25fxrz1)47zS-8ccRMzSQfFbec{G#JH9bPU=egV|?HTIK
zrP~}VpMTKRxf%QzRk5~9mHz{c)X`Hr<EP!+e4D=QU{Sa^iUaVCxM`j^F6w@~Pq%zU
z+)M+UgqLdOdUfRzF3W@*Bi3oeq`{~K2V_awT71_Iur|@)*T28TBRX_tLXcP0e_AF3
zNyufQ`)5%}4VfCy&^n%T@STUb|KFd#BcY*bG0%r)tyncdJud0Cx_p|Jn<F>hExV~<
z*_OS<Q8U0rYvM*Ysd55jB%#WA=38vYlpHP90&!fbrwE;#JcdyW`>8nV)HOX0yjrUZ
zp|>4lSSIE3pNc)GVpK?b8?7;DjORDU7@hxk+QG7S0XofcfTb1I`xW(Uw4%J7Ze(SC
z2vyK7wVfKN-5KT=Ce-u*THL@<B{Xz=P%cS<w#;<Ev-WX;*G#%heCO`=`Wz5-5S?`3
z8_b+i$mIwQ5iZI%bS$?RttfmxCP+`dgLMwW_x`#RxmXJ3Vw@UZ0K{%TpEHFAY4(o7
zcPk*Y!|Ba!e!jjJN;q09gPLDY&A|pSV~pFa-Z3@wZdRlp*?NfP+oLj$miFny=|xM;
z%!2H+hlPsTa2&0B5V6)?Zk!k&N7^~Bp0@5+_SK>C)QiL&{$!$l<Df~IH3_tDd}Zko
zs}PT5mVzKR1t^0bwskV-rNjwCI$vA)dx$6cLe=BHlpW#5IPUNc2Pw{IBg31dB<C0o
zl>HnM#yyb>3gJVP-IC1|YSf1VW-sy(WzP=hu+RZzzak;@*L2&_5B*EuEocyBSN5a3
zh%w1gwv+xAk!nbCs{B*eDn!;=C1_5gc7Y%sD4Zr{Q_EKue0WSwZ)w;LO=3d((TGy`
zI6(1CToTW%{fOtse4DVe1F|EFy2_*+e1`}S@utGpxupExzP%_Unb{(Z61DYLnqSVU
z_2u!*-ze5=z)wfegd``~KADf}vRu4PbGq5~Wbe&k-=8Y;Bl>bXz)`O#_;l8q0VX|V
z!$Y$bGX-I3z`0Ks^K5+XpK#DNGh+nQwYYJ~GOCXZ+GfSnS~syb;d0k(qNjG3?G1ry
zNAsFG%9_@e-*~R@)@cpQl_)S>UY?$BzKcpbzy<~iDEP=IJ-dA7lM%}wt*1()+L(L%
zfRfRD(z_z>WchI3-!JyTm6{N4`$Cn$%e-74hSpyj%yM5h`Z<!T;t=6fG0&yr@{Q~K
z9^Xy%!i?p`u^EDWyxM(e*<U_$T;Wg?c^|>GHOT5+MUMaDDNxzZH%^2cTtXf)_Ol@I
z=G--8KKZBGzwmK9Pa);B%fPbw>(2ghg=k~|9sManeGYcmCEs`cGb`?yng)9n7otsj
zu3=tqoAuq+eTiPVb5mL!GVIwJU32IMUDT&44_QGxS->Y)J&tSn<q?Sos+6Ur5U`lN
zO5&NC_okP}YUuRD#dJ;T#6DHYqux^74NzSSFIf!3wa@qoIl4mDqj%TNgv|F1e0=gy
zJ^v~8RBKe1u~g=^gh|;zjh@{Z+XmaUIN>5xi+_Q*1GC0wM%dq>QoCO-)J8|w-O^N;
zIpG`I@wWzbzHu>>+LrhOV)@<OI4;H1i`w3aT);!GnS+s5xNC`@wXT0I09&iX#^;rX
zhgJ!1i|ZG$SXWxj6pRuVbM%<$JL)<wo?bLG!#4EsOqet8sn_jK7Uf)w6M1{-YT0k$
zxeDvaBo&7W*KN)si_wv(R5GY^5OHVEG=++sb9;L@jCkz$f9~WUY`l&|5PFkt<u5Ei
z9LHGok#1sx+F7#0C<oY93G9E?3ha=%*q^eeCJ*r*|9Xz21>K9&WBMGTO0S$trq)v=
zXByWu0f2N-sC^SYm1^*#(;#uG<^G<f)`ByJf?~);Vq!X*e&w!y*}UZ(gMIaecT8lx
zfNn<G$|$R;&S+n1D&JV_^6G+gYsZ}pRtUN~!70Z683B$d-Kzle%!!V=C*4=!%7ZL)
zV(<==bX^*`uN(Xvs?upAI`UQt9i+t1L*&<HFe5{!l=8asPL~i@vsF@#K{S#Al;as<
za+hf9baL69_V-*7&|iC7|BC#eR=-v?+YD?*W=Ga!n*hIjYTOg6U-_!(W^CG$Ra|1_
zW&+aM5d$8C%&^rB$$E;*@ak@kIl}pOXDQu^+%gGXo7(tg2oU;OG%m+zO_L9FT43v<
zR`I%;!1sY*^}J=qFEV`Zfi&_0_p39fPn%SnVLgt-<?#4V-R5A-3IPOC7`{1s=*@)S
zj&T0hJ8DAiNFICgf0^6F(7_#Hrn0Ydx0o<qB|||FFiUfu!e<79auU`^EraenEJG>n
z&eP#H87gMg2Q)cDQ5ZTGVvWWL=?>rxOoeO*yHXy8YV>Hwv%`X>m{6iUJ&aG*0eRQ`
z`I$yy!IufTKlvvzKN&Oz=;mhxm(VZO#oj6EY=iPeR6Hp}pxsQQzCOuNSxJV9Ee?xT
zP$?{R5^Q^aHyuD+>3G@lsW^bImj!0{@Y~p0?>0;8S4`RyUM_EQSxw5uECtL|e19h|
z`0EUn3$@E@gO&&>ODgdb2p}F5$&&oK3T>{lta?2Emo4Xd)`q`<M;L4|&ntB6aBj*a
zc}6p(zU>E4{R-}h4La-q(#pwK{cNU1k$|Qb^um#YcpMhgm`7XL4Ebq~Md^u(%`~lW
zRj!Y^$8Tt7s2AP*^e$KUib@OR#NmYrREZ-|$x&_va{>V2jN?*1iPWTYknS^S_Ix^m
zO?Wo;FPs50f-}&%`I7hOM;%DOfBn0v9I57)yYvXLs2AP<V5D4|Y!mAcT^OCR1qzC$
zUKsB)_>|AZx$fMNpKZ@rw~Abjp_3rFsq%p`oLMVc)n+>2jpMJ$4!RSL;zD}C3*gXg
z#;Q;09{&TR@YQ($S7#XSxkW}1?A%5YcR$vzyI=2R4)mUUTf<Kel{>2t+_|i;6i55L
zNm=~IFih=OidfN-;`%}~V2$&{dwFMP)0+Ig6Fcnt>|ppDMU&NCF$`w;cB8x@bX`w=
zML|OA?WLaitz3Ng8N!7Eodztu22*H2)n3pS5O^{E>3QU`*(54Yi6mnFnt45$^ywS6
zaRBCj1mTPI7-P{H`{%3^sb_r8(Dl3k)^<6>1;%MXe3K6QvQ05ot>`N7&RxJhuPHA#
zn9J*!#N&l5NyEK!cXS;X;}c=A{P6QL`n1)n5Qe`&JDQp0WL>b@gFU4w5QTOj169!5
zRpvN<!~`Z#^$qTC5XIeiP(o@&Jeo_|o3lv3vWEmL{K~^3>v&L5t^OC%FBYjM@hF`@
z9fq_2e15TsT&7E!6HS^#WWWU?qBTjWD56W3M1i1;?P5)6XzHYF@|_4pO-x2(lB_o#
z8M{TO{tUoeOylS|zM743q4d~VCK%bR&-^GZz88s!ROf`*ALVHJNRNKRNCt;&6+Z7}
zQ#OmI;518s<crHoVephH!x`47E=Dy)$!S0%s501RqEk=y5Yj$s|MAs><27hB(~7l6
ze2ak&qc^hD*ddNfduYWDmVAT=7QMUH%mqG8If%7K6LTTXpLe|o2d&<aJr?5h_B;N9
zfjj+kKFpiU*Qfmc3=#9Ovv=#4D06Xfz0hnu@`Oq)DEU#I9V?=W_znF#>UjVq+G^n6
zOSFy7T7hg^`v+Kb!Pj#D2g_DRTN3cogT&C;yku7WNWEf>PkJzU0*S$YLv++bD!^HP
zzTmKuu?H1DHtARB0}3RDe<_$WqF}LEFE#&5ss65GRW1nTiBI7^;Z@%Kz-Yo1|71~Y
zv;lUM)BJlU7%CcoW~e8o0-!6jn0L>nQ?KDJgojR9byFxT@fKKHszg3cE%EIUmz126
zt{IArg1!c-_~AYKok?j_Q>5B@<glk<>NYG2p|#%3fwe)6nWq^6DxNQ#@39VY>yv8K
zP-3~Rj3I|mQ(9>bnV=B+<t#eC-IlX-vSD$EP+=GoE7D}E_nk<_7{9Xb31H<FQC9k?
ziPbamqf`d>S(NA!p9IN`MMWY&$YmPjK8;oT9gQH8%m9T!ta5WK&lk+ScLyR2QXGeL
z>$_?{T|o|46{(jVQeeP@5Do-$74l+>Fm$WoeD<GoQjO-wSq|TP<?_FrB_(o}_#$jc
zuwpNaS-Z^W#zp(9JAkmqt6QFYDr4`!DbdL85iCc@boJ^>8d_Ro@UPw__pHyf0EcAz
zbldwm2-KxD6fWHVNkeAcY1af!Rc+C%RKbCy)V{5x5+-e&tA`E&W}cYVfL<v3F)%W|
z#lpghaf>glI#0nVIa|2kAS@)*p76lqhEz0r7K!(A4i1s0;TcK5osnt>{b-Vy=ObwO
zO66rZy$tx%2M%$Z&&Xj}u8h4SGWQ5BISF9JasU;vZTNaL2urYxP{PI0_}e0%yy^;K
zA(HVM4?3=82ujo<kNTGk($E{Ifv5jI<!{%5xMgL+$rFeT`S)WQ$xW^AMH3?VMQtpV
zzvhr}Adjt63y>F`a0Q5q^pt$p@D>~o;+cK@{d+X$5evt%WZs@|91tBozoy$H7^UrI
ze?;SY&QzWvWCs|#PXQ%qwx=N+Ivl{}ck7mueD4{_xU~@kG(^fo_9)vjWYFU4*RM8U
zj>AUbzV5+fhA-SnGB-bTD`ky$-PpD+;C;$D5X1euzNrJS1gaAZQjs(VdtB-HhC@e9
zJh73z9D{eAfkBMmH*6%iu|w%J-_0OwpnGCtM%C>lg7dOzYq)k4>;L~cOQeeVQ<Zq~
z5Tg38<J>|Z7;bi$cc>>dr(skwDzP02D*!r9?mO1hr0#XQ2qjj>k$oa}fOoQfj<}<^
zfUQLj@GcQj$*{ag5kN&_y|7$|SyTsQJ(C6>wX`*~`BZg*WA!RBVg{07KzX@`wKk4{
z@oRp!GQcBxENg+rW!PX?XLdQKf*q3gG|p!O?MNZzx=-vE2vd<_TTH#}&w+4*Z!uzs
zw<*8jX=s}PPU{hfXd?WDJW@eUnO-AHC+%I(8H0%>a`J|X!)AKP%6ZySOd6jR@y3^`
z3uqHaS+of%74>1+-S1)h(GEmopjz#V7kb9i+gskj;U2V9GXFDcO#Q<c;*$GPlnV!g
zOhq8INBnRvUgRMRIk&=V`8>bqQUMgps}@27pZ$sa%XtlfR0<&dO6jom@$`7*#+Qjx
zWBmb252QhK?T|SY>7L~{O=wy>n|Mhc+TefM1A%i$Gx%Xsf!>Jo(*1z~!@v*15Uw+v
zy*Xk7Xey~$#u)CU#zxV^f)67tDqe;p!N^n&Y||tVt{K*VK$EF4M}4mpNDo#*uhV^P
z^7#Sfm}Izs5}QWFE|kWSV`Maa&$H-wR<NnJ3x|M;-6x(aAd;+lb#(@945J_Hmg7R{
zeSkkD`Q`@uN6t+#-Bt6A4B1%v?5TJ%f;)K!Hir7K$ZJ2R=8#6CexDlK9IJWV4ZD*d
z_4AoCyvLQ$cfUIR=MjQUUQ35Kj+9YV05VY(%jh<$B;{E^TDKwi9avC*9w5udkhPw2
zFQy|+82=yI-UFWM{*51Q7n+1hQMZ*HvRA1H8Kvw|C@W-dQfW{WBI_L0ZDwR=OGXkW
zdylM>z2`BW>m%K_PWSU4&;R+qUccA<J2~U~{a$;#uj`|m-!^XlbRl~4fpzM1xdZ8Y
zPOl|k|GRbj;Si{7wWjlYCR<Mrh-jULGG?<}QN9on$;k?T^;6YW+js0B<<jSGNxxPR
za_eb5TMKK#$h#t5d(x4%EM6#uej+&wt#Bhap0ExeP2H&xGhV*F(a`oGll1S2WN;ch
zaPOs6E%I<#F4M&Ue)FG-iXOX1%J@r>co<{l+IK?AO<wilg();biyW<Up^O#zD)fM+
zaY4sP*sM`3ETH-?I+#3zqv2*IjAk69prE&LZ?sytH@-AIpo~@tjJMh3iqCg#J$tVz
zkRz0e;f?;=w{PJHavcRMC6`VZ9R6|P=9d7=;2!U_=<Or}!vi>baA-2Tq(BugNT0%l
z?y3nmueb(UK;DR8OM%)ui&7VP#`B=aa{xh&lvLdu&kUk?b9w=fRfdRjvGCB(brhSn
z(4yG<{`=&$-2+W`!Hm-!+|07ZrF}>}wq7WvLeWKPI`igVPV(2DsQ*>A2!)h{!*j$2
zV$|<w!z2TrSxcS2aY#x7REmrW?{1dFtCX?X+FwcVUWoe#k3i=0@6c01%!3K2DKm^l
zIIk=@MlbBgtWpU-Oa1V}q1^Jt;L)KcdrM-yaVa;Tb^X96>=vZG+OH~~`$f(>N=8hn
zK+iZux4_Ny%iO(Jf5*=W4Uxf$(0%M8`)Ea*20ETo_@#?#VgH?56_Ixa=)>cMJ{-1_
zF^r28=c0&$mJ42H)o-?2;thF=t#+`?@a$iCDpERC!PMKUxeKSEFy~&z#K=g(b?rxc
zPNtk|!hC*h{?@NQD?GpUZ8eEN(Ov@8@qDk9VUS)x<h37DY3TKs3OZT056rLXQvE^X
zFeLlGLUAI&hkphjYtKXfPWn!?VW8R^0O>+yAqR)nanX>l2(yK3T*WQ0mBuY;#&8T8
zlXm3zweOD5T{*2+yo;L2MZu&l=CwDaqTiqo$#?afwULIi;)dl_Py;Z&8v7VPLx01L
zi^~uS$KGyEk=$A9zA}ZKhSS=jZKmXXPB1bJjSPn#Rh8vDeGZjjGQOFK+EPz}T{|mo
zkMBz;;%#(<lf5OSua&W>8};Lqy$NlUIu?(#bq`=74cY)Ox*vO!zA;iR2)1Rs+6%pr
zgHofk6MWDR)Z*EnI%eoLno?LLArP5gCpxxt*5TIKU>wYX<A1;~vt*<FN++$DlM>7>
z+8(oKF^A0y`-@kO3R(T|@|IZ-GGM6;fJ5XDO{~!y+C8GJ?`S{oSSvE1*0@E&(Mh|t
zsO21txDZ7xRmjt79qavxOZvmpet}#@J@|#T)ceOrXlUdh3ypyC?=Is!II;F>f(D-n
zz`9iJY!xb1;wz#qtCu1xe*_?6JrABoHbO%~2}bH{j%l*ocyb@$-<+bn0xU0TjF`=1
z6O-AK%$rj@5pvjShuZUg^SCyl=85jaLBJYhSM9dn+h0{3s~>`(t2sXHUbc)0dVZPE
zZ!+Y}3O$7lt(!ChK&~+im<B*hO=%GJXx?j_opDsatR(;(rZf<+;p;=@_bwH4)ZTx`
zatQ^uG(Iq78OVWS){1a^khj*9sG5+I<t|xQgA><&cv1`zKi_>-e4@etbh)os+9G<c
z_qd)!(4pnSRdx40uCLSkI2yM1`TsU-Ke`rMvprb)Q_Rs{&V<3rnjV9uXLr@?Z@eW>
zk|4pgeYfu^eU_c$V$quM492BK{z{q@@)(a-bjY_K!yKou-l<*YZJG{HkT~s}jKo%@
zI2HEOTx##{BEU4cDl2v1TcIF?Tv6t1^9U~KuXFA>erp#_uO_*gH_`5X2fL_TeK2l!
zMHr}*b<zF<2Ts_yD)=)!(-39pak=0SI7BMyIB9T1)Zx_jZQG3cKNXn)@I@<IgzZUw
zK9eY%=lc+HxxS(#_tlkRm;3kbolbojdVEH57lwZBX)kUW#!#ndX=xR}5ZV<P_2M~3
zyDKM{6vBQS^mmWCUM>ppLR7<{kL%<pWJlKGuP1_t)Lq9wl%1u+pEIHqrU?b$IjXxB
zVj;0n=VQ&>{a}2UDt;IA7`o2E2MJ2t94(~K9@<7*u@2e$HKvJdJ^b(PS(LcIFj%NU
z7n&%(vu0|WV6OP}5k8Y28aNQ%Wb>xoM$pNU#7iYic5@Xh{q<GEW&Zt1!(F(=2Zj$>
zm^>%PRAaAP*#B{50q1Ta+<Jwaap&bh9oCl>WpwVBpl<Qk=>CL$$P^i<ZaxBFi*VcS
z@9ON0y$U5T$*SrfHOdVXpNjhB0SS2LS3{+%p)Q9hKbM#wjnc@teHeDXTzpZ$T%r?c
z`0A?a^$uwRgTIe|F>!=##g5ARSF`9H2Th9~A3%e#FeG^JX0HtAQp$a(P@Zg%9lN`s
z9L87-ZQWQHg>*VsQRP2qgu@BC$((~zLC)g|Fh@!g)J<{Z2<zu>8jLov*mSjfd&$UZ
zXNopu9^Jj|=K@za%oqhV7uV?bzZ=x#o<JDzhpZ>0iR%CB|D30T__5UZJ^kj<OD(Vv
zr6t!fRoMxqz77;~iv=ZtvZs*CrJ?4D<<84|s#mV~9T(qonCF2)nBWN^Pfx}bM#d%R
z)XVz4*79-E9}LI)XbCc5O~ngff{jz|j&vnpm=4IjF){c8$inKoo3}pBnTg6*`)p9n
zr<$ncr}?6y56+GwKQEBAaP-xJm1+Z=u6W7n7yz^EkKa5I;4Y*C#U!fpL%R?2#0Yg{
zmpSB&vonpU#H(HIvIL8Nkn`V>fneKYzrmtB^LF7lt*jKiONw53vbev*t>Ji_Xh5`P
z_lAo(+nsf~#jPfL#V!oaLA<co>#uk|2Rp^<1UBp{jJC;$V|u&IzX#(qL5kM8$b?^m
zRchJxWsLO$VU2!6>?2&TE!6%abgCaD_M1fFA>XC&u5n2`#+8`RV(1}KwMp+6V)Y0p
zQ@v8rAw=%+WAG2?={Aw9)dSE-%**NIj9B1qsF$+!DYRPa2AnK#7mE-lhTH9H?O={i
z{`!HnjembytG4qn^WVOW>c*Y}4l7GSmI8$^W+7a1__Wb}*NGD90C88R$_dwN4*9l2
zRLK$9kf%{Iu~!3<>B5ie!Te?#mv6BPV}-1x4wcme*F-YBZ;|22Q06Sx>dWS_E~`es
zcXFNTHy19L&H`}lEK?}A{*H-Y1y@Vz7U<+%23z$~R)#$`<-=h*x-p*Fgv<`|;#ola
zqLe0F9M5aLG;euUvLnsde^^)+e}SbOhTO=Gm8pVVKVA2^VqZXm%~D&dr(vw~alWmQ
zK3%e3L3_>v+AbR4f5LnEI`tj%p-Aip=Z<&DB&*6e^t-6F^`=%FyIYdj)ph->0jFwW
zSRx@{Yz7<+YWo{hJNbEwcp3(Nh7gVQ^ZQsXMVl4MA)6}4ak8`X+Yf@nu_~|7g;7a>
zb-ji3oO6!Lz2SqUKAMc3D6seUTf&OF%EGyZQJ3FJRl|9Py+wW^&QlfKSKNb6im8qZ
zt=TXrlKFuhRIDl0Kqm6K7!)iT8aEwY(@!oG&XQd#rrid-P7vD#|M_hhZZ8M&KABfZ
z9skon&WnIL&dyO>A6rf>Y>ijvjb_NA=mj@&Pz2{S@)fLO#}HH`(Dtu4Ddg@|;pd?M
zd67Mt$^K5B-N=!Wk`j_PX|Dt;Qf#2^Zw#e<1wnQ3a45S#bT2c#sd<gVKJU>a#N$f6
zH<pth$QfmKt>$;Z>FxX7yf0h{l=VF-^q0C};7Rpuw+<`I2N?zK*Voen*oBTCw|YeX
z)F=Fj<Hzqpo#8K=Ez}~sdOM3^#@X#Utuntsm*I+4w4S|qd9Jp@Eq?GSufd1Q=yfkn
z3Y~;881n$s#O~EDORT=~vG{>L=%>j8!>VOKD;K|sQlqvlBLramAx5Jb-IqScQc<w9
zTWOGdM1~XkvRUuIe1scA2&Pb0q#W*EA{@@lTom5DaFaC_8@$3G%FM(>%H@A!L2Iyk
zw10{BJn;H)fvI0XkPodG>%}UVu%Qh9f(x?9UyyoU@eiOe9CgC2ZJ*O1#{L>GUt4g#
zgm#p=$7lPF!p_5V&p!Z-7N`T17kT9~#eXG>aDE5G%LUj?&STh{-yq+s1p{=2obNB;
z#S}QggIrD%0w><c81}CUA*Y`}d6L=2%r-aRZ)*{mQoAoDFHU%uvWuk}#j6}uVP&r{
zf<Y(AvRjN#YPhNFO>f=EhdjftD5R!kNl4BN*+0N_HyMAp3>CbRog5lz5imrglySSc
z?(bVhPq}vCmaMjlQ@43s8`f*~zgY&AmZ4iOpFcnT`D3i^^W#)X!<QVc3}0h|dC?`Y
zVwb;##Z0%Ca_xV`bJOd1M6`UUie)x))(2&Wj(pdg3VRX@Ob)5F{R9AgSyN8&bU|-G
zGN>Dxzc~-SkENk<*Q(2RhXw!fuY`W*SBEPxVy(MOqO+FrPQkIKzPVvVX@4KswMMc+
z3n2b%YmI&nXpx&N7(*--ZR^^WGJkl7|M1Bob%ar}9d^7)yzg)_3a#h}jq}v{&#DM8
z4|YlF*XNnyuG}NHe}D6a!cbpD5pU{sbz?Y$v4|^=+Mbe@_tw+nJy6D}hG*D*;+aS|
zldfz{Rycz`4Ed<|R|Ih>tEs&@-g-Fe!_cj(SL@(>8s={*4Sg$xbPvy;zjK_z;;@F_
z3!%{7+s3S~V^zb-$UlZ1;M-04@>}sP!%tm-_k?a#8Ta2d>c886ZOY=NmUSV?%R?9D
zlk90|s){@CVR>_e(L4!f4%HxQx3RIgC2^dyPkO`hWG|B>HE(m)gu2Wo!bBOJufKvY
zi$H8@I+o!e_R_LKQE;nyj@kXPRtkp|a9>sw(y)GRX@_&h=@;qgw$!QlR9kRQdW7&{
z6i{C^vFbCpVoKiMZX6x%^<8Z=q?naNh0DUqG38@o&gc;cNzDNL#9N@tct0<JM3ifx
zJ!I+T78xa>;I6g?+-S}iX6PeR=<O@hR4;8<RJ$A<Z}cz>yY#-0rW&cnc$OP1_HxM?
zB|ge*x6^bPbM7>K@0zzA<6psSui#eTbC@?ud(0_Cm8Y7(EU_RHpEuhwVjOBnvj6y*
zyJt$aZNsohjU3;yZI^*09E?O^rnGJImThr5XS%WmHGgc8gZFD)so_5}_QIRDG*j<)
zsB-g-i<MNyyo=Q|VLWqp6MTFZ`YrTg@N%II-f7rrradbw_W2`PCnIyc>*RH&mC+4w
zq~8UeggK9$w2v9S&HZ@mOI4Zr{UJ?1(-hN<63FX%u>`;!>O&cu8;s-*XAZ7AA(785
z15J!$56j^EjBzp%lT{*P?AeCpMSoj>-d1?l<bd;tGYr)Xd(iJsk{6`oWPI5_+CR3&
zFw9bFH?63ky_@y-uGwuEh?^?4j7IYbqZORhm)w700fJhgcG*VwP%TB1PRK90twiR5
z@5P{$S06T<coo|;E6UZHc_$m^Mm2RU^@mzuLRs@+YO*gg#^8Nsx(EL6&%NeJaQdOS
z<oMq*{u+FH#7Km(E|cZSpZ1Cz<g0g7Z<NgrU>jd5BI7ECi_^fY=au`aOu;M8?Y502
zaHv#kU}Hi(?rWG3FLk21xyBp`bJ%mT1t$0EGCxJJ9hjipA@}7WUF*%Oz4m6N=T=4>
zJ2l&{?pvd_2RZ33(V+dcDnR|1Et@^?tVVw1z*ME}Teje?ePdvxVm>~u6{eh}LLwR@
z*O0$$SDn}0LhA$@9+A0IeL^C6N+cUOhgf)BeRFMi-oE<3zG-zO1MJGOR$icF2hHto
zai(?Wx*SO^lE0B3B@jAS#i^s@@7*fL2D$LJ+tgl+60*pfX~-&1)Lr<Z`UX6Nyr}>4
zPFP7}^}6MzMC<KTd4}&FY#Y?0%X!sF4;#h?N^+U%iNWR@ty43;?o0ZjGj+-nZPqxx
zT`F!<p`BlY$3FD<WVz=r^>GQbW=a}TF1mHk7E>q|14JhDT_2ngU-_uR?@0b)eNw+<
zL#tV)_Wipqv0HJK@&;nvkLkNpDmd9>3Ui*Hn_Si)5zRa}%U+f?%pmAqcFg`f51&00
zN92p9%5_y<<Z71Zr1dJ7U$@ZQf~&}9D_{1L`LR!IT4_|m|IC<QY(sgwA4yfYuGI4-
z7Y2!F)5NS>p-f54hAK6YS{`_a^7>c0pO#|#B}+?|E@c?X%cY;5>tq|`8_$imm~^Ih
zETeNCPHrpk?{MgO)`pw*Q=AOaxt%*v`eL5ZzcsVXs=_XBY-Au|T^F6VZ^N{*{cr?*
zx*7zmvYnAj2j+f`gV!n|ZxdC>c8O+&#Q(hy_#RMu3m6&!@9g2{>thRsQ@FPtRs3BU
zckMSiTWDjLB$d7^?cj&-Q#T~due{y2c}qIWcsVn+c-=OA(LF`t&s1wW2MlP*wK;52
zuhr$4klx@0jbc9PE7K?bQE??4NY+f7t3C6X2wJ$;q2!^_OLsSU)WCfOb=3K=kn9qT
z-nNR>-%eyLl99DYHZ@`T=cj(YFi{6<p)qQ1)-h7As{3r~KG9PGmUiXL!JMHFFskY|
zU$*3?X&#8N_Av1K@o^_zSm>La-qg7cXK9|-w(0dM@2tZcbR~k1=7aq#0o;7B`-hs^
z6iwZ9=6~$buB%dD$atz0;3>|ifGta{A+ZHXkI07mf0yGPrwi{3wKu|PzuNB66vv;t
zf+7vi#T;Qf-IsMFC+gM)+lGELB5?a9`^3J{hW_@D&}kDMW3$<ogk9_h=0_~tX$Zrq
zid*;lx3|u^u|sPK2KkQ}VjP)p83&xaep$(gQn;{b?#D!FAe<D?Q-=gy;-n{VlI7sY
z3lg}#O)H#ToLEpxvj1~F{CU>-Y64}*q*g+>vH+3BDw%GH5^2<s9>R-r<&H}Z!({bD
zjT~1WSNMZDm_0lU*=-!%E&)cRom-&e<J03K!s!O~1aFZuRV7l!*QiBNu4~s-E9_wq
zH+AFDF{-(Cd)<|fPysBlZPcb7O3X1I;!h3vb?<l2>;aS0cwJfh3H+(zz<!5WCuEAw
zumEA@UK{yd+64m8x&xkIIb7lHnOS($)Xuji3Eu+nCL>|lYaZvHAKT=iL2;@*t3g;p
zV$Rt7OKPFc@^q*CkX#qeeZ_hhj@f&$%{Iv8lD#=W<Xc<XZTB1JTgAj=eo45JGte%*
zQCZ-xGj|Io;re-W$O|cv0{yO>Y@w%sw+Awa*&2e_8j=%jN6c2LZNOGf&W5>TI&KSn
zO|aOtCkv0MIQR6J2>-*q$Bw}4evHM@OlBqeEzoZ9tyK`=;LPgvTY8p1mp4|z<-Slc
z--Z`grO%9S<Acqz;fHt(E`oY<ES&RaY0m6$GQFME7x<E`SO(qXPvPfb2QGi_KluzO
za&!NF2hOc=9LRh>f;+cun%2lmDYVWGr*u$g>>L2+tP65V5R2<4m3K_+AB_$FmyT;P
z5wjgkx9Mx|8P`V;{NW{^Gl$z;q16jD%Xk7|RhjTmymF_YxnhLWHaMNPpm_7nqx+FF
z8kK_njxQXGT7q#NRi_t#+XoZ{B@ApmmgCAx`}kUBlT+IV%W9a7`9j&l@Gsd8Q$Zkb
z$2vd?#Fz)z9<;eW5S}gO9ME6*wo?Mk&I+|xk%~qSjoED3mcj+kx=ee`*)vL1)p=(<
zv7x&9?(RTUeT$H-0F}e?Ok>ZtbNikF-s#k8f;VUONd>|0+mtbRoZI5Gu0txtZPoi(
za4dIfQb1_<{U<Arb_gD>E{uFpUb(lOtlLLq^f5-vSqk>*yai7*Q7uhZLhma|4(IYZ
ziZ1l}@7b#mB?9Y)XOfp^TZ5vH=a8@S18!atZl3jah##=w+g^BW*X0<1=sbz^^kE1H
zyh7$nkgG+J@J$}WVD$_Or0aEFv*7AG&dyN$^E*FZ)U#gCwobQGXs;w`vuicW`ex0O
zcY5K~<gnNYC7LaYVl!8F9~C?mrHIAkD8ULYedt0bmr!Zem2R)N<83gx4CdLN5d8}G
z{>CRgXzx`+LZ%m}dG412oQdEBsULb%$q72)kqbw*2YH}-l|2DopY($BS~S<r4+^=F
z1ND-?fksK{_O;LcBa86tOdvlfqT`Ud4mCf^7USrWn?c%vx|mB3<ntR~j!G}%{liTj
zPf5OFB_yPt9y$O}i%M1gTVG!Z0#%o_sDIQWhp*r%4s%i7=%8K5;Ym4WT>q6BTSmAw
z)e$>&_|pTp-?c~iKfYg;u9vDh<?@bh-A|aUe5XfR1riHZ7MqOTKJ#iX65g_nhhe;M
z>%QU>kB*W<)_E&24uS}gQeQec)A(emhjCwTv7qp~%G?nP=iN#qW)H?Sxe-QE9Z9Yo
z_=lqxFN5}7aFGTcAKwmwaq!zQMD-&F_d6thjA|Hq3GwkrF6iru0`r;U1Q3_zvC9cp
zjlMv*RRhI`e|`wP@Q5dSH5xXvH#SLZ3pg_iBnF^#FI#Uly}!ZjH2vd8O{AQ2#ti%B
zEtFKRU}_R-ZVcwx15s_|P=oV<S(aid&D)p1%f?N80KSxFw3@f-O@QfletLwDmA~1*
ze=s*9?q6dYpbjqZYhea6Sn5)c8QzBdeaMx`A{Uru`Nf<Slyg+`1d-2SLq3Ns+-Bdu
z3|uo_$A`e@80MWDH1#cn&2#!t{o{6-A8+FaOM>%cuy4K@G<$mpc+7h8y~9NJD+oJK
zD86dF<gL5(*Iv=_LfFl5ZaZ6S$6FAd1C84D+HGKOCz|wqO9*2Hbmiyqy{i&^5%>3g
zk++?tK6}-|{hGorsaJ9X>QB;$An{cv2NQHfpiUA<I!Fne=;(;eeE|+LJLv%tD>$wJ
zD>!UxwTW55P716jNsK`b?O)T3K6P9UxTrq(Py_z4@v1Y&L5TNhC;E$kaceaq@^u<<
z;OjI7qjG*{Io#~QQ|&_D+LtMauXFcIAn+u$ZcjSm$r4<&Prz;qd}{D4u#Glbi`o7l
zn4M}Ovy-j@W+%Pj^4!1S*-y8U@)wvL72(%=iJk|D#EvXhmWT;G2*Lm`U(Uir8a`F`
z7HB4BmnSvGJ|1RH9K4w%K=5Xo@OJcH%kz^N1{WZBi?@+H_B-9so9f6pSlQPh=kO~y
zM9x7Pm<P$>vkv4ON{O69y&f`6daCWjytt?g7dkx?F8@miem*GTi#(94zK$auY<R>n
zydDuq+lmN$ewU~-naNscr}BUCGH{ihCKpivd5iceCg@>I4BNel1S=*DJl#twd)hzc
zfDhQCYCTs6!tT)kuQNNoQXtxXyIKg1%L|oj!Idnrz@gwuV<g`BUt{&t*u12I;Hsv(
z3YblN3%7s`CZm`Z<mQ~<bwel=LE(NvAi8sA`+p&u6?z~m`*swrY~*;%1N1;jb~}Fm
zGu*@(4#)=@uL!hz!dE;&-pFXw&!U@Ie|?z|I5xm9OB8wsq8KeOXUKfxRnX5dq8RN#
z3NO;m-=E1u22>?5dw>(ZauUKSt)|bU=qA@+d$F#0?#sj~ggke8$Bj8+5`AF=70NOg
zk^B8BNZ=~qmebp`?U4#8Z}u>>*p41M04gV02fb!LKS0*U48nxpVzwC+v(4<yp(7%Z
z5-0fF3Pw}i-w28x&O-w<2-1vf8u-djwxFkuI0%=IR7#U(0+Uf|cAv<~i6Mt0X48D^
zU)#S%Dvv8*<+Nnw|Bw;TeSf6;wyoOmmHj8ihdI`+egy?|lGU`q5SA)r?jfSvj1tgo
z=G#qs1|l}dvH;y!_(m@LNjF8f@tHa^d#D#dq+-AJ0a<$B)Dh_%gEL5jn1~}4IV5m$
z(e3_30%w#20yjcAElNa9;D{J#O{4W(J49zr2Ty&^qRkntLSm2y4>=$|^2!hXUv_*|
zNQFq`bXY;;bc9DR#3JVm*`9e)1fGbXaOFYdR53YYqyN~?_Hg)=64Ixd&=zh+`2T>e
zRqJ?S9wAS(PVkN7^T^osYP5}t5BG;~@7k{h4tm%OsemieDAj`YK%hw<xIfOIcvm(~
zAfx5vj-A9NNEs1OIkWvGF#)}SfM%DccK<=ZO&*cJr;Z&Uk9DX3;X2c5h8~I&9>%J?
zC9@merDV3a6G8M)dFY|?Y};Ik=;WaX^y5_d0#Vo>O3MMfauz{4rHf>v-XarZ<4hI^
zHfThr7qW$6xgR}<We>T%nFh-_;=-dBC73%kmZ3kcvu#%zScKt`|84T=Bl77twg(fD
zPYna)>mZJbkVHp<nc;l)76lnf9Pm)}H7vH^nU1%@>*i74K@AAa+eqVi*eTbOy%-iC
zY>0_rrB-PGKEycEo0$2G<nUDIpJxy|dGMvq@YQVh?VI3QhVc8g4R;z50fP|%x0$wn
zK?Lk@X{I2m#SEr{J0D7)8YT9or@>z|i>%q_-<cI*13w>`nua2mI24l{$^Kgk7`mR3
zUb*oC-oGIE(pWuV8)hu{j$3>qVQNnRZRo7zQM;DyjQ%;dzWVA7$qN|0@9c~!pv~DM
zoM&KIHz@UtFf{?$+{$83_nUVXiS=wbs(X-pEAg5RvjXka&S^g)qP<Zn65Xh&<NqPL
zOmG`I$n5{mM0a9k3$8?yPlsi^%}zuy3Z<rQ`>uXVxyyp<AdZ?MIzWOG=L8Cui9{Q2
zK$Zg(&*U-zJnug&-pG0?$PPF!UTxS8mzyFLrM9Sdh%B^j9T5#iRh>mKrLz@VqJHYY
zZ<fG`8kpYExoHjpcB1D$nGWBKy)c4`N4J5)hB*>($Q7l7!B&-vu2WY6{v5xoPQ6Q%
z8pk%Sm3yrR@>R3UDT76z&cNK6?G@Cg`kHAVWd4nUbtwOyX>pt*uF!$;jD|}J*^Yp`
zcnkx343n5yq2_Xi4}t|%{l>bscz8dX>tUhwf7A87%Vx;kPy8-ZchBS@my{H4Z}Ep|
z@UTL4ZRN%e_{wJt{P?WXo4?>wS5cVCXD44-3GYV}^xzT4xc=)AJ-1(S<h*UL=9xDr
zl#(W@m8nRyKm7gK;UmXIfI7_sp0bF2r~)T@Qsh2-Djj4)iy)R5N&~jF)x-ney}TXc
zooSMLr7LU14s!|BWR_`TBHn}Ocs`3qzH=j4P2?bOZ5fPi-EbG{A2E=IudbKiSy9GL
za?=n#6@f}-rUhlT*5@G3)8v51D*(c(JX0kV&1|po9{`?5cW$LT$b9{awWWdzanM)K
z1s-^2?%SR}&0q=}$a#B5%u~3<K2&ac6eoskyE{DkOyg8YUNW#T0W4F8ZdaDWyHd98
zm{xOq`83>6?U)Ct7saW*EJ5aXL_h2?h=g^~`Ns|-2E$dFAt$Rcx44S304o$xoU2C;
zHbw#nyg1}o*`Wq*zX25Ezu8zq_G(fq_vK5XA3kG?Bq!WenQtSh*KX^~3{nHdq^q)U
z#a}TD`a0PEV=;%}vNoD5KvMVEEx2|S4er?%<ItuR8k$noh6TxKt#oOdffPxb!6t6q
zuSJUDPqL=^%UZ|W53Lebb)M0Txd~z%MkC*Nn`sR_wKptfp?>OHhIQr6Xw~~9IPpGD
zf0@gbJKIkE1We|IiI^_ZSd@kDTVY}9wAB5I;Y}(@^;_K6tdIXu@F{SjBwd`k9P8f_
z)Q)c;M3ok^*yHC;LY6oC%4*{CkJKvs6pmdZP0-D{Y}s=5X5&HIv5$m(k9*y3Cp6vk
zTv~LTqAq6iH-GZZ{KLB#u`2P=_ku>g;cs9ZJ8PyqJZfpBTF`a)L7%wph?qQQqv^By
zp66nAUS^q1ry54nr4p0EMQp#c6Z|iDTa6X3<}KJQO~m5$Cy&jQX^!RU1{ia&lRKbV
zDDK!L$3(174$JR^j!}iE^}jp_2ov&WvN9sbY&V09RY}oZ1&`uk0$szz*}ZWR$kvWl
zfVk(nJ8y-C-gF)PoVSI$wf~O}v&im!D~esPcVx9-zInCnps)Ch%SS^^Qc;VXlE$9#
zJUiq4q~^KJsa(q(J1}?5Qw?`FyU)#kENr(rG1BMi7cuVb9*RlIRO%9H9_l_&R(kgs
zfv;&cm|Ew_vci#T<N4+HrXKC_^2jl#J|^sPL`7$EVI<G}X)R}NT#T4`?vJlcx9^KD
zq+9n_`jRoX9r8CPoJ++uN-Fw`?3Nh3+hjacX|;J?Ph4hxA-@d!gp9dEjryaZ2VSp(
z)HH<wv@tOIH5HL9iwFRV6Lh6khe(8xkq7~j%-%vOpAQ~;=v}8dWE&n<U|><NjuH_}
z(DD9OFgJ4+id~Ml@Qf<YaQ{2T&f;`cA;)?wKBRUPclU3>%_+FD&JGpmGkmWM6epv0
zBd4&)7oncio=w#m!<A(*AEoVVy0gsW(cRaM&)X;Ng_Y0c_Bq-l;U^a8BChk{RH?4J
zs+P@6<w6plz-yu4zeJ~(*m}zGx}2ZwS}F0RJ6s!#Q${^iR}MSVTC04-XSVd3PNdT9
zdBP`=qv19(EpV$i;(&fny?b7L?}EvgvDNF@wi6`j%;FMxgNq-=k_|~0Ri?I}bW_K*
z!TgK0>uh_%@1FFG7zDmhP_aX$h~EvRM;bCdqj#r}ol*zU`r;&l($92=hqc~?Cwy`y
z0kdp_%yR1y2XUi0i!v7JQED+2^Y*LwKvri$Ua;jNj=A)c`-+@*i9q7aef3m&DlVUE
z&hfczWCh|q?ShumlrJth>P*Vq`|?&n^skKhMO6(-JDo`va*dUFe?`3!9n;>q+nnRM
zGMzKsAXUZq#--bC+OJ6}<fzPZ78>=x8sG1+ewWnJ>t<1~dNz!|KW@IKb~UFhM}Yc6
z{_2FG-e+}cKF7os=Mb%!tM(T=%E;Ut&$ZdD>X)pZy4DQYJgey|D{B-793X!B_4df1
zI5AzIz=BMxiD4aNS{{?gw3gK;!lz;cfJjb54tqLP!L;@x)7lwWXiZMFQX<4$DvTYg
z?}{)G9@QMHdb$(qz)$Pk@98i-fY+Z5bIjuF9L$t4_nTDEG&`cJ;hey9qNlfc=j!zB
z-QuQNKt0tVzY;9F#z9=}xU7~veyb&6t8baqkUNWA=R|07MTTbaCq;2;dakVOkNNtK
z=#DwNn7;b~$Ew{=Vt(Cc%(cQ{o@HSox>z^$Vq2(v>nKg~kez0+kEZ$2_MG&i<K~>>
zVYfRzjP@ofyR909iQ5%f(bQ|HIhJpo>gRea$JBo14(n_Qz0b4FBC{1(|E1en!V3-a
zCARjb1#_sJvd7F@%f>(MEM;xa;j;{1_#W@j8;iFdyEUmesl!y8L>PO2bF_VUrnx>O
z*I#B#T5DROV$XN#)zb#wtA1et`~&kKRB9f2;zf!a&w2%6gG-OdM)ik8w^)>X{mzmI
zClJ(knf9!x#HtDU-1?^fF#3!UN<G3ix*q5=8`9aU9sN+2`Bu5iXW_oz`>}*MF)E2H
zD=YEdwkwn}&*vWP9Ia*$T}sOAct+u7rM}eH=YE=CB<Fz#%m-oFw<K0Sk#(?};BH?w
zUwwJ!>0kwIqc{tm1V@-F(VN<{cv^%qJF;+U;N0G)pXNFkh3y9elE&6mxWQ`ZnDt=S
z{FruznVmfM>jB$>Z){9YX{qv7O)HmZc{{=nM{AuDr*)q#*+EcoR`k!&u`0E!o~zKk
zeJ826{ln^nvE^u=zt#7desSbNtLDF-NN%jsEHf0gui}z<8ZACmU|Bp)=IxXeXZE7d
zI^FhCp4_%ej<cM<t2QVxs#E?)bsT8BG0O;0fQZ&6&<=&1>*>Ae@F>Kx{Aqz(aW9`Q
zw@x?eXP%7~aK1{;e$0LJTU$UsBQ|5(Id{?C2*KM)W9RWza_0)I`>VV!t6uU69d-P>
zITf6W#Bvea#C<wlwNplU2DL+O;(rO4IW;RL*Sh%M8K$IHe3~Gztl|I78mPX~YH|3j
z!lGMz_SfF;nVqZ06=<xlElmupnk7eOYFnD${$?0XRe-&e)-m??WezuYtagSsudf(`
zicYMlTctm);OI__X!b|m^l!EOTUrzL6k+*|R<%xvk!;~w_!=xAQM!VYx18aaQ|XpY
zdSeI?OhD@m#)A-UJ$RJ?kNumpU4b-L=$hLj;${<MJ;4wjhx4`Bl5ETk&qOdle8+er
zO@<1@vUcQNp*3MCh}yxN^E}~ue6wZM&Bo%9eWE(kAIsBvy`_&*V0EWb7YsE>cXO!&
z2t;ULCeq*HnXZ<-^kqZ$u~yBJ&pms+uLY9OQ442Il*)|sxGjBB%V|8aC%?W%T;`(F
zmlFE7IAJU<2`k`!R*PpK@WbH4oR9=UYiHZ(Dmps@6lIMP$i!C!{YaWBc2xR5pZzwK
z^pv5vt&N#vGhJF4oLs8^s4bJ)ZBmBwgY(M#&=K~eSqwjQlLF_fOUdF+y%#R*{O)fU
z;#?L&_Jn#S<)#ZrSt-rmxqYP2TH8*XD933-2yQYKvG}{wK{Sn%2gzy?j?w`|O;d!|
zu5K8V{&*;YGUt)-L3Jw397VzSg51KXDoqP2=g~CVc!FoTUM1$8R3l|hF+?2sZ8D!Y
zuGKo+vwYoUX?>&aFxXaQ&O=zTOsXlNxnlb6yJK!_bXycK3uUGjrO#7KOWv(Iy?e!B
zWg60I=s2S*+TC>My?((CLg&mk3d`;v6`^9*<11VLTA)*BgJ5!Qc}{%(q`)30UkkIB
zNn*j`F5e%}=+~IJ8`=5Na0&DBw?-(gIMaO%IcBIc{q6(dg@k|_gneeWUuq?CoLK73
z>fbW)WO+cJxpb?0h{`nK@*cA7n0K$`Om52*1{jVJ7j2bBAY%Ei1}!*XGZ3Wu2JRVD
zYj=g$BU4kSks)+J)hYRZY+Rf8Y9A0S)B001&CXY?<PL7%U%HoK%EGSMs6RVb2*(fk
zuoSqTR-hAk{X@*p^BMCgW?fC4<wv6n;exi`+>&d&v?uYoOM|A|*T)~<!kHa?uF#<4
z-gs`$>_wA;@Amsj4pT43mD6gPO{UK``#vlV!D@4?<~3-fllDq@*ZwfJzA>56k)CDT
zCZp@_*fZ9RF9@lj^7|@IKOWZHa5DZPWTV~>QcxDO(T0Bxg9JF(^vOLUCk&q&gz{w8
z-jHE0_`Djl6q<+DbTSg<P@ZqZcp!TVGp8{T8ef`g?0AaTU6C%oE1<P1H_?~gX!2f{
zZ#!mnetK%nC{E6oJ{&6hQ0smuuR%?$ApcuHZ=YFP2uVFn!fND)-lsj?WO?GDomP(i
z_i4t$FWtwu5M1R4-TOUNL&6hXYrYeTpKsk!j7)8KW5ad(%kL}LSub>D@rlX%<PXJF
zC$D6i3Y?tDoLk{m$Qh4o2*heithD*-MJMo^jt?*UN>H4u%8rpI+?+Y#fseP7(k=m=
z(TWeS-AD;kCu;ytSEV6cN8IuODcFb7#h<m`!Yil%zSeJ9FMw0F=tJ+A+psFE%@}rU
zKE0-*&Nzn^$F3#pOm$XW3(3Y@#W$k0m3X>JH`g2VSOH4$+gQKus;rB3Pg3cQp#%!k
zX)Qisjw$E2926LfT*CMH2^=$3-$b5;1tmW0j)Tou#uc~aNy#!#ezT)b&CHfL5-dc;
zoIaM(mAx#;qUQ{EF+IIka&;)dzUKi}i^|>j;5`ei)&l(3XP1K%eL}}v)4ru^%q*ul
zw`e`iQ=%i)9v=pac7L{s1$M(c*KKKdQteqKJ4M)nGaeMlxe9?<kZcWc1a>A8g#rO-
z)_+RNuG3KUnU(tQz&_&`fl2wSA#^<v#!Pha$J>doytrcQ%5?M&kx_sHT8^YQO2KZ7
zDP6?|Ez1&0%g-ks6$5yH8qmdN(v<acF_5H-Q9zPjPzT|_Cc=#PP;p~Hin)tj`ZX(x
zXN_uRFMnjzcqJDZil?>}k#4^h`YxT%1AlIANg+(ws6frQgmXdoR;~!ILe_N`(UoEE
zlDEr-eU8j@hY2&CCswsyFDlYmT|b&X$61}F9KFO_DZf{I<#}HBqn#r!<}~;NTqhpu
z>lgYgPvUUFB3x41+(PEr_!mo!4a<^AuTD9sO|K;JP3eZNls-LlTJLu5Vp67tD~^3G
z%~CM8K8S77!)lpv(*;`Hz?)leH*cgHqf$-csSiCwbo8);4tbloBn@%UfI=$Nu?TGv
zJr9;T4t0H78d$<Wn}bm|SV;t?s_Ed&!|PfA<FUkt+jC~7K$Zl1<(!8B(36F40O-ej
zwXfSj!bqmB%}@i!xlDFl7HGPra@<1cT!6TCZAYPX5}%6ir@0nJyB3pLtq)mBF^-Ne
zDM@!{3XHpKRKuVNr1Pud<_0LocSu`}7hNE1?TJ~=5A94+T3*iV<dF)WURFyd)O6mj
z4DS@ITbg{ARC4zty;byHIL4A;wa2uCldX3p*QmOCz7kUK<{)kVqzxg2@qq4W%f@qx
z#ZTUHaSi&^*VX3Pbud*4t&D3$<cDE1I~*k*V%%?vE-Y2_iAKF~CR+<1Zldtv>4yyB
za-J;0q{s-FK}qI!`v*4t8fbPxcgibx=NhVe&2ZSD&#d7{wwhgJ5|&t=u0@^x534H#
z@3@$(!YSCO(*r<W^YJCloMCcmtvhFLr<5~Y7}2csM!^OK)dbD<h%#@kz0uhzFuPL4
zU!B(~TS7OFeJLa8(6O|A4~FC5Ax`P8EHQs)nw>FZ_1IV10(R$J%<?)MllI-J^x@@N
zyw<k-89Sw$vu@zHmZx}b^z}0xk>DTk6dX<?n;gQ4O*|It%IWli^#Z;5fr7g0aYtZl
zw8^k>B)F>JZ5@MjKBQvsSSO6i0<;SQ@F^4`#6&px0-ONjy|g{Vv;k4$hb${|b0s}U
zV2mDQS*+%m8i*;LpMci^p)W4k<`2fb(}O`YZkV@t^Uj!aToh()1&88OLx!tilI6qX
z7dCysXM68GCcdYqzFQg+w8ZrUZOSQ6f)0<l<KUwIX~ug+XvLuV&8wL)p$T6QE&{DP
zlev4>#3Ad!%gHlQG`-%xNJfFtq;XK-bXs|TmitcQ$fP-$!~#QNeqV>6`;x8Dqf(Fl
z?R4%(Rag3bKG;kqI7YW73JUQ5NL5(DtvU_03w~z~D$&(hksdXDR<OGG)V`;<TD2Ie
zD+wjkZ#A4*$F-JlPQR+hvk<~p=_p<WrTdLGrvcT_O!FU2gX7vjmo*gNCJ$NEiqL3t
zM5L34KOZCr+mE&#;=Oe#%yqUoA@PkRlx$1^6atyJ`Ka=~*Pw?g1yYQz;CCIW8g^j(
zSqfF3igIUkOq*6d4EJZ5u*O&V9v*YupR_WRFkc{GSged&E%=qm?jbdGvCG<u`z@ld
zO*6C_+n5oxf_j#R^qWAojw?IxiR<e~wcz>Xj-7NR>%mfB(!zQbsc&^@OvqZluFt=x
z-<i&g{FMR~A7)eg{76hc#tp^9Ld-`yu3vGmzE!e%z77XD{o;VFSX*5H-swk*$iMJ1
zluLRHqPBcCA=>G;@Z;xOsOYmoo4pxGtO3zy+Yt^^6t+gJ^%6Utd6H3=!d6#(80|NU
zi?HlkfbEJ;&)(lr0+ZevZAmbUGVxA#nROAyT%veWSDu5bI7W_K1pw9G?rReB5iv+o
zORgZ@vp1O6!cBC1K5ZTwI##ZDg!5{Nw(2vVnMQ4qf_SCtAMz+Lp=4!lr%dg_9IKPM
zN2OP^V%nHGCB}N`^$S)+4r;rn+<7}wa4;y$Qp@Faqu~@z-d$g6_YrP>>irf+Z@Deh
zH|iC39oSPvo4b1V;>yk@KU#JX#%#;$PTz|Z^|~wV%RDK}Z%??@x78-_FK8?cOP_=D
zY7+wsI>WN2^y(=6d!n3ajcpZ&20&Vz2JKwi>I3kLKT<V`XlVk{_g^4gAhQMq&@$a8
z8_>?%0>PRM??IWtdQmuek8>ML{d3>xDtyIfs^TB53P*7Y0!pmMxs#xC8<1AXHL`3a
z0K7YL$-?}og91~o0-V>FnAIQhVZPhDXL+vEA|nnU{_LE2(>1V>N17(CYL*_7(pfNN
z4yz!t7CGH*O%XWZJ$vVWbXs>CQdm6gY+E=pu~WKOjH1eS#>riL^%4karROtzrSATq
zOWBNx+20dY>M`sXm+u1;mGXfrk$XG`b*;|mPcc%2j4kx|bN6+>s5T6bWU?2VJ|8By
zIJqO;l@J<Qy70;8DdrvLa(kA6?DV{~qE~oE3wuHSE8{XCuz~Q51(yhOe(tQsf(RGQ
zCJS5Tp&vk2VH3M>Eg-9~ahfR+Sy>TTNkgLy{uI#=L{>mr>_=pUhQiN;>layHI_It|
z4q5(q{iyW*KJljBa%BY}Mg=(6R?Rda@@xuB2bKczD~QHH$L_6tha)r0CsIfdnsFh+
zod9fmb=vp}^3e(L!h{OwT8=<sH?acKFcur?Y#|cI7s&RGy8KY&p3Y_SmN@-aNGlu3
z?$ycC&Jvt^{gTc6tb&6AY3{@uUj0-kg~X(1DOtX*I8BY=<R5oge7+S5$@VblCEVwu
zo@QjZ)bB&+c~Y>%_tQ%|XSOv`yDV91Erg16*OX`SvqVqW3<FgU_qb8w_tDf{cbX<k
zTH>7*?T@EQQYI5yo>5eMW+u@2{^K#M%SxZ%cyG05OVjY2`+~o}erP5C<KDT?IlK~H
z45wG-Q@Kh@R97=Lgx@0vz)OocVTgO#UrvCR7P>m}57>Dv;aCgOBW9477MNO_(LoS|
zbujZ<Kgm<)QkWn&oRwAA)yweM1#rkEfPOs&r*D{#=OH`#2nPQSfw0?ho@{sUnm!<X
z^gxidox07?tZ`wIiZ}?c9dx!pfVW}hS{y?K@$;YPgX3~~c?~<3KHMpK#yyZBaS%>X
zJnGs-Yq@m7`2u;i)~t0;`@S+?ck$%rJlO*d=12Jz_By$i=0GmX*4-SmqB4Cf<Xu5i
zp|v$d2-flYONZMn-rT)+H%|nwE_G;k*z`lyfo7}Sa8S}5lxKSuMd%)Xn(bgz(MXmX
z9g@3OXQ*o@?>!giiNQepDrkjWsm_{;-aOemwXu9*qv-D`ay#p?)}1exCz|V3jK{S!
zB_|Su1wBi);8-F=Qif3=AKtWijfnp`u`_H4;KBH+kLd5n%Y9*hXn{$s6}02{TcG-3
zv!^K#rz+iq*OcZ>a42WbXL{eKn&w%NAIl|{2Nd-IzA0Z+RsJSp8^#~+_ibw3SnsJC
zgwbbMYtG2uV)T}@Ge5TsCUP1xzd;$x^Fg3nLAwY3U%SV$_>JL|*g{dQ{->n)*4|*N
zonVa9m_Bxi(7K2|T%M-B(0-$xYuZb$N|1BWZNYm_12seewAmwG*82CwhkCKAjhbVn
z5<1TG8LOo=ot|6;iyFHRiN0wL@q0hNZ(j?&%KX=8kBQ!4gOI6G-DNAAAKy;QrMu&T
zY2I>Tj|y}WjJ)Se_{X(winW{gCzasbPBSzt4V&y44Zd^}m0!4>t8(8ghOlS%KGlD_
zI;+{g$3LkOaskd)cg|@!6P7QGDgRSr@P@wVe*jIC_9EQVYZJvk49w;&W?*5=Lqnqx
zr4xYC2o#P`x0;CRqh+uM<SlGh*b~LM@6f5^OqX8m&vTkF$q2~+>8=!qgh~*W6ZM3c
z9#Npl16!mIyTp>lM=G4d3auSo$woNGwJsVB%xO|MDBO!!tLe444xFE5V0T#_qkWpn
zxnf)WNXMb2YraE4Q##t`Sg5G-XrZ;$_0-o5j#K*G0XaS0@7y)VX*udULkg!BKJ6$e
z)XuEfX^OAX-482Red|>QV6(cTvS(;0d0tnK@6_fW2L7rm3r@R5C&K!6xS2*qql#IR
zYncWAE>`yxqswh>tNNCMuilZ)o>o6KymM>MPQtVWkL}U`^sVTY=8IIeM|%sO>bn49
zEKoRc6Ln#4r|}cV^%@9y#rBc?PsG4+B`L7>Jl!p5isLZkrd8LK15ob7im=OfYD=Hd
zP%~p)tnMrOzqjU@$%<5T-hwN~6fBQe=wn@Hjo&YSenC&aU|4>&hLks;PtXIO@}4R$
zw$QrzYIR7VHBYm{Qr5X1In$*2uDd81k%HR##5i^}rR~0RLb6ks<JS|(wJM=fc!lr~
zZOtd}jROwQGtp1rQ?XDw$3rqds5$lwyJ{kvn$9?bwH<z)wHQifn#nZo@&nWPM0w{3
zHCsYcWOJ(D_~ftr-aE=(n@_kw*M20hVD)FwEU49AW6YS`X{9!mt1s&Ls@dWYhep`a
zvWMloPR@BASC?j7eDj(c_A(udNf@XqKL(w(RWak6GbB&~aTQbM294j4g~srKi2fJq
z6~uA`iRDkge}pZ(PMAtr8<9aRy~M82<m(^r9TInOV0evZ#KJhYp!tXI57l7cKr{aD
zJP$mK=$4(`dF-IC*sR*h(-Y?&_747lVg2pfzc>f{A*2}LUnNASPJHWn&M@uLH)CI%
zo++^BA=BCK441MSt6GZ<Rfk~6tnIp^^0Ux)wvL+z`dHFVSK7m_s<jLP8IDoM=~%6c
z{3I9WI&V}o*|u`L!N-Ydv}HhT;90}!6)h<8Pk8EJ&Fl(Ly%CCY+_dwxX;iozhvH=J
z>t$}0ZthjN3zqA}$CL9sb+U^4T9#Wazq}w^@O`OjVufe;x)P1ofqVr^g<7`^W2QCt
z+cOAKoQ0SN^E9iGGzsDRwf76_ll?@X2V8|D%+IQYxDsb{A3=Q1OzrxeA!`vb4?>0<
zXlwvZeMX~iJ3`-}k(8^i;q~@Ve#pE8XPfl(YyOs5P=@t9hsqz|p#&9KBHKdaU_RQm
zdFzheK{d1Vde)Ol9d7|AHKb`|?}3+;lq=|LDO}6Kf-sg^)9Uys`MoklMZeMvnSFkW
z<acz^%f84n3FATwj$@#v38g&UCzRq-!Am;jr#sFmh!r%G1@MfA9lg_0F7Y_8Al1wz
z${OGOF;G3Y?FP#63u-`LCY7w`)Ki)(lJcpx?(kJJ*Dj+=QC%935IZu@;n<~Xkw?S4
z)SEJ2QL%c`AyZ>;;2Zty6Z)OA7T<#IZK`Cy*wI%Om3OEt%-WLS#(>5WfTaL0ZLE$E
zHv2Y;FDl|`Kq)txaZKwsxkhB%yybs{y=(oN4-2t$j=GG0oIsPFtL=JB98mXY>%g;J
ztV;y4wU#uFJ_7Y&zHJ7Hn<G9!z&q7fgx#;aGGUqFY)xZ_3o{g~SC+P0R*^iCo->zY
zo>W-R4|J9rHRUrnZM%eoy5jZsJzHKJ`!;xYc!;;7((?(yqTpse%J&q7U9g9&3h&<9
z+E@{4lh<1a2!UEb_of$>3gN$cs+8>*AkV%Xqf`>2*L|VvHF}}bz1b6`z<vYoKb0xO
zG{3jD()Gae=h4!}{RI?y7o#pq6lw7>f!Tb<S7*CH^M6ukn+bySGPV352eO{jfY(N>
zWGETCE`%JBix^0cH^6grIwf<+M5)O&7MGFNk0zM43Al;&7S6Xg<-LVn!xxSTPJL3h
ztZ-HC={9u>CG$%pc<ON9n126bS=Y25i{-V~(K_}0;fL8-Lg=u%5BQqTcAB>jMK4|N
zd^76z;0D%VG>Ln(Q2>9hrd&c!JYf$^v#RNgPn22qS&TlGWA4=WHk0^Hs%&v6vHiNX
z>5$sw9gZ0r|NNm6%SXe9QITBheda4g@>0bua{wA(GtVY`*Q1Rc<bx66cJUHw7C<h%
zPzof*5iz34Jc9s!;`S5r%24(FrMJOg@v2<B4_*X+5(oA0#Ot9oq@uWBUS<W+iDESQ
zv67K9nr*7(Jd$1w1Bj%g0dpr|eq)iUMY}8O`L~*}Oc<3lc%+@tEIPYbm66=s*x>zA
zb+G07mC4)%vhGgn{!-o53HOOdvAE)dIYl!bZu_My<`U>pH$-%y7PgOY4_==0a$@0B
zihc7clz{duPNV+*QG{`EWg|K6J`Hu0BDUI|hn-@F04yOqH#4Omo}%r6H}L&c5V@a|
zKlf4Ea<S>jL?Jec@^<ffquKPmd1(%BPXJkISr0B>ZG54hs_u$n<IwU#lPTUzueGv(
z)^tvTK06^zFF#49(6Qs5Z_g*+m1FJ;?n$Ky?(_QjnP&Iir96V(aQuxgX8tG|(wjoL
z(?)rNNweDURuBB;-~YU(s5cRWkUkMXCZ3-Ky6@({fF=mn2GvOU6SBG{U?To#!9t~l
z-ZQ=iqh!DJ97}&`eqw<yfrPyP&ZoA7&Uo3mePRLbZl9l@;D!^fh#Za%8yLa4X*8*o
zb`3;5!3y(^A8Aa2{veUY)U$!c%*-nMN5H%ema{ttKf{i4L)SH%{S$|k{8@~OU2Q)s
zJ;rT+7Ua^PVVVFV-XeNal>!!)L9;tBuqoyU?6~%YHAQ_mL8vt9*Tr0}9+0TL>YXbY
z6@-8uwCfe6^|?}d%0MY_AvJ<6Xue=0Nl&uFP2Qkq+eqd06zXpnK?@7awMGQ>0k0uE
zs(?_l2_(BFw$(5H!UBk@et26R&<_idvOmV6IXC$`qma3#)~7L#=id2VbV$g`APh3t
zJ+mt%Ty6)-VFF_jfO)O*f83!0x<mCA;*l039hl3uAnv}wJ^qI3jD<#|9EXVnJX{?@
zVR{nKX<%iv&cg@LB-=_aiY{P(MD`C@e6)6RVVCS-yX1ZS#p!{CJnvvu?Hhr9N14a9
z$V>Sx{u*z)CA^RPxDqp1FxwiV#;*|i3}0BccmL4p3kMJ{%X38W-z@1Dc{jHGHg-=9
zltE~UT!Ud$?Wkk#3(E?Q%+rDkuk?MMaO;<zhZSidSL+%Ir=A|>wNehV$QjAl;XDdM
zc-)rV#cST!g)m7AH~Gt!PJ$o~YQs-<blzReDFD28qqP80#QPEfqp#Fd5%+uHF}~S>
zqGYr>!ucFq>Q6!r_e@)v7X%~))**yRD*C;jfJMn6VJ=G(Wo1yic?-K&_vAMQ7EZwu
zTv{;M&qB?!W6Oo8wQVkk-6<leh6TU}&2{tXw__>{HNRE?nD?amxqrY2LsO#wRW|Hw
z|947yja*kHq4B&9Bi$&QK<+aHgb=~wje)OMLxHadBmtlK90jjQT^*q_(G&C5$E($%
zAg_D?&Gm&@R|zE#M1}TmEl1lyU@tzcLzl<=gs=NiMezj#|K-f)TcNK+9j)S1tAttF
za&x6g|4lGmurLjQLen{c2=OtC|HNPkA)J16Vjl8vM10=}z~Dpp%5y<=$ljyz*XX*p
zP)8Hk0ijmvtxA(CXL-ImLh#@-=Dtcu2Fp`M$`-#LHg$MU^pxQnH#WGebwfc~);eGn
zx+^@Rxl>iQEM5*1evJr*gpP)30X{S`XhEWkXs{vn{WQwvVd22<g#903wc{>;JZT6=
zDgey`poyDx+Q$xL=S&`Yis>EfT7ZP}IYtBa$wGNM9Q*ieb!BYTr(kt?Y+|y4s|-#`
zB^|Z?%3W6cid+8;oRqM=QKbi9Z8kYfzfVp6T{O815(-+?4P(V?W)S94+JDWX5Yf0E
z!P(Y_`&*cx?#nNrh#c}6G)sN3^D(SsEQ68qAVPE-)^)C302Lk8tSk8{T(6oC$V8*1
zup&UIjq3|8diJ5xl4znGk@=oE3Cb;Iw>GLm#OOF=`87nM&4EH`2pVB#jzR9<;~|&}
zmVyH?OI<1Nq`<@l$Fsa+uRm&AcX^@~yK-6uYNsi~*q8FCWJff2`GT|p0hJeSkHDIU
zjW&7^%?>&Zq&fdydPE;tUsAF*HLDZ`8nF&@E}R5w$UFWJfU&<ze>tn{mMT2P*?jkr
z(<ifG_hX7{YBtmkv%?SkK8JPX#sai4II#)gzl-^qJcK~x3b*^S9^s(@BOQq^6|#52
zt$9Idg0rfe6>f6B8iMdY?0AlNgVrdh>NwsMU0s?oOJpYzW>yaO73zOA-%VZ6)bAQM
zaux(d<WE6C&37H$oZq25F!=?7@*m@GK)-d|qb(QQ!D&>~u3QtXbq`zYjHDR$epA6F
z55={STG*#UE&3XEZY6`SEkz4x(5@61CV$(-o1THDu6+K`tz4vAjg2tZw9yv)+Iz7!
zkoMa<;5;EK`!(uV3Hs!`HlepZtHwS>3o8U*AN2$*@F<3zS5wijvxlQW-7p+QA@RAB
zUv4J;MQ8;%Zl(Xjed>byT%Y#ca6fe+kRtxq>?=e@)Sxr#g4MM)pod0j|0e@_AQ)(5
zpdguo_{m5>Fielx(jk`rrnMeUNHBsFa+u-g#i0#7%Y;7gKi-5vPpKkx{fn4yF^xq*
zW7qqIss7UYb8R}oTf&IvF|Iy4GiT0L(%3Mh%{Rlkwu9jJ8#t3k^7N=sN(md8Xk%+@
zS3c-X4oGBP!RAqL+#WRu6I#kWqpjoE_tIe?GDy#xBjYiwa=!qZ+mw@HYqq17T<&*A
zDEV$&X25z$@jCDdIvT#yq<44_>3)Ci|N2V;xgcylIoFkS&43DoIU))a)igVg6Ah_&
z0iap+snfzvINVDRb{)|AGb{SQQM!W2h|LeG7HHVmy3c>2(}Oy6dBRv@lI~Hf&nGOq
zJa!@&gm=f=7HTwlfi!5tN)dX>DCEpBdH%m^@^3cg-#br3#-GCkgAD!%5>|7Ic2stF
ziArF9+mRVAPhnTTE{~`*YFW#>T_AVrfCT{FiJXqJpg<PyVYh4tH<U>ai8|b|fpcJf
z<i**(M)n1&h#TiTuiS6~Q#sHqLOe0{o8<hW;6|KiT_W&qWVWFa?pa9&Bv{r&K-d&k
zK3YP2ToxOR0e6@?%p5!oN%>A{?e>_<Z@a<Oo7gSGviW1M64#qylXe9i@<(jM{%9KE
z{!;D#mv{+7h|uM}P{|8yk3k*WmXU-|a77V*@L(_Nbo%3Aw@4XU4I@qGYQwPEw-jrQ
z6BDQso%;GF4h~P&*-gXGp7^2Li>h%9b}(7ODgd+v!mB0q2sMh}q1yC+jg7NBhGI+O
zpDgFUKN1KY&NV21e&U>!axko;)OcUCPh#m5^hFNBY3J&rEn;(RylV>}V4c%Pv3Hgp
zz)bD`ADLMgOf&n`jb<*Poh)0uos8Y5#MWX{1poF$T5$$tYPitrW3Vqvg)_nhmLKpr
z!}{nXTC^%{sh3&e2`sb7y6y@4=DlDy1Q#;r2)tr$H4VEFI)|-cnVc8w?U#lvjAWyl
zW>8G49eD-!zhUvj3({KfR0_C`^8Xd>-PZCKgd9HyPb2KM`xRRI8q&qWogW5CHMx3y
zY$v9wu5uej*{BSTDzWy!mtq-yxRf3?EPf$Pdl(KdFtWZ$=Q@3v!H~gYp!6d?y+MO*
zZ2<@9P;0Kc3LV;B!27420q-N`%>29a`=uL1dTitk>Y-UD3UD-!7D_?=Pncvc&=>wD
z(3coA7w|Zy^twrG$7eWbqJK*z?A4OH&q0_OqDDJkZs#xbGYXlYVkl^H8ES~ZDr2kr
zVv2&_A0QBd4V)#)V0iPUzllo{Y|BPaD~^ymC6ua9A(^Ji%l8AsBwOqVJ0M133kd2r
z_sb%yCfg$PzeL|(W6oemBUH9s`V~&hK%}RI_rSvj52@9aWqJ}@_J!%uc9F#V#la@n
z#U#5jbq8YU(n61|G+~XI7M&j;0nod@hyUb)qp|tP@vtbjKccPw1HOaF&dfBIx_ib9
zJb0sH7aTt~kri7};f9nO<-0a;ctWqmDl7@rM<^G&;ietViWpI$y^wJFVT1Y~BE(z}
z)a^IoqjYb<XaV<R&AT3$bBc2Q$b)giJ($wZ>%ii$;~R!QIeTI=5^Xn+T^kZ^gLx(W
z`O7<ty|h&xzm?Xh%YNjo1(i02fSa}~K(Zh+dz3i+&)kv+E1ID1?q@-Zk~Xea+6?j{
zyjK*qf**glkB%0MXGMXDX+pY6n`POt<c0pwpph9<^SJ4OXe=!#8>`&s?Az2V?~@Cy
zH72nbO)nDM793UsoDzYv@yh^YgZ5URS8~cwB~nav<bbcd^NGD8Blo)L%u4_XqH<mR
z{~_!xz^dGu_HjiNu}A@x5CoBy?od)dz@SqQP*S=>0TGdulu}7a=>`QPg)Jc6xoI}t
z@SlhCe&6?=^ZNe&*V)&(#EtOmwPwxSbI(1qG#s$*jqX$j3A2zijL+axSa0d>Q~69P
zHXT*2XC%1OsYknB^*HLxVRQRDmJqS9H_I=l!lFI2y(c%X-Vvraud(Xd=^UA<sbSc%
z_58>!vb$U&EqRY*_b`vLrOI=$(sGTGWy5l|sdcNmcaL=l{pg^6k7vQ7bD7sYC!#9I
zXSaM|%fD}{lbgkb+3K0&Qm6&b@`L@-0FSnFm8bxe`NANEzHk<0oO@tiV%eF1x_e;W
zU;SFN`p2fj&KApK)lbVF{X4G{rajy>(PM)t%Q$Gmqb#-ft+qX_4<k&i2Zt8>%i+_*
zX#IH8eV4Dls&iro*0(s(HAjaVIxIVu_N~>T?(Wyh!deow;HiZ*g3vsN%h>@QibWfH
z1Ma1&iANC&L4VrNr`p%A>~?h+S1yxBY^-(xoH&U^hY(`aDOpIML<lk1$-mK!;n!~K
ziOGN9Vo=dEFU+faZCSA*W$|GRVZTHTfc+{RwSx-`TkI{8VbWA~fn)6&q~c3#QT@;f
zT*N$D$+&NC*5Ob|w(0dUcS=e(K!HMVni78Hg3)Pc&KlgP?wknL6;;Y!{Ey)fPu28R
z2}S}>>MqBPoeTeiz|K?56o8Q?laVmExpYl%C4b3)=2YR2fu~Ri>(1)9f6TcPDfxk#
z{LZ|#!w`2i%<G(iJfn4=3xPslR7HE|Q;MJ)l;yl*&bexe*8Ab4|9Zl6u3M`kKPGb8
zZlMV_|3R5xbKLyeRc2bh9C%<+pZ@UDpPcO?AlZM-J5bVwnBUdK)&(h=o*xWt_-6Ct
z6biDy03bx_LHV~P>IeNY+sRy(!xbv``*c9AdSlj@L=^gsm*47{K%aA8p66CNW`O#y
zc^oj1Wa<wCNoJ8@NYH=%k8~6{;2+815vG*jE)rmG{;hqr1;QH%{Kwk4e3l5h-8CnR
zkPdY2eEHjj(3Wea{>?>sdl7aGnt)$&LC&WjJ{0|T3%gDMq-&>@1LJ?L!b@3%<+$5-
z1^z`N!V%g#e<B2rQ-IJ0uus5ryfp|C=79^m!aR%_*`NQr4r>*C38pzAb?J=EJXzD>
z5&0aL;8rXQi?T&3Z(bfpuStT@x$G|Z*EE#t7lCH&RC2ia_s5WthV<BWFb?`y|NaK7
z2_XW-{u;NEhqNff>7;qXcV!{@$i%lb7=fk_0x^W;-4_JeL4p>(Q(^#*Nvq56^g<-?
zEmR*JBC|H95(KBC4D?}oQ||a%rXrIZW;POdQo;ZBq%6pjhB~nR{ejXD-F$eyJ>lQ~
z`ZzD;LXhWQpRMEn0Lil;SW*g{`1hp75Mr#>+ogFOeM3iAfL}!lCW}tR8m{(mV@WA4
zs>R(MMzNyXGLwS<;S3QwT9jA+!LXhKkNGc7_uq_2Yzon6#}>Z-L#RkXH2bg1I1v^Y
z=`eTRRDi=G<^m5qFN6_-OB82Dsy%*w;AT3aggNMJu+_v(1?Yz!I#9;tTmL0K0m-b{
z*9{;suZ`@qImo(crSAd2ZKnEft9Kt+z26Qze_y6s$S9MkjY7e{|Ml_eF@qHve>!i1
zWR+9!U`)n~NUbyMAy{D>c#En%3sz58l<*dpV@@Ej4aZPszB}vb@&H^-I`NLP?F}eA
z@)F`4hWQcY8fdTzW$jf+cnD#a&p$sCe+@_Qi05FV*ZkM4Q?QO{CZ+UpwTr9G`>GY)
zg!wDlstchpX;sJouj0`O$F;?+anSD9PYx{mPxfuPA~g^4yoR=R4_}rEe^WQsP*DeH
zoOb9Q1f2xZ`<%6PKfv$NQXAEybUhPmlsE|Ni-z=%dxc8#1)(KP5Dg8;`T!4BJS?jy
zJY45?qFfseR=Ne3i|5jN-t9avtDHTvWBl)0_0k;k;vcWAMwdncTdRt6GW!jt_WO{E
zpV5JfLdEvhk^hMBzQiD=+*iZSLe9Ul%5vQ0chxEV2KQ^y%SXFfnH3Lv23^8kr$KSI
z?1p(Dv{8M;W&u40j^m|~{dQ<)_Z->9!EJO_R6VWmKGJuPZ(C|wmWhdW9LwHZ?p(TS
z22<0(pJazhG5ljJO#w_=Me}@gLeApfU5%$$I06^dYr~KZUwg<ui-~?s!1u@?Q;T3L
z@6D`)%CY*veCaPuOpgCXOE0<$gD8XWqX?weF$>%PxyS@;pNT~EEY))4AxTR>qy8vA
zuI#0~!>ZgZha$hFAr;26VS{6Mj}ihC4p(<od|$qA^<AiT+1DGjXt(L2Y!PPK5yp&<
zq%@!o(J7AWU(kslNub>p+)q0NZ6dt>+1e_%98pj25V}__uNaTl`0<js^+aHya;gqh
z-igfO<hD*<n-p(jxg0X+wtJ0JfB&MxkNXayT92~=NR!=0Xh*fpa^Cm9|F%VhI^eE9
z*qwIVOs+26ZTk_gY`Uym8hbik^WdDbr&v0$OsUPwFaEw(N+gIdsK(^c-~amfR7Zmg
zY15J;Mzkw=hz=mUXBrmq7DAxMfji-yPdM5)M%V*(IKMa9`eAA|CEWD>u9ati8PpTK
zbENFJ6$cwR9szKv05adSXL9Fn@{CG{ZYpHnk+}MKAu8UvypqwdME%5#9&_aq(JjvP
zpdS619aR$HMJIr-K{kqfLRGnAyIBHE(I@uv{dns`PVGtOHH4xXK&HDrzx>ueyQ)<0
z(bnMedA^Ot0~*|u#hdxGuKS%C*+F}hE-9lEt*7Hrr3)YU4u4fnWaw&?D?2FOyiWhf
z236_$@ViZ<cJQW~Z%`b*@8U)rGNoYZrbF@c8Y0xrlyj%RJKABB6jnhkEAz)eE#CQn
znI`xOI2q2)b3T8^#xFw=7K=mn+CPiDNPH`R)k_&j<U>4*7MSB?5)Vvu5|adnb2|P8
z?iGco+tg?9C<n*oiecvIk59goR|OZoio1@T*L?%y@{3FCVIb#9x27dBV>9$T{|FG(
zEZLQa79Iw9XPvvLGD)9^?F-v=B9}t*DFZY~9l6{Xc6Km=<K5H0o=q>B2#}dQkRLrh
zNIMCA{7Q3aL_u#@st&_>;x@n=G&Cf*5+Y?*5Bc1eXfOLC`&jEq_jZG+n$C<!2jAk6
zm@B0FtFKEI4;vy}dg@=I9gN+f8i5Lc&en|YHcb)}yd&;xMyNO{ZIx~L{T{UQ^kM!A
zy}AQ*(l$IN)lvQ(6S?VT3@tJ72lFEapk^?&>vd{6{r>jBJ+$_N+w!`w#-(vNe0z1<
z;~EJqg-?^A^N-XyivT*D8mqmQJ@}Z0-Iu-2EKgQ?xH7Uv+sZvyo?RuO6UAX0<jx)1
z>xC)<A~Le1aQOS6uQWqMV^mtc^arr`5hN}7yvS6x4P<Dh?t#4~WaBV21SaQg@c=a@
zs35pl089(hTjz3Kn8B@P5Rm+`kq)mC2K%y`_ZNKL<U_*y&nWhGxH;u>x9R{5kS~G}
zFoA}UOyOn{TlBXWY*XFC22-o88y<n6+KD(HbMs}#P|*O*=61KrN-}h>5vnNM#Mk9P
zI;}TYj!t9wZTF5zqc_!fZb!Qce|ggH@8Pex6m#`h?(=;b5&JBZ4UyzdL|g;e2a9i*
zKd#Vn`M@T-E9Pac>{f~>ic>W9a*)u5tI&2`eyK<3uT7O0SC(Dq&UVchx!FFSpTS*f
z`EcZo|Dkbdr|&hKcv2P(a=44cJ!g_a=BA`B^n9OrIs9~Yy?t+)(q*Z_?!61?e5}tN
ztJd~C312WsfhqUiP`!OnKXNX4xM~cp&AeZUj64d7DX47WiD}RgJo4CV!){C2J(!>}
zxV>rE7$L*uy|7nhi7RTzAowZI`&OR!<4|VF5W0}K%#cj=Gfx)e&Nfwb-X@zka#lEU
z8JZe&L6y#z+K8ezR_BbD9Gt3_(fmC@E+gnKZz5Nxc!Icx;(~|>Z#(=r<GSA(q+r>r
zl+EN}?Vh8}X=~nx-^5JGm?u1ayML2v>OJq{b10GT&5}TClneQag*r(PUO)Nw*UMn)
zZ%b{|?@cJfuw^{|{1O|BmK4LH6~p=@=~-j*Y$_&|?j)g%5cVyhM~*zIYf~+eFuhlm
zS+nRT`Z7o&b0(n9DFr02Tb1fZo{t#vTdPNlrnF;yq26yQCX5f>`DE~BV9CZYV%547
zN!OYfE>1pH9XIel(}nr2$+9TGTs{}2m8mD<+M7%#OVXUvf4M&#!%PuQe_<~06`PQ;
z-OZ5al2sHURuMF68QRvvcPxvg-fLXWq*V-YC|(ry-lPh&SCc8K$)EZrcr;wQO%<s(
zjWVTZi)?G8a<I&h`5w^|y%zb)e8$mNJ1D?$c)-Dc-EHNn{C?BQo<nt8YkT%R3W+2c
z3%g+=<F<)Homz5xJY#L)A5(H|dFOr5-wsQ^58V^PJ8AFKE$(k{?>`sqTNHNhP+yeO
zlgF@Te||k4|LXidT7Xd|EaG@K$%y&T6S%>i?6^vfJVrWXmvMtn;Fi$9+SlYO2gvwN
z!)18-u&CK44U0X=`D<?oZUi&DFMK+)`RYuZ%ozVyKlaa<%w*GCB_iDI@+Zp-+QBS2
z9<18pD8;G=0$b_YHr2alll>*sE0c^-_rzU3-Cc=XNHSKf$(O0ky4rQy;cLVR345I?
zcfnmZcF9N7>&t^!iyyjQ+QxEgZ;eyx{ap9Clx!$nlRxRcJKjac=@!ybIpUanW_bDc
zpe&(7*GBI%^ySjMmgnbh8&92yL@Uq#cAd9iJ`)u4$(Gz$`()r;b5E>~u#a&h-OrP>
zdSqA&t7cJ;Z{z>n9FMK@Ma+qE!SaAWqCc$dU#orn7Oc{x<^uL-$Uleg_c1PRx1vFo
z3_A?IIES)7y$x3x!H{$^sJJ#jb+}UH<AVneU~2Z+58S4&i5VU`nrkPrO2EQJvwy&l
zQyr0svL0MaP>4^&m5-?}q^59sBq>y>s=xf>$xIJtuI&&{Ik!&M14{AoV#(ctFReMH
zgyH6-Aqpr1oT>n=zC5+cNi&LBdm$%B%gNU8Arpo5sv9#rwz0L@-%ap;OLU4}qLj`M
zxuG2WB_Oix*ENR)@knX|ZmWnpy6f|=ZWc1qFf;o(wdtZ+KhFo}@*SY)GVI<t>JWsx
zxN|({bTSQN3PkPN-+lVp_QRaF-3HaoFHE}DJak?)`6(RJ*9mW50CVJ#XCym|tieBC
zGdJOgmxWki{qv3g{XT5rljl%(67EtSFRv}ZM@;5<Qk?gQJw7?B<d}k4_)4}sSn8GF
zzKc8JzQ=NjQfS7$eY!2~8ql+1dz*^{SFX%%3Vi1Ysj<=HG~ycD*`G3<;kG8+q!9js
z`_9(kaMdfj@dJON(|+Zi6&d?;9+K>xA&Y>q%Kk({uelFN+Qui-vL@smmpVREI~P5F
z`-ZrwwPm>Kn*WoL{WnwBK6A%)amn1zu5_Qjbmm?3>x+W=3*8Ti;)nXmO`;>&=MIX4
z`3;<zx)W2{8WdZ4G#6vY+RBaQj2j~bQ+KpNN0v$VU*#C}I@B3yANV%Km37qig<Uv@
zDjVkDi^9fI@YIe!6_0i1PlwflNYCMaZR>y6>)-!?&4R35jKCQM<g8o3+AWaqDA0f<
zzw{*fh>;)bGbFf4OFIfVlUlyfMYzY<J1UASH7<eEu-W_EWk%#u`Lsu=mvmfLB)xro
z%LnOmFM`wTeRuV~<8bcsEsd=`c~WBTC)TSK%4EDYxkCcy>n}?A-Q8&wjSMXCcOQCn
z(jaB7|B)rm=n7S(|Ld-1jYX^Nx+`4n#&5FS3}4JWCQ##YzqN{Pzd4bPi*mtB5L?@l
zni<v%7}?k5dGf2|ii+u##fsE6?Yg5fyp59go3FZJ*fAN;Gnr19*iD(X-NX>b+l`dP
ziea6k|Hg_ssd)0#Kcfh8Y=^+<sq=Cu{OP^pL;h0Y4up@-GB@z<AS?9&j&S$8x2KRp
z{!QsPnAm4rpO~1i+FH_nbHUsx87dP^??o5a=eo!Jo)UuRz1i^$ZfySa`wCrFjm<30
z12Vx;^QSGLO%;QcB{*clb|;Nk_U=2-MBKj`cz0(l3e7){sW54lkq}O3F$tnGE&hBo
z!%(&rj{DxX+V{hhoLZ|kRVAsm@4I|Wl#2X)8OK-rOx+U`mhQhRx9FGc#2_LC)7~Ro
z)8f82B!dnYD{df{8aPa}jZ?ipqnx4G-SsH5YSJuP2n<O!+UwpoeC*z-KK$o;_<!*E
zAN|3n&DZ?+{)hMevb*4CaMBdGzD(Xh;;<MISk;HyR`|%7B?~<EX|I`@>j?2-9gI3;
zYVyz4j<9W?)@@ZwI{#K>Y^Uq$6nmzzVAB1#=8D4&O^4fhKe^0$2@*{eub44W2olD3
zd8-*)7et7OEDe)ba;5uy{hB(Sn;6($G;T5I<r{T--dH}mTDC<+#Q?n3dt+3h4*KM@
zm6Tu(_H>|Dwz8v<wlE8Uu5}T!1bkjgZP;gVe7#6x@uZh`{{1-o>(8Dyu@;sxafxt3
z{=*&p>pR3g(~6DpKS&ApMb64eU$B6DKIEfc=L5?~!H~y>U&DUz$nla{=)0Rjit)wg
zCX86SPfKQC22-re*XIYxztB94>Zx{Jyg5w8eWiUbaHjv!8IS%{q7OYeCh=$){p<(X
zd?ND$xFJ_MW{1U5*qST7Ts<m-0hOe_pL9`m9d-lj%sKDOA7;?J-gxHw`wI;@2Uoun
zosxrquj%D@bydZd#>iL}|L~gpCCx}8ng93x`oaF1*3{kk@B53yEQPS=Q=gCz`HdR(
zm-!NMFwQDK{AUotfv|b!H*x%_m?S0GE!w6>Xx4F$A>X^SdX=gPmvY9krY{CF3u#5q
z$vCRK_ug1aHr!9n;Qh6iZ@JR&eL|(y{D-z8ttCrMetR3!ZC8fzyzKah7&d-Q5^f)3
zmgTeL;#l>xaDMB5o=-ep@#8~z9B_}1tQ>6EPv`%BdVk%7?J8J9bwXJ1ri7>e6loJ5
z1>0qXk9awLtH*gK74JYiy@zWz*7rd5DT{h;IO{1EAwj}uq6|#tMn$Nc?%NWRRr6cj
z;W_-8!|#dX3oFE}F5r^A2Y7molkMHxKj2Sh+^^-F^DqDK4zi4%*moiLqkpxDDk@^p
z;yZ&8`RcSe>4WpF18>9WgRXnfbVlej;ymbn`xq+<0A@YRu{Zx=;9=*;lZPRPyKW0d
z<#GpC<3b^J9Sx2y(T*}eFx?Ehjz$9DKVF{8XR!?&&`2PJtcOSZi)JzFq%mCER(0w}
zxL5o)P?$n9wxxW$P@iiAk{9X!czef#=dZx}*AxHy|Apwp#`+$;iiL+4!}^ZE>lai0
z?Z-gS@A%ADTyvH1tjQ1hW}?Lt=8fhFFiq4&Ps>~msgZ98;mSBY;+dWGEBiTxH~sze
z3l6{k?gtkB%?}td1=aW3J62tD!4s$dS+{zzy{u=ky$pBQJ&_=kUd||f50z{>v(v=O
z_hO%I${5al-J~2rq}@irJJ%+FHw;4aRd)O@u`ftMw7eL8>5Mq?$p3h$V86;Ohebbn
z=(sVb3=M2J7q9V1RJrclEz3j2@_RwO6VbLcMzJR{`s?Ax6A==Aby_ou6>=g(>eVvn
zFkkxrv&jE5Pz)Q`+u>B}5VCgHf59%}2%H*BK=yy|#xX>$ZFbtmP`4$B27wBE9ONZ@
zwlnyX##JR9sfsy5$s+eZa8;>^TsM8_c<HSDTFNI{p$xy0;-z|hOvx3s1E^~GZ5Jrg
zbLHrE386wTT2;Mr+sBY)XExGgX(*`n&At5G7p==>TU~<RRU9{;T52|F>hrtrl3%so
zX(W$;BlmLWVV$A2aFz`C$V``WDCd#WeDT=tq+z)NB8j9V{<&7jhC{^6BblqQF*9QZ
z@v<k-M$|Mk$o((Uj~_plAz24%nB8sH*7a})eM`Hnpi^N)iX<NH7JmhjVFvUzTp9=C
z5?&yeAJEz~i~qq~0S~{1^YQHkIvd$4{9E8^PD#e^9b1_3anLPFzut+{3aXiUxasf3
zMEyk+qx{4|nccj=!3&)Uj@MVE;7re+r5oFsN~?&vohBIH8qs$_c<H;ieAEVK%1_;7
zL4tf~Jh4*!)2(adxTr6WB=wqjYcY4T7p8Mpf~X82`C93lEdAf5TW+cy5pP|k%e<@h
zSNP>T`uPkU{_S5b%L{Q?6h2Ut!y;G2s=tA_tQOoGe{fkZ^%j<x)z2G_?Lbu>y8BN>
zuxO?bzW`N>xQe5Q-CWq}uNk0FZZT`QrI<V&bAmTuUfhkP;mV9yAGotU;g9*n{(U%a
z)qXaj$G({v4DaO>YUmiHXdgT37*pv<Bd52Ua39-X7JV_gboa&ZqR%Nlz`iHOPQgN{
z%dnC0lF^-;Dt=XE(y6e1mH&Y%&2L&&NS>JYYtaWvU!D?V2p{q}l4tF{`(2px9J7_M
ze&+|FPuRY8xf4#0<<K2S9>_#({Dzz%jlIxIX+vhd7t~R>wT3dP{T<>KA}Fo0CM3OD
z(B3w0ikwm1d)puKzhG_&&6nBnM`2%(L(XW-)iY99w4<KiI<oB^aC=<pQZ-)OB@!kz
zyCDc4Ee`ukH%p)nw~o|m(i0U<DiMWm+ZS;#vpMVns*{*H`ETuem>&aMzs#4Gw+|+N
z)VaFG2O#IoRpGi?x#0OIC6f5$#1*&`%Ma$AD1QlouY;&W14&pk^gwKI?!0Qkm#gZI
zMOn{*g77#$dFCIU;;%1xkf2X{e5eZm{ang&T`Pch{CtmnpDed-mFpEyOK^)a42{q&
z&DA=2G)x2>+!Au}`n+fA{Q)EN(UhscR0C)1-un-q;;LOQR!S+<R3*}Ldo6UQ6F3)G
zydrLTAC2ZCir=%d2vlgDyCJkcTUESVuiwins=xF4S?gZ&w{%0D`n0W)yVLJxv!?k8
zi@Uthro5IDUToD3)7m&(%USkD1X(!8Sx08Iw?)C)t_B?R168T2X){I@KxvkmmuOFl
z6Cq$I)%QC9o-~MvW|?_uSHg)7Oi62ZC(tWDEcf1B);cRSVIgu?+0i~y^=w2$R|tcU
zuk&flK78b=oA{YSaX#mf-1Q-jvyZXv$fNj7#j!pj=UBNvxDMLUi(rhEcS1~&2qlJr
z-$*W0s-J|ctKK~%@)_6F*MEji)|0afW{gtjQ1y-mZ~gtn;SwNj93ldOlTHsS2y&E&
z;=7)Xd3)2V<+5LO6FpLR^fsw;#rw|C1B?$kvtAI2pQJ9hy6(R&t8*F`e7K<<U4D_m
zx7Wq$!)C-CqBE8LyUNYi?}gG%^KUIRJo){5XZ!h%bGhAbxK1Lj??I$VVq8xT4eR_|
zb>14c@gu88j@|qA-(S+|IU?!*HZi*xSmJs3AOR09@pq)wW2BPXzhZ38%7uDsd!C9V
zQCuT1smV{Rz=~b+v{W49A7#)xf!Ifs*lh=ghynyAGTQ+xA*jB$@(lUt|M7CChXA|W
zSW1mVa)cW|dyb>0k?$Mjful_vAD-O?Waf9cT4soqR+0Xp<7U`Lhv`R-Mbl|Fsos7Q
zT%V6tE3=3ZpD|5wf9+ao^Fi0)<^{o)*kQZJK4T>~QAN+V>rq<cc`{O4%db>TRm3{J
z-p<I)9;Xt-m{&Z^iVaWO=c7K^(qk!I&6ZL!sCs67ar>$OPljcex`uld8iQ*~&ZnUq
zt3niS7}&VfNt#n1HS4dnlx08dVug3xX!q{VeW$EB*Bfy?RIreYUu)KeYw3UOR}!$A
z_Qjp-Xga{8KVOocB8P<}h5RF$?k>1_c_c!LXf7+Wy>dt=k&56VkiCsZMGq%TPop9v
z2=Ha)cP7tSp1N-0_=_QD&OkyIEQ5<}<6wLb_b&@1jwhPEMjRyhhb5)O$48>btGE~>
ziimwf4lbFqg*@`Dl#u?CkJseeI)IlIbl)Ds>AKq-C&+u1>zSu)I3qRO1nvmI%+Bsa
zw?^EPCr@Nx>cwQW-xQ;6l@L@=o;bwL%sdNYQf&)vNWz%KMd%U5C+}fM56Z8*M98b)
z5ZrT_?kL<gd0loQmPPL71*hVRCJ}5F^nxvSv%QU__uj5o7>qo47sYjr9u@DPw&o^K
z@!J0W+;zM3qpCs6vZear+Q?3l@zl$kLve=hZj7nx*lhg-jnVg_O5b0Hwe6QCxI3H<
zsD<*g_vwYUo(*AJR=1yRdQZpMGlD=V$H5DkLq!eq9u`WDn*tA6Ww2C`B|i<zhtG^G
zIqRcvoyOOSIxJq1HWD=VLQ}0^ZR8ayqJxz1<13q(ZeQVT%LZ$Ie_jf<DY)b;wu^&t
z)0zKW@*Wo=#XsRHT6{HR$y*Z2k2#rY$Rsv-t(K&bB_F1S15oFK7IxYE2mqqQapkkJ
zB$7MBEm`6@I+ek!+5*duHQ!|2g*c8I$y)7m)qeHX8(7(?uYm$|{*8M*rT)nAH!WIF
z!G21hKCjB+RTeHw&EYk%Hwet@Ue${1#fp(H{2j^n3>&m2Cn)xU^t5d+6fKCnkyEuC
z1z3=#T3SG(jt1T<U9f=pW3qSw=a_o;3-nLI0qKbpgSG5a3Z@OW&>Oy(pJl5Fn873!
z$qV!CZ3>6gPJ66}eCzZ8u^!EF2uz<P$r&1N-;0q4%8C6hs~DC7?$;L1^WK2^)DSz<
zO1XRI?L~6!13?!~s$ng;qkH?DH;CJ2s6ruPc!W%gz=C{|y53tZC|9+94plS(y+Lo^
zUS?wa5)>5F0QDs3oyJ<|FKpTWHk_7ri<nJ^94@m~q4Zp6Gk&%DV#<7==t*CmVQ;xZ
zj+6OLX0BSU{;h{eZ}RM?n#1F=EtxjU($!nmz66A$&a&!Oy}KX7^Q0w$h0hUl9cJSA
z!W1UP+KqbS{)xkby_unM-kXZ?nfqxLLuInrG&gR9kBpSs;8gFg`q$jJc~goYVk*>^
zI+EehnbsIyY51)Kl$1-DH#LtarLv|Np$P#djo~#VYZa|Dp|o=I2GO9(JV!rv6VFP-
zc|C3C0*AsT18B_BRkG4!wcVE&UUtK!dfz~lT0njpg60nx4xugLs!1lN07VXONKBYw
zr$TX@&IiqPWhBD&Y0b;;)kL&q9}$!P37gUeM<My@&%UWXBtCzOhXqzSee6Z)!EsWv
z3bpvI43*jCXAHx&#N^P?Vz>Lm<wZor(P!8@(QXq#pL7f}v&dp6!F^NJyBzeo5mpg#
zB{+BlAvC2<Z%D?}&um$YT!_=ju;lKEV+>}aHV@)|<?uQ{Re)Qt3<AhN@5X`$FvwLO
zs_<=8EPipdo<(m&V#w$0rr6q6uY2Npl}+@SMaOlIrr#HAH=UHYZ#QR#?_(T9U=5V;
zJwQO(G~*}0EuKPRV+pMi*+p`f!nn$4Jj-lDW?98VTp!L|e;}m)$8037v)ejExH`!>
zk~rC6M!&0-S|@jMG=?z{fs8K@Z0CmLTVhr@sk(l+Ann$xIZ#Y#2S#q-T)gVds$F&+
zYQpvtm2^Ta#uuP7{2M5!Z#yh$MOsClDS3<HY74={inuvL0|nBqluzj0g$0P73x*@$
z>@YvL-+0j*!#utVDq_!^>cf7+{Rm^{sc_XWPJg}}dlJiYD8j_wgD^$m)iqv=lAmCu
z=Hb1olT8!WnHQ8Y?v&C`9*+IdRR{ASq!RSZF(8{L#soX3anFkUm{jtIV2Do=KZQF)
z<-YxCRT8^p9`2`8jhBsQH;^(=HVu{(M?41LYxPDv@U%dyq-%<Zc<fxgM1FLBBO{ER
zUMT~^Oe9dHf@fK5=KXHAvrPE}ApVJBa~I55)E5OqV|A_j;O~pGZUt}g6F0`l^*q``
zX;-|m;*8Ym+g&UhzqIajKsWy3qqm9n&iggegEBDzVP79QD>eWB;VILV3%?V`3rjVg
zmia~pdAYscn^W*vE{H*{JQ^zYgzLpZm%Wk5`jl72`ieErhnoB?HETN9cEZ~PG8cy8
zrkIEE<CANxKqbzF+jP?%rg_xBzzOB#r=4W8q5B}S8HF~uE^SB1RO>*09KKW@5y|Uq
zJp1%u$YJEk=(m^C3T@7(!cg}lcUz$`Y3?!im^!W2j@d@1qG(q|+}ZA$Pm=y5_6f$@
znvNn?<Je^m%Yks2rAgq3^~d~tM>$6k`vtNjo{jojcAIbq?WgCRDXgNrpD_z6yZn9#
zqW;J0)mzw?kFum*P-MG)G6-??WYnYxFZ0q9vE-VXwZ(zr@YDk6(rZlLhRXonK@*wB
zNruL{0tSQ}<OH7MlWyr_f{Kw9xN4rQxP{H?&9e%dBldR(PjYEwTi802r5z*qf1H~Z
z-|7x{0J$#yr-v<c6r^oBCGVQnRG*B^wQ8iKokP*2C_suQ;+jEcqRKpLeYA&1seiCp
zxxK&VRASp@$9tmi8<yMfOWpgObuB>86ZhO$e+?VyZ&NkM&t73H7{*FCg{cy@TcSDa
zaZ79Q#VTP0);;^|iX9>562h@ulFo)WI4TIi&8?xktC?6zn_zW${->|V@MLmOWPUq9
z+=fslghTYimtjVBGyWW^<GvFV!i@U(0l1~9<<c-dPV<~>dOxvTeN?}@F|S;{(-tp$
z4H{TeOdj|>7Bc!tHub4FBJxeLVM~N0B!M9?2ddF45){vuLL4N6iz?v4ZI-E*KO^Ge
z*|f{Noz|wuzI)-|e{uZ&>NFp?S`HSM&4p*3DHkO&xwTi-jSlyBTB{>bSCSHKAiOMW
zzF(>(RDFqcI&6h-AX~dUZ2PysY5$VJY7dVo@}kHrj8>_&VRHG3H!jgnaUaP-lP;yN
zdUu4}_sKd8HPr@|=-68mM5&-Sr84lw6WR(0r;ItIS3wsgIklB%1Vdj!I1FW{wmJgJ
z;rSaQSYlRP2DT@Sfk#oc&aw9+9{E@Cb<Xe!OI(^<IOFLr6s1W<pA!=ah=>e|!xHl|
zVRo(WC5ngjEgs25@~U2EE-EF<US3;Ha9QH#GVPZCqSzug_Yt&ly`?s(+V38s;Cl22
z(^}Q;6d?D?_g5SzC8$!)P*D(NOT=Vq(Ci<B>3=X|w2ZKcM9KnAWXRbA|67wUCqaIU
z$6q5vueXo!&M(!fObir<Klta%gwQmTo|UKCzLsa=MSBa9?EAz%=+yoB$IN~t7t<4!
z4hI{9SufTc9i11zqY|!(^H?6k_~l0(wMYR3la5}0o}fU9i%N-;91K$^43{O5$~IFE
z7EtzfvfLSeFjy;H;)d2G-(KU2I4tU_sP1mIqFah!(ZDyq+H<$`^XU+t`;4>~&wDMn
zh%TLE)hZ1P5Zp2VXldc!R%CT%w_<oa#V$@Miz&-ht1C6buB^!MbnBdoRG=}<qClZ;
zsnz*|>Zn&)E{A)eQRSpQ#tuzZl(;hpRk7=6#RbNCgdAH@o6bODsZ$L)=El?Fx+0Bo
zRBXnH3_>nJbgIW}26eDnnP7{$(B=61m#Op6g3A?&whfo9dRGZyDq$?YBa9Zxda5=1
zTF|_o>*>#^_Ux|`!{!}He&I4KSTI;LtCcW&ik84mbZ3$X1j$YN27a^X1P>|jQp)Fh
zHaX>`IkKjna;q0A#0kh2&k@qeUT0?hX_3?0t0sB#5T<Howy}*DHFiSyt{5Wc{?yAQ
zQ)fufD%sDC+mzCityB3nNc6Dxtyjg(1?R1$MWwnzVO0qD;r(yky?bn!D(y&eWzV0j
zN=ZOpv%3Xg$FF2L(j>XaHVeF+*eT3dZ7_SNcuPx{x|yjn=tZHo>1!u18#lG)Q?s$b
zFOTTU$q@F>VOfVRHDc(a<lV>v3cnFTY#-x8e((wZk;wTafLp9fFo+LE940fYuaWR^
z>idH5xX32!Xum=2^diDZ5AX*$*;0yjCAARFCidXa<&%m&4^`*a><DER0Ho7zURZOL
zqJh1VYCdB?eUk7iaIpL@xbOTwLWzw0P#j$h1QJPRlJ;%TbSYhSH>4_TYvi!pnCnWn
zPO~*y>P!tP9o4#*&7oAMAL0xeSeG@Sodehqi=I~1Gc%Xtr!Ekx!kxjb^Qt*I+9A<4
z(?4g|r-MsPx#O-k{f`!)JM#`S=41+p`!j38K;H7wKyTxaBp6-r7A`^+98~0!K|hn*
zDTh8g*;wdBQHev#jqO0sqil~sOIs5(AKihrojCiu5n`4@Wj^u3E{wS`tg|)3R^t*j
z($5A?&0dB&kQqTF$aXI^S%%WyBhQOy>+SFv%$x2nGWQ4Up`kn(;d4u94Ml*Q<A1zl
z_K^iVBwzDJHt3jV_=}o<_5Lp~0ZMJ$ol&Q*bmi%}WXwY4HBc{Xsg>mu!Wl_B-9vs3
z^*x+T%2Wt^1wx93IRd=58E<7#l*00R2|wbSVMxuh(cX;3pRO}4I8L(@Q)XN~#>=^r
zvdAwCjw%zB<AHtkKfz29_$WzctUzp%Q+zhUv@h3T`e!u$o!_g#aXd=WmwVy>2SlNK
z`or{cf1%v$1r~~gDL~2}tn117t$Tl@s5*k4qKIae6@q9*_xKP06==m)><mDPj}?1M
z<tnS${X2&25bLq62s{9FEm(_cUIG9JG2y(@N3oM#rB%f7k!-pam-F-6hE2Z1UC)-m
zto<n&Dp?w`4H#ziytH`99E=&$)AMMM(kg@$#5J|0w4xG3b6iQPtR|$Uq_YM5%jGF?
z{^3af`ttv$B7f=0nEK)=CsitmYbn-V*YOemiBw2ew;Gm+;}7mcyU-dT{@4ao0XGPN
znYv-*!wzBeEJ9~yoI02I(jcdZbL|rw)j#5_H_ywuZeXQnzhD+J<?_u7W$1?>7*6D)
zL^!LtfNY*Jj)?3x_74=G9$2yhlHYcrmw@IzDI`)mt5f1gw4xS2Q$F6sB;@}U31;LX
zu4L@&>}T=tkRJEPh9=Su&7fVm{j?E36&*?1MOHG`VOENU(W98@7)$-ml>5_Eg753y
z7=(_1-hO+5`6swSFUYPEO?$FGjL4fAF#VMp1wsDS#SKYbs5n92{m6_<ttK)3*<+dc
zCDUFu*Z-q%gHRQ}z*UeJd@&eBjA@hv*23b+1TA9ZXZetvhFM5%qluIhrVTt}t%E)!
zCPh(?ovFxEI<em=5#uxph7!@vo`C}tBR>8VCgg;F<i}qw6hi=JoKKKn69ulxTkNAZ
zny{2(qLpLk0lOv7q)fuqFQL9So~sekF(}5L*rNI|(Glm;>v1R$Q5Rd^EN<);`jv|z
z^pjH)=`SF^ZUu9n;z*Kz8P-%2GFhz@Hh*W|V5%osaLa0Y{Nld-I#nj0Bgv77<KzSR
zLlP-nrg+lA^FsbueH_<~U;l3AH%BopOi=!q_&$LhTfM=y8DIX|uA$9f=Org)obvKy
zTIQ8sd6VbUHuV)I{p9_*7-|S8NsO}b{8pw9aC(nWcKgng<VH*TkCsX7Cl2E96i;2U
z6**{avCZjz+wAulCDlKak+Kz_d8A7^ADGbaN;p-hO0jVA-f*oBTL|gJbHcu)c}ZS%
zA83TWaBNuB5YfFB%^9rEJC(|A(~w}#{=l*OZnOb{c)xKU8=ArbVrX7@SKHwd`9#1p
z(~-Zu(e#S(dNAv-nXR+y>l1O9IY|JOBm1~p0XJa3uSVJl{3IH4`sR5uc<0{x!>yx}
zvDzj>NvG(9%HMW%&BM9us#}HZj47`qejd8p9AW19i}5;K!w<uIcTqazf8<m3T(O1;
z|DEkzpZT@AY;J`|MJ*yi*<&&M9ke-mwSjH`K{|b>oC;3T0~dqM@>0xAaoB!DdSxX_
z$ntW1s}^|}GZb?T`^f+YoWO-F<;{82d{2%vST;3Pb_}b72=jF1Q$=_~FXPo;_OvB-
zAlXJ`JV8l2iU+3_IljT3+yIb`B;iMpg?JVNxi$Z@e}mqcwbw|*xiu|z`>TZ&u?tTJ
z^VhO*8kz~w;=>>7iw(CeChn-#a#!li`k2kASDs~@YJDBnRLk9(OHRBuN>G$g^$zzM
zp8ot7Iq-`!J;XGt&erB4RwwV@JW6-+Q$2^4N_~4;X}6SFCXOb0+9~3c&rxA7U#7@D
zuT<8M@A+Z_yBy9>PLc%-YoCXN2G+uw<4zx#UKR(DJi*FQO<G2){0hsU%ejPutBL9_
zHVj2v=}zHrBvegKF*jJFDs(e$%@Mnj%+iFGGvZ|QEY}IweW*p>UWgR7=*r4yP112u
zVZsXeT38jNb-;V+Jr7l$GXTzow_Li%MZq3yLIVe5{=!tC7waiR-Gv?|%T8VW-0Th%
zF>}&p-{;0QD7e;V<W#+dqARtNwVQ{<z!@^Lnl&gyQV;nSYMIcm`53!c7sY)|HYl|v
zVpGkICHXnc-l4t^A^jVayuuO;QSlT^Dw$$Up>%dxwSQJ?_<4D@!-hPz90~Z5w=uwO
zRDZW_*k6tu<8%&ui8eOYhVV|^O(8r+LcjGr_nsf|UVjyuu#im$YXE2YSj3apt&xeY
zugcBE6<V-m;soqQnYiBE00-4O`M~iKvxmn5Wm<LP&V<>Z7xq5Xvhzi6+29d=y$Qf2
z^DO+H8-$ohkVPs=S7}JirerEVEu5m$pO3duW!LLdSP*gPVu>AoQ)@j@vfZF&H)DB0
zeZ^PqQ%6Eg+M6k_Yhe;&3(-I0@`tdDIPX1eUAcy*H$Gik7STx~gqN<0WvR@VrfI@`
znr36AXG2hy)t!I8xMkZFsFU+3sm4?`r!RDJOI_bL<USjIAsx)d?JVP668=_PO;sN`
zzgeDnKI@c-Tj9HoWStyumU+1z^hJ;O2AA{2Ov~4x=Km1Qhq&Aq^4Hnz9U?lN8HK7r
zQViWC7ief`xWa=L-2=VRf(}a`rGqGu5~p%cD{wF--I;z;0arf(i`*E_L|Ri*!}UY&
z>LlKndKr4K#Of5JXy2Q|nQ~ostYd{;y2(=$*D`QhARdVWpyG;S@AX0Z;!%ymc+cK~
z=R}Y-XP(8RWITg8=%OAXpFe+oto|&EJ=$y_T3uoaG+JVFawxhAQYgy8q=FBretgLN
z2*NEdnEx19XsGd-Yw|}UJgNmix%X~=iQa73?s~{m&j$FQzhti~jl5ShtKk$BGR@)(
zyn(<fgMSg_`-ab;?RBOj*}s%y3bzkhzwScAXn=Co9g>R>ac;i*e|QLC#6u_}9wNX~
z)EkWgv4|fF(<hTEa+%ON8?4mqq3^zO4*Em69=0qIH{48e>Ywy$NetAIT7KLbUsRAZ
z^T17{`pxj#A)VM-uI@M1W7_2=&A`zK<?_M{s`XGb(>(fm(%?{)@`&lMVmf=IjfaH6
zWt**>-K7UNb!CTxY8X&lsG{y_-u*CJyNC~I!Mri5!<<juLf-XI9pS_oS#}Se*2?mf
zsEh9|V>FQxo^P+ko^&%<F3oUsfNl?yha}2t=&u7y<BKjcJX;R??yH}tT>_33Vb_Uq
zX}g!&Lgf^FjHPR`69w9DBvOjHlfyZwDKa0>iim5>0km4yaZn1GGN20w{)QM3MdjoT
z!~9FDr9pncdDuY6)>C=xe>#f_WEHN-&%e}KG>_wji46|+gn?E}SS7YHBDOQZG}o6$
zs;K~_T#9yj<QUPP{`gC1i-wi)w}rm^6+k6roZ&YAU}^sP@>Q(ZZCupN-;vqF-+<_T
z<a}wh-(Z{eV<cR<N3S@hZkYlSfa7RK3c%U71j8&zNA=iYvH!B@(SCkueRBkh{}oQ7
zk1)`)Dy=+N?LF*vCunKD?jPj3d_1H<2jyGg5n^xvXCQzBVPVUK@)_YLzLWaa1Jd18
zl#%=*cVAx}+w>rJ#qQA6*746WZ}a<mFwW!=J3_W&X9Rxszj?2^oR-o@Rg-30=ay<h
zU0nXmGz%qYg~txAH>&C&Dutm9(hLDzwv|`Xt7+!=#+HMU=mZ`dd9kesj9HTFZlXI~
zP_ni0_w63W@#KXo++7jzcWACy^4k$uqK3xvZj|?z4ZBl?IPxS^cz&h5r$nb~U(N!G
zktWs#s1bQ{ysYaVQn!X0y!!Zi6lt>8I@Hcz0Mfp17zLxFlLc-6B_VwgNMDmLw4g4b
z9ikD*tUmtummP_18ZL@PU#`b!$5pM2NAUf33M*hCW_d=hBVOd6ua_qXF@>0l<(Yf2
z{2uCSxV1Gk!GaB(jbS1zd7?yqn528kXiDJ?UT62!fR2@DQb2{z@X!9<nuJoUl-o+!
zab9B!L6|{^iQ9R7<^hc5I0?>(tH%sX@3zVBTf|H%KWFUB8n>a|!^479Wk#sVapu^p
z5BaWbT|emo9Nhw5GdS(vNa=#f3!sh+wkwNJLW%P=ryHGLnln2XU@U$vpWOzJ#gIqZ
zyQUcaxK$=M3G382Z7N}y=awbS{_F?qeLm65N92_z#8$Vk8t7?9*Eh=f91qV(XjQ!s
zR8i+u@3;-|pY!cxqc>nbzQqsB6i<yMsKl!i9MbA1O`jClMemcfr(FOjxfI+~+knew
zbHI97;d<Ufr!)XC8AnjKjR;jViKMmB-R^=fTIU+2S%)Rcsd)K_D_Fc<&<goSf%2QN
zt3pO0Nu+JC#J|O1iK%6(E$j^2(|bWbzubuJ_I;BihZif=pdLe>hcOhXpccBu0>R!;
zHq_QWVyVd|&)C{o*Z+mbLI=x&u*1c@!_t&@ZY$9jZiNXa(#RXjPL+trhg!u@@jP?a
zd&LlCsi|VmfR;1cJ1PwnZPQU+VspY{*wG>hb7noS^Yg>WruVPa<L(dhX8)+CHti3y
zH%<XFSd8<6({MLdOn+&K`3rX$#*DF=Ni`b|)akc+W;^_e`Ky^g5kkMLClDZMmuyrW
z?WJoXPE6kU)zPI^CK8jaL=ZnUc#X&G8z9B3w&l4+I!L2s8!Tp{+<<~J<nu9B{k<*l
zN4NhGcU!|z=c_a48F%j1ECUjzC_Fd8s@EKBq+%Vn=TTwf6I2ro$<<?>O*VrXG4J6)
zUj_Z;8ThwNd<c+Qo|rz6V~`}5W+x^lu0RfH0P2b91f+fkgJ8rXS+&C~P-A0bek&rF
zrs!Kjd)OJ%k_<woQkM%(o!THR{dl_m+r4j~@iC6T#>EZ<_ka^94BftjU+G6p$J7IM
zx;!RsJ8a>~4_p6!64|w+e10P9?dcV*6s}91H#s)0b@r4JTdW!0^W~>7rS^zew>R}O
zD_P8uBg=IT>a_UA@t~|+wqb}@7X9e3xZiGRx_XAs<vp}0W%DMfJ-R@MTD90v5>hql
zDv_{_5cNP%;zVSEY+K-{iZk`K-7ePG=bZ9s2B&LnJ7f2PC=Gu1lf4^p{56#|1tP|n
zTQxJyUdHTCI;nkgi*D^;IX`B47nx^heM>~Q)P-ufZRD+8!=crcgyOZa=N19+ik52}
zL%P{zU**hy4!Fsd>K?ga8lUKlWawZoKW2Hy{G8@-L!+Ch!`8iZwXe#L%o{&uhwIb?
zpP>t3EZwaRXg)=*La}>ITxEX@<97e4nG@cZDBhSBoO6xw#PQszzDV4;8+B|*&o7dU
z^*w&op$<81OK+iRWQV$X*=e~{L+MDFdL%{z1-$sW<c3fJpA{BJ2O40|RNU}oC=!YT
z53E#X`&_al0woIF(Ln?^r1W4cz;x-S>p~3;4HGbJ@6l6ZW9_nJPKQr-eTcu8K5YtZ
z_JB#)>ahZL=U6lfZhI6h$c>dk)v#H3ER)mmY&FH|4C0sm<8>96Ox=-(;)(4Mt7D<j
zX~Ef@92?ShyGAu=O^GtSbmOHSJ$~A96KH5Ro1Y*m6WZf?PzuIOxe3NJiC4QHe6^mE
zEk8%Pqh2OV%J2{yx_x6+mrv}_+yVL|<YvGG8;6UXxG1yKSRY<+oEakT{BNO&9YG^@
z2paM66!kF<xlkNi_69F<={N0=Y;jL+j<Q&(kh*fRN>+{e@>RA(k2rmMZSUmH`Tj)o
zubtG^GJ#6_Te>I20h&68D>T!P1dY5nk#Kl*g5yWLY;MFgK7qt{1Ozr6)3tAD)3<g@
zo-o*E?T6O3^=9B51@IbWI+SKYO>xx<Q)A~^;7VE)Up8rdKjamCQJVK_ADfKkL|n)z
zckLS}ip_a3Mj}GmEi*gB_VMxU5!D&23p8`p)wCL5GX2@ZqDdM1qnFEX1Imdkyoq-|
zs;}sbPILSNyM<O4yf9xlO3rmt8XS7Ioz~B^Hzj?y!k-g>3h0G3Xtl-863Hw8S6KM{
zrqDn{Vq*bJY{OH@(FuUDMg|AYdq*RDP>z5pEZ0CZllviC)iDsjUiZ%7-f~v=7p+nb
zjY5;tJ3p6p&<WBoX>MVY<NjI6F?9jK7Ic2frPi9e0=(z*VHDhH<f;}<l#mzHJu<2W
z$VJ@;jce8dDFl^L6)5uGuzcRAa9Rsy)2+gHM^Ci>QRgN^hkv2Q2D$^Hgq=^kt=O*U
zvYMb2s`FMCHE7;fj@KV`g^JVl>;E#KSL1qb5Cg);LV(OyjM<*yOR;VjeOE?aAGDYX
zF;J*q_e%wV`d$HQmd8Di?8!Kis`v?33sK-=6P@&cC@$FIl3+V%*B#vz5oaqAw`-(4
z*%qd9c|O!Fwf*_#&D2J;ZJOV%&jQt8O6s-z#WM+9j9#qa=V&0-l~)$Zej6s1@^C6X
zDY2VdDY2_fzD8R1b7yEVFLpTP>W7N`iga66LX*;V(Q>T;kS*k`1#(8VyR_Ti=csrt
zt{NBY5Lf!DY~<PAw~?3~j4ewd`q8$FvsBAxE4u`$D#ud@Tvfi7=N`EJWyL(BIc@*0
zeb;V)h})G_{Hv-xzg{O}EW+=mZ|yDa{e_zd+<ta(sN~3{qCsL~%_t1oCZ8}Q)4K~!
zeR+wQ;iCOmUmkPv-rCYoIS5km76wZ@_a8#_H=Gr9g;PURO(F-w%wY$-K&y48P@NOB
zv(9v7kUmV3`80b87pXE@&AvH+D|n)I9%rf%fI?HHjNi1>l|cv;poA=%MRoDQN5Uod
ztQl7;>Kgt3Sw}=*C&*)wUCxiq(#(*W0k!X>pnF75XrX4i$2ze?5XuR*^bVpgb_nGz
zk)iEav^f&w9QJ#a0mm3#Gr){T*KoQYY^YY+O>C?{H7@XX*6E4y@n9%Pwc6aEp?Pt1
z=+-)Mz#k0x>2n)4Fd03h+C!FKA4PNxJc5GRl_@7g(!Y%qLT5?(QT=I}YQ*VnPiqg*
zIf?zfQwn32^8*rvob#_mQHgzFg%)%Q*sZf!8PuxvT=r{671r0d8**MS*t^J)Wo<oV
zm4Bv#tKK2e`(xbix4Fj8zFiF;dDWUV;KzO$eCNp&+zeWBwM|z#QF}Gw?i_J9)n?0G
zkCAOs3M2EnVVfSU?D;I$MydGB%NKUm*WVR21qU{C@C!eWvNM*xg1%?oxX-wgXYjD4
zxiOl$Q2Sz?h@<zX410rq`-Z$cnT?F$FF}zuQvI!wIQ(yB2A9zktn+f_-^N+(l!QGL
zl&@&kn^sm2_QxWTUgmluuJ>IKRD}@uph1(Y4>kMVV#@@Y+p1|wC&3EDVTe4pa`@Sd
zi&wdB2Z7&5^d1PQnX%*Sjcx$pENBq67xGa*O$*EiNI+#-*wL=uVz)djEEyOtV3!Vp
z=ds#wJ9D-jE%~y0{$yrlMHWNp>UCX$BPxvc8`0_A8+hb=o;^A)7wHvZl>({nu6&mC
z56f=W$QkiCqJV;)U9@87iGdIHb!zocn79VlxTxrJXa@?s67=7Kq5>k8t*NbjP~m6=
zZUzVHO5)v_n|SX!oGt!JV*iQ0&+1@(rx8+RR43D2Uw&oGVfgh`CFrpVmbTVMzQ5un
zdKU{|_%b9j*=L`-#MR{UD3Bkk7Nm|f+#p$`8VX;)EZq2dH8L5KNkHrijI}qSz@?6H
zrW_PQ+H1s`^ls7O*I@5c3pM-@ttU0zdb05PltqaBP0vHS#Y-CUdl%_*hOJ6A1KsZv
zgm>+F>0QZ87A(m3lvH6pa^)Fx**>efG^{$=?_u#ne@{2^>*kE8J5fAEQ(*_UQ&Uwn
z5{fLWieo2buJ$lK6#)6!`3*e5z{KV8XT{?`Bk3rn9nu3Tq8AM#GLGs$p<iXG*-XZ^
z6bgrgnUiik&C+X3jqQ2Qsj(Wm-#V*yekVVvxTQU@0XHgd<JOt1-YE``wljQhm8Ua>
zoiDMZWZCc?1a$1II8EfqTzkUyJE6=T$Ffarx!a5Usdz|kwW;#yzRuK#L%Q<^Qv~D+
zfa<^_)8U_k0K+Xs94~2~ty^7H`raBe4PlpBp&r!z_`Zf9);vM{@^GcU!-$)zU#+d%
zMwjZ_u*?lYh9cN<m7!Fka?CzR9@Lv!X>wq+5e`tR-QM@h?1Lx-NkGQK9ppg!HRj9=
zL3~#?$j8b;(XsqCCV>xVH$mkR)i5udni9bG=1<<{`m?jx{upMl9{Ga{JJy;<f{ji2
z=m$A#Cj^I!^e3lGc~=h8n%)dMV28_HzLQe~`?JNLBp*V)y3D#YK{>kM!=zSfg=3VX
zh6qd*Y;IW<xEGQ%UuN+%Q|F*IlsE?k+egzv3NlL5M{X<Uik3v)^g+K?N7psV9tntJ
z&)Pp;z51BJtD@9e%-yn|6BO!N)pj@oo0RqIKTNM{O`81G>67}7TYJFZQlk0MgY=c{
z(3rYkNk)>78MLw4rkzNjSBP5j?c_A4j2I|cm`Y2mCwML$I9)xjVp-pIAkCd`LsHF5
zQwl9<mmlAHaF##bPeS7DWOM(+Q%A$)IlsTnwU)<rd;V6;P&N5jC$|$!t$o$|=%|*B
zO2pY4=|;{z)qv@Id-21>rTVBYLc%xuvg8kMoT~ni;!V3{*kH7wlq*5~v?+|T>HWc0
zbP)ruOHgcG$4HU>Vv+C%Q`s~9(=!iTg>IrWeXKZ1=QX6~a_ka6q&-eQYi-!_c`!0%
zccp9ZwT34&>?H>rvyR_xL%e>3bZ{mPg#6V<5>b;cr9=+WaQr+ZmYdBvz3?J{5|J03
z_ZR>eLX0maGqojK<6Khh+^$XlW}I{LZW0OguMmvu=)N6SY(al=)IvrcCgQdRjjGY(
z)&%ic{Wy9K7LL?Fr7~erS;tpF5z95PqKEUn(@j%p3Wsi(v&(FL_4~r9v8>DTj#W;}
zCwG~L1C#eqB^2XG0>KY-!H&knd%xjk-&E7n;v&MriH4tt)79{)c$s}3H`Ok#qQ5Aa
zY5a;NUQR#1+tbQ4*4nYNSa#cmaV|zb25`iFj+#yvD@tQ{WU%TvUkGBIucE}#K+i>d
z<$+_>i!rQW*R}g*{RQ{F&{S%_H20YWpv4es9>J`hnf@>cli7{C4SJp*o+meT_a8GI
zY;QYWpb#MDXjqk9s9A#~puNP(U@b&00-AK!rl~=f{$ZFpN!%91e^%==oosr}z#Cmp
z@-MOd6wx-GGQWlrRi!nZ?Om7)UFrgvN-kmF-qtdb3|x0w1ubgRAH((ag0moM*ZT7K
zNrCCZ2IjJKLzj{E_l{ZU7hQL6sobV=F1Cc^QkpvYwMS5h<dUf4n~j03@wH`iDT|1M
z`N}y^Vkw4rYisAhEd>MYtL6M@+MkkCy^R%1{3iQy9@+f9rF!A{qYTmQ-u7q-EMfaO
zOILn3YS3}c%+1*Ar{A*`#h`U(2FC2Tvb5U`FVKi4RKB>9pcw6IH?Z|#i#XaIBbfD|
z=-J8qq`6zyciIkt{0<{82}M)jqND=*RtI1H%&DWUS!-N&?+71xZ_K8*8_KO#qA^gr
zDttNKr`^}GP@?r5U6?QPv(hAsZpq~zGtbWtx{tglz=IC8E2GECk=saX8A8A8%~U1*
z*I1y3!~#9sCw#Q{uaH<^`BxERjzC-bZU)%u0|)(CA>rq1Pu{6zEk7@!q^Q%+<2?;S
zL?)k6NY~`4eCmzHxB@Tn`_MUAdVD7-BD_64tTO5ow+MeR@7h>vp=*ey;@a$eolXta
z+M~C^mJaf>N{5|P^Z+tca<0E9KpjX2v74A*sJDE4RP!xmIOp@THkop=bz3|K`xC|#
zNre^FPNy%Cg1i7+_6>YSaM>*2vZ+Mf-#w{)%}|pz%cxnzZ1gkgCDfAzvQj{lkvzY;
z_1k(};eWIMBOaEN^bhkg!`F|>Uq$JBy&MmXK|!E38=H#MEgHDN%Bn!1M}PmrQ}-<R
zk-=?MAR)E6+UbRZciVm6(RH<nemc7=1()VdOAhV5P_-;8fNE3Z03&&hTp_yy<<#et
z@_Wm!1KGV$uYY6$qio32_nvEeKx!!KYdjllUY6vNcXqMJxM*kwpax7+W|Gh=u6Rq#
z0MAVFQS2pFdTw8LLE?VQO*wbC8tVqk-><KTm)Y!4F9W(<A=#L}2Zzel)3W-YTS&D`
zRgNN@QCap;_J4^UD+JIx<Fr^b^h~Gv>ZHr7!0heqhNvP(ZYf1=J>Dcm-=1G7RxFV<
znTdrt!zBThMr=P6nYv9`X=hIeT8s*&J=`_SP-x61!GG?s^+_~WUz|(CF(di^BkV1p
zs!YGPVMSChKt&XkQdAm5q#FfPQo0T(DJ|V4A_gHM4F?hFl8!?egw#P&;t<l^UEhAx
zncq0$|9jVBAu=P}vG1MNzP2PKT5zzV$nc=J{h7hm;Hg!dnl!rNl;UWXb%|f3fWX7)
z>$YYp)TGZVh8O!cn9qh~_EP&t1vFxa-au<PX&0}Y?ATtp39pX+%*a$>eE(U+Q>cc-
z0|)skPSt$mte|xX^-ETJ`#y8~j;d`?g0EckNX~V^lW4*F!?(&E$lt%%58%biwHrTT
zCo;$@-cWOty>-h|U`1>9{5TUc^D#(Q`)+e*Ye6P4>h|BwMt)E$%3ZMeOmM>&;qH*l
zbW%8V_2Pb=qu7>%cAKku<7-PqfdO;$vMge6XusaYJJq$2g!0h+#^xe$$^yq5{WU?Y
zp2LZ;7CabL>fQ5}4b8h33eTC!4Fz-lK){a3niogvUD6Jqy7h4GzJQQ+{`r1Omu~jv
z_TuQWzBK2PuJTDd3Rk2|xXor~gB^|f&R}9yJ=hPIo2!$nw{{6T?bKF9CG!}j&|P4#
ziLs0ASlk`8+Nbp+=6K{MLT~2~+D3NQF|??SZ!44)bq<8<CO~DBFSl_RHQfVLB_w_`
zizlPMBn6aktxdEZW>9$dKCKtPk#WEPs3L6Ho*L4U&kNXm`=Nw18(Z7!<?G*%rfU@|
ztjL~-Q~lhHj3h0~RHnDLb(x4^!}<3+4jnh^FGr~>Jk|-qj2M~3rkSAj0NC-Mr+}p+
zL(?dPB&Y(MXglR5XhQgalmt<smc2!B&Nqz~3(ZxHgGR)q*|dt@e{!JBXKlQu_HGI2
zafy+HNW0dCf|j(j-^H|^(GF#Y2b#rNXYAW1_-p%w$XWlC7_8hQAviNwY#T)HM>$^*
z!-M%r$YKyCrzyJ7fm0j!LZW{wtlD@Mt5g0&o0d+rL_kv;;ek*!4Rz@1>QugeyR*ai
zg^J$C^jOq!<x3;L*D-O%PUdKR#ZgLLr!!dLsN*RL65(q$pQR5u2KSr?$73r7x^vFC
zp<*b#imi999CGE9Ir)+jRj^PWjj(_$ge^|Ix-T<Dy}fk39G!m}s8?*Hx<lt?UnC0-
zO4e+EJK_^~a*@eO3*)3Y;W|4R7h=LmvbWA2niVLq_jU8#^8YH_)5}7uXJ}J~?)GDA
za#r?sqKc9L11CplErcA#kA_3Q6(3Nvj>~0&8rEneQp2)wfM{U|2ocP-DcC<YUx3^|
zT(BwnHNM|1sAMd8uh*=b?Cf`4=n)DG5Kg)w)5fO1BW|xQFLhYQ=xsbhGL$Q_x8~Bq
zw`y)PKi{a2SId{>DmiWHL?p*g&wt7+;l*4-OwCu9AR}ree+vyFN~Y5y;WBGwzJVlH
z-9PPj-8A*9&vQh22I>t4ERM}zkIhNoZW@=Shiv+p7$ZxG<%xQW=w|mj!Ec@Bm2;2O
zhSzDo?6w;{GU=|jXS-a;c;<7>aj6T+iw`je%Pvetok8^`5Sf&Qw^|S%!D`<YL^S;^
zeSEPLOzyj;<d3<oRu4ZuCVXyY;vz0CE-FOLVYeP4+o)M!=4DG#PzGX8OfB>oXv&ib
zYC^-(+1}=9T@>!j&{W;%AH&8<Msb<H2Z&JG!o|Z#e*oka6>r|3kT3JtbB8Pf$;DGq
ztWGleGs3&;2QK2(U&hO~^^gsz+dTSQ!$()LFt%^QWU(+E1#uhX77s!xTk*4n;+kp3
zloXXX*DTZ;dz?;x6$HRKltP1@{Gt6I9?GhzHT}6p#OF9o^bMK4AG>AiR-Mk);&ZB3
zZ%L3c;?(JQ9nciY%W*teBcG8`IrUvSEh<KQAauK1`<f1)xTJ&xpR3Wk`RLl+<^ijp
zxrg6Ai};AVC%^*yQUWM;f*3z;=4JpRLu%dgf>S9YUlW#Qp{?nv!^k1#{uBOT@r&O>
zOh)9B6d25Q^@N|uh8v4d4$Yc>O}%>@LsY*wcRMT-_qjS|>%H?5>&-d-w+(5k#n&fQ
zSbZ21v?YL@WhS|QV~wgjx?mr44m{Kdj^BErkNRMt6Lei%U%SNOe1`=<;h>W8P3nPG
z8lFigQviV1`B18X^V6(_(B$<UbH8kw@1o|lp7OP;1=G9)L6sw^+681c&9c<Xc@{_O
zEa=|X_4@O##?_yg@}SN4`^q&(NGL}`9TBCk#%bSDRxIv4nXIvCfSK)I#BKFB>{W3D
zkvI1Amb#sta`=3&t7PU*(@6RXW0+VjU&w-40kgf8RHnuXrLj<wx%DG<!}#IO(1euo
zVRx-!W9TN>t9SM2%+fg2b<HTR9*dA&aJXGC+%4@GbQ_ZO9qXJ@{@{z!NGU&j>-bLj
z)|<t?3l+muFJ07hUvG)9$kqo8p?B{?3iq!@b1!HaihxJS$cAs90Dyx_kRfLFm<0k8
zK*2ywH<ng{OanYD-n_-e)>ocE1W@MaxOmBQaPYQdH$au{I~6+9qnUC;Gal~U7m!6^
zydh8jp;R#G;r1en%+fN^;*+bX4swD+I&QqgCZx5`tg%K_OF_m6)QB5p>P|T#7x;3T
z+1~Q~a`m}Hva;e(bTsezsYcg2p*|kH7q8d`>i0r?d*chjdL2e2FzziF6`{-42GxFC
z?F(90wvxW>{V&wd*07K-J2if{jNlbrex{qHT_(3V*J}Qe2dZ3mHWs}`AbV<R&l>>w
z7$t(o8qjT(cGt(~nan$li2xu%q@H3GJ*kMe<^Tumwv#K+G6OH_`Df-+Y27P;{}(N}
z8kGsz5)=T%HQH^S-DE-ei<5<Bez`bO2hlVOm=VdK=PYa)V%hOz%S{ef!lPEzfZp4r
zGc#jb1-qG@oxKi7+tb;abM4uBk_l;R;3K`B#U)DdiAaq-$e)7F<vUGhShbQjodFpB
zx&#|U>x-Le34#$l{VI>`%EH=$YMTm*^H9DYbG37zAsTTce*gF&Nu&1Ih|^2`X4O~4
zv-di3`ll;_??e0-nbTlte44SSWQ#7ae5mD(I{2Z>E8}V-p?%KJHH(%pgwc$SB^F~#
zS|+^Hkr|wq=9bl85^sBS*SN~LUDu`{$@mdJ{Avr6S4Fl}s1gf)p$+?j8-|Hwn7jv1
z3pF{#y4yO4bzWOE3JPyXoK$aTHF}F-vbW?)eYVD5W>J`AOnv{o-P#4$>lP&!BYLz0
z_^kI_=7c1qqy4K)3ok{6`-khOu3X+Ahsx9S6y^P2K4wtQyNzyNwmr{B1x<XNuW3~&
z(FtjrKYCIRPF*>qaAPW7@2cx}=Js>18Y!z7_E^i!=Pu;!J<O}|X#Q8T1eDTge9j(;
zg06qFP(Wpc3OfLl=&BHe7}eO5w8vPW4V6M;W7POI)M&*-9nbVp)}5_vhKiApfi-G9
z;lKjGpQZ)e$m@xs7fjZSED7REh=%UHfBHr|w$bl<@x>rvDd*wFXxYKn0C=k(VGu~8
zJ)&>16C}r*57y<uarM2xTvc}St-E2I$wnd--Bt5qhEo<^*ZPChrcLkmpT^J<okp+d
zhv^}CV=s|paGRo`Z8~)Q6kwxa(u}yXkmLND#B1RATzhhi{1Iqp8tD-)zR(v%uTl>m
zBlAQeO5>KKy$U<X-~q<^s!FET#5FX`0!F5?JojX}m~KvIa55J#$Nta<r;(8$Nw0GB
zW6TD(MXx-cwg+I$Q^#^%SFB{R4H=Z*>dy&GlPWRsfGH3!o9bSD0&cf>tD)*q<By7b
zHc~Z)`BIM;k-Sp=(Og?c?Ysv;#hUpu+i9v<x`4sCQ>vj$8iAV3fDvoAf<Bg<f2Cn|
z8jUiD%|a@j|N0P_Cm}dASZrC-*_^HJhMC29#9*Fg-l+b*EF3m{YVejT&So8-kBDMM
zp@@@(XDu!8b>^qgFjs1qU+iOvc=A(i%!_Yr-;hqI&nUdmDf#HmZYPBBN==ksln$8f
zC(W-56HyCqBzdwQDs@wXS8~RN2g^cO8x!j()ot!|ayJ*;Hh-+?D(f-lP9Y&3IrGvh
zzSMLkb!Ugp?KKgmAnbF6;3z>KIai<Y%&o9KHsSH-=@GH^k3%zCS_Ic~zu0h&X&gGo
zX-GS&8u&8HUbwnOxH_*YE#xr$gJNnFA=X5!2u*hR$!?718&oTw%AAWv$z4+?MPO<3
zIaPxkhvxz`XF)NpG*$Z2Nx*!WUHpTH3L+xv%sck%b|nG6n7HGa4)3Y3&-}_qFNk^9
zPSJB`6H^V=CTObJ4_r5#T_?G?UE<<&gRH&8L8;?d@`A4HX*aG@P_g%=m{7a`7=xRa
zj!G(o5WD4PN)4*Vyw0*YKkizWQf5{CRi-{p&j*hn*W5fke_z@4rn{7EvkH>MmS$`j
zc3GSPk};W3cD`I|1==T0gyuAKSZe#T3PhQ(0X!0*GYGd8bntos-P#bqX6ou8ttbHn
zPWenN-s{FK_z*GPERx|SG~#4dSOKC27?mI_!ndtqZyB0dMnWh1`01C>u+O<RW3ouM
z3IIWUZZ#*eemy1m?9^qbj2w9C8pDcf^^L0yTff;B($oqX^epjtt_DWKGO_HLjoz3&
z;CtJKQCi{LmS}4s5<T^Pg<X4x9Rv(e^_*0#&?4rQ9Es_d`_KS%x6Ik@=`{1H?1Xgp
z&1K^}llD61t5N}%B__gW#!p#o)KGgUJzj(mR^4s^nqp@jl-oWJ<?*S>)~k745-epB
zhqQ&lxS2>U^XEAR^?WlIEighpWAf`HK1P@RgGnK<!voe1@!pxvY+mg&)q^J)(y518
z9(@m#^7xb#JeHg&FXI|Ach^JEg`qqOBc|Bh@0x!_XuKB>aq$89A@Je3|JU)Y4Ts2%
zm!j4Ib5F_+2i<4M`Ugiry8Et){~}p;8B0r&jQMUcL1^ol%<8b#vnJi#I<wL1&hie!
zSiQY2wDzDnTcM#Hqux9Qs=Mv2hxh%HW37tK`#EWYFR+MtN2_yaTKxjFrt{7%&PAtr
z^!uP=rEQA~Z9`|f5>rrRymcYnkA5+KWN2&T*fVVgs?0HFs-!_O=)neQGf*<U>1no#
zq|&EZ8(NmziqntVR^jEdUStMOC#B?xc|i1!SJFo&<CW50O?@~pUO6@h`r5np9l=r!
z4(TKH1p=}k`Je+TuFu@`AUtloioo2*W<d<Ak>37wUfsuW2+7I~wlM*ALNYJUEFU9F
zLYDx(j7d`B!|hC7g^Z*(O9<Rz(CyZC;I7!C%N`l`1l%8q+j~qT2(5m1jkc5ltbn8&
zLzWCApS)%mIGUuy<Y762{pwWqq2;`3`1hqzjYLlW*K*IkoHw#nI_o@3Yw2gFR1?rg
zo!?Q`@r@o{M%9m(A&tBYU+Z!w`+pZl{zfXzlkSfv#I~Tg87*0r81It#ERIQZx1}fr
zx-Yl&BMdyL(477#lHPHnTc%XlB<G@}NpUri>-j7tCf`eEsF`E`$8(X-e4i@*3OoNK
zBY&x?OP-lL^k3(4khk8Y!D(=c1$M|x^XW~IqF1cbl;iNvb*6ic#Wn_{2a&(LcJbh)
z-O)M!<|^q!Y*E)KU=8e8n|5_omN{qrDIj&Y_t}N=<p(jYutx6LUf;q)ZV3##a$B}y
zOPk6W)sqGm!{Oe%6^ZYX1sewXUDnujznd{@W2#uO&ftj{Ys(3-aI3oX7yXZ}?|OTD
zHx1zJ<A;85kP^`Uyj2a)XHGWAt+M{%R*@dw8ys^2oP_wVL;wZ$u|#Q>c|BR1P^mWR
zIG}-?{Pw<(3{?6|kuB6yr3<@jFFE&}kZ9*h1Ktg4AUDy)N?a6ZSX$!cf2pz%{Bg<S
z^#52+x`q8XbcLWfG+E~ecCY=Qk5Qv#6vZt_ioeAd%g5L}O^m8R>Mei%h($nV(=P_t
zP-R|$)iQ-serCmCk?YRCX!k~6s>qN(OCGaP_t(icrgQeU#WARv2Ok_d6#rO8Rh{!>
zhst6|W{ZtVdQDCLb~3L$Iem;_Qo!_!Ufvvy##W~O$&o~+TW7sjo+jqJ*671RTV@lr
zzZHFr<`WuJQN{KfrX0hj2fNv~I{u=7n|J`!rZ2e9wdbAcSh>dS+z=`9fNa=*dsfg@
ztWcT_g+A*~ZPLCU*X5Q-*Yp=nU`!v*<ra%<*-CQg6H)PtLGJx?=dOLe@YB7N_(45t
zQ#rK&?H^B{Jre~Q<{+S{_pBMp*ar+OFW$LK$vfHyo;N~@bVP8SNH~AJhyWUTesEZc
z=+c~c{xO%q@~%3^04~)>eyA$CHb^MHhVrHO?vCNo1wRdpoT9Pg4YGXPc&lpt8=0=N
zL081D$*}XBU*;JkWbG^Vok?Q7;GnPZx^c-5_wWdI62B}&Gk#Q2cE+DW0cgD3VIAA{
zRR<x^JjJ`;-a&SlaCG6fsPjiY>eqKF1MwfpIEKz9>d0z(`j|%7JDYr;>JU8q6^I`q
z$5sAUwY(Y_=rVtT#Bj%tVM*0k<CFc%NCH!)g47w8Uyj0JgNdjN)h-oqPuyR5rM;m-
z%w1T?6Y);+dWb>WV)@UKXGCH%bg`Fh_nUSeLuqo|%$a|<gm#WNFK8%pmGju*^@>@c
zV<KXTB*B#JC1{>@s@L!JXH=G{1;r1X+`3@0HorMuHqNIXtvU7OlD(t-c}lt;Y#Sng
zu7G0B8lE5U835e<;r6-}@HQPfdY1JjTEJ-@iWnEJU2B<JSU0(G1Y3SF%^Y<E+s$Kj
z8n7wjNc&4k@z@uCIe1w}ly?j7*)tLPqu5X<<{&1HKIUL*(10}%so`e*T=%}}N%y)E
z&~{GSsb7JI#ljuYmU9i-JUTPJB}m!z#ye`2yHHEA;XWsloPEDI&|d_=#IBEbW8@uG
z<Z`*KO^1JY7C=3-*A5B(Fv~Ec9TaM}-*CK-gR${{a0-9=h@Qkq9gzZ(PxiCIPqKxX
ztAWx7^-7*Z=WKePAG90W@TwkyVBzE^=R9v)HBQt;O=2>R<LB)^KY+`U2lm#lSvEHB
zp5MCfK2hhdIlfnQ+6-rNYolvwCG{SZFoGCITHV-mFwfQ`Wh8)Usbr0^IiIwfGs7F;
zp^Y{xb%oAF6GY91AM5Wr-OZVQL9d#(@5o5KH!~>C>&y)TcS?F>f6@`E;O>CJ9G2%C
z<XzRKc;YTdBGqZ$YAaw#fRzYTCm{$C?Q6=SBfcPy;g&@U20F(a{w5Aoima$hNAwPt
z`a@mQI8$Af87_aoqUJ%XHh9Z-$EJ`6cu)EV2DL}oP)zL6MOsKyo&@-M^_Sqxy|=^3
zr3M+`G7W&AB|GN{eG3YXT$L_2wX`S{5~WjkUp;+F0NuwrOO75yP(qRG?uBX-4H-xd
zQFS`|sQi!I+bd&qFuotsSKvnVj^BFe726vXCE3@mtVb^@O42~HQX?V12h*9khMq_L
za@w|hHRnGjZybo+yu2XWF7}kbWAj5se4}_>x$77ef(}(wog08q=l#g=<=ozMuE38=
zkaJ60C0-P4oVn*crfBlHm80$)0Q-L=<h+GOBQ51A51e&KiMRDEEYqbH_~=`2ue+K_
z!lz;*@cLt}>Q!rKK=itK#}gMrVyx@L{C}z9Qv@gW(;mfM<b5%X*=mkGF=0Qp8z0Hw
z#3+z<I}LEzB6HaCDsU}K13p>b+hH`g`n`;(_9a*TMoq!BjPCM)OVTW&%l}|{Atuh7
zXKpba+;);*uNn&iT3LZ87LFW!>8#q3P^;7!Eajkj;Cu7agI(k7{Aq$7qpDec7UYH9
zWefk{&r72<Aw~a#vsc4HuCT^&5lsV;sQr1te?Db1R*|4!ZnJNm!0w?3<wSQD=_6S8
zs+orz5Tn)}i2q@M{s$`VMJQNjbBCniJlvpsiy0*i;XI)6d>?8+ec0tnnN*5e$z`^#
zwX=MCd0i9V;n^A-cs4aPN*ct1V)Wet!s}0)dSlsFN7)dl%+!q@Kj8Y6D_^-(_~E4-
z1F>Aw14cGU)7q(A`<-X#qj`dyS`MP7(F1H3Fn}jsFSSyCC%drx;_RvKzqr9zsnKs1
zc#vu`Ac-X5vb<RhjsYv=KNaskzCI?0z}9W`glu+m42kyoUVIuPNwS(5r8Mg;mRl0&
z@hU%o77RI-Uz>+e_Y_8Qiy&$<NT5qKLovU?$7sU72iUpWn02&*qENRRn_kHdcA*18
zORVi^CuZ)L7i-fjnf6@3wezy*sA9M&>=Zje4l+{cu4L#5x~=2+7-hUeazTKf2d+f`
zoFrTX!dVzKrshY+jt2qric1JEK=E6*A7>CaPfP6g2>E~i*M0La7K$+#Yyby@>&pmB
za-b@}#lGfSi{{Zs5M$rnes+oIobW%gQ&vS@;9#n|PDUv&ROmV>>p4Zw<0FVg;?P;!
z?hjq02Lahng7&@;tH9Qm+^^mKV!)fPD(G5-n~`-&4+3jcjudip`#ZyRJhxkt%H*oe
znANg&9*${?U-$N47n5T#&_0Wt0IK3K8d<6vf22_SAxGYb-;$oBn8AJWVA}jP^w2}C
zEtT~7JyLXno=rbdR=n}d^IK8f3!myJM^M9~K-ZEns^D0VL`px{MX+IaiQd$SjnHtg
zE7<1fd7u(ueEhSQ&+!Mypc-n62<Hj(rnAuG<o2F{RN}%Y_~>L#_f8{9?H}E3Mo0md
z!#!FC@6sE8Krfp|n@K)f-~I)>Q~tUx+k0afMd2^b?!ntzdH&;rlwS3#Wp<pyb>Y<(
z)Iry{q0@j%*sQXGn5x*Jc`tUwbnOX!@EAFUQy;e$c2jQ`tti{+(8i!T<*NF@#XEt!
z<7(B!tj32|a=b|xqr3)#zw`WzKB(t`TBXmP>xGuwDU;K2Xme8M!p)C4U60D?VA_&1
zQ76y1Gm@+M0+iDH52ZwN8ThnCBYB>{;(t%Y{r%TN5Spk9ikeVb3Z4u!ILo3t`8Q9e
z+oq?!-0E>9-!Vuu>pEiX>A@P@rV!-B9oiXsjNiMnMzzc5aB}MH8bVO#`gM*)iC()_
z;1ww9GLsZ0Rm&UB`~v4MSHyKAxbd{-qd)A$%b%pVenT~Z@lYcE2ILbWsilz2M0gRq
z3oQdrGfY<ukiC3U`ra5o%@D~FVM2f+;%=r^@t)qC?0|Xts&IgZRl}0fJB5q4_0dvK
zUYXvy{3=DkO0oKuk{IJ9hGGU=AAQEl44o&)>rY=u5Iud(h-Zv!FDrj5c4pdUb$)tc
z+GuXaNjqz+WQ}i1-BfU{-<fE{b&U<5>9P9}JZv1dfdu{uk;+UweDsV4`Kk}_xs(Xl
zoI4>xLyJ!!a5z(IbrCOeSDqH7dP%3`c2ji4n{25vhl51o_u4dldn;rwrEWEZXl#G#
ztE5&?n0UK!J-2>_ab%P?<4`D}z@8$W&e%nbNn`Zo(_@`IU3`w$ZVMV_w4Lo)!?$zy
zF7A3fbHzTHZE62uV>TvKZ${?K#DRkZ0-B`y6nF%m-|oZ3=w+u0kl+!}c<%f6r=YVm
z^V8*rX7S#*tWhfjo~<#im&H;q&Sbo0Ip5Lok}3wpcP5s{PQ$xD{5!!~WIe7#c&;nS
z1x=Q&m=wG%bXCs&L4t>HvshP-p<N%wxs}UqQuyPH)(87)uHeX~ilM`6N-=fZCCB-r
zs%@X${=)ZgJ=`6Kx#IfC;4dtIPQ@Jmn-xuuw$Aefi7wT7TNmb|%gx9s9SaoQ8u{7R
z8yO-C9Ygx(o(NN$+*_e2aj7Au-qOVSq2DE#a&MH?Jj99kjy?QxjSaSTLVG_R){4rJ
z%i}2(hoRr*>dg7R;84R=?mov)CpTGuXY|Nv`g<F#e%EY+MOh^VcyCqGSjq|6sEkC}
z5n7+IkleQ)Usu($(q_6Nl<ox!++mH1pwl`0Y5EgWUPdH$xZ<i9RnOjyc)MX|7bn0N
z&Veb|cDm^hpn1af_7%G#nKh*nE}TI5qm0of^7s-pd71G|oQ+90lRRE$Im{^57gi8#
zl9$;HUY?&ub2oQOND1Dsb8i$tmmiM7JQTI`C_`Zz_qfJ6%&t6e{S<PpvjDwlfK4U&
zVq`b3|8|v1Frh;Bib*5e;p;#iRT$GO=o2J&IktV^;1s9v3bE+STt`dQiakbCS->OH
zcIkv8(eu!#lOi<F501}LR8{Fs{ptP*zG=a*lr6mCDL)FchYQ&SjcxHmxVj+0D4VZ&
zF&g}Rc9>1U7KvhOEtBvw(%h;Jl<}V^9cS%5Yy`i?h1A<b9u|su4YL`zj2C;^n0>3y
zmU3trWR=xTh!61YI5mz}=Eh8#Hnk-`%e}r5ytlCvv+6^~H{o^0|26MY0yS$_(F|Sw
z1Sd02OXUD|*2IdB{r0D7xz=emmGvJtJHV@z)9?bM_AzoR(TCA%Gd(~V!DkF3Nhf4A
zg0IN5<r>MgrzlEpER8w13bM1`Epu7x06v=NA(!RxxFA|dJ`6`Jzk@6wKWuCRw2L0b
z!C2TtXl%eVc&y<Z_($nA@*X(KIpQ19%~OcF-o{7MPGKhrXzqD{D6M9l<iA--&^!`h
z15ao0%+rxd8}zh#{yU?iD331Sl%!o**$N7FelU5>yw`@FURN&W1qb8&w6WWgykf=1
zT|vW*y)Wg)q7wa@n#3K9X$=+%?f7UMb*#t%>{kW3r8X}iJ&o)Sjj1VH86zQ_Qlqxv
zn!7ydhC^pdj$ho``tHnMBRnmf*weq=FvIsrLrJwz(?UsXba_iNPcN`r_Rij%(oR=Y
z;TYZgbl@dDxa&}SxNE8QVf7OtAj<##^ge8#m(kivvg*ne|LMJORwkuNUK)lCzrOIz
z?7j%{Z-<{@j;bZj^!2`1F(~UTd~xJ#?U)vKc;)anQC$0UK_n|N`E2D9-H>mpYlNv*
z$M)8+aT%X+sL;AuiVj=lSrQ$UiC34S*G|Cv+_0}+M*Qt-y&FrBiwhUa1nxi48LK;n
zzM2<yu08X+b^oKRj+C+a!j?t@&Uv#E9gpi4Pct#8MK%HVR>#+FsF!+G=8Drlxd5lm
zMM5)A0kd97nN*V9cQz*iMuq}~X9?JW4TK-pDJ@?N3DqVYZm)4-p>>hVQlYjkrNl79
zA$$rLsq8j3nX*d1k2A--e0eXo^Wt@wr$}+GM#KpINTyi3;3-l{aKbSl?9NhjHjPNw
zm|F5htf!~jL0#T?vo>MJ<e<pV3O>~SS4}aUwc=5nh7+N0o^$N(YPMx+OxW67LJMY{
ze8FWros&T|HrMs$5xP7+$J(&*`?cyjYJ4MyEcbVjPdQ915LXaQZls1Vr#_pRO4E-k
zwHSK_lZ5CegC3oJK*odUbq*!%#HB?|^QnoPQr^}Wc8|V;mWOdYyyXuU-tJ=TlQx@B
zL-t23xA;NxyN#xI81-Y^o8#NVGHt)QMz*BHSukri&#>U5Gwxg?0u6TCbG9j4Nd|rJ
zzaju)u|;^oaaAig59iZ(OU~;SQ%KCprcJ6sxnF&A-k=^ycr=Io`#LQfhGCb%kK%4u
zJ)UWkt(AuuMC<i78kCbYRpw&etV7SoN|&l>`&Po<dcKE)O+@7qCReD_bgt2gU$y&Q
zD*1KOLF?nL=C2a+Irs$3xJYGO3iRB<(z}u~u@x36<qY1LWjWzji00B>scOxZVl~72
zOa{87SNl~rQ?%QUux@SGPs!#_ar9;hCLVKNs}f4pOel?Cz{SOT)Z0et<T_!hWjE;G
zY&6JVRp`^Vqh?yI_y!ISB1V{|2`p|64B>_GO_RV}Q=e_w{KhL$HYWAdW!9ZfHzxN(
zN1Qx#PV1N4NYa8oTc92}yRgEJiiNK1jQs}=@zjKSE(1th87Q#AU1tacBn#62s}^X)
zJz25Q2k}o3B#ERSMhhO4^Pec`-&L4$ID4>Hu*3GH_PnE_GV@&?)HCT5IW_4ao4Q+6
zfg`4xD|}_na^?|E3c$Lc)!d_F*=e&>v7+t_;$rE(B_&gRaMien;i%lKW0Zz+sftA;
zKmQXoLi;Ey^r|uE)tRFgn?0>2oo?I>e7%8(lgn#s6)>VNGZg4mlc__EtVZuRZNGMI
z($1(7o?fv#Cg2WYB;?GeF6WjfL2bU<KD5Jux!yrJl4-rQLW)^U*>RD=+Z0;VeR0k?
zU7NQMm(i$;k3PW4%YD&P<n+;I9PjfNH617aq)ndUfrdoCxNbO5(Y{@vQTKY1&3BLI
z;VnVmz|K)h%yz0p{gZso^=Gnmv1<xLoN_TbxD1S2r$zo0zgChgow|0RTfKzyW>7(0
z2#0UI;NUTVLK7|38|Z0fk^<RcS-Q)nZJ*U6H^1yL&*6?ucVzmYr<?DLj`1n#^x-vp
zJxwL+l=z@KmvZWSE&s%<Sp>}jM?9VJM7PXDQ;M+Hn0sK}-7uFZ#tVZjhj*xjt?t~Y
zF3XY&E7@zdl4FzDIY-L*Jw;Oa-97iP3m;tPA4NPR!n(UL>0Sjd;@)}0i=c26vKjTG
z^)wU1rewjC9ezHZq}{`zZ902^fNckI0|b5Eq(jAJ77o+VhZm?g=Ai?BvcICDVVgk}
zjMOr;R|EO0Yk?DkSpG?H5?>uNqr$^_6<eJJw=8g<*VuXrr`);oc(zUf{(Ot43BKK>
z={SHb3}wwM?7^je->^v-;zprs*!mOiALYhZiX4knX&+jtXOc{<D7qanju9rBDqxRw
zj_Fjq*VcP|C5pOTuY)mmYtqBz@`|z2-7ehppxSrxm8}Q<qh^~`V*;~>{7NvPd~KzD
zGfN46h237<kq5B5ZM0RFn=5U8%2AW$^G?B#OuNT>M@Q>sSJN$e9B!f#Svs2z)NBjY
z_w^}qbiJi?X|r$9&02Eq>EAedmeXYjyIB~EYokte^FFkfmby6BFluFvhjkB&IuEb0
z4)H!l4(f0H(fhdR1r`Y_Zhe-F3l}=ZrWs$JRcc;ba%FpK^`-VToVTKmZva(iYgT#M
zNWYoYhMmLUgC&ju5<~Yy?!LWmEhUDHU)WX@+pkk5G&Yh`Im$}6ykBU&zIAkp(x7oS
z$=_LmG_!&Ww{*(U{_>aF%r=Vy*pr!au<GTnZ$q37mv2hVNr1^(9?g|2vc1dA?L75y
zT5g~VCD^c?$YfLaA!}<T?x3YW4@$-|WAK$EUrhMBlO#qan@Lj4f>{_*cqoZOKXlGp
zaQugO=A*;Ill#hCOaUL&bVDssnjAV8B*AbkJ6@HG70Lk8DwFS@r99*(zh%e&VCHL#
z>YVcG18O>;s7eDar6%?!zrFRLz;@u+d|FO0s1{vm!=sdey*<!XLP`q;UI-lf8Y7(3
z<T&Y@dYk2rfEe0)4qj@+0_XR@x=f+(B=p(`?(k3eX=~<)>xG%Gq$%}%d~Sr*Y1U*a
zFs^;HqCK{JhHS@vO%%7(6efhJ?sFPac8-@3ByAkpWN2qNdbq`{!#0|#wIo18rBtFT
zM}Kwm*up8t0*%1a>A$>|=s2E=6yi>P;@(sjrZ5FV;+fAL+fzk{+;e25?04^Ppk}}p
z4jtIPIP#F1v7iOVYrt-adMl&48pySiT27s@u+bE`QPP3AE7yPbi(VzafeaotRH{(~
z|HRXi`%Cz1Ze4IX_8;E`(LiR*A_HA6R)(cqx7wLr+mY+LQ@Jem`kWJV3P`96nA{83
z;7W~Wj$qqCErEf`H=8jOTrD*XE_h9dv)jRW%s)L;Cd7*p>Be2@&4{?h8VxRe6Gg|=
z1=3sXU%DTE7T$J^KvQq&NNF3sEZiY+v^Qv|tpo*QdVN>ONh3=(GJ#k~b8^Kmf@UXq
z7u8;5PioK<{e`!~_5!t_Q$eC@Y3<C;E<U;uf3_qq>fUOSluBBEtpjt}VK-`v4Bt8P
z$jnb0Pt1iU2EF39kH{vWURLHNoVrQ?M*4jWXagQ10cw~G_cdjso~^=`ssRX8r@V=U
z<e{o7NTsqwOnKwUF{^F5kyOQ`fT8kNm!+Hqhex$r@0M8hmp56#w5nzx{|phtY_YNy
zxR^3qJOvjj`RZ}nd5SA)iQ@ab!cT%D#gQ;9`u!GIE-NK7DpIdwr=R*(+AKxgnauC7
z6L|7SM$KvjyT!Tv!e66e>y4@#$ssOEJ?dm_jZqH_reKagwP}v6J=5F6C8l0;&A64*
zez2f_Wx3RDb7`HSMgwP2d^aSE@$KqdvAgxm7;f)!>&dS;yM?!TvxNtoXLBvxgrbTo
zq+-sumF>!uB+6?S?qA3=4c|yg?@8&iQE_qWcrZRsk?KtL6eMl5??G}>7nrsNR|7V-
zH@7J#(_9zCsIad#JRhl_ckc*K(4?I6NlMfB^7fpq5otRnGyIgo4vqNa!YHMSWh3$A
z+Jc5uzg6SzZcnzC1-(k;5T;-R`{~^3#-YHtX{N=oj^;D%?bC^AW%jai>JS7av><n(
zdlQ@|c7<D12ma_hMI8m_N&C&#Y^m}AD$}Ix6ui>WfOo?jkxNxa{38YG_2%EJeHA;`
zH+8=~%|2M_TUYNKQ)zFhC`;MAzERk~`{KE*Pm-L5b&`=~4~u*JqeRAX_h#Ry_)pQB
ziaqLK2P3HCV!$p#)~%V4$vB%+kToVlZq|TyDfI@wIsGxSdhcv(WiFGH#3A&ITK{Q(
zk|fzFVUE$Mn24jj=<{-O@n06xyVuL8D6ZBGaUbwFfK9wYyCt0XT^zNS+5Z>~WKV<R
zj^(C>W5ir@OY)|67IwZ3ThW`>u<MiWx*~;y=VzM?h48Rkd;SqKi1$qe&J?@^LIL5$
zTH15`_UYCmbt5YY5khVye755Q6pDbv1ddM~K#Z+y<d}Emh(qP3Lr!>Sn9K#pe@ub8
zj#0B?g(S{ZR)1+fcu|@(c-Wv7BRdfK3ef#qolhw*zEnY6DME0id_@HC(f&OzSWL#V
zokw1`OV#L-xbE)M-j1kTtQPc*_Ns$<bg7HVK6al*4%+Eu<JHYv7&q(SMLSB|5b7Tw
zP}nSye(H1^d!}SJXPe45Qt)z&;M_Nd)yK>MJA&8kBJ?~M!Y0maZC9i+s&cJ)c$rpT
zM)M}ER8Y5lG<V5ct`W{#s${R3QO;z1tI?@FOYo$lX!IIsUC1Z%<_-Ntb;&W5gQ;uY
zhN@k}YDi~||C36a<jMOqj|c@MZwNMWJZWJXnZI5>c*nmiD?s|{w;qG0rKXAi)1|%}
z-Sb<b5t);tf*JO9C&4RJ>(tS%G<C2hY11j6lC#^MQ;?#JP+a|<k?T@N*1ED?bD)M|
zPP48rvm)KoMB65U5vBfZW{sF+xG9M90uD_mu(ior*ZKdB__E-P`}3U!pUIkZzP(@4
zoI`kLR@hdlzQerZ<4FI5#?hGoPhIcG5DG7xW}33}+1FfY>N6*HtotQxv6L$<87{Ni
zPUijjnoDQ0THY$3rx-LQd2gms@kDP%SNsU0Vxknzv{mRn&jGAqMw}pZ+s>ZJK#OSS
zc1hQmhfs9ZL~J6lY2?*ZLdODd?Zl6aE|u&Z)X7l94q6toL}Qx1_2IP`(|xI(e^}T3
zwZI6|&TV&G=8fYRo6GXTp33CZ@qT`>s>DPG&dhj;e8#kUyQLFNjpg2wZ63q?0c)MD
zBbEbQ4ND?RPn=Vw&X3dw3whg4+P>;=FGRiQG;8UcD&*_)e{LW((WIHmU+xf2eC*a1
z(N@oe?j8p&JZyQz90wkveB<Ga@UX02W596B$9HwCA)<B436-tuGufFncWpFSPUtRF
zFWk^ekb$hnPlNwHL}9w39UI*90MvHv8ShFP{T+46{rJ;tV4rq5b$tnnNZ!<QjKNPA
zV>!Wwn3?F!dpFwI+<X$6s=RpUV)(Nb>I;p(eLNm&xi=6eVWP2XQa==kmQ{%@PUD=z
z^lQy-V0Q|~%NaWvqzthhL8eYY*AIB`E*Hjc<xhlbrrs*IZFOC%xsT%-Rm^IYyfM%v
zAd)<JQhK0k`qS)+G=qI|sRA{cOF6Uk<!HMSbL<+i-mztYsP{|Nf`{APM(-HS_s14X
z)9gfC2?-H4;_|Rq;^}bhUw_cbW}?0jX^R>n6!3i<ejEI^BS=Q!pt$afKl*Q<pTgOh
z`cQPuvFDp9_2_lpH_hE{C3kL#_BprIUo2$FJ~*-9?UE`xa=z@4r=W~J-Yd)=&&$l7
z#k%8~g7&$`xm-<G6-x98%QmJ-7;R*p*=llJSor)-pOVM5;p8?~7$MKu+AzA9jO_0B
zGjz&X56f~48YiRcZEI(m=L`x1DW`V{1#~v$o#Z|^Lb}0L_Z4Ipb+|)W$(#9gA62vY
zx#+x5wOrm5I&{jBi)7V(Ufa$Xr^K6cc+FvAv`oFs7(+#UtwT@JDYjo;;W(s11J6Jj
z1!81K3tX9}kRR9pAiF$Z*7NM9RK|?mfo`u2W?;o8Z2>&?+|_%Yv}!Td$0_-gkY)<3
zOS~~65GpVJp8_#tD27$!<Cgcnc3F(dnTzrzoogA2&3$l(?MX{VmQCXn=Q}66`oS-s
zbH_M!zh3c2*-!M>yc5t&K7dQaEay<~DP?t~QQ|U%XY7+K7*GPZlLG%s3Ofp*zn*d=
zQWku4T)>z14RxJ!;%Po5ZDZnqiRJAU6sJ<jt=*U9X=7|!wzfQSyscjD!fr)P$Im^K
zG$3sZrfp`RSc|Y|rq1k7Xs?*=Oj~P%lX!iFo7Pk0IuZ~@rq%TQIUr<(xT#}`bG(11
zeY9yafVIg>>u?}NXOZ#JbvO4-q8julM@*okz)Es|oDqrrZhz`=J}bt%YdEL0`sF@8
zs>A2WWYv?cVi>Q*WH;{alt~xaV9UJ&ESkT3F}nUv(rBd7ZQ@g$!=95!;IVTv1SN@O
zk8r}E-n@+TEW$P;Wxvq&?a0qWa<?5%$XsMvHf#D?Phz-!j6Z7Z`1N_SS+}B<n9a2v
zmR|d?f<=q)fZ0(L)m3NRTOXSY!2wknCijLs<2O0TGg2Hz<LopFx(A)sW^v8~UWfCX
zS4_rIxbBYuKZST#&c?1%DD<UG0F{M9pQ#;ct4s{-aC3k+r~TqQg}R3dq>;D4Pbl#4
z|8@ZnjX%EF&?U0ZOa91b#R1-`nS@ZzwsHybk@~@{ex3CkE}X6hzqMGFYw_(|uH9+V
zqMod+*2d1qD*e087<AO0c}cPKEqmRqO%vN2gqQW3D4Mk!WEim54Zoa`*%_4_6Zn-E
zl;&RZfvik_K7YHz?72Ww2`NM=&`+K;LzF_(iZp6m-m#dUM|BvR5yz66)+M9LBq<i^
ziM6To_n?dUx@_s*QYlwK=*-F?S~m53qAHYAY2o?D{XRL3zGtOTYw>7wp^AB$w&Vcs
zCFT34z*(07XI)aS{Ts)>iwZwf#aAX!h85j7g^WkecIA>I^;&wxJ-LJitlGJE<F7Qn
znxH>lYw)pStyf!aV`+5NY3rF{66Z>AmGev)Z{%NjLOBWWryp@zvL<<<dN$hK*2Wh|
zHXRoj{oVFxC5x82nO+!wpD$;0sv2oBx{7j>l#Wh8Y)$})<F4<QZ^zwjo{zF<k1Umj
z?=R!)Ps?26R_wqx^)kY=agXqJx)<K~<RX|}qFle_$|i&Ui2qc_Y3)Y+oZeNCDCHC_
zW5KV3tPvt*8@vWgVT8L;@~e*{j$U^ev}P()qXLzCjr0KW4m2U37BEz4qpSNio^}aI
zJ^SX^O?5=_SiM~w!_KX|p%QXc173TF2_hT-<;zw&H0z?cv#>YNT=Tu@>KrmLe98bJ
zU3Rj}I|d$l*PWA&0qMeNw4R<j5bX6-NKBvDF9%72Q8OA(5&DDQ*`dz}$tKPg_Pu*z
zkVQbhZpK$N%jc-oTE206DAa@6_O?V`YC-^KmTFUaxpxd?eUwp&ZX;HW0i2nt*e9e)
zsSkxJ9)&Yt_4%4)wRXIvJ15)1CQ7zcdYns`Et_mj_gu#<^F1t|*;Db1+UtBh?Z`!Z
z!c0)?q@%1?o}OUq8D2FuOSYGUXVxCbi>}-uRctZ~+x^%zDdoVt;MWivqrIsc9Ixn<
zoTYu7vDVhLJV%5|YA$L+#EZp)s^dMD`@xqcVG1M@JDGZw8js*+FjT1GQ#7VY|KxFd
zjYEZsLhU$HC`&?zLtpAS(z;Z~uOA_2gvrIEPt-bpA2yaetV?3MYddkS{=*$h3_I=>
zf8K+4&h3ojli$O5`DBbX_K0_7?+C>U@A0KyhEhdcsd&@M_VImN{C&^+DDBcx+h<mk
zMygxtf|-)6^(&S|rw#k*<me`Sb=*7mRLt8-FQ>KGW!s(Y)-JHfrE*)%x2->n@!%8C
zBo;^-kjyDJ_Qs7KAS}r%SL6;UR&iPywssOGq7X{8>)(0v#ipV;6Sucrme<bot<t8B
zFF`6)KCRkLIcrj;BvtKh4S)2@k=WR0?EA;waTtLk7~H)$loh#F5u%Z2;@!@&$r(_Y
ztD|*H0*QZVMFw~SAT%OA@+%fBnJMVG4rBCCR#l|h#3`8fdF?KZpN$V-VJM(gx@OVC
z`#q%;A1oeM+Pr`C0Vg-W%OgB1vCr!pa;96Hql9{z;CW}GbkLACzNYfVZbV_TQ$}^_
zN&V4e$KBpK^vlbr`ERxWFc?j@r{+!Z2S?d&1(RomjeBX)_V_(@ilVID9aQGUVjFtN
z4tz<^7W_+%-7bMKEs*n@!YT6$9D1Q>S&RRAE=}uB2sqW>H>?>{3-ArTK-_$3qt{6?
znKv4HTH(zxG;`Yuf&G?!mQhiebj=i(a+cyy&Sj-cHBr*WB`p7|P3b72@g!_?>uv=}
zzIp#ix80q&$lQsrdlM|U{_Vt90Rz?{X)luRHBC3Ib@rN-dr%0*yo2SoG#nS6%AIr2
zj0_e9wIq^`Sg;PJh9xAp`HxF;PuNTTAbMs^f4Y|7F=O(xE0CxQ0F(YL<+9wPeRvC(
zf1ZDU@VAP+az2OIX@QkQ{w3fqhg2~t%kmNCy)6``I6Zb<l`}Nj=UfN{dWO!+DJZ<M
z9+mD-`O-RTJzRT%#JzAxe4r~p`Mw<F8tK4h+NU0J5v}xqa^sd|HRZonwR-XmCfCj6
zgY3VnIX`l;PtU@?&qhrgoI9r4WtwFzG(7Q6-#?N^N>O-2$4r<<&KF`hP$M=x@Xx)*
zPs6=KEd_>nOa}1JHa{bwi>7mgmeqhvEFmGGwwzt}`#f?1Amz`u?C+a$Gt2;jA5&oQ
zY_fvF_QLVUmAQhGm8{5*@;gB@AAPbe-Alyr;P@hiwvHiEJpIQS2?F6+e;TSw|JQzg
z{dE8ZA@`1Kt(03yvcacrJ}17r;48GEcYO2oS0oxy%lSG{89MQACBR*He!5|<bZ$Po
zVx?jHwnmA4u#OEo8HDPRNTs&z6`klluk}+ucLppUDT+z58l{feOT)mkr~r(2cVS#q
zS^06`zP-gg3R6+qimZlS^E+4su3135y4LSe0qFigidGml;SX)zS+gJQJ(Pixn}vj4
z&ILL?B%pmMXJi#FGl$vi2yBwz1<1tv_tzNk$Qhw(bZo?s{hvFB(>Xy1Vs`VTG~!bH
zq3Z6Alwe*k-8fV+f0ItI`EAfOqo!dk%dV0}=jE=>ZH@kh)=cpm@JhOz`kQptlDqW0
zw<CQ&K!IBm5{-IkXo2IuQph~S0<pp#eP0`$fK-|_sT^Pt=LDVe0KT8vkg_`v>)Z@C
zxKeq5_yM#iH|elymlnm>+W<!ou3Q{))Cxe!#|bU<VOhdU4IBZ2&{TQZun!UdK`aJ~
z5xI>0=~DXKAD|rS>FJLcqaq%&<NJldqnSQ@=ElFAz>BOR4r5QlJdgij!?5x~1RJ5c
zS<}s>7#QHdB)MwV)ELp!fK|aCYX1uhP_~OclhJ+8_){Wgsot$*Jm=%Vm?7v~P9_%C
zNgS)$?QEeqCI;#Y5#uPkkraqeR*{6L(?A!+0!&4b$k51tHL)7K3QcM;Fs&vtlGBv&
z!i5hb_0OI?o5W!Xfh=)gamsKShC8O1w5P(fBdOQ|{pvspUIn(&d9e~Z>02crLhLKm
zV1&(hA+GO;Z9geexr;1bht~e7Cn>yA&AYnCe!o{w6og4;VRzmW|1Jo>Uc`6sDsy*C
zw85wFr(7<&qow`Xzy>tO8}>Qlr2v#Rx0_Du<|J-|D(ro3@nB_KytBz?{EV(roI7!2
z0BrITWMPcRIFBMq>jEjwy!3$jmoHyn^4!b#_}rwaOw4SH%UoH~ql0X2i$UaDUnQw0
z`dsE@95nk%ohbK@LMI-R!)$MNww^E0GQ<J7EFbP0Byx;br@N2}#VulocXW^^>^cIT
z%)ecaG;lo%7^`@G-<a2}BOu|gHq-t8Ea7lmPKP8`U8FP4bt;cGgq_QXh(Ytkyvy%5
z@6565H{MwjHtC!drXZHbwD_kgj!57%GTzn!Oo>oEr$``tgM9Ie%~_M>Vcu2Od-gv2
z@W`)#>iv|gj$}&-Qho&N0SAydMJX^ZRs%X<dJc5_e7_RBHlwy1n?Os}4pTeapc8UG
zbpw^4GY_Az5f&{tz&q_oiv(#t{|(aAAg7+I%uxL=+i+6>Qi4HkM&`dC!C%^rX5I|T
zA&<ncLi1KQ`N`Bb+>W~+54x4;=WqJE(v25~gJOKE1G;Lt^CcZ9hjkENh}4}HRnyht
zYN%YKfrM3hjSE;=euR-dh5C^4HNlnS%SQ7Iu4yXe=-)=_2*6~qw55&dhl9u={X?L!
z3pj-?YL@DYzgsr_JkY}&8qq8N(9J(g$InkR8E}|e=x29E?wG1!4^zgCNk?e$3u@x*
z6c))FHHx%u^hcVkoPjeD@jEJ@8TWqCQv2kP+?OHbkh1AS1|lkR>r+6^a?$Ra#t|oM
zCQV@uCzy=j9`1qOVjDyo9Xduf(0Tq9x21L&%#wh&iK^VY_r3EGxj=ZO%^~k3^??80
z#X7A>n2|>w83O+JheCPz70=e8>48<pjo+=!uN`^iArIB<iuG7s96Dz9E6#r^6|zR5
z+@>Xbd-F?k&UTAtZe{LiBU@xoVfkzT9r~8VE{A;0fNaI4B-m}TjX|>0fJC>iWF=O3
zIcneXkNPmnEm_$Pm^-)f{RNxG<g(evCbqKWR{&5>eEj%vxGU!Ws1jG|AXc5J+VOYU
z0rU9=V<SPId+4{_{e5$YIh&TMrPAU4Fs9-^Ik{S*Y<nXmErd>0cBB^FSj<cEP*(a*
z$F8^Ja$`pfl6*Gr+IS(zg$3JOsRCzUu1v&*7$3DWKV)s_K6^|Qw|VzFn24hVU5<r}
zD(T}Z_fSO-MWw=6+kqc;1$OC7p$Y9{Ws7x8CeW8AZEr8@^bkrj6~sOUG`|cU!RMD?
z`^Xrt1U{<VI|{&<_`lcu^Oo*OLR8q3%{Wc?*ZcT~uz~C&Xbum~v1gFFEJswg!;X_P
z#F7Yb3m<*M-C6Nb3nbfGL2=tc{~FK(NL|@yCe{?fq#Dqb*_6T#iK*=e8=M!sj4F!A
z{Py8l(ZhQndeKB%<FgNi9(vm#aOt-rJ;4bA6A-D8efRep__aD+ZFrXrVc~~wf;?$~
znzf<%%TbaZ;N_FEI6WN+!ubM;?swAz_TjC9>uk1nF8c;P-btvy(U|o9%f_Ezg#+-F
z3U&IIdHnVKKp^n2yU*FFU;$4p;Onm_Vf#SvEg@V6`P)YCUKQr`8oPR6D;)jbSAmcC
z8zLh`#J?Xd&1={sQNk6Qe`&5?4@cxZNQ5`(usa+9O$lsLQnl<G3mntSi^wnKu)Glt
z-}EKy_@0N4W)WjQwjcky3Y<%CDEe^V0R#mn8sv2IxAXgd<pqO4N_ZH#0-u8n-nuy7
zz>Y}4TSd=G4zICsf#SRGO%WgC6e!}&Ts}y<fO^c`G5t(*l0X6?k`Lon2%PU9q6J%k
z9jd(C`tzyMw80K<u++ICo=O?wsbGYrA*MMof9dBACI5eRNP_I}cAS&+Z#%3^gx^?T
zeE-{--&gWWA6Y>yk~iQ5HXhPqT8Q1O&sk~)H|GE>gixTSlD%Mz7QSnSg46z&4}t8(
z=?{AWfx`Sjp#D0AL3qj(7WYRHKYJP?kuL3(d)&x=m4Cb)L39X!luf|I@jV>y+}&Q1
ztJx#!3=#Hlw;OL3by^%I>^wy-#O5`2jLt#*<sU?g2}JAO!D8xviPi}a=fiKS1%JDa
zzm@U{M1%*=7$TxosSSIdQ6{$kp?H3{DP1G!K>d3s)^+h2GB6b2S!#lkqJgF`>MKz$
zHWMPJ!-8R`E3mPyJOA}PyyU^8@t%I(;Pv<K`*9fel))gYB#=RD^QTpjv%uBFSE(b9
z<Y_tx;!ha^zSp0AL#_?5Ujxag^npXinhX2g&yPDDSqGnYf2+@O^}BbMMxX$&7|m-v
zEDtH2kp*&II4ud2T#$pCP%0REkbcfZ;{S@)ypZ_vA^$9*PX23fz2-quh_cvnX<;As
zpbE_jpTDO@jxO@&(%vHjJ=)7$94L9MKX-p2T%gCpUO068As?*nqt$uX?g<|_<oNdw
z%*eg=A%PbSMd805{rY7QhhNI^kEeqOuJ;FZ<aIoK5BniF>p23i3Vs|?>g#|pgY6}D
zrZX_)xKi)*wmE1H?qgNQ;CaQ>Qos)su|*(r0esm~_+l48WaR0u9sd0qJlQ~GwoZNj
zGvJiw#BC6nU+xsm5d2c3QeGy+X^Df75{XVff4;dgi_;{zB27@`bi|S9=gmmKNsQ=U
zLk_D0j=&s6#pNqfiyVQq!j*p=_bucI0uKDXtiOci8xncqm_F|!ioO}SkPNz9<P6D?
z@V@B)_7^@Z59;c+MyxRR%cQTg80ZP!fuk)8BCJ&Iy7AGP3)r60VUkk)wnMrW1k@cs
zl-(-W1cplq*fTVRu6P+~hWf}LKlnaUc|W6aMZB^Aai-0Uzmfg6v=bzVje2%%@x-4l
zjNo$~+)_9z8!;m9??7rA;{MX62qd<~k33$;ZSx&}nM1FYueSS!fpVsndc-w@=S{7Y
zgD4VELi=A~qm?%dQ5P|UNOJlokfOTkzA3@u+n{dmKQhqxO<HN15Y*1tVb7X>1@%8|
zIn5o!hTM+(2Kf=16BoUVm`<zgHZqA_Ki~R8XY0R)1p}NR$^xiIR@>JXhEPqKz_Gdl
zK?yKHTa()n=~+`E27u|W$N3KfkPmkJmn#43%8;Kb0dGF~z;Da^`;9$4<mE>I?5wI9
zlp0(LD@P2Ji$sJFN9lI(YgY-too^0y3ZCP%3N4u4zI^W6$DA6dxSP}l$As{!5*02{
z^EqqDRcvO4dgNYyH?=nDnpM)*FRe~;s-@9op!u#Kr|zQ;v%Xn-83z|3JHFfA{n2lo
zUa0!m#*%ZY1Cp%$owl8FadsXnn}R&AjlQZ_D09;YHbyvWcsQ5OdfRemy+=D^-9BxU
zMg8(puNL~CMT&|j0-Vq4v}})R9Tg=N-Onc=c_KCed4@D_wl!7*8*pmQ!$FeQjhZOJ
zXs<gjKY%w8)zl<fsy3>EhZV&;<izhaMzg;d>W=?&Vk$G>@Oa7oB}Q?l!LZPCMjZW<
zPTG(Ek>SK_rA@4_M%oa{KZM(XH7jso3GG%g_UwZd<+&A7M<IvV(kqRqae^nli_RXy
zt*MH|r9BKom*>sIM)%4qd}ht9(;!DP=3qGdIi*#jc+f>Aa><8l?i*>{;>{hcs^@1~
zJSFEkXYA4#ZAPUd=Lhl1gbXa&gjKH#Bl<Lx%0+UYmm^}{5<axaQ*;B-LK+CpCQu&4
zw1zM!R-g3DM0g28pHA^2e)jes(3lj6P4#88`91mpugYO;Jws3qJhy)x3ZlyL2p@CI
z_r|y4Cc6uY8?YmQO0wvSBihkk<z!B*m}QzbquTAs;BU{w&cEbs%e0Nxb}LDX56;Fg
z0g!KvGh>c`Uq`t#xnFcF&+y8+z*v12=eBU`4VCx@=-O{WeNErAM;-lg$C^$$7Kn;M
z{^$*X*?zbK0ujUyG777-fq^(HkdaD;VLkgP=qzp{mMdq>>%Z3du@t=1!B+;_zdZn&
za-=$|vodP@N4Myyz1PEo=qv<jdu%2{L6x-dCIWn$6rQA%($lkh8B)0v7hkiooHl2&
z!L2Fwu{mw2Y|`0-!$9WNLw+3qwYu1rYN9l_LnZ)~Gqvq=RAXW0%!;$ItCj;ZH`*BE
zOX5fbkTDB5j1L=D+=wz@0Nsv2C*8M;sGNpRN=lBsPJ@Bm(CJlj<SZ*i3I{pI)G5$N
z4F657h!jCyiGo#qRQ&IEji{UNQt<E>lr7O`{v^F#56%*(4ipce3e);&&0jM~+Mm5h
z;@G=WvwvpWE-St#jj&r=J^iVWdXB~%c1tJE=1{%|C><c;Gi?Fn7^jtrylFzOz)-$u
zs&-s@O)((9xBrJHsGnJ)yAt~q!YteMZ~fhvy*D<}<pRY9csn4|3Qf~py?v%)*jQHe
z40qZg{?CwKzm#QMO9!QmUpDF5O>m>3oQRF)jR$1$1HdFVeX{Q{FXY9;s%lw@KcT2p
z(}wK4ewx&O1B{OhU|t_iUPBHR6yUE<C$<n{v2*VKW9z#Esowwp?U_-MY^5cl>^+Oh
zESr$TvG?XUlu^kjdmSWuXKzW!Iym+^B#yn0efYgjw|npB`}=(UD6VpI&igf<&&PW1
z1pen&|M~OQMA>Qr1eDFXP-)_%E7N5Y&}x|9+8)@Wf_uTgynvLsM;>KAmFo@-CT+0z
z4<&4b%XjOU8rL5eynP<38)<%u?<zB?_k7~B$2u786)K4CqL-Esta2E8XtZvZm;a7I
zI}1Ps3;Z%wGNua<?>TJ5>mu!PHBgwsV>kgOQd`0e7Hc!`9Q?iuzTLALxTX{b)&ofJ
z2@gl$WRY&}2emY~YC!7h|Jc&WxGkN{>w+tm{(Y^WY4`ZMY5(`De}5)hKpx1wdGa)s
z*BV*gNIBAGCzGpMY+9;#`3u^K!Bbe~I0f7YfN*p#a#>t-lEN<68JzaSrVR_TR;%iX
zT)7fz##W{i%aafTSeY}2B0Va}Y^eKAK^FJUJBO;A5#>23Ns8t-bZ?A;f{$d*0+ltF
z@<KPS(BWDHc3If7z<uqMGsob;N)M=n^}$2lE1;~C2dmf@M-uKd?|=bMt|;if6|kf9
zmhpqxrEW_D-$%w~SEO-Q>fW3GQ3N}JHd}16r5iMff4@rbS!))~*Hu;|N&lZ;{b!w&
z@<x6LK>yL&d>6NBMxc2&eStW}Z@fV_rZdL9*JpjwOEs}hBjZLdh36fYx%VR^x!yrv
zdd2)svFh~PPaocBPICVK0+7h4_x-P3&6Ym$`WeDbv{{cDR&__k%n1b02VbSMv$w-|
zQ{W2P`DqlaUz5PDrbTrP;9fif4z?rv8<z(g&I*LSdoN|iB|~u2s=a~v#%;_z?oeFD
zcM6B6cd*fbh4B9=!$5mcD;Z)Ke|Wwmq`5FNbY;!GD$n!u*^;m;|6u|A$TG~r{y@w&
z37WG<<y~Y&Pv0FgHOB<$HR%gk@yJE5YMV}w(zRzdGutianT4%zI`7SCCTCM;HrE%a
zYBhD!FJ(=7oRru0=%oWKe2}x)0`6IzTmX#AAIj~&Yk2%5vX?JsynuX)>0ta?l4K3G
za+YgT(&5ML3VY}LZiGnIv%h!O2`$D(Yw%4roA!rShpaB~&(|R)g+ps@d2PNkHKv7~
z-Zc&k^t_RXC5T_!X^+*;?%w-D{G{PO_Y5c}`pm<h+t6<Y+d6UvK$tIn?YRUx+cMr&
z90{@cr&?X`k6-Vlk=l&}NuX#+zT*1rcfj|S9~ekv0(n#HgR7~>GUWdIbm5l=Hwt5V
zHxt=?9gdHnhBHNfc0mi-O+^6Z<amdCskaKQt^M((tw#ZX;Ym+-(#Y<o;HcW7m$ra?
z`5*5L(J8Kthlan(Zr)kkx5!ss$-3?2bslX2bu-?>Xkpo`GI;lewdLh6wf89~Mc4q#
z*dR8b?uGQ`YVTn&XFE>Tg{vM+T#cKg-8DRf>I$>E;v0@>a@#}a%FWSyay_hJB6OaW
zkfHzkJ^&07FdI2w-dGNB1BpBR44mu!1+>-un6VS4bcjjuhvQ;!S9Raf@;L8F9dm65
z#{}YD-B9QDGqQu50BSs|zZ@njNOAx6rmfJb<m}3tx9Qg-%g^SBCmi5Trg+zSNeX<!
zf+Og+jX-g1JJMOg_dnkQ@L*{3nq9`9XB4|>HrfkS8v<z9&2mhy!oE@N_qD+?7IWRj
zqFoROaQbqQkB*=y6N4JR{Hz|6Pgv3jy{T7b&&ILY(<66=Zs$O<>$EEehdL9ak~kre
z@zDiSu+vOLY~fJ(VJPbgsgD<hBn?nGaNeItv$&4F070Z618Eo*F<dT$2JXWvh}_4A
z{JlP^3_lZ6VVPId-NO%?ccNGgmQXb18kGfc;ueNdA&n*MHWMmR;?spJV`X9?@W1Sy
zpVZ@uoK6k?@o<QEbXX?}+uHT!E{#tC-;d{{drwkVt}5~<PitMA{*G3&Q=@=+=P%#l
zZZVOK0TDwF>s|(K-{8ur6jKw6g{gZG-|3&)jlts6Sn4}WxyBU~zv*v%)sAg-OPNU&
zX*ZJv*4{2DUoB?4_UD(DUUBA?sBAh`6x}7>v$mCeq8;pKqn+S9XEQ7Mj=1)kGXV%<
zc_S(t(-X@s2t9pTcx)e{9wAE#`Z=xeV_a|70Up?m%~jC=U=>4xu>geugrwpMh=mae
zx8={^HeL*7ws$-Y;QgW`?5zXbe`5y!t-J_$e*vI;ijT^u2|k>EbAP5rA&lSq)MQ9Y
zP@Ou0#!g&<&ON+4;Z3$Sz<e|-%r^AFM`Km5t3)s4K-p5N@c@o1`2K2}{-dl3IG``_
znW=eNGq8-EX~=t0TJYt@HND$fFSZ%)=r=qmu0KwnpDD|3c4@E{$#uz>ct%iQB~t3u
z&Rem5Z*X9`o@)D&r?8wd-Jp=nT)cDPbh&EEQN*UtRB*-ZmH9>^Ezth7H-MKt^0|f*
zgLHtk0>U|E=U-|#6$Q_Au=KW}qFTX3I?s1xfiWH{z^oU}amu^_D==RXuIo+_cC~N3
zY^nOYe*ELqdNz0PR+EW;9U7PB*2$o{xR)EUfutHST05D+Goi7En@^}WEn&FWmsT)_
zQ`9uNGh@kT_z}~^if(zL@?$!@bf1muEix2U6X|#N1MbM2erlwd%C2)L&aba=@iLkY
zOvS0YitBgMFBMNOve-zM&(^d{<Sg`vh${5LZ}2BXsK*uzY*xvT{RrXuY!uR57DLy1
z8CPZa0Z4#cQbK(mOb^<f4PjsYaWi%B17}g{u2z|a>Xa8cB@YaIa7OnQ0^vO1ykMs@
z?C-e2pz_XCnWk=7dnfR#eu=sE-|{U03dEt`f3#pFy2=>|OpV}}1fo35@b(5!$t%Og
z&NH&=gt>tlrqp<L)DNIYztt20mh_<ID%S~z{@>!|?+e`je2L3U4ML%CXsf7AnlKoC
z+iSDKDJ)4?9f{`K(zo>pKtISI&R0cFJd<y;y7`!#GO5J2B(AD?#{tIDH!2sA-#Y&w
zxr=B#<{+VHZSZhiB00S)+6!D6cX<gAn7rxFAF%95^B{`qXj`E1T^2sTln{YQ?_vMi
zqEe~9FHn?ct&1x=0DbS;5w-fhq=oHBWsgy<^0AA%YNRI+aeA9Aw=daKD$N`gDY^|7
zD>Ey3n8WzgXU)`is*k1bI(A=^@^#Q0%Jt1K>A9c^S(PhRAKrGT%<&6*7hbX>bp4&a
z9}%+j2zfn!$l;en>_(|LFg^TX*!BR&0cf!K`-w7(JbERvMzyZ}AjLYyG`r&8jv?99
zCfMf+4r1Js2ClCEA1&^92<ijKXJMNwk);HTjGi}ElS?big-~EMK$-8gG=h(7hRp84
zvh_H+(wH0Z{}q#eo(HW2K{|zG)9A{T1p96--P8ISZAn#GC)8I?ej^C*AVpvc`=cN>
zUA-pH{GczJ+6TWpu}e+SfNHlGiV9#FtX_w=cT0oOw*kIh^}Ji{0Se1vzV#{qPUUk5
zKm({?X^P=EN3Q2_uX!(ABAc9@w{Jyr;i$rOytjqRC}^s1gWG?_2H|JNLr9A7w2REs
zy|wb?6M1~mv$F=y&(<w@vyIlewNHwrr?hWa!kvkk($sx~RYVrJ*!c#4#Iahkjj)G#
zIhWTinlECX_j@B8i#r#WKkFaOlx6ZN^c=2EH)cA&M64fRx+oYl(oein%y<E-4sO;?
z*Fm+Kj~}Bj!@KH8w`O4znjyb4abGRY>6Bcs*Y=Jqt(s`!kN4JPK6i@5?s2asf9V)z
zhYclZqOB|=dhv-XRrkzVx)Cg_Eec-XZvDeSF#bgh?v%4idTM|%v%>(*aQoT%_Ps>V
zM*#TUs%roK@#Dv60f<^x(hraedx4JbP6VRkYi7XPzqJ4XL~wzpNfm&xU;-H7Upm8C
zAggvcuuWhLcu4tDYo<muW<~I`mk@my`jGFP+Ktqq>Uoq+>!+R%7N%HH9O!m4D7?QN
z?COl5`BK_9F90s7<g_&k2#0?P<^T9hlt`Z5Q?2XpPxH{lhXC{^p-Iv&%~N1n&ov4S
z?))fL)}@DRwLMGE2!H+yDCj#}e+iAGRB7o(8RjhwThmXu7h*2<sz`Z_r<pAmCvdES
z2E?ijqKsJ@5X#f5j7~)9D0>X%XU(JhvcEd0Y#plTkQ`9Zc-V}{14L+kz)Adifvvfp
zK-N;BkYK@x!tF?rs<}%$W%CYdN!eA9Ze2-Yc-<80nYycGCb+FLGX$<2;yCq5zNTTI
z&o3At5#+Vg7*p9^8&0tvfmF{Qc9T+lUOR>UX?DWbTewClbT1*95z^_-;Na%35+h8i
z+!**+kyJi=%$~w7{hVOzM)PF39~Ec<?i$VP$4MOzVh-;exo!AG_^2KmAJPppZS)!n
z&6K*^qn0w-)h)jt6^3sRFUVwBSTtD_TOii=l4|F^WK(5N6_Z;+I<KM2k~nIXlI^7Z
zo0F*ovX`bO_e`j?n!LZePR)d^ZPDFy)1OsNcQ5lW!~Xa=6uw~!G;)35O+^Ak^AmR{
zy()S&SF>uK@TUbw0Q&iEH3PU(-z5Kqm?Nt|uBMVM8W>O>{1ODv0+~<k0|2#TP_N>|
z{T*cBX#T6pjRVkFM!xatCH#B>Mu5m%a6gs!$0t`&>(+LK76-JZktcAx0_O(j8^i!$
z$kf0!&O-4&u5jGXK#X5U43`RGiTa4J1jUY^^)`=Eb^nxhf#nsd4NS?hb{&_Io;Tw3
z^)hw<CJ92IfRxe{tsYmP)AV5@uwk;10CB_8jv!z668F!^&|>OtD-pYvUo=3y2xtJY
z#^_oFH}JfB%Fgy<{PGOvZ$zZ{s5Var&ci^9cE_mh5zggMGzAOXWvQktt)t}jF)Y_b
z`wU@DEjkk;XIK_^$<Fj9ESCGX?BgP<$Wh_RqMBt*db3V%Cu^K7TQwtayt2`B{-CLs
zx(F$g=ibF$;f=YELP>(Vg}H2Xj!9VQ;|*%-q3ZC^x4G!NFx1Ld{0vFy{3;shyHks*
z+4D;~an%h?&`?G1ye?>G(gakK$b_}Er?H)8Jy$%&!Bq?}Qa7zf4UvK#{UcW-d;p1{
z_~Sm<a<aj{YqJ7EQtBL_v^x3~rOv;*nQ8pK*`;;S=+cc%+X(RzO7O1#&QE~^^4oCG
z8UZ*UyvV!Mkf?wnP-0VMbj<qtWd?E_e4qj8(EtthpUVCg4iuw!u<&11*Y8RU937)~
zHvdL^C_~A0)Zm~CzKJMp70Np^vmVO{R~AJEvdSD=7jo{du5Vv-odn(kDM`oZ_Q;A%
zDt(q;02j<k_#;tH?oSP`S;$gvqtQO(9}(RUnQ~%>?g_P@MN~DHEfv_js;BQVUb(lM
zT<f}UE@5+NUFK|HVXba=z7x~SJ9kMr**>0pPmN_-_j+;l<e?DqfoErX=<aqPV{DeH
zkGmA{hj!}aB8#daSA7VuoeJ8X2&S&Y-(4?%g2-vXjMTn2`eKn$D}7emeoROuS+}yi
zz7ayR3;SFsj)I0LkIW+<{Q&mJ%5Nj78-zNwtx1QIZmYsg_PxE!suP%l$Wr5`xOm<n
z+|3flP&0Ub9JMo377vc-KN6?sGT`k6fhy9l(|W~lmRwLlEAG02U3RO<#R;GrIfQk%
zcNz4OfFi@G8Tvslg<i~Olu&GrlLfY*v)ix=#VJi?pgLc3Vs58HEY9<ZShaMnHC6)q
z5?Kw{aFJ2`rX*lF^vc@Cg7&)@z*`Ic|Lc6lHEm?0T$_`P=A2U4hlcZ=3EzI;LJE3J
zDixt7UK9K^<d8<G&v7cAcQNyYE=6^qKBTUU6)`tWivh6(ybbHMhqZ=9{~WFRsi}~M
z$G%&jmL}mc`nsZXrx-Zwc(i`Fbu6`B!@>4<3@YSOd5z{uS4yhhwx2tx0_S(vb=L-`
z33Y3vO*{fSSp1~_{5bE^yG<^(%SBVUHy`@6sb=qfbx`ZC|DNw#WJXfGqKN&2rn4#3
zgIi6N%(ta)`nH?4Qydg%yo{zun$|}`XsUgRo2F_t*N)D49Cxp`C6#Qq(4%tNLmduX
zo%j6`lTMb#?Ox4S+>UY4U&(mDJp|KQtnHThEy&rM9GXDC=KtMaa9jmv*PP^Lz1@T`
zNO_0iL|+qxAUW6=c#0K(lJVmqYPJ>#(~GHD#czQjB(nxrzo83_>MDnejFRl9glvXI
zCD%N``Dms+ipvb5WdloK*2!Fl=#vpZ4El}b_9A4xalmq6a7u`8qufT}VZZ<f?RoIm
z&YME|X>l9z{58M?$@YOqNu3x;{@?u%ZYMV1AaJLUtfTbvrf@j=X){(*Ng~t#=<rNt
z(xwe0wirh(@Zxe~2By||7d`)?VdVRae4%SX^DBS>q3?<vz+GbCesEH=aZ6S3k<K|j
z8<GXH;+!YzJkG9QJe`sO-}8OzZ0_D>@+(iA#BRJYAp_~0rE+#~;FDlKG2wTly3ze?
zhD_G5VUiddks_LjHijY2mt)*9KA*#CCnvPX$>UnaDH{#4;op!UE=af&Qf$;Z1X32_
zt?l3(n7^~|BlDY5#7f5&-FWSeu4+Qn!>0AA4fiDQpr?X@BT_{(ugpz*1a@L;CZX)2
zzN-#W0^T2Ei>6*K9sA4JWMee~qKTh+Jl(H2i#u|Cl9Yup3f9MJ5SqAT`=(GM{*c_G
z%3yy1sxMnTb)AeUyMcoK_Gd^cR0ovIPXHOY>SmqsQUqtNytVV_8OSnvyLv+p@ItgD
zrQnieW(X;~LTd^NWu<C7o(S1D#4YDN?QsD^J|?WiDJj5qHe3?KOfC^p4Oc;@g6{xW
zh#enzoO^ksb#ffWI4-a_23_;NIR;gH31ax)u|LG)nx4qVLKY3D$$8OybNqvvj(tWm
zV}ZIG-X~t%%0VyRgN>+f+0tihtyTBVMU}2R<u5j<s2v(#G$No#GZ2=kEZ)weU-rZ(
z6*hHvHYWz2UEiKbRqlh=<9T{}261Y0rx!<w-i|=`U$>oj6akuPen-cn;$BLROnlRQ
zVEIC=V670<OQKRQeyS)%imeI#4JGj<ZEoP&+oc_3#D0wvB_RR1WZeWLZR1mE%J0w<
z&6atnm&Y&~=RsM5H&5*cOu^XtIA1IJoO8>Dx4Oo_FiS<X^$+EgS86zYfvM6v?Exp6
zlrH{WI<+KncC2OAoD6K4=L)z}^REC2pnXfbXVzBD(gPY#YaorwGeKdHPi;N@>>94)
zC{UXJ?rPpdgHna_I--^=P>EykchC9yW|ZLW4kmH0M}5}omc@hSPvAL)YMd*}1RsC%
z%(NajUZC#dPV;}1wErHXt>ykk1c)r3)sEAMr7!Zt$<vR1Kz7>(BUo8o2x}&ZNu`1q
zrifW}?ML%SN#9evT=HKpW%6_%!5`7(;J3>*^r$3)8+BQ9GmYea+BPqGvh^t_`i_lG
z({k<E%=xfHLFH5){k5IO@~NsGsz&)nDjV(LE@)kjV}NqliQmy0O~ty{eUSQ*X;@}h
zcgg#U0(U}clqxjAe-G3z%lQtCNN`-1+w8MT@7uV^as)s;*nKzua8>zTweOXW@$nZz
zt-hUdiGaql#C?a_ZE^ogX2ldSI8ULY2UdyL?-a@|yZ>^x$#c*KRO}5kA&~6xM{iXU
z0<IgtOM}tY@*n|f1mtxXsm(3+XAA*PpLu8W54LQ<#~wJWGzh3+|9>Obhl`;3xQ5YZ
z2BLvL{!QDF5?mJ4P}Z`W^|Q)A`^GON6u-KQlJ)7Lw~K{%jiWCh;K<U}m(Z&Hl1m#W
z3T4741yMgB3ZZM}l{oh~(nITMl(TN$LwGW1E24T$suxGUft-!quT?Tuowe+*r`5;0
zC>KxjMwTj9lou0Qykc(pyt=>-aqqTa`Pd6b_MdqiI3L3!3zlq<9}3g~>%)Z@1b%j$
zHF_XL%<QA@di|Wt@tN9SB~)p*iPc94wz7Us%A?WRY<{JXJ8oc2<XJBBJ9>?`1U0N1
zI?Maqn6bywFgX~DL~fW5wbd34Y!P{<Nnii!OS4t?(;gp~NzBtD-NSr0qkdn0^bH>X
z<L+eDpUIrMJxjGc)Ycq6jbFC~hyPgr^Q(a5)Bu(9AJNMHA#na$o})U=yx=LD8X}>4
z3H?#!u4`HTC*uFajlJ}kJ{$R9wk)&FS~$7c(3QOYD!l9x?@05=4dQZl4iSASNnIYp
z#}1Omfkiwztu@^N3QCy4H{=E&8fYRDNn$-k+(a(vQ#=mZh>^k}?!0l;5sB48qO=}H
zKeqv%R8w7#+4;GfYE~W9`?%iBUL;ny&!JWxc5uiqC9~IL-;VR)hxwfTa5Wm#q>+Zr
zz6xiPN#F20(C2QP%JWjzsi9IgltP`7!1Hx)<=;b`7y@k9nb$P)bxjqv`gn#=-PG9w
zMJgGTF;%$4yJ_2k2C{*lWKo7S{}flWskp0Jbkp}>(-inEj7|IwGk12uR3t@!OFaR}
z{&PEFWV4z8xV~b6rwN*Cyf8yh5f|YE0@^iRdPFCefL<B(a?;=5ju^DApre^S+qa@R
zwQaDlI}uZmRH@#|Z4+4uch<da5^n2Pc<3>}2radX2nM${j7?^Kq#>8ohhk6-*a$K4
z7=+18d9HH`iAer!I=@}lguB}&H|a2Y@9eyxV>;-&>N|XAOX?Mgj!P)C7Pr0@mRWb;
zj`=(8_A}DkP^oDBoxa>_J<|te#;4Aj9j~k1iz3JqWib6rJ(Dvi{X!3;dzgRSV$!Cg
zac-R>xb(VZ6Pm>^bHj>lG*-u<YtyO6;dU;sn}Y@>KlHYT`<Zl&gn1bk+6LrI03+;z
z|8$w9)Y$@b)ya^!J>;&Q)J;<L8&)UV8%EPG(C7K>>eJ#A4UqLsC2$7Fz9ioJH!ZF=
zESg(a4Zxf_AS05qvIdBZ+DRem@pr%0gaC1A9+*jg1%=Gh&etk7mO|Hq2?;~-_sjr7
z(HmL(J&-I=o`Vw=p|L7ryb%8<`@nZHT_I$-1`Q95Ye$RSq1Ze%2>ckm2kEhIJhbRZ
z7SjYQSB3PW_u+ytcdPO8TouBP6ZUng3&#k0-Dl@B)K^YR-Pb2`0VB9vx6$Zw&Ju{?
zzNI_Pwm8i{pEo@$E26)?VfHa9s^Z0W3MiObVo+ug)_o|T3oIJuek7H0uSu5sGeWk1
z{h&K}(LJN-k5YzSxYw*m0KyvTC=U~;YZ^>WBO0fU5c`6#;;gFSz7pUNlbu$E@vU+B
zg1H2cxyVYJ!YU2Rbuh-lY?ot7ul944<5%64L6-wV-p@pEtb}P6;ciORum&cnNEcvR
zlET9RMscaWPRWSg9|FeA*XRyZYwC&Pr-pV%-)efVWA@7(zl*T3{|Z>3!f3?_cgEd}
z%M6in*(<=|M-`~GzGOZfOMs&BOvUkcpg>7JQ)NPFz+zjghdS&KSW+ZCh5`cVJHcI`
zKm{wE16-rv)*(P_9q~OQ^!D+c1TtKfykYdUP1~t=A3$yTdk2-RO9~PEqkeyG5}*N!
z?9sytADflDS8&*^*1+InXt_?R!?D!r<t*i87uEdbQ|pmKR#&yU8qVw}Oe+1v?($EV
zA6^%#OLyileOzR*CQDoE?ntJ3|6XMqJmV&2v%I+6Z(pwcE_VO|vQh9!X(fhow1r3$
zCU#N<Pd&4HxXeI>*%H~bg<E-cF@>xB)FGUoUQTvhufODbR`(4E-5Z+FLn8-@=E{pT
zFzq4L@04P-ks-_crqiWem5YV*6DNuMOo%h(J2pAcFn;`o2ry}Nfc;DZ=krFt0v5pb
zAe{dP2VsR=j$Q&9l9`MM>4Ap*>7?~uV0MroXiwk|T1VKN+VXM2-bnS)z9E<P-5Y~+
zCJN;nRN5{f=I&hzNF(Ck#*(>aY#sQ4gL=2TwHMli-Alr`Lcj*deg5^8Jv$H6G|Ex;
z7tm%JH2T)KuZ?Fau|o&voQdF4x-GtUl~bjmEQH;uBgS6y^yXa)C{nlJy53nhh$VT6
z*&mlZD_~w{QtUdZ^Vo2&hp@v=O4|X@e=x^?b`lLnC0cs%Osq$tl<f?egfFG}rw}6G
z@^5w6hf+69OXQUDeYS;rL3u6EZf2L1tJGc>!+D%3WAcaKcvuxkzCi8wDQf!+wt4LO
z^oycua%OJo3BsHw0NtEF3i?8~+7<Vh+C0x%n7UQ%4McpC*6)IlAdA`ScN*9bDTvGY
z3+^5<7qe6CX<c}YKaOEs-?=hgW%>E0c=^+Q`9o;9s#LRNdt+0$D8Z2C!kv$YvIHh^
zP2j$Q^o=kD_=0IITtvsv=F|-__?~JATK0A)P9H1li<3sqHSgfeC&21^N3W#WYy9x#
z_r8V$0DV9hJ5)eq)RQ?j03G3ce_L91aQc8|oa`l@+2hJ{k$~rf2zN?}q@u4G8@Ywz
zneVu?WKa(B!OHII+HqE0*mdXCJK!owL9zoe;=5mEzC0=-lcdvzKUp^JY>v$~DYIVM
z*J{)uPY^IP{)UvLd(3)t=&gFZknN9t7khL-=P8_0gQ{(UFUK`reF^EsOG-NgwZWjz
zx;3O(>iGff<iRd$%<e(qJR!xhf&O%0{qI-;9z6&1YBLqV6d2vR)WP&drkDo)oRVrW
z2IhSht^EXUp-<YXLwr$<lzokJ*+X`16osP|h%eWFo1<Z%c~~vJJa?X~(4YL#?gq-l
zp?*uT_ARrBeF1RZnSh)onzc`tAYC1@@l97~=>`oNyekynU!EnCR8JCC1-9v`gO}$N
zI6MR1`^ub5*-19BgM&%xPN~2&k;_eW_$pb7PO<TnO`N_&Q`2lw2QHWpjNcJW?oPeu
z>m~<+F&nRD<+}hMzmVIqR=dBt^x-vL8PDr*kbHJ(yt0w%_6^&>k+>itvjg;-x;=2I
ziIqwd@1^TfJ1R$N!Tq}hAEvpR;uLEM3fOV+24@{4%&zAiKu^g3Z~d{$PD%{|Ew{$&
z&EmS;jn|Lg7>Lw-XKclqBc$D+e(ymn1<ITy06mjuS<6eaww0DBl9S0}<!Agg%4?aP
zJBGWk^fjCA+b+JL-Cs-hseEh_7kH1xZB!8;D&&z1gZQgeouU^HD05BXgst`RKW;Bn
z4zGz}>T%2jxquAvmzasCQEyQInlL1MIrltWxy6oq9QLo{GzQs@qsZ_AWC4y2l!aqc
zO(i0*N2h;uX8~}o^<!Mv=?5u_{}$miQ{xn?3iP9d1YgOC0QG*sU6}Rlako(h$QPut
z&$zT+r1K@Gt+FVeIkax0^@S-#gn~-P=1S`R5=x+PFMqBGz+mLEm3f|p^O38$zsi{E
zpEuknJpEdu?hv3(=Urdi=_VJ%jeNB#?JOUmV#20@Cz7~q20}rtMKNE4uh2+jyPusR
zIB^lHYvKnUca=nQd{o<a+gIXf9_?E@FdqU87gx_jX{I<i%w^1<gd8QuZzy$nU~{_@
z_<#7)sQZA*X}Q`3q!j1@(@h!iwO|ffivYWc%B9BRb!0fJ1Pgb9<4z+qPI~p_`eaQl
z8t&6lPq~kTyyWOc(RP4PEzB-bT|z`{M(nE6wIYU|j{R>Yi=KWmWRk}|1~1G`fTyMa
zfGL)1;}zkg+e0T`dp8`axq+pVMe1bp8-IY23-}3CBr&O||ELV8Y`@%U<o<UVW*q1R
z`mUt<9p$dqu7LPnFzjsXP@tHUhRyY)Om3pmPNC7SUZrCrbL;8Sfv*}^iGxz3X=b<7
zB0vO2*>qrf!(fg(J$hgP-HJ*UR(C+eZ=49xwfx=JZrLml`bh&w)4ORm)zuIrdic_Z
z?N2Cp{~$8aG`XpkGCU)awTR5Lsps8Z4}}im)C89`i^WX*3L%Q>yLy^C#OkIVCh9Nu
zCi0#b2{lv%+t;2l&G#m78iiP^yTtDZd#%fd!6n=0_=({2i@j2Ra+`~tMH|&ZAj&Gh
zw+Xa-Pydp`^}J=r#WU|N%}JhszU#wb+!&C9z3*bU8cm}*BNm-_iU5Q)Okf?X+TRMI
z-x5bMEr34-bULvU?42hAi5@Mz#pQWR6%tO0!6~Zf=SZmv5D;eUq?)!Jf^7#L^(%JM
zJ{|Y=j=Csve79cl2TaP>berhaq+Y%La&)Un`HqM8-o=h*C*RofoK&AQ)y|)~ZdS9!
z*N<L5Shn(`142I&-YWbd7Y1I-jegCewGMLD1Bw2vlIxW7{caC-@*mnG5DsmOue}WY
zhQ4VwZLiQO8A1z4;G~mJ^re_}HZKzi4>z$%rx(IlK9KXCfmqIOcWh$2^IXl(Ygd1Y
z#6^;=h}~svLmBS8K7yVgXl5*Shtd@pscfDo5IbA!RPQG2Fzimu(j9Fz-@_h^IDudH
z(0V(`BbR#fDXvp2ZP(ZMl5AG}mC9CeaWg!2=PdQuVe4>~aKQ)Z;X|`BF-6MixA0!8
zp*APa$VdxhPRWjasdl##LH`8Rx)EFiQd<jv*Tt$p4DA~kP+GK`^mx5ECRd5SD>qf^
z=3?6&m;z>W-a@AET;DyJc}-&tX5n?FNO%N0NOKaJo(9VXQ!!_uQ(@Cq^1z-Mp)|=C
z|6wDfX>{uZ)E9gE<*jX=FoigGKiD^;^DxwTaml*kht@#*Cb3*tHT@L02EQ3ePxNeT
z)IsV<$zCY@Ig($G8gx%34uf>ZL~jrx2E6n_T>+W#3gq7i509nWcQ!IjXCR$-DM!UN
z>;DD%u5$#%QrXx%jOgWMwFPmk8TU3Pw~8i)Ha-Y<#&FglA%E=E$!BNN688NrQxdn-
zi(seWYalfU3~dC#A-s<uOaP8y@R9mltKFHeaq8O{Ef(c$fyf#=YoAW=pU}cWPZ;=P
z53ec&lQEuVT|){*pJS8{z9oPA<5ks9T8Z@6W3FD^(c-@*kdWP>@#a0_)lSJ>Y1B|%
z1aw5-XQN_I3_b0#aaf36@<KJ0wQU>~R_xV4V8b5kPrbLW=Nn(BXg?fg@!~bEy`TC%
ziOb)$>T@D8#fRo;7o3ntG<2}&k&$<ivQ20Fm4><cV^$U4`7f^On2FpLy9Zn*aMAP#
zLvhSLqqoh0XL|bW1Oh~!5T(cx14&p~P`>aHYpi>X#y-Bg=WuMIz!!o7Jnm+S0}X%R
z<|Bn~wU>bqQzHB2`{%YInxu6;ej+18%=Y&?+&L$CtR3D}N>CtMaI3l5rJ!3dB8r~O
z2;bd{+DM@&fd1iQ;uAcCc*)NCT=vo-V?N#YG5vNa`n$7%Zgoo)Vk<_b^V!yM>GoTh
zT<gI$GQneL(#Y8Zx4NA6&+`Vz1)`VmI@cinXQ|?Htp0ED#|hpM-%7e;Cf@00M)El7
zHoXyc--M}RnPFK<XqKv74U;cUO4T^AEdi}QfE>+t2yqO-tK7aD<|q{oxEjJ&!)NyU
zn-EX--*#1?g~fU!s5hSp@(P5lxH8a(c3rq8^7B_a>Xde?a(&;dsFVOYlXlX)_JI7X
z>GtflznU3?&u}JvG)Y2Vssnqz`aI<?tyqG^_g(7K@Twp8vQ~-nQ6EE&CXK@LF8#Rk
z(;%iq82rJ9V4m@xW5{2$XAwG0VMihb4;hhJPr}hB^W1_fXy_np!9Z!>^I4n!<y@U>
z5jXFKD_-{8JIC7FOi@B`<yP~0@0B<05A*}>ayf4QdbB)QBcPfpZ8WF{EF?_)FW)%c
zOajBi52^rmIPmL7RIXO;Q&Vg0rbJ7>w^JAS``LxiFNr@k5!6e6JY(4=>%Mq5sJz=+
zUB5;(039uKwE2DkObOpT*+Ebu&wmExRx0r5E^TVTyRF~OSF}UyeiG#`A9rM@`aO=2
zKC=Aj$`Nevb{@l`fMh9C2)i_+PdpJ!A?|KB^#<;{DgD^twQvNB%p*iSma%ZM{La_p
z5DM{NPJb0vI3*gPU~+C%m3U`?T98iy+c;K7Sy7*@ICJ4WYL7ij=@=&KLw$07a9$4&
zuDw%ljdZt0p?e%v(sSna_`ff`W?;*fbQ<9;ZgM^3hYFisqGFR8+SqfN{#y%B!P>n9
zDVA1!Krwl-<EFwgd;NhZ$g6w&Tn%AT{g6F8LG9bkUW$p!1@D$es#8Eo@*yvl?}1pE
zxsGwxRfWAhwRi%=?dO;0V0NQ=N;%@muc_VGAM!TuD4RQL#Hw09luGt?Hu}9GN(g4P
z|7?j2vD2B??B5G<dfomv=Q(eI5!bU}m6!9w0??mR97*6&Ixzbb`sc)-Oz=tI8L;r`
zkOJBATg`wY;isFV(P>U4$Z9U5UOgATmte(hb+|smPK?D2G(^XB6<NV%6i?*3^0YEF
z?62=0kJN+?IxH|T)mdbt9bE&u&CZguO_lw!zFzp1AfkL@S-<YwP^}PO!|}HpiC^x`
z+D(aL2tQ9dFRCveuQ3&h;r}{vT3H^<ah1FNjQqg+7X`F^_Lr!Pc=Ha@t#hCZ(@9(-
zp|IgKv9B3d9A_yJz!}NbSVq(^S#s6<i6|E#58D?<frc&UF9)ki(hB6|52R`9CD#p!
z2kjD#t+d{<6*?tl2<#;n9xn#)y>@sKH}Q7wfb^owQM$P6gz9orI$RQ@m;6k<qkcf}
zHVVEmts69sa>~!J*AXLut8#waDI+NGe@n2sZu{vH+MM`mMl<k8<$yt|_#%*#a0$yk
z4*b5|l@Y$aTc=bUJ#HgL3<=?)gPqMTpE`TON(oFg{jU_1Jo-U@osRhOz(DFP5;&&9
zxzlU0?+S2|v)(bT*Q*vsBnX56?B<g9_a7y}dd5D!!#C8L^BN;~^uGjJ4;Lhevc5^z
zdEq71-Y3|;s*yqLS!7%)*sx=(+ZiKe^PE?nSj@!h=N0cXC!@zP%j2j}_YE19o>Wh<
znOGa4OScy<hvV@R!}}K^8>e!g6iN{E%o#aVlflIOZv5C>Z8|$u)@(FU<U4mHKQ;wc
zIBE$QlQ@zOe}2M$jcxf^o~*0U45M9_tN`GBzvnM5iAa+tYX0(gJ*vPO+TgqIuA1A0
zrMe)RHRmF7(;Y;FX}0YO+AZSMekbuYX><-!R8LiM@Dr9!{`O>;epx#5nLpYOtXm5b
zI47I>B<>Uki&p2aMSEUjXQ3w}_Kx8VM=#t2i~}><7iSOI*74Lkk1l(vJ;uBRzZ+m{
z+;sfu-}n(<_6$E2`^ol&p#L+N{-cDnbu`h#7a&&ghg6p}j-03w(5^<~`S6Z=MH-w-
zg$$oht(o;bdQj(iJ07cOPGv}wn$~L;XIX<7=f<-<c{IVKH5Z!{-M7;zr9|eFsyl$s
zuNq+MlVprBlh#?2l2=#PH~kzCSGJMZptJ-}pBA}#bL6leAKotYERHF<R_qp$ypx$<
zgOmBPLBlnpRPj-)9s|MNyGKz7h=;>N$L(mkwEbLWSVqy@3CdY=`>VlVfvxHMQA=XZ
z_Bt#)$ewF_?rdH$Cda4eY3HG&+xk_i_y&06vzl)<@#NAWNSCUEr$v1Ngb2lt&KBVH
z-_`J+N}+1~Sx_Jwc(K}fo`UM~U4J}fy&ulp$6$PRE&$j9P%ay_E4)X``NfM_NkuiO
z_j3%%^UP<&?rALu>ji9bH`D4I2+cEP7~%()@1jEjAke~O=kZh|$QF6fCkU*A>AK11
zY~K7iY;6T$?2lN8FBHe<86)0i^H)V{QQW88e7}WYl8$`Gz#A$1PCryv6A{3E{1GxR
z<ENFbRZ3*8YdiUvM(WsI1DgISyixoX{WhVb%;>KDGBT6iTqTuPBWb=GjE?%JGB{XG
zEkiF>M6?eKdUZulg%Qu_&mTb#x9N|p+h}&%AagDE!nCp<dwG|6KsrANxui+c#r?eD
zw*EOT)$`4l$#^aeKlJqL!7pgM7M3yeAhklBTccc0r`CS&HbH!E3MaF9k1rv%dKThc
ze)c@$Uem!M;jTtm9hYk2V1Tn)(Nf2A3dxM_oT?)Y%scZ7yEbdFCOLM+0gBKHe%Pru
zU!G2S^vtdBR7q9G@Q`oEk^RV)Mw+{ut2XbdX@9Tf0{<+6bCLp^Dh#~O(tv^R-Ne^I
zt&-<*-e90Jxjo{XlugkME@sfbyHe_Mk-)!YUK{xawg8tlIE6KrySJ<L0y=P~@Q3~w
z8*}0d3ix1t7^!>Z5%as9{H6F=PYTH7V5MPBq4=pNG}c??=)=N>=yQX@4jQ}Qid?zF
zFUH5S-Fb`MLCU+^1#W^bf+PwSiC?zN(%X+Bh1a%*l!}iO`qNy7E(oSQx!xJQew5a@
zCb2~$C24WqZWK0If%@U*=jxAU_Yu()-{(CUlG&t@^?iA4pv5skC0Bduy?b$pl^@w5
zG_I?Gg{2==(WlWEYG+q7pB{fj!Q}&L3%w_oM~L0Q?c-$CTwqU?W826gcVtv^RE(E$
ztYR{6^DfK1P^ltxUbE$Aj)ty`NtnwO+-|la2EzxlvQtUfuTzXp;rCx}?grrXXD}Ii
z0HoJ1fo}gZm|INZ4aCn{z|iXTX-_&4pD<YEg3!h4(wA9y#St%sO(&-!>7PGTHj)49
zkUa$7rBQ6$(AuspgB8Rj*F_84jrVb@1q-!qkI$6}I}r#cHbq<uCwM#Zn5P)ss+nN;
z_%`t;a?jWlyY#mv`$qJF%L^Bio&`W+l1k>Qj~zpO7N-rg2_|0?_rS$`j2Z1q@g(E8
z^`nD#*Ax$((JE<QXU%Rdj(G%A(h3jmS|h>-vrN~BdVkvUb~yVuim}P$@J;EIKcC^G
z)G5}=62%o6&(+ki&|s2t*msRa)dLk14Q%$0BFe2Fj$}Wg>p;`f<WXqf+a5tda!@mT
zR=op4at}bkigzGUWf_T=H_Vzn8F0{PzO)8mOHUsAa_yjF>1q%tlS$wTMpDn<;Ck1?
z*J)M-|IO%mY0K$X$9eDFex)T4AVl8U6(%zM5cEkIAoE=lj3_L=rT~sq;fp-oV~n2E
zWoge?!GiuQ3cjZ1q}fUMg7|6??wEb{!X7-YHw55*{@T>*DK62KBC&_?S{5=VhVJid
zoL>^TeTOM7q0Ox}^xgOCC-+EFjcT1^g85Ktd_Ct*&;R(Wz&EW4_a5d_wxSQO+o{~h
z9_C!A(?OHGY=L^UEl3Sob-#$>z232+BlFBtsqiSsTixyK9wOIiQKEEbO6wZBA}O}8
zJ|1=R=AG=Ci9LxuDKP}7l<V^l(8pC^ENV*oQL1Khfybp@1qh$<rwmvsXY)Miskj2S
z-|_1OJDEt!<}%GyPWx=Tqp(`vhE|VB1L72Lbv_qKoI4Ac2<<mi-AH3blA2sDIX<wm
z-k&Gn9C51~wLcc#URq|@CUPbQr53FwMM<5DimC{W@H_f=VLPS@%l@l<-@w&1z+mDl
zr~l>UmFhUWtgjoDIv_2d6J)GL^P5vNw*Ih+=jU)lNbQp`w6KUd_kp*52>^cAcIQBS
zo4q~Xr5VN~lmpa&uI;QI8i|5d9h7%PeGl%2F^hzQFlq&x;ezwwg8(MHd160aVyYnT
zlOR9}BYlEluzx$0uS??_i29N)2L{r&Hsv1u6mBSPJi3hm)k)-oBG~F^>0ptOgp>XO
z5Y{MxPy=#+@cMs&Ak@IWmkILPHOnk|Zs`_13-3vm^!lh%peLss%aabmAp61oxqnkF
zne12b(@ct(TTkQ{HL}%MM`~BYI_{2xpE}o|!rFDAn{jp*1i4!R5L9UPsY^FKuaFXR
zU)TC&UYndcmS3a3vq22^8*(*GT&nsDxi_J5i6+W0-*7R#TG!k+nSjQ+@SO76BB{p?
z@B$(2s-_fkpJwvg2}hoRf1%uIPVs1O!r{BG-FP|IOj@}6ZLJ(x)g)nh@s*-`#Te6#
z)l}Fi*Uz=%jV6XO9z+iSv8~$PgGnyKJm;14qC8<4z`A{roqZ!$E@$OiM2X%8;Lx*y
zwt?u~92QJsO7zqT*^=69CVYX_BX`2~DbO&+6`C|PE-LgUpEY!^F)XwmwMYltvmGr7
z&?!9Hc*<?V2J{e|dqS&Gwa1CpcjLJWK<Shn3PoCz`XE3a4etwD8}Y&T{fkdO;II4|
zF@e$hTv;HoIlt2Oc+}-_%<b#LH(c9m0?Jw(<R?K&m_`9*Ys1i`-t&WDn23f^EB4rZ
zw<^Dr4cJ)*2)cVGK=BBIl_%{ce!=>qIBElgVv$2mz!0GaQle%hC^_|Pklpv7R2^4P
zu7*xjInb<z-Fod54Kn0bX-3bsnx^(1GIqNnA5W+L!YU6B*qnlc@%B-;ve8{I2e*hS
zIVK)57*Y{**c*E%N!Kj^k<$D4r7VwG3W#Fl`YK-bg7R2ue77uiz5(F}ACuTX^XxB2
zCB^QQojZ=koP55Mb0-6u3Wys0=XO`8``U5mY>x<v=W#sV$1gXs5M1f8Rd8a*blN={
z?n{MGHbgpD=M<8pUE@LC!`@OW-aXncB)dkNxYJ~SN+@fr3=&#)-lrV2Y9Q!~yivpx
zgZky-Rn*YgdHk^+{R2f=zl^-Iv1i0xx*NJzHmGc~3p+hI{Q175{c`WaZX1AQelIfz
zL+;C)C`F+g3MM5H_7jz=l60d@bb4#1G@D1%i-K9L-m$v_(K5Q+w5rWzs}E01Y)_R1
z7z#g6J3#ZKg76e&ST`@?OP6QQ-Bj;)*?_7k58G1baclKGsdM3F#EMF0)LwzsX=WIB
z5Lp9;{VEl!#OuW2LdEYC^K@s_tT(%j?_)#WJFm*|;2|#8QxvqLm^@A@K@+<}H;y@$
zLm~Bx4IX%}<%ZBzOJwkOxfxddWMH#XW?j!$vN}C;P>pYL5@66coOt&%_eBuFknIFU
zqGC-dGMU4jo0A3JUtAzOvRH=Z>r49(WwpEORXcSxCgXs;U5#vNriJIGOmi6plVXE^
zKf8}t30iLrda#+f9#}fjLp@uG-D99zT5(|T++$$8(W5WV*C->e%t->Tc==Q1Pai2w
z|9mDP04=-7BH^xb^3eVapMw2o$VFeTcJ}oxLZ(xZ(~hoQ#Cm>s03@~UY5co;U{1r8
z8YXa+KP;uVyP5nVJC?MK;=T$%TEgSYpWWl8eL`ICkJg10XNPOQ<4&<{5U<2+k}pfT
zzjeq#<ew_##eCLf(*E?`>a|JqIr>}W<L<Bh+@9G_`FEEw@FBU&hWpVn_WMkEb7*_a
zH>ui@L3+Wt?1sjbv%S(It&Bxgxp_DLnO~AJ;&Uzpa1r*CC?=|;D9h(xHhy4k>7~o6
zz2l43Ex^i)*?&$>7@({|E9;(RHlm8soovp0mV~1P=lXr_GdmCTvzvbsJC(84P8pvv
z=IIC$8w@4mDQvW4Q19uYe^LtyJC?^<&ideWIz9yjxEQFTKkLd8tDIvJd{|9n(DZ{K
z@LSig|GN*X)P1#_8V!LocS#+&lF9K|^F9*a5yv-;xvhP{5s(5{I=n__BTpw5v*q!M
z{F|P0!~ucK`~lP;=uq?2*gotVb^1g*i5|-iu`H5cwtk`Y*R~V=HR73#w>RNwUOR()
zR`QT$yaU8?6jyZcy$<mV6rRA!aN&y3{Sib;^J`WKcMU*w61lF&t~cx(;~ULkiKAZP
zuQ@bfEAf)*FTAJ(adKBJ8k!-j&j+j#<eqJ4fZlU8oF1(d*^nQmeFdaGH;1=aP_1;m
ztJBfsy7N29T1UyB>7cJ5AC4$eUZ#;b;TSH8p=AJB2UM#mS<M3F%+kKq)pwh%)XO8-
zHxTu^1Ih;XUhswJfe@YB__bgB?-j?hgVb?67$>$Ud`Z0Yh)VVt6d|+>;_=TO`g_EX
z(6I5mK>0lid`z=QRl;^mw8!c5PH^-64g;;UEDyIS4WK;g!eYCDcu+AyYELii9Pgax
z;e1?7+(W!r5OH!53W^&8rx)7quwU;Oe<biE(ISaIcU<f?_BzQi-ez%kP*dG1!BrIh
zJKp#7(vg~1dnIgju1oSu1DQy;ro;^8qAw66{Je@Mb|}@QZQt*&iAQkNzP>HU4{x<i
zfQfhb9wYO@IgO)GzQ#pwi}8wY0_S;ee9a_&$bZ*n<hs`LF$XW3XYmerlg8xDq6_#N
z_~ZL14TM24#3<V@VZA?HvP*9#gfgCXVpO-jmnWHc6lva2@9KUmSwo0;n{MJPrkt`c
zwtIYW+N^CSuGi%Qaz7k<ydNs=7!YmPvZENmCbbAcyJ@`lG-8wsAU`yQFUDPecVmWe
z=iVu{(bXs6jQ3~hg=rX@m>V?}lh^~B&S_{Rw@|KO%+m>8l5YavK4~ZA*qqTsyIq3-
zbS#~K`i_f@Y3-Fc7{mOM=&v@4&2zUfDUiAWRj0AnFHFy7Gk{=QB~h~dgn#vzxfO|I
zP1G!6A?LgOig25oJuGYE64t{hUhSzE9brd!7yLS&W}R4U{`i5#Xg>LKt6y>{>J9f)
zn77$Zmyud^k86k!0nC-_pLNt5vMP#Pj|*N66hCWtjN;XkgHAIRabCf!4}KJ{u$1-W
z8-))$Og&05_SGd<3#kR|Re_LD^4mYfY&K}G+V5_@m+_yaNkb!2X#!~)9ch?_RaR&|
zNt>zZcdyYT&)VL+xAFosx<l0Vp#bk1N;QNK>jf<80-Rwnm2>?HH*FY2NtcUYFU9^_
zdPm)6Dv%Pg#m_3^0;hOcC#O%IY+QWe6Eh$xdIPjBe5%tWNo=Vz*r$=HUQGRsmxTCq
zMRRT)VKCz<gJuKHGD9RWPS)tjOzz#i`Y^$l{g>L^C;e4K_oe@IRUR@^%c!k(6W4tu
zR1ncLwT<TMah=;tgEFifLZEo<JxHE>i!f=F{}yYTIkHf&FI8Onw2#ZrZ8t=*pXDqx
z)n5LrD~{9hw%cUC+XAXbE-?VYN-RbL%JX;}27ANW$ykTOnZkCeHfFu)%x`w|&F5DM
z>M7#aIg+_FiyBsMEA$&nY2gDl*5lGlJl!a1$b)*KaOvD;>gu0g`@K3MuNe1`O2dFk
zhnRzffdQ@8MoB;}@{&La4RWU5pgL17q}eZ;U$2)ucQ8VAY5xN8dLgIcd$Q{HdD~r)
zq%60$9Ps+Jf4urQ`}F<krwAQqXSnNf_UhoRI<$8fV#$Tx6OV)W9yNnn3Xwk=yN_aE
zs9M2G!{fqlsOc%ytg|G<D@Y==$!Qkw=a*&JCdSws;JqMBHU!lu(U`muyZBN5cE`*Y
z`}&SF!R2ADP$5yrnP#H(HGY3<0?9dNf53REiPk6Gh$r&}ahXN;UJM<!v%~tz5Yje+
zBey~;sM0ReTHaFYv^R6%f7~}j2yGQ7ELk5?>So8&d*4o(7B!)}#A-Feh<#Ad^04J`
ztwvLfiFDT#?+1Uv@`>-4j9m!G=<ksqXiam{F?+Vb16|HiJf`~_FSjzW_x=E<V5DPK
z&2@SWU(j652{6Sydv3B-`>SQStK14n^zeNkn%R=~+rdHihCZ*!-B%vil8w0+q*KZ|
z1&zy+Fnup<H{;&T<~_@s1_2ODk~PP-{*PLcUtm1|Kv+`sS8<eg6gP_C_e`bR3Jztu
z+f@6*?ySG?8=Oy!5b<G{uO=mP`y}J6nusvYFkYIHqj%FN4d}m$5G;8ctPkE@Ob2g<
zWQp!l>-ENBvyF}vi7K-_Q27il@u8h`ImZ3b$15YB(RG!XJi<2(-s2U0*IeMu^iWcB
zcJ=I`i%TJq9(tRndpiZsUD2z$Z;w^->C*apgaR?^xAPFc|HRIX4Wk^18rQ`sB#)Ox
z@!Kgj-`6<5GTmZq9+x_%LGH5qeJ&M1`-oEjNAS&xhN^H!lS-C^LeINFu4_)$zMaMK
znp}#r(&uPOGOM}Gmb%>cip>uBrbyjo?G-dx%Fu60q1aC{R@e?Tf4R+36^8i{-tuTt
zx~(g&xbID_%i<K@4X&<m?|d5VV!az*A@<FsG5zF^L*V(scG)ArVQxX?K2nn(#Rc<~
z7OSMN&;_0bkr(_>DFJ{wupHw7wrG^Rh8o=JxQ6*A4p}sv(QgzI<%%OGL_Eqpt*FFr
zf^=-6)^x<VphvquzXw~YoRd_VD{xE&4K_#O#pg3(i59PqTPUuv6OYb|9d^_2N7I*F
zbR>jNX%T4EzZ|)eh~1I)&XptXOqMR>mzY6Zp{(jN+z}2oa=ket$kO{`uXnG&C>ZE5
z;W$3Y^4#3W+9!Kn*(&D98ru<5>+Pk<NLQzPSnn`&w>fT^J<FAuiuj>w?@k8cq&WUm
z-=(`JBQ}1S_+=v@M9C|?n$A58k9|$M`<c!v9D0;f%X8&w1R&=U6YWCOpH|2T$iw+X
z_Pqd15F5vhd(DsV-8~i}0UMxz&$#D8VDVD-p~U+FfeVb?c&*7GleW2Q9^`Y8ruV*Q
z6ri}-1QP%Iz--bBj)^(}thn|O5a2|K?V)JA{uz5u(23xerGsl|Wmwf;T}}1_1JSG6
z*>Ip;l71reEDuy3td?1zY|yH8U1~mSJB#qse^kW2{=Axp<%cxMN!BUoo}LaD=v$o}
zY-v?*R)p5v753-j=b<K}$HV}Esi?bgjq`(g6`Dw0;#irZ#SDT^PDd#^!IG;`V*d%y
z7US!GvbV&$b&gi(i07LZsD(ZQv7`0tXE~&_Ut(0Q_+(c{V_<-r#ZBA>9b$p(d2Tj*
zMt!p=%(v><0#?u^))L`0$U*3@QA?fQ2U&Wwo0pFQYGR@olG(s5=}%CUgf~zaC`z#C
z4*oBp6@hLfQ!ksTc&9-RD)tpK@i@JRCIx#x2HlsgKp?IU;=B*^+TWh8j+M1MU}m5g
z)hvDSy_J{-NG#n}!_P$w=V>d;26^o)P(}9>!}&}adHhwX_s8g7oGT^cU|$)t<He;B
zl`c%mKFZd+pA1^`ggmX>tIr=*=>;Nhr5W<2jk>bU-Z)2V*mw~S8;~S^6zujYu;F}f
zPQDJYguChVO_g|BqMuz@r*VZ+QqH~p8-nfT9B;V1uya=ll0%2*8Sw(~Q?KGvw9ex@
znk&BAvCr2jBsk_7HUzU{a4wslE>oB~iykIGM(C#*CkKGwCnUjCe=kVe%XOM-3)ohB
z-lUl<%UqqUlc&XnH}sPQG~4Dg=gL;zSEq<Nz39ti8nuC)9zs6QpN%x`&#07|eDaqf
zlXVy`Kf^-x{6@NSsq)#?oPrQ;`(oK$tgB`*BeiY;f%GD8%vR^(1>O_a&K}+;9$hk~
zeuc&JAsLps&@9n>n)hp;2NAoLXb4A22kkk=evK#G2|`ng8k+fjvGGP`seZ3@BHRPX
z(WyPRshr5WaV0D72f24QLLgOogI<?p&*2+?pANXVtiOUO;BJ5NRI;miifMdi0H1sI
zS*uqlbm-1-{T@k_00krrW9z;&aJ?l+xexKUn&m2JHv`V@Y;0{`r*YhGn`fzgR8{XA
zdo9@S#}-e8E2(4fhHmx2eZR&b=%v0I%Lm3bquHAdd&;q!MxVSf%Rg<ui-~}o{rMV-
zO?Jd%Pw5YuFQk;0+wP!Ua7Qv^cQ589+o#H~6(YGVJ#HAFEkku2K8Yz0(>{BTFF4$o
z!7|iPCOK4~B}^8r2o<O9|H0_lb~$8sjrX~hHo=Lc>E6|4nU~QvcLOOsgQt~8?6U;W
zMT~uiMj?(w@TM295BMe@%@grfIc*JG7vCB|x&A^#?H;FRxMG&IKV0Ya(dkU$CdL1P
zh*ZO_ll^r2-exyw*G#K!X&_Vy640~Q){=<DE*g6M!S*XFdGxr=P*6EhSl3eb*iQL!
zww)ksUeTb^KD3$xkKBeby!W8V%h`gmR?_--9lHB#MZpRYAQO%C%+Q5c4{^+*a?-jS
zz%4Os)I=%u-JAr)M@dk`{{QGY%dn`|uI<mzB_Ju1iiot7bcuq3bT^7Pgmm{%f>J6V
z2t#)Zh%n?xNrR-MfWXk*`CjAR_w(Ezp7-lM4))$~{IBa;>s-I{Ts76CQbR~;%f?&B
zB33#9-86g;4{WD+4})&w+7qo7k@_;NN~2Jf?xM~L|K8lpp!MuQRo1Za6i|YDTMGNC
zsvqUz+Jk)*gt^%RTQR^utNQ0jb6m|_to2=l06JbV-kEwEThi#uUT_u-Q6i=J?+a@C
zjNyV(Eh4w&VY*c@FWlbWPAsn<)<Q$A%DR8CeRb~t@a5~+dO+y%1&t1eQM;?$LCLsJ
z+$Np(hNZ1oF#UV6SG{HX3)!}O)5j~tS{I2~KZ^n~^74#>iOz?wHvM$+wMHo7h{jAN
z2iv&xtUH=}_uG$b-b*1dI_5fFvyo<sbijwv4GD!mW1E>A^Z|Q<xE!_5?vc)u-!=Sm
z^}&X1C0-%^p2dW1b)XR?U33INaV<twceC%MrD7$K%v-i>3AVtc0GNY^s%s}wPFJ=7
z)fvkkZ0u>t{=6uC9VaRPe}A&x%Z0}+1+l#SLS%}h&V7S3U73qf%3B-a<~hgE+1p9#
zh5;T$EythB)DKtNj`7YV*?8Pk@C(K#%VkgV)m-VbB?|0d38F1tSims^-1(fYN#>)s
za8ggN3lPbyoh}=Q>fRW@b7VEQ;)1S2T1|jv$QDkZrvT}6#Hqs8g=52d5^tF+>Nq=B
zVc&>3<Tme5VukRB5>Zp=bUHH5#)1E5VwPzxC<&IVfX#3KAA!%oP?7W*+FrF7egdJC
zRTMqZC)~lmhZVix4kzBGMkzol3PYSBlo0PpO7o`>zJZ7CGC=0zYdGKSZk{&gwZP}z
zNL#^*z4ymmOsYNRKp*_t>7u9Lxmo@UOC>Z^Ha&x{oZkPe>|+X7gM>rA7<Jn6U@7>5
zB}x>=2=jPcAz}~1ygNTcWaUvPn#8xxwJ5<^S$MI#C<R1_S54hS0rLLvV9ZPnf?tO`
z$mHYvMQZ5?wLfK5`VR|$lkQN=@6&vVg!MDcok#tu{zx)?lEQ~X*p$iDLrFfKy^6=(
z4)?^xu~PdrSuQi)Pj!Zd$CihUKW9Y<-{3;5IMIbk%@zA@H5`RhW-`8W7FT@f;a%qO
z(RHkhWIQXT<_VORZ28Vf5akFPg}&kdY3bIX*bll07sF9JFLEwV1zDz(;)B}!haOS-
ze*Dl>ei~WSuiy>BS^h&Wl;xoxlh>?3v@~{ob$`{E>6WW*Z3#c38!~zwedvOV#^yc$
z($ZddQf0OacL#DbYr2gN(IR*Cl0Gh8vj!cSnpfgOm89S4SN&wswROY!*NTy1CSL^{
zW+ZLKD=dQqZqP2_!r~ejC0TiZ-q};WxcZto;47qGwR}07S(Zw&Xx5p~80#h5GTf!t
zyRRZ<WWQM0YgDBRMrRvY;|cK*;+1`GfxkwRaHo^@mFI?x_*pJO(WA*dIk^=J<fey*
z_;((F+xPoT)MK$jVwG)fIGRY3wS%_9yuV$LZ>j&%q-zb?vElPWo=ob_p}Mh+<&`vC
z3gRPv2=OrzjaX3rQoM7gy)rvZxW#gq=n?C7WNPD39_XNbVDBwTS0u4=8Tc(p{qEZo
z+Wu6RQylUy*gbReR4^fX(2#uUYW<jQ2p?24|23>BohG5(W+3G$7N4~_9}h7Q^89*s
zeqy&5Wty_wSXadED0OI*(7@0bTYPq+w<_{7u86YS$Y@cCCI={s8jVw8!4*)>E30*q
zQ`hrnS)1*8yj;cuP@Ax%dgCFDfU(i2<xc#qtbI~-(Ora8fSl-~n3JWjbY7|Ha$Gn2
zaa2`(pQ!8H-rsqU+pFNcFaKgwfDwUVCPIgG-jZE&OV38^zSS5|i~Lrw06p_fSm;?R
zm`E1OzQk0u0=3R1#j@O(k!1P(ZprqzY&gd@$$R&oeZj-e_cP%xb~{rB){o9SIo)FS
zVq}Ysm~uX)^%Eyg+O55-9`Nbi%BCMQVHRR+VmOH?te22_J){lPxYOP56Rs|Y^g*lV
z8(_#HHIF~xw*Iw3ik*cLwXGQOZD7qszRqjB#9xHPKo3E?#i1s(7MJt2psYbQy22FY
z;+)3JI7?T4v|({j;2Sv0tgUmJTIc!Momqwlbw#-5K{+o(qZL~bx?D<elCLy3r7ij+
zDy|)mUEh0oIi?;I#e5@WvQjn)goYw|R~HEirN89-a`EeoPTq!+8S3VcdQh>iNMnKc
zxR@m4H{j_&P?>{B-l&CA6aZ>*YY3|!%W`^(-9+6(Xo87k-0L@Cv|s3jtjOU?Q&R&k
zA=JS_PU~)1pP*K#jDc>R;cJPP0KzQ*)UJ{z-35*n_VNkX%ShN0HL0({Lau#-^JS3u
z0CPQx^xiM7?~kcaVe#sCj=9P7CGj3$*$Gx=h;2c9Ix|DK&Qsi{-%D<q*E727#_P02
zQmXSMBtU)G(N26N+G!Gj_&3Htx3|9h=^gC@S{}nYqcx1tke9^za*N;aZd~`PZZU-r
z?xZDC^0Dyt5bv%)KHY^xQ*M)>RU&1~8fUv>xEXawV@!GHa@Sh=5r&qIgpl^<Y}6sI
z%a~QuYpi7B9mOE0rAc}V2_{=Qv(>yk1=yKgeqb(gJo}|Y&=koBafO6Ik3Y#h_Av_Q
zLA`}{7UQrmr8gNBC-!8m?p|xXghR0<`|%)8I$-$II`)&g0T8C!GE1XMjwEf!Xs1qE
za(yrDzPGBg8rh%-!euF*KAGWaFS+c#DPeLO+QEn}V8p}!`tCu>*Yb(uwX&wx$G5|;
zad~Y_*V?ug_TzY(f>jz2*v6UgDK^2N^O9~G$DucT627rv2~CBDXa(*UR%*hc3K$d`
zW-W{Q4`HI!iHeu%ea>+S7B{cnD8HKVKihy$R>yvFY<X;Y7D|d)zq8p8u7%+6Z2$JL
zZ#)E+1U4#)%1veZf@`Z#IoNeyE2^7tURcUjCDko<*8p!Xyn&`aCPz8JUD8lH!CMsh
zD;c5YY#NK^DJm|NuheRC?UwM<dk1c+3vQha;bY!6Tkky4qdd_4_O@#^i0?}F3D?2I
z0@X$@3PbS1<fb|C?ee;PDgGD;#8tIeQgV`)pai|gk9ORL3;pXIoepE8Q$!!j|L99E
zgYj;2GjT32kHZz-sKe3CCwl{cI^#juEq--Fo7zm2G{MMTT=#r)??@y(@5iK2E|y4}
zm;aA#s_U<89l2BM;$|4dIuj}@8?LdVH-i<U0m;!4Wc<PHbnL6W&gu|S=gEMeIod<<
zS*8O`joFi!8it7`d}=;pRJpC=4wIB0Q^U!4RYdRlXNjs}z9n{}PxNkYXyfn?uD1Vp
zd;1U2rtV$;Td$RI%s>2h%8>N8UIS1|Qbd~&fFg3=g^I}7yKs)=yLfUfPJq@L%oF4=
z^Ms(gv0<RO4#X5~IGHZ7#c(wY=tI5!T6nV2ncus_$+9cyR!IfBUg@QBtjK!q4|eXo
z@u845gsky`;n^Ag&GPvP<+(YlFa6Vt%byU0DHiHwwED|9FHLKFLubsse4aUZuJvR(
zLU&dW)M31iXWJ=k%ZqQQIU?kzBvg1u2X_R~)(5Gq`@KsI9D^5hw7BPd0jiv4Eck%U
zeE8V;6%E*p7CRxh<aPk^f?`~oxS=VdpT`;(#Gr`4zs2X39%f)b^|2DmQ57p3Ixx-w
z^~ECq%^WhHsc`kN8=u}Yx<r1bX`O};hxy~ReF_-{9q27D=n$ajsb)FgI$UCMtar%?
zR*9$u<QVM-ilyFz*uZ-br~2VdVNxAPp|K-$ZC-}DCpReDHN)~NUyE1m`+Abtq^~Pg
z+uE~%ypE)z*OFL16))gO6~Y}vk>A;L7e8_y(Zku6aAiZ{5pW_(h$5~e_Aw4zw?LDP
z@8k*ICI%py(_F<NPbwwrC#`w;K0}g^fc$;&;!ApFW<C~I(F|579KWW(*&zPFy;fEZ
zf2p-?`&Y|)UtD(rpNFeW6&W+^IwE84IC4SAlv=&G&Lh)QwCZsACu`uEr|4Mr`lpR0
z^ZcaC-eeKtFVy4|njrZbSP7qMgGw@eoim+wk0e2nFZ~GEocB9eW5`k7Tm`P-cec5M
zV}a~Iy+&sHi>I@E9_My<0^V}%H6y$31Om?uGI85hX<k6i97I}Rty+}*m}(xGP_jSE
zUM+UT&fc)lEa?aHVah(@>X%`Y^{69o1y}`syU)Z0eb_DrR8sqX9;2L`p-NN`E-?W=
z%w+$$a|SUY)hQ#QiaFEbCq%Mnf2cte&*D9r)E3ZHjYQ6kB2pm5!7cXG0FM@89`nuY
z)jXB!t~A=6#uKkp9uIT=iA>DL3)E^Cb^IdYFvmas=;rc*>-FFv7e&x!Qy8-ddv2RM
zt%RL(XPnLiDi3}MD@^6z*qGC#75x=(JQo%ipzG(L90_~cWc#MOfI9ruES*%TMgfV`
zNQPKEr|y%<jT6v(fyN6D`Vy;W_$lD6#O*v632{9YrvHPS232k0#!j436`=%$Alkoo
z8hw^8Pj_6kO4s8Io!(ZBou||}egOFn&(hSAHi#eTj{N>#H0##BjO~uis@S$g8LyZ>
z>CrO#ga{j?*3&^(<ix;89DeZCt9xbKBP^_{$#e7KV&MAe>v`@ZN!Z&_a`|cekHjF?
z_A^7A<&Y+iiYVnjtd#6NhRP<v?mrIZX?(vmS<TnUy1SI@KqE5feR2I(r-aP_Kv%A{
zzLV7jydVpwUMB{XQo^88!>+sL`hk!1;3lb*CeHTnskckuyr6rMFp6sOF{8$UB#Do>
zbzLIFJwD*|n5vnTPR-Rd9exwn;gKtNX*=}Z_CXPfxW^TXxuI+2@b;j2j#(J2^<i<3
zS`!|1fsrqVQW<ntym#$0ae~z(bHGoL<fDOQou)UvgeD@<=S#tLJ}E|fKd*Mt=|=Mc
zS~J{q9LlF(<1SByqR-%}Mzw3KhJONV#=IZWimgN|E3z&)IctKMi=JHH5AYk90(hh{
z;o-U0S5C+;B(N?v{~YJ~zC*{SsVKNlk|!nVQ>yRO%lK$f-G;enb<v&ON`Lgp0YziN
zURDFCju-oKi?#pD<59we!BE*^_lKd+)X9vSHyh9F%LZ=OA4>z|AxKN`1;sY4Y2uiZ
z$2ZP|IXaOrQ!xjiI^mF~Po|PwuE=hJ{b>~TnO68u^CI&czmafm%Zfy6L=kywBTG8T
z^Ez>%x*j^Vsw1BW;aOOz4aP|ur(KQ($wuN`pG(PNk#T!iAJ+iAf_hE^As*EQQ_Y~l
z_`S?DNb20Tumzg6-OCl6trZla5IHnW1{@Q28Gb-(<M_-{-_h_WfjwuL#RipkZ<LW7
zCc@KnA3HNbBB3F#fHydfgVNGtW8yJ@zwWe-g6(ew3Q;%r!5I^;we7xE_+o<aW&%1J
zB<wY_Nr0=<@X!1H58w|-_+xf|rL2LR`RS|yHJ_tdn<(NVQH?9sDA?A-#fvvy<HWi#
zU6%kRZ_KW-lI~>}a<HghlcTt)7m|NEEVKhVezAktS$dqW1q{PrLw1&%Q{d`tidGrk
zhfYenmx~L>E3TW#h($;^)T|%ui`Rw`;ble|<M+uIp8!M+?@j)o+>yWy?My7}Ea>0(
z@R~cgs$;slNqlGZMf-Xwwz;s9%}RSsneN0IA!c#cC&s4bKqiHU)Y678Z#v!WH_}b1
zMOA1qyLK0y(r{dS+%qVwmF!v@<nyxEMi<780xkcp58Rh9ZuPSie=3{X7y)`ni)8|=
zucoq$pw0MnV5_+ZSoGW)hB$tC(3O&>&oo_$DFbf(mMq$VwsaUTrQ{T-_6Ni6{nVV>
z#X%NV=W9X@_pNYvjH`ZDMwF9CPPWMvPn(orO`GQ8fCT`t_C0YQ26O79p^h;PwD7%^
zrv8CaKm3(2W|4tNTp4e10w2|&IJw0szr!gn@3V-v`v=uCeT$|W)I5nX_qRs4KILM0
z&EJy;$KvqR7P8-_oY0#1(7<R}KDB0j_-yXeJXUb=X9t4Xdh?Oojp4@rWYcLjWD{g$
zDEo2?%eklXBqCvXcqPDUugu4)ovhn>CT2})?JJDnFZ}M(Bp`MVL&KJT|7?IM6T<QS
zQSHTUV%=vu3dBI}CjTGEoeKlGOUqBm=+4UEx&m5QkneaA;>J|hO5Iw2Yx-dSpvCQR
zit5S{WdH^_AE^Bu6DowOnr`tV{`B5D?)|&e+v_8I(;26@sTOwrriEsKOa-PN1@iPH
z5NoSZzGz<4+>s{coa-{1>0ADn)7e&;3nl)_Oy1dsPYX5n?{;yTA_xL1Yj@~Z4i-f&
zVUpftZM5C=cOo}yT&XR=CBgCclJLj095RJYpRF_?(Xk-&%2{S^zaEH(RxAGMGA|Q!
z855q25YN{Rl&?O1EdqOvY$@Em`<`<(R*su}R*@E~{1mhm6omNV)hJ?I&qL?6xJe!8
z6O2WVOP0MwIF~$VEHf58My77(G5W}d(?f+${XX=(gg+m0U&jda1w^C=NtAWD0_qF6
zLg4W)JJ~xmPNmN)?>vL1|0JOkP;c-)z)}$n%&v$i(V`c>&u&V=1cv*^{Ys8k=f{6y
zq)dDozM|0i0boHV;p|g(tDzEUuNf8i(-i+V9~EgmbyJ?_q6yO1Q>A<~QzgA{UXq$u
zg5NHV*Q6Fjew$D->-x$0;RLJY>|lRto`lR#EXi)$;qHxTYu{bzO`Owi+v%@TNJhTd
z@zTmZLv18Z3!Nk{Z=Z{e)2jyn%Ac)$jip|jV!7K`fEDb<7w~1`16t}P9Iei{zQX`V
zTN<kaGYT80X}##2!83Qa|9QQaLTOFOY4fr#1HQELm@h^^-iwR|qMs}$O!aG0-*BiM
zWBIl~4A-U=V@azYbv=CTgXlg>BJzyhugpfveW#104yW3FJ$0QIgSE>M<-^hc`k$If
z^(xQHv%sXOTyRWM6I0h3{4IW6y523{yLk<$l@fF$lI$gyQ{v!leAEP{+=^EWPGkmw
zR+u*~%kNE1Krt$S$P)^p=__;4CTzPy@6U=fXShGdQM7P)cb3ZU&*q2TLjG#MslG;4
z_LIoBeg0gV^M<0|nqGS0<6ZTM;ev-VFTUtLU_OmCO>Rh~Id{IlaQd($-G0Y+|6FRZ
z!1xj@O4nKb<FXdRdxN252S}&+Kie0<@nX3NK1Q@vBJO4|GcHI{dkSi#2fRe=Pih<#
zjKp1*NThaJDTQo?^JT|e&B7DRY87xz`vy9)umn`5O0*ZjCew!?nQQoy9_~Y6&I`nB
zJWMAwA!y~K;(R%ew_ruq$6|30Rr!|k%Sa9&R6c<Ppf{H@=#Pt>pAEg`3!cLd#P^q7
z09}=)udi>q0i@8PJu(9<3w;|9{KtN4rokGl4-J&u0&8s{St_x`X`H4HL9)*1$I_O<
z-v`eNV-U==T42ueO!VDtv9@AAzRGmFR%V8e)vw2IcNOwlzQ5(lj=}{15o{rLI(Eka
zK0Z<Hz=!bZj^|Re9y+ZfQ1SS^H+TWYH(2|jf?wk65upvMBx7iB&ocOw&%k<<sI4aM
z9J-y+ThAJ}^R<U^|M?M<2O<c#>h56|`5!tpSsc}p%K^!4V{KCgZ2G+0a6BIoXkWzp
zZh$V9q)0qUkx4Ou*Tf1;;^X)&6jEM&Jt7)G7XJBW@4VT~_$g2?z6jaJ(sa7YwK|Xy
z{0Gsoo!<9Y(z(tt!eJ=)+Vh$BnNZJEu(5kNHvXA~yrR(p;emm672l(l${tWShqlzH
z3V6{B7O2JlP@}IU?N#*~<SAP<y0fpm30ncilh(xtp3)C@E638O3bgVF*1Q1N7+9d?
zR~_G^mt@M6bJigAvHzq07;%CNaB9G!Np)Ggso$H_m6={7-2IEW8K0e5?;oc6(3F6W
znxRCn<>JL(^4R1rdAuY{aSL-hzyFdnBCuV|K?Sn_VtgXTs+5@G%_Pc2u8SF(7CSpz
z7Rx@u3{6#@!-|40Zfa)(=iNQOq~R`UvwG)vY13W72Jy3|grxIqeH<Gd@yA@-blrPe
zk{-INOlI#^r;^m*R~h0g*q4m83ovh=nY%K|Y}}SJR+^ug>i_5lMfl_so^{O)!K~Xh
zW%A9V=~tp3B7+KF&Kjs$0%PIOTNmIIgd;I#pHjMna3<W@*CHO9#(zk^<2v*q#M&2c
zo0tQ}$`zpeLm-^xv6je;Co6EgrK|$*YFvorAsl}X-A}f#@ju9~rkQ$ZWMRU#%GB3|
zT~qRS8y3#$DezOl5TD7|&4@+a&+4}*Wl$yPM^EXtH7|h|n|t09Fi6cCpk3KcCV9FN
zv~#$mf_M(qn01D>vVJrp_^!h2<*$NhFY#_Lex^enVkWBn2Blr@fHm8JVi}bgeF18*
zugyxdqWg8yU&ZG>K6NM5X?wr!{TdpqS7xdLX)Tm}i1ZivrB1s9q*|ea<)6V2oI*{o
zOttM2_+u@cI^_egaPP%0g+AwGmpAtO0Cupk7Vw$W-Z>;IX6E`V8cZ*o`vDA2yIp5d
zP=I`Xc9g_hw)n1muhjV<Zp`j~8#ux!nuE$mH^EJBsY)xK&(PT7_ZDYL+GA@*C4!cZ
zhBn=tw|K!#cdhh#k#x$1;-d8GQ8VGlsYcr;Nm8wQzaHUx`HDwx`1d>O6brxpHLzSk
zXS=ygf^}m+X}2<F?D1`JE_FP9P;Y%oyM7_*vK%Qj4AU5?G#8TW&gZ@BJlL{e@I+Bd
zhe^h68LRuhKluXR5AXYOt`{OTHe&t*NXxmO_=NdB{iw&i>dzOoP?@dAeB$B)KH3Lk
zk;-4N-sYfymh#wbtC-wsv%g7oh&Whbpjn5WvWR#}l}H<W$W)>Ql6Nq52qE~PSF@ji
zk|Z)w35z-2>Dj>FPSn5toZ$|3|8uG#MIB&DC69{0D6G#oK>LHC$UjksCjx1Mf!`YW
zLYL(=Q4P4|2Qmr;wdzK*Cfatzno|Q(>YiM@D$OXjbw9nS%#B^;{{4ohI~jA?jp(ob
z)I<x}?(q>CQJEqda||M<?h4W#EAMLVn9ojKYaCU+7mD$Q9O>PgFWPb}kPcW8U_3cv
zu)F$mhiAqsym7K3hBQT-wU(-o(m5Nw)#lTs)h})t%uwfK-SR#(c_nIVL<SKi&O3eb
z;(w}>Tucj7@B$%|L<JO4d@$r~u2F^$L(q?mPJz=pAAD+o_TR0E(iss-4HTxEn9{)U
zV<kmbZz6v%r(Ws0{uCLDdD$~wht+7yksR!6%Z>V*+Ia~sRn`}#E-(B@35dRsF^~e}
zjT3_Y4}$K9<EC^MRk`}uzwEN$MLS_!+_cvPWYqNH!-0rXQ|0PmP>mZStl{)sf5WWJ
z0>6QoI4a$!=~;KOtiH1;qu<M8b=>q0-bR5|OK3186<Ua5q<M;K+ZA0FjCYNqZ+eWw
zo=q_D27f>NVneok=ooyYyu2mwGZ*l&1AmZEv}Wf{R#t&MlgwG5{d5PuJ}rc7l>V|s
z-|_hm;3Ov?jk14=o3jZHe_*f&2*4t|uMbw?s?;ui78yhdinB<FNuMml?)OP-sRQtV
zVqdK>yLAgpY3TJuM3pf;QXegF+U{BYQ9vhL#bSBtZiJblD~7UHHy7{Pnv8sU>UDM!
zOw8q2t_p%MjsFa0`&YyG`e#KtSXw$VYSfK-yIj(SQG$o}B?bS;a24=Q@W1cnFX}Xi
zP;yOu?Doej?`qEflSY;GE_e12+OaBQZ@Q&Krgl-DvB~~q!T-}6i?Z@1WthF|Xwu|)
za$L-CW%lob=0yaC1kMVk4p+tPisx36KXuRYMY`aL*SL4{4SyR$EwA4{;vC2H@fDvr
zH}Cyz><KZGgZ4l)F)gq9??yjD8OYoZ|EZZ@z<$ur{L+)riwn$cs}(-<d`G$58Ck5w
zlo)K7e}z)wV<v1aaz+fC_s`hYQIkaz<&qQRwEwSjPjRQ3Nm?%D2XBU1^tq*^oN9U&
zZMfBxdN}UwYcCaQ{hLti{c3;^V?^q-87-I2pXAT?I>oVSc`K!?if<4|9FNhhK6p;Y
zGwr5d_bi{TVQOOI0J}@t`ILJ@YDZ*4+C6FX(m(Fg#wjaAr0A4_6H^Jr5Hrti^W+KA
z^8VcWU3}>GGuW6JyFWugGy!0R(0K4*^f-N(qE-)lU2B20POkFQec+njZwO&?6|dXs
za2KbWS*FKo3}Ng1gS+r)MsZhymp`56ow0*7H14oaPJ2Ork%SF`ZQkVRsxlnlQ#rlU
z;>9T<JpOjL)obeIlz`|ge#l-Byk1&lrHn{F{<&)Gfh7@lx1TVNLxprRR5Gin!Y6Hg
zlhfA5<2NV+ZdX|k-M#;M{Ou)R;Ic3)nteBM>60G5fjr&0?D_J%g`$SKu1)!S5N@c1
zD~K~7OmOMoKtlT+?M;Dm!w&$pLV!&e>%KYVssZiao$v_0r81hIuw=TCSZr3rGj0uh
zubb(>mZde<97^e`IOC(j`$4v8b|MlfE9KAY|MV?5qdoUc)|`71;}jW0tOjJ)uY*ta
zb{tUo%4|E5h8@bDP$y>;U-KCUhSL_p-##n$&ZC*UjNt-FX_ngF$K%QkuJJNQLvKi_
z)<(&ucl}S7s%Et24Q_(={vbsOR)XwpvOaktshm_w*|${!s_L1WPM|8`pz_R9)3b5c
z6IMF7YAO%LKK^HR)|0(VTKzc)^Ht)i+$9SIdh82~ie$my+1^`J<K}*LxIu`;T<sKd
zQ~2HPBC)?0U#{*Gf?(j4#Mx<9j{_TR!#i{^H}*S>Rh3GrK~hwgMT6Qr0sSaGncf2T
zZ`~gmEopf1VD&dJCO<23{{mIJZ?VU;D)g${sVPO)SX<~HDu6j^E%H&|6YS3)N`Lch
z_w4;)RB3a8<vf6-D=*|_A}iESg=}V93klaOCvt!1caL4p5^OQC^#uC+4Hw>A>_q%5
zUrvMd4FxgcTkp32wz`bQ&Q;0Q4@0+NT~6xpDmVx?1Pg$!J@7D0EhXniJD-ZS=#1ws
z663UeR06(KhAOQV#vyGMHZjTW?buwL)bV=b%$5s^Mq1gW;pY;?;gX$TXCA2OuQ5v_
zOam_5SP8E6eSu`r@EB6PzsC=U(R~f$ItB^fgrr5fAEAwa0U82XbN66<wwvOqf%zY9
z0h$@yew1(@F!aUF5;ic)MOAPYSbI?ta2ca2NlxZ>FJfda^&wqP+*u6q1m+BhSpvYe
zkK1L7myr>CUp7-c=LvVDi9e1tqG<)b@(Muo36J&Izt3T!k+MN}?46EOV>ry%6Ow0c
zjMxD&!7C&N3*QkSbH+e)ax!II>vpy!Ev`2vb#55sc%ftf3LBp!;=pwL+fgE{lKaK4
zK^u_XqFrroqUSfaRPQ+V(0iwi)*84*CS0Id#G=s-li#9dxufDJ0%0c<IFUnJhSjaY
z95G&7BSi`Kh&)cJXS^c@b5+*F)$Sfla)B|Sfi9!f$3V30X(vo0zh9v6Ge4-eXMSZz
z`cDkteo8sbHkwiUT|erhl+G0__a7EOQrT^^)ZVwlC977ul}i89<e`x?qW@#9x1+Ql
zC?ZD3-nmqO%0sp+%t$lr)xkaBtPZ?ey>Xf{ame%p)|aYQZ0{(+p^Z^Pms@wgU_it+
zaIv~XV)q})G*4tPij2<3RAfxWrOxZc{D9DTs!kAq6JULS9>rX^r{H22J4J{4bP(NP
z$JF0qy);L{qf|g$C*cW`QI9o@<9i4&7>V)JCg0FEJmX4by#@|<{Q4&>myzC!94!am
z-!K2;tEas&LR--D0q-NLNt4;^XY>#5%-vU=k77peJuTs)$uSmr$r85su!&Jdr9|;u
zwk|s5fh|hWUi7O7pWdVK1TMWRw&F_kt73OI$3vS3IpUH{+D%vo`6h1$b5-MIv1He>
z^sX@e>p8ZlQMBq>Y#@y1<gbK$52KNgM#QB<5c^AgDYlYSkYtcP$<Iq$fF*EFwdwxy
zSC?r(jaf6km8kuc=(~sr!JBVLh0IT1APEQF_9cz-k&*YAz^5SCb!>1L1i__Kcn38v
zVA(|u7g`<#?p@iRtswKkF?GF`tL6BEw*w0UN)XkB&*TZ$Ek53&Gk$Y=d*xj|7J<rW
z0>>BbABompu?N?AXASUTH<E7AKm+Gx?u^?pI7&lSKw`k#{*5nRNnU#)7f(Y;czS=1
zfk3$dkS1U?@zuK7ZjAJx%LO~ZY@Ha=S$yYcoLE+PLcJx9?hj5larls(@2{qTfOX7D
zaZ1Mxp6aRF2>Zz@TgBf2SAOY=oKKYBjF<DNg5DbFt2c8$oa;puBgb$V@2-u_XtX{C
zC^eR2d3^##d0Y;vk=w=E|A0?Ai4lX~DH%mtL6x4sYuCykt9bq!EQKAy=&qpX0yAqF
zjw1!6^pJ37<y8o!z{qsB%8xlc{GO*Ek_;Q&)MaNV4D&yuRpn`*F%V^Wglw|6<r_Y9
z?fQePGi&nS2U0%Xo2zzu2lBpXNA}oC8oM>gh}HI{K7H4|+f6<hBmVh|ZtZh<*HX5a
z?CsV*9Sw>2)YUG=o*5KYy1TuAdbutsfXlv$*5<@fQh9at`^i4|XbsG``Z|e1i~@g5
zKA_?4Yho5x%DWPirxVNurgWNINwG>cb6sAO#%F0NiI0O7)vkHa)->MGp3MQ0VRvp0
zzo;{lz~3^Z(+Kkh-1wUhe|+5?hzNw4H$=qx6Zz$*P+grE4d!f&mw^q~VZ+`T<Aa~c
zehChyYG-A)1>md9fVoN4w-@<XS5^;P=>3l6j6bDQm8RpS%ay}aQ!-Lx^=^o@J~%VG
z<=ymHr|T2HiF(5%X<D{T@T+Ad@}Iu{bzVSlf$wFu-L*oeZWX10<NcF}VV_l{q>3Gp
zfNNHt3bjG-%D1w|xE0gn+E@ddA5FrRziSY$q!A|w*=obiF5fC>Bh^dO$uzbEhja&9
z59&<Tz!1{I*|go9=F%Gze?a@uUU%-fosIgW(sw>z;9{cjLQ?3m+@%J(P9oxvN@V#1
ztD+{jg`<85zi1^`9WID<LO!AXRS6@j0$J>V7WY`#w5)Di0K3dbNRlc-^!l<rC>^Yq
z2QuP7<&u7Dx$)7KkUO%T*-8yJ6|#<X=apJ6-zP;Z0jELe{hLz^(1Bl+=C5RXfDT0U
z#p2uR@q0RjA+(G7Cn~1<N5oGHPMiEMyfFbykUn5NIwP>f@r!?c{-jZarN4~%*b4l&
zh6{DLCMql=Zh&|K9o#3R1mr6q0(m$`nRc%Abq-Kf-T=Ssi~4%YY3=v9ou`9PylHW3
z!x>k<8^jChvhFU`#s-VXatYBaPOr2tYjz%Q6imy+uAxumeDtqCOcXh-^=uHKF$D|z
zNhU;;x!QW@4Y}eI$475ybPBZz6NYb%Rr;N|J3TOXUp&(9v(=D*v06Yy(4Dbi0xvt#
zUn|pg#Rf{OCj&2OCIy>Ph$Nm5?ps{&#uoYVUT(LLfn?SOAddlCL8V|HMRx{WC^X5!
z|2{KNwQ>P1D<)2~CL85m@@y=;?-H}(+xzka9g6)eeu4ObE4y*yRI~L(Vzj)yS#`aM
zS-gDdP>t(aLU+;3#-7Wo)n%TvUQ=4!{MERzN1E9E%UQm7&23Cqp_4f6`^6((qftvR
zx{>o_N#d2(mo*aufiI>{vy7yNch)|sZNdu9x!KKpv*_syD;26_R*QwJAIS6YjcuR}
zH1#O^^2$JUb!zCOs>?D(1aW5?$dbomhw0FBiP&ZRFlFl51mT(R8znJ}sTQvDUcbO#
z|NT1VVR;h>8zCbNUk1+L>z4!I^__XiJo{7(0yKF|zuW(}GViPfl+M`6NT&dc$Cv{(
zurnv2JkWiOOY;+nHKXNp&6TL|iEHTtSpyJ@q_XZSLf!dUEWZzvH}kI>oc%;d;bupq
z>xF+HES32xBTN~q(Z7awIx~DDLa%i2pKey<)Il~YJ6ZleVM|fG_tb?~XjnTLJ>ST$
z(r)}Agw!FLR|Yq5A_cdLuP-1l^&%)9!Bg`tE*pXnoXgEmcKp7#I#T3T$*v?V8uAU}
z{?@&xz5@1lEsw*}asX+pUL-x8u$&U|zWXfjP<!?buH3|=uwS_(#rwChq+rh<HG~Bs
z$Ayt!@IXeHtU^khHYkp6-@1I&hv1l!gN{k<56EK*aekb-qSC&^XS*GHXW&LA4wtku
zL^ttt9CWi}kSH%dBp1KHXnzphlHq&}*11hW^(m#hYs&aHzz1UOHFS`X^;VUMj;b2<
zN_I7|3@EqnY_ta|>J`4If#bs-sFg>8EIM7F6&R*tW;g@ZYeoI5!%Br{vhnhpZfmv8
ztNX0OlDn4p`DA_jHIcWarA>*6hH+D4K);F@FY&{(<1wgsWK5Nj{AD!NYl*HLAE>>W
zHS#S#e_cjvy@2*F3-U&x7KwB#mUL%=UHg2-=@{K~i*%zLy&zKvH%%c(|1!HdO!SRW
z0r;cf%t`=zg?J0Wf<cu{JiMK!+Lm2!TYaq7-b*4MhPARXT1J7t+3I-y?c@3#`#br{
zuTE}Xm!X~PC|$A#e?6(BXl1<wpy|95cU>mGAvU`9{+9ebAQn#y&LMGqq<D1-)H8yP
z%lt1+<HX%oWf{R(uCLNwNTxf#RS;BcG-h4PdDpyTybnh7f_ZtJS7TNOp?RSFc;%p1
z?dvR~?8Zwm_?B#BZzP0}vQ;opY6YB@!-qUuy49PN+0jghAJ4#x(z`!DRV*t?o&Ucn
zJw+B(Aedav4?iZG&Gtc9rFAJ6z{<cWVNVO|1EYdYL-2ESp1DB6mX;$wLuT+Blm!vE
zrGowTghd8C0FZg;^Z=tS8G!dEdb|J~Sc!EVb6QyM^62#tg=EXV0Ds3>#gal|_CXB0
ze0bC~5@5aiW_tfP#B$=#=bdFKDl_h5fp6V<WrKO$-!z&&+g7bPcYEr+Js0}w=GKMS
z^gcanW)nZ`7ZZ2fC1JckGnAZf5cE#%*0JU@TC<1^9-QH(X26tEnPod6Eu2$`*mrG~
z8OfEuWyb}wvwFi+)h{wCA4b<*->}u#a}Et&eabz168uT0poOt5o_DC*oslMiG9+or
z7!@nAO=5>IU%<4}-N4uzPA&29<AB}PE3kmr8hM|Kyb=?x(4*@2j^Q|0B{?56-(`qO
z!6OoRB<j!{FJSnQ=p|?a;-6oSr0#2REUlEkaqNLPYx|n<dPu>#6`q!6)%#H^5sM$-
z&k*gm>PEqG5F=C>ka32HgzJAvo1-8*Y4KAmY3eK-I$<wG`<<slmag3tB9OH-^W6#M
z({p3IzHrj8R$50OO97Wn!MO%aoHY4nnP5K5+8>{Gkn7B*2c5XP)(whwVSXNmT4~oM
zS@wR=5#x%=cx(pz7UU5nhudNd2u)%e&}0z@RmfvjRgHuOX+}|1*ab?ChX1V~9z;;q
zrdZ93%<ZP&R4V<0*Ef#!nPQ3beRez9SAfMV!*ykF<q5|wWc_<$MiK65b2%f*;oeg`
zDvh!2xfa~kqBS^Y@W*@$@M~d>R<ToyzFIx!I?`LLNKQmm`j#3vdF?N2Kt0Oz%Wun_
z!#Y2<In^%I{9wE3SWsQ}$fx&UFh?0T-=2-nN(VCwAD#|aQ06#Ka_Bg#p?vvToYX)u
zWp~#rUEBk)q?qTQSFu~k#0$A8<1b0g%W!`5I8WTs9_thYMF>!HYKPC3k|ubyzN;E5
zn3Wp|rLyP0C!<``5PVODI1%1a*wDK2?0tq;O1HoQf0xs7Gcg|ldKujBKhL?d-eJeW
zZk09kthmV7b*%NbE*e$we!S_a3f`qvq`VvSlU`&8EcfCzyN|)pR=jha_#*2C_eu_K
zfPE==n3j^92z6PEiQ;%nL!^CI2)okQ^D4T2lQCw9$k40p>Afb<i!~<Wk|mLJAqVX{
zc$_5veVk4XUvY(8Tz(Mq;tlLLNCImttc$e3Kv|sAi3xMiEHMXdaU5M<iS^;{LCfPA
z>mfSUVJU~B0nw`VTUs}*i&p*We&}Typ!G3~39GOiDuG@nb5mIP=RtuM?zMou?=l5H
zzlcpl|Fl27X5+-}R;-E8G@&K2-8%lmS9_MVb;DDaAW;*IUtXUMv#1r;+wRUgQ$3d_
zi!pdaf5?1PcjvJf`+KQ!I4tk*XEi_Ycd@fy+!#Do|7B;AgEM<l)m}YhA#9%|R1kEO
zhe=3VyM84$f1+Z)77++*Ds&dpWpT$;c)wE4;v}6R){vW1h}5}tlKZ!k%waum&ov)h
z@@VI4yDx%qchk`+6lc9WoAWowe!eO0dJjaKb_%X@|H=h|ePF%nqMIpqwTKmwYtpDg
zRSt1e?OUj0CRmIt1TJjZZ>>0~mgM9RjwN!{IpC{Fq)?A5&B{kFDAT>XfF8(^8C#{t
zF4TB%@=0_+L*RSKZ<C9QX;1T<(Lf%kMWx5m<Swd=h4nEkFMJpyMrqervVNQGdAoqa
z*Y!n6R&*g0bOrl`e?Ast({;g_j3u1j%Jm`o0?-vPW<NAMK>swH`8YzBznH)E1rQzF
zvZ=fv*mBsm=c?v0?x9y{8nic@i}aY7cMIXL^E%&^X1}9p_RHSQ#LZC}UO_oJWqchH
zw!7QDbIrKkAauNq>D|lkaZQ(}wpi*0=pb@BiU+uYI_TG$(0+K+I;whTX63;cyqy`C
zvaq&o+1NYuVI_4{Ia;RDKF=UhDW@r?)d26e0kOAO>|R64grrQ8_jn`qzglUe)16d6
zM=WoP(<<vz7f#Kq03E|$;GfoXZ>n8;XK`3>q;eNL%mu&>mb$?byxM%5n!w(BbX7V+
zVXZP!NfI0vo*nAX=U;D?$xzmqjxw7l+02R|({c(U%%2_JjATB$#vAPg^bBl``Tg_U
zx>3Xn%{<>1lFFwhg2t*Ct&D-LSay)}ezt=LNVB4v_LNspN&#7wAgoOlaheQvmp%fS
zgmR9aORYp9THsx3DY!e<1|F$m*?G*o0^nK%f<VkI(ksKc9>$FAgk9o0WHTRX*=6+>
zgIy^8e~LS8OmX+8*eOk`77G-{@;`i(X|s=vbaQIx2PL|^x61^F?V1G)YuCQy?K_%Y
z)__OJ&EidZ{T!A&7j<M7-$a|%_jM7xDyM_r0oKEXX8A|M@zH)!U$I@X*T~xD#Mt`#
zYr@tvgEd9)d(Td~^_Ek-l9z)+lcoNQ@w9w-ARNM4i|8|{edy3X`9z+^j<CSLtl1v5
z2hu!SZ(d_VmT(OxQ@~->$1?wwX4LqS8)O+N$Y_Vz*Ks6QNMiRlzFd3=1~**5cg-(#
z!QsCRns}-K1{9i-U6q_61uBmHW)0JpMExcU$rdEgX}Fh%C3X(MI+V`u`crKQ5By|Q
zWRzCz@S7K&f=OI>-%BX^)2n=HMmL*<(o3?CcE8STO#uufGWS<fn#ha}%o`y{Ktu`4
zjl3n4>~>@_7d0l|p7GftWY@SOvso#=W3%`HO?wqx72)qG7c)R^KBuK7MQt@mn5ZoW
zlGpahF`)=?VlJ=xdCeMq4Qg_lcG~H|dG1U4=K+(j!SqO;L>BMm-0!wceGeRcC(Q1t
z@g@iAKjS`W_u_(O<>86=`;2uVtJOj-d0tK{2QYCTZuIk+yJN?;&IosN<&6||Ur(%A
zZ}Hc72<D@ql$EO7bt<1pT%Mmp#O5vwV}aV(B?j%UU=`S%XRsbCP07ppYLoK(Ds-)O
zI!c^B5VmiI*9dH(D&3!Wh~XEtJDg<+fU6?9Z#vdSzwHh1NCI?8{6`{B-S1^pwqrCd
zDST$q{R5gQ3Gj1_hbZ-EA-i&P`sN6SgV|c{{aV!v)r%J7bhU%oPVsX##AD2El3&CP
z6iZl+f+$}uYHodXD-JW#5cDbt>AlYK{7&qJI&}c3AXh7J^G>Q_e(oD6QAdD?8VdAN
zz^2qMP{L?N<8$5)xxiNpiq4AZOZo*k=boj8N``uL%;sG?5g;^4a2;`BsinRK%n35e
zi`~lLEcOK2G-*hYv6O8Zy1aXuvio+2d&Q&IQLG9G3obAJzDMLnn1Q*`UT>Bq7>UPW
z01<vt$*n1**3@hbz+rGS?7VFay9_Ai=z$Y?ZfH<V1&9(FQQes1$c;IU+`SKC;;Ek)
zV-!56gQZb@C?75brKf@!Ce3yFkFu6wD3_r0)sa~yTOH|7;Bm7-3dU@?(oSntKsb65
zRt5?Lhh27wlA8X&6G)=dVtl{Q2y+?f8(ei((k^47KPPgfq8C}O-BGN)RENXDKWd_X
zJf)5FI+)2C-re&u+$kbr5Pj0WItuSv>b}0uFFXEkFLoAfoe~5N*gtuRm9B+5Ri)}r
zXxF0;kJEmg@D}P-z%}uWR|ht`aU16s4%*`jY!5yvdH4vmqg<ZMb%q)QCTam<+QPoe
z3#CyC?-~&?RxjuhGh9Th<CA~sF64~qhhBr+<S?oI`iC154roM)@7Md4z@RPz%+V1b
zIEY$X72W3SbBj)3$9Yhw<o65>!g|C01-!Ez<dYCGnlyOpz->Wd75V!{DWq6$DArRk
zm2eU?bPD2Y|9$yxUzq==R3c9WDa=-SNQJ;lg2r=_G;C+A?YGL)bDLIdB6()53^Q-P
zOqpxsg0EM|aaLAufa*Q*Jb{ZHAF(X;24F13zYqK)4%)uPoU`N4T6qE$#O#N}tuiw@
za+=7OhHjnZ4i4BZry#YF_(ng^N-;AvNggpJ)+6lK5e+>`eD&K{axOlR`Lcc~iHc09
zXgKx8S#+l*rRg3LnDU@|P~UG9r&k~*R}1JL#t{|{KaQQCZxNrk4KiJ|^j5ANJMS<5
z#3Wz+Vmef>X~Lqa9?hog;ll36jAU9Lf-t@KgXCf}9b9L`pLuGlJ&@24v0l4n4k%5f
znun7|3??@iw&^Fz<qxdto_!V&vL3v{sgW7tyuN!=qphqVz4Nltep-A^$^Lhl6B3s<
z1rssw$5>+HKuvNdq~?ySo5FJOf%*|qT>-)TB?!*W1i=X2Jp4Oe`4?K&gjTd?ETb`6
zL7yDm6rN3>GeEI1&+rVuFhQBBj=~dA_*R^DzgNc^o+Pu3zcHd8R2<i|*W9j$40+a%
zeg=@54Y6Bcd#JMdcJXZ){KV9%yZxCzk3n4tV(>Im+7`g!d?qWQ-=vSPibkyVl%LZ6
zb3%8fZ@@lXoMZqBbN+S$P+@D|V5Q!2Vw!cP;hIBULt6q^HmED9SXie;e39~xfhNq)
zk?DVBc}yuufte?<_1uoZ@cl0|Fj^v*CoyAzq_lhf#^0{;-!Ut0hfRQRea}ny^$21`
zSPzNhT0Rw060lvWMx~#@k7>REcR^mH*Mi0ute^Rqd_7F2zzc)GR=iZGOZ19<*nGs2
z&coKk2xXvOY%fl(<b1Z3%**vcHt(D6NdWh02%o6|?thI$7Yz~+8D_!nhItyPa@1nC
zaV@!i*KeE^9E6{g9{eMYWPoK`ZvIoF*Njuc{>u8uLy9j?w~=krYWP>#hRnXUnki-V
z{WB)f-XlE9D^%rIn22B4#9u+u%nid=+XVb5iC!a$Ulsq#jy`U3J}^tFe=Z_rzx?Uq
zKgJ1VJamF3Fqu&&8HZyUR}i)EDlZkEDTk<oS9dv=ei<e}62F(+h5Hkb09#>{+mVu<
za*U;h5To^3h<PBm=ZVr$q|Y+~;M(9t&0;9*i&#7iawj0XBK={F5>w^<n0qiesoBd0
z5rz*eb|;WI-wVY9HCGmhw@YXQtkk0CG(27X7Kcu71@UL8m;&3Jr^SX|DR7uCDgF`^
z8&4esgi&ZmiC%&ItszCJjClLbuN9`So`K@mD(DmwoAfi<rVoHV?6UV|X%9ncf{3e#
zh6N4G?xNC_%!YRVx}?P8ohC)<5ip$uG5rk)1kZ6f-gPXdi(?yv)zLaZ^bD!VamQZC
zcGPU6lpthtrY1mWPcJPnPWnLDcXV-jKq*@|opU2C<62E*w@(;pJ~}7r0J%!?T|yp2
z|KtHTm$4T=3gpVQu@M3??g8|U7F%@$fp+xRz$P%7=7Fgk4z$F|0gz)an5)XJz6awI
zD|#D<(8;!Y0wzI$OmZ8N1H$I=>+Y`U3F}fXzwFQRd5gpFvOKRhEnvgp<0oE7sHG^i
z9RqOwqUPa83*AD+(xOPaKQ7}s6KfQdu>j2Bj(kx54H7j^C!0z(OvdiHZghw|tPPR%
zD3K^3F56uuo+-gr2h{iP!Uy%MM}}pphA)`gq;X+hV$xr}IYmD=8MkOrRu&DB0ShGv
zd$hI@VN%ewp_%9*msDY<YBYU)Wq6~-Rb?J5)k^^}>Ez@@c*%H%0-@DG?G~2Kb^554
zLBjQ>fjr0O#_zmM(xvj8fcHbCr{O#@C<rJxot0KX(%tzl9t<}O+cM6AMl!#vCdK7f
zgwxXO)R!%J-7A2{eM`L#puV(O30-G#<#If*E+t`B;0-|fhLrgl9)ZA=Yu=ZrP!h0W
z>#lLn^MFH?@%+cz#jq1Ei1=r436QLQQUlhD6yavCeQ@jz)Y$v}DT$Rdz(6=oAXXBS
z9uh8$=^CA8>7X$(t8=<P^HRMzmt&yEA8@Ma|KP(^UxCN|b*>&*=dsOT{}?F^e&mTY
z_-Ba0S8+dDh#n<i7$B3q?+tJ5a4G0L-%MD!=qKNd<Fmc1xT6LDAnaFVd3<`C+lcfz
zvf*r+#RzrI_+G*57a*3k;kehvaJyTszqS};%qCn%!It)I^3{J#)44tVtB8U_KfsdH
z*H61KU*Ltj4JchOW!S$A>cs>WZ6wMoh$K?eLVfF9xvnKv_2sL|d|vyl3zvSalQM7r
zq>Ov>N6L)^M&E%7PGJtCj-PYWHrzQN*DfYm`FycR&w8R=YS?E&i6;-OSoB+=e}oI|
z`8_xXKob6$si0OVFM!lZ{8*~r%g@q-%5uWRAgzXk=0uQhju)g{5{H?LUkQN+Cw&i_
z;FnH(_ZZmqS|g7Au!;hHLsSIa@j)Jf1^)&%<)oN9Z)=Ln<^iUmXJ^-Lnzp;-ouxVP
zRW$7q6w4&w;y}B>+s%6uiN;cm1eXIoWEPLNm(vpo4gikHegABb1TxSS&lL<ufvI&w
z!WoqM)-9%4W6l<-V{YvyN>u_=vyyxs{f@;#1_3p`tc~3HIQ&@o1lz(>f^$8hASi?-
zv7dON;?x^t{mk8ldEBU$#@;5mC`rV<K>#xp^Qy9}XnY2ji=V-Cvx=fizfAF2oQgff
zzMoKUZi7%zjRytJSC16dlYza#)s4Q2bJ#>i^*+Pu_r^CzGTdRt%T2Gx1%l|y5cLeK
zIxHESG<?9|HBU~<-9bppZT_1_GyBP}Tq54LiDY!^SfZi7ewe<i#7VQ50{>@E-_zcN
z-uNm}h1HB(zZQ32I`RH56W9k@yXFtbmz^I&8f<_Z^$?$K$t}1M%)=jp<*V1Xky6ha
zEYjMLaF={Q)6RqZlQkix5z8AL#21h@dFHYZ(bgo$G8_A}D9$qwC3V-Z-qR_ah`rkG
zCA`w(PkR(0JT3kUJ{aJt!qJZ1A-Z6DEe&(>rVQL3EJ9|`jw%4sohwL4p?u*Fhybi-
zT5TB#*fpDm9!c0`yY6B7Rzbfi=6~3l?jZFp#NBGFt<6O8yH}<jNgGoQ6dMSR-Ax%~
zpC3pP1orN|2li9T`U&*gkq<f1Cg&2VbF=0qAZ%n&8h)cKhYikLLX1_97@pn_u9Hlq
z)+igQsYfiU^cxPrep>l_(opdhJ=83#nym&4S`#!eeyer2(a%F5E5_+C|M1qi4+zU!
z-s_hE+XPOZy7CzU`zf1PT8`+Rqx3wT#>`PUzIoYqgAg>Cq|BUgY#Lp9;8<nCieapU
zdbKZz?(i<Terc=*Du4smdE>w4#%wCgq>w|f^A9F?LSO5z<r<rR%T<v3GBHesWma+Q
zXRd4Je^>xefM6Div@uuch-9`+U^&zC0VJIM+W(D&FBF4HfgL^mr!h|m7v$W%d80PM
zq3rmf*Rf$jiq6Y*`f~)jPl<KnhgLyyg@h^@WccZt#}Gb99LmswG3@F5ONFmiFJuT2
ziO^Do2y)4;OAiH`8OC!(3%zGpIc<ST<_(`9t6k;1n=Ew{@7=h+wmTg#fI`w+O>g}$
zt|mpH9NqnA4>Loq%;LW>N<U?6G<*UE`>JtvR_;e{aL85TFE0|T4rW+~x9X}=E#y!7
zbd=SPjt8~xokd0WZ63O-UoDR`4Gd}N)<g><Td(r*ZMEJ?8ZcH(`7Wz@+Z}rUP^QB$
z1Z58zQ*OpLj*rky+JKw(k-_aL0<QPBF+4gIf0Sf6RW=GZ2txkEF`maiUr-M#a$b+W
z#nl9dkBA^8S!iShAsO61<jb697u`fFn|qZP6K&$h{K}`g%AAR-jut^gG*xu}@dkb8
zSAQ>Kh^x5ACIY3=YrB5T@CW}6H0%+|6Ylu|@X+#Lu15rkKdfzrf?;KD&|Z4~{>o6>
z8T7-mipv<u-EcS-v`tJbuImE2PdN4p4ihKbtOb3DWE6v#CRlwc=@_gqRr~jX2$Vy%
z{FS@PWtnibW!c`<A5Pc~gJ$)IhnjZ6<8+1@lkcgqf-u$o(xhVMo;iuFdU;F>0&8%_
zD`srvn_7pyt<3?0oIy%8NjZ3=O04zN+GxPZuH~HSJ24HULG4KJ0TgWsHwOmn5Mq(-
zw8^^1HRM>lgEWtFC(YLBAc!w{g&*l%UoM7Sf?}!qI1O+D1u?2{X++0QJ>!}5Scv5d
z6O_rY5TMtdiZ1yBB&^wQ3J}RBAd;gklsuMQP4@x*F~b_c*1KDuH)ZYx^UGmJb_z8@
z99wqiXy4Va#a5LT{wS(ve05M;{9Wy$*5w+{&Q$2tGfI>S!t!X^BMmJPpf6v8f~;xb
z-YA@Cmk#&*UG2aOT5hG6J6NedNmC_2_iH&!ddRgdRU-tw%lE_-RY@J;zaQ*gV}*r|
z8A^TSJ$mXz6O{|2(;*45;5&F{+hjM32qwONs4(r<k>p`~f3VhVEgvT}L(f@Sxg5VI
zk)O4rYMhI7TPzhiI}i;Ea`Z2Yn|xqU^!9+eAd3vdp<&>5a_}gFa$Xrz&f7M6eZ^Gz
z&X`L7&0n4|+=vD^z?j3Af&BGO24`N8A?4+?e_`wx51IYiqhz2y5?2-Ga{%wepeimn
z!bLLGm;ujE%oz2jjN+8ME(;Q4K<f0vLtb$_4$Q_%%}7h$<B-PRudzbQ+3-)DE%wzZ
zE#%!g(qn@M+OJHtb?<lo)cU%Dax!e%y766D#A0HtV6a!L07R?1+HMw?ZdNW2{fh)t
zF&MC7D9B4kNPl8XoVZMxNa-_^qJ-=<tvqKE33xw7U50!TS(jmgY+m;OOJdR~kd;5}
zyec&B;t$YKCyS<r**XLzJ6-|jv&Do^w9X5ul1Y!qrPtGH!~s%IK#GI%LWw#HyAB7!
z$e2MLsm|M#%HKfYdWmjko9ijHu>fkpu-Ce9-Yp&WkILtDVjf0+f{G5x6fY^f4Sj}6
zkQIZBq0Sn}bFsLczzBvxv$VTb{UCM1xCp-X99Q!j0Mrl_=mf<CN$U5vZq~%kw{3z4
zJIqd2HM&77Pc;YR9!I#JV`1sC>!1YN@xqB&hQNNu1tN0Dvp{-ge~ZIf<H?1#-K$26
z+F&t(FP4T)HkpT8zt#8}_CNkV!oCBX>i+#7sVG@d3fbACl#!VoN!go>PT3rL7s{5h
zvP;Mw$KEL;>yVI5Rv~-O|9$lQzTc;w=lA<x*SWf6e$IQ`<8{C8*Zuj2RtFIt-d#e&
zW(mJj_$LMHO0gWXuQ|?ZVn1p9bh>_pQi@Fgjmx+`$kmduMsp1m)^8=c&a!G(I_4O!
zds8M0QQaLkGJCNUiaXRfUQ%P)dJg@@rqFl70pzlT>C|7YPo1PFa_TLr`mwV_*2v>?
z7VlJ(r9gcpli&fM+7Tsvhzif84uTDYIQqtwCnaOEBeh;$nJVdosdi<CN|zXt8^%98
zJB_7!@-hL_f<yXg9*Sg*`R=kW@7b}XJRqvUwU2@BDB5(x;eA{znqDy~?`qm>*|#2s
zC@^%U>C{i*e?^S9WXNNUV#(3J$~H*nE9l5iH%=;jF{;4WNwY>o8{NSK5*1g@+hj-+
zwA?}I)3R_eeTOd2CqY`-SDLLF({{tuh<baWbEP4b#zt;a-ruE-X2X^F{MOyxilLf8
zeL900!@`qVKVuf8t#)F}0xHHzLW@HX#4jgJ66yOM7Y8_7b|yd8JeCAwk(<9A7iJ*p
zB^)vMcpM+5&?J~|<}Z%DLcvT<Bg?V!Pq@x9Q&FB+2E&Q`*`eRf5OAG6&V@t)Q$flP
zbDhz52Y6n&#an95*-%6!G^czT@BhpWIQLic<T>D+2qzuS7CB>1;TEQ`QrVmBs@Tfh
zi+bBvwvBWG&pVBGAJc$BTSDfXahtHE@X;fSGqJL*M=18Gqp?vZ5~su@m`#P}_8#-H
z?#`rJo=Dt(cdq6tf$%dWg<Oo<C`d#>Zr<iHb+6!WBSs)Mhv~B8dvnw7fi8kx+o#Ux
z6h`XDp+=Cg+G=HfCx&zaR!lm+zv@#uQP%-QoRo#L(I@Nf<hdTvlMigL(VeVJ9YjMh
zP4?N6%hIc6g6G{Df@YB~uaBjBmR`ju^=)dD`6;JYa1E(-TQ@14-ZU4;Jda^I93&AQ
z+$t8`>Mb6gBs;`vHYiIMHvN=5(>uu|@XctTarE7-eu<m8V+6u(8BIwrxCRSDS_nO+
zPFq2S_2mS!QtJ=GY9G`HWLEW_7uF!=zznjn-&z{?#)Ht9jH?VwQW6{Y?(4Hl@eHzr
z*jXSM?*qc{ve4C%4k=fFD3;X;0z5-9!WyduosCz#;78elqNiTxR1<C8K2fsQ%Po6e
z|KVtfWt<JO6ew|U2%IrK96dZ>Hl*=>UT!xff0t^EGg*Wk`yKYRL_uUd;=7H6@&@6%
zF=6p%Hi$Kx&ysqkRy)S+Z7_e>`;*P<r-?RSCHJmb4yb+)h&jzf7d2i+_^c0deTZe_
zaCNZ-NDLJ~Fd6?&#l!B_oW9v-pLSMG6KGp*bLLfwN_45Kohvxe2c%FPd~@S(9|IDT
zTQIumR+%=^B`=0w^=0LG_7RO4U^&|eG<cQ(;SDP$*qK8uGr}gmAyKE$<krt>#Rcwf
z)N|NN{8;Af=fa8Bqe1*0N-XcH_?<(B7iU1M{2bOeWRVR_xSiPeSZszLfSrUNtuk-B
zAwqrUy-iJ%r@!wyt_{BVEoO|kR<_6d=)>+Ox!Anp`*Dz<2d-PcIaVhgY+Ub)Z`3|G
zZm0VID~1!?E?p8ZxJ>w0D;FvhbORp;-PvPgRwVY(l=K#Tjeo81!-m_&{A-xLd;;si
zN0w_a-6`9cAFK6s9?&1@gq6OIBX?t0D-Si#4E9C2ThULmtG=V19=JBWpX-oiU^O+R
zKzVzEHeqM~<B1mWR+v*7q;PT4<J6VME=@bf%3eXs&nTjDJ=;gb4SKR4d$in$Th<K{
z{4ikeroMKOxzWer2B3f>J{yQzABBuA+TKrSwc~Joyl=Ow?gm`LQ?BRgQl-kov5YH!
zmJAkgqr@pI5SH)IZ*J{#q{J#=3ck)ZJHk|!$%ML9Jf3@7xHD86w2;qNGYMj$qx3&*
zb2O}edwlHV$MR~x=aLcF!cJqP?DPDW-L&7Cw_9&xl-nz#jH;=Qp<%S0i?^(rgZb7m
zi4gAyxoCPh*;s$F4CS}coyXs@Q+qxiiKA81CMG7n=^9P=IcP03*;~%~1OYMLxagcS
zB1TY<7kT-W-PP5fGt1?DCeg*C#pu2IS5DksV%S4s_|Ls-NeulEUabO~3OI~o?6@q#
zD$abP_TSqj703JI)<&~AdurBe@+0pBkTNMB6b|@@)T2mH^)YhZxWQ81^{DW|_nLBb
zPWbsPsKx0hp&$|5DpujsOd^p=i~GA&B2kM?JG^~VPa@^(Q_RREH@<w*V@s|HbcdNS
zKT9lX%O&zY7-E&_G}~|P<&q`(MRQNfzMa|TY<aC4O28$8tXE*8eKs%nj3h4fhoBYL
z;soJq9X9>Nz5TRpE$zYAr{ob~JE{6E3zZzhL5efEgF_k?LWeo>lM+ete#C3)Zz!hZ
zGjATcPhQ<PbfFyW-H)k920ygp!QWzr_WuoZ6|#>}9EXvZ8pmwBpE|P1vVmOBmiET(
zBn2rg_Ma~vlH{c}6nwil&Ve+%bFZnmN+xX5C2Ab9L{?Ze$L`g(Q)nxkETr}+^o^SZ
zl(4v?q{&P3r5Q7gx9U=GHK+Ph%E~G5ACfMWEOFfYkwNda(JNV`U#6g8I<@xc*p=+Y
zXYZmG^%zh2pPy(G^E|*2qptDwG$#@giLUg!?Sx)58`r|xG4qt^;dq+XU`MO<CPrf^
z((IcCjp+NB_NIchuQYd(gfs|=f4DeyUw!wS{TR--#*mFKX(Eb4o~Ohv9SjSjmoC!0
zk^CWiZ#vX{|Kcj4z7(4LBy<B(qP;RUKO0H48?ihWRV%{XMsOR9H%Zn`%=Mh?%5Qcf
zgAQGK!RNSpaadVGyG{q?ZkXk<dFowpaSkR-hCb>}V^z}{9%d1)jN_SM{>S2V(u<G3
znt6f?$7U}G%MWqb!taDRa6(51E^TQ(ZRS-yDz0ueKB^vd=kB}=e619|wre?CZcDl(
zfUWMbAz;TPhog=gOe>5al7C7g%Kgm!hsr=(sH&#E<2;f3_UfhK3uFcP3gvq&k(`{Q
z%y^w59bcA{5Ztuqd4mV+hcYrdqE43|owWtxL`8S^D`L{TCtpu6Z{HO~>^uxW+jVe{
zo^GiB5Y6MzcOe~}Ep*adf8N8MGl`~OnF%TX<@(QCY^3NBubUa#mi=#MH;bO8F5UV3
z@m~Gm#DVd({ri>eL4-!D+l%U4VsF188%sw&H>mh7tMJnpZgI;N?0-AsNA8DkN_6f}
z#Hms6?n&_AFCDso{~Qay%%=`u*n6NKDD1K#^~w{P6(0^)6bnTKUS;+}gc(lLV~*+X
zFE%)KE{w3Aa&@(Mcbzx#+>xd5Ej5zf^}K!vIEb!rDl74`J07tL&*3&$!Ph($;Mk6)
zB~mhDhn+QyM1(QSJYgJj!hI`Y=|jKx+Ngt|If3u^lM#7espVtD+*VY)LH^Bh!X6}#
z2g8zXX5Koqo95;aJ4hcbSZRFfab0ZvwCM-L_K0-uHERaF6>&xa|7RbUbJ#-_ibvA9
z^}h{#@UorHbJK+D7phmg7;^2qz6$Qtm_+|w9u>2<3Q@SXNHxV2#!2$XG6ea4-rAfV
z?ksSvX$vgV-5UNP7cTI6rJro(P%L)RUD$Pps_CF2j7j#e4$R3AF?Kp5+&q4q?1g#j
z3p1X#U=?V0aO*#O=&E$Gh9Zvit4Wx@UJvsX!-0?sllVT$okHKKGt&6;Qqy8b!?m8j
z8WHc0Ih#huu}`53=oJZGVE*@p7{1h0&zp2|6LXLCYeFBSFU|WCdnv)`T*REtUHq|w
zTXiVR)kam7U&QR?e_!5*B!&)%!&X73+nCL}LYEKqtfGfqIDNVMPRkvjA$bK9aXtni
zJi=9O_vZ7%$**j-)Booqy1}1Xb;tZ-Z!gT#8f;Hn!CQ8J1{diRx&|40rsV7?^oSy!
zA7XK3W)}WdBeVE;|CXSUFXNa3=Ha3XKUM$t!Hyo1A{=fq<rVhe0L)vhQljPHAOI=c
zn?Q{w0jA6LCIm$HKl+oDIWNB<H3T)p0{3k@GEVI;#@H&(8x&?-Qh%OEM#GzM_QH-~
zmSF2VEaX9?^+FV!PbWB^aK(kv`wfL&FU=48!g1yD%DscbXy^{7I3$TdMh>S&##=4X
zgEM7_=x<AidCT|k@+(mHuD%yII_u83K;C}_^K1;E6vF2-*Oy20l#n_TuPf1vEKx){
zcJ7b23IElE&9j(a?aYxDDY8~Qvn;aaL>8SlEFQ_{5pu?_>&Io{7)BCTPm<uF#p<9&
z5t-RbO`$c4hT;dRa-K$|*48g788G&Yh7n9yJ&Nd(xv@n+J<4enmbh}S{02PYGX7m>
zKF?4JI{dAR7)w<gY$pc0@b{MtUEOSl%qQH5BgX<0*Uy}h2AD45REKtsdX6?8KK@N8
zZwg-&hWq|)+UV~;`s?FECAho=VK{yW<C&)BuHzb<KHQW&;%W9bw<hx9$ofO#If$AX
zPoXcJ=$JZMDd{8A>ezMf+ch&-<!ZOfEW&QlVj5tac|>7%8qp*SjMxT@b!_OG7`26?
zV}SX=U$442T+Lw!0Lu9=+d+|32bF`)YBwioGxbt{oA1LhfyUTpP5ct${S(dQw7+i}
z=7eFvpAbC(+nYecAQR2U!n9Ll>D!VT6wB^|vMVLwovMz32MQC>xD2cNI0o+8?Qy}&
zGtZ2FVy@lr#La(OUV6;(He}l={joqJjIg{TZ_&j^rofR{v)GIrxDUs}gOh|^o&cLN
z8FZW<etAcNbL?1kA^s_}{`(GT%=iBNRkw)Q#|^k*7V~=)S1Ja`6^tIvoWr~QqM7QF
zAA;dQi`$iZ&KsB9a{s!}M+c&NW(G!3Iqz*<p{J*R6O0)sM9L^PwiaI_06XS{lK;2&
z!G?JsJ#BUdf4q;x%kVxDFL$i}_F#X1NH9{onBD70j9Q}FVI<fMO?V#^eXrUtW?_1x
z`q`G_LSf9E56k*+TKj{Rj6Z!QOEkXqhT<}2gWmnRh)b`D3$~#TMibg0AzStP@QjF^
zUu@6}8Xcq{%!BUTM-iMl=RgE~O#wf)9`zw*s+W)~aX2vg<x9BQSn%gXpG#Ri#x-Jv
zw-K5eVEpxuxABw#-o{e}{A2Y0hy?x<Sn=?{?>TXY5Q}qsyJKH=Ym}c(z~)<gfe*r7
zpAH_L{i|`$8+drN3JMO)!^gnpq%B#j+Txu){Q-o_c#Inf;zd1f41_6yb?SoN+ak-s
zb2o3^>`G)VIrA2dP%v!PKkdwIjGYmz9D?}l$PoVZuQxNs&M>ne&-~K>|K~FX5XSU>
z?SK30fZRBRmT3ztFnY6Ok?qcN5%XQ0;9`0GW>^qRSFe%^InIki+sD$Nb?pxk1;X8_
zwpV9CU0BA(h6e`+hiW72CpFvvejm7;tw~w1v=m8T(I;m7NnUt!V;r<W2iGP<LVvd;
zWB3@Shu@;>^ZVKT`w^*&hJEI9<YN`46cUpO4#h<5M%i%NDA{R(pJIM)4NT(g@&4It
zVYdwe5c6pIT4209`RQ!+Oml`Oy>hGA<HwKhgalb8u)^~h|J(EB9X%hKXH)i%=i|aW
zA6H>~^#ApIn0w~v`2T%AcXcVpb4LR_T+vEJyB<JWZ_%#d5K(as-B`ByBz_9b^}hKQ
zydU=%SU8$HT%|HN1-@8W!c8xvuH635uKoJp1I8PFNcF`I|NFN8{_#f~j6X_X#N4ki
zl9ZGsi;vk9?3dxcEhkgxFlV{?$f~M$$V*B2D)*NwC5cva+*$tC5Dvp^Sv+9q+A~<r
zxmaUBW54&Km|>so*TXCmF#G(k>nKJH`|Q<xbyMz-b(F`fqkL?9(EqWHNk{u!5<)A?
zpP_&0(0mx{(}=v$@wxY6VOMNE6MZdQe8=Ul#hca;ft~Ptx7sOZN9<sewI%^Y7Sy}Q
zbEvB@pjTmgen&ATWexxK3p`7otk?BTe$NH>h2-K{=m1gh>B}>i0HLZfhfVAa8nI`7
z=B48+#grA_kY6&CKheM2e*`VbV(c%N-A)IPe$?oV*bv@3e&DjleP`eEcvk5`Py5HB
zwCep9{Yt*PpxLQFhMvJs5LoI;Rk@~SJ8gYAzE@;*@&;c{0IA=~je#dUi<>lJLDSW>
z1%fiYmRnj{s;ujDj7}?y_}`lNH={)kch@d(?QNegSrB^KGq_1N{{xwBWQbTKrg$z!
z{vzq?&xD3@wxeq;4h2`R7THvZ0Q@8a3RLDW7d3sqB}q&S)OkXnYljZnCLEcj&U5RJ
zLAP!lj5g#!bWnjo(MEOvGyOjFezO3fu!tx>{G_t;hLz*Lc~37|@SX?HO&7v{-=(@p
zjQ5PZCV2jLBlGvu<}G&QXe=miI-ir6a4nT=KgC4W-&;s6U`t4SLg_P8g<1PD+f`Eb
z(s}98WmL{|F*6o_&OPje*dCd?Et7;VH#zo>mn@1sy)t?jJh*x3#(Z86U76X1B5Tbv
zue#H(obK4TO!{MIr8q98((Y`O<xr}*qRuY)CpYK5sOCt{m+i$#qC`!@yJ63njJ5U5
zU)|^@O_a7+*p}uoQ)78NdFz-1dYfepU)^%>*{9R$vp0w;m><YSpEdMDL^DkJVvN!l
z3EUP|;R%Sy9uAgpLKgz%ROgYAJ8?z=<g=dh+?PlvGIe$qfMJV|?|*nY4gvYS#*-&c
zGB7gYvMRr|akWYz!Z>?^f3wsF5@4zKnYxFVe_urL+xYnv$lLgK!oPq1_r*Pn@#6E8
z(#|Dn3EQSpxYHbi{?;>GsMDX-Bfodo8r($br=;K&mf()L5gp!~+ffT=xsNj(z>D83
znZ{N3`sIv6*x6;L+i1S6A`=qak3s2f@fj)Z)?V!!9u$Jea7k=U^V+PE1?h#&tL%lS
z>W!Io9Xt=xrxGrOLgXn?OYtJe69jXgacHK|843&P+$`BjdR0&IYWGJ>Ba0IT$b{y=
zm1v#6?0DoMKfqDAJ$^xKf9<96u+}hW7CnWGnhw+JD_RB`f0EE2SO5#if(#(mE43P?
z$h%*JkFhBX|7M$Ok8H}PYn!)zw<*LbU{k16o=`#l@^_!|_m9WWVBjc)c4bHs{giDY
zl{(Y)CRo(GtcTtFM~o3SGeyj=hd->P?8Fx=G|AstA3hn@=~PcOf5V-vg@+S+>*IE7
zAQs2y>b2`67Bq*uWKNfaB}^vGQ)iJaleey$4pTZU7<Wdv#Li_LBkp(E8Ovq4$D&m9
z(V*L1`Mv90O-djZ)y7(R`-imRQFH!9T9pAOVty;CxCs*|H5Q(=o%ne3{m>ykw4$7(
zMOHWw6EDepKG^)hyXUc6(<5Autb0q-CWh=uWMbY8OZuQ8$t*-x9kRc+ZDmiE_*b^=
zZ4D=5ex@Gz@F5Ej>wN%i)k(0|Xz54`upTOVEr$lOL4kG_%r1$6v2GA9ddxmbfg@@U
zM&KX8#DNs%h_Vh2vHyM)#qA-OK-#~vCHVdGe?OvF_*IxN=WF_Vxq8&|i^Hfhv+8Vq
z&bAY=?*qmvR>$y*D6ic+I~{>deQ*7xk@~lM_jHeizz_<tzJWd-#bhR|pds@YUZ(AH
zr3?2v0z<YQ{dCN7V2|W@5~^rUr?k;_&%{)RJhkUBnzH*e)9IO@tD@g}*md_-Z}2T_
zpV<vdXvhfiqPM&&-y9n$4?Un9I;Z-t=E2)9U6jo$Kv`Vk%h}g<^d#HCWiOe3ztZkz
zGdFTfl}nARm@PuJ_rP^%lh~#IQe|tORkD0sjU$m;^t3}~&HZ&2iS3`eVfm|__@=dP
zp^55@541|{yHrz7KKUOmz}`Fk5`n;JH?fvP26}zg@(cWK83i;vVj&QyBwbD%gnYn%
z9ZunH`I0-d_(0d+F*x#9X&z*J7lwX8sEssEopcA?6=*DHeftfSF$D}7lHAi|EMqI{
zRf2SW2<y)$|J7R4W2|+A<wE4|)|&PPSZl!>Uj&X~qyH`P9)YyPV!qq(+PGFG7B^Mj
z`R!4(+3|%Rt2V(&l?1CAmIXU5K{p~hQEEId808-xlsC6PY<{y<bi$&(lo_ja%ZC}w
zj<fvJjp+K!bAe|Lj{?9U>U`>_|Hh?RO`algu3uM2VK1^nIo$QzUZiG;ZFGA9xdNHY
z=tfAmpvT25(br1#JP&$}I0lP!klEl=FYPMDW@TM1zMXqL`qH>d``Z2HIEElWQIfVz
zD!~vghV+4)FS=P0`0m3cbR)MS>&?LY8AwhfGH6Zpd(&JYOFR=p_Vg%=zX5T?26Eg5
zBvT%Cz47b0_v2Su9q8?iMJ?N0&P75(LLuzT16rk~*KNQv2IBox03g^2b?e`f8%j)a
zQyPq{{lm(*g55^xE6>XUN+r&fWi?VI2nhd!5!HpE@&aH6O}5ANnNxaf%wMQ*S`2;u
zoES;qwz(h;5WU`|m%48;bePpD0P~wZ2AmCtu{+Uu4fw`1kAc#H-yVCN_oS%BR~`Z?
z0iF#@UlNF>e+%!i@SP(dMhyGr=z1juA6<wY?*Gi*>T&7I6WTv#KKD?57dB#(%8gX#
zadNypzsYnp^-STc)Bc8tJ<atGnQR3YgGI2n>XZ7qEWg;V-Y7{9Tod-N(THG)Xv$PZ
zRj*q%%m>?ZAcN-*j@vzGHz6PBLPd9GjtVGZf9>_O44B*?RJQk>S!p<Z-yRK8w~kj=
zzjzH-WvTw;o9Z5krM{>poBp5}<A9K*UWvZ<$KKN9+iu=ZM~j?<G0T34&6TP7BZtcc
z4tHPE*>in1a^<r%@SdkvBd<MDwiqhw;^a<M45AWsrvPGUNhTX7Y0)0EY*0BRIE0}U
zFgbe8Imn6h4s;U$k=eo+c`?~w0CYnjT;A1bEI0TgT&{*Yp~cO*!GJwVmk*__FIgJg
zEl+`2o`=^o;+lSu(syQQ$tK&N_7e03%EJ14%-pqMR>w8+S#n<-5#i7-!h=3`Nq{oX
zMBhwlLI@ZV96@aVy<T<9N8#(mVqb)lbd~)C*IFEpY~R}c0RPR^fh2lb+Qjn1tY*3F
zp~CYXO=xvbKI_}u5ghU8*~OoUz9J2!$f)sjhryq@#tK;H0vwy2Rtr35ZtB^3>hciP
z>{p0@ew2!bW~MM&tAJ$U0e8q%**v~8ofZ0OSsp(^`t1szCPg;%SyIyV>)ZmceP>><
z=2bPWSWZZVlg@6)oyf`WcxY{72zoK`Xjg;Q<VwTDE(6%r=>u*C`lbEP20Y`>rYfcl
znb8<bPOw+^2tm=k1xY{6X*47T(%`tSxNUeHzUIs^Y6xsmC{3oF8}!q4`99g81$3Ki
zE5^+R%5p$ySH1{{o%ch`yt2jp4dbIN0Z%^j^#yC=AD(=S6_XvXe%5`48!RdAI}9~$
zV<7R|d-B4m{PZ~vanRqYBFzu?n`v&An;)&E8FgO`p%;cJ%)~HKdi@kL*Q_Uw%(=u5
z_4WDoKa{69F3U&Y!7yxSIujfwlAbvw6Q(r7_ci2qGYNnw)IE?z`7RdE@A)A}bZ0_m
z{pekgngu-gpZc$+O71@Gp&R`KuK!J+)o7T$F+y+>p@22w@*Q`8FlNlA%JgK$RlRBp
zTlVFh((dzJWlha2tJ$wa30#mSU_0%5Bk*fzx{ghWvOT=s_tb#s^<49#d%ZmaivaQ!
zxUXJ9iRU_X=aMESWGrzMg#7Fw>}mCQtD0Z4$wKBeS`*gp=J_Fz({1f(hvy}F9$GIp
zMzMlH$t>FozYewrW|x}kS0Mq9h^~U3RlcNj7~^aRYJ_qxCfm;l<=(JJb19OV13reN
zzI>W|WJ^!0i=`+)MUr_OkSzZb>tn#!xu$F+`VTv|0nVQneG-3*<}sjV^a27DFvnAa
zkK1lFN2j!n7848ubI0{~Fl<nPanm=L^uUJ%U^J59TY+>4o3NPc8Xh$29*1dUDfwF<
zH%GuEOerE7Mos%C=z3Hmp%hw|Ay4$ver0r=r$W}e>gBBBy2<u-z-n|jpM(6{GxCp)
zZ|>1}y>9i-C|lXIeAZd~b~qjVAu_W~EVq9aw&fYx+~ZWnP(0&3iPbRcQ8)jE6omLk
zgbKSs6|u~wMTn|3Gw<QN%hu`J+qUEnIQz%G#D3Odck<G$^{YrY!LFQd+C5-x-xnIX
zlwIsZ@@4aqpwGd}X$j|oZ9;5y=ZyqiJ7!oROwkM~6YS&tXCtGE^?jkv*mx2RrFV=e
zAAa8)^5umR10M!bIb{q%F%K!gLozKVj3MpL$AFZ=M1L$Z;}Q*2@uqLlVjnURlafAx
zMg*3)^Zn(Z%tCS9EJDlGs=tsa?M^~qU7dK?v8#nk!tPr(Fvz&M&9@%a6!uK^{I$%b
z>>B%#hxMqgiPi6P-;DnF3s6`9%(dIfWL&FxKTnNE`=o~So;Ow{tzu+S-=grl(`8Xb
z?BqVuF`^F3N_xnc8y+!Y%$GOzKg-u)49Zg_1Ao)F5!LdaL)m;a7Rtx)ztv{Qdu)!2
zeHU>&w{b~i0v57-mbrnaYVf?X(2uxU6dB<$=6&ywN+O;1^x{$G`$|`wTimmUZR;|M
zQy7F58W#RCLic)K#1JJ_cq>*C2~h{BJVv3K@bckZ-9^%2j=K#jEFLB|t6U3r<-NfE
zzZZf&U~rrd>HgR+es`G8{!kup!@CwuhkcCX`D_-P+(t-CBHpz`aRvcPjy*Dn-*VvX
z9*5JykY<U+g_Y^{x#eaNR$-w^!BglC(xBmeR$*CgFDM)~g{1t~=pW%!N0kfX8D#}$
zLP(iBnmWcBTJfS!c#UldW`8Yg$`F)}NDzc^%Um%{(`C^lBrzxphMI)R+nYn$pBMTS
zH15r1@kj6eNF=CB>rV-XOly@mfOT@_Ow2Vezik9Vg~wws?$YrBwrygS?mtgt7X6rO
zQ=YB?-~0iCgj}&STSZ%_S@sSrChUeS)mlr~jC~YL?+puG;yC?1d@jkawyvkvdgA-5
z(E__U{q!khI$wQ<6+<0V)g)U3FtBl54kkb?SfaXkF%V-#ph<mNQoYvGqb*T{5|+nl
z@cn9Uh1KBmkN|+tV$0TD|1D7y!z{j6hv+7hLjInF(Sq-NH3Oy6mvz9Hx>xCFiM&1o
z6+&Ye7#RZsc%Z29@F~3*w1idmoF^5z8C-t^T#CRm&AlcoCi0v4Zacy9heIDd$~MZs
zyttdfUSDCJZW>5l*KC!O&BhYf>$<;l9fR-WJ-s9IwR@&Qvh#)GUr<b!$TQQX<Hofz
z0TR8@OkBfdr}r<nU2%#Vs%6NG+-K2Vouhusq}{5?fQP2fQ=%CnXF4_2weFPtVzARI
zNkVFk(=E(!eZ%9<lqNUt22uphWiF!S!qFO@{k4YK>k>OYM51*sm=vA^E3`DB7L+O9
zK(;I3vhvpDjEfdG_bh}m^Cb=<B3JwnN@q*;!yxwDMmoSdoWSJydsJ5>NS6>e*(>wt
z{+W?KaD4y-g;uGaLpjcQeU==mMW-@UEzE$T?pq}bJ(M(=>N$bTN{KABnII)_jziNM
z;2#$<vR{Q5M~G2EN0yV9s6?!&q<`P0qv|!KP;>zxI8Q`$dStU))#ygw7g;#b9QZ)C
ze1EZv_s-*9QtzH8g}i5HNfL{91W->CYv}R0x2H4+V%B`roM72V?+zqK1Xk%6%IRZp
z286=2s8W(h)AFj<SbFhbzUk<_^u|II0syTI4(r?_JUADFLx=ex1f^1B!(;$s&hF@j
zhSIwj$+v_8@T$7yq(B@(?QFlhm(mH;+tQpLM<>x|C+<2N$bXc9yqLkl9(dxEdHr^3
zmJBd)Ms4{_$9`NLs2g?_JyO}*naewc7yha*N9P&6bl}**{_YlZ=M{h`BQK*<ZFf)0
z38?RjrpOG4z&SMasYkAr*^>Ppje9R1B~!YM=f+*+(x)HlrU-v)2r)mtre?x;E^qKL
zoyvKE;{zIgh{wrHR)FE~F|h)&<$ka9>WCY^M@0M$d6SP>8Upq!I2aoXIjF#F$Ee$5
z2$i|TXI^0L(ijHG-Pd&<sc>LtQ_D1Qh<M`TLjqocj8(a8%wJ1s8D_K;Sq)zVZe!t2
z18~Leq8=zl=gqX^!}`1wh4uZX!xzFh{LQm<`@dxu-iM@^I&Zs()50k3muMNlOliRH
z%Rpp7*K_ysop^s>aL3XWTlA9x#Z%hQFxkq+>^{)NT8M40f%p>i!Lp^@euWb-0bM&R
zs_%&9qq#y(oVhp#C2R?CaV#o)G~~bapa76>J0SxhzGjA80$h}4w{EoPE<vGwp~)xv
z26nX^g-ABj1@_2XzncKvMkYOKFtYvz*XTYd)K7EGZlCAUeTO$QK0dBl>nZY}E#@(#
zQVgJiz;f>mE@+!ShQB@mx*P@N!BwY6YrVuUr6K749-I91(s^s?b-5P?=4RL6@{<H>
z_p`&a++Q1zKPPVaD2{;qe4f<Ln^04-g~84y9r0S9)DmtuL|pfcHf^xEE~yuQ6~TMo
z<#oBP3D;^PvT!9NWqo0!ItV6)-K-I!-3>>w>;Z)<S@$cKAWy<(3?6z#c(@!XTdr$}
zP81OU9cPf_x}Tez2+0l^hbHm%>WoH5j_}Eij-L%d=ewZIfoaq=mes3d=OR-LjPp%~
zFn5|KOEu#Ojn^Ith<^8?6p=LB9M2K5qBL(mO`QRPosUh_gE_5;(NC*oZktw9$SEku
z6Zd|e)OsPmSuX;maR02GC*{1@Er@`4{%yjbkFHB&H%-gpt1;q7F1?el0zchY>8C~P
zMjLqE>-Yp&P`kN1Xfn2VPPz%L?8S;s#P=WD{ll$(hr+y;7`yda@R@q#x?RJSx5(2~
z397q`lLs!OPw7V6n|2Zo_qUamQkDpY$`A1lcQ50fyc6H;(>5d<l;Gc_HXvZ&pC{g7
z@y=lCYy?*<&5tL_H`La*|4L2e(RWPIyqK!ZQ(ldudWFU>v<p?mXz}@Nz{<Ct!jKER
zGA{SIxO4S-pkN8~G-n|CH<`)1JVH4y8=17n_NXabFuH^x6dd~(LV-9AB#f(Me<asi
zkn=nJ%K0HG<8;v&$#PsAp|<_@@!rn6)SIoRI$Z_L@X_=;fp@>A$P!8O^q8f!W?AB+
zrGw6?P0ru+IiZDLGg~gz94P|%Ysc#^GiS!Nd?j6Exs=n@Q4yV-UfhLclL{Y=!>iV>
zg^OUM;!UfQlW8j@i+m9zc`&5T7iPIX3?=Wd8!y{44!rfTg-Icbg|uQMt)u)Sr_npP
z=3Fy+D|GDj(=ggS-_27>G~wE#7xbM$ScA}sgTym?t;fzs5&*43ejv3bv+en0<np&U
znCBLO%yM||o#l&2X-FT5p48{!CC1jYeklTd51(fXvA46IE37ECM;(c4tj+c)lzZ3d
zXX#g0(2oVZUATSrl`C`xJx3XnW>Dww+JC=0brr9BrZ8oFvi{h%MT{UaZ5U-4E1Rge
zO%-|i1nK45-ygajB>tXv(Ov{UDSIsT<TDCA=Z@GboM{B#eSxcB9^JAnb|4W=qGCI>
zI9T!h(XC3iutXUEoa{q1oCbKz88KMIfi2*#9NMMBnJa5mSE}!3%6VqJrt}o?&1FcJ
zHdjQPxR6tiQlEe1qx|N<cG$f_8_~(OSQ6G3%*8tUS_gzCqXoQO9j|!zzV-=%MNOD!
z7{QoVL$Y7y6_5z>$<*Mf_6|FNDbvU*`$t1%1q;#FHwcUOWaF!@&BKIfNVlLH>pheh
zJr?}Vz>)H7`!~3s5i7riR|)$MM*;Lihrj5Fha>`S8ysN9Z4dTbrbdfRraL?DrVh-4
zRD1i^FUmy>iQ4rj|7Y~%11}O%12a`gqGF1CXTTD$l1_pRqvo_es8)53{Q5JXCh=Ti
z>Kl~mPJ*aMMZ3p4X+@WlTq}7ZT0<mc;j*bOaz*g9IE6tIUNxhQlfFWo_tqy+K2!gw
z9MJXUc4RhngXvDSwrhx0q=C-dr}DFcgI8tT3IeWyHj?g8m5WLSnT1)XFQPnNuja5b
z&p>Qgx#9+)i%jkAf)I@MX`I!_B~A>XNWyzx?Y61TRvQ=Vp7+-4@Svhr((drUgGp&c
zkWNvdqK`B&HOYPC)|9D;Vn+iAaz)6uxtdH4mRhsBp&ZH|=UkuazX*BGO7(f!hGSMk
zWsgB)d%*q9&!1UfR-e-!ryoT9QR*y<0uR}@V!v)7b3Rda)551BQeG<{k6XV+m50%v
zW;A#L{I9xIe^AW?Vou^M6BdoI?WK0tklyPfg)im$QL-CS(`(iT2NFi{1rMofbjk*X
z98|-4AKonv?%C_o>+YvD_F2jvQQ2s_aCdn^;+^h}WK|Lq!{l!1>^Yz+NTzeu4N#wq
zm##kmI@)-Dh>hTN%yyTB!()k^Pn@!#7PP~)qb%NGDyhm2A9S2b`xY~FiJiNCQs#ln
za%xSNwtg^131AC4nuna*s(G)&X|mzZ`8SHi^ExkJKD}ao{QG69Gr*MjYfWNz86UsH
z1?T)u4wZKK$rv>4M<`@+>d49Y(+|9gO%2X|mZhy3<x~vlBd-Sk{`ZtHTIA(hFK}!3
zHz-T2M?>i(pT@8&n&YFH<Rj`J25N5&O4i8Hz78NJ)D)}ga`h@BT)rq~Qnsdk)h0)#
z<}r+GO-IdULl<ZC)ne6*gQ;kPwpO|IQTOeYSBH8BUIDFNzN{+r^g{iZy|!fN0{|8I
zjz6`l_BRGwj*mhK^E#T&%^VchBR1#hxm$tnI3ru-yi6PY94Icz;rzuPS0F`@><|a%
zf4%$NxkOH5@~%~o$oAPYc%?((|DA)h_#?-vJozIP^IwUv*~`isSg&M1s;D5<3rn52
zK$hcJf2H<N6q1|#E!>YHh`L!2a+y=|XD^h1hNFBjLIuSvJXjg8r`h5T0gQ6m3nWv|
zm#N!vy{I9d=3M#~d3{C2Nv-V;A9rqvL=uwEoZQ^cXZn?OA${bsn{2*+$$D}Ngp4y(
zDfzA=#E^{E^mK7G%x@en)a+<Mi}tmRh2fwxlzcIL*FE??_46bk*HMgUZ&fXBmByXc
zfmz;%i$mn>>IC5QHALN*9t21xA|1|N&`M<2`HWKKj7Gkx5x*%)IqAljw)(*OOnB?=
z+K-95*(IC7%TB*#SmHSMFdT%944%TJp|`FsdwCAW7~R+ddXj$dFNkLEx%DUoYU$N8
zt<1$}1r=CyTLEf<I|e1GWW~efiY%$PbKwF{opy1ona^Cy+eN?RA&6f)cDta3+tvLZ
zjU}b~(=BZ5X-~Kk<0sUpjeti`?)Rh|m7-KB!}5avnOa9N+asaYr~7(Jzi4Ll%gFCL
zk%A0I#!vh<<Q!>B1m&F0OJna_6pOTTAXaS28CfxE?&2(tnlROO+2#dNC5w?NEu@tT
zKBKb3ZQt1#VXOrzKzbAGsnNjQzQV8pK){kU#!d?z4vFA@ybSpfRa{|%<{$Zykpys2
zjU~d2Z_#d$Izz=Jf~{&+zCLX@gZ*YJT6%~4v9eM(X_RSvPEbBb<78FQ@O_FcX^fku
zfQhg*dn*d;uhdOCU>@QG09m2R4H~{`B(%kBg*QkOdv?hM@AmRvmyHbWS&eIiVE0yr
zx+N*;)Xd7rE{FljK-}B*CM!RjEt_64F4iAKZyU{RpubvOfmcA!xcPIikK<LVCJ7qG
zEu0DGERNN7=36HTSMODb<I8c0nUwSGz|bzlgk0BHFcB;$EV3RZ!hAyxos!!mpQ*2h
z^I;+bkeXuEqi*=|P_%V)a3}PM|I)D&1b2H2Ey8HT)VZ|I7if8Ws#t@QZLVe*CEU76
zCPf%V$!`_WX}YMqufgR|N|q#Dr53ap<s<#vdY~v{dzVC;JftV+f}m)4U>u0D)_kc`
z718yzmsONHd@cD*B2iJLd#9>!ouh>!d}PXvW97Ei^sSEa$(uDzZ#u#z+m%^LgH;ry
zqC`HG&q353fn-iHfN5{{I^Jd@%&y9G7^ldDT)X0*Jq+_oR<+14=w$j}OiK&GCkHKu
zNK}(8A6xum6aS|=r3%%l-8-|J<aVa5CdZd{e!6`ZHG8^(3fQlaN-MglUR%2~<6uvD
zof;UB`KR?2urTFQf}|2+sPfseJ@Odjmw_S{v7xP2TvW3zYOjf6Aoci*6E!4dNuq9{
z)(^PfE19S0>wjm#du4q$eOiuK4C^Yo`q?2%VadnZxh2-D@e#7bV=oycnl2a93VQJC
z5ku(-ha^w`{!+xYx<pyZe#_P?$x3>Kr~F>r%*IhxLw=rTrv)xT`t}@)UD%D7FZ!q=
ze*TD|q$PNV-)hLaiWriC^!5qU3Tkl4$}OkqU2d6eE3*Q}3Qk4?v^+<|SxuSd^q%QS
zJ3eWeTpg{*B|M}5DzF}<kso(H^<U~9a${#8nVWv}A3EBv50)G-3Gdv6>4Tx0hESFY
zEMx7_ly*MLs^Qx;HfC64*3&|*P+@u}d|&Db@2&U^+*`%oY}kS-1$|NZ*`+VuiQIPu
zGFsqRIuo}-e4$u)BBE_;Q6c&B{LdcBdelb-<rM#?p|KS!Il<v<wZ~79*<2%o(!P&s
zGM^fD6n4E*Cbju>PS`@VAPQ?%A@h))8@wgHuItXlhSpgtWgpI%+Y8uD;?4HuS+9HG
zkm#g-F3)NG{8LS$gh{xS!XeT;gOGDn1;jFZhC5B?p^2yyQX}?HuAn=$1=_0Dn{KdU
zkYvtBE|3$p`0Yfg<rW+In~YYuAT0(<6t^9qzmpCj2$FZ|!w{U?HKu;9c2|<l!WY{1
z400dQ@#JpMcb|o>7}Jb;4&~CQEh}ve(cMWq><njXvyzT*5E5=_vs<{Wh;ATLX4TEi
z&`~c*<hhRn^}I_7vcblG;D8oKC_u`6)o~ZyMaOqquW`R=>BeF>L}^^QA+N;pG?E=3
z>7V>bss7BJyZej2w2aZujs5S}9i#M?v~lwH%ANM}PPR)FC$<H&RXV$SO@6Z4NBlTb
zdr6`%ygNIs8DeFLrw+0uw|k>IehS;;Sf0GyiHcXFA*soV=CoRO@*z5K39gx#F>HE2
zq3%)5DV!I7gKSZ2ei!&65}qU8w^lU-V4~<>;MXZDF&l!PCl&3?E;%}K1CX^a)!Kgq
zWa&YG4sdo7XGY`D8|l-PFAB2=-&94!ih0#2Umwt*gec_!u)F^OS<yb`$HXY<f5s?%
zm_lhDB4!uK^cdrVjo$fpl*Ia@>FDTKH1m_$qaC+(wXeQb6+Tzc5w(_aZX-UUB8!Pb
zNJaWf)RORnHj1_Reic+^0n@b{g|a40$<D*<-T)jx{csh9hegFiIg$)6(Zw=hmn;v8
z@!qb4Me*b^3C~4FnLdW(nT0+z<45G=)CJ5iE|+*-^8q)_qK<0+0%dG|5P5J+;>{r7
zoKf7;;?u#fDBcxr0*4~5axPp+F+uvv;@5%{dO3(`(Ko;Km~{Y#AEvZX2GY`?8rQ_(
zp^>er*>J_}f4Bf7P_6P;KBLuc48m&9<p<Ip%x!-rO<dO(r8PK+nvpbdS3#-y?ZPx$
zOhbfPs!Mf}-qljo3lOTH3Y}ECT`!wZ2Q)Eq&1I}u#wvM)KVj#-V6%PbZ7cB`j^&>U
zJ47jDD?^qTr^+36SZw8OVqKl#%i(ot*lC_Xd*cG)1Y6<q+IziwZ|+{osvULNTU2ou
zLa?688NGkDCJKuqVNulCx904$RK{})b9&An1NuGFT_S_Yo>QOImqqbkI!8k>awkhz
zgG~$fL#4-KRcoEY>JIC4?l-n!;rHs-)0Q~i`x4AOJw<cAByjF~zrw>FYpVK0(Z2oX
z90}Km{Z^-WHlvXRlhFoO3_lZ9%oxgUVlsEnM%|;sF@&Mk`ysLpn?F}z_w}4qGC`Ev
z>RDA4JA5e3>RW4c2YC7BetYb{pv*vk$H2^qc(5XFxM|l|0?<L#dHtq+sd4Y93twUF
zKNqiU@#`#h$7b|D-)FYTtuJk~5~@N`+`n`Ak(ks+&R<{`1qMc?WF#SAP$wqtFNY3e
zd1-{KBAaN?JA}y(t;`qlrd=TE6IVJR*H~pY6{vh`8*;ztE3T$GocH@LyuM>zrN7-p
zmaSbw9<J;eMK66SE5cRMxkRJ9?`j!TlBFL+$hX^FmbN1o)KqdnbPJmWXfb+b;Kd2p
zeq)i<8RnM_r3$Rea;gS!m`yDZ@R^LZ99cV!pfcA+0oTilwY4s*mtnA8pn=!E+IF~V
zzaA(ouUO1I(nfoOMfEiemrK=N2b6}K)u1D;q9woDAr)sQ`KC{so<$B5bv$`u$fczD
z9}vk0*0J~Gu1_m?uzFWhL4fZn*3Fm*gErnkTBQlS(t@%u_eZrO##AL}(3Ra|FitWv
zB4`byV(pOp>lbXMU8-%yaloyo`yZ~<>Oq(+VPliuewt@Zn0sFGcLdo`5mR7N^6m1S
znSaGQRtd!k289#sHl+(9Qi}yBOj#l;DVkLsE5)s-Q79w(FmdiLt^$`q<?B~iI`t^~
znqLlmcv`dDw{Qax{pWlvsXZ4rr1USWwjU1AI|TB3<wm|QT@;ZzyoGcK=&gUKT$}ft
zUFUL^1BL;(Ro=?u)f2_y=2*Jq{&SkT=!e)Pjf0w;PnCKf%)YO<P&3NM)a|(~6He!T
z^<PGg>c%1#A7$w03OxPApNOsel-Sg+A4t<@aec#Q8qo#GJ<1nZ3Gue*Q8UsE%QG$g
zwwCts6{Al*idIYeFgAf;zX3<%C$v*!$;ns)*3>o8cn^Ny0Db^<I2kL%gl#|vh;+@1
zTMIt8LzvIHl{Ic?u?jbB5M1@d-0dYAjHebWpT1*!N1V-RAZtEb|0bJ=PT>11VRgm4
z@3LH?SnlBjo#T9>OKX|+NbP(tJGD7}>jvBDEaT3Li@`B0s~J-|I#-j^{bm5SH2(29
zHPG;Tz`3*IS}UA0j4Uj28t>>I!W;-nm9*PeHj@CVQDz-h|9hyZN2PYDkG%QN>BSJV
zaP;V2DFc-m`AGe0(n57V3pdh2-hxN#4(Ur&-Cs?rDkomfO3_a!kUOc&4VLB=T0!o1
ztu(o&?R{^<iL09X*|M^^(Oa%--|vS?XErLVPvhbr?0zepzdzX=t{(ZG=>Sfqki)H7
z?cTnfJ=dKz)(Xd;nf{$=mnBL*GMu-~|0C6R#_&cKSDcC};A|~>+(?!w4<i989r8P{
z7kCi;GVVf7ucjS$imjha7mG=Y*TsqP%F5I<RwSJ4M7G^-zp~+r*yjEDV>Hk3YmuLT
z?Lp!6)>x)#XSMV&%NvL9FL{Dr6;rhG{VWO4&lK*<Q9ZT%P1}I}00^TG_NoYx(T7gh
zvFhwGLP94TES~NyB0^SX%){Rc5hT9Vy3!w6k8EF|F@6|*PVVAX8cwN5zxGD>bz|+@
z)L$+LqO=tTl!sjoW;tV)d^~=X8wJ!;d|5`m^<s`Bj}prpmDs&`)-td+v_s5m@e*V=
z!yh717?jn!@)zaTrLn%go(G0R2UA>;=sWWxlrXa@B0Bnp9i^q=SX`V1RY*{KE%b2>
z&;Kh5MI4F2H&p?a_=@RSL`Y)V6%$V%xs%6V#WAUpWE4fpJ-oIAq4SW|I^Lklm>z)H
z4e3qgZr)X4Kfh$0g)VG_eoMI2!&#smt2aw`N44KGefR@hw7Hr@aj{d*a;4+Xdgdhe
zH<Zz;MVN7lo_ou(t1(Gl2U;287X8AU!jKMVg<Gkl79PBLvk)TKf8}yLs@klxr2hGA
zDF_>0Jsi+CwdQ=UhTi@hB%XQS>3iq<AwACY*;k;1%><1u;pObW8-lnJTmAG>D5Jx=
zDd7f}%w4g$4%GzyhEhPhF+4i=OHk8Ib8n1cYnt01vbEnRGMR>kxnyXaQJ3tAoS%xe
z%aDMMXBdX-P<42)D_?BO_nTv-#YSV?l*@ES_B$rwprXpfuNSJEFO0XTR_w}OH|BIH
zmrjuX-p@vHi<GG-wl%Stof?vZ`g&Ura{RA_f;!8Z)!5;-n^Ynp>q10E-HD?S@*O}&
ze2*{{m3Tk2NGI;V+K=G?4cE|AH+B@ZN_<NYEn}A_YdI^e9aYOf8sN11c;YK^vPij0
zdq;yyO?5kc>!eGBmj=Hq`5Qe}VXyy?fl6{SIjchBx2eM}iQTK;-JczeDK_yD{6ZL#
znWBOkP*+Qwiq>|EN>X5&`z*bJ;lTql)v<ZgnlsM%G@U}Pamy363iKr**QanE&^FU-
z7(8JN(^<tTJSd_9sbnLFZqXUgey#DS?4HR32ibBSZ!%R8sx$GIXyQ&n`Oa$N-)M=X
z{_?;gbzy_x*aAo@ZK&56rUd%<FEGG@7FuZv=oEsxSa9FcLrh=79{4p71wB|_%WPBb
zmD~@Vfc|_uLc&G>gd&re6l2Mtr1hlAz~Rf=S*z9Q_5zr1g6so+yAkwkIdUiUl19N`
z*e6P2`(L_Vv@G^ah;5A>^LLJgijv+$>PvNpk$WjJH%+Wtryq?A-guF^pm^xj1W0SV
zAW}xK2{H+lOue{RkpL}D**WqF&WUc7SG`{jAwp}J!m+!Q`XfU)H`>qrkv)t~X%S0H
zi}aFRr51L+8-5^d5;-kDu4UxDwL~Yi61gbv&%s3=q}V>9O>{|E94ayeE~^hx-Ru)d
zWZgDp+_mfTbI^ni(#se&{&0l@i}G9l>E18s3XrN+Je$ehlvSREt_^;9n)&L3w(oeQ
zJ&Qos&k(L9MP)ovQ>e9_I5Ej<3KL&qR~6o4g*QTc83ZgWI_S^3qZaGP_Rta2clNhe
z{!wqwNeKmKN2ia!`U88txS8zc+Ft90Gf8S^P0{SFQefyAoV%d|XR=(&zfpWHKR$BD
zOpkG4He~QSkMmU%#nv<?ELo{4oNu&&&DqbM?F~Pc`$qlDBVFshmdLYtC!<mBaCV(E
zio@--<yxiXpYf@hMV`;v@4``Sr8iF_k{qGZntvncxIuc+p3<rZ@hdHoikR(nrO?^v
zVviOQ;@+*q9Kk-DNsgl<_y~@mhm>>Fd5iGyaJe`>=zP92ABAM(HuO5&cZ$E^a0+b6
z`+&2|wA}W$IYEs8QV80@2kn74$4{}8e!i@fIK)}-@#!rn8u2UqYy9}|D1Ib>)#8M|
z<-%12_|$ayueIa30fiqv*QXRoU3hX+df~(gme5A#E9Wm`4w;J-47hB&ua>QeiSxai
zABV=ueV`8}q&b*YaJ^~;73x4?MAjpG^j|bjYR-&3eGz>l{i25?Po6xn58-!sQFii7
zp+R_WcRkWR^Rl_+4nTjz_qPD^hpJsJy#Ng$#!$R3dhwvgIDsTFg<;by;R5G9m2WrP
zw|LC@a?g;=rAo`@J5CeaP%$04mZxnKLU>jX4TZZ=k225w8yyKk>65diqn@Z;tqIog
z{)fvMS?30CoUcbg!=Qlu_vbITWTWxlC9T8lVn2UFpqmPRe|Gg-+feAvzzrg0hYFBZ
zR6kGoHFgfFwaSw-fXK6)m-`q(+1FLLC-QqSL;M<esL;y-L}z#dmxN8lWmWgz#zgM1
z<E%XC(YeBA63%ro#jWJ#uzYH^i8e_%0Fl953~5^J_Z6_B<U7uYu~Vj=DyYWNmyt4@
zSQzE8Ss|)0L3HcVixBN)UI;wjMs10$$tDyQhaxjCSuAaB?x{y5B9Z?M=`2GPGgZQV
z^q)wI;vnTL(7hb!ynae&lUh1y8I|xYXO6hwr2=fNBfiWX2~3=N3DZR^>pKHXT)53Q
z1;tlz-h!xTi^5wdm%HxWwS9jG-@dVM3p~I-h^2DkAQd3IY66b*A;zj9|8CX1yD%AU
z7w^ZdU>RB{?RF8~s=58}ZCY;fT;^9`x8P-jUgXvnZfDgy@`M+ZJv2@15wZTmoZ3Y^
zL|vpOj9)Hs7_GvH2TWJ;{6KN*1+wXyg>#9fuc4@&0<y%@na$9Z*#xFBhwDl-!W5#Y
z=P1VZS1l8+e7O!ML;x=tnWT5ovWAIc0-k$iiO*BkY@?Sf*0#Q={jgL!P}g9L=Dke*
z!Seb#>QcgLa&9)Kxt9L)7?dTApkSZ5@`@wL#Z~8EZ@UYHP-<SiK=Duy|Mnskkee~L
z@;@)bIJm3ENrWQbGP9hLbC-r*-*d1xQx^&}ao^cwf7okYWR;<Y`)%=ry+kO-g=`7W
zO9wbA=W`p|lKSGjtdy`iVz}@O_%a{g{_$j9-9Y_7e|(zSm!sd?lrm*=Q-$;{a4x&D
z)urDmCP+9H&mhLR^F0o)1aBW8{MY8+m1{fqGH{K0Y!SJJCHPg#qNVV;Z@8Ts0qgd0
zC^F9I;~)w9f#@{k#!c~bh@%T_4Ui0#LW64S2l~=0t*4v7uInVw>F{55n@HZCz~q;4
z4p17NLiYgj5sc|cUPgYzgO4qaj07PViR%E32%pu4-@2;%t4vJs3X?ku3I(^n!*$Q7
zM`;T?kRxT)-T)_6M}Ha^LOJzFO-%}jI{Zi&WCYYabh&tdKr7<7F!bn7HM1|G_#wC2
z`@aY$F%xB8hc(LKPT(<G(3$_O@jHIHvJQ1vARh<Qh+XPVSK>cC>dMC$>BPXm5SfWi
zn<gCZ|GxZb<imet0vg(kfViw4y_B`O3rcy05IVJ_?w7=JpGFrV6vFjPtitowua(Ce
zLP@1LUJ#YI@Mi)CQgVKXgygC(?Su8m_e1?x3Pz)wBlwyF#s3>3I|knFzDT}Q+xuSb
zV$4Ugq4mMdPLD4H@d932+vN<{n!Y0suCuZ2OS#{P;CX!YPUh*u)h&bgYWp(^p8Wyf
zYi^JyRd-x(^$KQ7B9nZ)iK2HYEPc*$(;bDyIrmh=gRsA8gdx4aX4p~h4eF7I>~6@y
z<tJHD?`N*F1Il_0I{98l=I^idBMFjfI;{vfH{&@T(ULOXHcXT}aOXDK1n_q3<F;77
zW0NPQji=(ZUHnOWg65=CpOgeP=UsV(xqi`}41M6O(hw&W>z|UZp&oY6a%jz|jz+o%
z#xsDK5Y*3IfT}}}lpqa+;(2i6=glund1AyVKpN(kR?07a+N75UQ0ELf%8fDZ*IyCS
zd%<TKRB+Kx90=p>q-nfY0`kj&>4JD#R-C8%Km@Ah1T>;UoO9Wl1vsE$)Du4lmJP2L
zayyB;&)1{HsV#4Bh(7G3MV5YL5e`abQ`3|_QPKxZ-*1}}G5ms$Mj0cqS54*pbRd*%
zlN6y7ZavQ{og7i7XbG}JAP`o$J=$}%My`HkH%3*nbaLX%_%@i9?-0quAU7j^4e&X#
zxw_>`<IiZ`K>Gtm0bF@s8|`vkG5dh<>80_$@5@>eFI&EXL*qS^>u7{|q0edGdl?j|
zYdl`~oO-%3lsX6sxC17Or^dC|ocJGo#@*dq)abp3)X;G5eR*Cs$+&B~x_K3tLrp-b
ziEKSF>EgE4={Ut6o;dXc4<Tlpa+b2`*IoMSU+-GDsPo*fLiKj-qp_$eS$MQbZHRzw
z>qw;0A&)UY3zuPAPWmD^ZM6tHpc=XKi8<-3up}f*yUzEV9k~r;Q6O3)<zEYe0838b
z*!TBwZGcI;3tB$Nc{H7qI8H96N*j621HlC{6E77Od@;3Dcjy=p+SQgRr%@t_5pv>!
ze#)O~cF?MJ6XMSJOEbCH5O3TB^;8TMXqA+{K}PnW+x806okZbBOl|*VK^aDHmT5bO
zhF)W--^6HPxUM}!x?@;*c(qZAoHqx~J~55gmUplhEJLm8G@c_Nc`cV%NYr}yDoU<x
zB~%pPAh~e*;~;R8<r3k!^Mg6;G@7&-%oRkhXND>q6ao}{<y~YL)r*6<$&;Z>5CYL(
z`I@bb3~fpA5>SHf0$bCcgQLB-VL{nX=j{-bqejd!Vc*%&udBL&DZ&nf#0M&~j+gZj
zeu$K&^y$2FL8&+!(8Ean^z$S~@uTiW;kJ?E1Zj6>5w_5!^c~WA6~qFl%RTH&#=esz
zniI^95t7gF&bR^bN^}>DFPqv^P>8*jL0=a%xzG;3>h#{h?&21df@!aoPaSf_eEa;N
zGtO;n;J*$8JPGk2xY<z43#(zytirppiRB6y`rJJAg6>fxlaV}FVWQKR#<iPZ&MXs2
zPNA=OxX*_{lKa1Dme6j1xt{r&B)Q;?d(^VP{%EH|7XnG<=plSe)PfPRa5@sQNMxk7
z8wMJx12^lNv?%d3HynO@RSmh20M(0^E`|Qpc`z;avwg-4@`cZafC$q|!7Pp;Dj;Jd
z!3a)rON5_^(kb*TTM4N%1msujCgr5xe(A1a5*Bk=!G#9hYZ_@$tMri%Qg@2vI)P@`
z1Z_bPobE6oFow@Ec0uGvf9aZ5z9F`D(Zl--3vv(oVx1tIf2W%9Vq2rc7va!g_6ZGC
zoz7x&jY_p=oPGA`M~-~Hn2UqQgIjviGUKg?^@y;kCZ@bGBMjvY$FQe6S{fQ5y~x}z
zGdE4P`cVjW@`_X#mLMmHhFMiSN;BV;aYg1T3d2;rMj+8zaSznAGBfa4Cx83d^^H`3
zZaLM=QK1TB#ExFIOmJDORjj{<Kq<i9l|q*T!Y9Ef=I7%8pl)%3QxxS8HXru2bomUt
z?9KB3V|VKKF-FsRGQ5%&TLZJzrvhvEPx>K7TUnis8n$XMID`jX(9r*5?Y+af{@eHQ
zk`hWNq$GQjkz`ZI2xad*GLyZfi0r++l$o8qDaq!QJqlULmhAbxo_gQ!&--)#?)&rC
z?>HQev^~c4xUTa$uk(D#gAFui4DhIWpYz0$83jfCJ)=fYM{wmb$}JnX_vU~zQ(+#!
z$j7-Vtho;9>(EhtNdT!;Zc2@Yb4=Tfx=58zsvV8xIbzq<o$)}0#1HRArzW#%K@t_d
zX=|A1kh&XigAs|914HwzbloyCE18vnvPub}aqP=<49a1?E)caT6-sXtNTew$k3A;C
zx<syIWZcKMU|t*2j4K@8!OCoD!O;_8C;szjrDioMpxbp69%mw64Yjxg!J}>^=R2<M
z<vZ_ats>vB3@pOpzf($%HCeQi6fE3R2Yatns?UKzg;{3FSmFH;HlG%ZvPMTiWB(=8
zGznQtf8uMdrhBY6m0U-OD^O@=x|6TNj;b<cb{un;!H9w3@zEjT;iB+NLueX=V{%QI
z!|Ga1b0oVnRjO=_#mVHOnW77LhyPs!1<^?npp-zupuf8Q!}@!0(+Z6u_F=<8eF(0L
zi1T1|unJyrT4ETosn7oZ+0+Vb{|%N^(Pd@VnD%a}jB8Sk#Ma=o@j{8b&VD|E)bL@3
zf`hV7{IH6#veS7q5Z{`hxk7OD>d&<&{+;xV`oNj&@~{f#f$;{~Y(C9!mGRX;$y)OP
zCN3<vhd@s$B~%n|jUtoIu!$67;hZZr?g-;~s^jP4Q2w$twfF}#4m<BBXP4(X{3xR_
za^SX5@1${B3|6NZ2em{^<p8H><bGLlD9oWZTuSWb<ZfFGJ|v}!E}eDnI7<|gu&3^1
zlGm9pbg8_#@DCH2%a7Xh0EEIMyfMy_`91+6R$46DCNl4-I^|e@Tl~VZ25zp)Bj>?n
zaz{{h`q!0$A0Vx@#l|K%a^yAp^r0$A7fCHt=YN<$&gt&|CnZS{_A02of&wzvKu;72
z?|KR715p4Y>Tw7wfM7yx@aA<4^(<XM-5{7k8s`Ve@aO&r;O)SOip!r775vMvqlWj;
zPAxF%JNC`i6bZuknoz0OS$2_4-(m8uvYTVNm+6nbbW_Mj$)31}7e;%)MubyqSfjWw
zf1$Y>vyi(mNVmw~?80Ea1$Q*RkEshi>p<{;a%R~UMYOy)Ik)ojGNP=5Z|{I}m!X#3
z%m?&K8tJH3{RJ9bs;Ua0KIRnC`y!Kd8!Ka)1W6ffyivFq^@m%UEV}htyXy8i6MWwe
zj-9^nmAK{_eVxjprSyCCDixu-3bw5SZQvfv#X9(2gV$0?I+i1*OBtDMDLE^ZEWCRG
z(R&#z)O}j;Mhm3Iz~USWnZa+>ILzxfwM!PuHUFfxiE6GASQI}TEb~1!7nH?YV>Tbk
zqO`Nu^X3{~L6y!JYo(^6L_w5q+Ld6e%%SB`<Xd7;rV$sz1<GYcG*S^wlroP0*gj+{
zZ63m{F_c)Ebb<91L?<!tU4m<M6lt*%1%nh7NC|m*QavR7o6i_fL%aIa0c^be<jbje
zPUGUex^H~^@LuG8^+&Gl`CgRu`g#Qvp^|azZ?j?@zrtL_*ejKxrYk51eFJ*<=Y)}i
z5*mrlk9;0V0DZ5UG7J^>F?wjevrh7wCRFsiYjtDT_dI7L6@BF??FEq!$GUvmd)#XW
z#+c-SxGilBH}!s3Eyj8M+<vmczJ(`~B7pzVx8$uf>S1dPA87<#P$;5}M&Iq_Mkt5Y
zk@%i+JXg9rb&*AI59{cn7wNQYi^Sw^zR$0oH?3~(;<(NTwZ`E*rI+(>BA?12shrtZ
zls@~I*GVGXlbHh;?=Mxu3!{^?YkyxNK6Gr$F0mf9D|26^Qcb8xTT-9G33bjmu-pH|
zl~lKhuZcbmcE~EMno0UO#<P#0{m+>ZDjPF{EgqqN9cF3HUbQsSKn_{HGVjQZ#u6h$
z0ZoBs9Oz{u55ELo=lKG@cX{e1&EK-1&f0;r<5xHTsu(GW;-m{gSx9=4%gf&|EAZDq
zL8maGK4?>Z#xLs82@DESVrqwCOg(&qHBm78F{{YvW$R5L$_4WZ&Me7wI#O+!xFP-u
zl&8jew!5l6XrHvtp4E_@QOTxaA-Sei<JgrGNPIaza5@*z=A;4O+KpaROt%St_#o)2
ze!&lmYZS<|YRd+eW?+1OY&YVFKu}PYa~`M)ZqH{@U&H&wJP@4$=`}7WlHO5Wj=?>Z
zzFtL4y;%#C&ka&eBY&zN^m0kVKV0y4)#~XDUC1q!afc$(HuI=GW&|W1Y&zz~2aPZ3
z>+2&)Jp#{%XtMd>d-DSt=!hNz%z#MB>(i@?KRiQ(IyMBJ{(G6Xh9GXJbJRde0Yw;g
zI$dOzMwPS81Id)GBTqY(K&~#s=kk|8lQV7&90iuj=khOB&A`X1$^K7sw~`{l5O@aB
z0fJH=RIw|ioy)n=1u{I3`Hf0Qb$dERe`*&|NY6?7iJkssuTNC8?Ky16!oP(j@l>cl
zrav!^nL3l3#n&l9;?Q<g_tNoa65*AmqQz~8{KBlhTQ`JXDxs8MlDJN!FdFOq!A^>6
z=HopjbS$ZM`>bmE80N331#<cupBi-yT2^^DW{3Lj+AgCM4O#giCm`lfU4P7~Ya+fr
zVD9zJD@Z8!lKxlUhU5UvncZXMgHzGAreein*h$y+cXn|t`P^>Y_u>sme5)@PT()Gn
zcyR2z<7BVCr+YT<obSY^@8;qZy(Rf{riUvHXm-18^@pP_qb3ass~O7OlG&DK;jeC+
z8vTn4a98ws-?I+h5wBUi%q4-~I<Iezt^!y{z4JT^fs3+U+X)h~lOtmb_nAW5Gx{^>
z)(a&YIIJ{Y0Ka%XDkoQAbF}{(AUg-ZVo=H<>u{TR{+#F7gFa#vyk%a&6o!VfO7xD{
zLx`odKem#IA_ho&0`=}%jovnD4f{y@TT?DIx&x4PQL!ggGn%X}I4{ADNgqtR)!yrQ
zn+R-;$JX3;uK%AtGq<z=!v2KGj4=2Y<H%+Cy{w({0r2IsfA2~+!?_J)Fo{V9Q`Nmd
zwcMD`#sr##l}e<<-u=Z=(a*9+lMssfFDd1@Ggx?_v5{d=q*9_9{8eSZB;2-)&5H-p
zB0`1~LtAfeB_~xLm-4>=ke__Qko1xQ)7WlS;GG?9iVK%X=c9G&-FT)$gkzi=&=Dj0
zMA<B6q9gw5T^kC-DQtF^h*uZWX|u$lkZ%Ke0Z{xLGMH-qVAQH)pE^1=cF7zTYZd|=
zOu<8|IY9$}kZ812en}248~?vG7RBJ`$NM$<BZLZ24{TFC{|75r+KOJbxrscP-+AEI
z_WjpbZtcuCeuYY_x&0H|gR5qdn(0<!VR=Mvm43vOm^MlLNvhzFNFdO=<cJ;uXmikh
zU_x`yw*EpYQD+5b@<$o$qcQEvrBUXoESx`6E|D94sh(Kt7-=ADte~Y74t4e|(v;Ji
z3Q@$H`_lIjrRA~Sd?qTnQU!e*pS0F1x)DEqKxtWHNF*5IwV2+>FX0}qWqs`zm999d
zxM;3x(GoV3+geQgfntL1%?WNZ?dg=Fqs%5QrMxn1UMbme{Mz<}(uqdE>(?{33wat>
z86#-=xsNv=9H$O6>ZVC97SDxq2Qo(IZR%Xrdn)Qed?#sF#wF8$%6$<uUXY~|xqwK4
zT=@Qs0hH6cp$=u_Og>+fsZwMhF8<~+?-wFj?IP75QQYG114&jJ0u!`Y1;8ClBYS~m
z>TUMx9Tvh0Eqhj~DX55Spq|cUl{Qu)kP~+-i%MhV=VcZs)UF|>;?=y9eT8Kp7O2Ca
z3K`Oqz{8YyfqAUWKC^TX^BcBRExR|5_7)_5WR0Kgd$xhhe#ILpWL1ikvlYa%rQHxl
z{hTei7$+VMPJk`hH-JduuY@Dg*^dV=&ntLZ^YIkiP0_IP*J6oq2QQJ-gKCi3l<uC%
zJ5NneGN7EyiUsmP8O1x0;}s~qD!1e;)H+A88*!}bEa_mun;Wy(x(b8US(5vi=FM9<
z&l;S6-5Uu#zY`;i!i^MRTQ6}JzLx-rN~{~Jy5=74(<)1m;d!$K)5lEp9j4JzMlmj8
z2SfzAs0kRXVj>C~n5t1qI0iU|OhqI1_r_-uU9nxQ2({Tp3>V_r-Ds_q(K1}ykKWa7
zXR+3+Vp%osbg@c9`Ej_<Jf}Hkbg@d{c+6=+*%KV5yckmmAa<kQDLE07YRYF=3g(UY
z{cp<nDL}6k^~4}~363Lbo?*sR`~SrLj*n&A2F*XQ5*tf6IK|+syXajxxeC}w$|D+Y
z0a4630`PFRh&w}6r>6<leAd2Mj^hOt?sF^<w_h_1S3N(zzs>vV&=wQ-l5R0>soO5b
zPwnhCnOCc44;*eMm2}Z!lj=<0w`WMEZy(pG4`?bKGv%=VMBukr$`p=UC(P=9`x;U9
z4ZQLmQOsX`JU1D7OG=d8p{>Lpk$h`>7wcHl<wN$>jGU6BY{HRVsu0JIPqK>Y0^Ea_
zNsc8JeQT0pnwGKAeU8bW<#m-lxMfGqhnP)|G<=!CTD<j2#`vdMu=Lx9TN`X&OE78X
zW|s2(e%2d!1?9Y094F&H>Apt-jyEEuw<L!lB>5e-I-t~q@hHc5CeB(j*P!tHa|I$E
z04Ulw3W!m&H&$T|;);uUZg1`&O6(pm1Yam<563LAOeYaVWZ+l;OqZ7<saa4M!=9Y1
zTSr0=2RW&gouKA!)L~$fW}%K?#`=5z#NPEebSRKNnTSjVx*+Ard#jwSG1CjerQm#q
z!yy<^-P{o$W!`@O)rFn`UB1@>-!f!KIup2M!lN?aw9YyKsYe1j@u`w?A;eJ*5<yp?
zI>C7(6-js3b>kgp6Do<VA+~+lcASOzl4b!dVXo5aJKP{;5g6&;^6}$G6SIl_r=V}q
zUh=jgJLNOn(0f8PGmT#P=HnW_1D>;dJ`Ifbw-zHZv21T)@K0qR%HbmB8z<J++B4e#
zHtDSP|L}AbrcXx2o%uIjBp~M(u0}x0aX-^nboV@xUqnb+gW}(ZRU})coK%-9tW}DL
zHC$I<xWf(VbgjhDWSH@OeDYW#vKm{M#s#7filva5SgBF1K+K>AA1*ik&i|6-n7c3G
z*Iwo)|K&%Pw&PjZ>^Ol6b#)`@Un)_wG?{pTPKMjFnmy{{=ADIO7E>=TWT$r8ZMSjR
zYZa4xWjnBP3NB<O63gP;r5JD<>q*zJ2_?cJYu2wS&-Iy`mo}cWyd%E-v$Z>7sBKe*
zO1}CETMO;_DoWUTvg5N`QjW8wymL(LAX}JnMLHSr5#dW7vX4oJB%S+_yq=co<Fmyj
zv>&*5ZcYW>bedYPYhjyf$A)@Ml7bFFiRm`(#$dX?e6FDuozo5eBq~gu)j1<M!!-t<
zar*N$Rn?o}5G={&6oim+okJWMWZM)63t<*ZL0OF{i7!WOAusG;Z_~nZv=ke$Of{kX
z4wdgc1KAXzy2i$l-LiH2!##%!SJ_2MdP}Ms{F|WLBm%W`IIRqEhHG9m;Tm9jL<Cx6
zkYm7rq%MJ-Z(_F8tXcRFZ0~YT+Pn=4(c2g<Is?XcicE}OEo@&w!JL(p0!sJ`gec>B
zgv?2V2AFr1ojhC?j$eWM>OCfFDB+lF_W&b|E{f=YRGmrLuPlN3wPx;%a)^m#@{?Bx
z2@Q_Ql$3Ofjb2k_ELv%xQ<Wr$<LlsBIHG-Kq*0;-qi4>X*<{P!z{SN?Oh<fdzUGr#
zC*0<<kjj{Yx^*h?X?;)Gt2||>{EX#Yh3Jz<+4=w180#r)28e9&SReoKw8KG)erSul
z>PHe##D!v`GfB8?Emnd^xsvKLhBYt28GRaFp%V*nZ~X%Ua*RJ9Q{8SaIEN67<Tg@V
z-+or`0Hc}myY-}c)6DI*h$kxQA^t&46QPYZo0FHnxg0$7j^LfubiT#FtAIsU%<LZ!
zSH0IyRJ!*|wT9i4hxKLk*%I;i#m~8t567S3q}{n0`j~Tpr2W-12jS+guP=+mKaeLf
z6D%%S+Jq#TY=4m)PrJ^7_!+Lr!{FL^h3_wevcp&&_iwVEsV2Sls-R<QF$Sd#9s+o5
zuO-Aws6}fl6N1vH`DKjfc)Qvuo>oUW{@#MrR;%*x63AAEUiO+$-K#(D{)%&pAXAa9
z$fMq8yP$*T*KX~NYG%Q&t|8^xmEH#GFUWqh*mA|L_@=gfvr?IQ1_Jazsr2lclWcA;
zv7h~TIjQ~^J{6lxea7cd5)xav^(^_<T4odF*Zq_#k0w8y-OzAgmq24;AzSjC8A*4U
z&1~5S{gW}paw4Tufycr1RK%wo(o70*oFSd(b~|hO{UEqkVO>{sB^%Vrq-PbwPmj?G
zWJyQkP?70Y+MI^|NF$OqtbO-7mZj^}0ft-88__cv=DBsCgiab#5XKUX^36BCz*(r|
zNu;lwvnd_8a<HJtP`m;MCMu_uGwXa!W=?6b6M@LAwlfU<A{C?BMcpdf42AiT{sf+<
z1(MT{DwZtFu}!Mm-g!19Cf<05n?P3Fz=0;cX4)+3cNG|gXE0T@7PN#OZ<}i8bI}l$
zLz_NY*iKB0L3uDEXA1yiBJ5Em-zum!86X=K+F#<Dt=0G;5*XP!Lbdvp4{gh4bZh<F
z&AT=q7Tm5cm`~Hi0Hv6DZh?y>x+25C30gP~HwoO(sqUTyniY7(Axbo{rp(ZfXg8Gt
za3;!BM##&c*XZJFQ6@XcT=?3n!g3a&HymoqmZYxOU7l1202)I|ux-6;%4I4)0I09S
zP&9(1N8=JqH4hx|pEn&e`geIGD|pylI8P3I0}^llA-`U72?w1y&3LPaT+r1=`1nAv
zDp_T0W_AqplA}qOGY8A9G{I!HX_{W(ajIB*ptPxuyNiG%J;}k&bZFPDbymN2Tv-==
z^MB;WG~fS0bOUA|<Pj$m{7(sODnN6EC;ahHO8qOw29&pCp+>%b`1Y47OQ9L5#NN!E
zysQ_ti-c9|r5cYh&6VVZY-aIh8vWu*e{Bj$bWHIQ@W_$nnxP)lHV(7Wvp;KLOzczb
zj3$`iW_Vo+H+75Pmx|GvSxvW^^1iZD%tFJNTw>9*)YTCpjpXd_*8w`vN}hVM)w6%+
z?3i5z`P;>H4@YKui{-w@(T2R5!Z&x%eBvQ5V`g?@j=iZH-RAeEg3zp(WY_6ec*oCL
z`@({0{Y75at#u;8mQYh}znRU6wb+0tmQ>pS8th-7tS?Qp)H+nivHD$nL+jZG0Xnxw
zoGj+b1iUXI9=sNx`nEl1W!&(fV65bN?tIeiUG|zT^C^XkwUC(K=+3~!IZ3~t<02pl
zE`=@5A5^gd|50G=$~%FTm=C<!`sd&Insx-NyS}s^m_6e<WGMXSH~szj7y*+Sext__
z2>&L6TE*I}#6ghbLH1mKa-@gAW9|ECgmw&Vi^Lkd>vt|dF=u}}s$mEi+a-9C)=Pv(
zQ?%<U7R${<cTc7ro2=pJ7&;wY1$M}DD|nwf`$RE7MLrUSS5a9{g+z|a{ED_izv6iZ
zzTwT+XH@ElB?WRN?q^Q5nJ-oQ%c|+oym#D{iOzD0<!hurh$|;a@2o4cbI7nQs6KQ#
z=D5~9Vy>JqR2&+l4GtMN#?tv!KK^;Oq~a~KZVsxXG3}o(FGfGgxtV)lVSLe)W*mm?
z@8=MS@AtYu6(n;Il0(L?$)S-WE}y51@wp_uXJ!*!LUOfog`JqZe#2`i8GBfE;HbsR
zM1h7TN})<p%*QAw=;A9HQo@UD(KdY)uW!${)4t~P?Np?>|HBB?VNalNh05s1rdRzw
zj2b0*Y`xzaF%_EF6cDT!FKI?!+j5)m5L%q8R9W5%_nM^kN{;*};}vzxazy#28-uq*
zChHn)#-1=vtKWom1mktDb&v7gBZc04$9W3Wu)IIj(MGIH^w4=GkA-oA-V#ssHZ#I#
z0j+TK4R4MUA$kUUJ$uu$lAR}n78-mmj7otzpzLVao73dBe&Sf9xE^AG0rE_-VCC2b
zZOAfD@Zr?P53UvQ#wgf>jy4gS$-fcB)+AwPA3SF{5pBVSchT;J?A01@vk<{RMA}c>
z7YO+9U}pP#o<Z3a6p<(gXibZ>19+5FtcdoG6zRRJ7d4ljVt^Nh`4C%7#`#rJmHEVK
zuS+{l$GdKm8}@`RhnDzR9L{x>Cu`?2bov+Y&(v7;8V-(zsrV^xJPC3b%}lvXFu<ga
z{kioLhs*ja!F4qDMAZrj&G;m1u32x1`U{({#_c$}do73M4t`7m)V=n8=CqA${P%y5
z52hx3b06c*JREoZ8CQC}rhj1co;ZH6u$7K@&bOEeC0CVTi~=fw8lvJ&?+~|~%zoy_
z3{Sr4q;hP#ysH(^tg19>4-<Mf&a3GaLodN*U$>=pqUqBp`@ARllWf%HAH2XP@&hDG
z8PJTiUFc`b9|au>GmP7pvEShjS3Be)mPar|TVUMsc~8)7E9wF%=esN0v!H>6L&EiM
zS&1{!eW4a1ModM#F@bBq`;jhaj28Lo7&>h6u;6p&Z(v<PQDdqUf%Qtjn>Y|Iqc8@p
zmE1`7+y^NN+Vkw~cG3^Iu%s|fW2Vvy96PQD)w0jKslT5l8RF8TB~D9C2|c8h4_0Tn
zjz?COSFv(~r|m-vlgxN-$+_9b1ey6vubzLc+{hPAUCOp2nH?xp>8LFG5$?U5DwmdT
zTyse(FdSD~e0g)#FyDZu{ds2e?f9WM3Pbd8vFd1ux&i;Cx98Dv#dhx$IQjPU(*iV%
z3`!&?bOwnR3S8YhNe`Tf7@e)y3;X77Nux&=jdlWVl-DNt%uSnicQj<?N0}vVRqGs2
zI9V|`&7g^ttb4q?qUln)oV0J^h)~?@DX#t&wYczR{aDcbo;h9|(fS^4_I;L<HsLvp
zpp{Nqr~{Bj1{yP{#|8080+b*Y_NK+IVx;0Or467%^AimX_RFKni;scNWZWD?s!;Fl
zG9(DN<uy`Vq^9`Chcg9i#~A_QlMTRxz2QAgAFO?HMF`%4h;4kMolgh`4w?WXR}H!^
z;*UB(u_mk-lESQ|e!K59VT71hXL)He<6qKvpTg+DNA(qGg+I=9lpe}vzMtylHcrM_
zG<L?;0!1b|OHj1)S*pW$rTk2}gI8Zsy$Fhv&i|cuW<eNf;&#<SqIOF8!Jgmks{$<w
zSIaI}DpRxbI}EbD*qBr_&6sYQEnlzFoUC#itI04d<EPck{Qg8KERT|(dWt)htVUK(
zu)vCyF@neRQBPoO-%QEu*V$loi*G8|!zMz84wkXfN7l}kZ+=~-R&M*2<E?y)hfr+g
z4qa>Ca`R~5qGK{@&(f-@V(p^sk4Ev22_I%pHypTVs2pr0c1|4Of=Ur{fG<=owS*9U
z3Nh9HNIFnNV8*a%Gsz3>rh_V<t_5BH&)V=M>ctRU4F83Lent7Vg@N2MxpK2U3DAF`
z87aZiZJfQ01J)%zJG}pE7s5pD!uiIdmmNGI=lp`WV|Rcia63^EH$fKr{n`Pb8*QMv
zU<^aMwoAhtj7smqzIkkq^+FQ362p!Vz0UcQ=RiB{G<hJ@@adTFz%g=kk<nnM-##tj
zA~!!j_gyhe<hIY;qYQ)N`M8n#0;MWJuP+x49gb3od7Y;A1j|c?{)A(*wgiqxt-e8m
z2A}a<T=tk$OD3`<q+Bu_<0(F>X4F5EABb|L#W`Ds#o>qqFOFf0;y|j4Qah2xni)e&
z4l!Mo1Ss8XZ4FeRW7Zp4!*wS9w0hGOX11r?4D_nT-B*>4QEBvLPln(6_a8-%U@u{X
z9Mf90u!TG9hpMd?e|=!erWH|PQW%MDm>};h)xbu}8F)wr7vUqT)^z%`gj2{%DLs-u
zb@pux=pw{G%_fBsaG?Pm@KG;dn`!TD%v19_t@N*dbN}7NY21E=QMG_9J6z<{X)%oB
z=}_U)%}8E5lPa6Z7cg`zHtsxqQ5~j%Vzv`33c@)In_H<v1qcZRanO|dhLnG25GOzI
z#z!`bfGqF?5PU)CoMvR*R%NFCVhc5f+#K(I;12Nm`kJ!1)xKE(KGMeGkbFVS3ZL0P
zPBb{i+yiaLw7NAT9OzrleHrmhqw$@yd1Q10?sXe9*X%gaY%OrhPGJPCcRGk*WQ$W=
zYy2HDaTJ9mbTq2iH2g7CHT=h{^`!(d<;kjVT`tCA9SgRK1-UO5&Wn>J4;RGrvw12$
z(w6IqTy7dwoMUUx)y<E2X5r*(Oi+LLMX<NN{YC_b$KIR0m=!*Ti+<fDmJgJjU+s)#
zm@u}5bK5=Ao(xNVJiit*knwE3F>}+a$}zT<-ONSJS-s3|Ysz?6??{65{SH=#<I)xZ
z6}>RGNs;AUo7?H0o)cwnzl95{J{qx$W$XB!p*g$EyLdk*))(0)3LR|rYP;I*3;QQW
zJo~|SNTAo7F3Yz9y&><op>MHJ28#Qg!{_~)<RR2Tc0Ypb!Oj&*ZIk#4poy*jYPl&B
zLdHwB$khR>OJn&bCL*!$&j&sgoOj9>h8)NqNj|~|0^jk=E!Ob-qzfQ7flc7)ShX_b
zc1&L@up~^gm2+c)z`qrTm?dsKkw)p%71VuN5|r%~)TdA3WpK=BkeSdn{t7bibowv>
zQ7%5$8&km#xj%nBmK&G@IVm}fT-PcHHI4<`M0HT*3YnG%QrV%7>l^KL4Y=KpayOpD
zYi8(1y2+lMwDF%jS{|&MrX$@^Gj4Np7<{vGSE`b!@wehOaU^Q9L%Q4YNAd4hXC9<O
zutTNY(!_4ip{Bq7R!w|aqW;&Sql_Q+gnD=Gn9!c}Hl>Smt5!ytaZH=}7}YJKrCUP>
zs;(X-<9)sulujXMPe7JV%&TL(og(TtPD0IddJJf(g@I$VM&NhL82o6MF?kztOIJvK
z`Mr%Tp@`U=s;-Cf9uPA4J^<IQ6?P{BYB<U>5LY<)i2rvRpYS}0bIswuDLRk*-86{x
z2eyFA5}4<RNW%b0(o`dt&O?X!UfL&@DW;(5l1vLN9mYD*+6$=05)#+(4rs}DhjAl+
zmIqm7!3Hw;Uw$}-mz_%$13MkhTM`%TzrO!@AsI=d<R1xJDm@-mVWOMA==HS9hFA6+
z%P7@q_t&3lUNzF6(^X&EMhMDeKCk_mmMHmAg_E6XWh=QsYVX#l!3ariF#|px$G744
z(Jb!k-_FkS4QQ-85~4083tT4}scME?Gz;#NJ8uDwf$Lh6nTzX^g0~TQNdH=a=1dx0
zG|iB*`imURlGXS9q_@8olh%0LHtF)3{VX-%_f4FiK(p}5@2ZtKYEI8(YvX6aGk8T5
z$dZVKol4_>fX|luC)u=-{SMUSQ8zEiu4X0B|04yrR!}Z$a;Reo<HH>H+_!Q8-|?5{
z+=sNC+F*;Ac65AV0eHq0j6ojee|*>=reB%{ohO>_uVLQM&H#PW4ZaGi{_{+@uYBVL
zhEVPWa)G;gSTpDpUC;)j1iqU5STez@a7AalGh&yNq1-tYia6#dynv02iZj8Bz?2jb
z3w(5q?RAg{{QOj`NV=~n&%NRswDhM{#hvT<WaxIce6ccyFAkmKo~5f^51Lz$ETcO6
zV!<XLAqYdI_EttlTi9y3RH(Yer)srKAx+Jd;|Cw#@S%09S#Gj^qw;7NGa5zRSe;MY
zl3t$TNeKA(Zpn6(h_uSquU5b#damd;vs>ap&96F8>B>xQIA$agPFktDSofmlX*)Hw
z=0MoJA?+JVbMg=5rAsC!+E=LQWxtjrsOBAqk{nZb{IEV>m2!2zbIT#4Q`RMF;jY_c
z#1gNo0`bz0I9=YvG;d?MLR#lZ8l_M1;b*u0^HyX+2R<6*^x&yjA2ChAZt6u|5r7IZ
zKx1MZ>USBygFlUvGe2a*U{=WsJ3g2bF1DGx#|zveiX1y&EN4JuiU6(+`X#6mkWw=L
zbF1LTHb~WotJZnK;FJ=9-6DKQ<p=R9EaI<6z2&BtBp^FR;HG{$umMb@*MU7K?TztK
z*HyMnn{et4qk!Lewuhe?1pKgRazO)3QUmJENJn#Cb~qPj;9dU_0S^vEm~<nTNht)H
z4B5}AqT#9XDGdyQriGD{%#Ooy?noX>-^J9iIb?*}>c`O2cju5s-m9v=yf*r|`~QS=
zzxQmlkIq^!Px3Ce6ID6$!mv}P-dFTLGFR7Yq1s|gW&5G6YeZiK2cEJVte?r4Ef?T@
zu^?bS6;yh&;bF|O5bQINDSeSv*>CwV%}&nLa_t*xu@B@^A_v^OVhhWc3^z=bzjp$)
z9ymHdD`c0KEP*<5vKjyJa}-qPDveDKkw71I8y1efOUaf3zG8%G4ax<I$toBTNLWjP
z=l;r9yvZAEK)G4B*Mvd+il(>}czXumZ;&e^opY9=D8q*8!5l68AmtjSf~<3aTZ=5`
zT2Xt%_Cnt2%Xv&J+m?^y_*7Ua<2lnWF%&XA)D-tST09V!#Ya_o%Bzu}lGz3Ry8nsg
zrmAvfFLSqV#5@pAxph_Z-TWsBq717H1JM_*?>2oky3IWEw!#&8c@F0v7v|Z&Q@K~n
z*ZRZJ!64ase_4LlyhCT#V8BbTD>$*9>lP9<z#!J%l>Xj9(*Ly@1`q-4EO$!~xlBU;
zI}kp5=(@L%g`*JoD+EA^HV<?A0PE7}adR}<Io~0H6|uQcMvQ}g8)3)`DT)&GeR-8l
z5~d-rpF&O%ABFb}JwD8cX*Fg=KX;q7O}L;|Dn6cE@z|xfeqK$NLWQ*NA9s&+F=5;N
z45L@arfAC3-OWp8?9L%+BGu`?KiR5&{2sWDPY{{>mhoR)fU1suB2L5KLbK1_83kpm
z1xbEMVVz)m+NqJ&tK28xxX3jleV^fHX-FESK-$-*{dJnCY%MKYqb{;W28_U@IvXjK
zRbiG-4HJ~Q$CVRy9(gBcr53>gZf&T6C*6NPJ5)$2JwsY@_GGo_;qFYE-5Dl9zIucZ
zI*}|b5;}>{)zQXK>Es^Lq6Vxne1D}z-h-CI6d#tiNc##PJRK?dkV-TO!%zGYey~Id
zPn|Pzl$>3P?-2U)U}Q50Xi9zfN8W19Cwp<VT3pCnLCRJ;n)dteQf5`7{+#4qhXv`D
zdF@SGYQ}`s(KqgXa!KJ^)3I&EV}4|dHj$LQja*cQDOEO)3)+*UfDO4a65h})70bwc
zkJy}10E>&&$xdfpXWKg4e(_?&2)^03uF)^Q4e%<+4(^X1nbfg-+qGa&et+Ur;Y<aQ
z4z&h?-f+17+lQDhya4a?OsJ9Th0lXI&JeF@Ypp@dbEyB|&%*X2x98cU3kab1!elH$
z==u=g0uJA|N%=I(EzOx<SG|DW60)2Mxq{}AB}md!96^d4a?ERR3Eq-MndgsDpE@GT
z!bNHcB*iC58-M!849sR~sba~tB;^ZtT&5;TkW*}++m=oY5T~vCxa8ZvKb1bq>Gbt<
z%e*YzBx&l>;Qf}FG~jikKW$5TCymIqG`LRayKMa44e2TD!clZC*HOTq<5yv(+&)fw
z(>(t0u9H@nCn^4C*Dt*{XJ76ne!F6LV#T=j5O57aYNNA%WsQGw$A9~b{|wFvbx7Pr
zWX(jh2=sjBDiiP=Py8gJ5Fp~&<Sz6?IRPO@?im@sV`qg8{w!(Z??`zeMp7;lC7nl|
zJ4j|zq<!%}pPe1#?CiuBOAz{#AZO?0;a~Z-XrImZ9o=5XB|`OzW62Mus-|r;CH(!7
z3TN%GqSdRmoj&qrN0^isGAmo7Z$@qI#Q*L!UciYja!HZqZg-&~nk4-@hb;cREj-(`
zXN=U#vebCGl}U6dEQgO>y-IYdCR`<O(B`cx^^BpV47y^Pn!a|r)TDh`s?1MX9pFW<
zDj1Kip-f-@lVw{&?*4by@!wXi2ni{7zKxt%HZ=vnsZt9!MRWsbz6Zk>S4dDr?i7OP
zL>_q^*Lm6zkN|orF6z1hsw7CFzdm>Ah%mPzBlaKzWC0vQ;8TpSCZI(AUdW~O3K_l+
zVQUS6+Sa&?gK6+@VJav)TdQWrZJ0ljD>iMr!cH{DrX5rNn@8$cToc3ds7X+K=I{Yi
zE4TWY-}isqc>g4@mj7v5Z&bH)L5LEw0tLgZ)Wsu<QR(<f&rk)yocM9ipX6q`vmq3x
zi5u5Ra-F2-7?~gD7GKVI>^M+AQC1w598TdQ?~+-rR5|O~x97?4BB2wvQO7d;E6{wX
z%w1)%kJU*v_*K|QPD*%!Y7LtK8#!b&{Y)B<+KJnlt4d6xFU<EbhAs>V%Oj~ga^a&o
z*iS^le5B!2-luJ`{cmfF{5-xkEJRvIV*W|lnTK4k%?s~6kWh9W78vmkjV^gQAv8Zy
zk`+WQ$fw6G>RzB;l8N+EK@^z@^ohdAcx@%C?H6=8FIc(IIWoclO%{evv*<x2=L5gG
zxt$d1tuVO2oCvPNsin@0-IO4bgM$vc*WfWJ%INs1OjMEGYyY&_9e2?WXuq;|bipzc
zOwGdcYT`cS2}A!v20VgVt8aErFq|Pi#vi05$YHmn6-aFBcJ(9R*q;bqjn+~%=D2YN
zbCj5x1cyd%;EK7A`M}<%=vycs)9JW?A2llEey6mTB&rVgk|MYb&wbT549T+ElR_QY
znpLj7Cr7?%c3OyTT37$CB@4TYWDDI|MOh~chzAS!Ni^Ky<XUea7g>K2aZW6T@g!cS
zm87S$85tR7J*fSy<=oP19Z`(bkOwNaTf}AqMO7mav_IChRBM&sn3r>j!l#1~q~bY%
ziihmju8AB#7qM5T(gYs7Vz!|5R=X^zJW2X(pSwzRREdRY{vz@3x0g-R4|j~~&sS8;
zbx}@_4n~0A!a3QYzhWShd>YZJjF}ndA9z?}wpxi(Lp(~zZZe{l+b!As=+LpxJWtFp
ztF;}J&eRiPYsPM;vbTE{Z#<Yk>~f7RuyElUU|=llXQLkBJS$UeJ;Ka$?OwApG3wAy
zk;2r`r9k?bnaY-9)zN~bC`JQnJ1dPoV`S~A@pgD5<rh7IlSh#n7S%i{?*H(qgM)%p
zf6j4dw@$=tQXjw*yP;M>dvX%#A?h)*2pmJX-V5SMvNj<OIf_{!?z^+G5s;s$Ini1!
zLlJVlJ&Y-Jjg0UPF^pYEQo_b4J}Kr^f-6pez#ihF7;O(IqJ8IHIgL@><j%fwC487+
zQ^-#DnM<xHZxxeW^YgPddk5n4yl%-Cbc^o0=-a!!DYeg5moAPI8@X8VA&_AZmtT<P
zxKc>1U&!-Et$lp;Jrm1u)6wlmzNW2{NIk)it7WRO`tfpBM&Arkv0lq9>CU_#=Y1<~
z@Ud-gCcagYPv@cySj)3}zH?yqn@M4|{2;UUcbgqa`eJ`Hr}4CWxJ1hyQb9#Qsh9P$
z$?o4>Gyi!ov5};$QSJu+Ntb~VNbi4r>tK}%$L9~uT22Yjx3_X0XiHHsn1D;Iq&l)L
zjtjgtu(SmFQKoEX2qlFuvRMQS?=&DenKo%(uQpc3<d<V@d!AZj2Q#f|n@1?^2Xd{K
zsfjBCURcntt?VqzyL_K6oGtuK@~QaC(x`2~_v<-pK_M0SN)KHJZ!z;0o8Vn*iH7My
zr<+kKX~c}dmoYY*951!zyVzn5ILs8<t^aw8z2C2`D@dhcqpw=@xDU0*0tWc?;V_^0
z`c`9rF)W1SDF7Ve7yo-M2H6B5$ZyOwCEVINN!n_mhq4q+(SfuGL@XhN$b^;Pz2_DR
zpz{YN6`4tOf-8vVX*vsXsQ5R@?jj%oJOQR`X9*?QFtXVMG+)>Kl|Y#(uu&0{RIe;N
ze=T2QD=v{gnl1j(A#w?G{2vflRdz4us$VdZ`TDax`}r5^JGf&eM9u`L#}oc40~z|+
zjhQ_!(c0B4FJrBvE9UZ4ug(3d%w5-!mWiaCu!<xv8Dtumj>=Xkx^Dcaq&<RKhK?5i
zRI=U0yaU^VQnI7Pyu5>Xr})iJo-$JW3XNRpM(1~(NZP5z*MIa6>K9n58+Cj%RB-xY
znMKL#X-)eZ8QW{Q0H=lgJiM}4&MYU@;y+K%KYz>t<Vs$N?;|HBs!cF4F?ZG5v;_Gb
z-x1wa#3w?~nkskdzc)~xV;-eIl(J(C6R{<bboeKPDDrl9>2t^W$Ae(N!D;`#YOjU=
z=LXHWimFLv`g;l(cIcS#o4j7+3qD)W9U*1n^o~9Q=LcSnjQEu;CV`iW?}-Oyek>Mn
z+~_FuLdD=+u8)M)3U5f6=}X(*B6IAvC`yk<<E#xwlo!vT9|sUG_doh80hB%aEbHgj
zaC{Vp-(OeQ^wE%)&{nL#AEdG0Oszaca}&sU^m!~`-TPex9?vQ<I9*h--$A6Rx7xu1
z#>>15vw;%X0aC#kV1pZOJzjhSY(r@Pql^Lhsuuu-%UDSBZ!LN30ee0JXtnjODQ+vg
zNcY4DbTO8!ivFcK=5LDOb@0m%pj)c%``Y2IN6lPVTmP`rX@oN0?k+<yGA>OhNd}kp
zvT4DvBJ$PR-#x8l?Eh;^4iM-o_6#=7C2u7X6i4JVDu;1X`U*}$KY)$I{^(#C61+Xj
z+3|_9ruq!o+8GAD5C_{(w2F^@Ax%~`MU~be;gw4n=DMWJ`lO_IDSXT~=-Fb=@_QCp
ze#%`73ec@?h(cndQAZTzyBn=zflh}^vO<Zgcjd%%j?&eQtBw-%c&W7v3X<RC6L#?v
zHkOQyibXB`l}?fEju>Bs&X$?Z%GMi}>NR1#Uzbn|ruxXY$c{8c3a}w`Kfy-^{D2KX
z^Df)-aL=sq84$TsdxF1D6cLIz3wa&Yx*u*1MKCJTx$I2(88*HSuBHx)iJ^nyD;lKd
zJGjsS(*^8yC*c4G8X8Hpk+qP}kP3U&mOh=(fh~%zJu}<-SGh+Kf)pKTZF;K6Pf+0e
zUAg$i41@6!5>t?_3cT^WNw}{a&LYqBSLOfmZljQQ8^xgs-vk6_76V#})&1!x5yv`i
z`H<)Hm4Jl^sh2CecIlAx-X0OxmGJd<{bn%J7?wtSAD{e7X~dP*__0#2-xW8*Og<fl
zIGV;Vk~MNIBtusz`*Hf4I9}Q?M=X{-(eadqp780N^0IWfUx^PFd7Y<9?GtJnRlYl#
z>xtzq^uAZp@7p8j+Slz<X)l#^W6gaXwPs0QP&n?SMt<n-*}B+dedAuO*Rx*R_?qSC
zm-#$T-jf0(a4>6i!ezk6IDNhb<W(ZQ&tRR4E<iwTazPDXR^GhVH6GYx=VK~af<Rf9
zgmy$Oqd=@OSNxMvfG!3M`G&*jW-G|J7UqK!iJJ^)eN0t+S_1SN%HBV5CRDE^t`A1~
zpP)autyDcg#A4f$`JFH|jRyV>QG5`pwQ!Ly!!$6jPV$!hNK69^Qxc)PFbdG4%|lAU
zky;U4<3m`z%HjWye+)@l%Q0DzAN;8v$`1_ap2<!XNs3<h>A-I&9={+9O__k1F4E`M
zi@6kBU*^00qd;oyGo&xpk*lFXAT4Mm+Ow7rcRA%>c&aWdJ=4%#`joy~Ck&-q7=26S
zc&L1Jk`P6y>vb^fVS7I$_u<$5O>|$cexp;uj8!bY@!yGQSNY}_4l^Fd?4^T4`60XB
zG#&N%NBKTRU02CoUyIH@KNU`h${k1PzPeq=v<r4MFWH5h=T%(Xx`A^dak}Q&)1%Ex
z+2dX&lAFaR9V;~i*C_xeYPcc)?3Z(6Re`ErgVzwqy5!sj3E*qm*yhwI-?@5^{dVTU
z+NniAx8hg`RD<Q>ImJiIt(akKu(ut?y?sODk<<g(mYIYl=($Tm#4?7_&Cw4a3v_>y
zw<<I4kHaycza6anwEK%;LFBTYN8HBs&$Ta{+!<+DVsKyXRgkGOfjw~S{?Ap{PkgAl
ze%hEY)8HfWG{+Ar{{=MKmXLZ8ogOqB$Jcgh=PJu+WF1O0@!hEX+0f7~6RCQh>S}Jf
zEzex^kt~y0af!N^@y=JS@}=l<!<f4&By5wU7wgm;^6c{*vy8fdg`mA*HokSH*WJgX
z$?VF(((9bw^v?VkyN!uY^fQ~#rEg4b;0uvQe}Puyyh2ITOl!hOX^^zN>6M_UwV>+1
zY7uk3EPAwdSSYb|_~}5eN90_bLJm(k=HcI&=_D4gE+pvGcJMh_y#OR}>$kUij}S@j
z`{6}{;I_r`eMe6SQnSD)-;_dIjh2Rkn}HPIz-i+xkV#Mch1GjJ1VV_1OQkh_4-2LH
z{OiUN3m+Eb&{H85$B-B(S}KC93LQ9;4W5&L{rD04ul@K(Jq!DR<ja%kZPh3H;reGk
z#?57lVvzl4{4e`Kfb2(+i3|-s<H>%MPij}Rf_kI61peSIU`JBp33YNp$46cS)3r2N
zKU>1Qw{P&%BE|o!!eRZ9D>G)2HTld3_iatl=J9CRD>F9Gl(yX-35?BHtw@{^4lEy@
zvJcP+Zp431pFcr;NOGvh9B?0R6@i`Hzg~py;d}q!YKr7gt*ZMvsG|md4oAiaFk^EX
zXSpe;A+@e!B~%gH+iSFonwUwhx=xlBV#tO+visW2n&hBHebn)1r2wyM_>z!qui5Y)
zv@;4p7&w_s0FGP}`M&dOx)&Va<6#6;MF!@?j6Xa-tuZqNJVtY2W}Vu~hbRGc=_rQD
zw;5joBp`qKdLO);UVj!6_MIC>pLRp`;VQBZ%>vn55sf8Gj6WcohSN{s-HP<dNel8$
zM(wAfkvb-sbnB$rS%SwRbFEX55UED~8~fxr=EIYh`YbjLGEMA-p5(c)i0i{rQ@YrS
z?PAdM;(gm2D8N|Jsx^HILgKm<LN^6H<nA1}i>a*gFI{$#nTUHJJ%gro8jJhNd9O%_
zSnT!6Z5LkzMCeVun6(yk?s_a6%dxe4VaB8S<6%aZy^f5KMmCyXr)8O?ypy&FcDr>d
zY4|UtgvmCLMQ;k<BoJe*XR@PXRAmT_Vv5r|3SZUe^Y7y$W7n<Te^Yv`O!A1&t0!EY
z#@F=r_}ts|b<5eQh&CGO@^M~&wD8S+848xCcWwGB<k(j@d1hk_c5T~k@MtlQu!@=X
z+f6N<o!{(omEABwX5+nW;Gv(h={3@ZIIV&Sc6QXD?Y<#y=Lt|?I@q-JHh6i>_oVq1
zUiBQTvNM$eNL5K+!8wF!CH-koqg}5&2Nn<-@abTAKVgD0%G)(E`pX}!k(c^^=lCK4
z2)QctGFl%}0UD%x3mKIsLCfMP<WeA!>OVk3KM9dbv2J2PgO5=1j_F2AM0w}c%6u}E
zaIm*J%4%kOr8++GU1aJl0dFz@Z(Kd=<mjx1)3jtOVQ~(A8AfNgRDGqrzo%D(X*p*l
zOwsj##b#;R&}o9Mj+}*_XQr~9<0%8_m)MI~A(AbxJB;U6y_l2<%qB^D;#x|pbaTdZ
zM5tzGlWOeM&)phhdWu#rylrx_a;KG{&5<V^unU*G7QfPcw-~V&!OvEchgk*!N%Vx*
zaUB?P%0m|;!$S#1`$+{|OQCf)Arn~uty0I}ptDGI^kW@4!l$|6yBoMmD-9|huQ?T$
z{@7`Y6eCNP$JOx+L5H66!kTH3p1i_wCox5?Gdl59T_$8eQc!mP2aq-u0clUrC6P+F
z2uF!esuF*<|Ao^g8XWyzI_u&BU|I36EF1ZkrBFQT<ZDZ;>m+kqI^6$uIQji%ug8yT
z>h*Ixs+$t`oY@jqHAu&c6QWwZlFf9xwJK@q8q{yyk9}&l#7rgFurKLSB`Dwh9_K@B
zUqNf?@e$K4S)Iwiu%%&Ag4d;MK|&&~M^kOr6w)L=z7HXFpbpsl0<FcAs_ROxIs}k!
z>pK#>YZ^ay!WWc;cb(VXAjFCo7WLGDQ}HfvHWkV()p=p=12n#+e{?4==6b-x*jxE*
z8ebDP|0?h<DvP52XO-R{eE4fwVr}A+ypK%+4*#^fB~@UlBOOe5*^uDDPQND6_ASD$
zk8|q8i-%B>9e$ZnfSDDU32B3QL?aj!C<f!Kky@7$K-$FaSbuvLT?pvMcV8n=ANSa_
z8Yvd9u{r>8qNwXnYajQDGua3Bo=;7A4FF2~eud<J#*~n1?Li8!CXKfZ0vt%%8q{bg
z)a=M+I`bUw`6;amb7}p2x!`RxR<|;yuv>ne#&B-~HOlL<^5UUd#e={>-1q^j03T#g
zmsXHx5P3Fbz|1)cz&9pw6}B+0nhgZ|v@xXw+Rw|cl;>u#N^T7;MGTo1JkwUa?PFS}
zxGJQjqY-t>pzVw79OdU#en;u?l!_i@OdB36cIaJ+m|uq2(2w#8L1Cs!>46h>Z@ScF
z-N;a(y>}O`-+5^JqZ8;iTH#2i1=<K6DHVT{qD)m+G9RmLd9~Cass3G?H*ao&EJQ3Y
zb|j;jlxK=s$RmM|APF?K&r;4pAmJ8toktEvoT*{{sE0~{gjGXnZM+o9jZywtSagO>
zlGS(+yjbc8lpN9vgJH2{QxlJ(+)Ag_@m?56*CjSQG;9eO@c9iw5*@&)VnswvShOl1
zAq<^t5FruR_;SA&gx0H$_7+(ad0~nYl#1>lOfOI==^M|!32=XG1IW)P<f(B4e>&Wr
zc&ZueMJ>UBaO!PgP^E3YFRM&)0z5x`^K|Motm}78tO`I$F1_ru&Jn?<;0w`o_rx?W
zA6fh^8`9}p=nDIIc<d)y#WUfqm~A$2WWcI-0E!wzgNSi_=^lsE_BXrGW}JsBmD6}q
zg*(TJ!M1OUZuwoh83Ub$R^Szsxiq7yek53=gS4Uo7z>yU=Edi3*;DN3G($#HR$U|k
zOTMUie>iejejrzMXe%;yRqxFW_aQ@A#!=VLP9huU*VVF|COj$x)?2Bs@egn{FYcz6
zGCbr|U%q7Y*;A+7GTxCbU#m)if|N=>2n0)uld~e}<=f%EoR-g=cdN6gK6CBH)udp@
zIqxy&-rnBQ`tJ&XAl7uYvqab{>EmAfTvJPkaJzx*bn1oD*LFCjk?p&$<}Ni0L9G}s
z?B)4kB!qhe>UJyHoUOjQcs7W%h!N1FMs<1{Ud}XlJ=-F_rK9DBmrav<n8<CR@^L%v
z+=Z(YAm9?+{K1P=FK@VJ_@(4<pEFATW~G!fajC_SoF>1dc)*pOW>qkn<{7}8*<=!c
zISSFQbCpVG-A)_!Z=bo=ID*l9w_)wzgV)#O*iU|!Vu5=xI@AJouI(WDL<6eS1<n3k
z*MA32+zhj+>Odr3iz=iY_pvP2;k5L2OD_F_330~F7G~Y(_2vowCB9c250mabPQ)s?
zl5gruDDWrz!`u14sg;+YGWwf!c@-iz7}DDA&fj8Ndti1@bnO;LUPS!b!E(if^zvv~
z8I`m54!HDL+-g?RrE0d(^~o@U9RA}~TUy{ONwmCx5XcBpCUyZ2Z~K9>FcKOCPC_HT
zD#U&)xd;E`$XsI6k&Bd%v5M9y84`CtN{FtLMdiEueN+F=*Onxz?f$BEZ*Jr_r|BY^
z)21y{y7&En9u=)XS{*@tj?5<oU)i)Ph8O;%@JA%^^0cD(UPTg8+3wJbWGxvbaXwrW
zpRpEb8Beb1QC2cN=1XN1NYifpbt8A2ahrq{KVm0Ha<C>eDXhrCvF%NUaE_kI36_8U
zHH33NpJ0S4rbWme>9oJONDrmumNt`B)QbaZk=#3`bWNfgzWBp1^w+Wd5&sjp=Z?P2
zWEHG`Lej<X0o^t3E!=x-%^OQ4XukGmMfj*+*4Gty$3qioy|BlQ?OLbeZuz%V-(I*)
zA6CU@|MN{*EbunIE}7h;7wdpASDnlFU4(ZHyVpX;>FK@O^T?+YL>nl74dki!nfYg?
z_$fNS>csKB%vsY=X*$3mnN#Pw*=yvP)iIjyBh@cx)QS(6K6F{*fl%o-46>R>)F?JO
zalxTwd0Mv_L+6!4<DYpI&06s-HA1^14fpV-`<gf12p{dz2t4{`Zi7<pTdwtYH$N}f
zl^lA@;7~Z!Z}iH9Og9NNRr}}#V#X6j9K(yDUsDiv3_=jSY|@X2o^hb@UEQ{pvbFQR
zM^W%kX?^xR1A_iVUDmV$a4`m?=SoJLZ5$DyUxYmO6=jA0;+g)r`b+5v_<H()fsKB?
z;E29hC)7Hj=iNb-RBvZg?uE2IwDG-HS>u!YoHOr9OOS8K$g5okbjC@5>!2lgP8Gg!
z_hlbj(bwT=0go)2$w>?m8eQW3Xgh_%PxH06B^dC;@mf5c=aQW;*r-3Q5{7;53VoB+
z^)D^};T9d+lMlPxpy1S!@rrw_-otH~%*M6*RlDk{kMyq1d32Af%H2leb0-NqAO|+^
z%3gNI7&+;5*+@MnKxtzbIwHU7k4U^Q3M8_9hO$g{pPEC=G&;7}1juKY0=qMjr#GjK
z33PgS4>@?{w*0cJf%N2qw$sN!qX>#I;7-yY1b{aBxKB_VoR5b~;}`UYI`<(;1VOmd
zjI9T#geDT`{j?%!BaOPEdgyWPYY001ia79G$fMZX;1NDzQM_{$ReZY;5YT^h`vNAw
zLry1Ic{-ZXgYRY}G;|O!b$5A2eZ=*rngaW$YuR3=l#DT}8f__DHAZc5qQ%BILBvkb
zb>+32OkEP*inZL?^$*gm(7Z!-H=)?V_YBeo#Y0%Hc&6nr`an?(I@D%hU2`iZJ-*X@
zstKDlfy#$PkTl~7eT>OHvZhopF&t5Fl<*O&-4;*jY+6DpCjxc?4n=Ot7u3KX?7lk+
zGy}flwNIBgo!9g_^skDu_&*5qS~hw{Pltc@m)tm00CoFyUpX<2&uEI&L|Sd>SMEP=
z8eehQlpK^h>=ovbwt6e^tA7zUb}eX?%YJdNOF^#U+RfowfK(GOP3f}}up{wlDL-mp
z|1z;M&=H0)uxyR899mmRcYl$fRfTu%Jf9k<&bO)6#F5ayzOKNSn)EgQS+3mF3pfAl
zCuk-YL0zSxbdHqxHTUlyP9t6R2+6>`59Ie)(hUSTtJ}4HE@E5p{_?X7DLdt&l3{)P
zEGH@6<e-N<-E!bJ>9qG3!~{k3%5M@KXHn3^&AZL5wip)(^Pzk{Cf@76tlgOk)VN~D
zgMIj^zD1nCksu>@e>>h3MWEIBtI+GPBchIt_I~<i=#$m0POeDjF;tDoLnR*#!zvgy
z8PN%vY$9)DPi3WED0}8%C}_w=QN5qCYCW?CYAfah$-mq%BgQhw)FyW7PVe_GA3Y{m
zzdyN%md~hFo#fR(<77E}L?itY1hpiEyk72=`pi}2E0N?=)B?FB2688d8zHLz;Un+C
z!LJ)$A6hQPT3c`5zKqJ(QJD&Dt0X7`RQp#cN#i&PJO&RIa_)f@uMGzm<YG8TDexk)
zn-Z!g27n4oX&?S!z9EV8$p@n-K<0`D3A~bo{<s+UOB7uT*ckXU)UDb_*sE%k<_m4Q
z5-OzrkNsaZB~d@Xn{b<i6zu#$?Ltm&>AQE}m(b5vGDt0(h>iA7uZD45-QW1>*KR*~
zQ-@m*d}`ugg3YUuY*#hJgB=J_k9kt}kHF<kW58eGu#0D6C&6r_IOChkTyp(!b3>Z0
zE9YS7y9WQ-fEnR)k`F;KFv_B^PNg~Y<}J`Q*c$LIn#5QGCkm_!m!@tmFsm1*gSdw%
z2-2|tv=Rg9Ok%(+T5vmwPF(}rUE*zDLm~bnaF{QA{q85$RT2AFV!=XbI<4NWw%5(B
zY>Jc)MQXN<-(7`HKn!?^KHldzBB6SNV#o91sDvJGf-`tH{&;Tg2~x|axzg5x50TM{
zY!EFgccS>r3YtxtFPnYdm3=p$6}&#nttEMibg-cz&Qz>gLb!;$=+gUwAWrgBOD~YR
zd-3j@Nt@*j(v}yt;PDp=+8~(ul#(yh5<z{kLf~9p+}l%wm$y8iEU=2_ZQIpBlSFfM
zvV=bK`l(P|e=FEDVn9ID=)sI$QqdlnK?sMJURui8=GC6({ONi%s|{M!_EM4DNR)p=
z`CPbO^M!*?i2?8tN8`<cTn~V|{$!%0E%sIlBNMfkn3%-a(Y>5!0wCJFmh9%5p(a1P
z4Yrx*!={S{5ec8=2U@+}b(TwM?YjmJ)&^aM!ISYX?%0&Sd^xfmgYQ5V?wyai*?buI
z3~{CdTlVyeiZ^kOw7n!IU7(;c9I0_Cq;sFHm9V`&KyF2`$2UwhS!M3-=u<qX?4xn)
zn!hEm^+|C1TN?KFz16FF!Tvw@xV|)y^J9~Se>t4Bsc4AQ^L!TnqJDzn<08qWo$b?)
zb0}34wFlq0;_+(>O~w>H!&7d*FM`EIvvebgcQ2udcYW9}bl3CJhu=ToU5mYLUHNfH
z={Ab!ZJun*pURBFTMQXxAutX0I3eVJ!Der?`?;hBB-iJd24DjZQ(Y5#ziqFlKXw*^
z#{brfFAI)(2aj*^=<PQch0IF$i+u$QNL0B_4ulT#;v(UuawENjE|<A?aCQ3u11h;j
z{{w+V{{HYbqJ!Rfn?McOb?hb*Z<x!$5})LjARKhF48jafS&WEjf7j__9QASo(`5<h
zrUU76f|QN)K99~TNUF?k@{POymff8bJ|?AjJK4C5^g{pwP8GkuC4an&_9|saF$sU1
z4D<-+;MiyU;{BxorC?uT#d^)W$l$ZPtcc11Fh^%H6RmqOO=#YW_CIR<;`-w1IrT;8
zZt)PA5fEIyw+=UpnBteHKnjQ9E4sx5+O#j(bKX}&w+?(6Itj!oG_9+|JIXx|U0r^@
z^}|SKqUZg8ynP2a*6sUui6RZ7$leh}$jZzvJA2FCg~%R7kzFBKMI}4kW@eJTHzkC~
z$jZok&r9R!`Td^ffBfI$J&yM{9FE)V9^dbEo!2=(=jT+-p}E(;)GEpWTx1>IV_iy7
zJd;Va{Mk?-uptbby%)Q7dl}!4nuu|IGU??u?OE}g0oPwJKe+Hs7uU?ZUf!T{=>kc$
zcN+AKwF&LO$SR#<+M_y!1E~>bbLU`M<K*=I9(6$d->gZVyAZNZrGLvz_1ZMxSFU=k
zhvV5`y&8szE?Bn@wi7hbO_8q-LtZ>@onUZK^vR*8x|qosI5maSGObthMbFmq->)Pk
z6`aJgF^2Mo&NpLy>vt9e9i02?V)`0syw{$btP#HnC+=(ui5}JnN5Ol2Y~Pi$7TfeZ
zQywtpm#3!aab?9}&)d#^%R%24t{&JR^!PaAh+VSOKV|!h{OC#~yRrDTl-l(z`F3p-
z+XrhLjx|(Lw7~XPir3x#z^ODGWU;r}amRn|Ebt7-66jOKTw7zlRw+fd9y_xW+r9nn
z9l=9Q+1KZtnvT}o^&Y0tGuXpP6(coSZ|_6ao0VQY?NXM3o9~`LF!AQVhi|6&G@Gk9
z-nEs9oQZ}S(-biWg%+xb%jJ|l)QwCsv<L{5EStJ0($geA7(KM41SgptVqVHTpO?Jt
z7M_O9K0Cu6))67W$4(LX4;?nd6g%ls(~HtujB*`g#LzNq&q|JJkGLpsiT;EIGIBw}
zC`ia-Mi4zw#9WEn`ZYFpw~AK15B_y5G2hv8`JR+>B>B$TT~YEIGkq!>Cb{ZWhX-7>
z=XS6(^2;RaWFUQkk^ILTMs*mtSG8fnx<BynKM#$cD;=De1uo*512EM{?Ayh_I26=Z
zi^>)gND~I-!-O8ZTSq?GA(!)VnEpAb6yLiS(j~jPu4v&o5QNdXYD<+kvmULtf)Qyt
zYVzD}WaA6lCpwlOikpg~D9No6chv}bka6SViGHGXGG_UQw9hU@IqG=;DJ6mAW9e!G
zVLzH=k~k*rC#uW(CH;?biKasBC~!P7>PEx%71R;jowq%GEB`>+;f@3q^5v@}HZ)kY
z^wHCJRb1t*$}f%HJf_ycyC3)}cyij@?AxU#`V1YZ1>S>S2<{T<^Fg*FT5DnO$hAt(
zNjw#YS|E;)TIW3}StV`Yvv8E2aXa^JMDTgqbrXI1Bp>aJ;De=!93wh1FJ#8$xxQX0
z^OB8(VcmHl4-c~k5`5*D5M7^48Mt8xc@(m9hf?|i4;7vGbnB?dyi^|#h2Up5Fe5q!
z!a6)PJkaR$=G~zh(|1EwLJ^5{_t#}^_Dt7EVfFE8OpgIEekXk;!vH3cHdWk+_hjC_
z3*_|ai7yQzX{J9;_4}0)Xc1m}v$G+?f=?2tW-G6;K{~DrEUoWJrnoLOrbh-hGkyf-
z{KkDpg$Sk=SE8I4y2w-1QiyQBJ!BJXsau6O%6hR_)FuNnWLui<Sj0)>0@?d~rz~i{
z-J5w$VDCyy`GI-5^;^!SE(<pePHY_b#QW5B=arUB(aWkGN74rlZ+mUI2BUAV3CVzf
zx<w*CeOJNC(xNE9@>QQYdIA^8<Sz(6?f%^iXHRrOKTXh9FN49&;b&HmI?9XFb+~`p
zU~#gZMQD`ir~%e+^l(na_JE_6E<^<TJ?Lf>F1sEPysUW~Qmg^NyCtCrC<q*jzkVbZ
zn!=9*Ec)m3$T|Dzu%$t*)#<VV^MU*R*t~WFPlKKiBxDg)TS(HKfgTq<;<NVyXvr^}
z<!<0#7YnIMRh3lm*#j=Qq<(i`g@!oQm}u#$GtqMx8>^|Hrc>&go1?1{$zVb!lyO(b
zoKS3*zLcDl?lYBL+3H7+@^FGL$9EpqTbfs4((^+0)3%9U@4TPR7%uGpBG28i^Lay!
z6Pr++l!T_f$K+9ZQ~LBApDyDJ&e(bzmz&QVN~L2-su=xEQy$-`YuQ$LuAZ!ya)z74
z-1u4S{71;Oe1`^yeaY0i&7|e4e$GL%>F(mD>XHJy(wocf<u5YHFhb@VZ@*AV2?GUe
z*H|xjFl*@n%7@yCjZ5^o8t!OR2|SgPJXrJn1#?$uSrl{<)zYO1^{^zX3C<d3EcLU#
zF7g>g<+2D#JD%1WMHA16OUu~(WE)%nu->a3O3E@Up^iuD)|^OVKc=p3MP^54rq^Bf
zu7+p)xkBo@Vd)nM(4SXHu2C%e#$H$}!H|VGozh;<$-LDNGZZ<)o_?0=+<Q@ti6V+}
zbEolwQr!`O4)Cp)Yk&LJ{m8d2ARBi@dsmqV=$LE*Bt5e7yPmZT$J_a5drIqgpC6#{
zR4%ci;IRBg5!r0;;6#8$Sz=z^xi^QqLGy4OPg=mD=F>+bJ2`JeDC8h`>ZN}!%=Muf
zMBk#t^lwiaJs^-nT@G^VwsF`qcw#OHlS$Ive~RuCw`9C)bRwZML8KK7(i@=&yB}{(
zQmmq0z~j+6ebPEd+pSPMMx{KTDAyPSMEaBD)60_Q-g#WhycnoRa6&y<aGKce!MQB@
zJq#6`S0m}A^6H+a&yu}kJzl?69Yp+yG1MhOt$}~`;^>4uP+n_INPBj^=k2+2JhoTf
z029YB+6R2-$x=*%`8F{tk?QB17;g0%cqkA!2t!$-fS+!WZU}nw#tPos-zQ?aWJ%g*
zlDK(arODap!cBwl{@8s0HGb){=w4o=FRI~jQC*Q|8anKpc}4^vZ&onKnK-DiWOK4b
zN6NCmx=riA=W9Q+@-o}$6BzAd)|niVrY?7JZ6DZVtpurrxUh{FzpM$y&Mb<XHS8`6
zaCuolfYT>>x3D4Lgft1OnObid#S<6EhYh@=N8?<*HdgT1iz`cCoE&w3`6*;Qj9NHd
z-}I*Cm@Ml?tE*L@daSay3|CnDj3X{4fA)3`nM~pM;K9~NZ;cn(hkw@tO%Z5W+x*_>
zuA2E#THz$!#XKM)$YY7O3=U2PA1kC74PRRuedXiGAT&LW>NJDv!Et(_7sp)&UElMw
zZataL6hGzY21VhpGd5u4WD(Z%z6rGe5LNiqA+e}iG@JdqScIbnJ!*>pFM7WUJhd0f
zZ+Kr}B9GGxgF|%Qo{)U}Oa{x7?R{s1*)v)5W6LJ;gHPLHTWA=l7xYYXT@uk)w9gBe
z08;Y?VBdPGm20e`&0*kgQDPZqo^a>!V3sZ)N~X_^!er~dgV&^0c!}n|4dS+%V<o3Y
zd}*yd;n!`w1)4&!X=WgoXg{#3odo#6(s8M8?J!G7jrQaDpy#ak-6x*CZ^k*`zGY5W
z-<-VW4pDiHNpx*dm|BZUmkBSs`l+5Gk<8De6F^40j6D={;qvkHty-%dB}~3$%@FIr
ztH>1XJezKOTD2DIkP67VKa$U8=G(Z)Vbar@Q@(x)l6WJ&bg{ET9OY}`1cQ1!S=tzB
za^Q@X@{mpkswbE4l0f1mpK(?sPNG<^GW)81fKIo=qj4t&(0iOW24tio`!9L(N&C4x
z+Lyse(Ru+B%wD?atot!*p7M)DK+JO0tDkEgfq0OZKV{~&?jA`5Q+a|hxrp>ZAQSun
z{h>f7qm$^f?MB99tyXy+X5HWVHv3-l6>#3<iAVrO5O1I=xa-ZW0|&aSZctdB=DX8d
za7xI7X);R)Xp=Uz9zl@s9R&&70-w<cjZy?uoI9kwOI-1z$O*q0$sv!Um`7&fqZ(8h
zNXYOMG78yuLISJ6*>Toq@4#vL*m?e<xO-9v5rOyDV`SuRO&^yYNm=KUmuP<ncC<^d
zOHmH7L05@S%0~e8D^AxF`u6XnzMJbXhi~0`>>Rtvmh)6syJ^9PVkfrncXHAkRhT&0
zx9&8dkutN@n%sU0W?sePBhs!K`h1FB{D1Y?er`->m4B^0oi3iq>CSOJ0RIalDKDJ4
z$u%|zNmhZwjo+4_mGdIh684oREwszf;4x|y))#p<tpxWjy|YXDwprsmCOyTQNRjTT
zHqpzX^T~t1h>;#Vy6oF9XV21Yv7Xt>OfA4kJ*|(QI)kOdJV8Ss1IjXI!bP`>hUsw_
z`Ia{dtz1TApu>G1Jtsqj?)NTRy&9I)Ium9;3Wg_lh~5m?tN5G9rw6x$&WD^2B{k&_
zs;_A@$YfV^n=4x#0|LN8{Z&>Iu9emqL2HBWug}bBxQ2~k!~^J-er|x+D=4AG+j2K7
zpeKBV{rE0@j*gFuCM4eVgvD;6e}958C`U`QDvHRc_%c56nqZCrVD`25#w?%P`j=EQ
z0E6sRk1Laue_3{{OCMqAFZ?rxUIfBKC(ovR9zQr6eq~?UQ3GQ8<0f8N+Ur<>_W*gW
z*L$2c`7?~<(u)bc?gwM&E<r4B2Dw-r)d))HZfA!nZr5!G<UOrApNAeIKUeon+@?$%
zvs#+2BBXLB*OP@*H%^KT$N+BOyejm$967nU<Z(Q4Q1)IJv1#O9tU*}d7g?jgB8KnJ
zEZl#wN&1PZddTYBZEI7L2dKd1QoN^!;huH1UZLIxP!W;(4!Ikld><#%`(Zq%^=5AQ
zH`9{=<l{UuN;6S+kF&Anr?OUyv)}RDf@Hf+PcUv95HR(&?3bi(g|RA^X`1|-l`CR$
z%^FEAsC9vSTiM0UdgV+6cRZHoDlQhP)cP=6{3J#*))0d^vNZYmCT^K<JVjCU1ts%|
zDAEx<Ga-$o6h{q;ikPu+dCp5?<`$_ALK=}D5_;lu&y-+#Sr*b8mK42S>Nu@I#jSC?
zeB-NXRuwOe=^_B3>Odpr;j59b>rFP!eaqx4iKv>o5e&6H8$T`>Mf9TkDpN&V{p=Gi
z9uMym{l&+lL(l&3Hy5=Dqj9P){qpnPAm`yFp2-cAsjFnzEA6_18}SbPK`RZKB(-SX
zc5|O_&y()i*md${dZCmfACixJ=@*SHs}5^V#MQ+B$l*5E)4R~P`J=3{3)VXJ$o2?8
zPb!JQw_NcL4?v&y`-k=k@-4*`NCo3=ViUNn0@*{hjQ5~(uT?ws=$YlYAwz&uewv2i
zl=;{yLUcR)Oo7)y?Gn|wyNee?El?I#<4kZ|*@QELJXvE(D-7+pU+T7W-dSMSDaF*1
z<cX?O@={J$R?%dEclTcLj+<vg0-r-2u?Kqs+F0v>N7)RB^`pCiyP?@tV7b+{Sxd@C
zA7dVR&bCjPwud2rUQ2NKKvD@|VyNT8OIzt~YV_6T@qB6uKHxqLM9g0|$j%!1n#4FH
z@>sTs1qT{DyL7xhlHraSSs_-U{wOP<VOb(a*sa34;c@Xjg%h_w;zWJt86})L5jw8M
zV4GoJ`Gku)qdX)2S&fkSmZn(>eUS%QxQ}QE>3uCJcHT>+PY(9D9IDDq>g?#iCZ!wJ
zzCKn*IQ!kd@zA{I0L*kSTVkC^TJSK^XgzGgXq(X}U5dM@IKe5RbZ&&qE36A6?mWi{
z(`<IpCXB?O^6SsVQR1T~_=%1DEi9y)c8}=-@Buj;#BVj{Ye?T58O)fsUE%8ZieoQi
zPZ=em;;QlFSg{b~hH9gE5IE%k`W()1N*M1ab{!q}>-?DGKVz0%mjYQANZd1d6)0d?
zW5Ya>)aZ6bQu@(%s$9}-hc$;_L@3=)&YXuDU({vKMITkyxz)Edc3)|I{o!O4clBpr
zu&jqk5T?NRMyFr_ZQ0ik5(-lASThFJ1o;P+1+vG(us_43T+NSe6I5?>#2ty<KFn60
z653HXRK=K_Zp^VJvt3|vxA+YuAT}FO)tWOga)f|!d8N60^vbdQuq*bx;fX`d=eY_Z
zXR0t$0VD?4BU-Qot#llt3x*5?@`M+?VI)d9O`Oz4zOUgB_tZgRh7pQSPuQm)J3jep
zF(W5wM=jv^bmb84_@@)cNBUXr=t403Ra58|(0!Ah`-mF)AAW?65SVnk*v)oL_oE%)
z3KC?6*s2O-wF!Ui3u?H;JEuLf+eGH?2l?<jd+9UKO8T8Zh~TQKrC4r@r_iT8ASW+8
z!aEF99kMXY=X6Yp<_H<zZF(sFtWDL=m{}iMk<f~JI!~z)pUsx?i~8p23Hba|yd#wC
zp!}Jk(_{vZ1|Q*#fB%SyLtE*D|HG3mT<`E0RsAheT*<T3obM9Ns>>dVnHPfbMOH)Q
zWmo%xScN2RZ-4@P3Tz|UjIV$;8v3?zOo3%xWIqt$geD6o--Nj{at+aErl-;5EgV8V
zK%Zx9Wdb?X!0Lg;v3lZcv#E^Q&=-%I2oty7OgYl`9B;wXS1i00{-Tw}>s#c>IkTpR
z?J?b1z81Z=BP;r))(u)?lt~}PIh)$vs#@9!bS3mRIRmZ`?g|(6D0%D;^mX;>D9piY
z1Sl^DtW|+fV+fbs3!C>9@$a`nEL))4{2ss_^n0@PGd|$NGXpzbB={cnKyIDq$O_Dl
z4Ll8G<JU&Xu4rBbSu^?OROlHhHV9(r+VlSK1V-G=;YZUNtQjLAEa$~@v6KhV3bMck
zW$p>mMjDOw2=au0Qf=mFHfv;b*}+zW#@Z1Pu2~|0=(QY!Uq7OxKdEjA3{lTn`R7|@
zKeM<$QmcR9&IkmsAfK>w@u9tr-%A0D7D_%T`bVo^13EG^Is7b1gZx=?fpC*IYak=S
z^7PR+?m+g9+VeFfI4|7}ViAJzf|6*^ZhfT--lr4T6M!hUGIjvYLmY9kBA8{=p?QwH
ziaVKa6?E-%VIpmV_;BZam72NUoTQcF7~E+M&kup=(gN~Ql}`CAhhxhQ-ZyAW5hR%6
zgG4}r2P!JSx9c3iYfq%RD7w8iS}tUdBP3|Rx(P?@NlOgcHim&JCq_vvFL{GC5AB>>
zlEmf~MUGL<?8q{Y)l0w&X$ChqAX5WzFiRWEfr)?-*K#9O0TTKh&&NN^OdzCD1f9gW
z{{F8W{y+kXV31GnT3kg&e6E%*NW++va5TMx_v**YWvS&-=X-BMRUQKpMd}NTGctuc
zwQPzcNepdo9h~Qhqb(eVjoJ2^U(dvy7B@5d<to!6SGi6yH*VKe{(;xK*Nx7I7;1lo
z;pcFnOJ@WDx%%D@>|8C^gGXTJ#6!(s>n{gE4n^&eT^QIoT1vE|l*hCW{BgnnPR@)}
zVyoMg9N9`)qv}9H1orGTx_y*}{f8HP87E}%*npqum<G9YF>*n~J915GJqPW6xIePU
zCqIYHAx_EaM4Zrmm&Sh%$QkQA=E^1-u{iPP(zfp3m7GW#+O>2$1+@1e&0aH-M9VE<
zoDD@tS}c3e7>qDC=s8QsJ#)Y}T}Q~R#Wsnf-|27gT6f95lxfpU2?@t-11k0i5HA^1
z!6HI+a8<MFOD9E%uni?A0TW~_x&B%G;WwR+{48}-eG@P*>mJOovBt{VDVOoyDX#WO
z;&%{F^j^1u(HD1f&YM)LtGm`X?-OtYj!Val*}{$Vc}TRZynEORnm5|n!+vsH{qrig
z?E~Xjw+<X#;elG}Tm#D4Pbu3qx{c(3d8|=yxjeegdFida?{LR$cKz3DchNA%Yn&1g
zU3-ZxaZ#HXvg;MmLU5JtqV1o!$wA`4u<)8Jl{1Ip2}xwa!s!l%b1Ce4d3znujJyYe
zbET@;72E5+EZy6)X?oX;c%z}%)<~=*L<emGZ6dr4L#{o;@?4`Yss>#z++kHRsc3?P
z&Dx{<=>kvw0E?Y%|HVw@@jB06vLlaVr_<IEk-swzqgf%`44-%PH)4JRdN=vBCndkI
z0MPF?EC280V!JI-4gcK|<$&5{N3R?AogsQjyT=WpAf5(KTI(qlvM78h{uEBEWYV;6
z(AU}VGYHP<by;>J{M6<Po^nT8tUsQ^vflw}T-qEIIbCC8f1Nc{aVBN>cB_Hw2o4k`
zZ=564WO{eWRXVRVj7o5wUe%-w@k|GQ#S_Q|PwAD>8!S6q@WSo^^_QfZJnQI6>!`Gv
zFgVgZG=b&rcTMy3a;l#8E-IcTyiy8dRu;ccByGGRUrmX9L6&|EzkWc}{rau_hx2%_
zRScA%hLOst>-E)~ifsDYt_(~-9fP(ZJ5ddbz`x-QzEAhU!)1l>)9QqZ>NQ!ck>C_*
zi=r7aY5FAva}1JYwa-XF&-bv0%*VJkCt9$YI5niOc9AnErhEhmQbN<7q?|M^J~`NG
zPWosv=JbgTb!g##^YM)GV_&lvwK12dh2dn}rX=dwftjlPm?4D_mB+IKTm}t(r7q(P
zOhObJFJO<a)a?Q<VOJw;(H_4Kaol~v2$3Xe{vZN6iLC2sAqqotW4ySshg|G-7*Zw_
zjc%U+Fpqt0g0JX{PZ~;FQl}0M;{c|6-&lG^w&Up}7BU3veg4nD-Zchj30dG{p84e@
z{4mz8sQuRf@0AdqWK1QDIC=_Va3NBm+1=0bHdLq|fU6}_c%L^bHb~asSqCgo17h@Q
zW2Ha3I@7Pp1&Ez_C!|5R*shW!^dBiwAt|5EF3Ahb2iV$Smgj$pMxY4{saV~5AstHo
zXcJn?E<DS>jT-{4cgJ?SY99Fh!YPkaWty??1Aoi+r!RpVG2ocGKHkm1l`Skux^7j7
zXKy8&PbbCJmq4B)c{i(ITAw`dR4jh~G++iUfzObS`!3LRmL}~)l-u!>>J6hC<VNLJ
z-tI@7WVDX_klfxgDcS}!*HRlF-vLr@Vk1`ti!ZxdxrKr|sNHa_i(=KioJA>_E(IBx
zw?E^nF`mTMn!rL<lNz_Z3r^s0I5wUs#HFuic;yBlKCBj3j!pGilODq;OxH}wMf)Tw
z%MBPPhWd5^NEDRAX0Fm$7E)^)QqxHL4Yxb+UPzg**X>^b<zf1uYs~5+ZqRwMIGGaO
z07D7$*oWz{CEZp@3li2Lx5#DZ^`(AFz)s_+_ndg1n?5@XNI_-}2;oh^g6ME&45@BT
zpS$~+^<JKKQpg1D+VJ;i<jcBXwlLSfTt<iyV)j|56t&Q#06EvN8jQGA#BV6WMM-ef
zi4;!W*(C!#2r|hgW=yi1(2;=To2r&8Z^>33OcflUDUpj}H7nKwV+YWHUIX$^&wLY`
z3<BSccsq16A>2fOFm&3yWH1W=t#uJ`fZkJJxg!CoTORNcDS5hV1OJHApwUnf6wod~
zgqS3*nEUOQmZV?EA`|F}s_)MnIw4JnC%VKMMJ{!^%ZdpH6pwO__V+)0=CGx#U=Ik3
zm&fy2kqY9Q)X=-GTk$bEebef|z79L!ur~!tqH^1Im{AZ3AR6N%gr_Kp$capiaHG=+
zFHkWLG2YTu={`Y4<rNQP`?EJFLYnE<Sxx!$YnH;V%_j9`oK3q--NDn>njolYLvH*D
zG7EW@nXL7;qT~S<qc^zE8k|*)T5CHx>i(v+DcQyYrS)v1YiG|dI@(W~sedMQqjLin
zLJ!DOXKhw6ts<RRSWo3*6s_vORAdw=0<LYQF~{)ou%RF6cKRsi^vfx2>yVxefD_pe
zHqaf;I611P8&qUaW70z^R03g1f-gSVwdiB4YIXV58Rs=jMARWdz?*hl0YFG*S6;u<
zK<K=!Q17=PnU~(n`0UquM!w&{x1G7i$yL<dZ=zWX0}ICip3}}w(?;xV2s1uCDybp+
zk07>r_JcY>aSmQ7%0>PYmO^X~_Sf$+1tCQxi4;}4sUr(AMvsAi34_IGj+{$BeuNJI
z@|R2*jOgq0WooPRLUCITlv`&#=!oxZyi<>10SM*sl_v0R!X6xl&VEKR<m&Q1p*#X!
z5UT$>btz-${vPS^Be#a%m{$yokm;;(HLgOP|C%RIHC~Xk2yB6&UQxi`EuA;3%(}`-
zsGV=MybY>x7nAO5Y~atYW~xE`i71(ZB1JxK`359^9&M(|xJpuEygyB(6D539%=5Rv
z$i9VXG!cMyz6kc7DrWy@{JGI2ua$n^QV^EbBvB5?z;|L>FxJ@?(GCyoY!|u+oWIP{
zexK^V)w*E~wZX!xNhh1G*iGWi5RPk06^S~I1-fY8tgs9VOd(Kqf`GbB5KPx4H*U9W
z$fotu`&d8#LYS&@Q(~`)<Yml@MS?XEaBLrQ#_<Hc=nN#;0T*Rmq~Px5nPpE0%9m36
z3#aQU$RW5l&@l6kB=tCK%+zu~*l!5v<nrZk;)!~2%jIrur85U+fNO>Oy6x=`^QO6+
z2_>nF70((cF(v!4tK19kr{WjE-w}8-M~=Eoa9k<}F67y59>R;kGcuU>kY&7=K5rYs
z3eaTj*?ap;j(KE=d@_y(IxQiiM95*)3|*WT#41AlrUR>;5d?ePVfe{CHcfV2HLV@~
zqi=MSc&xkHyeD9K`eO!j%tThVP;5eNIwX+ugIa#*nkvGR+&@fBgxEcl7BnSp7(q|?
z?)#NBjQo4nklR26+BjNo+~VxX>7)rPgH#$v?%+qYcoq-nf3dZ=`-`~AZw)~Yw<iLH
zq0u1F5b2l-r?-iyb>sJsFDpos2Gcv_S&h&ek#uXSKX;hAr9EbesjCNZ`SvDL$oq)j
zQKGqo8y~Iu)p0LC|K<bm`x;_u*{f31B~vxckCEOvM3!8irBQFUwY^*lHAO5=b)|?m
zfUcTjIr|*QE<W|+bCf>5xW0`?Xm|h+)tdUw>dG-?xCt7geA03-y|(aV(lpoWsKBpz
zB!iyzl)d1NJ`b{k8C_?y_Ain%&c+zHpBi2&{OB%6ERHd2H18LxRt-$qUNUE%_5efH
z$N|Xu`#awPJ12>I*xC2L4Bm5yWcW^9h5M_!GhxDaJl!ekhSF4-&lqPVupH?p^V-Y3
z^NOIU<oXw)bI3K~v#Ax$iP=J3Hd}gy_6a2Fl<n5bUBGz?08gcA_9^zyCsZ69{O<Vl
zgZ;5~LO`oMrUq*n*_s%E%ffQaV^{Umu#ObDR@#@1zQPKjNX|&CJmPefgmAFYHM8f*
z0$G@sN2dbcclI#lC|3#niS()^fZacZLepLkk`FMVbmW9O<>m8<aoaJM9#;^QFe^M_
ze0TS_>QuSP5g#D67{5c~Juii+#n}TMj>}AV_qHE@xEuup4Ho@0Gc<bkewYTI9ig4g
zr2!@~^H%&(?VCKeSy~~BkFh;qF$xeLdT2nfQl{ps!gPPn(j*VH2<RB&`XN}LcJPqe
z;fkc(zZ+o_L89sh2d@~G1F}<h@mpcUC((!D5~DXaeV+x93L?+{OY<A>_KJj0aF9ds
zma+|{3{AgHwhx4+);~b57VkgTFV~omt0l<yqwDv#EWSV;-s+ck8Qa*gq_uID#U=f}
zP4mP`Iq(D<Pf7(!&}8{rcxvQXs+_rNm2ln}>m#ScB3QVrlNsOT)Fy`@lR1uHNL1w@
zmZZ{SvVjWj;?-7x*ELjV2Ycp*E@F!PjrJ3wAw(C5NKe?lFHjKKoR^p_E7Hc|tJ47V
z>XG%n;5MuLf+N&IFYvC(MW4oNxSV#GiHy%aJ}cylb8~VjCsEp6{;MR<$eb?_e1$Hw
zlVi{@B6(Z-fiZ(e$z+>Bz{T!GW~Ia~Y}cLg3s_#Gp?s+H@?7O%?hsw?h0Cj?kvmIC
zJC>2fp`*3s?^e$NW$&@GcF-sNm_)eSwi!VfAE^c7mTu#19Bi*_mXyeukb7a%(6}9V
zd)^kl*5sWOo>9nL<B4&zCeFUqFP+T803U)c{vnHwPSG&NNbdd|%#51NICxKVg+7rG
z=w%LDVp%XhRgqnl##@C}lHP`gQWC3N)FsY=`7ry@qw{o;@?N5=Pm#7;p*kB_q9sQV
z5NdOHOW1Sm<;j(sX$xFOyo6j9vmKiFW3Igks(73|@do1q#^GjNCu_a+&IhMfVVyCz
z%_ET_UhBT;l?P)?hF7t6xI0e0UWY6Z&v%20HHo2ASC`&QZSKcZna!YWEiG7*HRG5B
z%_CkWxqiaa&NU9uIJU88=-J!WkkL02?XDUxnhyL`Q9*gR)7<qAySfp|Tdo?T)q2R{
z5?lrQ3z_l|-7^a^2Bo<3N^2&p+x#J-Jrsrn{`5kl0f(mEiq`m(hP%G8bXIjLwIlD|
z2ChG*<FTHQVtpHxj|?vwFImqWLkWY7fg}VBuWb=#7X|VozY4!`4}hvXazLsF&B(dE
z$DgW{IDjA{^_t;9yO6$Mmyo$DWw;J6TD8hy7@Q7tCDh#}NTPF)_z+;}A6&RK&f8_O
z%C0jpDxEArmLe3Kbb&HJ?IDm9r0EsEA_?FGxPlI`{q$-QSS8t=QfU0aVt-v!YEpYd
z8%Ae{JzZ3Sn6Nup(SE30e}l7UAEQai{GHZPP@&VuY>1Lc{FO``@8->z&%PhmukbEI
z?L}A@lmJu@)x4hMt0bo`OKx~y0BT#T^u3tWM0Zw8eLCVX{g>D~5XRI&7=v^3HRlE8
z+Sg^vJ-P}dlj;wztA0F<^|IWXE-8k?&_K8F#8LWG@d)lqK`!)MQ|ll_`E^;_ggwiC
z8{BUb1D+nRuNh;+huW>UzNW#74CFgP8d>@Gh};UfJDCk|)LsJGhOf{^bnEkp@3i@R
zwta2KqDZ&xdopqz3N23c^Z=<D8G$pYv+*2*4tI}GkKj{}8U4}zpi-BZ<HJrIqq(|w
z9#Y7@T`$g>c&Cj3NT<}()m}9A<;FwVY|a-1N<I->l*i{6JZn}Tev@r}u~afwPAnO)
zq7)HCxHY&VeyB+o6e86|n#Bnb&my6%Z=Dyu)0K{L5j7QBzj`lvg4lEnMnG9vFt*5K
zUi4|#9<oquf3%<{`ON}%#e9M$@ysM&(a4QBNWQJV8acFI7n`L0e!@Hg#)ls53H7OR
z3mMus?|~1+T^m;N)`Q&wS44h#ZnD=AuE=YDzYluinhbOo3@#Yk??w@%D2mW?tr4#d
zt%0E;hh7sNPuiL`-0862SoX=C(r}gDTDvt5y~?+GgbfCM_9PQ>c+lH!L7N`$13Gm8
z!~GyCju1yDq0Q9k1)K{)#Bo6Qrs2}xt3Rv=4#4`bA3|wg8b64WtO+f*-VGrsCdnne
zN&m4OidtIJp`>&znL^1#5|y_en2^-B+Y*Y2OEbz#lJW*OrNua~wfC1TNhe6xH`r~}
z<cd{EU>lfpsiX#kFEDUjy?;{STtA*M!(~VWnn8!22w`&tHHUbrzE2aqy2R&4&~JS*
zLh3w~sJGtzYYSd`G1z#=OgNMui?!7T9@_v7*a%q_{TsxuxRa?QF6UhWrOG!>#{zmp
za~m+K`!7Cby=Bw&BGRQ^*k7x}f3h!P0mg7Wxojs!EqSY^Jm2Iqc!`5)I~>DiJ}@Xj
zT66X$Hh(CU6aMx?8x!oSWtl(3F6T-7GXh(}P)|U6C`ue%qnv<ze?p$~Av(Ux8@=fU
zO>`{@04c>WMdeVJ=Y7ld-C_z{83$X}b%45}1V%npI%6P{*i9cEPAR}8wi$35M#$TY
zuV;>3tLt-hd97d+j@xzOzKXd^IzAR3y~wCIQL4Wa(OwBPpVSLphsn>iG1}v{tbI*L
zobk7a-<a%_<0v!jhcQvquETqWCQyS4=cfsz;@~IZ5Q;M1o_0hWK8%kQA<8(<_8E1M
z`ndvGPiz%tI6T=E&p)Qr9FNb~1(1^^Y3B^fJslBK?}uW{A*%CTF;pBFl{?<5&N@1e
zPF@bD64cnZg;}_LrHt0T7Di&rbx7T)Sp&np@*?FQyLW-Q4e*seIW&G=7hH@HBjiZ*
zh~bG`TBBo&Km)=7Zg(4f;cYf;sTnZTh;>5CrY4vGa3wx6A(rt3k>K8&=n~>}FeV>l
zxcMY1FLmh}Tf0Dzn6z7u-In>%b!f+M1LKL@iLg#wm7^*Obe9Jqub{(tN!EivecAuU
zG{KwDda%~Lxu_pvh|t$IgIO~1_GA>Z5_K(S0K>jyq0tiD8kXS~?@w8?Vi&cFY{+kj
zU?rOgGSz*4$RR1>OhjCU`*>1@Wt{LZC273`zvjgVBRZe-5Sg4g5Wt(3^MSlOZ;y)d
z^WK`fOZ#cnBwYBp)L?-e_=~kITBh@_L{Zz>#u{}q6lL<{juL2VGF{+$R~+QTaCuWg
zJLgGax4S^#v!b&m033}TVqT=-NTlhbiO@($q%o)&J;z+Le0x*FqEJ=eUgQRYP>R8f
z%*$D5?;<i3YBh8lKc1$M^$4my2IejCrvh~{ne`O?CZak89#8NI9{u`ynmv}^A$eF2
zkEwW2lz(X<{v8=)pA^SWE?Wc>3kvlT+EHGRY>RfV>%(iEJX9`oXWI+<1)h7o-SjKr
zlp583awwSWr)HqNfT}2MK_-Q}g=aLhPy)Nw7;P|t2~u2Y@`qPnzoD^Rkm&<p%6pB_
z7<0gQmdt!C6C8d<XMdoU#@NA$-+rpvuC1thfPG{|Tefq%J>{)%wpI_ob)&yY{Y%b`
z899PKcus#uBzy2q080aRFTjGu^6(x?`B!^%KwvHh$@Hp$1b`^W!pz-qU|=eD-(pa5
zKTZ@nzQytG_O(NJ7Ep&lS<?&HN{n^QU>(~1FP5qn1W(NzWl*Cdi*dY_<wK*Kqmp<y
z-+R>mLM2e0#GOn_R+K#5&9SexkIvPv6Ra8sl&!qP90F4VzhyE9hDOw&cr~IGB(1RW
ztOY?I?Um^?pdlC-pHLLP!cWRxEh9S$#+xwn%Fbj$4Q`D2tZ}&C7SNnOyI2~>Yq!|F
zSKt+7@lo>1db+Ky$C}b0pkBGrMA7^EIO%tQ)1Z3EzzufP#_de(s1Nnme9K`1!nu+;
z5gCW~)}DQh5lwGMFDJIhUrm8h(oNaEK8yjj4D$#F<#Zi5h6VGP;j3_ZUwLh~%zna;
zk6H^0WuKMUe+DQlYH04w0mR}(9!**N+>>n0klJZ&h*7U7U?uTgO|<=3+F$r_VURIC
z3B{=`%Gdk?Gwzj!)TV9!tP^>~oHGg!p%FDWNXkB=*42q1er25Vuno~G^3~eOKH-f)
zIS%n!_YqDArih*jf4deUp>`Gf9Uvrv9jVO8a(&K{==EIj{Q9X0=FFBngQP-sizmjV
z@`%TEfVd0{R8t4^g#@;{J|A5rzt#mabpkB%g5+4PeY|z1#dN1p@LE51%L|zGg6+%!
zT}?{F@2Y-k9)eoXh_wr2FKu=ubuUON!=5O0aONV!jsdRX`TUHodBx6lN$GqQK9}3#
zSRjY+X1mW=UF1OUBU1%nOcez9#6MB)8Y3o}>W9oLyPX`rp%c3dU`KxbIeZlh2v>+A
zYwP1s((Yc6uL|~&nCdwBkZ-xr_zx3oFCk=v!wbq^K~}^M*pCr$%~y1dzA|vZ!dRvb
zyIue)aUM(D7fo%iRCBlm#!CeC-K$o}gZzDmeY@gkgm=ft{Uhj|aX+9u6hqvMIsRto
z|4x&)ee&AonuAqub^=KPISeJx0F+dcWu$-XkGr{vcY?_visf5c9bsf}iSzVHm`Bxh
z+Ys{i7qG`*yya8+BVhnm{(&K(ZGpuN!{A-Hfu5kP`pt+nqM;?cn_AfAb^sQLyyq>6
z!fXKZL{UK<)YhtAe}<-5TGXsM2D7g+mupPS&_1yJ^FDmq-G^AVl|PBQX`MkmI8C((
z%j~jJF&ZfvLFKaJg$jd!^VEarD}67oJlfN|%fZe_AV2)=OXnhTDPE|g<Q=u?PG<sZ
z5e@2{IL5ECCrkGk828<Sb02DZvM{%i2`<-kcX-4PG2=uzl+UV_M^M}$riNO4)EqFJ
zXwz_iU^mZ@c0$L<Q>fda`5!}aEy|h(VD!eKGz}uHyVEa1l2sz><P#uN3Y)zSzy5)b
zDo6roF@CeT!Hgo4ImOEO7jvRvKCk+lzuxqfEuQj|Atc1fRaY*RYD>1vioP_hzF=<?
zc`5QP>8EabVFI~eAKf&p0C$_<<l}4~SzJd|B-4}$2?$r@7k#|7UHWaaV@*%SUi|2m
zD58Gj^NrH-ihA~1W<zWOu|0kr#~+tCed$R|VDR08h0Ay_Z?;^(vW>PmmRrVSCij3I
zE-gHN5gQTeCO9^n(!rQ3%w&MG)D02$aa`<qYbJtFX&#t}4X!c!ilGASCKSHfpzs$$
zjJ5k-T-ss0y;!Y@Ec=3CG79$MNwGT<r7GNn@ZaGVd;Bc0TFH)fStNOpY&1^*pui9!
z?_5gnufR_*Lo6s0O8yK$XN9IWTbdtZCvsDt(d3|=pJC;=#o#MOh)dwgev|$c4mQF3
zgFX6*!s<szZoy5UD~4ZU@p}dn1PFfoS&e_%pi~RKrT{V($2YJB`2^_DYpnNn4!Uiw
zEj;2hk)BD0y|jjjMpL~xG`4ow@|qJ6+G;{U#dwBoBHk(Nxl&R#@Ew>0{Q44Cao<V|
zb3-LPTFFwpwh@7=UGo0$Uw&4x;n+${);@JZLk?nJ0|?_pb~c?5=Kjno%*D8Hatvn6
z#`H0ZLZ3_SQ_=bd@Zyy5wRhNwRLo{#*aR$Gux3&k$^~#kIHZpoo!-b*V!^lUMVkh{
zWKW6RusCvH_de4OK8*MX*sA6H9503O@o}yldy&b_+Z6U~fTS}%jjBk1vi_EJ)hk0S
zwvP|q(?&lJnOjYe)Q%&9Gmm8%laGpSo?S_F1AJehp2=%cC13-Kii%=l)6~>V1?XLE
zd4<0?U2KKY3ba^FLrJZpk)wjriPI<C78Ln;$dt+T1cmO35UsE8DmJc=N6yJiT)5C{
zs<1?HuR}s%Rf?E*SH&YWqgv+Vyk6fvvEkyE+f8A6us)K)u5OX%io;eW!B%c*a%+VA
zT4h@0O|0uT*tf|s61yVXg@tG`Cfif;E!&eHkSYKj0!+2gKTXh#4`0aQyqDM(hC*Ug
zxgLIP0X(TXK+fbIS;Ko?&%9~`>_X;)BNhqH(cU{7L$&1Q^Wwi2wUl0LxSuV`hzbdY
zH{8QSZLBt0?o=FHA!1jTb$QN@ys#|+!UwSv@uyp7K!UyC?~gfzY?6ES?%=hJ_xtde
zy!#<>NDAL^h~1d@l*1_q|Asv~^Af0ALV+{C=Z!#r<Mlx(nmm7fW9|q8+D6}}cc|T2
z%w&u^bJvdgtletiA3cPF<#%s|iz)#ciynirVsbtKV?B2LKVN>1pOPR}DTjrfSf+Nk
z=O$WW4!8#|6LTaa9|&mY<%j!2o!&hZ5!TGl&ri?JF3_t;PiO7!>e4C~Ng^e3ZAs;I
zqVx5ofjivDwaz5li@ixJ7Goi*7Q{pU;~2q*KDY}fN$y$43^Ck=7;TjAcgzRS=r!Wa
zc81`Pb%LJlF1c}a4<Y*20=&~O?dT4bi`LCMc!EQ&qoJ=SlY`X^48mfb0zPzlDkDen
z<q3fCA-0iNX~NRqQk{cG;JTtCrF9gRV>laB!woMoDkMU3AVNyF((I{lYn&B*kF>UE
zn{yNSP1tr79oUNvF>a;PDL~88`-+L+OfJSN*?s=Kr3B|6R<HcR0*KL;A&r2}J=~P+
z>X1*x_FAoIgZ~-=vw(}w+mrBZV`!&*pz!KZq@R4n*!=Qsdaa>`;tq6@`Ot?9*cmx!
zo9cpsQz}r+#CTdv^xPRQo=EJx>b*oE3<qHVn!iDFbSjDJBPjzgP($(W>k&Ey+jHOS
ziVE6r8`wAf6f+}O(@J%aBPL2$-lWf^kn8E&%bg@ielS2!*!CGOzu!caMtr5ym91D<
zjZ(D18$8AVuttV{9tQ4)NuCQ)K75r8yQ|NLqr9sh=8G${1q*tQ9(iu^thKRjard9U
zSNezOP!a@6r*Q3Xb8u`q(e+k;_%1;F-379uZ=IVsYoB^r;|dAR->yOm)HfVX!^R7(
z)5l>uESccgN@BHsCJlj^$~{o?POhc)kpHa8+nj1s;0=@=JV^xzifsc75K}!_6i4Nb
zz@0i(It^=JwH^W9Z!Fwn`Fs2(lG!=pBz_&f|G2il{ydF8CKzH$5^Xo!GHr8aZ2h62
z=tvo$zt8OSjDGg`+Ot%gFIqBp;JcPgy1?S>NSDDGAMRms?7Lz{P>p^K{p6Rw`d2y;
z9*aGLww)I{d!Qot{27m$8Y;cJNW%={AWci{%3CqA030aO(Yp5Sa4=6N33M)e&+Am|
zJg%wt1d>ZKetY!oPLJA^L{wt*IW2OS=VvmRQ|Nrf7C>y-%OAaB_;n!v`<6Z&tK)rW
z-ZS>OOcCe8K><a&iGs^X;@KQSPQ<LibDNXV6;ZU>)0o&f=OP}%^ocVxOJ<MWHqX)Q
zYRz3~9g`b|i|d6(jLV8L!vcm;M$|}bD6p=(u1Hg9;lPa;X~IDuJdjujEjRD&OB5>B
zyL0l~mdr~m7+lKmL{KaDYak`)1|yeQ!_pmxD@0nb_)m)LKksAnWW*`S5%a!gEuQeX
zDCax_RrZ~k-mNvO!2mrQk<SH0tZZZF7ddZ~4>26H`uaR|sGK9$;O`&e*Fi)(ucxzY
zUWiWB$-U==EQmf$Bgx_e%|mK6>;_dua!9qCw<gZk)e9~)aq~D&sX5OMbVn{+x<UX-
zH;W6-!8*4f7Otxy=z~?8>Vtm>HH(IyXTV^H;j7{ghkT2Q`6{wY`lj04MK!FASKZl6
zbjoU`oYSnwx_oZfP}zst+!%V6cZAL3OwXMD3^E$KXXfv}N;$3pF!~+2*E9F_5jb(C
zaN>M@@56$-T)v+R@9^718(&q4`7$Hqz3JNgkn?T({p(2Xf%FIsWlI?)nKpt2*42i~
zU$$@o+g1)*ZLT-)IMS`pP8%`Fj*RgrhM7%$Ht##rz4q?qogR;+Y46R~_M-hB@A|Sp
z9I^*jPZsv7Gz^XQ!L9Gg!ax7=6~uzxs;q3_X?k!AtW|A-3R<6>gP>Hc&9q42%@GD5
zA(1$V&4$vOgLB@gNn0}(+3{NdDQoatS9fzOW*a5#1UCy`SK+TK7URTDTsnTzNpL1I
zd|`?6j=-{gkH4^_$<@>L<f-H|9;>ru#_{3KeX(yhh7`2v{>N)X=MCvzS1H&*P1ZN(
z(ZXK^H|}pT3|9!vYEK~lW(9cjQ>|4%cayEEB&-hZVa0iunHTHp?mXwt)qU6i8iMNW
zn5k}zg;FzQ2xTZ8{0A`_j=jXL?6&ozSgq~PBksD{1!v@K$3*YE-nv<o@u<AqEAVM=
zt&hOhp#sApF8Pd{+HKDIWd~LJ4fjSEN)fIf=NU4q-X|jcevoJO{U>srXbK-Mr)y08
zo^~$_70m3)7KFYG8t<>4dQhEZrB`Aj>*FPN_;SywdI4KO)2017p@gFG?|7cRP@NVQ
zTtA&hnNy94-S}?r@w!tK#-YQ2o~uhhA9YzMd_lsL|BhASmx9*X6^Hk)nR@zr$TLOo
z{d`@%_Arqo+PX6)hizkYM+@|Au2j4~`#<mgua*5N8@{C2@;5Aq-4&OU-(pD9tI2=K
zYfBOjHyvz&B8UZhawP_j1W*GC5Ut9&OT#3*=ciPkPgd?bs-~^2-C@&o0$RR9oE@P3
zq2#{7QfJb(8kAu(SlQSTgo35`NZ>$5Uw>P}(Pfs}5}=Qb`RV`t$F=;=fBH@%E68k)
zh18iQJoZv&Ckcl4<RR}qxpJu{l)@SC9bcfjr^Nq>)X+26zIdF(d9v+I$3jk8T2R2F
z`(6#*fb=>Cd_T)yI%_+$p1Bhuo9R7D8keU!L4{v}YcBKOO8K7-{okMK#mO#TpVI2g
zQR6r-RAPX25@`q_rfc2X`dwE2+=)8G!57fy)}0267>Lyh(An;b%n45+QB|$J4b|hG
zrdaN7NT)P0CwCu*K{oA>%IARq-+zM|8lOTb1v+|MpSf`U&A|Wt68=#&{p}@wo_O+)
z&Rs;AtA;P&vt|%Ap&-ar0vx4}FZN}iYo7VJ+h1O9Zq50Kem(RJ0BYbGcN5A`URkxR
z&W3K`UK5k@r4~VlRmeX^JrZ(zb@Ppp#%Oev0@4TL9K2p>ErKqrGxQIUONKu5|LT_>
zX%U!WW1kA)?%+zyG?UTpeybw$r6!<t2=s!HHZos0iC15LISDR3XZD&!qT6%(iY*tk
z*Naaqy{;%Igw}*nQA`sIt~(D{r2-3`_EcApm=&uQR;xCZK&}12w{i2?zO)pRQBH#Y
zl_L5>l@2reiO~}1W`EzgzGJ(5p3S7c>}2<cw3<_6I=VUa4d-VJu9`LJ<>We681~w&
zxHR+hKJ}Eu6WaJP7@UvPc`-hOe>Wtg(b62G1A&mmYF%DlegyC-zz&k!2y5aPXxe$?
z<M61Nt^^L0GmN#a9drjCjH!<vbBLqj71=AsuXGR>ae@BI6#ZWxr!@&$Ros>t`u8@O
z{^_Nzp`^X0&yHli*Lm4_*DT%Utjo{|kBQK&g~32|`e9^%RzqX=!uHMI+e7s`ZqR}N
zOpnoTLM`Fp&SFf(WG1HNl{fu8z3)}{9F~w)833}QMoqwPJcf#L8uTLwgz7i(AwBCn
zJvF7KuCCrE^}kS2%8Xd`H|-7ZHnp_xoRV0g&eTf^x5;c8iE`G}&LGWM`1DApx9h9u
zhO?-F;SIXpiKyb>G2P?ue~ViO2=ZrNv}@0pAdk?I&Q~I}$L|y2kBbLYX4UkNQwQ*q
zAHHlt9W9+^bNGNjLf>&V4NK@Q(E+NR5ByLJ$h>`s>|STzJ5O$O^m?xg=VWG{{vSo#
zkBI0RTjoel(erctE@>FYoTQ%F)ck@1i@}x~Th=y@u8w-PPApp#M^M*)W`A<k$`*Ob
zzb~6W>@hC#bFQfL6M<YtNc&cD*X~1mkP3b<J)TgGVTA~ZtwGVXD<3Xir=KsiA3Prl
ztm@E7ahD>l(rNupBKZG9$MA$mdy@M9aC?4!gP$6f-*dgNrk!^?PEJOtUu<%1u-<PM
zf8WvF;lm*+r!L897S7tym9tD~{?NS2n4Q&4N!Ht0`tdnV^-8a`a8|8ie2!J3c}&-|
z70;dV<#Tb^wwbA?_B>QhHO&vsP>2)I>1C*QkMn%=^`blby~HRD4S2iWEjuSIo$zAj
za_TcSa&9hh`|SmCVy!zJMeZd<bxov(s>e;xe~q=$mWZ2Rt?v&>;!qIQ={?m#zl5RK
zu|3(t(2aRDD6$~5zoJJ*{f_>uwN`$8R~N5{@?`#eY@DBd$$rn(e+v&vA-~d}5p_i$
z`DB7%H3l8b)L_9Y?S44(-kS@x&L13V$FGN74oJfjO0{5`kMGQF{Qmj&?fn7bPvUE@
z?D=zlSVF`WE87YP)Eg?*UR~>7xh6j_|FGZu^qvPNH12wx(r(KhuXuL!1Jlv@yo3DD
z4l6P3f6tX=vEdoJPcdlZ^|8G<yn9n;FI1n~oBH5NPS=&9u`7jZudlLouNdid3^3vq
zJUdJ?fLrh^&0~F_?=U&<Y~E~tW2q0tg+@DkMbdM&RObiFE1E3WT9O5t`WEA#9pOD0
zpY^~ZM<L?+`+d}_rDl(Ex`oqjMh884&p0PxHFSUGwpM4C@c2eV04B=l3g5Zd&&75;
z1>J3?SIRMG*&x1(6I(9ad9f&bhxC0x9}1|E+x1_+d{OE^|E8H1TDlZ?EUa@P8fCXk
zA$6cP<_Dt!>#-9t9yE0G@d<INH+Xn?)02~{(nd}|;0<rNg%vRK+?4-O3A39SRaWoV
zyXEosr@B4A7u2to3mjO`?_JctD!A>er2nLD;;P-WLF|cWJMs*>ha4gi6eTtJUmx^f
zr7i6J^lhZLy3KTI{?&ns*_Vue?aNU-><aIQfrQtU5^Tn&{`>NO-jV@I9xWft6+xGz
z4Fb^O&P9%n83#Ouq+uct<QRkQ8Hf!F2=b+VwLT(bLCn?gnHux;^Ke|0PfIOjCWFZQ
z*zu(v=Y@tx2aChRTV|tljEuy+)Arr{^06^~J@E}ce7hJOHu(-lnI~LdIWR4DHy85L
z3=ZM94fd}xneO|SYZUtkr+*?L9uJPL-_3)1`97r1IxF%QdMn3v-?0M!CU$Lcg^=5V
z@sj&qzloU-MSsL4ewPXR+=r|V`m8&q*~WPNm6@R(4iuZ8K9Dh(wnRl1eBn=x^EV59
zNd}DTm##+%=&jP=8N}~Hi<JahjwfUxR1B*`#8d{aak0<-xctpI`$+3BT$%3#PT1?3
zB{@?+bo-|u!BzYqz1iR|1r$&8xCEnmFXWH3>)+(;K{V*MA6j}Z_T+pM8B$oAkOYaX
zbgH>8H!Q%Q(t!^*J@I);pU?m4wObjgXZd04hPC~a4}59h5H1lSxecHAARUr|a`}rh
z6u;FfErs83@6pR8s-&h)Y^(nz+P^O2K@nU5k?0;<8b}qP<PnpmBQK;Dr6AddE+R$#
zZx;c_VchH*C}wz@ML22TI0X1<2jRZe*jmf~x^L+5E^*?u{_t=2O=|=R6Q?7vAAd%^
z%0fTayZLYH%_G1lH90?%&Iw6v-y0wb9)-M_hv0GzeA!!>e6aWYWk+7p5^S!&vA!|o
ziLdF1_YWuf=WPeU7YQ6kn`VWA)GT+5E@Vcjf1U^PKhA^Z97G5irH3F1OO4La0#>dN
zywq&(YoQp&<^I2-4qx@XuLX6ZKAr#TeE$8BpDFrM(*K7qbw*#hj%Tz3pYiiXzF^-p
z6qTJ-)Zc>)=dWk~<B>K#fosd7FbcFkhA7@O>(@F5FQ?r-{v!MTw%Ss6EHvH7D8Z3u
z2OBbqW9)`Yt2~8u-Hm;jxN`fiE$+Z;8;o7@`z1&J`P~s2AXj=g#UNG$^D&j`8#h!^
z&05!N+A~Rzh(dfTu+RI@^8(=#``h;hDN7zET@X#o7#Pq|*VNQyms3}dvbMH9ZD=@;
z4@SKd`!w<zS-@-bU6V(taxLP=jTj#0?BAI~0+usK-Ze^8Ct`Y@_n-N<)$kpKUHDN$
z5UcD&{`GY?ElPNG#AOhESP2p8zc0sdzq-MG2#>;H5m#LOw5ZX-wU5WD{;}9%$tZD(
z<msQ6LlI&%7x73=q<?mB+eZ3D^NpXutHo@7nBzoXgIiyjjq6MU(oD>A`PgKL<2dap
znQQE++pZs^1{x6vDxGvPV(d@O&pflX)omP$IA%3JT&1<^FCBo7aT7B?2n$Pl1B7J4
zz7F@Vi+RU@vg~P0g1f=kUx(%Qql0H)VcU)yd87ANz?!t=@L_hNV01VZD>14E-px8d
zb6_?aIs+t24j$>Hp-RJ0&FNu=<KfJfEeT#GG1x>;9y1w`mj_5*8eY9R_4^3;oyM8`
z8qY*oh(7nPYy0C7BP`NqWweC<>AOObo;VfCMa&J4e+q~}$d@L4RaKMk>o*N$hl_`q
zjP}B}ior7AR9jfXmjc<4e&Vc~1JCd85_34%HdxF*-H`n+R=`gj-R%RRC1e8v2p_+7
zy7D0wxv6#P;EsYYw?;6mLgL7ubK(M6EBU%TaPfcYBLYhXC=G-``u4XR5P?|W{PpL5
zJVHr^pUJYlb4a|rL$y$XC_)~6p@otV8F(Bjw?ECz3xNtGFZF9?MH<r+9p^&+CA5+P
zU~>eRMeL=2o3r_1kVP*8vFPxBS!Y*eID7|!tL~w7AO&?=64=;=ZY6kE`<A$tW*$`-
z?tdK>r`a*mub^LpKKmdF;VW^B-2Y{5K^YVmP1W1~%i8YcKuIj|<68UB?TNxZeaD!e
zM4xpL9%Cma2q|fYOAQOyoR_(JdcBgHDv?T<;}BRj5fG{w%*|Xqc&%7+BX{cW`xJY8
z3?xeLuDV6*e=O^-KYcggh6ee}_oMGYp4_h@(9dAbVw!29!4r?tv5wAtB!HCYxd)(t
zcwFa)J<q)D5t5>DEvI7)ps<a-#t9d#d=5S!`1u1{Y%n}BV0fOX+PLf{$am%WrM~ai
zmU#3{e_i08kNCjel{{&9h}QWA&aWJM3BIY>?$uu0b#Okn1G#b~M($<tAu#=<!e6EK
z?5#j@;en$q3P%0~GV*!zw!e)$iwfAB7|W4<-almS*X^V2kN86~R7xOir_8B!5dL8#
zu`6N}wE=Rour&oKQK>d~LKAq8tYEF~!>tNylaTCsx<pW1JxNy%_V4EhE6by5<8$j*
zr2lfamAYuH)sJc*>tA$6?2biY02#6a0PaL`eM6BX<t~We(Kr95#44#kiIt2tyy(YM
zqjJh%U7?B^Y<7FiR_Y#9AaeG+5{5I>5a)8jxaF2+x~|a0!)#xdrhBvs@v&wVi#6Zq
z-)0P<9DJq0qZFd<y6nW>)DVIDAQ6@5az4+TK-TBao%#7^9f~wPt+`PiSlJa4@TBGY
zGtuknP%sL`w3frGTHy8D?maaW1DTAWPfd6BU>QK#9`x}`68MzLXxJV*2Z!87xPATo
z=4+*R53GlaJqsC*pP%I7@h;U`=GcKsns-uQC1!}NZhea$YSa~Kcb)(6S9gW!*RSnk
z@ae6ubCE)$)rEu7-{3%lu1{<C(BOMu1-rUnwNjsE_p>-k62Jt$*hi4FiO@$u=pOm7
zCl_aKvznMQfM-v^^K~IvEDa}dpWrvO?ua1PA*Z6JT-_D!ccu*Dr`~nAIzl1~ZMwQC
z+TK?;Us3(@LZPSzUa55~8Wr-3z$W)g>D@*;LJuw%RQ0=2^#KWD*A=5JJRy&lET{2y
zK@G@Q?3VtE=s-#GsgZG0pc{JO5KfA7>C`;s`FZ1j{fRkZ&N0s~`%~Zwy%&0booIK{
zK>$k|w{1`W|D655SucW@P;l!fAtgoW)pF#cZ4kYJjf2C1REylV$&v~?tk$m-?-Q>$
z%HBQKR~||M%j1`fVz^^wro^zI1IS*@u6vlRg5+Mt>xs+|JB6KiOYo@1_?7#|W(Jfl
z8wry6S-h{kQ;D(TvPXM8m;if?G{djm`r9M1XjmkS|0pjKj8u5+rNbW$jtdi%C&8_*
z7sa;U3(8-P_m0Dt`S^X*h4j9@bD#t^%KdGT)IDzGwy4C1_mwH*^bGr>t6@73djHGU
zOd}~FbR{w%ME_r|<SZ=BXUb!@#9p^ucU>m3^D?^5X?tJTRnNC_i7MU0A-y(ZEMc6`
zbw%-v*2M*62iB|`!v67$flU5cZW#_kHEcaHWMi-C?y82uR;~IlrQ!Zt&r;w1v{AHI
zVHK97n<Zg=edFL_TvTInmte~U3_F_cEAGBcPzl%1LQqF9*jE@7;`^mf=*0ZPA%|U`
zfkn2sTAP4Ajpnx^4wtiNhG)bsg0=<ZMaNO!xHu?aKDUoHfw+?>>^*_lV-`4Cq*7%4
z$i5rF(P|QyFM%7V(0s|?qlC~b3HL$z!$^ARjgT?u$tog(8~ex004EKrwU~kn3qER?
z8I;XNnocHKhCcAQV=K8^Zy|G~f3w*PGPlK(4jfpxYx-arkk$YX+>Zp*qtl5^<e<+T
zKOMGA3>R%X!AJ~x9NL(*NFnIl^~wI_`|IGdoW9-;L6W<65!_zybP^k>Jpm5qKK{Z2
z08U|dDVmms@4-#R7&(!-pu;o9g|&**)%I~XXF~(Ij&O&xL7?~qNW4klsfnrMtpj)v
ztQR;v!V`Mm{z(Tr6J(<Q5wYgKI)3~Bd+8UDlSaNgpeyeDMpvLH4NqtZMi;>gnELc?
zKYQ!M?*0L)AKvJguO580>#-_=HKxQBMTXL8`L4w*deeIWh#_Y!9nN6&U{8LvZ4S!#
z=beDBj7Kg;JQD-DK5x(oGj~{XrO|Dv?8;UmehjkR*l~?7p-A683YFWR`yR23G!7m*
zAIfXsBVr8MH1(!WA|2Qg1REj~Z1aBzd+V?$*S8CFONfA=NC^n2fRuoObP6I!H_|F8
z4MWEODh8>f)PSV4bcaF5NHcVgFvP&nd7jt3_wW0C=j{ER>)O}8_yd7?-{-kwt#z-P
z0#F~G=<bKc|Jo<b)Pu*1R;8l=?+RNAzUI?^Qvv-cxqp@f2mfbDaQ!``Mj;P2BP5~o
zBG)O`_fFN;ox&&*)~&PadT6Dk8)`jau-55&O`q!+{S7mvGoRODYwVJmPG+>#%Q)fv
zg=_bu@rC4huAfCl$6_quC4%xo{smZe<mc0HR0zDHwV%GrW2G>paa!BEID>lhll4G#
z(361%(J;H`TU4}yBU$Ol4lYc6JRY^U;@6+MiT8$;D$F>JQLch;>__@3-#hWk&-RvB
zu*M~O9VEMAhL+-Mg{9<@Sx<D6a8`O(bO%@YjEjFQsZ*b>x(gN^#?wn5EA;vnT)U_C
z37;?S1@)$RFoVun9u7PgX|tt*B%hd#yDF9`dvB%9`@W>_7*V12KhaBj>5w^1CSkk5
zT|E%t{{znjq_8EKe~!%H%u@kb8tlg4Z-|N`8RY$MC;<U*72k(~=MOGiGV{7B$eE~@
zZRXIK_i}y;C9%uxFTuA;To-`E1n&c&HfuVW4o1uIU<UQdPj33~U8kq?!*e*;hj9IJ
zcp1Guuht~4>M=PrJA>3vBf}ITXDp|+YC@}V`~wLizv<WFok^w#y<berT3>sf6&<3T
zROxz!zApc44y9Uk&%|52wUBZ^U3jLn^qqQzM!l5rP_1GbyTUeh*!(N5L-o3_TeCMw
z`H^A%jCXB?JoV&Z4gJ(X(u!Hl<Zf-Ef(JL&xDo9fyeqA2dF!)jpOk=WKX-b_u)W57
zC%evGztip1y4?)gj^rI<QwNN><!wF*v)ti?p_*cY;=_?sD$-!#rVA2AqQu580|g;q
z12)I?FTw1Ob7*i>#hp6eLNPTSeC=B!RzL<Gm?G#ap=kEzKhaD~0-fI00D>MA0O7^g
zF%a(<1@NPUm$M2%e&-RYGlQ?*aPJ-zt);<?#%<Of4)N7#?-U<Paqp^tEz855a_m29
z`SR<zLPgU_&YWs&Q;%zIGE7C*NxL;4koe=C5$4rd2s$p{Javw?9|VsyS{<B%e>mV$
zwel-Tw{d(n?&aiWv;s8>8?=H*DQJzPH8R!51*;Ju%82uf4BQ4^ImJ(XI<ySv^zv@R
zN1FOHM$PIl!QEHAOOI|4m6W7E6@(u<4V~J_-gZ`-5uM)zaKdc}-Q=gH{kJmif(U4y
zP){{pLukDO@89$&lNnprZ~Wed`}bo~5B|R&lNR)tzRwB=`4J*iM^JezP8!bW`!wQ2
zM(^+4Xx9zZt;4oeij(G$N{pNwiRqH17Hyd-Skaw2nEL9>x;wQE(;m!8mz>$i4t_dC
zThXOYIX;s`3Pzr>-cU(V;xU=2*-fz1abGQ_UX{9z9(#Zh%7rgq9BMiwA|;%Fzl#n+
z|NJVg{VVV;iqNjGK+HJjaM3e&y(X#OGzHmQHi9S~zH{sC+9<AkO^zThqFN0q!}k@c
z3UqrY&8baKl4F%z$)NJb6CEXu_s758O+t`Yj5Xl5pmVXs3rS(41)!FA=I!sR8F7vE
z{3EC;{;GZd%|Fz<pf?8nFsRMr^JS#)kYAbb%_m=W)viWkT%#wLK{tTQGjQ`Rz~v9=
z*8*XR$2E*wLEzA=mKiNs{%i_ip1*B97m?YgpUt?eCzk8;TzRdRm~?xj`6}+|tu*6c
zTKCdtif&1)G{aQ|rVe}Z0xW9_f0k(nSzCt>#8kk`RuFaaWKO+m-kObgNK?PYL`{`Z
zlv||xx}QAtsl_`g4sGXnIm%UH2_qkI;L~9UQd9TSf2<I|$G^;x3F3|HiU6cY1@Q*l
z9T0CI0FE6bBmhLrUAz163b;p15;UN(d!h+pK?|xNv`u|ZfTX1yuRvZV{nVR`BB^Hd
z5=yboe{*7LX@*Jicz27H!Qc$>EtN-qp-lMMNSyl=w?`_m+xa}P9O4C`;Ollwb<M6h
z=$LaeNK^fv1r-H1pEmTp<_U2;eEcDtnS=hhMp|if_bFg~lPf9R+%V|8fO$R=M(0Mn
zSVo?3hU2kDlJbo>_(ZMh@BVB<@?*&{Cw@x>L((wSg-SY>y!Ivaq5WBN+v8ukkJ~1)
zukTZEP0x!EdP+SA`bo}-7ED245DoXqHja7b$GQpCK=peFSC4&TLY=SNsb78n1D`6{
zw7$i6q7eJzFN^)9jz{?g+I$8j!ou>yP5Frd#uNPwb!*87AQtT$*j9d~_etV9OpZFe
zmSlZaCQNNwSDwpcxmaQ?HG1+#tou3-;kZO7$R~mfR5l}mn+5>W%0h>SUIw+V=hC%*
zJF6F{48g<x#NB)hm5y-2U)W?bCoAYmCxf;SQ~|kVKSUhByYj^<g4*Q&enE6zmHm@L
z1St$tCv_{SO@a=OMcrmvO4i$hR(LzA#`2;|sYBh49DM`Itr}M1r?Xx!p2fR`FV->U
zf{<uW>dw9#buC=+!$Z!iAq-+$i?)LGo7O{3yE&Q4ywfnR0&X{y+O#6YZbaT}!yBEW
zeRTmx)G%CC)%2`v#dP1j(?#`qg)7#J(32^KznlKxc<nLKN;~aYhmU{XVR54DB=@?q
z_`}r#f9s+blZWh0yc21yv81{nk5Hko@YWpyPo{#-m%O2bc&dL0FXxx&x$~)(O?1?{
z{uD`oC10K)6aIWUB(AeJHyDPo^$&_q<tyv`83?1MmoN<q@!+)8V#l47dUS+y)|DBF
zS{D+5>c5cXM++M-2p4~m6aQGhVFN)}xI`7i2E8M15Ezmb?6aXd^4g!h=-2rl8ii0s
z!b<$}8-R52q-@A~55kFy68cOMdY_yU1FbFlEfdr_D%8)kX3rX*JV#|GISgqEbM5cE
zj5;5<=0;sf^T*V%=wU}{&P59$z6sl$$!yG8xRLvI56SMap6hp2=G>)On853h!ZrO6
z$AeT57a)+LN+&uG_`a6xadyakz4hvJ3HW78aLzxj@$e$|$7$&@`5A1L(G-OScVbNU
zbz9xb8`8FZa+`mh)UmF1S)|PKb_>gLNg<c;JviVRcTv|(b$-(cV%y*?!=vC8vs2a8
zmjng%c@fD4A^r+k1#-Bn1c*W#5Ou*JzWA?)7zkmS%Y;{&|AZC+1lk*a4+njBFF+>&
z+PwPiSvk$`D0A^Q14TsJ3L%F<5&$;p#pl<CzBQN{fJ8ulJ6RiO1d?q+h3)8u?y$>9
zX6QS=(+FFpvv*mJb%jJ1R)1;_$vHtmFzPkpdun55;a&=Y;yUaPFC&hF64HJTgrlTp
zYeZ#9=vt(mn4~_Vzjpun@GTKTkjp_qX)`8iJylFqc%<cyoKAl&e3sZ(yhEr9exiJ;
zc9e%tg#Br;Ev;!Dc5I~7Fm;F=xu%vfJs4iI#!dfx<;BSl%gWus!?~4tC;bEAhpduL
z*+$!OzHbca$2&j*OXVBblyGalnp<$Qgw+5cFt9e^=B-XG6)9<AG@Q5eu7so*-m9fr
zeIZB-8VSkOL-_xjo@)w1g$jMX(1VKUH{<R|Dj)?VF{t4MNJR4Oht%NMGT}kk4T-WW
zG#ddPCW9a)RQLx|froixCt*AfJWTXsh2azU`+Ed-wHHRD1Wm*95tX&w)tDD`*YZ|A
zX;Y8RVu<$?7%K0%bHL3##!3VT1IrA|)6yHpQceRqJ-|lJiN0bf-JW!pO8kVs(Xj}I
zBK=A30o}7LC)ilA>#rBrFiXD2Yg|WTjqmK<-unzk-5!$kgy~_*u6XfIn}1f3#2+%b
zvh;<%TCJ+uZGN11RE*ARcME5Ucxg*1voXf^Vd&w=hJ|1*lc*7hNQirWjzv}4kp)wJ
z)x<$JFW+G%e=5cL0z%fn$NQ)M1%uYn_!r3?0CHY`Zw3zA!H(5$<b)G^O@QD@$!cwV
z=no|3hP7_FaXd!Zi(T<{KoRS+R*~%}3f4xBK--OYO)Agik&v?{n33sw$By`H;(?M6
zl4<TYZgHgm4_q(mZ{YjCK?Ru`KoxJS=)Y9SHS=&rlqQhTC8twtBcb{tWuX5FGS7GS
ze7>@Zk4Y!?b7pHKLaT=boy{oiFoSX<gkwyHq)wE9u!u>@WU)bA32sI{uSb56C5sPd
zjg4)KWPGl3vUR3C8O4d((AV<n;m-5^b-Qb_U|cP(sLtJM(kQQHW4qiuC+|&cGi;^o
zvct;y@xHplNXZ)68uIbyQr;uhFmv`N7@Zju2B|MUJ)rBoUqSN9LlGO|SE3~tnc`Ge
z=8Kalxg;*)n25LL%UYP5;<Mn#arKXLsetZ4<pi6f{^@~335x>UZ{Ba|rEE~I8xBrp
z1T-!)qyGDmYa=FX?g1MXz2{(lr55Pk%+HcnVsB%Qe^9DJdZ!YjMqJ8<&D@W7V%TPU
zT|K=eWKPRc9%#G?y4n__<+L(yFbmxM{R1Vz51h&;=7z$uF+h_6zI_q%F_0k$9dc@&
zzxAG&M5kQ&v2{^U+tNAnPsr$XErXm%O7!Tl*${8N&TNG$^TqhpSjXwU{}dB8<D!;#
zP;RM(C$bgl>N@j8kz-84Za*#aj)x<&o^Z@gy{<yHh*{<MW=$5i6pb#os*enfgjn+i
zp`TQpBU$(P*jc-+34Y156omOCf+al9*Q1nvs-q$Lc^>(=9v-U4QH>DMaOx*GwB`HG
z<Rk3Dag~&>?ss@jNQ1(ywJng@ca2DeV?IKVa>0x}EvU8rsfhb8j{FZiasW?wT#cFi
z9Xi2nzlBZS{#<BMr&l%Ms(kwwaEV?DYrAVOa{%m|hQV$sM$Zklc;j?n_M-<(QhG$P
zr=h{FuOwIgv5+W-(wND)B_u0!gx%xkKhsa08nrq*W(x`jNG?i*SLFFVPh2rOk?Z`_
zzb0O~Dd%u9{>Z>xTy4Z#3|JN%Cg*jsu(PivCfUuPl!&aj1#`t*OSyKBK2h1=W_@n`
zI$Sc-t@Dan^BE<kGBPl)99g*1?uZ!3g#u@>M+TGb7Yv+uurG0^#Jb64G2OoFavh%u
z(Q`bcXrC*L7$CSc1R>ot|HB^t?+3;V9>}g{$zv|)00#a|7K_}uBKJFDEPxp)0Kr$|
ztZKy|HzVmI6tWx)n{-)&2L^^$J8BPC@+n)d@>{$G1L3(~jMji@IucCp>^>O+x&Qx*
zL3`Z?kdf(UJUc};@VshH=ty;kEmL1{W{?@_?j85odY$k^P_eqr$mdCT73i48-ZA}N
zN2*h2W8CT8zgD+7Z>@v-nP5<O*W`}qpOnPdrVETzLY!7so%TUbxhFebi@{gJ`i^%y
zqSR2J)NRDf=7Aa%Z>^54{UNAOd0?-NsMP%pYS-5RsI+7mc#{U?6bjn-!hb`_q#(d;
z;9H43bX%370RiGR2k06H{l<wLMPGbSd;rGhidAT>xJ^=wQYR~e(10xkV~1+MaX``M
z2N4;4ra_t2^QMz4b;VTs3C{LDc*qD%G2x$NH#jF|CcMHRKp)_N&1I>1Ert-zm<-Je
z3EaT1=9qDt5jB~)dpJZtByp-8EbZU%fiX6~{8e5=t!7XjBV{XTu|Zy8fVAME#PY5K
zn@qCpjIsFLIzh+I=AxH3w!yv26NA6*6kCt_60R~?m~w$i-1W28MJ++MJg!|)p4og4
zF%5A5A|K*(`_AljnfUL6x{z2eoI6#cC&T<2r}j=eifel9?GGP&gZxDHR<8O(QH7<a
zmkB~10j#w}M|BsP?)s<vLiGl8QFEVJ>YyuHKmy43JgO1sDMMKrh+8~vs#>GL&OtDQ
z#aqhgA+%RTCXv*<ofU7I2fTc9BsRgKzPG^$-|b(sg*o3o4S{QXp}*rP5Ts<@G6$n!
z)1bgVDtuM|MS*q$aIQot`GIZ>APNJ7a2AHC5l~`?RYsSe8~?K^qy}C1BeZhOhV#o6
zv!j<#^{t25HktA6vd%MctD%=Z@YFkU@YSx<jN}>>+{EAEcc0pM26&`mN}VZ;4WTbn
zW~o};Bg;kNhX<e(JUPg_b_@$XZ^}x@Jy)1cAy|K)Kg5ezXcx^FJG`a4EiR{Z-LYN;
z-Tk41j^G?KloZ~`n|J)<qXMUi7*ut<K{w0f!7)pV1p-)6a`3Hz(!UQ`M9PG|skeMi
zibo=t5{=zvO^WEgmF5TXvK<(V907K9#dYho`HN%V=z)X{O?yFxCQRN7A7a$sEaORn
z8DNkq4spavWwuc5!}*WARq+KdC<x6NIuIt4OXRFR<L0d26%?#@smhSQz__g)yH>66
z?PJ1=$mjUT;;B!yv^iE4YE6U)0e92N(4XruzSI+ziq9lhnzSb89@##2G3&oUO6Mi?
z^#HVq+@pm}Ai3w3s`+wgpfpKd^E*mU2U!<On>kAAJCJ_pyfFYvElD#Dl=%S31r$lU
zD*tz=`kzpr3IWdE=M13A?g=!-sd|SOf-NL|E2qjJ&jR4=rU4cosEfq0BfQe#o1tdE
zfW@&c7#>)~2nq_)i+dD+fQ54PA8&@?Xm5UaYwEwFYvMWS{^%tVSkeyyCt~*>6bfDo
zADkV579gd}&7Kvyv;084y${j&mqJ^kpo=;IUDUc8EFyg1qSmamcjS$3ywOTq-s2(J
z>7;t#&i!$@t{HdeSz+1h+}0VxUp?_f7u3)NuSd?J+<2?-k(S~+dIv=cp9{v}C4qhh
zM@(9l3WS~cnKE}UC)Q%1BU7W(6EbPxdA$nqqHbhz;nw7p0JFY`=-6kOKD_`b?3+*2
zqJ-*V-Q<&h1Hw*$4tX`zutW<y#AaydS&f7LJa~El+<?kFcd)sAxo-Nz@!RrIhdTTU
zrf~vTqE{EAh$V!ltHGq!Bs44uY#}I@4wlse@{F7sDt^^M!oV{I!&W+{3y)@BhsC>`
z13wW6PykzBT3}BC-AY!Oc(jVFU&Cd*2qHvejV|?31lJF}xutHZFK*mVFjdX4KZ&h!
z^PNs1#r21hF97lP`An2#-R~`)ZVD^mwQ*~lOhT^#hL{m>n#Tv7mEWr-zX95uyLBfE
zLin|cq<o=w|4(P79-yG8@tvpc&_x;me^&~PC6JKGT$cu4D@$PO!Iq8_FlRZOdUlMi
zru6J!i~k|p4*YJ_Mc$zZ5Ixteihu>VU27Pfi#hq#J2JqxRU0hI*@CU$?vVmO`u=#r
z>*g{>o+6<1eg<i9?5QoKq2C2*)Qqy{zDa=W8u6x*@`<Y6O*|wg-UeOmRdG5{*{>-E
zpAQF_`&7%ICSN@&1t&Y>9YwD<fmW0RfbO^U>ulGd{+ol}==Xf?Kh!vW5VG0UV!jN1
zp9~H7JY_QTzqxSyzdjrAkWF#xe*}%icez{x)45=Ptq_=pvP-6;&LeuB*o_nofkp4z
zzyYK+{AA|kC0pP@!Q{o7Y=j4Bk=u+6@0!cZV=jWsBn82K-_uJhpu<B4KAQM^J4Y|{
z(W=U!k46YavdBOS53k@ljL^DXF1LQ^(Y=Cp{WI<T0b0<E>J$3Hxu2?tql2MZV&W1T
z%XMI!yxzN8UrqndFAlt-Ch#$Em>zrR<9+&@EzJJe^;;_S$GN0GKcxh0NyWsiPu``?
z3XH-+w;C@<rhh(0c?NY6+QQu`i!3w~p7P2ls^3ydY8x4khv;$tTW>NIU;a}&Jq6n7
zGEY105N;OiH8XedbHf(kuZHC{!x~mh697vRkhFalmi`?!^fNba^eE<botOf(LxHXv
zMofOVSYtohdI*~u90DKO{`Bhap08lC*ZT&@cQsT^ZC}veAfDEFxBGu)$p2rR*%N>x
z{;jVIkp~oEi`JuEKw@Rx0y>~q_{dr3$H2;}UsZT=&~;r^A5BO~^Eu$`lruHh7#|7w
zmP4Kreg|7|qP+E3S!jl8XlO_tcM+<|IF~^on+^$?+mk?U{@d(89=ucE60WSe?0Q1P
zce1>I9rB?Dks-=aH3Iu+haUk!XmrI+(1y5P^a@3pPGg%3@w_F;d9-0<=U07>dssit
z#|hqi;T0kFU_q!VGxt_Y0F~)cBK_HspK`S*>JP*v3Ezc-ygysi50B4Y^W^z184A|p
zs=9YdoaOV-w`(m6Z4p=Vu`B4?Vc*y;P*s8YKD4fJmHc0h2gi93wpujgccEtoNY}rF
za9{=*@RiU*GOBVMvWz#$231!@at8&=LmSXct$?832}`t_@dg`)O30D79#0;Nt(6Zv
z!~7b|jgCh|CJ)Qb07;i8B=w8n&H%yyX{f}=P%D7=WM@rKV(`xKm-C#<!AEk|jVjrC
z`66sn>7DHS9Ziz$nuUgHk7>Lj?7hs4))`l>!aAQguhHiD`=kS!?B23*&0t3Th$jCY
z(U_-tO{C4*dBGyXg4R7IjrqJBhm8^L-?@-;TKyWih5q&lf-ACGFDPW!rGgb}e`Eub
z`~S;|K{ufYWI<>t4qch^piYH7fP!KG)NIaz6tsicdC7b%1B~_TMH}?Qc+m7-y*uzG
z30p{0!U?qLw2KAu{Y-7Qn1$(MrxLkO{L9O5i9eS<qO*!lq9TvRQu^i0lT!we8Fa<w
zIf|0x@7I-9eW<G{POX)I{a@bdmlNq?U!%ki-^P%<S=B%ull9nr7dq!ZfK(2w%%g^$
z8z2|87QdQjZRNe<5i0VN6^!(mA0etCupb!e^$bGu>F4jJ#zQowLWIRpU#L!{_QtD{
zn{-PZSxkTRfl4G`7KuJi)UyI<oK*P|mfhpa22}Rwd{-=zn%?UJRe_Bhz3z7-n9fl#
zw}7V6p-<k7aFDgKA236O1U7%wSpSqW(k_7aO?#gL1<3<|P>UQX^re~#ynF2X-}n(w
z2KZbj{*#ja-D$^DR96+s079e2<AmgSuj2C)-(D1>bA;QmQ&wk0M;j})e$sNryMI<+
zNqn8t|5R;hh95-N-dtLRo7mNsyf2Uv@A?KHvU|lvDQ{6Yi($!ON-}?I0gO+>De}Zt
z9unqxsC?upJvDJi5SXBJZ)baw<cJR^l~53HGp*Dv*fP1KKlF2)%4V8pwh~(_T;V?T
zHs-sfQRI`zpgekb9YhvbtbYEMdRw!{oR(wkOG~fbl_H0|$s?-OgPvDsOn~cj4`t7N
z+Tu^wrjANMJ=`+~by=<$A476$WU6M5oVAr(sq6jUK@VO2-<<&WcV{v@d0_d0PQRVc
zl4J<1(}8ct9fj6TtM|Yk3xn`o*No>iDXmHIC1HJHyZfE_^p4|h)9Fre$tCBdU>W{%
zxO3B&B(Q-rJcgOj21XFU-kJ=5k|SbtZay~#ns7HTFoPM?R@ar6AHUSWX)<8J9JZc)
z9$Au98%!sT+Vfs9{%AOwQgOLLV50|G)x5R1$qX0v-m*~rc;`Ma9r398uytqw+k|~G
zuQwq3vA0q%>gTClnhJSG8<5fQ|EOqSO)A^5_%9z7@y6HxHL=xSBH*vCvQzeP+)C{}
zYgtvlMO*JN*vtWe1%&{pVVm#(tI;rfO=k_jGsDqZv1lhqru3l_{`reP0lF}lqt48{
z8pw7CjTW16wTMBzxHXtLSPf&zfcmBbfWfIiG8ScZ#y<rgam_$&(Xr7(<yCa1WVlVb
zyhzvEBe|uv90<#jiBdDPqQOIIwV7nj!`XHubxB?)t`@r4f}k}ax-+K;luM7PhoO#g
z!30DsZ+7c%tWZ<b<uRc>#6Fn?dKmU@nnHtOK#J!B#Nfrq&GA5o?BF9F>}`gp<0`)D
z<No_6Z*<GBZJck~E18y;!W>VGE7T*jL7)=gkz5-YKrBlTK`>21n&r-yY0fWlNP>a(
z>_{CPwkE`HGc|eTsiPX53_CWtj0b<;45a$g;!j56po)mIcZtO8HN)=FhX*@?y^Gn*
zC%+V0ObYNeib_iE?tVHSZw9XsvLAi#A^2C;?!+q|H5vd}TTk-1I#KqO)yc1g7D3u{
zk6b*cP>&%dWP)|yjnJg<@(ANc2ko1t2z)9&j|FX;({y8^bU-7Bd+6##;6&!$|A_rL
zSeNmr4In@a2Lecm3{u|ew8igXhLI0PMTL!ur-AquL(PTQq445G7?x5(^-c)5_}ogy
zzXw;<0qI0cgb)Uv9KaM&2kg9Y6itAd)DvH@+(;WSY|&Km1Wl<qQfQO|b{@4!J+C{k
zl-j{oHJucKO;G5D9M&#s3ZQE0?HLDUUU3po--RnV^~9b^jV08mK7n=Pv_|d)T)#hK
z&?!`%yyoAl6R$Y|iF+#V%>UNdQGoKqNlCYy)Ljf~pEj1_TKuApb(x0ZKVWBsSKOYM
zl5X3nA<5eHAtz0!Fljv0x%G?wbel~Oz8@Kr;$KEKYGqixfrre-tpiB{?$<{Zqptm(
zmDvH^siVvv7<xNEFWm$D^P3oR3!})q3@0u5CIMD9_yv7H;{p}Lo<NcK0O4{i7rO3y
z88f0d=IH{I)7Nha)|Z^@NeQ6Xk9+{rpzA+zk8o^Hao;tj877oDq5S{~zv<6vzfj{L
zdDG&W-^0lz0>Jey;WZbq?j3~eEWp06SeX`*vKD8NYe%p}2!vZg)(JE$^`<ajZg<*2
zx-T`kBD~-WJxjG#>rPiV9;|5r-vFQPY0X67|K_u;cr55CB*=nI4o}4GVH?K0wTmNv
zwKSLA71-bd6(J(SZ%Hl!%)Ms2%(rG;i*JVQE-ZiRg(iBl)l%|+k+zleIZq$JFP7zW
zY3qan0gcY0g#eUOAA+KT?p-yMIIs{we1%+MWO%q0j&o=YTT9Ug%T5aaF2*j|7X_g$
zQe8znRv%CL;d)4dE8TDIEX+T1(gflt@5RLx23h`;OfON6_ncV!Y{MqmL;)?sGt+fS
zX190uw&Sv$@8{9}R>Y*1&s_^@hPe#Knn2JAkYp}Gl1vw!=J00N;wMTfu_s|VdLO<F
zV{5B)VAIQ-FWWtY^xX%kbhjCJ2s^JhhT40n8=+G{gOQ|CE2bcHre8tIr|Y$5=dnLu
zM7DeNbKY!Ke!=(NcNJgmz=wm-=8n28jUC^wkU1ek-Kmf}vFpjyAXh>8wVjWe9?Ozj
zAQ9BKnUSM|Yp)TFWt}_)>g5MrJ@Qb;a+4!H)Cio{n~oHh*4mnJfHBx^lRniHo^X~^
z&Y8s&HzAtKLebm6g{<#+PD(b2o|eg3a&O3ez%hUX;9(<i{e+-7%N<)^t+@Qjz|%v^
zCxa&4V;P)Qyl8gy;a9|zI;Mzyj87f-tL*i=Q!odnYZ3YJ3Olwl2g$JBKFIP1m`Atm
z0~p{cspfRR*$F4e!^w<$IHYuOC|~2_u6RE6t24lUE%IctKgDB=v+iq67cFQg{57-F
zAA?cjbPkpzsO7p1aE3~imf!XoIiNC#cb-8dKft|v38Iwu#Ha4<<<~U?2yOq)S%<30
zTa-S$+2EaB(u-~`!^xLi`A_U#V=j8XBira(Jh*YBb-g57uj}LQ;Wq!~kmHBI;l=bx
zB@i@U7=;FFB>9Rw^q^O(jb6-!L`HMZ{!@?(=7j&itNacxE{&+W6+s#PdA873_h_3>
zyZ3`bv(P~L$>^*>-xvC+dWDM6vJD@7?@?Wn9clLG?UPDG%l4QL>&HU%buty9TAsV@
zI?A}6a<ocRN;B+NbEu9t_Hk}K7u+@p`3j)QZkW*F0{yk7qL_m(k@lvR;_JC9l<OJ>
z@%~X$KzhTVyUq!8km9t;Y?I?}4zT0EreiGe`jvOTfKjFTg}da}M1Fb@<xF~vt3sv>
zVxH^vc4WrJkIzY;M?@300ZAViIZqV?PF`%!)9QgIxY)|)jKM0W=}|nDJ#?hhp<*@@
z7<b9>4c(e;2Vwr!)u|G5g*Hz&(4YST-US|h#^-M*%iNGH3m$iH=`nsgW=SFeEw;IJ
z%?tPgErhAgh+2P3CYtee?|s#HtvNr)8V8#N?s`MTX3}lOX8&av*LfH4Cg@wM_R|Gd
zH$eFurn|F*0Kjz_Zvli~{PuS;Dh-2NAF8=QYw-ggxpYZDO=R}fDD%EhQ-su^O%FoS
zNPHuC^hTR87#lnQ3kmw@tdlAB9{DXrSDbA|2G{_j)pf+KL?n20$QiwC7OcH|$CO|P
z9@0oETBSX$X5Qc)SA(Gb;gOB6c~wJg*Hk~<rAH=*_Gcl)+Er`2P-y^jWH0pE0Q%Va
z3QOxhJAP0y;)NSsqjUy8k49f|PAzuBr6BXiB}E_}d{}XonAWPRZru?1H|NUSv>W=^
zt7`M)jaEb|bX;EeL&)4D&snIuTyOTRl0eMeDz2Z6zIf{8>&k$H%%Go2jE>_S_sNuO
zvWv6DOsn_kl$M*Rd-gsP78=f5&w38}#7lJON8Um=!}H8e0V>4LHh;B;)(ZfmQk%T7
z8(;y~!#DcUIf)8w7DJ1YCSRsT!F}?3RsH1xG(Yl6ey2Gu>f|8WKrg%L_IglO8NnNH
z$;1f^etfr|tksP-@kr<<@~VO-s5t|wCbT<M9M^gKJs0*EEFPqpOhs7_&xF_$kWeJ{
zSfxxvb7=fN2T8gSSMT23otgrUipiaTl3|xpa9r}OU<qqv_yJA=enReMDC4S5xanBL
zhvktk@H2zk1`T|`jq~w*%F2qZcxH)R4^g&>&vKT}W(zIl`XbtIJh`zksJX1T3!UEr
zDiPh9$G0Kutn6>PWI=))|4+IEGCF!kgy_j3L%0tcfDH<o6CFvwvEml6N!!}mdPSMw
z0O0009(rkby;b+yjV)k2sWAzB1o>0P3|065GF!0p*Jy@ceWpbaT2MKw*H=lmP|Ui(
z>_!hAXJG)}Q4iDa9I<u|eaO>K%bn$T)Drm!IwZR}1=Sm@SmTKuOV9sxNGP0vC~?X+
z6H+M13`IN|p*uKc1atzWyimQ9FLUh6Vhx*_1FbLo%MIbol39rgy{h7lmFLD)LU#Zw
zwaT587_&BLG?YECJCayiS>Ye7&(}9h#2+xDSGjY6(Jg_iX~&%Ns?tYs!>UJ6BQe`R
zBoNE9JkoPeIO?g>38Sg&c<5=xj(iP*OF2e>u|;1ZZA?CXJWsg=F0He8qe&x;-~Ej^
z9<oxgrur?=W*I+d*&fvc7qQ?v;bZb3G+x?~eW%6jdw^freSJZ=GTqdK^w6$eJe0K1
z77?{_q~UmAL*s8vK%yxQ`slJcw*d8ym35%@ign!(Yt*l^#w>nlxZ=eGU@%Z!rtM8n
zpIZO%Uj;u}rP^iwA-KEVwP&YC=H>~P#_yh+27U~;9qsXu!@0!bMpazF+{6;`vat;%
z3V>Gq%fDm`GJEiQB=FlAt^g_uE=lAZF5Jg4xIzUiFL0Q9^%Eu(WKHQIG@o!*;jaog
zq8P)%Uwlm~%c*P50Hvn$xhPfjH`eITsN2~3EzaE9vO)8i-2)Z&CHbcJrV(e&_(r6M
z>kV2>8_GcrlVd7xeQ@;8_cG?*K&TRG_$$|^Q1FLol%OCI55Pyf$l>JbPVaM7cu1=(
z-L?}wYTw8hJj+M4OhB@`a_XN#FKue?Z4Pzfzl@~ynvW%77pN=1s7Ga`xuXL~S*5o;
zrpcD1qo^9Wm1t=8%i{A;hs(f__mPBV@L3h;1WqTkE=!J@rvZpU?W^Oh_V=05RV{`K
zT9I3luZ7&>i`JvcO;K)?kScnyPdN8kv6J%GkZpAHCjj@Z+2wg-i7IQEfRu*c=6RjD
z0{sy7^n0G~VSL=~HG1bFS)!BMi$omvXx}x%^qh<)bO@lT@x@IaH&Osp7=D!C4Q+-o
zmH3P4mR=sh&VIM-RRen=10L=z>zSB<2d(d|{0s~F1Z6y-OG&pMtQ7!hYcv7n6^W&f
z=B05!rzDtSF1*z+c2~)aY$3K87YX0ac>Kq~?zofFCI&2uTtcQ610HkH-gZh(U1XNP
zVX9tKXn3=Yxfsv`x&em+sdKGBVjg}7);A3T05!_PqY-j%1PqJ9i<M!$@4BXtls%X$
zrj;I)3=dOKY~Z#0ybaX#y}zdpAex^IxVp9NW<*N$14Bi*gy|T&qeEaK&xGlm0JeT{
zd?>(#ZL+H+e+KjzYuLtu6PM2#1uJ&7B193?Vt$%Y=Mkwf!%cktxEE_TUfSpwdc<%4
z^hOna`a5~tn{;aRwJREIOe$|H*)+cbvEGxM+^2N_-v|gt{hoCU2Glh`jpLFb1(x;7
zDZ|m?gA*&kqG~gDWowSZp~RS@cfoc<C&ph>h?FfCJBkHT%%`E^g5A;$*e)ct4}Nl7
zUbFQw_vj3jB{IE+DNW(Bxvd@QN%MI(cazTUDQ|lg6A%LN4NTr^`J@49FpT)+Z|Nj2
z>z14)K$^u)v9-8+KT^B`fQr0F6BHcX_sW0kYv!{#e>N&aJ^f~DeDKOPwMJ<7+Q*;n
zErHCOqYX{AOp?;eqo(l2?ic15almZ?v5mXKa6=ZP<UaUB6vXJ0XWRZDc0*LcKWEKB
z00?}%F4h2_lRuNr&o~fQK3^p?Bmr^d>yeG2tWGZ#u>KK<$dXV3MMc)?+QJm6W}U^2
z1sxMj1>oTiP{dq?NY<wKr{`WYml3kY^UeBJbTu<EPH_jJ0n>DcM)tJVvyqbBa!Oyt
zddb){J2#AYCb5;bhi@Q3>dLzx^bO^3(D&U|9ThFp<TSkxu;6^tkCMQoCQQ5x;%tJi
zfT7n<3kkoeOaK6bS-phm_X16uUP}qqw<J};zRo^~7lphVC+j@*9UKaQ-%viZ;?{V2
zH@d9HX(=JC?0~F(`gFGn_&mz>mjF9sJrm&OEk81evIl^Iqmx?E1ZZ%*hjoI`Bd6+z
zyKcF8xLwha2DTot2YF71nc7ChF9U8f`j%1IPP-Yw+I}_Wh|zjUVf~&@4`edoXKE<q
zru!USVX)0i-09EZQ|_`>-k5T-J0PY_npqY`#vj7<`JX!CHWp(yaYF%|g(kcWM}gai
zW&(9=`SRJiMSfbfzQ1<!4^ugTCwOkDF1ps;Nm*dbuYc{ALgp09oB}ed(KnSfJb8%f
zi4)x6Aa%RBrl07ny_~es5gPK)<t&0Eo=Qp}!m;T(BJGEJn#)iYU_3$x=>Pb?wzw7B
zn1|nFkkYW2WKjJ&sF<PCr}lN7h-0)OwNq@b_ost!9+q8M2RBL>C)T=j<%??7r2tPa
z$A&VS+g=Q&bt<zJ%G8CZG?^VWlHL7mKt+X|z)FjHRar*!gKJpgeSXxQkuw#`$k)YW
zy-O*s=z8{Pu2miJeEA9Es*udrVrDARej;DEcF<^XQnXvXP}4eD3|mSDD7hdkg)a3u
z<uk{<hBvyy-o=9>59uM^;QH057dPY^jSI&DOw+N}UC~qXL+E_(66&K9jo3HGvO%2@
zxqH_fH7w?jNY0SS71`F)R{@)%Y&p3b8Qe79*iFPn@Ise(gBVQCGs*FC$U<fLQ%%Wo
zHc(1%Jq(JIovD$*N+rf!jumJDkt58UGg9mAPkT;Z@tQr^L0*Fk-oOg;aJQ<7k(5}-
zQ^{c}hxoGEZ`_PiD)0^&817<MSN1U9Y!DL{Zkk@qYQ}*sodQFD4BU3Bz93+|qJU@i
zQJZ`r4YoXuP8S}drO)UXM%=OjW`c;yoo8iDfVra!8|c4IA2b0mli1z_EZW0ENY%aj
zY^+NCSkkabnjJVwfy@bJb!NjrdvAS>ju2|Gp^=ckXEZmV?u=f={qVXIs+4WG!y9mN
z&)=*M<;OQ}hCBUwc}eOgkW3jCG2XM$SiN*j>SS9R+8E;lJP$2Tuiu#L@=Hu7R;^lR
zTH9_U5oLnJ-8mn9{~9roGS$j!#!YM}qIu(dE>1Mo)beEonBL+!Dmx)d71Cc3wmtI4
z=qth=DzR8CFnKX;kv_*FGj}JmWaiBH)gKou^EPbaFQbfD2UN#;y9m@GQ*Q2@J@vaP
zF>uvLI^Hyu=LC~D_!SLiEXL3O>^X$xZhSFQ29n*4#J$su{swlds9B5$sC<9VYyt8=
z6O6bSYc1~nu$m;7NK)0?K8WDD)Sw0CTN=5GKYu$jDjH)Cn7U%ob5E7`+_;-L0PgHx
zG~en5hW2O`3z{a!CK{_5V~nQYwvg_&s5}V)Ptm}k<p&^wSP#z4yC$J#nc|^deYbvH
z)gfhMNe+lX%p{-1b?%AuZY}hU@j>GokmR$|w|?KR!qooudpLG+Sq>Q|&l-#w#7@a^
zO=AcMWyOC?KAJ(1vyK2~Yrxig3YnV153+U|VQ0<<;L9b;0Ob9*oe75kmuoGFQQ&-F
zQyJ5cGU0o}HlFL%F@y;~@~dyt+Wa?UDQTeOZ10o;4OX@}&G=T#XEt8_sT_nhJ6X}M
zUCvH=c4h}B`;&o<etsCW)^%*6i1TDjlFs~kj00sG9jO+jSDcFH8t@{{t*zCCy<;7H
zl~N`dl;y$<Y+gQ>(RzSCmQ?o%8B`aaf`^SpCMH1i2>V|m9(CZke6}5(di(&$20XXI
z*ciJAGB?0Ni=DOck)0Y6L129&+mVwH4>k9XsJ(<tUMfu70QncVyN-4d1jkRZNxC5#
z+&_}HXE4Wl96p>A=~xoJvGAtV0%0Bx908j?G{V1Bq`}51RQIaRByLFh1#K^`bmQD7
zt2omd#wk!e*2gMumWhzDDkF=P#n42KYOj~#O8?5Gm!*Zps5V07ECi9`w5EAs`M%FG
zYAxW8Sh|ng{D9u+Rr~>29sF$vV^-~t22G)dwOF4XlSeoZU3|zEaHb&#*nsqDmF=0}
zjo}ZYE8HW24td^wv=B@I#(dPMk5W^7e~ES(UhOi*JgxmBi8tq~$r};R#+%``#|x;q
zvZ3{u8j^KaMcL&HL5%LvpqF5gam?mEBy@>=qgKvz3gX$M@_^3S3UhzNS_L|P?HTv3
zxOVKzRxXW6whYZN(r;w8k2gJo&yLc$B9nmtNLY297Y~Y9_bDVBgZ&5)O~+fxC*VB{
zLt9Ni{VWGsCA$Ms3^4u=J0aZ9h!9f(TMp|zCf)U8V;zo896~)hcAbGkpdDD~<HF(x
zjO$7qp8y2ma~z-9)YgsfV3qVRaD2aZ6uP~HOaM+E+PxE_qgOF+EnD|TnohH5O}%b-
z6dquo;b;%RKeK_1<VylBJwEuN{-{G%<4;4jyVAIO4c%kR@cpNfVE<#uL6Y?K8q`K+
zv|7~i*eKYu9BMnx2jj=uORb^?M^=}?#^cCBT*Zi~T|xnXw!~%v@yh|VvHaybd1xU@
zsYjboUcf;JC93EdDXQE$B9b(g2rC7Pb4PiWk`txPCAZpH1no!ftN~AU1NX1?;sLBz
z`q)$bu3!wU#pV)zKDHBpNr5%Eo{YevPi%M8oH0o_MXprfLr3iaMz1HM?{jx*bdqEe
zx&KDvkz*ImqP4TlSI)%5WYzz9Z?b5~B^u!hAsiEYf3{=~A7_n8_{O4V((P_aEFn%8
z0(({<D}`_6hy(tR)n1#=67Fc=h`Kj`+i=OAJMLnl^=aBma@TUqQ_LO%4l#Y)(ARCZ
z#Q$kv!+*oqWm+9H<ErkhlRiyOl4hm2*p;KY1Ew0_g+6*@4I}G0s`Sq8L0-+y-{h@Z
zeRkkmHnc%)T4Yjtx%8msra?Rs;%Oo0M6kgwrK;-V0bsJ9O$#TP0hw|90lYse^kkZa
zV{q@S#n<~dSb=xsooT-A)5Io)s#6qlbtqZ5PWt<ptRt)>dOAP4q3ZDoe9GgJS{N|8
zXZBUUx4wkFJMc&5f%Wv#o)u0Y;Q;P{*Z@l&#kcH}8mQ*1zgm4CZnE;(8PcbOr#wZd
z<1k^v(i^@2o|UaylI-(C+Mjd|x{FT2E|b4;2mZSqZfQ<%)f6#Z^R|c~GT{lC8G~Uw
zB!H5^t9B?bh>VuT35{?uOIug?BcC&ro&lZ3vvb}!j3@5YVM?yMIKLZ72((GhCXna#
z;3kf{+n@u%04veRlf=Encr(X?o%Dl*P2hMM7Va>W3e06cc8&W@Gz9NUQr9s*ezNFW
zbujfDj%=y-Oq!;>5G)%(clZ7OqN|la$@G^2)&jsKAQ>lR6=dHBbcukv@K|F_oYuIP
z>NpAv(nprC-<qM3_h2DA(7gC*_7fuZn0fqtef-$x-JKqlkw2Qu5<!uSdr&$~J`x#k
z#tEyvSF4v>8RQm@R=KgT)TjE>ZO~tJ@-xqAOKB#l0ItwaE78+7efFII8|9o(@e{?B
z`gp2>QK>MOqq&Uzr}|CTC0%EC&xX%bmdC4<V;L*Ygei*<eS9vWEE_T8BEoR#o|LMQ
zChJK-lqeV(blbSl(JL}+eR~pW!X4L8qEN33iAibRSO2&RP3HBy_%DM6DiubWlG+*@
z(9hK4gojR!eDbTxjpHGb{uTV9#0U=}vpqXDPvV%Ku#Wk??h_Hc(2=u(T3Wb69nI2t
z_LZlyNZs!N#aojRF`b=dh6)3!n$UdiVOCZC?3y>vyw0Ce^B572d+D#M)^*ngn#%9x
z6gm!mIJ4eRnQ`FSJ3AANFkpLdBg@EUF1xSzk){%BR8)+yE15g@B!kj?(<5Oe;!oVQ
z`uLi!HSYb=qvv3im(P%``QRKee{7cOZip^u*faDc^ELEMMdEzCC&y8ClHZTKhszKZ
zdp^?Bk{pjW`BvdarX%&=-TYCyLQPM(bwp2u7Dg5Y91wQC&&Ws1NC{J=LMN5i0|(r*
zFCb4#R>D@<(th=##ywFwI2{tCU(Nmp1*W_3GyW`1VIiuAz<ctUVwPzZh45@b)A!Am
zePE>@7c#82lV;H>hTFT9|D`+L^Weocqvu2r*h|^!a6W4)2%m_aF9H{ATSc|eVoPEf
z_M@HlaPn&DglU5hU6ZKR`<=uq)QAqNlfxaa?LGznoeBeKK7WMmY=XH3(6FDk)Bfx-
z-&XRiq?2}W{?V4&0oL*W+#oWlkm?(OBZI))^CE-LMPSP0lIGeF6Rq-&t5zZ+K9*ab
zzE7T$-dEMM^^u$D=F<6H0ZY10618`Hb1(rH=@ISvT}7{m%;*lgz*2Ga7GUFi8&5@2
zVci;U3yi9YMm{mXu5}^6mII1&(Mf^TS@}h^o$nh&$OU*La?Bg6wwx&uClk{N#j#lo
zo^{Bw!i~8{?bndWFOT*%n9GFHABzCo&Me7?RoSa|Tk;#1#F4>EmwIQ(X1my^h7nb#
z7h!U^WQ(c%=o_3#J4&p4(V}RpV$9vRE>$I9=7hL>q*6erY6Ty2kg|4eK>BKqzCsro
zRn3KP7u^vmzZ*Sr;fKe__tx+Wj}_wo)&eku*Ca6*-nn~4Quxc9m2PrA)hx=yYw36b
z(97mz6e&QxE_IPfLSz|9wQOuHN{pO%5;{UOj^4i3gL#KU1ssh}U%a=8=sNLZZm#!A
zU92Aj)}F6~x;qv;xlDttqsa_feJewl0%B5&!J_L|bZoACLp96n*Jnjfx^>epybZiy
z5f%6ft<tZLx_KS%=H=wTSJuSHyBTk!t0S1Drv^w#;cuAC@@69Z-W_9?s=8Ge`DR76
z7v^Jp)h*LL-+4vdHOJLLiQv@Vg;R=rwlu&mEG-?e@(duh8O~k(#Vn0?tBRKb{_+Ac
z=0%+v{;qXQ)#o4W-=-T#3la$Zgz{Nk1y}^ij?MXq_M_Z_kK3cc3t9SLJxQD_MDY1>
z#i}u)>O}m>n#4@Ld~<wpO!@PsmqFi=_z`R5*Ia!TL?x2Uy_Pwi^l4bXfd8<x<AwK@
zQO1(O@oC7;NnOQMc&@Po6~k$~KE2tKaLW|NyMkzN>DPolr-m9aM<+~lZU3U6c*?gj
z)Ps=7*huEU<JURYi=PQj^YBfbaC@b!!kuD@Klcv5h`KR4rYm6-jQ)nSe(}wDQTu#G
z{|8!@;=H=R@4K|L$FC(BXfkHLw|&`*cW*rEm$;E|qPhI5D{NEuX+Woh)Q_>bo2!$D
z)Fjfr=YsRI)2p2o9{66BudyBf(4G&MQ4!xeb&~XG^b5B>uiINs8XLLJbQJbn&KX%N
zI$DD@MB0CM_HW6*eUz<*<JNpSeV*wk=c;^?e#XosQ3*OpIJ|&O!k39$?9Llvj#?PG
zr{mkiIFi`}&G(b0ydHAR)0O*I>S&*eAO_oi+4u#a)(M?*#IP;nfGMS;$l)ybBB`CN
za`!<MP)OG4Hf-ZUd227IOKiTPaOX93^(jeoW#~!<s$(?2_jQ(wTAAhqB7EOeZ+;U%
zX|v|c%)bdYc?lV0W6i?p1)oW&#bUeT%jhJrU>yuc_=OeHUFIk<mNAzJFI5Y7Okver
zNgET+N2%L|D*s_PeQZ=uQ8cHf0k;}z1{8GL<p+V3ADfOB`Kp2PUQ(XfJ&}QfNu$ww
zIV*l{3qA4m@I1xX8h;$t_#s>LNFXgra;kaeT37g8j#=LX^FnVzo+QgHlPItgbrv+T
zk)KU(eh@fOky%dMD0B~Dmw<MHfkhxP4<0<_TbniRsfm7(nR?pQ+6aq&&uQQaqnT<e
zd?$OOjXERDSJlzdyMz7W-UaRMV{iwZ*D6Nl0KK`XwbO6Tfz-U7sc2gA>}Jw-4Q*1l
zulvm?S2xv{OD!C|3meuwjyy@LoV}XERYuBU6g`86KMT~~TT>%^frT_3k}eEb##<xb
zF5T_qmsZy;W$<09_-<3!N0HE=6z#ENQtwrA*DMK17jNwPjv~0xsweRZ;_MF^bq?gK
z)oa`E=eteQ?s3uBdV3WAxT<9OhG(`6vPoPrJ{8ostzu!wJb0@2%kvy0-@*MaGU;5K
z$b3;+W+r51=FD=}5z%@9rBtIX^9{DXS7OM42d;u1-uI-D`#q8I7eb(D@Kz>{;yw2E
zEXt+b)mmtMK@zx!H=7ZBOdvuni**%ceOyuUB7b^`>l<uKoDG5P?!P*F@Z+3}X>e32
z=K)-T?&D;ekqLb=ftxSW9aDPrfp3r{WuP~{(y91c>;&uvHV4MSLRn^NH2eoC5xyC(
zTM@&!tar`BhIaE^nsZ&P_<Z8^m#3WQG|u)*VWZ@60w<;WeMLsw1if_UuJTzLMd=<^
z<`*)%pnMurajUO9T(?=c1tJ?a;@&P;k=$(YWl$AU`c~EoJ_qz4={b4t;me$t2slRd
z>^<&sF^$nvuiTR}tq^OUZ++k$7aqR5et#2@!ZB4s>>FVEQqrv9#kY!0Gbz4HtH+gn
zks96bTsM_VIuhylT!UiQykqTRc8HHMo-ph>nnX&P^$Oo)Pze1B$He8$b8_h0kI^0M
z7+x=K4QkKV;c>O6rcSHI3a?J?^6Z|L;|1GUrP&OG=;*TFnm&GMn68D(e?L7#wV<1C
z!oN#)zwT;Fa8riY(It+<QxmF!#1(_gN}4OXn9vVXoFnLwBLNNf0Og}|tEk{P6Ql4`
zj$`}mlCMEg;C<1ZllL69QKSvBAVqdZzPK(tr+N=Kn4gQ3KCRAITwxX53f>0<%RJ~i
za3<7^6J70Z09!$P>?%N-bz6>UC{NVs4~xBLJAg-bQk?gYy=6u)<<xZY#bI3Nwn^<^
zr9+I=zSb_NUDvtX2&oY%4>+g4@UTmODYb{MR%{-<?VIDU^vCrGxAu8Bpu0Bv`tN4V
zV2bK`E0ylcd(?IBUy*2AA!bjW89TOcHLnRv>vc)&{^4RxJwrlDN|yVRo-sU$0xR|@
z@VTm5H>dPP`g<3EAX~XQFUAYuxAjPFJu-fE>HI_Q@d`Wruo>EvNWm9RzEPjD;1eU+
zuJe17X~1ViF1nh1iNNnInfR3kcxOabt5^Sr_`9tKYn|N1_g-DGQ+k6<?v@h3tlyh9
z8ed|SNcCPyIPWV>@uct|ifzN+G`QJ*O`Pw_7Lc0Qj{sGWCYQvow3qc4)y)Uqck$NU
zmX$#tDJ>Z1s4<R6d~=OV6Y&CW?5y|Ho#7wvJjDa*u=TO^2jq%N4Pu`&?e)*Ddy9sD
zzwTFG(9q*2P(gFA%M9q}C{QsY>hFKJUq|0{oO`00K!rHyrsUx)bG*L}G;s3OYn9_+
z;mO@UDqZC`ZpnM`ewt&IC^o9<b(ufUw>NX$oSzbNTYNSFw=aEmpg!U*yS?<#14o+?
zi%UpK;v`Uw*{!)BG$JGa^gyje)obaRnCn9OtI>}tCq+Fkc7uDnB+d~d6E^UNWC$L}
z5cEHip+F2gGO~OJUHGjFie7QpRk1`NhcSZno5FU(x6O$}i|f)h-%L7A*3wDey5_&#
z_gQ21(rjPFj|mYFJuVz+LID8$CaGG6pDq#_Jy&w=aJEnS&su2xdlC^I0zVJc0{Pwv
ztc@Aoq%Ka>+nR<4qodEqUHB2Z_s(K-KxrnpE2t;6RzNIW(s}k(lw$XA`8@kf%TzNh
zj^%7>TQ+SVQ}K}xCcar9r9Rg4P-CWhyzA<ny_BXuU`P0=Vh;weEWAc`n7w+BOXqTo
zFX?<)B{}J$Z7wHoM%qmpFnJ0(pW38#!Y$isJ0{j^RxYrE<S!@$K0El&_yR{~+3UC9
zqw^E*Qy`)ZingwtqquyILQx9bB{eDSMH}_&3EyD#w5FT^8m|_d9=9=iCf`^EZ6&Q4
z{D5u0RNCx?S%s~x3R}a2N0oNphxE<JluGrUe&p*jb^b~y0?<&+d_5ky+KkIGB`CGT
zX9VoS6~M!uqG_OnF>LV1^z6sS8vx~>E3BFd$(0`OnLY&jK`|l(Yo-O+_B9|e_rvx}
z?_%W6b<kHt%xwTNA`o=ZKiaR@^RZ|{-QGI((oG3%nKWk^Vo|XY<A;gjU;!D~jiHzZ
zgAuSl4$rCaAiHncquZNTI9O`)2j%>*=vx)@p7(vnqZ?t(HhT=WkM{eO`2<%qi%mPM
z3Z~&lJXbYi(5M~3tZ1+_YyNXa#8yud`$tRQuXhb3cK$7_Yq-W`>mTzL@;55;>r>o*
zDOemW=cHa^;nGgd{)pUV2HX2uy=IMN58gG!YX7PX8MZp9$5tjWjHVvfp*C(OmG<3j
zyJn8f9__Jv(J#poe(>(fvPbF%tg&o6C)e}5;vg?s#$xeaBdX_MH=Of4BWbTeShA@9
zp<8!~xUf|bwyfkqy<H%VRTwP-DBhpdF6QRffUb)M0``)*3(a}nK3EdcD@?U^ZLF%Z
z;vm~Jp|_)Y+{;og4VGjd*!)a8_t<(W3FSn!vD&YzZxRf4NOeDs{Gp7R@HAyz@Y<8>
zd_`4X|A1>(D~=L<XOcxEqMGxy`vleyEfz7~HDU8|&E(6IAeZDxu@`pNr<&-6?0=zM
zm4A&sSRZ;<lwUx-mlQux_O?dKX1cMVWJ#!4m+1`by}UXiJYvyjN3cta!qMF{?OZT?
zzZ2rA{;XVkindn_bK9J;)16|39aIFB4lp-y(5T?(Uc*+wmS3fO8a|Q|ZRr#2_e<5F
zhme_#dB<>!75yY1O4Ayk?8SS-x^4K8$I{OIcgg+L@imjk%$Qtvz3t@hLR;Q1><Dl5
zj#~`Yz^u)gW~ZSQzw@FATID;Ula8lzH9#pJ`Vo_WZlN`^w-;%)CWIF^!wymc2hN|L
zxR3!PmPJV)f|tdqLB}{f?+m+y*`u+5x6)Z(MSA+hEsYKPd7{?56Q`*)FQPpKm1>ls
zf?~JO@D@~1co0y{8qCW=7p*k<cQ?`a(DDBLpv~YeAlfVe!#VRryaJn{^zaYgOf)Y6
zS@C>VLcPSpcbut@pvYqKS7fnz*Vh+*@&LPbdgW(<wn0ON<>RJruno568`ck8M^EwZ
z)HxM=z7|QtB3aeI{MoQ<l_Td@tH+hi!<ObjSpCi$6V}CayqS~$ykfE{<oDug`=z-(
ziO%U1c6>qd+xZG#$)9X$%$(d^U&DxziDdg4l&}4|FtslF>w+`^tHjZ#EOU7fEYc>f
zXvCmh$9HV2FR-S7Mk$j;_chc&yr;kmSx=@|U99|RL4t%3#T(=gLz(?hJ79T@fT@SO
zQl009P-ZDmOSirKLgf!(&Ta9fDPvFKTBiG<Eh8C@9?u5A7DX3O6+Z(#GsRFS+#oTE
zx@C?fLgeOBmB?@^pX?b}pAt#I2R7wc)xS?oO?4XgtE0^?B;rqJ&v?)lTB_*%k*`R*
zsPYyl$uG|yfDO_FEH(<tT~ryPW)`%3n^{;8OlCm=8$ris(Q^8B&o{>`;Z`lLL@J)V
z+X-i>K!`B_V;Q@0!W6$QGoS4%JYZFho4xfDMSMM`{-D%<q{w96(V_<vGD5*pYCyTT
z514DO5MD;Nbma%>b32dj!n;3?PV=>gj(F791dXt_oylF2q|){mo&wE#gyvS}HWQDZ
z#)~;YlM36Rw03)BH1?w0+qZFP{r?YLR{<4e+id{>36U0%5b2T*X;eZHK|&EErMqG1
z5RgU$qz42^3F#WTb3~*&2L+@X<UTKc|9}5`*S%}GW-S@xjq{w?XP<o>2<&N8UxmUM
zwS*a2)W3#QTQq~ajHr!tOcQc=du{ei>7=)KiM@EcF5AHP8O_@W8KnEL#h`J+7GiI(
zhPX%lo&-4QZgr+gQDh9Yw)34#u19WzV|To^(QEhuU%X#E(GzCsIc(hHxGOHVy?PO}
z1olb@7dw3#E6WtXTbFF5VFdVdJD9N~EDKRaXxcL-+<g?>-hWTxBXOCi@NhgyttM+-
zaaq-nou2EEUo9QAq=#UX#BO)ZF<AB}kmbw9qfI)Nx2#{Dv(oS(FO|RYma!0Qz+r~&
zVxvVg(#l9NM)~M-N#kiRY&N7ZT`MKQVj(F-+1|TLs3B>({Yib44*(@2pXKzon`J`q
z_dc9`p*B#dV!I!e^58AD;Vi<P?-f;Kf@);!J#jO~H1aL2<fV5=`nZ)Wbwk5XI|wtA
zg+rae0i2p4vh>nT%3XM8B||!CmWa`Or!ZKUTG^2PYW-Humd2uPb!t~}!=bjs!4LK9
z5!7Uu*KxyBgo{vmIh)^JlJsdkqwTA){MZ%m4(A!!7s`_$A5f;YBxiz7^C$WY9OS1p
z>=#!D+Kcq6HaN-{!kq<w>adf&NMS~uij70bDn?X4kz(N_fDa980U@1<P`~j9#?L#o
zqEPwvHF|;s{tt-?Oh%J&!mfB(m~iZ!g$d=sXR=>6`6$f4-z^LG{exg~EElRPO_SYV
zZ54RBcCHahsq>ZgcvE5Cm)cK=2klORw;{DEanoX%yW{)44W?bB*)?S6Tt1~sGDZRt
z`<Ec+m(!?c%)2L(qN2V(jX(R7zy$(ygTkN;r-~RHV(WFj5lzfi$XCJ6<b_C~h%b#v
zHZwtZk2ZzP-xwDe*Fs4vpv{(<CpR$^vAh0KU;c7)<cnrth+<D5&maEX99_rQ+g;Ij
z>|`m!Sigyl%yHQ@`=JMK(5NhDA3v-O;5pVN&BEgUo~r&i6UOs$ufwM?QLoxoSH#&q
z?ENv=9Y)DhC0;pKvTe=E_th~jfGx)(GGv{s0gQz@f>8qn-cR%-_sAw67%kg{p_6SC
z{Y<th=KCz$%cA!|ova+Td-p6k2Cs<6wcMAz0XFe%@`<8Rh0SVAL0pKe9B!!h?F;_;
z^jd$TpfSFn;We+tohT8J&y4)8&c2Uf!4A=nkA#|QlZNW-Y8HY+xDDg%!_-+*?E`Y3
zcKtkc3fHc6Tv22^rA6HkI0^mwm_2apepi#OV)l0G&l4&5wvyI`{^e=HUE11#bmhIu
zDQs#GHfcD0n2h-swL?{nv&MLJrrM{RJM6SZ&)Ynr=Y;^Y{c3-0i1D{6lXdv;X`hYl
zw&2e3eYP0SO8qCIdX?wP9VbbSsV*0tE|-A58xpG`HKD$~S!`E5V8CB{%Q$PJQ*taF
z!3rctOp2Xuk3A&YtaX2{+Y8Qz*hJ`t&u>k{rM-?lBZGjP9Roc$IT0?!p{(M2uWaW#
zaO(8n_at@wyt294*12-cEQ(1fiBnaB8**UCm5AF&s|wJUaVU2RqeGtc0tRVh-Zy1-
z>$&`PQuyX5<@uoh-1YX@w1FydZ;EC!ZK+tzVx0-A>Yp`Si~u+56H^)?2gc)%iObgN
z6oBSZcZH&bT!lIqnA_gzq_)cd%dX@DSMkXIyeAP5WOK@Xg^Rqq@k9=$?2`*4hDZO#
zcBUcJC`Tlgy9@FDyi~;0%XL2G%{bVU>5z6UnmfVgt(M--U+K6b-72src5y4?O?(*%
zDY3Om<1O<uwCA<*T0X8rUsCjNoRcws<Y~H2&oaz^(ex9&@Uxj?Zqg43=_{0bzn6Iy
zdh)y*?!&&_c74#MeCcT5b+hpq1y(F!KFuoILM@GJT{0JR%3DB2?8Iv;_7qc=9>9TS
zkvLE~-0cI!xC`$@n|qIgw>S1b2w;7GcgJVE9L=h%Jlv8@<#s5m(5@Uz9_bwZ#)MTT
z@nDC9c|zhNPm+2R*jMVN_%7hJtUxZs7xtxRa0#ji`d>m5a$lUnSynV*RaRLEY`LyB
zUp-Y%!R{xF^&c4zu63E7#aLtt82(@jTYl;AQLT>R6Q>xD_-w@Eju7vIUs^ut$4j!}
zqdwVjV3+FLv3F98<3RgkzlgwBV0(DUqmD6$2h1Gx-TGYPB{WE<REDBX>N#bnYgq`O
zX5%E!wB^iFZC=@v7fzAHX6`#Vg0$01s=?k^CYFw|D%z|HH=wjrk21z>Qy!Ah65;Wt
zbGA0|XpD9nuxE_nEUsyNal?2jO_(V(>Xzk7+@^O|6{sD0LdIB=WLbHv*riR9PTn2R
zwVb+Jl*h-WHUBb^B;B7>>h!W6V2p8zpPB4l&k=Mz`U%()NXl@RUq5yN-x(z!si(W!
zlfi<500^oUj#ethO6!!wlCEmmOzN=y25NY7*?_9>#dSBEpSTq(^!1+Z`{lRC@|T-H
z`|Z2Hg*66|_vJ!4wg{&f=(Q{E#ct~av@Z9Cv_@?7@6hs`&|29pF~+WQjSl?ON=_+u
zc!X6lrhz~lOCY-k^wRsPZKi^uT0LaV`q^}I=!Ho_>v5VQVvBF1=1e!XSifm@+<FB7
z2b^-PX7$iQ;1|7a)NQ8{4Io9zO{i9coT$GN>2m-eANloCK)*8pUnMv#h@Rs23k=?G
z8$W<cD^Fb<(^@CSBg>g2-hG`TUl2V>Ta<&D=3jh7KZxexoKwCLRjlibxf0pukE<%4
zaz+fNVm?7OFgz{o8|KW$W<{TYUq7_daqXOyZ1&9Z*ZiY*D599tFo$`w%v~#p<Y>$e
zBYhC<<+DBQvNMMV?Ix?Yml11^r~7m5S+`fynx_cG#^Q(SqiYK{_CQLwN!+|%U0<!D
zfC(LEN|7{X=m@O5_sf(>Xj9gVm7~XkZV#=Xj10OVLDJ{WD8Xx*4&V$xqvem@Yk_@V
z>_ZtCTmoM`AB_KIb}Q%26@E^+8r}o85GUrsp};-_N|QvRrG`Wr@9nLo%P24-G5g+Z
zpxs=V%uu`1(&RN^ZbAiEvuqG@!#0MqD2bnhP5-y4pt{v!_!gPN<_G<SiKy6gbLCO7
zbFhb1o|?+LPsujmel$ME>UbpiAz@$jx@WcjBiy4X@!!pWX-qLRS8h7+aH`72NV(c*
zAa7N9sBg2^OyeQ{lFLKGYLEre3&`{Akv!`wdghvb7Oy=Zv9Y`%yBZ8GZZ^?M&-cPb
zWogR7HiT6adtvE-1{cZ8B-}`0_$<E>d!yI5Vi^cg^d(>#jZP?A8<QyxVx63v5ZjcK
zHU`Mj80WZ-kzW4R<lYg}bJxj1dFEOL6LX{k^yWvwi#{hO%A%y9RWLoJ>7gu9%0G1$
z6}iuOS3LAUPZ($W^~d4Fn{AS<Tc<HLCt&a94fON}Jm-aG5`+eX*RdNkS<8yczI;p<
zN}ll+3)!dw<L^42o0VX>8gZ2L9~cd2cX!_IQC9BX*R}_%lGniYU&^FOrHJF++Egns
zX$!C!bq&mD^|Koi!8e5?x63+3s0BO^t$8zxlfbS;%LuR>`wingfF(JzeN+2*2^cI3
z9O8S&vT*N>PBWG<E#*Cm%fW+5uyA=D^}#VNS0Rc9SDx;0PiOG)<i*)hG}|SVT!)z3
zUGPTC<sXy7muv(L$B64)H~AaukHJ_&#K~z0V-zpkN=57^lTj+i3JXADY9~r8<Dsx|
zAAqferFS6%oOl8<6Hy9ynooq|l%b^fJ5LX*dkdls2+4TR=7aN|h;@J2_G+P=2XO1`
z9pdHiaQ|r5YvpI-6cqP{CPUjLhNHKw-8O!q{HSKUg+v$Un-BzoVnVy;<m!GDB~j;7
z6K>aX3D9>r#S>l}i*NUd&8J*q+))X=LpxE_2^|`$eT2Dtl6z6gb)|K4eRXlI^Rber
zr;GL3wnf9J2e^-6Hx-ppSBOz9QwAGyrEF@%ocJ*ZxSb~*`TY>ILca;}DbdaD7xeGR
z7LwjPIy>BCWLP~7oi`aHcICklf1<0bmNh5F^}6Z?plBG%F6+n#WZ#&|l*3n93gS6Z
z_o@dpZN{i*R=V-fyAO=1%VoqL*Z^@X`HEI^s%R8=x>OD}U`1<iUa92>2X6tzo*PNQ
zl7IQSI1hi|oup3f#<ce?VCTf!n1_V-=h0NWX}sUl+Lbi|&X|#2ShdkJcXzOr?~_ly
zc%E!WNI0O8M9JLJh%#eQR;6L5=hfC->5XZ+E;>L!rhbBqt{^hEmrY|@nDA+QvLZCk
zR5g$;eTS?+fJwY`$bzo4UV8G$YrS6!WaNVJtxkWf-if8eQ@W>kZZR}|V%E)=RQBO&
zL)y?Suw9nezZ#H6I!(E+(>lMucSvs%bU?S@tP(0RTV*$AEBkWAqg81jMWhR$u+d<{
z^>^fR0x#=to<jIHKn?PxFddQldss#CAg;9KNf4ed0!5OjxDjfogD+;2yfA8AA4vsM
zrVKP2HQHJ1n8zkZNcAtTz+XIn?gs=vZpYLQw4davq<0^s0^j^i6hK9#Zd-cx*Ql%(
zu{q<`si;zpoU9hhSC@NYB^9__Z-*!8nTZbQKS1f%pM<CQ0Mf{H)>I;Lw)Pc_N>BNS
zqZ=fhTN>whNF7f5@*2R>8@D<Aj#^0s<+0=Ri6zK~WNMi}N~YslfBPONm9PB`2}wMn
z_G9yF%VzJ6X=-k&eaFN_vbQ9^${~T1E}6MiX)c_s*~)b7$W%_6)LQEiD4j}YWNat|
zeJ518gI&=`3)+FpD&oTE<!2aHDB020=xqsmq31_yWqa(pJn1<77gqM%Gx`|Ga&nok
znkW-Jl#StJ&v;UBX-02#_c`%IyG&J*8({tt67Y||tU@ivsXuoRU2|t8fLaS0v$=)x
zJD96$e^#{8k=XEiQ&eNLAELIO$+VW+8H+uDz>Ho2&`sb*Hk4F_%Q!+M(oKaG2I$}>
zF@WxS5`%tY-!u6@GjT)ap*aL)ek8G55l{D7{Ot=oRaXqz_iRqCJacE{@QdEbQPqBm
zNuYn^4+ax*_mN&Fi6l|Ff=U40;4evZ#zzhfEslSL@z*3&`G8rKwb=%7w*Hh*Rath^
zAB9>14EW;t@axVS<3k~~9dn{!vU)9hyn>EYHnS|;bx^S@T=;jks_Tuk;$h={JH21a
z2b92<7ssweGOM{0j1Rnnmc*A1J$j2xfN<4qcZiVhdK?DGF840#V{gQ6oMojC$Y2O@
z2eD!$ZKV|_(W_E#?uN1ME<c>Zig{mKidjO~n==%;POTW?dc@vj5_CdGY)r83-1MXp
zX1SR~e`LAmCH<jnN^n7hr`{?uMzZ!+j;ykUKoWJFvEw~+*4+<g??od~SQF+wMVAOK
z+oKJGqm4JL#-8q-T1`F;r8}*jTMcF!T^7XvlAeBzus!Ai`VM9ifDi@h7V0-Q8&{L9
zZsa3C^H8sD<}gvIr2G|oy(kKR0s_BHptNd4xE>Ku3|+OzpV)@q25qDl%FMH3J;hsE
zELj5T8@9ta1b|h#8s2o1`wnE8(LOIZA*`nv*sPY()w_m(E0iObgt)iEmfO4I4Vu&Z
zlFC5T4}nA7fIzdkZXK0v4hexQpFY3St^|`>PwKDp9`j4NZ^C~<_3WpOt+~c)=Y6Jb
zd$NJ3e`g%R!#jS;v3?&_RSPx)%}l?}wKNBj(qw$e{TCO&PdW`>WBcU8^A<bim}qHA
zd@td?hQ<dSD;cN}3p^bu>b&8?LR(K)bh}xCa}j=z5C<9O%QElq;J6AEb{8mTi7Ilu
z1?4O>%)mj5PL^7J#Q7WwZbi`lB_FU<hW8jyMvsx#`<>KZHb^Fy;)v==*icSD4un&L
zEOy|unmT@XoHhwo1R)p8`ndOWu*9|A#V9C5(?$Lo`QkaqO4VAfuQjDztM7Y3UMRw-
zp29xc*ZcDpd_q40ufe#7gzMGU=8!+8<WuTY3lqQPc4HV7GNGQ#naF7~;6h{qO%r+X
ze&=8Lxo;*bEP_89JyRyu9x#HPV^Qd!;r%Mou6(#TSsBZ5f@xz+6lw`tLBX)AT=&ge
z`Pb#6Q{4Kwe5Bau1KMt^?y=qt0Jz55goo^5-=ql%=MHNcvU&f<xgd28@!gwX9CtLr
z^%&?wF*6<8M>PW49K%=HzH{Xx1$hGru<-rbL3LWqBD1z>M(YQAwamSSnPa?Dy~r6)
zF=Z!(;{7yz+=&fGz1(SIQwW{Q@_QrQMa`VEwp?AvJI5H%olDjMfqF~nLQg=q#sL(5
zZ&f2Uzk|mV$yO1;ZJR(D`f;5BFG5FwiQ}6Iu>ERz6=$3MX(uEV_10&nX?JqCyY4iw
zc4F@>lQd}{dE*T3yyA)ShrB?uGwMoqae+Ev5&ZEg8|vcv-(pH>VnVBd*Xdx*<L%6A
z!B3NP%#tE^IB#{mo*iH5AdsQ+J$JBY#FqY9j+ANxLj|{Yyn}~d-CTF$*eTYut!eh8
zD|uAkj(RmX<5jg%cLex;4}<W?&*^`ZiYkJ{Bg(Zf`6>^-3GB$q{Hhb|O7ebNCo0G+
z_YVKt4Wjh4HtEXBkK3u*jsA{Jx%|&HDAzR7mSviw|2r$ur@6I-_3;_nDnUs9`B!0?
zg#6}4f~S85rCy8Sy%H3xBAH;4NU6-FpZSG7@0aC=ck(BA6y)^5B8sv*1L<Pv)EYWB
zJ?N~y8`9W6Y%2S+GPkk31$?rDwC_!cVccW)B569Y$vYt9_yhZC4wa}x=%AH@?GGXj
z1GsM1hbpKmz{eCKLYST;puUrm!#DM&RmfGERCO*CSzODa64;D~HlKldR&%s!w54P@
z2~Z_m=KBsJd=M7HEM9<H{;JWd_HOxmC&aQ`;5N17`xViK$MCS7+bVj7GbJ^Pc)T1R
zUt5kAQgJsPzhRsd%`#yo@gFEEgh6R_QiA@;fYn8+FhzC!>}D}mFJ9aP8&$5jXanoK
zTd`ur?1R(c{C;}(?7iDx!<UoTq;vOj|5kqm`$o3Q&1;`kq%WL*&XJH$Z4`e|!bj$(
zZe7TmM2%JZFesrE=a~5k_k0F$zCeN+&5G_8fFT=PWBkmgZ`Ttswk|y6iP?uM#(JiE
z>24D{wiM9cWjgy{^o8=lZVE-Vs4as7Xw-UIT7oiR2n^pE+5D)mP}k2i%^3@jKHq*@
zitV)&DWW))yjCCJig+7ks6vZI8RO)6xIyJmGb}*5KGx`c7BUdII8*JqHER<Qq;SW(
zLhusgaxJe26<BeRrINSqiKi&oD{p~u7K<O&ds`Cl^@P!)*PfeyLYDYz2g5{s9j?*a
zTD}0?GyP4)9EnX-eb85*+{HtkM$p!Jjw6y)+V$3EW;{IQ?Fc9yH0Rdlr|*I?&Fxg}
z9x?R3c`$=TO5P^l;n{J_i@N#=F%B`X2oiXy{X%QV-JTh?lWc=DF_(ve8s#M%B=AS$
zW?|c@=gK>b5_e_@*6!2gy{s>!llH+>vLzt1<~jR!UYu}l)1}fJ-r&!(+BGUo9b{Hc
zPN0mT{Riv*qJ%+{{GUvz%L)?Mf}`0_(YzX(eTk}kcS1ik&li0OEmKX`O?9yfeFtiN
z)7p=gqhGGDflzYQY?P%(19`(uv}d{COi@CIi~gU(1<J~ejcT2Im(14k*yaR0sr;zL
zU#RS5*ybK5;`2j|ZS{A?fH#$l>Q6EX`RuhY5#GPv{%y0v5{}Z$SrAn=k|Z~ZcNgnn
ziE!;Q_aeLNd$K}->`?!az1+qY5@!S)5$a@Gmhc9H>V%w-Q`q<4xjs-?!Y~v`D=xe!
zSkhXau(OgY<r5VTdT^X+V}3K|tGUQ?8fw}W!KAWcEyDV|8sOQ7Mdw&!KiydnO<FN6
z+aiexb5Gkn{nSmCR?|r-=4>8l{;Rt1E12_{X_{HRR$|2?{?gVxCB3Wu9agPo$_ciX
zrs#<1W4#CtAE?g(5J5dv$}ZK6b)e03w~e2q;`KgCsRZjnk5$iLT=%IV3B1~R;!Qsz
z{w*#x8)XDR_GpQ|opqG~UfjkQP;<bBD(p=i=d;afh4YzBIOWI6#DvC5ahBP-&L^(I
zV|vTuO}>{nC#mT`0l<R3x-*jctoS9U<9;SL1z7ua)}5T#-YuNVhq>DVQ5#%HMWilY
zYkz*x=^kUP)3tCYnGKOI){p`T9`~J#!k3Uc@q^msLbt9e9C1(wad)+r4^!k#H&}xZ
z)=5#O*a8(tgWM)&iZEp<Z2;Llz3iKk$x`q8Rcgh#y<6E_U}54LKDd`s?bl;AIVMTG
zkd(RkZb`A1Fxi6}^QIkz`(Z)}+NBgyKkAHDhvCNMPo4SZRhi0biweHn+}MpCBekg+
zi;my+`bXuD>{QXW4{_#Q`l7`f642hI68qjb&dSYd0M2?NJ}&AC*Czf-^gPat;5tYJ
zGJwflXU^Tby83dcJ}W=(^ZZEy@00gSPx!DDNnO$!o}^G0XqKBd6pDkS7PdxXZlV$}
zZAVCKtxOi^H#ItpgU9%k@M!mq3sDlw@Q3jD>vCm)SC8X1?Qk7Pc&uZ+=;s4lEL*_b
z=1`eQUtAMCoouF}X<)@#r)NXwIZl+nSPSZjVT^Uaey+Ou+tv7DEBaUlWsc8U^%677
z-05vUcD9xJk9v&P%%mQh*0Ow4+Sko=WM=+BZHFKxWmS%#q(HbiO;-X0YJbb84|Hz=
z2=F+<JsL!wz^c+sDJK$QylRV?7@P84Hz$Mp-P;I6G5~X-D%Z^8-rAAaPHQtaGQ(*6
zsk=m{N}$>7)>gxj-n8qK4Se(zD3k~?Jr=mwwDWBPFF%CiMM1lU3NgIz{kHBf)H?%$
z5^i4H^T%fU8+=Bc?O%;$jU9_svx3JA4QLX2Ts~?I(IbPDBb8PgdXR4T6I>Fp%wr*5
z#lQg-l<XMMd4aB}UkZi2uIi_UK{D=ZpEn?H=6R|pbcG=s69MeZ__+q%Rd(hBND6(W
z+x-iN06545ZCF0WEkL}H2X*wYG=o!~ECeDe;Uass$Uw1B|ISanm?zwnhIZ!Y-?8?W
zwgwItEf&{pOMN%#*7BLsFX`7zoe2lC=lC=J?8Kma_nCObUG)xLJf6w&H%;@)s>3*l
z!s2)tDLJEjOC%bPgZsK9lItk-y5GyU{^yD&2bFm13KiGIRIn{+ULBrvt~3XddMymK
zphG5)qnr!j0e%pl0^(fB5P=(b`+QjR>Ta~4TvO{0$BgK&B29meLLO9^-9_WZ7UbfL
zy1s0`S&iXdNE%8&0(3U9;5soJh&>n~=A~fU2KuUDTKBPZ8zbh5b;T>phcLiY5k4HY
z82a5K{`$g%HJ2!V04-zjPI`I>vj44WIy*P-V(Dv*HrdsBdNnKlAatu4_Q7eFG2KpS
zE44tgElHZs(rJz=f+ZUI*OJH_9OaS+mc7!If=S>{ix(k8w#IiXU%hMibE)aTZN|x8
zVdR_)QCr_WU1NEc9QgL8Sod+gevCljQ3zPDLmDx5kRFISeq;xt=zP5sP3{BXAPbNV
z09jUIqe2CR8}Zv4Kg--CyQ8=J`(X`xx{&r{!m7<tp~aYV+&opw6gQMHF3PzIpQtKr
zPL$^ku?|_Z2I6+s7y2-4nmVSLjOw?RuQO9S_zjm=rMrC44FxJR9l7Z~c(!Go>t_H<
zJk;bvBinM<b-IS1J<SZo$FrCBu8zJqM%Z?t!~Z~(fEbU$($h?!h%FZ}f5qro8!LFh
zDX!p(V+jx@%rqk1AWx12oC?Jr#(7Pct;gMO79`KJ^t5Pw&T$f_Z?lcxl*KRqcZ_Qs
z^a(@_s-XM+CQnjfD6@GR>t~C+b|%8z7fT#Dcfz_oxxLIMRYPw7TiRbf7v0_6<fJqF
zIW3gh!L>NWKe_)q${v=ae1l7b^@c)Wkp>EjemIyAwuIjDQ;?Vq1%8~chn}$-4p_>}
zfSNPdYulHvn9I;wPGKmNM3gr9TrUU()V!%MHpWYh7<t8awmBu#?`Nm>v3b07xLc=k
zlF8rI3WV!JmZHROf@DQ7>i!}QsBNUO-(h=~igkx?_S|Oep{XB5;NjtIgK2OhFiO*D
z*BybjDI^Io+|mJbGqld$7=_PJ=>z7FYWBZB1=v%;_cHF<Ws*nlzi5@`l<XE=Kl|C0
zvBYV=qNCj30b(|l$3NA5#eq+d;$FLk&NbIP>a2I#jL&k)n@Ag^bHo+x0c3=W42@K7
zA}Q0gpt**K0&14MiP7xL1his5YL1TYaTXro2!G!?9l~S;{jK(miE@V_ISRRmgp9r4
zAg78v>jSDdP`3$oYmLoMIf)+&R9I1Y@6v^2-)N>^O!$<L^0?r52AWkFf+l|_O_@PU
zz;~95gA(5kKop=UHB@natt@$<p_C##K;*IcArM8v;mWlXnFg5p1MlG?KxxMzaoB~F
zr$ni^yx#-Mo!i+`onK^WyF5voe@t9&m7w0}!^w2}fa7Sfu9dZR3jx=x*WNSa2;Oa*
zX(x0s-X-t?%@Ft+P~JxTm=k1a79B;cSCPm9dbdssNsK>d+~zv`(67C5eYnECdaeny
z-aZr5?8e0eqKFQw7$jJm)B(~|*rGp+eLGG2C56@J#o4ERj|SP!tS{ae{1P2JIAoNl
zZlt~cw@G&HYRwVaiExeb6=v1Pukc38vrb}kv%7XnhtTU><B1QM5|ZX7?;EEgY4kH;
zw8L8%d)Vmd%13%E$gMvN&+DHoiz*L&ejC>J09ig-(4~{{<Xu_Qa|~Zk%<a=nP;G0Q
zm~RO>8$XBq+=pRz$j{MZppmU;wf#-@u09=8{4&K+u9v1y*VXPyh**5b1cKqbnMXN{
z%un^U_tYE_uI}V|{Sq5`g+74U{4HNSSLWE~^1@BW_u?S<m1Pb{OQ;z(0cnybHdp^I
ztD|FNE$>-`@O1W53LS?Ia(PiQ_zXiJ)?Xu5UTSCGb)i3gA{H7zl_kmZx~iYZi!G#`
zG^57{OkSA-NA<yyC<!n`M9I|%t>k)qzGXasftK@A&Yr{aJFhD($yIlN{W24wFK2G2
z7WgQS3K@tr5l{>FhtMSe+DSpN@NOmbm(bZezQ+sKjiOc%1}yb#Nj;XYa!_Z+crre#
zZ*|thqx_9^)5Tpknj=MKLx6ga{Ed6X_wN4zNzzh%q4CH)k3v?DXagVMbn~#|<J6gD
z)G`1gd+P>z@{yMa`YDikYWb0`-r``H12V$R)$Q^@vA%JQ3_ynP&-o3_hOF<ptdGXS
z?>OJ^Eb^PfK_YVJp!zu_a@+=uB#j3}Yx_&R*3o3D0QlFIMNQahrJ&>TW23gPwe|r#
z$~+VpyV@6|0c8k3C*^YOw|obXZj$N>c>I#-d*Czr#Aojhpo8oH)|Ow{#bQUeE|$4Z
z^bi=tXzExy=7Qq(Q!&MntELbW!0l6+-uKEw$l!M+022n98Ff31sWsX+^B5K713;|?
z2rEBu7kMG>kn7%`TXY{7|D(+YdR*kJmb)ZBU_<}DJ0G|1kLSDb+fJTCPn>W~f+}yP
zi(>Sp>S(@{PoP=-S{s+=1g60(TFE_bac=F671ky%bftB)b41nheY~781R;&R@r1kH
zq=_bVeU*<CHB8u1^Uczat5mV@K+>C1rWWyLq^wCz$9G*Sqff(NF4)9ZUxYAmbV%AW
zjW_}zLk82%A0f9J;`cyC6)r>s9nOF1mTIVowmq_|s^C^#nD}`v8>ri~;iw7&wG(1z
z*W-N!P>^O^`~^|fRH7;F4n%JwJdbRbMEW5|n`s$=1Ejj{rvmO>@rUw*_6+{l-zcUw
z+e1{d<f#}m6G`P_(tS>G0r12@=efb>wcT=^Z^R|`A&G@P5kQVo0W)d|j9hlbC}{gM
z?P&3dYjv|*+H&YmA8FYvfjzVE&CCky$ZeWvi*HuVw#AR-KRfVTAVWstLMdCmCzDJV
zw?w%W1fqERj~;9hyIw4rNp&AK3QVOd<->qlV9oh%pAiO->G{?&ZV<#j3{+)r?YJXq
z>4D2F!{-P;o!Rc}lUPJ00zgyXGGm>i{#2D845n+gD?C{F3ELE53~`Lj2J=VM)E%;{
z^{5!oxOGq42Sdf<l)Xx!ijiPrN@%DtEwEHtJD(u4)iW>ym<31y#Hv9>M?AKLR|5t4
zxqomR(I~gnfSzV;)W`?WyzMO6?sq!YY1xBC%Ca6pDJdnf_f$S9Fj~cq^LX|JQBb|y
zmo#!u<4ld3s&nC-^;o|bK`VX>2*5^&a_L$f$WDFQdkf5yCL(1JqP6;4L;1mLPvrvu
z4YF+kzYDm74cN!oGeC6B9wcAwUg&)Z*J@IXKy}#6BT6k|8-Yj8k>PRh3*PvOBbp)^
z^2Bw4sAV@rqJ`R@05*Ig6X{`cFi}I3aqclmM1L8rtLl4kJ8_8i*f!s(;Q|NQ&n(u;
zds%hy?gJZAzyD|u1$tIfXZ{63eS_@W*e|8=5#Jd3(|_}z4`gQQdz}eyZyn0CxNe!q
zD|DK=_pxF)+2{%4U!|gxVPenx>Gu3472+-q022r_v()XdIbLv{oWXUQQ{1jyHq41%
zqfMd&!6kHO_-crW9wThGknhSh|8>v+;TlSFsxT`5suhZdwB7}RX3?*0Ynd|-s%KoO
zf7dKU#VLF<{#45*!OBp=&#@gup+|X_OvimC9tH&LS{~O7^+a!<1F~cW$iw894Xxf(
z=*rW&gs;))R^4$Q@vwc)hXNf!fnRpdH1)lOfQrG}+d`Jc9pwFM@sEOU@L@VxDFzu^
zuHJz#dcnbSl9?MV%88%syZByxd0B+R;)B`c?Cg^mgvuah*_8dF5hZsRn+~jc5Ry>i
z1r%5sAco1l<tqPVW3I_pRy5SQj2CKRo@NSVgK>;DuXOf>B(L(?t^S=DCBQPfY2VO%
zAnTmzJPyKz5$)GE9bw<2mj<v58x+3oJ}?>5A%(vqpvJCZmzJK})9G<=9Mg59x-4WT
zZ!H({mA{$&_<YuLtM+3`Po;sXY7Fs8B2sxYu6r^h#S$7tV`u`kl$8QV*xB#%zgaKT
zDpw3I4XgX2St1Xp$;|`giHO3dAi8JVV362D05b2lkeiTWRdAL}r>H%-B~okeS=F~M
z7EVrhs$(uPE5IyK&TV+E%>1IYya%4LZm&yop{rKVcOW+Qv*rg8aJ09JdtX3qsTD#h
zUwO(OT-o(8ltLISoK(;GT=R8a9E=>TaNX2CU)R4>T(6A%b%%O*GEo{Yb~_dK7q<zh
z%zbh(iQ!t|=I(A?Ec3qW{=vC%BSTO3^_yDvo?P>Hl1KYr)bux>*YH`<&S+`eHwN|n
z60W1COHiL24clUC&uOuo?yC1TZbf0})sHkt&^cJU%?EV(TUPc(@r<K$(cvMaS%eA2
z9Yn=WD|CB$h~P<fV;0SPGyVv~2ensYU@Y#YXhWd}&%utof`U6ZpA($yw5(7r!7wia
z`C-BT@WUcke)!NzxQYXy?`eK6gQ6%u;@MwgC3kT8K$vm>W6Pg_SDnW%=|_QeLhT@c
zMSwb1b~z~=U_1%uY0Z<NggcnxP(Ao`KJ>?k^5EfafD`17d!K6fe9jmGv%;XYL<$I!
z9ft2{gTb8sEwGeAJJ4XzLGiBHN^g8dAk=z){v#W)Au{pATCe8hUYCSa2=G;#or^%7
zMugE)XWwNb+m<Xp5d}od$pBu6jZOmJ%x6(&s9ZP7btB?;AQ+b$W+IpWoDCyqa(Lt!
zwFIL1C&0uqSa!74L0ZA7Ctt^=WvKA&LpMOYKl5vmsrMiI?l1l2Ce^*#DA1+G54>ql
zDe}Y>Lgm}ytps~ZX~sYu05GUmB^hPr#kj}^Op3XpV^3hS6&9pmv4%-u<Z<+6ddjy7
z%aL9i#fX#XFM+_e={Y6;g6=|@e;$m<Poo)wtf(`GXOs+h?V#WBl|s_>LeuQIsR00Z
z>U<^c^e3aj77%S4K%T7qp!fy6OKQO*Pru2BPZx>GGE4UcV#gIB;#~X<gKt&>q$nnZ
zxMEwKD4WzBJQ}CV%?OWg^!tL4mwy+zCz^<a0g=oDv3SZZn~AcHaU~O=e`b-^<@G}C
zI%_<L%byoDOd)e1@&KvkqdB>P3@A#E{|k{%OA&A=dJG%HJQ(M9+nikCd?V6Cvq<Qu
z6$?=CrY`|_$W1y7<yyMD=cS*OBVI<lp)YyZ<LepuXI&F!Ix9(m;VV|H)G5?ASBCxS
zKIO6FCjnM@SE=>E?zJBh9(KfEe@O!%^bOLaed(zg+Cj2{PnD<m_Mfa~tG1=1r{&>(
zhtU{9grO3@QAmF?F?8vYNXaI5!uf#G7~(v3+9p^0?b)os73Qa~s9Sv#O&jV0b>-@U
z8_UtpSImL2Yb{+h*<|T)F~P7JZkW$8J+3G_{$D)*>eHW>2%w|9`vJcUNC54oKkvDu
z$9}W;7J?Mr6)Kd&*KK1yWqP*dD=YIfqB6IYSROD{feHA<f-l9bP(o6wCJX8O5ok?w
zpAdso0UQ%L+AT$=51I`j^$+x|-8|USRkJB0foqiZk>0s4_grQ<d}c^Be<SfY5<y#n
z=h`HaZr_R<oM;d23jsshZQz-@wq|Jhw-&nsj7$azmn0}{N!EicVCi`6bnxXKlhZVi
z;<_5YVvv}^Rf|{^@%`iVX!0O=BU5Ns2eN+`g7GInTiUzoZiwou!xYMq<d{X#x#WBx
zjtd0G<Rd7JyFv=xtw9v_B5&}y@L-&L<BT~>e}u<__0F&n`FT5az}xK5t_;|J-`W?N
zcP~pPEF@|hqRBcvF{em0ZVi|&B$e?oHaId~7f1tXSEqz~Gx~K{5`Toz#%Y|d8#s-t
zX7V4in}3Q-6cJnp9W&6JlgEK2>EM?=b9Hk6RvAOO1zbDl9zDP}kLPbkIduTJi||~y
zM{gd+R%`_3OqI=4%?l_^Va)?aKTWu7-i$)mnHV55xHPbNe}}WXfNS=WYt8ACZWceg
z#Je4F@?4Z-eI51;=D=_FNT+7b5(J6)9)8JQVpJau2-!(SB_E*PD?KSxDhAZ`US-$q
z09^XG9T9}T)a@M_&8H$-bJ0A{9{Zm5GY<0ZI?+oMyjMom5Qp<)#|u0T36Tb(H9_{5
zG{UOqr$W40;@dOdkL?3+kk_j@LS+z_hNbT6y%)cA4F<U8%e9)Tw)_C0%vUUzsn>DF
z2(+L|QMGa*Qr>L#tD7;Z@Fe_>xpe7kQt9dp+9ke4Oj~sd*WOiMOLoDy`A0hA&5yy@
zLevzapUCeDl+!EftDHJ7fsswi_e_8O=pHl5<`ffl)>0<%fmuSmaPxx2{`u3h)6H3I
zt{S@In&o#4*@95z!19gbs&arTMF~(T7P$OPRKVv-G$<YunxqJw!ej4BsGol3>P;9d
z3RSIVQGwrRY~NyS01$+H+#N-C)3BpO&tg2JPmqMXcIFR0kj&ytgZ!hC_+ANUk5|pi
zE5INsnh}$v{2pA`=Jv1AqV?RA$j3f$*z77=q*>WgK;7v+&?TYWd%E0lxQ64%6$sY2
zfp!!?7tSoNWofArEk2zaHKvdp<@O*qaaQbw1eHr1sh)iSs~CtkDDNe|Q3EYoFQCUh
zpV2D7jQ6Ga%||)((9D|Q#ELJZT}RglCH$sr>+hMu`LY0-!n!cg=!TDn7vUZd5GE9}
z2}ur|d!@`RXO=AnW2K7U*HmEr=cwrN6bf{aW(A#1toe~}b!7e=&h><wZLZRkki=+(
z3?HLV`6Tc*-Z?Wh7|RqG-*cM&{16u9w&|IM-@nufi#C`E=*<Lh>{O4HUbecG@1z_W
z3Z>jCFy;i%xae1VZohu?gfQrp=t=TCjr(r2Q9K4}QDle|T^hvXmrRAHoEJwfh88^N
z`l>nqAjqsfl8JJr4h(U120vSs$MdR66m#LIQI1LgNyxV+var<?uUqS&>(w93aUKy-
zYZ_7gvUgvMGX~YVwiEwkK@_mVKnd7$e6z>cuG;W5hsf@(7!u?+kU5A|^=g6qhEa07
z{$E@G%+|*SusyN7lbsK|FQ2*molgsSbX8~q-b^wB==S4SGfaR%T!#G(Bmtl!R9_Lk
zMx^E-X1w0A-W+|afRkjV<TnSJ=X=kO_!ns(7`j7~cXZ9GDodylcf4=sc%FunD)$!U
zVyjk%ucmH~MSah1Zu9irJn12UYgxW?XvV$h&2)WjwY#LyygVxNVB%?j57FAVA2_eq
zF>{Ci<|&zAn!da<@NXdreJ*1|a9yNNfA&kJb(0sY?(rn6OVQG{IN5q6jD9z73$m>S
zSv6u_jvGZAHeDQs_poRNnF;S-5#AAiu-Db0fFAnY?DEfrk*q`Y<4mX=#wmL6SvO2*
z0|kK0iK@P5KVE_5m3Q@CXT$2*d|Z=&BJ@%4=M%t|RPa)IkZu&u^UCvyvW-2-NE(AS
z>r=0N#xEWg=?96xPWKF)xfK^i8yDxmFp2UsQ1Z@uS(I$$!sf}p$jE4+fZ79OeZR5@
zT_$&2<e)iKY4l!@kUROM=tl~!R{*QRZtwonk9-0f{2|o5^C%`|F-1sqHEVz7izbiC
zcVM-0-cd7vfIm>K>~!-al9I1A#F&a_bEbZ5TqD3qXa5-G{*Ucy!v;q|PNU8dXtqkL
zQc)3Q!h=rP>KbiuF*8c9JZ2?~k4$}C!@bnWKDz7@xhN=|><%O#Trs$iJN0)v06RDY
zG-TxqIhSy!lC<%YC4kk!(r-(^cndW0Nsb!7f_gmSwovaYiivFC*j*kRcA-P2?9!Z{
z3&+D7q+vAOt;>Phe_W~yN$&R7a)3l;N1*_W)`0+M3?@t!SGwEgjA_*}NW_|qH>x`T
zlNbYYi<w}{5b<#E9Klwqu&Tq<*MW7N?4?$Szj*&luJS55*B}Os4UL!=jgVt|uMLtI
zW>9x34+*g>h{_p|_PqRSx6fo@QYiF{5d-=rHRIVJaetAmhzN-djXmfsS$W;EpJkhG
zUh+doTv{LgMsM^WPaG~`0!_nAYmcGW<``W+Jo1^Ss>@Up{@wkj-qR)v<y+Xw)ELW}
zuH5ShJbo6me(7T$X=0TYCfLK=cJvG&IK|K4hVx5%2*E2KPWtP2i7^3$ckOD#<*z|I
zzJ(w>dFfR(PT<M+Ohy2vK?cd4-=J$`J45>ZA+?#Rk^=+^>mAiKXdeNwO>97vkZS8v
z_(5)(>BQsjjxyZ%NNZEDC1V><P!?2tPr(>Zn8H$Lq=2lyIaSn<3;S9}tRg6uU5T}|
z*zx${f<Rh9d9*u!-yLvCA{};tAi{mRbr#F<Cpt7{WG<9<C+|(j5u-$DtN@!mnpkq;
ziLfCx8%7y{SIcZN8<tWcZ%c@dO~S^nfh9IONFd#OqAWX0sd;I9j7$e6sdbiokYn$@
z7|L2FB>4>mT73v9c42{Dm~*DYgW6os>xw#<M7Wpp1-lc-vg%Qjxus=>KZUvew#_y7
zvfUFpdmFRd6-8^sIak&W`(Tt)><{q43^OF0Z;WgG46MnRq0HLJ{82tbmNxNwZP^6w
zLK2GT-*O#8RNnlzxwD|GH^iO1J}bgZ|3u*QicUv#TZ&A9iHJ5}^y1UN=mj97w~p2Q
zVf06GwjtRn6EqsT$2H#kp014&p=Di2VjVAf)G#<8@%u|jBme3SQ1O(DLsrs^;eZLV
zvLel#(SX7mjJ1xos0LOsutaj!F7@}5o!<(kdcc#9$-Q~LkaW@nXr^n^wE~nQBis)6
z2FxDjqNLVE0WN9I^az$C>kB&7x2+Is%W`i9mc=*|0#RSRb4gU7aw4`=bPC|`jKL98
z<$><JT{I&53+ESoJWX+{EuwzN{d1oO>>?n0865C=@dYxa=1@Nd2`g&|VUzh6pXT@T
z9-ezm03M8po_%yvwEftB{3rfwgeI>H7I3P5AAhNXBtG39NNi+rxxsJzxX)XrH$-BK
z&}L<j5YWfVFq?b9aQafZnfi{r%j_=u_V2c41r-nRE)O#1<=E@!XLK976*tv#C|XXd
zBby8Lm=}fBp`vNb6W6|*Dkj3!8pHGmYjki2scgn14-V~{bLXln*DZqy2`2bm@oN6&
z;TtTFve1lgt@arhkXn9372D)Lfophv9w9Ku`x*Do$KgCx#%t{QaVz0laa;h4Dx&Py
z%T!#c+g4(W*j91{ZBPt-hT>PHnJrN<#67lv)4{iakYCwjl-Oh50GTfpadYPI6V^0i
z93Gp=icsRh35B>oQDMr^$$Jgp3Ec-gEn31eKn%C<AlICSfaUwai(7y!_WZW$Ihayz
zg|He%VgV8={k1z{&UrReaXIV?WjDc`O3mG0phY)e-WJEMPLoYn9TK>s_XhO}m4gO!
zhf42qkA2ZZ=25TL0np$PS?;*`*qH4~-Hz*-1ZdB@UjhM<;D`scY+|J|aZ;j<(UOO+
zj5fXw#QoIs+L2oda$A484Mq%?@j-x)q*k$B(4q;a6?YXBChA5ayTB#^T)l*dUfWH=
zL4nlaqA@jz9-#k&wvIF(j2NU#>lgiKb0~1LVX%DpnSbP1f$l_vn2Kl#=x=v=?QK@i
z-A1IXx>?cvJvQ$izy>=(lu+ORRII#+(bAuCu$xno0SCAo;imQ5G%Bx_X(qsG9SCZL
z#+)Htg`uSNZyn(j3QAUSEL0CpfV=0o@RV3c+WO`Y2>_-2#FLnHv10GqDgl%!UAxc(
z=q*}?l<H{8myp8f#;iae$rV9tYI)E6WyogZ@4EFH3*us>f52bln;dv%Si?uQ)drB;
z0tfxC_0d{;IAXywqi<IspQ5bVFn|#4tT_~)M>{$S`M@qwZ8?Gm=o;yon8jVHIx0b5
zlW;t%6{qABMUW0tGJahqmTB=%TLnaee~QP$S_RgBRX7BmaJArk8{P7`pRl912VVYd
zr|Ov_X@u)Bw-XpfpAIgRLaG4=US6@JVFMW|&SSvPxE3&MaTRS0!B?P-p1M$)u-EYv
z$dypcK{JXs3_Hqd6u~uGSEUivpNbj;d$;>jssTjMG??rG&=`r@xIwdTmeKo1rh|hy
zT}^nloD|F%Z_lV1j0nz$pM<(;@6k06C7=OTP{Ak$#|s|P3@y`zEwrFz**@!uT-mS1
zGnjcAjA~`v#{{qJCAWu~6enY?7bdvFjzsLxW7G6+KBXyiw1;S>Zy#O3?DU}^@Q8-;
z#$OEsT*2@-#(p65n4#5$IWxZpU|x{w{fuK_heg@3vYDG1IPOz6)eE5ICy1$^8yDWX
z2MS7?ZUQd{iN|j=1o4OgwCyB012vx6lAeD9DQbby>`M#3m^C&4Qy=nR{WOXa#fB2#
z<HQfWU<!Vv9+{=GHdW0d|Ciy}wcO#*mXObWmxWOmG@~FH9$o!GSI^KHJKgU)YOu^!
zfhk#0GXRtX^QM?D9-GRZfV57^_9Y2?DK<xA763^I@OS*{+T#Wjc=T<pi40godid1G
z(69!;78L_9FIilcNI~zjs<oOB;p&w24yY9{WFpm=B|wc!_;x$B$h*rUR}CgyYah8<
zvFen9UpS`q>Njzr`g1M8sUWk#7_x7tM_bY09z2t<jiUInNS!VQ*LqEXzRzBw?3Ti4
z7;J*8M?y;7>OuKAjJg%}6Rj$qdSzS6dWMK~jq*^9CHjd*9klzL4-gHe7HjSjqv8<=
ziLqZy>M+-P{V4SOIuX|1*&?c)&nDFbANeZ>dxaK~yvaUpX-o&2RYx*TcLL6mo(Rcq
zN=XzU1t?+{ei4pbm2+x<FYZ);%jpZ*<YNNx6$xBjpe*i7slYT?Rb=$oe$1&+hq$u>
z<WGrdzn4bJK`VKlVoNKQY(dnjPLX_o7-Oi_=`X?OV9M#Uzt-ptT%^{hVL>tEpaauE
zx%jRrTI11`cfM+?eq{t6`Zk3j^Ly)sq)lUZ#Ih$qbFpr*P)=-sQS>V+sHUrM-%|L_
z%hCYfqqDN6G|Ki6Coceim^H98wE`)*Nv@-u6kCj}<q0`NxMl(*z!VJg{Py6wdqJ80
zLQ<02w~YObYFpDFN`liXX}Phi3<t2b!vL&|&px09X-juZj7gdrf?{_7Xa}QcKu4JX
zfP0k#!8{g_gmBHu$JZG6?{jLN^>nS@|8)G=!jIqed49-B8DUKe+NCdtIzJd*i00XX
z(iYg-r*{~dW5{E`0FO#)`lOBqq?UrtT0Sy@!oVzI4va_e%X{rIh??%+K0aQG<}>al
zV&u2@)*iA9wl>Lt2C|n)3*VEZ@RWPU4TrV)7+9qstu<U|4a#20Q7quc?_T!0b8BHj
z$IAHiT(&lhg(9o~LAS9=#N*_DfPvR0$<Tc|Wc_yTAibogt*xb}M$<`G(B~Xxq`T}E
zwdfIua)`}bbE!?)?q7AF?H08KnF*b?l`oFSDd+)s-oUK^#9H`n52X7<(<O=JASHhC
zWVQ2o1jfJsZ{b+RoEg3J%I>&q_LNEA$AwmWo_4^lVhjVrc2ZDiNd*j4cmdNwEvEW=
z)C<<;9n6<ewSbQ0XVna2pII*iov#5Gg&cq?lESrAqNr@W_`Qzpm$Z1yclRPFDsH4v
z+yw5gZHTZS2r`+^;Q&l@j(Pc)lA{B@VwPZalVz`l{O6<ll+Z+;x<+;?3lVM08JPFU
zbQ7?wR67MJjIy*-{FOqKs4_sLoe*zFB>rV}{;81Qpy*Vk*4$SO9AscCj>`I2iOA<`
zjg*@&lJj)yg`vfq)Anry=F>H$tc|sl3lm-rX{vB&;LYCI)BhMsADE;R8($P??6gN+
z$0CNi(*?<uxCn4Q8ae?w1lOVQVUFu|Gj$_tPFBlvP%Ai36U;2oOttU{1BIGLQHI8k
z>z7xUZQu$EZ~7DYf|r*%C%E46T)6%G-oe%cL^9wELt5+R^4oMmmT`8q?;NSl#2(ke
zuLrL*8$Z@Cb+tYMc;dCHlnazlQXv?v(Lj>K2AbJ9Qw60_6Tzr1Q?sZpVyXGJFQ*3X
z>M?gI>0P6trtzJIHQy#5j-*6sg>|L2o%ffIdWpL6{gOnv@BkmzIAzyF3kWG4KtRbf
z#O5a%TtIa0-4mGbnDE38x1U6CQMh~4s<(ugdm4V`R%8-F;VU=Pk0FA;jQ~{l6zWU?
zk&;gv)^1w|3=%0$1E3ww2b!q*P6AsBafP)$u9M51Ha^I9?QeC$9zc>zMj7bAe|-I-
z<v>s**m5{0@@|+un3vXKmbe6UF!1XoI}zs3bk?EG(RnYveB``F2j+4xuhPLTGl5l$
z>e3Mi;QmMQC2UFohzzb2Y+Da@C7mmGtd|RpVQ^{|b9p!Joz4(Wbys4fTLI)}^MndH
zjQQ#9Lui=Zyk4ICY7R6GYSoYw_G&<(O-&hsDo}BRWxLwI0*MY0;KMbnAg&xU>hJI~
zr+ZKP<1gu)#PE22fNMCJ)_l=`46NG$wC_@vXbV^Wa?5}bFVjG1LM`_|J~rqFR6SgL
zIvl4-8FK_!EP{I7BH3zqcwRAwp!FfB=a{Wj1%?7n-lX=mmQ(S;fYOP4dpS_8pycX_
zZoS8t5i<n|`GO5rb@FXh*QhmM!h1k_C*`gg0<?qy^IP;I_q{2vSeMK~1^=YWWI+p(
zJq7{&_oievcyXi4LY6;8=DglQU_Y6h)badVO2dih8rjekcLX5fbY2$gHPmV3uxvqm
zBQnqX%Irx$K$bc7s!}|06Pgsf5$t7{KEz)%_Sm_086c+DPj3~H=9`1`yqs>-wdwTN
zmt#OrV(3%`eZh#e7B5^W^Au)@vvn7}J~9jUdrmjhHjEdA`mK_NC~H4G+^>m-KRQ>8
zN<v^$#@mAFfme`vT(ugJ(9}RxiW8-rXH^;(LY}}SOJ%{cEtnut($m%AWVIn?=@R(O
zE;$Na<9?CVhUVYe`(m1PBVB!{pYW|V7KxOIvh1zzcO_zRx6<OCQ7sH5pATY<ItjQ$
z8_=E?QWv@mk}phnq^}0^>-aYQIbHAcZkaX+b|M?gAk$!%Np>Pbg1TO~Z*^M9J!+j{
zZ~(~jtoK2}AYC+05qJMMMmAp-0SaY;MgKt}kjyOFTorrYe=@xfdgRn8@<~=LU?Bo6
zt(Y@!qB^%(S^XejqHEswrFm}-#%9_or9}wJasYIF+v$m(9z3@I5OQ1MK=5<lJzbw|
zsNc~cNrT2guHW(cHY}77<?<2~;x7-Aj+HyUY+DzxHdW7i4Y-F`S=#m*%%t{vbuM{z
zR=R+w@u8~fP|#s5BLawAr0#G($AaoN`XT^y<&sz9U77f2fYxy>h+4dTLL~=UF*L2j
zcNath(=s_$xcqM}`H`NYyOS#>zuKn$YL2trGM;MVxJC^CkA2<Gcvo`?!jPJb^@h+0
z1ovJ2%8-D@i+88bEbDP<NWXeM5hv#<t{;<C_?ry=AD=S2kaGqiRaeg0kcAiFPW$5=
z_9XF3UftglGY$?S*Mb&>d>AyQMm#QC7QdZ}BbmmDF$<>Qu-Ev>$Eyt|ME^yBKY~X`
zDRF~jrDr97`tM~aLU<nb+pD4w;)dWDUx!X$l#kjyIr>-#s{hc~3r!mRU%ws;fqyHX
zrSSX;*w+DaK%zPqpC+$vMe`~EdD3!rI)JHymdig!(uMj>bbvGzB6<XHk`72J8OX3@
zJ`9r1^f=|X>vr=ac+8E6t9g^wuzW>^_m41d+G17slI8g_yid^l{#CNxPh<=GkEicV
zAwm2Uh6Pc4@n<ifi}T;02VBC5A(evc|GpEpy1(*()Zij*1NUqH`<wsqtYw5DD;Q|V
ztpnu45I<kQs@zCV8iZ~bG^egqP+VwiCo6CP@1niJVz`-tw#v>edQhw0)h5;BSANZW
z0RA^HVnG9lE!_egB4Z>@OK1>%R{<1nhPio)TF)|$NHX5Mm`AzDI1oRt$F(=W&rmz)
zHnq{llJLdd-kU76wH?*eYjqlR`*iz?%wWg@ZpZL9?Z~r#S&s}R8W8_u16Q^E5cpdP
zMX{?q^dH;$Yhiys>0d*r#3)?7;p(kWqJzumd}ekv&YJ_i0=d?GY!T~mzxu64YCu=p
zW)kh9?*9)`w<Dtzq>8(5s^0jICm+Eq*2bh674{(oTXMPRu)Xece@RIujZfH4C7sEW
zhaRWQD@bPbUmg3u#^O%`c)^7+P^&u#NKNp>IY<0|@1P2tvD6~TTo33S$nXN&dFwE&
zbyee8Tp3bVL`#~u+ne*_of?pOZ37&J$tD80*~S`Bv+@El-|b73#|L0QbdVJ(tHzfl
z%b;@oq>mhJTW1*zMtuD!mS|!}(8>yTK{Z*}rlETkfux`a<n5hv`|908R~JVJE?P(G
zRI7;0-x!K>H^O@VmT>?1kpFqqh2Vg^t2EQ#f&Q_pG9Z7UK<DbYkASa0SCrByMZ~TX
z^vMH3;Vouj1cd%<VW1=#`fAt~7f&1<0m`9I6q&#5{}I_$kC&x?imStmecu7?z2@C2
zbUCc%Uf*l@wrF@&W)|0+3MWb6PKA#~XrTWQ=MFS*qMx28{}<eZK4qY1?f^Y=&|Eev
z`Rttf|2=bI=$S`0<cM88^Z$fN-#@u}<_S?v&fSBihX2|4OQN}CzxTS;NpT0ePlx#L
zTbh^6=!kEoYV%8u^j945TYQsYfBmZo!S<|CbMFru_$Ost84&f+%*Euctht#7T=n5o
zrLKQJ^AH(Ji1}c7!r?~TEd>5=ry_+By8qkf5TQft(%Xd4y*_V*hBx#tibjFYUA_r^
zP6Ce{LKt=3gphtpaaZf<p$0GwxCnSZk3gJ#o~xXeXaw*$YA*donx_feE)9zR1P~%2
z%OCcV!qkM2uh)zHYeV34eTQJP$>(3_p{rp-LPb1HL%Q#FexB>^w(c(u&y|A0@O|2T
zi`YF{KnbY7oKWmKl#PFRDWf-Xk=Dn?A-jO$F&MLYr1p}~L`Mo&!)A}3gVtM1zSHM~
zsrV&ptjxjS@O$5Fi_#^xuITZTUy*F5ztfCw<sy1<fgk2(r7bcIllcI7cap~@j90u7
zaO3~$7m*@(W{)QB6~saB`ZL7#i~IDV6Z&KCq3oYoBCDo`5-G8F<DY`M>&AV>LFmW}
zjp@7vdOTEsbLmPOC+_yU9ZYrb0t})*S*VY14wsY#B0?3Pm@EN`$D?s8=!3lg5+nT|
z+EFWMpj<Wc!8BHb9kPgT$uHz04aZWdP3S&azIOr#0i~1h@e4NKYHH4;eeC3@bJFJh
zLPXrn4!!CE@e#`Z=T-Z&fj=O`1Ub13FJdr=d*||K%6<(ijWk$YM<XzErvASe`wFP4
zyX{-Llm_V#qy<z!x}-rGq(ed!kd|&xN;)MDA>D{{9gq+d4&B||oo^q%@817?@7;Uf
z8v_Rp83WJnoW0kYYpprw`s~wkSBu}`d5Sp-^G2L%xn*5wjE9Q^PYL*zr*Y1YP9#(+
z+;yPiX?o9JPu$d)9?8&T7r%C1aS+iRZqV!LdHJdn{{EN(%(OmH$#hO2;j}qkK@Wlo
z$9L-q0yj6e<}NDp&41CrP&nR#R!lg!&g7siew^kqt_(jj)bQW~;?DndX7=D`=7^qF
zy{%GfqRzO(Y}<S7_IH39wgcRbJ9T^1VTc(Mwvt;kG^zqTDgW4U@xx(qpHmL`AKCf;
z`Ei4f)fhe>{Pd^pi{t!<uW#*h0iCu=1Mv=zu>GyXt^JMcr{^01E(Tj!$lz5uM1pv5
z=f%2EmwFFI6hq^6tO~E&o_@6ets2s&CMcu}`~&Sj2LO$&6)e*;`36VCm&Y5iZ}Bq)
zjoKX7>rvp(`EMQ(4-u4&T!L879tvp*Y^rJSe_%c=9I?0Y9I%mf?%N(`#?1jlMBJv-
zhWBy%sSRE!Aqgz?j*R(5FaN{m{&)I#0unRj>a6Mv`~c8ipbSzi@EhI>#d2Pn(pp{k
zs(QxQlG&pQel!e@*wr32co+3uXTO=>gV&@$o=j_U40xI(0Px9&Zd<?k*SH|513&%X
z<c9!@mJ6EJhawVfju?Xfpc~|}{z;eQ@Oy`BM*K%(L1zqybiX4$0^L5uY)!a9Wq&|&
z+j}qbcbn=4Kip}#&&6JeSEbES$;Eb(B5%F#olGebJQw`uuNR<pa{MVMJMsr}J|e`B
z2(++@CEdB0k~*r`_7jYmRx<}=F}ye69^9No0Hr#I5E!CP0q03!Qoo6pKDS5d2fw*s
z^KchW(JcKw?aDXU94*%l#h?#={1dRt;W7uX?UXRa(2$V}y!&#OIOc5dN0rh9(>qx>
zHhL#f;{Vrcl!u!`txCTT_-N7H_hri>-WohsrL^?l2#KV$o~_=o_EMtiFEu8Ji2wEr
z10DR<g?03iKV!B*DcWcZ*vF}w*F)dVR5Y~b>+0jrCU8e^EqyJ~hoy47PEijJP5R8U
z*FL~rwkESEyY^|JgLrPn3x3P(mGU2}lyty5Io_KSHCg^l$xi?_%g`TM+mzl0X)lv?
zbacQIs{N26uaxH#(7m|^vIj-(E0r|hU#~HvszwF}V?>&A;Gnz*2PF@?3hS>SJM%+;
zvc6DYZV*7&c((jm^Kc{#!;z^d^z`>ButXri{%17Zlz8#_rSL2RB9~)0K-Z22$rWhe
z_;&Ue8UOpo|NA}h!-F5+#Xi-MKgSmpg#aJoVlxqFel{1;OXG!?|3;Csg<O$CPuVEx
zA6$SYf_ST48pm7PIj?}6nwh{IBOhXB+ce<M7iIA0jB|oBuI<Xf%na?%l)y%uc)1Z=
z-3a{L^c}hxw1ENLyE+iae%xL8+~ES-o?97vSs47%MnwOIo9p)s4uHN4MByw<q--}*
zq0&T%w;mpRLCjS9#Q@%aVEy8lI{>0|E*sY*9w-c$y@@&gul)0`=?KpRoK=Zw{=(!x
zxyV<H5sdrSps~slzfx?Zs_LU#AD!+_1aRiwQ4}$WB>2B#Qvg7_9F$wzEi!*-o^C({
zN6P&c58{2fF&OPQ>$bK4oRD!hhYOZ`xZQ-7*3x}C!4fmSkM~FtAOkhFMxUvL$pK;e
zEJ@)U7oNFA>i?&PfTut-@QQ{Ibfo@*{gQ<6Yy3Gx;~p#~jccfkbp(D1X8AVq?EoO6
z?DOuf)&-w$^-0nu+kbu1C=U4VmC{}LgMaxj%s_(?RoY$Tu+aNwnHmF9&)ulkr?ESC
zDL%1+2Ox&Cz}cUohJQZ6YQo67%dr-6{PZGVPyv4|Q?P-)7IYSXg|BiPBpPR6()1+8
zISKFsGhZ4?16{LFLJ#~1UxCBe*DRtQ{s(2sz;EoO;4|P$)A3_6({igJx<=Wn>l5+b
z>?i*-nM~u3<zrq$h6E4rQh(jx{j@{?=(qmrg{qsB3yu_TKEc1qE&uXqJ_IkBKbz{S
z<)3gEiV-O8TB_Y7(C6w(aB4gFI4i!vd)D6^cRE7x*Q|o-R(uJ(5CAXmbdC_7Rb?b~
zLA7y5=O6&0>!CIyI}nb`Nfdkr7vftKNPdsExi$aP$M7M%P+kj-I#_QIpXSmci1ac2
zTJQ#2D})XKp6JB?5rHD$5s2?nyz1Wuo`3!$1`|92ZIg`1!=ta7)ZYXfg0bz$yQ}wn
z$NqizT|3j?Fn{f~fE{F)&=b!ZWF=Au5*+?A)ArXM&bJZqrqpd?M@9~U2I&B%n(@P9
zk)R4(`~m(_Mxks09x{-n30=-riPJV7*x};APs5K9n!@luRRkCOUd`etf85tU=&w{c
z*+yTKah^R9-8Q^b?(W;zv59{m1%EQhzn?7e0{&#HS7JQ>e29Ph7Yuk*T-c=<QG!2N
zj3VuQe8@!ZS9e^oq1`5xOtqXr#$EQu?UNRH#=Xn-_`TJH^ex^OQjdMs@B9X`BJ1l#
zaYU|!Gzv8Dxe*_#0ubf@{_aNt@vnoC?TzungJ)Ww!v&qhe~hkWtS%_3R9uRHqnvY`
z3ZPc_b+z{`OS6|b*gUnjReIsZKkvJwB(`*#rkW?EpfS<n{O$`t9_4Fia}_sRW&0=o
z{Nqn-U%hkA6XnMF5Gh)B+;Vo^jLaq-3FR&83uAdUGHuBt*cmlyMdfzia@aQbx^RT9
zHnW!0tj1yu=lXP}2CWAB=neV%qnO2!Iid6}Zp$&mD_(wr_|u85b)Vd(kEr7}EhXpX
zCq1>MzDs%C=1nSh!(6IgTU&vF5P4K0`e+)`ZAAFoQp4DT#iB&WnZ}-VaizMES(eXN
zUd{OrZtE8$CUc!ZU)4^?pL!h)dDt!$CFYv0t)@5LXIUa`5Lv9M&UfL3xh0`L=ew~s
zy&kc2=4Rc~t2DcIyhi98S3Yfg2%Ht~!KP|#yMT6S9}JQcafWfI1W4};fGPM~z}OZh
zDEB(RHi;0oqaK@Q*L%SJl@hSe%Eqi;_@u(C>jB|-zpOUAS1;k|7+BE3D^9eZtbY+y
z0Hpw23m5B0dakFB1oy7V8s^*(K3YMPZA~?kF(g6EEPfc>3lf8c<DPl*E)d~fh^S!0
z)6IYWYJ*R%YOiFf{y+{hLJS2UO9o<aLvQ@98P|K3w*_(qW~9AVu+lW`qh)Q+gEnrx
zIC!QCa@p}Ktg;kb#kV)Hf9id8t0(b0*NN)Uh)8|iL(ehcUCHH@M2z}~kQJ{pY>_Wd
z$SLN$2>~IVaHaF-oQ%_spBL_XeO6MB6_r{qie{bTXChvQ_F8^?5FyWbxx1jRSB7CX
zk#AQgZKY5f4t9!;DlDge20x2jA0<@e+%x~VTI#ibQ**u?BJ)`!IPqIW-@s0{*<Es*
z<Ht~9;WM!C@$}cGoT*R<4E&ef!4%K8hsvY3&biJXkeE)587+j74I#lOeoF&1@c;sN
z)E`r9>M`MFNKJ22xZnAMu}cORBp9w4&vljP*Aal79GHL_spY;4^+SFNq-7c<x+5;$
zuZ6_{cJh$M|Mz7Oov|`N!b7~zz%V>W)48um1DQC*;B#nDwY2w)sUIs(v{nJRZ;S!5
zp9vAkE{gMAXjX5!F>K7%1h&Sds$Wb0o(dkmHTW;kG=#om9~O=w9v=EF(h-qtLuZXf
zGxsYunDuX7ww)A!E)pwG1L;M0pV-IbAYW^hOcSyo&>l0mV_PVHZLikjROG~X#iZHh
z_%7vk80VrYW&AV0jh2q@V%E$*cVJK$c0oAT@;*%0orEXC*m#;SE`lK8SXC)mZOtcs
z*M+KMV>Dx=T@R)jZ_tU}>xKv$C4T2kW3gUJ9}~XN6e^|yt1|Ap#h(JFxx1(AH}*HS
zHv}HdNkMHRLnfV{z%VKTGR>+0^f*PLBjo=bEBznh4_YlcA)lij{b8Xjosexxrkm59
zG1s$Eii0T^BN<AmjoOZk7K(w8-r!>I%g?IWyzF|Rk<znUa^JX*M84lh@d4+k8HTC+
z9-`fI?DV@eBfkCiSD3((HIU-#Lz=rtL1=z;X_m84Qb))5HwS(Z?`1^#&?<F@m>{1U
zOR9+N5m<2k7SB`5)0(hV(+Ol3(!B4D8_y{ROC=-&?Q0rIrIK!LhOTVaG}3xI12cVC
zgnUjGkKQ^=#5Z4#5rt|A3AFKKdC;{vW_?aw_Nb|_@aFgvCn!;XMCJ(^E;Z#-L%7^r
z#Ou<PN}@S}4LCm>?TwWgwYz~@r43MD*4fI+X+ZlqP#DgG{<q+@oC$!}4GOYu0*G*T
zpP+Z!uFI~>!dm6vpjc0MymBKBF-46y^%$>+ntiMDwoVI1!yJ|0;@tB~KCVL*rp)U7
zp|$<H2|?T^Ooe6PEmJNVO(gFIqv@znP*f%HBvFW1ZVS{j#0t4pll|3*vROUpLg@_L
zRRsoh>IXe6{7NP~VPAMA%$}|EJ<^GpaZF#k^GshJJxi_Qn)tof{4@n!K3+n;T2i}L
zoc3I`N&B^z1BHFiD%~#hz6Vd$EqkKNeD}ASofH2<jTdi6aPjmr9^er%)>5+-54hyG
zcI<rFw#OSwTrv&!2poU^1K-E8Amf$Xi2vc3*xMBdMvoB{Vk^oB1slqQUg>VEiFMt#
zDeWbqVk-S(^T32XU#L)gOop2}14|1*-+Qs`*)l}~i`(S8!4vHQbbqWd!ahQ)W2$zl
z)E%cAu6=9PgIat9XO~rOV^nR#SH8sm{o66I@w|=^eQ7IJIa0Bhr5aR=62f*u@yQ~&
z+Q~n>yWnI$4zsrQbye)Lyzk-?TVfQ4tv3n^TH&Xz<fP!EqJ+i%v8B|_<<eIa#G>T>
zPIXnYuL*+dM?yXFB24cBRg>pSOY-P?XXA|5qV9Lj?tQC@^db7l#9JmPK5Ln*N};?d
zondh@U___2VwsCzUzbdj@+)O*VXq=RAFG7h$3MI>zQ9;sLc#j65=YX<&mY4wjK4B|
znkjgfg?q~%-t-l{0pYk>>U=5n-EK31k%O6x37!AY7Z2sF?r<x<;}O0eDgu6I6oNIQ
zjqAcW_^oGS)wUhCAzz&g`c?={=J~^sVW5LA7R#uJM@(!E3rE8yHwNSOpXs74P=jCE
zpp8^I50`m^sT<I8fqp+{({W5ipJ(H9WS9skt+VTCYsP*Yc$;ZEh>Yj?JF-{({(jGg
ziH#H2;<|B^5nw2SK_fI_(9fGb6tr*o-b`UUO4Zvt`H1gJrt3)iLcZ$ReSwB+eEwBK
z0<-HgZuO&4WQashm5g?HBt9NQD*vuMpDkEt>|Vkadd?*s)IvKP5u>2L8=JjYw$X>W
zLa@{Qk@qbg%~vQ_2}|)V2BskidJoSFn-Em0YrIO+Vir7Y<}WWbusAf7>#O}8;*=a7
zo~c9|&)#t1xxB5;q`rB2mm|b-lTf!cU@6|#(SlcLULHqsUx-4Ur1Ux+X?yo~0Tg~V
z5ZM0vw_p}1v@%IaKMa20{D!l9%lL%GQl8^Z^oXoIvihAiH4er$HoJ36$9wX41*RWs
z(a-E(&&cbao1$jf8pt%)b7mQT)xwYF;>CGM`LUrZa<VA4^62P?h|FbU>7GyY)#l94
zueH@)_eyQ%==gap9_tgiSvC10VH=6(T7=fW`1Xp;hI%|qm>&IYTNsLt)~4vo<k-gk
z>$06o!o=srttA0h*#gU1`ujOr(K(VVioE7up_Mu_J!jV#QhL==#&+dH$M2I3=1!P)
zDTM<)^5}A4ay8p4Y;?-u0zY*Q+@ySYI5z!IY6;JaHR2YPGQQI<mu6G(^OAU1p$@Kf
zuOV4<O*aKZ9h2b^C75N4E6tq2QschYbO_De!)BlraH-P8!20lfI^zVQ|AZosq*s~8
zG56!Syyr?*%xs)i;`7qhk(TEQG*hLvJ)CZy>qmlJIHqg!vxQ?{b%NF=30uTVSw{>U
zLLGwjyUMJmg7vP><i~t>$ous25WmH<?>$qlTdG~Z`RKRCjU!~#;0&x7oS=Z8k>;=p
zM5A5pA%xK)O?9zIaRmDZOI@UVcC^hmr+Rsjfc_9k<8zG<%t;A*^;+_~pq03vx{dtS
z(9)5r&=JKmyR$l_5v|-2WgfUouVd{Ym3bov@~38)h9En?M5z9}|C_X+iX`WJS^&pW
z?|D^e(%4EF)|371vgjqZO8BXXDkOs61kAh!Xk}Sz-!eI+;T~7_YZ8q8&%r2UhU1@-
zJ~UxBP+B3vP+X)N1t|P<LNQ>m({^9ATq?%KkVL?DDeW5=zFHO;-uku7hXq6`R)RS4
z21lVq<!h7Ny5UZvMN<@4MvL9xPeye4#_wS(so4p{AvMahZ?;qf=WTRdak=l8oIZTP
z=KK_WGxW1<h}87LMa;d%w+DIGOw1zJD;pRb`Y=W9X{=x2KY}N?5f(T<ORFIE2(FoU
zc~T8fi1@D@Dyg#68Ir5{&ZG^gl)DxDKW{@hk`4Sm=pEXd{t3a?N{B`XknV{({WDyB
zefK?S>dtDBy}GxBs6vEso_2$4$q}UyRID$;Ua@6y01}KX(^6-%LE%&O!E431-C?TN
zI^Xz59=aY(htAo#Zla7-%P*-lr1<P5tSWD*uv(f<ZYHBb!q;=rLSlcN4tz0Mn2dOz
zJcif`i@(H{I&~mUGgkFagq56YYVwxUZoQMSY2@jADcvWpsk!#R<Bmhg^Dd1WU1~>@
zvd)%jJZuNqowfR)i~q{RQ%=9>x<{{py65a1W8;X#DOAej1=8(}*VBl;b9p{#cN|!b
zptDSfGnD~!B}uTx&hz%AzINqH9@y|I8O;fvjD>gjJ)g%U&5=*`Q#<`Eibg^K{Spfp
z5WC1FgLR%+K!u2Lwa1OVP^^|uN<k3@2py(i1w2PO4vh#Uy-b9pO4ORZ+4Ps9yvb+#
z+bj&i<G9GMO%wQb;{j6F=VeqiFLh8!U6pj2+_?WvcABW;D-&1U3po3akpyGu?YBoW
zWoyevw!$!b)G-&ZJL#WzuyB4h|G09<A!)m9@zY-G9&iLwjGSLo{P<AE>e>_&EnGC&
zGb#ST{+IK0ffw@Cc7&x)3C|7|7}|^XOtGLRdFR1MFq8@cp>?-$ex0-SsV@AcY7&CI
zyRDCn54K|Te@d`@IQi^R1**YX=JwrG-*ZzF^y&+RTMrTC;&S~-L=+VNQ#-fU^E!{q
zhD_L<U6jI9*LeP1m;WAQlQez(EGm2K1^+r!>VA3HLWI}4AI`PSICO!I!Ji1maUm+%
z2U(>nCYMacBMe`1%w0;)3H#`;dVDg<JXON!Pd~^%!sIgQW%|iKL#8Qfb(qp)59d^w
zv;@PD4RIFbYWCWL<kVY0dx$J~RYD22;^7%wUOZutYk}tayijkV)JvuDg9np~#n*Di
zj;78ext}T-QA6Z5RM!+!S1wv{2E^14oJKgt_lbqZT<~5zd*iSHma7xdStxha#rKp~
zmWaoTe`UC#GRVPfpW$Cd7OM9Y=hIsF;dokijx>mfK!;A2Zx9*DVE`?CO@rAbd(ZV1
za+}netlSGPjm<O6XT<Egaa`^;!-puQfAV)69+-l?NO}I=?tK4sB6e^pL?)d4V=BMn
z=FBe*(UuQe3By6t6JN;VI1I?cNI0c|^x@^%UJ*7K$lX6ZA7E)OdfAx>sG9^?%teqP
z!<wmRAO_!%?h83S|2ye{&wBfC`$jAY6~ZuHx67>PvKSYb*3HkCb-<0X%3Q)iju;=}
zqV&Vz=m~)>b^IpU8+OJjdRksup&~TjR1`AlvJJ-V<6<;30tD)~n$U+e)9oI2bnYj2
zsZgz|Tg;a@FVhvsR1{x|Eq*O37pV<yXX1_J{;7cn0wEo(zmo2VLuWPJ!nP$Qlt=BF
zT_g+Ys9hC_r<;=}fbmi{wy%c%b)6R-Spx2TX_eALjNhsc{3g;z`jCrI54yZ1e^)O^
zKRVN%i5j$hSlq{if$^(&HjjZi1ecTUV3zFBtZ*23G2>fkg92P6(kc<pFfvl__lvYY
zIp~v<yc<x**gs9Y#<k_vIon5kXIt-fKVj=C_V_Dhw>qYH9UI*fD@<DEb{(H0)Y0_@
z4B53y7i~0%aKddtDLpvTA0DotR{Xz#gR3ec8l;Jt!=)DvGkkGb_qqMnrZugyuCH#Q
zDpc^9@{D3O23Eqxg&SMwoQlz!XGI;ce7uIGNJy(cE_`HhjB4^^$jgtC*z)7-#1NEJ
zH+3*zl;wWp^=Y}e_?7<bH=HXGidBM>WYauakjs_sZuhT6#|7QX@yryle>rcYp-=_!
zwP?NmbG*|yEod!PER~L0nmq)av2UO%pR}&+Fd%L_*i5e4_qa8~1o^pWl7${<)z}VD
zwY0j3(kWnmDF~J>UZ#3rb11-iOg)d%-8EeoezPPqzj`aov2?YR>c!rC5J@A4PLt1X
z`%8-RR9Y`?W00bC0sl1ev3&(!o$+%e&SXv^`Vt`xls4?XXvR#ZeeoB^gpz&9p+1S6
z1HI;jP$(BNj5cu%SnezW_L#=Y<hGCZz5-RBE7(^NCFo*<g7UWMQOYptiQ7^q_BCKh
zB>~|ihrXR~uivI(Dxa!Dw9@<CzVFY`ARkXN;!4_aV4LohKPoTu&E_W>ou{9fgF=Ac
z{A6|DZve{;0IbCxV=&pg4u*5H4^D+oxIDgv1^$_aKjxD1?WbA;LY~(v{XN1AOjEuX
zW3i&Kx52N)uS1ORE8Q|wjE8++K59p3UHB$aT;3_H<ZnVqh=&JZz<hK6K3Z`|aHM`<
zQc0iBwbqW&Yv*k7Hc=~lmwiES9>4P4Ma9_Cepll~S<MiA%XX*Ob-Dc9<oO9nM2Tpp
z3`q21LRlO7C~w#|3v))0e#Hw1&j_oIVW%8pmhj^}%$b!G%m0BZg3~m(fcSLFedClN
zE)I=VuEMI0RJ=|Dhvs`b{lc_B`|gb{Pab8^y&fav*P5aTb-(4^N;C*e@EXq8+NqKX
z1l4_h-u15u(0}g~LkZ+d_qR4mWq9CJEAu;|qHy;BMI>m1G&tjW<b$96yA=0-f&>HX
zx)+nGaFC!>?3Qzh^@x#i5zwcB#aGaA3$O!}4Y~#kEwwCDfofU<SgUu05=#J~N%q~q
z9StDZ?+wz#SY;!kE-o(a!$j|lK?|MFZXQ7?T_ha1{qfzb%C%%BFsmZPvuegT6a&XV
zW-t^)LPj3)ebFfSpDVp<WpKf6INu2U1x2P+X&L-198P+5>mNJB6$Nh-vp)1{M#8{w
z5*CBZw{wZ!_!5eG=GAgb1rJ+xe5_)#n-5d|xJ=75&Ke*sQ1>YI*Wrbxg8llI{R=O?
zS8D!A@%rZ76IGl0_OgkBWlePtJ{LTXo_E?%IV<A-sg{+kPW%H}DL(V|iGK+u5iTs6
z5MUVs%AlY8bMtfNq7;KAl2Z}PGZljVZEf2-zY$>4##`j+iqf;XC#kD)yTd_wO3)ld
zs}F1Gm1>$AWQQzt4$I%hc#8QEj3AYwU1JZ3a(>*m`3tnvw`;!>9q5r@csnaAbaER0
zOK;S4F6m)?oh&h_>#6z^P3C$uRv>l71%J9?7$&oYZj6;_Z705&t9~<+V^WzgsgSi9
zh>ngPud*V+p%%*L4;VA@p_=nLOG8o3>udwqSUTR%;w0k<fFG~P{yqFqgu{;pOcNs4
z2mWb$Y4K3~U-2`uY1ckYTXoxv<f*uI_(5YdVHuv9#F<7`+sN%UclQsNyax=@nWNr?
zHwV-PRN}iBe^O6JBs--0?tqIzQXE$6rH<$0UN^L6a-;XU?gM4Tlqhj1rg-T>IOD_G
zx2EQ`G4ordcTC%S+mQ$Oh!BaQ;}_ahxJ1)wC9-@iWKXv)F$De8RsK|!z2YDVEGKIv
zxU%q?oV^_E<yU^CVnmOiP#jlT(a`cl$V|KBAuS%Mc(>><pL2|z4RV&2%fQYUm>&2q
zru-H&cAf~Sy({@m?nq;Q)hs>zdQ4qC{sE1l?fn-$-&I|RYEFJZD{XeOJFvEQQy+ZY
zy|c;<aM-BHU-=aafpadczzkb!J;NYpWUkSTu;u#j9@rH8@vBTEIYGYX#UXqV24Kqv
zZBJ<8bUq!X*YZxr!dsusmLrsul9C%DeP)IMTxv|uq2~T;ku;(aKvMg9V=$)>UBId&
z-l53r>J*rUu>%_*Gu;Q5y;Z;%grG4!#vIJx2x?1St(RNrrt5AD7ZBzur$+!2IuFlD
zTxuaPxY!Yx6SAJ(_}rcO3?j~2uJok|4z6vDmZ%rM2%^6R?7p|-<(XY;N>o4-hID*Y
ztL6rDra1xay}L3swsW0;x)oMs6>F#s_Z=NJYWBQn{FX30SfD1dWq@2@zQa<5GZ`lH
znRJ1h(kfML6WPgV3OH-LD%HmvyaBV%QxArwJ@rOcdyP_kvhOY=?1gscdOeCK1)tk5
zE;P)Pqs7m5XTpF_$2@p*l$NRhmjie*Ndb>Ebu((A;62ui<6^}*O~AutlazcU_~>F6
z>)Q6saauSIjY7yFu<l*}W5$Hp_ljE|9feCQ5Xy`cK~Ui5{S)kE^{_nEL<FDFT&lYa
z#G~JrS<up3A9bwp=Na%<ptytH9?)wxs38%tS5y$tdc8ToeWA`glF26Dd_;{T@nYws
zn<Z&Ft!!^_OhQC<;K6&9YCRezEgj6jv|Ch;QiFG8r6?C7b>5l|F3m2iYHxR|<yYK2
z0~bpsAJQ_Dk@I5Bk&pEDqkLecKvj(2)MI~;db7hsnQ{@|KOA?+x4GzM?R97H&|<%s
z;aAq=&jZ;+ri&WXugO=JsUHK0@|Q!-8QUf49Y(o-Jn`D-6F#t>icrE?w;cNt?Y3HQ
zfyC+~L<s`U1hL&3xmt2ww5Qqp7e5ORy1|e$K)_4z-rsjFFl-W7g{4P=NbTW~pEoLs
z3{zHgKk*}TeKq#U&*BIZ9B8+lTJs)ODAciMv-j)Ypk|#FxU<$1d?_^sS;=K||1zEa
ziIsi$L9uM@IZ2jQqm{doG<nR?JJ+Rm7{4`CDs&ci2!jphWPW;$Rv;sV(~~PotN0~{
z_E)ZK8Vc!5>bextiRG%&PT8y6+?JH<&jclvNMOIqSlhO$tEN%*_K<M7xh1E;pXblo
zphO{Y>y|=xnNIq#>FM8(mF5e@OZ1j{Xgrfanjf_r&LC&=`|w5o)dq3!#J->3Y-*8o
z;~}<x7_Vky^H0C)gDmNKuPgW4pb_%V{h<a#OYg$Hg2xRAvwt9ejw>gpQQ$bXEayAv
zz=25<DN=lxnBjll&R<pXOhAAftR9RLlVyb)?f{G(#iUIAL!k+PY!n4pD*w>#qIGTT
z9eMw~eiRd+YHclL?feH90P8glQ*p{p<diTKD#pw8v9fe8JO53<CgczzD;i;?S{4Fh
ziDKO<`P#HsQvMb|PvZp`5;TAS&pn;jf&@FB;-o5946^X32Ub!vaJiUYqV-bkJt1|`
zA3?Z@hqONzP+66W11Nu><5E8s=a%m<nAa)o1@h-W(D0z==Dsk0u3SuCb<zs0JP0Nb
zYXBy2LT-nWZ_~{)hHE<^mi7y6sQrdw==^U%el0Wx`$!0`Q%AAZEH*#$36BljBtTl1
zJEnqC`dyC=%m=chK$RNsx_t8VrF;+}Tsl_3C1$7}kH15PbYeoQQU7M1e}yLxpMNYp
zVpphej}3GK_PT;Ce*OWuOvH9w3lp(N8O;D&Rk)CxdOepU6wyQ?S^|617RtIX>y8Jt
zHPOPQGZT#*b=z*Zm@0A_E;BQAw=|Jn$3tJ`*xUY9Z}C31xOER#shCp<eDmAd){3Z^
zr!Zwf&XW5HQnDJa{0U)J`uxvhPWuI13oh)c*YUm<UoUx+TTqr?*|16ToSpD{uy#U%
z%O-vx#6iuf^m(5t=L8S{$#N&S^LCCc_I(-~<Pl-+z7nxri#^rh^_j-->y!5=Q`F&l
z&*u*LCc0biqY`Q!A(#3Ui~D^c54M2m^|MM9D8ZgIx^h7QfeHl$YVigtrOx^nxu^Tq
zhB9c7UaKy4^;aPLsK2tM4*#n^EvAL&>^2(z(+O2|Ugy5^tu46?UkkZG@UpA!E|nCM
z$<`7@@PrT^CP_8<&Z|PscSCOpke=1n>+3Ah&^xd3wbP8;LUPc}Vt!bGibs~OvTd01
z!<CCfK-A%$f2u2{cS2%&P$GWish75mrQpvCD^E$^wd0*p7J)NH{p%s_eNMzhPM-ki
z=L-X#mNQ()_2BO)tM3Kbmu0GU(b5^$SiyE5DxA<at9#{&d{m9c5|4}oxP5&OQnB_U
zM*PoBMX?&Pe=U&?cs}&MH=-HXQBa!iy}OKFbHa%_J~Pp>*inhP-f;uB4Fq;TQTnKu
z+}gEvnT10I$!}0}Zh(C&L8DlUh1<6%u1(OXQ$!qxq@tgh>m&An*yJw9FV&HsG6pb9
z&bSW{q&kl&ELE!5iXQ=)v8@VxU9-*VEtXI$fm0IV&Fpq}7AogN=K;$8G*CDL+86>z
zktfT?_Pf*k-KvH=^Y$)&Yk)+mG-)bTwZFc2UCV_A=?IV+n+L5IhA2oWe$!U(C>L_h
zBDu%6)rm?AhOsx%XWr|0<n=+hQeAp|%F1uj_Ku}W`%oxsr}M7|&E}fDax6rZX`GMG
z4@`j;g%iG89OAZ0)-TNH;XKyTTEOz*U{MM#t#eX_>o;;Fv}=P|jHNw3iFJHGF=Gcc
zV?nN~QR>#Bu6^(K90X+8JGR+I-9L!dvt!}QbUf&4;OF^V8uBvB`<fi@q>PK&px97N
z=S@Lahc6t^99}t#+`l_aI2q@C6RhHqBY%?HD^dLo#_#21Kf}769<}|-z+%wlr_P33
zo&d?{;~##UU8K~!5`Jy@JT_Q^Ys$}mKlzh}_8(D^qvR<B>BGh(`$A##dS4EIs+2#0
z_e?&JeONHE-Y&N8%p`K5w`0c6#t@?U(<;N_>@}F57+ZV%IIuX2chTw;4=Klr{;3Fc
z9?6|A;gJPj?=;4~e!<)mp`q^#czDKp5^NSH-!%l7TZ2uxkzx0R8kWaTw#J@Xk3adH
z!mG@Aci%^e=}RQAzZjNV{e<q(gH7t{-Git?4xxgK1OO0v5orGJYqmhpYPERy<-QgH
zB7~5}fY{F2nR=4k3;n6yCtu?aw(-!%QuqT-97vUn=|t~n*q&0`%QVYupTefeMT*^h
z0$=T7qOkDoLgqfNYo_sBjiEgQ@Uek>z{=wb<m;Qe<u-~sjHh}*Up9>fYbHEV-gtNB
zG!Ys3cf$(OGuB}T_(+hs*<h$OgZ_6pzDk0DE%%hcRC2i|R?U{>j6!}V2keie4L<Q~
zViA<?2CQ7?5+OrI9OsgkcN?3_Hh3n<3&oeRprxE^KC&C57)63uh?!*iPV$XDI0IJo
z<PHWd+mcpux$aq)byQ-zP);Ia7mF4QLS?mFC~$1;2dj1j*JTipntrFVs58DUF)=lL
z3e<KRXKYCNBUn_`3OJQuno}*9qILsJ6e!@v<W7N%^Fh#IP)FZ`g|Ufvp)PaUx$Mmv
zIGlt92luB<1HX`dAPWXE55r2EYU_zdQI|l0<$|Pz@sb~uWnhs+$ppl~@^HBu?pN^y
znAt$N#GRK{AtFs%;tKWRss(m?v%;8JqYkVi$WQjB>jJ>qB=oJ(0-wB-grg+@HyD`Q
zsupqfDXFN~>uL#Tr8=08ZJ#q1WKto+OjI9r{RTDz{8aZdU79A`-(+aKG*uqWvS4&K
z%t{I&at{jH{1g-vrlG9)flpLjlW!+%qWm>7Ij>D|J)iO%(}oJ8Lo?i0;Mgl5H2O-Q
zwf<Om4fsb_`e<DMVh5S)16gI4?-m#uV#c^yujeO#OPF(49S1X!mVzVb->0s==)3m@
z405<jq3VvV)w<$JHMp=lHvTUVqb7}^k%^}-AqbF_e%9hwsYW+~I7+wEPSV5Y*3S+{
z3)hBk6Yl56M;4*I4pqupNKKC}cx&%*0$P<*8Dozb<R2P9Jby-&A-t$GlhPiw#=yDf
zf+Q6``j*J`IhOSm(*E|A<7iJ)TtwORC4Lh}KID3`9MsX%p+@$g^POk&83Cpl6e;0{
zw&|nA^-Q#?UiuUT4FbU0Z~7@3)Q!k|ApcIBa+CA<O$AtYV3|Y^6e?D8brggHYQR?J
ziXWBkDxh<mUVdLFWh_E3w-_}0@``7L$p0E}`9^_u-{VKI3-a~Qqume;_N(fw*pgXe
z_&~wki)KhA-SF-qu>f81sxiDXU?;aTjf(Z&)JnXCj-y6bESKjN)5IlQ<~o|@IHB;e
z4qO}RrVsfjnknoXx{u+2kE2)0*Vrc_^5^AX5dthhFFM$8QlJBBR#`D(MH08gk3w7=
z0NrN5PpXSrK7L#$%*06Rd_~%xuAMdZad!>;k{2p|levds<QERNOuakoS|o3Hap}rK
zh-Y0{UqIfMt{?se1=waz0lWaqZA)W)puoA0OUb9=pPi3)egVoH`T6{}a<Z~fVyUeG
z=xom){`^Ae4TwGAU|&@cQqC}LT@k_X%ZMTQ72tiDm?6Fy6z0PW{N}KjYWBgM@XxSl
zqC-DeIbkWwPY2XhWFU_pzV?beOhrh%I9gK>D<z0iu2Z%%-<d3h$;&%xx<5@bo(9JB
zSgq6UTUuA1!16p_Ehp5{kwHwu@Cy!=HrQv;lfZ7}OwO@$`d5|Xt4}c!O6|*nB&0sK
zoX-c4Aqpz0o7pW2&7cV1hfR%<+t0qLhe{m!wZ6balL8R^<3#AX+$YzVjcVV*p_cY1
z=8f!sxi^CXPXcCvQ+{5Amk;fjjld&N|A|0S{pXH5E;e{G2_X4{!ik5s;Iw1}*jSqb
zmhqrCD}-4dMk2ktC6C7pE@ZU53_M|RpPI4f=+{s^CMH;P=PMC3wdjsP1?D&s16f$;
zTlGT~G}SBmXFoCy;?yPK7)SCAb>JV2ldcM-n&%hUd<cf1$O@jR`|epBsS?NGR<%B-
z+!!`I>Wzp(6n^(dR*TVx2p;R+O?~{S%{&p<I%B`0G@^_QXvBGlq1--%I`Rq9Z$G`P
zSE+pUo6OZ&Q--rlvh*Q+>G0t#<c;SF&IX-T)rhKty_SPIFOm@3$vZQp7ay=GoBwnW
zlli6Qj{qJ%0PFe`L|+ckh)dpU!Vu$31WGV8^PV_4u8c}W8Z?L$7X#`RxGMGI;Fc&$
z{s+*e4M5w21o_%5#{E=0Y^!4flsCW22GfaUza#jy_N42v2a8tSe@bzM3F`uja1-<B
zYbCipGQMi3xs!6BJ(X0YVtR;H;nDJ9+V4x@kGF_lKwFyr&CXT%w2-xXa`Mfba5$5K
zZY}EUC*I`1C^g&q-^wT$r)%fI#mc>DqzT%9@(?~gW0VwOrTZ>Fp@r@;R=PR$Zey}o
zTBXmSbS%PGuwM4MiZUAnJJl!ehhMv_eaBxLuI^d+m8h_vhK<0mRtb!d>G`x*+|(I;
zM=ay#m#Kx_YsIorj)3<SlCZ}q_wpl!7B4tJTc2xbS==*}UQ3!Z0W-G>?Z*VE@|>0G
z-6_1Bpo%ZR+*WG>2E?*YxK{ES`xVZ>5>d?h4_j}T%kpV@#=ycx`SNYByiNa-F~|aG
z({c580bAu@*apP;(QsPe3sn61FUf8MkLV1#t5QmJO0v+6Ly=&>F{I5O^<K8;p(MY{
zE+yPVz@<)Wa|nZ!!IsnI+aq3k2hY{4GJuyZ$yKodigP-Q*9BfGd1jLH+Uxz{0QhAB
z5As6vALl*S>JHQ(&Sg)saJ@1!%RYvXQ(2p`InnllBcCIYX#Ek=4Hlw=0Z-g|;ioPc
zdgQ5~@_lFrKk~g>Hq|+5r42{-t(~dgl~Ap@>xS`_X=#AdEfv4${%01Nje|Ok2niP$
zahHGUg9>qcRT#=DAtEIiq3_IcP3GEXP{r8ev=RQeLxDUfwd7U1sN3g^ayt}(YsMaE
z<(B5*cV7#m;4*@Y$Muot?ECk{oM!F|aqT~L7nT(r6!kpEJN~*2(s#Bf-cT4y^@}|H
zp^zC6Yh<G7q@q~VbGdI!F_B?y1@R8V@S=S(7V6ksDAo}nc55k?Kqn^aPWy<?&ANzK
zHW+O&%U96CH?{@&%7W>pMpiX>tF;aq%7K8GhAoY}GKsX|faoV4qfVT)hp<+I=mSIO
zvmK#c`Em(8687`2LU5@6);lu_t7C^kSfDF2ayP*UF!>L=>F5cKsQNQZ-z+nxylHLC
z*XqY`5Gse$9;C9W>+9YjnaPRpV5(~Q&Iux{J-33j&Y8*=v5gS8?i%=x5=3KTM9AJA
z;?tzX(G!LJxEJg2jOPJ}9Hts>ihsSiSn!tO(d%I(NR>=wo5MTwMOeuneux+hSN`KX
z?1t;nlkp)bY`&8~$8Sse5^QH;U`FISZ<ol>o_<aIlw!mvdkpCrpeZeMf2+3X&KmJ)
ztHTa9oatb@2}3BjrKE{IDL|e<01a@TMUwVi%Do46yRY`IGb(uf*7CmGwW4m(H4d1e
zWGmmk?t2i?F35Cno|iZi5s`2ugbcI!m0~Gi?6Rgo*u!0kcgY@NyvD9qEo+rK7xjSC
z5N^qhVO9Qq>N(i_h`(&Ssf{uH<*6x}q1y`aP>U3>9=Z>-EB1bPnAwc9S-{Mz?Pxhv
zS;h~rP!;&AfyL1~q&QrlRh5jO<kw(&<U-|AEc-P5nK_nxJnPD+KgK>&-ZBVi4aZyn
z?y>A4At7;h?K92S0T=LEkF(NPM-wm~gj-Sn=&y}o23@pRebR2>6fA?U`1ts>Zb!N5
zA1a|-h<)Jlem|4C8PWzSC?_^t>u}H$3<Vy%6IVU)Y$RD<zoy~XKe^}8zjv1Ib6baU
zJM}#inMyLPUHF@r=x(5&Ma6s_vQSt!v`VPJSFQ2WeKI6}58)O4z3vMKqhensV%p#@
zkV@+u$q<4;$D3Tml>RcKLqrIqajXG(W>5(9lG71)Ep5HcY~D$Oi`ARo`5yG<pn7mF
z1-p5j(O<=oI`~oS;v$FZORsgm0x~Qj=@NVOC=nlVk7u=lyF(7H90S$#^3_{`QJ=pA
z5>63Nge6V!+Mk`P>Q8=u@cK*Td>t;@Lp}p{4;oN)^b{m&(IGzB+}~z9P>jU^g-BSo
zlLM6xih6iSl$9tp>h;qy6BCnc5)?@VSJ&F(fW|o@G{}g=3g&#>;GHiy4HCaRlV7K_
z{BPm;pLd8UpEd$ume3<VeyA0fpFZ)h@<52ked`O9@tdlxZLDDg^3RAEuCE*`E2~1?
z5GZw`xaTHmkpJSCc+LKyk6g5*zenRvN9JoLRdk%-CLI?vFe;$qL}DjFa3E=x83XqR
z>CUEq@2jqD#)2y?_eCaIqBGKwWl_z^S<+z7TkH>Lh&o|mX{C~4w>wMF(lP|34R!U`
z^ZYzgv14t27;c5?z6KCs3N=TlBf&OhQ*H{1Bf&U{+Ba0ZjB9IKFDLA$dT8_u{(XKg
z&=PV{i3VK<wyxS7K9u=Uj})Pi*oN+3ngUi<M^4Pjq_@z;4)#c=u9(C1LHdky=BvHA
z(DR?^fIt<jM;<sv5fJ6D<L4{3=Kc5}SnvsQg$E<c>E0f^KVK#0n!*gI0otWBl5O2a
z_+X_wet*e)wtwd?U+~bO7tfsps}mp_wqjt&tr0LN`;J#D(I<0C+GUdHcY-k;u|6fW
zHd2G2hnVCpofI)GhAmK&obUg@-B}`tlIg3-D(?kGSdT~yDVE=E1sku?c%Qd+H1+>R
z8u7Js_(acSs9f9p<qJs-zPd3ms(cGp0D-07jDV_Nk5SKb_9kVk_I>uB!H4HjxKnv0
zQz*GM`zH0aGV};+tzf!omc5+7qOvPD>n4a}-`dv;k$a69@pdpfMEDI#H87gQtF(YQ
zZw@@(e;Fm%I$_b5`n1WYfIZw>hM~rMQrUV17%z1KEtfqQ*k84LL50u=xzhR>JRj=;
z|EOTo2&hP01<sR}odCu)g4{jStz~K7=*d7J6dCtQ%v%gA{0=leOaMm9SStiA@$q!B
zjY?oAX##w;G49=Sv?|<vQ3GI0;+(-wZ{%<GUS<FY$wq3i0l;@v05_;9=keLEKYch`
zcuQAJzc?g>uVxW29|c;SGAxy^(dr;&3&Xdoa}5=y#++9z-L2|K5ZHAiLf4v?mF4s&
zM{nuetd5=cgb<<51r|a(`yZBxw?Ye_71JKPJ7w%~-U*Mo-y{9-RT!#0?%q=OiB)jD
zCn9cNYFpaZAiF({?4i`SRWKB@BG2aD-&Qzi`e4jijl2GCb5f;#REhHzk!cn7BrQmO
z?0kiNtHSoP#~y34E-iwy>zg2p%DNvrPJnATfv&#%r_B~XIP@hZM$pchs63a%&Mebn
zbO~qwCEm)GV=Sf9iP-Cs34*uo2g=4LR|tqdET!I+29Y4Ru_D!dNEHVQRlO~7rulcK
zJ$C%W(ptq_U)c*2vO`6!{H)LsEz3}a)XN-#-stZ6O+X8GFHP0%PNew-&DGf)7-yej
zjXzawiClH`3r&m7G;%0r%-cLc)}PA#lvLmFDKO~~hL3@$7lx6M*IYj6xZ1Y+S^S}x
zBCuuTBQywkJb(Hb8TRUt{`I}5KF8&PV`y*K{DM1&@MO`+3y6MIwcs~p1$0f@NXR$)
zmAjkb;U%}4x3@P~?7OPcM`99#GAXd-kR6GO*p6ze)lIT&`*Zu%9cx4$mKnhlPdi2v
z7oUw$A<)7=h>=3aE}EsujQ2nif7qw!<ZIy*8Huq^i5EkS&rU)gj5A2UJR0~`doMi}
zdUvj_Anum@K*6qp5rbnILENo~?up~Wju!0Tb>+zmOVBST_3f7z)-R*5qFao-p)sju
z;NPs%YoquqAv@uWkV-`Z0J+x0@Bm~cO<jg^Xpj|5k%GXWc48(Lxa5QZ-=s_+(0uU>
z?}?%vl3zi@$LaAmeW@%bC2LV^&!Rd^qQ!#1-s}3$OL9S4(RFrIOIUc4lV4?UJ$?Yp
z`W4{0LaJ_quJ^><nXb3jfWubD^b0gUvh_cp=oQ=n_ZV#3#!T;@1S1Db^IvcnX))@+
z|EP30!1J>2_byRgz|#{cj+~H?kU>POhz@W)BYuLBQp|MgdnuxqV|pn<5n!nXB`{QD
z@0}rp9s#G4M@hfXtO)fL`xY}Zae|IF6`x`I^nq)+PnJRazg<h?z1b!X6)s_}`xMXb
z<I;#Y3D5?vt`_{znf_=4&uuLKAGys>xyM3gRl;XeNeQ$5bF&xWRaZC^&^Dc>Jihre
zK@IA-p7p{E>?;0{E<)6l6q?H4kl}dpci!6m-S2xhia#RX{d95)>S$N*l+U#xXvVJ7
z>6ZM5R0rrsEsJfF#Iml3k}hO90!~uhF0&HGvWIh9;8Tld2SahAu)Ud|KgLJbglF_)
zw0`P-dt4Y!8R_H}gWIAQ93s73{O;Uk0;V()?t2zm!FSGEeMT~$cE9XJ3nILYw`GAj
z_5|L&%r!f*p?3yAbPT-52<9m{V$q*WcI({ns~M5Hypa;Bd7qCpSojr78Bq05dSo`A
z!H{14QWB3gUdt47ebY2i=dIL#RMp}q6o2fumdoYKI+MDVzgBKdQPFV*b)<@4b2&#h
z)?vD-`{lQ_-T*D6h8{5Vy;qlbDO`mXx>sNh{nl52?EYq6=T2aO{BNqFvL>k2PE)Tt
zeJ!-cC!WXtlMtlt_NrPJ&&QI?<oW5*&t*+<eKCjOaM5}w{nEGqX2~Q8;33r>7oa~V
zME+tQeB&Xoc&cb`1rfRDS?D+DO+ceMpD<na!xHC(m^Nd8{r5I<FOmeSt0+5X#X={a
z+yEpb$M{6weQ2>1XlTgBC`F+A7UVM)wv?^5)U27uD!!)~o=UvW$fk8B0dJM#1|)Rm
zz4ToKZr~qpZ?2}|X=!6_!}X1&u=r1%*ZUCxN^>flt+LW8NC(r~FeiGAwd;!`xPEvY
z)w*kuWsEmd&W_;+*3UFY%ttq%T?ZfRk2HhQhOLz&;v?-Pa9zy=4!EDzGRDNP^GUkd
zeUKsDRY~80AzYC`f>N4*TwnLDe%b$wbjRdxCUKO37?lbb-7gR!RYH$Wcc$#O$DcB}
zLJr#qgQ$PDN`wlF&wp=mLvl?xE6ejU?8~i!fy5QYhy2|{;6k1~BC$-;e=Rx&tDlR6
ziur|ar72h$E^O$&!LX6OE-2aJ-RIK<0rK?qay%CtH3yP<(BJjUtu1Q0h_1^a1N6q{
zU&(7;13JGgoK#v9x8kh>XZG(aq7D}Ra0Glh)%BIJRDXxN-*VO)w-E(mEQks6yS;z4
z$->h*Yt0Fgeybm$YTUZ=wS6<IJX?D|U-v(>i<mLvs>M~mrg1I@U%Yi|EA4tmWud8R
zdL)S$Y!*^iXT_E{6bRrjMV9He^*yASA6U8fsr1+N33;YTOD0Bd(zTMs_Duudeg~=k
z+=?eO0Dr#jx5wUB$rKzVek<pQPY<%yz|aVE`J-i~gZx3e?aEm9$$l|4Q6<!INlvUv
zq!dSHIl{w@#Lkkz&R)>&q$M`oApsF<1W}$F74i-t(A+7F(64sc(962w&@+B@Q~Rj<
z{e1r!H1h%)pdhxv>x;F6=wPEaOIF(`Sm%7eD0ajz*10^Sn2p=*b#Z__zfL#*+j4&T
zjqj@AH-44d_p}!L@1ZG_g)Qt`LU*BQzFN5m5uCj;28iB!QNt~-qJQ*6d?=$rc%wOB
zb7x(d{UH|?ufKOGZtN|r6*UHGH#X}Q9OlaytIt12E<5q(&&!}K6tRafhyK&u3N<KL
z!zlhg(yAV<U#v$$pApYbqt^(Gusq%0RvuWa`}T(u%?Kw~!9E&Ly1oiZdtZrJ#*6lF
zWfl?1pNQ%0U!&MZLFz6FgW5Y)xX&D1@cd5QjdRJ*^T(8o-xh?{w8~fFz}~Tlyyg*+
zew$M&7WD&{Gz;;%48n8!lwdDax@}?vFszoJGC2hdJ<(5_o(Z_u><iUj$wo#*8d{5P
z?n3K#VTkt<Gs7F$`k{3snsxC4g~YwpydZ0fU?6xipT7Xq-i5}SC0)CH1-#V-AIu$G
zEOhgNzm|NZ1~wy7z<VV15naDMn2~}l5@W&>RP6Bx?>FH#u(aq4;6gSG3D=y&AdztB
zLxHheIGD7{)8i3c_43a7ifQIJz}w>748bE6c&-BPC%Mgf%#NWH6(6OfrL%zfB5jR@
zUgjMm9-kIRP|L)x`Xk=?i!jb+C~$GezMBB5-O{{8P(VFeThGsDE?~X6Ln)2*hM+)L
zY$?-X-aSrX%htiz1SR46OapK8-L1#fvv-`2^{OAk*SS64rz9Z>0&PzP1d@0H2Kjja
zbK!2-$7U<AJ9R*=t<W&`ef5&bsG#HEFkW_FNKFm#!tr6aL5sIn|49_5(O?6uJk0DL
zof%}!-w#|e^+3JGiu?b0T`OZ6%E|Fq2<o!fCo`;TL@;oaMaH%LBlCRx!mP9=8&v!~
z)Ad_IfwX05wm>#Ill$uM`F=4<XxD5*zzw1FQm5fsK<vqbufrsDlD2>uA!Rp7*f-09
zi92+8^S-b7!CY-VIv=;&<5Fu!5d?kCnx2z$j1Ijj2{Dz!t5nd!4|v}sSht)2?x^t(
zt^MgB3A`AQ052@e0cAXXE7n^B4lbSHlt&1gWiRIZVt&;zTw=n<!C!xo*uUP9&)f=u
zUu4w5&qYb9;N9XXbgRP#d~Bo?M995i1YLHd<%qY-)s5Z=c9wO{Vwkkv?-^CfBe;^}
zm6DjNd;Y-%_zn6hqsOmo0Kyb)ykESHmU<)c_=|w`ONTt?$8@F5oSsO-lo`8NB`87`
zJ`Y?yc%n*N58oaPK&w@u5h0sL40bguxMFDWG!v2;8=Xc-BccdM0kNGAGg5r_iSHb#
zd3oBu-!f;M(rVf9Z?cw~quAM%){in6q06#$k*e+_dC2j5*+H}oFD$|A(TnGq5a-|Q
zIdpiVg=>QrMM#ewaK^C#gV4%93Gv|u7@OhbD_6yw?-Q7eo<-{?&9D?^d=*Sfep`h8
zM1*}_$JQ`Qfd^Ztg%AIs`D8H8QkA>AfGaDwp=h+(FPKpJ9lxh^B;$p*TV3J1I3cdv
zp&PM@Us%z82R~aW3-PrRxZ#2>sSk&xycbjMP=1g2=19~aObUZlWhzH+;6n%&Xp`lu
zy(^+_5X>ua%}W^RF>qQb_nQ@1>1(RKbQdc!Y|%~M1H=f4ARH+`DcoNS<IK{pbNHFc
zf1zPb??o8sy59dV3VCODrh%c>pY}d0AiW`rX8@ZZG0?nL1zXM4cWdN1!(tWQ?@V@|
z6}7iZQi^)(BY76wg3`+FY}eo&HMcidsC2MdRF(yHvA!rTi(#7ui<9AUQWXB{3>!(d
zIuFjO62PVbP6tdxENYUKMWC4?OFr4N+#SmhBvM6(ZczZq)Ay`ml<#M&qXC_Bsbt$6
z`S?JGUvs_|J-9vMT}e?KyL>oobv%=)q@q{bT#<lik9oE^au2kCz6Tuv?Q!(2piBg%
z^x&kS7S5{)hm=zcps~^_GwcAan5HfxjK&_IG7P<7nB!W0BEUa0G~48UVo53Fsxnyw
zOlui-@XMM`$+3u-X_vj_uFfFB9%2#6-|wQss@~5%JUanXxvya6t_Bgh+OEXFx!rSN
zQ^ykIbwA&K2>b#$L4OntETD<%%u}nid;M-MN8+*W_qPj9{l0*=uyJelVEfF>=zCva
zAWEu;m+qi9SYH}*8gWKlKi4n3xB@mJ1Ii~pIeB4W{opM<0bR~k8V*hrjS_8(QOL34
z2eqvmO{W|AlQ?71f!pP}Ejhb>LazZoZm`fa>XgqtSHSHs`=&mZXYeMj)YhFd2*Pu6
zs4GXJyvjrcSZd|{)K16isvA1}&e}nFr=k~-obPvl(Q@@<XU(Z(VpSX_l#MUEHAwyP
zEIT-OnIul?s8ef;g(~5ZkCLW9aNwxvu`#3N*Em7<$^Kh~r`a#gY<sNeSAeA<YKh!;
zzFCqLez`H+6$S&r<3{yzV{sHTv`}aE5%v1W4`6C45Zv*!<}LO2SM|@=XC+}|5oc>a
zO(%QCm>l_&a(X}nxaOVmu+D77t#Z<Z{`6ica}O&v`W{h+0D}mTwi9}V2+j3#aeLY3
zSv_~zoqhpZsjfZOD`gXwVBYcJwViDTUAPMSCFyrYnIF_Lt-??F21A>F1$#&0_nMX_
zy2I6XC+#s>f0{hL!AS2$z-Hm9$^2(;AGLjB^db2MCWzOCIef<-4qJ2R$ko#R!3H0R
zPc@3~tZMIAT=t*fYkJ$fZl~5nlKWfy^e{ek(v`xs^l0V&a<P8zyWb7bK@}x1k58-A
zsT@?B-aP0!@^Ma<BwkmHr;!wXdgymtgihzfbMvp0>sxv!#ulwslQn*o{LMk-aI-{l
z?NiBkV(EQL{EX=F_Au;ZW~#z!9sN@&Vxj&?EKD_{^_Nm{Qgi?bsq35h3A;gi<fj1o
zJs4t<vdUqc=Vu74Y)2Gid?cPFpEKZ^AuVbFG#JLr;;{FLh!gUBk#^qq_J4%+EEOWc
z>?8epCH#cdJBAA`Mf%?&;U#+_VScW?5Sc@M%ZGyjTNHp0?7ys<<WP19aef@+gOKn+
zE!`3H0Gd?Af_rcyzV9jq_%#iUTtx-U*hl(?2l#r;3Do>AqP{Y!%B^dglu}w6L_q0~
zj!k!Wr-F2Mr+`w@NP~2Dry$)e4bt6R-{PF}eD4?x{=tU5?={z)*SzB7>PI2;yqJLN
zZq_Uvsk)nq6~h74W7)SN@>-4B8?on7N{$La`LC4zLxW{O`iTNT=@_n^g~KiVe*9t$
zBf}XH?O6ijx6QV=U|;chg-g;-AD(aGljpN`R^tL1v0r`hrm1X4dJ1`<MB?{B($_Dc
z^D<X|3xa+H?!K+Q6-VsX<h${ulXqg+6UTmeIdyNcGAsS<*CH}?`{A$<v)#KlN&<bN
zF1?X&F|kRJy9)i(0C^g&<;!7FAStrF{kuAY`GHz@hj;mrq`F$_pjKQI&hyW}r0oQH
zxUVJ%tb=&7ol8lFYmj6<TB6M%dp$>3>MDh94`?nQfRoslz`%mx)ah$5Tg+#o`MW+|
zuc1+{%vXHytAhK=*@!l=#a_1p%Eaz&?D*?@4A<PfWsLTBLxcY0lb2wFLyBPS{w)=`
zfssMG_9X}vP=*UXUomH68hxQ3o#pYJgm*u4dVsbiCGyRq!GM*2C@(Z*f^QViUwIVr
zHY7@3KyYc*Wc<qjQf7h>v$01;jL4LCQxpf*w;wO}XMeC+#1mIeO$0+NfW;4Jt*0+a
zXV4B>SPwyzfMZW?P4jbr1XyKNZ{@yNNt~7+UUI~Pg0>>d0o2<Jk24TkpPAoM20c>n
z@JNHnQo~(<;abbvGukQ~T>BNrKb`-21$J=@zvmGG{;V{)nPs2=H(36Dv1#f2Gy&ei
zjO3wY4rzqz(YGA#fxb-KVHx1vv_AOdzLx!aONz8=j#mdMjQy<mFRG(-k}H;LZpuBd
z`^^#7whWHg?(*|BY^0pX1fAuFi#!C#{Nacredu678p1Oq;THl8y&L?YF{>rd&t$F)
zYy<p(otp##`!^h)acxIgvI{V&sP*--6dOw4qDn6>UzEPqCInXlb>g0TJB!y5i0lQs
zVp_-LoTHfPIta*TAT#TC8jpNl|7RMUDg852K}D(&r%_)^wc8G+YClgjZ(UTiVP2a(
zKBIbT>$g&y772C5ovkB54eN=RL=P+J6VU@_Sp$*?aC>o|Y^p^Q8Cp__atekfpwj-O
z=n>v|oz6O^DijKH!1a0WeOj4iJEh){0<8`I(6yA0VS^j&9c7rxmzB8Em`YvuEZ4nD
z#Ft1FKE)dymT<1QsI)#1m_U8ALpbUle(x{6vlF(~U1jbGr4MJGP6Qdx1K2?HP{B}Y
z7Q)FEg6@}FfOf}0jhy%F0UD(5V;MirZ2)y@EaPpkvX2J&vugR>i?ncfQ8ZUIdvU%h
z0*&*e3cI9Xs6t0r7$UhaRoI{c6SkGah4sxDECi)xJ@{DDU7+I?_tRg31AK1n&Y}rF
z8eZRa(yd$bGn-?x=sVlU&Vrribys2j_p8lM8N~sia{-|O8k<HgX7(0*OoSRar=+Q6
zG^4k;q@kT}Z2AH{6qRS3ZarjP`yC*(=f0IJUHzG3ufCG=ZJ4QvfP!Hzk8#p)Wfm7G
zo&M9VqbmT=9$sX;;3pD<@Jp(daNyK6US?b)j3CXZ-#Ze!VB@#WGa?fx5}^Vm(HgDD
zp1B*>Z~#yQ+*S;o{6n9WHV@QZxTkL7r11I{{p41?1uHx?7dRoQIzivHw+6;MHCmC^
z+b`(UuSl7c1>rWlmU1Y;I`#MGV3QB2<4Vs0J?SwyUJF$i4Ui2Gu{rb^LC~B|^Ke00
z70Hp}Dvd^!^)dc7)+jr{r%%@5c;%On@3(PQB<{C(zAG$I%l(tSCiZ*eOeqGq$we9Y
zbnBNKU>_Ar^_C~&ybX0y=cHHmJSRa9lek+?zOO}n(oXa6Wyuy<={kihRz(B6h2ZNT
z+|A4vkBE>ML%J3jInt1lA5LvKm!}WE$9`!bIosj7s`=m6kbWvrpw(rpZBd}EWuFlj
zM;9;h;X0RhswowH{zMOuPN(M>i3A;PQcOwK=YYr98{{|Vq(=c6uj~UTG{#KdQkuY2
zn0bjIa}i^$$PMR#qyiW7!VSKCNp)kFJF)|6@9dVbi+P;l+!k<an{7OrQ@Ct6DH9$O
z5fai8>j{jAh@~4^&Y{ad07GLkUr|_H%f;8EF=!`I8hcYE=_PcUzUK)%Sli|Bs266u
zfW&xg0A)<h6FxnlBbJ^+jMi6(vkYCETE`!>4-}FCzdaK;l1HP|mP9)2M!%C*sI}Bw
zEocPmGSVfWrCroMLdR->AI~JMX6HtiEHi~30UwSS03vftu*d^jQ4~lfsw8b~%Rec*
zR9{g5Oq_#woB~;whJLf<YAFWQu0MwId#Q}Pw9Vo-{CuznsMM&o8T*K0oW8oPl_|`(
zZG#S?+h)T2o7x?uwi<Dcn=NJ-&i7}P?L#?~wmrH*Pn!cf;(5~47FA=bJMxUoY*nwP
z{llm~MtDYMSh{kz_evCNwe{h|vmKD90(?dcmVab^Z9FcsM(nF-P<s0KwKs9Tz*=V#
zmMD!GI)S6iVP`EdmtL{-4gs+*a_lkj{}}kP-z8tQyn@X9$FXAVq2wr)9By#=9!ktv
zkc;mG1_X<1WlY!0nrkU@jgopd1IEiRBHu5R@~maw{TW+API>tp?XB}(xI_?l5C3X;
zJx>Uq7N;|pv(VxMi$Lkc;HV+g-65*@kCt@zKG{J?hBaa&F~TRFVR{iE9|m@$0M6BU
zEBIxm%!&QGEdIalZrmh0X?`Z(d$(uh07R9WM{>MiOQ(>}5c%OvBp<GUEF3%9gfoF!
zl3#Q{h2lzL#<K94-|HtVZSsa|Oo^1))eR(RNVx}dxm~%&8x`q-H@7Y|H~jN2P5CG#
z8MWO!u{6fuIa`$67Oe05n&V>lc^Y2!Dv#kCv{qyNP}gqos#y}~JOOI2Hx2S7P&jRe
zTFdERtsZ3|`egi~BhOh%z;k`1z09iXtaHX3XooE@R&V~3bmmh5=Kf(`5Y90jGBM2%
z8Nfe~uHj-o%Kx7n=R?ic>X(z;$Y2TOe7n6-UUz}ge?cVF-yE26UxN;~S#BolU-156
zpz!J^HR>&xVK13t0{+!U`$IJitWO4B_9#FV`pe#o=kJp)!d{lec<h3c(53TzN8yz?
zaQ!HaSadY)^uzzU$Reqz<udb*fbp46A;oj?XOZjRH+N033X(yIY7q|Da2X*Qk>l#Y
zI1@_)LzQfA&sW`~<-SA@y*aaVKMb`gTdHLkeorRMp*+?$f)Ng654>)2If~bFXV>0_
zH}Hp=a#XYLV%c4>N}dP4a=Ht?)Waw^bdgXlB|?$<RmMXU2nICSv)3nUg6$<rQ8Rhm
zhmBWBQySH&{4gV6NzM?-T%wL9qBbtl7niK3l&7J5@&X_d)NQBzw9&$!z)dV7OgTkf
z#}88tGyvoeb!&RtNMQdUbTpn5y1q|)q;K|9s@-9xRQKKJVF$dG_$#&<C;3AkM~zNk
zNpHBV5DJnultE|4JA_RT({9pF+w&K#oyD(x-BWsa!Tn@nLTslz6y$5D+xxcqJhe`y
zq?`kB*7oG<2IT&u*KqwUkBw(X-;Mp0<Tn%e7xXXwW?$a4cw8~rt_bKfyN>CJvg&D8
z8bR|qyr$PuF)unFj+(u49XuEPKzTf!x`mgF2*-|bEjd%K_8~bP&SwnLH@mbb6ucBE
z{eZ&QoG~AVc?@wm)OqIS`ZEmSz$havWZ%`dy`Dqc84Uv<HIMA#N#d^)*qkHU^M)~D
zTYyakfwo6dx2LBE@QkuTKR|anZcBd+n2^g93ZzIa;MA)FE0Wl>RbrHlz<W}3nDx8!
zoh&>u|BnSnAP%{KxC8tOsVk$F0EtN`9rxDlsiFid6>B4<YLf}JvD%VT5Bx*ZbT$jM
zJCy}Lft@933!N7Cny4J==^U?+Qp1o2H8(ByrWS!`#-kkO8eo)z%&@8*i{BsKlik&|
zZ8Q=b%UtzE5c4E?U%vKYKQ<b6&~GgPi;HLRGx0w($TT%ldf1Wd&j;U@N!8<a<*ZBK
zG{kfv!CA|4-QiwMD(@&ww#HK%n>edO+6UP9pOD?mC!Giq=5*(8C?jO<g#0Pvq?YC`
z6k2=4CDPk6{%$S3NbBp<vLA8c{3>nKQrm|EY%`=>_BUa^&hvo=o~@V~338|9dGK7i
z9QM&s<w%O$(f#Hw@cz$4K)u;G3}$2!sR4rPe!~XWEAw`ZWTB@(^vuq=@2TLBaJD~I
zJaFj^xkVrI@Ez*Iw5{zMXSAa{6z6$vlV{a}JU`Zt-I0B?ceyX5GsQsH7da{x1gNiH
zwR}mAEsHrW=A<Hq#Qe6dO%3L$41zO~Q!wF(oZzopU~R3dD&qQpF0^L)et8}myEqOa
zCPY{6MOS{HJY%=vTU}yZk&t@l>WY7TV72mBztoF09Ytk7<-XlOh1SrVV#=G~gQ|bu
zqGnMv;nhJ*`@Z>KhQG$uXU2qJ8C}3|^^fuyigxAWGyKFW*a8nh*eLjN+&w{opk5R0
zdI$G&=1TDIVs=7fIJWAFCV~hna}edOJ7Mp+^OLcke8hK)OSmuD;<kMp=sNWz{`TrZ
z0wQXpq!OG%#otTG+zov+7>=;5*Q`Z#>zsR_F+W9KufH!wV<0<uY%AB{|N4ctY2UVL
zTxS7308kHNP(5wEZvGHF;Z^uVFnf&#=m@_^TO2x`oO<4BszWCF%@O$yMl)gAtrtx7
znH##tP|nW(HcEiKVw5jDSGC}wI2_A0=2t=AE_wck?A-`0VC2~+7~#!GqrtR;e+$w=
z{xrFioh#kx`%vHW;zqXXjtii;tGR2Yu#!<NU^M6AZuPMx>19)HQNnN+(EAOTN0LIn
zfK#I3dNVfg00K(boeyQay+6R##Ng1X7286uiG0NJuk5=SQaksZ1MHkx_#`GCn<&Vo
z2r=NI0YfjAnKGiUlv{TG9<(a?q<wp@RuXS>EMiWV)du}fPKlf#c<os2RYdvI6B?~1
zqm8EgIlSMH4_;4-gY_ujm%ZOv?u&~;btI)~1fkz@W+in`7_k@A+BU0T2ox{Xkqm~?
zZ*)Xj?-#qv9e)&uIt2*0pj@)MC<^vl7j=J$-ka3!_g|w0Smzu}EyYP?&4_%U((0_{
zbCEq`8TIw-OWspuYgBwfeEG7vC+75STTvYsv(W`hK&(QY#=bXtcl)5J)_V{t$NeUb
z;Hb=<%Vyo7>Vz4Vx@c%XM&I!(G;~C5r;Mf7{9oC-K9&&(`9^hb+IXXr#SS{Ic)KE>
zKFf{dh!R{84Rm*P9r*hU)r&P08*?$Suuh&;#=v29oL6io62`yWw5>#aWR_QgPv6aQ
z$DJ}izEr$N=#E;jXx(16o!;++^${c-VE5#?P4LE}3LMwj-(6abUWM0K%*^GLZvAOC
zn|R;1F>#Wpc|J<~&TDYJ$&g=a<l;Fx+z~xmVvL(kfR=J6DQiKok#ThOwd2G3@WKAl
z2Irf1Pzk3Its_z05hF_HM1sM+e=_vkaen?#<joPzB49NoCFYFb_wklRS{PfRgL4N0
z_nkzu8~d4ruSkx0M?TePv9#)P?$bZ7M;!)$;deG0HE-}&2<(mj64_0@YOtS)0Chxm
zBE1W+fc9tp-2W%NFaHskk7UE*#HJbU5acySzu{cNL+}}J%Gpc0w7m8HyZu!*X)I0H
zLmCJ<1rW4Gy~VV+4h18$wt3GQlh+oO+MX#G&$^z^&n8>#y71%N$1=yO|JfBSR{&Xm
zJ)s*mu$%fdb)Zdw$Ys%2*{hjDfZTk|%*?Ai3GbTi`jt~;t61grv&>6`Vdf|vPF8RJ
zf|9{XgB&_?og}qP4tBj{0&2EDUR=TShxs4S{Ke=!uk;!hh7r!R=yA-3d$zob5VObc
zLu<@~EkDzcj;=zn-Uxj<TJ8c?2rIXx{JPWmaJlBt<5FZf6cCxUyZL7;f>S)qW|?3Z
zo!MIG85f&6eZuwq6yS&i2glVGBgXXD<y2(5f&!1a(>6Q=HS%j`Jo9#&F{;R(4cC-w
zr~S*B66%B>lH)7o2Tv#c*vEvbzh{VYTLzQY%aAZmk$K0D&!yHS9U3zJZ*S;Kc^+&+
z&WI8i94ze<c9tBS<h%6+Z)IO=#4iC(2*(E=EI?56_@PHWp!u>L!HVO4%UQEnhK_6E
zUiPigiLP;H+f{hjbzHeg6A&2xUZJGufEs^g<q+U~$+mUV!~25$m9x`KZoTs?{5oAS
zBwn*zTV^&}3y2fKyQ`NFGlmsN|Ex-*a)muCVO+ZeM9or8(N7S#TD?b)qHnM|*ELW@
zTe-$ud$<uo0r=HhWC+54_>~~2#hMCKapp@fX=e5R-wPmH8GkY4OH~mtd`Vap&G)Zg
zkF~ou`E5yJS}$+IddwcVbMQC>GeY#{!4i2;4F<0G1pvO<i@SVDFW!8NcCR9PJVypD
zGDXV`2<D^BRBbThMQz2V;SHf*Q060T-SIIQEZHFD=J09C$k%Ap4>%oqc<}x^#Gw}_
zP9iqfRHCb{oxiqTkf;Nb0bT}bmme_5qXT*8{-tZud%*rNPKQhs1sWfYzIU-QtofXg
zJHF|{v;|}ZN1$*DF`j-0tY(mUEu`_|^bG13<2mLbFz}@FbBz#%<5qOnT&O^Uc?8kg
z4(N0laB>qiJm(#(;fy{zdMMWl4qZTbwoFm3RsFlWTdI5%1Uwzc_D&@V_ySPf%$eLr
zQU|qzhD{A7DXioAr^T5H?{n$3|6)MrKYWq0M^r=3G=R|@-7}!w?g045aD^B@KGCvH
zgV+dcG-!nojsw5-Tk&JR5_HJ@xHO~W(+<*#CTh!23YW6K&1A@QTITw(#a^%u?andm
z4d2h&Cq?~rJn6$Wso?htZCZEfh+hQC-*R~kq^cshXL!Iv@0oira^a%kF$vg<tI@Us
zORNTa(MEae*~%_6q-t;f38ZSJyO#87@)_kqN`%2`DB~(9dfCyJ3Q0IeWoeBr3jD@v
z8>pOA>1|`JR2+Rdr*}tfbuOYK!sm3ta);(Ktg5*t$lS$&mu7BySdo94-5gZi!Z#@w
z*R~0Mn&7#;ZMQq$9!t7=Q{n*B>f?Iv(u)vR3b3_%+(wDSB8a{LFnj)&ZO=%bq-%dH
zb%=++e;m~vUm_nGspHC}x1{G@Yd%kR=>sk8+h@hHlvy_nGc&p?ZeZm)?nE|1-S}qI
z<pVdBK*{Ju72x54>@`Y;WwRhuVw#QWxwkUrE4fy5jX}>UT9D&{kg?_Imf1-^oW>gp
za6#&&gss6u@-E4u`m7dw#Mo(>8Gc>bQ=t*2?1cEGI_t&glzJe{m+rdM3jTiWzjS!`
z>Lo<@meBXU9p2qLx1~X)Kq(0`9pa|cn6~W^Xy|q}#348%y*Zfpdj1Yz_PR3>C|M?Y
zb5*833f#rPtrq+9b@JRI!x)jde5U$8kDyOGQx=A-tdG*4*{$hzYnuO99~r3mRii>;
ztQ(4^#Jau*)eAHSvDIkB7;KFM6*P`=UutI0CR5GVa0oGfB@UrWm*0QzKtx0qG3{R8
zTD<Q0ZhO6FS6@wibIhX6Bi~*(sXoil5~Gv$y8Sau`WnfS_+U%Zc)4I^CgMmM7O-An
zV3<Q=ybo<(K{ujO=xwe3vQ5d!{2(pLp(j6>?zcrzP}A<JAHt|Bzhl4g!6_Vl-_z~P
zo#@uRIM=Wec%s~*s@78#dY^JJ<I=Ap^AavV{=N}?Yk%pOqJ2}88UM^)x1{>mtO|;u
z4YftFJ^<Zz;y%^Hr5GD#`Leb6lJ_x6*R#EOCorWARAUKU1hAq6Q1IqZv(hk`_$hfd
zldH2-zvsPm$BBqS2^@<li!t$v*#6@xd_>L5UJilVoOA>=>TrzXI}8gAls)=f65>5C
ztj5H=TdkA>PaTrRsDzDbH{Vs@4;sOJExS?ekAM`vr|(OL$`94JhF8L&C*?T|u#Nmd
z!~f=DR|P;4VrR_d@H0tLy_Qx+Kecq&680p@vjCoHzh=KKJ_Kfp0-2y+x%nHeH^0yI
z(U7*-N!^_)0k18BS7&x!ptA0c0|nyX8TL*|AgCF!WE3A*vIqz?5<$UW^GJBwl8n*v
zNTc>IF|3B?z1MA$a;VS}n9&?e&tfG`x!C-n<+oK%0E@0wfPjuo7SsMOomP}+ePUeU
z;MQnGlkrSxc?-yzZa*cdsZqGF<~Z&7y|=fT)U$p!!{h#KK^+;I9Av%z0;-9WT#-Ya
zbyM9Osn?kWX<QCsFrMp-^jImvOzA`Ts!NGD%@T)ltO&f~tZWFfyyDt93DrOpNZZm}
z22a6Nx(c*Vn6Y&xBP4hHp)kGn31%(0%~Lv^NIxhz4n{{6GBt7>-{s&=v~VHta`v*<
zVp_hlDT*ckX<g2W3jKNATw<lTq&E}S26{wll(-=XqdA2ffp=yPx8bhx9vvDeaO>+i
zUy?vH>|64L(8oYIV7nr&;SJCc?VE~8Q5Ho3j=Byo*+kM{XwI<+A%%<jxj9~(KwGUp
z?sSCcJocmdsK#tU3|A)48BQI;F<-?K-!<1O88zgY1e}^HXP3fZljhhPNMLGhk+BI^
z==3L0(>K1G6*)F{<F2UJiCn42vUvd-VdIQ>E@8aYbbVHczU<=gLhLN&(#1a08G;=!
zZEf_FwfDoI2t-geK;^>aVaniBIGB$$BFyvcfoPHUGCxRv>1c_{(y|1{d|l7t*y}wC
zuB|w4{~Q9nD2x>3Kxdq3^IWxl6R+I-%h75<ndC5^q@qrUb^jXjKQ4gCE5=%@`5-Wj
za&J)Z>Z8j%fMiLkAVrk+90&#OIe_f7F<M>r6^1@&uFp;q9}Ed(NR5zI<B+>gut^|H
zp(|LgA7ki`&M)q9mB5ZvUyTZqL>wOc4ECO_R(JQFzGGOiX()w<5V~!u-j#klXQFjE
zOPjwdeP$t9Te=$mLyvP6e&*Gy3@KJ1_h0RQr|U{*gJ;Ct_HcNzCQ3PPScjw63TYX`
z$>DZ(@Q)Xr>t1LGWZ`ENIjNf?ClhDzz@qOll!xkMMQ@l=ySJr6ZvLY&wyI_GxSrx=
zb*DyjXN?S8C}04m_@j*wciKPh7K!IqUOj)m-LM>|Was1jj|6;3!tejLWLY$_|LvUA
zO+4P4@6v=bE4>{0<{iV6(|}B9h2&m3>H%f1=DsvpBj7dajlpHoxV~*&&2$Vze`6^p
zQeI%9Br95#1Efnj&3@Kn@s-&Z?VBYjaAC%@FR}-aB15sce!)|<ZCl|^&I3L-kFS4C
z%9@KULwN_Bfa1Dn#s&<j;p6&a^=A5m=E}j8K<(eYi>Z_V4-#nmIQdeOfXaU<`q}c{
zzCAbe?P!eLWd$b*0vBk$?JL<n@-k4DGSJg#OleK6B+6Vw)}Dd4^vzC)zK^|W1%L-^
z;<&E&4?=1HuO#zNv8ky8W<(X?rjS>DTvA1H<G0%cpK4#SucH&{G*fr%eHRsQ#YynS
zI!gJymafOn7=obfPAGBtJolPJd~cv2JLdWclqs+C3lIsk*M6cmIpO`4XXN1UH7Lq}
z#GyZ)av(b11M;aw;Kn$rNZHN{;P3Z~B(5mOq6S;gO&6NcVuAd(HDLsx{<7U&P2H<I
z=;d|+S~Auc?UIu;hxXdBSPZ(-u53apP>+6X_MSp&;<;0;cddM)&VVEO%6ZfeuMz;c
zfU1v0{en3v$+(R9L$^$iS9l^5*AD41j)&KxA*tY3C|psbgTv1F?#OOLj56#%^}?q(
zos%#jW+|rluc3rPx`+2gR~?lEL@S}Cs}U6^CM!2!S2zw0|Ex-3l!Vi&P>mNph#i5O
z0Zz2jOX+pKCIki`zBO99ykwgghO})3=_gs_kQ$;nX^$*n`zFJ&=f$pSpRPABuo*`u
z6R4d_6J)~n!!<t)mrBMbHWEa@<`m)vCLVDq8$iEoX!@jVe>%l|EQlnL2aQzy-d5I;
z7#KJwd};>ViG-V5=Hp^VD@E_|G}$W*-aoUoe}Hd?l<y`Gb*7x^44n+q2}q|B!b?q@
zK1_{vWhu`7TU;hSfZ{^(`FvJ>hv_xU4KBUb7vLE}Lt{+Azr%!TZ<E6w5r~?UDePmv
zHB3(=<iA8ESDnC(lXI84wq*{yN3IOm@?8D7^Z`$D)d_MCBK4sRM1f*ExHHVq0Fx%u
z#l{+}W@~In^@R=VGALyzkRoD#!8;z_P&}tSHT#fhckc-p+a2!APEC=c0Vz_uhue#s
zJqlpXQ<&)}?vaI_KgWXDpfT}FUhMq+c4C??(Jyta?CDkV){MHQnda$C-NQ@S1JSma
zq!xLap)m=4sSm8Tt;KVGue1<;-)NTt&r(Ok*IsCtZkz8)w(lfC=aeEu={}l7X&-%e
zYz3D0mj~C!`~PkPs$tS&M<ND;A14pgkg%uxEu459vjDAGJ!ci^oNloo7)~I_*x8Fi
z+OA)XwzuGVzbw@D;}ElQ5!s(cU>XWMYbCu*$IZK1Vxlm6@qtY)%_ZS3oOB-Cv8A@r
zjoyReb=)SRBqPhEvZ9~C=3Sor%I_-{JHus$0<{K`H<_+Q&WN_0D~H2l0oP}+>c(AI
zb|x!a)SO!FRg7H;zFWZZ^QXiahay;0elZy+^$OQ4arHbvmZMK%^yF=Xh0MP{vuu*1
zy1mr~D+}p#*t@c*ckf@!I9tcG`;Z4ya3X^R3ITN5&i97%Pm%@(83oB<b%d{JR3vU;
z4S-8g3$IGZ1;CnR_m^7cKR1KDPNrD+x-z$tr4S74hhsX@@rnGujO`8u@jz(j>p23u
zsu^fw91`tL=<3an*{k{9%@jt{#T(l4@we+2B(j4>++IbRhLW@@+xZOvN&BhF#j)4j
z3Bp52744T5?X~hT1Wxm4;BKJ38iAEwf+;<eeyl?!^dox}^K^GIM3H}YE>+=lt1Po&
z@mf2xLQ9R-)_}ddu&KhiRVf+pz`wzb1pI4E((Vi7;Fbb(!G!?9G|LB)9t<@HbeIn2
z0=aSfGsK@hHncAlUKmq{d^JA9_;G`uz*jQnf?nlDiaGqQjmbui_)6RDKb#KKtDA)=
zR3uj?c9<$`yMy1D-|Jyn(3?!6Z=m6{%()!3(-*`Bb033HsT2s5Sj6v%T;JC%e(hEk
zglcMb9naBzjpy_Kt>*kp9bWr9Pk|4-{$?(ZaKFYwvm%av^h)pAcTJMY>%%Ys2ZVP$
ztoE~xbn76qC~Q0Qp|;kX@vk`-Zcz&zpV>G?WO8ONN_*`|TuQ@*fdSX3q}MlpNvU$*
z?SUP8bvg`m)q_B@7B;?vqS<smlXp}ytlil>f<NFQXG5abeu5V99ikLkaox}bU{hjB
zNSF4hP>FA#7c{lA=sw9Nu~xwLkU1#NvxFU%NM*bziJG3Tw=?Wacm>T57Z6iaP(o#9
zd?!QA4zQ#6%XE`8y9aIm=`ZpBYVI0TPCl<_dRvXIBmK$pTG;l$u`f@*1ra{vOy6}B
z$OWjE^<U`=g}fT<9w4?=;KRuBqCQa6I*uRj*wMvx=L44BKOB#x!Kn1^x&(};g{3`W
zr^-mBP9{&vJdVbo;Eh+MC@I3GT@ZzDc#U}_2@C+;A(t1$E!|e4j8hwKj+cyg@7Fmh
z^W#EOj~?&v%_dJdW52CuN#b(bbg8bzQM7$}`vy=p;^AmeQ`V4rvO(1W!QnWxjA#he
zJ+aDXwbM?W{G{HhES@>SK!N|fyL?gKRJ?XMSL1Q@MUBtw)L-wyJ!el*seJJgA}$Te
zR^4pK(k~*Zq=}|{u)YujU94h*{by`iwU@w=NQw7+aM@n>;LZbZ%5nk<!XRfPT=sD1
zE^&a~@+XHCIk74EYGKI**f`F*7<RH4{sT-ABA0ho6&1qq;aZ;4oKBhf4t7bAQV5H|
z1$}w1TCMf*TR1!y-7>E?b^?1zS9l8#1e1WE%wR1SXdhk*v3#!JnbX3e|LSut;U5Ux
zgI@#J%cyP^WGKQC#!4#x($Ll=f`+0QvD@Yw3~UnX;99dKI48_Ox4UELXJ7Bb)DXvA
zcApk8hEZqFNns$b5*(kdaQ*c75<*E$1GN1udBpmJ$1)C?dzI_+M{o5PaWSH3zYkns
zIIR*fl<Ehb75U&w>Ut0Z7Dw$cGP@`I%`Tf%V-Sn6T+*74*blh-m_s<D<$Di~N8Q%n
ztlO4^-+}!#_`U7m+6x6NeQ6)OtO?QZ=6!?hkd^-Ef0*ZFVMy?pYyWt@o;s~G{237F
ztPn_(>V9hiBVqXFeFGmtAO*?^tM3*umM=Z|;6^Y}1~6pbdkMW84&~n&)5B8(qmgnC
z^Znc{HB-P=Ly4HHn9hauJ@Je;6wxhTfwvz4)KuTTt@}w0Y&)HL9({u7ouP~}jO+W%
zL2kd~0lJY|Kmn<8x3~FEm_hv$$f1&BxZe4iBqhRa#(QHSA}Xcj16LvC0raX5v$Ik*
zqW_sblult{p}V+}d~)$Doe3P>`W3iG-V7*9rY}ejKNQPbzdadKMZY#{Xv9T&y3a&5
z_xN9<gvrj<6P&T0(X5IQ+e8Lag(GUv4Fmcrilva>G0Qso{<u=C!BQBE^5VANj7x`<
z5M5Dim{8w-o~Le<(PlX6i1}<4&d`_B8jn)n#siVQDcjT|2I~Qyp0|(NLUpW@3r#>l
zBdzUYF?cxqi1Q(>Ox6!b0Dl6V#h*5xHeGD{WO4)&a>ndJ!89!>Eh%xqLq6%XWT!7x
zL%K*mGx^j@b-vN4>OAEZombYYdC`ZjNA#&AK#AO@&)IyF8qLIvEo3cKng={oDFpK1
zF|;)TQPh|afeXL_nDQkjZ^++refD12$ygt!fl;Dv=%Q`#@xTI{hzt>(ShK}y1dzKv
zh}ucTb<uzV89*aK_lu9a3_INzSWD3N7A#OgMY!Fis2E=mtx??5GDBmASxWMoFlCUk
z?mItfMTXGo60+(??MX>`gnR}BEedsOf1_T(7NfpsOjh#cyCIlr)>4d&*rPJBL$Y0`
zp?_-<;XgI$oJjI;10EUsIE)Fo1+Py&2Nq$lT|Ht#f!kGZubnXGPY5@`Kth@0O1NV0
zf`uQ(HYW`YGT^I5w_5xO=@sO3zfJaYdM-b1pxwQJrkX5UI?G9`mU7E<PFIArMcSsg
z`a3%Mq!&9wh#iS)7uBP@&ecVH5H6G<g~0dCJ6JJVV5mS{mS2BVn>02_6)>dW(<IA}
z+Me{do7=VDi#^8gnJ+Kbpu#CiLvlK?2L(0{H~H$WWS?7?q59+Ee_NM$n4hR4O9bh6
zY-PTbg0g2-EAimrJ+6Mnz~z5UKA9ZJtXLM_@joVd1?|5TmB<${l2oaLjK8lQE2K0N
zHdD+O`4$_Jt&}G96sr>dGdf!3@?9nF3&rVT7Q^^EAFq*mma8<bt+Zt>mqdpn&*8m|
zsk6cR>8m4;c#o}SAAPU(A6|brP>R8cUK{2)`m(R321vtyL)v~ECUjJxB!F)>KUo`z
z>vvy~=E!`+?{l|&2VF-3rE_ZJEoL%B&I8MNid*i0QEtqvusiwF3pe<9pS81UaK7|d
zr-^vf2~%M+OvoJJ5$e7EtNrEJCzh7cW|LRPxg&DT9j!xE0!mumZt^7M`?3bk>Gw}I
z=XV=ezM5AEdIhGs%E%+rCZvHT7GiXQFcbnX6f06NgQu20nq==FJ@4Ygv}WWZ6}ljX
zU$%Z@7(LvQRr;u7K`U$Vep~Nu@pVmq6(vX3yV!(-ufLF=@_`$@VR9gtyx<qTfR1O>
zCpQH4)$2(H?6g-<-MhCEbP0_+(!bU7m03wpk3=>(&`4cIY%b-+(g}>oc)-gIynx?Y
zwB!ulcT%o?x%-R6Z@5SUJxy5)P3#o@3H~ouXh#3+B`kAmBQ4bzzhOqSz^#lwV#=S5
zT_SR){%TynQ4B;HUBxC$wFJlD9u85+=LxBEs_lYY&}Vb)`y^C~y!^0VGDa*|@?m;7
z>QO%7-Jd4or{?gHGX0p_Z!Rq;l=2ai$AkEUV?q=#q&up<Lc$3-#I)x6+EDW05<VpY
z9)kg(MoGwC%;ryJltx*8#%2^_E>P8xbaB5~rfbsY3^c(H=CjBS5&29}iubMQ7vhNf
zTM1Q17VeQ$BMuhKN04osFIj?w^gcOL^LNHr5Wd)qO4VNW?u9V_%c_@o*00lmmuG$-
z6QDHE{Ez~QuA2QhIc7H+J%Z{_a`0LO7UEPJ4^zX1KN6f8^?rQ~xE=drH6Y=f!2XV>
z%CdbNXrRj%?o`&4OOF7He(LdtL~fmS`Bv-<mBNjnq(ldI$#w131l_c8)Tha3;JSKn
zWXu-t^L!0$Msio`;s~+xn(oL<!LcR9O8<N4)8>tHrdt^k^>!!b7c7Z0<`YE*f99hE
zo=r=GN!$@Dk-i^&j1Sk0YxB+`DlmC(nDilp!e+`XF^+bpqssOJx6Ot_P)Z%y!%VRz
z%Q{lnPx|R0zBV4WO%8MEcOIVQ6~nh<g1#LNj>TBwlFyG`CRs)jNQiu>NFBxC+=}o}
zc$$8~F;0JBYj9xmWBIBrbMFg%6ckc@JXGKo9!8pjz5TQ0sd-atAFer|v@4)CT9h`t
z0RD8Nt*lx1#bHC0;JKC-iDDF3olR-LT5Xv161c<A>Za`^XGG}zgLdCUum5+k1@q4%
zOSwgXk6l)4Cq3lp3O7TH_Q?)-o@p15*bdNT{GAOv(ERKgAAS45hy=!%Ka$Ze^v9gV
zTZIO}kI5E!Q0X^D3xzK9qc^%4)g!sDwc8NhJN1?s#zD`Sj{S6#wYkH|pBVE|b2pA^
z+pRAWM;<xULW>B3lkkO;5J-fog3xBsP~lOpYI4g!_K+nu|I}AY<-%SNEe3`)d?$0+
zDvOWl8)=3R*j0XrengDIOS09%%69f7KKX2ceb5;byk8R&b+cwLQ4z!g>rg+B_P*1B
zu1tPz>WHl3(&y!01{6){?v>xC9CT|O3VxHQSy7I|uS7%FjmRweia0C&mSoJ;6<X5h
z!1gX3<1=sZnNNwdFC#$8jJ}s*iAS+LnHkyKlNs=g5j>@ZJ@lq<pxckw+y74BM=?`x
z{dO(YJ$pd~;mIUOb^5W$#XS?AHB4zhLCqDl62^Qiap&9>U|TwnEg4fS8|Z~r6xK_#
z_Q8Bc2J!){Y6^}kKX$%YaGa0@8L6_rm{fnOWEFo=V_eu{VU-RR!?zuB-Ps_Dt7q!R
zqdY($zra=xGT~@WDWQRc60v(;+w$8<eZIe~XAbSMMHvzM<f}RT(WW^h(*0`D#SjRk
z=DrV7>==A8@zRZRQ0@L}WC)veHYzcW$LsDGW%0A0Z6qPth#<arWvNqQoxRaJ-^k}B
zYdmmb$?$MVZ)<7b>~vtVGc@51>*s!LnnSOesnUP?NU$%pG2`;@$%~q|l4d7GPt^F^
zwVkg~K}NB5rmC(RIKa>(Gay3}BJ-p0#OyuKa5}#};rEHoVqKQL2~(?7h|Pd&c-mrs
z8@qjfyCiRx$xE}IQeSHv7hu)%8_2^H8Xe7YR%gRqyt3rJm{iFM<jSRn0CbTid_I2^
zAScx(_{$^k#)LeHNcD2rB2}{R3uKU^#U|Xk1$z{r5!Lpk|6_t~)|;<ewe^|f*qKO!
zl9r&WQD^z{u%x6YM|;h@J}@Wv@aZ>Viqo_vbP;!pe9&KfvxB*s=cvmH?+5qJ4^r`N
z4a<GqBAAAmf42K0j`xE!I2v=g7uS2X`HD^xC$0A@JRLDJj)_wDx)o!fkRpARVX)Iu
zqaD=B>|-|TkH48jc}n3Mk^BiR4nR0L%Dwo*Y)BJvC{s6OdX^WI<vvgATVL<*&a*Z2
zYkcd)oOy7dq%Yjhn12toSEqdIx>mDm#U;X9%G0xwx}`!dJEctj@UW6G{HKq^Y=K+l
zf1Rd5rFF1&4m81_Ji3SGY)SM<CP6Frfi2&sH`wrK|JGqilABuR_M#pH#{W>#pjhar
ziXv*Vdw<;LbZIz1rCk+r>^a4mG5ZjRzafII4|(z93@;-g(V@~eT_0)_?d*;I(S|?`
z3X0F>5T=)ri{eN3C$W{a;XwTC3zqOpw%0dJeT$D(bL?o>8L1C!;Zp?IX@*Xpxfn4?
zp)sj2;C^h(E-{NMxyDE3=s=+Q!aZP-j2UxWBb(N6jRp@(VpF*ecE3p!FOmqX;!f+q
zq-8ZlaUDsnXFr*ug~;zz%h`~g>=%X4kKE%Xo~4ToZzkp*1Z(%cbh!f;3d}pTnVwh_
z--X42$PZ_zIoE8+MD&F2bp;Z~i<7%Xg0is>AEE4Paax(SIO}0mUV^OeL%oQ#bQhGu
zd4a9C>GGd{4sNg8HhC$gnIBWAlevuscQ>fylo;HQ`@-D`jO%bcvC{&ONAe2>_lFg$
zREYAC{1iTaV;R^qd^av;|1srjJk1&lru?8$w?bbiGXoE!z};{s6>VCaqg8D9h!oky
zWdSaaioQF03~UOr5Hkt#NZn1KV+LmnAlp4Kw;@DNR#jFq8MH%%5%bh$JOBa*5up2d
zsX5t`B<bph84+aFKigaL-5(35xEKC<p$m0>Vij-3jAy5s{!#N{&eU<pb;S>Xw6aoF
z&8EshVWV-gIxI@WsQPV?CXUFIcX)k0Tc-n6*<GP$^M*f0fJ&g!&Oo1A33GyJQ(NI6
zwE}+L-s|G(9iI1;R#r>RnggF0j<_|6qoI}8O`<xC0^stF*bBN%>X}87B-4zq`W1L>
zQ1mIIOQsq{cCP>{v76XsVU;pH&Ev2EeYQ>zHvQGc>lYM6=OZ*g4ABWR-N0EFB@hRS
z^5k&7Jf?<e4o<|Z92I$Xh|}^Nb@D603(uFAT5BaXTPzApBi4SA^RwseW;pu_r_Shn
z-l}1rbeH7gEVOf^HVJoM%$K#cPTXaWaqic?Bc$)$e0iEEzVWOTr-)MQi6B-4I4yAK
zFH||B#j>g(>Gvoda}!>jbw=#6ghw9v&{Um*hZG5gg!g<{XWR1?+Mu;w;(t@hOVt~-
zyNv3*w%QN<bMnB^A5h?`C(KF+A<8LARaBs4yn|!+KIgamaUNmMLni@4f!+c~1?kB+
z-Y1LzuR771AJ6#PWGE>o<wnyvC7x0CIh<X``(}emyVmkt%<V&XN{7P-KeS1Da7wwI
z1GlDOec4A~B+UY?LiXwKgYq1eu~J*u=22m__@OB%aBE*%uWB+9N{=+DLt!P&p)I_B
zg69s0j+R#>Y^zgQOS`#H@+>V|dO76&zULB(#REFI_54jbUuYGFw4*h<?U1&BR#(TV
z+H6FYm+U+*TDAKEe?&7(y%FA4R63XE%ExGMDxbgn@+sQsMlR#&$2+RtJLs`NBe7`_
zO8uVOKC86*baB^vd*e3HM=^a6HpxEnmZ7Hk<7~6nUet#?##2<wT1#nt(wvPo32S?_
z$d#(qX!*<(D}K_oq9&MHLhH9I3kL|ake=OJIBuKB<UlOiWx?s636?!6Du~!F4IVx{
zC)8vs!B-O2kLXHC7bj1Hj+N5#)0y}C1Ifmm1Yc6hBthrvu4fibV{H0;_xfs`%wx^b
zXxQsFPEYokjc|XLJn8V&9kebAcAC}O@VSn`L!uh#9=~)7bQ<W+eNpQ=qZAe_Wh2<X
zz^((~H5y=jjq4f3hy7ma9O|f2RD)oMw1RawtCiMylk-Q)*^eM>e$4-77DMX1GE1|T
z>w)Y?iDsEz3zP96UM!7j|7ByJJ81Lt8q$pVwHoYm^xcXEo5dJ&n^re0r(TXQ&jOcv
zv(q__W$f19*rrQByCVwbN@OFBKHE!RHJ1epLqs)rFvt%ON5(U~OO5f9bBYx`LuD4)
ze0+Z27^O#6%H`mLdCB|MjNT!Y$Zwc~mx+=&uJu*bUe)P0gQ^U@ANKw93Uy-mhgp)Y
z@z(@g_7u@EUXR`8F;!A_P0ZhIk_%x7kIHChGu66x+~H}hZX<@xw8`=SPNm%>H^;TU
zm<w7%y_k+dpsQSOw3v=J_ayk-%-SzvV=}l@X)pVR)ihFH6?w!@()*+fcg{WdEb{Ik
z@~v=WWTe4t_=M{f`~EtK)uoIfmj(j-zA;gR=<4jYo8U#21nGKWV&a?$`NJ0S)R1fT
zfyBQDmkP4A7U|*!G%LkkI5iHv#&1qj11w*>OOH{h;-AMzTSVW(PCF2RA9x`QdS0e;
zjfCe8@aK1M?e&&v&e!n^-861tJMW*Yl-)Q~82yRq4=`ijc2lu&0cjd*D7!>YCUx<P
zH|>pV{RicwNRa~P&!>8=qsqqAeFH@N`n=Kix`<mw!SWN*cyy)p!P^EaPC#4?2YOI(
zo8RKL>)2`Jh}t!@$RnHC`^Y1@O9%}06KySqPf!y_Fpt+-PNJs+_%$^)u*Y&JhIVtN
z#o~`Jxr@-24}v8}I9|S&<bFo!dx?oEmQz2YASKbO4NAN#@tOi%RVW_UE`&@M?^i1G
zd~f*}dk!>Vn`iU4CC+CnS~P!RDL+(+o0veIrNevUf96Xz)t?BegHjJiP#SnWeDYh-
z;1H`_=>m5InMT?}y*2Hr!~0TEpwc*$vt`%o?t}Gf;iy~Qz%7X=WB%viQ~>59892kc
z2ITaRf!=C9o-nZm>OdaHUGe=s?y%xOMzXuBr>Yd0U7v2!0zDJb(&JEzWhpm=Cy{CY
z%-&hP<w|-Rzjc%&Vf!k+uQ6L>0i9Wqi<*Uj|K6_1BLo3>y6KY<hG%QTR8804$T=%4
ziR6l1rMWh<4FnQZ*<my;7%IJq&5C3x1F<YzMn7>d3~_1)#q9sCoE4Os$B-y=Q1yLz
zL@7>4-cdF9-hh3tnaIfI5mS>NRjvO##q*agC;{a=FVL21a*y{ZcC(65t9Zm`-0iBn
z;e_5nf`#Q?Z6pP9SH&Ow*>C<z)Gw3nd^=V39NzF&)6(mm73d-*ylcpjw^@!0jDzAo
zv^`wT*8}m%eb5_AV+|g=&*GSuwS*`gRVgFDQ+|d*_XtB`ZW=`f?2S6}6|*9vl^i<@
zB@jo%EDUg5z_a=rWJ87n(*rZ*R}^PFuU#&-B2~N1mhqg(ly?jvIDsZvHMT{^M1^<7
zsn4=8Wk0puBIKOrHf~M)3lnA*zv;R_gPb9Sni}AaC(D}3Y8FjSm7wQN{cN6HZ83v-
z!pHcVfGw9GU9N1N2CLk`JMfenJ><^6|6WM<JuR*DKp404{vQR7gBiUHS@m;b1dl^b
zO%&vjn<Dg=9z5xOLN~M0Z}<ymG8L97RqtMHd=p!3h|JX)c;<NZc3V4h+(fF;s1@gX
zwR!<L5CEGud<U?tD&$tVVoiwWkwttvSk?lb51JGXH1bH4{!up?nn;O7<4s?G17&6T
zMyUMp-Iw<Coe3Is(X7_*Sz1K{=(zgz7>SgOXgsaS;Tw#o-|oNsAqVHJ0zKnc$N!7T
zG9r)XH4_e<M#h1RmAH32<IJ)X{QtNBGNnz2+zWO+#HLHlTpL?ZdGvu>pBi%E4`Vy3
zxZX5%oDn<t%^f_<&!5m9^Y?8Vuc42vl-{(P$==h#WBw3+3nA?5PJ<ZSz!^+G>|!wy
zwuKziwQRGnrgOfyXF@ccBg|jOuq85{P2AH(arDNe(#<K~?@kmbQ1&AzLH5~vUEo5`
zmaa`#Z1z1-Am{ecVF7~j!!!Ig$z58HAK!G8_;{#!MJ1$e*~8Ub!4(xvy;kZU%m@7g
zz<i1$8acn1;Op8?AViMezZGC#`M{-8#nPi*912v<<d2gMvh$}Yoc<}{6OlEZ5Qw--
za=bKW36c_n`%5H~YPi2x;RBM!)64uqoA({KCqo1;>mrE`FTp<r?nR0;CwQg#2&Opy
zsDyxpHk`K>Cj-3?^Zdf;e0m};<XT;!qK6xx?fUZYf|QpYU-e$!dapk;i9W!g{WL1D
zR(`9L(<k{)&wDXoJJ2BckR4P<-f+VEmC+^F&Df<a0@AB@_knrq)l1HR{BKWkYZ&?!
zX<P#>7IvRjsU~Fc|H>5vO-=`pelUr*TdK1|9ub7|j#c^WyhWu+g~~$er+|z{R=?nu
z_Y{;25Y>iDj*<y^j%5Q%Olj<0+##<X%UIdIA^fz@`|f?p-tS?~fg(XMQwYB8-G}Hb
z5U!&ui>b;*!E;Xj9ipV*2o$;3BR-E=O0T0oRDu<?_vZtn9@Sk6fy`(c2nuFPJ_A1s
z=26%%05d2862~x*R%cCRjH4^O(~P(L2D=J4>pXAlbvzDg{<QEsogIO|Yqms{cX!!X
ziBcn}9V`aF+s!tenydm(`7ExIjKzB=>q=-;P9Du&R0jumbju8W3qF|oThTHyqJb?!
zCuO?l&qbE;i3uxMchEOiTVyw{89C>7=S}R@_Gg5rc_adKWC1Me?bgCMK7m3(;~GGd
zb(=z939M+um0gGR=M4Td3Bs_=?}y^21#mmj)AF>)6R;6kYM(wH?$1`CEG^d%HJ1v2
zA3d3~C|eDoEfq*PYLeyK0EiDYWIh}w-bKo{qe|JnSp4~E8<PYV;QCAn7#2;rZSltC
zE=~xI@JP@iESH1Ya38;T6{)WriSk<nVD&XforNmaXi7^guG?>fzPufcAJVv8ASIIg
zK6u9#mDf!ICdYHon<gl%&T^oy55znNfgZJ7k*t4)UDo{ro66jJTD5PT7$4uwG8haM
zXo4FA8)$MSiWGR<Y(;V`$m4<H(-+hJc?3$baZkJ+z;`)OqwYqyOjQZ)FO~<=;kEb`
zRkjsgr)i@>%|Wy|eWFTdypbn&5VKS7*DU04j?Ylq?>}~19_hbQG1Iti@~PWwqtor)
z>S`-lhK=Ed(<3Bb<(iwWJ`>3}(S3}z-+a4r=ht4zF=cp`QrgaDa}&C*<Z%bH(lRhP
zZ>KGZJYubl5fS_kG}I0aZS1BNpr+GNb@|RQk0a9catazw0?odbQJ{Npo}MyB<IFz0
zmKX^G7M3_I<>fl@HLRP*%z9{&8<&6*HHC^**q8oS==zzZqL-Rz_AdASVY4l9%#}g^
zTy>e~X`gvNIeGmz_Q9M;ZhlNi2xunfdD~%s$X|pK=<%(ICY%+6y;X_#>$#=#pFUS>
zw@NCU(MqH`BH#LKrOmbIvz;T=-q~AXd*}Nv&9KQel}M@#0v$U;HQ}(Bg>0fag@`hC
zqH`xHoQ0o_Id;iS&JudBYvtR=#H8;L1{V%eKU%Y&qFRfAmqRX<J$b!LnyJO|TiwhQ
z?@eWtu#S*<lm)nex=6o|e**1p4oc5Q+_&gb7}AUvk|t4+%F6)Dq|<H02L7ue;5i4f
zqj0iB=pj9+bl}a(4cvxRjQ;cy8Rr#GW$ZUYSxv_4i7+VVzI+aF3jg@a>H`PvGe-KG
z{OF@NW^9`--}OqsGxw}!7x0TafHYDOpi)IUW^K5NyYbrAzslMkch&V&jTHYm$8-?F
z&zJ0&Hy3uMHQiC*&6K>bFu`6HODxt0uCO5XWtoX4CB)%YX2ZFZN6v`mVWpEgyD-I2
zt^wY>iIW8aeC-jzmZZZZ-7D8omsLynTLmzyBje)IgwI@K{?!X96rw&Vi%1aPNu(Rc
zPYXY6A~TevQ!P~Kn>GTtXF1R*&_@MGd_3D4Q7>g2KLdBW3>H^@_ZCQJtJ(V6;IM^r
zZsnA&M4X*^r%9DwCv2y$kZMRZu#?UsJX_mZB9NSL9sf$KG0r)qSd=PIAn&E{gHDD!
zUr<O(>WF|Cq4J__=c~vOOkkJd5He~@{@N_8qslN_>UDry>6nXwqpVWH7QrhOcC46@
zBIP`}siig5#eHEuEFVDC6=lCZy~KE)NB5E!Uj@a&d=2~*1lrsmno@FKo}cYlL<Lv1
z4(wB*t;Gk@;nov13W_|qMd$7%pK|+i+xx2ZgpB!5u+NnOw8|`jCT*s1jPlQ6zcPu;
zUA6ZMz)sNdXo;9!s~Yj{&D76T?5yMc4%MH>maSA^*gRzMy=_dWL(PFrcS3MxP}07y
zDuXLcg^DfFtM;V$V!~43G{bCgo9Zhu;1?mVrv5t3q7JrE4yHdv@lT9Vu5=L`MT#fa
zFB%YKDP7%5Y<F_=wBE}LeSb>Ebd=bB_AQVtj&0|ZM4_?pNeAr+u64iUeWf}dTK%J8
zhB^5Yd-H4^?swHt8;<y&$NSx?t3RG`USBv%OoMn6HiIr|h()3gtTA7l$s}Eh#tmvE
zPKyy&X7;Ua)VMk6-}4dIcs5lO*T}mOo8J4!3YdzCLgV_tLEE}FG^?7NU4L)CX456A
zVVQr;P*&Nl2|>$Qp65h}ovTu$s_CSaC|rjBDqXH9U1GYFY1OO6N<dBjM%1%GOO$5g
z@PW7|uQv#BQ=GM{?R{jOE1Vtm=P4-7xc(ub%iKhXVu#OB#VgopB`cy=M~RDEjWge~
z7%9-DpgScZW7A-ezWgff0)ms#Joz`z4h-<JeW9JvUf16!(~j?&5{N%^4cE8c9rDQ5
zUh;}e;`6$*Dk&>Jcptt8B;biOCIX17tzdisAkC}-(b1IN!X*i~EhZ?kW3l%Vbk)lA
zUPU3Z+YB#<b<v&G;2#wi)4jxFf->y6^=ShkWAb5y9K$MKDyzKx`s0mOXC+>+M=xH4
z6^qDGZe?n+i;x%mZb}pL_vLH7n4v3rMUe8f$$e*eI0%{<7YvnrxkJC-^UO{MaOH)S
zl0V<>?-kO#8%N%^vvZH>YiFBhxFY7TepKH?+;CccK~`eQ^eK|qZ|Gmds0AQKL-}D7
z7szUJ=ft5gdnvn~pR0TUE1j7C{y5-LE+!6&*wg`Q>blyawQ%?y2L~>YjIll&?Wv10
zYcG4A4T&ZpZJaq%9ikMTy;S9LOP5U>%9BmroLL`WYzqT2$rn@Mmi6a<48PyhyiHD#
z&<CP*v_H<}TM+;|+N}aUMRV^7!U=Yk91ugo+%}>qAi;Y{xjwB}bC@otmQ$6VrTMR|
zSBNdC@Vgo(hM4^ELjho<n7-=n95uw}e7I16G>VC)2IClvO3VZ(Ggeug4<E%{orQ~o
zPtyqdO|A?+MK~>j8@Y;SCNPWa86~mBa4r1!E>`<OxjxCH2^x{LK96zNanBDfuniWU
zeeGn$_7>-!{Ze#ayC$U=T)<LzptZ(MlNDZ~<sZQ1t<)~ueaq<UxHtWZOn}2|91h6p
z3RMjYiNNj4K>2+8R?tFR@Q|uKfAg;?=rNRUW_fD<*)_H2Af1g0H!>+8L|WCnjSBmJ
zw7qpymuu5DtbkI2G$J95G=h|LBPAf6N-HTyw{%KKBcOD5r*sHE0qK(N?tae;-1}Y6
z`tJMrzJI>8SS)ob`*)q!%$zgF9CJ(){v>kidELt9?w#70ySm40F_nfF=6<Kf3P)c{
zb9NYPug+2BQ=_<x`Aj`H;V%?&dfA$&ygWoBW1924-&w+n>oQn7=FdHnCGMc=J~P&?
zr?Og9OVMDvX6RScUw_y{qwRq$k@tO~zA<xWW4e5{^RJ^~0!KB~0*>0VqQsi)#Ck+w
zGcD-fYGEr$J^Mx4m$X56^A}21k{08{$E7+NRJH>RwEKd$J}Nb<ZUy=1L88;5rX=tF
zz^`_qJ6#@$^=Etrj85ug<HEX>48rz0=-|fGvSNx_+e606(bY288F##HxwBx#_|rd)
z8OMA#`Qz25WObInoWXp_o6t!!(1IbVc4riqN?3l%i`c^2y~bK=^M&z|A7PkBe@_qC
zX!e`+jGA(E!bD`@p)1FmRbCwRvY*s%DL19mDZ0j2?yZuE_zGUNwQ<j|h$Yu6pFQC)
zi=}Q}`79r3X{Y1CF=tE}w60qPIOfz){nEH*S2DO3*9Zm#;}6W1Pk<7a)#c&HxpFvu
zbLOk$0`n!JzEnz7ShH<c1l7twIx1KsqwO)qxgV$SK>cNz+yWd8z}p#SvR~6MPMl<C
zCgt=>c=L3@(GCO>0X8%brG6a}y|^ECxS(!F&ci5E-?2lgU)66~`4STpPD0zBQdk#R
zj{rAPq9oKw^EOTmF7_0)OWlY7lA7gsL4cBnv`odI*UYwE_)T3rlU%iEAY*$aWR6`+
zF`;gx?9C6jU;tf7Z>0#p@d@Ohz>L0s>l}{Jwu_thVT&M6e^$CK{}Mmx@L2OL?nXQQ
zbX%nH7oM%E7UrPJ3pqotq&F~P@bT4U#vL2%h^lDlYlNE6d~a^W%jagJ7+@BVwvT(6
zsN-n}hs`r*JS%w14s1U4XT#pcW1}w{D#P3x{01MuJWm4e)Pq~v{I%CJ{q{c~hZhS>
z_<G%s2T&j{zCktJD3FaRD>B?<#Kl4$1W7^m#JL=ekCu<>9xkqb4dNnnXa31={b*vj
zue~O~l63iGckU~KGYx1k;DL4vEie7OX(0Vo#%w2Wb)rA9w{KoKm9fe1v{k(v{q!`h
zxm3|atO5f7u+vbRM1-o;syhD4T*_`%Ru@iy*D#Sxy@amP1t|<axt<_6L>yOgfc>#4
zsn84gFL8$ZQ7_abJQ!^}*q^|!3&3JZ4KL6v;0&&I?{e4_QSj=rqnp^dzZEySbQqEA
zf+d>U{jSAZxnPp73H%U1+z59a`O^STA}=pnvYylZ?r#fSB@Mhd*X#Rl=3|<YXO^UY
zDqc-moL;{#zo+;>N3bN4Y&_8B!&OqV&4&TwnOl^H3_@y$S%NejqA_y}!h0twNeFDt
z=nPdtdD0&%YCaU6Q&+u%JWTK>@2R64k<iiEtMKwSvC94AjSnls2td)v$cvhyrk(sn
zI$3C-nv+Z}Dcy^ox09!gm?pX|a3ka@&g7HDxDkB(FW0tpN#0V~Kd_4E&7{P^J<hOt
z3n`)1FT-BpE<fCZDmJ@Bj>fw>FFq8#>*;@V4GLY+ZmL%l->l({Ke@<J%jKOTs0*M8
zDPCIOBr(o2RiLoc+L+}Vzf2A>5UV{JH@);0uT^(&CFt(?rK;m-=qdQL@pW&Bkv-#r
z*dj1!+|T@HS-Q%l4Ef@pw2D|O)bHz+K5)0dO1c-XLw%%8pIUvdTQFT0+P(XyizfN^
zuq1l!MzztpC3Q(>4Vb~#q8%)o8r_R#1}%35aq*dTFSnMvxQ?`nz=U1gf!6|#F?%qF
zNfFlGS0vb(Ec}L4JIF^<9iRmSHTK8X3Cd}#=?R9yXyRKD>)oJ$o#Rxl1;yIxxFix9
zB`+cJz>iK9>+Q&W$?3ab`wTi5W=*Mwedrg%$L+lEEO?v4`Z-6Q&UN={?SN@sml@z3
z-G`$lzr{@nm}bW&8hI@{%Rl@|5pokj`KXiKo*DPs=vX7qxE#=%8N*xW+JaEssNw<_
zPq#NpVN9PYKvXdzB=2e$ELjbkLW@4I>K%WGBQVYJDcm9*E%Vr<>p{WiWBFbA>c`bL
zZb^&`R<_MMtbFq$C75^p{3b~#jW|`-^Ct(}2YAb<DkP<LfPW9|*Y?-_oD)7I=%R%|
zS2xJK_iF7WHH{Jo#5D6>JJ>xY$TN-s4q4l(mIP&M-xKbz@z&~J(*3sdh*!01&&JpC
zFY8tg4~q}vO7^y`R#)%jcEj<znhe$!fVG5Hm(sqd)nDa8!FtMZ<C_!c{Nc%l#=jr(
zp9oGAYaE9Cgi*b8%h<_#On}KCF*`<7p*^dv@%|dlYB#lguEZZkaPFCUy>J{C8=s8E
zon6-GO$WEuV0&t=l&8tnfGy!^Ndl*GRk#u>Ldz??Zmf%}#ogZM&hwVmCd!eL5_h<S
zf|qj(`Qu~wv@eX~f9p~gROI=qA;BnLv32J?4?|11((|m|rz*or*kd&C%8zlQl7=L4
zqPs8fPuxgB-?Wx}N;Rpu^nBQx0?GpJVp+*}s3*av+IZ~{>0U9U**d}RW+oZ)tG!|>
z;gpkD(z=cbg71}UZ3bwnL^A<C8Pl^~y9bUX=<n3iOYs`@=yOXgC1&whTc+Fqs>GbE
zrj_hH7*7f?8%eHxR^z!+8hl5@N>b-b9FPWwZpNY+`=sq#4EJT8?3Hf>)J$`mpAe0i
zQvZl_{b?B+amZ7CY0?_i4Rcs_2DlI|LFx9hsTI=rd0jPX_B!I-y-P%;5@)BdT6;!0
z1|pxt&3$$moTL_l-7V9Ev9D8=)I<ta2$0a3>?iEqF=vEG#{rPn+EML~gEm2Ei6L6U
zD_brfK=*s+-WY*j<ob{;fSHev80xCK0Vn!2_rnN<14{=kpxy#IDJ0gxCkUiz_~DKl
zosZ6b8lP>{<n%C=3;;2pT~I=vD`RqNvKrcMj4G7VPAp2*?3k5hdq_r=^n<=gIV8ch
zneMv%ErOUoqPevHl-udR1X>gd^g$|h#U*i1hYEJ`@Lj2l>YK~ov%HAPKG-BQw;+qy
z028b1Z-=|cn$NZEr+lmzyT1XQM-7=ourMcHz5o_s{v}(R>O7;rYQsn+ibI%|aFxpL
zoj{mc=FG(pp3K`ut8Hn>^i%zcIFnVSlqjKWlyt@DW7UZs6SW_!PCkk(tm86GG7iys
zQP#v^?htwehSAGRe?ZW2eEfUVZb(TJjM2^~ST^kD8drpSem!~q!noFMGiRzL(MTQe
z1n!NCW&$r3Kc17f^fjPNmLd&UfH)Zm?RqQ&`Onu5NX&5}?Kp5*<HPFUM=LFLA)TdK
zny%$=utoWsSP-5V2O`QMU;ox+Gtbct-zZ%mbv=F%uo_@ygN%+1L^r*l$jj(Xv^sO`
zFf<epUO6It;Cshbs>Qk!kYRfbn_$Sl2+VzKCSPm=Ui3GhAz#rye(FL16{X9Vsr%xF
z6sQxZ%_s^G6s5#Oh$ssIQ=!N#Ev5P&^FYT_5UveCR-pFmxcgMM;S%hcI@zk-HtT_b
zHHRZQB-KDklkA#@pMQFnYf>W|Q211RAqpQCW|CM^-I8pIkbRS}AMs)otP6od2C>1J
zXoVLuYkr?KT(UK&<Bkmp{kT?^kMZia;YAOy=9z#(ZZUl!vL)g}H_>~o2Vb~Tt@7-=
zjb}$q$x;kI4G?>>YM<jpnN6TsFlh+P-`N>*ZK}zB--9J>N;gpIzYtMA_R<CAS=R<b
z$^P2)>-ps)MVzFm24SKnQ1>^?0}Ued!K(!0MLj4FIIJZrHX>|Q&G`&?M*fi@(Q{ZR
z#)&cTujKCYb{vsfUOO$A#d&SWvgv*Cug$>CU5`}SenFWugb4r|@~tm}7&K$;R9h~e
z6J;E%Ln*d*&=)TE2}?%YeJA-Et4%#WreX?l7wh0tD!ucr8c@cJ`Lg8Bw3y^Ub8e#4
zo6Y4ZO?{STQ$zNx`<~$Y(+kr`RgdVCPTs6G(Ia2xGO#A{ix%9^EiLO2eidV;PbEmy
zH%vG^AlJ;|xM$GmTP251fh^j3zpH_AY-eu6CB~HXPPo~Ek2R*%p<kS+5ul^7=AjY4
zzyU=^MTRN$-2DiXNooMrwCx9WMJy^@k*x4-bkCld&Cs#tjlsosZIaJ4fYrAec7s+#
zb~=*BN0tYAHH12ScC7euk>R40%dtpS&^#uXo-UNtR@Mnj+cEpt=M014(yItwRd$wI
zXBqHR(DSa}<EJ4|{o1W`w*NyiB8pY6(A|0yBU>T)o8lIYW!<hgCa7B-BKK8FzHz+&
z-8sB!N?mcUd&ex=aNL4gp!D2-h}+`hRd#D4=A+cpgb(&~zoZ)NycMT)&3%gAz0F)V
zEJOB)M8o^Zk^c+?An#{1BiwiOC^@=_z|6A#(=THIuh?dXS2JjvSErq~hd6?F`05DX
zKjK@4>SdEHInE!?cpQIjeNfSJ23{E4w#qjyQ<2E>_ESZ8@DxnSNH~A2w2}~E5;$Cx
zV8~X2T(XeCR4;!_tm}PS`0Y2_iWZnji+rk=Myl3)c-ef6vaXTZsoV74;bS)6!k=$b
zmX?j#&U+$~cua~MB>0Fzmnu4Fp1OjCz)8w4i-{mZY`qWa#Ny)6#CxEd(nTRS8`o?j
z`5h`6(IfWSdcnl&mkIOHA~KM{l5C=?^5_!L+RMw_qvgfNJ`tQ!bl}LG7j-zQuxrZ|
zoUirxZT&uoDPm`9v8j%%DVauhWAlX7s$WE~4i@xg0{_mown|yA6=**Pb*60k!G~<g
zCj@nNqUzqYW4u=*XU)VbJ!90om&D$bfWmB2_LnI%2Q%H)_%jvciG9jE{`5(4Ne1ie
z@T)tFBnaI~sxf1wpWz9OmwHaf`8v9VX1P^VBKB{Y1zaz|RY`zyo4SJU{ZbxdUsG)t
zxwZrmf>~eP@0QOK9+W}A^gQ!=$S(-Us7mqA7%AK*A|8~2$f)a?x5DTG!1p<zPr%VT
zN;3v?p_0g6{;0J+QaFOJHRHBFLrE33GsiKjzEo&TzRrvlb`cp<TkjY3{JSp8WzZ~}
z(NDqUzOTfE_5R^t?t-agDI#18Ojdnr81vvT8}$HeP07f7mxsAAYx=Sd99VxS>5~j(
zzWm&rK2hzJdGvBteOQ+Hxunah1P<AWN0as~2)PVl1JNKyNmd?X*w;;oY|fT3{GGv1
zVb3xm(6vA}CL=b!$M=)#v3UXMq390AhnkP815zWtK)~T0LCuWg@>lXjcx0I3sH93U
zJ@GZT6UUc!0C(nSeMZWG1Dgq8Cn8$*Q?^nzF~a0{fb*&rsi#!M1DJN^Ji^)i^^~zE
zfl=qVd(BA<D<A8BmiYIuTL3v!Ny(itgI@;nduCLZLA-?mKLMH-0NWCW8%R!kZT%{a
z$KFEd*eI@NvpsX{P@<2|(Fm;L&e>Cp`%p5DlaYo3v+g~bB5Ff@@vF3CPj3a~@zi`C
zhdwfZLASn6r}pX%tcvsR1v4%e*kj`pXr@%x-sG6t$iErTH~GDgR7+(m4K^3{A-Y_U
zcM+ij4_{&3At2zbC+8cwkQesc$3c7ih%$#I_5@^^`Sr_dwX+G8v16Cgjo~q0VqoQa
zyc%GbThk$=K$*F22hP9??h@dna<gM9Pm8QDKpz9~?q_9ys7R8cAj><1HjB-NLBG2L
zGE)>7nE<BC;xbegJOB8x#&SOItIWrF4N^8#RX^zNL7?E3&Na-1?|pJ-4TxSAqESXM
z{me;-K{a^NgD)}Y4iZRaU92O%zFN$xB}USGz1df64?$9R$CUa!1}$c1V^eI&T0tOX
zYe=w~ZGV|OPMDYEPpiV89rQQ*3~zQvFSv&+@auLao$mZvQWq>SIVFL(ueO!^acj4|
z$jyDKs$lye=cb>v`RjyH{^{&dr|fn2xVG>AIaoUV5dD;(N6u&;4_P;D$4k>gVEK-8
zM@RI^hPEyWtgr9h@ZYSH5wtz*HV3TU{Oe742(eRfWLUWf)Uxs3Kq7n4(uq7~8X#+-
znP${JLxsA!JqHW}rBfS*tdCALqdNmX^X^Gv_RASyJHD(2kYmO?KWNyV9c`k<m2Q(U
zX9qo}w+`tp2HVqR|NQDH0(+*0+o2`YEk=`1LA{gDT6qKSfvWlkuadXC<y+T^m-Gbk
zKGmIY@b@1;BL0Cm_*+1OJHEnAZmzphXjOu>jPLOGYcV~U?x^Rp>V4dpVi3UwPA1}>
z4#F}}T*AGOQ3hR9K&+1E2U#c}yrMOnE)kkv<b3xAUMBitEcS;pSNI)pe~2&t)^)#3
zM^*sqc2L*OXd?DO%uo}>c{fqPDEfzQKA7#nt5qO^m+E}Hg%LM&jL{`huu4l?)9Y!Q
z@4s>0O<#!9MYqXlL<KIrdpKk=TQ5r-eTK?KC<)_^LAe$56Snr)_#??n3cD>nqXbuk
znr(opi|H3En^O+0&={&ZEL|!u%HdT7zXlb`kgnx|aSmaPq*>SD4j>_L)@|Q&qUn4p
zouU11P=*}6jLdWD<azCt<R?GSzWA;umTXTf6HWVzEo+j(@8J^*Ct~_+00kwQQ2Fuj
zMFwy64@8>{kQ16dn$lk&P}>S>nA0ci*Ab`yY2UjsBUdr~Qknd-8lz|L>?xj<I!Bt3
z?f`M!RI{-Xv9MbhMjKh_PNy-)SXP^T{=zbh|2*&LRG6wOsIzbHCyU9i6!Oi^WsCs!
zRAXd7es$7uT(1^$m@YTPx2Qi<X@q65iYq%_0xZS>h>f6e_J!O1Od1N|XO?!XcJDkC
zc-BuNBzb34yu>!87x|`liRheB+&~Aet(-KqP^d7}^8$^GQ8kIaPY9AJ3z#W8GPB%y
z3foc1ix1sPCK}@W;=+~x;R0Mh|E(uw(!t0mV*tNs`u^+JugP?C#CtLGn{9sISrP=k
zttAVP*W4ICFN)imOB`GzUq9$G-tg5RFm{fjX``>LBZ<#1*&{rdl;G}Sa?~9oju(z(
zS|ycTsBLE=I%~MI@+HfU>%A?9GYpoUfRqV)m*RCv&2I(QSv8%}X#DiUxR+uNoUTrn
z$<)91w$|3w@origvAVe$8OM!yBVz}48aC{INZnzWXX|!Z<FQaC`>iQ)ZCw#XnfG|{
z`eOexHIXzmPij>}%oJ6DXhyMf=S*+I*&2P_!GN?{jZ)=l?kjy2n}Bja3_GNfj!uCV
zY?O>U6-#bF00hSFz?y&A4nstqKLfZQ*ootZ;^W)OFSOYAv!gIfIjZDQU;};%c#@~p
z-G{HHpM;&awLBEaGb}&gQz~$e=vVYMRDXC$wp<`70TyU-h!)##>ues+cYNm5mY@gi
z1+TpB$;D}k5xZ&u|EKyNv+@e#te6NrJ01JMlAYIY#pl%8Nyok(9t)gS8_+$k(IssA
z-1(8*Oc`(||E#`OeHzv7;9gz>cmK+jrFkE;!ONHNFMZ*#60CJi0nu`9em>}dU>~ES
zPNYrG${5`*TRAsKRKx!{ug#K7V@3YiAzt)o60ajGiSx#@#(n7mT2I!Li4d6J$P-Kz
ziQq?`AZo?(AK!PnkNZQj9a54Tyl3aNWA^+$LD5k8;5(%R^8?a>Y_spL3$(a1WeJDb
zYG)9geA%%TKooM7_2zhC?83w8ycFqRGFtSu=@j{iBS*M%&Xqj3`vO}4uXBJI?feek
zLK4%u`H|Mr{kxoxj^rqR2@=wDWzdwI29EhEH>(sxk@o>PoF1`Z>8+WPRqrvjU_u8(
z=Mvx?96g~-`;q7Vy!wyxy%L#A$Pzl&Jo%UJjVE?)Jbs*|tGfj_o&D`k6Dm(hthb|2
zvgN-ChZ_b~sM+5<!`<97N9Iu*pF;|W)n>p|ihlzgpaPe%P)82Eq26rVxXGAKE7wy-
z-SM6nl|7%(4U=~Vh`cB;^d{eLC_c+9f`*Y$y#E71n^PrNM2I()iy#s*OOFlG+Y;J;
z{lOpOjoq^U8zY)F(KkTPd|di++ANbbSiTc!&z{<GbyP+^GG<rIIx^;`SRHF*E>&uT
z|8w#MNfnh;5%%23-|RgRtLCZ*KDFBonmxkWsp4px9pnf82i%}2XHY9@vU)(*T(0S3
z%%;%I(=Rx-8~moz2^8D5_ZqRP<#brl+sFn|4Aj{plG0tn(?m^T+#}MN<=ZO&I`?KU
zLv}o9xun7gSw~<R45El*cR)DtVD_W>SSJ1VlU2p}y+L1#u+&=^q^tX-&vKaE5)9e*
zLyJNGG=9T&Ejv>Kod+;eA9Oy2&)ybPW@Db{+F>aE*uz)}?RXigi7?GG2#--Kvhmh%
zKrbWk36?H!ubi?YWUAd@1<-&*WId%#p?_b=%NjG3Hy^XKuG5lQ6~P&?V@kVs@(C5x
zt7_XuQ4<Fw?ky*~3v-yLatq|A)eBn31&gJLg?obc9FUeqDTyca&ISmvyN%qZrHM19
zA}lzT$4pA!00Jl<mcQCR)-h$OKTzCsb2wd$Nd;2K8iQb1`PW+KMaY)B$}xWzDc$it
zkWfFL#p|j80?1iP*%3$k(oEH9{*R1-v)KV>v$4ZxBn;kRD9J?H-ODrh=)c2o%);G)
zjfGWaBdsTWO%gU)%iM6H*NA4CH;v3XAt}^OcBunmiAKV-F&%d$5FYdeCH*Ye!pPxL
zP!ecPv-|E)i_T@?;>&rKS!b#>nqu#FZ7d%!KS6CU9|QzW5$#K2e@CDDAZ?t9^r5c4
zzE&4m=mLXb8rajW535zBH#jV(wRm(XNfxRV8x<neE7Su}t7E#{7co}P0HuX-pix=I
zX&bd+`SXx57C|kqea~g<MrqVd!Ne0TXh@vLA&2!rz3MA~V#e8OdD*(Cu?90=jU?HG
zOvfp8nQY&m{*hz_YA8eCL}A8+sW9_tn{9#X)6dixJ>1lvN?l;Z4!d)353G|O5Wi4z
zZchNI><<{cD|MK$+Udq~AV=L|tmaq{u|G{HpeUL2Z6sea(_D`9n8)>j`jp6jyQ`XB
zqtSnx%t_YMXN8KcJIG&1Z{SHks>wyv$Bqi~Kt_7dE{k)bZD^u;=m}>tYbOgQoh}};
zMp>35GuT)1`>JWR-K)8jP}N8OrZ<QIe3^(m%<Oi2smUA~bug!I-i2_6o3B{RH+IJD
z;(6E@Lhnv?X2(&lr$tJByB>`hn*g9kelRA+iK=HGWO3L<#NmaMrAex-;mw2`X1_|F
z7HU<qzR|`+7BgV!$uU$?*Lj2Q_-X+$x6w!aZWw-s*hcgNQ_lsBcOCi7rF9RcK_^;L
zoF{c2U^5axw<1s8q;UlGL4^4!=&L@*{Sftd2sffC!U65e=8&e?RRztblESn)*K<2r
z60P$0Q3Gc{LlW~9-ysgbb@+SOIh>wNbrF62jZEospMtkX4bJc`Cw4&ak~~p!sSf`U
zag0A5IqB~L?Fp=EVn9~c=EkiP1{73EvAS>upNMDNIDQ7oG0u{!1L;xmsFXz6&q+tL
zs!8(Iikn}-RbVBhZ@xJ{3Zw<7L_FmNpOTMp^-SPLijuOS`=}}mloWp;6cQ}gj>lsk
z|J9nc1Bl3^*Di}(G{V_sz2~_@-Er6ak3c?#`)nID==LIOHiFyfx4NpS2%k7S%(efN
z$DvrncAu7hpxmJL0nUWn{OiMNCBvV48<6!1ox}#DPuocWf&}FgzqNgzv9s8h!^AH>
zHi5w`Qdrtq3xjE{CHJYz-fCtt-q=pNjKR=xf37NXFag_rFs4d3;Qmx!aqIJ*3q!Ky
z0>7bMl~|K)_jEP_9q+<4J-BF$9{wPS5YjC9sOvE6b2q^nb^FcII`u3!&N0zgmIOy(
zxGWQ3Z%n{FS4kkf=bU_~4IlMSeq|F~TtEzLa`Ts-Iuzsm-Qa%i5M7#f5<u1}b8D>(
zdxnUTW5se;v8EODrRv5i?vqGkq!gnVKFdhWDw}K72BgfI<<EP-Y+#t68DycF>s@2F
z_K1T4+PU}P|5|65U>S6On5ff+vOR)}sILO43Wh?Hvs(R5?s#%%*`8u?-O7_Zt-S5m
z25UfPZmch!Zqwi42qYvX-%I?5<j}*lliaTyWg8-8vQ+%QML|%hj;GbdK_}b{4$p#}
zZ}ZscN1xo0ZaTUgUW8dRx!yytPI8acfeYOcQ(MP7=23WXDV;<f3M^t)wFLW2hr-*h
zQzYWF0A#>)4v?`K>_waUNKTx!$Q@ZL&#E^Dc(;Zd*kM0w-@-(nRc|LP@$b~MzWhix
z-CZXd5hatSlEAmJY;HUVH&de%C(XL@vO;#ov=Y!p&0kjNkNtTI$cPouB56il15-K5
z^o%x@*q|-*4JjSVxuii3{VmgSgjqYrN>O*!M!qTKJU6bCqq}X%_fy+XxJl_~W39U0
zPPH)h2wc^qK?56%OD?5p<OtJ1GYg6YuSbSL%ISME*H@|&g0p3Va;R!lu_rjE@*FbZ
z8%c7>(o}>8(#-Wrq8{voA75f8EfUCbz+(~oKF0WAHCgqco6sPuJ)|<RpIUBuCeD01
zwFcAIK(ukAq$lvfOE)H1pyyUPxz?*B-m#B18S<@ZW}Ul0R_jkjf(6P`o*slr@;@&O
zp9CY&?Z=pYF*pRnCo!A8YC?`frcntFJV|+78QOHmzHiPRD5*B+hB?Hz+o3y#esPuo
z;`%_?P+?PjAasYpwaaD!biDM=FEwU9<hc1^+!W(-&r8f$pkiF)v+YeWeul)-VsnsB
zARz+*!@{??FzEhu(d!a*hsKvU*Be+@ICnmQIUTHgb3LWtsn4lveOvi?j_%cLbyX+g
z3N+)5?;`>ts6WJ&z!d2Xy(JrvzXJI_1uT&-;-C0A3g!Dp4#gi2u(&?1V2C<I<n>L@
zLzo_dP%U`^rVmKoWts4*EkL9Z@_IjASAF!!2H@|Xt_Y**V(H^Ts9tB}oX{u=fs4QV
zr#vLPk@PtMT!jo(5wvn??O?A{y5m~gs3$z)d%dLhuIX|vD4SEYsyzs_#gBsx7AFdU
z_y?<BP}tKr_k32)v*8eBF!=D8Y=3LKoSs@P(H}^LJZ04WfCOVLieoc=uP$=v_jO}2
zr@1sKYDyN~rQ;FQ_VYv9AWyCb{DJP}&z<BYTjb8qA<*-5mWOlORfvO8t>}%X;l+v0
zu^317=~4nQGHf8+pB0;0<`=@N-1UZ^$G(IXzZN<lv%fc)|6LgL-q(2MW%r}HW0E0R
zLMZ)!cAr#ixsreHKI^_5k=c445&a2Ms(ydra)>qW62)8CA^QGe!_zoF<v!Eu8NhLO
z9d9Sm@th9WVYuvemdsTx^ff3;<}i<Ej8oIw0VE~}l>3sj{wd@Y!MZOR;q!OuNv_Im
z#@vX8iR48YX=zWHt$+#lCizSQvhSqp@l^L@BF^*i)|cbR2i2gP@Srp#g)*9QdNIkw
zPC%hrLXro}#0+sLmS9It8xQ+EaZ_`&A*n=yl`zSTBGP+GHSRqiPl$0SRLv<~a=i-g
zeWf%L`4JloPp|b<)3%h_o7p8(ES_D0rYNLAF<zP*o|s!Ro50)ha`*&D&SRSn=P9()
zE$14%iCI?wBt6FUn?`1ZmLeV!CNMEFxGp2e#7Lze+x^0T<99P*{o*E|PQ0M4izvN#
zrfO%3E^fUwU0+)vZEtN|lK)Ip`Yj5KNsAWxhJzje{=FV>1c?XiJh$kKKQBOMLV0bD
zZdxQz#{TB>ol>5O`L=<6SwWCRCe?RTFDC_oM)7wLKFJbd>vk?dn)W{K5GoiaHoJqm
z$(Q`A)WDeQ#38(LSn<nYS<F&MR}0AK;Q6DHbD{cyF<0TV#P$$Eo->}f1fS)}NiEqG
z?0`;Zt}Yq+z2dy^<Ji9p&w+B*_Zwh=n}$rlpLFt9kIVZTT%9G`E9eXB6xII>RbuCY
zA&l2))|gH%pZilKOqsWGrDodzGJ<?rl6?%6WAk2a()j@{QFAg<3Hfd(nG0o4ePm<4
zpQgi6pJprCSg>#bY)^M)Jy{EJA%3Ljtw<-k8b^GVVtOD7YdS0WLV<3hP7e+f08mu0
zPk&7m8YqIt%q^E4#m2@i=Li`VW^XCoPFbj79zm^^>pIjexU}wM*F|9Ol4p+70#>+%
zKM$C=_48p57C?(*fc3%SDQ5#B^5xEY30Alq6qSbkgYK^pZ<QWNyElBw3=hW7lh0Y!
zir6lcq7n9I<vl;5Iu;Aa9V<09O@PnSs#ZKNZ&iq-mbKa}A9+7laOGl02mVeyC|4Pi
z7VEyaEjq2xdv))BF^G)b*K8+EKpqX)lyA?p&uf-yWuMs>;X-B}s}e3iwCh+kR2=X&
zzBEReO(DO`Nuu}c?keB>f%v2qnDUD1&o!HnNkm9sb<>UXS?-L#FvoJ3tjEt#&=j4I
ze8u0-FVX3$)QJ4~tMhN5t;9~CS}+1Me+K(>EuG0_P<lY8?;RmRr)xDpb}D{V%Qltw
z@Z(@*;m1xs0h`mZ`B1@}_88KSa|Tkwx)Hi+#2(ueAcE8ltDEMZY@bjmN@Um&pB>|Q
zh?#to2lJB7A0)X?FH3U2S}4uZg4M&q6*(V)FjUFg(rwdtK)<{JMyN05I?jndnT=EI
zQe2ns6N{I%z3?%nF~(o#A_MvW$n3*p5Z<yvB{<9NG!jYy5pBkR@xOMc;Dvw6tx3+x
z0R9?5Wzq2W9U_%FvqS{RrbY1IYh66KcyG#6&o@xDT7;mU3?W4~-lqkiOKvTLu>wtG
z$qX1^8Nq{+i1Cpg7h7>x&&8f_NgWXb^ZxdDd9Gl4_}vpJI{-%dUf9sZS4BDIg-Blu
zyQeMS>4xqdD})PE4wkh>wZ=-S==Aq+R`hYrKahQd88>^e50e06FM8%LxsvrrQ}8F0
zO51Q-<j1IPD`;fOKe+9=%DEXY8^9z2qbClR;q8Z2qy9r5j97YCnSm{ExLUy>bU13h
z3cBu^2tfSkt18BWegF<9_zlqS(En9?p0e?ViARxV?1z?}_eOszq8?vl>DWsdaK$lO
z-oFGairMd0k{~8vg4PbVUe*}q?I+j;8QAjNY@ZJF!*9OwkXdR=*(sv^8R7OgGXSe}
z4s7yaid6$rp=9dgbV^wmnvsiUf6wstAEkI>hAm%$&EoH`1NHUev3wH-scsOnr_^u}
z#g`x}Xa+rB7PC<n-phl*w&z%OgzrJD6OsM031x*IwE$1IJ3h1ecoVb4p(}j((`aW;
zPYaOkWC6R3WUY0;(D^1?W}0+gL(@~<3H0P0Rgg)#wQqQWGl2$RYwgXsSZ@z643Gqu
z!@e1aKtJy0i;NuH+fdZ~9@+$q&1%BDau#q+``ek+qXFI17bw3-Olfhwx%u}upZsPb
zRTvIzg8TsxUVpHM3<Ju^A?7*I%p$)HxTK!3<U8R0`Ua4*@kGE;hlhrXp0PJZ@DiZF
zs2mTu?SauoNCNQXrIu}99EjL>fL+Zr+gvKyw>QuQ_~pK!l>QW9g9LjIt_5#`oQ$Wu
zeu0z|z}o%A!4ai?7~(Rz{Zt81HAl^Y2TcFAuZ6^O*T^27N6+q$BAmE-()r=KAE4Qj
zQBk#}r!E!!j~C>DkcX6~U5m##?LaYHsQVcd-TDii;gWK46*=uQkV*iQod3q??Cov)
z^-W}V(gp@Cl0Vt&VeT(`H<(2rfcQ8T^f|<lpuhQr0emv)VJDjkaJt11w1kY4BxXHb
z^&$0$?Be=r`?|=e_wfW~uT0-?=Tlxywm=7DkXK3cp+lG|nEEOW!t)Y=9QrO65)HVs
z&}p<QZhHJT@81u)v%a;J6ifaAwEkw$b_)fY>E8|Js+VR|+_eNfXT1+KiP$%v`JcqB
zmDdxzgSSl{cOMD3@dC|jFEBRu4a~tl6R?yEyRqyP1I$<fcmA!qeYv#E%v57o(j!g_
zBB>ZA?Gn<~9Wz?qj-5H{_cxzF5LjRfz>xn6A7GL5E-x%V5RwL-ad@JbLxr=vb%g}r
z-1F&R(#p<qlgExB12dL}%y_lgO6}I@zEJ~~+&Aj(qkmsTA{HpWGZ)mDqmdv%ZD@fo
z*lzP(cD-NXNgKE+;}*H2@#1W5Pv-U|1HFQyt*IbjDJqd)7gq)|ABu-O_9-hX!+Pc}
z7jYZMj*ul;v%yhmoL@;z7f1J=^PjCylvzx@IHJ-0WF$FTjljr}c{56OC+WQ!@}kP6
zDBkTyQ~<ta^8gh_&RwY6NCJ^3fue;}7k4ua4XEDWRaZlsOHrtzA!~$s#s8`(kluv{
ztc9ONx1SMwe`)~{119O4uBL<qS(^?4VFpr#=b!}bqMmLmqTB!7-|ufeH-X|6Vyc)X
z5Oldqd?pEj1j|jm)BgscrZxnyfXCl?yI$3KMx&v{X)#Lm7r={K+uEufeiOxOgU_;w
zrSAwCkiP|7Tlj3XT;*2QbOCmEqB?3ox)d1`zmV5TSC5#53SP+-pZ}&!L$mKgYEjNf
zQVI929?S20K8(SdyADM*)0Jvx7k@~dcMF-;`;8{gpK9iYzO7v?&D3w}fyEuYd%jdp
zAN^;!W9#86m&lkJSBJ~{`3#ebUxHt<)oVM$Y+f0>?ZHp}bBwGn!cAO(D+FhJwftUd
z=NGmlgK|yE2*s)<%W(lA1uF`e_^0L#TBx&Ll<JK68PUbf$2Vn1`Yf{4+`tzXIK8jF
zq~^CBBE>|rU=Sv{L6`_Fxi@chdm^96fGPd>rLX<h^}$FSR4L2L8xR%?zd2kRT7lBU
zmxV-ob1S)p4c#(U#U)LNw=nHr=MEqx-A)T3;>H3+hcGOO@03@UT;*#dCGbMe2noT4
zj>o_oKpQRyB-qiscc|X910Qc9w+Vz^+8+_p2!bT~q!Ek|5|;<KeedodjIvNM^AILv
z$~4zphcNe>;>tVRxB4jY0uO!bb1Kg!&j+J7u#Jz5Oe_0U$xW-8(aoh!k}Br?k3-IQ
zeP8D!tEWZyU%Ja&Vys^sXE=E=tU}4}WejEC{H_Cey;T3xj5PvcJ|`Rs@!%$jC$2bO
z7F=4;IRAVPVw#H?AVX?43C{ibQ%2J?IX*rxI5-&NSD-vL7@<app8^4@DgwkRRVGwK
zv77jh!4Hg;h`+)N1mN2Xi<AX~P9yIy4}rU<gUJ_{QYB4Y3K=p4E&vHz87-zzfUc$g
z_9fT~y_`qy?L?uUUO<YG{Q#D!R2YCKBK}?Lt5fuTUl&*C(7cZL*$5j$PrLP9dN3T4
z|CI$#(?|x5z!&w2`+asETMpa-THf4m=#)ImL;}9OudfSRjagJFR<{5<=yC+~j9VP0
z8r=n;)|d<Mlx{``HtyHfD&j`Dq&~))RphT@|3&oe!L0d4pY3ec?b9_wZK@XEXtcj^
z{SLAkFcxE9BxD>atBuiT(D255$MUwy3~;GB0R3bEXz#=S`oyBp6C)Yf3Ew<183yfa
z^$$MR4@V)DWc<(it+(RNkKdkl8W$O#>h+WqYc)wZ_;z@G-EU|cSVr_i5Yfw4G-2Cv
z{gSWTQN{-p&9cPoEGFGarIhnk(yWe(Q%6twuZvD2ALbG2A&+2#hiMeWNh0>m%h{3X
z8Y6pyI0QtDJn**Y0B2A(b`v~SRUCN1oQ3Ee;<qoD8hXLhMP||eyx^uU;1Xful59$B
z0*#swz{GnCRAM#0jp#U|f!P)jEiIzr|MiJSK`e*v)|>R9fE;?_-t}vdw)^-eu&MbU
z0WE$z+fN9?9p5A$)&365B<Ey<?-Az>Nu2M{ZlfFUD#l5=OEeW8>f?1jztVS*;%7f>
z%lTbs+f7o2-f$exXSpdBzN?QWl&HDJzC(XKCiz~yidhpe$Yah6!3?C|hj&!G_NxQ;
zfLxX!s6E1>qnC<ec9erYe}>XY((8~Yfcv7V0*2y$hAt*WfDKHQ;t_#+>Bf$~;6MRd
zQ6+vK^7#M$<quJ0Q_b7BbZ&0vP(d)0)kWlTdh$?V>8FO9*Ho=FEgs*cWB`6buZp~w
z17z#v*zI~f!P!6V!FepbLX$YGP`z`PLoZ{;ER;KJBk^$iZl0?#(LiGo=D~~NyT@Qq
zd!rE#yx;bWsGW_XF@NRSxj_0Ia0Px-Oqar3XPYEea6spLi?|7r);l4}z^MP{fSP0z
ziRZlTr2nM0^oFwDyZ`$%R)T=oLoBF|`+xuFKR&5H21uJR;d(6~e`%sZASdHhwHP3h
zMMM`F&sIh;`WhMMS&psK5xx3aOSsv!g#{}~EK!z8ljOs+XgtY7yS{Tg>EtIwzY*lJ
zb+GXty5na4&Qh-uC+G+=a0X8utZuMO{TW?cV=+ez#b9eDXoe=ZEp*!mv0OZFgC7cW
zBi_KhAEobXU}!zc1rJN1l_?7zRx0!mdRR};#8y)i>jRlB^!cDk)4%5c{oVtC2m<jX
zM%4fSa2w73%Xe~sh1rO*lww22VTI<I`B+7`T3NuymRB68=hf2dIH9{UKd_v6`&BKL
zkP|EEda8^BNX&Squ^OT}3Di&!qaq%6^nYCqmH7DAc<+5GMOy8!-xeMlx4jro+tI@L
zP(EyKCDcgsz;HRZ{`ya4F|*z=^FDDQ8hs7K*uF0RwzI*aLd(m8fZzTexG&PkyFo~e
zN}ms0J8Gn$6v0yox4C^wR+7MGi9hMH|NHt3fz9Rx2j2U)Hu=vFoT3IL)p#2lU!W5S
z3=#Zb=+HM>iP$5->s3!5gEuibHC-IO!%!C?M;o+fyN>@rfM$M^kyGm*E&$iC+@l~y
zwu~1^ivMb0apj|hXHl{3W``u4mcN`Gd-8#F?DRySws>s_^TfD|IdsGacM9SMT%Uog
z?@=Kz=ufEn>?p9e-s@%WP+)Rh#Uvo7W&q|+gqqvQed~r?(BuUAkK)JwTrVE1;3iTL
zZ~OE&H@b;8|KZp23?Me(quSZs<Z*4{DOIdc@vCDkGV9oeMKo1STwbv1TXk}BA<j#Z
z=FWyquk+YwDoZHfb<Rj@TZ?WRE&F@CvPzKeNYd(JMlqYss(LlskcA4Cl;-uXePDiV
zXNMEs|2hCXkA?zxp4$I!&x0TY-kFz2N6x<=`7S6Q+2C&`N&oZPy0O;DSK#E%NBk*n
z@(+0|Bne+*3S@cPpDM!nm0vYf?b`^<U!-V#t&wD~MZr>3M5*cQ?QRZext{yeZaezS
z7|-sK_f@%beq#Sg{k#-|G71>XdDY{asQ3+TZVvlTSMSi-#PI}*SsXX+xe8<DP_p`4
zwps4Wb%Lst8Ei?q*A*#iv*zx4ZpletRAAC>#@M!+0L|%GOz(ZX%1FYqvjgb~1qY6A
z>hl7svlr8sVH(-i;aCy_m3l+5m}EilmWY|U19<E&bV5)LVItOX)~C<hSt%+t_4t2P
z&wrH|TEb5qE*~-72WNeE_hr|3A+KY##s$e(1Dbnpa*%mxgB1?!>|Vh0%D?%cd9t4z
z-N#F^=Od44TnUGO(}XPmXyzHv;WC?Pn0kxarn&i(4b}>n-k`(pGe<0m<ea%fBpmzg
zePFXqmShPra3>Q=;7*TKL-eU{D=RXnvLch=xJ`nfEKUN%M*@t0U)6v9#UD2iqNW<f
ziO3)u^^OqaV&(OF?NBh1O0%Zrj;ctn#1O^@zqjLHbx=q=deDW4@lJh_C6c4EHz~CJ
z67k!lx~=SaU)Rs3vw->Cj^#+@8Wgp583&z%pQ%*`L-q;{7MGvSxH~@2R(@c9d(Npl
zzEycTNFiTu-toyv!ahxZ_Rp2c#ZDxEHB9bDe0^wn3Qw==NP;+=UtEuFg!^7+=-y$?
zOu*~DtZzj7euI^NKEtRbc$Wa55f^6aZ?j$VB#Mv|%%^{%O3rhC_~&!>ledE%+nUkr
zbn;q&LB^xOv>f$aLB^I}b2u2s^EQo4OJCg9`fi<;kwH#DA(EcO)6IpO^h6C+i67>U
zexn#^)lt{{EU==*0}xJ;Pk91OI;L%#PVUdHgqTO;zy;Hv(@!l}q>Q;XeOL8l7jokQ
zJmh2n(_5c!C4yktYN|!=zYdWy3Vg(-nE5W^fAjW#{#*(;J^fofGAJVY6TtOH%&Hr7
zi9h7`MOl6>AIWb^PpKYz%%JNXmFh>Kzb$Zg1SUB?$-hkhg{Ri7VT0(;PIt3}eFh>i
zg;Dc9?DN#;zxpbmJmPPB@)>b;=4WeQjPKam@s~CB{`akqzTm~FjD#v&s}NMdhl~w3
zk*kgN#hU6k5<i|CM5JA$Kl}V^zMnOa*?m{tf9G+aNJ)V1$q9y}uO6<D+Klm50K>+~
zmT8`&O_^cFf_z(D+)kbQ=6ih$yp)Bysb|Ng4DKKNSqJ8p#AnqVvxQ%?JZ}5_>t|xV
zcckF+c3dChNdJ_b#D~7kHeAdERH^1Du(frqda7SX+$qD0f_C>=&V}9_`mG&?NGUp?
zhdwdAO>bKfpwkU7T%i1Wx=qRe)*Dmq`SIiS`2Xv74_c50JV1xe_Nh8TGZ}C4@E+Fh
z#(aD%#n<ey<crY-D#XaNwD+uRN=vNFj^!No_AWfOHFn@RTv&c=8!Ky;iO*lbE9AT{
z|AQx?AjKZR@%geGK0W8F!qZ2D!J<ngo4L$IYvq_fLiP<d1<5T!j=n-yWtdWinUJnu
z<z>LBtCL1|euEcJZcnetuUclT*ZYyeXGVb;J1&hgFM#nMUpJrt<0nfi*%uSh6^}yI
z<ifhBhd*9e)H{X==$&R6!?%lN=}l*SZCrx+n5l)SROh7b3%8ng9n@LO5k?=1*)dr_
zlW#G!W%q#5qcPsCz(a@Yf20Zqg2_FbZTKIdz-FtIzT|O$S|q>HK-Kc_5s)yx*i)By
z%9|Um{|~cA9+GvEidFOfC*=_I1o<_+pdRMGclCe%g%vV5V<vSZ^c$`bxhN^<p}MU^
zB|MiH^lS;Udmr^l*KlLPYmp-9$8?l5e4mnNmywrW>c?`nz%SUCdrjl)qnG=<8)uG5
z(bd&5y`}Q`Lxx$?4cOhoFHk@1B3~Wyl5D6KBLw;J8qs+?ewok^lE`(yJkWTFwW<pf
zZJ%!ZaF)I)pW31v|EpT>cPUqqg{-6sbJz}p-g0_j!%Z@9ut-mzi=MK$Uznni%W<#m
zda4c+_3U_H6yy{*NgzviR%LA23m0Kv^UEy=_+rYp2Gm5H6fnosSxKIZmk^L+ZfiFn
zAaWKA&B>Jrs@A44(s7vhdoMJH+)HB&Pbz2VqUbH9^rKq}84vG8bK&F(<H9v6srI~x
zREsnf4Y}6xd5^E_3n{3iyr?*5W+3Dr-J7(A=R|?ok@F!Krh?b@qijqx7FkqZ3*1S_
z07*D~0|0~590j=e%$>Z7&NEYVA7ge4hJONmAV3P$F>!+aEpXHz+(}xL;h(Vbp94Rr
zWg)DcUHXS7<THYzNLAVn95|2oNQlN7a827AKY?A{&li<y_tCg2m+QgZ)?XNml%i?u
z7CO`3V4}tuc0sFn*xlc<3yaWgZs7OyHeS_Y<?SJ&@~W_N;FY{KqE@MUF7wU|ca2jp
zh>SvEt!!{Vcb!!UsMz2qnd&;j_pKk^iRk%4-+Z_eEwHoR<VF_t6;{8m^Lz7yG3Ezu
zn`@ms=WWI`Sdv9e!;nU$P|NvQ>Hz`!MYq8d?qGJydf$UWV{i5)_`pGuhe!G90iZC6
z59}%VAvAyGCBKzd<5ps?W5>bax_5WD@i=33a;iC`xpZKW=A#ef98()(id9IrryOwq
z?<ucaD_D=GgHs;HNgCoR#`&AK{Xg@b#}J}G`d@?1Etnwq5#m2+uebF>EoH$Xgh>Y8
zWZS6_;Qddq2Bl7AU$ag!xmsX|GL*f@sW1rvWyEmFpEDA5#AE%_AQ7}SL$-HpG;Hwe
z)5LxPjVG-+6JZzAQV#s7vWU;!48oHAogF5UV`nlm-yOZ5@f_N8qY=eAG@8MsRiWG=
zo^embP2F*E7aUW6^fG!^Aj8eu7JCM%D?Y&xBqHO|gt8=#D2A{X6P7aP?$>CI{9_#~
z9KyK!8O^1_Mz-5;78C1tb~D2@vP))8?5|%u2}*Km4J%Ek0tHN0cFMHm^T6!2QvI$u
za*U7<K#z}71BmW<ecRE}iisRu1x@yG4Mhi)`Bufh)`tNFZge=3+<&|4CLnOfdH%l{
zX}=gKYd+@VOGrbt{|~Hx<PeH_5%P4aU)nq;=6dihQ-$_vOtIy|=+6%KomH9G^PR_H
z?G>`$VmHo6M^E4~^BL2(lo$wESiRDvjN(r(V*HHAv+Np_4tvNn&=^;Cy1f*l_Ny@_
zLWT$>MWwm)p#Ny|vwXuA_r0k_fY!{k*L!cyxV+=`j)paUdY?az^JNfLn1%{-@4Y5L
z+gkNXd)<v-;5gSwcEPbEwsA45DZQj}k5UfSu*=a(Lyd<6v>$LxXB>%v?-(i9|2Zx%
z*b3a=cI~v>;3nuO-{^*rBF&!(UwPE8Nt$w0b(#v2muY*y-R4OWjU}F<dYpg~oLAQR
zjUFdN@BNAqEV@~Y^znRPbLq?H@(z87BD36DF+mZ7Og_ftK+G<$Js|z>{YWYafhClJ
zDt{XZo>(#<EtKAwvswFZapCsQ6{Nwz|7Idal!FdF6bT5qmKr#kpS5D7DP<|KFu&3v
zsc=2Kr&7<+e5A$f_V$6Csbhd{eJamDv6YoO(Nx^lv5m;P!Fi!GPPXE?^tTn~b{axr
z4h<k9$N0trr6h!HAEd;!!lk5@^P_bcnO|YQr)nuZe%b|iH<--Ii_fLU#(KKthIa*b
zQ}9RAH)ev|+M9(Qj-gYe#?EkCqDS*{76A}7(2;F+Y>TL`e%<Z&x*N@0WCTjPlfgO1
z#YxkqFCltW0HKfZ#J9&<J|;M?y%L_8N__g8esum#KPZ`qi>DC5!5sp}|NrOU_POmR
zz0NAH@IiHOZ$E;eIAZE_+vkW)F?ndw==sVNen9rSdz*2FA{-GMApI!1@~SKb02$E-
zJH?k`|M)uw6f0Neb}9s(HNJv*#AU5dS>t<u5Vj6fc~ycVeSaEc1&k&)^PcOAUA=96
zPZ~w4U)U;Cs1P@jC)H_f#GbQ9e<qqU$S!}8sqI3*<Z|&Y?MtEBFL9O+#uAUwm46bJ
zN5*^&>3vQ8cIvvFzVzAq4hH(V3E=T6I!nKBg2kV)KW8(hpQZfk@#ME2Pt+$Z-er;i
zJpRS&|Kag)kmbSow~dOvuT_wIv2E0Xxv+I{uOo9tF*sF!q)m);@w-I0%j~6~@GMj8
zxSDn82fen0P7U6gD&KZDfCA`wgS4ZgQ`q|P|8A;Op#Vd{s$5890Z|s2=Fjj0m~8T+
zK(@lFc1zo<sk*!STlGWzfy!AkcNl$xMY9Lxn6sGS)}JK12hYqn4He2#jIo}NDqI|m
zBW;{q5lUS0WT%nyO4pukR2Y$0Za+F0a}OC+bYP;S9@skKE(O9!Hd(HlXeL&#lF7%C
z#pI@0-obIWUNl554kLW5Ur4TYe-i!K<-T(?bH!9f_ybGI3h5SLr>S;j4n+Ip74DqU
zj_6>dv=Z{^x}V=@9|4~>G3rxke)-p@MgRJAVoY#EKL;>P+yCj)2;j=+M!*Ov2935h
zq?tBt04MFvS(#8`P=KJyEshs42$c{>o`9fqad|wWR$)dE&7l5CIalQ^XzZ$io~F>X
zVfPoI`SuWnOgZA`meWM|+8;3=bC~%rEHvlc$0T6jA@gLx-~zjk@Qwkw02Pl(%m3iu
zW8(>E9r5?^pPM~59ex1Rxd!??0Sn3dEu~om!UeK{fx#QFDM78y;Tc%Il+(5Z_N)5>
zNfaEMgZ50VRLwGzZ<o0m<#cm%bN-PC-{U=alQ%|-MQS;E8o@B{d0XRlX%|(D7ThTf
z4B}@h@L8hYXaM{Z%#s1hBn}7N%AhL=HZ+77Onlv6db4T+Ps}WzFmmk|h{m*&+ju!q
zSaczFomXdPAl>1mkd;NptNUvk`S+F!JkW35*F1CxyXuw6b9fi~mKmxemtK7q%+49w
z3rg^x!AaWRT}<i}KHDIGG@0-5Ch&9Pu)oj>s5(IqXeV&K{_!{G!BH7)l#G<PpE^9G
zeK&bK#w*-7gwnk<xc}x=CrVD(wa;@{{K$)&N?AQ?v948a*5HTw9nIrjLi3>GAs&x;
zUi#|xjd#7^!Av^TGu}e}6NB&R2*a@(Sg}j#H*b1}f9GA4$4R6+X;05x*6Vv$I`{G}
zanyE0FYo*w$+G4Cwc(Z`o_o;{F|a2ssSA_~^7lR3^gX@??YWakJv#)#cK0gbGdU5#
zDw}|JJ`d@}2u?4zHBc^&$co;^*H@#o<WApF?8X_^<i?C0qrbkUiMgvGfc|^?fl?Mc
zWmMm`rUg4ULDMX8Q}niHy5$1u)Hkj`n+Dda7pix9w|bYgb6CS-t*!HIJh3G_TnRmF
zUwbZYFI->035|HH`fbVVV}p8rPk5upJNrAD?nQL!C(hhY?vto6!y^8Aw7f79;AtE%
zJ@;8&(XO=$1!C1)t0m^3ya4&t11P`pc<c%E`2P&P5@@@=oVk9-`gJN*uPtx|kRG6J
ziL6KHaYL(KCD|Pq-!V~=WBalt8~S*W<|3f!wSneXU|CtDdr&`nk^8lqxy6jzdH&l$
zat%PgKm+`k%87EtQ^1^?t648#V{mT6RsECtnO_+2Y-J`QW@hXr%rhFdfpjn6KluUa
zUiy#@IY4_r{|0o7PXHb4LGH_+$eNjcZWfmHhj<2jn?&5)+!~XwVp)VVJ18W6U*l1j
zLS2_hz<CquX!!c9)U1p2Pe^Y7HjN%w<m_f$O6Li&GI_2+JNL|E1UdJJ!(!4O%=4>9
z%1}6706Ga!46&8bnfZP>jr9~A@D}_gEE|buTq2X-u7Ga+$d+<uvQBKVBvoh~jK<nk
zr_?A!D(Vw!34t`=R+cYE3LZ;phLl(-VTRe5ogJ*k^4J$tI0k5SEbQPe@Yt?-14Sz`
zT3XtiCA^I{AqA@{u$CW3U1yXSv~V^{KYyTcPNwh;f{h`s&xyzraCf1YVgWS_oG-bY
zpbD@^x(SR=`5k9`2gMpn=KSuPiM)4er`=YyuNj`s?47F0=PMI7o;j;B`~Y~w_Awhz
zN&5Vcr%Fn|b6HPGrjKZrj#mRf7bWOGX*;^M%&Zx?>l9sVxDl~vE`53DxH<WvThXB}
z=4++PF0xxP%kbxrR-^q)UBT+owX^+0ha%%PEC!OjSh91$9wscO!wO<t@)k$TE~Wa_
z$cIH&^_(qO``=#JZ<GN?1UPg@W@<ceych9L>BYYM%>pox%Xb)r-=E`q491^q%Il5R
z`d43D+{FvNBf3!TtYnE?=8BH-Y=~Ydp*-9XNsSNr#LcRKa`FtB^aqXCrzvpLY~5#a
zgv+w3IOH{yVI6V3hD|#PzEm)?yqKs%rMAWpm8X2EA}B8;_{kqZ>@KFl4+(L>a8hzQ
zA=eX8L7`unCs#;VRFZO1=;WlIJm3OJg(#5{?5~!OUevPj4NVR4O_lQ1R`$&jO=Z_8
z9}aD<B~4uo9a9t?3hdbo!6zZ_Se(#VX@AfkQY^eHz<6J!5kZUpotCZ=Mdq@pQU1FN
zRT!M+OfSN2FCLrgVnrP*o4xv1u+25WQ!Dv)C7}mykuq4FTA|z*c3=Mwcbc`ItnmY0
zzh1>%L3Pg}C9o5)#d^%)7g*7RA$$)gn)@MRQ~v=Z8J2sFYiPTGEYuEM5z=RrF8Dr+
z#2ZOMArT58ml(K<?oMt+J#i5cpMcZQ307o{-sM%zQ9=f^D>2ZP{zPmccs^BYpK7~4
z%;kDk#$w0PRH~7HMJ4qHNLgiR*H--I?R@qTIDi9~T-yc25UKM1+~2^TsE&cZ4X5aT
z@Tkl8;N%3ujRjTQ6ACMD0)eWJ*j>SVA`6UyrqYCCPKrNco5y(dbxPZp;1+5H@-&!)
z>-$Vza(A5g9f3Bfpr6`FO{``$Ugv2dx+2+%0Mk~oS4Td=9LL+#JH&WG9+xRXUJv$-
z!6o&G<7W-F>Zt$s$4}v5vo+RRk2p8`0Xqeb?;P)`d=s(P9)Xyi9{EZGoB8<j#mtXZ
zIG6UtdTm;qDkK>6O(>o8O--eBDq4}yse4YQ2v*&JX{2Ptv1c95Nq%w6%GZsYyu&-_
zO%i+LUX|_AmRFswT^m>*IrF2C9`rds!bblOQ|}#5b=>}sbF4`A$ev{rvUg_L64_*r
z?7a_(%xuCj5@pNYN3zF}S%_nAPTBkS(tY2b@9&@E@hCmwocHT`UDxw^j(6;)`}1YF
zW};P;G#64dT6~olh?Td2o_oNJiZn#@Czoz!+H0-4^}A6`*Vr<<0%K#S!Ee8Q-sGR|
zs~L)Mjm@eBt4>aeio)^D%ceCozPn{P<4dYq7X+n#BUY<(Ii~IB!1~5tdEa-c4_gAd
z9_xvObzaAR7kH|o(PzYlwyiucF;X^ee#doRi*f!t-y8ctnMyQ`{eCAGEZ>Kezl0eu
zRy>_B4PtDP(=c}~7^>XgS7TIDpaNGti$+Y@9H}N-;YZU!&}AAcL#rvbuD}o=LZHG~
z;p3EXE_q6@aB)NUazee`#B_wK*wY4SAbr#L5WH{yvc!?^L2TyE`znG+YveBrXxC#&
z41`6qtE7`2BR-kA&-$a6tn|#_n^!U*m|;Z7XRbxlyCWl=?>*)LJC&?zgxi>`0`37A
zJ^Vl?NaX2gD+c-+U_-px`WgqNwwwbftRO-{Od^sXq4#6xOgIkVR4ffXDsXFpqnpmZ
z>+MhnaiZ!nP-?}0i?~ZO5-uADHp}YJt6lCD<`a-1#%^x%NMqeQ{+(q+It18&XF(p0
z2QMkuPnA*K(zK(=Ys`dGHrb4!wj%p30^rHq=kfQGH>VO?2#DD+)vUS!>2awSZ+|1$
zRz$&u-sPjO+q<r5YhJvH<!4yHEjU#we(%r9o`bFp0#@EQy<YbX?qO==7O)loa(M#*
z-QIV*ydec6c*7j}Bm(aj>D(GSNFSF6`4Da>q}&66px#3H!h(+<U+=x^s)m%weg;`<
zV!F>fWd5ZFHs0n{cP%Vt3ViXuN!LD3>?I<m`ofyV#)8qC&I>=;k2_GUWft03+O?=;
z$#;GDT+VgxN7GFgbTb)3BEW}%z>H<cypW+i-(mTEw{A+caw%<%kcD~d#&v$8qY?=}
z>=&)Kf*nTClekjC-&ImBwuE#nid1Byr5?TE7o>04?19&K4$MC1r|r}Yuo=<L)@7tU
zh=$t6#a7+*m41ZN_)&-`B)M)vwfRmN9rn%#hdEQIP9(-35$fp|(~KKuhZSl#4mEJW
z&JIm(v;pBn53ea|y12z+MR<YjVZJRfD$t!qs}-AwE|)IHd~GELzPhl9D>8m0W=8zy
zK3@)6+A$=aS7d7B-ocd{uhtel@~uwKGWjGUJUbgU9rwsNH~nDXX-<D!>y(GAmD(Eh
zoF^C2x(Bt6KPPG$A?yF_m^xAZWM&c0LCW^$c{@RiQ|pNFfAd~L$@{|bss=rLdBs7|
zXmZv__yDynCSN-(lbwG7N-JRU9}F3w>&cp-`)lXS?QG;~f6=dmLAuYv+%OY%X6jVk
z=!D!&SbcQPZF|%og9j+Al@fH~K*xH&-(l+j>n;U5s!Xr?uJ7JdIn6^&n3F?!?8Co6
zqR2rcv{~@Nib+)a0w@4K0*Ri1Ea~b<6fp~aPB$T)_#1Q#Y;zEj@?p8)s7(7Kc8Okj
zw2^bugHrlBP?Pz9=<wrgn(RK%<$cJ<`^dV8xFY-6?8OCQNk6S*m@-wtfFaaW8ScOf
zm&AU}5~DVMuRvh>o_E4xr7DVnaQ?ku6>2N8U{vd_yj%A4OxSjW_Wk%*)2A95J%Ha+
zxU=k^$Jx!az6WTB)?!M7(pj|*lX%aI)X+`jjophOhq33tgmV!%d%q<u+>U`^ZTN{)
zl4HXTW}<OK%*10~fm>nVMEsmUMR83+#kCE4`U8PXo;<JJknV<mu80f$29lEZ_R50c
zJwnmKwtGI>{0lHG0im7i1jw_sx9ZxKqpsggbBBA3CI6DP)BU8gpRTa}r2NU&e=$B@
zlSSa+LC)n$$MyHU2Bqbp_+fO8lys<V;z7d(b<9XQ;fY4ZTL?KPPGI0>I?K<q9#<kX
zuIF>udk<kUF~65hyPKr`a=)e0GSDOybek)p!2mHG&Xzf@n9pBlOGmn^KwXEpuwg<;
zodn2CMx!u_kIP>rn!M>nn<1jLcSJ>H4%SoYu}Zul145)Nvbi%@fm1ITAnk1rP<{Bw
z^Q3E1=8cmrD)Sp|WS;=?r+A)<b>24?*~t;vM2RD&RSy{mgYJ)xSIuRZ15)?zM`pi4
z>|g|E<6M4yNIJ$Un{J%PRX2#~9fWSZ6_ID#9dv5HE3F6@w5Ln-^Cz?$p$zHA^!F*z
zm@IDJ4|zbk6CZ`zmshfNSbTeybkR_m?Ddx%dFqcuh7E;q=!brdgF2W9KraFzY+*6I
zp=UYhVcm7lOcAA8n`-B!C$xs+l^19i(u8~Q-Ea@rb2mr5(0@0eaU}>Uja(oR1hig9
zifCf+6)&}kP>*q+1HL!n=Q8Zq13o{-VzkZ?`M3B#JnKGDcH)k)QYwT=Sz1~OVI->_
zE_Oxohg=5+UjFI*V2;~nCv7*%sGxY3`>>MyC8i|l+zM;9E*NgHs-hE4#4;vrLj|;`
zUoeQdA2OQ>{f`U43iy=|r1U?&U^$JM+6JTYbQvcG&HD>OKos?CVeySM?!YamEd}>K
z2GD+VZ;lrW$zx+klIp58t@2><NCBhj<}SAM23we)(l~$CTrF^dxHo{?b}18ZTn0fA
zi?X>heR8zln)nhe;q&I)#n2mf$jv<!0+QHukIi^g_JQJ2sU!F*g_cjREW_FKhXn?h
zQFw|pbK^g$rsH{)7&TNZMVYn-`f<=Hx@Mo4hXT>im-~90KNe|$N@L22dgQ{W5u7uM
z2=_+NSpEEJ9itNg<FKndj5wHly_xyy+Z){Ii=xk?9<sU8L4fUx^ZT+`6l4$UpI-nk
z(Kpo~o(;o2uV_0;bx#KlI{2Xs{(*yx=1~tTVB$^o5{4)Dw>N>I_f`}MT+UW`nOJSQ
z;5Bblj320?|G0Vvj-Q6A%Cu+@PJw*V=h1x1TmKu7mF(efHzxzG<d<5ZP}{o&ZlwP{
z_irzuuIRA->eHV~em^g+jnXEaQO9ItZG$(A6*VpO76<R8gZYQyX=yktKeRC2x1f#p
z*n+~sUX<7>YP7i<{Mc@&<N6w06o0mUEp=)m>$7Ng-*@jp^)uQ200u4dgT^ep^XU^`
z)J2cb&p*wlKCWBHf=kJ!U~=zvel-Vhy?o_b_h62n^#qM>hFa~YF!-ap<0}uD^!|I}
zuvTOE9^X?(?Q6zB$uU9)*}8M|>@aEr85;R7QhYwW`nx*hy%188^TTA?N}T)Rh+{EB
z4Ye=D=g{(m1|1P+ib35<dxzId467`J{O3XSafW^S@bx{i*54f(V`lIYN~lzB)NFL9
zt&4hC4({axb3;{Ih%Cc3(dRMZs6#)oXyhSXa_Ecjy560hv+Hx7sx|A^u1&g9$i*+R
zKOC;#i-_U41ao=h)Z(qbC=y7~*Vs}E0`+Z4LfEJGKb;aB&x#Yyi&#v@1wH)2hlp3-
z!Ph6&d*S4L0eDp{9`i;XFn;i{+~+dKB=y>zA^)hH&;x?q9B9>EEYCH0;C(&>$`Z^*
z0TS~kvWDT<_X?N5j~k|~fQr_YrQz&<Vh={=?x9RU?E(c{8h<Rn*e^n>!E~`Fr}`<2
zeW&mg6YV`q4P8VY3nU42QD-gp&`RNg&}Wz(#8po}1z(+K2sx(l-Va$DqPpBRyZLtO
zy7@@aJpi1C?4=AxY^|cX5FnU&6M2V7S191vNw-vQnBb|H60e>t;di#tgGqg}7uoa6
zOMwJiX`}KheH*i7E(`6^7y0;A++oXBIROveIy5~K-FI0GcKKmg3%pq_#O#0>cFTQH
z*St0A@vAW83UF%0mXs%ARX@`lSwA~iPah#2xLO@d=Q?5hkx#aA`V=Ag<HwIN{-%Np
z@<#7nYHa-bd6I}zih<mQd4E;PW}lm#9Xciq#PL?<?8gW$l-dd93?WdWo|Qc)2-n?^
z<@nXr=zcNt_YyqpI-$^YS3@3pu79m;Cl^u=X@EJ4eK$n<(e91w^RwvSf}B%-;`WOZ
znqZllyEFLSUPBOs&oOP(EW_@bs1Snh4^%Es$FNSX+uR1Uq9<Bu>LT0X-+WU{y+Np`
zcKx>vdm|45&Ix=U{wX=8j&k1>XVy8os60%uqCHy@q`h?dU?D!?@5VYHIWbi^-z3#i
zWsmB>pT#!ntjI>Z6LlVsgRYy`{II$J*OR(NV(EG)WHCb@U(%5^ItN}8>sak`-{%PO
zA@%_0eSe$pbR>K_s5`u^MP&4lhw-?12LU2j4|1hl@M0@*BuP~+Hn@>RU$1xXJU0C+
z>FU|^=Y<NtVF)H!`Igy9!!Z2DFxsU2DAd+H1-5{j=eA8A2Frc<9{1e^EV^>tf2aP8
z<V}DR&!ghD_6w2CywwRcqU#;uZ2iJeJyWsr8vP(_Slr3oiq8K%?V^R?8j)ZvleYkw
z(JDV55e~vVGWZw?bID!vAv=3J-xL>_Zadz1t~?x4;`f)g&H5t~E33lQY7%1wX9OP*
zFow!=bWQ-J);EB6nBSMl4z-131JEN`hz(?0|7*5%Bj>0`42>;7hWbFWuPO+YOU%gF
zz<FbU1?&HP1ZDzJcVKIA3D6V^5C>nBCs6(+v5UKNyby<c>}}pNcco(<J*chFXL&|J
z^eIDR5v;`zM07n0zVs#I{_HOC-dMza>0w?1nz$E#@<q~bw>smZQ2Hp%kw#2boNa|u
zAdX{qO*jcft9;{WGLj{NAk>&l63Gr-cm1_c3tpQVqTCgCt5n65kk#|~Y=(Plhk<{=
zWL^M-%tZtRD7tANXbIi>-GW0)`|9C|$FH>H(qfk8qP}j}v6yrqt3IEU=7<H>H*(*H
z1{%52_x$Nkf^9zr+}*_mZX4-cC2qc;cvl&LDPdPHWf{DkXk18VtT{Z%>0rOd&i2<;
zxxSPMwBOD{r0k!LLdZ+<Mn3=P({HuXyNI38rOgy1l?na}HB)QJQ&+_}vbUn}<qbw$
zeLfmfkttq`Q4aprfYHLTJ0qmK`Gc^*XCnz~_5XEI5)rqLHob2==(&#Oef6OOlMnvP
zQSD?t!nXBxno@r^fH+n<V{2v>c=)h%esUqCE6oA}b<$7TS(NM?(Bdx5w$!LDf(@fn
zpM)b9?-ad#$-Flb_TT55yEX@Y5CKHK^*uxtKI{l*>}S&_4ELesf?3fxvwj>}?Ws4m
zwkH^7(%6@NFL{$0KHyL6ZSMQ3@f?M&7cE2m!$Uvs;1IEzK&A)=F>9?=%jx0aE*O8L
zfh2z+v!Yw#ilILBy`j0o5*Bh>8ctZV>d!lS-%f<I@p^6NeC2QJu8H~ZDEz*2*8XG-
zd<+K9AEFGxAGEB2a%!IHj~khdl&Y0yPyK3P#+RfidZZ7K1N48_XtqwM2`oovDzgz^
zQsSdyqp~$pY)=1uF9NSo6W!a#d*K5r3VbW$IyvL#RJd5(^G9u{E8tiRB4_BU@JY``
zH13uOV?ZKFQIf!G1&io7;%)ysf8Yrr*YcM5r&@f*c#(DPSItLfzy_hpXMdjiS2+Z6
zYw!fY(aaQhz(pZOg2<7e1Q8gx;`Smg^RYlCYSZ}(vn-X@gc^8loW#5)_XQd*{+Zf-
zX)^EI?kAu|kEz(&-G{~2Zegs&#|4XBXqlFCOaAWXqB*v{37;~#m_c;+ILPMaCLXuu
zqp!_BcU4NT^aklYi8SCIjXN%e;C0SOfpQ7-MP%>{a{<mcY9UCIYdUx4)l!8^3ut~g
z0vb+S@%L@P)BnphHy?O?$(ijN%6b92{gePFC<H5PFC3#$9_sp5;CDV_LKry_7t!AI
z)UEb(E~~YBBad|i1H8Yk#b8Qk@i6h#{Wl(qehM{$u1A{$<b6v<fagruajTz_582`~
z4`(py;O9|ps@KdLIJg4y<@-tD-I*6VIhO>%Ce`2hU?!tQn{p-88Cb9{eqB#<^UPDX
z3=(xJjeF)#7QVSS>Lah;32J?I7%oGM1T|n}nNGa*r}9n1fjymg<wc#hh9p*D*Wotb
zyt%aCtH<9h4P7_<<e5zI`TE<V?<)vlZ#C7{w-^j<31;md9L00kP3x}ZQykf){!k}f
zOD1$^UPqGx=m<XTy&Cvmv?ohr5=s4rC4+h{w#kc~>`rg%E2II7;0^@VpW}C;3>P3J
z#7P)JbC6z_saoEKBB*f)(u^Z<t=g*9j&1!%E2bhCqvgGhLVvFNU_5X*Q<nUSpRpS9
z_x(g2TbLu^5c--IzEcI3q|s&;ZAq!;p^xA7krA@v1m~D$?;r^2mDf2at9zWznk|E=
zR!O=+H0+n!c5QPo5IfRt05E_Fz0TDCKHx9Wi^<H>ZHrUtWO%_v>-d3&SpbzyqnEf2
z$7@u+eZ&!lC>8L{a4EdPPoFr@_JG~*6}BSdL%D(+aj&mf-pAN%sAO3ap`SA#<`9cJ
zE+%H2oDGzA_wsDef#f=Jig^Gx2GR%5Qtoi#PPkeCRSiGz{<Y@WMT2cBhJ?MJt#e!#
z8KDxiVILXlB&+LU>n`wmK<P@|;ZhBeBxQYOeg@3vQnU(>k{+mPwRpc?1RI7cAFNZe
zj^MV1bxbDG9lf%b$U$knH9p7_j(Bb@hfkW$%h`~EquQiOf0nrh7X}cw76q^-@ef(^
zNZ{`)4`vRs26>nKZCh}|e12XWUy-qhDH%C<U4#W*+Z0nCrHm&?9lxBCV?30+y*<A7
zoL|OW<i7Q+%nKl(Q(83I36{<ez?KTZXONc6V}O)rJ+@z(Q~H)DUn1KmpUFfXBVmc9
z#tt6(w3*dRSkT6!!i#;%RwJ7ylobQuO3i-(+Lh6*;kJJ@{{1*!Be)WLYmWy%66xn@
zr76H-@oCMP(H1(=&srP=S7Ftp+O6|#!Robh9=l<}!?BvW(|D*rTLiT6>W4;DRJ6{M
zB%59eL>I%X@Y&|my{SmskhkGClfrwJ$X}S_hX{yXa0Q|BK8m&A)zyvgdcR6<C0F`V
zi;wBFTFmyd)J`eO>+tPo7gZXeU%ibY2_NNX_;d(hf1<bB>%F$4qc+zcSy!O_haYX~
z-m8!k4g1nLoM9CC<iF#!Ko>7EGdo%zAMH4hIX8^iT<5N2#W<y%0DaVOF5S(*KKxq+
zyxQw;kdC?_0o<n+`_9ma+2fXk2?g_INBCO9@R4fWuxtS{Qf!s8%A38)@@rGO;<<@5
z?$ACy(diD4bC!!gUezj{`ERbj2jrVYsa)Cmna$$mHTzrqDDK?B|A4)>CEHaNK<wVK
zx{*%=0NJ<7f<6tp{>V(0UYZLjfc5kDFYCMCx-39-*JpP{!qMNy<Wqm%o^7avpd}7g
z{$2$OA#Fx!v(FWv!MF{8(5iPb_`UZH*LS|wo;3J$5$AB{m!PAD&0N{h8%AJ^LE07(
z>IO*!P81F_D7E+>E8g{>7M2C?gnA%y0chL9rB|l4<PQZhJpComQ6FX1lZWck<-%XP
zYx5d6R%z+fm1g;<P53PLgki++m=a;`x7}P{07DtdOXfbvCyJI;8v+jGW0V@vL%LV8
z0WcRPTchS{n#XqL7efjoe<u2YkzGnSCo?ni;|qKdq!{h72oF-sc#o&RN_(N@@Yq)<
zbS-WgL<e22%iPFaUrO(5VfqQt$^;uX6+6$j{^Cp8vEY{TMhwdr{(Ekd-nCd@sofT9
z3aO9dLDnc_XMONGKM$+OEtsBni)ON)HQvrORS=6}P_AuBbaqwLWK_=&vX}k_+C<*$
z(;dFyyXdq0JoYg+*MnD*KHIp1L}G!Kcyxl-PVlEy6-1J<V?B7n-=1~JnKNVZs+%Q}
z%<lL<NRZOmS)E{U^+B9PBWgK#*g^p1^q-EzOZkRI(Q)#R2o>i0^AmE~3~8#~w4w2d
zRsez)sKqE>&J>wm(clzm>cG8z>HM9?(E^}Thh$!T$lgM@3$~{Z_xf(L_fU^pGJxU-
zZN8`<-9qv9TeLlL|ApdJ?zDv3a-piJkY|hbvgP#GgoGWh(WR?VI6EM_^lc?Nzck-N
zd+5|XGps#20)>n0iN7cI=8NNSzS3bc>r@FeLQO1^uM7%_zUVJ!NKf4=LL`YB1OA9U
zI&nizXm|uIxHHJ|B1BRKoY}%AehV+GaDnbVG$?lvF^!KJ@=N_vIxc=uL63p>+cYEu
zfXaX14SM8lagKabC&rC|5Wf3C^&3AbTa_ZIxD&4tJm@~&oy;hr+v4nn7iGFvB&S!R
z`_!!tAdDYaEFY%ZLP}(SF1+j%gokKl)9$xTFI3A)g}Q=emg})sP2t%nEr{%~_$6jk
zxB!gKw1G>j<e%@P_2`IOjp<U-j@PrI>g43ZY%venZR<K9bk=i=Z+BmfCJDRF%3JPF
zW}Agbg7qg2(C?2xwj%G*6xq=qCCI<oZ@f9*=EEXmqtsk@LQsV%{qx=>gM_!5n>z4k
z3DENaV5%+w<((<9*)4xp2}eB9dy+Bk5lYW^YcZcFD=Qn(G|F)u9X$>BS%Z*{0YM@g
zkfKFw$;htjA{_AArkLXZR;l;JJwJbK(BUv^ms}gpDn2W#1{o-l=Nlh`EwbHzPx#XV
z^8|8`H1rl6>R*Z5ap7<EpstYu@}(3fJzz=no~EK4=$OxfY?m?KtxV@SeuX$|0|cd^
z3^U;$rVJfrN7^8pjI#(n{|yXLU|TsLT^{38t~!WYWULzf{QkM;a3{-qt-=hm$rP^i
zDxW1I(I#hW**i5=pMKmg3wFlxEo!IMILE{))kSQj83$G7NJl_Y_lC|hV;vu1AlAUq
zX;c|*(Y{#yp~AkE>v8yNby>!;cO(Bp6PLSivG=?65p%>Fex1CVBkO_1y+lhl%cj4<
zU((Z3q8Mz|dJuPQB+?GNO(bl9fdS~d%lCXKW+PZK9NbFgxZmKtu{7mJl_fHIqa)S{
z(Z{#ZI5PRHRLBM49_gv@*Ta`x^cfgK+w7C6W)Fxx%djM4LSL3Q3$SqbiHt6)B8!Y_
z9391!V&K2W7tk3d(>6J(;{D6}Cw&f#;QkDeXbr>Dc*f4yzH6-mL!`xC$HPkC2QcZF
zi~he&r}pipQ(Fy3ew7SGYfvTminCT(ck_bCZ<!}5s-IB7-;DMc4)#~9CJN_G4DY=I
zd2|Q^7xVD;@af!4aKF#}!2V&K#r(JDtE^M7f<i9_s#(GhnZNJXpDy4fUQKK-!xPGO
z<S<H#r;N;WXK#KIppb4K^_kfR;wHlUL@ad3ugwapSSYUa08%jcSV^J%iL3${@@XX7
zDyIN1t=v|AQ<1xr<H+g<rV?|%&TRC;Vt6tPA2C_ecY44$7GJ46gBX0>Px{Nxqph_Z
ztwmye_h|iB4WumeB|6W^ci{T^3iR+#WG8reaJ@|%oayNOH<;!?Tf%PIDhjj?_qrm%
zeK6_^fIzZH3m1NOT455o-GN=N-iH}O8pzDN$L=Jrr-p7^Ug#O8w|S5>P5J+}1hHA_
zMgRgD4>YOk0KPGjfBRUT81xU<+^!T*bq!-+MnHjuFf~uaCc-{$W2V3{!LuBYgn(I@
zHLmFc*?oW(MPrCGS>ERC&DOsjq<%j9L5TcO8?Z0P%jPm^uALBCli(nR{{{ZC;m3Ct
z+>dt*xDCEyg8f1#z-nYsT;yB1J#v<LjyY>3>m+KgHL^vaU=wW#Y)lMHMw#$I>;EDz
zxKweMTh+}fJErd3gW8uuEMYc3JTW*WYsFP1NC*&|0Qo5ua?^Fsmmu;=oe;bR#E4I8
z<cLRrP)?m+HJ?W<7(GY(<3#4~zfUuCUcR-XD^kmZK-zbCJzrwevCHCAK6akF?=$aL
z?CK<NHD-DP;9{--!QDa6g`&1NJUVx|$jFIOea$AqTT`O6xsu6%4;!*cwY85|L*u}^
z#c|?6O-d*H<a`pOEM#)UL18Rb1gCl1Hn)*1B$yXNzSoz2qs<jBtC_hfPMV5<A&Y>!
z;}r4k#aHyn6LD?sQiQzs_oROQncjWilebE5T=tT`>!Pw!xIpUnmc^r#NU#aVmNI%h
z*GLTFgope!P;1P;9XG_FuoyftcE%PpgCqAPzwSdukI$U|@48*7F9BS)2lzwO9HUcf
zJVHbl>{}^4FI>kPxB-hqsuwS$Plj^<Gm9%y{A}-2ZHTblsH~r9z7&tJupVc{=cEnU
zFW?_@MUr%PpJhb6+=J451yAERSi0=gexv;l!@!3F6s*h?hg+bx++=zIHL~29YL;Me
zgR{RhG*bQ{EUCuhlk*LpYtxZc;c|n}V0FMHyM+-bSkJi}$bH*u)@A3e?bsB233-lu
zWjk)-PuE?PHaYr5AA5)sAkHiIMyY)3u_co~r^2o&2YmdgwW~|^W$6h=_U7Y<<<IRL
zG@ZK}9*5i~PSg(e(&07vu32*(^U>p`_*cF8YRTuRlLus<wZz~S=X0=}pTh1$V-+TM
z?OZ@>X+_(x0mhQ&5(P?g02-6$CibzF`)_MWF~f`eg@@^c?HlYZsMXwl!+IQu&%Ng?
z9X=?GJu{41aUvG<((y837`sH$qMc9>*hTZ#Q9@gKryzn^AKt-C8n&tPoJfQ2{T4Rk
z8d<anwH+xP3UoEJT_^*+@yyX+`TtEqS>)id&BF3x%r=47=w%-!XSu!BUN1{jCa-+=
zvjvB-SvQFB+3_cGLdkcO@6X4d6gq|I<4wf!j^FvC{*aN`Y!PizfHzlREP7r=re#`K
z@t`jEt~>t=O3l9ySY2q}O>~((Js~%#5PRi#H1Njz<Ez41ZN;#h@sKMDj({TOs|2Z{
znRC5*8sCSHMA|no71ABkaL@3b^>kaRTptS+=JW;P>emxVG6d+$C@jp}C$3&IZ9ghB
zmYb;-Qkg%eko1&I?k-5Z`64qSH`dJbY$5}1m&gY($hcw9;)?`3FOG$f*pz+gi82Z=
zkG3J&rY2tI0^BWO{S--nTWqpioj8ehf%W|`fULn{sE69^9%w@Pvhd1G+r-gqyjrqe
zG@>I8N!qF54;n4c{;m$(J=cq-eskC!8*+K-HVd_Nmp#Usuxr(WQaY5r-3dBSudTPS
zU>0I6URP1)Q%iI#r=EZCD9St*%yEVW^u4pAjr|dO022R3Y1YTkr}(JvHX9JwBs>d8
zD53oTX`W<ur-1)*C$vZLi6<EM-PqcX^m65sUXUwlNxoouG*)uA{?030?@A96--nFa
zL0S-dqz&pJEx5$@UHsu5qH{(Q0ioxixHAX9l9=huBgHd?Kfs8iWG;Np(5^Bc!vt)1
zQ?NX#<5JN!fHfTF0jEx3l=e{>0-#Q@vG}W6RFFns&18Q+nV{JBXi}eC`L$G)W)Kn&
z%Xy}j!)7EKcjkFHSzT#sPJoFsCXA!^)*wVJTa^FqWy+5>V8ZST3^vH%lN(6lXj>dq
zc6N3wz*IO9TmpV7S7U;I0oAUYwl)8xEDz?PAz9hC0ZaDU2<Qa<9L2QFo{QhGzT{{4
z>3Nd)7D<9tE9`IV)A#jiOm@M*64`E{z0F$I;EIlm3}?7+C~h0jH$t-koHDriz3$%=
z-N%F1Nay=7^q_*+@_ok}!kRw`r<i-DqE!sVBIUhb)VIkg=WZcHm0fZ6$wo2f#yVrP
z$IsN`XbZrE89q#03itREu%}#}u7n*rKmW&A!h>33;r`pZGFu_afG*JfciZjqXdG(3
zsx`xwBN2lk_;8kFkmh^Ih*Ef??NEqT>wFM8l7Dp<Am@<`GB9R*;v7DPt@vw^=&2IY
zQ3|}B5ZVx@D<$X^?yI8n2m6j?ms~o_yb9}s3`Em?Ofvv6?-zCc5eK#X$knxm=mPD!
zK^QIW$;m=U`1NN$;AVIU_#3`pCF%CtYkIZsW%kD7|BxBf?oc8PRy11wjf-o~vrTLF
zsxjD&`CG~<P~2f}hEF~(d|O6u&m!P{aV4jNxn~&BYL5xHBC~7TkD~_tQZutOws!)s
z9-*902)_OKzv`e0=mj+oem$9CPl0<Fw^0&5FVC`P7cwFu6yT*-{<~JKB2hn}#l~}o
zYO`Nuo}ZYbrMrVn7{9C}4@W=w;BTE+dEpZhe0u#8MWUe(yWTvLkMok)c!=#IJD&V)
zs8mH?+CuZjK>Sf(P*jVkum3iJsogs-h{;dRaF4lIz;^HHITxwGyik7<QVH%clw(4q
zzyCnt`vNPu%<{m#is3IdG{a)C-rpxB5Y0h``+4Km{sMNn8Z(l@>_{UA9b*nn&f9IT
z`!BySfVG+Dg;MN1mBnn^dEIToLsk&Ug5rj}Ih5s-qEkc`O?SS;EbbHi@-CE-Z$eGO
z9`o~<Z<`_^v+w}A(9e}^lsyirY)Te!Pxmh81C-O{=^weK?^K)SM;;KpK6?#{M-)c~
zfV0V&C(=|xLoO9+2N0Ky$<p+tdfTEDj_26>?H9aul|RBN6wr(xjZ_pkRsq+*3`xJQ
z4{Tq2s7!{}Q0$0-Z8O%2n<cI?pp{)@5u<jF17ciY(J-;T-6OCzBP*(zf^x60ngKXy
z+63}g#+?DUNPT|%^9u}9uGgjgtjIl1`Fh#3P(+&t*`)hL=e~q@*<XZHeoz{kJP*q7
z5dTTgM){}t`wBS5M5t!h#V3#L3&bbpcdXD>QAxZe%`fe68-FZjR`~f#f9%32BOCJu
z7D&%vYY2u@=4zKAb5QI^i!rS4_}1%eFrpcT1IS;svsCP;^_8`=Mgx<+)j*KDWja>Q
zq~+G{mWIDOn%j9Vve97ne81ZQL`;1bu?GD#5a}yQnusi$yL9Xd3+lkIC~5L>28Pej
zG2#f!e~hA7_k9U8WSob=s9h%8$bs}mW*&0u6{9=g)4m0?H9DfSx`*_d4|gut?hvJX
zez9|VY9q{L=iVku%XM;2M#zh{S+mr)3G})8f+^rFUFK&brW$qZh`)SCyPDWUfF-1Y
z%S%<Zu22TCc#E-S(k(5hYR$HoQH9eCrV_q^sc$I0cz)9!hB3~z{4>cfe!bOXI(|~r
zd35+4?WAF&_qKP|5VmcpvhPL%F%vJq{AZYmb7Uo~`iL$;gK4lxAAD+YxIK-CM}6Sm
zAiLnN{XZ^14*{L{X%0QdL9d50&Z;cxKO_wjrflf?unYROSzX%Vmft!nN==XmI@Cn&
zD=cVMEN4uN%BDORu+Isq)B)qhuEGBsH~xqE(8#}q_-YtWVYtQhFct-F=gFCVXlDDP
z$?8ItFcEIH#Yt?`Lb$YrPZXAK-#g9ZwkcT8PBm?g`KnZgMzkSTw=hsfo~l&)UDE-F
zbs6#5iivgXD+qmI?gepwmZ`|@k|P_+>5jyW7ui!9r3`A8Ch=a8z~^qF*vzq?-hWwq
zsyy>94jjel$E+1NlEr~Cbhw1-$}8&D)=K#kPbKQ9-wifjOtRT-ym@$|q-*caT5CV@
zjF8?cTPCXTURSs~RyvGUe@O6ldkiyhfUyIVB|{ep=+JlyvU>*Y-JDU5UjqgwyNG=$
z(4T-DXf4`lJW5`tXLpZcPC@I~TV_y`l760p1_PerjA5F?;(X(*xK7Nc*1&kyz~dn*
z70r^<8x0Xd$)^O2Ki~YBd@{~74}!#X;dl$TI6gEx>bD+f`STmEd5!A1H}VrDxapi^
zLvu`jK%(|KZi2g^i9J@IwAx8xKd!oK>`T4R5|ne9*8!Xf>yBH{VUu@~cbkmyWoY+P
zq(M&a<9Txe>PI{o5!isP+7c8Z2!Z_$cKM^uawHhJ-=>=4?3v5x^J?LV#y8In!R0U~
zrc`wirpEVsG@g7P!ykunU>1%ACB0HqR^EO~f{v)I3<2I=1Nry$Q~`>R8ZNYPPLz5}
z%hDKak1&3*Nl#BsZe219eB~Fx0_HpDBfidHZQtLOKockG(T|y!&|Q5`V0I@ZRAa`>
z^3|)$o{HdU4NOfVKZ*j4vhU))C`Opy8r7{WKuS-+fRdxy>oN%9FnS}JoGS%GyG)wp
zG(HpUwRmCoozamm>#2E^U2AASIWcf_0o<ms=$gRPlObRo{ZLd*<JdIM<anleHO(mH
z&pU#tg1P*4X-Hl8a1OA5<cORZt@)VAd>%|Y1eLHkUuy>Sn5Y%p+&Gm?%KsKmjM-#&
zaBdH~@ke2>XO7Ks?&u;x+j^?m`|I~?mF>6t-MEI8FC%z%O;5ma=af2MJSH$9`f{G5
z;_og%5G;I8*(StB)EF==sT{0y7weDdw3`CV6nb&4oj1l1k=2%mT9-jOmp<rWKBY%N
z#EK5{4?H<Vwz8y)joFlU7W6{slCm4NXK){CXl#SFT+L?YB^%&xNEB82yvnEhQ}=3g
zGZCbRRW9^`rP!CoKqo-3s>QQK{wnDbeA)^3uzB)XZd*%KGUohK?mILWaB?N+U|nLB
zV8P~dVSzNb?&<#jAzZgjcO({pKp;D4x;kz~y-w-`B4P(SnCxH3n8+s6CNF1c%B>uU
z;CL=Jo#T+`6Dpkd^XN67I41AEvyS3)w>4*&9$~THH3uDXFjQ;8z$u54$mk;u^Q+3c
z{@ev<D-_wS7LALdfwMDMlD#51p&1f04q<y#6y~<pCJgrz`ks)}jShw%CL1L-+PvrJ
zwUk1NF$>ESxUi+Wh^hQa_9yQ*>S6M_SvNRfv9)y{sAKRRNqe+>nV@3)1BZ&TPLthR
z)|DVLh8)mijawTfg+SMQ=NUTB&+%;vT3uHh8gRL)xf8un2yJR0ny3vtb5}<`SA8PX
zxOtaK3`-I~1cvCL|F+=Vl^C@YL=UJtsGW2$$~qwkhrAh;Wx$4lhTaj%s|UHd<J(K-
z16=utY_X4x;$uf&crhKlwkB{fZGVICDZD}@H2O~M7z!R8xPM}o8DQ@H0Y24~fB`;6
z+rDvy;XWb(xKg<X^l?!5eaM+=8uu6Y?#0QzzU+O-nqq`|w0S0tbEDj;SknRV;gV4x
zx}U+YA_{N#Du%CJ_Se_Hk#b)yK&wsB32S%W`07Du-R6n~``y|>g^qCUXgFQVJ+C5Y
z3%Jv^EZ~bmqS;Lk1odt@hN4N$sQMr8aJ<R6ou#cqihO<obHH-^$YTVkJ?sdqTd9a^
z6o@v_e>@rvn;dllWJz+@bVlQaCl_?P2(T}Do@Xkq6-HYNv>T30x+*6c;>KZn2?b9W
zUxECuYHQWwef~H!?{Y69x;<Hfam+wvg}$r3fg$Rc3XWesvt_aahL(afF^qa+Z(!&_
zF(=mxAU|9e1dnV7KRXHmsQxOw+S!+2>^z|&TZ{#aS5|v&0@Fw!MG+!Y$fteK`&#_*
zYW=M9^Xwr$v-UDtN%=@@yC%8h*++6;{YO1Cq$@M}<=D2zY8XspR*>bqwQGmjDmKDG
z4ij4L4_H_*WNt16Z*z~V7xLDHz4B@ETftT!V{bIPslF9DyhgD5X@yra;EUak5+btz
z_XkWGY{yn}8!$bP@yXl6)wjKX7GMO3as8*QH^O^!7o}E80=j8VuWj5gCQFyrr_TA3
z^v2SE?#7Q~MUa~DB{`7dJ8?`xtv;E9MO=5q)nE4S<m@Zcv;W=OKV~vo7yuQAP%;_j
zb1#JS6Sau4f>q&yp|8N$pV<$qKc#|GoR#K&e4!br^P|O8jDFU2z4`PhBilYhk@{oI
z%P$@+?(^zy?#}@5_&`LorGy{VDxf+5*svyI=o9_l;0JoE2^h}YfZ9^_4Dj#&v+&P-
z(?@Rb@C@%ax~C~NS*uh_eOlK)II={>obDeSIsV+$O92gqC=I<4YM^MCmd>-A+^L7*
zoB!j&)P$_npNZdFayPZEXai`u4FJWbD$Ja_Z#yJ3a9jS5qjJw82OEZ~7C&UByYV$P
z_OaDv5FSjmgL!$zz*;y_Bt>aOeU&ItYZQ&>h;esA0BdxD>(B7K@(^7DX3sNC#Q`4%
zX1|L!dAhMpAMY(uw?m^EX)u=h8dU~6N9cV?_!zH-PE=&tB<_BX<2EGMj^Jeax{(tV
z)n@8L59`omvA@!?-}ps<6l?!u6!itgPPDo|wM6%s{AXr%4(SK=Csr1(MMIYYbSfB^
z5nTW&pro=DG^35-cFh`#58aG9g@~W6%ALDpC7vD~jA<YYr-?iFv>!OoR3z*#SzjLA
zB}tcB$D!|?q>B1E)AoIuSg6SHUBySh84A8O7cQftiu+vfg4m!Dj~QslWC%3HbwW;M
zSYLY{`+Sfyah(%&y5;d#8F13y_7^Lmkb;i=h_1W87x5$+L|tW}_;;QhQPeokV6G)~
z@yT({KjAx<0o==h8*wa2($;5u*W<oz-O;n$=z8sFsWbdMFOg?pcaWE{!maWN;Ed5R
z%;=2;0)U0wz4)u)WoG(#Iv-RO5KFbQw<aRY2@JE0T-siE+y@8uxom#O&BemEWB<`=
z3fIkh3RRQ9!n&Z5KL&0cX73FwkwXJ&*H0<JT>APepC&*2RvlA86;_VxL@b8kfGcZm
z0U%B}^U?;8B5U@8m6vzq?_W?)9igxF=yZ8BN3FX4Q=R(J5YFulr$z6nQQ6{NCGJt-
zpNZb91~knKOOx$OXQ~BQo3TW`s-v-wz#~U(l~!_U^ch~@?<ROMeg~4V!bXD96J@E5
zjIHQAgN6H;BJtEhcy$hwI&d8?rcuwO@9%s;J`BWs7i<hUPJo3NXR{6aF?dIAO}XuH
zlarHE17<;tX6|H>!aI@yIXY0U6XPw5UOmz(GEgyaF}AVPhK*h!PC_aCVFgE}FST!o
zdVul`YMiv`_oLxHU4cdS;#prJ%RpVJcmA<7mn4X$`Y<X0Eebf^LI1ryVfo1|<nV{`
zK&PA`W8nX4(Fjb6rm*^)5dgs!fKl6wi1w5MIIg~5(er)Vu1G?Cn9vb{`JJ^HX7ymO
zOMl=4%xm}Ah*t@AwIAb@%coVCw&95rDc7bOxLHureK&s)pX~Ur63LdKVIZ=p{++fJ
z9xSjKJ+8oqB0gnFzezfg4yAG7O32u*P_|NYTI;1J?CTcYM0nh;@-FMWu|{@KHzp-N
z@Mf`IQy*C#`aOaRcDIKt;ggjB>T*`{w!XzWO;>EQo;2tr=#iPUk^A&w6e4L{40!=t
zlO1m6=d83wk_cDzXCn5{q<dKfvJnGOZP!W)KlYbz#8%>IhsG;Rx+ljwmUk*%pRSfR
z>EzWkJO_2^wVPdV{uBTbx^(LsAQ*^zESO>lewq~4_<X95u%YN0IR=7SjF=Z*bLab&
zJ^!PYxHBuhf9xTY7wI?z#*m3V5jfJ3@Q<gFwLk52ml)+#=R&-alZNj^eb*^aD!|G)
zt8$!?cm2iMj6`{j%bp<0$(s#%{_kzN1!g|pVh11pVw2C=BMlylhG~xnn?h5h#^1Xw
zM!8<{lh3f!uN_2jtMR>m_ghhtgkXUGMeY;hD+|U*W3Q(&br_h-IM0`8;^fp)M`Yh@
zJ?J^KV%^V(O8Nc#ahGzsJ?5AUb4Qv{xH9zR^Y+}m_2W_k6_1!U+dHo~rAzMCesDwG
zqwtM)ImLd(DU*-8VRPxHf?hR#^OJ?ABX*E~Rc|Z*Oy$>kJ#QJUQ7Pps$75NAup5p#
zgK(zP6Qm?-ISsRr^zb%24{lVvem4r9>@Gz5Y(zy~;Zi-SPoH%hYc?4=fl>3m)<Qv#
zMjVYXf3i1<TZ`9~8|GPf(@xhvJ1;yTAfZV=?n%lff+|c6!nb_-?px~LTEH62E>6#7
zLxA#)jTd#@ZabOJm+~qm?<Lzsy4yu6bZo==5=1XA0hnx<nzw0@q}905)#3}sXF35e
zzeEE$X&4$4rv^HL%m{Nf!|%*&w_T0jb|@o1^y@qcH%g5F`fo95)PFD0SisWq#?MXd
z)7Lwq%<8toA0n6)qgBeEvU3uj%9|%qoBIJSEZIx79_uXQn>&wK?G)*hUFeR`3IwbN
zkigr$6N6CFeB=Amyc<D;Bq^HQrrb?|V-GjJG8(y<BB|@HQ6M2+Rr6c$vlvztllwT9
zden?*6h@|setm6~t;CVGA+F!xwS7&ocS5mmc$=&c|Iw32Cx6@bzXCkhrn)tCf-29x
za3ehQF5sm3e6OoLf;P2SEweyO=I9ic<kLEa(8^gKOM*F&m$HMXBG&-`aaNkPp{9&n
zpaf~7T!!&gN|7N^6;w4iPieun6?z{e;T_4bdslk>x7h61y!sL31bFAcN^Ca!5d{HN
zKfLTHe2a{k;V`)2!V?4@L+~lnMy7sGT^50Ten9>HqBI$Toz3H~U-zD59mzhB^+gBN
zpSw~CBBCcM`>ZqFBg`G#*QPi-jg3;4JyD%^`C5tIOMFFS8U|#Zw<N{ZZ|xgR^lV5(
zs2NvufzNs5R-AW0?LqkOXf!^BYJuwffiEHAKVrQC*%q%B6QR^QgjY7q-yhr#9jqi*
zkp9i*8g;?;{qPz(F6uyhFm61?r$nCR_rp!vF3t30uINySrwJj=RL>6#^12mnxv~y`
z#kMR0J?BH$mgWmJ7R4KKz`FDkJ^elr!w7+uADc8`pisK9Zk89ttfoMigc+QjbtMS)
zn)XEL2b><eIa9wx5P<3Gy@1^KzhjJr^>g)Cm>rWHi6Qb=_@Rdkv)}l(%1uz#;?AAX
zSQl~hAFCgL@DzH4OK`Oq(i~rl$2uF#TM(^_YsZJ7wtX+@)%teTokXntJD=0sHbqI&
zC70L7wvC=r>9_of;xtsBi(sR(?<jjR_Mvl(w`;|icEN5(X0I2bGKcvcsN~c_{hW?$
zog#9cva+OG)&RN*g}T3DYOB$m$$%t2Ccp746{*ut78P42%I$Gd??RZZ5uV*9b9@XH
zT+p9}Qp6_b(uF(OQD6^Y9Yp(IAY@|`y(x;Ve1|QEh3xU#2yax&y0yah-$bH(S=oqt
zSj%`#hJa@<GC4SyYzb&HG-O1b6pN^n+s@y<zb`XwZnHH)wh-*NK7!;69j<3pKY@<(
z>lQ$>bfBsQ1p*x(MR6sQC3vAWV9D6OWkw-dh`mBF-Q;0=n{R1v_C@DG_tT1JsXP11
zVBi75L3!YzT+?HVX%%zsF)PFv7Hl5|!hQ8<qZRkE4Eq&ArrFT4|MoO&bTay*5qGMr
zIB{Y;`oYD5*_eEg=fWR0OTqmzPqgryLU3fmVcI3y5-XV4ev1rM_Nhr*TJ<$Ef!^()
zT7Hp*kJ?!ah!Ob>_L84=$UzpDR<M=l&qzJqrd?_xxeiPpl8oOV`r^4h4f+?%o01&Z
zBu@A#_9O&mROj$%r=dHBGMzd+t)^tYlgRTC2HnfAu8ZHwq+>&MkIWU8r?V<2tnfLp
zFswg7Q{s9Y9_qb%RRA#>6O3=j2-5o@eks3CgkA%$>3a=>d2CjZ54_H_#oE6FA*qC`
zeg?aLD}wiqCdG<#^<$^z`Vn=Zi}X_d>^SOR8BpjJom~(II|IXm!ihOo69}y{7|rg_
z|7FBZu=`y7!{bC#$dObG^;r8eL>m?!+57k|^4kgG@K&DXKis7Jk67Za0|p8YouECi
z@&cdgHuy_?z<pS!gKbjU0NI;X@DCxUpH-pBd2!dG_EOy*71sZ`CeKKC;T}2dvdMpB
z%Mvc80hxAb55D+djTf7Er36b-e-~f!5S=K*(wl)J1jEr3+dJ62!mLF7kD<muAd~o!
z3vysn&Gr036Z&!RYWZF0`pO_Y8a?x9IK}1$dyAmYv*<4cvE=Umx_p~((B-djVjyfP
zj4KBJ*X6%a0NetCcSA)IxOAu;x$`eMDl?Ca$0rRQ`?5R|y*hL@&RQ__ng75$L}jn?
z>jF1PB!R!2MrhPK;h6CV^@_E8tCC8el)Z=o^WjbHou`gp^YMP6lYGw_bsCcy^)Tmx
zI44gLVwfcOcXOg>0m|MjW^l&K$*z~3!x0Pd7;klE=ZO!vW+c%+5==EBV_#|}QL#Jc
zEqL~u<qgc*1a2K_TO(WVQ-E7;!`QqNj($MU6DPJeuTNxDPD4aRN+h!T(}9(D5IfAu
zBVoH5Vjy-gsvN~TosD%%O9P+Is1_rwx#o}2E-El-Gk!&_-O_MRE$3VPqPYtwPjfI~
z{rO=1@gwh#e+*UQJ7l=<>lv%M-T$Z(cW^8)F(gZ&jG5K%!1-Bd^x63d1<4FHO?t_p
zm@(rE8>KJCMQfkgPLD%oZ^LQcXMJwALs!_Hse6tVEQpNk<44??(Fy<iSeoVKm=BQ3
zYwv$xvHcEKQX;G;_sYY!yReRRW5`C}HAW{Q2H+%}P`Jd8dQE<{9xU^Tj@=6|Y_f_J
zY%!q((K-Te@u><Llg`tCevIp6V8br=gp^f$Qk-n5x|jYra+SvA(9dN#*Vhn!@ULNR
zyWVFDic~n$WZVRd&r2l-%#2b(i~Js?*R6C>1PAY%TvS!Q7(KYpK8|1L^jJf%S^+Ld
zH^=Wd#ZAVc^Zj#a83JS^7Ky$C>`rRXqJMkc@cS*u$KXVe6}-asor08X=4FF9V(~2v
zCJr5SY<7juzzvsvh1F!4Sx38oi}*|UYXsNW-9=f&AB()5l3dYq+ln~kXBrPO996IA
zpiABxqjJ#8VM$vZiK9iGL&1*$@%~TQ*TH?fye?o`=kQM;{P|^d0R~qe6F78mUI5|O
zt3Hjtz4^*yec9n~58pD<grt9mt`0zc>&!@l?UYqll>l*zB#o%}xFffydy&xX*hHu;
zB`=buT2`s#3>tA4WxPR<cOa05-2LToXoc1y14%nd%Jtbff(z0>mxQin?a&buF2&ul
z`FPhy3EUVqQrrub4Qn53LOJQ6cNp~fcwCJzB?Bbk&|mP&k-tHX!)I_>b5rFT(f?~)
z&u<&oG{B||*>zX{S0upG$oS?~Q+ua7ZHH4AUbDVM1-ikoTGGrB0h?&ym-tr3((ek%
z+Y*DlD{cBVzN*U|#Hz59X(?l8OySWk>;!$EljAGCXyFLZxmE%@E~X=-vBh(#X?utF
z!R|4TY=51~4~ZP5&Yqb{Q6c87kOOQA!m)T>`K4&-=IU}Z#M`PGMfOZ=>IKxj?{jB`
zqni>La|J)OKW(w(gY~$vzJM&LVfYHq^84l+50kdoP+QG0#qM{KyDIgw5j#t3`?VyA
z5fHPTXBS5_th~X`gk#O5uozvWu@a{jXh)@&D|6j;2dB~w5M!Z@+7;SBB<wM)^H%}<
z+Hg6n9U#F7{SAVqT(Mz_p$XirO#vs)x7iPO1Ie2v#Bn8`b4DYzsTw&RJir3u+6&{g
zR>hU2`}n9bY)fmnwRVUSfj5C^(zj9Oyt0s+>*y8_<Lpycwl;nTxu=F9v(pgX1(2!L
z?KY-HEVf1^XT(hOVyawoq8<#1FKg|H0g<1D*54Tmw)<104()a2d+#}np_SyFI!?%4
z#n-<ZJNh9A=e7pl>E~?;RM0`gCXgob@Uh(}T<wFXp1$Y(@E_N&9%q2)4i1e*AVEZw
zJmXIXwnQN47w0kMYCSzs$+QeK0@%GAmKLmEtCB0`wQW2i)$S;$@0^>Btvw2+$4{}x
z))A&o$b89+%v=gN-z?6v&FSk?`MAd&1ejlX3<Jx%quhb}5?YBdqCw8bp=nxDl`lzt
z6)A1WKVUE0k<xLzQy?X`pSiGA_uxEU)-vfVn2B0qm1oy_#Z^-_1fY_@au*8=mF`6~
zO`V*zCnx-gAkqTMJ?HjZG(T9y{YQ<?E4>{!8UM9lZ*JuR;K*d^pg<$KG4|>12Bq$z
zZrSzU{$UR*<gbX*WWYj5GRC}XCYHMSx6E64%CJDG68)YjcER$MBRC_v+ldKKYQ(Ah
zz|-{5x~@<Yh5<JDZf|(kbLgew)$HXt4{G58V>v3+w#2$O`^dQD_u~bv(8g?_(Hb=C
z{(mgvzPj6i?jlW?6<))J@dd|*$-e83DnwykmjJi?CZ><Z<(^%oC<UoXkW-xVPT=l(
zn5?9iw(P{cF7i7+)qc=CKX1ZNF&>p8y+4W3Xib+9*Hg02xC=NMPaSc#zpdLq*BhJp
zY{Vr>J21E~ju_>kITu;p=0#J{Q{~c1R#127_=t+)K)YsQ>!v!(Q%{-cajxA{aaP_S
zrG_?X6gN($qu2Y61M2r44b6pg24UY*gLn`Ui-`zgldlAl_S48Fe*!9fjc*>SK7~0m
z?+%w80kNmdY2G>Gc1J9M?t2X0Z^lou&Je)R_fDM6N+H5d?TM%3!i)>d0?Nb*rCmuS
z)Ee+o@M7X6F64CQ0z5G3p!3Il<u1b?cJ?I&WFx4T#vrrd=rG{m69?8F);uxz$X6#q
z=F077J^$1CH3pnh@cdM+4-t~o+^vOMmd^=8C=@M;yOYFLWHAepXg2@Zn~;AJdrfTq
z8uObVsw_uQFVuEs!C~Sv6zFq*9i~($kDddqJrP6Y`9%X~7&mIC=Za@;pQo10p;(QI
zQH0ibqyvDjkDY#w^35L3WmuCK-TgWaNl<KhUE6CPCr>uG@qrS0ti)S2f35iFQ|d)@
zY=siS5lH!QhC8(K;awa-v;UenYvHsF6zPoFCRJ><-6qvbinYhoStlNX@B?7Ff;wGJ
zu)MjxGFY^(@cLLCwWfX&d|nKzq=D(jt}RcjGw0m$PF5l(pTr{%jd!=J%54KJ0p-6c
z=eXQ@@aqlz?GkM$ON0<$-!<Zwd9zptIjkEaVz;Q~OFht<4UW0V|EtR1Zd<qi8O6Zy
zfQ-NxTk^RE=$dH|GB>RYd`WZJ+k}!RH0hg5kGVjPJi(Z(-q(+Xft-(Q0E#$(pvtiD
zfzX}XpPJFmpmJ`W#O{jNRsIXy&>t>X=Xhbp7{Y(!ovsN{g{6m=$LM0yUqRT5&J{fe
zUYP%OHNUukT8=?rU)C~%;FlUx>zA`O$PgZA3K#UM*}gp=4JKCU4kk)R^Ia|T-W8;<
zhPwcRw^-x-xex!NhW0Jro}CsFMlyKKi_bsxzXZOvWht4ELSN!=9-vWXJ}Sm){?;?K
z7v3n25z3xQL)rOL=2FvbLFhT)O(_`XK%t?P3axMS1c^+J<>Y+m3+YW`3twCf<14<@
zpH}?9xVUf2=r`y)96W@EsP;0Ib&8`0*sO+VvAMY|-u>sh_TPUCk`eQtL6&TdCEtFp
z40s9o5){6PzC)!StUNX#t#V_TP-N;qoyc}XozxJoyw#52E-$4(Iblv2ZN<`5L*&@O
zDUplnRln`4H8~i=)UEOf6Sce>0|wv;1(c(4d9=lpQ}`(l$DU&HUBnygx))<TmAEHK
z>MLb4bf3s+i$}n*{=ZyhgV(i;_NA^+e`5h6v3osxj_^H637<V$ai6^%oDBVPLvpj=
zvj?1dWp+&x+Q#KxWUKWrh!inh2UimMR;vNt3T!mLT0K>#qu?`*1&!co5U)GMNjVZ5
zh+iGX?}v-IN*B|@tf!{knN@{Tz_zBbBjTbhB26X8EJ=*D>z$*lfJtHLFwo~{%3Rf&
z1@RC)7iUKU|C24Pw)1U>t7qFb9X1nn!X0FqU%k%40!KZu9fvzck-UiA-iqDs|Hs!`
zhE=(?U86LJQc{AHl!SzY2uw;k1SO?WK)QR<q7u^G-7VcA0+N%SG)PEy_r9iUJ@0<@
z_wM8SGmo`ghl|O5U)LGq9OoED-?5EH;`?9qLXS8f%dZ6NJZ=R*s3~Axuk)v-5^ldV
z`te}@S#KKPM;HRE^j93mzDs_Ex*t^VKU@HdXF)~P{&m0IB%Sq~Cq@OB-tW%Vr@Qb0
zrAn_0bRadBO#QReCEst{qHoiMwCE_(>S=?NF|JBv2<!xrs>wi&<t(KUirA_t_R%Ib
z05vIV$MOe6F)gZ(Xmg(Idt4kE10sUb1$x=c{S*q>r@@6v+)h_@#$DZD?N;tdX%wlC
zNo4e_e~}1|#AUMb?g3Js3v%BT>eJf5FJ%`L`)yloC*<DSaMaXlFw@i*#Iq#FwDi<)
zZ<ilO)oQ_uuIpQ1NYtXx#_YFd#RvoJF@rzJ$E4+(k_ND+(}pA)$%IEZ=z!%<thHGG
z&roAFOXt>(opUd6@x^P5RKL+MZNM%@-1c%5Bmgq#n&D$F@-P2*uah+Mf~kA*JeLq(
z<={KgfsyZ8{t_?2f(JHj86Jdb5c_4~{=PSXr^RyH!K1pTTMcT2O*cChGD0umvGjo!
zoG9PN)6i+NP?vJjW`ixh30R!c-G>mKQ@U&DOAI;Kr<05E3%YkO@T3K=AC`PMQ1QN2
zwmYxMGChro{k%eXC)Vqt!rM6982Vqt#u3gV7p=fwuTaWqg~$MNk<M*KFm}nFPQA?<
zv0-B^$xnrtEU2SPaM<^We%69=?qAcZm+EXhg#kJ~N^%mgi4?;%Lz{uhgIQ&CM>eqB
zeirQlG;~a7YS@1HM$3VH8lt>)6m;zK@mg%0Unss~WXKt8+BL_5`Pz6t6@at}u!`Ql
zy_)i09843T2KsV%@kMHjnm9^5f4<MbedgT1jGyq3`^P#|G3f(Z`?oHdUQbOL^%cGR
zH4K(s!GxAyH&EcyE%lHkZPcUX?$*84A_F8TCP4pFz5450iYcL6MWaC%@TN=!Ben6o
zCkws5-pZRB71DSx|0=Txk1D*!@M$S>!M70!hN3gpEcSk+RCS$%>Pu3}QyNrA+Dp4v
z{PBNz(-|Q;x#KXNu_2(8LN{k5Qj2Y@>`@tBF(2O#hD}PC?XkxY29-n}Gn9tzaPd7#
zms#KS^|pm>^3La|keP&UEz1x2r7b82S7O(Q>2!B)bsT+s<~&`OhuhJBDAq&awRg=m
z67FIB?T-`(h9AQVlAWQmfDrr((bOlyxqa3U_^R7$a1(&0%aI2nXw@P*5q3aW<agPb
zMi*cU^LoEMVShH~VhG4wz0AG@1O(QUxvO^1_boLA*2w=5FKYHNCq%z)LAkELp1ec1
zwe`x4;;uhaccY+Lf49%&Hu32ZpoT`C6W;f!c#V=_jPl|Rf_7bncJ5}(_*3D8wHkKq
zvV|&|yvX1{=Wi;sI3;LX=`bjRd3vXxOIM?3*XJRQ4p}>C!j2h?b^wYGdeSWnbZn#5
z*R#I;X^eVE&R$4hFW3uaw5X}aTH1rS>WI6t&IA+~c4fl=h-lWWPG_fimd;qhtEjn{
zAOJ_v5(2vdx|w+147^ghu9suDZ@esG!0ex<bokK|8cWPayE8V9Q@J#Av=6b-A9Q2=
zh(S&@+}-|E=dLFuF^ltQ0~+r2kt1@V<x$mD1<C&7Vh1qoO)Q;5QpOt)2MAr_R-reg
zRcEy%WsbePs_bKMMd>5Mq5JZkB{NabaeM07aAGWV0G{Ehi&_7D2eQZns)WJr&-i^K
zDi;BOlw(BH{)tQUL!BTb>g8tK`Uyo~k5jY(tlh6N4f&};e7fl9W^1<=y3qg<Yz>x%
z*{%Qv$G@~pggAeLCCv4)0>T=bh0?L7q(6Y!AqwEV<NP8lLYe}Qd30+QkYAAql${;|
zG-pFYCx^Fan!i@kRX@$Vkwv>)uixr%PKb`qUKz!t_E?1f8~{+%oU%>IxSKTL&O)om
zyw{Z#Foo-p%h6VHY(`)HvAv^kUk0yB$ay4N7TDMp95-#G5GAGoz>QiGTK5}@y#^96
zD81b8izOmt=@1*6s<dbG4Dzw*&O6f$^-P~?3w+<=$~|=rC*Ej*{2g)IT!3-gK7h2*
zh&{n94?7#W+DqKwUYsJDs<xOg5$~9ORa?7WQn{Y+b(2G@3JXDXkP}SkQqNErK9Llu
zY*<SI`@UtsQTD^n0D>TkiiB6W4AF6pLl1=fdKyHMkW(cGSDfn*;29C18lX{1EwMcS
zMo=vio<9^qe@%=x=4t=EJKGLT+jS*lN!%u(k?tQ;{fdt}tPr_H57EDCD1~<Y%`G#C
z+_J;^a~T1Is{fsY)f2hq>JtG9fqySGQk9&!ISay65X3VhwSqSpDC?y0iqU}WoC#z!
zd(D=Qr31kt;N#ohlYw@hSs1}2N_kpB#0`kL$yx*X!(=R?XEv{B(hAWu9%fSzN8O0i
zEhd<=yxD%1Zdxt6KD8|q$&O;-kK9<5k`FNKQ5Ej13BTIvu2&AX#=QHhh99KOoCNv$
z`J!p8^#+UHY+er;EtcwuN@Sm^Ih;HzuM@O=cO^_c6z^+N*@zD}`3G$>Ku5IrmF^Ea
z+yJCKhN@3vc2l0tNC|pwe1pgP9&xEbfHN*}czW;afnlNWm%7Uie1a%_H8?7_V*50w
zEmx$Af<*5wbE4s}rTlgwjzL|6;p=xVSFt1cf`>jgg#b=Zk;y9*LFu>_JOsB@;x-dG
z5qc{e@>A_5!pq$M_jI3XKua?*m~Kgei4yYosM20n^@Xps>wFLt{&q9W;gfyF1nT$Z
z8^pjIaO7nKd6dX!<Ay&2#&ZRr%2gIC>xBKSEk$txp}bRdc611;S~o)lcK0eMgLqun
zTsi=8<UT^R6o^dZ7Y9;3BEnuwFvtO$WlZ=<wd=_E_eq|$0X@=0H4Qw0YHUbR`x8wh
zSu~<iQE?}nN4iFe4fBCj&fNP7=^B7~b^tu@KlbEe!oC@B2^vT&@H$|-s*7jN3pcv#
zmHJo%W<0lCO1emG1x#yh)Q_rI42c_%i%<Cg-aE6)wu#E6Rb}c6YIPmAR+%zW<j#yH
z44~PIZuIT+P346Z2A31+zQU1qqca=m*4Hr2EGYYF1$a6`{UAn8HHh578eTC`SKK8B
z(jb7%h|l#snZ5*L04AISXaW#+@mRXhkX<`1%-MIIUfkrB2`9ZpxUk>2keT{0vel>r
z*c|qUqy$vBG5TR9qM`0oTjK}Cho~OtGuE1#E+H<U(ZDhRXMZpUu)mapydqX(RTPc8
z_D%)By1wm~!-x9{hYQnKG`1DC#t!?>+XwwgkaPBsOI!Kfp8)u?@HM{ex4?wbfrz&g
z4}FtU?m5q&iQJrm#=UKfdJVN2iuC;6;Z0s1lE@%x>}j-LN^`^<=fW3VZrw`bZ?V=~
zc8B(QJ?qPj80t=cRUAg|E>+5`K>va0DU7DnV2*0k&YCU4aHId6;djWd5u+K(p5HI`
zEm349RDNQ$fpC7$aS;BVfcE}8!B7mD%PMFzo)oJ3wLOYcqBWDW$xfRq;wOIH%smob
z0f0#xDnpA8^pHdTOdA3p?hR&{d0sMR!A@KTC^U~s_tQF8$QOfhF4zC?Df+)YFIHs9
zedYd>*8`Q{JN(|nHdg7NxjC<+Xk&inIKjdas>J*omhv+B?ufb-G9#U9$;q@o4a&EB
z2|+7XcQp{j-kHzNl?0OhnX*JmTC@<-uqQh|F|u!~XiWXB0@>^3s5B6|b*?D;#|SL3
z3I$Lm#1VWa-~roO<NUIs=Rp?>g$er4C^98Zr~X%FwD08Zqh*oqY6C+?^oInh#|Pci
zS)#ru$l|!@`wA(A-4Y^%VqKYT5A(&@{W3qG$IHsfwga6iVp2;MFu>_<ayxocl6b&f
zG{MX1w5f_e7B$+*yr_Af(Gi?{i;S;p8zz|^58H=<E)hEan-O1RXW~2`6Y{iQY?c4Y
z4H^2+U2T3aPOEOno2jPKvI8BM^~}PLoB3Cz#b6ZM1w_4KR>@vi#xJ2!5QEvS?>f2Z
zSbhXbO5}5KMUtn!Cc|PBTHysL>FH|zCR&&$?az+6OH7CfD1;2V9;_O~y&>X8DuF`O
zpBN7}eF|{`RlNrgEbTohcid1O9f`2(1u!+Hy?iLmGP2ln)CFxfuPCyrb^Z{_)SDQ?
zMNP)2e)#*vsiu#!pNMV!z|n7fPwe{OA81Jt)JNiZyk|fEhWZpxJN375gQ7JdM&$uy
z#>EYkLTRxzlD!sf-1nq?45OZlR+oxihjb}NT5M0a_^Jq^`pN@1d`8|c&R4+GXG^{Z
z6<z@MNh3im(nrC9zF>ihZoD@3z50sG7c;t-rjv3ffNcrA52UN!hE1of9sj=t5YAVm
zGhFl~RQ*>fk>N-;Y<oysdJ!;9jMUZYrNn<*gkm3GgidEDQIC8ZH#60O>4Oi2j<~i#
zr`dXvhp~1tb9eQ-Kfp%h=W|9v{0OC;;AOGR-<>jA<NCGDj6TW#Q`%d-eo*97&A7ka
zgS75bmj7<f4jReM(&jK7(J3|{4I@aR2vIqMJz!~Ru-l%yA&_10Q|oj%!J{vw3IKe`
z&o^eMuuFsb?Zuus$Zl#J0fCF3Wt8`(+oEif&+({Wg!8e5v-gVvDvdZ5aFkaam$d$V
ze^=r+ewp)WE?W~gQBRemZm%yWiTnN@Wd}!EByf^>sCc*S!J9>F83C!eD^a?LnS82n
zDi*NUF~o#YF}D01<wXe+cb;O(4=|h$#CiOJsn4>DS$T{6(MwyJt}yDb$#*w<<myM8
zK-ka~$aW%jn-M#OfRu-?XG6O`(0^UkWr8u`5M@ZaA?43wITd-MD--R4iAy00L`-bK
z-uttARTN_-rBA8><x$-)bBub2VC8r|TQ<tzE8F-)uds!aQXKgk{1~7jWj&2J<Xk|G
zw)6Fbc7I**w^C$*@_)s3o$<*s794?f@?^ckQ!tLmoeh(ZV-EpPYO9o%zf^oe+5}TS
zh2w9M#e_ev{6im*^DtRxWtx!Nmr)Brt5BNV<q?V*fKY|+^EhaUWKpw(bU@e9HaM6O
zEqT<_4s^@CmH2b4#&QW<#{2)OvHaKsdv3$X=*|@BMt|xjUW1LJ%Ne!qK{rfp#v*vE
zcKe6vIG~BK{`6j}_hW^xw%Y16Jct1yc}hL$UzBM!6Tq(4n)j$M|F)!mz~X;`uL(x#
zfG+n)HBe58WpN}vqTSzCi_=NnJjLv-MX_!|59mp|hfKJgbjW9W!3P8r0a%of{{7ec
z2>{CNG@6^F9~z4GH2Qx2B<lF;g^vP9Ql1X-?IrrT8*%v&D*w>b<C$M#aY(R6chvE;
zUbM94P%zV20w%Y6K69hK;`ev@I+Byiwx0z%MukLa9LcGt891@4;nmql<jz26U;Bn`
zxQn2YI7Lz72txW;|BGVG403~}9HTVrD(@nHgQdz)GTlS$0>>EC*yT+;Nz0I3QKMje
zcTEGWKFaB+L%vA9P*n=IooCQCTTp(4S6K&?3pP7fEX#{D-w*uO_^Il<*3#n^<WHUi
zzwa`O#$DBC$V$Q{oQ%c>`(2kdn?Fw`3i;%}<Q_md<UtVp1REXzfEc~{MJ#4JjzbKB
z^->fY^%OA^dZH@260ZeD;OL1ok4Aew5QkQxEb;T>1zDlX5f13%7#Fq+(oOa~c(ha$
zs7dn%4RB)^oOI~IC$aaNpeN?&3P&WB21Ta3QU=cD7xeaZ^o#3YQ})aux27RgYPr#V
z6&uW^{9qz5VWV%0Re+$L0SsHc26oWQB+{$NCSQ}ps))xr&OQ65wtgPjHMXXR0;BC#
zC7%V1v*+exOw9f|Dp7CMYW}HZ$nq3sZ08q#T+C<AoAm2!Rh822Cwp0AN)*KK(Ym;P
zde!3@k6W-Wgx1k}Bgd#ee#T=y3KWPG;yIBI4i3DY%%TC;BAyK4^v{f=Kr9B~ksI)>
z^7e{>Ot;yo_*KG~@H>r<3w8oAB4%c0FWnBaQA9_D=CFJa+jdy$xmVc@@yMw={gMlI
z+cULF{J;KyZ>cF_Jzf}dc&(i7bNA%P8mjj^Dyn^gc%Tu0KJj2_Zh_ml{N_=0Ez~N3
zf2G~@fwR}Hs^Sd4R49Tq>Whf;ntwIUMCTC-a!u9q5N&V+ehK}P0Z?JMx-h_Ur?=#M
z50shOk9(|A)M(zNk4+8RIjXnmVwZPRHct3S|DEsw9{(v~UN*>)G1^wC+m^@Zlk`93
z^T>xk79yl*8wK)RIm<FShYz^eZ!^hQ7qaf-tI-aPGv3T1)3dJ|V1@s2|3H=*G)soB
zZznO&se}A>=NbP!2pt_U^OdmVVJ;t=k}QV^8)o&;5L1{qGGw%Td9yhn-h2*UY(f6h
zv(zdHqvZLJ=o@Y^_vfVQZ9(9?SxWL+|MLYAHpPyhivlB|vnand@=q#s>mb(#sl6P;
z4p<u^s`#PhI}#Zc^&IT*pxd9slCWtp;Y<Ji@g;`3pnWng6^gwgENfes{|@w|0F^ld
zqJ%>=R@%wJo&pw3ExVs-ZG#t+Kuh_?fxjO`ly-kQAJ1uxR_5F0<nJ-&#C)vj94PHC
zt!NE!6g%Vey#m2*g_fcPfnzQ@T>}oy!}cYhY5B%Sf(1-)U!CpF5f621Z`*dpHqRFe
zI2JCeSVy6*FSh&L2m`TB_LlRd`_XLL6Ni_5JXL?sr*RjdLW=F&g6ZII51Y4Ze#1BN
z&0SjVIVy1OlQJg}e6fklrpykkb$w2l;wI{2Qys0+OlK4~+V!E}q=NO-vae>O|DvdD
z)hcINB|;r6(Xt0Xv#FZm$4-Oy`CtmtIv`K7o`IzDbP0&jA&;1r_>wJZMvFLd<O(0H
z_L;cdC#6K%FY~^UuNPsYr$6s>nmGzul})l_5>V*sS$o%5RsyDlzd>czjo1Z$Wg)i1
zm~*eNL8Ww63@7L@LSY~S6ef@5eiDNo6P!aea1Q?`UI5-6Kykny_#Z-~b9>tyLVuMm
zFr&iQb)sqff3`V<qu3uoazNyEGp~26__tQbcnc)V#Iq{ZIot?bCqDTI6~M!roFK|)
zo{Z6~m#69hHasK*2YeUayFG7a?W0}<bL#Y8r@(0{3@&`2sYTv2b|<3QX|o?5bts^(
z`I_L#)CN3)MzG`eAJE?hW}kFCL#+l(@=|BjgT#QME80xC3G~eCl&*mlDQ8_gtVyM*
z%NHYFbEa%)wY^b)SDSR4Q+YsEnauVfc{7$ShKlcM2Sl}}$5ad+71<^e<HNqoUK37Y
zM@zjP$DKjr0-Sq_>{e4NFhIdZ5vr170^-1^B^;kKF}lfl&9mEH0gPdcqE!>Xf3;e6
zA16>G8=SVj0&x;L@7wE|+^~r!sI+HW7Vy&dV1<t+9bSueKOoTH#MVxMXf<U|EAY>s
z`w}zt@rnYF(U;YWePFRg$xadnqmXB${cn=zBbP5v(?(96D{mcc>Hwkdt9HptZbFGE
zutX59T~#~eX%8-lDF8;w3x31(w!)F^TXZXo?wP|+MQrzh_=s4?SDc#<X!i}j;l9ei
z1;XMXPg7lK4O+l-tvc1L$$}v<DYdb@<l7cuzxCqOdc6V<UWt1nVOyomoyhgRpFK^$
zWjGyM`5XY!KdF~MqAo%;wh{rs?-^KeBB8O;Lr;|ruv_kW4**TC#9%<Y;`e#N!qQvN
z*TYddZm|WlM%QQSxq(f06fsA9hbNQGI@#ByIbgYwubhL;qE?6|z}e-V9vw{cu{G+~
z&lLf1ZWa;yNP=b0&;H87`y_$%KlQq(-S96(MN|)*LQn(1NC<gVl$E2WzkmPEsI^*J
zh-eOl0B7!ZCK^9I0JVw2lh&9KZKO)yK$P)6y&6^sC|p6EQ4Q_+*2ayTbuf@eb9kAA
zYV_}Z-2K%c8y<}3Y67SP5ngGmZ}u9V$Oy*XM25%C*WY(NrCGifqu)#XbndP%_S1>s
z^QhuKNtwJs{~I|N4>pV7zoZ7sKbbc5d(=CA4xi{rx1#s*=#0DF`5T$-_JG+yqu@W5
z3!taU!3M(L_t7?zU)ue!`Fy-|4Gb0tC@FV-{VMrkr!ESSW?Up3V~J+`q~E+d?R|ZK
z+2FjBh>jG^p|5AR4(!7d|2V$72l*ySq<=OQCUW}@*`n#oB9WG$(Eg8SI^{MC!VE59
z6#SWvS*w>ORk!B*$(w|#(ojB&iT?ch<F#xuerW6dkNaL$$4J8V&nb9J!}UEs>4J3_
z1okF-wo{Uijvwe_VZJQoBU5tE_Zd*%kX-GJLDh+9P9xa=;9^Ja$|!0Jps#eH_S&Ij
zET6HN`=3&ZVmv1{Q>nJ!g-ri*YRz=LkB>mp#*Arb8k5{oJLF4=Hbd}vZBD7#-|bOJ
z+};|<;=+0E%b>Qk?ZDxSzn7<d_V0iltKyVR%U^$pn@|j@U1LN|B4c|D#4g_ewP*R|
zhb2iQwy!Dca9l~umcFoi-<)WG<kEvzgTotM;Pe_uf#?iZfFXaw*2lPc+LU}lw#AF>
zsmh=$Hld>-kxLta@ElN&<R10+dzm5_1)|H8lm9a3N0+lPwBW51864Igdf$2M?uSV1
zZnRS|l{@cffk_S?;6zD*vRKT0Sl4APXWJ%oO{u7jyZM`d<NBM`i-FpA1C(#sM|_Eo
zXC1G0YUK-xcp0<Y%T#Sk5&c~#3m~Cp9opOeEp>JBK)@{3e)!isq8vr}Z{n}~m^noK
zQ85*APhcwk7D>6u)vh2rQ*M4#rGWBgDb;HO7%aWrIIYj1MMPgoj8xq^CNbPV-!&Ta
zl|OEBh%h~7e`Ew2cZ819t2~4Rw<72mKnP~Ud6=jq<#;iH{KG`&f>M_C3IS_*rAhu=
zk$!bmAHeeP{D<>p0cAQxEc3<mpp<HXJ4zvF%V^CyY_lnK840BtLLG2nU@LX%TXr=E
z?eY%-FwsQ&&YMaqh?|#3yLuHC-gIKMVdu`uwg2p?rt0u!K*9CPOzYAxz)L`;WkXRm
zla-m`2F&a`Hu!URZ{^>ptP4$jiHOSiJ|wEr_>BB;g7l@&QTcx>1CuPBA8&xJYQ*w;
zFq#o9-v>53n3pzXl{vx7wjdrqe6^A*?}KtfSq~(F&yPoyh?|Wl`3JxPn{9W{C~b(+
z{X(<RTpWZL6F@?zB3}W~6xGr^!0Wv1JSueqdXXrX1ZETu_6yNZ%zPQF(E=m}yU~J#
zj)%luck9=P&NOW9Vc_y<Ip+DAZ0c``VmCUd2x+ih6$8^2nKc6uKGgGz)0ok##~}<l
zdu#wI?^aO~`a;aA&J_8=d`t#e4y?MFLHVn3v&)Yz=-JqC5X)vXR!+mykE>_nOQ=Y)
zoM^tz-?XzM#r;v(^P2-Og4ZbA{&rkT0%dL^onl&~DF-OLo&%d8vCzpd;Y-sAnBSL<
z*H1@8a1dpy2RMqz{^{@dO(mm8iS+Ngl2CelIB}D%8bnZ6aVG-8!p@?`0XN7gUXIhB
z5PwqPDxj>Raj#uqlfriSC}<qQJEM3%^KLyWEV{$L-z!XBK>>~OvpB4+MRIRH)1Z`X
zSda0jSvosveg3r@k8v;2>Q#Wkchtn#4gbHw4>#b@TDIm=MEnIp`4A-QBn|)We{ky>
z5X*t+dpIQ-8QCO&N9O_Y=$USV>IK6fP!v)~InTdEC+tkupC1YIs-3l$PpxA_1aU6z
zeIV+4&t3*(cV-$sm<pm2$%{>T+J_A^*IiId9GdU!uDP#*c|TXZW}X_!#g8EwK2%yA
zk&+E;U^a_zWGJW#kCw2Lf2ga)B*z-_Z0bvNK!rb7zp%q?7))s`zVdGHR&cG!E6l&U
zdEnxvrSIrU$YiiT`E1V*2?a|+$%h3tT<N^StceOE8?cToo(K!TNJt5-pk>yA$OXl<
zEFG@~cj?k^U3W7uR0j23Feo=|t1)Y>=-l+o1gSUtp_CPzXr>KUhqEQ}$;eH#Qcnzo
zkN<hw1G#yDhx{%nx;TOMi@wOxcM*LtGlHaOtE+Y_Q;SJWJUaaO12)+rmzwTVOH3E4
z!cWa^)}lv84I;b}ls@jFDHwJYwRTeGx}G?1;k3=~pUT3lSk0VZ(b37|w9IC?CdPv0
zud!QPMSE{w>OAi`?o5v7K~Z&4Ow9S1iA96|9=RmEh#Et`$}~bnslLAc5X?Y=Fw{|Q
z%A3lbSh)TM;PML;DPp`5np2=yF9&LeW<4qee2yO;V*VqJI;HKTILKxODt^4g9U9;m
z^ZiI}2{pw`6+-{QYf!j67XD3W(xRqUhsM_gzG=}?XppGst<+B5tmr4I^%}U*#ihaf
zd9;2frwF=`duzkJSg0=fTyXr%e8xu+4kmf~bvMZdU8`(kq##r%F_QNWn=f420FM*g
zUISLNM9q0qv{YWv{IrAhh1S-Y@ZGv)+-2S>FgyMD)nbzZr^Kvtx^b^%hAn=N<2v5d
zckNwE%M>Z=WSc{AYc1l?l{Rd$bUO@dqq}h@DZ|C|1&M0t_5Z^Ks19=P6kkXK3My}>
zYpg7C$;#H&Q`5jw^(<I#Rcx1QP6x<|)FNzjfa%z-#7FC(i~h3OpnG-a>mzarH4&w~
z_F@v(Q=`m<q&%fG>+7O-?c*BZg4yUN4t7i#3UJR7-3=TWN1GQ`n_en%O5#)B4gCp3
zQ+WGs7^CJ<V5dv;#D1+{P!tXJ7=;#T8KB~nJFQC@d)>UhHlb(_T_-Gr=7Q-blb*Uc
z96Aw38R0_Q8>W0p16x|%mqYl1J;{Il-VplUnne2LaQ0AjZhqm)Q0eHvnUI59L|n}h
zrs8wm%UZ@WU*>-0g&)ANM(ZQGf`$c=9*m785Jf^}#m0B7St{gYM8g+tYen)kL4heY
zN_Z2TgQ(C87UlHw%9K(B)Jj`#p?89tY22XtV}$Z4!^OSdm$f1_?>21F3km%SV#}2#
zm?69e>7H0H+(S!ct7CeC8`$O$;G>hOih`CJWnQ1~3<ZnEN7NBL|Byo0#@(ie+;!rO
zX%0KJocVyt?4rE)l^nM4?(gH1?NbifE@n#J>8)cY8{&-Sl8=Ki%M3R^<>#o`*`hN$
z>2h4BCd;iq8=!W~qr!IIn_T&fK6Z@Wt`TQoxWWo^7(%jYFHZfy%5Mq{ZCB5KF(($U
zSf#u;<s?_Y`&i~HfltdDWs79`$k<hri5JWlZn;EW&NlImri{aA`Mb%cti2xiqBIjI
zm$e<BOsGfq;!)u9IJd3e$W5YW?OQmrS*RRJ;;~r$a5IxLpFDLU19sK(@Ge9^)~EpJ
zh_sVJrEenQYN#Ds%r#V-wrO@moQnI+03q1eDxL+t)_e5(O;$56Hm0kK`cvH3rKC<-
zBBiO^jNT|Q;0&ARE1*i0`p+X>7v!aK>5>heTmrk6KExgn8iNiVPKh~Izec)N8Y}rq
z$s^|xXDaV&J36eSsHkO^L2~DbARtre1$Wl-IFKC1kH`~EQ3L-sKI|b+c&6n2;rdCu
zN-_6VS3Si5sYT7kiJL56SWrySj`+w>I+$ye>5EL7y)2z+^=>ugmGRtd*d-8Ro<f0r
z_wcOb01eREJD|BAUXBs4<6ldYH}(1oNQ=tWTF)h0e@usO4!k{f0&5iQv(b|9cWD|x
zFsIeDtz;=fOqcSklT?4(DgH8fcn=Rad&Q3OlXH!kX9BVtAFpKdh5}edpPeVIHXH4F
z)N$`=^5S<&V4fm0$0WLTx*R4V(4xO(gn~np_=(3(8F8V`C{K6$=Y5QT1|*k38&cRF
zcNLorj8G{T+A_cYYC${xCOI_^|9cs}MY^wvXIjSI$?=gNahRx0>dKB#R83a<MJ4Ri
z!%x1bMgRGCwhsGVkdIcrKDNehc6-$+?$>Y$&G!0;i*vg5T1m&|(`uPk?XF6*n|dc_
zYYWCEEDp%YRlrJ{)v)kZs>fyqSg)Hyq2%#-NsrG9t*yFZ9Kh)j07pJ+S%Cu_cpo)z
z*|d!r9$|vxk01Z{f`w9Mwfc++nr=~2>*6&NmfsFf^k=xBj)_QEtWV8kv{sFYpnbj+
z!{lBL9Op+%HDuV!(no=j=tSfDODytBh_WlOAfv!!t7jERkioU3X+TkR3oWf}t~J7e
zY+YA!c*t=j&eKOmZhp2AtP{r;-doGJO@c~z_SD;d%AT;j<8;`_g@xdkO?4ExxD^nU
zU6&CJc=uYW<PIKby=|b^4<dokF%&UEExRp-CY6kh4ZT7H-AkE@;iayx!GAL`--asx
z(R7hh|K>%zF=Y)$Gq-6NrBST1U#wqpR4yEg>K}#5NM$KTD$9y94<dVA)vtQrF``qS
zgdeuPz{qH42xJ#^T9eR!|HmH*Wte0r{ge8`G0v}f%XhbXcfm9rnkFd>1qHyI@+L&h
zZxFB#I6t{*$qvs%1pAtlNabdX$LT?PioSiN<R#kTm?Yn6tHgB$mL&EAyzn~o;q9nl
z&H?a}XvtshyMXxVT;iG!HxCz%A4>AnA%D0roaL|mOPm^huywSY>@fJe`Ht7)JJ3hS
zvd_%(X`SkIM@L8Rq9Ow*_~YL|EztDyw<h$ZV-y@bvlw&F+j=VU_;K40U~@r;Jq+Bg
z)WW@b#Wu^K`GuCMAImMLx%1IWB6k;I2le4V2^1ZogJWg>=x#MmpTraAz?oiqkw3P~
z=9CwUVV|OCkL?nglO{d_Cl>1W0PYE9p3Ne|d(WX!;ebqmQFV91NLto%SqKN;inG9r
zV<gF@4kMRW=X@nfiCitS+4HO1!eg@-Q=oJl!<zEQQ4flWg2M?FECv>D-#4#_Z?@xW
zJGNizT<?4{pu%I=-!2I%4P6F0Mm#M_?w5LnTg4fk&A)#E?|?<8$rkGei)JM^C)(e$
zLLT#x8#*@IeJx}8<BO46ErPRWu|CfGtmx_jY*0is?l$yo;pkmAn~a=2ZzyY;gLGr>
zX*4x|O!B0r?oX}J&+@vg{0S5i*Cv8naDfcLkv;Hyb@jw6x>8*zvj~_!Y4&MJ-Jm)?
z(VIW1UJwr8Tksg2B)s)<o^5a{vYQ8KaEHgM`TLiaZunWz{EwT<8%2GNlQksx<C=AN
zb)g%L2(>G;1zk&(DY|+^3k@Gj&!^2{-&rRR)NentkbKu%b$Gji!M_CHyE5)uLh!&+
zx7qB53$Wy*N+Z3L#<*ed{P;0<VePe-SIYq=N^@UsS;scV&5g6^Fd?|Cw(k)iZ(mV7
z^j*V8u7?zW8~k+Ko?8>#H6-8l`)^kHFaPvVxGIWV$n~`Bo1F#yWY3rn9&M1Hs7Iw|
z$ch|AG0R!X-1C?1F+U4>Kd;Vpl-v^O<V+pGbGCr<MX=36^?S>1mbjXyF0S@(edU)f
z>Cx8lt9JSAfGul#Zd@j_8SxrYV<A|si`_~^laqv}_W8)gRhN!{dEqFEB6*};pYW|4
zL=Oe#`~XLIt;$_&Bvc4)xIU$x{8Z9`Iwmqr=q{9-CExLbEOMcsw!P*fA`~ooA0ICR
z9h=1Bj0ZN-VAiw(_BUe?=A=BPYdfebm+fvPjCt>=N)kcix^PJ}f>&kn5=fLQ3X2f!
zF%uQR6H9bRwz<qs-rdl6I@es`*uJ`#v*D^!G~N(8Z0Xeh4lT7=jj9UvB)qrJ7cEs6
zRgIi!LWpehxy?LeW4^gj+jU^R6ob9$Fh!l{ltdJ6n~I_86s#38q*ARjS4Q|$k%uKJ
zk>BHHN8e9EfE`C94Z2y-Rkv+;zTbft!davq>nPNZR$78jbGWk1>YBS|;iR6@C~QNx
zI)q(7K5d5+IklEo)~%&>1|*>3falZ2HAMhMR8`Qm;9k?Rleem4<4(;?Fw2@Wr4z8#
zQ7RK@;Dfzfe$dKif#np!bG%oQSemyOIkm4@thcaDb#UI?dGZmB++cJwvw`##KTmqG
z?+D;+xLQi-t5lDaw%}5^Z-3WYNLo(>BJ-nlzF&kfA(S6kPdBUQ3xlR-tR{*R?6^@p
zOGH4W8qBdbUENow`3s0<H?{C_9Zj(oVWa}-6+^UUUDko=*SpPUddrLy-<fg=<Q6r%
zTa9`iZp_wKcAsH2V3yS_VJ|n!04oHEvto>7AgrvC2e<NMe?Z110Wdu_eUF1LhW9U_
zWqDWX`n90d{ZnIHy#N4ma?hCs)OA@X_cvf8MgBy+c`AJC3dws<{L?*%MQh8vv{~Av
z$D6DuMS&jI*0SZXpoYuTb&zdwFZs>BoUwj?0AWpTh4&o6L>62_#$dvUo<buxV0ia#
zSwKJIz0DWLV<?W+7LFfJaVg)&;Uo#y2C;8t&4$!hUr6<s0A=d=Y0qBko%J@KC)>#}
zl<Jp7z2`D9Dd}UryOFc~Q#uO1U98nBYB;Trj~mEke@b2wf>b7h2L3s)Jg6TT87=ay
z4=@%o8OEbqr=C5DJD$hAVJOCk2ehl*GdFvnqJFXGeP?k^tGf4Qb0e%`I^Z@_?#=)T
z^b)VExT%zA_AT4h=)i~MJv$k+RB^PG!|aDN`^l<(14f9t7cG-RG-ho*>8Zn_W;Lr*
zA|v^tgHZ2nJjVs~ypacgUWciyjcv>f|LWBXx%T22)WT0&+-;3#COv7+A*SI16#lT5
zYt?<67yHG_gD+1$_L$$4XRo`P)OIi$x5DB+O*JbFjAFgfKlz%iF4Eqd^GQhgT+N$#
z&hgwY=(p|t+z{9MkSz)oix=U^ta@={bc&8?6=`zw?T_qUf53V($NzjHLK74Zyy!ek
z1y9IJGXF5sYub&G^GwKjYMo4ZMH$}+W~x_@w|%qd6u04S0h&+2<&`%bXDvpMFCVg=
zY_QUhnllY(RBPenn7>*5Nhs8$VV~go3<V~Draw+`vNsN;v*wN&@5e}W3Nu@OcgJ4h
z|22%ajx2Pj@hhAsIa>|Ob4TAjyb_E|l#A5a3@N~+6gO+@fjdZd7x0k`g;$V~UU6-?
zI?h*qbZgO6rGcTAjO4{2oz!n_mA~@&HFJLa5!-xyqw@Zx6Hc4y`m+e!In+xJ`;w6j
z0qLvTvGP?gjM#HM2mXT(Z)$-Op31XJauL^k1G=X5l8VgcC*X?a3f$|E1^Y@V@ZP8l
zPzD`bsz=aKwbpvvUaXC@c)EA<99Js>`A}R>-Eo@4d5KPV#?@&*H;icR?$o@zVXr!<
zqDADu$y8Lk)yAy|y0@X_xsd(-bxK|>OgSN?RoI1hvP4b2c}}%sK2{BgF*IzC=7pIW
zYbbRCB*tWRmXSm{-`7NcylY@kx?>x<Ow(N73q;8&fH^{a2*xk><MrPsxuhSj;79$R
z-}&J{K>jm!ZX(wjKl6vG;1#v%2RbckdiJ*{CLX{&xd5YJxq4v19mxu&&2Sjxxv_P4
zcY7wX-1V$f`sG<@i;Wu~4CDYB`^VhPmurBi#l!M?3x;wko>m4tyiT=JAnCX-$&k@#
zSwt84Gn;rqL4tdy=+joY8v7afi4PJ=E=W^^t}k|@cXz_vq3|YPilxL3tliUZ+wxpA
z#u{0`=;~V+wS_L;&2s)N?dfO2S&?fqb4=}XdPH$a@qeo@U;M2w-77Fto6+5^FkNKY
z*KQ<&@Z?U74e6_=zYpFwWQ~jD-IMt}D~+nfnT9;SgsObJv#eY(*laH&nsV)6?>v(c
zX=lzC5v^L=$BNM)R80wD-OMNpjJ%FS*3PD-SSBd0m0j|F8Xs?o9K4Cnth>feq_#;l
zNaI!`=#PkTA*J`zoHaeBskjHEJ;fi28`->Tqp2quYpz9dEN4D2-<%MZJ=u{#PvwnH
zR^OwLEpq>wJ(15*pcY2lBmAn@r7Yt5>~_^g&Wrkt<PRwd%xfp|0Ig8)>2e{r2^xMt
zjgQx&j<^`P<B_@*8-?MB`7kJp%HLkRDQZgKfB1ZVyC5-sjy7B_;5#<N!nmcVv_THA
zV0WuN`>IdzK6Z6Z$(u4S`FI+4LxJ<TusN^QA9Eyht6qyiu_(n{Mlig*vY8U6;=$t=
z3SaT4sg61ORky17tXJDA%QhReLf)K7(o&H#dH^wq;3SFjMrO(KBe+2VM)KH@<v#aY
zRM@bWXHnh942;yQn`X%cK~n}>oL?Im@A=sZ)6<?{gJZ9cng!hau=#bi1sHKu@F-q#
z=9`ZRoPp}G7mLNBTp(WDZ~C%ng0no~OhO$r39}2{cb*sU)>&_j>~T$}S>V~}$SLte
zpU`|!6EofYO;PVwt0=Cg@cK~}*rdvol&<Qv0@%*Z78svCx0WI5b$6(|dN$csIcL-v
zMBL?m4a|d{;WKvEUwW<wK1~e=vfhb>7}g+>sAEA`4aCHJgUPtN^69CzUq3qj18L|k
zYupu3d2h1g0;pwOS6_h*%ejt2Nv}scI;pFW2^ey9x!bb+BJ6+vf_|^SYDYyxNqcx;
zVF2qB866y;2u275qbo)o5y^`X2*zeFCp!Ycs0HO8Js!lu$)tHXZ$6@aDWetQxt`%4
ztnUSARs1V!5x4fz-t{@&3bsNh^(rpEPy;`pf@Uqis)iK>_AzH^4T1tYs8kQ;IR!nO
zVJk*3NEM*P`i+1s<`@)D0ZZwviJY^)27UGB^TGX`FUnP}*^_ej>9=N)BO~ATY}oxz
zNr%<c8wU?Bm<&F}7H$??c^w}4R@1%;>a>%)Yn6b7?N9FdLT9G+S;2Vd(M!@9Un}MP
zPE_U7?lZ5o#Br#PQR=V?J#WX(Y{i^K_TmkDZlUDW7Sq*_Jo>DS*HoxWt6qgR(zN@<
z7VkD1!sx5JWUa+H7Vi9sye@5+XNYGpp*uKG&s##*(^}ogLpyNNS1Y*9z)6qrxt)6}
zP7Gd{<vYK$yEE@Qli74lzrl|zU)7SmXhA~o;x-wKE-Gu8w#q>*`247-?FgI}D>P+b
zTDi+}D_XRjhy!WVl4O}}0L!UaBY9t?KH&sUiXGdWkhcwSCQl(1)aH4*xz&3B0d=Vu
zQ|}lr)|!m1j5Ok*^6CW66g`VCND*amB1+?()tlE~gEZSr1y97UYrUH1;oL1y*X_C;
z>#fMxRk}LoS<HS9DWNUodxo_lGuGx%rN6+E2%cN$=^2Sek<1tE6W%u+o#cwR-LILe
zOh0>_ZUFa`7dO-EJM|eBxiKNt75VB$d=BeJyTJbWdExCup;})*Ll?g~F1P|V4I7Nh
z?rNRl9s5vjO7~T((I}kJK%g19^=8}s4Qf3?AZ}qe1^E6Qf;GE7Nm5;-+3laBFZ;i;
z@Y>#h^0@~*BNkAPoX*tVd=|AaA^?}?YW(}AyZDWsv2(G&px|y@hbznxhy^F5rV-DV
zy<^xfHiZ&?1yeCHzPCfXoL9nPH`PDB-zJkNWIV=MwfN$qFXqDSu6q`GQ>}WInIN8K
zg75<xDqA(r-D5pzZn!a>s>o4D;F45?97u4BFH(RP`W+nkM;u6VbRY-3C861;Ft%+F
z2dDN2JxCNis4$>_5dpMie}VaQa!oWw)JYtdua{{Xyk!iJf1O(S1n(Yk7$D1&wHn|Y
zhZ($w;z|^gp}+{Ta9Z92>dA~91L&#%Ft~QEFa8ny2!LXcBC^3e7lZqvW{yKC1;@ry
zz0I$B4I2&-Bd`8eO@>+r8fRvl(o&U43`Hk&nZD}q4)t<r>{c%co^$Hn-@N6;)GfVt
zD%h<aqH=t&lW9qiXsupWUbHf7CAR=ff$H+R>_)6jj=`S+uROQ>!$U0D0>|vD`uEs>
zE+3dC<^Q1!1hKQ3*;o=>zi1*3&(5;6GQx_PknmuslsTn~r~CR%mX<O*4t7h3YRYaR
zG66&lk9*OvvAtlB7+jc1Ue0rsaS>EG7Gw_;3%OA^FOi{@zVaFc5;zcdY)tgaCs*&#
zQ=LK}=t`)~vNZ{94w{+?Z-rEgD7G@Z!)^gOn(cP<hrmtQ>|6}~5scxv-v2-wMSgK*
zu-#5j;tB^Vr0InM?n&{XJ77l3`9nR;`?mfX45>0>3U;Ky6gOD7I<f%+52zGBw!Krw
z!oSdZr<4{FAkn!OU|Gt^+h74_zW_RlCQZUgIMHg`S{>(cI`)ACjuS!O+UFr6h^*k@
z=`i;Y99Zci_dbAOWrInr6tNZ{E7W-@GIMitD>Tm4la+h`TYWmqxkT-%P2;w+DK7J7
zvzGLeo_kgw1YVRV7q(k^>Q2B}+>x|UfkrQeXUGJ6jeGDV9TOcZX6lK1wR+t6-qtDI
zL*iD{xpzRWZRTox@d$9lL(aOi$kb8Kfgvt$;peS}%@9w}9xwr;+}u%<6EL8jxtcip
zbD2Eq#rv+5uRZ^qs;vdaGgP*ReC&-nwT94U4Lt!o1%i^@qBN5#|92;zBVY<)4WXU}
z)>->6cRb(yz6pv!ff1)Y27>_Z(|~CX&5ED-ugb1e@fz;{<<M$d!&A3dy+kWri7oGl
zx43w3pnBv8tOrwCSc%Bm5T);}4p4M<c4m(Ms3vd7gDcMoLg|n_*~Zg&{cbLlyE7GQ
z7rYur2R?uBjKbiSN(S>Ee{?KJ<zqyfzk+BMbhZX82}Eh^5CDMN;>DcHLJW|qY&Vl$
zu<v8*qLdK7$%)R(>jCXnu%I(6aaBiBW^ck}BWIU-C}b^<92n!4Qh^V4Z1Gn}zWMj?
zU4e`nC8ActqOV0#OU<giMN5@zLJ#jdvmy8%BX52}m`s}8BoU+$9GAJ5cF#P+LtUTb
zc_9^P|0bE^_y;-~k74de<xU6V{E>y0$WLMJ=M?TQQDD<ziruo2Ue_12kxTRXni1eW
zjOS<X?EOOEeQB<N-OiSnHS)jVs2&(~H#Oc<tdU-aF{v|f^Z6qT*$ocv<`;wo;5scr
zr%oA^Iget~&jQOlP0{vRH#u^WTK1*8mt|=IU`MZMnhP+NSdt<9u&;tVNG0u|`rbF=
z=QPkx{3;B=P%xmR+Xl{N_lNO@sh!5&KJmnw8)16DZjJ(CXx>}5U7)DNd)c&I(HG2l
z7$FU<V}2Vio%5S^ZVyr9A1HNL5R21Ir+j6r`cZzXTMwdSmjg|>&)ax^1qWIB^mf6U
z?XO=Gz{;s*6T8i1c{{VI5#9?ke8mnMsLB_hsV}3bsQ6I!`MAuzIt<$Z)q@A{cRcyI
zd$K%Zc`#p~XG$g}c=aAH{RJRGxLQqkcT>7h?8*PAMydiYUJ->DbvdHt0`>Se!TDhy
z$0|_Qb-Gq4+KZ_YsJW%+^F=~Acmm?7@I>HSTJQpYgX2NL-m^GucGksHkf<$E_YJkF
zuqj)@D^=j_MDu~UkA{7N8v16cLn%+gD(=)p2g=1p3A_aGv521_z*`E}iN=>(x?Csv
zorb`S-5wPG(x!cZO~8li5D>Zg3T{Aaq($Yn5)p7h_U$W}th}02O8Yxmd2kPOuZ-Z!
zn2$MrMg(LZ#96Au#DaXwrh2Q-^Cn5F1L$DZ9EyJnkKC+!wCbjvV>>vfT(i9I{}NS~
zO<zp?dyH%kj9PYh))(1d^KTb=$|1iQ5h8@!qh6&S)Z(orH=%IAzXJ4@LiSHOVX|_o
z4XND@21!I72R;Ocr%u^?>r7Kt{&I4s1XN0_cMd5;9?oIWdqS$iOyP7z<KmHSe<*3q
zG^sea3Gu^J7x{N4Cn6m-WMm6GqOT@-I9slrn6=J5Xhxlqw=Cw38@;2iU3ifOf#U3V
z$`m@{H~RZ){nv*&$vKD)D-BzvQNZ+9o`yK21d=B~m6TU_j1UEeoXt8!#;jhP9S3f=
z&*BGYfM<NrU?2P#RBu?IOLDNa{@oA+EGFrric*rNomR)@J#*&WNaf_@Sj0|Ow7j}4
z??Im0Eo$!Nm%3n30Jma8aUEcYmg%BA-k2}FneS@y%AFd_)nkDR598RJ&pn8=Ip&uk
zb^sCX*AXG3Eg?Y2S&k=!2y6jAfEu|FB5hKF1);VdY}se-F@;8H*G-MG#NTVX)E3HW
zu<ov<FG-CvN3Y>xo;xggTmO#rQ4;@1sDMfZhsR}Fd8L0)?}w-noXYIQ(Z-t>q@`_@
z&e2ZR-aU}4I2o<4{$5tf4IA>z@lK5k&yDp3<hT?!)}2Cn&u&;G7jPp+*p9Aij76Rt
z6$>&PZ3{<c?i~q*3`#he7~bhzCmZq9&zGQUA0RXTGoiMDDwIC`SnItp)Fh<ZQ$xHa
zdk{IS4DzSwj!E+L>xx0ocghuo_4=eA(2smDe)QNq0l}c|9U}$;aQ#1C6@5wDTADXI
z*p21;BLcif>j5$mIYCRs*aDP7>JPde8-oDRWxF+A^c0A`WmK6C(^7qfJt7cYqy}%y
z5*<-%*Mo4|?c9tQCW=tG!<v`K%_dJ?6zJ?dGCWt^4eeHY0E_?ovfRL0l>N~FUmuVl
zr)VrNfu(W*@^AqV-gkA5_gBBYy|NkhLR77;RUkET0Yk+CFw*;Z8RFalN`5x$Oc4|q
zWB5ySWVDCs;O+fjXCp)uPWa`y;6l;0f>l*Kvb;2p90HP4b+O5ohc%HN%v$@w(Zz|6
z4wuf0yq<lCI&7_#IS7L17@g3U-I%!w9Bnr;WNm~*M(U1kx%sM!;R4w&o4UrTH$h&H
z7&_&<&v{NXnzcA4kZd<D@v==f9;J!d;rN?Wd%PJ5HDicO+^yFysk&J<Vov7!wWlQw
zOfmIZ{fzW^6qN1F-KeD93ZkuanIyk`y($r~-i~p&l9$RFjEvOmG<08e_lo?av>ORV
zESD*KPw88b##0ErLlU0+uXjX)4?`g?LbtFz1<PlVz{RZlDEc9E`$f_K)A~^xD&EUK
z1$TE~{FvF`@Tv?FB`*QqOkiDMKL|Vc`|Kd@)HM3FIauB-e?L3=+R9Q56Yx-B2G^}F
z=r<WRXuohFdsCItfV9Tq@TcEyJJ(cI>r_><0WeUdj5oBZn>>)XAXh?#B_JKL)YcDT
zL6RJno&oSr0r7L{OZ&0G&k=2_KoptRBlxbumO2HaJj1pC2k}^t$|AR``;nf&wj0&Y
zy}M5?EW|^SG>LsGH#JDCT;o|l_bW*&FClx}{8@pM;)zzqq%dj8!;<k#c|I6#b-@oZ
z*9qKzhsDhpy?tkQ>|fI6?1k-(|HB1X<}0`O{F4JJ1;$uM{CL`CzR`PgG^>po)aI2<
zs`Ocr+2cauBj$WDW8^qZs>#j)vQv9UIR?%g-(x6_El7Y?pmVCzyf^((mA0g^kXp~J
ziR)^Pa_X7CsX@KvT49H;O^Qp(D1G%G=18qAqT!*++73j(C4sOqh_vju3?_$DAXpor
zP9J0c&#MIr*d}Zs)}hqkvd5mpXA_H%w3P~@VvAvQzoRs58svW4)$FvTo>eylMkrPa
z1%RE&1vaYqcdAAo8VB{G`eqY=nubSyKmyS=i-Qwy$@eL*?P$KbXtiyE?LBpTNwGo=
zERA($4R+sshUbwGcp<};HBcqA)O!G03LUBh9weU)#LZ&c9GIXw-#|`89EW+0@B>8i
zf%P*3ac>#{R|D_rV>J~`g30kQ0ZX-P&-9(_Z;hLd2Btr+VC^nuf^{lEE!2z*TOyou
zGIgjhK;?v!yO7Ar9&zF(MO$BvU`1UCa*^w?YL-w(ME5$~q>!&HzhIuJR|XBko8GhA
z4}~Ezn5iKwQyNW-X5G~U5kxA6*yq`Rqoa4*K*Z)DyeMB!^Bf(F^W(RAi$OVuI8p!g
z;VXg9#K94#Bz*BRpQy>}2HG<)z>D2kWtP<{zFMB?qwYc_;?;D#Io|iY36X{`&-OUX
z%u;Wy4LUUuOo{UQ_^uy05TUP!0dXI;c9{2@+)n)~EYF7~af9dCSVb47D&z@Id<k3?
zm)efNI4`(y5V)fCeLn!zxaGBdF7#xRU?%wD;$o!MMn8-CO6<#8fE^|<s(qiava%B2
z!YC-HUG>3bmI5tx`D4K&CO}OR1#b-PjoHJ0ewhWt{n*9@G82=0JNeuUjUB9!S|^Xf
z)@w3bb4=%H0JA}Al9XY2476azBC<UN#9cL3$~EpKM4o4#K@@rPrZEBwk~d*V!@r{+
zJ4W}|{gkaIZs`k5_Mn&VI#i8S$RvgD9big4nnx-xQIM<kuu$!oD4hHl1|Gc=1vzxE
z#iKZz+S|)@t(f=cH$0i1LcVO%-?o)1f12p=(;FNfKI2l6_^*fl?>ksh6&0zR0{a;W
zFRT&?oV|eQ=n5#dVxpt5<~X?&Q;K;k;hTe`x{eVFi9E8Q{9Q*2M#VhNU7wIpb^#<a
zMGWqRs6C>V-hqNMS>g>jRb1q!(qQ8<WZX~w>1%>hcnU-}fR6gwq%K(hiv@;m-RH(y
z*`c5Wl!ce-8iU=XQ1;_ESFkbufXVW4!NHdzT^5MHJXi1zzAG*r<Lf6rqO*t}(8yre
z_V*SBJ^{0WUnO+DCMGk<ckbdNN_CTTu2Z|z40m&^bTyC2>jup)(z7W?<>+~Zvb`)j
z-INbU-Ibdb`9WBCP*(y55Kj0~<&7sDAQcx0QLK7fMShxci7zZpR?bN^swHFEI1voU
zHb#d?!|y++bh?Ymv`NFND*-oY$FEo5?s<=?M~X&A8-~q431=w+Sq%xV7;i9C8cg`Q
zmkXbN{Qn+s#Ipu@6!b<606dO$->sJhJ7BRoQDK!+=1yy1el81WUc!be0N>~RF{q*M
z>6AS1`8Sxx=z^(>I~c#<%dmsn|6SJxe4K=J&x35dTqyamA8>hI?iXvD12ZTt+eOj5
zxs!SKJz204M_aipu>!2Lcy{ID<KouV4Z$*7=H0^$#V=^7bTwZ-*~I^6NPEBm?#wMb
zoa}Gy@G}PyG-@r;Qd6wB6U}JO2(l7ON8jqvcCvQK51+v#te@YS@n^k%+Vk>8UAAak
zf}V}e=hpHA>$SEVpC@}~Uv~mFj=Kd!8NibqYZV&wH%2(2?uAwF|5H7^hH0ZNC78@H
zm~~f0M;}v+!l#tRdgm_5RjSs%_!J6gY%?W-zpK%4kYF<V#H?E%UuS<xoe?qk>1lAu
zalP0=PO=kQx({%K19&_cqWb?|p&6ge*J*14VJ^r7r2Tjr#$eO9U_v)CD&BZuQPrg+
zf<?0kLS=$3=p(S`;{cj6S;_}KF1rzIsb#xyTJDdaDWGyq<#))Pnfqg1ze@6V9-vii
z{(iwJwNTFDDHz7a05?D3?VHXDG4NK#&PMqD|C>viMHbN*lT(R`d)eT6li`%cRvBu`
zg6>k1i&b6@dipyZTxiRjFCY3A*j7R9Jra@z+5zs!ke=uII0bop+w>dtRt(kZ;vi}`
z{bFU<DVKFC02t>_XWO{wJ<T}$k7Ri|Y#S#t^5!8Jk;<z~vcu-^F5v>(bz$a#xWk)P
z`HR6A<qGSD=pBLMb}n2r9T1B9Q~x^x{~HVtown^gB4XkmP>Qh7Nd<Ezjuj3rcZEbD
zwkeA4Pm3Y>KEeW55m`VN@!+2mfwoKBgW$<Nh4!86c1oKAq*TkOOW`lj1=N|jc13GJ
zNts9a!fb>dFsv*lS=lYh>j8$XIG*yD&-xjVdW`~8QmKz|-T7)Lu;-u7#@YU_4-xI3
zmmNV0BHBvG2I6M?uiwMhh~OMpPa*J^E)!h`2M1#yIy3_GFO3p}k0fZ`15n}V_6XN=
z@K@Uq{DevC;CGm*qpWI?vEpClQ!2Frva>0=%BPIBnmCLO2vHSKd3Wcv$!8~ZN4P4J
zr>Ckdzds|tPLKYxCfKq|KFxOS5ciDoG=5v}*3V9H&S5Zyu=Fx+%p96%q%QbMc?6eO
zqC5sPc)pMeiw@|Nz?s8^)dXg4_uAG^(D~%tj_YO*PL*WaHyhG7E3bBV#>SBuiq!4k
zO;sCZSSCMzjqSYmAYWhqXuST#=Dc<x+=j+%YKrznQZxz4qnn$9aSG<aEA7O&DcGZ^
zsVttm^kRyWV#^DUSh##BB&$Bz_JUWrQkJ`?&?!6H#O!yVxO<zr9;5M>FI|v3oqB~A
z-drlDCHjUn?t|YV!NUM({nr1(+gHa`*)HuW2&hOHh|&s(fPhLj1}&g;qXHr=-K7E+
zT~Z4b5NV`q0g6fqOG)WwNq5(o2XKG;+gsiH{Bh1d>-WA$JkQ*7Ph4}&jLuKQ!*ZY(
zf}6sd^uCp@jip+*8%B7uEeP%L2h3n=baE!t<<tPl<*%ttnfXYY$Rx&40-7M*P1kvD
zhKhkI0u1+6&=&b8h6cxW?&vHl815AvK?$T<Mt9o{;X`=%o%&5o{OeY~M-bF;vR596
z1uaH+T;r1+T4}f#0aP(tM|C8L5DUdxx??Dn5S^*6hXzl5%oKTz7A0PZeQl!=f#Ach
zOFpG|o+KOZKcA9~9Z9G1($9?TG_$@vK0P<z{4^Va1ox_+qX%N=lhNxk=vH$QZ$CLI
zigP_89;md_q8+v~?VYaNx7`$zC>iS|RcHEIdl^$D>a(D(b-O#kh&I_>5m#DT_yohU
z!~Jg?nQ?73#k^Iu4YVg$1Y@h>_YmJ=41gpT+O%!_kiA02oh@k^N>F!V=k2ytlDacA
zH)riMjb0vl?Mq`#C<xQOR76(#aEpe5W#8lu$F~uAVSj2wPXWMc%p!k%pVtAnb1|OJ
zYeRPwe?S@t(zCgmFUWR$Ob>7-sVrH)IPTQc|NUyfH$5qXQ~nv<L=PuGDvLOkX7cw}
z`!w)`RE<ssZI*AcJoh~kSXxhszUq{EOh;9jK@`0hwIMVtS70iP=c$HXh~XORse5NT
zp={oPIc6+cUcV)#ir#P)d+yYgs3z5fa_f57^Oj<j2~v_ma=)3J^2#RH0+02)Azddw
z8*#G;pITX2!!{Daq3R~gVcEW-4@BLG{eA^95-lcMJ`=N6L@wKxC(T!l2TZT%V8r{$
z=w@#8Y2?<<@7bi8!jcBjZP(GZ%($a68LIUg$G4#!aIT)Ww;E7h17LvAbWTC(_*OSR
z6BARZU?)!cQOEp>vxUT76*+k1-e*89n~{Q*UzGo@gl5hm_8EWIwDcpUWkt^T=<iI+
z&ey!IBS-KgwKfL%@oA(mRU()M{o&p`81z}Yr~=!=l%2J826Zt3hVAnTj^2&mW@>bb
zrA;J!2C1~1iZ<ZRYRoNo2<c)e_OW*qNant=4b!)YZD_0gxLRT_WqhS9!SJ4pd~CcB
z{^P~c-tv`C@6MTz@Xe+)G%omy1m8+biEKinlnRXsHdZ=h=hm?~Zux@(y#uA?;$~+f
zTT<3b**&oK?JHZ#{Ke*il6YeiQIayPN4aCt<{ivJ8b9JxwbdRbG~DbQG_3kKO{GWJ
z79&eMR1?)ES@$+zLv4%rP{i|1DB}TJ%PbfisdNC1umJ!CBD``ip=Y>sz99tgz3T6h
zw3GHzh+PHx9ZbHy8gFf@tBYz|(#$twwk)0XfnNI`(-MAFJ1nl29TwNAV-PvZLFC;0
zZuH=;ge@U*zGt}$T#&khN$~9rJ^5$-mHJKoTPfp(?Jz>o*VEFR<+mFXczvJGVfdYq
z-@cr2s|6?Wn$yc9p=tCc0>TT+U-X82U*n=xee8+$nTiQ8GJZaq&(6wznT2V4p><r|
ze5g5tHL1%n^}gN6i&~YKwFQ-1#RE@^BJ|kmr%EyUVS@wZ+rhEV%Oj|tmxWLhmWMTN
zxZaA;@5m2{*No-h&l@jPiDUnPRtgVrl(A<=P1|u6H53dyrC{kTxff;@#joSOJ*SlX
zJu8{Y#ma?$ZR5G$`!q+Zmr5lW=3x<`)fgqF+G1Ya#g20P8m%Ml5-D-E&vG=Zf+low
zH0>{}NGzr4o>_0ryhPK+Punsk#Q%0Okp}D9w_Kmy_ubvPFQERt8xErr1Tlw%1W`Ei
zBlRILbgyotKFC9HWMrfkdRE18o1Z#EiMFry7<|0B$N(lG3QWSgTTMrIZ$R0BcXMC2
zcHJtSE9Ae#ag1y`F!{frOS}IY_!DlxaX0TsTKMt43rGSc_)*5oDD2xst9UJEX-(De
z<6lv<YPHL_Yp3?#UJueu4$gJIm)k6`Xq#jsJGWFf_MW45MB$#}=S8x8t%pC3_fWBx
zmx;EFFD`cE!ki2@b*_CnvwSmECpGwf?$5kpYWYErh}v>{i&A58ZvM^wdkVe{%ef`A
zp@SVd{mS*4W<2eAwn1N~Jsc|LTipF>n=vm_Z=_QnIPPfi;F^PR7rLOS*rZS^RcT$5
zDb{9wD?k=K&mJ1u%vH6)6_{~nA+FbnK40M#!*nddc&bvWxR+A{X(x+Ax1HMcYZ#a_
z5&B-gisFvZAUoNi9Z`P#e<?~6FQXQ|<>Vj<ZvEsy!zPxGlyA{&re6Z=J&pTTKKXB3
zAO2|K>Cp6b*<e@}v-WuLq+=`Hq>pik;pAhr4fd~3)$GFGIxGaeU{RsQu8sG0zQjF5
z9oI3go7_8!FI^`@b(n+gLSWvYqzjR;OLS%uT~YZ{g5!=NM&B;-FTGjvjjgRU7<^%z
zZL%=GReYRCIF9#jO!NI8pGLbTdody2+eN+8i!r49;7AUjy6xMbncDysY|jCo$kz?K
z*}J_Nhb*gU1k;>VV4jD6NJz_E{QBcRSz;!z#2#0LcH#q1PY<vL>%0R;eiP*Vvd(;<
zaFN|exUd(;7bI2KFi0rRVA(YUCN8z3Hwen>)#}qZ(tZ2`S*mEJZ7n`z#AYwFe~_Tu
zyVVMpQ&epPBx$wye!3^gd_xH=oZ6S)Zcz2n((pt4=7#@w_p9^1zSUL{x@VY^ll@JJ
zH_F=Y4_44d$98-lYoHj(u-3M?sh{B(lr%f7fz~b??Q9zSAskC<7B+Mn2b0kjIzk&7
z7LN*Ji^E(tbgPvUPWaB|z~@~$pTBX=X4@^NH;<=fKNB867m*b?Renm!$_58eac)Z*
ztNjGq9BW(ldNAT|C`ghf2-XxxtG4zF3a^x~qiA6b<%gCXPTd_T@a5ca1h?Yf((eju
z<Iv6F_Bhgj9VCdZPtS9#VatyX_m|lIEok0<fEYnhc#%83hoYE&uE=>7(>#iA+?W%P
zV1%W@Ut<a^U27pv{M=`huqK_v*H1Rl!(1mR#3)FPS$;moyiYs0p!eO8&Gm@5HFS^r
z)$Wt#39FTlAY@{z;!ea{%^t~%cyag(SpYt-ZN!&u7g^Z4t9#MSB)Uq+I`V$r&u3oK
zZ_1Zyv$4Ll{&>BGe!WeuxQAY=nO%f!&RYA(P<hh=^;)$WB%&6|P2f1yo^5w9fi>i2
z(JaBQljERa@<FtrA_pU<tf|RT7r_w$>tSkv(_jG0RNnIHSXgA(PW<?OkiWpJUD4W_
z#g+47x6B4gs;~W_sU&<n{}>2irU?>eO2|p>rj;*`zN*bR&Dd~d2Ro_A0UWpe^uJ}C
zzv5Lmhj(C1#Xj@Vf~cQZBJq`uC=g6lh9|rfYTuJPJV-%lThDk-Xf4M#i;PoAOUv2x
z8Rre?y}c1?cf2YRXYs_oo|1csUrVq&N^NRF$&+>4VNr4lq8p6Sa_-TKUKc$2nUnGx
zPIKMxN54ZiwQa2pGp!aow0W-I$W)f@@OM{AiG2_}*fAG;Y<0N1z+lp&sxS|0T=rp!
z`bJ-SQy2Fu^i;i7w(-aU428sNB}JmE_67OecC&1cZTI$E;}Em%Vi-fN#bjt`DBl)m
zUCXQpjlSQv(^XQ#6W+d!vJm|)&j6}N25Wymi2p|Qz8(bCdvOfA>mK?vK=mg6c7+cr
zLp<>6ZcG4N;!g-;=to9e7%oP|W}GrChSkSe*kn^_)d{ZR4gF#td-LI8i5XQ^Ap++Z
zr|o#_k6~@k5)$~>yV0&sU6;iylisqQ-p)`?^~i2&%m2En9xfut_l@0eUQ2lOVY6v>
z>uAR>vz0Ea;DyzZJz2Z4<v`HoaM25^WHENe&!XrWlIE>7$zaxbEkk3UkJN=@?n(EI
z`0WqyCA`aoD45NJz23uRklRB-<m0z$Lk4Q2!Q8M@!4T`qLez%nZ))=bUi@}*AwHru
zdOK<(2+xN$RjY#qMjr*Tf;ZO<!44xnet(|J-6|v&bYY0MCd+J_+@4g^5Hb)GyQ9(e
z%|hc=&@Jk<h~(qA6mh7oLbzJBMBR+s^p#fW_zvYizki<X&5Md<#%^GlYm#-^X1UX=
zH;`Ot%1;e<N{NGSv&UE5O_Alql||LOcBMY2`3Vxk_L<lFmqCAuYTp+%&dp-GaC*rE
z3&J9#XL~(rdhA4N8$S2eP3wIPZ`|1Mtg6EMejHINe&-Ul8t1d1CU}Ug`mL@U23<)z
zf!z%+9;hR_g8vtDe^3H+<)Vo;)QAs$%G}YFSTOimSAIy<C#-s&vDPlBV`Sv6Pk$ba
z>y$mR_&s|xessUxG^FYEqUTv9g;}*k;>d0aI?v5U=rSiQyQw#_s@Q95{gb;GG?k2X
z@rqgeMpq;7=3d1lfwh<Ew+?r6B<b`&p4m9#Ok49#v86e*R`waAN0oi$gLx)S!`?OS
zw;6*8KLMzO<QLr?ZXgU<N8iKae+aU?;F9o@vNeb!W<|;#939g-@$IcYYsdivrgId8
zec4dmyaiG)Q)8pSL2gMM|07iR*dit0o%Y(g;Q$WNFvd!Jyca(y_SqAiCun>H1~JJ7
zXS8yn2(QGUmPA2>#YL6Fq4?8bFC*$e)dUM$S&o_vjk<}@7xTGmHi&n$>mi>TUsn>x
z)*N-&aqRl#gvV_UAm#3?Kh(U%=pc+IzW&^1Cvp$gfrG3m!+j~b8&dx6wLXBqA@1>K
zg&**8M1Z7DmFK<{HudM~hbAQBf@>H3tttfy-BK?&jooT5v9}WoJOs?n<O}AK?-5>p
z2!zaBlkJlJCT?B^${EML6_3`4drXGKCEW-mYU&|g(PTde3IYw3{oS{6N5IaReW-^z
zz;AEs0YK<}8ErAT(@3e}MQTILpR<sIaiDuM8jX<7(2<htq<}KBP#l9e&00BiUR+VJ
z>pjGw?`O0$X6~1J(!qcTGt}nm3zQ;UbiBgtEgApBO%O9rR7+u~$}73*Z|sr`v6XtG
z4U3bmK1q5w8F|fKB!@E=5>g3u3@|>94$7qI=DlS)NXJ2}fL$<5KsoUJAvebvcttzp
z<99a=2Mp3LD@zyn0x4^KQ88-x-O22J?XdzV*~$fDq|W$+V8<dcrV^ki9j4icEGF2E
zBQ<nib8W2Eg+9%;7~pGqnen<i_z{7@@bk){We!p9ZlBFh5(Qmnh)R0SC7+J;H$`&K
z&)4v)%N9*vWKuiS7+SBn1@Nf=hyHUNik3z2r0ekqabCK&6Axp0zUUT+Q}T~Ie4co(
zsVPCRJD!nEuoKJmPszf8+7GLDfb{`Z=7v^{4JS{u+|qO}M`B{4|MA0z4;w(sboSlP
zhgX3sI1Dg&DC`=q9;yjR1PuNtwE`C<(+riId5GIlN1j8UjStcV1|ms}wMHcL=0HIM
zXllcIG&L|ZkCEu9pEptb&xR=&$!W9G`R2v<c^Qx+%1nKGxWZ<V6B9H-LJ|Z!Cu0L;
z=1F9n6Km!5SbyXxfnQR$=h*&m;+^Tf!dyB%RX-XUWyhsv!S$M&?c!UCjg3@p)4hdP
z$|ju9gXMljLUWIYs5Gi(^KPzjan~GX`g{#qXyqmK9z3P$9=V#_U7Bdn_HOk3+PuGa
zZ_Pt5H{9XAjf*?RXd*uh8fM5MU?<+0@wUtenDXdrT;gQuQ(^Pzshr(Sb@lW7{QPel
zT0*jOa>Q{btZV?_jZVD#3N(8^9Q#qRMCsiZ{|M3fFQo?=9YP@Z*3i>`P7O)T;pe}f
zOcI5kr^`of$+%@pg^hEfOrv%6gJc_3!lYNJs^VKtbClLHZ8X7l*Xt`Cd7>vYi={@E
z%R3R-{7_XrN7)m7R@9%`cx>$%Q=ao2UUFM;dXrsec^nhrwoUn<sX!cI)s5hks<^%t
z&GT;f0@aR3gw$g6D=mZ*KP>VJs#LdD>986TjD1{vy(^I)S17ch#XC)Cbtoj;VY=s=
zy{f0$aYtg(3s;k%h(&s#9znNN5u4$ya{I{ceONj1s+N=T0Ne%$o*lqo=dk`3VB1Ta
zso-@hUrSLU>a>Ai=98Kl+%sK6OC`fY>^h8WoOSyRSUE!EID13FID}b~de^gJe$46x
z2Rj$0SVP8d_L4`VRo?M4M@rJZnJL5P!zBdMb1vqekEc*Yv!?s{1=m(lIXK4Xerz&F
zxrmR&bk`<t3a3WY@w&O-Hz#cl*~HN>0u~jRJ#d#9Fs5g~*<;bpbg@a(a)E9l+NS(i
zs+KjJWM{Js16mEO%c<07O2XM~4?m9Hf8SnBG}&X4C^!l=#fR?0=w#de(0|M75<Ti`
z^A(IXz(Nz@&_68$t1b*-kKl-d!YK^H43)%J(x|gDR5&ydJJN$&V75qh*L7EV2USq*
z2X_Cn5NbEOrI+902_X4?kUSN!c!wC{!FRWQaRCs&CyDsIw%TW2*k9-`AT9#^6I$h}
zWiGD@0Fr->@AnHgY2rux-P}=ynugEwZXcZw?=!vcIGS^9Ls#p4|6Z(D$^0^1<z0lZ
zs4}1{UyP5)O&3baPc_yvXK6tybMV3T)^O5(^8?%_TQj$7Qf}3a7K@f`-Ecp(_C!ky
z#K>DT+1uE+ro{l^)ujvsJe^<bGD%BpA6${m3~gE$xiA{WBFK&)^s1#Zxn{A;-tG7J
zI9`Bm91@Z>HcozV>e5yGBd18eq?4&D9YEpEKWlD>0v)4l3CPM54`52I+ec5~?0V5H
zV7*WDKH=O^yI@Jg#pbkr1-8>4^w3KRPRl>{;v|v;wjva{&c!UvK7O<!qcOihVKaZK
zx0K8?T**ftQme|&M~g=GuXOwqTtGtKVU}&$epiEQ<x1QgO@;lp{|UUj!S<}+TV6p*
zeI7<gGD>vUIwiZM*Oor`wz=QiLrZAs#r@%-XA*m9AA%uR3<em2=XUzCiOZN{Vpmlt
zLr7>ax$+%}HRLqLr57`cbfcDvB^z^WsM_vVZ2B_Ml5t}Zz#6=G_Ff+zlpgrU1TXUP
zsv-PV-X6#JTu9~=m6YN|e5CA<NinrBl&w2TFYC5fu1O1vVz*B-(82;&3QvKcvhv}V
znP!s2@5SW^F)W8{SAGB`*csX5?#g;sRsTgnAIpON6*3qTATE()$0ahQ;-R9KCT!jv
zw)Qru$e+91ei^H;v9xjALfweORnZ?^_$WcSx(LDce2gvYqGsdK1E7iHHHM$Xv+r~m
zXbAR`l@m6(R*>>%bPopyM~(2c=>;ywYzr|~={A30ewQ}CzWH3iV3hOG=29{fSJ9>g
zf03+OQcC06qOfR2=rqB??5P-kSbX$9hH}(ygWNCrf+W}dp@o%RN{=8~AL39z^yTW;
zyOJboZ>2XlWOx1}$FxygK`LTSeMM<$H(*)YSzrb&7MD=)n4PD%(HftSa2;ATn1NSp
z7o#_8wRso1N7B^aU!9wqE6wMJDKteJb9ISO7!6kJZ#dQ8JMi8#2oj_T^pX&lQbOsc
zhq(HHPI!u)Kp^-1k?gy&J{SOH=E^}S)*U&|vH^u{MORb;Ndf;w=6nC^*@D~!bMaFo
z@ExQYAp4-j#kOJ~B-?4_2IbIDFtq2H2tw(=vb=WS46YEqe(FeTgz7a(sWANjhvK&n
zV36t3#?AI3$1+JAOD`kgA}E#Ja&m<9*s+u+$zF$l66KqqJ5Ct$vEfZiVg!5e>gM!P
zt%b6ayW^9g=Hzi(e;<?shu+y#36ka0`z)+G%ZC%>d8H7C-WKsc<tGd0vS=fZ^L0Kh
z7-r|_>(N-<AORym@>7ds`K|U8%Z0M?MXyVg0(H<(bYo4G5an=b%UCjbt5S~nxnBMH
zwo6C<11Bfr(bu=v$2rzJM}%m0=&3Z0J&>5+(s2S5US2B=TG<<cvQY#*Wqw&X!lByQ
z+81!~j|2wMlk`&eBdx^W<0WN>Ykl>a&3K}ZK0}FmltIUj#K_mk!De?T%nm_Uarkk0
zzB5;~1efZMIr?u>6#7B)+HcDv2UVcP^!l*1Fmg6xXAuVfQPm*+-=7n5fVZjPcv>|G
zC4eMy@H)b5;tEAC_cPC=^*7;a6<jyCtx4p#zEPCzwk4Fb#NFJn7WrLqytyg1Y>J~g
zl_}18Fun0-xNdNG*s!hnE3$xC_WX7R%R0-sDz&)UR?D)>SF0&hMc$eV@~=I7&N%)g
z%0gt%(mN4pbaMQ2p+t+4BEJWH_o*46#qEogINy0pxy)Fqa?#iQ1(w14Z5)y(^UvGQ
zN%p2vqklDxfIxbRtc@>VlQy%QCyn=ulCCmCv}o+;!OP=os6F7bIc6~duzn+D+g1+*
z@2_8<*7M+bI76`YgDMCu6Th{4n^vJ$H)3hBvt%+9Nca*oj(z*36%-s?%;tt_8q>9S
ztJ+}Mj;#~y+Yo0Jb`f~??AZ`B0|dd`%x}q;ZD9t-P!Sdt5)(tUEyzubL9!R|f5Q5_
z(o)XiE=i^f3S8a6rS^G!1Js@>idJox`c(jU!=TINshu<X{iRZcoXhO9%AFg?sqW68
zX}wX0g0`y+!lf7lK5`nnuCe?jRbGy(BsH$Nd<kDV{UtC~e$~Y{>>Z|*=>2Y|0shj^
za=3z0^d(EGp0M_f=B-yr`yK!ep>@B*vT8f#X3TOy8nvhd54_XIY*RrwixOG+DQlZ=
zoBSP3DUEd}>)w1Ikui2JYwP#noayTGnQAO_v5eD4fNAbb^aV;$*S1UN()-*-a08yH
zRueB?q+}<_qs>@mgOnIrY9Q7R@xmn#m<%?}M5~^=@sbWEJkJ!mAbcAbafIv;`aHV=
z7a#$4;psX=tGc2t?~W4+j)*XdNm5B*l8a(MST|Qi6ab)ppn-hc`uAFu*8?Okpn8^z
z@J8rJGa-rcZwu4!`h(4aZ&Fu-TV>bg6NCiGK}cA0NM+t!6KG<4bt&7_DA&FKw(JMr
za3$;vJ$zO5EVgDyYmx`4-m<NkW~Mpz8@DfYx5>*n9^PlUI-#HJ-pH(ocE>F+`B=6l
z3h8kggmfEQt7aaRw$2+v?A;0da>5KbYe)C%7>R168jhES3yXd-`k{d#G5QC5lc2d7
zg7(*{%I`-r&kd?$vsMvsTh3+a5c5LX9W3{tB*6sriOb}xNTKtS7xv@FtMSj~UtZFW
z;i>7gEN9axv0p5<vY+l@gBo^->+EN8e;eS&A3ZH#`z|3d(FvWAo2xAG;?x;je5x{B
zAD;whOt4D^#>_`y6-2bsf0Qqv*8JMRNDBbSR{@c4Qp?@6Kg;)10Oa8-8mtn};3pm<
zriSmtBTM>LFYJO$*ih-w!32rQbMhhIOzM7Bv717g=+b?jC~;nn$!ITdPu~FzZ)O$j
zn<;*pS+hd}fND_g*9&oO5o9N8m!`2b0fQ1xeI7jGH{mfpM`St0Oz~r>EiT?IT8`Lk
zv-@2r8dwBxi;0Z-Gz7VhYxf}je{JG+!7LuP(NShVoq)f|`LW@HbM(gnh9-#ofUy-5
z7suhYz7#e)SS84F1A+Hpwo@FJNpto1IhOP117VbY4Qx-L!ruREWS#`%L=oOzfMkb9
zqAkOLbL2|!9GOrmU^iyq-1MdHDT2u;{t1{VL<|GFG;nv2sD%j<m#QX{OR9eg8uy~?
zjWjZP1e^MZv{_I+`%rLnykdTIS>ytf{YFGWg6jDR<D8m|#ODU?+a<VBFMX1<h}=FL
zBvBJG8#V-R*(G}}(TG}aiF}{P+=~(GM((z{Afp(z(<&)c6mj+E620*hvrK&_)K_9R
z)}Z|y&l2(0gPnTJkg|jV^b-Wf$M{)yqWRu&B2RMcCL2Z1)@F`|1zGCmmDT9X@^N8d
z6aoMhZT2cI$Jj9DbR6}qTF2CsL~vYsIMPGh#-C(pBx@q^c5q=X_%3~dJHV#d=37Fm
zgLMj(Bh+7Ai@pH0&gK{aYI&kvU)dT3TVM2zY+<+SIR(@O*c&^y>Wb_rI6}qu%}O5X
zsD`kN(|WF^r|wIq0_x(OhYJ(!O;(fX1wAYXqYoy&Fg4qRf&+~R`I8q-LL#1IX{{Y)
zRt1(pPK(-;*}o*u<)ABr4wI+&k-M-551&)^k~-jdjrw>SJ*nkR79og-g4m1=*ps2)
zEaF@=u6>6NI-3|JSVg;v(4c@*=pKiQWd1-`(}h^hIHnT@?na8Z701|j7`tcsCG8aG
z^Zrnv_j&*rEq_c@t~YWiZ`*+aX}NiyLaM!B(|-2>G@m|x_cs3^#20;l+?QCw&-5zr
zIH*$S+UV)7A|TdHFYM|(whl(RVHc<x4`V0{mK=tLhijmziw)QX)-h|)dP>>y_V_5`
z`dW~J!|zb7k8A)*Jb2zngFJ{Q831>`T-xs+?I43c7>rLEviFeK<Rvh$%_N@Mz;Ef}
z?|PFPv749?kinFq#xw=gUeC|ba!h=7_T|$sn-zM-zgkTJ-$Z;nF(F|EEut%rJYN_B
za{qsNJ}@5>$&J=9Zp(0}Itb=;56-V<!34ATv-2?Jmm0gd-n$)slwLy0-afw<+X%Zo
zhG9;SO^i{#UOAE0&jT=6K=Xx%&*fa5;?RgN4a?gv5F*ACA2H|fYNsvB8^`=!e?z8Y
zA%}Z_{bs(4-;Za@1ny_lc8{Bs0EaoUg(2O(iSHEiwcjv`|B;NQ9?muIRAn!nxtduC
zp-W4czBc^Ue>97`A@xRII?`e|5NJ{$KXF2;+<nuatI*0n)^<2dtrjNHCcx|%5r;M)
zY!4x(Pb23Ze*?@i&UryAs+$6%M;wfb@lk@dKj~nFL^$Lit7x8`13`z?-cSfSHrLRq
zbxLBdDeJeY`aAVS9`Ya#w22lTp%MCFN(kdhDuxvi2#Xj(yQ=;-j1_^<@KSO1viD1P
z2l;`DoGFpEJ}c-Q9Y#X9t}gAk=q4rgj)^?f8Vr`2Z^O~w=k4w7p6a4Tm8?47iYDbC
zfU!s#%wK-fvh$})_3KIh*MJs$2()XmYFnd$3q78Gi2j)<d;xf??hsZE!wj_t9Ys(`
zj?{J@O>}q2F;y>y%;8O2zDY|hHV?{s$P^63$_+x7y?ar-H4e}~E;b$dzme3hYxWo^
zNu0W5jC6}g?SrQ{l=Z}l4-W@pRe1kMyEWqWgl0-%MN83GN~!^Lkowmntn<_S)(O2b
z<;d~DtPOtiF<@3a)14k)S)VRXL^_Dv)@-Yc)pVT_5&AhSe7#e|B%OsT0R3f#FmT%<
zf<ry4!t%V(50PtV{|cLT54HcCLjHQx-$?j7LNheCT0>BQG%yD39pfSI;^}8%-m%t|
zzG2=CGvT2_Ao|fk=y_FTO)~Wq+GWr}#Rwdtarfqn$b3vj4i1b}LtdLM>+y3pW=?Bw
zf!4lAY5edwEq-<AU>*k!<WWA`YhM>mFf$W+l;l47%QO8t@PGT3_fTA;5_EBvKpyry
zJnXa7;1zus-@Ib_^$0yuCW4q4+K1kaY#{AJlRQMiqDw#Yt=n`Qq+xQj@YJV&rruX=
zdM7%-L@s4d)Fpi%rDy>5PHtjZZ&R-(U2}ASzod(KkQ*L^I+tiT+<*TW&p)pBp96@0
z`<9BcI9oUv4zyjNnEDN$G`niEvv;?9UmkfVf9=@8<&L7Mr?<B|dlO3AD&`#+1L#TH
zaOCnc!%7u;1IGC;uB;&M>tXukNjel?k#{R*gmF#uI>oj@Ku#-~{04(Zr89NW2;o<;
zz(OZU%-YK8{gG1_D-PaJ6SB6k8LnZ9<LdZYTWc+pt6xPt3PVdln`+W&iNqX$4&aOl
zB$a?keg{eq>O=)au_W?r%I*JU0e_<`e>|9j49`5vD{S6;4RPA}v6shJfW(MYM{JmE
zP9nF;(-2UM<Bs9rO`M?9;>E@6f@`=KnBW>Q){-h1I|qa5Y7y;ma&oeW$Ol{tIMYm;
ziyGHjAVv&^en)2^OIRXfJ@AxCbL8n!e=qQ@zIPovkz-QO+{Ls&EB@^O{`j?u!*Dax
zvkt#NnABO|8;!=(@aU0H5eIxD6t{nHz$V(z9<<AIRm~;ajeqhjO_eafITRK?S3}$Q
z>FP2Y0S6%dT!x`R(z3FI#YC*Sz^iahy$T<uAD{g9kMxgc1>E5uH<}?Faod^-_aHN<
zq2D2S=of55XRw4LjE@wtalsdw{dD}z2LpvFShaVZRm>7#!agH^RWB5maWL#er|UCK
zci)d#Nzz?YxtBNp-w2CN8SL5~5E7^kf;(<197zPsBS=Da2DQ4ux{5cV6MisUWpv!K
zd^6v6vZG~?vl0koq0CA{4T965rKLhm7!<gONIt7K;2&ETxMyw8B9C<P|J8F@l%PvX
zwN=y9&%jWcU=op)ii4Lh(7484*oTveIYJ~nUWAMXO_tfAwvp?(-hu(@bj-}mhgM&J
z-$olwI?q`t0=)P}tKppQmKw?Cw)6GD@$XX{jq_=IwCllDAGxFEUwjcc)aJ?}Lc}^X
z@2FF7JvyMk&i~t5!doM$%8UYy>Ccfz_hMZ!DIAXsbrzl~5<VVjtBqHKTZ8^+`nGwJ
z?dAU0eC6o{<yQR7?&q;3Tlf{LRhQb99>MP_P?{R_UveehKIQDlM!d~8Rwgnfny<R?
zzOK`CqdZn<Lt^1u0B^(g<fM54%BWW_LsXX!Q{~i+W8AAzjTXg-+7}1(iilY1E;&U<
z2Nc=aY+Chh3pl!z;ceOJVOaG<C009pYcA4S&M)VBoPCd)EwsoeKlZ-Fac$D#Gp&=(
z@|{V}DBlHi!1F&Bg`x7JfTQ_%9JcQrKp<lHf5=EVDUrMNL;-Z?0LpEu3-Z1iK&TU?
zL(aV+i`=9x-&~4DloxSz0WBz)MZtoA7F9GZH{|*H4|vpnUL1rwTHwmrQ4l)Us5nma
zVn>p<NTq=Ald-nad4AzK`JPcCmGd(1bz^%@<8m=dr;&WulV2gsb-i0i<<=v9MGxcs
zXY=(xw(eJ8jJ(45E~OzzC;7@~7^_+>Oms*rEL6YD$vF(uy+UD%fDZWOqd_9+tuO^X
zHT}uKUyAI7up#_W+`_vDPipRExwaGvMJmY>tsPejAr+mMW_?2_p@bjFPYQX&qx~mC
zL3auS`Qe=v+&enb|IJQFF93Ezdcip)pnZe*3g^~^-QZ6w7a16|=io_!L^9T}_bdL8
z!u%4F*hRSJk8TZ8z;n?dGl&ie)jh-`G=QcxWWK2sKu?U^+}uloockrf6FWagY~GH!
zqjQGujyIB1*!@+Xt^$bq{$`TG&I{%Mvcbdp+3DAx`8OZ`KmYKLS{TR%hUD!yz<)Bl
z#uH@#H0nWr&b`NR<FE~OlR8pN82G3?cae{%i)TUiP+&ZNYBU|+RfqSazy!F^Jc^9?
z6RSuAq_AGbYA1pE%VWR;iSdE+^1T*>ga9yi6C&Taiur-h&;1_E1I`(1uLo-Zh9Dyj
z2E0PtL+zYY0CF0Ee>##h1eaiSTH9g3%rHXjW$C|gx^R+EN6?DdLLOoU=`MkX$huQ$
z&{yGQQDQvt<2@kb(=!v<4CdYyIg2Mz@X`7hE{xa$Qh>ljp>`GQNf@N#`iu(YJCX*<
z^VBcO^IvuF3Dl3U_n$W**)J&#Xd+CGXux11AVaBzZ5grlN{{e~3}E4wb%$}1w^0P6
zF)Q-^Gk+4o1w=1p-PU$A{{h*nVon#F|Kbju1l`hqEgy%R3PQhzoUqggL$fo)I+C->
zm<gs#%J(>7sgnV!`zd(gsTO3Z=+D5yLUZ4_4v)UuOZ=8;_qyF>hAdirtce#<5YJSE
zt9E=xE`R@ckKmbvc56rycooXM5H7IIbiKYt5Gmx8-JGJrx<DQNi4C~sJp(|4`n@im
z1YJy1^W568s>*OLCvbP4>2G$xOC2#RVM3cop&6;^%MVo8KwS!1?~Zhu=04XfurOD4
zY^bj%GIwBHZ9&ApAMg!Jxl=%Lhphq?HVbK^2(tyWNF5m=yFzxqYml9P*k>v@D(I*y
z{T1D!y$)i!%#m!V!&DxlVb;#QzKm!^LrGqzXx>_K;o|Groq?ULar>jX+zY6ztgn5i
zJi{>8KUzddVj5a)`o<={*yc^<#rgjnm!$z>(q3#@S&h84-tK{>QR;tue5f%eX`O%u
zG9fmY^~R4uks^1{ll-+TOuRT4eh<dsu)|lXK3~2hwJW!e(9DAEJAX)s02}^O?X!L#
z|MD|a1Qg!sFte;XSLd5^t@|1zwPZ)O7lGVg&di30@$W}{@Fh~t?|H_FoD(Y|akzHK
zpq+|&nD!8}WC+W;Uesa$ch72c_e*_`9J)0S;NZ~TY#U9>SH!bmwgP$&d68bbUO|_7
z<*-j;^*NlY;wgCKTj`YPPo8O#y+12>I6GA3`U5>NwzBi*Po}<6KA`4$C6Mj<^($=0
z!XxJNHnV?m0T|C+IK$u-xT=TwapcD3I({4I`Y~#7$f!$2f91!_6~#q<&0VecgQ_<?
zYJNjhBqCP`a3Af{LD8o|np@p9u*uGZhf-?9g!B?qoh5Q#cF{|+=Cu_7c0;+*neNz0
zwd1P)896AF27Es{rCt{f0tE+AA~cgmJP*yi2njwoHg*ygNcdB;j>R=MH^VrJ@JTRA
zhRsn-Fextb86o`84y17BRy#r@gHyP3;OAW0_7hUaR?86|!I%?{_KlkLLV^cS_03!&
zC%a?)C@=4k0seMgo|nKfec32P#;t&%LxR(uoC3gzPG2^70>#^p(-)RTn<SBG1uUh}
zp`j;IZN{pdU`A9X^!@I=_`gW*IY@3%?Tvc@jbeG{aPZB<XCu+3hWX*S^U*6hdNC2K
z1ia}@LO#V8c}>w1Yd>*ymLI(5#*sgKRM0s6o@zivPGr=Vsy8J$R@WqUj^ghh7h#@G
z`%78}1VDVh!}4l(SUOBdIGh-v8w`a685kAa8~L@mIuN*B+Td>(VS0T8GIZ$f-8V6I
zts23>!N*ePO~-d$O6X576#_36uje<FqZ_(F9I+y&9GG)Wq$tO7fyoG8_CU;uvxxLB
zz|1I4Za+V92P1v=F3_pJNgnyv+ejK^Lv<PknUH2_f_^;5#efooz;b3-RVca9&<(>Z
z=>SvCS-1e3!nI%@H8P_2y=~8Zd5J#$sNx_$xjz2Oyi+ixY6UT|v~~u2@!e0dS9LkM
zmp0puBMOoW3bONvLIgVs!h<Nts@0Aa|NCQl@`H`3w=y^ZHlxA>l!cvG2s=D1jWBv(
z-KuI_iOl>_M}Mb7rG)3%*;`xQmw=2Fp)n!gv#B~BH1$hq1Ac0S;Jg;HbzgRG*FjdX
z2yE7MA9k=5PcTP$1(~l;B;-YDxI#(ML;}%X`hr3E|7_j&+klIY9<R9%g7#E_2i=)=
z(U{{$jJfrY`{D&JBV5NWsQ37TNt#$Wmc3)eVG`@fS9La@Y@;j~X40P4d?)AM{qs)e
z;g=<!URAes@bB^${A={S`)pqUsW!`CibtYWRj?oD5R#z`h6ma!{_^LK9QrogPO4WO
z4Tc%h%1;05?O{~_yghO>^frUYP|Ux^7f57<kj4C2;}N5b;8HgbxI7eP2+`yL)7$De
z82p(+==P+7Sg?JT(ExW>9Zwwt%{WsQ$SVmI)K@MF#$Dr@$zkm>DTytiEHyP}4;|`R
zjA3C8Ixb04_j<CoW{Pb4q>8?9fY^s0F(kbqayf5Td~UV9Tg4dnTuo_xcjnMjl~Pew
zL8q%S#I(U1>Altd%j9(&(Y*LEnrH&^t%&`K+l_S$c~|JAQ&4l`Yr-_lYy#}w>r1sL
zOn99?<92A{U{xi%Xjjy=`lQll)+E&*<i-uET((0W#!%nEvX)n*ky@pHZMAlv7Ew$U
zH<eEBy_uq{u$uRFm?WS|tF7_PRsMXVf`Ye#+ne(0(PBmyyGE>e(?e9~3A{_buFu~`
zRo*7#E6|ZNUaKloELjv+oSd+6l}9Nhm10GBO&@JwV|f|3OW1nO927ETF;wPwQ(~j+
zbed{USN?JeVo7j&V{dCCEl5AE@7`V*&>jqoin<J)lDqCyFbvo=5(Wbx5zdI|CDZ+I
zm4dk}VPfxy#$~YMd=Osb%$t09wySFwwLIA|_nvPF=yKs;PPb_@k@pCRnQ)m+5c2tI
zqpRp8ZS1QREX&WZ<faWDWf5}KP0DB-n<5yBxfgi{D{A-Vy1hr~jr>x+PL7wIY18S}
zcgpt%yUI^~tDAKk`q9%pH`$eB&^s1(CX*;{gmB9-B)4!=H(9qt{>Nk-3wyYmUV5k{
zR%DXsim%HlBB4}go|4&<(slO-2>pDcb2hL{g#NV|3L>G?TeGg;`fPG>Fsd<?&xrH1
z@@9P^nmTnZ98z&7`a$3lb3y1_iRy*Lj(4}w&J7Hz>0>3@-W{SvIhm>^8Ju)t4ttmK
zKN&|-GO#wc?ik*Ln?N@Pms^L*(SL2!dKK8^lMee1f)6^E2KieWfPQAN+?>0q<88J5
z#X^!vm+#@DT3qsp<Igz@j`<L};CIMn%~-uMedO&MFtKt=^GIP^=Pk|gN+o+4dyniJ
zJPj{@q7yft%v!r=nr?nf>i?qnVKA+|X{e&O<q8K;{>Z?FUV=t%=SB&mZ}d$0nb`>=
z8p4j(#ja1Ja4{0&qn!_{=*wMmq<KWS>2DjE<iHo((y>%1jkhDbB!OSu9i&=vb1GG)
zx#ZfJ`Litis8)@mc^9BrnKL$-ys^7oSswpChkm3NuYkH7dU6Ee6nuiAGBC606qTUE
z4e7picRM5+iOPZ;E(@ln<=C};-3>`RWAT2@nYr72JuH;EY#Pf{A6!h$eyCH0znNWm
z#j!Vq!(i}hIl<(?v?-Dk11fh0JJQ*MQ@9&#uf>81FwaCY7@sDW8f49E?XFRma~u*C
ztuB@(se3QXzaLAIJF+pIFI9h!Tqew<CTSY9Z0J1>^BSX?5bCmJ8+Irv52u($G)sX;
zboJfTE4#v}5BFMfT$NDSm)qytt=liUX){JIRni7&rgu1QyWLe(kv9J(bu1*<wJnT>
zBlP3uHrAo{HT;Fv<4r7nhONbUsLDA!$I6Dy>EM)PZreCv?zaBPUfKG4kuauZ{B*h3
zRKpjd)4!ec@V{?!g)6AmNc82ih!6JOiF#CHIMo`ifFKPa_4yBibckLpZJzv5q<Fr~
zW-RnwMxBeA``IGOwwl+IT8xaFLYCw|*ScL~CR@!nD$C9)z8|!Hr9*I-B=x5G)w^$|
z=Z;Gf*OhVW)lLP7hIXYb_Bg3vM_!m_gfu%7v83nmiDD?#(}OomSJcxtX97A)ms4|+
ziurg8Ck|WEt(D5Mr{`L<oObLq<)|aEETzc0d8ch+R;s@YJ=ZjVm#trOxBOD5+K9PB
z9e@30*7WsnOVb7|PrR<+4g}dcT5#Tf$%Wp!SKjwtGq6K{tCcO=qdp?Iz%Zu(H5L^9
zU3o0mm}#-tHe$h^T((goIqZo1396upB-(lWxJSZgHO3?LHQb0>y0AKpasn?XI9kRN
z)-CG>g9fgxW@DQSvpptRrcW!&FLI!7Q0Q6Z#khU2yz0&`R(P{`H4}e|Drl-<DowED
zKVvr&Fmk1RitMf%*a)%Pa8?IUP8Qc8fVK&}O>{)IuYDg)!C6nwLGb>9WslAF{A3eE
z#AXzU4s@J27&d|<p(C64%7^!nV~1ggcta5QF2Z6s7iX#HbsuSy$}jG1F&Fq)v_^y&
z`Rl?Dl=9xZn7?tA?4?NIopghG)Cdu?n(MpeaGke8XGSLl@pRN``dwGIJVt6U)8*7N
zQG{E)a*b;!0v5ZFhVEQ>RX?f(m->NOsi_g7Y^yJyJWW+`Q|t;T1ym_E%Odm$#<Cj~
z+h?cFw$BZOL}7`@-llh__-SucbAdbiW?A|XU1C@l!(@n9XOh>47xQUFRL3^*WA$FC
z;Wfg<0nPwCbgxt&|4Hn{A2w35zCXWeO^1(o6tUe`yAPUN)`~~rH`VZ+_uk(+qW0nm
zuWAI|82O@u2saHx2t4-o(?a$WFNdJ-8>nUs0E>jfDjbX=pujKFt0tZv5{hOsafQ;~
z?)8d6a?qkY{o!2`{*p~B!pKJ44_3H(>#KZI$MYvi)1tQjVDe(AvZ|(CE_zibdf7bR
zb#VrFzaTjQjOi7!e$XVa>O4^WiaMZ$!7sB=rbA>UP+3l}q;XB2hBYvqLzGF>gOqPV
z+x=RV?>Xr~QSHGt6;1Dl<rJLP{U>XZxh&oV=8ru@yVt7UI@5NOWU;JpK0!mVppw!3
z&fS*N&Xu{64B*qFMnKZ$ZkXG~t;?$AQw{w%KgpO>USrvn?#cQ|VAnU#LjX`fX_qCL
z086rBE8lvIv8GSE-3(T2m@}7_^_`o0{s<4-f;lAQ>o$7Pb$@p((ecZFkC%7;RV)+{
z@uBcL;Mcv*1O8=i)wc@e^~=r+_uKPKBc@+2LQ;iKLPOP}UjS3gMmq~KhlXOdDqEm@
zQinjk(7$^twzKvF(4Ugi&Dmj8Y)l~(Ob3+U4!E|4>FnX8$w-zOmg*Y!y~3vAfm#2h
z_O-d3D<$*<nr<PcnGuk$4zyh=%Kcer<~Dk|{k?%zqSixiUlpT~dW9S0WG_gTfATxL
zj%h0O-ZrEOm{@f!N{d^QD(qw&922b(CwZ|zSaK4(d6v@XM&KjL^48Lk<9Lz`>!Vc$
z7^Aw%hGh1evB`q|kr$Y;t&2LD#X{S;j$Uo(foqme_`XH{D7OzwJ1uy<X|0ySgxB^3
z*Qm`SJvt%nCu-B0b=YSu{1XD3#>s7oC2@7%sYPqrJZPyeF3=pji8sx$GU(^7Q_^DB
zpJq$;=sD}5Z{3mCc6f9U=l&&XxNQWM!ajr86mg{wccsqH!J&YRsHuyH+tU9ATb}Cx
zdy8L-(Lg)r*-lSw`@Vo+Sti42W?<m1w6ru#W`^CIiSOP?%`d($3xqX<tQ8vog6&Q}
zF|ybV$|smW;pa{6a?Mbk=c1D4Q=L42)6o`#Wi{9gy%fenbzvsWXo_7aE^sYkX@*kE
z?s9|s$ffqpQOP?&xOf4w0p!k=btNKO4$7_CBKMQ5ni6hddfatT^Z5o(*DiZLBSD`T
zXmt>!V>zO^*<~v&k`(@=ie5|4N!Pw~SyerRljFYo{61s;+pm<R<(D?cADq!eo3hdJ
zoy6X>3xhfDxA@0P*UC1%52FYw3Ay}>d1Asc?mn<$++QqcbN(;k?5}KN0y?AH8(Qv|
zBlh9Jj`4*=Wt8mU-;=5v<d9bICrIF67K?SyQA_wm4B42hpR`FyxLy_~s^+fErMHZu
z)x$5vUh3qz6BQQRZj)af*y>DJSg|Uf#lh1TPLi)t*mhPbi=9t+TO{~`iUWdRtb=*h
zDh`o4Yfhf_Wh<W;l}xYcDra0wBCz5Y6OHVApGNC0lN-BD-<D2sUwhsQ8HA&tQfl9P
zWPsGQPf?`Tv`^HO26Y`zwK<iAUrrV37a$)9UblEf`9bk5jP91vB_m%~Ya>0K!3Xbo
zKt`rE11$eV?)3xGJ^xr?P6T(5fZGnRrO?a<GF%?U%S<>}jLw)18JqQ>Z5??q!9j@F
zOAvrbOI#hHrNa-s-;o#h#bb^ZR_nsS?c40qX`1x~F9%BXpabHN^@G8-V)7qZRbh=5
zE9%MNs!a~lPao1}uRa~kb{J-IAH+NSCHy3Hge58(@n83&1aJ27gEdQ|xOB5=bL<VP
zp3Vj1({gTY<3vKVz1tqy7LAqG_t#bHZBhtTZY$Y2$xDk)FLYg5qLqH4-gft)Hr1Y#
zez>pDHx6}-Zx)+jE$hR>9!)palS}iOb%#87q~JN#K=og^nEfSi`|@49)px*e^__h{
zvT$UHiI-utEW#)2eF{9SfQa+3M)MtRD&m>mAoM|sloz{17teVlY%TQxb@IZA!^1Cb
z#2<S}m8Ng|>b6<d1KW#<o?ci;6I-gUMBM?$L$X~cCqnx}Z{F2NSKVqVJA`T2SA2eW
zT}^L+k|UVZih)pZx_w-?OQjL=6v$YDDEv4Xn1?Tvzy25-Ak|8dYV*T<WL|N3fZct)
zbAI)(hN$5Si!!l0+YeNWnfmcF_jSkeAN-5y`DK{o0fd=&%<^6WH0V|KPGA{+03}Ql
zFoea{!TfpyXo6C_d9%-f!m?zN#|ie~FtV|+S)h^td=CR8UbO=Qh+u*&`L^gtZrxkJ
z*%FVXA)^>B>OoMy`u4>x4Ma@nAV3I?WPS%wf_@&#6pIsvFj`IG+wlRmwc%H0uC@<{
z?xnZoo&z+{#tuIX)205ZW7u^CWfsN#*>XGR@smgU!+oQW1im4q4KmF83@r`Gj5T>C
zvWnJr<rY<5Uk;KF_!*?X7w{EE#WLk-6)bk%qw`?%y@YmS8s)o}oe-nd=HSG~$~I^%
zXNVXUmjA-A#L$6ZIizye4>2r;U|6h9II4DsAl-(*YjEh&pap`F!(>Z}icMF?Bes_n
zIh!&?32H{ntZaF`AJvhWGq=4!ri6i!dzBC0!ypFrez4)V-KJ*T`=^#14Y1|CVQEZn
zS2r)-yx0mGwxw{h4_BsC*gTpN9)2j=!+<r&4>jR#Af_}Sw!{DE?MM#;633|U$FepD
zwK8iQ=5@bqa@~w`VE5>^lNKz0I^DCY0`QLb|HV5}0pp<kQsUj$@Q$8H=Pw`RUA0gH
zRI)%Xsv=$|-hU0sf(ahmTmG6hg`bZ(5;GZWLWH4Z+3*5h5SgaDl#Iu^H$=h$$JU8Q
z^=KynO8mNM+g*qQ(sLghW5%0RG`lxFW121~c~rL#28C1+CdydbN#Br}qi%dK-@!9O
z)klSYSZF)zewpj`*mKwi%IZ(v!16{~+x>WVtd9G>A0mHqr91lEM+eVB%yLo|sk=2G
z@S>5IkxD0Qwyr@h_prjYMSs2tVV5Oj|E&%*0sVZkXrjXan;sQgbznw@fayJ8POKwV
zPfu>=^x(JJ1NdY!)IvAo8cVA?_0}%iWQeZ*71_0v#418YB(i9$+RosTcy%#TM^K2j
z+{x2mspMM9+t~E>0cGi@C$_C`&XsqbEfL+63N}U2oBIkeNE;=tA?j-9+{fQYj5++I
zB4;H2<ackoj@*bklsn@am!jS3&HqjTe|fuqoM|dV!dh$%uE@b5nCIX{VBY_{by~<V
zt8}TubR;JDynk>|(3yEGCW1!T`MOp3k0SuS@=U$H2jeY%n=u{-8%E{S0_s^(@2Wed
z+x6LUPB<<za8c>Y^)fm?2mHfd1m0K9e;_h7T>M#X+?BU&s8%``9@Cv0Ij!ehTRVhZ
z?hOZ|VZP+1<&%|%+YTYQW!U}6n)Gaa#X{gKpT_$RuWY#~W-&?nZxrgx<(q|H>nxy%
zQtM80NFPH%Kud#Z2<b@gT$|)?+~?vaaoTz`UO^UepR(DH-j(zMT<m2}ryAJVMYJUU
zO&<U11E~=Hn3uSI4I&^YJNQ>t<OsA=@|XoiTNF!MTU(pCd=s=AW8l2^m9(Xw|IC>W
z`QtBa??8)U*z+DYq`+NQ4f^eY=y&E()?GR%tn0OP@=wD7GfyqyAMwJJ#a61JgDzF#
z$P2Se0#nZ-Dr_ts>w4j28!{<WFL}=!K-#-5J$C45>T-9t5hhGCGJPYpe^v@c5zNgK
zSjsj?PRowY4&F&^8CY^*$l2BjR?mx??tl<U7(yg2RqVx^-<*5ZSPdr&`|p^u=mz$b
zh1gPYYA80Or?$&1Gq9HdhVWm?1J-%9zxLK|KS<Z-j2JTtM-jWR&{EjdH3zFY4+sx2
zDWF0pCd_Ium}Q-i$VldtaceFbA_}g+lWdyR(6Yk{jm4qRc_N)>lC*uBlfveU^$ldm
z9*Gc)2VHM@Om@c8(+)H-J?GxJ-P4F+0o=Tr#fV|1Yb;|uaaGSE;!}!>TL}47H=CZ{
z>9bW!*Oz4{27pPLtX4^*CR5U{pQx!R7q|J(k%R9Qg(rop^|%Sco?eQK{HqJ6XacCZ
za^=fVe2PlniZ;H4I}isGTGI46h`a%0zlGmyz3-a!2U^SE$Dgjpwe-^#)L?Pf$K5aA
z)81q%`so!E9BLiDYbLE`iEJJ0t!CQ={<Cqpi5M3)syaXe+8|o+Z>q=%4~dP4VA@<?
zMmo>e+2f!#jL>)MCbf^lFpr!~f8dM!UM?3PAm41^gjmY;vr41=&r``!nw7j(#^0^l
z*BUtyRHXa)twEiIzWbJveM;h0iG0)KfEq^(xh{B7sTXuumO361iU81Q0!tI2LyNcF
zSS}V=j<$TsRq{TLK{<I~-70G)7gb>c*=WrwdVNJs!}`1ej|fk)BGE@OzHjO&vJ2$P
zy(hW&(v8xG=fR=ypn>$X*7G4)S@^T$Eo(kK9$^vjrOegKQ~|vhT<@eqLgQzTIA7|H
z-K(wqr~7A$m?zk3g4{lhy-Ekqc)Ri)0F(b#Y`Aa~!gBX}^j=sBzsm{ZxlD-?us!qX
za;6TW=@;Vf=$G<?Z>8f=Xz^pmT$k|)1HZ`jX$aA@&#zi*h%O$j*BwL8PNQ7K8;nt-
z4j2?ht$K}FpT#ep={nU<p`pF_uvEnEP8RpcP3xOHA_7;eb6N?g6fl}~1W#J{mFIQW
zel`VnWWBfKKybgJ1?u*V=J+V|Lye=3tv621T`d-j9;`kOul3j54*s<jSr^tC-Hur7
zoSldpAkWsE<d1xRuXm`Q1LbgQ1`}iQ7Vu!HFDf1jgv@b(Qft9(F?aWE^ZMt;-_S|9
zlp?CZnB++Zg`V&Wx46?+o<vvVJUM|Kb}qqDo48ATBoi)7V`=ZcWJR`i2kCbW9nM!j
zc{N<KXvNpyzWZuKGlh_2!ohb3R1bD__8aS!q@SUxvpID>!BRrF1qJ)51g!USy!~%0
zn@{{8eE3nw$%@21F!%^2%;8`p;>52^!9)s1b==Bhjy#@K>0o{w;VSuaoT1zWcq%Rf
z0zMwk1NLY-*rIXv_0wSORR7*j;za}urb}FMXL@p9ylphzuuX2yE$Tf5dDL7tE2hSy
zM?=|IX%<4VXu<43Nr-^cc!wy)UYB+TI9^w5#LSmmnC>)N9qT7R@-;ct4<B=N-in-A
z<e2Q|{W6QYUK$%eU*7p<^Z}k$d@+A;g-t(H99mwUu{KkX|F)`cNBjtHk;Hsk9-#J9
zR>Cb3u+3D!;tW(<{>wt|zlsB?l}eD1Ec-&?%dGr3kVoCQ?}&q_Vj)BvPmA26Y*^l#
zIr)nV;7AI0VTpwTiW`y25!iW`<)Fu(Slo}t@mZ_qm(Qkb<T_sYhSy@zBEjx_JG8mh
zY`Be4vpX<4Pq)>dHiQ2YCHoC6tE0P)Dr0`Vtth&+CMD<*7q0H=*xg%RlXH@zvsY`}
z*Dkd$&X%+K9X+2gSyDZClq`62q^Rl<{__@1t!}3{<;;*N-ldv{@!5tAO2Ij1(iL{&
zqcdG=s1J=$v<ZG0YU^xlm}IcbR9Cvc=K)FHmvG#v^b^(W<s906fU1V&QVU3=ylv$(
zr#QRfKH_}Wiue)A19$O4?{P2egCw>E%;=Qa*MO4$qI`K5smkVWeMJgb*TJ1Z^)SJx
zs(D>TR+jP7rSIzsr9w!RCHN0TaTxVdEM%Jb=Cqg5qHg*g{uXw%4p2~JG!P)m*0(n)
zET3rT^!7QFFK>nPOx=|hV9jfMSZVjfzknDsypQFg+k;BY_Dxu$Q_dl}i0`XH-tQB!
zeMOm1==C#?cU|#kUZgS2q>cRCgEic_XJZ5y*IQkgdp8HYJ$0V9P<YtrizauEorUa~
z`{(`({4pHSG;ssMBwjQC+e%km3Sk`XDs~rDWq0be2oI+BMQ?BY#B2xAlGiVnoCC~+
zjZWRydXW>oC}aavm%MU+yy9S}BkWQlc5l=lZS&rw63hgcFR_H(5|!$iLBNudZw=a3
zMM#k-CM-J%(bUw|1{c|2YZpeF5M(ci*sx?BB%8tTC#`ivtaiRk)KN!OPN!>eV4~~a
z*0cJe^4LB*wAAS1WiJk6hp;3)#Lh;g2T*k~vW#cWyjBd<I38I=Pr&QWrnl{;T*3Zh
zSpVlCp;Q;m-pz_k9Te_?2$h-DPv0vQUC;Llx`vswlu{>!q$I7hbq@x`^9wRuN=|9M
zZ(C+5m?&N~9F4t)Hz|?MKCz1sic3A*NN>-!p3w=*c)vy!&e2yYTfBo2cBa0i*jhe2
zoU-Ncg;UTJ_y+VpJ=p&1628kpFzb@k>Ud!0ULifOaIb;Xe%YX!v<P-8iF@0$(E4hw
zo@ZeR=C>KCZcbK8g!WKmh%6K!PtsLrv+Rot4u0cA2HS<mRV0%@Wd#!z(}1%0zYzs7
zSHf?@8d`4kkdU=2|4{Z8Y3kIwaOmk)?3287iR*UuJ!`c#+3SNO*Lsd|cwyh@5!b?|
z%}B1ICFkVen6xnIT6t6`TF;#&Ixgnq<%u->W=coZX>=zMHLIU{FY6lt*dKLvUDfSO
zZ-bkrz>6q4JC3P=&WOWHTML=I!=3yJ_}!*;f&?|*yd($PnTsDxTdbOVCFRd&N^0a&
zMOAV#i6S*>B-FB(P||trVfaQj)1;uRIx~mc%cx$#R|e{Y{MLd~yMgl`RTBA!EMG8z
zF3nylh*NcjzyY{MUF9Agt7EB|1~B2gr!NPUmD*EjKe7l@dP8o-OVl(b%A&VnKS|F4
zl%HpmEr*v;2;FI9!d%vC_+6AIy%~2A=e~V#=Y2VXBfYS5?d;(0xHg;O7#YzSBYoK9
zfxGR?Ei#r;s@|wd;Q`pJG5h_Hwd$r%uv1-Zux9z4`<wxi{*&Eoq8Fk#_-V_?{cF~3
z(C~|{bZ(eGHqc&_gsAOWlMqXx%Zl&x@cMQnJwa>cY|nfr6l`)R*H`Vb8ajg+$BIr*
zyzW_dIK_R7!gl&}?!28<4+qp-4p{n*-zH&gxbwNFr9*3T(GqI8B2is3#MnB8=j64o
z&9xE-U)CHzamVI<Bmdsre&D*#f4JU=ET0sC7k(MxiHFGY=nlu@0*s4Xra>V+lF(1W
z2<y)LeqJ;#N{0CtJ&uD!NTU+KJ?j={X41qt*4iMpExW{t{5Zl-0v_wg4v$r=q8)M{
zV!d+M6;Wqoj>N66s=^dmbHbEmQ&&w11|J`o4p8CCUvPYVVyY8JQ)yw;vL{Us3q%A^
zTY6yu!hF*+pYqBPY9sW?i<GMZBCM$hWdn)N8xp6TOZJaDU8itVwMj^bW0Ko$R)co9
zdde>uis<6Z@tmZI(08wj55L<EMoU|Vqd9!@Ef=*k_i_gGB8LZ38*9fvC0}&&{1DIO
zjG~6|Tk*H_fx|ICet-iQ9KAxkvew~jhh&e|+nh499$t#3v<P?pKeW9EG}rwbKO8A5
zQAo0*WTnh(r4ll;X9HhFX7<RI?1XG3WzUT4l|3S(Z+2Gp{MzcdKIs14|NnEZ^PKbC
z$LVx)`ucp{?`vPL>vicA%SaD6yzgCDdZt;z)dxKQz*pQD{1rD?4(=&Ctjh+PDER^w
z<o}u=xVn~s<o1<_rXPoEzyp{Uu+wn@W=_h`@ZIfLK^4=Yk)u+#Z}VUx^Q<Q2D1oG@
zbaNs)tPZkqR_j_1NL^k9QXyWj4=F}E`>NCL|IPvg8TD9ZwAd^1M5hhS8LFG&K2YDY
zzH7nqA08?Ld8lGx_PvMN^O<>i!yk9vCLeEkQ0E^4Q#e#ETQ9@H8exAuBeD+xHfrHg
z1FG$w;hm!psSq%FAfix!P%1R%W?$G(?5G(aMwNWoNbbk9IQyI{ukkd&=K%rljobeP
zM3xE~dPjir7^1DW_fP?@ZYXQ)66=}s@*<d*z+<X-^5n_0f-gUQ{J6Yc@k>V+_M-)_
z>p9NlFK^9c$z*8epN*;y<hP#trqYwCqxJCN!RFDg7LDMb!VT^zB3TCu_&3>m`Xu{L
ztPKz@RB!m8D)6J3q*-Rax<Z;zViiOFat?XDe-7s_B5<tb!<B0%#4B_5y{0I%ACWD~
zy7uE?3ApOkc5#sW^*gT*yoiZO-%Lo$Y>Ri<O7TC#D-G0R5`(rdq1K3OpoO(yQT4Xf
zu&QKEE$LjDpp3@Dl-*Nx&ygg75RwEf*A}w<hpEkhUfF0-(eZDv-VvAFFG)}is@Z{T
zl!X7@k0!x`;&kLF(u2AJ9XP~AO*e$Ud}(*M9j5n!cXoCX-WTarxGaBt5F`bcUK96(
zx2+|TyG$pRkbuBlcc|d8mOX5f9e>W28C+Xin>{KxhYfX*w9|i+(0*3|B+!zk#^kJD
zWuDcePtvjH05^l`_Urx+09;QvUh2VTdz@oJkTlr<Vb6|&>0qfwT@W3oOvFVZ2RT?W
zYJhAn#B{6S9=qTuObY|E0mo<rm3y-E=$^!U_`riWUxV6bCtk$GnbuEwu^7YPGC^XB
z2Z<eDSZEc10q?=h(WqVuhiy`bcn~24`<gdJCQ^XZQ8h0xjs&IRNHhdQ%D<XfKjfxx
zT6vJqX@rl(7v2t3hJ{MVu*+a(^J_uwAZh`nI)6$=sFC+Lo59u^efYo5Je0iDty`Bi
zuk;ziHqEtEA5EgaJ5UYeK(%$Y7XqM=U|dTO%M1rP+3ioh$Aa?DGW&lY==sH2+l%=&
z59fO_qJ=SCcRB&ct~l^o<&XpH)V|tYG(9a0>+C2+or-`3_uJCPyIWC!ER=g>Cyw3j
zjZX<~g&OIxRegFewJ)1rZ^6^?v3e6mfWCR%Ks75;{NS=L=vr{G@0x^j{`V8aNk<Gy
z5}y^~rYH8?6!jv{I5hM@3=MkHHLl7>PcQbv9IZyV(?@VmxMi90@uFO?ZRH^><LkGz
z7A1wH2lkxGh5^@y(d<%nP=IDegy(*@I`_UluaSY_6q)(%t#ZQ7Mj{NI7>51Cwm>QR
z<NvqETjdG+Kh5Nfs5L7|#(ddA`8LzJb2rv^<tpHjc3ELi<}Xt_JLsTUm8yL$--g+<
zp>Nq-C@`AnNb3jP`=|PkmpTG=+=u@=0i11R;GW3&h(rMJVeM!HVi>b~!}ZK}F};o@
z68w+9DX)ku*)V`;sQ_`q?xSijFO`CjboXd*A56-aBB>=+s!0VhPy^-e5UV3U=4boA
z<yijv<+&}w@2{b<KqBC~lJK!ejMCiMsiaYAmjmOcEc%6$FtvlQQ;!w&nTC#y>B~N-
zpSgk9JPpK(;Il!YeFo7XRO{{icQ3jM5@njq<p_H~>`sAY7FLTW+YjOX(L9sbO&mkc
ztqsHp;AvkPRdnJ|Z80oJ)!W_OJ$PN;(69+&aAag0EZKG0(fZN72Z5bDfBhggkOxWO
z*+w1&aS^2|FL(d(X7*n|86Kb|60saHFjNhhX+x-%lG52<TfG(fz>CBnB!qir42U1r
zkH*7#pxG9bPn8O(9UtCluRiEjV}&t*Z@G+Es5`8u5XKj<EDr!5H2X>^2LTjT?zJ-U
z{=?<iA=Y$(-0&&D0J8Q31y{I_Hb^ozHy4SebDvCgK5MU-UK55j(#W37b3l>#-6F;g
z=Ipms5M%R~;(}ms4~79~aemBG6RYSG{}-u+OZFlCK8d`J2uLugUFH)m!hN*+vJDF6
zi-;X8RbJV{<ZrJ1ilO})8kLlgK>s5@wQmI>Nq|YhHK+5}@2Nn^%=MA^Er1APeWU?<
zgiNw`j6Yb~e}=EBaA^e`M1%;E0vog{(&IniFNP70f!{K>CqHQ0Ock|BUgMqy0KXUj
zvpPfjwvO$<Ib@j=z{Guv7ZB<zA3B#SW6ogN5!QxAj>7)$&rLxG=GI>zz{bBQ>R=Tj
z2<pe=C>Y5aR)+SYP`+h4GQsdKjz%0|`7OJ@H~9Y*nYVO9El9PrwuYheP|bv#r6jp`
z^1yaOiy8ssfDPJ=z0X^R`0>{h3PH&H@@SCQMGnmm+4Xt!(rF+U|5S!6fbhQH@k8~~
zAbi7!w>f^WlIoxC*;NZ}xFo4_7V_8cz!aO;e7GIx7nWg#ZWAm<U=k7%8rz(Qg*Lb2
zMRlTh9-dqk<Q>d4Xx0Z$oeT4#3B4(%?MT>(Y+M4W!_hhW@d08G#jvoy6Gg<TQ>C@V
zfx<|ELAO!LS4T>Y|7a#a{1oA`Zyx!$M_vz=N|evhJFQnP!t}z*(b&nBNK(ry<Z`My
z(Ppp^O-fF#hX?a>cY7RJO#s{1fy>|85gE7sU^b)+u_;f$+)F>C)CAN-e>qfQo(RF)
zfV%aXKWuzHnNX4!n)*reKUBE<!GrO!HtfRdh?^iM3xrDlYcb%8II{)|12N&l-ptr}
zQcc(d@M~vM%m79Mfh>h|^(TZuW<NH6ZZGswC-;=SI^7TTvZFuly#wsRUpD--AO@Q4
zY05K$NW28XpE+HI9-n`UhQa6~RFio(D2b5>9y%R>>xBrO{2Ia@bnFbzEx1Rb*gEJI
zHNaHJIPl&O3Cba)E<kI65bjG@p{EQ1`IK6hWel9o|35z&EYJb|jmA6jBIl6)m8U1p
z9XWtb`Ias>u?><~ySTU{T?V{hwTWHu0j5e=?mu9v`WYdgSnrC81zaaLSj1U>!5ry-
z)#yWiF#)oT-JY7Z0>nGxBR+JeFhhw;UrX)xcwoXW>P+(s!&Iix3>}a!xp2?#`Ioa`
z`&h#6$L`Dg6$FIemPwZ;2{Sc_!`kV7N%J2(=U)VgM8dYyT~daCZ-6d2Pez(m_F+BX
zQ06S(cyl{WSOdtXOCw<Nt-ks2|H?HTNS~cZrS722<0cMPWI`L!#x)X<dy(nN{{pp0
z2d8CHCcuqk3x-f&JHN>O_}hmNgzM44p_mdS{RNtSl6<E97l6Vu3@T?(wR?MN7j#f^
z=oKZ{(cg2EEBpDzuqc#$O0p(qff;p{1#FpPkC)1SIF#!ETd`UVh}@fN_Xa7az>R|y
zQLIBxf0Y%MSlz#WpQlaj>tz)mRJVo$c!k`%RoDpo+gHaJ%d5!BPuLd~7G7f5dS<FK
z$&aHD0NQ_@j*@)70P+){pzRIn21w!gHbr*>_5UD0l5jdU*CW-Lz~m_)BYKeA+1QIF
zo3^9T=+1Nv?i9sDZ=G-hI~trgBGM?)$``t^N$DTDRxKz=KwZt<;iZ_BeR=oKi9`&p
zD==R3W^U*~7nR;uCGDnV$!kYTQ73u6)~K_8s!EqHJ$DhfKt&fz|50Oxt<~ZuU1nH;
z1nrBrc}mV=Yz=6J_<nTnXNMw6xD6-=s4lnc=UV@0QSKrH>^8~R#)xdMLh0lDV*0Nx
zbD#6@?WkAmWq^-J>c0)g#D~mH<hf^4@1K%3JscnfnLxuSeBqM9wb_1JB)UH+%>Iir
z5T6CxTHh8IiI`-@y&#E7K&+qY8l2~DAnxDfWF#9%mOC$k59@%m9BoXA!ISqT9#8)-
zW6d`3OCF%wTR>-BPr#c&*)oQ1o(R%(v`f9m-?%!w7#RG@6OspYv+ar1>5~94`3*n&
zm)PN12GWe%;uZoiAxVKd2Ugj**k=nu7T-O|$jCMZ;$OYG0D%q}CVpud&b2~%KMw%V
zk~;7U9DrevlXm+8B4RW7FnK7$r)$CHBXpPFP$sAv8Y5_<A<QTHj|e!r^+7VBRbEeX
zW{_y`>RRkus9e6VG}Rf}y|E8rT-3d@^nc0=>%P3G_3mi<1FRcj${B)7|FBfyw}*G7
zfxGK_O^}Z0=x2mBKn@!TIV{H-PALAepJ3$RkWE!dd-qHK`cfcR3K?K7*){wzP*TXQ
zU8De9cch;pGiahJ^YRBZ4H=>2>1z!0I=T>?)*Bn34|pZk5U}H}vAz$vO+4n!Tic>q
zGZxjOfq-utA<I6R2$FW-rT<a1K?;|^SPST~Hi|+B{}^Ool<Ch7eLHppIP9bfO$SV@
zGx3G9LN4kQe@#6<?#FWRB9v>$T8-0m%Zg!Z>BDl-9tt3{wLbC+0vH%-9~KDTYS}Ng
zxt1Vae16M#Km@@uY5M>O7SLaU{~{rKu_s7$B{ws2lIHLBgak|86nvw;fq^eUW!&b*
zoY_;h$vDT%*I(q2a^~y9`orM4|8h+4iY6#W;KlfJeo$b;=MIJX)j%xNuS|&G)AxSa
z-~S;FyKzth`u&L7eg!2&0t(#AQ8bBr^O4<Ybbxkm<mQAQA`?Y4sg-B;KhZo33A3~u
zGlZb&rt*A-1dknTz1+dR!`~9~*LU<DZrp0`b5Ia;!Df@iF=$Z#^ql{epO&suQqU=G
z+ylW_CKVbFFhP3*j(fz|ONx><`^klCD01*)zz8u1>4{m(F<=L@o!rRa5c&_ZQO}_A
zX{B%4k5CWR!YD2B%BbMVyruW&-B6?6hC!f&CW>|NNz70k`BRX)!~&R_jr25)5-y>|
znE&B{X0+~iaN-(_CXzDo8#^Z%#Q*X!PMQn$VVX);9%?7Bv}s_rUF$CUbQC6iMs<6o
zGg?%YI5OS=Ci|4st;IkD_bf*(!tPV>l(|H0N2_ba9T)p1sHKT=@b-U*T#?}aAXnbY
z31H`dT@Su6)l+=1F8!ZpPe4V9qsb0Z3nnmsa;c|`_qz)8E^vs{75Tiyd3JchC~d1)
zZM&62utqS4bC|mG(_W)*?0w--FwDbnOE>IZB!{R%hOoWhKRnYlxY!BT|CI*>h8v;S
zzJo72j4pbOW`x_-dbtFODFuodtIW}}pLdJz07_H9#n|`>O71uep?#ydDNELk7I-a9
zK0x+f?q$P#P())m1quOf6)#L4wIzfq&Hx6r{?GEZ_ft1S=(rH!{Xc~ckRg9@Mpm{I
zZ+>F|{y494<h+J30Z(k@02i>1J{Xbf=;;=~3x+nytQN-0Bxck0T6+lXS&BhPTNK5q
zSlDF!*IMwWXEuUvPr9$KAM$j42NgL5L?yK0(Ej4-{`!gjm^J%~hFLRe^itxi&Zh5}
zsWvmTY{Fiyu9WH|%-All07f^2wfn_xk@63FfDQnJ+&6^FGQJrJrE}f$bpF+59Kvda
zYva9Rf$Z45%eLoJ57R=0pf3JBcvq?@5$xYdN(`*4MN4(x2kZ%Q47C+VWxb-#!S+XO
zrByT5;UxPCtT6XxaAnH_X;w^=HS6wNT?;)D-}?D)D^c;Sl}zYYDFXXFOQS}?bxwxf
zlH1^jFS1?f+ZSPoTmIg_eTL<C8Wa>1$ORPa5swKB--CQ?M?m^tq!8M_0zZw@ZGGej
zg?kZ<hl_sH+a9wYh23xZ55M4bi~=B-AiLX{#N9;;OPeo-F1-`Z(HwdrJFYKj#b|Ag
zT?Fem5Q{`A`#T5!Q2<DEL4EBQGZ7m-3&=?*U{zW{Ib-7s<lz<)d6J;T1s6sE;TH<-
zS13g@eQzx<tP4Sq-89xc#WL@Y#Ayw2t4CqVRFIiSEnyA2+Xdx2={q|=`!owVWHZZj
zU4+b^DQARCl)V4nh)>RhMbT#+B#~DuLv#FxMBW9idaJbvL74;V1rpg{wL`6QR9+sL
z_iCku8Z7#aKcum8AkedKALSen0{0_5E88xOnTxTf?B);Vh=$hI4v=NV21C|5@5MM^
zoN&Fw>|i*ixc>W5c*6mBL&Jza>B`?#8$u9d;QlPWoBDxU*?~<wKR>{g|AP)yiXcIj
zd`8V&mz3M2gAQlt@x(frWr2L>K0y_wvqSCiY+@Iq=uFeXTn3lhu3C<?9%jmUqyl!`
zX6_~De1)uZ5M8ebEk7)o1E*V3K8v*b{@&m2No*xG9OT3mE&@b<@(&IY))9`%-}sk<
z+)sT4`{iQE{?B2Mj6V&7j1*{2O%ge3v&?x1g*|%6>ntXuXnvhf?6S^U=FatXyMB@(
zxb(n<tz4mheN;%BLFx{0cdp%__do(8Yz1mAMh-rPpT`lnUk&}Zfa>WAk`w)}+F~Ao
zL<lMicT235hCl8uO1^E(@AYG1+9<elz=(1EQ+gO$^J3%`379RYw5<pU>^Q;U9t|B%
zTj+55-uMp<tgkTJoJaJ$@+2I&8y-9<;`~e?H(fMIH7ERMcRr6)R!gR7ObX=)gl*6K
zO6HkHv!!|gKY@~qI*(rYQ_QF2wwmEKq==L^{przvaC^umKLMMy7N|Ypojx!*n^0&O
zdx)7S`)AWwB#&e`Y4)FJgjg1-jWXV;eI5;o{pF&T8~4g^>+`W6w~^{)o@x4-f0r+M
zKDeg`qiyM+H+$8$<3#UN*-Oz_26^>DR%K~wS>dq3cl+^cKye`O7|5doJGdW5Z$p&*
zj;!444;=pgm5^ZVg0A$Koiei}vCGu%IN*4w)k;Cjdm<7=64>`|q7U+~4KXx2%mDQT
z@*~^>fvQ6Us^nh;>OCS*8tcnCP|MbSirqf%A{X<~5d!X0fAw3x%ML>gY#EkPG(k&b
z4EQ8NfXlkEtu1_K$7x8Fn~Uo$)E6y*F;g0-7!Vf!PT*ch0hD#AMK;}^72_FX?voc1
zxRAtuZ>$9DUJyXpM%zXIMO`6ag#5A0sTYwjWE8xt$WO0o&8hrak38k*{Mp>?WF%dG
z^x>Glct7FoNUj^OXn<tJ8TMbR4m!+mG`6((Z)&Q$y13o_761CoLl}5L);#QbSOTlr
zm78~=tY<lsQK+i9Q0aCUVI+MC|0OS+`+DY$Xigx$ok*A4_;;DPMJLP46{7{FJ}bEn
zm`78CWhEJNr`Xl$0YAN-nFAw?-b7UAf#@{j!vuF0VFAP7y5^l}pdfDotdGH9sXbCt
zzRB}YQ!@%+X_E6rqsKP~7F<w`+ab?RHOc<jIFnd`WxLFYMTl6yv)~1Q<z%>uVUfCH
zrE%f>znm6=6IkyV1T%b`v0kQ*w|A4AZGTSfbJ-F7Nkyc@)@4L_8cPkS#N3EXESGYp
zQ)D)DzjC4hO7pO`N5ZsX*CquLyO>$&+T(6QfVH=Z(XKclBpiQV(X+bx+JbT8!qZ=l
z524M;9zNV4FlVldCmb&-=#=BN^u7b3ghm!*0D5f0+MG`?ck6yel?Se4O?yg8%1xh9
z7o0b&!%KEdY-}=oW`pd=?q=o5`{sYhiklu1WFofcKLwe;!??;a#08nnAvWE*ceitg
zvry)KcWSvi`5$2Yv7*9fv%;*1k`b_ZoE<&`$DGMxt;{byERb4yj20Ig7=xFyp6<#Z
zRP(Y?JWJkNNWzUPv~5jjgVzrbm*~;4t$?p1oB9SPnqXiA2u?$R7rDuolo{dIJ4t?f
z?6qv=@kuXW<ktkiwT<avouu;B;RAA{s54J8uS7UFBMhUfbA4*Sg>+W$lI>ZNb6A)o
zQhBKMa0*?!tH7*%4hFP>I&UBQU;Sw>zimTSAPwJ!$A$^4IBk%G_|N}h5RvNYe0oJF
zcKhDg_i-9Vl{>~8&s8*^Xcv^P`p>H8s@r09+fj&I(L9-^8v6gn>T4pd4$5q{P-e>|
z#kmjhAXLVdcmcDwk;+_hSw5juD4L*$U8lH9$Q-KVP>YtFrSh$`B2%E`I*)CGm#vw<
z1hknB9zI!RYf-jMpq3}?dVI}%R;~?iKIs6Mws7UIGABUB4c<j<Iwm#T?68fPjuLwO
zjVDDdE3TaC;~BRL_%WMDr1}cY*4y5cv%`nKtfV~k+k_{>oLeO0Z?FiGh%!e&BFfD3
zFP>|!I5o5Iw$chFbvmuTg(@Xlc<bf-4ybf0{|wa+r*4ao4U!MpH7@T$0+k>nP<bCd
zi*r!9b#iKz85Ss?ZZvH<(EzlAu-@3AB##_QRYxKH2sHbo2>!G_OfiEZTf&w1Tgk<J
ze5LaA7_7{!D5Ys5besHQ4fULbV`-e;2xtI7hcq3I)08wCDldEc{{8kvH64q*+}sa?
z^JSS}XO?$X+GU{sV?9v+3GWq0HuN9OvOnJc5fAZWClWvIn-m-<O}9)TmgJzZTNw(3
z4jZIULHeh-80p=Xa}U3*%uAK?Q6yy=O%>bndMSU3HWbJL*u*M+6-;3B{TRISjfjq|
zak6WzgxO3cJb4m}cpU)+l;xu)@m?V=>bC59;EmzEZ&V#|KT-*&Fxdq`CeSEpa^Ceh
zWc<N4!#&{XX0smj+WO2`AvPZ0w<noYZm2nwx>xLw^STZK?id>z^DMm#<+047?*vH4
zP}>G=Fqqi_bL3(3EF2tRV2PW7i;u;ueDmGAzz#}EN-ncO;VE?9d#0GB8=>E9tHbb4
zQ{b&tG(+la+Wg0WsjDhzlx9*95_D!q9Mq9uux?ad`Jas~0z%ksbNzzdzQ@oZ%A@$g
zxO<6BRdF7xF-k=CIM6Z_JcFs=q%Rv&N_h{hDvV|P{KiVu;zECw=3SUQtxZyaDAv%o
zTy6lA;H~-O<H)6_yRS^1U7X!HlY4jtETJDPx=@I7T86ijx=eS4fYF;kmh`?K_7#bv
zWP75N1ffx!d1*|9!s{s@GP?y#VX_i;XzF|p+!4>a@Q-#w#-(JMHEq!JUF(?!3zycF
zrKi_h`zSE}=}+{wsQ%BLO_-A<S#=?IQ0-m)(IL@Gt?j=z_|p19|Da+GWkyRlsE|Gj
zW{|DD8GQ>{JKX}4$S2TtS*HZH|KNVxA!o$VZ4FRO*`bc-0@=dHt?9Jgx`2AK5~z%C
zf4#AL6eehx4FP47odai0Md6cl(Af|dN4g#}cdu|Prl%$r3&#(Y1TitGF0u9aGWr;7
zQ)M*)uoQ>m{+#OLu{XFkrJw}lqVuMk2&(x8a~mT{4m(99$FWK<2!TNj<g12I1>lqE
z92h0pF}K}CJO5@jp(pVbFU3)pn2Uvl^vsNlB1@LK>Boerx+k5p6t9LLRJyKMhp*Y#
z{HgJK-{=~-QybvQFF&Yjcuvmd9@zcS3S|#?t|WuzaPNsxL!7e^t5W-(S^p!snn1dt
zy`de5D@%j0y54>mo$8_ggcj-KwtlPtuHN24bG0m8oi6^jg5%I^*>8)H5Wz8bC>Y#6
z@QKgfLCltzVqB3Edm1p%IYRPL=d6Xal~_Ko(*}{veq-70bz(uFyWX$5F$iDcg4opD
z#c*hAu#sPr_c7X52_cU}0Bw0pl)!!r-t}k;3H`=B@3RkTqxZyp$>tq8h!=b5<Xxg9
z(ALsCGEVSedG@I4dB(Yc5;aqpp56p|bNC`J=-)fp=<~4UCX0!gIl6$Rdqm?%G7J@;
zc+vt8-NC+1ArX;*5B8LJSR4?S=3m(31Kum&kmw-qg)w*n??tb<7qm*wm`t4kYke^@
zSqGvVB!)vO9NLi0x5!jtAB3dz*dQ2^t`ncNV&DL}_B5*KA!yV}(`!_0{uTRlamdtT
zc6O@z1Q*7Z?I78(5>^Q<+ovJU$+{|>4j(A293leTDM%q0je5S+&p^V7)uYf2IJ-fO
zzzFb0=u+-`lOfX5(ql;996$=V+J-l8dhhX<yVv96<G(Yz(VA69CuC#tIR`=M)c6ob
zu&SiK5*0gqmIQ%<qL8c}v9%=6%8&*|uPZqs4Iz6iLF&ejmv`ar--X`fp?8~n_O%F^
znKAc-8~9VACVm7_g2`J~co)-I-(G8{5V@q(xrdw&*{zD1o-M_MP3o`L(-pt}G0T$x
zUP$0H1k_0No11q)2&l@}d*Who!RP}!-6Ssj>RK6B-mXCa=~!}M50U=9*EYrsSQThm
zJat}iAe>E988r#<8nWi&%}JY!@mbl~3J3=!C#(n&XC*Io*`6yL|L(_Gz<tixj~i~$
zLRFz)Zxs~0T_9jjW*c&IjNsM9_m~+zeSw6Mqlq5S1&X+;h-cp4HK$Cu$7~Gela829
z@}`{lnnt%t%QC|40S4J$t0*a{thA4PD+kE-O`G{IZ?3fq_j7>k+V;i|Fdk@z_0TKm
zVX0Z!P(#@V9RA{aPd1Wd_PzVe2X{Nz81(x-dqZHqFw<e&<1at?QQ&HShh1X`cH;$f
zComk;SIjF*!q3Ts<HXFs3@KUn6o82Yri}pt_!17Z4eP*%{+1&wHso#*JqKBRVfA23
z1R%Q10Y-o`JdtC8K+}ft;BW#jkox`ZCm%wh()yfdus_>A2^lBi`C*X+qfY0#J0D4t
zV{o4U{U4p9T<nX^Q7@$;vn=ZVY*ia}joY8mG?Ln1E>6wwmoxz~4LiZrwa*}fX+DSD
zn)bA3rBs~$q?;Xnw)Zz7n7jv$O2_ZxrC@XxZhy6Yt1u9oRq@jyQph$F!UwGaC+i7>
zng!l4Y$9E+qP!Gz*s}3lgn9w(c}t{&v3JCKnhIzqFcZI<gP+;<lxC2dG;9Xs$V+8C
z0SLfCE%u^Og!DU*g`Du9mI#*(4Rm$Df0RO9SzwT@<tNF(C|R0@N<>~(`IfZk&Y}c5
z*T4@JhVC`x37cA4UWRjMNkgl8EVub$sNcon3)2H_Kr4!XnBG+58TV#}9gOy7z6V6@
zpxB1mJ<qmSAsK1u=X5dTZm6Jo##E|k%WtrZ(Kh4JAooDmn@7Ew9I#I#Y^6;|{$kmx
z(t;QuuTwuv7^(gI(UG`Z@oT#RYo@Q<+0iV3vtq~c<3yD4TbRvwu{gw8fnS(O$}HCa
zIdAExYI<mhMNKQ!k4b3L<vOmLD!PTh4jT2JEAhWJx)~W6C8GGvF2cI+<$0A|6oT%K
zY+4j)l8p_6v0BH00@HzM!lIY)l-6zCg56K!oY&CalOMPisb?l7OL6Ai=Ok#n54vc_
z;{EO(zB08r=G|MkESOR}3p@scPk%XUe6&n*`n9|Mrn(sQvs^KYvdjUqTwa&-?%f$t
z#ts+JPg|I4SQ<2T7l-n;c2iu4lLg~8Mc?nNyE-f+{k$kuHmSJ4_~EB1pVI2`<WEUD
zXC^(TC9mG|Y;4U!Z?{BJ2ft`rcdK;1mydT{pm^h~>pv^o4uuvr<COJ@>9S^2yz_Zj
z6PLfU`bECHH-3425=o89Z54m*Oz>CuZcQE$;T2iY%#XmOwG;n2`Sj=ePLhWYA1(u0
z&{0n^!|7#scvGz}sR3|rW4)&lb17Y)R7q97Y^~9BE!MVMr}#Unl6$B*ZPFQVRLC4$
z*QN*2yFs=zOmaKt^m;6Tdql;l6vwV_UB)lAoVH4acIDuf86k_$(|<f3n%wS!#Cz+d
z$-LoW#9hW~@xOLZ3zJ>91U=L85=84w`inZYWA0SY!Mc&%wX6xH_@S3qVP{9_0)F@8
zZH&Oh7Xob6$l3jx&2m{V`I`N@b4EtEr(|a(;Tu91%o@A2ypEk(H&9}`-lJBQsp;It
z;jDC-^1w7~K#VaVju?3a!b#2ShC<O+Z0K48n4;|0dxuXxTXd>jzb+frLswo%)r%Ji
zbtb?a@YRPX{`PKspfZ60LU{niv4(hXj}_TG)b$Q_`;8knxJ<rYGMmhL9t5$SU5OQ?
z{b{1vqgr|ioGU6-DF~2mXLH3FYtr0U?Wuw&Y}5F-yVfU*e<xO82skmvyeM>oAtXjq
zm><ul#rO;&@Odh;QboX9l|`Jv5b_Q|YCaA}_C8rKEnAX_7*pR|IA9Zv1f7~6j7})J
zvM40n@(M=kC<}BI5n^3*^<n_Q=0dem%f?NMs`LWH(U%|t$1zz<TJ33nR$Cweqmlg{
z-M2sGC=t`EO@&Ek0Fx}k!ZQbn6Pk3apWj?t0mr%Sy%B)Z=zgS>)mtennxHp1Kir3`
zCZlkt6cQgEec3i@<}PF9qRqCQkAn&svGH_|^ri*Y$3o>be|jBuXlXCy17BAn8vlJ0
zBilUVL(q_#(2D?N0j|~k<ZYL6y~uh$BeFds=IPJPsCoP+iXX7kbDYZX)?DroD={ss
zD?gL<`rfr*fN_**mpJ0zy}(93?{{-J8^(%6H^)frXLIU#eqAJ5bz#t)yb%;e*{exu
z$G7$IY}A5dw{bgAjk323ulSe#+?t6Rl1V*tePzYH<IK~QR({JbviyyRfp~ZUGq_0i
zYeVvOIhmw0mK*@T%k<n8mR?uqv1&V39ii~-cgbE+SP}_#kl*T*;cSGB4K&$lPZFe*
zvo~J*(7s^HD(zhV*lfhXI#^*gB#K0vWyXYEbAn%;Sjij~%GMp8E>KRj(9CYyy^D9y
zP3BBfSZI|fi&tyM!k{MsGK@EqRBrOUWT8A1s&vSROrr4uDk@nZ%wwxx1)yxQ<M?Yy
zbWx^17jA^`5GgsTx|v!Z?RJb_=G6Mam6f^<;Lx+Muqyub`W4mQz6Rq|OT}%~Nh#-h
z9u6FrmN!-WxO)$xU2zl0RqA`<(NN%Zy9Iv-mb=wib3Md8tO=c!aj*^tO1^Cle_&ZS
zgnbG$px{s>%Wn-xMKpkw(}U4f{1^hIkZ^i;lfDf;<(KB&#Hu44fX*|4Nfouydn3zj
zjSjGshc@M!(DW}ouC4llN3E<c)2g*Y(j|{PL!vT3zj4n}G%&mvHoqcIUZisM1K_Y6
zg|5!ndlCw#iP7sp#)543w#r^>+3cpfBZf>ph?{@<mMV5zO=n`&JHKn=Q9SFcBitp`
zag!90)QcWa*u>a;4P1RaZC+1fYqw7JQ2ea+UBz9Ck}da#<o?f|WHgxqOxh9-%ZEr-
zoyN|ldPO3Z81`YrpR;)?8cS&xSO~Fx7TxrEK7otKasy*=wfK0qY8DNcqUEC=O%<oF
zx6dS@u~dahb>W|1Bu-D8PY>Cd7tYg7HU5zpB+3eNZ{2t5f_u+wmSqld3jf>~9psEt
zylRry?)NadO8}PVEZMwN(KAUbe|AfrB<E~C!QE8)GJ@+j8Pl`!J`6T@N5m&{lHjc>
z#0UKZRQFA;^byCdnX)bI1ndbnTFr8!t1`p(P=!GzK{ayj1OJU`JaN|oIk%oq)8Vfn
zE~nUso;k=W>Gi)AHNL(q#HoF#05HXaoUgX}hJL=A;x-0XJq|=Qq}>>S&x|iPc{-GM
zFh|s<q1g7>XJ&&MCTq4jdGpJAbsc4!ZQOMe2S<}NkqM(NlwcV#+f0V8BxDgIyX+=o
zyee}!J5MDQ;xUZ7JE^_TXAZgsl?d@iPc%0p7~i`HybZc0F^qM12+@&Cz5Cz6kgiKY
z5v;?)KzT1LABcURcUTqT6Ufef=I3@C7gkGCZKaA)dTL_4dV=+Inzadevcl8!G{JgG
z`_<`gnK;LJ0kdRPYqV-?A*<-BO}YxOIh3f<IxSW`^GF?8TYWFKD;4+jr~SjP+4qWM
zoW+RA?S8hJxr`D8rO{`Hh`mz{d_;}$S)P9VlU~pIopS7l4>rulKQLf$zdh3KwrB55
zDBT);LSC)NV9V$zY~{?VunD2I!{HqVy4To#k4g~&Ab?l$9$~C*&6rO8DJ3gQ<d72P
zhI%h7Ms~_=vQn}Rzy19(Bsk^eUJkoP0ImhJ^1yabhtHNf5k|DxpY!Xp)lL0~UP89R
zfYr=P%^^^`x$Je(ZQ8WXF>85fCN=GyW^|lfDtYf)m#j4I)Iqvr`eCSNYCc5EIqq6J
z4%-7(aB#i6sVdzke4W#<l^kBET=59gVdr@xbZ!#6ZmBx5*Wr^AcH7cYJ)PgyZPFi?
z5yy2`$vK>G4UfXE_$6P_Z!CbSp7|y4RM17>`yff4tfJ?X<USix9?z-g>aeTmcqdEJ
zP^2s3UP?QGLrua6c6KUT5y2hY>1rCIxzx}if)3a*me(27z2F0$O#`+{puR2mJP%Fo
zHN>2uW8HZSo9vw*229`0STh-0(f{O}I_bZ4{|KgBs=X%FVPT-8*Rig9#xF7mfz`rZ
z#s^zb1`{r)2@f*29k4o9XA#4+4{?}n37v6;QHKJ(5Ck#(7Zk|I0u#VAh-lrsdC2W6
zbX$R>sO9+{isYB(l+n>Rm55uI`pQ(TXbocWQu~8csz+oDHY1lZ)I$34qn!UU5v#B;
zlswH`zxh|@5o{cXC20yIx0>tbKTgt`6@FA;=;5x%KV_%->t_wI*sAxp7CdZN4u__~
zdpKD7k$=`C+>794GRfi%3Z1uJ^$Tb6vwUbB8mQ#>wqLg`m)ftqg+&#`)l?u*thKMN
z-8Jk=bgE#z`^NoN74~`6BXO<5q^}=6tXq)?Solz>lWmo>cCs;oABW4PX+`Qt+$aXa
zp&nbY@ttsi!c=%?vF2S_*Y4A`HXM`pk9L0*+P~`+Hf&&D3BGXBiy+DK=ouy{ftP2c
zUQT&FCrcu=c41(Bside#De>+CSMzB~3A?AgwnFWF+_U%QrtITKO0t?UzjhsabZaJc
z9p}o1^2BE9@^LiUz<sFH4XSH$zBwn%s5SF7=Rv+KYpEzb<*3H-$dXw7)=%fpgIKQ2
z{v`X_g2P31y=IYduxCi0Jnk}Q`Q`+b{jeLZll_{Pk>5HAbEJA%v3JXuT#Js$wU=?b
z+q>&S`2o@9#CT$vCG&!Fy6UBN782Bk)!g$~!DSJgdh_^l!u+H5r4X1G#6^2h4Q1c+
zcrw$|nkRascTWEf3pduO$ixu0J~`F<iT)$Ou?+f-_Cfpvtb^m1-un+y!6QFt(KTr@
z;E}tvv7sdidsAOd+VXs#!wd;k<5KLDX9}C2V!}E}zxVZw5SuJ+^z7|fCj8(-_}57i
z7H^M4MtI5NToB&I>A8_l7N=7=n!+TGZ6WVkZPo7+e0cxg!Ji~24DBa);4im$uv_M1
zj82T(-J&nRQ?M&1Bd+kAM7C}tX%Ix#fHfAKh23i(wXIrtqdcR|$(#}Z{-p43JE;AI
z)4jHx>b%i>I+ou|cU;PAYpJ6s(kXy=Dug8)%n!B{pVdT*<<5r1RU0n-TU(F_FkbJ3
zaN<rV&#LG|AtU}-4~kk`Zk9v0k@<rFW7t&irH*(61?6dX!Akh8_TiBwKVe1PBf7*i
z>YLTvTYf=HHRIpJosbMQ1=2G8rk7Im%7`HuH43-3gdH8&?;N<?70Xy3cxU?^Yc-^-
zuhvLzp<Knkj9;5%K2oj8i_qp`W_p`pF?&KQRpMbQQ)t$7et38%Ou@*&-&0(l(?dTY
zLV%R+{R7?J3@tS!C8aUodR>MYm}a0*;~=EC$V2`%!*j_9uYJkJnO<(`eE%kgaQzny
z&WM0wFMr~d(LtY9@#<cybVb<Bmhfc0U*jSqiX2~<nchIH+4#-b`A<c*R5Xx{b@mKW
z3s_#ru+YV8&neKoE@7t{hQ?lJ>nD{CV@>)r2KE~n5UQDs5g*2Wae2ah;8+?fw&0fc
zwiw@ssxq>6K4jV0+N>+;RR0|h;ucw9nfs%`zXe)l;syfg2kf{zYPd2_AWn#0_C8T{
z|E1>BfhH^J9?JZiikitLWL(V&{`vQYE8PmRn5kN|PKj7e_!n4&4sM^w+=LWY2}t9N
z)Pp)4nfO+!c>PcN&=vKZpy+Z$8$>)D6|-M{yQckoQeBvW>T$ov4DC9;epJ3Az05M`
zirH4C*H%v5$j6-~wydss9d*bDF=`scPg3SJ$@+<JwKuqGy}flJX`WI<LgG{wbi%`F
zv@poy1;hi(foTKpqNoRNzU6q+H^=jX9u^5fT7{<FgFZ`g9_vSey<GueF~h5JkmT`1
zsh}y>t6^=Gapy9*n)d6*KklKNbGB(GZA+J2d;;Vb`H9OI7cT}}W^6M6ndz&=dAd@`
z`>aCmuE)tdYjWeGe9n0*x*38&8i6>sT=%}{cR?ZK#MVqAk^k-A(=T-E&dA5}pkeic
zs|kzLj+gHR!Oz}Ef`YB8=bbyyOnR)R_?MdVDx5Tg@Poxy@h+9@|MZqBXe{TALCz5k
zGhK#eMlKl!D>Lz9M4UcBqY>;HZw!d)B-b2pMSFHPMuT`eZj6f`70@rO_8Gj18L4q}
zD%DC>V5nhbAZN5XD~hcupzMXxH}G6^ZP9Y558xk6c2@Q$9#Y|A`|dGbIOL+G>$G%3
z;^Ri3(ER6DW6crmmKO}q>Mu4vBfNqvDS=}2JFSJ+5<Waj`cSzqt{APcXds4)iR?BF
z#+{hy2lf$yA900{KRhnId95T8CqeD?bK#>)wAK8gr}zagtwS4^5q696k;Ts$HYo+*
zZh4BX!MY~W_JD~t$RTkwVnQ%|KU|*$UKZC?6Yw9)fKtt)H_sKRG1Gy)GmL|tLSkN%
z*Nfs#8aU_WxzNwB&WnJ};4PhH$YmUzfJrp#8?5FMM{9wJC54^Fn@TWQS3aj)_mF$Y
ztid_o$N^M(F0;g?+Fb&wcdr_o?Aod{&0Z>Y`I;2jF6cf!jMb3hAPJ$R!N%f<yR4G`
z{GDfPe7CFWXbn%QCb)EOzTsg>RJcItg<{(z%N?_77h#4YD@jrI;l(rdM{{$=3BmT)
zw4Gz41`P}qYtY#!l)jY=lQgTj0+QB|m=nvF<IV*f&s%R;v?94vi=Ug+kxy%9N_*+9
z>xkp<<HCV=QlAVLOhz7S8I*kzV(p<QWs7QDQujk@SaWec`PvKFc^y6>hqN*@c?E3e
zV+tZw7O}kTetkO1v_Z4bqAo`NjOy{cr!`G<R6@+)66CLsdVM-dv=#-PwS$o7Z~JOY
zJiJxtytS$r7j*M!1Qx2)<Eu#aYtlOZb&167<8fy66S%deWtOZnezEIUDn`pJMsGg$
zz)Q~>!M*O%Z`?Ylb$C);r0-s4OPq+-86Km%{5<UP*x)RXFOnKW!p+UU@*)C#i9viM
zvYMNdBp&o(4L{xQ5sPq%YyBN5SDf#0hrA|>h#7GA{~i3fD2Cm#+>!!+3%-E!YCu5o
zY@<c6`0UAqMJE=%j+1c=bAb@M!`9*?9+u!Er!7~t%*v9qXz3J@(eY-!ht{tQUR2}G
zHz?p!%G@|#=hH;OjKs)khWW>&SScHjrI&?+4r%j}Xd7;?BZ6MUoeUIL)xU&#EWTc5
zj(2)vu#+#Sj3eOWDhBg`9E?#a6Ov)Kv)n|{DHJ(XKK;QgtH)P~PrhuzVmU;ajrn?Y
z*n|3*#n3F{=CDorL;#JzwR5z=ot4;VE|o1j2683BOt!Lghk!G@FQ6u%r;yb&lC(&j
z)hk(wZ|g$PzT_f;*6_LS?0iQ<%sgO}*5{ZTYh<KF#Z|s~D3>YQh7$GThIpym$hC+b
z5$Ne9%*JTi6*L(=W)A-m-@RKIX!6V?)XuQl@UjDLEqlMb%*zX}EsaAUczC#C0>1Fw
z_~_$ZS-ug<_Ur0;5ajR<Tp4s)Y2|_u)^XHv<@<d6G0(Q7Nju+=Oo-{X!iMncFO?hP
zW^d3bPm7<j7Ot3~QLRo%T|CA@c3H)KL0pXf0<t#v+f7^tT)t_yVPj02n^i{c;I}!d
z>Dq0qo;Q%}Pt~bfClWJa@s%p&5gp3zAm@58O6zCLyivJJY0S=t=@bGRI;1>E?%wXR
zL6m|vdR&;7b7~r+WLI#qG(?1Iv~K8-GV!z8qO8+Wqp1rPE_HqEJsU#xMHZtbk59U8
zwea2X^*V-0U3wt!hpprVqu6~VhYX2gYR_O1w3Ed>cr17R_EV5S-3u)6$L+6-C+XdO
zoB)d}(qrQL+dx-71zDRFdr$z<Hq41-XuB4{Y0Qjuvgo>N^~$;99V9UNLHHvYjoS=7
z6tUs>xpqt6-`zPw_mPp1{5y?{IG-zypT8t4T?=lTTBGXoT-NG5jbFiYC+N;F3gv{>
zu;%gN@Z2g>^$DZL4Z+Dx4`F|;_6th8JdN6%hgA|vLmgVVq*A4qC4*?YMIZHsvd6Sh
z`ZwM_o*kO6eU7%F=o>(qv{t^ca<g(}4w=^wseZY!J-{|15SJBXM>@ZAGLCe_LHaGG
z_0eu6rVlt0WR-ZEEh)=M)P12@=-<JooL$^od?If2UfCgjPA^oNbze%0&C>T9k3$Jt
zj+Dt>LeM5ifHA&X$=>(&3z1Gb*WtuFO9emAM=uIPp^amvaIoJ}kdtJch>GtPOj|th
z+Sr~ikta=`OE(TD?SE4__2qpMz|h=h&h3~-bqqX?_m692NcKjNs(uVt_CztauNd{Q
zvvD&Slrn~KX!V>#%2O9k#}A&rnLG)L(JfXpy=;%)YJb%}BI$RE<HNBARgF*;9ChYv
zHVI-4=KjY92v4y;`@vYN*YH^`3;$V9a3_XKq%U?z7r$LK+T@CTIle1*$@;K}Dfkk1
zqQREqVx!KpUoSu2l@N0$=10F;;tzs4qj9v67oAO#O*3KGkptK4iqjbjmgfmEme8K%
z2?WdZZgJM=`y!JuNK*#ojS?KMNh4B}Bbc)s5C<#v8y?OKzawXJ8CQ+Cfs<pK@Y5+-
zS9I(bA#9U)oEILh9X|OTr^`}}o|nIJK5hJ-Hqin?yHQ51)fJ#%vGrU7L|c#OAd#3d
zH<{T`xejR*CKu~$VmIuoWYQ`yDcH98@tHw3c@x|>QM<~CxTn8%ZDj?2LP?u(T9LfA
zMbJ#OLZzobyiC=dI`f?VXhA5=?Qst?JX@Fv4g&}pOFeEU&Hd;84kNg=j{4#<(7VrQ
zzg=s=)iYezG7*sKKPoAV?aUMUCAg8O`c2bF*dlfNzA?gYZ?qHnw7g}Q?@|;()S6q6
z+N5XdIccZPneF1hA{_1OWNN@yxWv=UhmUnqNflyGWmA?D2JhtKFG7jBm)ubC3X%n#
zjBDkqGB4>MzHi0@*&+L=k<xUG{#synu7S+s(RUCN*mcK@IzV$G_3<@NcJzo(SeMZ`
z;S%%(Jm;!>w{<2F4Gn|QBYCn6{pE=c9b##dM;2>J2<Kbn0+AMuTD3J})zS5uWS5tU
z_Qc1##={m()&_&t#akkQc3bdjN^G7{$Jr^EMVWW<R2OyGv1^Rx&$_n@X5@#Us`D@2
z=~akZ#(=+_ykVtE5~AC24`R{arZ6_4iAGXzNc-|}-w%Z2>vLR33B;YHf-}2gry9$~
zjtV{Gg^I3xqYzVVMMj^84@TwHIq?wzLy4NekVA<5fv+n}mF(@utCo1+r%#wGXWOP|
zv29#&UU<W2%Di6FvU357nEcxza&Pai5=XiQ<OZhb$MC**AIwXsbR9eeb^9jt42|8z
zH*hQ62vkH@>wI=Hv<d@*Z9a?XpUzaxh#LJlQp=5M!>gT#d_v?6EvF|+h_a_z=n~L;
zOm!ss%y0B}8LuvtNvkfxkR`GS9nu_@6T6Rk(j(8)yu6@Tt#aERq`~R#dG_KF@2d=b
zXLGyn#f;Pg{NTgwDOT)awQ4J2Jvt;YVGw3s!I1gQP-Q;MyOL&jfk*3*MSGp|iAa)~
zAlO=`@U76ytk`AW%v27S!Tob-IxhJ?=uz7-ByWd;yRnATWP3c_i4|0w{Y~#<l=qDj
zUk*#Bzs<4JdKxNar(0zHy5FYnadGYzpgdC^Tk41551O2~3^Y?iQnR8vk&mlMWUBM1
zF84{>sfLKR7DXAn`XpRkr*23^8Toi@)Q09-MJpFU+G_hF4y&4W6VeOavf=3yrIGAe
z%e2(VLu?~xqeLFgIJ5Nc8M!I$@k@LT1(nWr)t>n7KLrG`rH=F%=&wS!SpfUEZ$eX@
zX}l@y%b+imN2lLv#_iqzUPR-8L^O95zV3zS3iL=slUL1i?*ub^U+DG3f&f|E?s)Sz
z5TX0gi0}jwJ4*fzhwjE7$`kHG)p3J#-S;?@Cscov`YYkdTSxbzd9>LGC0Yb3pfh&T
z42<&#aROrYl51FN@`pM-B3pdgQk{MTh|Yh_kpM0qM&M#5ak_F`@mOvr^b^lX22uy2
zkI3;v&7@unj-;QwNqf1O!25PJqh#Qd)e{&X4ftSU7;Ni7tbaV_@&oU+kb;hT=f65K
zd{5BfPEkLXR;VG|Z!R%JFDZVlreaaTx_nqT3-rERbav6wImA)Hjjr0NR8QWwp!{Cc
zt50Y`ECMu7yu&mLCBLb5ZMyQv)J?lU9}KoaDO|n2H0O2ERdTMOgsUD%ySrfYAyYz{
z>HNL?^hn=4#IyJ<n()P!ep5cWQukxCbMD<GRV%I?+eBlUae9(+K<vEWM#+_QJLn=Q
z&W%GVl>Um|8e#dhMk^<>_`Jk_1;kk^Z=udr%Eeq!Y*t?wtK{AVBvDG4)^`!og@~En
ztlmMPoR6ejs1lKuiNJ102lGD2jG-LxB{L0JCvF^T_M40rPcqbaYf(WBl;0y03Sb(H
z%6!*~CvD`GoPNg?ZmW!-*1KQx43O`bOSo2Zd!hJOd?uqHi=%e@;!$li`^i(}EY`_q
zv=DG~JZKI7!?Qp#rxBM}AH@L{!A=64L)l1!Cj7JuDUZ}P%EOCY%75N-;5)hOm&cyH
zQKeH_fTn9`5@c~FEi$ETjMB<SY{i(ahQXkM1!J&AfCUs8+-D!i9p|p9mn}2Z=VH8E
z?e~$or6HL{pSxbWJ%TfyK`BoFWi6XrI|a?CFp%k+WciH2_UDAtR(l^)R%8+jmG%c#
z`K|1#H-F!Cu*^ID={uPFI4y%&l~@!Y``~wdyFq5wl^jwEq)<x5PagKoXr*h*oAweO
z>u(idJslI$h2OY{aCt!WGtIl@LyV%3>XOsuctjF7vlG@Ot$dWppV>X0q_<FXA7X;L
zK0h^DW&(wQ%v>Q{yXgE;8^-h9ucCBjHaNm3lF=mz-78t>kF1n|Dt1x6>Co&TH(5)x
zef9X#*tw^9HZ5B|qS5%-B=5waFP5TJm`U*N3Cn|?r$P%%aH)pTvtOtD$ZzEZP-*5r
zItN{;I5Kv1g~BspqmRQMQ$;@$f&L0`Ng4uixa}YJ%5)y?)z_o_btuG&ynC_Yr~`Jr
z<s=d&K0yM;zljy;9dKTa3pk{sPvQr=@<V2VrV32=!nq|LfSC1|9kMu;G1)YukDI{`
zOW!|1Iq1u?^NS-J2o<Q)=CW$5)=$$li+pKsRP#~c3&X0PxdiEy$ZMAXye&ALq%k4H
zKO57e4{f9l(W6k~=w+-M4ND+TwnISU?_{aag=TuUv|ubZ0s=O1R<%r%hOP^=y1pk&
zCkX}RMg$7<&xa9q8IK4IX1%|zHH2+dd7({JDY}W8;N@ja50&fMoIP|x&iPtBem@1u
zB)!(jH6lV91$K(&i)TKxsz0aTG_|!W9&hCf()eH$!UJyPyKr9XBX455i-(UMGaumR
zo@$Ya?68(3XT5c5#2!EtmZJgu1UMawHE99Q6c&K=6xnouJtev$zNWI%9L(TxO~0S~
zvSV97lE8Gt;%r!L;O$Sa8vRRds}G$abt{ene`F}e(5vGpv1(*#>!6j^2cy@IPbxuu
z!j`n7K5wA(J+5st;fOs#{{<5c&pI47ok)}WjTlMNY%jwNpC26EOam&mZ|9jlT8=MU
z+{}LT#~s@tcMKX>4tLDNy?4i<_CL1;Z3x=SzgsH{c%dY}+QBu7yMr>v(7TOY`a^fm
z;$3RyW<CNxD<!#WGim2bwk?J23+lg;q`t)>ygrM*gV2*f|JzpBqUEg@ls~3jc4&=S
zWKZdIAEy&>P^PbGi{|J2PR8Gv`DmMUyx(vZBNMzLy+wc;{xArYw1<>c6zu9GZryBG
z*%Wf3XGd=?ElQ8^H?@6(2ub!0n?i!?_y<4c5p>oJW4QkLX4lQ&4AH?X=0bP>FV+pL
zs-`o%nqsB>T~+K<?^SwCI6d#Pd7HY}mgputF1&o5#_(DWMZ0d5f6MoWOhh-?YBEEH
zPnpe_<>JAn!XUMF&Z?E3g=3?vGq=v-wX1e!pdH^voGj_cOVRk{&sPro1-`no0b8hi
zEU(BzRKzo{&}9vBss<2WBnp;(b>$GAFys>9Q-Pv*;{3@G+tPrZGI5l2#6^zveygCK
z3UHs7M)R7kwQ(UX6G`E(gLh<B%ob3FSjh7%;xw(U$^KA3bIfZ}{drwdxq@9_*(>P(
zi(_{n26tb*_5&&LCr|>jdSai4)DR{Sg%c2plY&Ie-?c+B$MhUCs_e38b6ShkO^yu~
zVJ-Z|)bto9oB)%%65hu;etOYCX(I%d>q<jvpi**Iy^Y=Pp#y&V>23HPj)te$?&_T?
z$qf>d8scu^sQ?@vHvDRWOp4A3T`IX5UeY_WP9My8O?T$6(}P-(5er?*)u1a24l7HR
z@JbBj>L?af&f;W#usBMc#VuUFjd^J_6WzTvV$gGkpDL_2YH=3h;CAIondA|@b;EX3
zT@lA4-?;slxp+q@Iqtl>&CuLA(HcG8CbaMp3due@wrR5wHobg_{#6Gqa#eVtdqrk(
z$AUHCTdarSIO$cpg69Dpn#yGk)|<l~#BZUZ8Aty~2zs-R^`X7bfQc+C+|IujI_T=Z
zrS|&Tkw{+S@UCP<i3kt0V|m27fxP=-g>+lI7^mafYzPpq$%$_LVAO!n%e2QvXhBYH
z5aUs=U)S+Nbllq)?Ts>w*59OAfNGV|HST#9RopQY-X@LUlVBJTkOJgWH_J}!?;M)G
z3LlD`i-)Z2;1cm=5Jx%b8QVW4a*3dQipH&B?H}@Q`Ypt!ZndD#f=v~HMuJHYg$yVy
zcumN^1ASSAqn#CI^N;ECM7&`Teh~AFflKx(yJoJwFLdH=lBT8~S7B)2tm+}0k}>l~
zt}T#0kl*C%XM&XF1(I=GqBBejr7Lo^0;(%tvg~jKzk8+6-k@o^Bunsa67$-@+5WN|
z=k%hco!r_?<CLI2xlAGE%qKMKO0G&p^;7<0PE8$(DHdVL2W5wt3orAe9j+)tKo=&n
z<Ckq_)!k~zT~2@c@?&e=;SK|1xavxa44-aS*s<z7w3R*VKG%n)><dQ8<DBM?tkbnW
zDnXx;MlrtE>T`MxvOKnAzE}pPct{jmq?y)^d0xBml?;EkR_OYM1$6(Lfz=6hLE=oA
zWbptGQz@O{GGK|qJDl-eOd{-F;%?XkPr9C)^cBeNHG^eo%?r@m325tk=i5>-O*$+l
z8y3(r*@Oc6k<l7nFWK{pzwSTaNMnqu&Wm*Q-=^RbT3I${Ro|vDt2F^kQ)i+iu@3%#
zf0c@w&N<*QjnFdvc!O0ny<xUpSMGGrje{`O;u@sEF*wf$kOq?5C5T;Fgje1y`a((+
zUs;?SUY$p1Cme=zi+&y?B5<3;^3Z`55XU|b_Z<1f1Af+n54G%c$!EKaZnaF}Bb!Rv
zq$+lN+l%Nn+^?*8L$^LJOWWPF`Bdh({zz#BoTL6gv9)I11OUicoy|>oj{fh$P2j?t
zjwW;Ama;z}r>OXT;<N`vZZ3s&GFL$NObu&b>ZqTpK$h2}ennhNriAK}jaHUo3}DSJ
zf>?ieq`PUGXft27t{VyDOJj|pl7$y5ye3z5IWHL@X7ncddf4GH#jA`Dt$uqT)zWa`
zfVg`|MeXRwCbR{SVJ?oo#}0J5BJo-QHM2Uk#tdImISnhQ4D{;-RC`}^@EV<y?wiX(
zZ%-rR0sIhV-wHE~oSG|`D=0-fzK=XKgV|2)xFQ!PoKp6{6v_dP?b@(9Xm+SVFEo09
z+GcBkuO<(@ll_HQXw*74M~3Hnz_EUWeik^k`Y7dYY3M&S?JLMQFD~J^X30%ZAN$?+
zk0UQ9O<0^2y3e;~h|BkT4-vkfq;EisGsylAtp~SN2FPEhYGN~99Ku&8y=#LqzR7+S
z>@G2Q(ugAPf{qxsoY0_z#LbaBx@vB&M+|9~a#OE*3IIpSB!zoeEs4=*POMf?O5*2V
z99li)+ef06jrOkS$BVrydUaXedxkN}>iA=O4q@hP%-9o}+~*~I4!cEqQyBDA-7%MR
zvz!mQu3>$r-yG+@+VkkR{bor><t)!no9&w;=!=(66jsN6fey~fY*6(8B~*pmV+0uG
zg_k_TtwnZ{f%^k661YkqEe&ivmWF2`o0fr&tu^(IG5>EYfbkiK{)hD-AARyPsE5~R
zy{ye7&e63+l|1LXFeBk5zP4)>do~jIQZts4PZg}#9%5eDs@lEOtR#wn&;~JE&s0Vz
zM}-~N(h-&?1g)VKo%lKvaB+luX;t3jP=*Zy*snvt3(Oz2fm%E`#LFk~Y(N)WaZI49
zeR~8w@$#O93aFEc)Qv1^ugM=dhB9!CIY>|FAjyCn1ilu-A&qow4$Vgr3rQrgc+eOH
zA2SPeO()+780i=9%eeCTBrh}F)xO<Jx=D054NA!x5Fj!%f?ptbJPdA=>Vw>q#)l`j
z84_Nba=zs(yDXT&k3Q0RNQP72fOGn@-(xNvI#o!dD(=`FKbKl8dgfCVVjR^9F%I?>
zTr+8RM!K7~W!W<+bO*LC2uF+66qmVVWJ+W`mG?f@-grUyK^y{dinE`f9AH6;F%LW6
z`u2Wlv~dQvllcU=hu2FXpo5@6=h)2KMUwy_RZrBjwwa!cJ%VzGo708nbFcGwp%PJy
z$Evc3j^Ed?JML);IULuM9}0+LV~&15VA;%p0ESW^aUVI4ckw-uJk)LXkLi{qoaL7j
zqS5D(?@;U&Z)`3*+oe*%+2`<!{Zo*?3-O=TRt~BeE&!a<KqNO}v_!z5m0ypq&fC`b
z;U(@J^ts~amN5ruKd?aS*rqBaNh8d#9Esh4>KOk-q354`M+I+LNt|%rQ_G?q@5L9W
zpXr^Q-t#_HOR{w{HJGz^^RNlpUso#kN@n#kq-6`}3}Is&EGlxR8C~S%M+Cy5)^9;2
zk!P)Wj!!rIo;S+ONXc%^JMj^R3x2S}+$d!!Y?3S1p#nB+WU4d2ul%JDbb(eEhV=T3
zSauphnS-2Q>s)7%#e<pxh>m>Do$^GPD0tr<4OWqlxpHsdq2I~YX#R*kvkINX0Tkfx
z^{d>EoIokqwVkM9Iz#no%4;5a$6StUCLC$&FkCaq`cs_($p=InR)c{mu_V=x89eoV
z#jExda<b&aYu<=dE5m8MTTB;JQ*S#D-{*6?@%oC_<OOj<QaIGh5FuE#cze?@{=Tlo
zU_`3%LRAo{z6Me)Qq>?0A483_eDJBgP(Gv=e()4xQ;wxmavL@PByIXaniYTtl3aGT
zI<xOdJcNX`3;OJfLm&|em`<sqsdrh);s)AJ2=S`>hP!-X{4BXA+jy(Y8d=0bR&{>4
zWN{KKnmRtJEoH%)m~ZA<S$*4K-9?8hsPe52{4X;L3rn%eN1dVj&SjMe19c?Umum7h
zxzqb*)U4wiRRXK?)R?-becvsbgxm9-nwT1?@fja<TnO7-nUt)o5&E>fP{Elb6Tyc=
zMOeLpWPy;~GC1>)%5)q0+9q{s?QM}l;I6vP7!^s6>hqcnKm7h5*4_fB>n-dWJr<o(
zA}u8)(j5ZQp>%gi3P?AS(%pYlx*MdGZbV92TDrTw{lhrtojdQHZ|>Zg|1g|6jHB$m
zpIB=>>pAuEgrWu^*ORV#V^6WbtQ|Li2iL3@)xBw32ZIu(1YXYWoX$<MjB{KkscT?v
zc!J6tAb~M{$aviMe~|H^0~^7~67O&2V=TdcqK@Dhhz}_q8{^E-aecZW<I0myTkK>2
zAQ9HE`fQHtNWYmF{Q5=0K`;9|{QOEJs}RsV6fd9wUe`$gK(6fA!Jm=A0nRo4QAAGj
zK9D$8qMxkvCF3!TWXYz|YSq5kwLq%H=>*HKD8L<!1tXYAM~EbN9Hw6dv69Ky;u$oQ
z&*M3)XIN|0if>bs7{ZS&C=0SYjDc$JY1wuAq{!j&pbXVWom~Xh?VC5_eMPZ;_<@s%
zmkyU<Zu~KMfytdxwMd<@XwVSlor8wxA!AWFl}wT<Tk!=?nIE7xIX>3M{#aorhUnfv
zD2b%ARRItG2{>Ae;cxqbB;59E*n@uHuV{MP=!1|S@2Yl%p_ClAxx7Rf@_<9Dc$2}L
z{bgrXFluPYpzsMM@)+H+Hi%I6nPT~e6B~SGKt+(Gn+%{TxrlipM4|O4N~uR{Ls;MY
zT0tZTFe@{rC-gur0kCI0U}8+w>I=~VH&wLkV~>i|(nV2ebQ<fi8Pr*2tVVeRK=dm<
z(`$RYa}Lh1f{RtZH84=Y<!B?9X#mFO+G_35p;3A5ZQHri2xi)3+^&G0`R>4rJosf~
z*my7V=lsz&+h0cozLq`%kISOfyWu7f*4;$DDjpM6?MP6gJ%90%UUZJ@UCB5g$^?5M
zF3gThF95u>Umv$eAno&e8&!fos0vhR<(7uL-gm`68$clCEg8i*t|XG$CcytBl@NO5
z7)DGsc{T|meZ?w&NkU;VjOttmhVP@;%#|lqZUaE1v!FFf6|eKQw<FzMbs5r|_a`V2
zeR@{%E_$rV^;g>W8_vTsxGB2L=f3`Id~4waKugy8_!_m@dA$Zf>FM^b1!xypdmD3y
zXPDYL6-A!u1u)u=S+gh`!)YkZcC2YVn<e3*<B(Xvb##p9H=(7=T8lXdm(fQi?2j;N
z-XOfxsCz)Xdbms_x^7&m-v+pF{y}yhB!Q{MYZ3Ct;GjAeT?hD%JHZ6`n}D}{OeNY?
zz^@btXedFzsbI4x7BNUy&Fx;jfe}j~81cXpc!#P}=`*5duP4s#I`D{EJP44vSMmz+
ziroenstwU@*KPAVZ&r>`s1#{hB`-rn(XS7aAUGqGyWUziX9WK%J3X~@dWQtw6m85$
z4gcofO|SQN&@gjV&3~Es6=lKK6DiV$a^4jhEm=u6YnprJqtR28QW9B&T$FZI8;|gr
zH6J>e<!N%dD)(R8OUiDIGX#uXJ+8y^AftA$+Q&mN4|l%(d`*WTwvLwqUinFE-*zL;
z{-qIoC^&8w6L@ef=s}_AMuQ|;hTx$PY5Z|_t_T!*3JMC9;U~L{7f*ZY7Y(f*9?JQp
zCEoR>*8tPv@j#24X#Bcrt$Y&wY8t`R*nMB<>|6fqtA>2#!YFpj2@&yI#>{>=t^yv%
zbj03`E@X7BFX7Lkd&dGu>^cf$|1HpM9H9ce0o$h+hFmaVQ{LFt;^KW$8%nnJw8w!P
zlw7YssXL;$cX$jER}WwhbB)4K_)CIrMsEQ@d63~w9)}GJ-K}-Ea;$xMpE>AJ4}Dov
z0EsdInG>_xx6-Ik0uIP6!VuJgm<^vR&j1S~0ZbUuXIenhBel|>Y6LiQOv?o#L(mk-
zS6QeHdEoa2fS&i&_*CS*(cqS!HQx$|dMm*_5oPrRdTa!deSqyb53;G)e`+A^dnf>e
zZIE&|OFHqR#s;LM*K6^Y00D+DoD95kmM{ROea*N6JRKLY$w`p$0t^h~Q$?;50SvnQ
z%VqIL@C)tfAn`0P%13L8ho8uM@xg~Gk(`_p{dZ|Y>U}~D?!=WrCnmTJ6x86#V8f#A
zmc{gjv0>@|j2mNN!z^zb*!)IYxIQ#@effKNR58Sh=?CIxUzf+!6{auNef((H8`Y+q
zBG0HDfyLw4c~rw<G?|iE)o(O+LLHI*0<&gsZbp71X`MpYKV)zHAh&Eam1EvhSAzGp
zv7_w#54)cf7{cKG>LB??()-LR`c9j=eOecB61@&(W}F2&gFTQBA-G^2-}&s|)he(O
zJeqSm(Q89ji-{OPCBX_H!MF?M41i+TkBAvs6R?^_fZFlVM`F<Bg?br}Iz#p8S?Z`c
zRfG)iLFRVY_(_Yaow}jId;4-<Ncrevm|+uteG+5dq0_&8MEs;n==?_JYtyR8cd;Ss
zZ`_0951Th?@<IGpSAK3dRl(g`riMnsuSx2ZxuVAoUtgd01jzr6xg-uxsrc~kRi~oh
z!<e+mZ;kY<?5PI5dM(fo<D-K%Y=jVS$X2`3CdM=n<ISL7U2c2!unh2~Ya9Z*=%Py)
zg9iIhR%okSF$XfUOEyb%Jh)MGz}PhSr$E^=10q@w^~Y2#$5vKc|8S5aeOo9`pwj%&
z^LO93>~;{FuCMB#Z5WX8iRiF_i8hb>7`heqcCE|)IQo}qQw8*cjt#Dk0jD~ojIJ)%
z<E>aoCPENDGq=PyT?_)(pL=ra|Fb57lpR5YrH7?#PkD1x>It3BY8rjNEK^2ScUMCl
z+noK1NqOpM^ZtCG;A*l}tRz@pUh^*YtBd>Yb`t=4US1s3c5n3C5DPk_-Mt?OA>&^(
z!Pvu;(?X~%a8VEZsg793N%oLri~AB`!qh_Bn4J0i>*Tw%mvD-&{W5r+c67(ab1+og
zGElzoT)e>XNkKAI?(qXvbdE>%VN&THIZpx(N&4*NJF)kCH#tT8XuW0i;<XEGtl6$i
zM@|+L(<e2BV~_j6JlwY}EHJ!OvF5Y1Ejm`z-7vO#hRe8q??(+3Qzm;Cp>hooiwdEP
zCVR|gxfkDgK8Skpa7-usr%(K`0)i_jbzfh&MQgh?Ps-rVo#q!eew*QHMvtu|Q`TRq
zAc@!qdr&jz(ty3LDrrC*jY9jY!>R#L8l;YrG!zU)!~*Fv1K0yvQZSZ}(qxR9Xavup
zA@@|Yvj{N#K{>>U%hq7EP0#MB*5`p^({xn0*VdnnH9&B4<l|*&AH;=)-gV?j4$8UB
zWY%NGu+K7+=k6%qZ_)68`%TV~BQo*dl(GxSb1ym~$U#aGm%YJds8@Q!n{xnob*rq8
z>M`%+-S&C#<A~3}4+{B3zx^v@ctMI$NIx(_;x5!(VMU#2w=DlN)+G{orN4Wtm_Po|
zIwW#GTP{DXGs;HnN7A6R_y_K5H$$;Gt}USlgL=k}Vb>3A??OVY`_G(?9tEepY^QG>
zm;C-b5sCzgPVCW<k&*c0vD3s6iU9xG6n`K_CDsY&(gC=7$fMq>j@npFnT9uDUPeYH
z7R1U`j|%r``!Ph3`uVotx-dlA@7306ygQ%o_!M}8*}X_WW}01FgP5JE-}WwwQHQ_9
zAqB}hkLdVMFi>1rQu+=0)Pu@if4xdOozpcjbVyuCsjKYgO)$94WKYoYx&EoX5vAt1
z77LIEQg$98z1o46wg4f3`7zB@@s1}b-gKZ2^JU>6uj$}NzaQgRcKv;NM;EzgZVOrN
zj050GjGJz~T>>M9_iXmpoPfTbZMO-m34px1H&0Rb@#Jl~q9`eWf?1m_Q!#wp?fE_6
z4?WCGyVAY1IXiI39e}c-i@K_`U1E2cfH?3UygK#)J`S^M8cR=>>&cGZ7dQ;CmLWW6
zq9+vZ#VJq{FO_x>M*{>{0^IS6Q{`Ow%nE0d*3ZSqrsXo!C2$~!oI^y4ip`fU(RoIX
z;3~ksxLMw7lm1*i!P{Q!ls*8htOcr#9^i>$2qILO?UE#y=T&`N*W~7T{y3E=P{0_Y
zc>eCFy-z^Pz~!Oe1_OXlgy7Sgq~rYj1gK)l<#e?arRK)(@xmm5>hBn2gAR8Du7Cc9
zA^5u&y?Y@m`L%L<jBQ0Vvdo6PhTrRuP5L`^0?<2G@0>67Mz*Ne^m5rsZrzWU>VNdS
z@>=5!f66dYR#x|%eg0CmifDIG>Lek?51L)5a^e+{XbA7!D(HVl38?l9Y79mIxWe~z
z{uAQq@-YbMS!dkeNbOXQJYbBE0c~LSMu`Nw<9sBmQPxP94Fh$btP&|-gcwpXvb=OZ
z7!<PwXSaSNr)2t?_Pv_-As}n40mq7g`;_B=8RmI17w`>C>*njxTw3Ac(LMVNVaZ!J
zok^F6`(qgjbYuWyrJHxT@ztmW^fs?mX_s?`<Fkm8=5V=Z2{wW0;yg>LkJR6Chqr%S
zz>ul-r%weSTW$cVf9?C4*Q{=Ts#>b=H|4TkSey82_k!j155Z|J1FAG<8v!<grnUW0
z+B;1>l)*bF&W*shff-1U5i+!S<Kb!9Gy)2IMFb2kK@u{<4=BZtoVmsIAaU?ArgV3U
z0|3>|%ZVm$&i{Gl28a;(Vr@Q3Y~$vA2<Zi7&=C8fjokx27L5_>Bbc#$QR-DvgPcag
z&?)6-Q3GLw{3deMhJm{*4z)FEuZfK1tsq?c;$$}psP%b1j4;%>pF6Bh)s)lgH1f_n
z(nZb`I|0qT+7<xa)tWrq#R1XZ+g>mW<_P3yl>ZjM0Nl6h_4OgT2|^mbniMuYRaGF!
zGO<@4q?MTCQqLGigQ`&g0T2ei)Jyx@q+C$_P>D048|OM~zISTZV?55ZZLpa({_-Np
zSiVXNp5{eEoMCqoLz$^r-bLvgwHk{9XQoLLwKVq#098^ypAXZQ^PJy}8v!?&^AWf^
zEN~{nDzO$p#;<z~F;h$^H$H~wkZi#nPxgs@6>Zi6+o>wh&#dcrMaA~vsS}Oz6y{7Q
zOqGCh2LH*AI}COolkYUcXQTnsoi*ij+PhphUklE*Vh?a;!jfGgo$*Q97*$J1Kb}L=
z;6QXL)nm50{4_ZDo!1<wU<~$zl(j3(VgcesB@|%}#+l-1>HxGKhY)>vR&saI=9Yl(
zieV#cszBKIgS|$vS_)WzQeb2#0;ibW0kFJWMEAL5(Huz**|BOTHO9|4WebpWL@JIR
z@rwmzyI)7EG;yUy{L6Z>if5+&f<zvqS&-8c9O}cLFso64>OFiBQNtFtD0ZbZ>y&=G
z;sGh#IsB=T@fVQL4|4*OqEFweps8~szuGx8KKa)7RQS%ScSq;P+r_T5Gcv!Spq=L~
zRZ*#fH1&2LJoCO)Chmlsfz@6(48rw8p{!vf0!2o$qRJ;b!jx|-#@p$1LqnUGjhJV0
zlqlPcAMvz@Rnls(r(Vmnbb!!xY^Ui7Bwl%oy@?7&PX0y(cRhH<_Intc=z+E^)U>Ba
z+PtFCX?G^PKXKaiD;&EkG=N0L(sX5I3rYU8+Gg3C(HTeUhqC3QpsfjDEE4n3;LbhG
zVBWviXFd&-Bq!lyVh}f(a&z@$`x8n0UMe_+-8M{?dEr!r7}5p;%l&uY2Yr#K;r|MN
zbl-ZBsKM3`TxFkFJ4+cFuM8`4)i_pLBr(4{a=Y{k)qaPa>shJzt7xKOUa&LEBh;+a
z^l(~xsbOFC+6~f;kl4ttQkL`NztG1A6}svDK<YVs5@*NF3AWqeu@?I6L(=+)De(wE
zMH*KP#87JIud6}osmdnKxMw)l-mxbdfFr=Ni}LL|D#w&=osKsa*cee+YQH@*U4o{h
zH<RmSJmOqE4|{oOwRVrgkoS75=_*sBVRFM&+2)yX6UeXl?=XMH0AK~>J2C)L;2>4&
z=ru^S8ZziDHuJ59pZMu-Ict7KpW{j_rsi0WP&1t#HODonFjsLv-dYE&1KXT27^=Y(
zeu48Mwa9wcR^Z+P@@FMNen?3LiU8SI%Ab_p*Z{^yR1-i(B~{b7<0E>Tr<<(q0Hk}c
zXr-;hBfYnPa}JDaMuX|#$w-8tjX~gG@U71(v{q>)+2AR0T7q=*ei#wYA^_AeL%1$J
zXE6d2FHA94zKv-PqJ_%#IcD9kUIt8Ww1x5L+0KJ$wu;jrbCqo`RzyU+fwx??i#r9#
z5ly@OJ}au}=>UV8VABFaB1;a100M3Fq92gee`kVFuo-A_;_dhY$Z(`U2`+tZR@H^E
z+yd<#eKnNTy@1MSN@YkMr@Hca<mV2sGRYt7FaPwisjr=gvm*?_{0(|_cLL#zK>0?G
z)y(;wO?veT@$=*m?Kp>6`a_R4f-$5S5<oy=W0rxw<JEqNNYMSTl^T4N>TSa?p0<CL
zhP<Gk8@J0Sh=5R-kAPtV^l6TCg`tHn0RNw-{$&9EpE4u;V6QPXSWi`ra30^VRM}!b
z7W3EPU_8Uw<Js;!PXWdixip8nw+1ZOh~4ESD?dpv8+YB8@9?uTF<S4vj!v6}{m8L`
zVM8EMhn2;~QOJ1e=6wTakhz->TcdaCEykG{uu8SFn5l(&0D-au$V9x9d=(R~hS-u|
zTMze71caU(_^_AZV?r4GwCx?|ou8Ne)6_HznSBzqNPf-&8U$JcGk8hL^Sa2aN-O(z
z(g@pz@)S$ntYw2SGuombFt3vYEfSH0x-m(2zLo9|5EOYYv0<BrL^7bNUpw19a}@A3
zIbc!)>WB3Ro%Q6{osBo4!t6TYF3|Xv$e2<aYs&AM&VUu`t`e-y=T>r;+v64E=rGV!
zc7eY$BUs`&mKsLlaq<Byp_zF@vnR}6abI+h3~8WU&*pq;%i&7h(sxdz7Bytlf=CFT
zV_;N!#;&l4q2iviEf{N&FVf^vk$HOz92qKPlK6pYprw|SLYTCwe9mXY1=qCqm2@90
zaZ_`_c-z~G!Zkl+dX?t0&t)Jc`C_68%%<LhKnAyf?7Q^B=}H<$YO+Ha3s1vwfa7QS
zJ1pJv(@ub+_c0-=3mCD_j~blVJTo6Psc)*ZnpQuy#<GIl>s@wE7Y(gwqS4#`=l_>_
zzeM75KK($AR9qa~TQSK-I$HmOf6Df^?xppB1=p5G(oIg&SHc;J!K0Y?8Q0uSg36%}
z=>tX!4l;<8?-M{*MU-&r$Y(tDxAxfs1B9^Cow*iu2RiWrmw?o8)!h#az8L^{OX#jO
z6?<O4yWcYXS#=s|Kv^w^*mZ-6fi9}MV<4i9?Cy^g0QM!w%Fua0U2oj<4&Wj}bKrd*
z@TW8ap50Q$(=Fx#xZ{|Td|Ee3Ln;bsF6UP-53%LEv%k0$uQSSwZQK(l!^IEbOrQ%M
zenG6zwH4A{A9Xu~dn<b<gk9i#gJturTR!=CF*+T;_Vrf$PD;U;LOs$zf(nL8L?ULH
zGNAwHv%!eRi;>E{zMJN--IOY1{uCuSH|I6_-DO}bHM%7R26=`;aYQ-*-!R^N5V8V4
zReVrppN<*3(eK@#U=&oOm>X3Uguq;o7lJ?TdVa7<4j_5f)OZI&@g78CzuR3gbNHNn
zWnUH?4fTmQtWp@p$0VCW{5JZ(D0E3LqsH{42{f_KlDO?W`{L?7Z2&Ahn1OJa!MuDN
zPR5FvNW;hp?Ertf6e4~>q*X(^Gk^-R{NU(&z>15;R1-)A?)*3bVAg6C>MDN9+DLwO
z1=@%E_10uOtZO3%`BBsOhbO1Rmm?}VuL!bMiDb)(7N+qYu^GDMw;<1*PM8;u_)Y)>
zTb(Wbs7@`;5xZH_w?wPL^yLyaht*V$WyXj2MB-S`*gQ`b1`{4^g+l{Q2C(B_IM@Bb
zfX{VwWePyKrsh1QoLo<q&I)4!^=XDCH8RvM+<aP}fTp38fZ!+u3Uj?TV*6gYR%+0z
zy5C;^IN^i?%b&UqRNNS5qo^A@9m+P`%uhEQQ`y(VI9(a;vHqlHcI2VXywV$cM8bG=
zS$)Cno@e2*f+{y5J}h}KZ>PV)uJ+n9K!k|711B50rmuc{xBT0L$RRkzF4as3hfAEI
z9gDaBK&O^hSo*PsQ|7i49mc{FfPR0#N(B|4!Ec=v1M%#H{c!&I_~nR6(m^k|4fw0d
zhNv=w9X73z>7?Ew<QQ_zEd1<#x`?kje>&}TQA5f{f0EXP77I7cYsgFkPsi;8wk(|<
zUhK)~oYnhiD?gLrw*}Ij-TB9JydF(UaT=_M^+0LqPf|z1{YLy;l|(2ikrU%*HI9Jc
z2ccvz9pR0BMlGH2zUu>kpIJlkmph?>t+N?mN0@ms3W%=gY7o?{)n^#Luq*oselF?0
zAIN<2iHO7*a4VyV5rSHfF-`r+e6W9%8<@al`_cdq$GFmcfWP`TV8HtBCcG}an~YGP
z-JlMvjHo-YaXF(v$(e#cjJG4@Bs;oJqddluk22u>>K+$dlm#fJQ121P_I~&@GwZs|
zmkU72Zz1-zjcq+HdWTck9<%6ElR$Mt3aZ=@Bhy$T)}4^I=Sm=}OQ%so&{^jc%1czc
z??1;4K?jmVm}C3i^Lqlke$33R-2HKrv{>L~OdvV_NltL1-S`Qein_I}_}n-rvsK#U
zv@y8(y^<$>DB;dFphuGmBjiGX&nQEiTU60@e%`ZF@Mtu@*Sy;3>o)i&ixNBixYdLD
zu7r4j?_M$m8z7vJuphs7C0bB_D-9|~eM<!|z_F+|7Jr}qp`9ae>yH-Tk=s;WGfRI<
z`IDLSwXtIBPiVQkEW*ldJjFy^-)MQPb%}g5ttbm2{k7{AhwH5T^v{;fLsNG2Qbf2J
zC7|MYuhHAzemg-Up%@r`DXPC6)CJs4r4zt^ruS@_CJ*KS6#R{y3b}Jn7cP<D_&un&
z)<-QXF%gzY*DQc0Klh)IQ%-|+luejvMPx!M{hUWT0ckJ$gD=|&=qDQ9-RD&8SM3DW
zrUql@bz*%<Y;5eR`ObfFT?#nxxPman!h8T#eR}h@Qryb<CCRhE;<~TwNgsABP=cGw
zLxZrDW17qD9C%r3!4naS=g|1e!3Of(wkqdca9*2;r|vPr5fZTFJYT^)eYNr0TXA#g
zWyUju+#%}-Lq?1w{P_cK_p&{|TegcUhqZlvJ>{N6{EzcRdE?`7YQ-<}Go_O!GX*Oj
zMzL)hMa_Bg3}<V9BK1t;CG)hpe17C{9=nlnTuR;2^)e?g#(eA<(wO;U2tZ_?kTl<)
zzbA8F;P9N7>tum>oZTer!v3dMp+jGLaH;avS;-=u$}{G<g>+DNdoD8;Ht^|ql1(nV
z>C)zsFK4PxQ9OUh8dY!9BURNSPoYp{L6%tk1=SvQx=?a)+A~nLus?2llC6|w@YNN*
z`e8+C=)hjg6Y3?a1_h*%Jh~s@r6!$DG+4LFB0FgJeJa-^6Gy8hukuDHHN1VIxHOJf
z=SVIaUoV5Cf>x}>z$1`GC+0bB>c?Q#!~$%MUzC%9V3T}zUc#yUYsg~m550xACqUt6
zuI3H0Q=U6QSU~^~97gx1@~Eh2e-W#Nylr@a84Kp!LSL<vAIAZ8rXUoL(+XowAR^rB
z)Ktw^X4rh|==LKteDTXRpuFFme8>Oz&x1$#wkIE&t$`*Ah+?S=QPb~4-QhoKzplci
z=knrfg{ydb1PL3bX~9$mBbe;g@ZCVYh5Ah%OqEC%6GJ5!C4Rn;?)zDIG{UcC&V7k2
z9d8~7qF?C|$IvQC(glClbpu2G%*jTRRmo}t0Ox>P_1kd+;Jr*t3Ru)k#$)m1ALm;Z
zpONs5B54>;#Fzh+X?`XLY~w}Qs#>#(ek-3Zn$LVUzJLXa5kL`Ys{eI&q2nI@dgSys
z!13*G7Flw3fMkoley=@n1rtH+#N>@2>1_dv3Q#IPw6yePO#v+8w1Oi0>gS-&xF8cB
zAOq`$MzSRWocd1zK5GPgohB~LE?q90!19FXh)8^b&hMCFN=4To_nF%j0%Y-(JCD=O
z`vPD!7`;qp?N9RQcx$GCXa3Z`xqO$XdDE^=rMbM@&RLh}i<>a+4)8l-mhljsVp8dY
zdEUV$)fS5Ci(OA!gwviCP8|(E)gjd<FbZSK<JdvLIpHf&{=N$D``n_yx6gP(bKr~R
zcTCuwmATpmVED$SR|z}<d{9w$V7G$~0gBJ=ZtQ?1Mb|}q`Sv8__%j$`&T2rOZ!%z*
z0L$vAq81uWs&<v?95yQoCZ4~;E=38LTpujBrD@=V)?s{K6uvz;qysor#~)R{1>zF(
z=QdBa4xthDQSEfsJN?#xZ#d$`@OZD?%UG|YL8KeZrT6Cg8Ico52MYo_V^17IvKvTI
zK7DluXj5aai<_zNjZ2&B2)2U091{u5&Rv9XaOzG)>;|BdSOxO#9Df89?yKbYf*xoV
z1^&*BmXqw3cDxsxEk7@D>^YrrOOoLKqx*D+be}p`7B@=JlwV4w)u}mWTqys}V1uYb
zh=nxJze49{va;c1{00&QKC!M6{mp|?V7T6$GIsW_&?r1#h~g6K_k@d2qIGRN6;f{G
z{2G62%-tn0rgo013nON8YyNb<%zoL^XhCDaev<XRm)zDkWo6of$A0VV(pX`rJgG^v
z)W=(KI@xluscC%<tn7hRD*RI#@7T;-B-oWXNe{Qq^&%}&E|SVlELgP>MpY`d3lO_S
z;WKuhH*5u;;JX)(7RF;+4vUm6)*g_>tleTd{%)<YxYAYJ<sB<!FSjq{<o`ItaoC4#
zO}w36Ev6PF$Z0m}d70j~{MB|seU}ld28vl693ey7@>_ze*ZX5KWv{wLg!?7`;iYZe
z=qL6zWCQR!3@eahPE84=<PI$<`P{;sc~K43ydsX?*fzT3APo>%55GlU11s~~85UwB
z{f%+rFe2XJGUQ1t%mp9`2+TkH3C}ModPM8iil&Ct{Nm(R;@D$z!}nk#YF~(60h0F@
z_x6C9aQ?HL-ee9rBku1{7AVSLpxJ*jgFJ_CPe#Ybypv1pBBaBtLy|XDPrFjR6XHQN
zbcmh5p8PU}%;_1a&+uZ~oS~&~oz-YHnEShJL3aPyI|V+-GG0a#lpJYuG+@{W5+C+J
zL)Yqsg|<#vULqqvJ~|YVAPOaNmAW)_%SRt4Xada`8xEq5QtoikoxI<%kRH5wb1W_h
zT<9Muwyw4L;se47zfqR;!j-k;^%tAQwJbmHOgRURaMWgfLXSw;%oHa?`D6*$lhHfB
zhB~5`%Xn^7th2{`66;M9(3xE9$>;Ywz6MD#2J&?($&KlrJ94Qo6SikT$#*&MKu6gB
z#PNf;5_sfB=wqzz&V;Vq%3t^`+O#BpGSad}q^#tkY{^(p*zSkYi4ma0fwIFh^E160
zj$F2bkQ3eWS`gRn0yPM*;=ig`!Fgy}fU|T_9F(w8U_j6WX`qX3A)QklY2c-A;dg95
z6N4AnFdXVgStfAr$9H>wLhz;5j78{gg@rgNJL0FVJx$NA2sGmx?GfIW?(?4th&Uu2
z7*n=;ydsxswuatPYDjuVoP}Fx(ghQ{3RcVT-Ex}#<SDgOG+im(5k>aaR);Z#kB*1`
zTy-?mQ{t2m+cyJM@4<Y@=7MM0p}$12>0bHB2aXx{S$FbG>qaGJBeuBI&{i82cl0&-
z<#xuQtj@UANPN-@3|Kz@SB38B(ZQ<sL_hZu$vr!l9cN=U?Ft3z_oAHrWVJHoSI#M<
zVD-u$MlXq?F5(ieaW2ARR_%d+*Co=V4qRwju)Ef&Jz;0=35C$Xbm0T94R5IQj2r9}
zf31k=kPhSnD$Ohpgj}w6TF^7Wv__GDgV^Zfi)236s<_yv7cb`U0_hA<0wuMHfDS(X
zrlvy*ZF!u$2n`~h`da)D4FI>>g140RuKzK<+CBzB@pJ6##KVhsV!Bgc2$)uF^{pBK
zqPyyG)b81i8N-W0>W9GOC5nDbp`ggxc)bHS+cBI3pu}0BWt$UBys2w!J9EMO@s9o-
zI^RJ=+_RTUFK{|YtDlLu0E;^<0cyBoWPQBFdC-aSF<@m#HnsE!Zn&-GX20RkEma^{
zejri~rboNMbd}89I2>>Fz@|3Rz)yAxMP9K5XYBQ?czD?KPD)Xf-e`|Q>abrWvN<Dc
ztYH~rilqJ;*aF&3REEernIcx+zm@o@bnq6t;cGi5{2p*eU#V3BoycmI6y}Ohrzn+;
zg>R+{P!uRl><W9i13#gPKB}k6ykmNHg@Di$d9wmg(HH`&PtP)w{f|A<qk%9b-!e|u
z!Rd5{WD2d-!Qz2e!H`3kX)D3yvtAO=b&n0$%^4tDWce4&X%~<<d@rT{bF00sLAX92
zNML|&xfcoe4+W<q1MSj&&Qul9(?vq=G9A#=fRBJ`ThB94n9F0sQU~<^_Z!aHw-;iP
zn^i0mS#MN?fd74$sOc5H&p;ag=A>70*md|~u`)}>RTCOXj1oC4TG;%NfF@G>%K_p`
z;VXW7dE0oDK6WpZnr!r~_<hgRIBB(V$*|_<_V@H+CL4a(sj-=re0%U8&?{YWCg*FS
z09uznM@JXnELjSv-Rf6l39%$_x{}5?=`d({-F4Fn-1NLid+dJf<ZXMOJ@p@9Y#@bK
zzQADi|A<-~DS7-xs>dyr&+}QAgPNpzA^_2r4z6aGE)qSjmVKmqn=Ty7Hgt~M@1iqc
zIR%f7Ce|@L`bdlJ+BQWu168&Jvo11Kp85>tbxARB)GT%7B#LU(;Fqa#cmAbQ`}f_s
zEdWnYbS5nwygiw;s*`~Xj-esb*HqCvgb4yrVF+#ig|MYb1tgMlq3_R($uG7=MSbG(
zc0$ku-v}y;wvL2iM+kU&6r#BWrqHPrRg}7e8|xEZ-hyr@p2BXK6a!knh_@QI`f}z~
zac&=;ffG}ci2)5}&cgbWPcvXaVrMX8<Il%88R1|6OvPGNjR0MtY+W3f{vyWoF|6me
zV3d9$7Z|q_Vnx+*lNLtKyv<RYH%@jJh!D%a+>eQ-c{L&sYt5$b+<;(Q_IuF!+cI4z
z?z`T2HnZW9)^{td0Gd<B;%mS-vjUb(mIl~$`tdKtb}xGYPg?2vK6)h;%ntSdJBJKF
zyy{bEFt?QNvf9v1sUa2xhDQUDBoQ-E0XTySIW>jY7A-@2*su3+H_jV785dfyqVYDh
zab@)u=v`~$#vcr%ec6V$($naJw-_yXvnqcI5bq^ZJ|X&2WeDvXF-D$S@XZMBd!y}n
zX)ktM`{V5Pwl<*kOAh(S)?T~v<+Tsw_ZgRb7zGg45*W~z3+{kac#dn37seZSeJ+aE
zI9#Jv>}Ihtb+?K3%-oq=29!gEE9Qw3G_1BN=M024cHO>%?XgQCqj{RD|2qG$L(<An
z&JsL`q}PoHWCACS!X{kOQo5WMJ}iA;D(^V}AblV6YV;8=%+m1km~^nDL(<sa^G-L|
zh4**L9~3=ygZatgsUB}BCXG&o;i{~=9;qE!=;**}?C)l0T1@g#2)Bp4+7$GiK3R4e
zXLl79nVe&$T^vX0T)SxM?Z@j+9bu-n_BdwXDz(#?IPGlLEZuikcUY#cu?-jwmLBN5
zI$y7#$ueM=u1!_k-esJ&Xjlomq=8qakG$+@iofh>9HQ72s$7=}%3{bcZ84`5`_POz
zo_)>zMsPF)AD0XT-tm!J{g<Xp?Rru%_M)LOTykVb$xJi(y)%N$lgcr3_tVu_PkzO2
zh1wKWH@%R>C(h14A1Ej+6$`^x@{?#b2%%klVwHIv@JX{ol<3Pvr=4?R=f-$$?u4@x
ze3FR)0!d3-)AdbhC$n9z$5s8LFMN_xxV<FA@7d){o$0eALY;aEJDR~o7aIAdNUIdZ
zMLos@;m_7joic#NN&lxNkE3#!g(l;9{F69=(Q69OxaC}i)!d$;OqzMmqawRClWW6Y
zW0FnD!iWr1kMXQ(csG%7Udaiq<a0m|hGAr;^NMUG8!<`VBUkdp*dOTO(J!6g#Kavj
zV3CNsfX7fsGS&^)Fb8m)?D_XG<T<Xdocx{sipZs1vbffPC=n^p7!r-<r`N5j0Lsah
zKyw+dei{IrK#Wgz%kXX|5l?|P+eg(5`Xn|QyhO^e;%gH-(%7y$FNxJcBDLxP*B>T;
z3+d;WoG5wI`NanyBW0>u)gT^PP|s<R(AstPFb`tG?p(?)-k>+Jhj*Ik4K)oJTMzSX
zAp<dJlY`8=z6S@#__20xdvXrsD*IJ68Jo`=!BQcHp~s_I`7N?p(!=*kkp?(1sFl)e
ze9pc4Y|f50=`e7oTZc_G0|!8x^wG#w88hurgarV)<y3o)U=Abp$C{#(s9fohuJ9w?
zcu(MMPz8Hy!w>T-#m9tI*2lPPDmZw;%^!ef$ZXpgn4eYj-RAquHdLg=;|!)W?hz24
zl6mI$!`D~KCb3Frdz>0&t`mJ^VIF>DZVR}k={qC2w+3>+4wf<Y7caB|eDa?yyB_Q|
zo^M41GYQVMbkH!#rhE52-GRM2hIt*&0@W-MJN36or@MC8$00k8W1t7^1On_>_H~4*
zYSNFwQB^MJ?NbS0?#C9+F~k#S{wDkh9LDf^#p+FFfss?~_IRF6pmF^jccAZ&mrmtY
zn(;WdTx!C<7Y<)47UvMyPdi^riyZe~^aME0Fj8$`LRmEjl!?-FYDTb<kUIe1Bof>|
z#RhNP3q0~PH;_Ba7ZL{{&xqRk@rQr1Fz`2e9ko<5RY;Vne%wJ}cSQK=j_~qAA&G1>
zDwfWXeMzmQPJNC<y?ICIJJU&NQNP|=&|jQxrmy<DKN<a$EBMjxA0oGusdA|}RF_90
zQD|4w;<|os>aDx<)i|ywy+0Q?{!puIPnVO-#pRFy9n2L?R1z(`3PU^z%w@QwNp#_P
zox1T_q*-F_VXIzXM;)FU)jCy3Rqwpk2L+YfH^+A{%Q%_YiXVIvkeM1G2vnFG7k?V?
zzHxt_tITH4sl|i5@mD*>77~#uw>e2CfJX2j>XQ4@toBp*Vp=VdIBCFdE1}X5t2GQd
z0?WM2ErkK8Qu89@RSppk@r$O$>#*W61E;0lEkdO!3xE=n1|j}vQA1$qT!z<!8^5?c
zkHFvG7SIk@osqJ0VB$0wA3nW6B*5?Gp8{8k95n`ZLwsA5cI(wHH<sBXp9Ot3Qo<S8
zl`jjg%RvskTA9fD0OUOb?8#6u?*c!%9EtKVO}o&eZJ_$mwMgg0WHNEso+;F_I|<;z
zA|@B#O%OPl6(RfQ+1(JwTOBm|hL&$X-HG(vU+yad%vd39CuhVeuwS=YKbt?CW4&MM
zOkAA(v#l-c1>q+S572W@u!!#L-y7waBCydB$r5V_W&g-QyB`M)6M%f_g>=gt;CM+p
zh=a(<hO7XoboDnM$&0NTe21lnDL)H1O~bZ#IFJRNxuwN;8KsC@bGZz-shQdhE)z*1
zsJ6gV@Z)_U3%7qxx4<%}A@=)-Ata#-YVe6E)?l|n&C!IpI~GYg^9y!hWdb93inkNu
zw|*y?M{f%bjaNVywb>O7R=QIRSk;%#qSb5|=tv7%+be0FqlPjCExMRjy+JE77KCpd
z`7~KRh3#5J07M4eMOV`=Z-+34>~+I}Gll^|EB0<{?OV=Twe1wNBY;j!oV|5qylC-9
z@itK!4mx}=u&euUdt2Mv<cWokpR3T09auv`IEK3g|A1~w7=Rr%^)I8N6E;B8xD14n
z-+z*yG!P!s)~nRGAAPpFRYckPaC;nJ9jN08h{rI%D46<ytK95*1STvXd%7Y0uqL8Q
zkhC!eZhPC`9Uxf6ai!k(TltHNLQ|yW9E57f&?^%GrW~MQ8-&?O3=so+4rDezlZAvA
zP7eG!|7-T8(&gwWpRp6j_$H%3-4(a0;7(NV(qWZv){TZv=K;oGRb&Vf*_+>Ns_yt~
zndMf58Tw%qea(JGte;@L`&K}06dTeiVv9rO0$rWJlcfpAC~39pNZrW3Gk`7NOHPO`
zTpF~ft9@&V&&&yF6drAK`Q8&%r?~3*{c-2$9BID5H!;wn0q(Xx<!I=%Nw%kUOf*@t
zUe<4gs#q4+@BteCMl1fx$fkgZWThz)Enjw4H_W;*B9;+tIF(NOX|aH*Z;XY*g24WI
z^F0VsEpXkW>^Ti`Og3+qj*5{~67CiyUvPQ-7L4BF)8hV`pZfQ!ch3D1a$d1Y1L#G9
zIRVlwgzq@81uL(Or+*^XO>azr2oPwA`=)5o;AiH3A<(a*xs~BW=D3s}?aG0XD}oN+
z=JNeh$vJ3>8(#|q#`K=#LNIgk>$qCkZ`A>H1uvJl<m?;s&iDOk{FLT8M3i-nNh|BR
z$RkCJ_V6gNg!hmGD(p-Tkxs9=#2!tI#=cZgFuE?=HeCJ8-|b#_et<z^-y#!5X3$<?
z2j%q<8{2{Y2$wg0dYD`l#+md4!zvQoXo@*w*8-!(nBr?L0ou-W;~tFGBcf(UYmod=
z(tLg!tAnk^^|<-8yLV5e@Ck;q9WG6r16rBL^>5Ot#U%w{U}?fW!fcu(9w^tcMv`52
zKW?0!A)nfuly|$gy9%}7wIeO7?bsasBq=i+#^5TMhtK3Pj(0vu*wmK8*{;>0b##6#
z5++b;9^f{)udT5!gC8UN%-GREIDF+VY4-afdqJu?*MS%|=zj>70ZbvKAado?U#}u?
zF!v!33X$N)9YC!TL4VW3$;Q<XAKJ(}g>l>Vz|rs$kbm5<;BrSn1~<Mx%tREbNPq4b
zn?vR4&8JlEvuvCR-`wf80=3a59k~T!scdeAlS$Shjn<RJH$2VdSkBerZ`<XMSHDCt
zWD2U+bt>2I_SvXY&HP9g$}W=6TyKe@nG{G&OsxpwmSVHcGz#vIqPAAYGP&k`l&|CR
z?1)i|(EEj))7ARUGc~q(E}2itT<_SdXaRPgVdT{R@_mG1H%vIpa+)3jK9&w5nER+M
z(=?So*iLu?{H^GE<0;K}L8iDr$}7uM{t~&KGj~8iRGk%eT>J+fbk`#$u2v+#hYfyc
zwl9w2mJ=v8A`@^u2uwNO&mp{AG66l@rwD6ge|;cdRleJ%|MxfW`x}hs(A`8xmVxf3
zpbwa$P5E(ac?Y`we?B)(P8o=7@Idnzq5JFCT|z^|)AC`5kf#*k0NivN>3VFK0<BVX
zkH*X+n%I1aB7MpV^tQJGF7d;PHLs5fQm)ryK)%+9P_yi)507Dax_Ss{tBnqS;@uT8
zi9L5QC2=>Al88jXXF=po;&;4|<EnS4v`FSrK)bH!eq)PUprq1Wxk-AOKN3;k%qX?)
z`W_~1?h!M~pA>g=tLEKKfyy#yVvX7&u51tDLnZN~PQ>;(R|go9my!dvmAOf5UO@#i
zy*h7UecoM+ZKgJ_V4Wdp$!VwZ48=TJwf)69?6K>hTM6L_<R_dS$MO3WT#143;F^Wd
zc8L7JaDVuFrP2?$G?%aKS0wYp_1n|&l8k>EjjW%1bKyzhS($W=LdK@#&<nt-5F%V=
zdPBA+5b$+vyy8xIBEnnE?r!;Rk-_fygr`4iy-|DJ-oB@H_~%?|ULMQIU{$WJ(T&BF
z<!BS}8QUe{+WM7bMz*=}E=){w!=F`=&!?W)HE`W2bsvh6D0APUrxp%e<N8}a{5h*%
zhZX~3l5%sa(62K_0!KzCsE_%VAcnpPgo?EVQt=JoMeq;e&zmAH5zqfWX2_v)&Sbpt
zjjPHup66Oo*>CV=SgY(v_s5FvrF9Kh%avfKh5D`#rS-+ByTc;gmS}wqKOt>(oU(X!
zWmW6M7m_rR3^rUS<s-K)3RSL2_)9NeicwW->3HNFRtt3$$(JgqbR_^5J{8RBMBh7h
z6hFVlYp4wxUgZ!k(~1+=Z2G`RFiXfdX*^IOqLky{p39?~v-nt*Gh+k5A3(k_(Dg;B
zhhhD(qq$h_=p|EE3ZIy|8r>?{jKMo^eUw1Ct^1iME6)+?0e{wLmu!)G*eIAGmjR57
z9zBmX_$vtCdFN#ox8j5gQ()8{zWc3mL3F%bxo%sK#%|w|Y~b7XyKCpv#Avxva*fl6
zZP_5W=}ubC`}#A6#p;J9`KQ$VYENea&klT7-U>X4hU&|glU8!Jj~FNU;KxJPk4R|8
zk+G%MpRn6bu-ZpiB=V08Vnis_{pn;^(kALO;!u7PMKX;eFaM5};><>u)O4H=+l>vJ
zsCh~*;KvV_c)y(6gK3;CLUT+#&%bZ;_cuT8gN&4d`0Kp2!K~W`j>PC+E#FPV1EY=2
zc4vDg>(CGVh`A{bWLMAz%7M2R*HIP%R3Ci8gps`*UFA^id2ddMl-&Og{POCpz($&G
z@gel2!!x8)PMuT9ctexug~hGZvNyVEf#H>OZ`NpIxo4@>_7;_3pK|dSJH2tFmcEbn
zB^_8$@uq(ZMCpS|96aFmx>8y_du^I!fytmywFbXwf5u0hsZl9|g|mZsI*Oj%<sZZ-
z#+r!0WY6ROA4Ps?%uXz|3^!iJPD=kR-TmKT3ryBZ2(tB*!*)VKBSssUg&<6M{X413
zsHa21;$uaM45E>ZI)@p8W_TZ&4+ZQ_-_}W(ng66TF^hvmjdYY|mpuk{Ms>ckdt#Q8
zd^DNzY>=Q98-$uC`7hL>Bi?e{Qqa>=aT;}H6m?jHP&Cj~SMl5}te>G$6HT^o{=)0n
z_04u<|2nKwo@mOs-Z72mD4TG+;u%_j>Lbup;9_l?^lojKzT2hvACC{bza2>aJBwL<
zGZrlV5In|cs$s&%|EV+nM=-u0cXMEfJT*hEx?tR!=XT_#f)jgKkis1P?Ik5)QK>m@
z)Zwxfb!lWn;>rC(yuF1T6I7|SLmahiTU<KKs36+<+rteDL*a|rPq4z-{%8SIM13zZ
zy86Lfi<<3H72m7f-NUpdi*JVJbaOSMO>p9g8WRBrgS0KS?}hEth#$pQ*FUfJ%u^d`
z_FK234(pOL2<Vx4jx4k$<D1~Z2G{ENTa8D8MKUr8d#^_g@~>3H%HPRp3ffz-^aCnK
zKgpM@w+YL-hW?{{aeJN~--m~~b-FC2cG~O}Sk+>r$Z<LBdVTgggF-dUsKbBYA@?OO
z=_cf@ZCx&HY~X9+KkTN~MjIuc`_52`hLkH9H-6(3839(xV%%lMZ3U!te3+L2g)v+T
zzOxj5XinJnL1YM5K7X4ficPMuVn1{_46m8xA<g76TJF&c#;z!yaQQUGD|cDh?b+<U
zhSaoy*z_g~MeczfA#kRYGJlOi{$9x$P^@e)$c^4`eqGYwy)lcW8T=j82d6Gh0S+3A
zCW#Of>4MDQok|1g>9VV0hvXbr-ly1y={pu7UAa4&my!JPTUD}AOm@o=-H}Aco$ZI}
z#<>8_i8;cwg?(G9=v8@6piS;)HWJ;-W%v9I(xWFCw*m)1fv5AfhwOZ)pOJr%-{Wfi
zs|St8=AnFEn~XYp#mKdXcGyKdf1y((TxccwemUM_><LA)yc8}Mkt6yG%_GKh?em0(
z?u?^3K`~}C>U(hw&5Apt=NQ*}%9a_%#U5k^Xn6NR9$`lkap$~$8CU+J_Ta9sK=s9=
z*~3*_vb}a!X{N3AYZz&S5E!Au$mle4x0uA0+2gM;^J8aZVvI<w|IXG2(&Sncjd|5q
zyC>Qa)5!esO62f#g)F6-fxvWBEWrIQ%pfjzn^p~0`cgi&2wf`6;+mH+FR(y6B>&C2
zgM9i}G{m7+td)EiI;tr*-2C(sQ^vO^CvTf5Z(C>XP^}@|`O81{4-gx)yQ;rjwz@Xs
zZ1PByipo%$4C=?pmybVCS`L~<8K6z9-n3;@tZN)5da7uPO`qbBHw*3n?c@<!?o?(S
zRm3D$$+Hn1cy!Zk$0m1Htz?oxPGHjfFlHR?KQHyBs^<iTPUH9-uwV8u7!p9*Y66V4
z{#5+noBtc(N>9*EN|Tv=F30PHMOx)Y8Ty7A+5)>q<Hs$JhJFf?7EK@H)4Vn+#9$iP
zXmM9GJ!0bHf33}8oIt3$`}SM!*Yl^^l=;?~uC)?=VaE@R{YJ51cJcHKp`!J)HIgxs
z>yv)vCPqWU5@kkbiY-?UjqH$9-ald#F-(_|iA~*K>0XQX_peAsVDj^o|6n(UU~}3_
z60o@9NM)8d6B4E$nk;!p>@KA5TjrU@`DC5C@explq_Y<5S7HEvt5B5lKh`-E7|3@E
z%6JKkm}YLWW=vD^!cU`kcGq}u)j2Nu@;ACOPBgp=i8X6)!@HL_X7o=QP1tqFI1KKT
z0Gu`(2Rs9JKjEjpS?X5I<8n{pNS$n%*g}?T^XY`(```iICYGc8<Eikwh4X$4${50X
zofbeD1UO4yQ1nwEvIYPXt(y!8eT~=w#SpbyAeSIeq~8U&6maPBk=;BX!3AzO;m@4V
z?@Ks^!D;1(m*w*vB*bjLmJ@d{sndU)vEt7d5;;Z5h>rj51Diniu^ev!k2Lw*=|y4_
z#GAiLOjQiC-Q*Up+zJmcl28~})~M<ulHdHG9YyH~3tT?%HNiGJnO9{HR^8E~6#r=y
zh0IT~F!Kw;_Ma<hx#tDZKcUuP(Ux|`{wY}7Ol<e7HUoEZ(j8_^D6MPu$SkhDmng>_
zY~w^?Y8Vx@N#X;E;eEuT8)p@$3?RitR5fyLFIOBI%MzbqR`GpvJ;2b^>F8d2dX|zy
zeE?~L3Lc)O0GU4TZE8Q0L3VB^^=T-eZf93sjp!ha=3tf4#H&$$f4aUMO}(_ZMYMO0
z9&9+!d<wdtmCP5ZaIdn}8Z^>%nN6I~mV?&|6oZp9M1}|`GUa2Xw%75QAGcwCJRMEE
zrWR=8dzM=t4aG^{hXa3$tv|=~Ka8pGL&n4+;ZHmM+J{2N1a)=PQW;#b;Fv_f%L2#r
zv2Qg2AKh;y0r9!I7?|-34jK^~cOjYH);VgRch9OW^f$XS_ll{>Y%uIhVA|v**L!d;
z#Mh+m<Cl>ZomjwpA+E0P0)P3_eYvhKKAzb27rz(<2wj&ZTM9kXS-KL-(9bT4Vv>rh
z*z>d+r=R6@tJZTpgb$5~>a&tQzdZPIsotJn5u%ryTV^oRQD(gHbedZpv^|izxc_6@
zkcSxV7Udlmu@IISgFHHmG;SJ#*@ujiEB*5CXpmxSPT8RuV>0HtNQxrOS18ySJA`+h
z>EivZ=Jb=il;T#-2GF9T`qcdq622e20+FrEb<RzvT^UlPoge6S#mSRqT#*xub${$2
z{%Xl`QFHaXhb25LwT%zR_%@9Wi8ijCsmHJD3LbpZ3A-wLd2XGVR2`Lc{SYH65f9%{
zrRl(DL#OsT-jgq2{t+0aci#v?$!OORt8#OF`m*{jY{UViK$-0^OCB&*90_XM(1_{H
z?w$w1({=|l3~~W&ZBLvUI3M;AF(%|6pzk+{M6XkN0wG+vxJ_3KU)|~}2M#@jRpIw2
z<vf!g+RFZYg1_Ek1V1?8Y;?ye&{PWE4iX2+@{wq)0dN@oeM>;H6oslHrZqwqkSrhF
zbSPYy%Qn(s-e63?&#$?>Awd;!V5Ii&zS~Z6gDx4MGvA4u<yoZgeqkLd(MCUDelQ6N
zdj+GFi<+{t%ny=pMR0WjTe*U<rRY3&FPz7@_nVRk^Id+9ytM<Tf3QX+pFK+o{*<c<
zssa%&<LE0nPzQ1qUNmku#kwug`6KiimkHNs@JND3F0s79b(L#y!%ocx>b2WRGbOxM
zKCzaia7)RiZr%P>#r!j}ifOk51VWnP-yzN1&Byxzxh@dsYSy9211(T|SwCFZu;O@~
zsdvGHiP`wUpOzR#hg<ibr0eg)?2AL=K;cn3-9hJi5V=?F7>AO0HNJ>Q@SI^|l&P=?
zBC4iu2~&!FX?}IxCzI^Jv+C!j0T?g&8Y#yvK$42@N2DAxsudWap&`sDUz+nf`!6!6
zT%|eo{fA_7%F_Z1HprjmJ4)qytT!ZI+n>BIc(|KFs9w&z_=Yh-<4|n-cwQz+YPWq%
z;zO;9#8J#et!?d)qhS6OppbCLu4MlcVsFlO6`IjK>+s@({>|PZ&_w0UW`qBxLmtqx
zbn)Ne^Z-BM^eYtC_fIEmb3k|{rk{x`Q|W!!=7_PT`nIrTUGx5ba#yC+s>Eke5?c+W
z2QY8^rM|P--wpVy5$%ifl#4yj>qyA$VNmfCDbZt%jH?kcdq`ec;AZpwqSQl1gy>KA
zagH(dgf4$~{SrpZG!or49iqf&FcGB1vp5Mm76pQZ=Er>fA9hlRhsUY@By+)f%%?qI
z$}{&5sOWFM)ykxHzV>(=6y7j26|5joDsv&>A?A*)O;c)$$~3%wR^`Dd!!GSM?r{CE
zky9BcwI15eG?t*od|s*@n~mm;kC&iZ$&#r{JrHUfndDss0oZUsqqyrx&Uk!HKG@M%
z5>$OYGpF@<FyyfY$+h2YrFlo+IXZ3b>B*3M)h5vm6387o<6W<7g`6il2blh7B%*E9
zF8VNiD;BNw!(cGUAR7E{IsUsk5qtyDw`e9+LeMdGL35|fB8din7bk%;yzy?1ChI07
zhu-3i`i%>7#a=ojsM${1mn;J-h=Ny#(@`0%NR`&}+P)j>tD5a%R5NN3P&8b38RN(`
z@6Tq-c2B5?c5eCY$2tvf@x17sa?=oJe_T%AWgI)v_oDC^v*zXUkDb~(_;v5J+%3hg
zBke5~k{8I!%!PSP&nB?DG?w30kg0ZIO8&!u1h9$8&{s=O&?PeKNFqYi%CZi(K(S&p
z(z&wuBX6lHK<VQv<lK>C{H`QworxL@HIIipnms@8!U3k^6!P#~`p=EUu)M9T2BVv?
zZ~8ILax#tR&Rb;}T;Mi`Z@Ev-h{ju|#b#K6T{J|>8f<Bnw{g0RY}rDdE-WX?SK;F4
zcD+bmRU21l`05qwyi%9aN5u(e(MNU*o^lb*hY$US!%-7SBY3!qh_CXnizWr1r0`Ii
zgkO=FfCUI;DAq~LOdq+c_X=UW^hha(?U^cgGlkW?PAVQ(1MA&P3eF_x0~Dd%p)(9h
z`WTH|c>PRC99D@7?)j$aR8iT6ue!8LA`JUn6G1+gj^mqD$;^Lcl=oi%efGj>#mK=#
zLr+idPw4p-fc{^<1|ou}Yw!qV<1TdHzXNYs;QYNQ(Sil}IV&$<_sN5!P7`pK>>;gM
z&v%BA4r3`Q@s~J!<K&-W)Z8|sQOt)e7OQL7#?vw++AX5wc3b(I)RC~RyN7Haw7lkH
z8m+i4oDlERzNE10*L-Eb-?+e`HoV>#*!yK}JU!fck`}qnKISzhH#C;O?-&OJW3;~M
z+OsKvXv#KWehF(58^$&z4;29C{!MpByi2XZN&RIt{!qch!MP*SfCrZG<=<QrB=lJ#
z&IF+hUZpr<>1|qwQ;}nHrqwl>eBN{Ot_UWD&Q;oNeivrij5(!re*jwuJa!b5+d{Kw
zuJBx)k8sKRT>=I=ogD`)Oz^y0YghC}3T8E>2fZ-;xIB_?h2BB`vU1~yD+&9AVmA^j
z5~FFqe7wx%0NFn~ApF^qd;L{;ONL>29GTDsnEp0gLcX)uIRJ*9AMUy4E*Dmy3;@(b
z=6u^&t#+EZ_)JhP_iJe|WK*)&@ZEycrF7pVoz~c~iY-<(;a@!VZ@KWleWMRq?<XMM
z<GFdy(3&z}+hyKdQo;=)?pnb|rKsj|rdPm<#_JzfHNWnpr%82A5?wB8pl|MCOe{A-
z?|YFo#L(nt($K4<O!A&!knTSnl5%UowTs%f5dKcIf$umcpPPP;$YjViJ&W2byLe?u
ztiGS_cHP=SHeFvPMndE1YSI<48bzUZ^rd#h{=qsEYJ5v!`@*6l&gy50hC%LtX1faU
za-u)i)7yy}%8ZML44KB;*-O{sJm^np-dTpH8R_XgvxP&vPoZUtaSQSOU5O|P3G7=E
zNP$QQye~c?eGGh`LxJc+D)H&=JrP^cDAGVuI(i>3shR2H)T92VAB64Yj-`)#n;OHx
zUP1*e`!&ZrSL3Z@oz#Q_{D$~5fmP@0jKMcNS5agr7m0<^vty5S8cZ4tUl4}~gu}F%
z)57|bvbd{KLb?Mev`(&?DjcUYglu+?aryswY%tf+a7L$O8F#MTif5~|oJtP28ud9p
zBt_883G4auniY@N=eR_d2g2Ot`awDu-rdz@&HI}gZ=5Dv-&k!Tk!eo_*{LjCCdSmL
zN;H9qU+agMFl07s;Y=5_9d1EsYnT?$GP*L`pz$RqQII^gt7)lGWfovJe|TAI5&3jq
zXrMAvp6^_to3(Vy{A6YU>1Vx4pJ~pP1>EzHTenI7`M>>qq5cP0to<L#-U2KNu3H<n
zK~Pdql#-AJ0TDqOQ9`;KK_!)z4y8drx`&YNR6s&nsi8xWh9Q*h{MO*}_`Lr)|9jqZ
zu4^tN2AtV@?X~U|`@Y-RsWIJ8VMYs|s^<j$o-O?5cQ;Q>47+|mQ(s~w(~gH&@5>Yo
z<}#Iz@sCJ~e#BuU*<o_O(PW}xG)juz<+aEXo*L=BGJ96Cef`R%z&1gX0)7|rG^B2q
zOP<QJiR|Ua=_CCnS)I#FTy#HbAJNtBy^(1@d|a8|*<Rr5Wu^LQ6D9VIe^0vd)mrAh
zd5xfmYIbvasS(|b(h|mYWmuN2uocJQhvz(G(R?a787#U!D`st_l=OI~PXB(y5y<7N
zwcm_#E@C4x?hQ#XP;e@s<cT4eJW2|~uAzz4+AL+Jm69s9o}ElPsFj`1q|DnGzPCE6
z&09Uq#JjcI68q$<a`XU!!VDu09UX1YZlJ00n5TTKb++Z<B1*LMu*0=@vXp3zC7_`;
zPv;?q;5D`@+;$4ttsMpPL<lk5oNOLjV$1XT$aFfSza<@AZ{mHoRI&z))_g&^S*tPQ
zUGqU?d<wrgF7=TNOAmtdzL%A<OY5R*h2sX^i}^9_X{!!_Oi-f7BojJy?tlCE{NWrW
z-}k#*n3vGMqI6Nn-h1V(W+T-ZJOs;`8=njD5D^pSsrXR22Q(2syMsB17}9mEWMtC!
zKN8UM+Fq9o;YhpxSq>9f%N#4(QhU(Yg|kkSS>}mqxo`7+GC_AOlzQ@md2)jJWh*4I
z;OgQ~f<mf)$k8kl)4s|bR|}@sIMPm0uDTy37m&s0wk6^?KRUJ7$|fXHINJ7Uy(hH3
zeq};kNU6*7Mm-K5LKKri(c=`_#Ncr+$XzAEQ0HCjcx(N_SS@LWZWK%EdfY%(zMx^m
zf>NObcB^_?Y^_j->MgosrZ+X5TH@!F_s!bJZj`^t?|C9yG;FO9b0kV{R~CCEf@<%{
z2(#AmcQL{0QB_M8g$NY)Lq)TocpVot#~{H%)$HipTg8z-M}4q9nmuCZO<%7(a;+D%
z*(eE~``MaSY>mM<X2k_pzbU?|el~T^zv4W8SY%_oEt|a?7QN{t&D+Pw_{}4xMOmZX
zdQ*ZzF-TImdR9KHN}CM(j)_c~Q>|oHkKsz?-PI-SEZ)s-bqcsrHe23?OU~M{7-344
zB~tP2+u3KGb&+#h9Q>pC0d}<O(@(+7uaP0PShe|IJqWq)_K{(`Rc_K`5z!t-&bRXs
zYtN7(X02V=)==@bKHeW2|IeRbUObJ7yjd3{Aq+-DsRwp&lMkcH@3uQ1e1GdZ2^fU6
zzLGubw|s0O)3N^LmVT3A6ot-Mg_Zi=D#!7Ppl$PGrL73=D%U=8EZ}_cGStb@nNTM<
z?H!anWX3a+NpN&e_PbAw$g$Qvy2B1tqZ-TEJ1-Wx2Yw_DN6qyFq$nP1$_FvMD>`K{
z*WlOV7UiU!DiWh;jWwaZAA>!i9*6Jv+W^oX-RVx}F(OAk9*pCM*9se$8~0)uOeWm7
z*>9!YLme#JEGORR$zd?)?V)y9SNs-Vk$U`CUVd%mn9hAEZpTfem=iZ;claJp%*&&r
z8`?XrWQRqYPR3eCL7^X2_*)g%jU^Kp0(6-*&g3q>X^H)iuE4hWUMV<r`-aD}#T^eP
zhL`&)l~sm5H2u$6Rs7cmvZX2zxf1j1S2w086>fgJwEJNsM>I<>ED{XNOLc^FiVeFa
zm6IuQC(p<0332IqzxiWh4nukVQ$Oo*7mbnMMTyJ{u?hl{&L9O)hkM6s8{6|?jijM!
z`Z^zbWY4kQAHVFFtIi~793!^-eKjggvct`6SbP;@3(t*i-2V%6DJoY>dh1h_xR()U
zv^6u=Xre?+K*W`AOd|}c{nr%^4%qHG;RwzSx*dAG3U=*1tmb`3cRl+)h6wK9o4-%h
zKTG@Xz9TdTJ_5%D2k$ic6kY=dPf}kJ;^JX?4pYdU*!{{hD_F?u8~BKR*Dsx%mSQDN
z+(+q}Zq3~t*r>U^tgwy25D%MQ)A9vE)3YBm?DIFfuZE{7sTbTUWqirdJ!PWT-z=ME
z<FH=w$w=9LHmHW(qQ6*_;%yX-#nFUao^c6-XkGmhyNeqoQbfq>CUw<Q&z-|h#@|iZ
zqy~+Y-6J)F!>k!q@uj60G1sW+r+bC%aU8Jl{=6CY225Wihgn(@o;%{_#Ci_*H(Cay
zZ7f>Vni#9CroMd5bY2NxETjpk;NKtl<d6{55i4V;`c+kJi>%=3c2t&loQyZhI=DqS
zZF@Og`@N2$b}h-;i@h^mnXQ=@b;j!WrtB?HcYo|MAqbX#gdMfNv3~6R*p(<DW?fg1
zBGv3!tD?UW`_j$w`ppj@C)&MFu0}f;VsLM7{*le>_a;|1>u0~5H?K3OL=$)Lw75Bb
z<>TFIqRWnou8XG|UzpW;7CW;l6qgy!#*3SMKOj^@@IN2tzk?fIAszbswqvK9L5Dk*
zlXGqQ3%}~+yHlQq9^=eaz{Sp?Rb5}APn&+MoHPkKHZxfmA72_jFSSQSWvaC-=?Gr<
zs9_%&W-#^JqxZZW{OF-o_8fm=cZ^Xh?*~z%8lK=`hsd5o^Oda1n2h##n`82Xg9U6}
z_HE9B`Nj4Fe-0v+rK7kU$&G}ZRoD6rK@_q(XyPmplQs{3h80nFY3i9DUtjH<$3u`I
zggkNoMjv<tXQ-4BJcST_75!eaDQ;OsoQagF;8+%Pi_g#{?zkxsEnyxOChw+O8_PlL
z8qYz@n62SAWS8$*E;zUIlf=fg!LG8NzGwD4<3RF_rl;+xi@d6aN3M<ayGM49OkE4^
zNY!*`R}-&p-w|DVVa36o1MxmJ^^;xZ&m2((<;q6~H93dY6Y(JbCcfx*9QS|(i%5QY
zW;a1TUva+1BTMi7Y&gdYUeWett-}C-B;lR;eZ=28^gCbA<{UFeEOaBUZ6D`_KSRAQ
zNuM9hkW7;MG2BI5FZRF3SCo28uwgGEaL%1YFG@YcY*BmSQRcbKVAwsEsyE#?VE3NE
z?wM^e)GcV%$QNE&vH!#xPNmGaEv15utSM%fJ=o47+z>4Ys{E+``e8@b<Jv*`XJ#77
zJK70r+-8!`_AjsQu~+7Zl(2<;a??#aT&|IJTN+v*N;D@jRE?NR#lX!RNqTZ6OyK>e
zT9q<`3{>gUZ2_gy7ls8c(*&O*{_WPKC?mA%X0qfVcACh>NFDe9uv+wwVdB+2-n_c^
zqQh!Fu{)jSJ$d3f!(qlCQ~0NcFFJ+@mPc+1Tv<THDEd89pbn&6(NA#1qadJduXq3X
zBHcMIPfLvPaE-E6DK)zAn(Ol3*v7s$SLn4)AJ?KPtrrJMl2P1=H@<s(|2Q|7LD3UK
zBy~mJ-?z5**pBO0+&xLlxA75)sRY_iis>y<m;u|kFKOjQWHo7?sNETS&Vg#W_KoSm
zQ<_&H)o&L%slvN3{?};g)VZJ1r=YU9?lFf3U_ySdJDAAw`^;2;gQ*HnUxENeg3y_$
zn7r|x;h0mp>Oz&%FSc*omUpzj9fsvIe+{%<-ySovPLwH*sG8QVcA4S$bwe<8689&5
z?58g47g<Wkqi_!C+)uTkO1WG~@q8=_6Y5KkRz4n8iGO-jE78PTEnbn+Zzbz8)o%Q)
z?|q&weNBInHc@Sgfqqr?Bf8OQsTNdws$#$_J!{K6PlxKqWL+XI{4Al1n8B*gKQAiG
ztTw`$b?U0lC1Th1R@mm)EL-OBU&$Q=5D;)@_Viy+84r{D4D2iaV2AP>ioEadtdyMJ
z^q|<p0mdYSXZ7-h8-<1W31$eECkR29mvvh@2mu&Kd&G)w)Eg?VzQA&zrP->IPmty~
zjqfjsFRi7#SC{bJW`8Yl1J(IrW3gDQel16(Y99Ma%E9I9Qp`OwolInVSBS_Cm!BO^
zSuD@mqR~%+ZtcVDpSc;vd8$aY{2JzblxbIVaU`QCL$82HcNRtKqQdCQL(xGue%dBS
z*C{*KG3}fh&f&%TWWjA(c9)#QxJXC2X?1^!GPAZk{W49*(n2vL_5H$jkCwu$ep;r`
z`{&_#X6Li4ifx2cmHKMG5E9NG33B`4dROlB5?2-)CoE}c*2NpF|6uAi?hzmRs>L#Y
z#P4TIS!7x#YHP?MVo&yeFr0YEno2m=Jn7N6>vPm^!(sZbhNHgJ(xDnmp2BkYX|WX#
zJ6cvcMl8mkc)e0aTJNqXIOo8yu+X`W{cUsO7*k$@NmpupSK7mNRE(b2jSuA-11y$m
zl`AZ~kCV30js`14-fxroVv5W6t%yGKk#NdKDm?q)X6@HEC_}ESrFn09<7<)PT!{1e
zssVZ`qA$W(2Lo!3rJbBQ%0J#puGZgRIk-{w_bfo`WDw*G7v4^w-@wVEKGJ=Z5t)49
zNyOXh3${;O@h~r(R<gsn@T~p+l3NZCy04h=8?>J!qbhjrE0-pQ{aw7+$tnw$Mk+*$
zig1}mO=vl!WAE7PZ@tKS`N4q<(oIH4H^rEw9UiXQ@8>I3S<4ofs2^1J*K;Co(v^-v
zfZnRz|6p-`j6mB_zI#iXo_F?185_CgZ(zH6E#Kq`v-WY3!`YcktqA=d!*Dsv^EqL<
zF2Z!PuD9YoSC$YfdW(pS6wo#v3TsX9NDus6s%=^xlm4#bq`s*W9!3=(*^4FmP~o+A
zM-|P?{>9_2Zlpj;2~!l7U_g1SW`%G=)dQ=VodRkM-<*A(GGY5yCbM|lf2B6=?qrzP
zvernm%Wv}e`18%$^1`yca=d^q`1ce3;F*8_2_Fq0vrnhdTM;h-rX|{>Wcc#q7ys)-
zh+k3Wn8;S1ZYDle7ekJje3drS&zez_Mu0%)7mBDwz9QArcq1tI;`K{V8;cZ|Ce$TN
zs^Pr-in}ct8rdJKagb7Py|osRHQB{}C62|%Df_;gc5?J35(cqsrJk|UY^;41KN*?F
z;htleR~S+k!<NR-VLEwA*EOXQVey3ll?wik-gY)u$F!Qv%@3zaP(}QytZovXGC6w?
zMDGPI@0`7@f;Qrdzs&Eydb48Il1q8(57x=Rvrvctw<<HN?A!bD90b#sB*J|EmtYzX
zr)%n`1uc3xJws29TJnh76Mk+_!~L|Y=wihN7BWp|L00Trl@NnMx(#k@e7HD@J3r#-
zlRTC2VN~XWC`55Fr|ykQ6EUa>|IfRL#v<|zDHPuE@sFALDG*!vDODY-Q9lR1=lB<+
zV#T}+YtjcE>B_<JA`1C&&pP(*rcut0<7y|Hd{NL3Shj1!0vh$Diz$Vj4ER4j9dRdU
zt<WauaOj3SiwsSJP<<0!;3;72b{0N(&0lKWNa<*?7&)-@;nyO-g1GDFF8qy^#_`U`
zWUavwxp@QY$v6K~+4sLQl3`x>KRBW@x1g4z=Xveq2#biIYdNTQW>w<RSL#a+i@_Lp
z=p4NaiA5`lEg+++zA&>;Z+ZViPZ2VsO+|b*yVXmxEQ6W0t(@svn^@Yd`PQcFTjDvt
z9;BJnICpqxUbJyomf<}<gbL|^bZC6XDk@Jq(f&}rn>@4>N#p2=Dn2`!)Dm0vO7-bh
z;5Ot%X|2POEQ4WOS@rMm!+8~nhgtvff2O1FCPEb>9TTq|lFwwyHOXqELsGoEZ@z&F
z%Cj*>;z;Pv9yuT!HTI&awP+<{?j}$$@e`$R3HubD^+e&-xAgrnnJ2=Z8I&Q<^=W$5
zK=tPFss03yOQgN8|NY_muW6Ed6Cg6pOV=9E<q3DO6L|K)$n=RCI%PhKPMK3w?Po(I
zH|Ms^BT!0N2N#~N?v4i5&UcpO^q6+Dvny~-ha2lx)Aqqzuw1&7CLZHY?r^8pDIVja
zdH00za(6`;(?RoHMf=%0Yn3dSXM5GNT(yWFZKl#PGF#cN@*dfDjukRQ{NjF~QYu~P
z7A)hsc|IX}KgL8H0#W)UFaI6XO$NWtd@V^Ifq44TREw`!G{v=Ck?6z5#*ZChKZFi#
z^K*v1b3e1wcneB(?ck-iU8Fk67SKn}g8Vd{l7sik&ng!$Ur{MoBhfaBl0W5`|GFao
zeeK+d&}$@2WPnaK-8X-)k!zIL+$B8lNcL!t#7@Om!l%B}|72sX_cA@BmsOcTnmXa~
zkF@d^ar1x$!3ILY-zzBFsYd$Bx|O_hMX)z5Ta|bjA}BR0{2L=8Ij_*CO1+!8SJA2o
zM~(ijCXQXMqo?7ln&~$)ble-$=x5Y+dNphEJ$S%)-0rYCZg75l+H#{MGEjv3>Ajio
zG(+OylBj~GMs6Ip^M5+q4BX+Zow$F~_0{sZiQy#@5$8WPBSSgv$%q8a>%uU+_X(Iu
zl_d$quho7BzW>$7&!hn9J0^VP(a)wFI#~~#Ekr={E)?Y0`cLXC5A0tBs#GrPF?JP_
zq)Bf3%Jrxd9PLL<**I@qsw_41vFUJg#7(8j`=+S0A!@mI5>7I9?=pEBB0QdTg|cm>
z9bAsZ%=3CS-78g-VoDdJ(3EU&2^-1xF|^S6FNk<+5bAU(`edKbxv7qe`N0;`i}}F_
z@{!6k9&bY*i?0j?^Q&9hS#s+r1}GpriQ!-EB=i4XBNn=<?uZk0VS-1C!_W55{(Hu{
z|Hy&t1ZNYc?m&jheW6pPm017$-@x%tKKk!Jv4nMLQctEsd*`9w2>R8x;3q`bXzxsj
z_ReSU5e|JrKY8Fa-3?#&TL@>h)A5%HYt~lg=3xj?qkLw$(VBb+1#(GrEm-xogK7VJ
zD-Un^$9r#=YR76FJaAm@g>;&0p!x?HCx*E$7Nl9Uy$a7Ux$NH&5}=Uocc>aJEt3vM
zPhxS$8g+Nic9?2j(2z}+M{lCS=o70_iWHl_Ou{$QysNHivQDGq*$$jDj>Wf>uW`Il
zFl=2bG`HeN(EJ#>ReS&8x?=^-M7W}kZcXK72H8AEv>!K4l*r{Nc+g021pY7_ZyQ5<
z^ix=8t{Q;~AcR^x#N{7Y<(U6?>;uf9>Q*SIn`z+T(ZPC7=5`VO^V9!mYw-^t`iB*J
zsiAS%_)Vw+;2|sp)zIG+(>r@@AgkW`lvHyX+OWsL^^?S%V20m+ww+oJg*NhzHqM7w
z=4UdONJn>@7XIUZ`R55#ai=^{UpfU<AeSH4I;lC_x7+^4X9dnh3J>;k<{Un_*>jM^
zqQ;AwAOAUB6SAK>PdxJRQ3F=TO?cO{7LD}3f4oIw0eWo<19>-qj9CinHc&y|F(2hK
z0fN)C(?FY$V83*@zugO3EAw!X5w3Ic@H(ue4pz;yMP&fnrXuKZW@%MeH6B+3`OFnU
zLIY#xYozDmACChEQ6DIW&g;A?1Iq25o;0Z}xI`_Zb6p<;ZN6-0{QUetaRl?aplj9O
z*kFOKEbuU3u>1xlL}$K!{c1B&e|~T2)3OoVeCx3*0)DIV0(zC)Yee*NLVK*u$1%0k
zK*Pr}8RH7~(KqU`*~)p83Um@?k%_WERq)pS?x}xixyV3MWq6+C!2!JO5XqA`EdFCZ
zvH(*38DCr3Qxodk99r;vyocaBeB?QbuKa;Wns|lQ5r&$lMDQr}3}D;Lrlu+Hoc#2)
z`^AxROOL(nWm(X$wG9T5RJK@-XIM?m%@_D;P^a=Kdf7BTyd`GUnj_8)d<;Z)#3m!<
zMH|%X83@PL4K*02Wm_03qCVWNJ?0`nJR1aEy}+H84UJzJiLNQvxlD{~-<$MjcfUE0
zP2+bnI&fPEIGvO-`bu!AUsO|7=&Ppz@tF<C^w@0!Ia@C%6Vd}yAC+iQ@Wpe}Yo8m6
z@00@fm;|Xv3HNKbwifp=h;;tybTqpG%~xWUjdS^06(YcumSVd&P~ma0X+Q3pAx8K1
za#!zzbLdriFb+hT8$j;o;Br+-(+!x9=7E$j9T>zO5!@Os)A{r&aA&_2-qc9J_I$4o
zsNg(+=LFab{D0ORPQmckVbP!Hzxep~x{n@5=0jSMW`FrI&cra@U9EGl?zhnEE<sgv
z>Cd`<KUw$hLL!uudWz_E?`@=H{(Idm#2UYb(e=U$S|Sf<Qq4c530&mvEq0gRg@-?Y
z8@o}n8a2)p##5CS^q;&-`vr=`v0GDX;`oS|3E<Jl1SOSpl$h~ncxEq185z&E$7EY;
z0BC7*6bmf_PV!pDU(7E-sv$llCFOy)5CtI=Wbd9%xhT5w`i55C8VFSNRXf>fQ;XOS
zcrno=Z2$WH2rh`Gm6zY!K5!frZO7tO@InbKv6T$ncOYG{w%Up(+XrgdIiQc+;w61#
zDu(1+zhwDr3iiyzHqdtveE^)3x`2W|Z4-p;5zpJBg+Usm1@+5!=(HNm-?@rVIf45M
zX?{#oCAfw9gcGzqQPU#&K>+#;q6PQZ{|7e^B6><0K}|QPX1s$ZWQ$t)N~?0_1nG{;
zy5~$<5lw*&SLl-7HM<r_&L69g{+Q%8N7QF_lRkQlYDe48Q^uF&K-TU$_e)a#!;Ri>
zDlx3QrR;)7p)2ictDYAL>2KY-Mfp7CD$~WJn!&}8{s&Ba>!cHfIxGXu(Wg}rWN^+^
zRU<io<*vY%M~6EEUOVBF>!geV0*Fx{9*Pxk;k``4@xV^`IVSQ#9mXn$fY5up<xw5Z
zVvFWw>N>L!dOi$CxAUBWui~)}env-p@tp^D2vFbRM&FgrKfk!^oNub<^D6TGmn^uq
zT52&N!YH5ODXQvK_$P^mMWf_1z(vPbcg~ac+|TxeMfXe?U}2L$BPYTKXj|k1=jzdN
zo%0W9eSmXX$ss6S%OGt+6u?>oU6k6zlPz)Qehv3}C8)a4$AE1mxa2CE&SNchp03Du
z?<kdRT!BmC2T{QpGR2KOpbidG8=1w@e5gqSqjK5G@4-vqscz#uoXWOft6F(8tK{g3
z#b0lIFIN?NG=fPrQ!zuP^yLp;yFtAIO{QjXNX4z^I9#@uwD)G?%#GTk*&7|k!42Y`
z?dqlETTCW8sIe+mhcw<8{<spT4JBQ;A_0WX+p2PePv3w2jC}aj_p_Y+@nsMo;l1?0
zoO4Ye+YTpVbL^8{fzr(II$lwfj-ww4$cVr(w*F>?$M(0?q46iauzmC06P#fPCI~rE
z%>BXB@}l-1bVJb)i_RB<ETugxFGeue^FKh>m-$TQ8manjvq^$9EL=i@;Dg_l=%mN)
zOl-(lt^m-Rn(v?Yz!gY3efEN!l9H0p&e6^kZkB(3#$y&<%C-T-<S&Ro^IgfJHk%8|
z-8WNRnVr^xcYshlA1co%aRP)x2a`%}5|AB}=%=Bmc-W=q9dkTU7HS2=>%iYF4`lNU
ztgM|slyPPAR#z+ULC~JeZU5=8GA=A(2A7)41`8iPU;$O{=gkL2YPT1=lFz;vtNFoi
z52DdP^z8tndw19Alt|bWpbx)XkaX+It+3aL(ggT*Pe=uv)d*~46L{0aU*iUWHa01^
zeo2>?Bjv)NiI)o3g%yESggP2=+(W@<F9d|TDV9^s6}nGd6J7ThfV2T8_fL3-Pe{c9
zN1P4weup+#cVBc}x;an}A6_u=>&tqO|D68ZfW{U<=wP<W(0Jz0FPlli*?tjw&GY;j
z4oKojzMnY(HI!NxNd;Zw(Ug3Cjrl5|4k3Zl8pmxe)HX}exxF|TJW!xpz=IRe!90A|
zxW}_KicJ<gW7-u4g)cfSDL;WvMRLq#n~Vj57|@2M@wg@mxES@MN_aDxip8$9zuEhJ
z?jMGkQj&#F%T};&Vpcjb@~vyh=oYBpOlKP*quE8X7lMe<aK)qTJ6BvxXmN~zm-j<^
zWGnI*^o_c~<Nd$}EdqMW(UkY^Jt}v=M_bCC`;hmU?Y3(<uf-YgN@-{g8630%u_G$M
zE)#w&oa32ffq;aBMB}mZF}eMNP72|rb@Gy@!c}^8Z#s1;RsJRP<3BuAZ|zA-+6K58
z^Tr8o^67Z~;kyXfEerL7|LNG1qmO-6yg^eZHK3->HWI1}C)gX0+5(Q@bphvcrNf~T
z%k$af{Ep|8e#9`{hd`j3O`JP`a9m(;1flY|`HaL@s+S1^j>dEjM4j@3wn6%(8<aGZ
z*`7@}bR`K_rqOX3O9cDo7-~P?>SsEonxhg3yq@FA+TYvlwG3R)Qi=kE#*X(MeRJ&J
z0p*O~QRj7OKpFm@H2*p4V)z?#22@}ux9a=Y!(&8}V+`ii&QxL%j2n{GBAxCYk=A2m
zpZg?iPfo3@@<@@+-4N@Qc;-d=Uv;2sA~}{P8BUMCfJesdrmvr3(=|}|q@hY5m5l@g
zs9ai32|DZU{*Mo(Y=>BgQ{t)51Ez>CdJ@xphkw@T(0iiv2nP|k^(A+yH^5b3>cV0-
zaDP-I`P-2|zcZ=Tdd}J%Q%EQP$|<J!^^5RF;2}d>XuGIlGMG=+)YMcFB?oE|lajj3
zV%J{)5pWXlzDZ5dmx1z1_m>drED$OT=e0F?uUJuFu<8K7^$zg%=x530TN2HK@FA#T
zM%Z**HLmAuDAK@_5*L4$?FOu<n)Fvyu#jMx*FOhbWmABIX$;76=_Lqwg)^2L0;97p
zKX1$<ID!t$n{Ur0%i*%1wq~+Box89E$Fu^ys)37~&}|k3RKfkJKSfUL(uiDi$)&P@
zO6m1`7L*<FJiX>vNBh6eM056hgX0KzDFxtgHU?tlCEM$Vtk(yU3h(_vICvJPV9RjC
z2}I7J&5Gp2Hm8~keIYgme7<m-*%NhYd_n<!)B<(epF3CdPyb#m_;{Djoe_Cg7Oy_Q
z2%SoF&$mpXw}>+LBE7`dlII;MKnC2s@(tV$rSjX^>|)70VUG)UY!{Rmb*joI9Ku<F
zor1oIv`yJ{Mgv}*G3btkvug8BR5YmD&uUqxJsW#oHvQSbq&NLCgzgc3E~zaNj;HU4
z209234VKj=H)57O;2`1$xaZ#maa8>}MqXZx9de;Z)@Y>U^L|vG^2!43Z2OVbwrYk<
z!W?LXq(cUQ;UbS7f(S6<<x6}(pB$`wuFn$%@{)2u$DS9<`?EzkdyP-;^Qo-85)uqo
zFa8J2c>knlW>7#YktTS38}0Xm&^=f@gzZZbo2#^d-@}zw_MkycZrTnZTu%_z@~?3I
z+ZEvHoRgd3sao^v_q>WOoEJFjbNyaS+=~|Hb%|f|iG2YM&tAa1S>Qvw<%{&b5<No#
zNipP|vmI%S`tU06@c9gQF?YXO9O%eNiCnxMiHKy=l>z+Rf4%w8@g6o(>&Y%i{H+=&
z=V@?h6zZL}q~&0`E;RtWPb=4Bw+(=W7#-?X;(C}Tct{6xA`=-D)4>UNR8^{JSAX|E
zBw#fVvB#0Vb@Ph1w^N~^c7+w~s^KN%U5B-&S&qi^*U=p?s(l<J9lqq80nur$35UbI
zO(hUK$w?3LwT!^A*C;i@?*~>Ws%v^7e{6*k4+o8|MTv_p;GKFcjxJ``ThlD<C*{Jw
za<>;_&()Vej&cd0&vWz>)kTU({}`zzfe>Ur>F0&T?!meRtz34WK^Llot`U~M`W35m
z-T{<`baJL!+nFzjiHVVDR4eow4eHr0^xY(u0r4PnBWM1rtGHi*6>Of?=2QNlraDu+
z7Ye{KK6W9R8=pcar*X*B^Be%f#L}yL-`|pO&aKU~-LaY$uT^V5F*eEV^Ayj!Ae#i*
z6T~Me$T^KyC!3UV)!6wz67I#Kvu_DXTb#4Wa&)&bUZEsosUN;g0!E`x?@?!E2J*F~
zoTH`w|JLsNhZi@UTs6u@^@2grOxy8rbMUanaDG>Z@MknT#otZ~4HgB#)`p1C3p)Pz
z<dojGM)6XbpfhiR{|RzSp&>W-wpxYDMJQ^ySbKc=9kS0*e<u2b)}^973x*#D<fmdp
zOe%Ny%C*por^R638PB5xOuP+KixqOxHLL(+X`G^fDA(wV1u;VXdAZFzL77Q^Yj6|b
zB0sVCxjCboGARj=urU9s1>9J{#-LG3&!Sza6k3pX1$|&8M`orq@>331TPO$v+>QPk
zGIV%|>6JV|-mYE%x!{2_T0zX|8{p|)Rb3Y`hqdX8&*_&cPWt1+_VxmPav~HtaJ)bG
z|7@__EldLy(;b)Pqpk1eSs+<CfZ<N^2av!g|Ecyp?8+nj0G70l8fxVBx$K00FSyWC
zLZOJ0N{2ZcI`1oRro8^!9M>`Fl?TD!GlBU8LP^2Uh$C;I6ErkZz`4b+BD2pkZJ)JP
zJq8fd4V-iIr2`gM5$YFA$7=ZDx7q@Hl#7QzK{yMYS6EkUje+_)%T+|fi)zU@d*H?E
zt8%a|h>BrTwS0tq8FoB*0w?+Qoe>Y7MbKT$1Z|EhSFfTFk%_j1w;m@&M@Q><Pa?3<
z<C!6WKBzEE+KnQA@Z33H1W31G4SVc5p-SQ;eHYx><Z=rrPeYdWyYu$f$;5-XALQwY
zh9>macpN8&<zu$)wcm(#UQhlMjWNmLd36SXF@Kg9YC+k2k6s-WIXdAB*%~^{obCQy
zG>m?vx4raZW5gQiAR)}RhxTSbZ0Aj<5Eii5mn90G?0uEP`gQWLFJ6*@w^2o)p(&=A
z{y2GPNUbBeLE9b#1qiA^17Tyrhhnd2++-Z8GJ{WPzD@j9&oLIXzZ>PAPjfW)bx>CD
z@BvWoy}Rtk#OL$fafUcHI$Heiqo+HY-3h5t8u-+Jbg8nj<=SHzG?W9%$!_2nO3m(u
z1rN(E|M=p2q+}$s1cO5A?K@W~+ARQ<?;Y(g)8ZdPxM(ac%mH5-fkn(L&OEbT3&2US
zHJsscN7^=(Yrff7O#-lS-RHHM_Ps*4Y>M3EHXV2iD0cZOp(o1C7&05#$qx?e&%YHJ
zV4$NIQ*C=bh+Qdxch1)Q$jT3y#_5?sW09N7$DofU?!2}1WAWp0jO^<(=Tn`2Wn~~z
zz{q?aFDO@1+(X7U5YS4BTY?V{gO@-kN8Z1rT;Xx&%%N7pdWBWpvKJcii~8VnLm4Pd
zZilC_pQpkKMJUTRvF$*0t$y7D19xsv*Nv9AW*0HJGdVStW>{=ErL}^9x9+<^{&{-<
z*tSh11>QKg4L_-N9OEqA!w5@cA|8zc=CV}4H9bHrRkgLgjud43Wk|i8CDRtXSUp#J
zd{|)PM#1kGj%L(f{Sr!x9l@fhQi<HPzHmHtiG;%_t+#6tpt8-;fy1|QOX^jWQcZ?-
zA`743_H=7Rz|KccXWV0y;Bhi|-s`c*^%Om9WPaJ2;U)m1oxF%dMX><lG*G9L8reMt
zG9aFD+Xa$wAl%((z!mHQlp%a@7@4?xaA{^+M?icIX!ZNcu7-w%r7g$9JKn<b7^8?{
zRu=(G)<a0X_pifop6*bo#1m=>N1MOi8L)9Yg!ItEYa&5#iuQkD{}1*r1b(#$Lx^$%
zk{zB69wBtW3LXP^dKb~sDUDV@p+K?02I87~>>opJ(WyEHz0Zg(4%zL`UyVpo(=&oH
zoKp7=fOF#PY@(%l&HzHIe)ky2{Y29a3CPG^NSc5zeVnKHF&*tqTh<Ti+-^fosac?<
z|F2~;UE-{vywbszw+?DqVP5GVA|;tUv_x<`t*?h~J^dx5$K@84Knb1|P-03{8Q#=+
zVK%m_w&T1z6O{#r!8@lf=Y>VkFW{%#fI_F_n`_ve8iBM4mQ!g%b5H-C1!#826O@}6
zHhIm_wNrZSLaeSpxTw@UMGX&?8a`b&kXTFQM*vAyZCh?Aq52tvx)%$5SqenrA^fvg
zLFbEb$av*IMuEZY=)f^@p<h+PIc#bbSZ2FHC0rH+0`Y_Tv-g!)UlPym=sf%SO?>Js
z8PuO-5?!`px=MTn1y|Se0d6lLiBx_RRH;G<>(*C$=W$P|uD(w)osR%kry)hHoU~u^
z<t(#R@Q7CER6j1;)BGwsf3(SW+k31{Z3#^mT8$I)&PjF#=?1A}-$A@t#=(m9zSmWq
z6{)jQjR>FW72yj2mD7PjT0Fu7C3JB3T2Ph!zJNv3+gDS~m*=Zz0ab<I$aVMbydx68
zONGL8Pci}3n@R!-{`JZ|M_1M>6^LyV_!`<-!K0n`UuYF5bm%A;>k@f(9mt7@Xr1%S
zA60DbyJF0u=Aqtqn9aVwbRHBjs^9TDy-K#<k@E)@w>`3Z&CBokbSW1JuudiX@wMns
z1|vs2l+NYX&{$Li@=0_X_eG6G;x%aFazHWU--WVMm`SaiThJIsK(^x}fZ+?k6&|0j
zpJ9$G8mcE!<c~uvR9J<q1~;gGn9R<o3CwA<B&uj3cWRFnGk)9p=2xct`%DOKq4q|p
z1ZFgtNJ*Y6Ax}4HBAMcgO2>yQU;UDV`bOA;e!YvxiBcV&BP;#$*hCPY<nqqt(DIf1
zZ0DKg^TUiu*%nl#c{cBHUNpoqVv7{pnUGF~hWL&cjcG`A+vFx_WL;|6g0d2*+BeV`
z)5#?dF~06;y4kf8ZXX+=8!73Kc*mqqoV`{B%d1KNT4IqtyDt7r=@>m~UlzZAWJ6c(
zzL`75!GKHz+b$kqq|+v;<Twd7#RcG5q!5pg?08o-jX*gUUTDesc!rS^YS0(Zr1jnd
zEh036Y~=(w!`n=4*;*Ab1_lhHxy{=|%eA23Y$O?`GNy|#EpBk^(9hj`+s_%?^ZF7D
ze#hQ1PHhO>A-QR_3=D2@HJ~?;<zJyyV&GcpX&B@Hxm{>xr6H1>f9QwHE>ByrIVd6C
zd1jGzQH=oAfHms4$ns=DgJlS6*5Vi%a801h(1hO8P-k&_%#F<jhod!quyfh?z~xD!
zFMfS!5gRAVL`<SNTg?uY<)(2+`Xd!kM2D8)4kZp!1v1t7>cnO4-z|Z^T66nQBQqNt
zq2xg07C8VoB=ww2cmZhaglTrF0Wkh4a#pBESfB^NgJf?KpSUly6C1w`6|<j(o;ODd
z(fz(B5i-ob2M#{_oH{N?<lyezjdW6Ty!$OY`|P^?b5+c{*9|BK#onwqFOTx;vxhxX
z8T$Zo?b&cdC^6GaBV2z9x*a)Bw&snzq2=@Gb(|rX(j1Q8x3^M}Opjb?cX}-b!xrif
zibreR1TSB^Eu?lEf7q9KU_~%;?yRwvUwjVTTIE?|X5r#%4Wa4)n9+qeorYbNYkhGP
zOUs&dJ>pf*v60MCtHRfqhj#Oa#hslmOUXjvfcG*vU$nk~!J&7N#+N%lkfP~mT{nJK
zK<$7{H9wxU9d_~Y<4fpbWOf&c(NuO{7@ce+qum8Ye)FM1Vcx#R5=+EA6@G(~Xym>>
zQ#(e_Jz{}Ik0x<f^C|zz8tLW3Rv&4LGGk&Iv8T$^tJY$U0`F)b>+p}#baAV=^`Xr%
z&_d}`v!ts=J!fqql-hzo6Ogr(xlHuz&sYksi5NEzP`IOl+YYr1+wZ!OB+f>SXazP8
z><5BUel%|rxev@{NtwuY2+Ss8wrSk9CPXRQHJIFVCjE!vWviv7?VnFXZ+p+xQIEL&
zuD6OM-`-j80-|@?C*Mm}4X+D)ev@PS{!a}V-GD_;qV3OK2tj|-{rXA3v-J7XJtT(a
z;fe(V{Adbl)32PdicbB>!hG-{3-h79G~rEz&R%nD^x1!hJo0nl4=}3*hqk-`ebFD<
zJAp`MaaBj?GtroKaXds!L1_8Q$Wm5S+acABOOY$2GK-tl+|x^&(3F!@o&g~9-jONm
zA`%Kqbi{TbIEs<{U0!NZZ`Y$`(y~G%XZRO%EFx27sUu$U5J%PnaqL5ojNqUx6>!-T
zH=S8{9YV=x?YgN%*pD5P6HyWmg(=Absk|##4pb$chQt?k3-Kdo+X?FUgf03<0WgM}
z4CFGrRgt!zE!uINWc>M@D-y9=0c4Q04Wf(^PaOrFv?^^Rcz2!w2c@^cYE>b~K821n
zJKd3kUBHs^aHjX`4}4}GSsL-c5+aUHB1m-KHD}s|x1e<xh3k(O!Q=2l>t@1Ofpc8K
z`ukga>!`P*)lPAHRWp*C&UyiJbyzATQ^@-VxzFd-^UuQ!$aWd`sJ&=tSjWqQ(bcA_
zx^mDG^BlImWKzElO|>jg(y33jDl--s5<%ASTKQP7cs_SQ65x=N{ZkP;b<PIrGr2cN
zSWgP%n!o4jh;OiydVm|K%@u(vxeOjYB+V=6(%J74#os5M70&zHYDk07?dUqgAd=sI
z_st_AiDILn!eph6+aTKP{iXu4Z5rWx^^)f0y}z1*zHZS=)5Hw*oNbg)L4C(q?Dzrl
z4yjNf5Dz^gNv#?@yAXm1JkNRbDM!Kdwin7my}(>ydLT_Dt9#ust-4cLtYRU<{26cA
zWqI&o68V#&MAP2@WM)c5v1S)rLmMCq60MCsMOk@20$Q4ih%{DLL=kg`a!&tK<H)xa
z#YyBhy$py#D0n_x)v*^QkOZL0tgKY!yvy<H$EPUJQZt3tQQ25=g<xAJ=r^o7vXC<I
z-RNM$e$wuXtEX0Ok*rTEL;OkY+RL$;>Zp~cX>wmAIAItYq%`a&FPZ>x>mYe=$?<Nr
z-Cy}|vf;<j+h4bJfsb?^;OE8-pY_iX45odVH_0+V@r6-RTs#Sod>7d19o$QEAP}n*
zaYxkqk_h|Wt4h~H-p#pgQGyI@>19LQpr~hZTrqL#Ylm$}uECNy;0CvW+(V=Ish>^l
zzW*Xp@VIKjIrRP=VOcN3e91N{MCdgn4^+SX!hN6zsu)5gHiB{5LPj=GfTigx`^#^o
z5)mZ>C0`cy13XH49WTF-j0nh`&agSI+31l{N)|akDJt~f#<Jz;AVA@@PmP_s@&t~%
zoR{1NL-wmaBh_mPDiqA259x<%1#0uxT#gU7dk#}xQJ%NfC)e=#T26Vi(JR~Y`ix|v
zrWW!5n1#i+ixx}RYYh+9+SnQ~FwjMM*MrqZO-c}wwD-)E3iX6*1?mV8Vu-AK6tUo^
zokLpK1BEJ(myQykW^f6zVv$_lzoDB!!?$jz>1w{s|MuZ8S9xAMp<mZ?YVCYS$L26j
z`h&kMH7OCC*I~QGd32gS^cudvM7|pGc|vj)d|Db203oY4z5s6CKn2x$Ma|%!8ACh^
zwDbCT9uHFh?Yji5+c8i6F7(+X`<*}!NL-77^VY3qx!ZIy@syE{&gTWl--^uvY3`7P
zm$K+-bqo1aOeCbWVT|I%?mxe8sI+ZVBbw~<uarCQS`|dq?9Sehn6oa^!$M9J>*x+x
z5lu%g0<Ah1(*(mW+sN^wqkVHh2_IS_VlUA7VU!SNUM-qy4P**kTNle;OIF!@nlE!7
z%1v;pB4{+2xFSu;V<+w}SG}iiE?#L*aj~%}?dpBqlp_EBVK=bHKixO{8tSEn&_69N
zkvf}Q0&n15_SS0k`~FfI(ZNj<Bo+UeGzC6M4up|&z9lq=X!(yLvJC=exuMm0w0wLd
zEB|1oT+RRa{Nucsn+&G>@0;I}P{S}(fDJRUxvA-^H<bOJW%1rtE!fD@`{nTZ%aP&h
zATh!Gwfz2YPl<}f9f9#}Y=P3xwAs5`Qj(IPI9+;2lH0FUXzlZ&aUpAQPCC2Hqq@#7
zF8bn9jjFnqV)oCktE+VvwX%4i3g<6+2n8F3vX`U5RMH!+GnNhoh4c&{W%7$_emC8w
z&=bW~H2`3oKy_a^XX8Fp9_NQyzO`4?#uw_yv!)!jPl8Psm$oO`@h#&A1tvBye?<$H
zJ0>hEj~X86bSekzR`+fn4leC>Q*W$2!G_b94Bb&ljL0u#pfHnA?q4#3(pPnhe74K!
zF<^MfRR`UOdX7Z=;wGw%s@`Avq#zKkZ>imgR3<Lsc||L%`fELEdD;Z&(v1p7MN=u~
z#_$XJOsAR+s^uf)+f|lMkt}7P4g>ewe&M>lng#9Gz4qDR9sZzoc@WTK_;^rZiyk0B
z=^>&LH-q`NqUT<)ZcaJnfOuYy6C~i<F#rw-+7^s{$J5^*x1dHkjBpTELVuYgh6enG
zdHin%FQaiCUc6V7u>A)-R&VGA74kzE{~(Kh=A?v=j@OtltguF!?$=aoCILOq7cs==
z&Y#%39>M8+m1e;&6wnGE^95otH~W{RFMd=i7dg@#izgg^y%_uuh23NuLdD0aNz0CC
zl8&@0^~6a#_&Hh~2b#1zY<03S*Dyg_jdgMJY3CcvfY6$VXfZF_vg#!sT<jR;b0Fx@
z>uQA~70=vTM6UorwU(vM>awb00?0J{d}RztuZ`RrMC*#y<~v#fp1R1ECtt;sXeJCb
zAf`WkrbOIWo_$-joyZ|yyV_A1u0(;1W&fk7Ou3)uCUzm?57k1}M`-oN>5>PZe;g?>
zbNx7>iGhoKh&lNk3fG32<;lC8qP3ee%$j~5-aiT^kgAbXV3vBt%vPE0#TdxM$j5Ee
z<$g;hp59N7N_tTMb9$VJ=XKz+{Orx1KE@?0obtUpg1DtL4-f&n?1qsWWwE~EUy#<s
zEb1R3O{-*gxXP;8Ks1=Y=?st5IOr4hX?SlqCT)9~5@h76V)_TkJYOZOcI(H92hIs*
zgJR!Sq{$fa>Z`F}s(MbhTjNMJXSJ{|)eUFZ+*Bt~;aX*e-TrG<i5@X*6ho^ZkxF>{
zP)=6y{=IY%D^?&cdDfQBlR8{qqCsRz?BsyEDojH4Qw))lvTV*5`Q=FN{`8yeGZYIY
zcFQ_w0T*T5Y56nAv8EXT<Q%_|2xrmU-{Bo?KzfW|Bk7Zkdgw3k?wr*5FkY01eLHDB
zTK-RUj?ZCG=ra&RBZs8Qf86Tpal-M1O05^xK;^9ywjNxm?wPAmmOYWw=gKDlcps*U
z-cJ<$vU&oe23i%lP?b_h6GW<fdi6r$%9k*@;H6S}iN{t%=R8>Wd>S1239SzIY!|xU
zV?f2Y@6b@5#D@(JAF+v*Q*+}v0I{BRaT!t=3FCnlSD4jodzMq;;lVnx=i5vw3MOkL
z-;^ncJs*5(LUc_%RDa&@nIV5*%hNiiy^VSQD6g=UTpHWe$-r6Fz?*IQpdNZNUq8FA
zodC<>K&W}L;gOoJa_Gm3(A%~7WwJ8^S+{Ceo%jU_)s$3Bsp?a^_jRQ=?(QAcD;w_$
zoklL?XysY|g|<7shp*Qa^P&1avU3t=z#tEsP5J9Z$B>2e5!DCwvklZcLa~82xc8b`
z^!W<Df6<w!Q#1PoXx}W&up1S^lqs}y5$VPAtUC6Flp#jRj^9YFbJrybdMYco9oB}6
zyWIpDyDkS%cDPCWMhu6=lD&)fi|Di*NsVsB^-t!#ih;x18i*)d`jtbzG!&0f`bzs4
z_io`uSJ}IZF|AXXWIdme3sC-*^7(1Rm3R2$!Iy#9-fi17kF*_Y_OUq9v)BKwcB4~)
zLv&*J8zP><^gV%(>Q57sf=`2~Tox+XA%F>hv&}1DJ|EEYR0H$E7HCsCc|4JUT37>L
zof?7#JvhtK<4%b#F`PQbOk=UL@?;)k720oWv0h$2P{)%8)N%=#Ph+y;u5~^KMkc&l
zKD2pJ5g6SzAi_`Mo0>2&J*nt++cERJZnsVL?YhZ*5gQLO-xL}e`wt+o<gM|+Y<}}2
z)Cgg=N7mhHo41XQ`J0il-Hc`F7H^vfvrMb<OX3QQzNgnBf5PM&z4HA#-*y}zqR0yF
zK2q|P53>~n0egK3-C&v-4Gcji_2(gBMHRuBdPdt=NM_oikwbLpIqltg>n=3M-X?FX
zjm}DbV&2-jo9AEi{Gu11Ii*1Yzmp1q(ahHAfJQB4ZqP6C-YFlouCTgAF+KiWEq=zN
z33`=sq`FekPyK>$IVM_{9$r^vpAo(297oQBT>l3BUrC>Av(c*ILqq-u>Hf942(*%s
zY33V4XXh%K!`&K`Lo@4|K+The3+fUH4lbYos+1;4L~U%L`~HrA9iAVOOOA5e6LluO
zc+@?qP0Nf}FmFF<45MO!je_q<D$h+p4H2f2Sy6%qIoM8CywGYbGwyx3#V?-2yPR;}
zF4g2`4MyTQ$CdCNlxa{LuF(XlMuk(+Ugdt*ExfI&3uUP6x#YVV`_%7HHN=L4i;`ns
zH%Vn^c(3mLYdRVIyI0LlfHtLbVJC>JZp0j)>na+N!DQpVM`SCs&^;a^q}1aFMD_u}
z{`@3y!NX)ZO)K&GJX0hEhhe7~y-@y0TR;A8qE;_;`!y^|@vFuXNy1nS{DC_8FbIx#
zkhM1i)d1RIHfa40hR)C^q)}{PXxiiWY&zZ&m6`1d73MD({I;kS6S-_hU5tk;G48!(
z(w*XEW{ZMUP$~3gAZAlOSEk>6E21X;!A>M6G3leX*CsyJLaTzBp9#Wtj_I=LO!KB=
zIVRHNj4`D$b4-{9mrh=*FBk&pBT55?i?u04(PeiNHO?F3&%=KARr-`pDiT!h@KR%#
ztT<`H$W0Z3&T%v&eEOr%jI$0BaynY!#GJ|NyUwXwd;}5urE8X;GRe>kqe8p3{p&o`
z@0{wnioynRnv|CppRY{ulr8Coh~kFp+ey0id8*{U?C{Cv+Zr-(RHWHNF}A|5MF?Q;
z+|5G;DZB*DJVr7qgAGc&AcM(9zfw?@WYj9Z9RZCy-aFocIV(?8g89~bXEmUSPDntI
z2}PPaPFrSD0-{;`s$#gW<-Suq=iBCbLKre4Oy;bewD#%C$lTd~0pWW8>9)<k-|~0V
z=q<lWsYiz1@*tLnzeiaLaZl$j@fvke*1Chg7G+LOlL%{;D>M-dy-KumNt$#zpp>SI
z(&v578dXa=+`Cvvf>bjJMiZS_@?a*Yu^63fdPe2ae7!_^@7=A((L(I(>^cTsBKEKL
z9JNo)WGgS`#b=id28#RT@2E9Djj@LUh3kIHrQ{*5%%0P@8pXXZ>z2yczBrJ#9>6B2
z9)<CAHN>j~?YaZm_Nq#qybQTs+dz{!^evHi7@?K}-|y|`i|Q`Q*R5r}c*y*5_io8K
z6>?Q+Uevxh=$iZcs2E)g)2c3kKA^Ya7b4!>YOy8q@z!pwJayTyeKTq(nfXxrHfG*B
zUft9XVXw720=bJug*8xMO3f~YEGc=+bzeFZYqK~sK>e3P3LVA6<%`JZ_)?vagT;A+
z3xWvQ#iM@f9Y6B}nXkdIUD+EIig#2^eE2NI>|DuV2CNk)BQowUZ^>}od;Sm_+5I@B
zx2J?|=#<CWu&(h)zi=}mi`OT%FGHSjM0|SQ<YD>SZU6q*2oZsJ%2NmJ<Mp^t5X1Ku
zvfk2s=+0oP92SyD=Z5fkgbOPry`SB(g4N{N%ET4i9|wor{$sUqa6dFH4%K<?4lC;g
zxSmd1S^YTp<Dv+ILYkG`jLXL&%^P`A)LXVSV-%*}{9U#8h!V!8j~im;++VMdVl71f
z%S1dmy8Hk00$yPd6tFd0h^c>HQW^4ak3To>Zw5n%#r==PsC`zLC0j>aHB!Qt&EP^4
zExU56Napg0BtceaI599+%rDxqN99}}EPNuGb^XZ)vtiG>$+4$0Xd_Idla%^{3Z|+c
zap6l^=rkeKD#-f!PHUzV+O4ic>K&P%zpUpdf4PJ)q5iA-R#)q1M8&hBw@hzUGv!_%
zttTFf_w(-Rf|&Z1?O=70WYYrBY3?!qWLlk={Z;R|q>Kp$wh=#WuB&XV?ILmYS}W5X
z(*b5dS4WaBe&#9L__FynUFU0wztrI%uoc4=H)?`S`E>)U@*~apcW<vBaTMcpo3?8d
z%sHRrJs$+BeG+|}Up^P)TzmftIX~Phxfpxjj@5R)r)l{a%fv2p;LxK;Fw0swRtA%8
z88MuO3Q)S}Xk_4vRIleONQTa{>-vHNYrwg@JF;N%Y1@&|c4NeO|3UDUEaEY9lje`U
ztv3TG%p8M;oj)-iK50cyuG*oSy#IZ2Rpu&U#`o5bT3NS5Uf~S5!+*5OcO4V%DAbqS
z>xYZ}gzK$3`2@71)uKM>GO1(Sx%d>H>k)9l)H5LYpDLEnJAj(lZ$!B9x`ZKbvHdKf
z`n&wt0$e`cT!x*bS(BOF&&x<>{H1aha=UlR&YjOeabN$aBQII7(r(G1{Ix$DZ{gFe
z3jbV_h!oMAeD__dS!b<seQ?MazczMvcZ+<N{-rauPzo)zG}vFPz?tvHF#gb&Bz!hY
zB+;Bx>h`o{f1_sp%eG?}yFSpDBZz@<V`hmdSjM}hFOdP^OHA@NzjP8I`m?t$?CB8F
z+n0EX1U1a5K|oIjZ?wxIc4yo18vQR$mU;PK{Mrw0_4;>TO77-rfno8izlP<vOC1?)
zUjC?zfP8=Esukc{<L(qOrBreJycwgK{96@^E);|D?LrtOYfm;1xAClmqN2-2-QT>c
z7-DucS<F!_AZJl4bT2kUzI?Tl9%%T`HQZAr&oFV0D!lpXu0S%)!rBN35GTQ;UoaLC
zBh7Vx<3qSW+fSpzk9-y)#y}>cSZQOt2l6pl_N$Zr4Vf2_P~c0B<udj6#IvACbE<)o
zQe#ARZ>j|03hD2wgMYb(&*(YIjv2ph^w*SXU=naQXh0{R+yOm<J2OGg_lcg4?v}#H
z7Rw2%70_rMj#E=Tg6br5WKGL=yR;OfT}t%Zm{e@?5lQ%_H%|3GUWZa2x>eSVsn@YD
zQ=tnrRA!HVC$2D>VDg4kil6xs$qPJ0U?}!a-s+Ta{0a4}>uge-%-+R(DgEJ$-?Amj
zlL-(BRXVEV=qX~43zyd<12n`8Dh{&v+0NwgWzbxSW4M-GG(H&Kc%v-4Tx-?5-t!_{
z8hKwf!=)?evR6t|U0gUd{2)#BJ~mR)icZzrFu59bDb_i6K`&D7%PI+()I2Ze1AX0>
zlKXG>v#OVKsx#tl>SvKXOxoTphOzz0)=Y%7tXHQ{wx|D6FM2c^36jnSxE}}h1z34A
z5U$UaNA5bx&iVDft$Cg|_*RXt<#s^gw?ihX>DX%4>ZzCz3fkoUTqKDVl1vB1iR2UI
zvx9~|QB)qyLM7|?b}Mv@<X~Jb3+5)!<8sxQmp8dZ0k^F@g_|IaiTfQUV9@WO>DmQy
zYlq%_ynK(0-F#gyaGGA0O!Vo9Z0PX6I)y&Pe?RbGuKIV`_`l6n*Q?t{8Uc0HCTGUn
zc(|p_<`Diq?J0boNI-f;ef|4Z7t`2QR{C@>k@_QD*e8v}6W~u3h#n5nXfD!$S^Ub9
z2pjghC;y*&0%oS4>}xD;6Qb(&@R6Td6T=q?{9qd;PR&%Y>ej@=HK3e|j;YDXRIJGx
z+%Ijm`zr;M;J(vfo5c)c#SQ#>eM@CRIU9^KDi5yojF<_Kg{RV_ElirEM3DO_Fv-xK
zR=A`3iWuP`&#oF!(6fjJ>op-oeV!cE%0pt;GW|Z7ti@==J9<53vc9=pHM<=Nl^4mB
zB$cU#duV0K!}S->$t{w$NEo`1x*DcRj)!#~7$>)YU}(u0roO;u+?qdZ_}Gysm<Tme
z`a6LkP9jEnguLoSI)wFW*X!YYZZAW})b|)h(PGG4g{EQ>{@PH<#0;_KTBmK+laAAP
zPXCTWGZM;yvE3kIB?a~5_b~Nrr2Tx(7J9qTiju@I0-!^mWa#(h48RN-7~xBT8H_#K
zL}*>EZgZrimkPeoKK*0Cg|Qyr<)1@z>f8oC^a6sfI-Xo#)rL6ky<!$seeIPcI>(dz
zb&C&uX^s{6vHZ4|jqCj#XwEjxwu`aPbIL6|j$sgEoFV*Tj<_#{kr9-^gi$R^Y{NvL
z_xCKoVEj2UAt7|}wJ7iOg!)@T$A1z6p;YwPl=#?bwb~f;MefJ3*M1wt=gxC|uAVQw
zE#A|Z+$Q8~Mp?p?{)dLQm`J`U5I9G-mtDhxud#~|7{Um)@>zqBY)BP#I3iTM=HxCS
zWpvnDSnU#TWL+^-7PIF{_D<U=GH6jSGc(h>@ndQM%I$K1vJaXS2?F{D@*W-ZYPb{l
zSO898UWul?V&xk6HtiN=97Erytwi~{Zw&S8XuQ|@6@s7jn3bdt$(J8tb!r)=I%$Xc
z11Rb=Lma6e=$T(|={?~0eG~m)t|t6w8K!E=$dyx36^7G@u1i9OK6?48hDNbJIwmtd
z6m;Yl=Ty)q;?%VU_b-dS#zelI{+G)kte)qwuDqK@L2E++rjDfZj?BDQa<}*mHW&Jp
z%1o5C7nK*=F(g*D-?7ym^c2Jm)R9W=zlTYE^i{alc%Bkt%k0akHbReK<j$z>&H2XN
zhiCt4^r9nV;Z-Q_Y_(#foG^OGpMbT1C8CX9G*y6%p77jD?up&4h_+Jm195-B!bgJ(
zis{l)Ua^9*=i^FKZ7SCAzQ#{F9uDdnZO?JfCt{;)0s}iKDbd;8Pz^Zg=8OO0q)(mm
zm75{?1TzWfe$i{rM$F$Rq4Op>+c)`9$p!tZR;D?0f3Lp?J*d>B4^ul5)lS=yWCjrE
z6q*WzZg`?TV*+`BdiW$(Fx&^&!V;3wnC;Yh9hPHQ{AFsm^yxf6Oc`yq^@toJvj?fA
z*>4Ip#A-gj`goKx^Pr(OLsk(ghDKuo$7ho}hzV!5?C)VB(PLy_O1re4Oza@A8qt$3
zog=7zOZy3U?uq}$*_($$-G=YOm1JK^p=>RpWGh=3ODUcxq_SlRWnZ%IqiLbCr4nUJ
zl4Q@mj;u+H?E4tfVC?JIX5MR*XL-Nh_xK&Z_a8?N&%<Z#&vjq-d7amJoj2#fQ~Kx2
z1x1fxT5mnBR3)xDQdMZhNIR%IJ3CKDEnc_OWoub_TyeDT+NEp>Lh(FwUs#pU&G8Bv
zO5_a*;B68}8w*;O9GTyb<8G;HUbM)s_PJ<rV9JVJDSus0OY!2!D+0&FMz^t-r6uJw
zg^7WdV%IukaBW8dM4E;}Z{86SBL0|Kn6i#4Oc=Q&oTH-uHdJdg@LDPO1kc-~Jf*x$
z3C^(LJL=fR3rY-ccv$NG{f=9JciFZ}yT`LNI%Hq(e~|q&5W%L0sxNwZMBJGx{$p23
zv2HM_eDPx6_(olRo>SF{_2awt%2HHKIWN!s4ePfFoD0zZGv2Wa#is0zuRLhJ`#0#q
zvu3ZUb=WGo$Q(8L7V&77yBjGXgvibNet#Y!yL{KJ9&<=q&+jI->t7i8R!&LRo_mX%
zAHT<Qsl3n7JE=wJ%1T*VA`p5Jx^x)bf$9?W6?C)9yF{Z$_FdhUPo8AXRo(EC9vPH+
zt`%WNq2ol4S>$wgCaTc%R`{2i7Qsx-Ga=7610g?;aaO(Gi>$E!$!z!<l3z}qK<oXJ
zZ(~FduE|D&TEf|)(0O9!K707{?E(Kq+Rv6hub)(w&(xE-9Ln&0m?LDp%k!j1V|d4T
zySE48U-L=yff%oRZ6Zylc1igrXg5Dn;LeTd$P1?VvVfB!i)c!u%e(BJH0sW<{P4;k
zIf@>l`48{$m;z^-=@lr@wNi5n1QoO)8PoJBASezefGDMn;+frZz^g#^V*UH&`$(|3
zk&|>kO7RO_*4T5@Wk2j!^YQ!tV`q`JSP{CzP-TCT=dubq^&1?=UaFknyU}>V&@Ez^
zZwO5$t-k~q@a)UA^&{v`mJ#D~X$@E3Sv+LivG?GCd$PYkHm42PXQQg?Q+wHO#)EpG
z%)8Cs;a>wq7r9mW6|i59I+O!@l?dDA2ed5vPkg$OrpmxB{1)yP9qaXBjb~Y%yFYuG
zU&p;A#8<D&vg1b&aP29QXLQ9M`EalwTcr&Yw|SUg^=Z`ndz`#XhGG6wA>H&7d8cAe
zAaeP=AtAgu>wHwis7Tv74&_FlKWmR)+EaC7x>tFIt}FUWQGl^)kCc)wLk8>!(Lg#n
zx<ydLwSo+SNzZq=UkjjdQp7Pe9hf>j;ch#FMW#*}&|>&`UM1=t?49%-oS6A#Ssy(2
z@52v<mMl2__bLS^gpe2)tk+C5x{+}7>C^dLWUwd7v*<qc|NotsmU%@05crFd!l+>Q
zSK(60cesT0>kteAT=vraCWiNb8JU`QuR$|X4K0aRHOiPevOjSaw|{hj96mo)T@~om
zcI?>hqX>EDa+e_HG2CbaT;u)_U*OQ!ny50!eHfusrV;Y>32Qvn2!1u>dD<%xm*~JT
zSIzLTQ}bWyLg+CYvX(%cw61@`A31t-p>ZkoW??4(#d)XaP?VmBZMjJJ5cB|Wiik&P
zR@Vv7WCH97VO&r1s`K;N$zGQ43-x>f?|X8s+CaTVRZ+jXgIm#FeX$fcJQzt+Y7^)E
z2P-~227!x<zL8D`=zrf9vf!p$NATM{oaNp(`RPSq?rdy@@{vz#%j=6@lUB<)`^%R(
zz@zJp=oF~|Nq6o-7&r_X2;TrpfU}7l|NP#2l`0ew!oZGQYZSnUv@H5PK32K9JDnZ8
z2Tx=bPP{vi2ewrLh}^R89kKP{d^3^ppwDdsrT&xJlUM;kCRZ|`IUP7rZ?xIJ7fPME
z2dA*6(ag5oniD2kkK9Oa>!Z{xaQY&<I3w6bMn+PhH}Tng{LfIXkJF7Nl4^YN7gmAL
z*#)D?Kk&qaK)Fubtx^?Qu>u^^L+|M^M%1rZ=y5{XK<xU9m{uS_q<Cdb@AK_TPeVf^
z1J4Ajyp>%tiel@u<7ct$imnyoT*(?(7`cG!@g&dm<OTs0P=(9PC5={#DH4YtYJ69r
zPM0bKX&Wu(O$?x{Ehzo_t2XYS@7A~6Ta)8fL5{}jK};0YAH5aswM~>ys?q_Qg<W7Y
z(#oVH`?od7NBJY(w(W_2Ooy)hGCzJ&z+aVObYOf=4y;K_JV%`Vw%xk?A!RfR8~v?f
zeoyk{m}dW;;+YEYjN80<^6;JBJg|l1_-33TkleU%BKb1g`b>XALyapo41FOAj1wiW
z$SX3KxE~Vv=2-2_s0ZH+Rxb&4=S9}YyMOo&mUSWf?Ob$@DipZ~Q+zo$bM71i+uqlU
zSSp*IZ$Q#C^-_FvP6()c2ykt5JuaZ`9jcTag)}aBmdPuBbaMbMMT$M!==|eb<J74r
z8!Rzmf55G9UA;C_{Q9Ty3BD_Q98~|7^m0<BlInVJ-@YCPbP^Hco6b$COOC#-e8MI;
z23@w4V``*Lti5rD^^bRXaBaE}poU7XpAR*<-U8_U0Lt(^h@c-k0iX&XBs^umgpiw$
zE&`(8;Fl;SQL1VH2C+a+r@v7@_B{S~kloDcCo%tgivs6dAE76Q{F$wdJ%Q4t43jzM
z+hYsvG_rp($o<gCT9wr}<s3o(4S~9`>P($={fR!mVzO_cS~&0X^w-i3hGwdko{}0T
zeins<+<B!h^zGvK(4GoPHLkOSk+f+gGI21PwbCl-y#;pyNsmeW@$&lI@yNO?<5E>4
zYIkVGAsIoyp#lUy`^AhNv1jxc7GA_NmnG0u+BRQ%tGv@^_=(_07?o(nmmIzLR2!7U
z{VuuvJ1nOfm0czdMqTSnd(9Zf^L^NAX_N49<E*jQa=+ZRa=XO+6eYidD)NF}mdSxZ
z<3D2$HoUvdl4m!*OJG>>{C1t`Rn9kieoSm0IaJya6!D}Eh5R>*G`w3IXApb9XUjGw
z?ssjHvdO8NL#a*VK>y#Y^H~y%qvBhCeV?Kx!)+#GK_)zp*mS_BX>88Jcr2o9wf}zZ
zL@n4ejPxIrD|g$OlPBB?GhARZ^kF#8SLcmzFFDB@d}H?)`ZH`ITtD%j<FZLK5yOYp
z_DqfnU42=Z+TlsAP)~{SyNDK!&G0Su`9+~FH!F}*JnI`4mlW{|5frDzSm8HLlbxz(
z1MmKXal5mdW4}cszfsfM^ArKNg0eE@qz=wKueAl5P_swh@@LNp!A!vi182R5bzocZ
zB<%@B`JYw#aZKn;c!VM(a*lW?art-JkJN>locsuMPE(wGlzz6m)bSr-$s+97B;q~8
zNJ|=A&n1F_=XC6KSq~WDbNmG7`g>5LT?O{i4SPnHF5h+q2Rhb`-Kqm98ZVT=aDpzU
zuzeV>g5esz=fXqtX{Q@VqZ~L|PCdEc!5R+`>-CEW3T3S+j9fT!tfJ|u%DIq1CtZdY
zs;kA(4bJ<^JrTuRqe<=ojpfZV5ymBMbH0~;E_8t=RSV|Bcg^`eHyW8}D)(7>M=4|~
z{8mrERmLUfB?*+0=ov%%NkRv;!P{Jejr59uxMzH6k93b;mExRn1?5Z<Nb%L@7eI&c
z7@O5$2x_kPNRT1vOj&^m(SB4H#LL}M4P$-ttQ%>sOexbV8uz_`*uN0;A750|VW@4n
z`CFp5B&nAH6N6uE!%h0?VhE)ItO>0r?)8)3K4`x-mC^@S3H0D|_8f}c<Tv{>Nrq-$
zPYlzS9Uab?x%Fgax^<PwZu~3EZRj4>-4{P&RDS+qubYb=H+%0zGFxWx)1}A<q?CeF
zp=Lo#B=<$}NS{=`<mQ(bty42Kjh?J-LF?fYIKtBc!ui@>&*bal611k(9m!%-gXg-m
z5}v)cItIh^u7R_{Wsxm=|Di{el^;=h>c02xQ3l%Btscey-XXY)%9@YeGPExg?M;k$
z_DmTj>ymnMH(B~FCn%QldrQGp*|$%$t%HEi$jkD{YidwuAh__rI_8_SZoq6PWzZ%M
zhD|*6*x#Wkg5>%4y15T#C*Lo9yV1rA=hw?LYt#c5{{^#1?Z$V^dMD#xh!@L#YcFz}
z1yd0A-Gu(s;<r;-Enp0>@rx787EWK2cmoai>%DnGN}2g_XPxiI2avoeBQISp7l&`c
zgrJnuU_Z;X?b`H!wMqxXnk@2|MQS+<wF_Ek?s}h~gyr{V{07CA`QkP&6Hawj2eDj(
zvS=Kj-t=s-4h4f;-azvAzGUVdB6JAd3J#m;(Xv5X`;&CmjcP~GAYs$VCxGD#7oA2)
zohEQXopzi>RgYBTPkw=5dk(?Y2^C|!EZ(+H-edBa;x#yvjhr<;Da;siygq`M=_?F>
zBkeHZd6p`GWOc3-CV4Hf(t&lkJ^_#S+`&_sU+B8`uSCUa1E_wB!Gd|kyIZ!%u0@k~
z`ZB2C{tKP+pgD~K|L~^;Q<KL(2>i>3Q{Yi@yPzssO;+goUZAMO_c45S?vrk-xI|77
zQfQ>Z=!NqnhFs)L8H;XbQPTqBb#njvKE2_Sva**xt;Nb&SoUx_89z;WaILN_c|gg=
zQBgj9C0vB-$tlr#OHAs)EF&8B@5QYWdbc`lpHKGZ$Wv^jboJ0(vMB)x2+f1R``bWI
zA#atbYBJk{KOuBS{mIP|BkvP;dfglYFR}1ySJg7STpX%cck;|KPPWKZ{9q>-KGA&f
zEyrEOH<_f<EoOHW{)Ni!QNze;&#sT0XC^7)kM89C48M&PTFzApCtkGs2D4OG0h`}0
zbzZ_#It_66<OiPKbf?9j7%;IfDKnOOgbMG44|-wx?FV66z0@T)_Fh`bw?dwWRw?OG
z$qMX)s?czad;!CU+AuWCIG~|RpD+B2@sQ+Ku}n}zY8*&ane)3#%_h9_jH05+r>|2t
zMT{?;_@$rmoh2M}{dKqKv?$y=iz4x!%jma@2ZXM_g&8f4GobjWJqcou&o_QHLE?+}
zky97Wrv(MjLLXUp_OmsxXsRkJ>HQbghvimo_$or-{^~_di0)V2Pi6MM7XF$#Of7=*
z{xizfpzHbT0X0$LMssW(QgEnO!iDB#kx7hFK!uyvlfNW<w7kNm-Y-2B|It>@h(shr
zu4G-Cx5#g?<-qmiqo*N0HW**}I^h(L(wvPxf=J`Z%m?MSZC^n~{LQM*6_hfwa&;9H
z&`#)S3e5+TdO9!;eewS;_dVW?GtlJM)KsT9Q|{AK*0t>r^uH`Cq=*yT=XxARu+rSy
zSUk7xq48XRp~Q$<&NEaFyt-|i&b@xD|19zoi<Wk#V)}lo>reNS1EXUv(>VvUj2~YZ
z|Niv2iP=Jbt=!Vv%NL)n*7{EUJl!i@6lM9yMm|02;^J64%jh%>Tp&OiGASXBthA!R
zXZk7_N}e>`v^u%+Lh7V|g#dc-bW*uR5L@H0x(LP4-|Q=``il;{+ZZ_HxlRcgcjvt#
z^%sYNHE!Y}HT~6TTd9+#1FA1^IaQzL*PC3etSt(qKgsy;J6p0YRTv#T?IzVco|wuu
zzQSZ@Kd1a~);?JBFzs*Ng7Wp*RvpcG_$|frBmZC`%~nt=6ZdJeIbIM7c=Ej1Z}sza
z=Vxht-=x?N8N@$%`g9*p>q{i`Huh4^=8rHNg?wiT(Q@2C3fP>Q$OwaqJ2{x#@x85+
zP^y`ggI$i`t7Ap?OVXY^%~h9)iaehY%ZPE_FJgGzQ04WRJC}>`q0CiB9e}0U5CwcD
zox>0Ck~c}EH0wp0cB9zSUrpaO?sBU$UKp0ph;P52&d(fvNa`suq2D*e;QWz#OE=+U
z`tGCl`O(CROv$Lua?51#@uxpdC>0CM8Ff%B2Pke7<&UlZw6D-Ah&;~m*MCjLx9p3L
zWPUs(<$AFTD*6e!iv?M&3y<YN7U9wAL$zl{;@%J6DjDWeGo~p&>unP|9IGT|P*5gn
zlS!LTSe$1az!)fGarHPLxu<=dV2R11GSOF<wdh}2$iyRe-3J>I7AC#k`9mSTV_ik^
zaI!CQs1hY<R<---lP7z5=r{jex7>%xXxC@T7F*M^;;(^!0)q&rlZ<$ZgEG?h2#hYX
z-#*36qmt9(;&puJk8%E;7dqg{Ce&LIoKWP4&+?L69%YnU87O?(@Ev@;Bc~OfwT?F7
zJMg3?F1OotG?c4TdhA9YT6xz0ynSioj@g>W#_=A4LBH_Igt*V*_@4cd!#Hww^8t5^
zu;~Rg#n<mRb9uUaN5AfrTN@90I($a&zNpMy&O5@_+cnN>mINdq4J3oY0#=_4``DGO
z(A~j*8{ob2exGL!hmZD;g1G%@=k4e)5p}QUKvj=*<)c`h{b{D$djb8cEqAsDl)!s%
zg?r)m54%I}J%9Mk`4gSFYG>l2iVJUWawfF;bpQs1Fgg$q(PDnlV#>lPJ%~#T@lDsx
zzIajUf^WV8x%C|sru3sNqHqa)=PAK77%+LFNFrJcmvP!arWDkQfB9(x`UGFaBqdl(
zhrR$=>H369k2ES&DSXQH_z*aEwZc@VIw+YIgU?+eNn8`Z?xXrtJ<E2LF7N3(!$`Xb
zVtahP<t2ozPUM@nHb9le)Q#y|HY6La<i~POmeeiWqc3(&-!EXiaBx&`g|zRh*>2l2
zXY~4!FsqQSZCBCka+|`wzqhT>zT#Ekif5*b-M|Fjw2T@cMG9LDHvnSHM&JA(dTQ?*
zDYg=SWR;T9wZpbUBV>@R;(?(%S?(l-0#bcsg%n-*qMf3Z@Bxs_58zS8U@%eA@;fJG
zn5gomdaC)|?1xsQ4RK28({c>U{?*lxnUU`=iWAoGMCV4_d`>;@3z8GG;K!h2lR{~1
z&*#Aj&#yD3$OBL&=Wy8H-l`N`!+%T~j&2co>^*{G7BClTMb#o)!N2bz{(G(Z)yS)C
zCmcj)*uITG!RwG!KB*Rcu~)siUZ<qryWGU$2eES9V!U-aK{B_>;w9fA`<)ZlYtm-m
zmVkFtPRlPc3ep7-t{2BCm%Xstb{x{cXoVodU9!Iuu3kBxF_AOX854V%QcA-e1a0Z+
z8P`)~9oN}=S33_ITz~YYFi}T=M*CZL-gMVRp}VDWN}YL9TS(NWhO8!KTjl!R*#kJb
z^wlC|ntw9jj-3Z6jLOtb=-Y!GPbt+O`1$xHk`@atal@Y+a&mG$+=BVNLvzt^PDb!#
zQ6^m;M$2TakWIh5Y&ft7HESO>``t?h3O*=&XI~UjY_iM~p)C=rQ+pZHS>kSI?tS@+
z^6aD&9CUPqNH&?p>us+}@1(=M#{Y<m(5c?0Iqze^z2Y7p%?7u*K}6cyHEAi_wJRTZ
z#Bc2YtLG;=DWrhq>D`l^IfHyehi`~TGlth;0hexnxc>Or_{069W6!M57aYkAEeK`2
zbmls@3eCa7)HABbRV-t;7hZ}y`dT)%iJnegSz@;H>NLdUc1rklx)(PW`y}9b716#K
zo~bMA8<x{v{!PmX{My`tVZOnM%cnAn%-iLf&yIu`ax80_HuXQx%=$6&rbExPe7Q21
z^YX+mDVx^!*&Vt!AJOo7-Rc(4;7xy~`&L}DcU-@zSueehPB51B*BiS`M=#qIMbKlJ
z#W)nu752L{e#|j+@BJ<${`#kc;vVwEqtQLn863RR0)JsA&BSf|c%^)->nf3NsN>s+
zgRK*XYqN*Il7|PmF~ZkXo)?9}zndzIbSPDtS(Lc*6Jbyu5PUr$ohl=4cf6_ZRDO1!
zV`)bPL%5&&dYr?pGS&6z8$-oCp1+8>3}R*C_BpleQx#GZCILS#ay$|#?EjpE$w<ej
zJ!g_$=aq3DuF3APj5&U=PHHzT>tW@4J9h3;U}yE3K3y~p-#>ePuQIpNRb8z?{22A_
z_ytdaPpn>H6US7d=9GNRS_g1}#vYtEUfx0HXb{6Q3Ygkhyy@5avPh>cul3O3ax}#L
zGODSS8ASNm;|Bt%EmF!zh8cEqNo;E5tJjk%7j9;4Nc8(}MBv5=$QbKsAD#@0D0~n`
zsk9&{BCBj_r6~%^Hy&TF$6o^Zvu?1&va%ak5%~^DG4KWq&yG|_ODRg(k45qMpY1>6
zI5V4-lJM7=J49k8mPNz9U^H8^cdpKILAS|!ht81fj|i8&J9qv5jLK{~jpKP(b|q1}
zd3p$KeeBB8T`+QGp>LOwuj<UdszUT&vfvy_Gu^cH8h)}yR+z8jazb=_=MUucl_O!8
zIbZ2B)-J1jB2&<2sMLT~a-4H2Q=y5#MJ+kZ-E>~>SQl&Qr`lU4jM6OrwRiOy<?3cP
zz=J1nGku`B+?yVO_eL+36fsW~42-WfoSEhB+ON_iKV4)oFq+SnZC8Ni<b0Ai#3|0R
zdWaUUCG*qlt4fwlUPZTpxQDh0a&=TbIm4gXseGyPNPJpE)_c^jaOnvdpSS0X>{^$k
z==tkUYbq?Y8%}Zc6y6Wz;UBTOJn1Jt-!bupey$?NLQQE$$&IfCgKtXc1f}In=gNJ*
zP;-R%+`4e;MTAoLkv`7vSSR$R>cE@#*SvL64T-_y?O7&FrJLWa-F~%wRLc!p&Ook6
zr>Xd{@C0nknHBU5`rh&`#q*m`2KHrZR<0z?*iLnOtdoR#k@9nn9$JM+DhsP6l8BD~
zXXZSQeO_TpHrGcac{5q-l@`8BRCiS@=NAOX)N#G(kO>V?d+b`nI)gzj=NfYSax`h;
zO^^KfMaUo_oZmM%q1bfwn_krwnE}bkd0iIAOJ@q~wu5%}XEs)^=H||?@QT0cC=VG7
z$u=(B7|)@OU(h27(Zq8-y4EE-8f6xrtaK(_|AiAth1iBH%{Q*}!S$wOp@sTAx+{|E
zZ1;Oj-=4m~!pR4b_Q1?Ip6ommcVrK>YQ@tE?41I#E-~O{fXHQ~w#yH0`i?f#SI<;z
znkLCzs26QKeORIIJ_D0UpD^R#$DyUVCtcDE2)zdHk1>_5eSBPwzxNhA9p{m^jk(9p
z-O9T+-K<Hy6_Z|iLm)8Dq@HqwBHfI?QKZm_*pY=ipg13aBO3?EIG(<HK>k&2Gb+@?
zF0<H@_#+&-U)RKYG_2YwqmxJ+2@e7XpCa~M){E+gblJU-df8X&zY*2e1moOvGCrW!
z2WIvZgOT32;p42hXV&q%Fy4bRrsTD*-UrlyK3V0<y+;CN6^UP13VeJeH@rJhcw$Ar
zN1uZVBU7kfZT2^4!0?(E6}5GN_~aUcylKGZSP->XLAQDSLeV!N9(=OHwiX~wlDopd
zv){>Yge$+Lbut$-xI9k@*z88pQppX3`k@nHiC7h8+4sv@3TRCTT86!~-jHPty1MnT
zxy^E?YA}y9O`4iDCu0wm`pEc5(aYvfco{jBX;PIsr(5DrDKhd;TY$nZZnk~87#GN8
zn^@>`q?7+kN&2oADl+X)Ov|Uc<2H~7>H5;syk(Z6Vf1rqy&e{KF#A=UhnMDvYefhC
zs*#&~oK>lRfRwzacfV7;Q_CJ14}y%iz3vS5Xony{*)9cz_u-ka7-;n>I-iRzy@ae!
zKHwFG)iSRq&iK#xx@)cNRpY+ud#fQt<&90=g2~g7dTfOJRNl3LzTxhX`fwIlDc3$M
z^>fntEqQ{2+f<Vb*2uT0;cL+R`#FQCy0wsVluS!WN7|;!Lp==;9m=O6!&1iwd%G}X
z+oJFr2QCeHr<q+mxbOt?FmhQ={5t}(PD5Vs9Q2bw+^su_OLWCpm)@It=uDuFPbr)8
ziFTWy!~M``wzHc)j-DOOp42RTc+l##`)oy5LV#%h@`FtkMn#sg`2uXJ6PkxtZsz-4
zvW?P}GOUkd7J1VNw@e3jpU#vybf0xMK4jEqyK~CzRpy%oZFv-4SFiy?_vduj{hW4e
zKK4R`)Q!C@1I0p-1zMDk@z>E=OaLjj(?B9{Q?(+qWa86oF~(oCLHtRUH_W2gq3{W$
zyt#lN9`aXmAC}}X6^_Lx1dwVh3QJ}i1QXJmxa6Od&sH&95z|_uncjt&IW0Z$xEO}w
zLnk)4AEXGzdzp*0L@Y+8RJt0unylOuL*&9#RuJb2Z|wSD)JxxXcLSREo7Co{1toHB
zr-&b<G<#o`3`I|m0rxJ$l|D<eXk7w<Kbx&OJSTTiaecDwuGS^0TIYK(@SnB3BiJWP
zvy-D*=XAe*)~VS_?y2-F13?1uHJ@FJqji~qgYQ5`&Wp<FdltFO-{;;q`4ev9z8(v=
zCrG&7ngl3t*dF#R?$u4lRA1@mZj((y;<9e%;)orDSN)&TJX=|wS@PiIWwYia_!Yf|
zVz3*Q1ELF51ltZ-cc9srurFXz)vVh*Sme2*bGLg=-JHT%uxtFVQrNbM2z_P~vD~-U
z??p`OY;d*JDoI4$zxA{I^-N7FOaOTuYsX@-+T?_kupM)(SgHB)Mq%tIvZ8kSTa$?m
zb^Hi-6Yu4HWaFWqFSMBZ>~pWEv6R{tV2DG>8luJg=>@&ES61FEt^PIN(=sM+)EXdo
z<xJ2&8MT#z)k_oG{uo~G*rP2RAl7Vf91m&Mw1+xAh$dd?^MFiM5V}Mq8u?*`WIY@j
zu%Xb8tT~5sD(oWj>BQV{$K`i2oC+BbD|2ctrV;ZcL<R(whAAKruGLc2ju{;N{tok|
z)fqvPY$kqpb#4X;lIC0|=MjXBnN3{H%Pv!fm5rZcvo%?Q%wds*o0{HJ9d}pe)B**w
zDvr(}`{)o^XFbT-#JU(3g5PG<`_gKr(#dCD6)1+N!8JRx*I85CmYnmF9jZL>X}=Kr
zuMHJXC5`U+g8_D+<aU)H<k@!*vb7BK*!<4UGjugOG$(!Bhfp+O<~Aw8rt%-4x8o=B
zom<{V+P5|{El0wNaaRpFpk(}moz<J({L|W;Dkfl@Y+FPe4vq;7*|R6y_=APu=SW?{
z)KNbO<Gl*&leuQ4Qw3ZH95aKuKM`!pedpP|nh8jY6dXvFn2%lFXKnA^oy&}<Uxm`m
z$ahEe0ljuIw#eM7gE(m9(s2Byi03Rm<GkOc5_uNJ&U6ho#!2qGl{?J}2R!T!#qx5V
z3VZ@%sl?d_d>e1a@jCLTVILd@wc5+Hzc%Q6&EY=iBM!8e#4Xh-fZq`*|CwjG-sDT3
z2R$<ZWM7sJM``agWsgZ=zUs*)q^Bgktp8eD0yB9XP3B@ZxE)brl>u?z@+BrN%gcDf
zn3F1X%FQQU(nja8S*lN5+eq}B#QH-vO<LC`DZMs-%Y)I}(M4=%4Y6{`8ttJLJ#NAH
zT+IpzMxK-#g(Kas6_5ey^Fv`kW(B%p6bd_=$~&m^45{P6qh`X8qXjl$sg|HWx?Z-8
z7E!oB5^6=V1?cmaiDzI^gt6$0sal<=PKf)t6C1-E=#Ty@mqu2GD><8Z-S+Zba0Azw
zK+Zjm%jy!d++=@=&ch_$w3Vj`NptIjf<DKXG5PjNPH>#|B{q9jw9TS;VLj_}J}dHN
z49?Fb3eI6`M^-W;vN-pIqwCB6MiuiX56{M8O7ssvvNoinMKB-=<&Gf-6Yo{J2Hib4
zsQajLAtbkny^}O<VAjkcqIh|JRo;;)pnKtFH7VBz)vXGL)1B8V)s2B2%!)_9OaFTC
zAWCL}(^#x%y;UWEu4`taga2A+iRJAUNe?X&+7&~~5{qmd%J=Sb62#4W?84abx|{3Y
zij)d8Q1IWpfh(w9cIYcIi7x+PkP5l^G74W1{)+4G$2|nOdD5Dd_ME3x%I^avgxz|-
z?&APIJ}U3rCh40K_NR^5O5fBuHeNCzWR^O1yEKYa_u6Igu$%hLi4)RuJIh7(#9D|i
zfcFn`lqu5tvDrsRzDFDO=i4oVgID%U#kv4<zZ3&XAYL+w;l)EE3gmrm_$X6VRGySW
zIugXa&%8Um$L{r=S9Fl!k@_Azclvt02LW^R&z#CC(_ug|`bSr`YKR#VhALoAyl_-u
ze%NGp{77t*+)`~q0I?c?Q1A%;ToMN5Wz5*s?~RYc4LxAV^k<j!IZjv0T)t95$V{Vl
zF&`=)JhkVqk@dq!O(~yR%^+vv(|6k^Ivn46&6KZXKB^*=o4*}<`+%9AaxholvtvAa
zF^i8h8-#;!u*)xNX<i$?#iFpNEi&8wvF|RUOI-^_P-(S690pcB?W6M~EORlHoRwjy
zPUsP%3&O0~0!%YMe-pwz;PaaaDCE_@Er=S3;IenG&^31N=zY^^U==4m5+WhJ_#`gN
zqVOxHro*DVxJRWT{63Y9%Z^Lm&WQJ<nkL=rzM*G)KyjrQli#cJE}_2V)}-7a^tE<h
zhtCZH`72;7lG!bOi-&K1p9)E7b6kr}!7aICW>A-<&l>-Qbxyu*sms)q;jy4MTJ1si
zfZVjzD+5a#o9Mdrw7=FJ=-?7O8zPgR#}t^5!My~U2lMaqV-LiLl%7~>kZb9^GF04A
zEjo-R0VLv8T<^h1Z<ccs_VZ=loyzYD1SFV-!|S%bNjc~hW*3Z(8uoliH0T~XfL5uU
zQPU94xzHA*Taz<foj$B$aY*69QN>E3>HMCR9F+TQe?`OcCGsjJp!599-1=IJ9%Hg8
zynD0JNIgfXhY4)k_;>M=@=&qX?1$rkc%pXg!tBBr^ByUb*$2l8w&&ydA#c(FJ*HDL
zzShz171V(T-kJ<~cTfpFyx5sazK+ZLq$lRZU7yL(pHW!%o<MB>A^2!j&(NWJRLjWw
zQqjoR3Q1qkY2ANwCV-((>+A_cfE7XMJZHB)fS7#Evm#{yJ>fD^mnLq`7)jA1^qRas
zCRVz(vW?39eJbd}sZh9YenE(KY})9NP~R~MO3ORNTCM$0?a|43Hke1$f~H;AWR7Lr
z!dk#1J>*=Ihe-~VHDhm+BoqgQwf;5CDRE`JbZm?~rbFIE9*hR|;0bVru!<#bVsQ{>
zm!kcbd-{ZKz|L@<gY90<j5dX|rZV|aVWr>o_pYFhx6odVb#11@Zv2voiBBRH?|<+Z
zN>p89cfF!CClWc4no?E1GRTEoN;8#WlB?gW9WvS2u)z!0-Mh8X|CGhvdhuz3mb1fY
zM%nBFqPHhFCwFv8LD7G8XB?+uoadl-wG(n7#;m%4>qNy|9alA2>ko)O<Dm&B1SfvA
zE^OD)==kOPGeYhB<V!DjlbuXo>vlMA$Ull$cl7jk&eTm@_PR2;gNtb-HdwU8+95P&
zWZKws;D+z5^Ne?7*2;fqc1@!<M#XR%rfWJ=-`jm_pAqR#rE4>A)VB~o9?&oD_fmC8
zHa@Gn$r&J&oeycmy9a<!m6hVcYeOcBvxxW{;z3rb^wU2(Ce8SMjD<)ve~d%%XCd~)
zlSwP+uBrtUtb|Q~)1`Uu{g}V}vYhfebn|ofmENhBtIKxt8~b*V!&veY`wyi~aU^eu
zUp_seUaA(eNF2IvnQ-oh0J3m>Oik&~uV{;~-sHJKqM{EG7i5P<56h-{E{|B%e28&0
zls|UN)VUu#^G;v2B1jIJUhH*xzM>=F^&ZE8Z=Bz}`Eu<xkM)$mEVtc$zIIl0mel%6
zBJuU!0MB|%F>y8u6;x5PR=2X4Q8;M0US_kX9N~dpdC%h&wx9&>Ip48%&R08EZ{u<(
zxkiD-kLK&Vr%@+9sbif4)AgwxN{ag}$Ly+^WYQICtRaBx{%^KYv5DZI{*wqCmYC4N
zK$l_y^PW{BdPdyS2#<~IL;|%ZA>;J8&vW2)3b5%^n57aECAY!XI8xgor6>6jp5@KB
z&|R1tuDrGav$2ajgnu>(KWq}gt1edzfItzoyE7F`sbxwIco8xRZI2X;gvmR$Yuk9J
zrx!MJzI=>JKO4|GeScuub8Pc;J6maO-H^6?Vm<2g?)V_RB$FKux69472?SHJPv4#I
z7DKX5>!G+3&dWS&PaSi7@osHbx(ZfXxO|}ga;?k0#n@XL_Hs&*@yNJ<g|*n8D4LQL
zfpRgE70|N;T7)YUu&ZAgIY9N|^v^?78acVR7QIr`Fh6S2qYKeOM+}pn{^(vfZRroM
zx;OyX>L7q%(oCZTrOjUcrRN4!jSs{$@jNgD+OFrLMk_6XC6>&eh}ZPn6QU_M{+{*0
z^-D?|pBXPRPbL|oIa|UL`x6H`EuM)Yqii}i%Z&@WPfICo99KMiBm<LlEbQ!2V4DSM
zXzu-vJHiV$bqCJ`s;+uWEK1q6e20mA;(Q*&Q;$>1uh40YMlr&Q)82^}?r-z^DqY#p
zrE2kM#7%LJRg=44sM&ZrgdtH36UD=3{P6NPI3)hc3-b_rilrL%%p>mj@E$!v!7|G&
z)}*|O>cThJe+o7^!12@l&2bbQpR$GHl`y8MctD}1-f65}CO(9$M|Ua;T%*l?y{2Jc
zEfwx!C|Tx@?mcX8+*|3&wLYabv`TJPIEJ{pc8RB4RxQwb{g794#1{Gp3wssw|3V+v
z4U|3cLVDOiO2zm#vJ&W1o=)Hlu0}0Quw+>14}*6PlM=8Es_u#o`OBI}-UrQCVx5fb
z&JQg9V=8fsS4-7~yw4glo&k3(MQ*Pll>3!FCMRYd{%{l?F|`6&E45fFTHGw!-;hOQ
z?9Py^<KbQt0Lo^N-KL6MeUoH7{3_2go~nQAoRY=q6P93b`tslv+B2EB*=m+7_q?!t
zd4L6hBB?J{E^i=0336b$DL*hc<iEwO)QTam5(Esyp0zcs?V0dxyCTzZUMt7d<xm#t
zttI1a0Vdg6Z!YT4Lm4Ocq?&UE{7f6Dbj($#e>Mfyft<?#rLL>s=2kW$)OZaD7y6q|
z5h`yRX_O*Q7ja-^R7_8w;8nrcpj*?`HYT5tYcPcoEUb?Snm~$m9iMvH8&=CKKdHUW
z!&}`=DLgV%7Nxq3By1mhAhX^tk%Yb>`{p!V91bDNy~w}H6%A+%OPLV$xvVOfITX`e
zZg$Mffu-gfeqS4}s`1?LbrSTy1G{FS9uq}0p3H00)F_F)(3QNYa5y-~I;8)vB${xM
zyq5F%Shlo`0>gLZrH}tm63TCt;9%Z#mN~Vxty#U;y{4BLi_s`Ry`(VJ1G_M%!UmLt
zE9QP3Vf0>G|4*TI>S?8`v{jx-%7o9(8RYJG`Kej%<lXUPcE^Of+FqpQzC*m+D7^jP
zHB|rrTYnGYB?{sNUE<p{P+XE<=(@!x2(ZfYJ3@(*bYCP#UYD`73CwRA_Dsxw;{1fb
z$GX~{tMZ*GFKqmwimd>Cy`w}nU|hqLo<TTUxod<05iSst7!HKQf_wF$f!#FooL*t2
zm_{VQJ;pJG%_>bsC5s9}MeyF3tG6=_hwkQgT@Cxy1<u}33kNtr%<2wW*?aq^F_cVc
z^YN9jR_B^-PUzVYQa$VBnfBL1x%7dTsvY!C=hHIUH@*E1*MJ4jp3+-@ns}pU!J?I(
zhWoB^*HS1PBvcaY#%@0fR(c&hTRE%3^WT^S9t>>iaJ(hcaK25BJePhEPh59e?y=H3
zvy^S;vp3qlK5Nj&{OsiCMo|X9%PxKE8a~xqXKHp0_@2ctzWPEKu4Y<>20=D>GPcWU
z%h2BvfG|L{(KAtqmwXyAzvbKnrZz1le}|?}-gkQg3IOVNR=tp5@zzHkd-gl$^wA{M
zGT9K9hBsqY*7hHn<pzq^<n;VmCcowSAe9<SoQH|Qp7;^D;gy%fTj44!3X+jDeQpa1
zhY1TmzHep)Y}!rv4$UA}O3k8{*05jtWVtf8-zck>7I0dWn)-imTB-E^05O*8fPa<$
zt=VM1rX)>9hV;*M+2l#M*N484r@-k~Z5qc8QeT1-b3p&PJWo^kGzP=qDzeJ=U_~=V
zP72{MPK}%9<C7&Oj5?r!10uvd@-ds_A95A`tWW&;aHM*Y0liXB6Dqfb_7W;xpSS%y
z^f%h`*zM-+sMeT&hr(qEN9M6me)3iQFbU;9Da8KZtO~q%(F3Ki15hU6$`+a#fJL>@
z1MK(p^d4>*U{tb)!Eth*8@KpAFJAwn1ZpRmOgX6d?E>1d4X3hD__E0ra5iFzaV)WU
z%p%6=<<2I~XOM&hYDUi7vx8GIga<sKiNgi*e04NDHJAc1<XkY%BZZrBU7UXfqb6Ty
z%AK5haL8O7|4fCMj`E)l&D!jVr`Z#V>fzfQQ^Td_Hmh^{4c>adR0S6{ZDw~G+A#(4
zjd&UCjxWt{V%#0dBQITZm#nOvd!u|}6S1c>O!il_;p5yJR!!X}7i!qFxI2!N#Z@}W
z+lAnjNQxl?!F>wkxfN4J*?P^Y0mRQF4pf83Mm#h@3|Da7$Nh%#fpqO`UiDjO>SA{4
z>#BQZ!icEHBw+!KJ(<SdesRTiW`)Re`|Y~dArv09f^qLM)?;82Fve8A8z~unz|;w*
zP9<hvp?4x(v(o)_rRGedFk3S~-fXA^1$zSy&19jT46h%VS-<kIweI4;+V21bh;=2Y
zdZFdmUbNoj*t0py#n|a}xiE>@dns^M3-}FD@mY`Lm3!I%d5wBap`3eHf;}o!nEJW+
z`1hjwpt>=^9phX5I)t|?<!WB@le~%f=U}P{>h#+rm^yi$$`t&Ql~~7eAlrq1xuSM&
z2kPZ+3Nw36PCf7-g%3bWTwShQh}QBGNYCp+nf>w}RAMA=*ZoQKpIhT;0&At$s%GtG
zHe89-yz*8iWj-i8UOu+egwu1P&*>3OfVUyB@wF0HzsBHljQdcr$t<xa#*tQrPrP7i
ztr<Q1$!yf0g(<gr#e^eN6@~vK%Tl}F%W>Vh7`2_F52NmhV(6Y3=^WZQ5T|iXo$ClZ
zIVyx2#z3w!>`|Jnq^h=jdL(<m6I(u_bacj}=XC!Wa8JLBQCN8ht)^oXg8xXMn8gkk
zYg$2%)4>DU97cD)WPcM%YcE06DG|fTtICxh`|^ttluf8LWk+AK$`hIVYu{dGREQcs
zprSli&R2ag0LUU+7L_7kQE3X#&dRb<c~cJ2sf6yO!WeOT9W?Q6V9_f|^@0O{#ylq>
zl_^$XJ}Vi>W08F~DO8$1Sorf_T1rvoZ%<^N^k5C)xgCT=*16X%)#lf>SZ9?@6vO5S
z&c_frZ-<gSKbkR0XJl8=o24edqCfqTr;AHzrftKI;x}x|(o;0?Da{tmz~?vn<;z+v
zy6y2oooLa(!=M-ODZ1c4w2)8@?tF-JiC>58(yuc7P>UgI<*6sdc+1!|4-fk6%u$Ui
z%#BtGJBW^{S(pQkTDt#kHB)tSBa|b$>x!S7<p}M~m;D0nY#!X-#U}s36ezF!6{0C=
zKB^Z4FH*BToM=A(iE9b;l+$!#CPBB;rULym8>`DZMM7Lbd*B^c)`U7!jFb4g>S=T+
zMIH{St&B}Js1(8~RnRMpn**HKzPW^>$Yqw;*ThXx5?cTbV48ty-p+mKtQt%gas4b3
z9gL;31S@3zm&Ikj01MGkTG4=ps<M|dfJVbn%0)`0F52aA6_VLDu}O=NXM%4}eAohj
zr?nh+>WV-lAB|L)Evs5tW(s!td@<T_4Xk;v-h?K@u@%_Ri&v>v$C$Cq8Ufy=;+vNx
zm`IZyR`K*QQ>8bFU2QX@@HK<-3F>(HIN$&9yix0&r^t+yzOC1<K)XHU+_EVdLghle
z=O$9$JXefao<&SyG3xGS^OWap{mBLL!B^zzc(XbKE3QTR{a(;=Xjyq&>8hK+S+{+l
zT)znr!L|!+TM7`t_CTR2B~6cAKkOmW4=A8IJYZwQOeMW=&?kR7Ij1j|gHp3^w=3Wu
zEya~k8nBylOy6`YzSDj9^lpE!01i568J)QBW#H0wC#HpMCkh`g9`g{kSSkaB>q|j?
zWF%%%P><&M*H9qI&DPPEE%)bAXgDUZ{(zZ}_k~NQ192nRKAG>5>)a2NV&;K)ZcSBU
z4%bAWr*_rNy~4WBz}n3)%s}tkYlJU=h3vD|Fk9m!%77VIt2Z0zU_fYX5!+fw&iIX{
zI}S~oIZo5OB#TCeDmN99#!nyI`0oXLZc22~_ee3W3d#L0H@5mqL+qhE(n~*jBZaH)
zOeROX<JRJ7{{@_2SY+_u{hD6ZmeFD+;V5=pz~xo1nya>Oxz;YQ*K_EGJRan(85UP9
z3L7nZx`eQ7bAv+7Y-btDKSo)L#@+>2L3=~iUZPPr_wreY-D+@nG!47b`@4Y&K>sH|
z#^YkyQ+p3<8^Su?-*ZjF_oznRX-X$6_d5!&6`~{>VldKn<g(1b;f=i1Yt`5EOb<FL
z%@38-Mq5PGo-cC>yU4+{ywD&+KC)qh9_SSv`a~1L5}cm<C0k08zPpnF5d;NnTNhcH
z(#&GRD2&+WKGNb_SrEsx^6`3L+PX|{F!-?e{{{jhNF`YuQGC5Bm|31J+!T4Wh@i$+
zBK^-I)zp7`^mmvz(2vdc3q&_EoP1mAdZf|v2jwR%YACpKLdtKY6w5@0#*&2nfzs&`
zyR#D?ld34>MSaKXH~(I!-L(7l-u^(l&Vc5~3@l|xaivAc`7XbM@vS*5(1BA$Q~M5U
zAA$v9tqSCF<V3FbVZ($bK-5%i8RMY)#=P$@C+b&*wyCpYN#S8{7U9+bmbps--hSo4
z&X{!?U<rHrY(sOupV9Dwn?Y_GzluqSlA7JfiSuC6aLjR!#`#y1y~aDN66gYW(E$U~
z%~Tf+Y_c9ynR7Wx3%}o3P9xL5Kzf|_nrvm**ERhh!B8+=+ygDXfSBR)z$i_ZOVnuI
zK4$7pg*oz2hBW7wK{fX^tR<BF^Y<<fW(}u>d^2CC&SW{Q^(MJl?xu-HEp6gq7BWDp
zB;0879`d6{2$H_uJ+i(1lxct`@VxiyRTYIyi`FqXLHWtg%R`^iGU+K~SCG4R7@4ab
z_*K4F#vVPf8J#Zw-nSI|xZ4qA4zrKM{Ei?1S3O`b0X<6<HHQ(auECv1bNecxzvKLV
zWv&Y42sEdsMFg=HfLqR0Ev4{L#Olu%)y?uVM&G{~x?`zfl?zJDhTx8X)9?xr{-&no
z)U*gIy;qUT2yx+p9~^;KS}qN_kL}1Qotww@NC=k<s+z66wmc-gEbml)kXMl$g(I29
zcDr3;$SknbUtOmOdX3+#oXuAt8x<`?&po+c?<!hwTYssty1HLORKY&vq*EF1G<j94
z>KI3yye$(@Dsxo<ifh_MQ`wugs9wvgqU8fC-Ado($oH#7*X#W~gJPCf9i!)a?UEcZ
zZAkgl0EI%!5lQ-YXyVW&ZuuDVbobdRinah&{B0;b0{u_xw{f~UIW%I#s_sJ+xgu2F
zNPT-BKNAGS%ffXgyGu7A3FKW21yr+h*Nwei;QC)JS~|I}8EPQ2n~Fe$RRQzQ3%pm!
zC+%**?Kk;5%Ky4}3x>BTWIS^aT$7VM=jN&TsO#{bO@xZ4r@a=>l)J43K&-s%yA414
z>x%M_?89t$ACKtR1VYY-l9OwpUec+S)^^_wqJ(7enX-kj-n*0W8-{VLmL|7%$LFS9
zc!R=o&KlbWXH*|0*t1&31)$6ZUfan>ON(qwh25+cp_ws|S(JB)%Hf&`lJ4feEVId2
zja=+hGNorB&5Yx43m8Y*1yZ;u2OKUYW`isNZtZgM=!Js23n&XhW+KQ#GL`nY%&a!B
zaCxMJ;pV~}$xY&oG{5)?3JOiVW&7bizbZC#eD~cZ?;g1F1+zDC8d)mNd(VO0%kiue
zZDJSxlU}nkhCtdV6YqLXo*V3mk%sBxYAIZ&%1Xo5d!UZ*t8_W}X9*9dz+S-TiNETI
z1l@=6T=DCzHGN0{!W3q>x&%S#f^8q)yN~?$2vdH`gu3)X;88|FuMh&T-2+b%NCG(M
z$S^gEq5Zgy6~ElhL~pT_;yv@;D(sAw*Kl~z;wT+D$XtXIU*bG^PsX}@I~mtdlGeHl
z#&`lK4H~tB!o@$I<AAeh$Ox@;^+-<T@Cw8AqX&>0Uihg(Lrp~EI-VgvKvCQ@+9pg^
z_0bC&nrP{%0p<p!ZuFg%L6omKt;>>17&Ak9p5egk;wddfUIFrW-E6-@0R8D2?kw_9
ziqg$>E7Ad7I*DfK^^Z&F!FMWdZIugXwU~M6IXDGW$RgYaoT^Qo$_hFgWIcpUyrwXZ
zIP>Bi193a2Y_4JHw2T)BhWKI2tg^{My>tDlJVi-YOY2mOZ&E{dm)k+neeseRiKuvJ
z{>^f;eE!PM#&Gm{4|%y!EJ#f-o{zM&$R>hau`vbsUhr{lJIj=Fz}dL+%ksMkY>Uaz
zrmcf-l%0}W?FM4iNKS7k&E0P2dLDOgQP0(>u<b?O+E_aKz2NS@Dhk<R_fB%rz1uA4
z5s><V-$6lmvf1*)_A5lCgR&!+U0M-h;qzWrFXkhFw5jt2KZp8>fQkkO!c>gv;KyY4
zvuk5X)fCrT+wcU^VoG(W%wTk1IRm7pH&8#cTS16zj8v3X#nto1pPfWx2O6GOzBgw!
zeLRYn$H&n%(TPlm6d$)zq6;_nI9DxC5r$~K)>*Gki#Ye|LR)=iXCZc<<H16`cbT2u
z-A-A%Fh88WbaXN{26$q`>yPrO*T~Ml<Lm`zR>NrISGcQ2XqLPF<D0*cTj2PIgo?z4
z7_oBJX(G-nKZtg5?f7iJo%6!pKi!?rds{|jx%85j)nGkIMKO_-_4+2bRdfIW?xVkw
z>fUJ}7z<y_aE~}dv4K;6lTN>L)$PNxkY57D0#+MwluX3!-z3$=XqDjaq3<_WD-r$U
z+23cj0VH~>R6BcE=G%gYXSvUNO&8;<VEk=D2n+SPS$;>Lr``8VZdK9XN-L%Hp1bf?
z!+V=|(r~j5w~^-8_IWjDo&m~$0(fWJJJ8H4@otyjRXUZ&)!nsatc5B}C{+Hh!oyGF
z;siH01aYZpS@u46$}3$NcaWFr2vx&ePP~@U-oI*Uqt}-EaId+tm^^u8Dl*FPW@u<W
zee9HyA5V+8`P})}O|wx}s$y4>g4;I}mpMJcjDHTIC+E#NSJ^p@Y@b)UmKCWtFJu+j
z>U(2Vf@DL}r(VjKu)NSzID+tL@V?X--|M~$lX~^g_ENRzK{j9i<~08j_;<~N?RP0l
zqHTccMon~EgfjsoUJ<7<jz8tgZy)=gfbHFH-98|zczKc{Y|MxV`Wq-K=Rk!Ud_yb^
zpkL*^d$2E$1Cx+Z66Xn1G(+N!{bvOOmCZrK6QC%ZUFn-xnJ6;|%sh@Qfp}p!$HyPK
zXdc4`)bUv4lIicw>__UsJzAf+a)*lS6hmg#AmO8`(ig;@o#k9_=5l&y<y3N{-}Bqt
zk-+$y<!`7xt!`T5W^q~ee3quxxL*reG3=zJ@7Hi!w6pW*@r)1l#0BI`p&klP%Sx5S
zx4Gt771W94r+{veCUcqLAPo7A*PPf3Rch}Sz>f%|K3!-9rJkP`{B4OQSv_McXG_3X
znMCzw%4S}2KraU^B1Oa&@n?rBlYs}CPAGrf24WCd1&w1{@r*hl4h3^Iv@-%J%3#(D
z970wP{<EI6{A@+|c;70h<9K+1sQ$tQt>x0YRdj+6i+_D0dnz}mdQg-DTj@p_4Qsr9
z+{pyKcY}D{a6%soN1OqXoMMt>6u_UmQ0uv;OiruQaeMYVn@xgW?s6F*??Ax>zcXFz
zyD&YGDNsk{+2G=2u#DHR$Wvv@8?LxfHC}p3DKXogk8ZI45u90SMjSFEJn~lB_%T}{
zi$@j`=Fth)50;WB0-D*0Tul3}E+|iwuC{pJ%E^#BP=Cvk(O7J)IJ9b#LF7#LoNV)Y
zXQSAoDBd*<uZ{U&yEwR_W62eLX{F_x$oSdZX7nw6%DD{c+XI%yw=PFJzcRpPOy${_
zeIsOb*vhW{QhKN+Dp$Bn?jjiVI}Ul@a=399Nmh$u;3G|R*%{|py|D<p%F~vxF*OQs
zhDFmbvpti|OEa`p`OnryRWJY=U)iUxOuwr{KINV@?bZ^CpC5wGKLf0=gBK8{GyyKq
ztm~3f!iZNhmThPB?O!3c3#<+;y93CZ)u-#ZC5UZzUKt3)`c4;{0pzh#-3CV5+LT&_
zz4<d8>&G%m+=nYON{NAF+nuH6iC{cu#paZg9w~c=wFoO!HSlj<kYj%EI6~tTe|Nq?
zu#oqCxE?KHWiQ(NTFQa`49=LTePim-;69}&v)%D_5q#U|I<0Pg&H+$x|DIPGTZmPe
zYZr`}r*1kgw*(>9sivfN82^pdwoakQ{Y3!z?FQn46qP3_Efb19IC>h(bY6(gqHX(Q
z-iJDsqwL-*t$WIM;E)M=iG(gKGlWn5$Lj2_M}r?y#2c5vgF8~;gmnDnAqtHMV*NDV
zxkx>egNo3v7v%b=rALQYp{uyx1}*XIv4Vb2Sd)sV%3a%QnhM2HtWe4qA7Lntep6yQ
zcSY`d%rY9vf`jSL;h_qO#xyV~Oit*#93egObGE^sp#QU7TGWStyJjw({H$>nG`}=r
zp-j(*(P|hF;$=9lY%%_n69mN#9|vD__7%JzlmNgOmAlQd_}JOVEWc#YF7TMCMy^lc
z#rImDU{#p|GEeP1BU88^<zT^0q-8ai$5*;eYg~UrO^=BB@KRdmFno?(2O4#@O9VP<
zH*Zba(a%P7W7Dx|F8Yj=t@-jgsJ(b;;rc26qS043ZRLSLk*%gw7?h^f^Xcw_?Pd70
zjjeW3I@;B+)8)Mu;Lq{`GnAeU)2N=(Z%hCFZ(WW7(umQ~r-i@K3*rXeFH5g6egYfN
zw9X3VB(vTEk*_^NcWuXBisrluPI`UM>MkSKEBc-T?yliSW@Fd)MOvbEVUkZKhwlFi
zLQ1*z^YuTgs(c;}0JihR!g{bj@SnKZRkq`KD|G$@$9CeOyaRRMEsef??%lt0*8x^|
znyWu4`d>4k;6B}4g8=}Mjk)ju##8Cp=c&U2MGB5(S2N2$gbqUaw8&|GzUujlUUUeK
zMD<bqq)J!W32Tm8_(-C<<>{~fyx?cofuVV?liBzH(5?5!`%?d+U;fu;Le0agjzZUY
z{B=)R79y9>?hdUsCW_=};3;JT;y@zAMA>n&S`Ur$MZ}RCL{PoaNS4jO$}^;;e^&1s
zSsvMg!XrQGxoV>Dww(i+>!%LS_L~OeZg0KM`v8udG8nNu0)fl^sgD1DeUyK%lPX{e
zc{W%;3Q^`7SE4Zg^%(A2!rJv{gtyDUC(*LmDjaAKXervM(`k+!sdN<Lm?b67!?1de
zS!rXIB9QtZZQ4mihp-Z{-`KemWo3m@o`Fy3>*T1q{r<8j=S^gucL}BMz@@OV?+<?Z
zJAD4fKc;9w5|=b;2*2T{8brf65lXKo`^z3K$&B`sDY5%JdTmlRuD7ZC2+V@oGjOV)
zPn1GnGT}qa#uCUIc!itS4zbZ8QWC90_+e*VdTFF`*cdj5n?mSzHvIYmC}r<Yf%%gA
zKoi;Xe|)ChKF49bPuVQM%ij830`YKExez-GD>|<GQC=sM>{`U1f=aI)?}WV4K!N;D
zp4+)*Z+-xP-26a0Blubka;Ahv1<9K)D_wEwy4}<a2#x|bc9UC_&C7vl{d@Dml4;)i
z`Md%)Z@-KW+x9#DA1k$;8r9)d>V2M+zH(2Q4o=>i`<+4QPOrx2@7Ab8?h4q{<COQX
z*Ef4~z|e$nf0(Web}$YofpqFPU{`xSaOvBoCfl0fK|qpsm+OF+s8Iub=`GY2gT^2X
zyJ=|HK<-cIY;*CgGQh>Jt2&wgdC&g^3;)^@+Lz$dslOWdK*xH5(p9~GR5M!q$Rw-}
zT_&U1cybgoms6Q@w{=peEe;R#h|&*#(Qi@8lY<mbjfc<{JY(csdw`PkH1jI+I}Ov=
zLr5(_iZ933%8dDEV+)_Y^i<e|8O_Lu8`XpA8=aqXW;^r@p$<(sZWAj>!H(D3^#7M+
z@!us?-UrXhubv701!WQ}BwOcCJ@nd*ZpYH8D)1A2JsajJ<_jup4!AOejor{Xks>23
zaL)?%6`O_n^by#c1_9lYGu7Y;1=4J=>BhUM^?A1km9Fuh%tYYN|3!I3W38m^B4Yjo
z9GJXm=2uGO<*EM9kC#Q+TOy8%@B!aL((Ml%nJUDZFet#q9-9vRqpd=;A)q60=9oqJ
z!r@XZ*GV~Crs0E>F1X74d4X?Fz^ZU0>vZU+OgDFY%r*W$ZAerZh(YlgM>`?&qHe=}
zYKYyF^#(2g2PXdf+2O~>s?emREa@2!@1ETiKwmuswe7lDs8r{P+Roka8dp_gV0S6Q
z{<EUlhLqY&kkr$z;@?f`c;eq(|0cH``Tt)_pj-?eqVa}S=@I3(a_|T|ook>$`1mNE
zqM`ISd%y3ga5kJx4=E*SI-0b0aj@=VJ_91^Q>--Qsm<VjVcxyH02DQ}UP`C;U@GMA
z%&`3Jm+{|N_$Q!IE|mOTTH&E~Px=38@5;ksT*H3Ga4K6VWeLS8SxR;(j3rC96lt|a
zL`B=kK8I{kaU`USC8R}EGNq(r$ySMyHftuuh$3Y9es}ebq-naY?>gTf-*x`#z20e_
z=icx8xtC`=!GVPEH<V*fgGXP95M(PPLvZ_Eey}|cu2gTurFYjgJlyHbP=4uw&qJ-U
z`~g8-fS})62v>eUDvLojR1UF5=MY}J0xToCZKW}y8Os)&;9e6h0sku~)nxPqXZAj%
zAF*ZSn2YHc3<8wl{)!XBHy>T2iC}Ti7B3Ym?S8Q33s4tq+r=?v=BGRMqwa^jsv7E>
z?E1KP+Emcz4zL>3YiKh@$Kh)Y;cNY^cZt&c`wH4fH9$5bA~nvLcNN6m5jrA4uVwI}
zGMp-IF2E33P@89w!E%&_T!4|5*o|jCwO<{%Fzd}h8QvL~`#B+*ADQCB>N%VM`(8(n
zz~5JzJ<gXT5@P<K$}@-pDoTc`Mmw2LQ>Ws_REstUGI=&wTzIUt3NwRJ916QXvS{d|
z%diO^%92=KO9(8m0lcqFV7)}Klw%Lff&>C@PH)g_ilupA54;2)9*trA$E+EM4N9^M
z+ZkjR=(n34C2qkNFIexv#1XJMJbWyIb)@Rcs^#`YY*U&>2zsyysHg76ODE$68nt9V
z<`L%wGtsRXClg$Yd}U;$nP0zF*uZArl^#Gf;{q1TvvX26nTwb2v$NCiQoZbTYsOrk
z=?!x<q_ps?ZK)`~zh}90i5v$?1pNUZl+EC&54cA=unvOc_9+e~c==*1gj98LRL!9V
z#F|IJK&!PyT3H3SIVEm^ROqb%X#W1|DFpp2P`a50XTAykE+Csi%6Qw_+Ms7`tP_J6
zNZ{{x<P^m~op)YmxWb_KJGV0tAWk25uOp~ioCxB)y?b|(dK}m+(An(;yTD(L{Uhbt
zFu{~|48+Gvcb-V~Yy#qMJDxP@<eOqK|2}bdQ3~mSwc2;MZ!YUbv0DVcij2ZMy!@Jd
z{T2gTB9*XIZ)F9I;|fc%3)2{5A!#E{HxQxRqQi2MMS?baLZrG$Cp_HRmpcJRJwDKC
z;q7#r5g<kAwn2MK9MBGXof#l)c^RG)xsQ9U{UNwtaFov<y>^gNY7iEsa~>A0&Up_p
zZq8Uy`M!{$1yce_y-G!gOW#|l)suLJlw#(bK7qwFZ#&_jw~qrMVjIP@U@e0|f$LTy
zAwq|Z{>CBadejOFXEX`Xwl$u@2L*ylv9W`s36Y5%sL^9idM>DO9((zRIlpC3fRq}v
z2mIP+@yc`ga(jkAOruN$+l}j_`9fIxUNRa02-%LKPSXL1TYr!!@u;Hgz6n5Oa7YY(
znM!sSP7og242Rh*0l$YzCak`G8>2;%g_HS$B`MTVhpAY}4$B`-Jl9E9nU>cJIZs*^
zqk}z%9u$v8gmi!i#e}?hb={3C>K%8JSw|uwyrDp#Hi7T*Cw4f#&_ezf%jW?pe)Onw
zO(Gu&xbg7c-WIW6!!5bOuMbj;rEhT$D^!nw?xf?{op{S3ys6lA(1IaUDRa@luHFMJ
z7$2=Zj6BduWiOlUqBO65fksk2&@B*Y&dCQujW=;_V%;g`CV3heWITth5h2sZNg_3|
z?H<P1ii&Zt)EiE#Zk%2a<UMbkfmj?RJuHS%)?MrM##e@S9-by3_WVa9esf4x+l>6N
za1?(50M!#F`x!S@e3U%RRo#2D;tc{$AGm=3kt{;=k#QdoCNm{hr5P)P_FBKg(U0ST
z4PKZ>;xxy(2eiD+2b3};>xv1$pm2D=VwHlR+oqGNBmpjFdl81+b-C-2+2_3{4B{?$
zOyHCMxo|U|9B2w*aB`LklbosbmAYT;;tf=jwhaAEbL;Qe2VAt-F);7XW-Lu*?6qK~
z;||U#Fh1y(z!xZf23lzwcZd7cQX->2C3iLFg?qYl>z=SBZiN2HCxTm9aa<OAC!I@P
zlvo<@_JOlzEr*bu5esiErPb|8jKbCRZGA*(`Z~K7Ge%0XydDRYyUIhWn&!})kY`DA
zT;_64uOP^EW=rq-B>8=#_8@()EiRo1TrEg_^j{Wjc^mMuHrPouhq<$!cbt*<ow~56
z_xc`q^TnN;mi@3;%uY!UoOn?D)tb(`LFGhP9y)x>DwAcEF*-jUn7L`gUT7T7V_@q*
zZgcE4r$u4$264P1Q~XQgMoC>2I|#Hl8{yx?0>r<BmvjsRjp3j>i7ByE6A1J@>hRr$
zTb;R(=5jcDz))|Sk;{8=__wg;?8u}<rH&A-*OO?!I@?rhRHiZ#AJgtW^9@OGeYbq+
zAJTy5@1!-u<3}o~o3w=R51e(co9qd)dhJgv0OEe*@KR0X+FphT`n16|qgT)O!kqR@
z3Y7-G49^D$7@Tyad73X!>ijty{#Hs2fGtWYSwMGBOhT-rKW*9~MjCJFfI@=OKsRo@
zK`(!ouC>xD*<kn$1B{nuUtL4+PQ_ve{|=L#g<wn2e({CLEOs9<CHGSDhSshGJ7pVe
z!0G!`2ev5c^o8+~m#)#hK0UC&p0Rmautm6RK-YG%<93b^gbT_O{0+JCBu!rH6;4`B
z1otCNE}!i?_v{3&aISYC^IoDR*QEiUi-i-Thbr;%DJPmVUS$%jQ5_-@C-{4z%E)nS
z4$LkP<bH@-D@{sG+(4sab27fb>z@JbXZLk}%r1e6!ys$}UOya8In8+J<L4!!!*9P>
zNasO@704aCr!Qe|_b%rl(+e;W+Zvi*J0x3o`l}tR%GymU$gYrT!9}@w>ROIba+=5$
zZZXkioDzLjOVN4{E(jS9=Ve-S1icTzSQwxcp*8(&-4HVU$SRo*+qeiJ*yTeO;@dzP
zUK=1~B!NOzOGE@)H^5qM(qg%~ADyEPE2j)36e)Gn^7K|FtvHJV=OeG|nX2N>jWkbu
z!z4%RfnAMl^{)G7gIwYMvHejb)rD|<l+Wv4H|~&!EwqXH%Tl*iWYs5<=|0`Dldch0
z*m_l)X%YzYbB79?4+N+3wLnac10;&7h@5w1OKRLZrza&SnG^(vZ$>)d-#2?byj$2@
z^C?m{7!3dEF#;Q!Oa);UX5T#{X9qnwEEhHgHlgl8tC|X46oy5GOo>@p5TrCfz*ICP
zQb<92J7_#ce(^!Dl(Bbuu+^{i8nSz9aJPnQ|9w@%wu#cdMGqTPOE<|__z{o4oSZ<h
zM1PXUqbu67TA)U+JLzaYQtGZy4zw_-Fq(z5NxWe|IR_0*Y4N$@;+`ZS`WaixDwCg1
z&5HJ5^@c{DYV9x_Yra5cUw`7YC}l<o#z0DSmp--XXU!G5WM@VGWM#v8!0`t&!)(^C
zg=0tsgN-7CW0fXhEB(bL|G<_)WLW9?eYS=$UzhI(67js}*uR3=3k&v;S~`%u1T}m8
zCz1OA*18rRI9n-oJT_1=-;A?xUB6dNXZilv;OUg%E1LgH8yFLld*AAa*|UjV-~1(s
z^kTEo5l4a7vziS5zrB1<K!0vZXoBmxQu_}zrmqncIId98UeuC$jiSdT1>uJ{R2e%M
z)tNee1URdw8FlK%PywxlIE2_5BYw9X9VTvO7t^Bt!y$;G0p(Gu>m371mt?UALi*FT
zWbyGb71utv>fV~KSPLf46xlhOJ1!-n>cfh3?XrQG2>kn!CFn&iVD>*&vK6EU=`Hj|
z;_Cx}Q<}oV3}*eENaWbuNh$re)|W?N*(UrM)(>A}sUsEs>h>vnClRF)E<?|%Y@3xU
zrS1Vye0DDA4WR2NT8)i)Kt3#c404wiAI?*;hEcnQANYenm9+ePqq<2oKiFiUT8Dp3
z@f|-po8RJZ6+6R@2)qOxw~gTn7e2W8v-z>%<*}*=U?&V4%mS`o5H9Aob9Zf@8j3}y
zw_0RL7>ognT8tkjhVubg=#GCN)ABEEFqlHe$J?;Mx`#JPBmpkw4bKf2-2X7RGv0{n
zP1@*Dac#EKZU-<ZM(Rs2Zg?CMILiJ>RX;_u$GVvX0_O@XEy>QMRvW$`-<SV~n;BWN
z7NEVdLYp!Ceo(T-p(oaZFu1BIIogcLma6;3;t*@uiaRt(lt!7)Mzpu)qn}AkS#+{L
zsfpD7p)(cb0QaH%Q8#77nM+aE6wR$?iA34g&PXTGYpfL=J!BwAbe=TI^kELWZtRc?
zE8*1qc8FLUyHbP209&nL2<&P+b>l?h9i|}k5@$b6!?Xltd|EbRZN}fW&9Ub#I0&1<
z+*~@o5wPkrC`<|Mtbuge*)<kzYF|tC!Wmsa^}^wSUZB@9a*ouR(}GUsE8q1C!b=?@
z7G;g9#gsP}W+=b!tC!{2z;}cN(}Ui~22^G!3=G^u8^EKZ3fh{bNp60!h9gBk?zBOx
z|HBO^V?E`~J=a&Fx&>>=tp3&kht{)~AHkGnkoul-5%n|~3#{6!WbMb-%ADaRCW@zn
z_yJ%+rRmxV8x#y4K7K9cyrc)tMzU|E&K{tF8=cQW4_C?VBh}hs6sw7sUm>}VW{Uks
zF)2Z!%0})eR6L(MnKPu5$ZK_)Dg7Mk<qsC}y8>{J7@H||rv{ecopHe9m&{ReJ$eZR
z{a?e<F^z<hsROK1eL<BC4huB#yp$^BU3DI|*B0RA%~mVcvSnw{(Im4d&a)O$&SR*V
zNGDO0miS?47l~jqGUYb|;MHnl26gx#rLJ_^56gS)bPU7t)A6oO_MW3l%R6?$@i-#x
zQKOI(O?m5ZZV^W7nv*9m2c;smd_BzZf>idMwZdAH!*{{Ex>Y9^3L~72k0!V}AHFL!
zjVAj7w~Xok7;=6ruz!HVtA{SaX_yL+jsUH0<)hjTcv4&Junw@F!_t<}Tv_Afc~1%0
zADu0!@xKH)Y6I2A&S5?1{xC;>w<acT>UAbYN25a#>HFw^h-%`LT`h>(e?clAnNQ{l
zSFGPXT^ii7%2U9!T1VZQPQk4j1N;~PJ}m@*U(Y%%Nsq*&;;e!~d0QLwMPQx_=4C}*
zE>P--1Kn#5X+Zsq&CVX+=TQI50&1U?UShfibD-<=Cj77r&jAK=PQrJO8S@W-EY0Dj
zekaRbkmZ&66mH#a53wr$_ONdp$fD*R1ABM|)eW1(LwN&uTY*E{KfvMo)}B8JrskBj
zPC-zumu_6xP1-_{Tprwm0G<wp`h0M3y20*Dr*<{a^tZypF{%eozlL6g8~3!O6a6zZ
z=Jp&jgB~1yXCgY4@+VOqYx_+jTAs=(%y<t8kSVpcO}BLamzK{YGjF@hohwR9FOUws
z03vhSc3wiyI`XlUA_m8S!$@fKfRa~le=kNp`e>BB?!H0QY1C3*<JUvHr@@;yseE-F
zvK|n!_v4sNoGcJ5%zRaqW(JOQkKREReyboV&qi7!t<R|h*P2jREaDA6oIw+YO^{U{
zCLfBZOKi@_H(NP#0D0}ug8+U#&E2yf(@T$D7~~xKCNwdaI_(1%x;r?YLgYm1Yx5zd
zOWhuT7qIjC4~x#x>6)Si^hILE`)mc$xWXa_scwpYh{Ps5!4{d&b&%9&4>Q`zo}ipG
zni8;}p8;Ox`(OP_EahWmabpm-HR@vNx3i@4cnEkLX30&^;_yFVt7~VEcCA}3RHwY3
zpzn?r;&2v@H$Z4N5q{8adL&LF4O3kV4w*6Pnh_u9I;d7p`PjJL^^YHmyFbrfdT)FH
zJ&tAt&#!)%)6VSF)XDBsH)A_A!61JSy3d^vuQSxSdj1gdWp{4QmWr7SWs(Rw=Aat?
zjD-vC+gdv(y652Lp?_L-S>bnRLN`{rQm^T)Z>YIWy@*Q=QqUq4lw0_WgWQx$=Upfn
z!GWlT&)?6ao=zWt?Yi44GQss)p~3~`?Qb@VSY1sSH|k02a3qidCCUgV-3vUx;j*d+
z_b>>Pg6UtywrFaP!XUfHu=%n_kMM+a8={&^s>ajE1d;q7UENIH{FqUtSvx7BjXRKC
z_ncWAQA+_{2}PfmUd9{Dc{_5!4XJdgn<D-Omn%AzB#?8{P)IPOD^#DNd-crj^Z4z-
zZMa3-y~8>YZd-Zg-9OSR5S@ggTpG#pDXX|Y<5XdRzK>2>{xzN}uI2EeV%R!4M@AY<
z)<B6$11z}0%<k*EFbPR9QGR<cg%Z|I4iwXQ)|WqRoO&xBPd9zs4WBE*oOhq`JLc>1
z%svh*TSxas<NgOXy<)?toN(Nl_YRXyquS#}TmE=RjAoyc5zP9=eIPO4Qjj0WZ0>9=
zoit1E{e$;99{}eL!UfphTE3F?NPZXYfONZnYqld)))k@$u6_SwsyrbsvyM`6OG3Z>
zgowVkMQN)B@`P6>Ucn4!-7u9P;ZUP*x4E-07r#2yafdX8QAnjUh3i5kd{^8igZw*g
zZM6P`w%GiYDY&sM7O<>-zS~;2PZ;&!-%sIWe8wFdju(9{rh#*@Huk45RKV>cL?hZQ
z94KwJ_Q|Eg5h*7BD_mr|v*$!<MWf5n@hT0X6BJM-g({K^Q>)5c?ZC~Pt*pCPBxva5
zGW|E0K^ZZRW{7;!Dhs&t%)G)A3~t<P5j@Cn`&KQEdL`MA{(D+5icG<KfRL-<#{IaT
z(G!McKMjZYjT_8e{=q<MX3oH|Lbzd0@>jO0!qTiF&thfjyp4EXt`)dqpu@`;R{DVA
zb>BY=x1nD+d@%+J6fL=2pK_eAR_3&ZdszT$*UEZCD<%_#Fd2YMfi@TUBuQ<dYR2kK
zEf4Hv9YMzm3t0ObPfqsq^WtPePKCGt*?;*JA+CdVD;q5uT4t!==F>H(*R(_XR)-}P
zi_}zmmL0zMr|zE|{uUC`erI$Tn`+$8<b!TH(vzQ7=22du0UC&j0h3RMh?iHjNV1sJ
z8#;QEDihy&J@*}*^k&B#t7RcpWpu$}tDu&cxeBr>Emqr{M&osP%khIrHjZs9FEb-x
zp5*^r-NUIAS-tc86P&CB!&1@fk>V7CWGi0`Srxb%aKT#s5uo+$!LVUG<rWJAgy?BI
z{akVM$8)}3-zp!EUIA!!+&F--hZM92o3<J;8m-Jb-ga5@6r-L>!D~uFVS((wVF=19
z=Mxgqlrm1wIyaP|25${(_M7+N3DK2|6gMKaeDaE-ulXoQ@HT*Dl~W2Yu?H&T(B%cb
z8!C>so)9_u1)Bd<PfiQW&RUmP0`BxK%by*ZKXnu53dzV^&TX-bnDEco2BU=XWI^0w
z1{{f8QlE{jXX&@0uFhcEdROjv_IcG$H%5O~q#I1jaC5ueXiND)cW~vxQ(R>&!s+0m
z4YE8tmmvKjPq~YnOv~Yns%ElgmjZ3q5Z{EmOu1RId^9MZvB|i*trOj}5`Jp(;;&GR
zesp7N=q`Udh1ynHemHoyI3j#KBa*jtOk%4=vdS>833awEK)QayWW9>Eae#0x^l!1^
zYEk*q!N{<#r?f%e%Qu-rH9KD6v~hraK~Tc*Kb@GSi8``gU+`KZN8+Id{C}2r26w=a
z*~k>kxCbr|p?iVoims8i7z~&pm^NV(Ck+v$tGU8$@wstQKNEjKt=FjbP{u~2sOF==
z0NI06Z_CuRXiRbA`pjpg7Y@g1n{voCJrC8qK}qXYVG;Uno*kJfy=#U2o`y4ySXsz{
z>AAp=t^i`o|A`+F$ytffnIe~K+tgep4aY5EYm$cIPKdt){V<yX;X>_gzxc47ae?!Q
z^fv$Fo|R3}k)w2KrQ?~nDU%<PjC*jYar*xI___0F5W0HU>kdYi5`(R_34^_9XG7XK
zv~%{)op*37t9QP+Y-Kx%njxk%A6MB#-~VN2^*zbm*1lHs8Tm11{KJnW#~3~)9&*Yt
z9AQ}LRd50}fvP;cyrNn&N{IUNtz06rkM9=h-j&8ocW8E3`u5Mhksa*sALA~2QB&}}
z*u;z49^4K10m>8%-8}MsHQ}yCS-z^r-|c@Xv)1eq-(K6gT5w0Sb&{Uam3=@`r1y&y
z{i%{_1}XBWmZBZLyIY~X6g+X2^LCh0>jHT4^7|rgM~l)r7+$@ACSWPFwL{VNu>LtY
zY~!~B(_=rJ-#WR3S)}R7sG7Fis%EF*J;v{IrzMIr_qDNWb8vBs)ElGk$o$Y~dC<O-
z<rwpYCkC;sS#omeY9IVBJ}Z<FiOjNW=LOv`n?7gbhjk<8zc0~ADKTMvkhQV34Jjxn
za9h8AWT{DBUf!};tB`982jr=-(xdOK9e|M@1pRZO*geG@u~S?cQe>Inx<CC2(<Ivn
zD!d=a2z{K1Z>!VP%bmo7hmzHK!t^7j;+!A1U87a$J1=S7!MXTh<Eor7LOH{B&)zR$
zu{+~SPAiT^G6lq(`8;#0;8+9avWc%t>bX>n$+Ma>y#T)xl`|98?AWEYZ~&YEcw89M
zR?EaUIbSDzNQ?nDPhoYWFmNz*^*%BkGn_@c#fulQkHr}-=jv8E4rjXX8WBmPUb>JE
zr$Y6CwS$K6uCQD$7(UDp<@12k_Z5ctY#hvT8=d>JtE~z_YFe@1{!Q8_sS_Reu0Gav
zxiB5V$)WL%z42*UCVP^rD&OA6kJI1(@ZrM=vyrRI)bXqLx&@avZp^IQ@OCjccs5yv
zEqezgZNr{PbHpA;S#f*r`|tums_+T5Gm1Ie@I&%hz&(l@tFPjO;UgGkrqoe0%0~D8
z=g*%{+us;9Yf6Iau*=EiIXY|5=%xY-ZJl$>K)lwvz6!Arm*}??h)dbb1g%(t@U!=!
z{w9<M-52gEH%oh^ySu*!m)mQ(XJKLC^suZxIx60M2IHh_u~|b_;Q;RZ_0(E@R_t)J
z`zS<^eBMjH@>I%xU?^`M0A4SPf835fNi|pJlUjOK^EYlB9TXI_d&7p*&Aoc{3SB!-
zXCo^;c0~%(NOw@_GTatVK~jwl?^D^HP1LMVeY#E>f*jPAqMhk`Ghx1tJEi5bC&*9D
zdT(Q$;Ce%|%<%p3cW5}729+CjZbAUFgTf~t&c13<slJ>@{JO!1<iV`Y!KYSzq^^P&
z6b|Z?7_yqNKYsj}iJuMMIsFTMciYX)ZB|9saqU!BckA=A5)G}$V&!KJEub1nHOZb>
z^R#u+bA#5$HGzW5K*8TPG>$WTm?O%MFnXe&d}A5f;C8~(s2+FMW2Au7f)z^|7Wt&-
z04gCiE{9PAAcb9`9aNF8COy8cQZj=5MSkB3&kf(cxQ{S2d~I;c?y}PA;9a@i$o^4r
zAtj`v_<W`w$azBuMMvaP4|Ri-V~~eXSc5yxn(vzfs#(-NTV&PRh<}L_x?gCO7i2IK
zz5I{;ku@`+R_4Tn0ZefUy*xZCYuG8_qqAmGcR>V&ZK{kvQ%qKZbXMf@?k6?$LX%wt
zQ-wmI<zvg`39j79JYjg42c+T(`}@kbzOUZ?`0N}Ny8CCcf`6G4L8w)|&FJ3*zo}0l
zl=cWJOo?sY8O79s_cpExuG@6WZ0?@2S75Q1psW|%tboevHFB<{dYaU#{Kb(?s$$t^
zKKvcmy~)wOB<BNw;v&)A2ki55FIIR)3JL_xl~$*Hd^CK!3~v)y+ALs+9m9JV&DdB-
zM7K+0+euzoyjM1)dv0JxGw}0(5sZL}M}JKdUrywHVJ?`o3O}=wWbn6wx*p~X$}3(L
zijT&6z8oDQnDEi%CX4`)d7zw^*gW{nt~C!4?l0fHTQy?%aEsDine8zAq~d{FIEKKH
zJiOF3nhCyTUnrlh_Lpf9Bczve)iY2`5nT44M`tiC$oZiQ3shKx<ZR~7J-Mx2cb^Sa
zX=&5rFJ7GFeQjYyCo@CsGDENPTu5RVBrzgzz|UJ3<ah#-#DJVLgTy=(lkT^NZ$Pt3
z|1<R*Il%WuI`(|2q4#(42%U7r+#myUp77u(U6@$DJV5G`J)fB`k>TxNzjX4<78%|<
zmbmq&0!dQmG8E_Rk5EmO;?x`YfuVACsu$zqjnjiiPB}7qV>dV0DG#d%^Q<@nZUFAR
zjIkPxWTsf*8Mx!*?_|N|e$LMN#LQ4N5>W9^gE%wiZi55lqmJP^!0HdB?Y&m6T<JG`
z#*9DOY;bYe8}Csp3lpz+!47k@#;{ZYd)=ANxJ{f0mMh6+<Ogp(F@Yr_8`T?P9e93O
zC6WU}K(6eeC8OBc3ZE)ondKG~yeW9E7Fr-!q_}$S^XFw~#eWI~`R8m#3kEd|N=z*P
zZk|{78j6E*$&p7Y+OiLKda#O=0R3K`J(ix;P%wH@T&&%-d-uxD!%Sg9G?3+4{-Gzr
zDg>@hZs#IV2&~~ynHevXq@+r{2{KX+$xgrAoKjiycR{B*4oO#7YVia=N%ZR7^BZv&
z4Bx(feZt$@JI114%2M2uW$BMLJ#9)?$z<&j8(j_dNe2jw+A~)@w5^L@H~bLQjI4f?
z-5k)8;vTJsC8q3brQwE#Q*GClr+)|k?z;$V-Wn;5)+sX_aPc8-i1P=BXP4?#58#s(
zN#4RTBkl#S44o@RTrxpFA=r*iC-W#aT|4FTv)|{(8k{u`i;0b$?dFret(Bor#l2fI
z)*k3hPsiKgV(<km;P{qv(-H>=#TG!aExQhq`o@(LAzmW53O0M3AUCTYNoKr?@a;FF
z6<m=>qs->rANGUcK=IG2y*{!SqVaat>=n8C^8|4Zk3`Ql6h&mu=jNgmw(MXuQRb)i
zu-(jY;zbsH#;?Y|x*gE}vi=AM2#Xv_5c77F!{8w(P+Mlb<Y!=|VRD87s~wK|)V(N1
zx#0)Bx!CVgP%j9U9&QFNKm4U|8sCNY?b|mu508(5y^mjjhQrc)-8=|%G=dg*zaJsK
zQbv6foGpUmX8WC~gI+QP>7)5HLom#tKzB-=Ap3>N#Y>l_&ImUA99yLCg8C9Nc2%ub
zHN>+Jwo7%WAKP2l1kctG&)5o|E))rtuPw~WYo+(0a-#C;wh1FPE2;ZY2UPQ8J;ey5
zw0c42QtkWSE2UD9!yf~YZy8Wx%$8lT1tgph{zy}9m+Gh7_v_vkCdODrWM*e)fBI+1
z2Gb{=FI%ieN+udsHT~AxA~D4}$h{zKv$<!wy9vhR6Na1~385Q2>c$*Uxx|6>w|u{V
z+cDA6O7Yd6JGdhH9mc);kdIiQab28;IWSFq&h2Oq;Un|nYvRo%KJbnJ&ML~oC{Snq
z%fhMC=9}``bsu%&+QL2f0c(R-Yh5pgSG6mh7C)Jr8B{pY_uKl%!lduIC2!Y``+`=0
z*|J|hcmEN-u<g&@GKB-y^9x91T%9v$&`+|4*L*ZurTBf%^*=W36BY-STU3sW8#&r7
zDeq<4`a;L@D$KjcwpKi?Q|Tgn`SRtG)D~YG?zbMt&UPg4H$Km~30v0CgJTG;t~MBi
zvvqv}@m8`HkvZOG9m{L-ddYN~!4<o14PiY0baD>SWL(I%VCgDbOR}qAu20;chHgQS
z?X4F>CLnc4mwLBTHbU^$h|{`Fefc}OcynF0bu|GeJQ`G=7!CeEDUv#s_Adn;Y~Vo2
zvEU_9X!lQK&XXpuH$MuxHGg1(A*Y@>-<jn4TuQKX6U%7%JD8*AZ|+=e0D`ft;7Y;^
z^efi1nZY<=X_Crr3B%t`tGq!5mdaKy=-Jgay$QCf214FyX!kphB@HDWp0?QVfB^jJ
zVB2K)WE<mfc85<hiGuEnM>ZYfXNbUV<Q?46fD#PWD61loo)(^^$q>8&LjU?vV>+U~
zE%|Nrna#u<jfYZ~s^P+Rx)jdYUaNs@VaBfHhXml=@CQb}LeU@tYc^~Yz<`<s1>=L5
zd{{%e{wup0qo8I+tl^Ij^L{v$OxJDtin0}`3yYkK0vaNbvO=*{@Q;ds%N>S4`Wfpr
z^q;>N3Jx2#OV)Z|&n6STVa$0G5I=OW4PV}bn5$8qY)81NwqVMVCe#vsL8LxwGw8^1
z+}mBJsRGq7Ud>pF6kx8nxl1Z3>qAh_s{9uE*ZaIts?-5a{VyQ4Mdu)OyZ&!r;v9z!
z+O3-%+tiJJ74M(l8jnPUPn71MCR;xVa4)54rhC`oF%A5`!ApiG3&d*9kw1ZKR8}N#
zVmD%>Veu9Zo6wfi=<*5_gQo1iBhG(F*iRbPe9ELxP%X8R_K%V{#X#8u>CK5bdpk8?
z+?juA66FR{D{=8c+a_dH8b$d(Bh)B)py>Y@At^@TKO-c?DEw!Hq?oq<jL?5ZNQ#I^
zi@5$XLTvZIi9=*$WPVH@KW0|rF~t94gH$$p$W^PJn^G=#gA_88ji!%}96SH7{{iqG
Bv>*Tg

literal 0
HcmV?d00001

diff --git a/doc/markdown/dockerhub.md b/doc/markdown/dockerhub.md
new file mode 100644
index 00000000..91b6cb22
--- /dev/null
+++ b/doc/markdown/dockerhub.md
@@ -0,0 +1,93 @@
+## CK docker hub
+
+[Docker hub](https://hub.docker.com/r/rocm/composable_kernel)
+
+## Why do I need this?
+
+To make our lives easier and bring Composable Kernel dependencies together, we recommend using docker images.
+
+## So what is Composable Kernel?
+
+Composable Kernel (CK) library aims to provide a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs, CPUs, etc, through general purpose kernel languages, like HIP C++.
+
+To get the CK library
+
+```
+git clone https://github.com/ROCmSoftwarePlatform/composable_kernel.git
+```
+
+run a docker container 
+
+```
+docker run                                                            \
+-it                                                                   \
+--privileged                                                          \
+--group-add sudo                                                      \
+-w /root/workspace                                                    \
+-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                         \
+rocm/composable_kernel:ck_ub20.04_rocm5.3_release                     \
+/bin/bash
+```
+
+and build the CK
+
+```
+mkdir build && cd build
+
+# Need to specify target ID, example below is for gfx908 and gfx90a
+cmake                                                                                             \
+-D CMAKE_PREFIX_PATH=/opt/rocm                                                                    \
+-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                         \
+-D CMAKE_CXX_FLAGS="-O3"                                                                          \
+-D CMAKE_BUILD_TYPE=Release                                                                       \
+-D GPU_TARGETS="gfx908;gfx90a"                                                                    \
+..
+```
+
+and 
+
+```
+make -j examples tests
+```
+
+To run all the test cases including tests and examples run
+
+```
+make test
+```
+
+We can also run specific examples or tests like
+
+```
+./bin/example_gemm_xdl_fp16
+./bin/test_gemm_fp16
+```
+
+For more details visit [CK github repo](https://github.com/ROCmSoftwarePlatform/composable_kernel), [CK examples](https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/example), [even more CK examples](https://github.com/ROCmSoftwarePlatform/composable_kernel/tree/develop/client_example).
+
+## And what is inside?
+
+The docker images have everything you need for running CK including:
+
+* [ROCm](https://www.amd.com/en/graphics/servers-solutions-rocm)
+* [CMake](https://cmake.org/)
+* [Compiler](https://github.com/RadeonOpenCompute/llvm-project)
+
+## Which image is right for me?
+
+Let's take a look at the image naming, for example "ck_ub20.04_rocm5.4_release". The image specs are:
+
+* "ck" - made for running Composable Kernel
+* "ub20.04" - based on Ubuntu 20.04
+* "rocm5.4" - ROCm platform version 5.4
+* "release" - compiler version is release
+
+So just pick the right image for your project dependencies and you're all set.
+
+## DIY starts here
+
+If you need to customize a docker image or just can't stop tinkering, feel free to adjust the [Dockerfile](https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/Dockerfile) for your needs.
+
+## License
+
+CK is released under the MIT [license](https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/LICENSE).
diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt
new file mode 100644
index 00000000..c403e51e
--- /dev/null
+++ b/example/01_gemm/CMakeLists.txt
@@ -0,0 +1,37 @@
+add_custom_target(example_gemm_dl)
+
+add_example_executable(example_gemm_dl_fp32 gemm_dl_fp32.cpp)
+add_example_executable(example_gemm_dl_fp16 gemm_dl_fp16.cpp)
+add_example_executable(example_gemm_dl_int8 gemm_dl_int8.cpp)
+
+add_dependencies(example_gemm_dl example_gemm_dl_fp32)
+add_dependencies(example_gemm_dl example_gemm_dl_fp16)
+add_dependencies(example_gemm_dl example_gemm_dl_int8)
+
+if(USE_BITINT_EXTENSION_INT4)
+  add_example_executable(example_gemm_dl_int4 gemm_dl_int4.cpp)
+  add_dependencies(example_gemm_dl example_gemm_dl_int4)
+endif(USE_BITINT_EXTENSION_INT4)
+
+
+add_custom_target(example_gemm_xdl)
+
+add_example_executable(example_gemm_xdl_fp16 gemm_xdl_fp16.cpp)
+add_example_executable(example_gemm_xdl_bf16 gemm_xdl_bf16.cpp)
+add_example_executable(example_gemm_xdl_int8 gemm_xdl_int8.cpp)
+
+add_dependencies(example_gemm_xdl example_gemm_xdl_fp16)
+add_dependencies(example_gemm_xdl example_gemm_xdl_bf16)
+add_dependencies(example_gemm_xdl example_gemm_xdl_int8)
+
+if(USE_BITINT_EXTENSION_INT4)
+  add_example_executable(example_gemm_xdl_int4 gemm_xdl_int4.cpp)
+  add_dependencies(example_gemm_xdl example_gemm_xdl_int4)
+endif(USE_BITINT_EXTENSION_INT4)
+
+add_example_executable(example_gemm_xdl_skip_b_lds_fp16 gemm_xdl_skip_b_lds_fp16.cpp)
+# FIXME: re-enable this exampe as test when SWDEV-335738 is fixed
+add_example_executable_no_testing(example_gemm_xdl_fp64 gemm_xdl_fp64.cpp)
+
+add_dependencies(example_gemm_xdl example_gemm_xdl_skip_b_lds_fp16)
+add_dependencies(example_gemm_xdl example_gemm_xdl_fp64)
diff --git a/example/01_gemm/README.md b/example/01_gemm/README.md
new file mode 100644
index 00000000..226783b0
--- /dev/null
+++ b/example/01_gemm/README.md
@@ -0,0 +1,23 @@
+# Instructions for ```example_gemm_xdl```
+
+## Run ```example_gemm_xdl```
+```bash
+#arg1: verification (0=no, 1=yes)
+#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
+#arg3: run kernel # of times (>1)
+./bin/example_gemm_xdl 0 1 5
+```
+
+Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
+```
+a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
+b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
+c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
+arg.a_grid_desc_k0_m_k1_{512, 3840, 8}
+arg.b_grid_desc_k0_n_k1_{512, 4096, 8}
+arg.c_grid_desc_m_n_{ 3840, 4096}
+launch_and_time_kernel: grid_dim {480, 1, 1}, block_dim {256, 1, 1}
+Warm up
+Start running 5 times...
+Perf: 1.19685 ms, 107.657 TFlops, 78.8501 GB/s
+```
diff --git a/example/01_gemm/common.hpp b/example/01_gemm/common.hpp
new file mode 100644
index 00000000..495a8159
--- /dev/null
+++ b/example/01_gemm/common.hpp
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <numeric>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/fill.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+struct ProblemSize final
+{
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideC = 4096;
+};
+
+struct ExecutionConfig final
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+};
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+inline bool
+parse_cmd_args(int argc, char* argv[], ProblemSize& problem_size, ExecutionConfig& config)
+{
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 10)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+
+        problem_size.M = std::stoi(argv[4]);
+        problem_size.N = std::stoi(argv[5]);
+        problem_size.K = std::stoi(argv[6]);
+
+        problem_size.StrideA = std::stoi(argv[7]);
+        problem_size.StrideB = std::stoi(argv[8]);
+        problem_size.StrideC = std::stoi(argv[9]);
+    }
+    else
+    {
+        std::cerr << "arg1: verification (0=no, 1=yes)" << std::endl
+                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
+                  << std::endl
+                  << "arg3: time kernel (0=no, 1=yes)" << std::endl
+                  << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC" << std::endl;
+        return false;
+    }
+
+    return true;
+}
diff --git a/example/01_gemm/gemm_dl_fp16.cpp b/example/01_gemm/gemm_dl_fp16.cpp
new file mode 100644
index 00000000..cf585a8c
--- /dev/null
+++ b/example/01_gemm/gemm_dl_fp16.cpp
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+
+using ADataType   = ck::half_t;
+using BDataType   = ck::half_t;
+using CDataType   = ck::half_t;
+using AccDataType = float;
+
+using ALayout = Col;
+using BLayout = Row;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl
+// ######|     AData|     BData|     CData|     AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+// ######|      Type|      Type|      Type|        Type|        |        |        | Elementwise| Elementwise| Elementwise| Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+// ######|          |          |          |            |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+// ######|          |          |          |            |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+         < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 2>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 4, 2>,      S<2, 1, 4, 2>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,               5,                  4>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+
+#include "run_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
diff --git a/example/01_gemm/gemm_dl_fp32.cpp b/example/01_gemm/gemm_dl_fp32.cpp
new file mode 100644
index 00000000..93f085cd
--- /dev/null
+++ b/example/01_gemm/gemm_dl_fp32.cpp
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+
+using ADataType   = float;
+using BDataType   = float;
+using CDataType   = float;
+using AccDataType = float;
+
+using ALayout = Col;
+using BLayout = Row;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl
+// ######|     AData|     BData|     CData|     AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+// ######|      Type|      Type|      Type|        Type|        |        |        | Elementwise| Elementwise| Elementwise| Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+// ######|          |          |          |            |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+// ######|          |          |          |            |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+         < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,   256,   128,   128,    16,  1,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 1>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<2, 1, 4, 1>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 4, 1>, S<0, 1, 2, 3, 4, 5>,               5,                  4>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+
+#include "run_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
diff --git a/example/01_gemm/gemm_dl_int4.cpp b/example/01_gemm/gemm_dl_int4.cpp
new file mode 100644
index 00000000..e392c490
--- /dev/null
+++ b/example/01_gemm/gemm_dl_int4.cpp
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+#error Should compile this file with ck::int4_t support
+#endif
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+
+using ADataType       = ck::int4_t;
+using BDataType       = ck::int4_t;
+using CDataType       = ck::int4_t;
+using KernelADataType = int8_t;
+using KernelBDataType = int8_t;
+using KernelCDataType = int8_t;
+using AccDataType     = int32_t;
+
+using ALayout = Col;
+using BLayout = Row;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl
+// ######|           AData|           BData|           CData|     AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+// ######|            Type|            Type|            Type|        Type|        |        |        | Elementwise| Elementwise| Elementwise| Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+// ######|                |                |                |            |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+// ######|                |                |                |            |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+         < KernelADataType, KernelBDataType, KernelCDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 4>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,               5,                  4>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+
+#define BUILD_INT4_EXAMPLE
+#include "run_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
diff --git a/example/01_gemm/gemm_dl_int8.cpp b/example/01_gemm/gemm_dl_int8.cpp
new file mode 100644
index 00000000..be9e3877
--- /dev/null
+++ b/example/01_gemm/gemm_dl_int8.cpp
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+
+using ADataType   = int8_t;
+using BDataType   = int8_t;
+using CDataType   = int8_t;
+using AccDataType = int32_t;
+
+using ALayout = Col;
+using BLayout = Row;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl
+// ######|     AData|     BData|     CData|     AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+// ######|      Type|      Type|      Type|        Type|        |        |        | Elementwise| Elementwise| Elementwise| Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+// ######|          |          |          |            |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+// ######|          |          |          |            |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+         < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 4, 4>,      S<2, 1, 4, 4>,       S<8, 1, 32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,               5,                  4>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+
+#include "run_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
diff --git a/example/01_gemm/gemm_xdl_bf16.cpp b/example/01_gemm/gemm_xdl_bf16.cpp
new file mode 100644
index 00000000..9aaae6ad
--- /dev/null
+++ b/example/01_gemm/gemm_xdl_bf16.cpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+
+using ADataType        = ck::bhalf_t;
+using BDataType        = ck::bhalf_t;
+using CDataType        = ck::bhalf_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::bhalf_t;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
+// ######| ALayout| BLayout| CLayout|     AData|     BData|     CData|     AccData|         CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+// ######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+// ######|        |        |        |          |          |          |            |                 |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+// ######|        |        |        |          |          |          |            |                 |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+
+#include "run_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
diff --git a/example/01_gemm/gemm_xdl_fp16.cpp b/example/01_gemm/gemm_xdl_fp16.cpp
new file mode 100644
index 00000000..488babb7
--- /dev/null
+++ b/example/01_gemm/gemm_xdl_fp16.cpp
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+
+using ADataType        = ck::half_t;
+using BDataType        = ck::half_t;
+using AccDataType      = float;
+using CShuffleDataType = float;
+using CDataType        = ck::half_t;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance0 = ck::tensor_operation::device::DeviceGemmXdl
+// ######|     AData|     BData|     CData|     AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+// ######|      Type|      Type|      Type|        Type|        |        |        | Elementwise| Elementwise| Elementwise|Spacialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+// ######|          |          |          |            |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+// ######|          |          |          |            |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+         < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>;
+// clang-format on
+
+// clang-format off
+using DeviceGemmInstance1 = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
+// ######| ALayout| BLayout| CLayout|     AData|     BData|     CData|     AccData|         CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+// ######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+// ######|        |        |        |          |          |          |            |                 |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+// ######|        |        |        |          |          |          |            |                 |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+// clang-format on
+
+using DeviceGemmInstance = DeviceGemmInstance0;
+
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+
+#include "run_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp
new file mode 100644
index 00000000..99253b74
--- /dev/null
+++ b/example/01_gemm/gemm_xdl_fp64.cpp
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp"
+
+using ADataType   = double;
+using BDataType   = double;
+using CDataType   = double;
+using AccDataType = double;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
+// ######|     AData|     BData|     CData|     AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+// ######|      Type|      Type|      Type|        Type|        |        |        | Elementwise| Elementwise| Elementwise|Spacialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+// ######|          |          |          |            |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+// ######|          |          |          |            |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+#if 0
+         < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,   GemmDefault,    64,    32,    32,     4,  1,   16,   16,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,               7,               1>;
+#else
+         < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,   GemmDefault,   256,   128,   128,     4,  2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>;
+#endif
+    // clang-format on
+
+    using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                            BDataType,
+                                                                            CDataType,
+                                                                            AccDataType,
+                                                                            AElementOp,
+                                                                            BElementOp,
+                                                                            CElementOp>;
+
+#include "run_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
diff --git a/example/01_gemm/gemm_xdl_int4.cpp b/example/01_gemm/gemm_xdl_int4.cpp
new file mode 100644
index 00000000..7f1283a4
--- /dev/null
+++ b/example/01_gemm/gemm_xdl_int4.cpp
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+#error Should compile this file with ck::int4_t support
+#endif
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+
+using ADataType        = ck::int4_t;
+using BDataType        = ck::int4_t;
+using CDataType        = ck::int4_t;
+using KernelADataType  = int8_t;
+using KernelBDataType  = int8_t;
+using KernelCDataType  = int8_t;
+using AccDataType      = int32_t;
+using CShuffleDataType = int8_t;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
+// ######| ALayout| BLayout| CLayout|           AData|           BData|           CData|     AccData|         CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+// ######|        |        |        |            Type|            Type|            Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+// ######|        |        |        |                |                |                |            |                 |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+// ######|        |        |        |                |                |                |            |                 |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+         < ALayout, BLayout, CLayout, KernelADataType, KernelBDataType, KernelCDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,          1,          1,           1,               S<1, 64, 1, 4>,              16>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+
+#define BUILD_INT4_EXAMPLE
+#include "run_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
diff --git a/example/01_gemm/gemm_xdl_int8.cpp b/example/01_gemm/gemm_xdl_int8.cpp
new file mode 100644
index 00000000..e67594c5
--- /dev/null
+++ b/example/01_gemm/gemm_xdl_int8.cpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+
+using ADataType        = int8_t;
+using BDataType        = int8_t;
+using CDataType        = int8_t;
+using AccDataType      = int32_t;
+using CShuffleDataType = int8_t;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
+// ######| ALayout| BLayout| CLayout|     AData|     BData|     CData|     AccData|         CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+// ######|        |        |        |      Type|      Type|      Type|        Type|         DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+// ######|        |        |        |          |          |          |            |                 |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+// ######|        |        |        |          |          |          |            |                 |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 64, 1, 4>,              16>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
+
+#include "run_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
diff --git a/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp b/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
new file mode 100644
index 00000000..12a69925
--- /dev/null
+++ b/example/01_gemm/gemm_xdl_skip_b_lds_fp16.cpp
@@ -0,0 +1,249 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp"
+
+#include "ck/library/utility/literals.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+#define USING_SKIP_LDS 1
+
+// clang-format off
+#if USING_SKIP_LDS
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdlSkipBLds
+        //###########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BThreadTransfer|  BBlock| CThreadTransfer| CThreadTransfer|
+        //###########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Spacialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|       SrcScalar|  buffer| SrcDstVectorDim|       DstScalar|
+        //###########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |       PerVector|  size  |                |       PerVector|
+        //###########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |        |                |                |
+#if 0       
+                    <   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    16,   64,     4,  8,   16,   16,    1,    1,     S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,            8,       8,        7,               1>;
+using ADataType   = ck::half_t;
+using BDataType   = ck::half_t;
+using CDataType   = ck::half_t;
+using AccDataType = float;
+#else  
+                    <   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    16,   64,     4,  4,   16,   16,    1,    1,     S<16, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,             4,      4,        7,               1>;
+using ADataType   = float;
+using BDataType   = float;
+using CDataType   = float;
+using AccDataType = float;
+#endif
+
+#else
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl
+        //###########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //###########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Spacialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //###########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //###########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+                   <   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    16,   64,     4,  4,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,              1,      2>;
+using ADataType   = float;
+using BDataType   = float;
+using CDataType   = float;
+using AccDataType = float;
+
+#endif
+    // clang-format on
+
+    using ReferenceGemmInstance = ck::tensor_operation::host::
+        ReferenceGemm<ADataType, BDataType, CDataType, float, AElementOp, BElementOp, CElementOp>;
+
+template <typename DataType>
+std::ostream& show_2d_matrix(std::ostream& os, Tensor<DataType>& matrix)
+{
+    os << "[" << std::endl;
+    for(size_t x = 0; x < matrix.mDesc.GetLengths()[0]; x++)
+    {
+        os << "[";
+        for(size_t y = 0; y < matrix.mDesc.GetLengths()[1]; y++)
+        {
+            os << std::setw(5) << static_cast<float>(matrix(x, y));
+        }
+        os << "]" << std::endl;
+    }
+    os << "]";
+    return os;
+}
+int main(int argc, char* argv[])
+{
+    bool do_verification = 0;
+    int init_method      = 0;
+    bool time_kernel     = false;
+
+    // GEMM shape
+#if 1
+    ck::index_t M = 16;
+    ck::index_t N = 64 * 120;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA = K;
+    ck::index_t StrideB = K;
+    ck::index_t StrideC = N;
+#else
+    ck::index_t M = 16;
+    ck::index_t N = 16;
+    ck::index_t K = 32;
+
+    ck::index_t StrideA = 8;
+    ck::index_t StrideB = 8;
+    ck::index_t StrideC = 16;
+#endif
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 10)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideC = std::stoi(argv[9]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
+        exit(0);
+    }
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    case 2:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    default:
+        // a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+    }
+
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+
+    auto a_element_op = AElementOp{};
+    auto b_element_op = BElementOp{};
+    auto c_element_op = CElementOp{};
+
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+                                      static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+                                      static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
+                                      M,
+                                      N,
+                                      K,
+                                      StrideA,
+                                      StrideB,
+                                      StrideC,
+                                      a_element_op,
+                                      b_element_op,
+                                      c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+    if(do_verification)
+    {
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+#if 0
+        {
+            show_2d_matrix(std::cout << "a : ", a_m_k) << std::endl;
+            show_2d_matrix(std::cout << "b: ", b_k_n) << std::endl;
+            show_2d_matrix(std::cout << "c_device: ", c_m_n_device_result) << std::endl;
+            show_2d_matrix(std::cout << "c_host  :", c_m_n_host_result) << std::endl;
+        }
+#endif
+        ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+    }
+
+    return 0;
+}
diff --git a/example/01_gemm/run_gemm_example.inc b/example/01_gemm/run_gemm_example.inc
new file mode 100644
index 00000000..4e2cedb5
--- /dev/null
+++ b/example/01_gemm/run_gemm_example.inc
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+#if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
+    static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
+#endif
+
+    using namespace ck::literals;
+
+    auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
+        ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n);
+        break;
+    default:
+        ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
+        ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
+    }
+
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+#ifdef BUILD_INT4_EXAMPLE
+    DeviceMem a_m_k_device_buf(sizeof(KernelADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(KernelBDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(KernelCDataType) *
+                               c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    const Tensor<KernelADataType> a_m_k_converted(a_m_k);
+    const Tensor<KernelBDataType> b_k_n_converted(b_k_n);
+
+    a_m_k_device_buf.ToDevice(a_m_k_converted.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n_converted.mData.data());
+#else
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+#endif
+
+    auto a_element_op = AElementOp{};
+    auto b_element_op = BElementOp{};
+    auto c_element_op = CElementOp{};
+
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(
+#ifdef BUILD_INT4_EXAMPLE
+        static_cast<KernelADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+        static_cast<KernelBDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+        static_cast<KernelCDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
+#else
+        static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+        static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+        static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
+#endif
+        M,
+        N,
+        K,
+        StrideA,
+        StrideB,
+        StrideC,
+        a_element_op,
+        b_element_op,
+        c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cerr << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return true;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+    std::size_t flop = 2_uz * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    if(config.do_verification)
+    {
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+#ifdef BUILD_INT4_EXAMPLE
+        Tensor<CDataType> c_m_n_device_result_converted(c_m_n_host_result.mDesc);
+
+        c_m_n_device_buf.FromDevice(c_m_n_device_result_converted.mData.data());
+
+        c_m_n_device_result = c_m_n_device_result_converted.CopyAsType<CDataType>();
+
+        return ck::utils::check_err(c_m_n_device_result_converted, c_m_n_host_result);
+#else
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+#endif
+    }
+
+    return true;
+}
+
+bool run_gemm_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    return !parse_cmd_args(argc, argv, problem_size, config) || run_gemm(problem_size, config);
+}
diff --git a/example/02_gemm_bilinear/CMakeLists.txt b/example/02_gemm_bilinear/CMakeLists.txt
new file mode 100644
index 00000000..10ec0f1a
--- /dev/null
+++ b/example/02_gemm_bilinear/CMakeLists.txt
@@ -0,0 +1 @@
+add_example_executable(example_gemm_bilinear_xdl_fp16 gemm_bilinear_xdl_fp16.cpp)
diff --git a/example/02_gemm_bilinear/README.md b/example/02_gemm_bilinear/README.md
new file mode 100644
index 00000000..9eb87e1e
--- /dev/null
+++ b/example/02_gemm_bilinear/README.md
@@ -0,0 +1,28 @@
+# Instructions for ```example_gemm_bilinear_xdl_fp16```
+
+## Run ```example_gemm_bilinear_xdl_fp16```
+```bash
+#arg1: verification (0=no, 1=yes)
+#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
+#arg3: time kernel (0=no, 1=yes)
+#arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE
+#arg11 to 12: alpha, beta
+./bin/example_gemm_bilinear_xdl_fp16 1 1 1 3840 4096 4096 4096 4096 4096 4096 0.5 0.5
+```
+Result (MI100 @ 1502Mhz, 184.6TFlops peak FP16)
+```
+a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
+b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
+c0_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
+c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
+arg.a_grid_desc_k0_m_k1_{512, 3840, 8}
+arg.b_grid_desc_k0_n_k1_{512, 4096, 8}
+arg.c0_grid_desc_m_n_{ 3840, 4096}
+arg.c_grid_desc_m_n_{ 3840, 4096}
+launch_and_time_kernel: grid_dim {480, 1, 1}, block_dim {256, 1, 1}
+Warm up
+Start running 1 times...
+Perf: 0.936965 ms, 137.517 TFlops, 102.959 GB/s
+error: 0
+max_diff: 0, 558.5, 558.5
+```
diff --git a/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp b/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
new file mode 100644
index 00000000..917b6b1c
--- /dev/null
+++ b/example/02_gemm_bilinear/gemm_bilinear_xdl_fp16.cpp
@@ -0,0 +1,306 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+struct AlphaBetaAdd
+{
+    AlphaBetaAdd(float alpha, float beta) : alpha_(alpha), beta_(beta){};
+
+    template <typename E, typename C, typename D>
+    __host__ __device__ constexpr void operator()(E& e, const C& c, const D& d) const;
+
+    template <>
+    __host__ __device__ constexpr void operator()<ck::half_t, float, ck::half_t>(
+        ck::half_t& e, const float& c, const ck::half_t& d) const
+    {
+        e = ck::type_convert<ck::half_t>(alpha_ * c + beta_ * ck::type_convert<float>(d));
+    };
+
+    float alpha_;
+    float beta_;
+};
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DDataType        = F16;
+using EDataType        = F16;
+
+using ALayout = Row;
+using BLayout = Col;
+using DLayout = Row;
+using ELayout = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AlphaBetaAdd;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using DeviceOpInstance =
+    ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle<ALayout,
+                                                                   BLayout,
+                                                                   ck::Tuple<DLayout>,
+                                                                   ELayout,
+                                                                   ADataType,
+                                                                   BDataType,
+                                                                   AccDataType,
+                                                                   CShuffleDataType,
+                                                                   ck::Tuple<DDataType>,
+                                                                   EDataType,
+                                                                   AElementOp,
+                                                                   BElementOp,
+                                                                   CDEElementOp,
+                                                                   GemmSpec,
+                                                                   1,
+                                                                   256,
+                                                                   256,
+                                                                   128,
+                                                                   32,
+                                                                   8,
+                                                                   8,
+                                                                   32,
+                                                                   32,
+                                                                   4,
+                                                                   2,
+                                                                   S<4, 64, 1>,
+                                                                   S<1, 0, 2>,
+                                                                   S<1, 0, 2>,
+                                                                   2,
+                                                                   8,
+                                                                   8,
+                                                                   1,
+                                                                   S<4, 64, 1>,
+                                                                   S<1, 0, 2>,
+                                                                   S<1, 0, 2>,
+                                                                   2,
+                                                                   8,
+                                                                   8,
+                                                                   1,
+                                                                   1,
+                                                                   1,
+                                                                   S<1, 32, 1, 8>,
+                                                                   8>;
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideD = 4096;
+    ck::index_t StrideE = 4096;
+
+    float alpha = 1.0f;
+    float beta  = 1.0f;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 6)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        alpha = std::stof(argv[4]);
+        beta  = std::stof(argv[5]);
+    }
+    else if(argc == 13)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideD = std::stoi(argv[9]);
+        StrideE = std::stoi(argv[10]);
+
+        alpha = std::stof(argv[11]);
+        beta  = std::stof(argv[12]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE, alpha, "
+               "beta\n");
+        exit(0);
+    }
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<DDataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, DLayout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_2<DDataType>{-5, 5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_3<DDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    d_device_buf.ToDevice(d_m_n.mData.data());
+    e_device_buf.ToDevice(e_m_n_device_result.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{alpha, beta};
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                               b_device_buf.GetDeviceBuffer(),
+                               std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               M,
+                               N,
+                               K,
+                               StrideA,
+                               StrideB,
+                               std::array<ck::index_t, 1>{StrideD},
+                               StrideE,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_m_n({M, N});
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                CShuffleDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                PassThrough>;
+        auto ref_gemm               = ReferenceGemmInstance{};
+        auto ref_invoker            = ref_gemm.MakeInvoker();
+
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d_m_n(m, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}
diff --git a/example/03_gemm_bias_relu/CMakeLists.txt b/example/03_gemm_bias_relu/CMakeLists.txt
new file mode 100644
index 00000000..35c54aba
--- /dev/null
+++ b/example/03_gemm_bias_relu/CMakeLists.txt
@@ -0,0 +1 @@
+add_example_executable(example_gemm_bias_relu_xdl_fp16 gemm_bias_relu_xdl_fp16.cpp)
diff --git a/example/03_gemm_bias_relu/README.md b/example/03_gemm_bias_relu/README.md
new file mode 100644
index 00000000..f28a9a07
--- /dev/null
+++ b/example/03_gemm_bias_relu/README.md
@@ -0,0 +1,10 @@
+# Instructions for ```example_gemm_bias_relu_xdl_fp16```
+
+## Run ```example_gemm_bias_relu_xdl_fp16```
+```bash
+#arg1: verification (0=no, 1=yes)
+#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
+#arg3: time kernel (0=no, 1=yes)
+#arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE
+./bin/example_gemm_bias_relu_xdl_fp16 1 1 1 3840 4096 4096 4096 4096 4096
+```
diff --git a/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp b/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
new file mode 100644
index 00000000..aee51d05
--- /dev/null
+++ b/example/03_gemm_bias_relu/gemm_bias_relu_xdl_fp16.cpp
@@ -0,0 +1,283 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+// C = A * B
+// E = Relu(C + D);
+struct AddRelu
+{
+    __host__ __device__ void
+    operator()(ck::half_t& e, const ck::half_t& c, const ck::half_t& d) const
+    {
+        const ck::half_t x = c + d;
+
+        e = x > 0 ? x : 0;
+    }
+};
+
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
+using DDataType        = F16;
+using EDataType        = F16;
+
+using ALayout = Row;
+using BLayout = Col;
+using DLayout = Row;
+using ELayout = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddRelu;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using DeviceOpInstance =
+    ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle<ALayout,
+                                                                   BLayout,
+                                                                   ck::Tuple<DLayout>,
+                                                                   ELayout,
+                                                                   ADataType,
+                                                                   BDataType,
+                                                                   AccDataType,
+                                                                   CShuffleDataType,
+                                                                   ck::Tuple<DDataType>,
+                                                                   EDataType,
+                                                                   AElementOp,
+                                                                   BElementOp,
+                                                                   CDEElementOp,
+                                                                   GemmSpec,
+                                                                   1,
+                                                                   256,
+                                                                   256,
+                                                                   128,
+                                                                   32,
+                                                                   8,
+                                                                   8,
+                                                                   32,
+                                                                   32,
+                                                                   4,
+                                                                   2,
+                                                                   S<4, 64, 1>,
+                                                                   S<1, 0, 2>,
+                                                                   S<1, 0, 2>,
+                                                                   2,
+                                                                   8,
+                                                                   8,
+                                                                   1,
+                                                                   S<4, 64, 1>,
+                                                                   S<1, 0, 2>,
+                                                                   S<1, 0, 2>,
+                                                                   2,
+                                                                   8,
+                                                                   8,
+                                                                   1,
+                                                                   1,
+                                                                   1,
+                                                                   S<1, 32, 1, 8>,
+                                                                   8>;
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideE = 4096;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 10)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideE = std::stoi(argv[9]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE\n");
+        exit(0);
+    }
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<DDataType> d_m_n(f_host_tensor_descriptor(M, N, 0, ELayout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_2<DDataType>{-5, 5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_3<DDataType>{0.0, 1.0});
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    d_device_buf.ToDevice(d_m_n.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+
+    auto invoker = device_op.MakeInvoker();
+
+    auto argument =
+        device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                               b_device_buf.GetDeviceBuffer(),
+                               std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               M,
+                               N,
+                               K,
+                               StrideA,
+                               StrideB,
+                               std::array<ck::index_t, 1>{0},
+                               StrideE,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error("wrong! this device_op instance does not support this problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+
+    std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                            sizeof(EDataType) * M * N + sizeof(EDataType) * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    if(do_verification)
+    {
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+        Tensor<AccDataType> c_m_n(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                PassThrough>;
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d_m_n(m, n));
+            }
+        }
+
+        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}
diff --git a/example/04_gemm_add_add_fastgelu/CMakeLists.txt b/example/04_gemm_add_add_fastgelu/CMakeLists.txt
new file mode 100644
index 00000000..c75c5ba5
--- /dev/null
+++ b/example/04_gemm_add_add_fastgelu/CMakeLists.txt
@@ -0,0 +1,17 @@
+add_custom_target(example_gemm_add_add_fastgelu_xdl)
+
+add_example_executable(example_gemm_add_add_fastgelu_xdl_bf16 gemm_add_add_fastgelu_xdl_bf16.cpp)
+add_example_executable(example_gemm_add_add_fastgelu_xdl_fp16 gemm_add_add_fastgelu_xdl_fp16.cpp)
+add_example_executable(example_gemm_add_add_fastgelu_xdl_fp32 gemm_add_add_fastgelu_xdl_fp32.cpp)
+if(USE_BITINT_EXTENSION_INT4)
+  add_example_executable(example_gemm_add_add_fastgelu_xdl_int4 gemm_add_add_fastgelu_xdl_int4.cpp)
+endif(USE_BITINT_EXTENSION_INT4)
+add_example_executable(example_gemm_add_add_fastgelu_xdl_int8 gemm_add_add_fastgelu_xdl_int8.cpp)
+
+add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_bf16)
+add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_fp16)
+add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_fp32)
+if(USE_BITINT_EXTENSION_INT4)
+  add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int4)
+endif(USE_BITINT_EXTENSION_INT4)
+add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int8)
diff --git a/example/04_gemm_add_add_fastgelu/README.md b/example/04_gemm_add_add_fastgelu/README.md
new file mode 100644
index 00000000..08a55fb9
--- /dev/null
+++ b/example/04_gemm_add_add_fastgelu/README.md
@@ -0,0 +1,23 @@
+# Instructions for ```example_gemm_add_add_fastgelu_xdl_fp16```
+
+## Run ```example_gemm_add_add_fastgelu_xdl_fp16```
+```bash
+#arg1: verification (0=no, 1=yes)
+#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
+#arg3: time kernel (0=no, 1=yes)
+#arg4 to 11: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD0, StrideD1, StrideE"
+./bin/example_gemm_add_add_fastgelu_xdl_fp16 1 1 1
+```
+
+Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
+```
+a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
+b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
+d0_m_n: dim 2, lengths {3840, 4096}, strides {0, 1}
+d1_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
+e_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
+launch_and_time_kernel: grid_dim {480, 1, 1}, block_dim {256, 1, 1}
+Warm up 1 time
+Start running 10 times...
+Perf: 1.26914 ms, 101.525 TFlops, 100.804 GB/s, DeviceGemmMultipleD_Xdl_CShuffle<256, 256, 128, 32, 8, 8>
+```
diff --git a/example/04_gemm_add_add_fastgelu/common.hpp b/example/04_gemm_add_add_fastgelu/common.hpp
new file mode 100644
index 00000000..3f9375e0
--- /dev/null
+++ b/example/04_gemm_add_add_fastgelu/common.hpp
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough    = ck::tensor_operation::element_wise::PassThrough;
+using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
+
+using BF16 = ck::bhalf_t;
+using F16  = ck::half_t;
+using F32  = float;
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+using I4 = ck::int4_t;
+#endif
+using I8  = int8_t;
+using I32 = int32_t;
+
+struct ProblemSize final
+{
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA  = 4096;
+    ck::index_t StrideB  = 4096;
+    ck::index_t StrideD0 = 0;
+    ck::index_t StrideD1 = 4096;
+    ck::index_t StrideE  = 4096;
+};
+
+struct ExecutionConfig final
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+};
+
+inline bool
+parse_cmd_args(int argc, char* argv[], ProblemSize& problem_size, ExecutionConfig config)
+{
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 12)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+
+        problem_size.M = std::stoi(argv[4]);
+        problem_size.N = std::stoi(argv[5]);
+        problem_size.K = std::stoi(argv[6]);
+
+        problem_size.StrideA  = std::stoi(argv[7]);
+        problem_size.StrideB  = std::stoi(argv[8]);
+        problem_size.StrideD0 = std::stoi(argv[9]);
+        problem_size.StrideD1 = std::stoi(argv[10]);
+        problem_size.StrideE  = std::stoi(argv[11]);
+    }
+    else
+    {
+        std::cerr << "arg1: verification (0=no, 1=yes)" << std::endl
+                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)"
+                  << std::endl
+                  << "arg3: time kernel (0=no, 1=yes)" << std::endl
+                  << "arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD0, StrideD1, "
+                     "StrideE"
+                  << std::endl;
+        return false;
+    }
+
+    return true;
+}
diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp
new file mode 100644
index 00000000..5e50c14d
--- /dev/null
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_bf16.cpp
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using ADataType        = BF16;
+using BDataType        = BF16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using D0DataType       = BF16;
+using D1DataType       = BF16;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType>;
+using EDataType        = BF16;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using D0Layout = Row;
+using D1Layout = Row;
+using DsLayout = ck::Tuple<D0Layout, D1Layout>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddAddFastGelu;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        AccDataType,
+                                                                        AccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        PassThrough>;
+
+#include "run_gemm_add_add_fastgelu_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_add_add_fastgelu_example(argc, argv); }
diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
new file mode 100644
index 00000000..6c7ca414
--- /dev/null
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp16.cpp
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using D0DataType       = F16;
+using D1DataType       = F16;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType>;
+using EDataType        = F16;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using D0Layout = Row;
+using D1Layout = Row;
+using DsLayout = ck::Tuple<D0Layout, D1Layout>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddAddFastGelu;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        AccDataType,
+                                                                        AccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        PassThrough>;
+
+#include "run_gemm_add_add_fastgelu_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_add_add_fastgelu_example(argc, argv); }
diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp
new file mode 100644
index 00000000..1ef266f2
--- /dev/null
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_fp32.cpp
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using D0DataType       = F32;
+using D1DataType       = F32;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType>;
+using EDataType        = F32;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using D0Layout = Row;
+using D1Layout = Row;
+using DsLayout = ck::Tuple<D0Layout, D1Layout>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddAddFastGelu;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        AccDataType,
+                                                                        AccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        PassThrough>;
+
+#include "run_gemm_add_add_fastgelu_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_add_add_fastgelu_example(argc, argv); }
diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int4.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int4.cpp
new file mode 100644
index 00000000..8b5bc987
--- /dev/null
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int4.cpp
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+#error Should compile this file with ck::int4_t support
+#endif
+
+#include "common.hpp"
+
+using ADataType        = I4;
+using BDataType        = I4;
+using AccDataType      = I32;
+using CShuffleDataType = I32;
+using D0DataType       = I4;
+using D1DataType       = I4;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType>;
+using EDataType        = I4;
+
+using KernelADataType  = I8;
+using KernelBDataType  = I8;
+using KernelD0DataType = I8;
+using KernelD1DataType = I8;
+using KernelDsDataType = ck::Tuple<KernelD0DataType, KernelD1DataType>;
+using KernelEDataType  = I8;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using D0Layout = Row;
+using D1Layout = Row;
+using DsLayout = ck::Tuple<D0Layout, D1Layout>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddAddFastGelu;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle
+//######| ALayout| BLayout| DsLayout| ELayout|           AData|           BData|     AccData|         CShuffle|           DsData|           EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |            Type|            Type|        Type|         DataType|             Type|            Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |                |                |            |                 |                 |                |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |                |                |            |                 |                 |                |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, KernelADataType, KernelBDataType, AccDataType, CShuffleDataType, KernelDsDataType, KernelEDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 64, 1, 4>,              16>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        AccDataType,
+                                                                        AccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        PassThrough>;
+
+#define BUILD_INT4_EXAMPLE
+#include "run_gemm_add_add_fastgelu_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_add_add_fastgelu_example(argc, argv); }
diff --git a/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp
new file mode 100644
index 00000000..b236f5e9
--- /dev/null
+++ b/example/04_gemm_add_add_fastgelu/gemm_add_add_fastgelu_xdl_int8.cpp
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using ADataType        = I8;
+using BDataType        = I8;
+using AccDataType      = I32;
+using CShuffleDataType = I32;
+using D0DataType       = I8;
+using D1DataType       = I8;
+using DsDataType       = ck::Tuple<D0DataType, D1DataType>;
+using EDataType        = I8;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using D0Layout = Row;
+using D1Layout = Row;
+using DsLayout = ck::Tuple<D0Layout, D1Layout>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddAddFastGelu;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 64, 1, 4>,              16>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        AccDataType,
+                                                                        AccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        PassThrough>;
+
+#include "run_gemm_add_add_fastgelu_example.inc"
+
+int main(int argc, char* argv[]) { return !run_gemm_add_add_fastgelu_example(argc, argv); }
diff --git a/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc b/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
new file mode 100644
index 00000000..f3def33b
--- /dev/null
+++ b/example/04_gemm_add_add_fastgelu/run_gemm_add_add_fastgelu_example.inc
@@ -0,0 +1,166 @@
+#pragma once
+
+bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+#if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
+    static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
+#endif
+    using namespace ck::literals;
+
+    auto& [M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE] = problem_size;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor(M, N, StrideD0, D0Layout{}));
+    Tensor<D1DataType> d1_m_n(f_host_tensor_descriptor(M, N, StrideD1, D1Layout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<
+#ifdef BUILD_INT4_EXAMPLE
+        KernelEDataType
+#else
+        EDataType
+#endif
+        >
+        e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
+    std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-5, 5});
+        d1_m_n.GenerateTensorValue(GeneratorTensor_2<D1DataType>{-5, 5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
+        d1_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+#ifdef BUILD_INT4_EXAMPLE
+    const Tensor<KernelADataType> a_m_k_converted(a_m_k);
+    const Tensor<KernelBDataType> b_k_n_converted(b_k_n);
+    const Tensor<KernelD0DataType> d0_m_n_converted(d0_m_n);
+    const Tensor<KernelD1DataType> d1_m_n_converted(d1_m_n);
+
+    a_device_buf.ToDevice(a_m_k_converted.mData.data());
+    b_device_buf.ToDevice(b_k_n_converted.mData.data());
+    d0_device_buf.ToDevice(d0_m_n_converted.mData.data());
+    d1_device_buf.ToDevice(d1_m_n_converted.mData.data());
+#else
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    d0_device_buf.ToDevice(d0_m_n.mData.data());
+    d1_device_buf.ToDevice(d1_m_n.mData.data());
+#endif
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                               b_device_buf.GetDeviceBuffer(),
+                               {d0_device_buf.GetDeviceBuffer(), d1_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               M,
+                               N,
+                               K,
+                               StrideA,
+                               StrideB,
+                               {StrideD0, StrideD1},
+                               StrideE,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error("wrong! this device_op instance does not support this problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+    std::size_t flop      = 2_uz * M * N * K;
+    std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                            sizeof(D0DataType) * N + sizeof(D1DataType) * M * N +
+                            sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << device_op.GetTypeString() << std::endl;
+
+    if(config.do_verification)
+    {
+        Tensor<AccDataType> c_m_n({M, N});
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_m_n(m, n), d1_m_n(m, n));
+            }
+        }
+
+        e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+#ifdef BUILD_INT4_EXAMPLE
+        const Tensor<EDataType> e_m_n_device_result_converted(e_m_n_device_result);
+
+        return ck::utils::check_err(e_m_n_device_result_converted, e_m_n_host_result);
+#else
+        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
+#endif
+    }
+
+    return true;
+}
+
+bool run_gemm_add_add_fastgelu_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    return !parse_cmd_args(argc, argv, problem_size, config) ||
+           run_gemm_add_add_fastgelu(problem_size, config);
+}
diff --git a/example/09_convnd_fwd/CMakeLists.txt b/example/09_convnd_fwd/CMakeLists.txt
new file mode 100644
index 00000000..e0a53005
--- /dev/null
+++ b/example/09_convnd_fwd/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_example_executable(example_convnd_fwd_xdl_fp32 convnd_fwd_xdl_fp32.cpp)
+add_example_executable(example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp)
+add_example_executable(example_convnd_fwd_xdl_bf16 convnd_fwd_xdl_bf16.cpp)
+add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp)
+# FIXME: re-enable this exampe as test when SWDEV-335738 is fixed
+add_example_executable_no_testing(example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp)
+
+add_example_executable(example_convnd_fwd_dl_fp16 convnd_fwd_dl_fp16.cpp)
+add_example_executable(example_convnd_fwd_dl_fp32 convnd_fwd_dl_fp32.cpp)
+add_example_executable(example_convnd_fwd_dl_int8 convnd_fwd_dl_int8.cpp)
+
diff --git a/example/09_convnd_fwd/README.md b/example/09_convnd_fwd/README.md
new file mode 100644
index 00000000..9ab5fee5
--- /dev/null
+++ b/example/09_convnd_fwd/README.md
@@ -0,0 +1,32 @@
+# Instructions for ```example_convnd_fwd_xdl```
+
+## Run ```example_convnd_fwd_xdl```
+```bash
+#arg1: verification (0=no, 1=yes)
+#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
+#arg3: run kernel # of times (>1)
+#arg4: N spatial dimensions (default 2)
+#Following arguments (depending on number of spatial dims):
+# N, K, C, 
+# <filter spatial dimensions>, (ie Y, X for 2D)
+# <input image spatial dimensions>, (ie Hi, Wi for 2D)
+# <strides>, (ie Sy, Sx for 2D)
+# <dilations>, (ie Dy, Dx for 2D)
+# <left padding>, (ie LeftPy, LeftPx for 2D)
+# <right padding>, (ie RightPy, RightPx for 2D)
+./bin/example_convnd_fwd_xdl 0 1 100
+```
+
+Result (MI100 @ 1087Mhz, 33.4TFlops peak FP32)
+```
+input: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
+weights: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
+output: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
+arg.a_grid_desc_k0_m_k1_{432, 165888, 4}
+arg.b_grid_desc_k0_n_k1_{432, 256, 4}
+arg.c_grid_desc_m_n_{ 165888, 256}
+launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
+Warm up
+Start running 100 times...
+Perf: 4.43736 ms, 33.0753 TFlops, 150.357 GB/s
+```
diff --git a/example/09_convnd_fwd/convnd_fwd_common.hpp b/example/09_convnd_fwd/convnd_fwd_common.hpp
new file mode 100644
index 00000000..4c594ccd
--- /dev/null
+++ b/example/09_convnd_fwd/convnd_fwd_common.hpp
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+void print_helper_msg()
+{
+    std::cout << "arg1: verification (0=no, 1=yes)\n"
+              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+              << "arg3: time kernel (0=no, 1=yes)\n"
+              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+}
+
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNDFwdInstance>
+bool run_grouped_conv_fwd(bool do_verification,
+                          int init_method,
+                          bool time_kernel,
+                          const ck::utils::conv::ConvParam& conv_param,
+                          const HostTensorDescriptor& in_g_n_c_wis_desc,
+                          const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                          const HostTensorDescriptor& out_g_n_k_wos_desc,
+                          const InElementOp& in_element_op,
+                          const WeiElementOp& wei_element_op,
+                          const OutElementOp& out_element_op)
+{
+    Tensor<InDataType> in(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
+
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    // do Conv
+    auto conv     = DeviceConvNDFwdInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
+                                      wei_device_buf.GetDeviceBuffer(),
+                                      std::array<const void*, 0>{},
+                                      out_device_buf.GetDeviceBuffer(),
+                                      a_g_n_c_wis_lengths,
+                                      a_g_n_c_wis_strides,
+                                      b_g_k_c_xs_lengths,
+                                      b_g_k_c_xs_strides,
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
+                                      e_g_n_k_wos_lengths,
+                                      e_g_n_k_wos_strides,
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads,
+                                      in_element_op,
+                                      wei_element_op,
+                                      out_element_op);
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_conv with the specified compilation parameters does "
+            "not support this Conv problem");
+    }
+
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop      = conv_param.GetFlops();
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+
+    if(do_verification)
+    {
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                     InDataType,
+                                                                     WeiDataType,
+                                                                     OutDataType,
+                                                                     InElementOp,
+                                                                     WeiElementOp,
+                                                                     OutElementOp>();
+
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei,
+                                                  out_host,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        out_device_buf.FromDevice(out_device.mData.data());
+
+        return ck::utils::check_err(
+            out_device, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
+    }
+
+    return true;
+}
diff --git a/example/09_convnd_fwd/convnd_fwd_dl_common.hpp b/example/09_convnd_fwd/convnd_fwd_dl_common.hpp
new file mode 100644
index 00000000..855710b9
--- /dev/null
+++ b/example/09_convnd_fwd/convnd_fwd_dl_common.hpp
@@ -0,0 +1,196 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+void print_helper_msg()
+{
+    std::cout << "arg1: verification (0=no, 1=yes)\n"
+              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+              << "arg3: time kernel (0=no, 1=yes)\n"
+              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+}
+
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename DsDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNDFwdInstance>
+bool run_grouped_conv_fwd_dl(bool do_verification,
+                             int init_method,
+                             bool time_kernel,
+                             const ck::utils::conv::ConvParam& conv_param,
+                             const HostTensorDescriptor& in_g_n_c_wis_desc,
+                             const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                             const HostTensorDescriptor& out_g_n_k_wos_desc,
+                             const InElementOp& in_element_op,
+                             const WeiElementOp& wei_element_op,
+                             const OutElementOp& out_element_op)
+{
+    using DDataType = ck::remove_cvref_t<ck::tuple_element_t<0, DsDataType>>;
+    Tensor<InDataType> in(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<DDataType> bias(out_g_n_k_wos_desc);
+    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
+
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-2, 3});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 3});
+        bias.GenerateTensorValue(GeneratorTensor_2<DDataType>{-2, 3});
+        break;
+    case 2:
+        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+        bias.GenerateTensorValue(GeneratorTensor_3<DDataType>{-0.5, 0.5});
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_1<InDataType>{1});
+        wei.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{-1});
+        bias.GenerateTensorValue(GeneratorTensor_1<DDataType>{1});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem bias_device_buf(sizeof(DDataType) * bias.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    bias_device_buf.ToDevice(bias.mData.data());
+
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> d_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> d_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), d_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), d_g_n_k_wos_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    // do Conv
+    auto conv     = DeviceConvNDFwdInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(
+        in_device_buf.GetDeviceBuffer(),
+        wei_device_buf.GetDeviceBuffer(),
+        std::array<const void*, 1>{bias_device_buf.GetDeviceBuffer()},
+        out_device_buf.GetDeviceBuffer(),
+        a_g_n_c_wis_lengths,
+        a_g_n_c_wis_strides,
+        b_g_k_c_xs_lengths,
+        b_g_k_c_xs_strides,
+        std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{{d_g_n_k_wos_lengths}},
+        std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{{d_g_n_k_wos_strides}},
+        e_g_n_k_wos_lengths,
+        e_g_n_k_wos_strides,
+        conv_filter_strides,
+        conv_filter_dilations,
+        input_left_pads,
+        input_right_pads,
+        in_element_op,
+        wei_element_op,
+        out_element_op);
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        std::cout << "wrong! device_conv with the specified compilation parameters does not "
+                     "support this Conv problem"
+                  << std::endl;
+        return true;
+    }
+
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop      = conv_param.GetFlops();
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+
+    if(do_verification)
+    {
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<
+            NDimSpatial,
+            InDataType,
+            WeiDataType,
+            OutDataType,
+            InElementOp,
+            WeiElementOp,
+            ck::tensor_operation::element_wise::PassThrough>();
+
+        auto ref_invoker = ref_conv.MakeInvoker();
+        auto ref_argument =
+            ref_conv.MakeArgument(in,
+                                  wei,
+                                  out_host,
+                                  conv_param.conv_filter_strides_,
+                                  conv_param.conv_filter_dilations_,
+                                  conv_param.input_left_pads_,
+                                  conv_param.input_right_pads_,
+                                  in_element_op,
+                                  wei_element_op,
+                                  ck::tensor_operation::element_wise::PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        // cde_elementwise
+        out_host.ForEach(
+            [&](auto&, auto idx) { out_element_op(out_host(idx), out_host(idx), bias(idx)); });
+
+        out_device_buf.FromDevice(out_device.mData.data());
+
+        return ck::utils::check_err(
+            out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
+    }
+
+    return true;
+}
diff --git a/example/09_convnd_fwd/convnd_fwd_dl_fp16.cpp b/example/09_convnd_fwd/convnd_fwd_dl_fp16.cpp
new file mode 100644
index 00000000..db5a7f0b
--- /dev/null
+++ b/example/09_convnd_fwd/convnd_fwd_dl_fp16.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_dl_common.hpp"
+
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
+
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+using InDataType  = ck::half_t;
+using WeiDataType = ck::half_t;
+using AccDataType = float;
+using DsDataType  = ck::Tuple<ck::half_t>;
+using OutDataType = ck::half_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmPadingSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
+// clang-format off
+using DeviceGroupedConvNDFwdInstance = ck::tensor_operation::device::DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
+// ######|        NDim|     InData|     WeiData|   MultpleD|     OutData|     AccData| InLayout| WeiLayout|            MultipleD| OutLayout|           In|           Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+// ######|     Spatial|       Type|        Type|       Type|        Type|        Type|         |          |               Layout|          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+// ######|            |           |            |           |            |            |         |          |                     |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+// ######|            |           |            |           |            |            |         |          |                     |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+         < NDimSpatial, InDataType, WeiDataType, DsDataType, OutDataType, AccDataType, InLayout, WeiLayout, ck::Tuple<OutLayout>, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp,       ConvSpec,    GemmPadingSpec,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,               5,                 4>;
+// clang-format on
+
+#include "run_convnd_fwd_dl_example.inc"
+
+int main(int argc, char* argv[]) { return run_convnd_fwd_dl_example(argc, argv) ? 0 : 1; }
diff --git a/example/09_convnd_fwd/convnd_fwd_dl_fp32.cpp b/example/09_convnd_fwd/convnd_fwd_dl_fp32.cpp
new file mode 100644
index 00000000..964d784c
--- /dev/null
+++ b/example/09_convnd_fwd/convnd_fwd_dl_fp32.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_dl_common.hpp"
+
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
+
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+using InDataType  = float;
+using WeiDataType = float;
+using AccDataType = float;
+using DsDataType  = ck::Tuple<float>;
+using OutDataType = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmPadingSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
+// clang-format off
+using DeviceGroupedConvNDFwdInstance = ck::tensor_operation::device::DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
+// ######|        NDim|     InData|     WeiData|   MultpleD|     OutData|     AccData| InLayout| WeiLayout|            MultipleD| OutLayout|           In|           Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+// ######|     Spatial|       Type|        Type|       Type|        Type|        Type|         |          |               Layout|          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+// ######|            |           |            |           |            |            |         |          |                     |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+// ######|            |           |            |           |            |            |         |          |                     |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+         < NDimSpatial, InDataType, WeiDataType, DsDataType, OutDataType, AccDataType, InLayout, WeiLayout, ck::Tuple<OutLayout>, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp,       ConvSpec,    GemmPadingSpec,   256,   128,   128,    16,  1,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>,               5,                 4>;
+// clang-format on
+
+#include "run_convnd_fwd_dl_example.inc"
+
+int main(int argc, char* argv[]) { return run_convnd_fwd_dl_example(argc, argv) ? 0 : 1; }
diff --git a/example/09_convnd_fwd/convnd_fwd_dl_int8.cpp b/example/09_convnd_fwd/convnd_fwd_dl_int8.cpp
new file mode 100644
index 00000000..b0cd88f2
--- /dev/null
+++ b/example/09_convnd_fwd/convnd_fwd_dl_int8.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_dl_common.hpp"
+
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
+
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+using InDataType  = int8_t;
+using WeiDataType = int8_t;
+using AccDataType = int32_t;
+using DsDataType  = ck::Tuple<int8_t>;
+using OutDataType = int8_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmPadingSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
+// clang-format off
+using DeviceGroupedConvNDFwdInstance = ck::tensor_operation::device::DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
+// ######|        NDim|     InData|     WeiData|   MultpleD|     OutData|     AccData| InLayout| WeiLayout|            MultipleD| OutLayout|           In|           Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+// ######|     Spatial|       Type|        Type|       Type|        Type|        Type|         |          |               Layout|          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+// ######|            |           |            |           |            |            |         |          |                     |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+// ######|            |           |            |           |            |            |         |          |                     |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+         < NDimSpatial, InDataType, WeiDataType, DsDataType, OutDataType, AccDataType, InLayout, WeiLayout, ck::Tuple<OutLayout>, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp,       ConvSpec,    GemmPadingSpec,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,               5,                 4>;
+// clang-format on
+
+#include "run_convnd_fwd_dl_example.inc"
+
+int main(int argc, char* argv[]) { return run_convnd_fwd_dl_example(argc, argv) ? 0 : 1; }
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
new file mode 100644
index 00000000..d55d3154
--- /dev/null
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_bf16.cpp
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+using InDataType       = ck::bhalf_t;
+using WeiDataType      = ck::bhalf_t;
+using AccDataType      = float;
+using CShuffleDataType = float;
+using OutDataType      = ck::bhalf_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        32,          // KPerBlock
+        8,           // AK1
+        8,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        8,           // ABlockTransferSrcScalarPerVector
+        8,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        8,           // BBlockTransferSrcScalarPerVector
+        8,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8>;
+
+#include "run_convnd_fwd_example.inc"
+
+int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
new file mode 100644
index 00000000..d84afba6
--- /dev/null
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+using InDataType       = ck::half_t;
+using WeiDataType      = ck::half_t;
+using AccDataType      = float;
+using CShuffleDataType = ck::half_t;
+using OutDataType      = ck::half_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::UnaryConvert;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        32,          // KPerBlock
+        8,           // AK1
+        8,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        8,           // ABlockTransferSrcScalarPerVector
+        8,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        8,           // BBlockTransferSrcScalarPerVector
+        8,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 32, 1, 8>,
+        8>;
+
+#include "run_convnd_fwd_example.inc"
+
+int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
new file mode 100644
index 00000000..f5acc540
--- /dev/null
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp32.cpp
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+using InDataType       = float;
+using WeiDataType      = float;
+using AccDataType      = float;
+using CShuffleDataType = float;
+using OutDataType      = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        16,          // KPerBlock
+        4,           // AK1
+        4,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        4,           // ABlockTransferSrcScalarPerVector
+        4,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        4,           // BBlockTransferSrcScalarPerVector
+        4,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 16, 1, 16>,
+        4>;
+
+#include "run_convnd_fwd_example.inc"
+
+int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
new file mode 100644
index 00000000..8d697976
--- /dev/null
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp64.cpp
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+using InDataType       = double;
+using WeiDataType      = double;
+using AccDataType      = double;
+using CShuffleDataType = double;
+using OutDataType      = double;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        128,         // NPerBlock
+        8,           // KPerBlock
+        2,           // AK1
+        2,           // BK1
+        16,          // MPerXdl
+        16,          // NPerXdl
+        4,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        2,           // ABlockTransferSrcScalarPerVector
+        2,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        2,           // BBlockTransferSrcScalarPerVector
+        2,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 16, 1, 16>,
+        1>;
+
+#include "run_convnd_fwd_example.inc"
+
+int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
diff --git a/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp b/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
new file mode 100644
index 00000000..99f7f256
--- /dev/null
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_int8.cpp
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_fwd_common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+using InDataType       = int8_t;
+using WeiDataType      = int8_t;
+using AccDataType      = int32_t;
+using CShuffleDataType = int8_t;
+using OutDataType      = int8_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        64,          // KPerBlock
+        16,          // AK1
+        16,          // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        16,          // ABlockTransferSrcScalarPerVector
+        16,          // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        16,          // BBlockTransferSrcScalarPerVector
+        16,          // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 64, 1, 4>,
+        16>;
+
+#include "run_convnd_fwd_example.inc"
+
+int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
diff --git a/example/09_convnd_fwd/run_convnd_fwd_dl_example.inc b/example/09_convnd_fwd/run_convnd_fwd_dl_example.inc
new file mode 100644
index 00000000..697ada14
--- /dev/null
+++ b/example/09_convnd_fwd/run_convnd_fwd_dl_example.inc
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+bool run_convnd_fwd_dl_example(int argc, char* argv[])
+{
+    print_helper_msg();
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    ck::utils::conv::ConvParam conv_param{
+        2, 1, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        do_verification                   = std::stoi(argv[1]);
+        init_method                       = std::stoi(argv[2]);
+        time_kernel                       = std::stoi(argv[3]);
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+
+        conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
+    }
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+
+    const auto run = [&](auto ndim_spatial, auto in_layout, auto wei_layout, auto out_layout) {
+        constexpr ck::index_t ndim_spatial_value = ndim_spatial.value;
+        std::cout << "ndim_spatial_value: " << ndim_spatial_value << std::endl;
+
+        using InLayout  = decltype(in_layout);
+        using WeiLayout = decltype(wei_layout);
+        using OutLayout = decltype(out_layout);
+
+        const auto in_g_n_c_wis_desc =
+            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
+                conv_param);
+
+        const auto wei_g_k_c_xs_desc =
+            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
+                conv_param);
+
+        const auto out_g_n_k_wos_desc =
+            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
+                conv_param);
+
+        return run_grouped_conv_fwd_dl<
+            ndim_spatial_value,
+            InDataType,
+            WeiDataType,
+            DsDataType,
+            OutDataType,
+            InElementOp,
+            WeiElementOp,
+            OutElementOp,
+            DeviceGroupedConvNDFwdInstance<ndim_spatial_value, InLayout, WeiLayout, OutLayout>>(
+            do_verification,
+            init_method,
+            time_kernel,
+            conv_param,
+            in_g_n_c_wis_desc,
+            wei_g_k_c_xs_desc,
+            out_g_n_k_wos_desc,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+    };
+
+    namespace ctc = ck::tensor_layout::convolution;
+
+    if(conv_param.num_dim_spatial_ == 1)
+    {
+        return run(ck::Number<1>{}, ctc::GNWC{}, ctc::GKXC{}, ctc::GNWK{});
+    }
+    else if(conv_param.num_dim_spatial_ == 2)
+    {
+        return run(ck::Number<2>{}, ctc::GNHWC{}, ctc::GKYXC{}, ctc::GNHWK{});
+    }
+    else if(conv_param.num_dim_spatial_ == 3)
+    {
+        return run(ck::Number<3>{}, ctc::GNDHWC{}, ctc::GKZYXC{}, ctc::GNDHWK{});
+    }
+
+    return true;
+}
diff --git a/example/09_convnd_fwd/run_convnd_fwd_example.inc b/example/09_convnd_fwd/run_convnd_fwd_example.inc
new file mode 100644
index 00000000..36a68056
--- /dev/null
+++ b/example/09_convnd_fwd/run_convnd_fwd_example.inc
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+bool run_convnd_fwd_example(int argc, char* argv[])
+{
+    print_helper_msg();
+
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    ck::utils::conv::ConvParam conv_param{
+        2, 1, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        do_verification                   = std::stoi(argv[1]);
+        init_method                       = std::stoi(argv[2]);
+        time_kernel                       = std::stoi(argv[3]);
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+
+        conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
+    }
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+
+    const auto run = [&](auto ndim_spatial, auto in_layout, auto wei_layout, auto out_layout) {
+        constexpr ck::index_t ndim_spatial_value = ndim_spatial.value;
+
+        using InLayout  = decltype(in_layout);
+        using WeiLayout = decltype(wei_layout);
+        using OutLayout = decltype(out_layout);
+
+        const auto in_g_n_c_wis_desc =
+            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
+                conv_param);
+
+        const auto wei_g_k_c_xs_desc =
+            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
+                conv_param);
+
+        const auto out_g_n_k_wos_desc =
+            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
+                conv_param);
+
+        return run_grouped_conv_fwd<
+            ndim_spatial_value,
+            InDataType,
+            WeiDataType,
+            OutDataType,
+            InElementOp,
+            WeiElementOp,
+            OutElementOp,
+            DeviceGroupedConvNDFwdInstance<ndim_spatial_value, InLayout, WeiLayout, OutLayout>>(
+            do_verification,
+            init_method,
+            time_kernel,
+            conv_param,
+            in_g_n_c_wis_desc,
+            wei_g_k_c_xs_desc,
+            out_g_n_k_wos_desc,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+    };
+
+    namespace ctc = ck::tensor_layout::convolution;
+
+    if(conv_param.num_dim_spatial_ == 1)
+    {
+        return run(ck::Number<1>{}, ctc::GNWC{}, ctc::GKXC{}, ctc::GNWK{});
+    }
+    else if(conv_param.num_dim_spatial_ == 2)
+    {
+        return run(ck::Number<2>{}, ctc::GNHWC{}, ctc::GKYXC{}, ctc::GNHWK{});
+    }
+    else if(conv_param.num_dim_spatial_ == 3)
+    {
+        return run(ck::Number<3>{}, ctc::GNDHWC{}, ctc::GKZYXC{}, ctc::GNDHWK{});
+    }
+
+    return true;
+}
diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/CMakeLists.txt b/example/10_convnd_fwd_multiple_d_multiple_reduce/CMakeLists.txt
new file mode 100644
index 00000000..98941b4d
--- /dev/null
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_custom_target(example_convnd_fwd_reduce_xdl)
+
+add_example_executable(example_convnd_fwd_max_xdl_int8 convnd_fwd_max_xdl_int8.cpp)
+add_example_executable_no_testing(example_convnd_fwd_max_xdl_bf16 convnd_fwd_max_xdl_bf16.cpp)
+add_example_executable_no_testing(example_convnd_fwd_max_xdl_fp16 convnd_fwd_max_xdl_fp16.cpp)
+add_example_executable(example_convnd_fwd_max_xdl_fp32 convnd_fwd_max_xdl_fp32.cpp)
+
+add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_int8)
+add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_bf16)
+add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_fp16)
+add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_fp32)
+
+if(USE_BITINT_EXTENSION_INT4)
+  add_example_executable(example_convnd_fwd_max_xdl_int4 convnd_fwd_max_xdl_int4.cpp)
+  add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_int4)
+endif(USE_BITINT_EXTENSION_INT4)
diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp b/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
new file mode 100644
index 00000000..00e370f2
--- /dev/null
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp
@@ -0,0 +1,159 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <type_traits>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/fill.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+using BF16 = ck::bhalf_t;
+using FP16 = ck::half_t;
+using FP32 = float;
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+using I4 = ck::int4_t;
+#endif
+using I8  = std::int8_t;
+using I32 = std::int32_t;
+
+template <typename ALay, typename BLay, typename DELay, typename RLay>
+struct LayoutSetting
+{
+    using ALayout  = ALay;
+    using BLayout  = BLay;
+    using DELayout = DELay;
+    using RLayout  = RLay;
+};
+
+template <ck::index_t NDimSpatial>
+struct LayoutSettingSelector;
+
+namespace ctl = ck::tensor_layout::convolution;
+
+template <>
+struct LayoutSettingSelector<1> final : LayoutSetting<ctl::GNWC, ctl::GKXC, ctl::GNWK, ctl::GNW>
+{
+};
+
+template <>
+struct LayoutSettingSelector<2> final : LayoutSetting<ctl::GNHWC, ctl::GKYXC, ctl::GNHWK, ctl::GNHW>
+{
+};
+
+template <>
+struct LayoutSettingSelector<3> final
+    : LayoutSetting<ctl::GNDHWC, ctl::GKZYXC, ctl::GNDHWK, ctl::GNDHW>
+{
+};
+
+template <ck::index_t NDimSpatial>
+using ALayout = typename LayoutSettingSelector<NDimSpatial>::ALayout;
+
+template <ck::index_t NDimSpatial>
+using BLayout = typename LayoutSettingSelector<NDimSpatial>::BLayout;
+
+template <ck::index_t NDimSpatial>
+using DELayout = typename LayoutSettingSelector<NDimSpatial>::DELayout;
+
+template <ck::index_t NDimSpatial>
+using RLayout = typename LayoutSettingSelector<NDimSpatial>::RLayout;
+
+struct ExecutionConfig final
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+};
+
+inline void print_help_msg()
+{
+    std::cerr << "arg1: verification (0=no, 1=yes)\n"
+              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+              << "arg3: time kernel (0=no, 1=yes)\n"
+              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+}
+
+inline bool parse_cmd_args(int argc,
+                           char* argv[],
+                           ck::utils::conv::ConvParam& problem_size,
+                           ExecutionConfig& config)
+{
+    constexpr int num_execution_config_args =
+        3; // arguments for do_verification, init_method, time_kernel
+    constexpr int num_conv_param_leading_args = 5; // arguments for num_dim_spatial_, G_, N_, K_, C_
+
+    constexpr int threshold_to_catch_partial_args = 1 + num_execution_config_args;
+    constexpr int threshold_to_catch_all_args =
+        threshold_to_catch_partial_args + num_conv_param_leading_args;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    // catch only ExecutionConfig arguments
+    else if(argc == threshold_to_catch_partial_args)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    // catch both ExecutionConfig & ConvParam arguments
+    else if(threshold_to_catch_all_args < argc && ((argc - threshold_to_catch_all_args) % 3 == 0))
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+        problem_size                      = ck::utils::conv::parse_conv_param(
+            num_dim_spatial, threshold_to_catch_partial_args, argv);
+    }
+    else
+    {
+        print_help_msg();
+        return false;
+    }
+
+    return true;
+}
+
+inline HostTensorDescriptor
+make_r0_host_tensor_descriptor(const ck::utils::conv::ConvParam& problem_size)
+{
+    std::vector<ck::index_t> dimensions{problem_size.G_, problem_size.N_};
+
+    ck::ranges::copy(problem_size.output_spatial_lengths_, std::back_inserter(dimensions));
+
+    return HostTensorDescriptor(dimensions);
+}
+
+template <typename Lengths, typename Strides>
+void unpack_host_tensor_descriptor(const HostTensorDescriptor& descriptor,
+                                   Lengths& lengths,
+                                   Strides& strides)
+{
+    assert(size(descriptor.GetLengths()) == size(lengths));
+    std::copy_n(begin(descriptor.GetLengths()), size(descriptor.GetLengths()), begin(lengths));
+
+    assert(size(descriptor.GetStrides()) == size(strides));
+    std::copy_n(begin(descriptor.GetStrides()), size(descriptor.GetStrides()), begin(strides));
+}
diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_bf16.cpp b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_bf16.cpp
new file mode 100644
index 00000000..6ff29b4b
--- /dev/null
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_bf16.cpp
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using ADataType         = BF16;
+using BDataType         = BF16;
+using AccDataType       = FP32;
+using CShuffleDataType  = FP32;
+using DsDataType        = ck::Tuple<>;
+using EDataType         = BF16;
+using ReduceAccDataType = FP32;
+using R0DataType        = FP32;
+using RsDataType        = ck::Tuple<R0DataType>;
+
+#include "run_convnd_fwd_max_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_fwd_max_example(argc, argv); }
diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp16.cpp b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp16.cpp
new file mode 100644
index 00000000..02c19c2b
--- /dev/null
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp16.cpp
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using ADataType         = FP16;
+using BDataType         = FP16;
+using AccDataType       = FP32;
+using CShuffleDataType  = FP32;
+using DsDataType        = ck::Tuple<>;
+using EDataType         = FP16;
+using ReduceAccDataType = FP32;
+using R0DataType        = FP32;
+using RsDataType        = ck::Tuple<R0DataType>;
+
+#include "run_convnd_fwd_max_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_fwd_max_example(argc, argv); }
diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp32.cpp b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp32.cpp
new file mode 100644
index 00000000..679bb5c0
--- /dev/null
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_fp32.cpp
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using ADataType         = FP32;
+using BDataType         = FP32;
+using AccDataType       = FP32;
+using CShuffleDataType  = FP32;
+using DsDataType        = ck::Tuple<>;
+using EDataType         = FP32;
+using ReduceAccDataType = FP32;
+using R0DataType        = FP32;
+using RsDataType        = ck::Tuple<R0DataType>;
+
+#include "run_convnd_fwd_max_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_fwd_max_example(argc, argv); }
diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int4.cpp b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int4.cpp
new file mode 100644
index 00000000..abdbdaf7
--- /dev/null
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int4.cpp
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+#error Should compile this file with ck::int4_t support
+#endif
+
+#define BUILD_INT4_EXAMPLE
+
+#include "common.hpp"
+
+using ADataType         = I4;
+using BDataType         = I4;
+using KernelADataType   = I8;
+using KernelBDataType   = I8;
+using AccDataType       = I32;
+using CShuffleDataType  = I32;
+using DsDataType        = ck::Tuple<>;
+using EDataType         = I32;
+using ReduceAccDataType = I32;
+using R0DataType        = I32;
+using RsDataType        = ck::Tuple<R0DataType>;
+
+#include "run_convnd_fwd_max_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_fwd_max_example(argc, argv); }
diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int8.cpp b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int8.cpp
new file mode 100644
index 00000000..cf86afa8
--- /dev/null
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/convnd_fwd_max_xdl_int8.cpp
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using ADataType         = I8;
+using BDataType         = I8;
+using AccDataType       = I32;
+using CShuffleDataType  = I32;
+using DsDataType        = ck::Tuple<>;
+using EDataType         = I32;
+using ReduceAccDataType = I32;
+using R0DataType        = I32;
+using RsDataType        = ck::Tuple<R0DataType>;
+
+#include "run_convnd_fwd_max_example.inc"
+
+int main(int argc, char* argv[]) { return !run_convnd_fwd_max_example(argc, argv); }
diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc b/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
new file mode 100644
index 00000000..b3a38917
--- /dev/null
+++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+using QsElementOp  = ck::Tuple<PassThrough>;
+using RsElementOp  = ck::Tuple<PassThrough>;
+
+// ReduceOp
+using RsThreadReduceOp = ck::Tuple<ck::reduce::Max>;
+
+using RsGlobalReduceOp =
+    ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicMax>;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+template <ck::index_t NDimSpatial>
+using DeviceInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
+//######| NDimSpatial|              ALayout|              BLayout|              DELayout|              RLayout|           AData|           BData|     AccData|         CShuffle|     DsData|     EData|     ReduceAccData|     RsData|           A|           B|          CDE|          Qs|          Rs|           Thread|           Global|           Conv|          GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|    CDRThreadTransfer|                  CDE|    RThreadTransfer|
+//######|            |                     |                     |                      |                     |            Type|            Type|        Type|         DataType|       Type|      Type|              Type|       Type| Elementwise| Elementwise|  Elementwise| Elementwise| Elementwise|           Reduce|           Reduce|            Fwd|Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|       ClusterLengths| ReduceThreadTransfer| DstScalarPerVector|
+//######|            |                     |                     |                      |                     |                |                |            |                 |           |          |                  |           |   Operation|   Operation|    Operation|   Operation|   Operation|        Operation|        Operation| Specialization|              |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _MPerBlock_NPerBlock|      ScalarPerVector|         _MPerBlock|
+//######|            |                     |                     |                      |                     |                |                |            |                 |           |          |                  |           |            |            |             |            |            |                 |                 |               |              |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                     |           _NPerBlock|                   |
+#ifdef BUILD_INT4_EXAMPLE
+        < NDimSpatial, ALayout<NDimSpatial>, BLayout<NDimSpatial>, DELayout<NDimSpatial>, RLayout<NDimSpatial>, KernelADataType, KernelBDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,       ConvSpec,   GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<64, 4>,                    4,                  1>;
+#else
+        < NDimSpatial, ALayout<NDimSpatial>, BLayout<NDimSpatial>, DELayout<NDimSpatial>, RLayout<NDimSpatial>,       ADataType,       BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,       ConvSpec,   GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<64, 4>,                    4,                  1>;
+#endif
+
+template <ck::index_t NDimSpatial>
+using HostInstance = ck::tensor_operation::host::ReferenceConvFwd
+        <NDimSpatial, ADataType, BDataType, EDataType, AElementOp, BElementOp, PassThrough>;
+// clang-format on
+
+template <ck::index_t NDimSpatial>
+bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
+                        const ExecutionConfig& config)
+{
+    static_assert(1 <= NDimSpatial && NDimSpatial <= 3, "Unsupported NDimSpatial");
+
+#if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
+    static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
+#endif
+
+    const auto conv_input_g_n_c_wis_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<ALayout<NDimSpatial>>(
+            problem_size);
+
+    const auto conv_weight_g_k_c_xs_desc =
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<BLayout<NDimSpatial>>(
+            problem_size);
+
+    const auto conv_output_g_n_k_wos_desc =
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<DELayout<NDimSpatial>>(
+            problem_size);
+
+    const auto r0_desc = make_r0_host_tensor_descriptor(problem_size);
+
+    Tensor<ADataType> conv_input(conv_input_g_n_c_wis_desc);
+    Tensor<BDataType> conv_weight(conv_weight_g_k_c_xs_desc);
+    Tensor<EDataType> conv_output_device(conv_output_g_n_k_wos_desc);
+    Tensor<R0DataType> r0_device(r0_desc);
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-8, 7}(conv_input);
+        ck::utils::FillUniformDistributionIntegerValue<BDataType>{-8, 7}(conv_weight);
+        break;
+    default:
+        ck::utils::FillUniformDistribution<ADataType>{-5, 5}(conv_input);
+        ck::utils::FillUniformDistribution<BDataType>{-5, 5}(conv_weight);
+    }
+
+    DeviceMem conv_input_device_buf(sizeof(ADataType) * conv_input.mDesc.GetElementSpaceSize());
+    DeviceMem conv_weight_device_buf(sizeof(BDataType) * conv_weight.mDesc.GetElementSpaceSize());
+    DeviceMem conv_output_device_buf(sizeof(EDataType) *
+                                     conv_output_device.mDesc.GetElementSpaceSize());
+    DeviceMem r0_device_buf(sizeof(R0DataType) * r0_device.mDesc.GetElementSpaceSize());
+
+#ifdef BUILD_INT4_EXAMPLE
+    const Tensor<KernelADataType> conv_input_converted(conv_input);
+    const Tensor<KernelBDataType> conv_weight_converted(conv_weight);
+
+    conv_input_device_buf.ToDevice(conv_input_converted.mData.data());
+    conv_weight_device_buf.ToDevice(conv_weight_converted.mData.data());
+#else
+    conv_input_device_buf.ToDevice(conv_input.mData.data());
+    conv_weight_device_buf.ToDevice(conv_weight.mData.data());
+#endif
+
+    std::array<ck::index_t, NDimSpatial + 3> conv_input_g_n_c_wis_lengths{},
+        conv_input_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> conv_weight_g_k_c_xs_lengths{},
+        conv_weight_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> conv_output_g_n_k_wos_lengths{},
+        conv_output_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial + 2> r0_lengths{}, r0_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{}, conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{}, input_right_pads{};
+
+    unpack_host_tensor_descriptor(
+        conv_input_g_n_c_wis_desc, conv_input_g_n_c_wis_lengths, conv_input_g_n_c_wis_strides);
+    unpack_host_tensor_descriptor(
+        conv_weight_g_k_c_xs_desc, conv_weight_g_k_c_xs_lengths, conv_weight_g_k_c_xs_strides);
+    unpack_host_tensor_descriptor(
+        conv_output_g_n_k_wos_desc, conv_output_g_n_k_wos_lengths, conv_output_g_n_k_wos_strides);
+    unpack_host_tensor_descriptor(r0_desc, r0_lengths, r0_strides);
+
+    ck::ranges::copy(problem_size.conv_filter_strides_, begin(conv_filter_strides));
+    ck::ranges::copy(problem_size.conv_filter_dilations_, begin(conv_filter_dilations));
+    ck::ranges::copy(problem_size.input_left_pads_, begin(input_left_pads));
+    ck::ranges::copy(problem_size.input_right_pads_, begin(input_right_pads));
+
+    // run Conv + Reduction on device
+    auto conv     = DeviceInstance<NDimSpatial>{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(conv_input_device_buf.GetDeviceBuffer(),
+                                      conv_weight_device_buf.GetDeviceBuffer(),
+                                      std::array<const void*, 0>{},
+                                      conv_output_device_buf.GetDeviceBuffer(),
+                                      {r0_device_buf.GetDeviceBuffer()},
+                                      conv_input_g_n_c_wis_lengths,
+                                      conv_input_g_n_c_wis_strides,
+                                      conv_weight_g_k_c_xs_lengths,
+                                      conv_weight_g_k_c_xs_strides,
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
+                                      conv_output_g_n_k_wos_lengths,
+                                      conv_output_g_n_k_wos_strides,
+                                      r0_lengths,
+                                      r0_strides,
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads,
+                                      AElementOp{},
+                                      BElementOp{},
+                                      CDEElementOp{},
+                                      QsElementOp{},
+                                      RsElementOp{});
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        std::cerr << "wrong! device_conv with the specified compilation parameters does "
+                     "not support this Conv problem"
+                  << std::endl;
+        return false;
+    }
+
+    const float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+    const std::size_t flop      = problem_size.GetFlops();
+    const std::size_t num_btype = problem_size.GetByte<ADataType, BDataType, EDataType>();
+
+    const float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    const float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+
+    if(config.do_verification)
+    {
+        Tensor<EDataType> conv_output_host(conv_output_g_n_k_wos_desc);
+
+        // run Conv + Reduction on host
+        auto ref_conv     = HostInstance<NDimSpatial>{};
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(conv_input,
+                                                  conv_weight,
+                                                  conv_output_host,
+                                                  problem_size.conv_filter_strides_,
+                                                  problem_size.conv_filter_dilations_,
+                                                  problem_size.input_left_pads_,
+                                                  problem_size.input_right_pads_,
+                                                  AElementOp{},
+                                                  BElementOp{},
+                                                  PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        Tensor<R0DataType> r0_host(r0_device.mDesc);
+
+        auto reduce0_op = RsThreadReduceOp{}[ck::Number<0>{}];
+
+        auto& output_dims = conv_output_g_n_k_wos_desc.GetLengths();
+
+        if constexpr(NDimSpatial == 1)
+        {
+            for(std::size_t g = 0; g < output_dims[0]; ++g)
+            {
+                for(std::size_t n = 0; n < output_dims[1]; ++n)
+                {
+                    for(std::size_t w = 0; w < output_dims[3]; ++w)
+                    {
+                        auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
+                        for(std::size_t k = 0; k < output_dims[2]; ++k)
+                        {
+
+                            auto e_val =
+                                ck::type_convert<ReduceAccDataType>(conv_output_host(g, n, k, w));
+                            reduce0_op(reduce0_acc, e_val);
+                        }
+                        r0_host(g, n, w) = ck::type_convert<R0DataType>(reduce0_acc);
+                    }
+                }
+            }
+        }
+        else if constexpr(NDimSpatial == 2)
+        {
+            for(std::size_t g = 0; g < output_dims[0]; ++g)
+            {
+                for(std::size_t n = 0; n < output_dims[1]; ++n)
+                {
+                    for(std::size_t h = 0; h < output_dims[3]; ++h)
+                    {
+                        for(std::size_t w = 0; w < output_dims[4]; ++w)
+                        {
+                            auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
+                            for(std::size_t k = 0; k < output_dims[2]; ++k)
+                            {
+
+                                auto e_val = ck::type_convert<ReduceAccDataType>(
+                                    conv_output_host(g, n, k, h, w));
+                                reduce0_op(reduce0_acc, e_val);
+                            }
+                            r0_host(g, n, h, w) = ck::type_convert<R0DataType>(reduce0_acc);
+                        }
+                    }
+                }
+            }
+        }
+        else if constexpr(NDimSpatial == 3)
+        {
+            for(std::size_t g = 0; g < output_dims[0]; ++g)
+            {
+                for(std::size_t n = 0; n < output_dims[1]; ++n)
+                {
+                    for(std::size_t d = 0; d < output_dims[3]; ++d)
+                    {
+                        for(std::size_t h = 0; h < output_dims[4]; ++h)
+                        {
+                            for(std::size_t w = 0; w < output_dims[5]; ++w)
+                            {
+                                auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
+                                for(std::size_t k = 0; k < output_dims[2]; ++k)
+                                {
+
+                                    auto e_val = ck::type_convert<ReduceAccDataType>(
+                                        conv_output_host(g, n, k, d, h, w));
+                                    reduce0_op(reduce0_acc, e_val);
+                                }
+                                r0_host(g, n, d, h, w) = ck::type_convert<R0DataType>(reduce0_acc);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        conv_output_device_buf.FromDevice(conv_output_device.mData.data());
+        r0_device_buf.FromDevice(r0_device.mData.data());
+
+        return ck::utils::check_err(conv_output_device,
+                                    conv_output_host,
+                                    "Error: incorrect results! (Matrix E)",
+                                    1e-5f,
+                                    1e-4f) &&
+               ck::utils::check_err(
+                   r0_device, r0_host, "Error: incorrect results! (Matrix R0)", 1e-5f, 1e-4f);
+    }
+
+    return true;
+}
+
+bool run_convnd_fwd_max_example(int argc, char* argv[])
+{
+    ck::utils::conv::ConvParam problem_size{
+        2, 1, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
+    ExecutionConfig config;
+
+    if(!parse_cmd_args(argc, argv, problem_size, config))
+    {
+        return false;
+    }
+
+    switch(problem_size.num_dim_spatial_)
+    {
+    case 1: return run_convnd_fwd_max<1>(problem_size, config);
+    case 2: return run_convnd_fwd_max<2>(problem_size, config);
+    case 3: return run_convnd_fwd_max<3>(problem_size, config);
+    }
+
+    return false;
+}
diff --git a/example/12_reduce/CMakeLists.txt b/example/12_reduce/CMakeLists.txt
new file mode 100644
index 00000000..6e58ed93
--- /dev/null
+++ b/example/12_reduce/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_example_executable(example_reduce_blockwise reduce_blockwise.cpp)
+add_example_executable(example_reduce_multiblock_atomic_add reduce_multiblock_atomic_add.cpp)
+add_example_executable(example_reduce_blockwise_two_call reduce_blockwise_two_call.cpp)
diff --git a/example/12_reduce/README.md b/example/12_reduce/README.md
new file mode 100644
index 00000000..76d28527
--- /dev/null
+++ b/example/12_reduce/README.md
@@ -0,0 +1,62 @@
+# Instructions for ```example_reduce_blockwise```
+
+## Run ```example_reduce_blockwise```
+```bash
+# -D <xxx> : input 3d/4d/5d tensor lengths
+# -R <xxx> : reduce dimension ids
+# -v <x> :   verification (0=no, 1=yes)
+#arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64, 7: int4)
+#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
+#arg3: time kernel (0=no, 1=yes)
+./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 0 2 1
+```
+
+Result
+```
+./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 0 2 1
+launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1} 
+Warm up 1 time
+Start running 10 times...
+Perf: 0.238063 ms, 264.285 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
+```
+
+## Run ```example_reduce_multiblock_atomic_add```
+```bash
+# -D <xxx> : input 3d/4d/5d tensor lengths
+# -R <xxx> : reduce dimension ids
+# -v <x> :   verification (0=no, 1=yes)
+#arg1: data type (0: fp32, 1: fp64)
+#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
+#arg3: time kernel (0=no, 1=yes)
+./bin/example_reduce_multiblock_atomic_add -D 16,64,32,960 -v 1 0 2 0
+```
+
+Result
+```
+./bin/example_reduce_multiblock_atomic_add -D 16,64,32,960 -v 1 0 2 0
+Perf: 0 ms, inf GB/s, DeviceReduceMultiBlock<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
+echo $?
+0
+```
+
+# Instructions for ```example_reduce_blockwise_two_call```
+
+## Run ```example_reduce_blockwise_two_call```
+```bash
+#arg1:  verification (0=no, 1=yes(
+#arg2:  initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
+#arg3:  time kernel (0=no, 1=yes)
+./bin/example_reduce_blockwise_two_call 1 2 1
+```
+
+Result
+```
+./bin/example_reduce_blockwise_two_call 1 2 1
+launch_and_time_kernel: grid_dim {204800, 1, 1}, block_dim {256, 1, 1}
+Warm up 1 time
+Start running 10 times...
+launch_and_time_kernel: grid_dim {6400, 1, 1}, block_dim {256, 1, 1}
+Warm up 1 time
+Start running 10 times...
+Perf: 2.1791 ms, 771.42 GB/s, DeviceReduceBlockWise<256,M_C32_S1,K_C8_S1,InSrcVectorDim_1_InSrcVectorSize_1_OutDstVectorSize_1> => DeviceReduceBlockWise<256,M_C256_S1,K_C1_S1,InSrcVectorDim_1_InSrcVectorSize_1_OutDstVectorSize_1>
+```
diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp
new file mode 100644
index 00000000..a7ee9990
--- /dev/null
+++ b/example/12_reduce/reduce_blockwise.cpp
@@ -0,0 +1,299 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+
+#include "ck/utility/reduction_enums.hpp"
+#include "reduce_blockwise_impl.hpp"
+#include "reduce_example_common.hpp"
+
+using namespace ck;
+using namespace ck::tensor_operation::device;
+
+static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
+                                       {"verify", required_argument, nullptr, 'v'},
+                                       {"help", no_argument, nullptr, '?'},
+                                       {nullptr, 0, nullptr, 0}};
+
+class SimpleAppArgs
+{
+    private:
+    int option_index = 0;
+
+    public:
+    std::vector<size_t> inLengths = {16, 64, 32, 960};
+    std::vector<int> reduceDims   = {0, 1, 2};
+    std::vector<float> scales     = {1.0f, 0.0f};
+
+    bool do_verification = true;
+    int data_type        = 1;
+    int init_method      = 2;
+    bool time_kernel     = true;
+
+    public:
+    void show_usage(const char* cmd)
+    {
+        std::cout << "Usage of " << cmd << std::endl;
+        std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths"
+                  << std::endl;
+        std::cout << "--reduceDims or -R, comma separated list of to-reduce dimensions"
+                  << std::endl;
+        std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by "
+                     "comparing with the host-based reduction"
+                  << std::endl;
+        std::cout << "Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64, 7: int4)"
+                  << std::endl;
+        std::cout << "Arg2 -- init method (0=no init, 1=single integer value, 2=scope integer "
+                     "value, 3=decimal value)"
+                  << std::endl;
+        std::cout << "Arg3 -- time kernel (0=no, 1=yes)" << std::endl;
+    };
+
+    int processArgs(int argc, char* argv[])
+    {
+        using ck::host_common::getTypeValuesFromString;
+
+        int ch;
+
+        while(1)
+        {
+            ch = getopt_long(argc, argv, "D:R:v:l:", long_options, &option_index);
+            if(ch == -1)
+                break;
+            switch(ch)
+            {
+            case 'D':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                inLengths = getTypeValuesFromString<size_t>(optarg);
+                break;
+            case 'R':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                reduceDims = getTypeValuesFromString<int>(optarg);
+                break;
+            case 'v':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                do_verification = static_cast<bool>(std::atoi(optarg));
+                break;
+            case '?':
+                if(std::string(long_options[option_index].name) == "help")
+                {
+                    show_usage(argv[0]);
+                    return (-1);
+                };
+                break;
+            default: show_usage(argv[0]); return (-1);
+            };
+        };
+
+        if(optind + 3 > argc)
+        {
+            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
+        };
+
+        data_type   = std::atoi(argv[optind++]);
+        init_method = std::atoi(argv[optind++]);
+        time_kernel = static_cast<bool>(std::atoi(argv[optind]));
+
+        if(scales.empty())
+        {
+            scales.push_back(1.0f);
+            scales.push_back(0.0f);
+        };
+
+        return (0);
+    };
+};
+
+template <typename InOutDataType,
+          typename AccDataType,
+          ReduceTensorOp ReduceOpId,
+          index_t PropagateNan,
+          index_t OutputIndex>
+bool reduce_blockwise_test(bool do_verification,
+                           int init_method,
+                           bool time_kernel,
+                           const std::vector<size_t>& inLengths,
+                           const std::vector<int>& reduceDims,
+                           float alpha,
+                           float beta)
+{
+    bool matched = false;
+    int result   = 0;
+
+    const auto tuple_object = reduce_shape_instances{};
+
+    static_for<0, std::tuple_size<reduce_shape_instances>::value, 1>{}([&](auto i) {
+        if(matched)
+            return;
+
+        using ShapeType = remove_cvref_t<decltype(std::get<i>(tuple_object))>;
+
+        if(ShapeType::Rank_ != inLengths.size() || ShapeType::NumReduceDim_ != reduceDims.size())
+            return;
+
+        std::array<int, ShapeType::NumReduceDim_> arrReduceDims;
+
+        ck::ranges::copy(reduceDims, arrReduceDims.begin());
+
+        result = reduce_blockwise_impl<InOutDataType,
+                                       AccDataType,
+                                       ReduceOpId,
+                                       ShapeType::Rank_,
+                                       ShapeType::NumReduceDim_,
+                                       PropagateNan,
+                                       OutputIndex>(
+            do_verification, init_method, time_kernel, inLengths, arrReduceDims, alpha, beta);
+
+        matched = true;
+    });
+
+    return (result == 0) ? true : false;
+};
+
+constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::AVG;
+constexpr bool PropagateNan         = true;
+constexpr bool OutputIndex          = false;
+
+int main(int argc, char* argv[])
+{
+    bool pass = true;
+
+    if(argc > 1)
+    {
+        SimpleAppArgs arg;
+
+        if(arg.processArgs(argc, argv) < 0)
+            return (-1);
+
+        if(arg.data_type == 0)
+        {
+            pass = reduce_blockwise_test<ck::half_t, float, ReduceOpId, PropagateNan, OutputIndex>(
+                arg.do_verification,
+                arg.init_method,
+                arg.time_kernel,
+                arg.inLengths,
+                arg.reduceDims,
+                arg.scales[0],
+                arg.scales[1]);
+        }
+        else if(arg.data_type == 1)
+        {
+            pass = reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
+                arg.do_verification,
+                arg.init_method,
+                arg.time_kernel,
+                arg.inLengths,
+                arg.reduceDims,
+                arg.scales[0],
+                arg.scales[1]);
+        }
+        else if(arg.data_type == 3)
+        {
+            pass = reduce_blockwise_test<int8_t, float, ReduceOpId, PropagateNan, OutputIndex>(
+                arg.do_verification,
+                arg.init_method,
+                arg.time_kernel,
+                arg.inLengths,
+                arg.reduceDims,
+                arg.scales[0],
+                arg.scales[1]);
+        }
+        else if(arg.data_type == 5)
+        {
+            pass = reduce_blockwise_test<ck::bhalf_t, float, ReduceOpId, PropagateNan, OutputIndex>(
+                arg.do_verification,
+                arg.init_method,
+                arg.time_kernel,
+                arg.inLengths,
+                arg.reduceDims,
+                arg.scales[0],
+                arg.scales[1]);
+        }
+        else if(arg.data_type == 6)
+        {
+            pass = reduce_blockwise_test<double, double, ReduceOpId, PropagateNan, OutputIndex>(
+                arg.do_verification,
+                arg.init_method,
+                arg.time_kernel,
+                arg.inLengths,
+                arg.reduceDims,
+                arg.scales[0],
+                arg.scales[1]);
+        }
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+        else if(arg.data_type == 7)
+        {
+            pass = reduce_blockwise_test<int4_t, int32_t, ReduceTensorOp::AVG, false, false>(
+                arg.do_verification,
+                arg.init_method,
+                arg.time_kernel,
+                arg.inLengths,
+                arg.reduceDims,
+                arg.scales[0],
+                arg.scales[1]);
+
+            pass = pass && reduce_blockwise_test<int4_t, int8_t, ReduceTensorOp::MAX, false, false>(
+                               arg.do_verification,
+                               arg.init_method,
+                               arg.time_kernel,
+                               arg.inLengths,
+                               arg.reduceDims,
+                               arg.scales[0],
+                               arg.scales[1]);
+        }
+#endif
+    }
+    else
+    {
+        // for testing half_t
+        pass =
+            pass && reduce_blockwise_test<ck::half_t, float, ReduceOpId, PropagateNan, OutputIndex>(
+                        true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
+
+        // for testing float
+        pass = pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
+                           true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
+
+        // for testing double
+        pass = pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
+                           true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
+
+        // for testing bhalf_t
+        pass = pass &&
+               reduce_blockwise_test<ck::bhalf_t, float, ReduceOpId, PropagateNan, OutputIndex>(
+                   true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
+
+        // for testing int8_t
+        pass =
+            pass && reduce_blockwise_test<int8_t, int32_t, ReduceOpId, PropagateNan, OutputIndex>(
+                        true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+        // for testing int4_t using AVG operation
+        pass = pass && reduce_blockwise_test<int4_t, int32_t, ReduceTensorOp::AVG, false, false>(
+                           true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
+
+        // for testing int4_t using MAX operation
+        pass = pass && reduce_blockwise_test<int4_t, int8_t, ReduceTensorOp::MAX, false, false>(
+                           true, 2, true, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
+#endif
+        // for testing 3D input
+        pass = pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
+                           true, 2, true, {16, 64, 960}, {0, 1}, 1.0f, 0.0f);
+
+        // for testing 5D input
+        pass = pass && reduce_blockwise_test<float, float, ReduceOpId, PropagateNan, OutputIndex>(
+                           true, 2, true, {16, 64, 32, 2, 960}, {0, 1, 2, 3}, 1.0f, 0.0f);
+    };
+
+    return (pass ? 0 : 1);
+};
diff --git a/example/12_reduce/reduce_blockwise_impl.hpp b/example/12_reduce/reduce_blockwise_impl.hpp
new file mode 100644
index 00000000..7bafd2d2
--- /dev/null
+++ b/example/12_reduce/reduce_blockwise_impl.hpp
@@ -0,0 +1,338 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/utility/host_reduction.hpp"
+
+#include "reduce_example_common.hpp"
+
+template <typename InOutDataType,
+          typename AccDataType,
+          ck::ReduceTensorOp ReduceOpId,
+          ck::index_t Rank,
+          ck::index_t NumReduceDim,
+          bool PropagateNan,
+          bool OutputIndex>
+int reduce_blockwise_impl(bool do_verification,
+                          int init_method,
+                          bool time_kernel,
+                          const std::vector<size_t>& inLengths,
+                          const std::array<int, NumReduceDim>& reduceDims,
+                          float alpha,
+                          float beta)
+
+{
+    using namespace ck;
+    using namespace ck::tensor_operation::device;
+
+    constexpr index_t NumOutDim = (Rank - NumReduceDim == 0) ? 1 : Rank - NumReduceDim;
+
+    constexpr bool op_support_indices =
+        (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
+         ReduceOpId == ReduceTensorOp::AMAX);
+
+    constexpr bool invalid_reduce_1 = OutputIndex && !op_support_indices;
+
+    // 1) If InOutDataType is half_t, must use half_t as AccDataType for indexable reduction
+    // operations 2) If InOutDataType is half_t, must use float as AccDataType for non-indexable
+    // reduction operations
+    constexpr bool invalid_reduce_2 =
+        std::is_same<InOutDataType, half_t>::value &&
+        ((!op_support_indices && !std::is_same<AccDataType, float>::value) ||
+         (op_support_indices && !std::is_same<AccDataType, half_t>::value));
+
+    // 1) If InOutDataType is float, must use float as AccDataType for indexable reduction
+    // operations
+    constexpr bool invalid_reduce_3 =
+        std::is_same<InOutDataType, float>::value &&
+        (op_support_indices && !std::is_same<AccDataType, float>::value);
+
+    // 1) If InOutDataType is int8_t or int4_t, must use int8_t as AccDataType for indexable
+    // reduction operations 2) If InOutDataType is int8_t or int4_t, must use int32_t as AccDataType
+    // for non-indexable reduction operations
+    constexpr bool invalid_reduce_4 =
+        std::is_same<InOutDataType, int8_t>::value &&
+        ((!op_support_indices && !std::is_same<AccDataType, int32_t>::value) ||
+         (op_support_indices && !std::is_same<AccDataType, int8_t>::value));
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    constexpr bool invalid_reduce_4_2 =
+        std::is_same<InOutDataType, int4_t>::value &&
+        ((!op_support_indices && !std::is_same<AccDataType, int32_t>::value) ||
+         (op_support_indices && !std::is_same<AccDataType, int8_t>::value));
+#endif
+
+    // 1) If InOutDataType is int8_t or int4_t, the supported operation must be either indexable
+    // operations or ADD/AVG
+    constexpr bool invalid_reduce_5 = std::is_same<InOutDataType, int8_t>::value &&
+                                      (!op_support_indices && ReduceOpId != ReduceTensorOp::ADD &&
+                                       ReduceOpId != ReduceTensorOp::AVG);
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    constexpr bool invalid_reduce_5_2 = std::is_same<InOutDataType, int4_t>::value &&
+                                        (!op_support_indices && ReduceOpId != ReduceTensorOp::ADD &&
+                                         ReduceOpId != ReduceTensorOp::AVG);
+#endif
+
+    // 1) If InOutDataType is bhalf_t, must use float as AccDataType for all reduction operations
+    constexpr bool invalid_reduce_6 =
+        std::is_same<InOutDataType, bhalf_t>::value && !std::is_same<AccDataType, float>::value;
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    constexpr bool invalid_reduce =
+        (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3 || invalid_reduce_4 ||
+         invalid_reduce_5 || invalid_reduce_6 || invalid_reduce_4_2 || invalid_reduce_5_2);
+#else
+    constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3 ||
+                                     invalid_reduce_4 || invalid_reduce_5 || invalid_reduce_6);
+#endif
+
+    if constexpr(invalid_reduce)
+    {
+        std::cerr << "The reduction setting is invalid, exiting!" << std::endl;
+        return (-1);
+    };
+
+    using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
+    using InElementwiseOperation =
+        typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation;
+    using AccElementwiseOperation =
+        typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    using InOutDataTypeInDevice = typename std::
+        conditional<std::is_same<InOutDataType, int4_t>::value, int8_t, InOutDataType>::type;
+#else
+    using InOutDataTypeInDevice   = InOutDataType;
+#endif
+
+    using DeviceReduceInstance =
+        ck::tensor_operation::device::DeviceReduceMultiBlock<InOutDataTypeInDevice,
+                                                             AccDataType,
+                                                             InOutDataTypeInDevice,
+                                                             Rank,
+                                                             NumReduceDim,
+                                                             ReduceOperation,
+                                                             InElementwiseOperation,
+                                                             AccElementwiseOperation,
+                                                             InMemoryDataOperationEnum::Set,
+                                                             PropagateNan,
+                                                             OutputIndex,
+                                                             false, // HaveIndexInputIfOutputIndex
+                                                             256,   // BlockSize
+                                                             4,     // MThreadClusterSize
+                                                             64,    // KThreadClusterSize
+                                                             1,     // MThreadSliceSize
+                                                             1,     // KThreadSliceSize
+                                                             0,     // InSrcVectorDim
+                                                             1,     // InSrceVectorSize
+                                                             1>;    // OutDstVectorSize
+
+    Tensor<InOutDataType> in(inLengths);
+
+    std::vector<size_t> outLengths;
+
+    auto invariantDims = get_invariant_dims<Rank, NumReduceDim>(reduceDims);
+
+    if(invariantDims.empty())
+        outLengths.push_back(1);
+    else
+        for(auto dim : invariantDims)
+            outLengths.push_back(inLengths[dim]);
+
+    Tensor<InOutDataType> out_ref(outLengths);
+    Tensor<InOutDataType> out(outLengths);
+    Tensor<int> out_indices_ref(outLengths);
+    Tensor<int> out_indices(outLengths);
+
+    auto inStrides  = in.mDesc.GetStrides();
+    auto outStrides = out.mDesc.GetStrides();
+
+    size_t invariant_total_length = out.mDesc.GetElementSize();
+    size_t reduce_total_length    = in.mDesc.GetElementSize() / invariant_total_length;
+
+    std::size_t num_thread = 1;
+
+    if(do_verification)
+    {
+        switch(init_method)
+        {
+        case 0: break;
+        case 1:
+            in.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
+            break;
+        case 2:
+            in.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
+            break;
+        default:
+            in.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0},
+                                            num_thread);
+        }
+
+        if(beta != 0.0f)
+            for(size_t i = 0; i < out_ref.mDesc.GetElementSpaceSize(); i++)
+                out.mData[i] = out_ref.mData[i];
+    };
+
+    // these buffers are usually provided by the user application
+    DeviceMem in_dev(sizeof(InOutDataTypeInDevice) * in.mDesc.GetElementSpaceSize());
+    DeviceMem out_dev(sizeof(InOutDataTypeInDevice) * out.mDesc.GetElementSpaceSize());
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    if(std::is_same<InOutDataType, int4_t>::value)
+    {
+        std::vector<InOutDataTypeInDevice> tmp_buf(in.mData.size());
+
+        std::copy_n(in.mData.data(), in.mData.size(), tmp_buf.data());
+        in_dev.ToDevice(tmp_buf.data());
+    }
+    else
+#endif
+        in_dev.ToDevice(in.mData.data());
+
+    if(beta != 0.0f)
+    {
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+        if(std::is_same<InOutDataType, int4_t>::value)
+        {
+            std::vector<InOutDataTypeInDevice> tmp_buf(in.mData.size());
+
+            std::copy_n(out.mData.data(), out.mData.size(), tmp_buf.data());
+            out_dev.ToDevice(tmp_buf.data());
+        }
+        else
+#endif
+            out_dev.ToDevice(out.mData.data());
+    };
+
+    size_t indicesSizeInBytes = OutputIndex ? out.mDesc.GetElementSize() * sizeof(int32_t) : 0;
+
+    DeviceMem out_index_dev(indicesSizeInBytes);
+
+    InElementwiseOperation in_elementwise_op;
+    AccElementwiseOperation acc_elementwise_op;
+
+    std::tie(in_elementwise_op, acc_elementwise_op) =
+        reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
+            static_cast<int32_t>(reduce_total_length));
+
+    if(do_verification)
+    {
+        ReductionHost<InOutDataType,
+                      AccDataType,
+                      InOutDataType,
+                      ReduceOperation,
+                      InElementwiseOperation,
+                      AccElementwiseOperation,
+                      Rank,
+                      NumReduceDim,
+                      PropagateNan,
+                      OutputIndex>
+            hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
+
+        hostReduce.Run(alpha,
+                       in.mData.data(),
+                       beta,
+                       out_ref.mData.data(),
+                       out_indices_ref.mData.data(),
+                       in_elementwise_op,
+                       acc_elementwise_op);
+    };
+
+    std::array<index_t, Rank> arrInLengths;
+    std::array<index_t, Rank> arrInStrides;
+    std::array<index_t, NumOutDim> arrOutLengths;
+    std::array<index_t, NumOutDim> arrOutStrides;
+
+    ck::ranges::copy(inLengths, arrInLengths.begin());
+    ck::ranges::copy(inStrides, arrInStrides.begin());
+    ck::ranges::copy(outLengths, arrOutLengths.begin());
+    ck::ranges::copy(outStrides, arrOutStrides.begin());
+
+    auto reduce = DeviceReduceInstance{};
+
+    auto argument_ptr = reduce.MakeArgumentPointer(arrInLengths,
+                                                   arrInStrides,
+                                                   arrOutLengths,
+                                                   arrOutStrides,
+                                                   reduceDims,
+                                                   alpha,
+                                                   beta,
+                                                   in_dev.GetDeviceBuffer(),
+                                                   nullptr,
+                                                   out_dev.GetDeviceBuffer(),
+                                                   out_index_dev.GetDeviceBuffer(),
+                                                   in_elementwise_op,
+                                                   acc_elementwise_op);
+
+    if(!reduce.IsSupportedArgument(argument_ptr.get()))
+    {
+        std::cerr
+            << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
+            << std::endl;
+
+        return (-2);
+    };
+
+    std::string reduce_name = reduce.GetTypeString();
+
+    auto invoker_ptr = reduce.MakeInvokerPointer();
+
+    float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+    std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InOutDataType) +
+                            invariant_total_length * sizeof(InOutDataType);
+
+    float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+    std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << reduce_name
+              << std::endl;
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+        if(std::is_same<InOutDataType, int4_t>::value)
+        {
+            std::vector<InOutDataTypeInDevice> tmp_buf(out.mData.size());
+
+            out_dev.FromDevice(tmp_buf.data());
+
+            std::copy_n(tmp_buf.data(), out.mData.size(), out.mData.data());
+        }
+        else
+#endif
+            out_dev.FromDevice(out.mData.data());
+
+        pass = pass && ck::utils::check_err(out, out_ref);
+
+        if(OutputIndex)
+        {
+            out_index_dev.FromDevice(out_indices.mData.data());
+            pass = pass && ck::utils::check_err(out_indices, out_indices_ref);
+        };
+    };
+
+    return (pass ? 0 : 1);
+}
diff --git a/example/12_reduce/reduce_blockwise_two_call.cpp b/example/12_reduce/reduce_blockwise_two_call.cpp
new file mode 100644
index 00000000..39821f24
--- /dev/null
+++ b/example/12_reduce/reduce_blockwise_two_call.cpp
@@ -0,0 +1,301 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <sstream>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/utility/host_reduction.hpp"
+
+using namespace ck;
+using namespace ck::tensor_operation::device;
+
+using InOutDataType = ck::half_t;
+using InOutDataType = ck::half_t;
+using AccDataType   = float;
+
+constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::NORM2;
+constexpr bool PropagateNan         = true;
+constexpr bool OutputIndex          = false;
+
+using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
+using InElementwiseOperation =
+    typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation;
+using AccElementwiseOperation =
+    typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;
+
+using PassThroughOp = tensor_operation::element_wise::PassThrough;
+
+using DeviceReduceInstance_1 = DeviceReduceMultiBlock<InOutDataType,
+                                                      AccDataType,
+                                                      InOutDataType,
+                                                      5, // Rank
+                                                      1, // NumReduceDim
+                                                      ReduceOperation,
+                                                      InElementwiseOperation,
+                                                      PassThroughOp,
+                                                      InMemoryDataOperationEnum::Set,
+                                                      PropagateNan,
+                                                      OutputIndex,
+                                                      false, // HaveIndexInputIfOutputIndex
+                                                      256,
+                                                      32,
+                                                      8,
+                                                      1,
+                                                      1,
+                                                      1, // vector dim
+                                                      1,
+                                                      1>;
+
+using DeviceReduceInstance_2 = DeviceReduceMultiBlock<InOutDataType,
+                                                      AccDataType,
+                                                      InOutDataType,
+                                                      4, // Rank
+                                                      1, // NumReduceDim
+                                                      ReduceOperation,
+                                                      PassThroughOp,
+                                                      AccElementwiseOperation,
+                                                      InMemoryDataOperationEnum::Set,
+                                                      PropagateNan,
+                                                      OutputIndex,
+                                                      false, // HaveIndexInputIfOutputIndex
+                                                      256,
+                                                      128,
+                                                      2,
+                                                      1,
+                                                      1,
+                                                      1, // vector dim
+                                                      1,
+                                                      1>;
+
+static bool do_verify;
+static int init_method;
+static float alpha;
+static float beta;
+static bool time_kernel;
+
+int main(int argc, char* argv[])
+{
+    // used by the device reduction
+    const std::array<int, 1> reduceDims_1 = {4};
+    // const std::array<int, 4> invariantDims_1 = {0, 1, 2, 3};
+
+    const std::array<int, 1> reduceDims_2 = {3};
+    // const std::array<int, 3> invariantDims_2 = {0, 1, 2};
+
+    // used by the host reduction
+    const std::array<int, 2> reduceDims    = {3, 4};
+    const std::array<int, 3> invariantDims = {0, 1, 2};
+
+    const std::vector<size_t> inLengths_1 = {64, 320, 80, 4, 128};
+
+    // input lengths of the second reduction, which is also the output lengths of the first
+    // reduction
+    const std::vector<size_t> inLengths_2 = {64, 320, 80, 4};
+
+    const std::vector<size_t> outLengths = {64, 320, 80};
+
+    if(argc == 1)
+    {
+        do_verify   = true;
+        init_method = 2;
+        time_kernel = true;
+    }
+    else if(argc == 4)
+    {
+        do_verify   = static_cast<bool>(argv[1]);
+        init_method = atoi(argv[2]);
+        time_kernel = static_cast<bool>(atoi(argv[3]));
+    }
+    else
+    {
+        std::ostringstream ostr;
+
+        ostr << "Wrong parameter! " << std::endl
+             << "Usage: " << argv[0] << "[verify 0/1] init_method time_kernel" << std::endl;
+
+        throw std::runtime_error(ostr.str());
+    };
+
+    alpha = 1.0f;
+    beta  = 0.0f;
+
+    Tensor<InOutDataType> in_1(inLengths_1);
+
+    Tensor<InOutDataType> out_ref(outLengths);
+    Tensor<InOutDataType> in_2(inLengths_2); // also the output tensor of the first reduction
+    Tensor<InOutDataType> out(outLengths);
+
+    auto inStrides_1 = in_1.mDesc.GetStrides();
+    auto inStrides_2 = in_2.mDesc.GetStrides();
+    auto outStrides  = out.mDesc.GetStrides();
+
+    size_t invariant_total_length = out.mDesc.GetElementSize();
+    size_t reduce_total_length    = in_1.mDesc.GetElementSize() / invariant_total_length;
+
+    std::size_t num_thread = 1;
+
+    if(do_verify)
+    {
+        switch(init_method)
+        {
+        case 0: break;
+        case 1:
+            in_1.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
+            break;
+        case 2:
+            in_1.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
+            break;
+        default:
+            in_1.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0},
+                                            num_thread);
+        }
+
+        if(beta != 0.0f)
+            for(size_t i = 0; i < out_ref.mDesc.GetElementSpaceSize(); i++)
+                out.mData[i] = out_ref.mData[i];
+    };
+
+    DeviceMem in_1_dev(sizeof(InOutDataType) * in_1.mDesc.GetElementSpaceSize());
+    DeviceMem in_2_dev(sizeof(InOutDataType) * in_2.mDesc.GetElementSpaceSize());
+    DeviceMem out_dev(sizeof(InOutDataType) * out.mDesc.GetElementSpaceSize());
+
+    in_1_dev.ToDevice(in_1.mData.data());
+
+    if(beta != 0.0f)
+        out_dev.ToDevice(out.mData.data());
+
+    InElementwiseOperation in_elementwise_op;
+    AccElementwiseOperation acc_elementwise_op;
+
+    std::tie(in_elementwise_op, acc_elementwise_op) =
+        reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
+            static_cast<int32_t>(reduce_total_length));
+
+    if(do_verify)
+    {
+        ReductionHost<InOutDataType,
+                      AccDataType,
+                      InOutDataType,
+                      ReduceOperation,
+                      InElementwiseOperation,
+                      AccElementwiseOperation,
+                      5, // Rank
+                      2, // NumReduceDim
+                      PropagateNan,
+                      OutputIndex>
+            hostReduce(in_1.mDesc, out_ref.mDesc, invariantDims, reduceDims);
+
+        hostReduce.Run(alpha,
+                       in_1.mData.data(),
+                       beta,
+                       out_ref.mData.data(),
+                       nullptr,
+                       in_elementwise_op,
+                       acc_elementwise_op);
+    };
+
+    std::array<index_t, 5> arrInLengths_1;
+    std::array<index_t, 5> arrInStrides_1;
+    std::array<index_t, 4> arrInLengths_2;
+    std::array<index_t, 4> arrInStrides_2;
+    std::array<index_t, 3> arrOutLengths;
+    std::array<index_t, 3> arrOutStrides;
+
+    ck::ranges::copy(inLengths_1, arrInLengths_1.begin());
+    ck::ranges::copy(inStrides_1, arrInStrides_1.begin());
+    ck::ranges::copy(inLengths_2, arrInLengths_2.begin());
+    ck::ranges::copy(inStrides_2, arrInStrides_2.begin());
+    ck::ranges::copy(outLengths, arrOutLengths.begin());
+    ck::ranges::copy(outStrides, arrOutStrides.begin());
+
+    auto reduce_1 = DeviceReduceInstance_1{};
+
+    auto argument_ptr_1 = reduce_1.MakeArgumentPointer(arrInLengths_1,
+                                                       arrInStrides_1,
+                                                       arrInLengths_2,
+                                                       arrInStrides_2,
+                                                       reduceDims_1,
+                                                       1.0f,
+                                                       0.0f,
+                                                       in_1_dev.GetDeviceBuffer(),
+                                                       nullptr,
+                                                       in_2_dev.GetDeviceBuffer(),
+                                                       nullptr,
+                                                       in_elementwise_op,
+                                                       PassThroughOp{});
+
+    if(!reduce_1.IsSupportedArgument(argument_ptr_1.get()))
+    {
+        std::cout
+            << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
+            << std::endl;
+    };
+
+    auto invoker_ptr_1 = reduce_1.MakeInvokerPointer();
+
+    auto reduce_2 = DeviceReduceInstance_2{};
+
+    auto argument_ptr_2 = reduce_2.MakeArgumentPointer(arrInLengths_2,
+                                                       arrInStrides_2,
+                                                       arrOutLengths,
+                                                       arrOutStrides,
+                                                       reduceDims_2,
+                                                       alpha,
+                                                       beta,
+                                                       in_2_dev.GetDeviceBuffer(),
+                                                       nullptr,
+                                                       out_dev.GetDeviceBuffer(),
+                                                       nullptr,
+                                                       PassThroughOp{},
+                                                       acc_elementwise_op);
+
+    if(!reduce_2.IsSupportedArgument(argument_ptr_2.get()))
+    {
+        std::cout
+            << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
+            << std::endl;
+    };
+
+    auto invoker_ptr_2 = reduce_2.MakeInvokerPointer();
+
+    float avg_time_1 = invoker_ptr_1->Run(argument_ptr_1.get(), StreamConfig{nullptr, time_kernel});
+    float avg_time_2 = invoker_ptr_2->Run(argument_ptr_2.get(), StreamConfig{nullptr, time_kernel});
+
+    std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InOutDataType) +
+                            invariant_total_length * sizeof(InOutDataType);
+
+    float gb_per_sec = num_bytes / 1.E6 / (avg_time_1 + avg_time_2);
+
+    std::cout << "Perf: " << avg_time_1 + avg_time_2 << " ms, " << gb_per_sec << " GB/s, "
+              << reduce_1.GetTypeString() << " => " << reduce_2.GetTypeString() << std::endl;
+
+    bool pass = true;
+
+    if(do_verify)
+    {
+        out_dev.FromDevice(out.mData.data());
+        pass = pass && ck::utils::check_err(out, out_ref);
+    };
+
+    return (pass ? 0 : 1);
+}
diff --git a/example/12_reduce/reduce_example_common.hpp b/example/12_reduce/reduce_example_common.hpp
new file mode 100644
index 00000000..05f0a0ed
--- /dev/null
+++ b/example/12_reduce/reduce_example_common.hpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+
+template <int Rank, int NumReduceDim>
+static inline std::array<int, Rank - NumReduceDim>
+get_invariant_dims(const std::array<int, NumReduceDim>& reduceDims)
+{
+    int reduceFlag = 0;
+
+    // flag the bits for the reduceDims
+    for(int i = 0; i < NumReduceDim; i++)
+    {
+        reduceFlag |= 1 << reduceDims[i];
+    };
+
+    std::array<int, Rank - NumReduceDim> invariantDims;
+
+    // collect invariant dimensions
+    int dim = 0;
+    for(int i = 0; i < Rank; i++)
+        if((reduceFlag & (1 << i)) == 0)
+        {
+            invariantDims[dim] = i;
+            dim++;
+        };
+
+    return invariantDims;
+};
+
+template <ck::index_t Rank, ck::index_t NumReduceDim>
+struct ReduceShape
+{
+    static constexpr ck::index_t Rank_         = Rank;
+    static constexpr ck::index_t NumReduceDim_ = NumReduceDim;
+};
+
+using reduce_shape_instances = std::tuple<ReduceShape<3, 1>,
+                                          ReduceShape<3, 2>,
+                                          ReduceShape<4, 1>,
+                                          ReduceShape<4, 2>,
+                                          ReduceShape<4, 3>,
+                                          ReduceShape<5, 1>,
+                                          ReduceShape<5, 2>,
+                                          ReduceShape<5, 3>,
+                                          ReduceShape<5, 4>>;
diff --git a/example/12_reduce/reduce_multiblock_atomic_add.cpp b/example/12_reduce/reduce_multiblock_atomic_add.cpp
new file mode 100644
index 00000000..c4d63a3a
--- /dev/null
+++ b/example/12_reduce/reduce_multiblock_atomic_add.cpp
@@ -0,0 +1,216 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+
+#include "ck/utility/reduction_enums.hpp"
+#include "reduce_multiblock_atomic_add_impl.hpp"
+#include "reduce_example_common.hpp"
+
+using namespace ck;
+using namespace ck::tensor_operation::device;
+
+static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
+                                       {"verify", required_argument, nullptr, 'v'},
+                                       {"help", no_argument, nullptr, '?'},
+                                       {nullptr, 0, nullptr, 0}};
+
+class SimpleAppArgs
+{
+    private:
+    int option_index = 0;
+
+    public:
+    std::vector<size_t> inLengths = {16, 64, 32, 960};
+    std::vector<int> reduceDims   = {0, 1, 2};
+    std::vector<float> scales     = {1.0f, 0.0f};
+
+    bool do_verification = true;
+    int data_type        = 1;
+    int init_method      = 2;
+    bool time_kernel     = true;
+
+    public:
+    void show_usage(const char* cmd)
+    {
+        std::cout << "Usage of " << cmd << std::endl;
+        std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths"
+                  << std::endl;
+        std::cout << "--reduceDims or -R, comma separated list of to-reduce dimensions"
+                  << std::endl;
+        std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by "
+                     "comparing with the host-based reduction"
+                  << std::endl;
+        std::cout << "Arg1: data type (0: fp32, 1: fp64)" << std::endl;
+        std::cout << "Arg2 -- init method (0=no init, 1=single integer value, 2=scope integer "
+                     "value, 3=decimal value)"
+                  << std::endl;
+        std::cout << "Arg3 -- time kernel (0=no, 1=yes)" << std::endl;
+    };
+
+    int processArgs(int argc, char* argv[])
+    {
+        using ck::host_common::getTypeValuesFromString;
+
+        int ch;
+
+        while(1)
+        {
+            ch = getopt_long(argc, argv, "D:R:v:l:", long_options, &option_index);
+            if(ch == -1)
+                break;
+            switch(ch)
+            {
+            case 'D':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                inLengths = getTypeValuesFromString<size_t>(optarg);
+                break;
+            case 'R':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                reduceDims = getTypeValuesFromString<int>(optarg);
+                break;
+            case 'v':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                do_verification = static_cast<bool>(std::atoi(optarg));
+                break;
+            case '?':
+                if(std::string(long_options[option_index].name) == "help")
+                {
+                    show_usage(argv[0]);
+                    return (-1);
+                };
+                break;
+            default: show_usage(argv[0]); return (-1);
+            };
+        };
+
+        if(optind + 3 > argc)
+        {
+            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
+        };
+
+        data_type   = std::atoi(argv[optind++]);
+        init_method = std::atoi(argv[optind++]);
+        time_kernel = static_cast<bool>(std::atoi(argv[optind]));
+
+        if(scales.empty())
+        {
+            scales.push_back(1.0f);
+            scales.push_back(0.0f);
+        };
+
+        return (0);
+    };
+};
+
+template <typename InOutDataType,
+          typename AccDataType,
+          ReduceTensorOp ReduceOpId,
+          index_t PropagateNan>
+bool reduce_multiblock_atomic_add_test(bool do_verification,
+                                       int init_method,
+                                       bool time_kernel,
+                                       const std::vector<size_t>& inLengths,
+                                       const std::vector<int>& reduceDims,
+                                       float alpha,
+                                       float beta)
+{
+    bool matched = false;
+    int result   = 0;
+
+    const auto tuple_object = reduce_shape_instances{};
+
+    static_for<0, std::tuple_size<reduce_shape_instances>::value, 1>{}([&](auto i) {
+        if(matched)
+            return;
+
+        using ShapeType = remove_cvref_t<decltype(std::get<i>(tuple_object))>;
+
+        if(ShapeType::Rank_ != inLengths.size() || ShapeType::NumReduceDim_ != reduceDims.size())
+            return;
+
+        std::array<int, ShapeType::NumReduceDim_> a_reduceDims;
+
+        ck::ranges::copy(reduceDims, a_reduceDims.begin());
+
+        result = reduce_multiblock_atomic_add_impl<InOutDataType,
+                                                   AccDataType,
+                                                   ReduceOpId,
+                                                   ShapeType::Rank_,
+                                                   ShapeType::NumReduceDim_,
+                                                   PropagateNan>(
+            do_verification, init_method, time_kernel, inLengths, a_reduceDims, alpha, beta);
+
+        matched = true;
+    });
+
+    return (result == 0) ? true : false;
+};
+
+constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::AVG;
+constexpr bool PropagateNan         = true;
+
+int main(int argc, char* argv[])
+{
+    bool pass = true;
+
+    if(argc > 1)
+    {
+        SimpleAppArgs arg;
+
+        if(arg.processArgs(argc, argv) < 0)
+            return (-1);
+
+        if(arg.data_type == 0)
+        {
+            pass = reduce_multiblock_atomic_add_test<float, float, ReduceOpId, PropagateNan>(
+                arg.do_verification,
+                arg.init_method,
+                arg.time_kernel,
+                arg.inLengths,
+                arg.reduceDims,
+                arg.scales[0],
+                arg.scales[1]);
+        }
+        else if(arg.data_type == 1)
+        {
+            pass = reduce_multiblock_atomic_add_test<double, double, ReduceOpId, PropagateNan>(
+                arg.do_verification,
+                arg.init_method,
+                arg.time_kernel,
+                arg.inLengths,
+                arg.reduceDims,
+                arg.scales[0],
+                arg.scales[1]);
+        }
+    }
+    else
+    {
+        // for testing float
+        pass = pass && reduce_multiblock_atomic_add_test<float, float, ReduceOpId, PropagateNan>(
+                           true, 2, false, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
+
+        // for testing double
+        pass = pass && reduce_multiblock_atomic_add_test<double, double, ReduceOpId, PropagateNan>(
+                           true, 2, false, {16, 64, 32, 960}, {0, 1, 2}, 1.0f, 0.0f);
+
+        // for testing 3D input
+        pass = pass && reduce_multiblock_atomic_add_test<float, float, ReduceOpId, PropagateNan>(
+                           true, 2, false, {16, 64, 960}, {0, 1}, 1.0f, 0.0f);
+
+        // for testing 5D input
+        pass = pass && reduce_multiblock_atomic_add_test<float, float, ReduceOpId, PropagateNan>(
+                           true, 2, false, {16, 64, 32, 2, 960}, {0, 1, 2, 3}, 1.0f, 0.0f);
+    };
+
+    return (pass ? 0 : 1);
+};
diff --git a/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp b/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
new file mode 100644
index 00000000..94867aee
--- /dev/null
+++ b/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/utility/host_reduction.hpp"
+
+#include "reduce_example_common.hpp"
+
+template <typename InOutDataType,
+          typename AccDataType,
+          ck::ReduceTensorOp ReduceOpId,
+          ck::index_t Rank,
+          ck::index_t NumReduceDim,
+          bool PropagateNan>
+int reduce_multiblock_atomic_add_impl(bool do_verification,
+                                      int init_method,
+                                      bool time_kernel,
+                                      const std::vector<size_t>& inLengths,
+                                      const std::array<int, NumReduceDim>& reduceDims,
+                                      float alpha,
+                                      float beta)
+
+{
+    using namespace ck;
+    using namespace ck::tensor_operation::device;
+
+    constexpr index_t NumOutDim = (Rank - NumReduceDim == 0) ? 1 : Rank - NumReduceDim;
+
+    constexpr bool op_support_atomic_add =
+        (ReduceOpId == ReduceTensorOp::ADD || ReduceOpId == ReduceTensorOp::AVG);
+
+    constexpr bool invalid_reduce_1 = !op_support_atomic_add;
+    constexpr bool invalid_reduce_2 =
+        !(std::is_same<InOutDataType, float>::value || std::is_same<InOutDataType, double>::value);
+
+    constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2);
+
+    if(invalid_reduce)
+    {
+        std::cerr << "The reduction setting is invalid, exiting!" << std::endl;
+        return (-1);
+    };
+
+    using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
+    using InElementwiseOperation =
+        typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation;
+    using AccElementwiseOperation =
+        typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;
+
+    using DeviceReduceInstance =
+        ck::tensor_operation::device::DeviceReduceMultiBlock<InOutDataType,
+                                                             AccDataType,
+                                                             InOutDataType,
+                                                             Rank,
+                                                             NumReduceDim,
+                                                             ReduceOperation,
+                                                             InElementwiseOperation,
+                                                             AccElementwiseOperation,
+                                                             InMemoryDataOperationEnum::AtomicAdd,
+                                                             PropagateNan,
+                                                             false,
+                                                             false, // HaveIndexInputIfOutputIndex
+                                                             256,
+                                                             4,
+                                                             64,
+                                                             1,
+                                                             1,
+                                                             0,
+                                                             1,
+                                                             1>;
+
+    Tensor<InOutDataType> in(inLengths);
+
+    std::vector<size_t> outLengths;
+
+    auto invariantDims = get_invariant_dims<Rank, NumReduceDim>(reduceDims);
+
+    if(invariantDims.empty())
+        outLengths.push_back(1);
+    else
+        for(auto dim : invariantDims)
+            outLengths.push_back(inLengths[dim]);
+
+    Tensor<InOutDataType> out_ref(outLengths);
+    Tensor<InOutDataType> out(outLengths);
+
+    auto inStrides  = in.mDesc.GetStrides();
+    auto outStrides = out.mDesc.GetStrides();
+
+    size_t invariant_total_length = out.mDesc.GetElementSize();
+    size_t reduce_total_length    = in.mDesc.GetElementSize() / invariant_total_length;
+
+    std::size_t num_thread = 1;
+
+    if(do_verification)
+    {
+        switch(init_method)
+        {
+        case 0: break;
+        case 1:
+            in.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_1<InOutDataType>{1}, num_thread);
+            break;
+        case 2:
+            in.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
+            break;
+        default:
+            in.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0, 5.0},
+                                            num_thread);
+        }
+
+        if(beta != 0.0f)
+            for(size_t i = 0; i < out_ref.mDesc.GetElementSpaceSize(); i++)
+                out.mData[i] = out_ref.mData[i];
+    };
+
+    // these buffers are usually provided by the user application
+    DeviceMem in_dev(sizeof(InOutDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem out_dev(sizeof(InOutDataType) * out.mDesc.GetElementSpaceSize());
+
+    in_dev.ToDevice(in.mData.data());
+
+    if(beta != 0.0f)
+        out_dev.ToDevice(out.mData.data());
+
+    InElementwiseOperation in_elementwise_op;
+    AccElementwiseOperation acc_elementwise_op;
+
+    std::tie(in_elementwise_op, acc_elementwise_op) =
+        reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
+            static_cast<int32_t>(reduce_total_length));
+
+    if(do_verification)
+    {
+        ReductionHost<InOutDataType,
+                      AccDataType,
+                      InOutDataType,
+                      ReduceOperation,
+                      InElementwiseOperation,
+                      AccElementwiseOperation,
+                      Rank,
+                      NumReduceDim,
+                      PropagateNan,
+                      false>
+            hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
+
+        hostReduce.Run(alpha,
+                       in.mData.data(),
+                       beta,
+                       out_ref.mData.data(),
+                       nullptr,
+                       in_elementwise_op,
+                       acc_elementwise_op);
+    };
+
+    std::array<index_t, Rank> arrInLengths;
+    std::array<index_t, Rank> arrInStrides;
+    std::array<index_t, NumOutDim> arrOutLengths;
+    std::array<index_t, NumOutDim> arrOutStrides;
+
+    ck::ranges::copy(inLengths, arrInLengths.begin());
+    ck::ranges::copy(inStrides, arrInStrides.begin());
+    ck::ranges::copy(outLengths, arrOutLengths.begin());
+    ck::ranges::copy(outStrides, arrOutStrides.begin());
+
+    auto reduce = DeviceReduceInstance{};
+
+    auto argument_ptr = reduce.MakeArgumentPointer(arrInLengths,
+                                                   arrInStrides,
+                                                   arrOutLengths,
+                                                   arrOutStrides,
+                                                   reduceDims,
+                                                   alpha,
+                                                   beta,
+                                                   in_dev.GetDeviceBuffer(),
+                                                   nullptr,
+                                                   out_dev.GetDeviceBuffer(),
+                                                   nullptr,
+                                                   in_elementwise_op,
+                                                   acc_elementwise_op);
+
+    if(!reduce.IsSupportedArgument(argument_ptr.get()))
+    {
+        std::cerr
+            << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
+            << std::endl;
+
+        return (-2);
+    };
+
+    std::string reduce_name = reduce.GetTypeString();
+
+    auto invoker_ptr = reduce.MakeInvokerPointer();
+
+    float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+    std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InOutDataType) +
+                            invariant_total_length * sizeof(InOutDataType);
+
+    float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+    std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << reduce_name
+              << std::endl;
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        out_dev.FromDevice(out.mData.data());
+        pass = pass && ck::utils::check_err(out, out_ref);
+    };
+
+    return (pass ? 0 : 1);
+}
diff --git a/example/13_pool2d_fwd/CMakeLists.txt b/example/13_pool2d_fwd/CMakeLists.txt
new file mode 100644
index 00000000..db09c033
--- /dev/null
+++ b/example/13_pool2d_fwd/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_example_executable(example_pool2d_fwd_fp16 pool2d_fwd_fp16.cpp)
+add_example_executable(example_pool2d_fwd_fp32 pool2d_fwd_fp32.cpp)
+
diff --git a/example/13_pool2d_fwd/README.md b/example/13_pool2d_fwd/README.md
new file mode 100644
index 00000000..9b017734
--- /dev/null
+++ b/example/13_pool2d_fwd/README.md
@@ -0,0 +1,41 @@
+# Instructions for ```example_pool2d_fwd``` Examples
+
+## Run ```example_pool2d_fwd_fp16```
+```bash
+#arg1: verification (0=no, 1=yes)
+#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
+#arg3: time kernel (0=no, 1=yes)
+#arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx
+./bin/example_pool2d_fwd_fp16 1 1 1
+```
+
+Result 
+```
+in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
+out_n_c_ho_wo: dim 4, lengths {128, 192, 36, 36}, strides {248832, 1, 6912, 192}
+launch_and_time_kernel: grid_dim {124416, 1, 1}, block_dim {64, 1, 1} 
+Warm up 1 time
+Start running 10 times...
+Perf: 0.397436 ms, 1.44252 TFlops, 783.713 GB/s
+```
+
+## Run ```example_pool2d_fwd_fp32```
+```bash
+#arg1: verification (0=no, 1=yes)
+#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
+#arg3: time kernel (0=no, 1=yes)
+#arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx
+./bin/example_pool2d_fwd_fp32 1 1 1
+```
+
+
+Result 
+```
+./bin/example_pool2d_fwd_fp32 1 1 1
+in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
+out_n_c_ho_wo: dim 4, lengths {128, 192, 36, 36}, strides {248832, 1, 6912, 192}
+launch_and_time_kernel: grid_dim {124416, 1, 1}, block_dim {64, 1, 1} 
+Warm up 1 time
+Start running 10 times...
+Perf: 1.01823 ms, 0.563045 TFlops, 611.8 GB/s
+```
diff --git a/example/13_pool2d_fwd/pool2d_fwd_common.hpp b/example/13_pool2d_fwd/pool2d_fwd_common.hpp
new file mode 100644
index 00000000..b83cb6a9
--- /dev/null
+++ b/example/13_pool2d_fwd/pool2d_fwd_common.hpp
@@ -0,0 +1,283 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/utility/reduction_functions_accumulate.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <typename InDataType,
+          typename OutDataType,
+          typename AccDataType,
+          typename IndexDataType,
+          ck::ReduceTensorOp ReduceOpId,
+          bool PropagateNan,
+          bool OutputIndex>
+static void pool_host_verify(const Tensor<InDataType>& in,
+                             Tensor<OutDataType>& out,
+                             Tensor<IndexDataType>& out_indices,
+                             const std::array<ck::index_t, 2>& window_spatial_lengths,
+                             const std::array<ck::index_t, 2>& window_strides,
+                             const std::array<ck::index_t, 2>& in_left_pads,
+                             const std::array<ck::index_t, 2>& /*in_right_pads*/)
+{
+    const int32_t reduceLength = window_spatial_lengths[0] * window_spatial_lengths[1];
+
+    using ReduceOperation = typename ck::reduce_binary_operator<ReduceOpId>::opType;
+
+    auto elementwise_ops =
+        ck::reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(reduceLength);
+
+    auto in_elementwise_op  = std::get<0>(elementwise_ops);
+    auto acc_elementwise_op = std::get<1>(elementwise_ops);
+
+    if constexpr(!OutputIndex)
+    {
+        using Accumulation =
+            ck::detail::AccumulateWithNanCheck<PropagateNan, ReduceOperation, AccDataType>;
+
+        auto f_nchw = [&](auto n, auto c, auto ho, auto wo) {
+            auto accuVal = ReduceOperation::template GetIdentityValue<AccDataType>();
+
+            for(ck::index_t y = 0; y < window_spatial_lengths[0]; ++y)
+            {
+                ck::index_t hi = ho * window_strides[0] + y - in_left_pads[0];
+                for(ck::index_t x = 0; x < window_spatial_lengths[1]; ++x)
+                {
+                    ck::index_t wi = wo * window_strides[1] + x - in_left_pads[1];
+                    if(hi >= 0 && hi < static_cast<ck::index_t>(in.mDesc.GetLengths()[2]) &&
+                       wi >= 0 && wi < static_cast<ck::index_t>(in.mDesc.GetLengths()[3]))
+                    {
+                        AccDataType currVal = static_cast<AccDataType>(in(n, c, hi, wi));
+
+                        in_elementwise_op(currVal, currVal);
+
+                        Accumulation::Calculate(accuVal, currVal);
+                    }
+                }
+            }
+
+            acc_elementwise_op(accuVal, accuVal);
+
+            out(n, c, ho, wo) = accuVal;
+        };
+
+        make_ParallelTensorFunctor(f_nchw,
+                                   out.mDesc.GetLengths()[0],
+                                   out.mDesc.GetLengths()[1],
+                                   out.mDesc.GetLengths()[2],
+                                   out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+    }
+    else
+    {
+        using Accumulation = ck::detail::AccumulateWithIndexAndNanCheck<PropagateNan,
+                                                                        ReduceOperation,
+                                                                        AccDataType,
+                                                                        IndexDataType>;
+        auto f_nchw        = [&](auto n, auto c, auto ho, auto wo) {
+            auto accuVal            = ReduceOperation::template GetIdentityValue<AccDataType>();
+            IndexDataType accuIndex = 0;
+
+            for(ck::index_t y = 0; y < window_spatial_lengths[0]; ++y)
+            {
+                ck::index_t hi = ho * window_strides[0] + y - in_left_pads[0];
+                for(ck::index_t x = 0; x < window_spatial_lengths[1]; ++x)
+                {
+                    ck::index_t wi = wo * window_strides[1] + x - in_left_pads[1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[3])
+                    {
+                        AccDataType currVal     = static_cast<AccDataType>(in(n, c, hi, wi));
+                        IndexDataType currIndex = y * window_spatial_lengths[1] + x;
+
+                        in_elementwise_op(currVal, currVal);
+
+                        Accumulation::Calculate(accuVal, currVal, accuIndex, currIndex);
+                    }
+                }
+            }
+
+            acc_elementwise_op(accuVal, accuVal);
+
+            out(n, c, ho, wo)         = accuVal;
+            out_indices(n, c, ho, wo) = accuIndex;
+        };
+
+        make_ParallelTensorFunctor(f_nchw,
+                                   out.mDesc.GetLengths()[0],
+                                   out.mDesc.GetLengths()[1],
+                                   out.mDesc.GetLengths()[2],
+                                   out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+    };
+}
+
+template <typename InDataType,
+          typename OutDataType,
+          typename AccDataType,
+          typename IndexDataType,
+          typename InLayout,
+          typename OutLayout,
+          ck::ReduceTensorOp ReduceOpId,
+          bool PropagateNan,
+          bool OutputIndex>
+bool pool_test(bool do_verification,
+               int init_method,
+               bool time_kernel,
+               ck::index_t N,
+               ck::index_t C,
+               ck::index_t Y,
+               ck::index_t X,
+               ck::index_t Hi,
+               ck::index_t Wi,
+               ck::index_t window_stride_h,
+               ck::index_t window_stride_w,
+               ck::index_t in_left_pad_h,
+               ck::index_t in_left_pad_w,
+               ck::index_t in_right_pad_h,
+               ck::index_t in_right_pad_w)
+{
+    using DevicePoolFwdInstance =
+        ck::tensor_operation::device::DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<
+            InDataType,  // InDataType
+            OutDataType, // OutDataType
+            AccDataType, // AccDataType
+            ReduceOpId,
+            OutputIndex,
+            64, // BlockSize
+            64, // ReduceMThreadClusterSize
+            1,  // ReduceKThreadClusterSize
+            4,  // ReduceMThreadSliceSize
+            1,  // ReduceKThreadSliceSize
+            4>; // InSrcOutDstVectorSize
+
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Y) / window_stride_h + 1;
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - X) / window_stride_w + 1;
+
+    const std::array<ck::index_t, 2> window_spatial_lengths{{Y, X}};
+    const std::array<ck::index_t, 2> window_strides{{window_stride_h, window_stride_w}};
+    const std::array<ck::index_t, 2> input_left_pads{{in_left_pad_h, in_left_pad_w}};
+    const std::array<ck::index_t, 2> input_right_pads{{in_right_pad_h, in_right_pad_w}};
+
+    // tensor layout
+    auto f_host_tensor_descriptor =
+        [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
+            using namespace ck::literals;
+
+            if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value)
+            {
+                return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz});
+            }
+            else if constexpr(ck::is_same<decltype(layout),
+                                          ck::tensor_layout::convolution::NHWC>::value)
+            {
+                return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
+            }
+        };
+
+    Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
+    Tensor<OutDataType> out_n_c_ho_wo_host(f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
+    Tensor<IndexDataType> out_indices_n_c_ho_wo_host(
+        f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
+    Tensor<OutDataType> out_n_c_ho_wo_device(f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
+    Tensor<IndexDataType> out_indices_n_c_ho_wo_device(
+        f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
+
+    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
+    std::cout << "out_n_c_ho_wo: " << out_n_c_ho_wo_host.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}); break;
+    case 2: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}); break;
+    default: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) *
+                             out_n_c_ho_wo_device.mDesc.GetElementSpaceSize());
+    DeviceMem out_indices_device_buf(sizeof(IndexDataType) *
+                                     out_indices_n_c_ho_wo_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+
+    auto pool         = DevicePoolFwdInstance{};
+    auto invoker_ptr  = pool.MakeInvokerPointer();
+    auto argument_ptr = pool.MakeArgumentPointer(
+        static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+        static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+        static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
+        N,
+        C,
+        std::array<ck::index_t, 2>{{Hi, Wi}},
+        std::array<ck::index_t, 2>{{Y, X}},
+        std::array<ck::index_t, 2>{{Ho, Wo}},
+        window_strides,
+        input_left_pads,
+        input_right_pads);
+
+    if(!pool.IsSupportedArgument(argument_ptr.get()))
+    {
+        throw std::runtime_error("wrong! device_op with the specified compilation parameters does "
+                                 "not support this problem");
+    }
+
+    float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = std::size_t(2) * N * C * Ho * Wo * Y * X;
+
+    std::size_t num_btype =
+        sizeof(InDataType) * (N * C * Hi * Wi) + sizeof(OutDataType) * (N * C * Ho * Wo);
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        pool_host_verify<InDataType,
+                         OutDataType,
+                         AccDataType,
+                         IndexDataType,
+                         ReduceOpId,
+                         PropagateNan,
+                         OutputIndex>(in_n_c_hi_wi,
+                                      out_n_c_ho_wo_host,
+                                      out_indices_n_c_ho_wo_host,
+                                      window_spatial_lengths,
+                                      window_strides,
+                                      input_left_pads,
+                                      input_right_pads);
+
+        out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data());
+
+        pass = pass && ck::utils::check_err(out_n_c_ho_wo_device, out_n_c_ho_wo_host);
+
+        if constexpr(OutputIndex)
+        {
+            out_indices_device_buf.FromDevice(out_indices_n_c_ho_wo_device.mData.data());
+
+            pass = pass &&
+                   ck::utils::check_err(out_indices_n_c_ho_wo_device, out_indices_n_c_ho_wo_host);
+        };
+    }
+
+    return (pass);
+};
diff --git a/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp b/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
new file mode 100644
index 00000000..659f3251
--- /dev/null
+++ b/example/13_pool2d_fwd/pool2d_fwd_fp16.cpp
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "pool2d_fwd_common.hpp"
+
+using InDataType  = ck::half_t;
+using OutDataType = ck::half_t;
+using AccDataType = float;
+
+using IndexDataType = int32_t;
+
+using InLayout  = ck::tensor_layout::convolution::NHWC;
+using OutLayout = ck::tensor_layout::convolution::NHWC;
+
+#if 1
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+#else
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+#endif
+
+static constexpr bool OutputIndex  = false;
+static constexpr bool PropagateNan = false;
+
+int main(int argc, char* argv[])
+{
+    bool do_verification;
+    int init_method;
+    bool time_kernel;
+
+    // Pool shape
+    ck::index_t N               = 128;
+    ck::index_t C               = 192;
+    ck::index_t Y               = 3;
+    ck::index_t X               = 3;
+    ck::index_t Hi              = 71;
+    ck::index_t Wi              = 71;
+    ck::index_t window_stride_h = 2;
+    ck::index_t window_stride_w = 2;
+    ck::index_t in_left_pad_h   = 1;
+    ck::index_t in_left_pad_w   = 1;
+    ck::index_t in_right_pad_h  = 1;
+    ck::index_t in_right_pad_w  = 1;
+
+    if(argc == 1)
+    {
+        do_verification = true;
+        init_method     = 1;
+        time_kernel     = true;
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = static_cast<bool>(std::stoi(argv[3]));
+    }
+    else if(argc == 16)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = static_cast<bool>(std::stoi(argv[3]));
+
+        N               = std::stoi(argv[4]);
+        C               = std::stoi(argv[5]);
+        Y               = std::stoi(argv[6]);
+        X               = std::stoi(argv[7]);
+        Hi              = std::stoi(argv[8]);
+        Wi              = std::stoi(argv[9]);
+        window_stride_h = std::stoi(argv[10]);
+        window_stride_w = std::stoi(argv[11]);
+        in_left_pad_h   = std::stoi(argv[12]);
+        in_left_pad_w   = std::stoi(argv[13]);
+        in_right_pad_h  = std::stoi(argv[14]);
+        in_right_pad_w  = std::stoi(argv[15]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(0);
+    }
+
+    bool pass = pool_test<InDataType,
+                          OutDataType,
+                          AccDataType,
+                          IndexDataType,
+                          InLayout,
+                          OutLayout,
+                          ReduceOpId,
+                          PropagateNan,
+                          OutputIndex>(do_verification,
+                                       init_method,
+                                       time_kernel,
+                                       N,
+                                       C,
+                                       Y,
+                                       X,
+                                       Hi,
+                                       Wi,
+                                       window_stride_h,
+                                       window_stride_w,
+                                       in_left_pad_h,
+                                       in_left_pad_w,
+                                       in_right_pad_h,
+                                       in_right_pad_w);
+
+    return (pass ? 0 : 1);
+}
diff --git a/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp b/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
new file mode 100644
index 00000000..f47c7ff1
--- /dev/null
+++ b/example/13_pool2d_fwd/pool2d_fwd_fp32.cpp
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+#include "pool2d_fwd_common.hpp"
+
+using InDataType  = float;
+using OutDataType = float;
+using AccDataType = float;
+
+using IndexDataType = int32_t;
+
+using InLayout  = ck::tensor_layout::convolution::NHWC;
+using OutLayout = ck::tensor_layout::convolution::NHWC;
+
+#if 1
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
+#else
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
+#endif
+
+static constexpr bool OutputIndex  = false;
+static constexpr bool PropagateNan = false;
+
+int main(int argc, char* argv[])
+{
+    bool do_verification;
+    int init_method;
+    bool time_kernel;
+
+    // Pool shape
+    ck::index_t N               = 128;
+    ck::index_t C               = 192;
+    ck::index_t Y               = 3;
+    ck::index_t X               = 3;
+    ck::index_t Hi              = 71;
+    ck::index_t Wi              = 71;
+    ck::index_t window_stride_h = 2;
+    ck::index_t window_stride_w = 2;
+    ck::index_t in_left_pad_h   = 1;
+    ck::index_t in_left_pad_w   = 1;
+    ck::index_t in_right_pad_h  = 1;
+    ck::index_t in_right_pad_w  = 1;
+
+    if(argc == 1)
+    {
+        do_verification = true;
+        init_method     = 1;
+        time_kernel     = true;
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = static_cast<bool>(std::stoi(argv[3]));
+    }
+    else if(argc == 16)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = static_cast<bool>(std::stoi(argv[3]));
+
+        N               = std::stoi(argv[4]);
+        C               = std::stoi(argv[5]);
+        Y               = std::stoi(argv[6]);
+        X               = std::stoi(argv[7]);
+        Hi              = std::stoi(argv[8]);
+        Wi              = std::stoi(argv[9]);
+        window_stride_h = std::stoi(argv[10]);
+        window_stride_w = std::stoi(argv[11]);
+        in_left_pad_h   = std::stoi(argv[12]);
+        in_left_pad_w   = std::stoi(argv[13]);
+        in_right_pad_h  = std::stoi(argv[14]);
+        in_right_pad_w  = std::stoi(argv[15]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(0);
+    }
+
+    bool pass = pool_test<InDataType,
+                          OutDataType,
+                          AccDataType,
+                          IndexDataType,
+                          InLayout,
+                          OutLayout,
+                          ReduceOpId,
+                          PropagateNan,
+                          OutputIndex>(do_verification,
+                                       init_method,
+                                       time_kernel,
+                                       N,
+                                       C,
+                                       Y,
+                                       X,
+                                       Hi,
+                                       Wi,
+                                       window_stride_h,
+                                       window_stride_w,
+                                       in_left_pad_h,
+                                       in_left_pad_w,
+                                       in_right_pad_h,
+                                       in_right_pad_w);
+
+    return (pass ? 0 : 1);
+}
diff --git a/example/14_gemm_quantization/CMakeLists.txt b/example/14_gemm_quantization/CMakeLists.txt
new file mode 100644
index 00000000..ca09c48c
--- /dev/null
+++ b/example/14_gemm_quantization/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_example_executable(example_gemm_xdl_bias_relu_quantization_int8 gemm_xdl_bias_relu_quantization_int8.cpp)
+add_example_executable(example_gemm_xdl_quantization_int8 gemm_xdl_quantization_int8.cpp)
\ No newline at end of file
diff --git a/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp b/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp
new file mode 100644
index 00000000..d5f4e6f6
--- /dev/null
+++ b/example/14_gemm_quantization/gemm_xdl_bias_relu_quantization_int8.cpp
@@ -0,0 +1,235 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using I8  = int8_t;
+using I32 = int32_t;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using ActivationOp = ck::tensor_operation::element_wise::Relu;
+using CDEElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<ActivationOp>;
+
+using ADataType        = I8;
+using BDataType        = I8;
+using AccDataType      = I32;
+using CShuffleDataType = I32;
+using BiasDataType     = I32;
+using DsDataType       = ck::Tuple<BiasDataType>;
+using EDataType        = I8;
+
+using ALayout    = Row;
+using BLayout    = Col;
+using BiasLayout = Row;
+using DsLayout   = ck::Tuple<BiasLayout>;
+using ELayout    = Row;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle<
+     ALayout,
+     BLayout,
+     DsLayout,
+     ELayout,
+     ADataType,
+     BDataType,
+     AccDataType,
+     CShuffleDataType,
+     DsDataType,
+     EDataType,
+     PassThrough,                // AElementwiseOperation,
+     PassThrough,                // BElementwiseOperation,
+     CDEElementOp,               // CDEElementwiseOperation,
+     GemmDefault,                // GemmSpecialization GemmSpec,
+     1,                          // NumGemmKPrefetchStage,
+     256,                        // BlockSize,
+     256,                        // MPerBlock,
+     128,                        // NPerBlock,
+     64,                         // KPerBlock,
+     16,                         // AK1,
+     16,                         // BK1,
+     32,                         // MPerXDL,
+     32,                         // NPerXDL,
+     4,                          // MXdlPerWave,
+     2,                          // NXdlPerWave,
+     S<4, 64, 1>,                // ABlockTransferThreadClusterLengths_AK0_M_AK1,
+     S<1, 0, 2>,                 // ABlockTransferThreadClusterArrangeOrder,
+     S<1, 0, 2>,                 // ABlockTransferSrcAccessOrder,
+     2,                          // index_t ABlockTransferSrcVectorDim,
+     16,                         // index_t ABlockTransferSrcScalarPerVector,
+     16,                         // index_t ABlockTransferDstScalarPerVector_AK1,
+     1,                          // bool ABlockLdsExtraM,
+     S<4, 64, 1>,                // typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+     S<1, 0, 2>,                 // typename BBlockTransferThreadClusterArrangeOrder,
+     S<1, 0, 2>,                 // typename BBlockTransferSrcAccessOrder,
+     2,                          // index_t BBlockTransferSrcVectorDim,
+     8,                          // index_t BBlockTransferSrcScalarPerVector,
+     8,                          // index_t BBlockTransferDstScalarPerVector_BK1,
+     1,                          // bool BBlockLdsExtraN,
+     1,                          // index_t CShuffleMXdlPerWavePerShuffle,
+     1,                          // index_t CShuffleNXdlPerWavePerShuffle,
+     S<1, 64, 1, 4>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+     8>;                         // index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        AccDataType,
+                                                                        AccDataType,
+                                                                        PassThrough,
+                                                                        PassThrough,
+                                                                        PassThrough>;
+
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M = 1024;
+    ck::index_t N = 1024;
+    ck::index_t K = 1024;
+
+    ck::index_t StrideA    = 1024;
+    ck::index_t StrideB    = 1024;
+    ck::index_t StrideBias = 0;
+    ck::index_t StrideE    = 1024;
+
+    float requant_scale = 0.03;
+
+    auto f_host_tensor_descriptor2d =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({stride, 1_uz}));
+            }
+            else
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({1_uz, stride}));
+            }
+        };
+
+    auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
+        return HostTensorDescriptor(std::vector<std::size_t>({len}),
+                                    std::vector<std::size_t>({stride}));
+    };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
+    Tensor<BiasDataType> bias_n(f_host_tensor_descriptor1d(N, 1));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "bias_n: " << bias_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-128, 127});
+    b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-128, 127});
+    bias_n.GenerateTensorValue(GeneratorTensor_2<BiasDataType>{-128, 127});
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem bias_device_buf(sizeof(BiasDataType) * bias_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    bias_device_buf.ToDevice(bias_n.mData.data());
+
+    auto a_element_op   = PassThrough{};
+    auto b_element_op   = PassThrough{};
+    auto cde_element_op = CDEElementOp{requant_scale, ActivationOp{}};
+
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                                      b_device_buf.GetDeviceBuffer(),
+                                      {bias_device_buf.GetDeviceBuffer()},
+                                      e_device_buf.GetDeviceBuffer(),
+                                      M,
+                                      N,
+                                      K,
+                                      StrideA,
+                                      StrideB,
+                                      {StrideBias},
+                                      StrideE,
+                                      a_element_op,
+                                      b_element_op,
+                                      cde_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+    if(do_verification)
+    {
+        Tensor<AccDataType> c_m_n(HostTensorDescriptor{M, N});
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), bias_n(n));
+            }
+        }
+
+        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}
diff --git a/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp b/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp
new file mode 100644
index 00000000..23717373
--- /dev/null
+++ b/example/14_gemm_quantization/gemm_xdl_quantization_int8.cpp
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using I8  = int8_t;
+using I32 = int32_t;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using ActivationOp = PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;
+
+using ADataType        = I8;
+using BDataType        = I8;
+using AccDataType      = I32;
+using CShuffleDataType = I32;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = I8;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmMultipleD_Xdl_CShuffle<
+     ALayout,
+     BLayout,
+     DsLayout,
+     ELayout,
+     ADataType,
+     BDataType,
+     AccDataType,
+     CShuffleDataType,
+     DsDataType,
+     EDataType,
+     PassThrough,                // AElementwiseOperation,
+     PassThrough,                // BElementwiseOperation,
+     CDEElementOp,               // CDEElementwiseOperation,
+     GemmDefault,                // GemmSpecialization GemmSpec,
+     1,                          // NumGemmKPrefetchStage,
+     256,                        // BlockSize,
+     256,                        // MPerBlock,
+     128,                        // NPerBlock,
+     64,                         // KPerBlock,
+     16,                         // AK1,
+     16,                         // BK1,
+     32,                         // MPerXDL,
+     32,                         // NPerXDL,
+     4,                          // MXdlPerWave,
+     2,                          // NXdlPerWave,
+     S<4, 64, 1>,                // ABlockTransferThreadClusterLengths_AK0_M_AK1,
+     S<1, 0, 2>,                 // ABlockTransferThreadClusterArrangeOrder,
+     S<1, 0, 2>,                 // ABlockTransferSrcAccessOrder,
+     2,                          // index_t ABlockTransferSrcVectorDim,
+     16,                         // index_t ABlockTransferSrcScalarPerVector,
+     16,                         // index_t ABlockTransferDstScalarPerVector_AK1,
+     1,                          // bool ABlockLdsExtraM,
+     S<4, 64, 1>,                // typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+     S<1, 0, 2>,                 // typename BBlockTransferThreadClusterArrangeOrder,
+     S<1, 0, 2>,                 // typename BBlockTransferSrcAccessOrder,
+     2,                          // index_t BBlockTransferSrcVectorDim,
+     8,                          // index_t BBlockTransferSrcScalarPerVector,
+     8,                          // index_t BBlockTransferDstScalarPerVector_BK1,
+     1,                          // bool BBlockLdsExtraN,
+     1,                          // index_t CShuffleMXdlPerWavePerShuffle,
+     1,                          // index_t CShuffleNXdlPerWavePerShuffle,
+     S<1, 64, 1, 4>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+     16>;                        // index_t CShuffleBlockTransferScalarPerVector_NPerBlock>
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, EDataType, float, PassThrough, PassThrough, CDEElementOp>;
+
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M = 1024;
+    ck::index_t N = 1024;
+    ck::index_t K = 1024;
+
+    ck::index_t StrideA = 1024;
+    ck::index_t StrideB = 1024;
+    ck::index_t StrideE = 1024;
+
+    float requant_scale = 0.03;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({stride, 1_uz}));
+            }
+            else
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({1_uz, stride}));
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
+
+    a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-128, 127});
+    b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-128, 127});
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+
+    auto a_element_op   = PassThrough{};
+    auto b_element_op   = PassThrough{};
+    auto cde_element_op = CDEElementOp{requant_scale, ActivationOp{}};
+
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                                      b_device_buf.GetDeviceBuffer(),
+                                      {},
+                                      e_device_buf.GetDeviceBuffer(),
+                                      M,
+                                      N,
+                                      K,
+                                      StrideA,
+                                      StrideB,
+                                      {},
+                                      StrideE,
+                                      a_element_op,
+                                      b_element_op,
+                                      cde_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+    if(do_verification)
+    {
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, e_m_n_host_result, a_element_op, b_element_op, cde_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}
diff --git a/example/15_grouped_gemm/CMakeLists.txt b/example/15_grouped_gemm/CMakeLists.txt
new file mode 100644
index 00000000..67f61608
--- /dev/null
+++ b/example/15_grouped_gemm/CMakeLists.txt
@@ -0,0 +1,17 @@
+add_custom_target(example_grouped_gemm_xdl)
+
+add_example_executable(example_grouped_gemm_xdl_fp32 grouped_gemm_xdl_fp32.cpp)
+add_example_executable(example_grouped_gemm_xdl_fp16 grouped_gemm_xdl_fp16.cpp)
+add_example_executable(example_grouped_gemm_xdl_bfp16 grouped_gemm_xdl_bfp16.cpp)
+add_example_executable(example_grouped_gemm_xdl_int8 grouped_gemm_xdl_int8.cpp)
+
+add_dependencies(example_grouped_gemm_xdl
+                 example_grouped_gemm_xdl_fp32
+                 example_grouped_gemm_xdl_fp16
+                 example_grouped_gemm_xdl_bfp16
+                 example_grouped_gemm_xdl_int8)
+
+if(USE_BITINT_EXTENSION_INT4)
+  add_example_executable(example_grouped_gemm_xdl_int4 grouped_gemm_xdl_int4.cpp)
+  add_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_int4)
+endif()
diff --git a/example/15_grouped_gemm/README.md b/example/15_grouped_gemm/README.md
new file mode 100644
index 00000000..c83b23e0
--- /dev/null
+++ b/example/15_grouped_gemm/README.md
@@ -0,0 +1,25 @@
+# Instructions for ```example_grouped_gemm_xdl```
+
+## Run ```example_grouped_gemm_xdl```
+```bash
+#arg1: verification (0=no, 1=yes)
+#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
+#arg3: run kernel # of times (>1)
+./bin/example_grouped_gemm_xdl_fp16 0 1 5
+```
+
+Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
+```
+gemm[0] a_m_k: dim 2, lengths {256, 64}, strides {64, 1} b_k_n: dim 2, lengths {64, 128}, strides {1, 64} c_m_n: dim 2, lengths {256, 128}, strides {128, 1}
+gemm[1] a_m_k: dim 2, lengths {512, 128}, strides {128, 1} b_k_n: dim 2, lengths {128, 256}, strides {1, 128} c_m_n: dim 2, lengths {512, 256}, strides {256, 1}
+gemm[2] a_m_k: dim 2, lengths {768, 192}, strides {192, 1} b_k_n: dim 2, lengths {192, 384}, strides {1, 192} c_m_n: dim 2, lengths {768, 384}, strides {384, 1}
+gemm[3] a_m_k: dim 2, lengths {1024, 256}, strides {256, 1} b_k_n: dim 2, lengths {256, 512}, strides {1, 256} c_m_n: dim 2, lengths {1024, 512}, strides {512, 1}
+group: 0 arg.a_grid_desc_k0_m_k1_{8, 256, 8}, arg.b_grid_desc_k0_n_k1_{8, 128, 8}, arg.c_grid_desc_m_n_{ 256, 128}
+group: 1 arg.a_grid_desc_k0_m_k1_{16, 512, 8}, arg.b_grid_desc_k0_n_k1_{16, 256, 8}, arg.c_grid_desc_m_n_{ 512, 256}
+group: 2 arg.a_grid_desc_k0_m_k1_{24, 768, 8}, arg.b_grid_desc_k0_n_k1_{24, 384, 8}, arg.c_grid_desc_m_n_{ 768, 384}
+group: 3 arg.a_grid_desc_k0_m_k1_{32, 1024, 8}, arg.b_grid_desc_k0_n_k1_{32, 512, 8}, arg.c_grid_desc_m_n_{ 1024, 512}
+launch_and_time_kernel: grid_dim {30, 1, 1}, block_dim {256, 1, 1} 
+Warm up
+Start running 5 times...
+Perf: 0.037887 ms, 11.0706 TFlops, 90.8132 GB/s, DeviceGroupedGemmXdl<256, 256, 128, 4, 8, 32, 32, 4, 2>
+```
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_bfp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_bfp16.cpp
new file mode 100644
index 00000000..05d572a1
--- /dev/null
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_bfp16.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = BF16;
+using BDataType        = BF16;
+using AccDataType      = F32;
+using CShuffleDataType = BF16;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = BF16;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
+    // clang-format off
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+// clang-format on
+
+#include "run_grouped_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_grouped_gemm_example(argc, argv); }
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
new file mode 100644
index 00000000..3f78dafa
--- /dev/null
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = F16;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
+    // clang-format off
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+// clang-format on
+
+#include "run_grouped_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_grouped_gemm_example(argc, argv); }
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp
new file mode 100644
index 00000000..fd93bb5f
--- /dev/null
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp32.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = F32;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
+    // clang-format off
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
+// clang-format on
+
+#include "run_grouped_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_grouped_gemm_example(argc, argv); }
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp
new file mode 100644
index 00000000..faf41bbf
--- /dev/null
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_int4.cpp
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = ck::int4_t;
+using BDataType        = ck::int4_t;
+using AccDataType      = int32_t;
+using CShuffleDataType = int32_t;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = ck::int4_t;
+
+using KernelADataType = int8_t;
+using KernelBDataType = int8_t;
+using KernelEDataType = int8_t;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
+    // clang-format off
+        < ALayout,              //ALayout
+          BLayout,              //BLayout
+          DsLayout,             //DsLayout
+          ELayout,              //ELayout
+          KernelADataType,      //ADataType    
+          KernelBDataType,      //BDataType   
+          AccDataType,          //AccDataType
+          CShuffleDataType,     //CShuffleDataType
+          DsDataType,           //DsDataType
+          KernelEDataType,      //EDataType
+          AElementOp,           //AElementwiseOperation
+          BElementOp,           //BElementwiseOperation
+          CDEElementOp,         //CDEElementwiseOperation
+          GemmDefault,          //GEMMSpecialization
+          1,                    // NumGemmKPrefetchStage
+          256,                  // BlockSize
+          256,                  // MPerBlock
+          128,                  // NPerBlock
+          64,                   // KPerBlock
+          16,                   // AK1
+          16,                   // BK1
+          32,                   // MPerXdl
+          32,                   // NPerXdl
+          4,                    // MXdlPerWave
+          2,                    // NXdlPerWave
+          S<4, 64, 1>,          // ABlockTransfer ThreadCluster Lengths_K0_M_K1
+          S<1, 0, 2>,           // ABlockTransfer ThreadCluster ArrangeOrder
+          S<1, 0, 2>,           // ABlockTransfer SrcAccessOrder
+          2,                    // ABlockTransfer SrcVectorDim
+          16,                   // ABlockTransfer SrcScalarPerVector
+          16,                   // ABlockTransfer DstScalarPerVector_K1
+          1,                    // ABlockLdsExtraM
+          S<4, 64, 1>,          // BBlockTransfer ThreadCluster Lengths_K0_N_K1
+          S<1, 0, 2>,           // BBlockTransfer ThreadCluster ArrangeOrder
+          S<1, 0, 2>,           // BBlockTransfer SrcAccessOrder
+          2,                    // BBlockTransfer SrcVectorDim
+          16,                   // BBlockTransfer SrcScalarPerVector
+          16,                   // BBlockTransfer DstScalarPerVector_K1
+          1,                    // BBlockLdsExtraN
+          1,                    // CShuffleMXdlPerWavePerShuffle
+          1,                    // CShuffleNXdlPerWavePerShuffle
+          S<1, 64, 1, 4>,       // CBlockTransferClusterLengths_MBlock_MWaveMPerXdl_NBlock_NWaveNPerXdl
+          16>;                  // CBlockTransferScalarPerVector_NWaveNPerXdl
+// clang-format on
+
+#define BUILD_INT4_EXAMPLE
+#include "run_grouped_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_grouped_gemm_example(argc, argv); }
diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp
new file mode 100644
index 00000000..7cb09778
--- /dev/null
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_int8.cpp
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = int8_t;
+using BDataType        = int8_t;
+using AccDataType      = int32_t;
+using CShuffleDataType = int8_t;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = int8_t;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemm_Xdl
+    // clang-format off
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>;
+// clang-format on
+
+#include "run_grouped_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_grouped_gemm_example(argc, argv); }
diff --git a/example/15_grouped_gemm/run_grouped_gemm_example.inc b/example/15_grouped_gemm/run_grouped_gemm_example.inc
new file mode 100644
index 00000000..324e1772
--- /dev/null
+++ b/example/15_grouped_gemm/run_grouped_gemm_example.inc
@@ -0,0 +1,265 @@
+#pragma once
+
+struct ProblemSize final
+{
+    std::vector<ck::index_t> Ms;
+    std::vector<ck::index_t> Ns;
+    std::vector<ck::index_t> Ks;
+
+    std::vector<ck::index_t> stride_As;
+    std::vector<ck::index_t> stride_Bs;
+    std::vector<ck::index_t> stride_Cs;
+
+    ck::index_t group_count;
+};
+
+struct ExecutionConfig final
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+};
+
+bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+#if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
+    static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
+    static_assert(sizeof(ADataType) == sizeof(KernelADataType));
+    static_assert(sizeof(BDataType) == sizeof(KernelBDataType));
+    static_assert(sizeof(EDataType) == sizeof(KernelEDataType));
+#endif
+    int group_count = problem_size.group_count;
+
+    // GEMM shape
+    std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
+    std::vector<const void*> p_a, p_b;
+    std::vector<void*> p_c;
+
+    gemm_descs.reserve(group_count);
+
+    for(int i = 0; i < group_count; i++)
+    {
+        int M = problem_size.Ms[i];
+        int N = problem_size.Ns[i];
+        int K = problem_size.Ks[i];
+
+        int stride_A = problem_size.stride_As[i];
+        int stride_B = problem_size.stride_Bs[i];
+        int stride_C = problem_size.stride_Cs[i];
+
+        gemm_descs.push_back({M, N, K, stride_A, stride_B, stride_C, {}});
+    }
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    std::vector<Tensor<ADataType>> a_tensors;
+    std::vector<Tensor<BDataType>> b_tensors;
+    std::vector<Tensor<EDataType>> c_host_tensors;
+#ifdef BUILD_INT4_EXAMPLE
+    std::vector<Tensor<KernelEDataType>> c_device_tensors;
+#else
+    std::vector<Tensor<EDataType>> c_device_tensors;
+#endif
+
+    a_tensors.reserve(group_count);
+    b_tensors.reserve(group_count);
+    c_host_tensors.reserve(group_count);
+    c_device_tensors.reserve(group_count);
+
+    using DeviceMemPtr = std::unique_ptr<DeviceMem>;
+
+    std::vector<DeviceMemPtr> a_tensors_device, b_tensors_device, c_tensors_device;
+
+    a_tensors_device.reserve(group_count);
+    b_tensors_device.reserve(group_count);
+    c_tensors_device.reserve(group_count);
+
+    std::size_t flop = 0, num_btype = 0;
+
+    for(std::size_t i = 0; i < gemm_descs.size(); i++)
+    {
+        a_tensors.push_back(Tensor<ADataType>(f_host_tensor_descriptor(
+            gemm_descs[i].M_, gemm_descs[i].K_, gemm_descs[i].stride_A_, ALayout{})));
+        b_tensors.push_back(Tensor<BDataType>(f_host_tensor_descriptor(
+            gemm_descs[i].K_, gemm_descs[i].N_, gemm_descs[i].stride_B_, BLayout{})));
+        c_host_tensors.push_back(Tensor<EDataType>(f_host_tensor_descriptor(
+            gemm_descs[i].M_, gemm_descs[i].N_, gemm_descs[i].stride_C_, ELayout{})));
+#ifdef BUILD_INT4_EXAMPLE
+        c_device_tensors.push_back(Tensor<KernelEDataType>(f_host_tensor_descriptor(
+            gemm_descs[i].M_, gemm_descs[i].N_, gemm_descs[i].stride_C_, ELayout{})));
+#else
+        c_device_tensors.push_back(Tensor<EDataType>(f_host_tensor_descriptor(
+            gemm_descs[i].M_, gemm_descs[i].N_, gemm_descs[i].stride_C_, ELayout{})));
+#endif
+        std::cout << "gemm[" << i << "] a_m_k: " << a_tensors[i].mDesc
+                  << " b_k_n: " << b_tensors[i].mDesc << " c_m_n: " << c_device_tensors[i].mDesc
+                  << std::endl;
+
+        flop += std::size_t(2) * gemm_descs[i].M_ * gemm_descs[i].K_ * gemm_descs[i].N_;
+        num_btype += sizeof(ADataType) * a_tensors[i].mDesc.GetElementSize() +
+                     sizeof(BDataType) * b_tensors[i].mDesc.GetElementSize() +
+                     sizeof(EDataType) * c_device_tensors[i].mDesc.GetElementSize();
+
+        switch(config.init_method)
+        {
+        case 0: break;
+        case 1:
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+            break;
+        case 2:
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+            break;
+        default:
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<0>{});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        }
+    }
+
+    for(std::size_t i = 0; i < gemm_descs.size(); i++)
+    {
+        a_tensors_device.emplace_back(std::make_unique<DeviceMem>(
+            sizeof(ADataType) * a_tensors[i].mDesc.GetElementSpaceSize()));
+        b_tensors_device.emplace_back(std::make_unique<DeviceMem>(
+            sizeof(BDataType) * b_tensors[i].mDesc.GetElementSpaceSize()));
+        c_tensors_device.emplace_back(std::make_unique<DeviceMem>(
+            sizeof(EDataType) * c_device_tensors[i].mDesc.GetElementSpaceSize()));
+
+#ifdef BUILD_INT4_EXAMPLE
+        const Tensor<KernelADataType> a_converted(a_tensors[i]);
+        const Tensor<KernelBDataType> b_converted(b_tensors[i]);
+
+        a_tensors_device[i]->ToDevice(a_converted.mData.data());
+        b_tensors_device[i]->ToDevice(b_converted.mData.data());
+#else
+        a_tensors_device[i]->ToDevice(a_tensors[i].mData.data());
+        b_tensors_device[i]->ToDevice(b_tensors[i].mData.data());
+#endif
+
+        p_a.push_back(a_tensors_device[i]->GetDeviceBuffer());
+        p_b.push_back(b_tensors_device[i]->GetDeviceBuffer());
+        p_c.push_back(c_tensors_device[i]->GetDeviceBuffer());
+    }
+
+    auto a_element_op = AElementOp{};
+    auto b_element_op = BElementOp{};
+    auto c_element_op = CDEElementOp{};
+
+    auto gemm    = DeviceGemmInstance{};
+    auto invoker = gemm.MakeInvoker();
+
+    std::vector<std::array<const void*, 0>> p_Ds = {};
+
+    // do GEMM
+    auto argument = gemm.MakeArgument(
+        p_a, p_b, p_Ds, p_c, gemm_descs, a_element_op, b_element_op, c_element_op);
+
+    DeviceMem gemm_desc_workspace(gemm.GetWorkSpaceSize(&argument));
+
+    gemm.SetWorkSpacePointer(&argument, gemm_desc_workspace.GetDeviceBuffer());
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    invoker.Run(argument, StreamConfig{nullptr, false});
+
+    bool pass = true;
+    if(config.do_verification)
+    {
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                EDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                CDEElementOp>;
+
+        for(std::size_t i = 0; i < gemm_descs.size(); i++)
+        {
+            c_tensors_device[i]->FromDevice(c_device_tensors[i].mData.data());
+            auto ref_gemm    = ReferenceGemmInstance{};
+            auto ref_invoker = ref_gemm.MakeInvoker();
+
+            auto ref_argument = ref_gemm.MakeArgument(a_tensors[i],
+                                                      b_tensors[i],
+                                                      c_host_tensors[i],
+                                                      a_element_op,
+                                                      b_element_op,
+                                                      c_element_op);
+
+            ref_invoker.Run(ref_argument);
+
+#ifdef BUILD_INT4_EXAMPLE
+            const Tensor<EDataType> c_device_result_converted(c_device_tensors[i]);
+            pass &= ck::utils::check_err(c_device_result_converted, c_host_tensors[i]);
+
+#else
+            pass &= ck::utils::check_err(c_device_tensors[i], c_host_tensors[i]);
+#endif
+        }
+    }
+
+    if(config.time_kernel)
+    {
+        float ave_time   = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+        float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s, " << gemm.GetTypeString() << std::endl;
+    }
+
+    return pass;
+}
+
+bool run_grouped_gemm_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    problem_size.group_count = 16;
+
+    for(int i = 0; i < problem_size.group_count; i++)
+    {
+        problem_size.Ms.push_back(256 + 256 * i);
+        problem_size.Ns.push_back(128 + 128 * i);
+        problem_size.Ks.push_back(128 + 64 * i);
+
+        problem_size.stride_As.push_back(problem_size.Ks[i]);
+        problem_size.stride_Bs.push_back(problem_size.Ks[i]);
+        problem_size.stride_Cs.push_back(problem_size.Ns[i]);
+    }
+
+    if(argc == 4)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        exit(0);
+    }
+
+    return run_grouped_gemm(problem_size, config);
+}
diff --git a/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt b/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt
new file mode 100644
index 00000000..226656a7
--- /dev/null
+++ b/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt
@@ -0,0 +1,40 @@
+add_custom_target(example_gemm_reduce_xdl)
+add_custom_target(example_gemm_reduce_xdl_max)
+add_custom_target(example_gemm_reduce_xdl_mean_meansquare)
+add_custom_target(example_gemm_add_add_mean_meansquare_xdl)
+
+add_example_executable(example_gemm_max_xdl_fp16 gemm_max_xdl_fp16.cpp)
+add_example_executable(example_gemm_max_xdl_int8 gemm_max_xdl_int8.cpp)
+add_example_executable(example_gemm_max_xdl_fp32 gemm_max_xdl_fp32.cpp)
+add_example_executable(example_gemm_max_xdl_bf16 gemm_max_xdl_bf16.cpp)
+
+add_example_executable(example_gemm_add_add_mean_meansquare_xdl_fp16 gemm_add_add_mean_meansquare_xdl_fp16.cpp)
+
+add_example_executable(example_gemm_mean_meansquare_xdl_fp16 gemm_mean_meansquare_xdl_fp16.cpp)
+add_example_executable(example_gemm_mean_meansquare_xdl_fp32 gemm_mean_meansquare_xdl_fp32.cpp)
+add_example_executable(example_gemm_mean_meansquare_xdl_bf16 gemm_mean_meansquare_xdl_bf16.cpp)
+add_example_executable(example_gemm_add_addsquare_xdl_int8 gemm_add_addsquare_xdl_int8.cpp)
+
+add_dependencies(example_gemm_reduce_xdl_max
+                 example_gemm_max_xdl_bf16
+                 example_gemm_max_xdl_fp16
+                 example_gemm_max_xdl_fp32
+                 example_gemm_max_xdl_int8)
+
+add_dependencies(example_gemm_reduce_xdl_mean_meansquare
+                 example_gemm_mean_meansquare_xdl_fp16
+                 example_gemm_mean_meansquare_xdl_fp32
+                 example_gemm_mean_meansquare_xdl_bf16
+                 example_gemm_add_addsquare_xdl_int8)
+
+add_dependencies(example_gemm_add_add_mean_meansquare_xdl example_gemm_add_add_mean_meansquare_xdl_fp16)
+
+add_dependencies(example_gemm_reduce_xdl
+                 example_gemm_reduce_xdl_mean_meansquare
+                 example_gemm_reduce_xdl_max
+                 example_gemm_add_add_mean_meansquare_xdl)
+
+if(USE_BITINT_EXTENSION_INT4)
+  add_example_executable(example_gemm_max_xdl_int4 gemm_max_xdl_int4.cpp)
+  add_dependencies(example_gemm_reduce_xdl_max example_gemm_max_xdl_int4)
+endif()
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp
new file mode 100644
index 00000000..eb3832a6
--- /dev/null
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp
@@ -0,0 +1,276 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+// DataType
+using ADataType         = F16;
+using BDataType         = F16;
+using GemmAccDataType   = F32;
+using CShuffleDataType  = F32;
+using D0DataType        = F16;
+using D1DataType        = F16;
+using DsDataType        = ck::Tuple<D0DataType, D1DataType>;
+using EDataType         = F16;
+using ReduceAccDataType = F32;
+using R0DataType        = F32;
+using R1DataType        = F32;
+using RsDataType        = ck::Tuple<R0DataType, R1DataType>;
+
+// Layout
+using ALayout  = Row;
+using BLayout  = Col;
+using D1Layout = Row;
+using ELayout  = D1Layout;
+
+// Elementwise op
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using AddAdd       = ck::tensor_operation::element_wise::AddAdd;
+using Square       = ck::tensor_operation::element_wise::UnarySquare;
+using Div          = ck::tensor_operation::element_wise::UnaryDivide;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddAdd;
+using QsElementOp  = ck::Tuple<PassThrough, Square>;
+using RsElementOp  = ck::Tuple<Div, Div>;
+
+// ReduceOp
+using R0ThreadReduceOp = ck::reduce::Add;
+using R1ThreadReduceOp = ck::reduce::Add;
+using RsThreadReduceOp = ck::Tuple<R0ThreadReduceOp, R1ThreadReduceOp>;
+
+static constexpr auto R0GlobalReduceOp = ck::InMemoryDataOperationEnum::AtomicAdd;
+static constexpr auto R1GlobalReduceOp = ck::InMemoryDataOperationEnum::AtomicAdd;
+using RsGlobalReduceOp = ck::InMemoryDataOperationEnumSequence<R0GlobalReduceOp, R1GlobalReduceOp>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle
+//######| ALayout| BLayout| ELayout|     AData|     BData|     GemmAccData|         CShuffle|     DsData|     EData|     ReduceAccData|     RsData|           A|           B|          CDE|          Qs|          Rs|           Thread|           Global|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|    CDRThreadTransfer|                  CDE|    RThreadTransfer|
+//######|        |        |        |      Type|      Type|            Type|         DataType|       Type|      Type|              Type|       Type| Elementwise| Elementwise|  Elementwise| Elementwise| Elementwise|           Reduce|           Reduce| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|       ClusterLengths| ReduceThreadTransfer| DstScalarPerVector|
+//######|        |        |        |          |          |                |                 |           |          |                  |           |   Operation|   Operation|    Operation|   Operation|   Operation|        Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _MPerBlock_NPerBlock|      ScalarPerVector|         _MPerBlock|
+//######|        |        |        |          |          |                |                 |           |          |                  |           |            |            |             |            |            |                 |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                     |           _NPerBlock|                   |
+        < ALayout, BLayout, ELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<64, 4>,                    4,                  1>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        EDataType,
+                                                                        GemmAccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        PassThrough>;
+
+template <typename ADataType,
+          typename BDataType,
+          typename D0DataType,
+          typename D1DataType,
+          typename EDataType,
+          typename R0DataType,
+          typename R1DataType>
+void DumpPerf(float ave_time, int M, int N, int K)
+{
+    std::size_t flop          = std::size_t(2) * M * N * K + std::size_t(2) * M * N;
+    std::size_t gemm_num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                                sizeof(D0DataType) * M * N + sizeof(D1DataType) * M * N +
+                                sizeof(EDataType) * M * N + sizeof(R0DataType) * M +
+                                sizeof(R1DataType) * M;
+
+    float tflops          = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gemm_gb_per_sec = gemm_num_byte / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gemm_gb_per_sec
+              << " GB/s, " << std::endl;
+}
+
+auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
+    return HostTensorDescriptor({len}, {stride});
+};
+
+auto f_host_tensor_descriptor2d =
+    [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+        using namespace ck::literals;
+
+        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            return HostTensorDescriptor({row, col}, {stride, 1_uz});
+        }
+        else
+        {
+            return HostTensorDescriptor({row, col}, {1_uz, stride});
+        }
+    };
+
+int main()
+{
+    ck::index_t M = 1024;
+    ck::index_t N = 1024;
+    ck::index_t K = 1024;
+
+    ck::index_t StrideA  = 1024;
+    ck::index_t StrideB  = 1024;
+    ck::index_t StrideD0 = 0;
+    ck::index_t StrideD1 = 1024;
+    ck::index_t StrideE  = 1024;
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
+    Tensor<D0DataType> d0_n(f_host_tensor_descriptor1d(N, 1));
+    Tensor<D1DataType> d1_m_n(f_host_tensor_descriptor2d(M, N, StrideD1, D1Layout{}));
+    Tensor<EDataType> e_m_n(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
+    Tensor<R0DataType> r0_m(f_host_tensor_descriptor1d(M, 1));
+    Tensor<R1DataType> r1_m(f_host_tensor_descriptor1d(M, 1));
+
+    a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-1, 1});
+    b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-1, 1});
+    d0_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{-1, 1});
+    d1_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{-1, 1});
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d0_device_buf(sizeof(D0DataType) * d0_n.mDesc.GetElementSpaceSize());
+    DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem r0_device_buf(sizeof(R0DataType) * r0_m.mDesc.GetElementSpaceSize());
+    DeviceMem r1_device_buf(sizeof(R1DataType) * r1_m.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    d0_device_buf.ToDevice(d0_n.mData.data());
+    d1_device_buf.ToDevice(d1_m_n.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+    auto qs_element_op  = QsElementOp{};
+    auto rs_element_op  = RsElementOp{N, N};
+
+    // Prepare GEMM, mean, mean_square
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                               b_device_buf.GetDeviceBuffer(),
+                               {d0_device_buf.GetDeviceBuffer(), d1_device_buf.GetDeviceBuffer()},
+                               e_device_buf.GetDeviceBuffer(),
+                               {r0_device_buf.GetDeviceBuffer(), r1_device_buf.GetDeviceBuffer()},
+                               M,
+                               N,
+                               K,
+                               StrideA,
+                               StrideB,
+                               {StrideD0, StrideD1},
+                               StrideE,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op,
+                               qs_element_op,
+                               rs_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error("wrong! this device_op instance does not support this problem");
+    }
+
+    // init reducetion buffer to 0
+    r0_device_buf.SetZero();
+    r1_device_buf.SetZero();
+
+    invoker.Run(argument, StreamConfig{nullptr, false});
+
+    bool do_verification = true;
+    bool pass            = true;
+
+    if(do_verification)
+    {
+        auto I0 = ck::Number<0>{};
+        auto I1 = ck::Number<1>{};
+
+        Tensor<EDataType> e_m_n_host(e_m_n.mDesc);
+        Tensor<R0DataType> r0_m_host(r0_m.mDesc);
+        Tensor<R1DataType> r1_m_host(r1_m.mDesc);
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, e_m_n_host, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        auto reduce0_op = R0ThreadReduceOp{};
+        auto reduce1_op = R1ThreadReduceOp{};
+
+        for(int m = 0; m < M; ++m)
+        {
+            auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
+            auto reduce1_acc = reduce1_op.GetIdentityValue<ReduceAccDataType>();
+
+            for(int n = 0; n < N; ++n)
+            {
+                ReduceAccDataType square_e_val;
+
+                auto e_val  = ck::type_convert<GemmAccDataType>(e_m_n_host(m, n));
+                auto d0_val = ck::type_convert<GemmAccDataType>(d0_n(n));
+                auto d1_val = ck::type_convert<GemmAccDataType>(d1_m_n(m, n));
+                cde_element_op(e_val, e_val, d0_val, d1_val);
+                e_m_n_host(m, n) = ck::type_convert<EDataType>(e_val);
+
+                auto e_val_reduce = ck::type_convert<ReduceAccDataType>(e_val);
+                qs_element_op[I1](square_e_val, e_val_reduce);
+
+                reduce0_op(reduce0_acc, e_val_reduce);
+                reduce1_op(reduce1_acc, square_e_val);
+            }
+
+            rs_element_op[I0](reduce0_acc, reduce0_acc);
+            rs_element_op[I1](reduce1_acc, reduce1_acc);
+            r0_m_host(m) = ck::type_convert<R0DataType>(reduce0_acc);
+            r1_m_host(m) = ck::type_convert<R1DataType>(reduce1_acc);
+        }
+
+        e_device_buf.FromDevice(e_m_n.mData.data());
+        r0_device_buf.FromDevice(r0_m.mData.data());
+        r1_device_buf.FromDevice(r1_m.mData.data());
+
+        pass = ck::utils::check_err(e_m_n, e_m_n_host, "Error: Incorrect results c", 1e-2, 1e-2);
+        pass &= ck::utils::check_err(r0_m, r0_m_host, "Error: Incorrect results d0", 1e-2, 1e-2);
+        pass &= ck::utils::check_err(r1_m, r1_m_host, "Error: Incorrect results d1", 1e-2, 1e-2);
+    }
+
+    bool time_kernel = true;
+    if(time_kernel)
+    {
+        float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+        DumpPerf<ADataType, BDataType, D0DataType, D1DataType, EDataType, R0DataType, R1DataType>(
+            ave_time, M, N, K);
+    }
+
+    return pass ? 0 : 1;
+}
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp
new file mode 100644
index 00000000..e1248002
--- /dev/null
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_add_addsquare_xdl_int8.cpp
@@ -0,0 +1,364 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gemm_reduce_xdl_common.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+
+// DataType
+using ADataType         = INT8;
+using BDataType         = INT8;
+using GemmAccDataType   = INT32;
+using CShuffleDataType  = INT32;
+using DsDataType        = ck::Tuple<>;
+using EDataType         = INT8;
+using ReduceAccDataType = INT32;
+using R0DataType        = INT32;
+using R1DataType        = INT32;
+using RsDataType        = ck::Tuple<R0DataType, R1DataType>;
+
+// Layout
+using ALayout = Row;
+using BLayout = Col;
+using ELayout = Row;
+
+// Elementwise op
+using Square       = ck::tensor_operation::element_wise::UnarySquare;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+using QsElementOp  = ck::Tuple<PassThrough, Square>;
+using RsElementOp  = ck::Tuple<PassThrough, PassThrough>;
+
+// ReduceOp
+using R0ThreadReduceOp = ck::reduce::Add;
+using R1ThreadReduceOp = ck::reduce::Add;
+using RsThreadReduceOp = ck::Tuple<R0ThreadReduceOp, R1ThreadReduceOp>;
+
+static constexpr auto R0GlobalReduceOp = ck::InMemoryDataOperationEnum::AtomicAdd;
+static constexpr auto R1GlobalReduceOp = ck::InMemoryDataOperationEnum::AtomicAdd;
+using RsGlobalReduceOp = ck::InMemoryDataOperationEnumSequence<R0GlobalReduceOp, R1GlobalReduceOp>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle
+        <ALayout,                   // ALayout
+         BLayout,                   // BLayout
+         ELayout,                   // ELayout
+         ADataType,                 // ADataType
+         BDataType,                 // BDataType
+         GemmAccDataType,           // GemmAccDataType
+         CShuffleDataType,          // CShuffleDataType
+         DsDataType,                // DsDataType
+         EDataType,                 // EDataType
+         ReduceAccDataType,         // ReduceAccDataType
+         RsDataType,                // RsDataType
+         AElementOp,                // AElementwiseOperation
+         BElementOp,                // BElementwiseOperation
+         CDEElementOp,              // CDE ElementwiseOperation
+         QsElementOp,               // Qs Elementwise Operation
+         RsElementOp,               // Rs Elementwise Operation
+         RsThreadReduceOp,          // Thread Reduce Operation
+         RsGlobalReduceOp,          // Global Reduce Operation
+         GemmDefault,               // GEMM Specialization
+         1,                         // NumGemmKPrefetchStage
+         256,                       // BlockSize
+         256,                       // MPerBlock
+         128,                       // NPerBlock
+         64,                        // KPerBlock
+         16,                        // AK1
+         16,                        // BK1
+         32,                        // MPerXdl
+         32,                        // NPerXdl
+         4,                         // MXdlPerWave
+         2,                         // NXdlPerWave
+         S<4, 64, 1>,               // ABlockTransfer ThreadCluster Lengths_K0_M_K1
+         S<1, 0, 2>,                // ABlockTransfer ThreadCluster ArrangeOrder
+         S<1, 0, 2>,                // ABlockTransfer SrcAccessOrder
+         2,                         // ABlockTransfer SrcVectorDim
+         16,                        // ABlockTransfer SrcScalarPerVector
+         16,                        // ABlockTransfer DstScalarPerVector_K1
+         1,                         // ABlockLdsExtraM
+         S<4, 64, 1>,               // BBlockTransfer ThreadCluster Lengths_K0_N_K1
+         S<1, 0, 2>,                // BBlockTransfer ThreadCluster ArrangeOrder
+         S<1, 0, 2>,                // BBlockTransfer SrcAccessOrder
+         2,                         // BBlockTransfer SrcVectorDim
+         16,                        // BBlockTransfer SrcScalarPerVector
+         16,                        // BBlockTransfer DstScalarPerVector_K1
+         1,                         // BBlockLdsExtraN
+         1,                         // CShuffleMXdlPerWavePerShuffle
+         1,                         // CShuffleNXdlPerWavePerShuffle
+         S<64, 4>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
+         4,                         // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
+         1>;                        // RThread DstScalarPerVector _MPerBlock
+// clang-format on
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        ReduceAccDataType,
+                                                                        GemmAccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        CDEElementOp>;
+
+using namespace ck::literals;
+
+template <typename ADataType,
+          typename BDataType,
+          typename EDataType,
+          typename R0DataType,
+          typename R1DataType,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          typename AElementOp,
+          typename BElementOp,
+          typename CDEElementOp,
+          typename QsElementOp,
+          typename RsElementOp,
+          typename RsThreadReduceOp,
+          typename ReduceAccDataType,
+          typename DeviceOpInstance,
+          typename ReferenceGemmInstance>
+bool run_gemm_reduce_add_addsquare_xdl(ck::index_t M,
+                                       ck::index_t N,
+                                       ck::index_t K,
+                                       ck::index_t StrideA,
+                                       ck::index_t StrideB,
+                                       ck::index_t StrideE,
+                                       bool do_verification,
+                                       int init_method,
+                                       bool time_kernel)
+{
+
+    auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
+        return HostTensorDescriptor({len}, {stride});
+    };
+
+    auto f_host_tensor_descriptor2d =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
+    Tensor<EDataType> e_m_n(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
+    Tensor<R0DataType> r0_m(f_host_tensor_descriptor1d(M, 1));
+    Tensor<R1DataType> r1_m(f_host_tensor_descriptor1d(M, 1));
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
+        ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n);
+        break;
+    default:
+        ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
+        ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
+        break;
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem r0_device_buf(sizeof(R0DataType) * r0_m.mDesc.GetElementSpaceSize());
+    DeviceMem r1_device_buf(sizeof(R1DataType) * r1_m.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+    auto qs_element_op  = QsElementOp{};
+    auto rs_element_op  = RsElementOp{};
+
+    // Prepare GEMM, add, add_square
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                               b_device_buf.GetDeviceBuffer(),
+                               {},
+                               e_device_buf.GetDeviceBuffer(),
+                               {r0_device_buf.GetDeviceBuffer(), r1_device_buf.GetDeviceBuffer()},
+                               M,
+                               N,
+                               K,
+                               StrideA,
+                               StrideB,
+                               {},
+                               StrideE,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op,
+                               qs_element_op,
+                               rs_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error("wrong! this device_op instance does not support this problem");
+    }
+
+    // init reducetion buffer to 0
+    r0_device_buf.SetZero();
+    r1_device_buf.SetZero();
+
+    invoker.Run(argument, StreamConfig{nullptr, false});
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        auto I0 = ck::Number<0>{};
+        auto I1 = ck::Number<1>{};
+
+        Tensor<ReduceAccDataType> e_m_n_host(e_m_n.mDesc);
+        Tensor<R0DataType> r0_m_host(r0_m.mDesc);
+        Tensor<R1DataType> r1_m_host(r1_m.mDesc);
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, e_m_n_host, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        auto reduce0_op = RsThreadReduceOp{}[I0];
+        auto reduce1_op = RsThreadReduceOp{}[I1];
+
+        for(int m = 0; m < M; ++m)
+        {
+            auto reduce0_acc = reduce0_op.template GetIdentityValue<ReduceAccDataType>();
+            auto reduce1_acc = reduce1_op.template GetIdentityValue<ReduceAccDataType>();
+
+            for(int n = 0; n < N; ++n)
+            {
+                ReduceAccDataType square_e_val;
+                auto e_val = ck::type_convert<ReduceAccDataType>(e_m_n_host(m, n));
+                qs_element_op[I1](square_e_val, e_val);
+
+                reduce0_op(reduce0_acc, e_val);
+                reduce1_op(reduce1_acc, square_e_val);
+            }
+
+            r0_m_host(m) = ck::type_convert<R0DataType>(reduce0_acc);
+            r1_m_host(m) = ck::type_convert<R1DataType>(reduce1_acc);
+        }
+        e_device_buf.FromDevice(e_m_n.mData.data());
+
+        Tensor<EDataType> e_m_n_host_converted(e_m_n_host);
+
+        pass = ck::utils::check_err(
+            e_m_n, e_m_n_host_converted, "Error: Incorrect results c", 1e-2, 1e-2);
+
+        r0_device_buf.FromDevice(r0_m.mData.data());
+        r1_device_buf.FromDevice(r1_m.mData.data());
+
+        pass &= ck::utils::check_err(r0_m, r0_m_host, "Error: Incorrect results d0", 1e-2, 1e-2);
+        pass &= ck::utils::check_err(r1_m, r1_m_host, "Error: Incorrect results d1", 1e-2, 1e-2);
+
+        if(pass)
+        {
+            std::cout << "Success!" << std::endl;
+        }
+    }
+
+    if(time_kernel)
+    {
+        float ave_time            = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+        std::size_t flop          = 2_uz * M * N * K + 3_uz * M * N;
+        std::size_t gemm_num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                                    sizeof(EDataType) * M * N + sizeof(R0DataType) * M +
+                                    sizeof(R1DataType) * M;
+
+        float tflops          = static_cast<float>(flop) / 1.E9 / ave_time;
+        float gemm_gb_per_sec = gemm_num_byte / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gemm_gb_per_sec
+                  << " GB/s, " << std::endl;
+    }
+
+    return pass;
+}
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+
+    // GEMM shape
+    ck::index_t M = 1024;
+    ck::index_t N = 1152;
+    ck::index_t K = 512;
+
+    ck::index_t StrideA = 512;
+    ck::index_t StrideB = 512;
+    ck::index_t StrideE = 1152;
+
+    if(argc == 1)
+    {
+        // do nothing
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 10)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideE = std::stoi(argv[9]);
+    }
+    else
+    {
+        std::cout << "arg1: verification (0=no, 1=yes)\n"
+                  << " arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+                  << " arg3: Measure kernel execution time (1=ON, 0=Off)\n"
+                  << " arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE\n"
+                  << std::endl;
+        exit(EXIT_SUCCESS);
+    }
+
+    return !run_gemm_reduce_add_addsquare_xdl<ADataType,
+                                              BDataType,
+                                              EDataType,
+                                              R0DataType,
+                                              R1DataType,
+                                              ALayout,
+                                              BLayout,
+                                              ELayout,
+                                              AElementOp,
+                                              BElementOp,
+                                              CDEElementOp,
+                                              QsElementOp,
+                                              RsElementOp,
+                                              RsThreadReduceOp,
+                                              ReduceAccDataType,
+                                              DeviceOpInstance,
+                                              ReferenceGemmInstance>(
+        M, N, K, StrideA, StrideB, StrideE, do_verification, init_method, time_kernel);
+}
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp
new file mode 100644
index 00000000..c2feffeb
--- /dev/null
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_bf16.cpp
@@ -0,0 +1,167 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gemm_reduce_xdl_common.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+
+// DataType
+using ADataType         = BF16;
+using BDataType         = BF16;
+using GemmAccDataType   = F32;
+using CShuffleDataType  = F32;
+using DsDataType        = ck::Tuple<>;
+using EDataType         = BF16;
+using ReduceAccDataType = F32;
+using R0DataType        = F32;
+using RsDataType        = ck::Tuple<R0DataType>;
+
+// Layout
+using ALayout = Row;
+using BLayout = Col;
+using ELayout = Row;
+
+// Elementwise op
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+using QsElementOp  = ck::Tuple<PassThrough>;
+using RsElementOp  = ck::Tuple<PassThrough>;
+
+// ReduceOp
+using RsThreadReduceOp = ck::Tuple<ck::reduce::Max>;
+using RsGlobalReduceOp =
+    ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicMax>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle
+        <ALayout,                   // ALayout
+         BLayout,                   // BLayout
+         ELayout,                   // ELayout
+         ADataType,                 // ADataType
+         BDataType,                 // BDataType
+         GemmAccDataType,           // GemmAccDataType
+         CShuffleDataType,          // CShuffleDataType
+         DsDataType,                // DsDataType
+         EDataType,                 // EDataType
+         ReduceAccDataType,         // ReduceAccDataType
+         RsDataType,                // RsDataType
+         AElementOp,                // AElementwiseOperation
+         BElementOp,                // BElementwiseOperation
+         CDEElementOp,              // CDE ElementwiseOperation
+         QsElementOp,               // Qs Elementwise Operation
+         RsElementOp,               // Rs Elementwise Operation
+         RsThreadReduceOp,          // Thread Reduce Operation
+         RsGlobalReduceOp,          // Global Reduce Operation
+         GemmDefault,               // GEMM Specialization
+         1,                         // NumGemmKPrefetchStage
+         256,                       // BlockSize
+         256,                       // MPerBlock
+         128,                       // NPerBlock
+         32,                        // KPerBlock
+         8,                         // AK1
+         8,                         // BK1
+         32,                        // MPerXdl
+         32,                        // NPerXdl
+         4,                         // MXdlPerWave
+         2,                         // NXdlPerWave
+         S<4, 64, 1>,               // ABlockTransfer ThreadCluster Lengths_K0_M_K1
+         S<1, 0, 2>,                // ABlockTransfer ThreadCluster ArrangeOrder
+         S<1, 0, 2>,                // ABlockTransfer SrcAccessOrder
+         2,                         // ABlockTransfer SrcVectorDim
+         8,                         // ABlockTransfer SrcScalarPerVector
+         8,                         // ABlockTransfer DstScalarPerVector_K1
+         1,                         // ABlockLdsExtraM
+         S<4, 64, 1>,               // BBlockTransfer ThreadCluster Lengths_K0_N_K1
+         S<1, 0, 2>,                // BBlockTransfer ThreadCluster ArrangeOrder
+         S<1, 0, 2>,                // BBlockTransfer SrcAccessOrder
+         2,                         // BBlockTransfer SrcVectorDim
+         8,                         // BBlockTransfer SrcScalarPerVector
+         8,                         // BBlockTransfer DstScalarPerVector_K1
+         1,                         // BBlockLdsExtraN
+         1,                         // CShuffleMXdlPerWavePerShuffle
+         1,                         // CShuffleNXdlPerWavePerShuffle
+         S<64, 4>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
+         4,                         // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
+         1>;                        // RThread DstScalarPerVector _MPerBlock
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        ReduceAccDataType,
+                                                                        GemmAccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        CDEElementOp>;
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+
+    // GEMM shape
+    ck::index_t M = 1024;
+    ck::index_t N = 1152;
+    ck::index_t K = 256;
+
+    ck::index_t StrideA = 256;
+    ck::index_t StrideB = 256;
+    ck::index_t StrideE = 1152;
+
+    if(argc == 1)
+    {
+        // do nothing
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 10)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideE = std::stoi(argv[9]);
+    }
+    else
+    {
+        std::cout << "arg1: verification (0=no, 1=yes)\n"
+                  << " arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+                  << " arg3: Measure kernel execution time (1=ON, 0=Off)\n"
+                  << " arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE\n"
+                  << std::endl;
+        exit(EXIT_SUCCESS);
+    }
+
+    return run_gemm_reduce_max_xdl<ADataType,
+                                   BDataType,
+                                   EDataType,
+                                   R0DataType,
+                                   ALayout,
+                                   BLayout,
+                                   ELayout,
+                                   AElementOp,
+                                   BElementOp,
+                                   CDEElementOp,
+                                   QsElementOp,
+                                   RsElementOp,
+                                   RsThreadReduceOp,
+                                   ReduceAccDataType,
+                                   DeviceOpInstance,
+                                   ReferenceGemmInstance>(
+        M, N, K, StrideA, StrideB, StrideE, do_verification, init_method, time_kernel);
+}
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp
new file mode 100644
index 00000000..363390ad
--- /dev/null
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp16.cpp
@@ -0,0 +1,167 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gemm_reduce_xdl_common.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+
+// DataType
+using ADataType         = F16;
+using BDataType         = F16;
+using GemmAccDataType   = F32;
+using CShuffleDataType  = F32;
+using DsDataType        = ck::Tuple<>;
+using EDataType         = F16;
+using ReduceAccDataType = F32;
+using R0DataType        = F32;
+using RsDataType        = ck::Tuple<R0DataType>;
+
+// Layout
+using ALayout = Row;
+using BLayout = Col;
+using ELayout = Row;
+
+// Elementwise op
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+using QsElementOp  = ck::Tuple<PassThrough>;
+using RsElementOp  = ck::Tuple<PassThrough>;
+
+// ReduceOp
+using RsThreadReduceOp = ck::Tuple<ck::reduce::Max>;
+using RsGlobalReduceOp =
+    ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicMax>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle
+        <ALayout,                   // ALayout
+         BLayout,                   // BLayout
+         ELayout,                   // ELayout
+         ADataType,                 // ADataType
+         BDataType,                 // BDataType
+         GemmAccDataType,           // GemmAccDataType
+         CShuffleDataType,          // CShuffleDataType
+         DsDataType,                // DsDataType
+         EDataType,                 // EDataType
+         ReduceAccDataType,         // ReduceAccDataType
+         RsDataType,                // RsDataType
+         AElementOp,                // AElementwiseOperation
+         BElementOp,                // BElementwiseOperation
+         CDEElementOp,              // CDE ElementwiseOperation
+         QsElementOp,               // Qs Elementwise Operation
+         RsElementOp,               // Rs Elementwise Operation
+         RsThreadReduceOp,          // Thread Reduce Operation
+         RsGlobalReduceOp,          // Global Reduce Operation
+         GemmDefault,               // GEMM Specialization
+         1,                         // NumGemmKPrefetchStage
+         256,                       // BlockSize
+         256,                       // MPerBlock
+         128,                       // NPerBlock
+         32,                        // KPerBlock
+         8,                         // AK1
+         8,                         // BK1
+         32,                        // MPerXdl
+         32,                        // NPerXdl
+         4,                         // MXdlPerWave
+         2,                         // NXdlPerWave
+         S<4, 64, 1>,               // ABlockTransfer ThreadCluster Lengths_K0_M_K1
+         S<1, 0, 2>,                // ABlockTransfer ThreadCluster ArrangeOrder
+         S<1, 0, 2>,                // ABlockTransfer SrcAccessOrder
+         2,                         // ABlockTransfer SrcVectorDim
+         8,                         // ABlockTransfer SrcScalarPerVector
+         8,                         // ABlockTransfer DstScalarPerVector_K1
+         1,                         // ABlockLdsExtraM
+         S<4, 64, 1>,               // BBlockTransfer ThreadCluster Lengths_K0_N_K1
+         S<1, 0, 2>,                // BBlockTransfer ThreadCluster ArrangeOrder
+         S<1, 0, 2>,                // BBlockTransfer SrcAccessOrder
+         2,                         // BBlockTransfer SrcVectorDim
+         8,                         // BBlockTransfer SrcScalarPerVector
+         8,                         // BBlockTransfer DstScalarPerVector_K1
+         1,                         // BBlockLdsExtraN
+         1,                         // CShuffleMXdlPerWavePerShuffle
+         1,                         // CShuffleNXdlPerWavePerShuffle
+         S<64, 4>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
+         4,                         // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
+         1>;                        // RThread DstScalarPerVector _MPerBlock
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        ReduceAccDataType,
+                                                                        GemmAccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        CDEElementOp>;
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+
+    // GEMM shape
+    ck::index_t M = 1024;
+    ck::index_t N = 1024;
+    ck::index_t K = 1024;
+
+    ck::index_t StrideA = 1024;
+    ck::index_t StrideB = 1024;
+    ck::index_t StrideE = 1024;
+
+    if(argc == 1)
+    {
+        // do nothing
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 10)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideE = std::stoi(argv[9]);
+    }
+    else
+    {
+        std::cout << "arg1: verification (0=no, 1=yes)\n"
+                  << " arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+                  << " arg3: Measure kernel execution time (1=ON, 0=Off)\n"
+                  << " arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE\n"
+                  << std::endl;
+        exit(EXIT_SUCCESS);
+    }
+
+    return run_gemm_reduce_max_xdl<ADataType,
+                                   BDataType,
+                                   EDataType,
+                                   R0DataType,
+                                   ALayout,
+                                   BLayout,
+                                   ELayout,
+                                   AElementOp,
+                                   BElementOp,
+                                   CDEElementOp,
+                                   QsElementOp,
+                                   RsElementOp,
+                                   RsThreadReduceOp,
+                                   ReduceAccDataType,
+                                   DeviceOpInstance,
+                                   ReferenceGemmInstance>(
+        M, N, K, StrideA, StrideB, StrideE, do_verification, init_method, time_kernel);
+}
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp
new file mode 100644
index 00000000..de6b7eb4
--- /dev/null
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_fp32.cpp
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gemm_reduce_xdl_common.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+
+// DataType
+using ADataType         = F32;
+using BDataType         = F32;
+using GemmAccDataType   = F32;
+using CShuffleDataType  = F32;
+using DsDataType        = ck::Tuple<>;
+using EDataType         = F32;
+using ReduceAccDataType = F32;
+using R0DataType        = F32;
+using RsDataType        = ck::Tuple<R0DataType>;
+
+// Layout
+using ALayout = Row;
+using BLayout = Col;
+using ELayout = Row;
+
+// Elementwise op
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+using QsElementOp  = ck::Tuple<PassThrough>;
+using RsElementOp  = ck::Tuple<PassThrough>;
+
+// ReduceOp
+using RsThreadReduceOp = ck::Tuple<ck::reduce::Max>;
+using RsGlobalReduceOp =
+    ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicMax>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle
+        <ALayout,                   // ALayout
+         BLayout,                   // BLayout
+         ELayout,                   // ELayout
+         ADataType,                 // ADataType
+         BDataType,                 // BDataType
+         GemmAccDataType,           // GemmAccDataType
+         CShuffleDataType,          // CShuffleDataType
+         DsDataType,                // DsDataType
+         EDataType,                 // EDataType
+         ReduceAccDataType,         // ReduceAccDataType
+         RsDataType,                // RsDataType
+         AElementOp,                // AElementwiseOperation
+         BElementOp,                // BElementwiseOperation
+         CDEElementOp,              // CDE ElementwiseOperation
+         QsElementOp,               // Qs Elementwise Operation
+         RsElementOp,               // Rs Elementwise Operation
+         RsThreadReduceOp,          // Thread Reduce Operation
+         RsGlobalReduceOp,          // Global Reduce Operation
+         GemmDefault,               // GEMM Specialization
+         1,                         // NumGemmKPrefetchStage
+         256,                       // BlockSize
+         256,                       // MPerBlock
+         128,                       // NPerBlock
+         16,                        // KPerBlock
+         4,                         // AK1
+         4,                         // BK1
+         32,                        // MPerXdl
+         32,                        // NPerXdl
+         4,                         // MXdlPerWave
+         2,                         // NXdlPerWave
+         S<4, 64, 1>,               // ABlockTransfer ThreadCluster Lengths_K0_M_K1
+         S<1, 0, 2>,                // ABlockTransfer ThreadCluster ArrangeOrder
+         S<1, 0, 2>,                // ABlockTransfer SrcAccessOrder
+         2,                         // ABlockTransfer SrcVectorDim
+         4,                         // ABlockTransfer SrcScalarPerVector
+         4,                         // ABlockTransfer DstScalarPerVector_K1
+         1,                         // ABlockLdsExtraM
+         S<4, 64, 1>,               // BBlockTransfer ThreadCluster Lengths_K0_N_K1
+         S<1, 0, 2>,                // BBlockTransfer ThreadCluster ArrangeOrder
+         S<1, 0, 2>,                // BBlockTransfer SrcAccessOrder
+         2,                         // BBlockTransfer SrcVectorDim
+         4,                         // BBlockTransfer SrcScalarPerVector
+         4,                         // BBlockTransfer DstScalarPerVector_K1
+         1,                         // BBlockLdsExtraN
+         1,                         // CShuffleMXdlPerWavePerShuffle
+         1,                         // CShuffleNXdlPerWavePerShuffle
+         S<64, 4>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
+         4,                         // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
+         1>;                        // RThread DstScalarPerVector _MPerBlock
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        ReduceAccDataType,
+                                                                        GemmAccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        CDEElementOp>;
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+
+    // GEMM shape
+    ck::index_t M = 1024;
+    ck::index_t N = 1024;
+    ck::index_t K = 1024;
+
+    ck::index_t StrideA = 1024;
+    ck::index_t StrideB = 1024;
+    ck::index_t StrideE = 1024;
+
+    if(argc == 1)
+    {
+        // do nothing
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 10)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideE = std::stoi(argv[9]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: Measure kernel execution time (1=ON, 0=Off)\n");
+        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE\n");
+        exit(0);
+    }
+
+    return run_gemm_reduce_max_xdl<ADataType,
+                                   BDataType,
+                                   EDataType,
+                                   R0DataType,
+                                   ALayout,
+                                   BLayout,
+                                   ELayout,
+                                   AElementOp,
+                                   BElementOp,
+                                   CDEElementOp,
+                                   QsElementOp,
+                                   RsElementOp,
+                                   RsThreadReduceOp,
+                                   ReduceAccDataType,
+                                   DeviceOpInstance,
+                                   ReferenceGemmInstance>(
+        M, N, K, StrideA, StrideB, StrideE, do_verification, init_method, time_kernel);
+}
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp
new file mode 100644
index 00000000..9666fc66
--- /dev/null
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int4.cpp
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gemm_reduce_xdl_common.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+
+using ADataType         = INT4;
+using ADataKernelType   = INT8;
+using BDataType         = INT4;
+using BDataKernelType   = INT8;
+using GemmAccDataType   = INT32;
+using CShuffleDataType  = INT32;
+using DsDataType        = ck::Tuple<>;
+using EDataType         = INT4;
+using EDataKernelType   = INT8;
+using ReduceAccDataType = INT32;
+using R0DataType        = INT32;
+using RsDataType        = ck::Tuple<R0DataType>;
+
+// Layout
+using ALayout = Row;
+using BLayout = Col;
+using ELayout = Row;
+
+// Elementwise op
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+using QsElementOp  = ck::Tuple<PassThrough>;
+using RsElementOp  = ck::Tuple<PassThrough>;
+
+// ReduceOp
+using RsThreadReduceOp = ck::Tuple<ck::reduce::Max>;
+using RsGlobalReduceOp =
+    ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicMax>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle
+        <ALayout,                   // ALayout
+         BLayout,                   // BLayout
+         ELayout,                   // ELayout
+         ADataKernelType,           // ADataType
+         BDataKernelType,           // BDataType
+         GemmAccDataType,           // GemmAccDataType
+         CShuffleDataType,          // CShuffleDataType
+         DsDataType,                // DsDataType
+         EDataKernelType,           // EDataType
+         ReduceAccDataType,         // ReduceAccDataType
+         RsDataType,                // RsDataType
+         AElementOp,                // AElementwiseOperation
+         BElementOp,                // BElementwiseOperation
+         CDEElementOp,              // CDE ElementwiseOperation
+         QsElementOp,               // Qs Elementwise Operation
+         RsElementOp,               // Rs Elementwise Operation
+         RsThreadReduceOp,          // Thread Reduce Operation
+         RsGlobalReduceOp,          // Global Reduce Operation
+         GemmDefault,               // GEMM Specialization
+         1,                         // NumGemmKPrefetchStage
+         256,                       // BlockSize
+         256,                       // MPerBlock
+         128,                       // NPerBlock
+         64,                        // KPerBlock
+         16,                        // AK1
+         16,                        // BK1
+         32,                        // MPerXdl
+         32,                        // NPerXdl
+         4,                         // MXdlPerWave
+         2,                         // NXdlPerWave
+         S<4, 64, 1>,               // ABlockTransfer ThreadCluster Lengths_K0_M_K1
+         S<1, 0, 2>,                // ABlockTransfer ThreadCluster ArrangeOrder
+         S<1, 0, 2>,                // ABlockTransfer SrcAccessOrder
+         2,                         // ABlockTransfer SrcVectorDim
+         16,                        // ABlockTransfer SrcScalarPerVector
+         16,                        // ABlockTransfer DstScalarPerVector_K1
+         1,                         // ABlockLdsExtraM
+         S<4, 64, 1>,               // BBlockTransfer ThreadCluster Lengths_K0_N_K1
+         S<1, 0, 2>,                // BBlockTransfer ThreadCluster ArrangeOrder
+         S<1, 0, 2>,                // BBlockTransfer SrcAccessOrder
+         2,                         // BBlockTransfer SrcVectorDim
+         16,                        // BBlockTransfer SrcScalarPerVector
+         16,                        // BBlockTransfer DstScalarPerVector_K1
+         1,                         // BBlockLdsExtraN
+         1,                         // CShuffleMXdlPerWavePerShuffle
+         1,                         // CShuffleNXdlPerWavePerShuffle
+         S<64, 4>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
+         4,                         // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
+         1>;                        // RThread DstScalarPerVector _MPerBlock
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        ReduceAccDataType,
+                                                                        GemmAccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        CDEElementOp>;
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+
+    // GEMM shape
+    ck::index_t M = 1024;
+    ck::index_t N = 1152;
+    ck::index_t K = 256;
+
+    ck::index_t StrideA = 256;
+    ck::index_t StrideB = 256;
+    ck::index_t StrideE = 1152;
+
+    if(argc == 1)
+    {
+        // do nothing
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 10)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideE = std::stoi(argv[9]);
+    }
+    else
+    {
+        std::cout << "arg1: verification (0=no, 1=yes)\n"
+                  << " arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+                  << " arg3: Measure kernel execution time (1=ON, 0=Off)\n"
+                  << " arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE\n"
+                  << std::endl;
+        exit(EXIT_SUCCESS);
+    }
+
+    return run_gemm_reduce_max_xdl<ADataType,
+                                   BDataType,
+                                   EDataType,
+                                   R0DataType,
+                                   ALayout,
+                                   BLayout,
+                                   ELayout,
+                                   AElementOp,
+                                   BElementOp,
+                                   CDEElementOp,
+                                   QsElementOp,
+                                   RsElementOp,
+                                   RsThreadReduceOp,
+                                   ReduceAccDataType,
+                                   DeviceOpInstance,
+                                   ReferenceGemmInstance,
+                                   ADataKernelType,
+                                   BDataKernelType,
+                                   EDataKernelType>(
+        M, N, K, StrideA, StrideB, StrideE, do_verification, init_method, time_kernel);
+}
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp
new file mode 100644
index 00000000..00e0b767
--- /dev/null
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_max_xdl_int8.cpp
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gemm_reduce_xdl_common.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+
+using ADataType         = INT8;
+using BDataType         = INT8;
+using GemmAccDataType   = INT32;
+using CShuffleDataType  = INT32;
+using DsDataType        = ck::Tuple<>;
+using EDataType         = INT8;
+using ReduceAccDataType = INT32;
+using R0DataType        = INT32;
+using RsDataType        = ck::Tuple<R0DataType>;
+
+// Layout
+using ALayout = Row;
+using BLayout = Col;
+using ELayout = Row;
+
+// Elementwise op
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+using QsElementOp  = ck::Tuple<PassThrough>;
+using RsElementOp  = ck::Tuple<PassThrough>;
+
+// ReduceOp
+using RsThreadReduceOp = ck::Tuple<ck::reduce::Max>;
+using RsGlobalReduceOp =
+    ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicMax>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle
+        <ALayout,                   // ALayout
+         BLayout,                   // BLayout
+         ELayout,                   // ELayout
+         ADataType,                 // ADataType
+         BDataType,                 // BDataType
+         GemmAccDataType,           // GemmAccDataType
+         CShuffleDataType,          // CShuffleDataType
+         DsDataType,                // DsDataType
+         EDataType,                 // EDataType
+         ReduceAccDataType,         // ReduceAccDataType
+         RsDataType,                // RsDataType
+         AElementOp,                // AElementwiseOperation
+         BElementOp,                // BElementwiseOperation
+         CDEElementOp,              // CDE ElementwiseOperation
+         QsElementOp,               // Qs Elementwise Operation
+         RsElementOp,               // Rs Elementwise Operation
+         RsThreadReduceOp,          // Thread Reduce Operation
+         RsGlobalReduceOp,          // Global Reduce Operation
+         GemmDefault,               // GEMM Specialization
+         1,                         // NumGemmKPrefetchStage
+         256,                       // BlockSize
+         256,                       // MPerBlock
+         128,                       // NPerBlock
+         64,                        // KPerBlock
+         16,                        // AK1
+         16,                        // BK1
+         32,                        // MPerXdl
+         32,                        // NPerXdl
+         4,                         // MXdlPerWave
+         2,                         // NXdlPerWave
+         S<4, 64, 1>,               // ABlockTransfer ThreadCluster Lengths_K0_M_K1
+         S<1, 0, 2>,                // ABlockTransfer ThreadCluster ArrangeOrder
+         S<1, 0, 2>,                // ABlockTransfer SrcAccessOrder
+         2,                         // ABlockTransfer SrcVectorDim
+         16,                        // ABlockTransfer SrcScalarPerVector
+         16,                        // ABlockTransfer DstScalarPerVector_K1
+         1,                         // ABlockLdsExtraM
+         S<4, 64, 1>,               // BBlockTransfer ThreadCluster Lengths_K0_N_K1
+         S<1, 0, 2>,                // BBlockTransfer ThreadCluster ArrangeOrder
+         S<1, 0, 2>,                // BBlockTransfer SrcAccessOrder
+         2,                         // BBlockTransfer SrcVectorDim
+         16,                        // BBlockTransfer SrcScalarPerVector
+         16,                        // BBlockTransfer DstScalarPerVector_K1
+         1,                         // BBlockLdsExtraN
+         1,                         // CShuffleMXdlPerWavePerShuffle
+         1,                         // CShuffleNXdlPerWavePerShuffle
+         S<64, 4>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
+         4,                         // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
+         1>;                        // RThread DstScalarPerVector _MPerBlock
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        ReduceAccDataType,
+                                                                        GemmAccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        CDEElementOp>;
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+
+    // GEMM shape
+    ck::index_t M = 1024;
+    ck::index_t N = 1152;
+    ck::index_t K = 512;
+
+    ck::index_t StrideA = 512;
+    ck::index_t StrideB = 512;
+    ck::index_t StrideE = 1152;
+
+    if(argc == 1)
+    {
+        // do nothing
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 10)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideE = std::stoi(argv[9]);
+    }
+    else
+    {
+        std::cout << "arg1: verification (0=no, 1=yes)\n"
+                  << " arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+                  << " arg3: Measure kernel execution time (1=ON, 0=Off)\n"
+                  << " arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE\n"
+                  << std::endl;
+        exit(EXIT_SUCCESS);
+    }
+
+    return run_gemm_reduce_max_xdl<ADataType,
+                                   BDataType,
+                                   EDataType,
+                                   R0DataType,
+                                   ALayout,
+                                   BLayout,
+                                   ELayout,
+                                   AElementOp,
+                                   BElementOp,
+                                   CDEElementOp,
+                                   QsElementOp,
+                                   RsElementOp,
+                                   RsThreadReduceOp,
+                                   ReduceAccDataType,
+                                   DeviceOpInstance,
+                                   ReferenceGemmInstance>(
+        M, N, K, StrideA, StrideB, StrideE, do_verification, init_method, time_kernel);
+}
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp
new file mode 100644
index 00000000..652c0e6e
--- /dev/null
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_bf16.cpp
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gemm_reduce_xdl_common.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+
+// DataType
+using ADataType         = BF16;
+using BDataType         = BF16;
+using GemmAccDataType   = F32;
+using CShuffleDataType  = F32;
+using DsDataType        = ck::Tuple<>;
+using EDataType         = BF16;
+using ReduceAccDataType = F32;
+using R0DataType        = F32;
+using R1DataType        = F32;
+using RsDataType        = ck::Tuple<R0DataType, R1DataType>;
+
+// Layout
+using ALayout = Row;
+using BLayout = Col;
+using ELayout = Row;
+
+// Elementwise op
+using Square       = ck::tensor_operation::element_wise::UnarySquare;
+using Div          = ck::tensor_operation::element_wise::UnaryDivide;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+using QsElementOp  = ck::Tuple<PassThrough, Square>;
+using RsElementOp  = ck::Tuple<Div, Div>;
+
+// ReduceOp
+using R0ThreadReduceOp = ck::reduce::Add;
+using R1ThreadReduceOp = ck::reduce::Add;
+using RsThreadReduceOp = ck::Tuple<R0ThreadReduceOp, R1ThreadReduceOp>;
+
+static constexpr auto R0GlobalReduceOp = ck::InMemoryDataOperationEnum::AtomicAdd;
+static constexpr auto R1GlobalReduceOp = ck::InMemoryDataOperationEnum::AtomicAdd;
+using RsGlobalReduceOp = ck::InMemoryDataOperationEnumSequence<R0GlobalReduceOp, R1GlobalReduceOp>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle
+        <ALayout,                   // ALayout
+         BLayout,                   // BLayout
+         ELayout,                   // ELayout
+         ADataType,                 // ADataType
+         BDataType,                 // BDataType
+         GemmAccDataType,           // GemmAccDataType
+         CShuffleDataType,          // CShuffleDataType
+         DsDataType,                // DsDataType
+         EDataType,                 // EDataType
+         ReduceAccDataType,         // ReduceAccDataType
+         RsDataType,                // RsDataType
+         AElementOp,                // AElementwiseOperation
+         BElementOp,                // BElementwiseOperation
+         CDEElementOp,              // CDE ElementwiseOperation
+         QsElementOp,               // Qs Elementwise Operation
+         RsElementOp,               // Rs Elementwise Operation
+         RsThreadReduceOp,          // Thread Reduce Operation
+         RsGlobalReduceOp,          // Global Reduce Operation
+         GemmDefault,               // GEMM Specialization
+         1,                         // NumGemmKPrefetchStage
+         256,                       // BlockSize
+         256,                       // MPerBlock
+         128,                       // NPerBlock
+         32,                        // KPerBlock
+         8,                         // AK1
+         8,                         // BK1
+         32,                        // MPerXdl
+         32,                        // NPerXdl
+         4,                         // MXdlPerWave
+         2,                         // NXdlPerWave
+         S<4, 64, 1>,               // ABlockTransfer ThreadCluster Lengths_K0_M_K1
+         S<1, 0, 2>,                // ABlockTransfer ThreadCluster ArrangeOrder
+         S<1, 0, 2>,                // ABlockTransfer SrcAccessOrder
+         2,                         // ABlockTransfer SrcVectorDim
+         8,                         // ABlockTransfer SrcScalarPerVector
+         8,                         // ABlockTransfer DstScalarPerVector_K1
+         1,                         // ABlockLdsExtraM
+         S<4, 64, 1>,               // BBlockTransfer ThreadCluster Lengths_K0_N_K1
+         S<1, 0, 2>,                // BBlockTransfer ThreadCluster ArrangeOrder
+         S<1, 0, 2>,                // BBlockTransfer SrcAccessOrder
+         2,                         // BBlockTransfer SrcVectorDim
+         8,                         // BBlockTransfer SrcScalarPerVector
+         8,                         // BBlockTransfer DstScalarPerVector_K1
+         1,                         // BBlockLdsExtraN
+         1,                         // CShuffleMXdlPerWavePerShuffle
+         1,                         // CShuffleNXdlPerWavePerShuffle
+         S<64, 4>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
+         4,                         // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
+         1>;                        // RThread DstScalarPerVector _MPerBlock
+// clang-format on
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        ReduceAccDataType,
+                                                                        GemmAccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        CDEElementOp>;
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+
+    // GEMM shape
+    ck::index_t M = 1024;
+    ck::index_t N = 1152;
+    ck::index_t K = 192;
+
+    ck::index_t StrideA = 192;
+    ck::index_t StrideB = 192;
+    ck::index_t StrideE = 1152;
+
+    if(argc == 1)
+    {
+        // do nothing
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 10)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideE = std::stoi(argv[9]);
+    }
+    else
+    {
+        std::cout << "arg1: verification (0=no, 1=yes)\n"
+                  << " arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+                  << " arg3: Measure kernel execution time (1=ON, 0=Off)\n"
+                  << " arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE\n"
+                  << std::endl;
+        exit(EXIT_SUCCESS);
+    }
+
+    return !run_gemm_reduce_mean_meansquare_xdl<ADataType,
+                                                BDataType,
+                                                EDataType,
+                                                R0DataType,
+                                                R1DataType,
+                                                ALayout,
+                                                BLayout,
+                                                ELayout,
+                                                AElementOp,
+                                                BElementOp,
+                                                CDEElementOp,
+                                                QsElementOp,
+                                                RsElementOp,
+                                                RsThreadReduceOp,
+                                                ReduceAccDataType,
+                                                DeviceOpInstance,
+                                                ReferenceGemmInstance>(
+        M, N, K, StrideA, StrideB, StrideE, do_verification, init_method, time_kernel);
+}
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp
new file mode 100644
index 00000000..7eee24fe
--- /dev/null
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp16.cpp
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gemm_reduce_xdl_common.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+
+// DataType
+using ADataType         = F16;
+using BDataType         = F16;
+using GemmAccDataType   = F32;
+using CShuffleDataType  = F32;
+using DsDataType        = ck::Tuple<>;
+using EDataType         = F16;
+using ReduceAccDataType = F32;
+using R0DataType        = F32;
+using R1DataType        = F32;
+using RsDataType        = ck::Tuple<R0DataType, R1DataType>;
+
+// Layout
+using ALayout = Row;
+using BLayout = Col;
+using ELayout = Row;
+
+// Elementwise op
+using Square       = ck::tensor_operation::element_wise::UnarySquare;
+using Div          = ck::tensor_operation::element_wise::UnaryDivide;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+using QsElementOp  = ck::Tuple<PassThrough, Square>;
+using RsElementOp  = ck::Tuple<Div, Div>;
+
+// ReduceOp
+using R0ThreadReduceOp = ck::reduce::Add;
+using R1ThreadReduceOp = ck::reduce::Add;
+using RsThreadReduceOp = ck::Tuple<R0ThreadReduceOp, R1ThreadReduceOp>;
+
+static constexpr auto R0GlobalReduceOp = ck::InMemoryDataOperationEnum::AtomicAdd;
+static constexpr auto R1GlobalReduceOp = ck::InMemoryDataOperationEnum::AtomicAdd;
+using RsGlobalReduceOp = ck::InMemoryDataOperationEnumSequence<R0GlobalReduceOp, R1GlobalReduceOp>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle
+        <ALayout,                   // ALayout
+         BLayout,                   // BLayout
+         ELayout,                   // ELayout
+         ADataType,                 // ADataType
+         BDataType,                 // BDataType
+         GemmAccDataType,           // GemmAccDataType
+         CShuffleDataType,          // CShuffleDataType
+         DsDataType,                // DsDataType
+         EDataType,                 // EDataType
+         ReduceAccDataType,         // ReduceAccDataType
+         RsDataType,                // RsDataType
+         AElementOp,                // AElementwiseOperation
+         BElementOp,                // BElementwiseOperation
+         CDEElementOp,              // CDE ElementwiseOperation
+         QsElementOp,               // Qs Elementwise Operation
+         RsElementOp,               // Rs Elementwise Operation
+         RsThreadReduceOp,          // Thread Reduce Operation
+         RsGlobalReduceOp,          // Global Reduce Operation
+         GemmDefault,               // GEMM Specialization
+         1,                         // NumGemmKPrefetchStage
+         256,                       // BlockSize
+         256,                       // MPerBlock
+         128,                       // NPerBlock
+         32,                        // KPerBlock
+         8,                         // AK1
+         8,                         // BK1
+         32,                        // MPerXdl
+         32,                        // NPerXdl
+         4,                         // MXdlPerWave
+         2,                         // NXdlPerWave
+         S<4, 64, 1>,               // ABlockTransfer ThreadCluster Lengths_K0_M_K1
+         S<1, 0, 2>,                // ABlockTransfer ThreadCluster ArrangeOrder
+         S<1, 0, 2>,                // ABlockTransfer SrcAccessOrder
+         2,                         // ABlockTransfer SrcVectorDim
+         8,                         // ABlockTransfer SrcScalarPerVector
+         8,                         // ABlockTransfer DstScalarPerVector_K1
+         1,                         // ABlockLdsExtraM
+         S<4, 64, 1>,               // BBlockTransfer ThreadCluster Lengths_K0_N_K1
+         S<1, 0, 2>,                // BBlockTransfer ThreadCluster ArrangeOrder
+         S<1, 0, 2>,                // BBlockTransfer SrcAccessOrder
+         2,                         // BBlockTransfer SrcVectorDim
+         8,                         // BBlockTransfer SrcScalarPerVector
+         8,                         // BBlockTransfer DstScalarPerVector_K1
+         1,                         // BBlockLdsExtraN
+         1,                         // CShuffleMXdlPerWavePerShuffle
+         1,                         // CShuffleNXdlPerWavePerShuffle
+         S<64, 4>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
+         4,                         // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
+         1>;                        // RThread DstScalarPerVector _MPerBlock
+// clang-format on
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        ReduceAccDataType,
+                                                                        GemmAccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        CDEElementOp>;
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+
+    // GEMM shape
+    ck::index_t M = 1024;
+    ck::index_t N = 1024;
+    ck::index_t K = 1024;
+
+    ck::index_t StrideA = 1024;
+    ck::index_t StrideB = 1024;
+    ck::index_t StrideE = 1024;
+
+    if(argc == 1)
+    {
+        // do nothing
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 10)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideE = std::stoi(argv[9]);
+    }
+    else
+    {
+        std::cout << "arg1: verification (0=no, 1=yes)\n"
+                  << " arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+                  << " arg3: Measure kernel execution time (1=ON, 0=Off)\n"
+                  << " arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE\n"
+                  << std::endl;
+        exit(EXIT_SUCCESS);
+    }
+
+    return !run_gemm_reduce_mean_meansquare_xdl<ADataType,
+                                                BDataType,
+                                                EDataType,
+                                                R0DataType,
+                                                R1DataType,
+                                                ALayout,
+                                                BLayout,
+                                                ELayout,
+                                                AElementOp,
+                                                BElementOp,
+                                                CDEElementOp,
+                                                QsElementOp,
+                                                RsElementOp,
+                                                RsThreadReduceOp,
+                                                ReduceAccDataType,
+                                                DeviceOpInstance,
+                                                ReferenceGemmInstance>(
+        M, N, K, StrideA, StrideB, StrideE, do_verification, init_method, time_kernel);
+}
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp
new file mode 100644
index 00000000..c250b996
--- /dev/null
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_mean_meansquare_xdl_fp32.cpp
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gemm_reduce_xdl_common.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+
+// DataType
+using ADataType         = F32;
+using BDataType         = F32;
+using GemmAccDataType   = F32;
+using CShuffleDataType  = F32;
+using DsDataType        = ck::Tuple<>;
+using EDataType         = F32;
+using ReduceAccDataType = F32;
+using R0DataType        = F32;
+using R1DataType        = F32;
+using RsDataType        = ck::Tuple<R0DataType, R1DataType>;
+
+// Layout
+using ALayout = Row;
+using BLayout = Col;
+using ELayout = Row;
+
+// Elementwise op
+using Square       = ck::tensor_operation::element_wise::UnarySquare;
+using Div          = ck::tensor_operation::element_wise::UnaryDivide;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+using QsElementOp  = ck::Tuple<PassThrough, Square>;
+using RsElementOp  = ck::Tuple<Div, Div>;
+
+// ReduceOp
+using R0ThreadReduceOp = ck::reduce::Add;
+using R1ThreadReduceOp = ck::reduce::Add;
+using RsThreadReduceOp = ck::Tuple<R0ThreadReduceOp, R1ThreadReduceOp>;
+
+static constexpr auto R0GlobalReduceOp = ck::InMemoryDataOperationEnum::AtomicAdd;
+static constexpr auto R1GlobalReduceOp = ck::InMemoryDataOperationEnum::AtomicAdd;
+using RsGlobalReduceOp = ck::InMemoryDataOperationEnumSequence<R0GlobalReduceOp, R1GlobalReduceOp>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle
+        <ALayout,                   // ALayout
+         BLayout,                   // BLayout
+         ELayout,                   // ELayout
+         ADataType,                 // ADataType
+         BDataType,                 // BDataType
+         GemmAccDataType,           // GemmAccDataType
+         CShuffleDataType,          // CShuffleDataType
+         DsDataType,                // DsDataType
+         EDataType,                 // EDataType
+         ReduceAccDataType,         // ReduceAccDataType
+         RsDataType,                // RsDataType
+         AElementOp,                // AElementwiseOperation
+         BElementOp,                // BElementwiseOperation
+         CDEElementOp,              // CDE ElementwiseOperation
+         QsElementOp,               // Qs Elementwise Operation
+         RsElementOp,               // Rs Elementwise Operation
+         RsThreadReduceOp,          // Thread Reduce Operation
+         RsGlobalReduceOp,          // Global Reduce Operation
+         GemmDefault,               // GEMM Specialization
+         1,                         // NumGemmKPrefetchStage
+         256,                       // BlockSize
+         256,                       // MPerBlock
+         128,                       // NPerBlock
+         16,                        // KPerBlock
+         4,                         // AK1
+         4,                         // BK1
+         32,                        // MPerXdl
+         32,                        // NPerXdl
+         4,                         // MXdlPerWave
+         2,                         // NXdlPerWave
+         S<4, 64, 1>,               // ABlockTransfer ThreadCluster Lengths_K0_M_K1
+         S<1, 0, 2>,                // ABlockTransfer ThreadCluster ArrangeOrder
+         S<1, 0, 2>,                // ABlockTransfer SrcAccessOrder
+         2,                         // ABlockTransfer SrcVectorDim
+         4,                         // ABlockTransfer SrcScalarPerVector
+         4,                         // ABlockTransfer DstScalarPerVector_K1
+         1,                         // ABlockLdsExtraM
+         S<4, 64, 1>,               // BBlockTransfer ThreadCluster Lengths_K0_N_K1
+         S<1, 0, 2>,                // BBlockTransfer ThreadCluster ArrangeOrder
+         S<1, 0, 2>,                // BBlockTransfer SrcAccessOrder
+         2,                         // BBlockTransfer SrcVectorDim
+         4,                         // BBlockTransfer SrcScalarPerVector
+         4,                         // BBlockTransfer DstScalarPerVector_K1
+         1,                         // BBlockLdsExtraN
+         1,                         // CShuffleMXdlPerWavePerShuffle
+         1,                         // CShuffleNXdlPerWavePerShuffle
+         S<64, 4>,                  // CD Reduce Thread Transfer ClusterLengths _MPerBlock_NPerBlock
+         4,                         // CDE ReduceThreadTransfer ScalarPerVector _NPerBlock
+         1>;                        // RThread DstScalarPerVector _MPerBlock
+// clang-format on
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        ReduceAccDataType,
+                                                                        GemmAccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        CDEElementOp>;
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+
+    // GEMM shape
+    ck::index_t M = 1024;
+    ck::index_t N = 1024;
+    ck::index_t K = 1024;
+
+    ck::index_t StrideA = 1024;
+    ck::index_t StrideB = 1024;
+    ck::index_t StrideE = 1024;
+
+    if(argc == 1)
+    {
+        // do nothing
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 10)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideE = std::stoi(argv[9]);
+    }
+    else
+    {
+        std::cout << "arg1: verification (0=no, 1=yes)\n"
+                  << " arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+                  << " arg3: Measure kernel execution time (1=ON, 0=Off)\n"
+                  << " arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideE\n"
+                  << std::endl;
+        exit(EXIT_SUCCESS);
+    }
+
+    return !run_gemm_reduce_mean_meansquare_xdl<ADataType,
+                                                BDataType,
+                                                EDataType,
+                                                R0DataType,
+                                                R1DataType,
+                                                ALayout,
+                                                BLayout,
+                                                ELayout,
+                                                AElementOp,
+                                                BElementOp,
+                                                CDEElementOp,
+                                                QsElementOp,
+                                                RsElementOp,
+                                                RsThreadReduceOp,
+                                                ReduceAccDataType,
+                                                DeviceOpInstance,
+                                                ReferenceGemmInstance>(
+        M, N, K, StrideA, StrideB, StrideE, do_verification, init_method, time_kernel);
+}
diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp b/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp
new file mode 100644
index 00000000..62992de5
--- /dev/null
+++ b/example/16_gemm_multi_d_multi_reduces/gemm_reduce_xdl_common.hpp
@@ -0,0 +1,491 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/host_utility/io.hpp"
+#include "ck/stream_config.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/fill.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using F16         = ck::half_t;
+using BF16        = ck::bhalf_t;
+using F32         = float;
+using F64         = double;
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+using INT4 = ck::int4_t;
+#endif
+using INT8  = std::int8_t;
+using INT32 = std::int32_t;
+
+template <typename ADataType, typename BDataType, typename EDataType, typename R0DataType>
+void DumpGemmReduceMaxPerf(float ave_time, int M, int N, int K)
+{
+    using namespace ck::literals;
+
+    std::size_t flop          = 2_uz * M * N * K;
+    std::size_t gemm_num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                                sizeof(EDataType) * M * N + sizeof(R0DataType) * M;
+
+    float tflops          = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gemm_gb_per_sec = gemm_num_byte / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gemm_gb_per_sec
+              << " GB/s, " << std::endl;
+}
+
+template <typename ADataType,
+          typename BDataType,
+          typename EDataType,
+          typename R0DataType,
+          typename R1DataType>
+void DumpGemmReduceMeanSquareMeanPerf(float ave_time, int M, int N, int K)
+{
+    using namespace ck::literals;
+
+    std::size_t flop          = 2_uz * M * N * K + M * (3_uz * N + 2_uz);
+    std::size_t gemm_num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                                sizeof(EDataType) * M * N + sizeof(R0DataType) * M +
+                                sizeof(R1DataType) * M;
+
+    float tflops          = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gemm_gb_per_sec = gemm_num_byte / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gemm_gb_per_sec
+              << " GB/s, " << std::endl;
+}
+
+template <typename ADataType,
+          typename BDataType,
+          typename EDataType,
+          typename R0DataType,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          typename AElementOp,
+          typename BElementOp,
+          typename CDEElementOp,
+          typename QsElementOp,
+          typename RsElementOp,
+          typename RsThreadReduceOp,
+          typename ReduceAccDataType,
+          typename DeviceOpInstance,
+          typename ReferenceGemmInstance,
+          typename ADataKernelType = ADataType,
+          typename BDataKernelType = BDataType,
+          typename EDataKernelType = EDataType>
+auto run_gemm_reduce_max_xdl(ck::index_t M,
+                             ck::index_t N,
+                             ck::index_t K,
+                             ck::index_t StrideA,
+                             ck::index_t StrideB,
+                             ck::index_t StrideE,
+                             bool do_verification,
+                             int init_method,
+                             bool time_kernel)
+{
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
+    static_assert(sizeof(ADataType) == sizeof(ADataKernelType));
+    static_assert(sizeof(BDataType) == sizeof(BDataKernelType));
+    static_assert(sizeof(EDataType) == sizeof(EDataKernelType));
+#endif
+    using namespace ck::literals;
+
+    auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
+        return HostTensorDescriptor({len}, {stride});
+    };
+
+    auto f_host_tensor_descriptor2d =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
+    Tensor<EDataKernelType> e_m_n(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
+    Tensor<R0DataType> r0_m(f_host_tensor_descriptor1d(M, 1));
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
+        ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n);
+        break;
+    default:
+        ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
+        ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
+        break;
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataKernelType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataKernelType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataKernelType) * e_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem r0_device_buf(sizeof(R0DataType) * r0_m.mDesc.GetElementSpaceSize());
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    if constexpr(std::is_same_v<ADataType, ck::int4_t>)
+    {
+        Tensor<ADataKernelType> a_m_k_converted = a_m_k.template CopyAsType<ADataKernelType>();
+        Tensor<BDataKernelType> b_k_n_converted = b_k_n.template CopyAsType<BDataKernelType>();
+
+        a_device_buf.ToDevice(a_m_k_converted.mData.data());
+        b_device_buf.ToDevice(b_k_n_converted.mData.data());
+    }
+    else
+#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    {
+        a_device_buf.ToDevice(a_m_k.mData.data());
+        b_device_buf.ToDevice(b_k_n.mData.data());
+    }
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+    auto qs_element_op  = QsElementOp{};
+    auto rs_element_op  = RsElementOp{};
+
+    // Prepare GEMM, max
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto argument  = device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                                           b_device_buf.GetDeviceBuffer(),
+                                           {},
+                                           e_device_buf.GetDeviceBuffer(),
+                                           {r0_device_buf.GetDeviceBuffer()},
+                                           M,
+                                           N,
+                                           K,
+                                           StrideA,
+                                           StrideB,
+                                           {},
+                                           StrideE,
+                                           a_element_op,
+                                           b_element_op,
+                                           cde_element_op,
+                                           qs_element_op,
+                                           rs_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error("wrong! this device_op instance does not support this problem");
+    }
+
+    // [CAUTION]: launch_and_time_kernel will not initialize D.
+    // If we evaluate kernel multiple time but without initialize D. Verification will fail
+    r0_device_buf.SetValue(ck::NumericLimits<R0DataType>::Lowest());
+
+    invoker.Run(argument, StreamConfig{nullptr, false});
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        auto I0 = ck::Number<0>{};
+
+        Tensor<ReduceAccDataType> e_m_n_host(e_m_n.mDesc);
+        Tensor<R0DataType> r0_m_host(r0_m.mDesc);
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, e_m_n_host, a_element_op, b_element_op, cde_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        auto reduce0_op = RsThreadReduceOp{}[I0];
+
+        for(int m = 0; m < M; ++m)
+        {
+            auto reduce0_acc = reduce0_op.template GetIdentityValue<ReduceAccDataType>();
+
+            for(int n = 0; n < N; ++n)
+            {
+                auto e_val = e_m_n_host(m, n);
+                reduce0_op(reduce0_acc, e_val);
+            };
+
+            r0_m_host(m) = ck::type_convert<R0DataType>(reduce0_acc);
+        }
+
+        e_device_buf.FromDevice(e_m_n.mData.data());
+        Tensor<EDataType> e_m_n_host_converted(e_m_n_host);
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+        if constexpr(std::is_same_v<ADataType, ck::int4_t>)
+        {
+            Tensor<EDataType> e_m_n_device_converted(e_m_n);
+            pass = ck::utils::check_err(e_m_n_device_converted,
+                                        e_m_n_host_converted,
+                                        "Error: Incorrect results c",
+                                        1e-2,
+                                        1e-2);
+        }
+        else
+#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+        {
+            pass = ck::utils::check_err(
+                e_m_n, e_m_n_host_converted, "Error: Incorrect results c", 1e-2, 1e-2);
+        }
+
+        r0_device_buf.FromDevice(r0_m.mData.data());
+        pass &= ck::utils::check_err(r0_m, r0_m_host, "Error: Incorrect results d0", 1e-2, 1e-2);
+
+        if(pass)
+        {
+            std::cout << "Success!" << std::endl;
+        }
+    }
+
+    if(time_kernel)
+    {
+        float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+        DumpGemmReduceMaxPerf<ADataType, BDataType, EDataType, R0DataType>(ave_time, M, N, K);
+    }
+
+    return pass ? 0 : 1;
+}
+
+template <typename ADataType,
+          typename BDataType,
+          typename EDataType,
+          typename R0DataType,
+          typename R1DataType,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          typename AElementOp,
+          typename BElementOp,
+          typename CDEElementOp,
+          typename QsElementOp,
+          typename RsElementOp,
+          typename RsThreadReduceOp,
+          typename ReduceAccDataType,
+          typename DeviceOpInstance,
+          typename ReferenceGemmInstance,
+          typename ADataKernelType = ADataType,
+          typename BDataKernelType = BDataType,
+          typename EDataKernelType = EDataType>
+bool run_gemm_reduce_mean_meansquare_xdl(ck::index_t M,
+                                         ck::index_t N,
+                                         ck::index_t K,
+                                         ck::index_t StrideA,
+                                         ck::index_t StrideB,
+                                         ck::index_t StrideE,
+                                         bool do_verification,
+                                         int init_method,
+                                         bool time_kernel)
+{
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
+    static_assert(sizeof(ADataType) == sizeof(ADataKernelType));
+    static_assert(sizeof(BDataType) == sizeof(BDataKernelType));
+    static_assert(sizeof(EDataType) == sizeof(EDataKernelType));
+#endif
+    using namespace ck::literals;
+
+    auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
+        return HostTensorDescriptor({len}, {stride});
+    };
+
+    auto f_host_tensor_descriptor2d =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
+    Tensor<EDataKernelType> e_m_n(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
+    Tensor<R0DataType> r0_m(f_host_tensor_descriptor1d(M, 1));
+    Tensor<R1DataType> r1_m(f_host_tensor_descriptor1d(M, 1));
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
+        ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n);
+        break;
+    default:
+        ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
+        ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
+        break;
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataKernelType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataKernelType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataKernelType) * e_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem r0_device_buf(sizeof(R0DataType) * r0_m.mDesc.GetElementSpaceSize());
+    DeviceMem r1_device_buf(sizeof(R1DataType) * r1_m.mDesc.GetElementSpaceSize());
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    if constexpr(std::is_same_v<ADataType, ck::int4_t>)
+    {
+        Tensor<ADataKernelType> a_m_k_converted = a_m_k.template CopyAsType<ADataKernelType>();
+        Tensor<BDataKernelType> b_k_n_converted = b_k_n.template CopyAsType<BDataKernelType>();
+
+        a_device_buf.ToDevice(a_m_k_converted.mData.data());
+        b_device_buf.ToDevice(b_k_n_converted.mData.data());
+    }
+    else
+#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    {
+        a_device_buf.ToDevice(a_m_k.mData.data());
+        b_device_buf.ToDevice(b_k_n.mData.data());
+    }
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+    auto qs_element_op  = QsElementOp{};
+    auto rs_element_op  = RsElementOp{N, N};
+
+    // Prepare GEMM, mean, mean_square
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto argument =
+        device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                               b_device_buf.GetDeviceBuffer(),
+                               {},
+                               e_device_buf.GetDeviceBuffer(),
+                               {r0_device_buf.GetDeviceBuffer(), r1_device_buf.GetDeviceBuffer()},
+                               M,
+                               N,
+                               K,
+                               StrideA,
+                               StrideB,
+                               {},
+                               StrideE,
+                               a_element_op,
+                               b_element_op,
+                               cde_element_op,
+                               qs_element_op,
+                               rs_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error("wrong! this device_op instance does not support this problem");
+    }
+
+    // init reducetion buffer to 0
+    r0_device_buf.SetZero();
+    r1_device_buf.SetZero();
+
+    invoker.Run(argument, StreamConfig{nullptr, false});
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        auto I0 = ck::Number<0>{};
+        auto I1 = ck::Number<1>{};
+
+        Tensor<ReduceAccDataType> e_m_n_host(e_m_n.mDesc);
+        Tensor<R0DataType> r0_m_host(r0_m.mDesc);
+        Tensor<R1DataType> r1_m_host(r1_m.mDesc);
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, e_m_n_host, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        auto reduce0_op = RsThreadReduceOp{}[I0];
+        auto reduce1_op = RsThreadReduceOp{}[I1];
+
+        for(int m = 0; m < M; ++m)
+        {
+            auto reduce0_acc = reduce0_op.template GetIdentityValue<ReduceAccDataType>();
+            auto reduce1_acc = reduce1_op.template GetIdentityValue<ReduceAccDataType>();
+
+            for(int n = 0; n < N; ++n)
+            {
+                ReduceAccDataType square_e_val;
+                auto e_val = ck::type_convert<ReduceAccDataType>(e_m_n_host(m, n));
+                qs_element_op[I1](square_e_val, e_val);
+
+                reduce0_op(reduce0_acc, e_val);
+                reduce1_op(reduce1_acc, square_e_val);
+            }
+
+            rs_element_op[I0](reduce0_acc, reduce0_acc);
+            rs_element_op[I1](reduce1_acc, reduce1_acc);
+            r0_m_host(m) = ck::type_convert<R0DataType>(reduce0_acc);
+            r1_m_host(m) = ck::type_convert<R1DataType>(reduce1_acc);
+        }
+        e_device_buf.FromDevice(e_m_n.mData.data());
+        Tensor<EDataType> e_m_n_host_converted(e_m_n_host);
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+        if constexpr(std::is_same_v<ADataType, ck::int4_t>)
+        {
+            Tensor<EDataType> e_m_n_device_converted(e_m_n);
+            pass = ck::utils::check_err(e_m_n_device_converted,
+                                        e_m_n_host_converted,
+                                        "Error: Incorrect results c",
+                                        1e-2,
+                                        1e-2);
+        }
+        else
+#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+        {
+            pass = ck::utils::check_err(
+                e_m_n, e_m_n_host_converted, "Error: Incorrect results c", 1e-2, 1e-2);
+        }
+
+        r0_device_buf.FromDevice(r0_m.mData.data());
+        r1_device_buf.FromDevice(r1_m.mData.data());
+
+        pass &= ck::utils::check_err(r0_m, r0_m_host, "Error: Incorrect results d0", 1e-2, 1e-2);
+        pass &= ck::utils::check_err(r1_m, r1_m_host, "Error: Incorrect results d1", 1e-2, 1e-2);
+
+        if(pass)
+        {
+            std::cout << "Success!" << std::endl;
+        }
+    }
+
+    if(time_kernel)
+    {
+        float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+        DumpGemmReduceMeanSquareMeanPerf<ADataType, BDataType, EDataType, R0DataType, R1DataType>(
+            ave_time, M, N, K);
+    }
+
+    return pass;
+}
diff --git a/example/17_convnd_bwd_data/CMakeLists.txt b/example/17_convnd_bwd_data/CMakeLists.txt
new file mode 100644
index 00000000..fa4e65d9
--- /dev/null
+++ b/example/17_convnd_bwd_data/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_example_executable(example_convnd_bwd_data_xdl_fp16 convnd_bwd_data_xdl_fp16.cpp)
+target_link_libraries(example_convnd_bwd_data_xdl_fp16 PRIVATE utility)
+
+add_example_executable(example_convnd_bwd_data_dl_fp16 convnd_bwd_data_dl_fp16.cpp)
+target_link_libraries(example_convnd_bwd_data_dl_fp16 PRIVATE utility)
diff --git a/example/17_convnd_bwd_data/README.md b/example/17_convnd_bwd_data/README.md
new file mode 100644
index 00000000..b5c8281e
--- /dev/null
+++ b/example/17_convnd_bwd_data/README.md
@@ -0,0 +1,47 @@
+# Instructions for ```example_convnd_bwd_data_xdl```
+
+## Run ```example_example_convnd_bwd_data_xdl```
+```bash
+#arg1: verification (0=no, 1=yes)
+#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
+#arg3: run kernel # of times (>1)
+#arg4: num_dim_spatial(1|2|3)
+#arg5 to ...: N, K, C, [Z,] [Y,] X, [Di,] [Hi,] Wi, S[z,] [Sy,] Sx, [Dz,] [Dy,] Dx, [LeftPz,] [LeftPy,] LeftPx, [RightPy,] [RightPy,] RightPx
+./bin/example_convnd_bwd_data_xdl 0 1 5 
+```
+
+Result
+```
+in_n_c_hi_wi: dim 4, lengths {128, 128, 71, 71}, strides {645248, 1, 9088, 128}
+wei_k_c_y_x: dim 4, lengths {256, 128, 3, 3}, strides {1152, 1, 384, 128}
+out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
+arg.a_grid_desc_k0_m_k1_container_{128, 175232, 8}
+arg.b_grid_desc_k0_n_k1_container_{128, 128, 8}
+arg.c_grid_desc_m_n_container_{ 175232, 128}
+arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_( 2738, 2, 2, 2, 4, 2 ) 
+launch_and_time_kernel: grid_dim {1369, 1, 1}, block_dim {256, 1, 1} 
+Warm up
+Start running 1 times...
+arg.a_grid_desc_k0_m_k1_container_{64, 175232, 8}
+arg.b_grid_desc_k0_n_k1_container_{64, 128, 8}
+arg.c_grid_desc_m_n_container_{ 175232, 128}
+arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_( 2738, 2, 2, 2, 4, 2 ) 
+launch_and_time_kernel: grid_dim {1369, 1, 1}, block_dim {256, 1, 1} 
+Warm up
+Start running 1 times...
+arg.a_grid_desc_k0_m_k1_container_{64, 175232, 8}
+arg.b_grid_desc_k0_n_k1_container_{64, 128, 8}
+arg.c_grid_desc_m_n_container_{ 175232, 128}
+arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_( 2738, 2, 2, 2, 4, 2 ) 
+launch_and_time_kernel: grid_dim {1369, 1, 1}, block_dim {256, 1, 1} 
+Warm up
+Start running 1 times...
+arg.a_grid_desc_k0_m_k1_container_{32, 175232, 8}
+arg.b_grid_desc_k0_n_k1_container_{32, 128, 8}
+arg.c_grid_desc_m_n_container_{ 175232, 128}
+arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_( 2738, 2, 2, 2, 4, 2 ) 
+launch_and_time_kernel: grid_dim {1369, 1, 1}, block_dim {256, 1, 1} 
+Warm up
+Start running 1 times...
+Perf: 1.40031 ms, 69.8734 TFlops, 179.037 GB/s
+```
diff --git a/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp b/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp
new file mode 100644
index 00000000..26fa9e98
--- /dev/null
+++ b/example/17_convnd_bwd_data/convnd_bwd_data_common.hpp
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
+
+void print_helper_msg()
+{
+    std::cout << "arg1: verification (0=no, 1=yes)\n"
+              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+              << "arg3: time kernel (0=no, 1=yes)\n"
+              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+}
+
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNdBwdDataInstance>
+int run_conv_bwd_data(bool do_verification,
+                      int init_method,
+                      bool time_kernel,
+                      const ck::utils::conv::ConvParam& conv_param,
+                      const HostTensorDescriptor& in_g_n_c_wis_desc,
+                      const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                      const HostTensorDescriptor& out_g_n_k_wos_desc,
+                      const InElementOp& in_element_op,
+                      const WeiElementOp& wei_element_op,
+                      const OutElementOp& out_element_op)
+{
+    Tensor<InDataType> in_host(in_g_n_c_wis_desc);
+    Tensor<InDataType> in_device(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> out(out_g_n_k_wos_desc);
+
+    std::cout << "in: " << in_host.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "out: " << out.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        break;
+    case 2:
+        out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+        break;
+    default:
+        out.GenerateTensorValue(GeneratorTensor_1<OutDataType>{1});
+        wei.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in_device.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
+
+    out_device_buf.ToDevice(out.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+
+    // reset input to zero
+    in_device_buf.SetZero();
+
+    // do GEMM
+    auto conv     = DeviceConvNdBwdDataInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+                                      static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+                                      static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+                                      conv_param.N_,
+                                      conv_param.K_,
+                                      conv_param.C_,
+                                      conv_param.input_spatial_lengths_,
+                                      conv_param.filter_spatial_lengths_,
+                                      conv_param.GetOutputSpatialLengths(),
+                                      conv_param.conv_filter_strides_,
+                                      conv_param.conv_filter_dilations_,
+                                      conv_param.input_left_pads_,
+                                      conv_param.input_right_pads_,
+                                      in_element_op,
+                                      wei_element_op,
+                                      out_element_op);
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        std::cout << "Not support,please check parameters or device";
+        return 0;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop      = conv_param.GetFlops();
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    if(do_verification)
+    {
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdData<NDimSpatial,
+                                                                         InDataType,
+                                                                         WeiDataType,
+                                                                         OutDataType,
+                                                                         InElementOp,
+                                                                         WeiElementOp,
+                                                                         OutElementOp>();
+
+        auto ref_invoker = ref_conv.MakeInvoker();
+
+        auto ref_argument = ref_conv.MakeArgument(in_host,
+                                                  wei,
+                                                  out,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        in_device_buf.FromDevice(in_device.mData.data());
+
+        return ck::utils::check_err(in_device, in_host) ? 0 : 1;
+    }
+
+    return 0;
+}
diff --git a/example/17_convnd_bwd_data/convnd_bwd_data_dl_fp16.cpp b/example/17_convnd_bwd_data/convnd_bwd_data_dl_fp16.cpp
new file mode 100644
index 00000000..f0896e97
--- /dev/null
+++ b/example/17_convnd_bwd_data/convnd_bwd_data_dl_fp16.cpp
@@ -0,0 +1,180 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_bwd_data_common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp"
+
+using InDataType  = ck::half_t;
+using WeiDataType = ck::half_t;
+using OutDataType = ck::half_t;
+using AccDataType = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvBwdDefault =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
+
+template <ck::index_t NDimSpatial>
+// clang-format off
+using DeviceConvNdBwdDataInstance = ck::tensor_operation::device::DeviceConvNdBwdDataNwcKxcNwk_Dl<
+//        ######|       NDim|     InData|     WeiData|     OutData|     AccData|           In|           Wei|           Out|    Convolution| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+//        ######|    Spatial|       Type|        Type|        Type|        Type|  Elementwise|   Elementwise|   Elementwise|        Forward|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+//        ######|           |           |            |            |            |    Operation|     Operation|     Operation| Specialization|      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+//        ######|           |           |            |            |            |             |              |              |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+                 NDimSpatial, InDataType, WeiDataType, OutDataType, AccDataType,  InElementOp,  WeiElementOp,  OutElementOp, ConvBwdDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,      S<1, 1, 8, 2>,      S<16, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 8, 1>,      S<0, 3, 1, 2>,       S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,               5,                 4>;
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    namespace ctc = ck::tensor_layout::convolution;
+
+    print_helper_msg();
+
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    ck::utils::conv::ConvParam conv_param{
+        2, 1, 128, 256, 256, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        do_verification                   = std::stoi(argv[1]);
+        init_method                       = std::stoi(argv[2]);
+        time_kernel                       = std::stoi(argv[3]);
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+
+        conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
+    }
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+
+    if(conv_param.num_dim_spatial_ == 1)
+    {
+        using InLayout  = ctc::GNWC;
+        using WeiLayout = ctc::GKXC;
+        using OutLayout = ctc::GNWK;
+
+        const auto in_g_n_c_wis_desc =
+            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
+                conv_param);
+
+        const auto wei_g_k_c_xs_desc =
+            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
+                conv_param);
+
+        const auto out_g_n_k_wos_desc =
+            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
+                conv_param);
+
+        return run_conv_bwd_data<1,
+                                 InDataType,
+                                 WeiDataType,
+                                 OutDataType,
+                                 InElementOp,
+                                 WeiElementOp,
+                                 OutElementOp,
+                                 DeviceConvNdBwdDataInstance<1>>(do_verification,
+                                                                 init_method,
+                                                                 time_kernel,
+                                                                 conv_param,
+                                                                 in_g_n_c_wis_desc,
+                                                                 wei_g_k_c_xs_desc,
+                                                                 out_g_n_k_wos_desc,
+                                                                 in_element_op,
+                                                                 wei_element_op,
+                                                                 out_element_op);
+    }
+    else if(conv_param.num_dim_spatial_ == 2)
+    {
+        using InLayout  = ctc::GNHWC;
+        using WeiLayout = ctc::GKYXC;
+        using OutLayout = ctc::GNHWK;
+
+        const auto in_g_n_c_wis_desc =
+            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
+                conv_param);
+
+        const auto wei_g_k_c_xs_desc =
+            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
+                conv_param);
+
+        const auto out_g_n_k_wos_desc =
+            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
+                conv_param);
+
+        return run_conv_bwd_data<2,
+                                 InDataType,
+                                 WeiDataType,
+                                 OutDataType,
+                                 InElementOp,
+                                 WeiElementOp,
+                                 OutElementOp,
+                                 DeviceConvNdBwdDataInstance<2>>(do_verification,
+                                                                 init_method,
+                                                                 time_kernel,
+                                                                 conv_param,
+                                                                 in_g_n_c_wis_desc,
+                                                                 wei_g_k_c_xs_desc,
+                                                                 out_g_n_k_wos_desc,
+                                                                 in_element_op,
+                                                                 wei_element_op,
+                                                                 out_element_op);
+    }
+    else if(conv_param.num_dim_spatial_ == 3)
+    {
+        using InLayout  = ctc::GNDHWC;
+        using WeiLayout = ctc::GKZYXC;
+        using OutLayout = ctc::GNDHWK;
+
+        const auto in_g_n_c_wis_desc =
+            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
+                conv_param);
+
+        const auto wei_g_k_c_xs_desc =
+            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
+                conv_param);
+
+        const auto out_g_n_k_wos_desc =
+            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
+                conv_param);
+
+        return run_conv_bwd_data<3,
+                                 InDataType,
+                                 WeiDataType,
+                                 OutDataType,
+                                 InElementOp,
+                                 WeiElementOp,
+                                 OutElementOp,
+                                 DeviceConvNdBwdDataInstance<3>>(do_verification,
+                                                                 init_method,
+                                                                 time_kernel,
+                                                                 conv_param,
+                                                                 in_g_n_c_wis_desc,
+                                                                 wei_g_k_c_xs_desc,
+                                                                 out_g_n_k_wos_desc,
+                                                                 in_element_op,
+                                                                 wei_element_op,
+                                                                 out_element_op);
+    }
+
+    return 0;
+}
diff --git a/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp b/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp
new file mode 100644
index 00000000..c4f2c1f0
--- /dev/null
+++ b/example/17_convnd_bwd_data/convnd_bwd_data_xdl_fp16.cpp
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "convnd_bwd_data_common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp"
+
+using InDataType  = ck::half_t;
+using WeiDataType = ck::half_t;
+using OutDataType = ck::half_t;
+using AccDataType = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvBwdDefault =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
+
+template <ck::index_t NDimSpatial>
+using DeviceConvNdBwdDataInstance = ck::tensor_operation::device::DeviceConvNdBwdDataNwcKxcNwk_Xdl<
+    NDimSpatial,    // NDimSpatial
+    InDataType,     // InDataType
+    WeiDataType,    // WeiDataType
+    OutDataType,    // OutDataType
+    AccDataType,    // AccDataType
+    InElementOp,    // InElementwiseOperation
+    WeiElementOp,   // WeiElementwiseOperation
+    OutElementOp,   // OutElementwiseOperation
+    ConvBwdDefault, // ConvolutionBackwardDataSpecialization
+    256,            // BlockSize
+    128,            // MPerBlock
+    128,            // NPerBlock
+    4,              // K0PerBlock
+    8,              // K1
+    32,             // MPerXdl
+    32,             // NPerXdl
+    2,              // MXdlPerWave
+    2,              // NXdlPerWave
+    S<4, 64, 1>,    // ABlockTransferThreadClusterLengths_K0_M_K1
+    S<1, 0, 2>,     // ABlockTransferThreadClusterArrangeOrder
+    S<1, 0, 2>,     // ABlockTransferSrcAccessOrder
+    2,              // ABlockTransferSrcVectorDim
+    8,              // ABlockTransferSrcScalarPerVector
+    8,              // ABlockTransferDstScalarPerVector_K1
+    true,           // ABlockLdsAddExtraM
+    S<4, 64, 1>,    // BBlockTransferThreadClusterLengths_K0_N_K1
+    S<2, 0, 1>,     // BBlockTransferThreadClusterArrangeOrder
+    S<0, 2, 1>,     // BBlockTransferSrcAccessOrder
+    1,              // BBlockTransferSrcVectorDim
+    2,              // BBlockTransferSrcScalarPerVector
+    8,              // BBlockTransferDstScalarPerVector_K1
+    true,           // BBlockLdsAddExtraN
+    7,
+    1>; // GemmCThreadTransferDstScalarPerVector
+
+int main(int argc, char* argv[])
+{
+    namespace ctc = ck::tensor_layout::convolution;
+
+    print_helper_msg();
+
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    ck::utils::conv::ConvParam conv_param{
+        2, 1, 128, 256, 256, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}};
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        do_verification                   = std::stoi(argv[1]);
+        init_method                       = std::stoi(argv[2]);
+        time_kernel                       = std::stoi(argv[3]);
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+
+        conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
+    }
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+
+    if(conv_param.num_dim_spatial_ == 1)
+    {
+        using InLayout  = ctc::GNWC;
+        using WeiLayout = ctc::GKXC;
+        using OutLayout = ctc::GNWK;
+
+        const auto in_g_n_c_wis_desc =
+            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
+                conv_param);
+
+        const auto wei_g_k_c_xs_desc =
+            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
+                conv_param);
+
+        const auto out_g_n_k_wos_desc =
+            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
+                conv_param);
+
+        return run_conv_bwd_data<1,
+                                 InDataType,
+                                 WeiDataType,
+                                 OutDataType,
+                                 InElementOp,
+                                 WeiElementOp,
+                                 OutElementOp,
+                                 DeviceConvNdBwdDataInstance<1>>(do_verification,
+                                                                 init_method,
+                                                                 time_kernel,
+                                                                 conv_param,
+                                                                 in_g_n_c_wis_desc,
+                                                                 wei_g_k_c_xs_desc,
+                                                                 out_g_n_k_wos_desc,
+                                                                 in_element_op,
+                                                                 wei_element_op,
+                                                                 out_element_op);
+    }
+    else if(conv_param.num_dim_spatial_ == 2)
+    {
+        using InLayout  = ctc::GNHWC;
+        using WeiLayout = ctc::GKYXC;
+        using OutLayout = ctc::GNHWK;
+
+        const auto in_g_n_c_wis_desc =
+            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
+                conv_param);
+
+        const auto wei_g_k_c_xs_desc =
+            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
+                conv_param);
+
+        const auto out_g_n_k_wos_desc =
+            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
+                conv_param);
+
+        return run_conv_bwd_data<2,
+                                 InDataType,
+                                 WeiDataType,
+                                 OutDataType,
+                                 InElementOp,
+                                 WeiElementOp,
+                                 OutElementOp,
+                                 DeviceConvNdBwdDataInstance<2>>(do_verification,
+                                                                 init_method,
+                                                                 time_kernel,
+                                                                 conv_param,
+                                                                 in_g_n_c_wis_desc,
+                                                                 wei_g_k_c_xs_desc,
+                                                                 out_g_n_k_wos_desc,
+                                                                 in_element_op,
+                                                                 wei_element_op,
+                                                                 out_element_op);
+    }
+    else if(conv_param.num_dim_spatial_ == 3)
+    {
+        using InLayout  = ctc::GNDHWC;
+        using WeiLayout = ctc::GKZYXC;
+        using OutLayout = ctc::GNDHWK;
+
+        const auto in_g_n_c_wis_desc =
+            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
+                conv_param);
+
+        const auto wei_g_k_c_xs_desc =
+            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
+                conv_param);
+
+        const auto out_g_n_k_wos_desc =
+            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
+                conv_param);
+
+        return run_conv_bwd_data<3,
+                                 InDataType,
+                                 WeiDataType,
+                                 OutDataType,
+                                 InElementOp,
+                                 WeiElementOp,
+                                 OutElementOp,
+                                 DeviceConvNdBwdDataInstance<3>>(do_verification,
+                                                                 init_method,
+                                                                 time_kernel,
+                                                                 conv_param,
+                                                                 in_g_n_c_wis_desc,
+                                                                 wei_g_k_c_xs_desc,
+                                                                 out_g_n_k_wos_desc,
+                                                                 in_element_op,
+                                                                 wei_element_op,
+                                                                 out_element_op);
+    }
+
+    return 0;
+}
diff --git a/example/18_batched_gemm_reduce/CMakeLists.txt b/example/18_batched_gemm_reduce/CMakeLists.txt
new file mode 100644
index 00000000..99fc0043
--- /dev/null
+++ b/example/18_batched_gemm_reduce/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_example_executable(example_batched_gemm_reduce_xdl_fp16 batched_gemm_reduce_xdl_fp16.cpp)
+
diff --git a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
new file mode 100644
index 00000000..c2e3602a
--- /dev/null
+++ b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
@@ -0,0 +1,311 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using ADataType         = F16;
+using BDataType         = F16;
+using CDataType         = F16;
+using ReduceAccDataType = F32;
+using ReduceDataType    = F32;
+using ReducePtrsGlobal  = ck::Tuple<ReduceDataType*, ReduceDataType*>;
+
+using ALayout = ck::tensor_layout::gemm::RowMajor;
+using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+using ReduceOp0  = ck::reduce::Add;
+using ReduceOp1  = ck::reduce::Add;
+using ReduceOps  = ck::Tuple<ReduceOp0, ReduceOp1>;
+
+using UnaryIdenticElementOp = ck::tensor_operation::element_wise::PassThrough;
+using UnarySquareElementOp  = ck::tensor_operation::element_wise::UnarySquare;
+using ReduceInElementOps    = ck::Tuple<UnaryIdenticElementOp, UnarySquareElementOp>;
+using ReduceOutElementOps   = ck::Tuple<UnaryIdenticElementOp, UnaryIdenticElementOp>;
+
+using ReduceGlobalMemOps =
+    ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicAdd,
+                                          ck::InMemoryDataOperationEnum::AtomicAdd>;
+
+static constexpr auto GemmSpecialization =
+    ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceBatchedGemmReduceInstance = ck::tensor_operation::device::DeviceBatchedGemmReduce_Xdl_CShuffle
+//######| ALayout| BLayout| CLayout|AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc|         DData|           A|           B|           C|         Dxs|     DxsInEleOp|     DxsAccEleOp|            D|               GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
+//######|        |        |        | Type|  Type|  Type| DataType| DataType|  DataType|    Type Tuple| Elementwise| Elementwise| Elementwise|      Reduce|               |                |   MemoryData|     Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
+//######|        |        |        |     |      |      |         |         |          |              |   Operation|   Operation|   Operation|   Operation|               |                |    Operation|                   |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
+//######|        |        |        |     |      |      |         |         |          |              |            |            |            |            |               |                |             |                   |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |
+        <     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal,  AElementOp,  BElementOp,  CElementOp, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceGlobalMemOps, GemmSpecialization,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>;
+// clang-format on
+
+using ReferenceBatchedGemmInstance =
+    ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                     BDataType,
+                                                     CDataType,
+                                                     ReduceAccDataType,
+                                                     AElementOp,
+                                                     BElementOp,
+                                                     CElementOp>;
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M = 2048;
+    ck::index_t N = 1920;
+    ck::index_t K = 2048;
+
+    ck::index_t StrideA = 2048;
+    ck::index_t StrideB = 2048;
+    ck::index_t StrideC = 1920;
+
+    ck::index_t BatchCount = 4;
+
+    if(argc == 1)
+    {
+        // do nothing
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 11)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideC = std::stoi(argv[9]);
+
+        BatchCount = std::stoi(argv[10]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("arg4 to 10: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC, BatchCount\n");
+        exit(0);
+    }
+
+    auto f_host_tensor_descriptor = [](std::size_t batch_count,
+                                       std::size_t row,
+                                       std::size_t col,
+                                       std::size_t stride,
+                                       auto layout) {
+        using namespace ck::literals;
+
+        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            return HostTensorDescriptor({batch_count, row, col}, {row * stride, stride, 1_uz});
+        }
+        else
+        {
+            return HostTensorDescriptor({batch_count, row, col}, {col * stride, 1_uz, stride});
+        }
+    };
+
+    Tensor<ADataType> a_g_m_k(f_host_tensor_descriptor(BatchCount, M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_g_k_n(f_host_tensor_descriptor(BatchCount, K, N, StrideB, BLayout{}));
+
+    Tensor<CDataType> c_g_m_n_host_result(
+        f_host_tensor_descriptor(BatchCount, M, N, StrideC, CLayout{}));
+    Tensor<ReduceDataType> d0_g_m_host_result({BatchCount, M});
+    Tensor<ReduceDataType> d1_g_m_host_result({BatchCount, M});
+
+    Tensor<CDataType> c_g_m_n_device_result(
+        f_host_tensor_descriptor(BatchCount, M, N, StrideC, CLayout{}));
+    Tensor<ReduceDataType> d0_g_m_device_result({BatchCount, M});
+    Tensor<ReduceDataType> d1_g_m_device_result({BatchCount, M});
+
+    std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
+    std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl;
+    std::cout << "c_g_m_n: " << c_g_m_n_host_result.mDesc << std::endl;
+    std::cout << "d0_g_m: " << d0_g_m_host_result.mDesc << std::endl;
+    std::cout << "d1_g_m: " << d1_g_m_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    default:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_g_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(CDataType) * c_g_m_n_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem reduce0_device_buf(sizeof(ReduceDataType) *
+                                 d0_g_m_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem reduce1_device_buf(sizeof(ReduceDataType) *
+                                 d1_g_m_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_g_m_k.mData.data());
+    b_device_buf.ToDevice(b_g_k_n.mData.data());
+
+    auto a_element_op                     = AElementOp{};
+    auto b_element_op                     = BElementOp{};
+    auto c_element_op                     = CElementOp{};
+    std::array<void*, 3> gemm_element_ops = {&a_element_op, &b_element_op, &c_element_op};
+
+    auto passthrough                            = UnaryIdenticElementOp{};
+    auto square                                 = UnarySquareElementOp{};
+    std::array<void*, 2> reduce_in_element_ops  = {&passthrough, &square};
+    std::array<void*, 2> reduce_out_element_ops = {&passthrough, &passthrough};
+
+    std::array<void*, 2> p_reduces = {reduce0_device_buf.GetDeviceBuffer(),
+                                      reduce1_device_buf.GetDeviceBuffer()};
+
+    // do GEMM
+    auto batched_gemm = DeviceBatchedGemmReduceInstance{};
+    auto invoker      = batched_gemm.MakeInvoker();
+    auto argument     = batched_gemm.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                                              b_device_buf.GetDeviceBuffer(),
+                                              nullptr,
+                                              {},
+                                              c_device_buf.GetDeviceBuffer(),
+                                              p_reduces,
+                                              M,
+                                              N,
+                                              K,
+                                              StrideA,
+                                              StrideB,
+                                              StrideC,
+                                              {},
+                                              gemm_element_ops,
+                                              {},
+                                              reduce_in_element_ops,
+                                              reduce_out_element_ops,
+                                              BatchCount);
+
+    if(!batched_gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    // init DO, D1 to 0
+    reduce0_device_buf.SetZero();
+    reduce1_device_buf.SetZero();
+
+    // if time_kernel == true, kernel will run multiple times. This kernel use atomic-add so result
+    // will not be correct. need to set time_kernel = false for correctness test
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop      = std::size_t(2) * BatchCount * M * N * K;
+    std::size_t num_btype = sizeof(ADataType) * BatchCount * M * K +
+                            sizeof(BDataType) * BatchCount * K * N +
+                            sizeof(CDataType) * BatchCount * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << batched_gemm.GetTypeString() << std::endl;
+
+    bool pass = true;
+    if(do_verification)
+    {
+        c_device_buf.FromDevice(c_g_m_n_device_result.mData.data());
+        reduce0_device_buf.FromDevice(d0_g_m_device_result.mData.data());
+        reduce1_device_buf.FromDevice(d1_g_m_device_result.mData.data());
+
+        auto ref_batched_gemm = ReferenceBatchedGemmInstance{};
+        auto ref_invoker      = ref_batched_gemm.MakeInvoker();
+
+        auto ref_argument = ref_batched_gemm.MakeArgument(
+            a_g_m_k, b_g_k_n, c_g_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        auto reduce0_op = ReduceOp0{};
+        auto reduce1_op = ReduceOp1{};
+
+        for(int batch = 0; batch < BatchCount; ++batch)
+        {
+            for(int m = 0; m < M; ++m)
+            {
+                auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
+                auto reduce1_acc = reduce1_op.GetIdentityValue<ReduceAccDataType>();
+
+                for(int n = 0; n < N; ++n)
+                {
+                    auto c_val =
+                        ck::type_convert<ReduceAccDataType>(c_g_m_n_host_result(batch, m, n));
+                    ReduceAccDataType d0_val;
+                    ReduceAccDataType d1_val;
+
+                    UnaryIdenticElementOp{}(d0_val, c_val);
+                    UnarySquareElementOp{}(d1_val, c_val);
+                    reduce0_op(reduce0_acc, d0_val);
+                    reduce1_op(reduce1_acc, d1_val);
+                }
+
+                d0_g_m_host_result(batch, m) = ck::type_convert<ReduceDataType>(reduce0_acc);
+                d1_g_m_host_result(batch, m) = ck::type_convert<ReduceDataType>(reduce1_acc);
+            }
+        }
+
+        pass = ck::utils::check_err(
+                   c_g_m_n_host_result, c_g_m_n_device_result, "Error: Incorrect results c") &&
+               ck::utils::check_err(d0_g_m_device_result,
+                                    d0_g_m_host_result,
+                                    "Error: Incorrect results! D0",
+                                    1e-4,
+                                    1e-5) &&
+               ck::utils::check_err(d1_g_m_device_result,
+                                    d1_g_m_host_result,
+                                    "Error: Incorrect results! D1",
+                                    1e-3,
+                                    1e-5);
+    }
+
+    return pass ? 0 : 1;
+}
diff --git a/example/19_binary_elementwise/CMakeLists.txt b/example/19_binary_elementwise/CMakeLists.txt
new file mode 100644
index 00000000..39646e0a
--- /dev/null
+++ b/example/19_binary_elementwise/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_example_executable(example_broadcast_add_2d_amn_bn broadcast_add_2d_amn_bn.cpp)
+add_example_executable(example_broadcast_add_3d_am_bmnk broadcast_add_3d_am_bmnk.cpp)
+add_example_executable(example_elementwise_add_1d elementwise_add_1d.cpp)
+add_example_executable(example_elementwise_add_4d elementwise_add_4d.cpp)
\ No newline at end of file
diff --git a/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp b/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp
new file mode 100644
index 00000000..9eae27ca
--- /dev/null
+++ b/example/19_binary_elementwise/broadcast_add_2d_amn_bn.cpp
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using ABDataType = F16;
+using CDataType  = F16;
+
+using Add = ck::tensor_operation::element_wise::Add;
+
+using DeviceElementwiseAddInstance =
+    ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ABDataType, ABDataType>,
+                                                    ck::Tuple<CDataType>,
+                                                    Add,
+                                                    2,
+                                                    8,
+                                                    ck::Sequence<8, 8>,
+                                                    ck::Sequence<8>>;
+
+template <typename HostTensorA,
+          typename HostTensorB,
+          typename HostTensorC,
+          typename Functor,
+          int broadcastDim>
+void host_broadcast2D(
+    HostTensorC& C, const HostTensorA& A, const HostTensorB& B, int M, int N, Functor functor)
+{
+    using ctype = ck::remove_reference_t<decltype(C(0, 0))>;
+
+    for(int m = 0; m < M; ++m)
+    {
+        for(int n = 0; n < N; ++n)
+        {
+            auto Amn  = A(m, n);
+            ctype Cmn = 0;
+            if constexpr(broadcastDim == 0)
+            {
+                auto Bn = B(n);
+                functor(Cmn, Amn, Bn);
+            }
+            else
+            {
+                auto Bm = B(m);
+                functor(Cmn, Amn, Bm);
+            }
+            C(m, n) = Cmn;
+        }
+    }
+}
+
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    ck::index_t M      = 1024;
+    ck::index_t N      = 1024;
+    ck::index_t Stride = 1024;
+
+    auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
+        return HostTensorDescriptor({len}, {stride});
+    };
+
+    auto f_host_tensor_descriptor2d = [](std::size_t row, std::size_t col, std::size_t stride) {
+        using namespace ck::literals;
+
+        return HostTensorDescriptor({row, col}, {stride, 1_uz});
+    };
+
+    Tensor<ABDataType> a_m_n(f_host_tensor_descriptor2d(M, N, Stride));
+    Tensor<ABDataType> b_n(f_host_tensor_descriptor1d(N, 1));
+    Tensor<CDataType> c_m_n(f_host_tensor_descriptor2d(M, N, Stride));
+
+    a_m_n.GenerateTensorValue(GeneratorTensor_3<ABDataType>{0.0, 1.0});
+    b_n.GenerateTensorValue(GeneratorTensor_3<ABDataType>{0.0, 1.0});
+
+    DeviceMem a_m_n_device_buf(sizeof(ABDataType) * a_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem b_n_device_buf(sizeof(ABDataType) * b_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n.mDesc.GetElementSpaceSize());
+
+    a_m_n_device_buf.ToDevice(a_m_n.mData.data());
+    b_n_device_buf.ToDevice(b_n.mData.data());
+
+    std::array<const void*, 2> input = {a_m_n_device_buf.GetDeviceBuffer(),
+                                        b_n_device_buf.GetDeviceBuffer()};
+    std::array<void*, 1> output      = {c_m_n_device_buf.GetDeviceBuffer()};
+
+    std::array<ck::index_t, 2> abc_lengths = {M, N};
+    std::array<ck::index_t, 2> a_strides   = {Stride, 1};
+    std::array<ck::index_t, 2> b_strides   = {0, 1};
+    std::array<ck::index_t, 2> c_strides   = {Stride, 1};
+
+    auto broadcastAdd = DeviceElementwiseAddInstance{};
+    auto argument     = broadcastAdd.MakeArgumentPointer(
+        abc_lengths, {a_strides, b_strides}, {c_strides}, input, output, Add{});
+
+    if(!broadcastAdd.IsSupportedArgument(argument.get()))
+    {
+        throw std::runtime_error(
+            "The runtime parameters seems not supported by the device instance, exiting!");
+    };
+
+    auto broadcastAdd_invoker_ptr = broadcastAdd.MakeInvokerPointer();
+    float ave_time =
+        broadcastAdd_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel});
+
+    std::cout << "Perf: " << ave_time << " ms" << std::endl;
+
+    bool pass = true;
+    if(do_verification)
+    {
+        c_m_n_device_buf.FromDevice(c_m_n.mData.data());
+        Tensor<CDataType> host_c_m_n(f_host_tensor_descriptor2d(M, N, Stride));
+
+        host_broadcast2D<Tensor<ABDataType>, Tensor<ABDataType>, Tensor<CDataType>, Add, 0>(
+            host_c_m_n, a_m_n, b_n, M, N, Add{});
+
+        pass &= ck::utils::check_err(c_m_n, host_c_m_n, "Error: Incorrect results c", 1e-3, 1e-3);
+    }
+
+    return pass ? 0 : 1;
+}
diff --git a/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp b/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp
new file mode 100644
index 00000000..813d38b0
--- /dev/null
+++ b/example/19_binary_elementwise/broadcast_add_3d_am_bmnk.cpp
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using ABDataType = F16;
+using CDataType  = F16;
+
+using Add = ck::tensor_operation::element_wise::Add;
+
+using DeviceElementwiseAddInstance =
+    ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ABDataType, ABDataType>,
+                                                    ck::Tuple<CDataType>,
+                                                    Add,
+                                                    3,
+                                                    8,
+                                                    ck::Sequence<1, 8>,
+                                                    ck::Sequence<8>>;
+
+template <typename HostTensorA, typename HostTensorB, typename HostTensorC, typename Functor>
+void host_broadcast3D_am_bmnk(HostTensorC& C,
+                              const HostTensorA& A,
+                              const HostTensorB& B,
+                              const std::vector<std::size_t>& shape,
+                              Functor functor)
+{
+    using ctype = ck::remove_reference_t<decltype(C(0, 0))>;
+
+    for(std::size_t m = 0; m < shape[0]; ++m)
+        for(std::size_t n = 0; n < shape[1]; ++n)
+            for(std::size_t k = 0; k < shape[2]; ++k)
+            {
+                auto a_val  = A(m);
+                auto b_val  = B(m, n, k);
+                ctype c_val = 0;
+                functor(c_val, a_val, b_val);
+                C(m, n, k) = c_val;
+            }
+}
+
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    std::vector<std::size_t> mnk = {4, 16, 32};
+    ck::index_t M                = mnk[0];
+
+    Tensor<ABDataType> a_m({M});
+    Tensor<ABDataType> b_m_n_k(mnk);
+    Tensor<CDataType> c_m_n_k(mnk);
+
+    a_m.GenerateTensorValue(GeneratorTensor_3<ABDataType>{0.0, 1.0});
+    b_m_n_k.GenerateTensorValue(GeneratorTensor_3<ABDataType>{0.0, 1.0});
+
+    DeviceMem a_m_device_buf(sizeof(ABDataType) * a_m.mDesc.GetElementSpaceSize());
+    DeviceMem b_m_n_k_device_buf(sizeof(ABDataType) * b_m_n_k.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_k_device_buf(sizeof(CDataType) * c_m_n_k.mDesc.GetElementSpaceSize());
+
+    a_m_device_buf.ToDevice(a_m.mData.data());
+    b_m_n_k_device_buf.ToDevice(b_m_n_k.mData.data());
+
+    std::array<const void*, 2> input = {a_m_device_buf.GetDeviceBuffer(),
+                                        b_m_n_k_device_buf.GetDeviceBuffer()};
+    std::array<void*, 1> output      = {c_m_n_k_device_buf.GetDeviceBuffer()};
+
+    std::array<ck::index_t, 3> abc_lengths;
+    std::array<ck::index_t, 3> a_strides = {1, 0, 0};
+    std::array<ck::index_t, 3> b_strides;
+    std::array<ck::index_t, 3> c_strides;
+
+    ck::ranges::copy(mnk, abc_lengths.begin());
+    ck::ranges::copy(b_m_n_k.mDesc.GetStrides(), b_strides.begin());
+    ck::ranges::copy(c_m_n_k.mDesc.GetStrides(), c_strides.begin());
+
+    auto broadcastAdd = DeviceElementwiseAddInstance{};
+    auto argument     = broadcastAdd.MakeArgumentPointer(
+        abc_lengths, {a_strides, b_strides}, {c_strides}, input, output, Add{});
+
+    if(!broadcastAdd.IsSupportedArgument(argument.get()))
+    {
+        throw std::runtime_error(
+            "The runtime parameters seems not supported by the device instance, exiting!");
+    };
+
+    auto broadcastAdd_invoker_ptr = broadcastAdd.MakeInvokerPointer();
+    float ave_time =
+        broadcastAdd_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel});
+
+    std::cout << "Perf: " << ave_time << " ms" << std::endl;
+
+    bool pass = true;
+    if(do_verification)
+    {
+        c_m_n_k_device_buf.FromDevice(c_m_n_k.mData.data());
+        Tensor<CDataType> host_c_m_n_k(mnk);
+
+        host_broadcast3D_am_bmnk<Tensor<ABDataType>, Tensor<ABDataType>, Tensor<CDataType>, Add>(
+            host_c_m_n_k, a_m, b_m_n_k, mnk, Add{});
+
+        pass &=
+            ck::utils::check_err(c_m_n_k, host_c_m_n_k, "Error: Incorrect results c", 1e-3, 1e-3);
+    }
+
+    return pass ? 0 : 1;
+}
diff --git a/example/19_binary_elementwise/elementwise_add_1d.cpp b/example/19_binary_elementwise/elementwise_add_1d.cpp
new file mode 100644
index 00000000..a1ca9378
--- /dev/null
+++ b/example/19_binary_elementwise/elementwise_add_1d.cpp
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using ABDataType = F16;
+using CDataType  = F16;
+
+using Add = ck::tensor_operation::element_wise::Add;
+
+using DeviceElementwiseAddInstance =
+    ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ABDataType, ABDataType>,
+                                                    ck::Tuple<CDataType>,
+                                                    Add,
+                                                    1,
+                                                    8,
+                                                    ck::Sequence<8, 8>,
+                                                    ck::Sequence<8>>;
+
+template <typename HostTensorA, typename HostTensorB, typename HostTensorC, typename Functor>
+void host_elementwise1D(
+    HostTensorC& C, const HostTensorA& A, const HostTensorB& B, int M, Functor functor)
+{
+    using ctype = ck::remove_reference_t<decltype(C(0))>;
+
+    for(int m = 0; m < M; ++m)
+    {
+        auto Am  = A(m);
+        auto Bm  = B(m);
+        ctype Cm = 0;
+        functor(Cm, Am, Bm);
+        C(m) = Cm;
+    }
+}
+
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    ck::index_t M = 1024;
+
+    auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
+        return HostTensorDescriptor({len}, {stride});
+    };
+
+    Tensor<ABDataType> a_m(f_host_tensor_descriptor1d(M, 1));
+    Tensor<ABDataType> b_m(f_host_tensor_descriptor1d(M, 1));
+    Tensor<CDataType> c_m(f_host_tensor_descriptor1d(M, 1));
+
+    a_m.GenerateTensorValue(GeneratorTensor_3<ABDataType>{0.0, 1.0});
+    b_m.GenerateTensorValue(GeneratorTensor_3<ABDataType>{0.0, 1.0});
+
+    DeviceMem a_m_device_buf(sizeof(ABDataType) * a_m.mDesc.GetElementSpaceSize());
+    DeviceMem b_m_device_buf(sizeof(ABDataType) * b_m.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_device_buf(sizeof(CDataType) * c_m.mDesc.GetElementSpaceSize());
+
+    a_m_device_buf.ToDevice(a_m.mData.data());
+    b_m_device_buf.ToDevice(b_m.mData.data());
+
+    std::array<const void*, 2> input = {a_m_device_buf.GetDeviceBuffer(),
+                                        b_m_device_buf.GetDeviceBuffer()};
+    std::array<void*, 1> output      = {c_m_device_buf.GetDeviceBuffer()};
+
+    std::array<ck::index_t, 1> abc_lengths = {M};
+    std::array<ck::index_t, 1> a_strides   = {1};
+    std::array<ck::index_t, 1> b_strides   = {1};
+    std::array<ck::index_t, 1> c_strides   = {1};
+
+    auto broadcastAdd = DeviceElementwiseAddInstance{};
+    auto argument     = broadcastAdd.MakeArgumentPointer(
+        abc_lengths, {a_strides, b_strides}, {c_strides}, input, output, Add{});
+
+    if(!broadcastAdd.IsSupportedArgument(argument.get()))
+    {
+        throw std::runtime_error(
+            "The runtime parameters seems not supported by the device instance, exiting!");
+    };
+
+    auto broadcastAdd_invoker_ptr = broadcastAdd.MakeInvokerPointer();
+    float ave_time =
+        broadcastAdd_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel});
+
+    std::cout << "Perf: " << ave_time << " ms" << std::endl;
+
+    bool pass = true;
+    if(do_verification)
+    {
+        c_m_device_buf.FromDevice(c_m.mData.data());
+        Tensor<CDataType> host_c_m(f_host_tensor_descriptor1d(M, 1));
+
+        host_elementwise1D<Tensor<ABDataType>, Tensor<ABDataType>, Tensor<CDataType>, Add>(
+            host_c_m, a_m, b_m, M, Add{});
+
+        pass &= ck::utils::check_err(c_m, host_c_m, "Error: Incorrect results c", 1e-3, 1e-3);
+    }
+
+    return pass ? 0 : 1;
+}
diff --git a/example/19_binary_elementwise/elementwise_add_4d.cpp b/example/19_binary_elementwise/elementwise_add_4d.cpp
new file mode 100644
index 00000000..27e10014
--- /dev/null
+++ b/example/19_binary_elementwise/elementwise_add_4d.cpp
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using ABDataType = F16;
+using CDataType  = F16;
+
+using Add = ck::tensor_operation::element_wise::Add;
+
+using DeviceElementwiseAddInstance =
+    ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ABDataType, ABDataType>,
+                                                    ck::Tuple<CDataType>,
+                                                    Add,
+                                                    4,
+                                                    8,
+                                                    ck::Sequence<8, 8>,
+                                                    ck::Sequence<8>>;
+
+template <typename HostTensorA, typename HostTensorB, typename HostTensorC, typename Functor>
+void host_elementwise4D(HostTensorC& C,
+                        const HostTensorA& A,
+                        const HostTensorB& B,
+                        const std::vector<std::size_t>& shape,
+                        Functor functor)
+{
+    using ctype = ck::remove_reference_t<decltype(C(0, 0, 0, 0))>;
+
+    for(std::size_t n = 0; n < shape[0]; ++n)
+        for(std::size_t c = 0; c < shape[1]; ++c)
+            for(std::size_t h = 0; h < shape[2]; ++h)
+                for(std::size_t w = 0; w < shape[3]; ++w)
+                {
+                    auto a_val  = A(n, c, h, w);
+                    auto b_val  = B(n, c, h, w);
+                    ctype c_val = 0;
+                    functor(c_val, a_val, b_val);
+                    C(n, c, h, w) = c_val;
+                }
+}
+
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = false;
+
+    std::vector<std::size_t> nchw = {4, 16, 32, 32};
+
+    Tensor<ABDataType> a(nchw);
+    Tensor<ABDataType> b(nchw);
+    Tensor<CDataType> c(nchw);
+
+    a.GenerateTensorValue(GeneratorTensor_3<ABDataType>{0.0, 1.0});
+    b.GenerateTensorValue(GeneratorTensor_3<ABDataType>{0.0, 1.0});
+
+    DeviceMem a_device_buf(sizeof(ABDataType) * a.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(ABDataType) * b.mDesc.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(CDataType) * c.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a.mData.data());
+    b_device_buf.ToDevice(b.mData.data());
+
+    std::array<const void*, 2> input = {a_device_buf.GetDeviceBuffer(),
+                                        b_device_buf.GetDeviceBuffer()};
+    std::array<void*, 1> output      = {c_device_buf.GetDeviceBuffer()};
+
+    std::array<ck::index_t, 4> abc_lengths;
+    std::array<ck::index_t, 4> a_strides;
+    std::array<ck::index_t, 4> b_strides;
+    std::array<ck::index_t, 4> c_strides;
+
+    ck::ranges::copy(nchw, abc_lengths.begin());
+    ck::ranges::copy(a.mDesc.GetStrides(), a_strides.begin());
+    ck::ranges::copy(b.mDesc.GetStrides(), b_strides.begin());
+    ck::ranges::copy(c.mDesc.GetStrides(), c_strides.begin());
+
+    auto broadcastAdd = DeviceElementwiseAddInstance{};
+    auto argument     = broadcastAdd.MakeArgumentPointer(
+        abc_lengths, {a_strides, b_strides}, {c_strides}, input, output, Add{});
+
+    if(!broadcastAdd.IsSupportedArgument(argument.get()))
+    {
+        throw std::runtime_error(
+            "The runtime parameters seems not supported by the device instance, exiting!");
+    };
+
+    auto broadcastAdd_invoker_ptr = broadcastAdd.MakeInvokerPointer();
+    float ave_time =
+        broadcastAdd_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel});
+
+    std::cout << "Perf: " << ave_time << " ms" << std::endl;
+
+    bool pass = true;
+    if(do_verification)
+    {
+        c_device_buf.FromDevice(c.mData.data());
+        Tensor<CDataType> host_c(nchw);
+
+        host_elementwise4D<Tensor<ABDataType>, Tensor<ABDataType>, Tensor<CDataType>, Add>(
+            host_c, a, b, nchw, Add{});
+
+        pass &= ck::utils::check_err(c, host_c, "Error: Incorrect results c", 1e-3, 1e-3);
+    }
+
+    return pass ? 0 : 1;
+}
diff --git a/example/20_grouped_conv_bwd_weight/CMakeLists.txt b/example/20_grouped_conv_bwd_weight/CMakeLists.txt
new file mode 100644
index 00000000..557f7971
--- /dev/null
+++ b/example/20_grouped_conv_bwd_weight/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_custom_target(example_grouped_conv_bwd_weight)
+
+add_example_executable(example_grouped_conv_bwd_weight_xdl_fp16 grouped_conv_bwd_weight_xdl_fp16.cpp)
+add_example_executable(example_grouped_conv_bwd_weight_xdl_bf16 grouped_conv_bwd_weight_xdl_bf16.cpp)
+
+
+add_dependencies(example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_xdl_fp16
+                                                 example_grouped_conv_bwd_weight_xdl_bf16)
diff --git a/example/20_grouped_conv_bwd_weight/common.hpp b/example/20_grouped_conv_bwd_weight/common.hpp
new file mode 100644
index 00000000..d2a8bed5
--- /dev/null
+++ b/example/20_grouped_conv_bwd_weight/common.hpp
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp"
+
+using BF16 = ck::bhalf_t;
+using F16  = ck::half_t;
+using F32  = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvBwdWeightDefault =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default;
+
+template <typename InputLay, typename WeightLay, typename OutputLay>
+struct CommonLayoutSetting
+{
+    using InputLayout  = InputLay;
+    using WeightLayout = WeightLay;
+    using OutputLayout = OutputLay;
+};
+
+template <ck::index_t NDimSpatial>
+struct CommonLayoutSettingSelector;
+
+namespace ctl = ck::tensor_layout::convolution;
+
+template <>
+struct CommonLayoutSettingSelector<1> final : CommonLayoutSetting<ctl::GNWC, ctl::GKXC, ctl::GNWK>
+{
+};
+
+template <>
+struct CommonLayoutSettingSelector<2> final
+    : CommonLayoutSetting<ctl::GNHWC, ctl::GKYXC, ctl::GNHWK>
+{
+};
+
+template <>
+struct CommonLayoutSettingSelector<3> final
+    : CommonLayoutSetting<ctl::GNDHWC, ctl::GKZYXC, ctl::GNDHWK>
+{
+};
+
+template <ck::index_t NDimSpatial>
+using InputLayout = typename CommonLayoutSettingSelector<NDimSpatial>::InputLayout;
+
+template <ck::index_t NDimSpatial>
+using WeightLayout = typename CommonLayoutSettingSelector<NDimSpatial>::WeightLayout;
+
+template <ck::index_t NDimSpatial>
+using OutputLayout = typename CommonLayoutSettingSelector<NDimSpatial>::OutputLayout;
+
+struct ExecutionConfig final
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+};
+
+#define DefaultConvParam                                                      \
+    ck::utils::conv::ConvParam                                                \
+    {                                                                         \
+        2, 4, 1, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, { 1, 1 } \
+    }
+
+inline void print_help_msg()
+{
+    std::cerr << "arg1: verification (0=no, 1=yes)\n"
+              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+              << "arg3: time kernel (0=no, 1=yes)\n"
+              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+}
+
+inline bool parse_cmd_args(int argc,
+                           char* argv[],
+                           ExecutionConfig& config,
+                           ck::utils::conv::ConvParam& conv_param)
+{
+    constexpr int num_execution_config_args =
+        3; // arguments for do_verification, init_method, time_kernel
+    constexpr int num_conv_param_leading_args = 5; // arguments for num_dim_spatial_, G_, N_, K_, C_
+
+    constexpr int threshold_to_catch_partial_args = 1 + num_execution_config_args;
+    constexpr int threshold_to_catch_all_args =
+        threshold_to_catch_partial_args + num_conv_param_leading_args;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    // catch only ExecutionConfig arguments
+    else if(argc == threshold_to_catch_partial_args)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    // catch both ExecutionConfig & ConvParam arguments
+    else if(threshold_to_catch_all_args < argc && ((argc - threshold_to_catch_all_args) % 3 == 0))
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+        conv_param                        = ck::utils::conv::parse_conv_param(
+            num_dim_spatial, threshold_to_catch_partial_args, argv);
+    }
+    else
+    {
+        print_help_msg();
+        return false;
+    }
+
+    return true;
+}
diff --git a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_bf16.cpp b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_bf16.cpp
new file mode 100644
index 00000000..9035309c
--- /dev/null
+++ b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_bf16.cpp
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using InDataType = BF16;
+// bf16 kernel use fp32 atomic add to accumulate Weight tensor into global memory
+using WeiDataType = F32;
+using OutDataType = BF16;
+using AccDataType = F32;
+
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using OutElementOp = PassThrough;
+
+#include "run_grouped_conv_bwd_weight_example.inc"
+
+int main(int argc, char* argv[]) { return !run_grouped_conv_bwd_weight_example(argc, argv); }
diff --git a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16.cpp b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16.cpp
new file mode 100644
index 00000000..6791b0bf
--- /dev/null
+++ b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_xdl_fp16.cpp
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using InDataType  = F16;
+using WeiDataType = F16;
+using OutDataType = F16;
+using AccDataType = F32;
+
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using OutElementOp = PassThrough;
+
+#include "run_grouped_conv_bwd_weight_example.inc"
+
+int main(int argc, char* argv[]) { return !run_grouped_conv_bwd_weight_example(argc, argv); }
diff --git a/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc b/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
new file mode 100644
index 00000000..5264c856
--- /dev/null
+++ b/example/20_grouped_conv_bwd_weight/run_grouped_conv_bwd_weight_example.inc
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+template <ck::index_t NDimSpatial>
+using DeviceConvBwdWeightInstance =
+    ck::tensor_operation::device::DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<
+        NDimSpatial,          // NDimSpatial
+        InDataType,           // InDataType
+        WeiDataType,          // WeiDataType
+        OutDataType,          // OutDataType
+        AccDataType,          // AccDataType
+        InElementOp,          // InElementwiseOperation
+        WeiElementOp,         // WeiElementwiseOperation
+        OutElementOp,         // OutElementwiseOperation
+        ConvBwdWeightDefault, // ConvolutionBackwardWeightSpecialization
+        256,                  // BlockSize
+        128,                  // MPerBlock
+        128,                  // NPerBlock
+        4,                    // K0PerBlock
+        8,                    // K1
+        32,                   // MPerXdl
+        32,                   // NPerXdl
+        2,                    // MXdlPerWave
+        2,                    // NXdlPerWave
+        S<1, 4, 16, 4>,       // ABlockTransferThreadClusterLengths_K0_M_K1
+        S<0, 3, 1, 2>,        // ABlockTransferThreadClusterArrangeOrder
+        S<0, 2, 1, 3>,        // ABlockTransferSrcAccessOrder
+        2,                    // ABlockTransferSrcVectorDim
+        8,                    // ABlockTransferSrcScalarPerVector
+        2,                    // ABlockTransferDstScalarPerVector_K1
+        true,                 // ABlockLdsAddExtraM
+        S<1, 4, 16, 4>,       // BBlockTransferThreadClusterLengths_K0_N_K1
+        S<0, 3, 1, 2>,        // BBlockTransferThreadClusterArrangeOrder
+        S<0, 2, 1, 3>,        // BBlockTransferSrcAccessOrder
+        2,                    // BBlockTransferSrcVectorDim
+        8,                    // BBlockTransferSrcScalarPerVector
+        2,                    // BBlockTransferDstScalarPerVector_K1
+        true,                 // BBlockLdsAddExtraN
+        1,                    // CShuffleMXdlPerWavePerShuffle
+        1,                    // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 4>,       // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        128 / (sizeof(WeiDataType) * CHAR_BIT)>; // CBlockTransferScalarPerVector_NWaveNPerXdl
+
+template <ck::index_t NDimSpatial>
+using HostConvBwdWeightInstance = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
+                                                                                     InDataType,
+                                                                                     WeiDataType,
+                                                                                     OutDataType,
+                                                                                     InElementOp,
+                                                                                     WeiElementOp,
+                                                                                     OutElementOp>;
+
+template <ck::index_t NDimSpatial>
+bool run_grouped_conv_bwd_weight(const ExecutionConfig& config,
+                                 const ck::utils::conv::ConvParam& conv_param)
+{
+    constexpr ck::index_t split_k = 2;
+
+    const auto in_g_n_c_wis_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<
+            InputLayout<NDimSpatial>>(conv_param);
+
+    const auto wei_g_k_c_xs_desc =
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<
+            WeightLayout<NDimSpatial>>(conv_param);
+
+    const auto out_g_n_k_wos_desc =
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<
+            OutputLayout<NDimSpatial>>(conv_param);
+
+    Tensor<InDataType> in(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> wei_host_result(wei_g_k_c_xs_desc);
+    Tensor<WeiDataType> wei_device_result(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> out(out_g_n_k_wos_desc);
+
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei_host_result.mDesc << std::endl;
+    std::cout << "out: " << out.mDesc << std::endl;
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+        out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in.mData.data());
+    out_device_buf.ToDevice(out.mData.data());
+
+    // init to 0
+    wei_device_buf.SetZero();
+
+    std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto range_copy = [](const auto& from, auto to) { std::copy(begin(from), end(from), to); };
+
+    range_copy(conv_param.input_spatial_lengths_, begin(input_spatial_lengths));
+    range_copy(conv_param.filter_spatial_lengths_, begin(filter_spatial_lengths));
+    range_copy(conv_param.output_spatial_lengths_, begin(output_spatial_lengths));
+    range_copy(conv_param.conv_filter_strides_, begin(conv_filter_strides));
+    range_copy(conv_param.conv_filter_dilations_, begin(conv_filter_dilations));
+    range_copy(conv_param.input_left_pads_, begin(input_left_pads));
+    range_copy(conv_param.input_right_pads_, begin(input_right_pads));
+
+    // do GEMM
+    auto conv     = DeviceConvBwdWeightInstance<NDimSpatial>{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+                                      static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+                                      static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+                                      conv_param.G_,
+                                      conv_param.N_,
+                                      conv_param.K_,
+                                      conv_param.C_,
+                                      input_spatial_lengths,
+                                      filter_spatial_lengths,
+                                      output_spatial_lengths,
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads,
+                                      InElementOp{},
+                                      WeiElementOp{},
+                                      OutElementOp{},
+                                      split_k);
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        std::cerr << "wrong! device_conv with the specified compilation parameters does "
+                     "not support this Conv problem"
+                  << std::endl;
+        return false;
+    }
+
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+    std::size_t flop      = conv_param.GetFlops();
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+
+    float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
+
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+
+    std::cerr << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl
+              << "DeviceOp: " << conv.GetTypeString() << std::endl;
+
+    if(config.do_verification)
+    {
+        auto ref_conv     = HostConvBwdWeightInstance<NDimSpatial>{};
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei_host_result,
+                                                  out,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  InElementOp{},
+                                                  WeiElementOp{},
+                                                  OutElementOp{});
+
+        ref_invoker.Run(ref_argument);
+
+        wei_device_buf.FromDevice(wei_device_result.mData.data());
+
+        return ck::utils::check_err(wei_device_result.mData, wei_host_result.mData);
+    }
+
+    return true;
+}
+
+bool run_grouped_conv_bwd_weight_example(int argc, char* argv[])
+{
+    ExecutionConfig config;
+    ck::utils::conv::ConvParam conv_param = DefaultConvParam;
+
+    if(!parse_cmd_args(argc, argv, config, conv_param))
+    {
+        return false;
+    }
+
+    switch(conv_param.num_dim_spatial_)
+    {
+    case 1: return run_grouped_conv_bwd_weight<1>(config, conv_param);
+    case 2: return run_grouped_conv_bwd_weight<2>(config, conv_param);
+    case 3: return run_grouped_conv_bwd_weight<3>(config, conv_param);
+    }
+
+    return false;
+}
diff --git a/example/21_gemm_layernorm/CMakeLists.txt b/example/21_gemm_layernorm/CMakeLists.txt
new file mode 100644
index 00000000..78d3a5d0
--- /dev/null
+++ b/example/21_gemm_layernorm/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_example_executable(example_gemm_bias_relu_add_layernorm_xdl_fp16 gemm_bias_relu_add_layernorm_xdl_fp16.cpp)
+add_example_executable(example_gemm_layernorm_xdl_fp16 gemm_layernorm_xdl_fp16.cpp)
+add_example_executable(example_gemm_xdl_layernorm_single_kernel_fp16 gemm_xdl_layernorm_single_kernel_fp16.cpp)
diff --git a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_fp16.cpp b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_fp16.cpp
new file mode 100644
index 00000000..e37555e7
--- /dev/null
+++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_fp16.cpp
@@ -0,0 +1,406 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+// DataType
+using ADataType                = F16;
+using BDataType                = F16;
+using GemmAccDataType          = F32;
+using CShuffleDataType         = F32;
+using D0DataType               = F16;
+using D1DataType               = F16;
+using DsDataType               = ck::Tuple<D0DataType, D1DataType>;
+using EDataType                = F16;
+using ReduceAccDataType        = F32;
+using R0DataType               = F32;
+using R1DataType               = F32;
+using RsDataType               = ck::Tuple<R0DataType, R1DataType>;
+using GammaDataType            = F16;
+using BetaDataType             = F16;
+using LayerNormOutDataType     = F16;
+using NormalizeComputeDataType = F32;
+
+// Layout
+using ALayout  = Row;
+using BLayout  = Col;
+using D1Layout = Row;
+using ELayout  = D1Layout;
+
+// Elementwise op
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using AddReluAdd   = ck::tensor_operation::element_wise::AddReluAdd;
+using Square       = ck::tensor_operation::element_wise::UnarySquare;
+using Div          = ck::tensor_operation::element_wise::UnaryDivide;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddReluAdd;
+using QsElementOp  = ck::Tuple<PassThrough, Square>;
+using RsElementOp  = ck::Tuple<Div, Div>;
+
+// ReduceOp
+using R0ThreadReduceOp = ck::reduce::Add;
+using R1ThreadReduceOp = ck::reduce::Add;
+using RsThreadReduceOp = ck::Tuple<R0ThreadReduceOp, R1ThreadReduceOp>;
+
+static constexpr auto R0GlobalReduceOp = ck::InMemoryDataOperationEnum::AtomicAdd;
+static constexpr auto R1GlobalReduceOp = ck::InMemoryDataOperationEnum::AtomicAdd;
+using RsGlobalReduceOp = ck::InMemoryDataOperationEnumSequence<R0GlobalReduceOp, R1GlobalReduceOp>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle
+//######| ALayout| BLayout| ELayout|     AData|     BData|     GemmAccData|         CShuffle|     DsData|     EData|     ReduceAccData|     RsData|           A|           B|          CDE|          Qs|          Rs|           Thread|           Global|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|    CDRThreadTransfer|                  CDE|    RThreadTransfer|
+//######|        |        |        |      Type|      Type|            Type|         DataType|       Type|      Type|              Type|       Type| Elementwise| Elementwise|  Elementwise| Elementwise| Elementwise|           Reduce|           Reduce| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|       ClusterLengths| ReduceThreadTransfer| DstScalarPerVector|
+//######|        |        |        |          |          |                |                 |           |          |                  |           |   Operation|   Operation|    Operation|   Operation|   Operation|        Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _MPerBlock_NPerBlock|      ScalarPerVector|         _MPerBlock|
+//######|        |        |        |          |          |                |                 |           |          |                  |           |            |            |             |            |            |                 |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                     |           _NPerBlock|                   |
+        < ALayout, BLayout, ELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<64, 4>,                    4,                  1>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        EDataType,
+                                                                        GemmAccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        PassThrough>;
+
+using NormalizeFunctor = ck::tensor_operation::element_wise::Normalize;
+
+// A:x, B:E[x], C:E[x^2], D:Gamma, E:Beta , F:y
+using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise<
+    ck::Tuple<EDataType,
+              R0DataType,
+              R1DataType,
+              GammaDataType,
+              BetaDataType>,         // x(gemm_out), mean, meansquare, gamma, beta
+    ck::Tuple<LayerNormOutDataType>, // y
+    NormalizeFunctor,
+    2,
+    8,                           // MPerthread
+    ck::Sequence<8, 1, 1, 8, 8>, // scalarPerVector: x(gemm_out), mean, meansquare, gamma, beta
+    ck::Sequence<8>>;            // scalarPerVector: y(layerNorm_out)
+
+auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
+    return HostTensorDescriptor({len}, {stride});
+};
+
+auto f_host_tensor_descriptor2d =
+    [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+        using namespace ck::literals;
+
+        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            return HostTensorDescriptor({row, col}, {stride, 1_uz});
+        }
+        else
+        {
+            return HostTensorDescriptor({row, col}, {1_uz, stride});
+        }
+    };
+
+void host_gemm_layernorm(Tensor<LayerNormOutDataType>& out_m_n,
+                         const Tensor<ADataType>& a_m_k,
+                         const Tensor<BDataType>& b_k_n,
+                         const Tensor<D0DataType>& bias_n,
+                         const Tensor<D1DataType>& d1_m_n,
+                         const Tensor<GammaDataType>& gamma_n,
+                         const Tensor<BetaDataType>& beta_n,
+                         AElementOp a_element_op,
+                         BElementOp b_element_op,
+                         CDEElementOp cde_element_op,
+                         int M,
+                         int N)
+{
+
+    int StrideE = N;
+    Tensor<EDataType> e_m_n(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
+    Tensor<R0DataType> mean_m(f_host_tensor_descriptor1d(M, 1));
+    Tensor<R1DataType> meanSquare_m(f_host_tensor_descriptor1d(M, 1));
+    auto averageOpInst = Div{N};
+
+    auto ref_gemm    = ReferenceGemmInstance{};
+    auto ref_invoker = ref_gemm.MakeInvoker();
+
+    auto ref_argument =
+        ref_gemm.MakeArgument(a_m_k, b_k_n, e_m_n, a_element_op, b_element_op, PassThrough{});
+
+    ref_invoker.Run(ref_argument);
+
+    // c = activation(c + bias) + c1_functor(c1)
+    for(int m = 0; m < M; ++m)
+        for(int n = 0; n < N; ++n)
+        {
+            auto acc = ck::type_convert<GemmAccDataType>(e_m_n(m, n));
+            cde_element_op(e_m_n(m, n), acc, bias_n(n), d1_m_n(m, n));
+        }
+
+    // reduce_mean and reduce_square_mean
+    auto r0Op = R0ThreadReduceOp{};
+    auto r1Op = R1ThreadReduceOp{};
+    for(int m = 0; m < M; ++m)
+    {
+        auto mean_acc        = r0Op.GetIdentityValue<ReduceAccDataType>();
+        auto mean_square_acc = r1Op.GetIdentityValue<ReduceAccDataType>();
+
+        for(int n = 0; n < N; ++n)
+        {
+            auto e_val                     = ck::type_convert<ReduceAccDataType>(e_m_n(m, n));
+            ReduceAccDataType square_e_val = 0;
+            Square{}(square_e_val, e_val);
+
+            r0Op(mean_acc, e_val);
+            r1Op(mean_square_acc, square_e_val);
+        }
+
+        averageOpInst(mean_acc, mean_acc);
+        averageOpInst(mean_square_acc, mean_square_acc);
+        mean_m(m)       = ck::type_convert<R0DataType>(mean_acc);
+        meanSquare_m(m) = ck::type_convert<R1DataType>(mean_square_acc);
+    }
+
+    // LayerNorm
+    auto layerNormInst = NormalizeFunctor{};
+    for(int m = 0; m < M; ++m)
+    {
+        for(int n = 0; n < N; ++n)
+        {
+            LayerNormOutDataType out_val = 0;
+            layerNormInst(out_val, e_m_n(m, n), mean_m(m), meanSquare_m(m), gamma_n(n), beta_n(n));
+            out_m_n(m, n) = out_val;
+        }
+    }
+}
+
+template <typename ADataType,
+          typename BDataType,
+          typename EDataType,
+          typename D0DataType,
+          typename D1DataType,
+          typename R0DataType,
+          typename R1DataType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename NormalizeDataType>
+void DumpGemmLayerNormPerf(float gemm_reduce_time, float normalize_time, int M, int N, int K)
+{
+    std::size_t gemm_flop     = std::size_t(2) * M * N * K + std::size_t(2) * M * N;
+    std::size_t gemm_num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                                sizeof(EDataType) * M * N + sizeof(D0DataType) * M * N +
+                                sizeof(D0DataType) * M * N + sizeof(R0DataType) * M +
+                                sizeof(R1DataType) * M;
+
+    std::size_t normalize_num_byte = sizeof(EDataType) * M * N + sizeof(R0DataType) * M +
+                                     sizeof(R1DataType) * M + sizeof(GammaDataType) * N +
+                                     sizeof(BetaDataType) * N + sizeof(NormalizeDataType) * M * N;
+
+    float tflops               = static_cast<float>(gemm_flop) / 1.E9 / gemm_reduce_time;
+    float gemm_gb_per_sec      = gemm_num_byte / 1.E6 / gemm_reduce_time;
+    float normalize_gb_per_sec = normalize_num_byte / 1.E6 / normalize_time;
+
+    std::cout << "gemm + reduce_mean + reduce_square_mean Perf: " << gemm_reduce_time << " ms, "
+              << tflops << " TFlops, " << gemm_gb_per_sec << " GB/s, " << std::endl;
+
+    std::cout << "5-ary elementwise Perf: " << normalize_time << " ms, " << normalize_gb_per_sec
+              << " GB/s, " << std::endl;
+}
+
+int main()
+{
+    // GEMM shape
+    ck::index_t M = 1024;
+    ck::index_t N = 1024;
+    ck::index_t K = 1024;
+
+    ck::index_t StrideA  = 1024;
+    ck::index_t StrideB  = 1024;
+    ck::index_t StrideD0 = 0;
+    ck::index_t StrideD1 = 1024;
+    ck::index_t StrideE  = 1024;
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
+    Tensor<D0DataType> bias_n(f_host_tensor_descriptor1d(N, 1));
+    Tensor<D1DataType> d1_m_n(f_host_tensor_descriptor2d(M, N, StrideD1, ELayout{}));
+    Tensor<EDataType> e_m_n(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
+    Tensor<R0DataType> r0_Mean_m(f_host_tensor_descriptor1d(M, 1));
+    Tensor<R1DataType> r1_MeanSquare_m(f_host_tensor_descriptor1d(M, 1));
+    Tensor<GammaDataType> gamma_n(f_host_tensor_descriptor1d(N, 1));
+    Tensor<BetaDataType> beta_n(f_host_tensor_descriptor1d(N, 1));
+    Tensor<LayerNormOutDataType> layerNorm_m_n(
+        f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
+
+    a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-1, 1});
+    b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-1, 1});
+    bias_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{-1, 1});
+    d1_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{-5, 5});
+    gamma_n.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{-1, 1});
+    beta_n.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{-1, 1});
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem bias_device_buf(sizeof(D0DataType) * bias_n.mDesc.GetElementSpaceSize());
+    DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem r0_Mean_device_buf(sizeof(R0DataType) * r0_Mean_m.mDesc.GetElementSpaceSize());
+    DeviceMem r1_MeanSquare_device_buf(sizeof(R1DataType) *
+                                       r1_MeanSquare_m.mDesc.GetElementSpaceSize());
+    DeviceMem gamma_device_buf(sizeof(GammaDataType) * gamma_n.mDesc.GetElementSpaceSize());
+    DeviceMem beta_device_buf(sizeof(BetaDataType) * beta_n.mDesc.GetElementSpaceSize());
+    DeviceMem layerNorm_device_buf(sizeof(LayerNormOutDataType) *
+                                   layerNorm_m_n.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    bias_device_buf.ToDevice(bias_n.mData.data());
+    d1_device_buf.ToDevice(d1_m_n.mData.data());
+    gamma_device_buf.ToDevice(gamma_n.mData.data());
+    beta_device_buf.ToDevice(beta_n.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+    auto qs_element_op  = QsElementOp{};
+    auto rs_element_op  = RsElementOp{N, N};
+
+    // Prepare GEMM, mean, mean_square
+    auto gemmReduce          = DeviceOpInstance{};
+    auto gemmReduce_invoker  = gemmReduce.MakeInvoker();
+    auto gemmReduce_argument = gemmReduce.MakeArgument(
+        a_device_buf.GetDeviceBuffer(),
+        b_device_buf.GetDeviceBuffer(),
+        {bias_device_buf.GetDeviceBuffer(), d1_device_buf.GetDeviceBuffer()},
+        e_device_buf.GetDeviceBuffer(),
+        {r0_Mean_device_buf.GetDeviceBuffer(), r1_MeanSquare_device_buf.GetDeviceBuffer()},
+        M,
+        N,
+        K,
+        StrideA,
+        StrideB,
+        {StrideD0, StrideD1},
+        StrideE,
+        a_element_op,
+        b_element_op,
+        cde_element_op,
+        qs_element_op,
+        rs_element_op);
+
+    if(!gemmReduce.IsSupportedArgument(gemmReduce_argument))
+    {
+        throw std::runtime_error("wrong! this device_op instance does not support this problem");
+    }
+
+    // init reducetion buffer to 0
+    r0_Mean_device_buf.SetZero();
+    r1_MeanSquare_device_buf.SetZero();
+
+    // Prepare LayerNorm
+    std::array<const void*, 5> input = {e_device_buf.GetDeviceBuffer(),
+                                        r0_Mean_device_buf.GetDeviceBuffer(),
+                                        r1_MeanSquare_device_buf.GetDeviceBuffer(),
+                                        gamma_device_buf.GetDeviceBuffer(),
+                                        beta_device_buf.GetDeviceBuffer()};
+    std::array<void*, 1> output      = {layerNorm_device_buf.GetDeviceBuffer()};
+
+    std::array<ck::index_t, 2> xyLengths = {M, N};
+    std::array<ck::index_t, 2> xyStrides = {StrideE, 1};
+
+    auto normalize         = DeviceNormalizeInstance{};
+    auto normalize_invoker = normalize.MakeInvoker();
+    auto normalize_argument_ptr =
+        normalize.MakeArgumentPointer(xyLengths,
+                                      {xyStrides, {1, 0}, {1, 0}, {0, 1}, {0, 1}},
+                                      {xyStrides},
+                                      input,
+                                      output,
+                                      NormalizeFunctor{});
+
+    if(!normalize.IsSupportedArgument(normalize_argument_ptr.get()))
+    {
+        throw std::runtime_error(
+            "The runtime parameters seems not supported by the device, exiting!");
+    }
+
+    // run kernel
+    gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, false});
+    normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, false});
+
+    bool pass = true;
+    {
+        // verification
+        Tensor<LayerNormOutDataType> host_layerNorm_m_n(
+            f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
+
+        host_gemm_layernorm(host_layerNorm_m_n,
+                            a_m_k,
+                            b_k_n,
+                            bias_n,
+                            d1_m_n,
+                            gamma_n,
+                            beta_n,
+                            a_element_op,
+                            b_element_op,
+                            cde_element_op,
+                            M,
+                            N);
+
+        layerNorm_device_buf.FromDevice(layerNorm_m_n.mData.data());
+        pass &= ck::utils::check_err(layerNorm_m_n,
+                                     host_layerNorm_m_n,
+                                     "Error: Incorrect results layerNorm_m_n",
+                                     1e-2,
+                                     1e-2);
+    }
+
+    {
+        // evaluate kernel perf
+        bool time_kernel = true;
+
+        float gemm_reduce_mean_reduce_square_mean_ave_time =
+            gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, time_kernel});
+        float normalize_ave_time =
+            normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+        if(time_kernel)
+            DumpGemmLayerNormPerf<ADataType,
+                                  BDataType,
+                                  EDataType,
+                                  D0DataType,
+                                  D1DataType,
+                                  R0DataType,
+                                  R1DataType,
+                                  GammaDataType,
+                                  BetaDataType,
+                                  LayerNormOutDataType>(
+                gemm_reduce_mean_reduce_square_mean_ave_time, normalize_ave_time, M, N, K);
+    }
+
+    return pass ? 0 : 1;
+}
diff --git a/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp b/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp
new file mode 100644
index 00000000..282c8763
--- /dev/null
+++ b/example/21_gemm_layernorm/gemm_layernorm_xdl_fp16.cpp
@@ -0,0 +1,375 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+// DataType
+using ADataType                = F16;
+using BDataType                = F16;
+using GemmAccDataType          = F32;
+using CShuffleDataType         = F32;
+using DsDataType               = ck::Tuple<>;
+using EDataType                = F16;
+using ReduceAccDataType        = F32;
+using R0DataType               = F32;
+using R1DataType               = F32;
+using RsDataType               = ck::Tuple<R0DataType, R1DataType>;
+using GammaDataType            = F16;
+using BetaDataType             = F16;
+using LayerNormOutDataType     = F16;
+using NormalizeComputeDataType = F32;
+
+// Layout
+using ALayout  = Row;
+using BLayout  = Col;
+using D1Layout = Row;
+using ELayout  = D1Layout;
+
+// Elementwise op
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using Square       = ck::tensor_operation::element_wise::UnarySquare;
+using Div          = ck::tensor_operation::element_wise::UnaryDivide;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+using QsElementOp  = ck::Tuple<PassThrough, Square>;
+using RsElementOp  = ck::Tuple<Div, Div>;
+
+// ReduceOp
+using R0ThreadReduceOp = ck::reduce::Add;
+using R1ThreadReduceOp = ck::reduce::Add;
+using RsThreadReduceOp = ck::Tuple<R0ThreadReduceOp, R1ThreadReduceOp>;
+
+static constexpr auto R0GlobalReduceOp = ck::InMemoryDataOperationEnum::AtomicAdd;
+static constexpr auto R1GlobalReduceOp = ck::InMemoryDataOperationEnum::AtomicAdd;
+using RsGlobalReduceOp = ck::InMemoryDataOperationEnumSequence<R0GlobalReduceOp, R1GlobalReduceOp>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDMultipleR_Xdl_CShuffle
+//######| ALayout| BLayout| ELayout|     AData|     BData|     GemmAccData|         CShuffle|     DsData|     EData|     ReduceAccData|     RsData|           A|           B|          CDE|          Qs|          Rs|           Thread|           Global|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|    CDRThreadTransfer|                  CDE|    RThreadTransfer|
+//######|        |        |        |      Type|      Type|            Type|         DataType|       Type|      Type|              Type|       Type| Elementwise| Elementwise|  Elementwise| Elementwise| Elementwise|           Reduce|           Reduce| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|       ClusterLengths| ReduceThreadTransfer| DstScalarPerVector|
+//######|        |        |        |          |          |                |                 |           |          |                  |           |   Operation|   Operation|    Operation|   Operation|   Operation|        Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _MPerBlock_NPerBlock|      ScalarPerVector|         _MPerBlock|
+//######|        |        |        |          |          |                |                 |           |          |                  |           |            |            |             |            |            |                 |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                     |           _NPerBlock|                   |
+        < ALayout, BLayout, ELayout, ADataType, BDataType, GemmAccDataType, CShuffleDataType, DsDataType, EDataType, ReduceAccDataType, RsDataType,  AElementOp,  BElementOp, CDEElementOp, QsElementOp, RsElementOp, RsThreadReduceOp, RsGlobalReduceOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<64, 4>,                    4,                  1>;
+// clang-format on
+
+using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                        BDataType,
+                                                                        EDataType,
+                                                                        GemmAccDataType,
+                                                                        AElementOp,
+                                                                        BElementOp,
+                                                                        PassThrough>;
+
+using NormalizeFunctor = ck::tensor_operation::element_wise::Normalize;
+
+// A:x, B:E[x], C:E[x^2], D:Gamma, E:Beta , F:y
+using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise<
+    ck::Tuple<EDataType,
+              R0DataType,
+              R1DataType,
+              GammaDataType,
+              BetaDataType>,         // x(gemm_out), mean,
+                                     // meansquare,
+                                     // gamma, beta
+    ck::Tuple<LayerNormOutDataType>, // y
+    NormalizeFunctor,
+    2,
+    8,                           // MPerthread
+    ck::Sequence<8, 1, 1, 8, 8>, // scalarPerVector: x(gemm_out), mean, meansquare, gamma, beta
+    ck::Sequence<8>>;            // scalarPerVector: y(layerNorm_out)
+
+auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
+    return HostTensorDescriptor({len}, {stride});
+};
+
+auto f_host_tensor_descriptor2d =
+    [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+        using namespace ck::literals;
+
+        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            return HostTensorDescriptor({row, col}, {stride, 1_uz});
+        }
+        else
+        {
+            return HostTensorDescriptor({row, col}, {1_uz, stride});
+        }
+    };
+
+void host_gemm_layernorm(Tensor<LayerNormOutDataType>& out_m_n,
+                         const Tensor<ADataType>& a_m_k,
+                         const Tensor<BDataType>& b_k_n,
+                         const Tensor<GammaDataType>& gamma_n,
+                         const Tensor<BetaDataType>& beta_n,
+                         AElementOp a_element_op,
+                         BElementOp b_element_op,
+                         CDEElementOp c_element_op,
+                         int M,
+                         int N)
+{
+    int StrideE = N;
+    Tensor<EDataType> e_m_n(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
+    Tensor<R0DataType> mean_m(f_host_tensor_descriptor1d(M, 1));
+    Tensor<R1DataType> meanSquare_m(f_host_tensor_descriptor1d(M, 1));
+    auto averageOpInst = Div{N};
+
+    auto ref_gemm    = ReferenceGemmInstance{};
+    auto ref_invoker = ref_gemm.MakeInvoker();
+
+    auto ref_argument =
+        ref_gemm.MakeArgument(a_m_k, b_k_n, e_m_n, a_element_op, b_element_op, c_element_op);
+
+    ref_invoker.Run(ref_argument);
+
+    // reduce_mean and reduce_square_mean
+    auto r0Op = R0ThreadReduceOp{};
+    auto r1Op = R1ThreadReduceOp{};
+    for(int m = 0; m < M; ++m)
+    {
+        auto mean_acc        = r0Op.GetIdentityValue<ReduceAccDataType>();
+        auto mean_square_acc = r1Op.GetIdentityValue<ReduceAccDataType>();
+
+        for(int n = 0; n < N; ++n)
+        {
+            auto e_val                     = ck::type_convert<ReduceAccDataType>(e_m_n(m, n));
+            ReduceAccDataType square_e_val = 0;
+            Square{}(square_e_val, e_val);
+
+            r0Op(mean_acc, e_val);
+            r1Op(mean_square_acc, square_e_val);
+        }
+
+        averageOpInst(mean_acc, mean_acc);
+        averageOpInst(mean_square_acc, mean_square_acc);
+        mean_m(m)       = ck::type_convert<R0DataType>(mean_acc);
+        meanSquare_m(m) = ck::type_convert<R1DataType>(mean_square_acc);
+    }
+
+    // LayerNorm
+    auto layerNormInst = NormalizeFunctor{};
+    for(int m = 0; m < M; ++m)
+    {
+        for(int n = 0; n < N; ++n)
+        {
+            LayerNormOutDataType out_val = 0;
+            layerNormInst(out_val, e_m_n(m, n), mean_m(m), meanSquare_m(m), gamma_n(n), beta_n(n));
+            out_m_n(m, n) = out_val;
+        }
+    }
+}
+
+template <typename ADataType,
+          typename BDataType,
+          typename EDataType,
+          typename R0DataType,
+          typename R1DataType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename NormalizeDataType>
+void DumpGemmLayerNormPerf(float gemm_reduce_time, float normalize_time, int M, int N, int K)
+{
+    std::size_t gemm_flop     = std::size_t(2) * M * N * K;
+    std::size_t gemm_num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                                sizeof(EDataType) * M * N + sizeof(R0DataType) * M +
+                                sizeof(R1DataType) * M;
+
+    std::size_t normalize_num_btye = sizeof(EDataType) * M * N + sizeof(R0DataType) * M +
+                                     sizeof(R1DataType) * M + sizeof(GammaDataType) * N +
+                                     sizeof(BetaDataType) * N + sizeof(NormalizeDataType) * M * N;
+
+    float tflops               = static_cast<float>(gemm_flop) / 1.E9 / gemm_reduce_time;
+    float gemm_gb_per_sec      = gemm_num_byte / 1.E6 / gemm_reduce_time;
+    float normalize_gb_per_sec = normalize_num_btye / 1.E6 / normalize_time;
+
+    std::cout << "gemm + reduce_mean + reduce_square_mean Perf: " << gemm_reduce_time << " ms, "
+              << tflops << " TFlops, " << gemm_gb_per_sec << " GB/s, " << std::endl;
+
+    std::cout << "5-ary elementwise Perf: " << normalize_time << " ms, " << normalize_gb_per_sec
+              << " GB/s, " << std::endl;
+}
+
+int main()
+{
+    // GEMM shape
+    ck::index_t M = 1024;
+    ck::index_t N = 1024;
+    ck::index_t K = 1024;
+
+    ck::index_t StrideA = 1024;
+    ck::index_t StrideB = 1024;
+    ck::index_t StrideE = 1024;
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
+    Tensor<EDataType> e_m_n(f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
+    Tensor<R0DataType> r0_Mean_m(f_host_tensor_descriptor1d(M, 1));
+    Tensor<R1DataType> r1_MeanSquare_m(f_host_tensor_descriptor1d(M, 1));
+    Tensor<GammaDataType> gamma_n(f_host_tensor_descriptor1d(N, 1));
+    Tensor<BetaDataType> beta_n(f_host_tensor_descriptor1d(N, 1));
+    Tensor<LayerNormOutDataType> layerNorm_m_n(
+        f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
+
+    a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-1, 1});
+    b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-1, 1});
+    gamma_n.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{-1, 1});
+    beta_n.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{-1, 1});
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem r0_Mean_device_buf(sizeof(R0DataType) * r0_Mean_m.mDesc.GetElementSpaceSize());
+    DeviceMem r1_MeanSquare_device_buf(sizeof(R1DataType) *
+                                       r1_MeanSquare_m.mDesc.GetElementSpaceSize());
+    DeviceMem gamma_device_buf(sizeof(GammaDataType) * gamma_n.mDesc.GetElementSpaceSize());
+    DeviceMem beta_device_buf(sizeof(BetaDataType) * beta_n.mDesc.GetElementSpaceSize());
+    DeviceMem layerNorm_device_buf(sizeof(LayerNormOutDataType) *
+                                   layerNorm_m_n.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    gamma_device_buf.ToDevice(gamma_n.mData.data());
+    beta_device_buf.ToDevice(beta_n.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+    auto qs_element_op  = QsElementOp{};
+    auto rs_element_op  = RsElementOp{N, N};
+
+    // Prepare GEMM, mean, mean_square
+    auto gemmReduce          = DeviceOpInstance{};
+    auto gemmReduce_invoker  = gemmReduce.MakeInvoker();
+    auto gemmReduce_argument = gemmReduce.MakeArgument(
+        a_device_buf.GetDeviceBuffer(),
+        b_device_buf.GetDeviceBuffer(),
+        {},
+        e_device_buf.GetDeviceBuffer(),
+        {r0_Mean_device_buf.GetDeviceBuffer(), r1_MeanSquare_device_buf.GetDeviceBuffer()},
+        M,
+        N,
+        K,
+        StrideA,
+        StrideB,
+        {},
+        StrideE,
+        a_element_op,
+        b_element_op,
+        cde_element_op,
+        qs_element_op,
+        rs_element_op);
+
+    if(!gemmReduce.IsSupportedArgument(gemmReduce_argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    r0_Mean_device_buf.SetZero();
+    r1_MeanSquare_device_buf.SetZero();
+
+    // Prepare LayerNorm
+    std::array<const void*, 5> input = {e_device_buf.GetDeviceBuffer(),
+                                        r0_Mean_device_buf.GetDeviceBuffer(),
+                                        r1_MeanSquare_device_buf.GetDeviceBuffer(),
+                                        gamma_device_buf.GetDeviceBuffer(),
+                                        beta_device_buf.GetDeviceBuffer()};
+    std::array<void*, 1> output      = {layerNorm_device_buf.GetDeviceBuffer()};
+
+    std::array<ck::index_t, 2> xyLengths = {M, N};
+    std::array<ck::index_t, 2> xyStrides = {StrideE, 1};
+
+    auto normalize         = DeviceNormalizeInstance{};
+    auto normalize_invoker = normalize.MakeInvoker();
+    auto normalize_argument_ptr =
+        normalize.MakeArgumentPointer(xyLengths,
+                                      {xyStrides, {1, 0}, {1, 0}, {0, 1}, {0, 1}},
+                                      {xyStrides},
+                                      input,
+                                      output,
+                                      NormalizeFunctor{});
+
+    if(!normalize.IsSupportedArgument(normalize_argument_ptr.get()))
+    {
+        throw std::runtime_error(
+            "The runtime parameters seems not supported by the device, exiting");
+    }
+
+    // run kernel
+    gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, false});
+    normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, false});
+
+    bool pass = true;
+    {
+        // verification
+        Tensor<LayerNormOutDataType> host_layerNorm_m_n(
+            f_host_tensor_descriptor2d(M, N, StrideE, ELayout{}));
+
+        host_gemm_layernorm(host_layerNorm_m_n,
+                            a_m_k,
+                            b_k_n,
+                            gamma_n,
+                            beta_n,
+                            a_element_op,
+                            b_element_op,
+                            cde_element_op,
+                            M,
+                            N);
+
+        layerNorm_device_buf.FromDevice(layerNorm_m_n.mData.data());
+        pass &= ck::utils::check_err(
+            layerNorm_m_n, host_layerNorm_m_n, "Error: Incorrect results d1", 1e-3, 1e-3);
+    }
+
+    {
+        // evaluate kernel perf
+        bool time_kernel = true;
+
+        float gemm_reduce_mean_reduce_square_mean_ave_time =
+            gemmReduce_invoker.Run(gemmReduce_argument, StreamConfig{nullptr, time_kernel});
+        float normalize_ave_time =
+            normalize_invoker.Run(normalize_argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+        if(time_kernel)
+            DumpGemmLayerNormPerf<ADataType,
+                                  BDataType,
+                                  EDataType,
+                                  R0DataType,
+                                  R1DataType,
+                                  GammaDataType,
+                                  BetaDataType,
+                                  LayerNormOutDataType>(
+                gemm_reduce_mean_reduce_square_mean_ave_time, normalize_ave_time, M, N, K);
+    }
+
+    return pass ? 0 : 1;
+}
diff --git a/example/21_gemm_layernorm/gemm_xdl_layernorm_single_kernel_fp16.cpp b/example/21_gemm_layernorm/gemm_xdl_layernorm_single_kernel_fp16.cpp
new file mode 100644
index 00000000..3c3e36be
--- /dev/null
+++ b/example/21_gemm_layernorm/gemm_xdl_layernorm_single_kernel_fp16.cpp
@@ -0,0 +1,287 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+
+#include "ck/ck.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+
+// This example demonstrate a single kernel that runs GEMM layer and laynorm in one fused kernel
+//
+// The GEMM + Layernorm implementation is a specialized kernel which allows fusing both layers
+// together given the condition GEMM extents N of MNK is spanned by a single workgroup. For example,
+// a kernel configured with NPerBlock = 128 allows to operate on all GEMM sizes if N <= 128
+//
+// D = Layernorm(acc_element_op(A * B + broadcast(bias)) + add) * broadcast(gamma) + broadcast(beta)
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using ADataType        = F16;
+using BDataType        = F16;
+using CDataType        = F16;
+using C0DataType       = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
+
+using ALayout = ck::tensor_layout::gemm::RowMajor;
+using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+struct Relu
+{
+    template <typename OutT, typename InT>
+    __host__ __device__ void operator()(OutT& y, const InT& x) const
+    {
+        y = x > 0 ? x : 0;
+    }
+};
+
+using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+// Elementwise operation that operates on the output of matrix multiplication
+// i.e., AccElementOp(A * B + bias)
+using AccElementOp = Relu;
+// Elementwise operation that operates on the output of layer normalization
+using CElementOp = Relu;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmLayerNorm_Xdl_CShuffle
+//######| ALayout| BLayout| CLayout|     AData|     BData|     CData|     C0Data|     GemmAcc|         CShuffle|   ReduceAcc|           A|           B|          Acc|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce|     CReduceThreadCopy|
+//######|        |        |        |      Type|      Type|      Type|       Type|    DataType|         DataType|    DataType| Elementwise| Elementwise|  Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths| SrcDstScalarPerVector|
+//######|        |        |        |          |          |          |           |            |                 |            |   Operation|   Operation|    Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|            _NPerBlock|
+//######|        |        |        |          |          |          |           |            |                 |            |            |            |             |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                      |
+        <     Row,     Col,     Row, ADataType, BDataType, CDataType, C0DataType, AccDataType, CShuffleDataType, AccDataType,  AElementOp,  BElementOp, AccElementOp,  CElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           2,               S<1, 32, 1, 8>,               8,             S<64, 4>,                     4>;
+// clang-format on
+
+using ReferenceInstance = ck::tensor_operation::host::ReferenceGemmLayernorm<ADataType,
+                                                                             BDataType,
+                                                                             CDataType,
+                                                                             C0DataType,
+                                                                             AccDataType,
+                                                                             AElementOp,
+                                                                             BElementOp,
+                                                                             AccElementOp,
+                                                                             CElementOp>;
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 128;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideC = 128;
+
+    if(argc == 1)
+    {
+        // do nothing
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 10)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideC = std::stoi(argv[9]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
+        exit(0);
+    }
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<AccDataType> acc_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<C0DataType> c0_n_bias({N});
+    Tensor<C0DataType> c0_m_n_add(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<C0DataType> c0_n_gamma({N});
+    Tensor<C0DataType> c0_n_beta({N});
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+    std::cout << "c0_n_bias: " << c0_n_bias.mDesc << std::endl;
+    std::cout << "c0_m_n_add: " << c0_m_n_add.mDesc << std::endl;
+    std::cout << "c0_n_gamma: " << c0_n_gamma.mDesc << std::endl;
+    std::cout << "c0_n_beta: " << c0_n_beta.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    case 2:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_Sequential<0>{});
+        b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+    }
+
+    c0_n_bias.GenerateTensorValue(GeneratorTensor_2<C0DataType>{-5, 5});
+    c0_m_n_add.GenerateTensorValue(GeneratorTensor_2<C0DataType>{-5, 5});
+    c0_n_gamma.GenerateTensorValue(GeneratorTensor_2<C0DataType>{0, 2});
+    c0_n_beta.GenerateTensorValue(GeneratorTensor_2<C0DataType>{0, 5});
+    c_m_n_host_result.GenerateTensorValue(GeneratorTensor_1<CDataType>{0});
+    acc_m_n_host_result.GenerateTensorValue(GeneratorTensor_1<AccDataType>{0});
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem c0_bias_buf(sizeof(C0DataType) * c0_n_bias.mDesc.GetElementSpaceSize());
+    DeviceMem c0_add_buf(sizeof(C0DataType) * c0_m_n_add.mDesc.GetElementSpaceSize());
+    DeviceMem c0_gamma_buf(sizeof(C0DataType) * c0_n_gamma.mDesc.GetElementSpaceSize());
+    DeviceMem c0_beta_buf(sizeof(C0DataType) * c0_n_beta.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    c0_bias_buf.ToDevice(c0_n_bias.mData.data());
+    c0_add_buf.ToDevice(c0_m_n_add.mData.data());
+    c0_gamma_buf.ToDevice(c0_n_gamma.mData.data());
+    c0_beta_buf.ToDevice(c0_n_beta.mData.data());
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto acc_element_op = AccElementOp{};
+    auto c_element_op   = CElementOp{};
+
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                      static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                      static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                      static_cast<C0DataType*>(c0_add_buf.GetDeviceBuffer()),
+                                      static_cast<C0DataType*>(c0_bias_buf.GetDeviceBuffer()),
+                                      static_cast<C0DataType*>(c0_gamma_buf.GetDeviceBuffer()),
+                                      static_cast<C0DataType*>(c0_beta_buf.GetDeviceBuffer()),
+                                      M,
+                                      N,
+                                      K,
+                                      StrideA,
+                                      StrideB,
+                                      StrideC,
+                                      a_element_op,
+                                      b_element_op,
+                                      acc_element_op,
+                                      c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    // extra 6MN flops due to: bias + add + gamma + beta + norm_sub + norm_div,
+    // excluding reduction steps
+    std::size_t flop = std::size_t(2) * M * N * K + std::size_t(6) * M * N;
+    // extra MN and 3N due to c0_add (MxN), bias (1xN), gamma (1xN), beta (1xN)
+    std::size_t bytes = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                        sizeof(CDataType) * 2 * M * N + sizeof(C0DataType) * 3 * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = bytes / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    bool pass = true;
+    if(do_verification)
+    {
+        c_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+        auto ref_gemm    = ReferenceInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(a_m_k,
+                                                  b_k_n,
+                                                  c_m_n_host_result,
+                                                  c0_n_bias,
+                                                  c0_m_n_add,
+                                                  c0_n_gamma,
+                                                  c0_n_beta,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  acc_element_op,
+                                                  c_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        if constexpr(std::is_same<CShuffleDataType, F32>::value)
+        {
+            pass &= ck::utils::check_err(
+                c_m_n_device_result, c_m_n_host_result, "Error: Incorrect results c");
+        }
+        else if constexpr(std::is_same<CShuffleDataType, F16>::value)
+        {
+            pass &= ck::utils::check_err(
+                c_m_n_device_result, c_m_n_host_result, "Error: Incorrect results c", 1e-2, 1e-2);
+        }
+    }
+    return pass ? 0 : 1;
+}
diff --git a/example/22_cgemm/CMakeLists.txt b/example/22_cgemm/CMakeLists.txt
new file mode 100644
index 00000000..15645611
--- /dev/null
+++ b/example/22_cgemm/CMakeLists.txt
@@ -0,0 +1,17 @@
+add_custom_target(example_cgemm_xdl)
+
+add_example_executable(example_cgemm_xdl_bf16 cgemm_xdl_bf16.cpp)
+add_example_executable(example_cgemm_xdl_fp16 cgemm_xdl_fp16.cpp)
+add_example_executable(example_cgemm_xdl_fp32 cgemm_xdl_fp32.cpp)
+add_example_executable(example_cgemm_xdl_int8 cgemm_xdl_int8.cpp)
+
+add_dependencies(example_cgemm_xdl 
+                 example_cgemm_xdl_bf16
+                 example_cgemm_xdl_fp16
+                 example_cgemm_xdl_fp32
+                 example_cgemm_xdl_int8)
+
+if(USE_BITINT_EXTENSION_INT4)
+  add_example_executable(example_cgemm_xdl_int4 cgemm_xdl_int4.cpp)
+  add_dependencies(example_cgemm_xdl example_cgemm_xdl_int4)
+endif()
diff --git a/example/22_cgemm/cgemm_xdl_bf16.cpp b/example/22_cgemm/cgemm_xdl_bf16.cpp
new file mode 100644
index 00000000..92ed90ce
--- /dev/null
+++ b/example/22_cgemm/cgemm_xdl_bf16.cpp
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "cgemm_xdl_common.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp"
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+
+using ADataType   = BF16;
+using BDataType   = BF16;
+using CDataType   = BF16;
+using AccDataType = F32;
+
+using ALayout = ck::tensor_layout::gemm::RowMajor;
+using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using ReferenceCGemmInstance = ck::tensor_operation::host::
+    ReferenceCGemm<ADataType, BDataType, CDataType, PassThrough, PassThrough, PassThrough>;
+
+// clang-format off
+using DeviceCGemmInstance = ck::tensor_operation::device::DeviceCGemm_4Gemm_Xdl_CShuffle
+    <ALayout,                    // typename ALayout
+     BLayout,                    // typename BLayout
+     CLayout,                    // typename CLayout
+     ADataType,                  // typename ADataType
+     BDataType,                  // typename BDataType
+     CDataType,                  // typename CDataType
+     AccDataType,                // typename GemmAccDataType
+     CDataType,                  // typename CShuffleDataType
+     PassThrough,                // typename AElementwiseOperation
+     PassThrough,                // typename BElementwiseOperation
+     PassThrough,                // typename CElementwiseOperation
+     GemmDefault,                // GemmSpecialization GemmSpec
+     1,                          // index_t NumGemmKPrefetchStage
+     256,                        // index_t BlockSize
+     256,                        // index_t MPerBlock
+     128,                        // index_t NPerBlock
+     32,                         // index_t KPerBlock
+     8,                          // index_t AK1
+     8,                          // index_t BK1
+     32,                         // index_t MPerXDL
+     32,                         // index_t NPerXDL
+     4,                          // index_t MXdlPerWave
+     2,                          // index_t NXdlPerWave
+     S<4, 64, 1>,                // typename ABlockTransferThreadClusterLengths_AK0_M_AK1
+     S<1, 0, 2>,                 // typename ABlockTransferThreadClusterArrangeOrder
+     S<1, 0, 2>,                 // typename ABlockTransferSrcAccessOrder
+     2,                          // index_t ABlockTransferSrcVectorDim
+     8,                          // index_t ABlockTransferSrcScalarPerVector
+     8,                          // index_t ABlockTransferDstScalarPerVector_AK1
+     1,                          // index_t ABlockLdsExtraM
+     S<4, 64, 1>,                // typename BBlockTransferThreadClusterLengths_BK0_N_BK1
+     S<1, 0, 2>,                 // typename BBlockTransferThreadClusterArrangeOrder
+     S<1, 0, 2>,                 // typename BBlockTransferSrcAccessOrder
+     2,                          // index_t BBlockTransferSrcVectorDim
+     8,                          // index_t BBlockTransferSrcScalarPerVector
+     8,                          // index_t BBlockTransferDstScalarPerVector_BK1
+     1,                          // index_t BBlockLdsExtraN
+     1,                          // index_t CShuffleMXdlPerWavePerShuffle
+     1,                          // index_t CShuffleNXdlPerWavePerShuffle
+     S<1, 32, 1, 8>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+     8>;                         // index_t CShuffleBlockTransferScalarPerVector_NPerBlock
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // CGEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 416;
+
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideC = 4096;
+
+    if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 10)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideC = std::stoi(argv[9]);
+    }
+    else
+    {
+        std::cout << "arg1: verification (0=no, 1=yes)\n"
+                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+                  << "arg3: run kernel # of times (>1)\n"
+                  << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"
+                  << std::endl;
+        exit(0);
+    }
+
+    return !run_cgemm_xdl<ADataType,
+                          BDataType,
+                          CDataType,
+                          ALayout,
+                          BLayout,
+                          CLayout,
+                          PassThrough,
+                          PassThrough,
+                          PassThrough,
+                          DeviceCGemmInstance,
+                          ReferenceCGemmInstance>(
+        M, N, K, StrideA, StrideB, StrideC, do_verification, init_method, time_kernel);
+}
diff --git a/example/22_cgemm/cgemm_xdl_common.hpp b/example/22_cgemm/cgemm_xdl_common.hpp
new file mode 100644
index 00000000..6aa06b7c
--- /dev/null
+++ b/example/22_cgemm/cgemm_xdl_common.hpp
@@ -0,0 +1,254 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/stream_config.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16   = ck::half_t;
+using F32   = float;
+using BF16  = ck::bhalf_t;
+using INT8  = std::int8_t;
+using INT32 = std::int32_t;
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+using INT4 = ck::int4_t;
+#endif
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename DeviceCGemmInstance,
+          typename ReferenceCGemmInstance,
+          typename KernelADataType = ADataType,
+          typename KernelBDataType = BDataType,
+          typename KernelCDataType = CDataType>
+bool run_cgemm_xdl(ck::index_t M,
+                   ck::index_t N,
+                   ck::index_t K,
+                   ck::index_t StrideA,
+                   ck::index_t StrideB,
+                   ck::index_t StrideC,
+                   bool do_verification,
+                   int init_method,
+                   bool time_kernel)
+{
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    static_assert(sizeof(ck::int4_t) == sizeof(int8_t),
+                  "sizeof ck::int4_t and int8_t is different!");
+    static_assert(sizeof(ADataType) == sizeof(KernelADataType),
+                  "sizeof ADataType and KernelADataType is different!");
+    static_assert(sizeof(BDataType) == sizeof(KernelBDataType),
+                  "sizeof BDataType and KernelBDataType is different!");
+    static_assert(sizeof(CDataType) == sizeof(KernelCDataType),
+                  "sizeof CDataType and KernelCDataType is different!");
+#endif
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k_real(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<ADataType> a_m_k_imag(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n_real(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<BDataType> b_k_n_imag(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<KernelCDataType> c_m_n_real_device_result(
+        f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<KernelCDataType> c_m_n_imag_device_result(
+        f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k_real: " << a_m_k_real.mDesc << std::endl;
+    std::cout << "a_m_k_imag: " << a_m_k_imag.mDesc << std::endl;
+    std::cout << "b_k_n_real: " << b_k_n_real.mDesc << std::endl;
+    std::cout << "b_k_n_imag: " << b_k_n_imag.mDesc << std::endl;
+    std::cout << "c_m_n_real: " << c_m_n_real_device_result.mDesc << std::endl;
+    std::cout << "c_m_n_imag: " << c_m_n_imag_device_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k_real.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        a_m_k_imag.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b_k_n_real.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        b_k_n_imag.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
+        break;
+    default:
+        a_m_k_real.GenerateTensorValue(GeneratorTensor_3<ADataType>{-0.5, 0.5});
+        a_m_k_imag.GenerateTensorValue(GeneratorTensor_3<ADataType>{-0.5, 0.5});
+        b_k_n_real.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        b_k_n_imag.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+    }
+
+    auto cgemm = DeviceCGemmInstance{};
+
+    DeviceMem a_m_k_real_device_buf(sizeof(KernelADataType) *
+                                    a_m_k_real.mDesc.GetElementSpaceSize());
+    DeviceMem a_m_k_imag_device_buf(sizeof(KernelADataType) *
+                                    a_m_k_imag.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_real_device_buf(sizeof(KernelBDataType) *
+                                    b_k_n_real.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_imag_device_buf(sizeof(KernelBDataType) *
+                                    b_k_n_imag.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_real_device_buf(sizeof(KernelCDataType) *
+                                    c_m_n_real_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_imag_device_buf(sizeof(KernelCDataType) *
+                                    c_m_n_imag_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem workspace_device_buf(cgemm.GetWorkspaceSize(M, N, K, StrideA, StrideB, StrideC));
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    if constexpr(std::is_same_v<ADataType, ck::int4_t>)
+    {
+        Tensor<KernelADataType> a_m_k_real_converted(a_m_k_real);
+        Tensor<KernelADataType> a_m_k_imag_converted(a_m_k_imag);
+        Tensor<KernelBDataType> b_k_n_real_converted(b_k_n_real);
+        Tensor<KernelBDataType> b_k_n_imag_converted(b_k_n_imag);
+
+        a_m_k_real_device_buf.ToDevice(a_m_k_real_converted.mData.data());
+        a_m_k_imag_device_buf.ToDevice(a_m_k_imag_converted.mData.data());
+        b_k_n_real_device_buf.ToDevice(b_k_n_real_converted.mData.data());
+        b_k_n_imag_device_buf.ToDevice(b_k_n_imag_converted.mData.data());
+    }
+    else
+#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    {
+        a_m_k_real_device_buf.ToDevice(a_m_k_real.mData.data());
+        a_m_k_imag_device_buf.ToDevice(a_m_k_imag.mData.data());
+        b_k_n_real_device_buf.ToDevice(b_k_n_real.mData.data());
+        b_k_n_imag_device_buf.ToDevice(b_k_n_imag.mData.data());
+    }
+
+    auto a_element_op = AElementwiseOperation{};
+    auto b_element_op = BElementwiseOperation{};
+    auto c_element_op = CElementwiseOperation{};
+
+    // do GEMM
+    auto invoker = cgemm.MakeInvoker();
+    auto argument =
+        cgemm.MakeArgument(static_cast<KernelADataType*>(a_m_k_real_device_buf.GetDeviceBuffer()),
+                           static_cast<KernelADataType*>(a_m_k_imag_device_buf.GetDeviceBuffer()),
+                           static_cast<KernelBDataType*>(b_k_n_real_device_buf.GetDeviceBuffer()),
+                           static_cast<KernelBDataType*>(b_k_n_imag_device_buf.GetDeviceBuffer()),
+                           static_cast<KernelCDataType*>(c_m_n_real_device_buf.GetDeviceBuffer()),
+                           static_cast<KernelCDataType*>(c_m_n_imag_device_buf.GetDeviceBuffer()),
+                           static_cast<KernelCDataType*>(workspace_device_buf.GetDeviceBuffer()),
+                           M,
+                           N,
+                           K,
+                           StrideA,
+                           StrideB,
+                           StrideC,
+                           a_element_op,
+                           b_element_op,
+                           c_element_op);
+
+    if(!cgemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_cgemm with the specified compilation parameters does "
+            "not support this CGEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = std::size_t(8) * M * N * K;
+    std::size_t num_btype =
+        std::size_t(2) *
+        (sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N);
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << cgemm.GetTypeString() << std::endl;
+
+    if(do_verification)
+    {
+        Tensor<CDataType> c_m_n_real_host_result(
+            f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+        Tensor<CDataType> c_m_n_imag_host_result(
+            f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+        auto ref_cgemm    = ReferenceCGemmInstance{};
+        auto ref_invoker  = ref_cgemm.MakeInvoker();
+        auto ref_argument = ref_cgemm.MakeArgument(a_m_k_real,
+                                                   a_m_k_imag,
+                                                   b_k_n_real,
+                                                   b_k_n_imag,
+                                                   c_m_n_real_host_result,
+                                                   c_m_n_imag_host_result,
+                                                   a_element_op,
+                                                   b_element_op,
+                                                   c_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        c_m_n_real_device_buf.FromDevice(c_m_n_real_device_result.mData.data());
+        c_m_n_imag_device_buf.FromDevice(c_m_n_imag_device_result.mData.data());
+
+        bool result = true;
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+        if constexpr(std::is_same_v<ADataType, ck::int4_t>)
+        {
+            const Tensor<CDataType> c_m_n_real_device_result_converted(c_m_n_real_device_result);
+            const Tensor<CDataType> c_m_n_imag_device_result_converted(c_m_n_imag_device_result);
+
+            result = ck::utils::check_err(c_m_n_real_device_result_converted,
+                                          c_m_n_real_host_result,
+                                          "Verification error: incorrect results in real part!",
+                                          1e-2f,
+                                          1e-1f);
+            result = result && ck::utils::check_err(
+                                   c_m_n_imag_device_result_converted,
+                                   c_m_n_imag_host_result,
+                                   "Verification error: incorrect results in imaginary part!",
+                                   1e-2f,
+                                   1e-1f);
+        }
+        else
+#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+        {
+            result = ck::utils::check_err(c_m_n_real_device_result,
+                                          c_m_n_real_host_result,
+                                          "Verification error: incorrect results in real part!",
+                                          1e-2f,
+                                          1e-1f);
+            result = result && ck::utils::check_err(
+                                   c_m_n_imag_device_result,
+                                   c_m_n_imag_host_result,
+                                   "Verification error: incorrect results in imaginary part!",
+                                   1e-2f,
+                                   1e-1f);
+        }
+
+        return result;
+    }
+    return true;
+}
diff --git a/example/22_cgemm/cgemm_xdl_fp16.cpp b/example/22_cgemm/cgemm_xdl_fp16.cpp
new file mode 100644
index 00000000..11373736
--- /dev/null
+++ b/example/22_cgemm/cgemm_xdl_fp16.cpp
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "cgemm_xdl_common.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+
+using ADataType        = F16;
+using BDataType        = F16;
+using CDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+
+using ALayout = ck::tensor_layout::gemm::RowMajor;
+using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using ReferenceCGemmInstance = ck::tensor_operation::host::
+    ReferenceCGemm<ADataType, BDataType, CDataType, PassThrough, PassThrough, PassThrough>;
+
+// clang-format off
+using DeviceCGemmInstance = ck::tensor_operation::device::DeviceCGemm_4Gemm_Xdl_CShuffle
+    <ALayout,                    // typename ALayout
+     BLayout,                    // typename BLayout
+     CLayout,                    // typename CLayout
+     ADataType,                  // typename ADataType
+     BDataType,                  // typename BDataType
+     CDataType,                  // typename CDataType
+     AccDataType,                // typename GemmAccDataType
+     CShuffleDataType,           // typename CShuffleDataType
+     PassThrough,                // typename AElementwiseOperation
+     PassThrough,                // typename BElementwiseOperation
+     PassThrough,                // typename CElementwiseOperation
+     GemmDefault,                // GemmSpecialization GemmSpec
+     1,                          // index_t NumGemmKPrefetchStage
+     256,                        // index_t BlockSize
+     256,                        // index_t MPerBlock
+     128,                        // index_t NPerBlock
+     32,                         // index_t KPerBlock
+     8,                          // index_t AK1
+     8,                          // index_t BK1
+     32,                         // index_t MPerXDL
+     32,                         // index_t NPerXDL
+     4,                          // index_t MXdlPerWave
+     2,                          // index_t NXdlPerWave
+     S<4, 64, 1>,                // typename ABlockTransferThreadClusterLengths_AK0_M_AK1
+     S<1, 0, 2>,                 // typename ABlockTransferThreadClusterArrangeOrder
+     S<1, 0, 2>,                 // typename ABlockTransferSrcAccessOrder
+     2,                          // index_t ABlockTransferSrcVectorDim
+     8,                          // index_t ABlockTransferSrcScalarPerVector
+     8,                          // index_t ABlockTransferDstScalarPerVector_AK1
+     1,                          // index_t ABlockLdsExtraM
+     S<4, 64, 1>,                // typename BBlockTransferThreadClusterLengths_BK0_N_BK1
+     S<1, 0, 2>,                 // typename BBlockTransferThreadClusterArrangeOrder
+     S<1, 0, 2>,                 // typename BBlockTransferSrcAccessOrder
+     2,                          // index_t BBlockTransferSrcVectorDim
+     8,                          // index_t BBlockTransferSrcScalarPerVector
+     8,                          // index_t BBlockTransferDstScalarPerVector_BK1
+     1,                          // index_t BBlockLdsExtraN
+     1,                          // index_t CShuffleMXdlPerWavePerShuffle
+     1,                          // index_t CShuffleNXdlPerWavePerShuffle
+     S<1, 32, 1, 8>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+     8>;                         // index_t CShuffleBlockTransferScalarPerVector_NPerBlock
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // CGEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideC = 4096;
+
+    if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 10)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideC = std::stoi(argv[9]);
+    }
+    else
+    {
+        std::cout << "arg1: verification (0=no, 1=yes)\n"
+                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+                  << "arg3: run kernel # of times (>1)\n"
+                  << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"
+                  << std::endl;
+        exit(0);
+    }
+
+    return !run_cgemm_xdl<ADataType,
+                          BDataType,
+                          CDataType,
+                          ALayout,
+                          BLayout,
+                          CLayout,
+                          PassThrough,
+                          PassThrough,
+                          PassThrough,
+                          DeviceCGemmInstance,
+                          ReferenceCGemmInstance>(
+        M, N, K, StrideA, StrideB, StrideC, do_verification, init_method, time_kernel);
+}
diff --git a/example/22_cgemm/cgemm_xdl_fp32.cpp b/example/22_cgemm/cgemm_xdl_fp32.cpp
new file mode 100644
index 00000000..0f45c18c
--- /dev/null
+++ b/example/22_cgemm/cgemm_xdl_fp32.cpp
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "cgemm_xdl_common.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp"
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+
+using ADataType   = F32;
+using BDataType   = F32;
+using CDataType   = F32;
+using AccDataType = F32;
+
+using ALayout = ck::tensor_layout::gemm::RowMajor;
+using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using ReferenceCGemmInstance = ck::tensor_operation::host::
+    ReferenceCGemm<ADataType, BDataType, CDataType, PassThrough, PassThrough, PassThrough>;
+
+// clang-format off
+using DeviceCGemmInstance = ck::tensor_operation::device::DeviceCGemm_4Gemm_Xdl_CShuffle
+    <ALayout,                    // typename ALayout
+     BLayout,                    // typename BLayout
+     CLayout,                    // typename CLayout
+     ADataType,                  // typename ADataType
+     BDataType,                  // typename BDataType
+     CDataType,                  // typename CDataType
+     AccDataType,                // typename GemmAccDataType
+     CDataType,                  // typename CShuffleDataType
+     PassThrough,                // typename AElementwiseOperation
+     PassThrough,                // typename BElementwiseOperation
+     PassThrough,                // typename CElementwiseOperation
+     GemmDefault,                // GemmSpecialization GemmSpec
+     1,                          // index_t NumGemmKPrefetchStage
+     256,                        // index_t BlockSize
+     256,                        // index_t MPerBlock
+     128,                        // index_t NPerBlock
+     16,                         // index_t KPerBlock
+     4,                          // index_t AK1
+     4,                          // index_t BK1
+     32,                         // index_t MPerXDL
+     32,                         // index_t NPerXDL
+     4,                          // index_t MXdlPerWave
+     2,                          // index_t NXdlPerWave
+     S<4, 64, 1>,                // typename ABlockTransferThreadClusterLengths_AK0_M_AK1
+     S<1, 0, 2>,                 // typename ABlockTransferThreadClusterArrangeOrder
+     S<1, 0, 2>,                 // typename ABlockTransferSrcAccessOrder
+     2,                          // index_t ABlockTransferSrcVectorDim
+     4,                          // index_t ABlockTransferSrcScalarPerVector
+     4,                          // index_t ABlockTransferDstScalarPerVector_AK1
+     1,                          // index_t ABlockLdsExtraM
+     S<4, 64, 1>,                // typename BBlockTransferThreadClusterLengths_BK0_N_BK1
+     S<1, 0, 2>,                 // typename BBlockTransferThreadClusterArrangeOrder
+     S<1, 0, 2>,                 // typename BBlockTransferSrcAccessOrder
+     2,                          // index_t BBlockTransferSrcVectorDim
+     4,                          // index_t BBlockTransferSrcScalarPerVector
+     4,                          // index_t BBlockTransferDstScalarPerVector_BK1
+     1,                          // index_t BBlockLdsExtraN
+     1,                          // index_t CShuffleMXdlPerWavePerShuffle
+     1,                          // index_t CShuffleNXdlPerWavePerShuffle
+     S<1, 16, 1, 16>,            // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+     4>;                         // index_t CShuffleBlockTransferScalarPerVector_NPerBlock
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // CGEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideC = 4096;
+
+    if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 10)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideC = std::stoi(argv[9]);
+    }
+    else
+    {
+        std::cout << "arg1: verification (0=no, 1=yes)\n"
+                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+                  << "arg3: run kernel # of times (>1)\n"
+                  << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"
+                  << std::endl;
+        exit(0);
+    }
+
+    return !run_cgemm_xdl<ADataType,
+                          BDataType,
+                          CDataType,
+                          ALayout,
+                          BLayout,
+                          CLayout,
+                          PassThrough,
+                          PassThrough,
+                          PassThrough,
+                          DeviceCGemmInstance,
+                          ReferenceCGemmInstance>(
+        M, N, K, StrideA, StrideB, StrideC, do_verification, init_method, time_kernel);
+}
diff --git a/example/22_cgemm/cgemm_xdl_int4.cpp b/example/22_cgemm/cgemm_xdl_int4.cpp
new file mode 100644
index 00000000..c26a83ba
--- /dev/null
+++ b/example/22_cgemm/cgemm_xdl_int4.cpp
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "cgemm_xdl_common.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp"
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+
+using ADataType        = INT4;
+using BDataType        = INT4;
+using CDataType        = INT4;
+using AccDataType      = INT32;
+using CShuffleDataType = INT32;
+
+using KernelADataType = INT8;
+using KernelBDataType = INT8;
+using KernelCDataType = INT8;
+
+using ALayout = ck::tensor_layout::gemm::RowMajor;
+using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using ReferenceCGemmInstance = ck::tensor_operation::host::
+    ReferenceCGemm<ADataType, BDataType, CDataType, PassThrough, PassThrough, PassThrough>;
+
+// clang-format off
+using DeviceCGemmInstance = ck::tensor_operation::device::DeviceCGemm_4Gemm_Xdl_CShuffle
+    <ALayout,                    // typename ALayout
+     BLayout,                    // typename BLayout
+     CLayout,                    // typename CLayout
+     KernelADataType,            // typename ADataType
+     KernelBDataType,            // typename BDataType
+     KernelCDataType,            // typename CDataType
+     AccDataType,                // typename GemmAccDataType
+     CShuffleDataType,           // typename CShuffleDataType
+     PassThrough,                // typename AElementwiseOperation
+     PassThrough,                // typename BElementwiseOperation
+     PassThrough,                // typename CElementwiseOperation
+     GemmDefault,                // GemmSpecialization GemmSpec
+     1,                          // index_t NumGemmKPrefetchStage
+     256,                        // index_t BlockSize
+     256,                        // index_t MPerBlock
+     128,                        // index_t NPerBlock
+     64,                         // index_t KPerBlock
+     16,                         // index_t AK1
+     16,                         // index_t BK1
+     32,                         // index_t MPerXDL
+     32,                         // index_t NPerXDL
+     4,                          // index_t MXdlPerWave
+     2,                          // index_t NXdlPerWave
+     S<4, 64, 1>,                // typename ABlockTransferThreadClusterLengths_AK0_M_AK1
+     S<1, 0, 2>,                 // typename ABlockTransferThreadClusterArrangeOrder
+     S<1, 0, 2>,                 // typename ABlockTransferSrcAccessOrder
+     2,                          // index_t ABlockTransferSrcVectorDim
+     16,                         // index_t ABlockTransferSrcScalarPerVector
+     16,                         // index_t ABlockTransferDstScalarPerVector_AK1
+     1,                          // index_t ABlockLdsExtraM
+     S<4, 64, 1>,                // typename BBlockTransferThreadClusterLengths_BK0_N_BK1
+     S<1, 0, 2>,                 // typename BBlockTransferThreadClusterArrangeOrder
+     S<1, 0, 2>,                 // typename BBlockTransferSrcAccessOrder
+     2,                          // index_t BBlockTransferSrcVectorDim
+     8,                          // index_t BBlockTransferSrcScalarPerVector
+     8,                          // index_t BBlockTransferDstScalarPerVector_BK1
+     1,                          // index_t BBlockLdsExtraN
+     1,                          // index_t CShuffleMXdlPerWavePerShuffle
+     1,                          // index_t CShuffleNXdlPerWavePerShuffle
+     S<1, 64, 1, 4>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+     16>;                        // index_t CShuffleBlockTransferScalarPerVector_NPerBlock
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+
+    // CGEMM shape
+    ck::index_t M = 1024;
+    ck::index_t N = 1152;
+    ck::index_t K = 512;
+
+    ck::index_t StrideA = K;
+    ck::index_t StrideB = K;
+    ck::index_t StrideC = N;
+
+    if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 10)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideC = std::stoi(argv[9]);
+    }
+    else
+    {
+        std::cout << "arg1: verification (0=no, 1=yes)\n"
+                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+                  << "arg3: time kernel (0=no, 1=yes)\n"
+                  << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"
+                  << std::endl;
+        exit(EXIT_SUCCESS);
+    }
+
+    return !run_cgemm_xdl<ADataType,
+                          BDataType,
+                          CDataType,
+                          ALayout,
+                          BLayout,
+                          CLayout,
+                          PassThrough,
+                          PassThrough,
+                          PassThrough,
+                          DeviceCGemmInstance,
+                          ReferenceCGemmInstance,
+                          KernelADataType,
+                          KernelBDataType,
+                          KernelCDataType>(
+        M, N, K, StrideA, StrideB, StrideC, do_verification, init_method, time_kernel);
+}
diff --git a/example/22_cgemm/cgemm_xdl_int8.cpp b/example/22_cgemm/cgemm_xdl_int8.cpp
new file mode 100644
index 00000000..2f241898
--- /dev/null
+++ b/example/22_cgemm/cgemm_xdl_int8.cpp
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "cgemm_xdl_common.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp"
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+
+using ADataType   = INT8;
+using BDataType   = INT8;
+using CDataType   = INT8;
+using AccDataType = INT32;
+
+using ALayout = ck::tensor_layout::gemm::RowMajor;
+using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using ReferenceCGemmInstance = ck::tensor_operation::host::
+    ReferenceCGemm<ADataType, BDataType, CDataType, PassThrough, PassThrough, PassThrough>;
+
+// clang-format off
+using DeviceCGemmInstance = ck::tensor_operation::device::DeviceCGemm_4Gemm_Xdl_CShuffle
+    <ALayout,                    // typename ALayout
+     BLayout,                    // typename BLayout
+     CLayout,                    // typename CLayout
+     ADataType,                  // typename ADataType
+     BDataType,                  // typename BDataType
+     CDataType,                  // typename CDataType
+     AccDataType,                // typename GemmAccDataType
+     CDataType,                  // typename CShuffleDataType
+     PassThrough,                // typename AElementwiseOperation
+     PassThrough,                // typename BElementwiseOperation
+     PassThrough,                // typename CElementwiseOperation
+     GemmDefault,                // GemmSpecialization GemmSpec
+     1,                          // index_t NumGemmKPrefetchStage
+     256,                        // index_t BlockSize
+     256,                        // index_t MPerBlock
+     128,                        // index_t NPerBlock
+     64,                         // index_t KPerBlock
+     16,                         // index_t AK1
+     16,                         // index_t BK1
+     32,                         // index_t MPerXDL
+     32,                         // index_t NPerXDL
+     4,                          // index_t MXdlPerWave
+     2,                          // index_t NXdlPerWave
+     S<4, 64, 1>,                // typename ABlockTransferThreadClusterLengths_AK0_M_AK1
+     S<1, 0, 2>,                 // typename ABlockTransferThreadClusterArrangeOrder
+     S<1, 0, 2>,                 // typename ABlockTransferSrcAccessOrder
+     2,                          // index_t ABlockTransferSrcVectorDim
+     16,                         // index_t ABlockTransferSrcScalarPerVector
+     16,                         // index_t ABlockTransferDstScalarPerVector_AK1
+     1,                          // index_t ABlockLdsExtraM
+     S<4, 64, 1>,                // typename BBlockTransferThreadClusterLengths_BK0_N_BK1
+     S<1, 0, 2>,                 // typename BBlockTransferThreadClusterArrangeOrder
+     S<1, 0, 2>,                 // typename BBlockTransferSrcAccessOrder
+     2,                          // index_t BBlockTransferSrcVectorDim
+     8,                          // index_t BBlockTransferSrcScalarPerVector
+     8,                          // index_t BBlockTransferDstScalarPerVector_BK1
+     1,                          // index_t BBlockLdsExtraN
+     1,                          // index_t CShuffleMXdlPerWavePerShuffle
+     1,                          // index_t CShuffleNXdlPerWavePerShuffle
+     S<1, 64, 1, 4>,             // typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+     16>;                        // index_t CShuffleBlockTransferScalarPerVector_NPerBlock
+// clang-format on
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // CGEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideC = 4096;
+
+    if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 10)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideC = std::stoi(argv[9]);
+    }
+    else
+    {
+        std::cout << "arg1: verification (0=no, 1=yes)\n"
+                  << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+                  << "arg3: run kernel # of times (>1)\n"
+                  << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n"
+                  << std::endl;
+        exit(0);
+    }
+
+    return !run_cgemm_xdl<ADataType,
+                          BDataType,
+                          CDataType,
+                          ALayout,
+                          BLayout,
+                          CLayout,
+                          PassThrough,
+                          PassThrough,
+                          PassThrough,
+                          DeviceCGemmInstance,
+                          ReferenceCGemmInstance>(
+        M, N, K, StrideA, StrideB, StrideC, do_verification, init_method, time_kernel);
+}
diff --git a/example/23_softmax/CMakeLists.txt b/example/23_softmax/CMakeLists.txt
new file mode 100644
index 00000000..dafe6552
--- /dev/null
+++ b/example/23_softmax/CMakeLists.txt
@@ -0,0 +1 @@
+add_example_executable(example_softmax_blockwise softmax_blockwise.cpp)
\ No newline at end of file
diff --git a/example/23_softmax/README.md b/example/23_softmax/README.md
new file mode 100644
index 00000000..37c43e9b
--- /dev/null
+++ b/example/23_softmax/README.md
@@ -0,0 +1,18 @@
+# Instructions for ```example_softmax_blockwise```
+
+## Run ```example_softmax_blockwise```
+```bash
+# -D <xxx> : input 3-d tensor lengths
+# -v <x> :   verification (0=no, 1=yes)
+#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
+#arg2: time kernel (0=no, 1=yes)
+example_softmax_blockwise -D 4,128,2048 -v 1 1 1
+```
+
+Result
+```
+launch_and_time_kernel: grid_dim {64, 1, 1}, block_dim {256, 1, 1}
+Warm up 1 time
+Start running 10 times...
+Perf: 0.0242877 ms, 259.039 GB/s, DeviceReduceSoftmax<256,M_C8_S1,K_C32_S8,InSrcVectorDim_1_InSrcVectorSize_8_OutDstVectorSize_8>
+```
diff --git a/example/23_softmax/softmax_blockwise.cpp b/example/23_softmax/softmax_blockwise.cpp
new file mode 100644
index 00000000..8854bf04
--- /dev/null
+++ b/example/23_softmax/softmax_blockwise.cpp
@@ -0,0 +1,264 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
+
+using namespace ck::tensor_operation::device;
+
+using InDataType  = ck::half_t;
+using OutDataType = ck::half_t;
+using AccDataType = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+constexpr int Rank         = 3;
+constexpr int NumReduceDim = 1;
+
+using DeviceInstance = DeviceSoftmaxImpl<InDataType,
+                                         AccDataType,
+                                         OutDataType,
+                                         PassThrough, // InElementwiseOperation
+                                         PassThrough, // AccElementwiseOperation
+                                         Rank,
+                                         NumReduceDim,
+                                         256, // BlockSize
+                                         8,   // ClusterM
+                                         32,  // ClusterK
+                                         1,   // SliceM
+                                         8,   // SliceK
+                                         1,   // SrcVecDim (0=M, 1=K)
+                                         8,   // SrcScalarPerVector
+                                         8>;  // OutScalarPerVector
+
+static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
+                                       {"verify", required_argument, nullptr, 'v'},
+                                       {"help", no_argument, nullptr, '?'},
+                                       {nullptr, 0, nullptr, 0}};
+
+class SimpleAppArgs
+{
+    private:
+    int option_index = 0;
+
+    public:
+    std::vector<size_t> inLengths   = {8, 128, 2048};
+    std::vector<AccDataType> scales = {2.0f, 2.0f};
+
+    bool do_verification = true;
+    int init_method      = 2;
+    bool time_kernel     = true;
+
+    public:
+    void show_usage(const char* cmd)
+    {
+        std::cout << "Usage of " << cmd << std::endl;
+        std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths"
+                  << std::endl;
+        std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by "
+                     "comparing with the host-based reduction"
+                  << std::endl;
+        std::cout << "Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer "
+                     "value, 3=decimal value)"
+                  << std::endl;
+        std::cout << "Arg2 -- time kernel (0=no, 1=yes)" << std::endl;
+    };
+
+    int processArgs(int argc, char* argv[])
+    {
+        using ck::host_common::getTypeValuesFromString;
+
+        int ch;
+
+        while(1)
+        {
+            ch = getopt_long(argc, argv, "D:v:l:", long_options, &option_index);
+            if(ch == -1)
+                break;
+            switch(ch)
+            {
+            case 'D':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                inLengths = getTypeValuesFromString<size_t>(optarg);
+                break;
+            case 'v':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                do_verification = static_cast<bool>(std::atoi(optarg));
+                break;
+            case '?':
+                if(std::string(long_options[option_index].name) == "help")
+                {
+                    show_usage(argv[0]);
+                    return (-1);
+                };
+                break;
+            default: show_usage(argv[0]); return (-1);
+            };
+        };
+
+        if(optind + 2 > argc)
+            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
+
+        init_method = std::atoi(argv[optind++]);
+        time_kernel = static_cast<bool>(std::atoi(argv[optind]));
+
+        if(scales.empty())
+        {
+            scales.push_back(1.0f);
+            scales.push_back(0.0f);
+        };
+
+        return (0);
+    };
+};
+
+int main(int argc, char* argv[])
+{
+    // Example: batched gemm C[G, M, N] applies max/sum reduction along N internally
+    const std::vector<int> invariantDims{0, 1};
+    const std::vector<int> reduceDims{2};
+
+    SimpleAppArgs args;
+
+    if(argc > 1)
+    {
+        if(args.processArgs(argc, argv) < 0)
+            return (-1);
+    };
+
+    Tensor<InDataType> in(args.inLengths);
+    Tensor<OutDataType> out_ref(args.inLengths);
+    Tensor<OutDataType> out(args.inLengths);
+
+    auto inStrides  = in.mDesc.GetStrides();
+    auto outStrides = out.mDesc.GetStrides();
+
+    AccDataType alpha = args.scales[0];
+    AccDataType beta  = args.scales[1];
+
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "out: " << out.mDesc << std::endl;
+
+    std::size_t num_thread = 1;
+
+    if(args.do_verification)
+    {
+        switch(args.init_method)
+        {
+        case 0: break;
+        case 1:
+            in.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_1<OutDataType>{1}, num_thread);
+            break;
+        case 2:
+            in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5}, num_thread);
+            break;
+        default:
+            in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
+            if(beta != 0.0f)
+                out_ref.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-5.0, 5.0}, num_thread);
+        }
+
+        if(beta != 0.0f)
+            for(size_t i = 0; i < out_ref.mDesc.GetElementSpaceSize(); i++)
+                out.mData[i] = out_ref.mData[i];
+    };
+    // std::cout << "beta = " << beta << std::endl;
+    // LogRangeAsType<float>(std::cout << "tensor in: " , in.mData, ",") << std::endl;
+    // LogRangeAsType<float>(std::cout << "tensor prior out: " , out.mData, ",") << std::endl;
+
+    // these buffers are usually provided by the user application
+    DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
+
+    in_dev.ToDevice(in.mData.data());
+
+    if(beta != 0.0f)
+        out_dev.ToDevice(out.mData.data());
+
+    if(args.do_verification)
+    {
+        using ReferenceInstance =
+            ck::tensor_operation::host::ReferenceSoftmax<InDataType, OutDataType, AccDataType>;
+        ReferenceInstance ref;
+        auto ref_arg = ref.MakeArgument(in, out_ref, alpha, beta, reduceDims);
+        auto invoker = ref.MakeInvoker();
+        invoker.Run(ref_arg);
+        // LogRangeAsType<float>(std::cout << "tensor out_ref: ", out_ref.mData, ",") << std::endl;
+    };
+
+    std::vector<ck::index_t> i_inLengths;
+    std::vector<ck::index_t> i_inStrides;
+
+    i_inLengths.assign(args.inLengths.begin(), args.inLengths.end());
+    i_inStrides.assign(inStrides.begin(), inStrides.end());
+
+    auto device_instance = DeviceInstance{};
+
+    std::cout << i_inLengths.size() << ", " << i_inStrides.size() << std::endl;
+
+    auto argument_ptr = device_instance.MakeArgumentPointer(i_inLengths,
+                                                            i_inStrides,
+                                                            reduceDims,
+                                                            &alpha,
+                                                            &beta,
+                                                            in_dev.GetDeviceBuffer(),
+                                                            out_dev.GetDeviceBuffer(),
+                                                            PassThrough{},
+                                                            PassThrough{});
+
+    if(!device_instance.IsSupportedArgument(argument_ptr.get()))
+    {
+        std::cout
+            << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
+            << std::endl;
+        return 1;
+    };
+
+    std::string instance_name = device_instance.GetTypeString();
+
+    auto invoker_ptr = device_instance.MakeInvokerPointer();
+
+    bool pass = true;
+    if(args.do_verification)
+    {
+        invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        out_dev.FromDevice(out.mData.data());
+        // LogRangeAsType<float>(std::cout << "tensor out: " , out.mData, ",") << std::endl;
+        pass = pass && ck::utils::check_err(out, out_ref);
+    };
+
+    float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, args.time_kernel});
+
+    std::size_t num_bytes =
+        in.mDesc.GetElementSize() * sizeof(InDataType) +
+        (beta == 0.0f ? 1 : 2) * out.mDesc.GetElementSize() * sizeof(OutDataType);
+
+    float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+    std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << instance_name
+              << std::endl;
+
+    return (pass ? 0 : 1);
+}
diff --git a/example/24_batched_gemm/CMakeLists.txt b/example/24_batched_gemm/CMakeLists.txt
new file mode 100644
index 00000000..7962576e
--- /dev/null
+++ b/example/24_batched_gemm/CMakeLists.txt
@@ -0,0 +1,17 @@
+add_custom_target(example_batched_gemm_xdl)
+
+add_example_executable(example_batched_gemm_xdl_fp32 batched_gemm_xdl_fp32.cpp)
+add_example_executable(example_batched_gemm_xdl_fp16 batched_gemm_xdl_fp16.cpp)
+add_example_executable(example_batched_gemm_xdl_bfp16 batched_gemm_xdl_bfp16.cpp)
+add_example_executable(example_batched_gemm_xdl_int8 batched_gemm_xdl_int8.cpp)
+
+add_dependencies(example_batched_gemm_xdl
+                 example_batched_gemm_xdl_fp32
+                 example_batched_gemm_xdl_fp16
+                 example_batched_gemm_xdl_bfp16
+                 example_batched_gemm_xdl_int8)
+
+if(USE_BITINT_EXTENSION_INT4)
+  add_example_executable(example_batched_gemm_xdl_int4 batched_gemm_xdl_int4.cpp)
+  add_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_int4)
+endif()
diff --git a/example/24_batched_gemm/batched_gemm_xdl_bfp16.cpp b/example/24_batched_gemm/batched_gemm_xdl_bfp16.cpp
new file mode 100644
index 00000000..c684c13d
--- /dev/null
+++ b/example/24_batched_gemm/batched_gemm_xdl_bfp16.cpp
@@ -0,0 +1,59 @@
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = BF16;
+using BDataType        = BF16;
+using AccDataType      = F32;
+using CShuffleDataType = BF16;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = BF16;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmMultiD_Xdl
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+// clang-format on
+
+#include "run_batched_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_batched_gemm_example(argc, argv); }
diff --git a/example/24_batched_gemm/batched_gemm_xdl_fp16.cpp b/example/24_batched_gemm/batched_gemm_xdl_fp16.cpp
new file mode 100644
index 00000000..d1985f9a
--- /dev/null
+++ b/example/24_batched_gemm/batched_gemm_xdl_fp16.cpp
@@ -0,0 +1,59 @@
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = F16;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmMultiD_Xdl
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+// clang-format on
+
+#include "run_batched_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_batched_gemm_example(argc, argv); }
diff --git a/example/24_batched_gemm/batched_gemm_xdl_fp32.cpp b/example/24_batched_gemm/batched_gemm_xdl_fp32.cpp
new file mode 100644
index 00000000..a92a04db
--- /dev/null
+++ b/example/24_batched_gemm/batched_gemm_xdl_fp32.cpp
@@ -0,0 +1,58 @@
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = F32;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmMultiD_Xdl
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 32, 1, 8>,               4>;
+// clang-format on
+
+#include "run_batched_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_batched_gemm_example(argc, argv); }
diff --git a/example/24_batched_gemm/batched_gemm_xdl_int4.cpp b/example/24_batched_gemm/batched_gemm_xdl_int4.cpp
new file mode 100644
index 00000000..5e82cfe3
--- /dev/null
+++ b/example/24_batched_gemm/batched_gemm_xdl_int4.cpp
@@ -0,0 +1,99 @@
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = ck::int4_t;
+using BDataType        = ck::int4_t;
+using AccDataType      = int32_t;
+using CShuffleDataType = int32_t;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = ck::int4_t;
+
+using KernelADataType = int8_t;
+using KernelBDataType = int8_t;
+using KernelEDataType = int8_t;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmMultiD_Xdl
+    // clang-format off
+        < ALayout,              //ALayout
+          BLayout,              //BLayout
+          DsLayout,             //DsLayout
+          ELayout,              //ELayout
+          KernelADataType,      //ADataType    
+          KernelBDataType,      //BDataType   
+          AccDataType,          //AccDataType
+          CShuffleDataType,     //CShuffleDataType
+          DsDataType,           //DsDataType
+          KernelEDataType,      //EDataType
+          AElementOp,           //AElementwiseOperation
+          BElementOp,           //BElementwiseOperation
+          CDEElementOp,         //CDEElementwiseOperation
+          GemmDefault,          //GEMMSpecialization
+          1,                    // NumGemmKPrefetchStage
+          256,                  // BlockSize
+          256,                  // MPerBlock
+          128,                  // NPerBlock
+          64,                   // KPerBlock
+          16,                   // AK1
+          16,                   // BK1
+          32,                   // MPerXdl
+          32,                   // NPerXdl
+          4,                    // MXdlPerWave
+          2,                    // NXdlPerWave
+          S<4, 64, 1>,          // ABlockTransfer ThreadCluster Lengths_K0_M_K1
+          S<1, 0, 2>,           // ABlockTransfer ThreadCluster ArrangeOrder
+          S<1, 0, 2>,           // ABlockTransfer SrcAccessOrder
+          2,                    // ABlockTransfer SrcVectorDim
+          16,                   // ABlockTransfer SrcScalarPerVector
+          16,                   // ABlockTransfer DstScalarPerVector_K1
+          1,                    // ABlockLdsExtraM
+          S<4, 64, 1>,          // BBlockTransfer ThreadCluster Lengths_K0_N_K1
+          S<1, 0, 2>,           // BBlockTransfer ThreadCluster ArrangeOrder
+          S<1, 0, 2>,           // BBlockTransfer SrcAccessOrder
+          2,                    // BBlockTransfer SrcVectorDim
+          16,                   // BBlockTransfer SrcScalarPerVector
+          16,                   // BBlockTransfer DstScalarPerVector_K1
+          1,                    // BBlockLdsExtraN
+          1,                    // CShuffleMXdlPerWavePerShuffle
+          1,                    // CShuffleNXdlPerWavePerShuffle
+          S<1, 64, 1, 4>,       // CBlockTransferClusterLengths_MBlock_MWaveMPerXdl_NBlock_NWaveNPerXdl
+          16>;                  // CBlockTransferScalarPerVector_NWaveNPerXdl
+// clang-format on
+
+#define BUILD_INT4_EXAMPLE
+#include "run_batched_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_batched_gemm_example(argc, argv); }
diff --git a/example/24_batched_gemm/batched_gemm_xdl_int8.cpp b/example/24_batched_gemm/batched_gemm_xdl_int8.cpp
new file mode 100644
index 00000000..ad22227a
--- /dev/null
+++ b/example/24_batched_gemm/batched_gemm_xdl_int8.cpp
@@ -0,0 +1,56 @@
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = int8_t;
+using BDataType        = int8_t;
+using AccDataType      = int32_t;
+using CShuffleDataType = int8_t;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = int8_t;
+
+using ALayout  = Row;
+using BLayout  = Col;
+using DsLayout = ck::Tuple<>;
+using ELayout  = Row;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// clang-format off
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmMultiD_Xdl
+//######| ALayout| BLayout| DsLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |         |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |         |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |         |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, DsLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,              16>;
+// clang-format on
+
+#include "run_batched_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_batched_gemm_example(argc, argv); }
diff --git a/example/24_batched_gemm/run_batched_gemm_example.inc b/example/24_batched_gemm/run_batched_gemm_example.inc
new file mode 100644
index 00000000..21934add
--- /dev/null
+++ b/example/24_batched_gemm/run_batched_gemm_example.inc
@@ -0,0 +1,240 @@
+#include <random>
+
+#pragma once
+
+struct ProblemSize final
+{
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t stride_A = K;
+    ck::index_t stride_B = K;
+    ck::index_t stride_C = N;
+
+    ck::index_t batch_stride_A = M * K;
+    ck::index_t batch_stride_B = K * N;
+    ck::index_t batch_stride_C = M * N;
+
+    ck::index_t batch_count = 16;
+};
+
+struct ExecutionConfig final
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+};
+
+bool run_batched_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+#if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
+    static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
+    static_assert(sizeof(ADataType) == sizeof(KernelADataType));
+    static_assert(sizeof(BDataType) == sizeof(KernelBDataType));
+    static_assert(sizeof(EDataType) == sizeof(KernelEDataType));
+#endif
+
+    auto& [M,
+           N,
+           K,
+           stride_A,
+           stride_B,
+           stride_C,
+           batch_stride_A,
+           batch_stride_B,
+           batch_stride_C,
+           batch_count] = problem_size;
+
+    // GEMM shape
+    auto f_host_tensor_descriptor = [](std::size_t batch_count_,
+                                       std::size_t row,
+                                       std::size_t col,
+                                       std::size_t stride,
+                                       std::size_t batch_stride,
+                                       auto layout) {
+        using namespace ck::literals;
+
+        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, stride, 1_uz});
+        }
+        else
+        {
+            return HostTensorDescriptor({batch_count_, row, col}, {batch_stride, 1_uz, stride});
+        }
+    };
+
+    Tensor<ADataType> a_g_m_k(
+        f_host_tensor_descriptor(batch_count, M, K, stride_A, batch_stride_A, ALayout{}));
+    Tensor<BDataType> b_g_k_n(
+        f_host_tensor_descriptor(batch_count, K, N, stride_B, batch_stride_B, BLayout{}));
+#ifdef BUILD_INT4_EXAMPLE
+    Tensor<KernelEDataType> e_g_m_n_device_result(
+        f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, ELayout{}));
+#else
+    Tensor<EDataType> e_g_m_n_device_result(
+        f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, ELayout{}));
+#endif
+
+    std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
+    std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl;
+    std::cout << "e_g_m_n: " << e_g_m_n_device_result.mDesc << std::endl;
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    default:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_g_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(EDataType) * e_g_m_n_device_result.mDesc.GetElementSpaceSize());
+
+#ifdef BUILD_INT4_EXAMPLE
+    const Tensor<KernelADataType> a_g_m_k_converted(a_g_m_k);
+    const Tensor<KernelBDataType> b_g_k_n_converted(b_g_k_n);
+
+    a_device_buf.ToDevice(a_g_m_k_converted.mData.data());
+    b_device_buf.ToDevice(b_g_k_n_converted.mData.data());
+#else
+    a_device_buf.ToDevice(a_g_m_k.mData.data());
+    b_device_buf.ToDevice(b_g_k_n.mData.data());
+#endif
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    auto gemm    = DeviceGemmInstance{};
+    auto invoker = gemm.MakeInvoker();
+
+    // do GEMM
+    auto argument = gemm.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                                      b_device_buf.GetDeviceBuffer(),
+                                      {},
+                                      c_device_buf.GetDeviceBuffer(),
+                                      M,
+                                      N,
+                                      K,
+                                      batch_count,
+                                      stride_A,
+                                      stride_B,
+                                      {},
+                                      stride_C,
+                                      batch_stride_A,
+                                      batch_stride_B,
+                                      {},
+                                      batch_stride_C,
+                                      a_element_op,
+                                      b_element_op,
+                                      cde_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    invoker.Run(argument, StreamConfig{nullptr, false});
+    bool pass = true;
+
+    if(config.do_verification)
+    {
+        c_device_buf.FromDevice(e_g_m_n_device_result.mData.data());
+
+        using ReferenceBatchedGemmInstance =
+            ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                             BDataType,
+                                                             EDataType,
+                                                             AccDataType,
+                                                             AElementOp,
+                                                             BElementOp,
+                                                             CDEElementOp>;
+
+        auto ref_batched_gemm = ReferenceBatchedGemmInstance{};
+        auto ref_invoker      = ref_batched_gemm.MakeInvoker();
+
+        Tensor<EDataType> e_g_m_n_host_result(
+            f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, ELayout{}));
+
+        auto ref_argument = ref_batched_gemm.MakeArgument(
+            a_g_m_k, b_g_k_n, e_g_m_n_host_result, a_element_op, b_element_op, cde_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+#ifdef BUILD_INT4_EXAMPLE
+        const Tensor<EDataType> e_device_result_converted(e_g_m_n_device_result);
+        pass &= ck::utils::check_err(e_device_result_converted, e_g_m_n_host_result);
+
+#else
+        pass = ck::utils::check_err(
+            e_g_m_n_device_result, e_g_m_n_host_result, "Error: Incorrect results c");
+#endif
+    }
+
+    if(config.time_kernel)
+    {
+        float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+        std::size_t flop      = std::size_t(2) * batch_count * M * N * K;
+        std::size_t num_btype = sizeof(ADataType) * batch_count * M * K +
+                                sizeof(BDataType) * batch_count * K * N +
+                                sizeof(EDataType) * batch_count * M * N;
+
+        float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s, " << gemm.GetTypeString() << std::endl;
+    }
+
+    return pass ? 0 : 1;
+}
+
+bool run_batched_gemm_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    std::mt19937 gen(11939);
+    std::uniform_int_distribution<int> dis(0, 15);
+
+    problem_size.M = 256 * (dis(gen) + 1);
+    problem_size.N = 128 * (dis(gen) + 1);
+    problem_size.K = 64 * (dis(gen) + 2);
+
+    problem_size.stride_A = problem_size.K;
+    problem_size.stride_B = problem_size.K;
+    problem_size.stride_C = problem_size.N;
+
+    problem_size.batch_stride_A = problem_size.M * problem_size.K;
+    problem_size.batch_stride_B = problem_size.K * problem_size.N;
+    problem_size.batch_stride_C = problem_size.M * problem_size.N;
+
+    problem_size.batch_count = 16;
+
+    if(argc == 4)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        exit(0);
+    }
+
+    return run_batched_gemm(problem_size, config);
+}
diff --git a/example/25_gemm_bias_e_permute/CMakeLists.txt b/example/25_gemm_bias_e_permute/CMakeLists.txt
new file mode 100644
index 00000000..cbc3c007
--- /dev/null
+++ b/example/25_gemm_bias_e_permute/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_example_executable(example_gemm_bias_e_permute_g1m3n2k1_xdl_fp16 gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp)
+add_example_executable(example_gemm_bias_e_permute_g1m2n3k1_xdl_fp16 gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp)
diff --git a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp
new file mode 100644
index 00000000..c934d350
--- /dev/null
+++ b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp
@@ -0,0 +1,397 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/numeric.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Add         = ck::tensor_operation::element_wise::Add;
+
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
+using DDataType        = F16;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F16;
+
+static constexpr ck::index_t NumDimG = 1;
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 3;
+static constexpr ck::index_t NumDimK = 1;
+
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Add;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+static constexpr auto ABSpec = ck::tensor_operation::device::TensorSpecialization::Packed;
+static constexpr auto DESpec = ck::tensor_operation::device::TensorSpecialization::Default;
+
+// clang-format off
+using DeviceOpInstanceKKNN = ck::tensor_operation::device::
+        //############################################| NumDimG| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|     DsData| EData|            A|           B|          CDE|           Gemm|              A|              B|             DE| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //############################################|        |        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Spacialization| Spacialization| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //############################################|        |        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |               |               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //############################################|        |        |        |        |      |      |        |         |           |      |             |            |             |               |               |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceBatchedContractionMultipleD_Xdl_CShuffle< NumDimG, NumDimM, NumDimN, NumDimK,   F16,   F16,     F32,      F16, DsDataType,   F16,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,         ABSpec,         ABSpec,         DESpec,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,              S<1, 32, 1, 4>,               8>;
+// clang-format on
+
+using DeviceOpInstance = DeviceOpInstanceKKNN;
+
+// hardcoded for NumDimM == NumDimN == NumDimK == 2
+template <ck::index_t NumDimM,
+          ck::index_t NumDimN,
+          ck::index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename EDataType,
+          typename AccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          ck::enable_if_t<NumDimG == 1 && NumDimM == 2 && NumDimN == 3 && NumDimK == 1, bool> =
+              false>
+struct ReferenceContraction_G1_M2_N3_K1 : public ck::tensor_operation::device::BaseOperator
+{
+    // Argument
+    struct Argument : public ck::tensor_operation::device::BaseArgument
+    {
+        Argument(const Tensor<ADataType>& a_gs_ms_ks,
+                 const Tensor<BDataType>& b_gs_ns_ks,
+                 Tensor<EDataType>& e_gs_ms_ns,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+            : a_gs_ms_ks_{a_gs_ms_ks},
+              b_gs_ns_ks_{b_gs_ns_ks},
+              e_gs_ms_ns_{e_gs_ms_ns},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op}
+        {
+        }
+
+        const Tensor<ADataType>& a_gs_ms_ks_;
+        const Tensor<BDataType>& b_gs_ns_ks_;
+        Tensor<EDataType>& e_gs_ms_ns_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public ck::tensor_operation::device::BaseInvoker
+    {
+        using Argument = ReferenceContraction_G1_M2_N3_K1::Argument;
+
+        float Run(const Argument& arg)
+        {
+            auto f_gs_ms_ns = [&](auto g0, auto m0, auto m1, auto n0, auto n1, auto n2) {
+                const int K0 = arg.a_gs_ms_ks_.mDesc.GetLengths()[3];
+
+                AccDataType v_acc = 0;
+
+                for(int k0 = 0; k0 < K0; ++k0)
+                {
+                    AccDataType v_a;
+                    AccDataType v_b;
+
+                    arg.a_element_op_(
+                        v_a, ck::type_convert<const AccDataType>(arg.a_gs_ms_ks_(g0, m0, m1, k0)));
+                    arg.b_element_op_(
+                        v_b,
+                        ck::type_convert<const AccDataType>(arg.b_gs_ns_ks_(g0, n0, n1, n2, k0)));
+
+                    v_acc += v_a * v_b;
+                }
+
+                AccDataType v_c;
+
+                arg.cde_element_op_(v_c, v_acc);
+
+                arg.e_gs_ms_ns_(g0, m0, m1, n0, n1, n2) = v_c;
+            };
+
+            make_ParallelTensorFunctor(f_gs_ms_ns,
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[0],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[1],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[2],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[3],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[4],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[5])(
+                std::thread::hardware_concurrency());
+
+            return 0;
+        }
+
+        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
+    {
+        return true;
+    }
+
+    static auto MakeArgument(const Tensor<ADataType>& a_gs_ms_ks,
+                             const Tensor<BDataType>& b_gs_ns_ks,
+                             Tensor<EDataType>& e_gs_ms_ns,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{
+            a_gs_ms_ks, b_gs_ns_ks, e_gs_ms_ns, a_element_op, b_element_op, cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceContraction_M3_N2_K1"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    ck::index_t G0 = 1;
+
+    ck::index_t M0 = 4;
+    ck::index_t M1 = 256;
+
+    ck::index_t N0 = 4;
+    ck::index_t N1 = 16;
+    ck::index_t N2 = 32;
+
+    ck::index_t K0 = 256;
+
+    // A[M0, M1, M2, K0]
+    std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, M0, M1, K0};
+    std::vector<ck::index_t> a_gs_ms_ks_strides{M0 * M1 * K0, M1 * K0, K0, 1};
+    // B[N0, N1, K0]
+    std::vector<ck::index_t> b_gs_ns_ks_lengths{G0, N0, N1, N2, K0};
+    std::vector<ck::index_t> b_gs_ns_ks_strides{N0 * N1 * N2 * K0, N1 * N2 * K0, N2 * K0, K0, 1};
+
+    // D[N0, M0, N1, M1, N2]
+    std::vector<ck::index_t> d_gs_ms_ns_lengths{G0, M0, M1, N0, N1, N2};
+    std::vector<ck::index_t> d_gs_ms_ns_strides{N0 * N1 * N2, 0, 0, N1 * N2, N2, 1};
+    // E[N0, M0, N1, M1, N2]
+    std::vector<ck::index_t> e_gs_ms_ns_lengths{G0, M0, M1, N0, N1, N2};
+    std::vector<ck::index_t> e_gs_ms_ns_strides{
+        M0 * M1 * N0 * N1 * N2, N1 * M1 * N2, N2, M0 * N1 * M1 * N2, M1 * N2, 1};
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
+    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
+    Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides);
+    Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides);
+    Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
+    Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
+
+    std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
+    std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
+    std::cout << "d_gs_ms_ns: " << d_gs_ms_ns.mDesc << std::endl;
+    std::cout << "e_gs_ms_ns: " << e_gs_ms_ns_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_gs_ms_ns.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    default:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_gs_ns_ks.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_gs_ms_ns.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_gs_ms_ks.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_gs_ns_ks.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf(sizeof(DDataType) * d_gs_ms_ns.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) *
+                           e_gs_ms_ns_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_gs_ms_ks.mData.data());
+    b_device_buf.ToDevice(b_gs_ns_ks.mData.data());
+    d_device_buf.ToDevice(d_gs_ms_ns.mData.data());
+
+    // set zero
+    e_device_buf.SetZero();
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    // device operation
+    auto op       = DeviceOpInstance{};
+    auto invoker  = op.MakeInvoker();
+    auto argument = op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                                    b_device_buf.GetDeviceBuffer(),
+                                    std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
+                                    e_device_buf.GetDeviceBuffer(),
+                                    a_gs_ms_ks_lengths,
+                                    a_gs_ms_ks_strides,
+                                    b_gs_ns_ks_lengths,
+                                    b_gs_ns_ks_strides,
+                                    std::array<std::vector<ck::index_t>, 1>{d_gs_ms_ns_lengths},
+                                    std::array<std::vector<ck::index_t>, 1>{d_gs_ms_ns_strides},
+                                    e_gs_ms_ns_lengths,
+                                    e_gs_ms_ns_strides,
+                                    a_element_op,
+                                    b_element_op,
+                                    cde_element_op);
+
+    if(!op.IsSupportedArgument(argument))
+    {
+        std::cout << op.GetTypeString() << " does not support this problem" << std::endl;
+
+        return 0;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t M = ck::accumulate_n<ck::index_t>(
+        e_gs_ms_ns_lengths.begin() + NumDimG, NumDimM, 1, std::multiplies<>{});
+
+    std::size_t N = ck::accumulate_n<ck::index_t>(
+        e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM, NumDimN, 1, std::multiplies<>{});
+
+    std::size_t K = ck::accumulate_n<ck::index_t>(
+        a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM, NumDimK, 1, std::multiplies<>{});
+
+    std::size_t flop      = std::size_t(2) * M * N * K;
+    std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                            sizeof(DDataType) * M * N + sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << op.GetTypeString() << std::endl;
+
+    e_device_buf.FromDevice(e_gs_ms_ns_device_result.mData.data());
+
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
+
+        using ReferenceOpInstance = ReferenceContraction_G1_M2_N3_K1<NumDimM,
+                                                                     NumDimN,
+                                                                     NumDimK,
+                                                                     ADataType,
+                                                                     BDataType,
+                                                                     CShuffleDataType,
+                                                                     AccDataType,
+                                                                     AElementOp,
+                                                                     BElementOp,
+                                                                     PassThrough>;
+
+        auto ref_gemm    = ReferenceOpInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(a_gs_ms_ks,
+                                                  b_gs_ns_ks,
+                                                  c_gs_ms_ns_host_result,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(size_t g0 = 0; g0 < e_gs_ms_ns_host_result.mDesc.GetLengths()[0]; ++g0)
+        {
+            for(size_t m0 = 0; m0 < e_gs_ms_ns_host_result.mDesc.GetLengths()[1]; ++m0)
+            {
+                for(size_t m1 = 0; m1 < e_gs_ms_ns_host_result.mDesc.GetLengths()[2]; ++m1)
+                {
+                    for(size_t n0 = 0; n0 < e_gs_ms_ns_host_result.mDesc.GetLengths()[3]; ++n0)
+                    {
+                        for(size_t n1 = 0; n1 < e_gs_ms_ns_host_result.mDesc.GetLengths()[4]; ++n1)
+                        {
+                            for(size_t n2 = 0; n2 < e_gs_ms_ns_host_result.mDesc.GetLengths()[5];
+                                ++n2)
+                            {
+                                cde_element_op(e_gs_ms_ns_host_result(g0, m0, m1, n0, n1, n2),
+                                               c_gs_ms_ns_host_result(g0, m0, m1, n0, n1, n2),
+                                               d_gs_ms_ns(g0, m0, m1, n0, n1, n2));
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        return ck::utils::check_err(e_gs_ms_ns_device_result, e_gs_ms_ns_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}
diff --git a/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp
new file mode 100644
index 00000000..98835f98
--- /dev/null
+++ b/example/25_gemm_bias_e_permute/gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp
@@ -0,0 +1,398 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/numeric.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Add         = ck::tensor_operation::element_wise::Add;
+
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
+using DDataType        = F16;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F16;
+
+static constexpr ck::index_t NumDimG = 1;
+static constexpr ck::index_t NumDimM = 3;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 1;
+
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Add;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+static constexpr auto ABSpec = ck::tensor_operation::device::TensorSpecialization::Packed;
+static constexpr auto DESpec = ck::tensor_operation::device::TensorSpecialization::Default;
+
+// clang-format off
+using DeviceOpInstanceKKNN = ck::tensor_operation::device::
+        //############################################| NumDimG| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|     DsData| EData|            A|           B|          CDE|           Gemm|              A|              B|             DE| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //############################################|        |        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Spacialization| Spacialization| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //############################################|        |        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |               |               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //############################################|        |        |        |        |      |      |        |         |           |      |             |            |             |               |               |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceBatchedContractionMultipleD_Xdl_CShuffle< NumDimG, NumDimM, NumDimN, NumDimK,   F16,   F16,     F32,      F16, DsDataType,   F16,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,         ABSpec,         ABSpec,         DESpec,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,              S<1, 32, 1, 4>,               8>;
+// clang-format on
+
+using DeviceOpInstance = DeviceOpInstanceKKNN;
+
+template <ck::index_t NumDimG,
+          ck::index_t NumDimM,
+          ck::index_t NumDimN,
+          ck::index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename EDataType,
+          typename AccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          ck::enable_if_t<NumDimG == 1 && NumDimM == 3 && NumDimN == 2 && NumDimK == 1, bool> =
+              false>
+struct ReferenceContraction_G1_M3_N2_K1 : public ck::tensor_operation::device::BaseOperator
+{
+    // Argument
+    struct Argument : public ck::tensor_operation::device::BaseArgument
+    {
+        Argument(const Tensor<ADataType>& a_gs_ms_ks,
+                 const Tensor<BDataType>& b_gs_ns_ks,
+                 Tensor<EDataType>& e_gs_ms_ns,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+            : a_gs_ms_ks_{a_gs_ms_ks},
+              b_gs_ns_ks_{b_gs_ns_ks},
+              e_gs_ms_ns_{e_gs_ms_ns},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op}
+        {
+        }
+
+        const Tensor<ADataType>& a_gs_ms_ks_;
+        const Tensor<BDataType>& b_gs_ns_ks_;
+        Tensor<EDataType>& e_gs_ms_ns_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public ck::tensor_operation::device::BaseInvoker
+    {
+        using Argument = ReferenceContraction_G1_M3_N2_K1::Argument;
+
+        float Run(const Argument& arg)
+        {
+            auto f_gs_ms_ns = [&](auto g0, auto m0, auto m1, auto m2, auto n0, auto n1) {
+                const int K0 = arg.a_gs_ms_ks_.mDesc.GetLengths()[4];
+
+                AccDataType v_acc = 0;
+
+                for(int k0 = 0; k0 < K0; ++k0)
+                {
+                    AccDataType v_a;
+                    AccDataType v_b;
+
+                    arg.a_element_op_(
+                        v_a,
+                        ck::type_convert<const AccDataType>(arg.a_gs_ms_ks_(g0, m0, m1, m2, k0)));
+                    arg.b_element_op_(
+                        v_b, ck::type_convert<const AccDataType>(arg.b_gs_ns_ks_(g0, n0, n1, k0)));
+
+                    v_acc += v_a * v_b;
+                }
+
+                AccDataType v_c;
+
+                arg.cde_element_op_(v_c, v_acc);
+
+                arg.e_gs_ms_ns_(g0, m0, m1, m2, n0, n1) = v_c;
+            };
+
+            make_ParallelTensorFunctor(f_gs_ms_ns,
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[0],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[1],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[2],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[3],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[4],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[5])(
+                std::thread::hardware_concurrency());
+
+            return 0;
+        }
+
+        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
+    {
+        return true;
+    }
+
+    static auto MakeArgument(const Tensor<ADataType>& a_gs_ms_ks,
+                             const Tensor<BDataType>& b_gs_ns_ks,
+                             Tensor<EDataType>& e_gs_ms_ns,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{
+            a_gs_ms_ks, b_gs_ns_ks, e_gs_ms_ns, a_element_op, b_element_op, cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceContraction_G1_M3_N2_K1"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    ck::index_t G0 = 1;
+
+    ck::index_t M0 = 4;
+    ck::index_t M1 = 8;
+    ck::index_t M2 = 256;
+
+    ck::index_t N0 = 32;
+    ck::index_t N1 = 128;
+
+    ck::index_t K0 = 1024;
+
+    // A[M0, M1, M2, K0]
+    std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, M0, M1, M2, K0};
+    std::vector<ck::index_t> a_gs_ms_ks_strides{M0 * M1 * M2 * K0, M1 * M2 * K0, M2 * K0, K0, 1};
+
+    // B[N0, N1, K0]
+    std::vector<ck::index_t> b_gs_ns_ks_lengths{G0, N0, N1, K0};
+    std::vector<ck::index_t> b_gs_ns_ks_strides{N0 * N1 * K0, N1 * K0, K0, 1};
+
+    // D[M0, N0, M1, N1, M2]
+    std::vector<ck::index_t> d_gs_ms_ns_lengths{G0, M0, M1, M2, N0, N1};
+    std::vector<ck::index_t> d_gs_ms_ns_strides{N0 * N1, 0, 0, 0, N1, 1};
+
+    // E[M1, M0, N0, M1, N1]
+    std::vector<ck::index_t> e_gs_ms_ns_lengths{G0, M0, M1, M2, N0, N1};
+    std::vector<ck::index_t> e_gs_ms_ns_strides{
+        M0 * M1 * M2 * N1 * N0, N0 * M1 * N1, N1, M0 * N0 * M1 * N1, M1 * N1, 1};
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
+    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
+    Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides);
+    Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides);
+    Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
+    Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
+
+    std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
+    std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
+    std::cout << "d_gs_ms_ns: " << d_gs_ms_ns.mDesc << std::endl;
+    std::cout << "e_gs_ms_ns: " << e_gs_ms_ns_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_gs_ms_ns.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    default:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_gs_ns_ks.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_gs_ms_ns.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_gs_ms_ks.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_gs_ns_ks.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf(sizeof(DDataType) * d_gs_ms_ns.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) *
+                           e_gs_ms_ns_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_gs_ms_ks.mData.data());
+    b_device_buf.ToDevice(b_gs_ns_ks.mData.data());
+    d_device_buf.ToDevice(d_gs_ms_ns.mData.data());
+
+    // set zero
+    e_device_buf.SetZero();
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    // device operation
+    auto op       = DeviceOpInstance{};
+    auto invoker  = op.MakeInvoker();
+    auto argument = op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                                    b_device_buf.GetDeviceBuffer(),
+                                    std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
+                                    e_device_buf.GetDeviceBuffer(),
+                                    a_gs_ms_ks_lengths,
+                                    a_gs_ms_ks_strides,
+                                    b_gs_ns_ks_lengths,
+                                    b_gs_ns_ks_strides,
+                                    std::array<std::vector<ck::index_t>, 1>{d_gs_ms_ns_lengths},
+                                    std::array<std::vector<ck::index_t>, 1>{d_gs_ms_ns_strides},
+                                    e_gs_ms_ns_lengths,
+                                    e_gs_ms_ns_strides,
+                                    a_element_op,
+                                    b_element_op,
+                                    cde_element_op);
+
+    if(!op.IsSupportedArgument(argument))
+    {
+        std::cout << op.GetTypeString() << " does not support this problem" << std::endl;
+
+        return 0;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    ck::index_t M =
+        ck::accumulate_n<ck::index_t>(e_gs_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
+
+    ck::index_t N = ck::accumulate_n<ck::index_t>(
+        e_gs_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
+
+    ck::index_t K = ck::accumulate_n<ck::index_t>(
+        a_gs_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
+
+    std::size_t flop      = std::size_t(2) * M * N * K;
+    std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                            sizeof(DDataType) * M * N + sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << op.GetTypeString() << std::endl;
+
+    e_device_buf.FromDevice(e_gs_ms_ns_device_result.mData.data());
+
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
+
+        using ReferenceOpInstance = ReferenceContraction_G1_M3_N2_K1<NumDimG,
+                                                                     NumDimM,
+                                                                     NumDimN,
+                                                                     NumDimK,
+                                                                     ADataType,
+                                                                     BDataType,
+                                                                     CShuffleDataType,
+                                                                     AccDataType,
+                                                                     AElementOp,
+                                                                     BElementOp,
+                                                                     PassThrough>;
+
+        auto ref_gemm    = ReferenceOpInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(a_gs_ms_ks,
+                                                  b_gs_ns_ks,
+                                                  c_gs_ms_ns_host_result,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(size_t g0 = 0; g0 < e_gs_ms_ns_host_result.mDesc.GetLengths()[0]; ++g0)
+        {
+            for(size_t m0 = 0; m0 < e_gs_ms_ns_host_result.mDesc.GetLengths()[1]; ++m0)
+            {
+                for(size_t m1 = 0; m1 < e_gs_ms_ns_host_result.mDesc.GetLengths()[2]; ++m1)
+                {
+                    for(size_t m2 = 0; m2 < e_gs_ms_ns_host_result.mDesc.GetLengths()[3]; ++m2)
+                    {
+                        for(size_t n0 = 0; n0 < e_gs_ms_ns_host_result.mDesc.GetLengths()[4]; ++n0)
+                        {
+                            for(size_t n1 = 0; n1 < e_gs_ms_ns_host_result.mDesc.GetLengths()[5];
+                                ++n1)
+                            {
+                                cde_element_op(e_gs_ms_ns_host_result(g0, m0, m1, m2, n0, n1),
+                                               c_gs_ms_ns_host_result(g0, m0, m1, m2, n0, n1),
+                                               d_gs_ms_ns(g0, m0, m1, m2, n0, n1));
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        return ck::utils::check_err(e_gs_ms_ns_device_result, e_gs_ms_ns_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}
diff --git a/example/26_contraction/CMakeLists.txt b/example/26_contraction/CMakeLists.txt
new file mode 100644
index 00000000..87f4750e
--- /dev/null
+++ b/example/26_contraction/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_example_executable(example_contraction_bilinear_xdl_fp32 contraction_bilinear_xdl_fp32.cpp)
+add_example_executable(example_contraction_scale_xdl_fp32 contraction_scale_xdl_fp32.cpp)
diff --git a/example/26_contraction/README.md b/example/26_contraction/README.md
new file mode 100644
index 00000000..c88d93cf
--- /dev/null
+++ b/example/26_contraction/README.md
@@ -0,0 +1,20 @@
+# Instructions for ```example_contraction_bilinear_xdl_fp32```
+
+## Run
+```bash
+#arg1: verification (0=no, 1=yes)
+#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
+#arg3: time kernel (0=no, 1=yes)
+./bin/example_contraction_bilinear_xdl_fp32 1 1 1
+```
+
+Result (MI100 @ dynammic freq, 46TFlops peak FP32)
+```
+a_ms_ks: dim 4, lengths {30, 128, 32, 64}, strides {524288, 4096, 128, 1}
+b_ks_ns: dim 4, lengths {32, 64, 32, 64}, strides {128, 1, 524288, 4096}
+c_ms_ns: dim 4, lengths {30, 128, 32, 64}, strides {524288, 4096, 128, 1}
+launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1}
+Warm up 1 time
+Start running 10 times...
+Perf: 0.843286 ms, 38.1985 TFlops, 94.5014 GB/s, DeviceContractionMultipleD_Xdl_CShuffle<256, 256, 128, 16, 4, 4>
+```
diff --git a/example/26_contraction/contraction_bilinear_xdl_fp32.cpp b/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
new file mode 100644
index 00000000..ea105e4f
--- /dev/null
+++ b/example/26_contraction/contraction_bilinear_xdl_fp32.cpp
@@ -0,0 +1,427 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/numeric.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F32 = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DDataType        = F32;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F32;
+
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Bilinear;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// clang-format off
+using DeviceOpInstanceKKNN = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|     DsData| EData|            A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |           |      |             |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK,   F32,   F32,     F32,      F32, DsDataType,   F32,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>;
+
+using DeviceOpInstanceKNNN = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|     DsData| EData|            A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |           |      |             |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK,   F32,   F32,     F32,      F32, DsDataType,   F32,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   256,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>;
+
+using DeviceOpInstanceMKNN = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|     DsData| EData|            A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |           |      |             |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK,   F32,   F32,     F32,      F32, DsDataType,   F32,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   256,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>;
+
+using DeviceOpInstanceMNNN = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|     DsData| EData|            A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |           |      |             |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK,   F32,   F32,     F32,      F32, DsDataType,   F32,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   256,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>;
+// clang-format on
+
+using DeviceOpInstance = DeviceOpInstanceKKNN;
+
+// hardcoded for NumDimM == NumDimN == NumDimK == 2
+template <ck::index_t NumDimM,
+          ck::index_t NumDimN,
+          ck::index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename EDataType,
+          typename AccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2, bool> = false>
+struct ReferenceContraction_M2_N2_K2 : public ck::tensor_operation::device::BaseOperator
+{
+    // Argument
+    struct Argument : public ck::tensor_operation::device::BaseArgument
+    {
+        Argument(const Tensor<ADataType>& a_ms_ks,
+                 const Tensor<BDataType>& b_ns_ks,
+                 Tensor<EDataType>& e_ms_ns,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+            : a_ms_ks_{a_ms_ks},
+              b_ns_ks_{b_ns_ks},
+              e_ms_ns_{e_ms_ns},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op}
+        {
+        }
+
+        const Tensor<ADataType>& a_ms_ks_;
+        const Tensor<BDataType>& b_ns_ks_;
+        Tensor<EDataType>& e_ms_ns_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public ck::tensor_operation::device::BaseInvoker
+    {
+        using Argument = ReferenceContraction_M2_N2_K2::Argument;
+
+        float Run(const Argument& arg)
+        {
+            auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
+                const int K0 = arg.a_ms_ks_.mDesc.GetLengths()[2];
+                const int K1 = arg.a_ms_ks_.mDesc.GetLengths()[3];
+
+                AccDataType v_acc = 0;
+
+                for(int k0 = 0; k0 < K0; ++k0)
+                {
+                    for(int k1 = 0; k1 < K1; ++k1)
+                    {
+                        AccDataType v_a;
+                        AccDataType v_b;
+
+                        arg.a_element_op_(
+                            v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, k0, k1)));
+                        arg.b_element_op_(
+                            v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, k0, k1)));
+
+                        v_acc += v_a * v_b;
+                    }
+                }
+
+                AccDataType v_c;
+
+                arg.cde_element_op_(v_c, v_acc);
+
+                arg.e_ms_ns_(m0, m1, n0, n1) = v_c;
+            };
+
+            make_ParallelTensorFunctor(f_ms_ns,
+                                       arg.e_ms_ns_.mDesc.GetLengths()[0],
+                                       arg.e_ms_ns_.mDesc.GetLengths()[1],
+                                       arg.e_ms_ns_.mDesc.GetLengths()[2],
+                                       arg.e_ms_ns_.mDesc.GetLengths()[3])(
+                std::thread::hardware_concurrency());
+
+            return 0;
+        }
+
+        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
+    {
+        return true;
+    }
+
+    static auto MakeArgument(const Tensor<ADataType>& a_ms_ks,
+                             const Tensor<BDataType>& b_ns_ks,
+                             Tensor<EDataType>& e_ms_ns,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{a_ms_ks, b_ns_ks, e_ms_ns, a_element_op, b_element_op, cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceContraction_M2_N2_K2"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // A[M0, M1, K0, K1]
+    std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
+    // B[N0, N1, K0, K1]
+    std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
+    std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
+    // D[M0, M1, N0, N1]
+    std::vector<ck::index_t> d_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> d_ms_ns_strides{524288, 4096, 128, 1};
+    // E[M0, M1, N0, N1]
+    std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
+
+    float alpha = 1.f;
+    float beta  = 1.f;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 28)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        const ck::index_t M0 = std::stoi(argv[4]);
+        const ck::index_t M1 = std::stoi(argv[5]);
+
+        const ck::index_t N0 = std::stoi(argv[6]);
+        const ck::index_t N1 = std::stoi(argv[7]);
+
+        const ck::index_t K0 = std::stoi(argv[8]);
+        const ck::index_t K1 = std::stoi(argv[9]);
+
+        a_ms_ks_lengths = {M0, M1, K0, K1};
+        a_ms_ks_strides = {
+            std::stoi(argv[10]), std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13])};
+
+        b_ns_ks_lengths = {N0, N1, K0, K1};
+        b_ns_ks_strides = {
+            std::stoi(argv[14]), std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17])};
+
+        d_ms_ns_lengths = {M0, M1, N0, N1};
+        d_ms_ns_strides = {
+            std::stoi(argv[18]), std::stoi(argv[19]), std::stoi(argv[20]), std::stoi(argv[21])};
+
+        e_ms_ns_lengths = {M0, M1, N0, N1};
+        e_ms_ns_strides = {
+            std::stoi(argv[22]), std::stoi(argv[23]), std::stoi(argv[24]), std::stoi(argv[25])};
+
+        alpha = std::stof(argv[26]);
+        beta  = std::stof(argv[27]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 7: M0, M1, N0, N1, K0, K1\n");
+        printf("arg10 to 13: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
+        printf("arg14 to 17: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
+        printf("arg18 to 21: Stride_D_M0, Stride_D_M1, Stride_D_N0, Stride_D_N1\n");
+        printf("arg22 to 25: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
+        printf("arg26 to 27: alpha, beta\n");
+        exit(0);
+    }
+
+    Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides);
+    Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides);
+    Tensor<EDataType> d_ms_ns(d_ms_ns_lengths, d_ms_ns_strides);
+    Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
+    Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);
+
+    std::cout << "a_ms_ks: " << a_ms_ks.mDesc << std::endl;
+    std::cout << "b_ns_ks: " << b_ns_ks.mDesc << std::endl;
+    std::cout << "d_ms_ns: " << d_ms_ns.mDesc << std::endl;
+    std::cout << "e_ms_ns: " << e_ms_ns_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_ns_ks.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_ms_ns.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    default:
+        a_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_ns_ks.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_ms_ns.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_ms_ks.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_ns_ks.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf(sizeof(DDataType) * d_ms_ns.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_ms_ns_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_ms_ks.mData.data());
+    b_device_buf.ToDevice(b_ns_ks.mData.data());
+    d_device_buf.ToDevice(d_ms_ns.mData.data());
+
+    // set zero
+    e_device_buf.SetZero();
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{alpha, beta};
+
+    // device operation
+    auto op       = DeviceOpInstance{};
+    auto invoker  = op.MakeInvoker();
+    auto argument = op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                                    b_device_buf.GetDeviceBuffer(),
+                                    std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
+                                    e_device_buf.GetDeviceBuffer(),
+                                    a_ms_ks_lengths,
+                                    a_ms_ks_strides,
+                                    b_ns_ks_lengths,
+                                    b_ns_ks_strides,
+                                    std::array<std::vector<ck::index_t>, 1>{d_ms_ns_lengths},
+                                    std::array<std::vector<ck::index_t>, 1>{d_ms_ns_strides},
+                                    e_ms_ns_lengths,
+                                    e_ms_ns_strides,
+                                    a_element_op,
+                                    b_element_op,
+                                    cde_element_op);
+
+    if(!op.IsSupportedArgument(argument))
+    {
+        std::cout << op.GetTypeString() << " does not support this problem" << std::endl;
+
+        return 0;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    ck::index_t M =
+        ck::accumulate_n<ck::index_t>(e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
+
+    ck::index_t N = ck::accumulate_n<ck::index_t>(
+        e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
+
+    ck::index_t K = ck::accumulate_n<ck::index_t>(
+        a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
+
+    std::size_t flop      = std::size_t(2) * M * N * K;
+    std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                            sizeof(DDataType) * M * N + sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << op.GetTypeString() << std::endl;
+
+    e_device_buf.FromDevice(e_ms_ns_device_result.mData.data());
+
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
+
+        using ReferenceOpInstance = ReferenceContraction_M2_N2_K2<NumDimM,
+                                                                  NumDimN,
+                                                                  NumDimK,
+                                                                  ADataType,
+                                                                  BDataType,
+                                                                  CShuffleDataType,
+                                                                  AccDataType,
+                                                                  AElementOp,
+                                                                  BElementOp,
+                                                                  PassThrough>;
+
+        auto ref_gemm    = ReferenceOpInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(size_t m0 = 0; m0 < e_ms_ns_host_result.mDesc.GetLengths()[0]; ++m0)
+        {
+            for(size_t m1 = 0; m1 < e_ms_ns_host_result.mDesc.GetLengths()[1]; ++m1)
+            {
+                for(size_t n0 = 0; n0 < e_ms_ns_host_result.mDesc.GetLengths()[2]; ++n0)
+                {
+                    for(size_t n1 = 0; n1 < e_ms_ns_host_result.mDesc.GetLengths()[3]; ++n1)
+                    {
+                        cde_element_op(e_ms_ns_host_result(m0, m1, n0, n1),
+                                       c_ms_ns_host_result(m0, m1, n0, n1),
+                                       d_ms_ns(m0, m1, n0, n1));
+                    }
+                }
+            }
+        }
+
+        return ck::utils::check_err(e_ms_ns_device_result, e_ms_ns_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}
diff --git a/example/26_contraction/contraction_scale_xdl_fp32.cpp b/example/26_contraction/contraction_scale_xdl_fp32.cpp
new file mode 100644
index 00000000..26f176b0
--- /dev/null
+++ b/example/26_contraction/contraction_scale_xdl_fp32.cpp
@@ -0,0 +1,409 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/numeric.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F32 = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DsDataType       = ck::Tuple<>;
+using EDataType        = F32;
+
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 2;
+
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Scale;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// clang-format off
+using DeviceOpInstanceKKN = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|     DsData| EData|            A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |           |      |             |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK,   F32,   F32,     F32,      F32, DsDataType,   F32,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>;
+
+using DeviceOpInstanceKNN = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|     DsData| EData|            A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |           |      |             |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK,   F32,   F32,     F32,      F32, DsDataType,   F32,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   256,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>;
+
+using DeviceOpInstanceMKN = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|     DsData| EData|            A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |           |      |             |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK,   F32,   F32,     F32,      F32, DsDataType,   F32,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   256,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>;
+
+using DeviceOpInstanceMNN = ck::tensor_operation::device::
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|     DsData| EData|            A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |           |      |             |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK,   F32,   F32,     F32,      F32, DsDataType,   F32,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,        1,   256,   256,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>;
+// clang-format on
+
+using DeviceOpInstance = DeviceOpInstanceKKN;
+
+// hardcoded for NumDimM == NumDimN == NumDimK == 2
+template <ck::index_t NumDimM,
+          ck::index_t NumDimN,
+          ck::index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename EDataType,
+          typename AccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          ck::enable_if_t<NumDimM == 2 && NumDimN == 2 && NumDimK == 2, bool> = false>
+struct ReferenceContraction_M2_N2_K2 : public ck::tensor_operation::device::BaseOperator
+{
+    // Argument
+    struct Argument : public ck::tensor_operation::device::BaseArgument
+    {
+        Argument(const Tensor<ADataType>& a_ms_ks,
+                 const Tensor<BDataType>& b_ns_ks,
+                 Tensor<EDataType>& e_ms_ns,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+            : a_ms_ks_{a_ms_ks},
+              b_ns_ks_{b_ns_ks},
+              e_ms_ns_{e_ms_ns},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op}
+        {
+        }
+
+        const Tensor<ADataType>& a_ms_ks_;
+        const Tensor<BDataType>& b_ns_ks_;
+        Tensor<EDataType>& e_ms_ns_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public ck::tensor_operation::device::BaseInvoker
+    {
+        using Argument = ReferenceContraction_M2_N2_K2::Argument;
+
+        float Run(const Argument& arg)
+        {
+            auto f_ms_ns = [&](auto m0, auto m1, auto n0, auto n1) {
+                const int K0 = arg.a_ms_ks_.mDesc.GetLengths()[2];
+                const int K1 = arg.a_ms_ks_.mDesc.GetLengths()[3];
+
+                AccDataType v_acc = 0;
+
+                for(int k0 = 0; k0 < K0; ++k0)
+                {
+                    for(int k1 = 0; k1 < K1; ++k1)
+                    {
+                        AccDataType v_a;
+                        AccDataType v_b;
+
+                        arg.a_element_op_(
+                            v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, k0, k1)));
+                        arg.b_element_op_(
+                            v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, k0, k1)));
+
+                        v_acc += v_a * v_b;
+                    }
+                }
+
+                AccDataType v_c;
+
+                arg.cde_element_op_(v_c, v_acc);
+
+                arg.e_ms_ns_(m0, m1, n0, n1) = v_c;
+            };
+
+            make_ParallelTensorFunctor(f_ms_ns,
+                                       arg.e_ms_ns_.mDesc.GetLengths()[0],
+                                       arg.e_ms_ns_.mDesc.GetLengths()[1],
+                                       arg.e_ms_ns_.mDesc.GetLengths()[2],
+                                       arg.e_ms_ns_.mDesc.GetLengths()[3])(
+                std::thread::hardware_concurrency());
+
+            return 0;
+        }
+
+        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
+    {
+        return true;
+    }
+
+    static auto MakeArgument(const Tensor<ADataType>& a_ms_ks,
+                             const Tensor<BDataType>& b_ns_ks,
+                             Tensor<EDataType>& e_ms_ns,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{a_ms_ks, b_ns_ks, e_ms_ns, a_element_op, b_element_op, cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceContraction_M2_N2_K2"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // A[M0, M1, K0, K1]
+    std::vector<ck::index_t> a_ms_ks_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> a_ms_ks_strides{524288, 4096, 128, 1};
+    // B[N0, N1, K0, K1]
+    std::vector<ck::index_t> b_ns_ks_lengths{32, 64, 32, 64};
+    std::vector<ck::index_t> b_ns_ks_strides{524288, 4096, 128, 1};
+    // E[M0, M1, N0, N1]
+    std::vector<ck::index_t> e_ms_ns_lengths{30, 128, 32, 64};
+    std::vector<ck::index_t> e_ms_ns_strides{524288, 4096, 128, 1};
+
+    float scale = 1.f;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 23)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        const ck::index_t M0 = std::stoi(argv[4]);
+        const ck::index_t M1 = std::stoi(argv[5]);
+
+        const ck::index_t N0 = std::stoi(argv[6]);
+        const ck::index_t N1 = std::stoi(argv[7]);
+
+        const ck::index_t K0 = std::stoi(argv[8]);
+        const ck::index_t K1 = std::stoi(argv[9]);
+
+        a_ms_ks_lengths = {M0, M1, K0, K1};
+        a_ms_ks_strides = {
+            std::stoi(argv[10]), std::stoi(argv[11]), std::stoi(argv[12]), std::stoi(argv[13])};
+
+        b_ns_ks_lengths = {N0, N1, K0, K1};
+        b_ns_ks_strides = {
+            std::stoi(argv[14]), std::stoi(argv[15]), std::stoi(argv[16]), std::stoi(argv[17])};
+
+        e_ms_ns_lengths = {M0, M1, N0, N1};
+        e_ms_ns_strides = {
+            std::stoi(argv[18]), std::stoi(argv[19]), std::stoi(argv[20]), std::stoi(argv[21])};
+
+        scale = std::stof(argv[22]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 9: M0, M1, N0, N1, K0, K1\n");
+        printf("arg10 to 13: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
+        printf("arg14 to 17: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
+        printf("arg18 to 21: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");
+        printf("arg22: scale\n");
+        exit(0);
+    }
+
+    Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides);
+    Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides);
+    Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
+    Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);
+
+    std::cout << "a_ms_ks: " << a_ms_ks.mDesc << std::endl;
+    std::cout << "b_ns_ks: " << b_ns_ks.mDesc << std::endl;
+    std::cout << "e_ms_ns: " << e_ms_ns_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_ns_ks.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    default:
+        a_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_ns_ks.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_ms_ks.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_ns_ks.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_ms_ns_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_ms_ks.mData.data());
+    b_device_buf.ToDevice(b_ns_ks.mData.data());
+
+    // set zero
+    e_device_buf.SetZero();
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{scale};
+
+    // device operation
+    auto op       = DeviceOpInstance{};
+    auto invoker  = op.MakeInvoker();
+    auto argument = op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                                    b_device_buf.GetDeviceBuffer(),
+                                    std::array<const void*, 0>{},
+                                    e_device_buf.GetDeviceBuffer(),
+                                    a_ms_ks_lengths,
+                                    a_ms_ks_strides,
+                                    b_ns_ks_lengths,
+                                    b_ns_ks_strides,
+                                    std::array<std::vector<ck::index_t>, 0>{},
+                                    std::array<std::vector<ck::index_t>, 0>{},
+                                    e_ms_ns_lengths,
+                                    e_ms_ns_strides,
+                                    a_element_op,
+                                    b_element_op,
+                                    cde_element_op);
+
+    if(!op.IsSupportedArgument(argument))
+    {
+        std::cout << op.GetTypeString() << " does not support this problem" << std::endl;
+
+        return 0;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    ck::index_t M =
+        ck::accumulate_n<ck::index_t>(e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
+
+    ck::index_t N = ck::accumulate_n<ck::index_t>(
+        e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
+
+    ck::index_t K = ck::accumulate_n<ck::index_t>(
+        a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + +sizeof(EDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << op.GetTypeString() << std::endl;
+
+    e_device_buf.FromDevice(e_ms_ns_device_result.mData.data());
+
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
+
+        using ReferenceOpInstance = ReferenceContraction_M2_N2_K2<NumDimM,
+                                                                  NumDimN,
+                                                                  NumDimK,
+                                                                  ADataType,
+                                                                  BDataType,
+                                                                  CShuffleDataType,
+                                                                  AccDataType,
+                                                                  AElementOp,
+                                                                  BElementOp,
+                                                                  PassThrough>;
+
+        auto ref_gemm    = ReferenceOpInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_ms_ks, b_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(size_t m0 = 0; m0 < e_ms_ns_host_result.mDesc.GetLengths()[0]; ++m0)
+        {
+            for(size_t m1 = 0; m1 < e_ms_ns_host_result.mDesc.GetLengths()[1]; ++m1)
+            {
+                for(size_t n0 = 0; n0 < e_ms_ns_host_result.mDesc.GetLengths()[2]; ++n0)
+                {
+                    for(size_t n1 = 0; n1 < e_ms_ns_host_result.mDesc.GetLengths()[3]; ++n1)
+                    {
+                        cde_element_op(e_ms_ns_host_result(m0, m1, n0, n1),
+                                       c_ms_ns_host_result(m0, m1, n0, n1));
+                    }
+                }
+            }
+        }
+
+        return ck::utils::check_err(e_ms_ns_device_result, e_ms_ns_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}
diff --git a/example/27_layernorm/CMakeLists.txt b/example/27_layernorm/CMakeLists.txt
new file mode 100644
index 00000000..d96deae4
--- /dev/null
+++ b/example/27_layernorm/CMakeLists.txt
@@ -0,0 +1 @@
+add_example_executable(example_layernorm_blockwise layernorm_blockwise.cpp)
diff --git a/example/27_layernorm/layernorm_blockwise.cpp b/example/27_layernorm/layernorm_blockwise.cpp
new file mode 100644
index 00000000..147307d9
--- /dev/null
+++ b/example/27_layernorm/layernorm_blockwise.cpp
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
+
+using XDataType     = ck::half_t;
+using GammaDataType = ck::half_t;
+using BetaDataType  = ck::half_t;
+using YDataType     = ck::half_t;
+using AccDataType   = float;
+using PassThrough   = ck::tensor_operation::element_wise::PassThrough;
+
+constexpr int Rank         = 2;
+constexpr int NumReduceDim = 1;
+
+using DeviceInstance =
+    ck::tensor_operation::device::DeviceNormalizationImpl<XDataType,
+                                                          GammaDataType,
+                                                          BetaDataType,
+                                                          AccDataType,
+                                                          YDataType,
+                                                          PassThrough,
+                                                          Rank,
+                                                          NumReduceDim,
+                                                          256, // BlockSize
+                                                          8,   // ClusterM
+                                                          32,  // ClusterK
+                                                          1,   // SliceM
+                                                          8,   // SliceK
+                                                          1,   // SrcVecDim (0=M, 1=K)
+                                                          8,   // SrcScalarPerVector
+                                                          1,   // GammaVecDim (0=M, 1=K)
+                                                          8,   // GammaScalarPerVector
+                                                          1,   // BetaVecDim (0=M, 1=K)
+                                                          8,   // BetaScalarPerVector
+                                                          8>;  // OutScalarPerVector
+
+int main()
+{
+    bool time_kernel = false;
+
+    ck::index_t M      = 1024;
+    ck::index_t N      = 1024;
+    ck::index_t Stride = N;
+
+    auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
+        return HostTensorDescriptor({len}, {stride});
+    };
+
+    auto f_host_tensor_descriptor2d = [](std::size_t row, std::size_t col, std::size_t stride) {
+        using namespace ck::literals;
+
+        return HostTensorDescriptor({row, col}, {stride, 1_uz});
+    };
+
+    Tensor<XDataType> x(f_host_tensor_descriptor2d(M, N, Stride));
+    Tensor<GammaDataType> gamma(f_host_tensor_descriptor1d(N, 1));
+    Tensor<BetaDataType> beta(f_host_tensor_descriptor1d(N, 1));
+    Tensor<YDataType> y(f_host_tensor_descriptor2d(M, N, Stride));
+
+    x.GenerateTensorValue(GeneratorTensor_3<XDataType>{0.0, 1.0});
+    gamma.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{0.0, 1.0});
+    beta.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{0.0, 1.0});
+
+    DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
+    DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
+    DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
+    DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
+
+    x_dev.ToDevice(x.mData.data());
+    gamma_dev.ToDevice(gamma.mData.data());
+    beta_dev.ToDevice(beta.mData.data());
+
+    auto device_instance = DeviceInstance{};
+    auto argument_ptr    = device_instance.MakeArgumentPointer(
+        {M, N},
+        std::vector<ck::index_t>{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()},
+        {0, 1},
+        {0, 1},
+        std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
+        {1},
+        1e-4,
+        x_dev.GetDeviceBuffer(),
+        gamma_dev.GetDeviceBuffer(),
+        beta_dev.GetDeviceBuffer(),
+        y_dev.GetDeviceBuffer(),
+        nullptr,
+        nullptr,
+        PassThrough{});
+
+    if(!device_instance.IsSupportedArgument(argument_ptr.get()))
+    {
+        std::cout << "The runtime parameters are not supported" << std::endl;
+        return 1;
+    };
+
+    auto invoker_ptr = device_instance.MakeInvokerPointer();
+    invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+    bool pass = true;
+    {
+        Tensor<YDataType> host_y(f_host_tensor_descriptor2d(M, N, Stride));
+        using ReferenceInstance = ck::tensor_operation::host::ReferenceLayernorm<XDataType,
+                                                                                 GammaDataType,
+                                                                                 BetaDataType,
+                                                                                 YDataType,
+                                                                                 AccDataType,
+                                                                                 PassThrough,
+                                                                                 Rank,
+                                                                                 NumReduceDim>;
+
+        ReferenceInstance ref;
+        auto ref_argument =
+            ref.MakeArgument(x, gamma, beta, host_y, PassThrough{}, {M, N}, {1}, 1e-4);
+        auto ref_invoker = ref.MakeInvoker();
+        ref_invoker.Run(ref_argument);
+
+        y_dev.FromDevice(y.mData.data());
+        pass &= ck::utils::check_err(y, host_y, "Error: Incorrect results d1", 1e-3, 1e-3);
+    }
+    return (pass ? 0 : 1);
+}
diff --git a/example/28_grouped_gemm_bias_e_permute/CMakeLists.txt b/example/28_grouped_gemm_bias_e_permute/CMakeLists.txt
new file mode 100644
index 00000000..44ab1689
--- /dev/null
+++ b/example/28_grouped_gemm_bias_e_permute/CMakeLists.txt
@@ -0,0 +1 @@
+add_example_executable(example_grouped_gemm_bias_e_permute_xdl_fp16 grouped_gemm_bias_e_permute_xdl_fp16.cpp)
diff --git a/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp b/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp
new file mode 100644
index 00000000..f8e6501e
--- /dev/null
+++ b/example/28_grouped_gemm_bias_e_permute/grouped_gemm_bias_e_permute_xdl_fp16.cpp
@@ -0,0 +1,466 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/numeric.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
+using DDataType        = F16;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F16;
+
+static constexpr ck::index_t NumDimM = 3;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 1;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Add;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+static constexpr auto ABSpec = ck::tensor_operation::device::TensorSpecialization::Packed;
+static constexpr auto DESpec = ck::tensor_operation::device::TensorSpecialization::Packed;
+
+// clang-format off
+using DeviceOpInstanceKKNN = ck::tensor_operation::device::
+        //############################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|     DsData| EData|            A|           B|          CDE|           Gemm|              A|              B|             DE| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //############################################|        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Spacialization| Spacialization| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //############################################|        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |               |               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //############################################|        |        |        |      |      |        |         |           |      |             |            |             |               |               |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedContractionMultipleD_Xdl_CShuffle< NumDimM, NumDimN, NumDimK,   F16,   F16,     F32,      F16, DsDataType,   F16,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,         ABSpec,         ABSpec,         DESpec,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,              S<1, 32, 1, 4>,               8>;
+// clang-format on
+
+// hardcoded for NumDimM == NumDimN == NumDimK == 2
+template <ck::index_t NumDimM,
+          ck::index_t NumDimN,
+          ck::index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename EDataType,
+          typename AccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          ck::enable_if_t<NumDimM == 3 && NumDimN == 2 && NumDimK == 1, bool> = false>
+struct ReferenceContraction_M3_N2_K1 : public ck::tensor_operation::device::BaseOperator
+{
+    // Argument
+    struct Argument : public ck::tensor_operation::device::BaseArgument
+    {
+        Argument(const Tensor<ADataType>& a_ms_ks,
+                 const Tensor<BDataType>& b_ns_ks,
+                 Tensor<EDataType>& e_ms_ns,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+            : a_ms_ks_{a_ms_ks},
+              b_ns_ks_{b_ns_ks},
+              e_ms_ns_{e_ms_ns},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op}
+        {
+        }
+
+        const Tensor<ADataType>& a_ms_ks_;
+        const Tensor<BDataType>& b_ns_ks_;
+        Tensor<EDataType>& e_ms_ns_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public ck::tensor_operation::device::BaseInvoker
+    {
+        using Argument = ReferenceContraction_M3_N2_K1::Argument;
+
+        float Run(const Argument& arg)
+        {
+            auto f_ms_ns = [&](auto m0, auto m1, auto m2, auto n0, auto n1) {
+                const int K0 = arg.a_ms_ks_.mDesc.GetLengths()[3];
+
+                AccDataType v_acc = 0;
+
+                for(int k0 = 0; k0 < K0; ++k0)
+                {
+                    AccDataType v_a;
+                    AccDataType v_b;
+
+                    arg.a_element_op_(
+                        v_a, ck::type_convert<const AccDataType>(arg.a_ms_ks_(m0, m1, m2, k0)));
+                    arg.b_element_op_(
+                        v_b, ck::type_convert<const AccDataType>(arg.b_ns_ks_(n0, n1, k0)));
+
+                    v_acc += v_a * v_b;
+                }
+
+                AccDataType v_c;
+
+                arg.cde_element_op_(v_c, v_acc);
+
+                arg.e_ms_ns_(m0, m1, m2, n0, n1) = v_c;
+            };
+
+            make_ParallelTensorFunctor(f_ms_ns,
+                                       arg.e_ms_ns_.mDesc.GetLengths()[0],
+                                       arg.e_ms_ns_.mDesc.GetLengths()[1],
+                                       arg.e_ms_ns_.mDesc.GetLengths()[2],
+                                       arg.e_ms_ns_.mDesc.GetLengths()[3],
+                                       arg.e_ms_ns_.mDesc.GetLengths()[4])(
+                std::thread::hardware_concurrency());
+
+            return 0;
+        }
+
+        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
+    {
+        return true;
+    }
+
+    static auto MakeArgument(const Tensor<ADataType>& a_ms_ks,
+                             const Tensor<BDataType>& b_ns_ks,
+                             Tensor<EDataType>& e_ms_ns,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{a_ms_ks, b_ns_ks, e_ms_ns, a_element_op, b_element_op, cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceContraction_M3_N2_K1"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=n0, 1=yes)\n");
+        exit(0);
+    }
+
+    std::size_t group_count = rand() % 16 + 1;
+
+    // GEMM shape
+    std::vector<ck::tensor_operation::device::ContractionDesc<1>> contraction_descs;
+    std::vector<const void*> p_a, p_b;
+    std::vector<std::array<const void*, 1>> p_ds;
+    std::vector<void*> p_c;
+
+    contraction_descs.reserve(group_count);
+
+    for(std::size_t i = 0; i < group_count; i++)
+    {
+        int M0 = 4 * (rand() % 4 + 1);
+        int M1 = 4 * (rand() % 4 + 1);
+        int M2 = 256;
+
+        int N0 = 4 * (rand() % 4 + 1);
+        int N1 = 128;
+
+        int K0 = 64 * (rand() % 4 + 1);
+
+        // A[M0, M1, M2, K0]
+        std::vector<ck::index_t> a_ms_ks_lengths{M0, M1, M2, K0};
+        std::vector<ck::index_t> a_ms_ks_strides{M1 * M2 * K0, M2 * K0, K0, 1};
+        // B[N0, N1, K0]
+        std::vector<ck::index_t> b_ns_ks_lengths{N0, N1, K0};
+        std::vector<ck::index_t> b_ns_ks_strides{N1 * K0, K0, 1};
+#if 0
+        // D[M0, N0, M1, N1, M2]
+        std::vector<ck::index_t> d_ms_ns_lengths{M0, M1, M2, N0, N1};
+        std::vector<ck::index_t> d_ms_ns_strides{0, 0, 0, N1, 1};
+        // E[M0, N0, M1, N1, M2]
+        std::vector<ck::index_t> e_ms_ns_lengths{M0, M1, M2, N0, N1};
+        std::vector<ck::index_t> e_ms_ns_strides{N0 * M1 * N1 * M2, N1 * M2, 1, M1 * N1 * M2, M2};
+#else
+        // D[M0, N0, M1, N1, M2]
+        std::vector<ck::index_t> d_ms_ns_lengths{M0, M1, M2, N0, N1};
+        std::vector<ck::index_t> d_ms_ns_strides{0, 0, 0, N1, 1};
+        // E[M0, N0, M1, N1, M2]
+        std::vector<ck::index_t> e_ms_ns_lengths{M0, M1, M2, N0, N1};
+        std::vector<ck::index_t> e_ms_ns_strides{M1 * M2 * N0 * N1, M2 * N0 * N1, N0 * N1, N1, 1};
+#endif
+
+        contraction_descs.push_back(
+            ck::tensor_operation::device::ContractionDesc<1>{a_ms_ks_lengths,
+                                                             a_ms_ks_strides,
+                                                             b_ns_ks_lengths,
+                                                             b_ns_ks_strides,
+                                                             {d_ms_ns_lengths},
+                                                             {d_ms_ns_strides},
+                                                             e_ms_ns_lengths,
+                                                             e_ms_ns_strides});
+    }
+
+    std::vector<Tensor<ADataType>> a_tensors;
+    std::vector<Tensor<BDataType>> b_tensors;
+    std::vector<Tensor<DDataType>> d_tensors;
+    std::vector<Tensor<EDataType>> e_device_tensors;
+
+    a_tensors.reserve(group_count);
+    b_tensors.reserve(group_count);
+    d_tensors.reserve(group_count);
+    e_device_tensors.reserve(group_count);
+
+    using DeviceMemPtr = std::unique_ptr<DeviceMem>;
+
+    std::vector<DeviceMemPtr> a_tensors_device, b_tensors_device, d_tensors_device,
+        e_tensors_device;
+
+    a_tensors_device.reserve(group_count);
+    b_tensors_device.reserve(group_count);
+    d_tensors_device.reserve(group_count);
+    e_tensors_device.reserve(group_count);
+
+    std::size_t flop = 0, num_btype = 0;
+
+    for(std::size_t i = 0; i < contraction_descs.size(); i++)
+    {
+        const auto a_ms_ks_lengths = contraction_descs[i].a_ms_ks_lengths;
+        const auto a_ms_ks_strides = contraction_descs[i].a_ms_ks_strides;
+
+        const auto b_ns_ks_lengths = contraction_descs[i].b_ns_ks_lengths;
+        const auto b_ns_ks_strides = contraction_descs[i].b_ns_ks_strides;
+
+        const auto d_ms_ns_lengths = contraction_descs[i].ds_ms_ns_lengths[0];
+        const auto d_ms_ns_strides = contraction_descs[i].ds_ms_ns_strides[0];
+
+        const auto e_ms_ns_lengths = contraction_descs[i].e_ms_ns_lengths;
+        const auto e_ms_ns_strides = contraction_descs[i].e_ms_ns_strides;
+
+        Tensor<ADataType> a_ms_ks(a_ms_ks_lengths, a_ms_ks_strides);
+        Tensor<BDataType> b_ns_ks(b_ns_ks_lengths, b_ns_ks_strides);
+        Tensor<DDataType> d_ms_ns(d_ms_ns_lengths, d_ms_ns_strides);
+        Tensor<EDataType> e_ms_ns_device_result(e_ms_ns_lengths, e_ms_ns_strides);
+
+        ck::index_t M_ =
+            ck::accumulate_n<ck::index_t>(e_ms_ns_lengths.begin(), NumDimM, 1, std::multiplies<>{});
+
+        ck::index_t N_ = ck::accumulate_n<ck::index_t>(
+            e_ms_ns_lengths.begin() + NumDimM, NumDimN, 1, std::multiplies<>{});
+
+        ck::index_t K_ = ck::accumulate_n<ck::index_t>(
+            a_ms_ks_lengths.begin() + NumDimM, NumDimK, 1, std::multiplies<>{});
+
+        a_tensors.push_back(a_ms_ks);
+        b_tensors.push_back(b_ns_ks);
+        d_tensors.push_back(d_ms_ns);
+
+        // e_host_tensors.push_back(e_ms_ns_host_result);
+        e_device_tensors.push_back(e_ms_ns_device_result);
+
+        flop += std::size_t(2) * M_ * K_ * N_;
+
+        num_btype += sizeof(ADataType) * a_tensors[i].mDesc.GetElementSize() +
+                     sizeof(BDataType) * b_tensors[i].mDesc.GetElementSize() +
+                     sizeof(EDataType) * e_device_tensors[i].mDesc.GetElementSize();
+
+        std::cout << "gemm[" << i << "] a_m_k: " << a_tensors[i].mDesc
+                  << " b_n_k: " << b_tensors[i].mDesc << " c_m_n: " << e_device_tensors[i].mDesc
+                  << std::endl;
+
+        switch(init_method)
+        {
+        case 0: break;
+        case 1:
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+            d_tensors[i].GenerateTensorValue(GeneratorTensor_2<DDataType>{-5, 5});
+            break;
+        case 2:
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+            d_tensors[i].GenerateTensorValue(GeneratorTensor_3<DDataType>{-0.5, 0.5});
+            break;
+        default:
+            a_tensors[i].GenerateTensorValue(GeneratorTensor_1<ADataType>{});
+            b_tensors[i].GenerateTensorValue(GeneratorTensor_1<BDataType>{});
+            d_tensors[i].GenerateTensorValue(GeneratorTensor_1<DDataType>{});
+        }
+    }
+
+    for(std::size_t i = 0; i < contraction_descs.size(); i++)
+    {
+        a_tensors_device.emplace_back(std::make_unique<DeviceMem>(
+            sizeof(ADataType) * a_tensors[i].mDesc.GetElementSpaceSize()));
+        b_tensors_device.emplace_back(std::make_unique<DeviceMem>(
+            sizeof(BDataType) * b_tensors[i].mDesc.GetElementSpaceSize()));
+        d_tensors_device.emplace_back(std::make_unique<DeviceMem>(
+            sizeof(DDataType) * d_tensors[i].mDesc.GetElementSpaceSize()));
+        e_tensors_device.emplace_back(std::make_unique<DeviceMem>(
+            sizeof(EDataType) * e_device_tensors[i].mDesc.GetElementSpaceSize()));
+
+        a_tensors_device[i]->ToDevice(a_tensors[i].mData.data());
+        b_tensors_device[i]->ToDevice(b_tensors[i].mData.data());
+        d_tensors_device[i]->ToDevice(d_tensors[i].mData.data());
+
+        p_a.push_back(a_tensors_device[i]->GetDeviceBuffer());
+        p_b.push_back(b_tensors_device[i]->GetDeviceBuffer());
+        p_ds.push_back({d_tensors_device[i]->GetDeviceBuffer()});
+        p_c.push_back(e_tensors_device[i]->GetDeviceBuffer());
+    }
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    auto gemm    = DeviceOpInstanceKKNN{};
+    auto invoker = gemm.MakeInvoker();
+
+    // do GEMM
+    auto argument = gemm.MakeArgument(
+        p_a, p_b, p_ds, p_c, contraction_descs, a_element_op, b_element_op, cde_element_op);
+
+    DeviceMem contraction_desc_workspace(gemm.GetWorkSpaceSize(&argument));
+
+    gemm.SetWorkSpacePointer(&argument, contraction_desc_workspace.GetDeviceBuffer());
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        for(std::size_t i = 0; i < group_count; i++)
+        {
+            const auto e_ms_ns_lengths = contraction_descs[i].e_ms_ns_lengths;
+            const auto e_ms_ns_strides = contraction_descs[i].e_ms_ns_strides;
+
+            Tensor<EDataType> c_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
+
+            Tensor<EDataType> e_ms_ns_host_result(e_ms_ns_lengths, e_ms_ns_strides);
+
+            e_tensors_device[i]->FromDevice(e_device_tensors[i].mData.data());
+
+            using ReferenceOpInstance = ReferenceContraction_M3_N2_K1<NumDimM,
+                                                                      NumDimN,
+                                                                      NumDimK,
+                                                                      ADataType,
+                                                                      BDataType,
+                                                                      CShuffleDataType,
+                                                                      AccDataType,
+                                                                      AElementOp,
+                                                                      BElementOp,
+                                                                      PassThrough>;
+
+            auto ref_gemm    = ReferenceOpInstance{};
+            auto ref_invoker = ref_gemm.MakeInvoker();
+
+            auto ref_argument = ref_gemm.MakeArgument(a_tensors[i],
+                                                      b_tensors[i],
+                                                      c_ms_ns_host_result,
+                                                      a_element_op,
+                                                      b_element_op,
+                                                      PassThrough{});
+
+            ref_invoker.Run(ref_argument);
+
+            for(size_t m0 = 0; m0 < e_ms_ns_host_result.mDesc.GetLengths()[0]; ++m0)
+            {
+                for(size_t m1 = 0; m1 < e_ms_ns_host_result.mDesc.GetLengths()[1]; ++m1)
+                {
+                    for(size_t m2 = 0; m2 < e_ms_ns_host_result.mDesc.GetLengths()[2]; ++m2)
+                    {
+                        for(size_t n0 = 0; n0 < e_ms_ns_host_result.mDesc.GetLengths()[3]; ++n0)
+                        {
+                            for(size_t n1 = 0; n1 < e_ms_ns_host_result.mDesc.GetLengths()[4]; ++n1)
+                            {
+                                cde_element_op(e_ms_ns_host_result(m0, m1, m2, n0, n1),
+                                               c_ms_ns_host_result(m0, m1, m2, n0, n1),
+                                               d_tensors[i](m0, m1, m2, n0, n1));
+                            }
+                        }
+                    }
+                }
+            }
+
+            pass &= ck::utils::check_err(e_device_tensors[i], e_ms_ns_host_result);
+        }
+    }
+
+    return pass ? 0 : 1;
+}
diff --git a/example/29_batched_gemm_bias_e_permute/CMakeLists.txt b/example/29_batched_gemm_bias_e_permute/CMakeLists.txt
new file mode 100644
index 00000000..40470f27
--- /dev/null
+++ b/example/29_batched_gemm_bias_e_permute/CMakeLists.txt
@@ -0,0 +1 @@
+add_example_executable(example_batched_gemm_bias_e_permute_xdl_fp16 batched_gemm_bias_e_permute_xdl_fp16.cpp)
diff --git a/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp
new file mode 100644
index 00000000..25d815b9
--- /dev/null
+++ b/example/29_batched_gemm_bias_e_permute/batched_gemm_bias_e_permute_xdl_fp16.cpp
@@ -0,0 +1,397 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/numeric.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Add         = ck::tensor_operation::element_wise::Add;
+
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
+using DDataType        = F16;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F16;
+
+static constexpr ck::index_t NumDimG = 2;
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 1;
+
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Add;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+static constexpr auto ABSpec = ck::tensor_operation::device::TensorSpecialization::Packed;
+static constexpr auto DESpec = ck::tensor_operation::device::TensorSpecialization::Default;
+
+// clang-format off
+using DeviceOpInstanceKKNN = ck::tensor_operation::device::
+        //############################################| NumDimG| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|     DsData| EData|            A|           B|          CDE|           Gemm|              A|              B|             DE| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //############################################|        |        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Spacialization| Spacialization| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //############################################|        |        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |               |               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //############################################|        |        |        |        |      |      |        |         |           |      |             |            |             |               |               |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceBatchedContractionMultipleD_Xdl_CShuffle< NumDimG, NumDimM, NumDimN, NumDimK,   F16,   F16,     F32,      F16, DsDataType,   F16,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,         ABSpec,         ABSpec,         DESpec,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,              S<1, 32, 1, 4>,               8>;
+// clang-format on
+
+using DeviceOpInstance = DeviceOpInstanceKKNN;
+
+// hardcoded for NumDimM == NumDimN == NumDimK == 2
+template <ck::index_t NumDimG,
+          ck::index_t NumDimM,
+          ck::index_t NumDimN,
+          ck::index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename EDataType,
+          typename AccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          ck::enable_if_t<NumDimG == 2 && NumDimM == 2 && NumDimN == 2 && NumDimK == 1, bool> =
+              false>
+struct ReferenceContraction_G2_M2_N2_K1 : public ck::tensor_operation::device::BaseOperator
+{
+    // Argument
+    struct Argument : public ck::tensor_operation::device::BaseArgument
+    {
+        Argument(const Tensor<ADataType>& a_gs_ms_ks,
+                 const Tensor<BDataType>& b_gs_ns_ks,
+                 Tensor<EDataType>& e_gs_ms_ns,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+            : a_gs_ms_ks_{a_gs_ms_ks},
+              b_gs_ns_ks_{b_gs_ns_ks},
+              e_gs_ms_ns_{e_gs_ms_ns},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op}
+        {
+        }
+
+        const Tensor<ADataType>& a_gs_ms_ks_;
+        const Tensor<BDataType>& b_gs_ns_ks_;
+        Tensor<EDataType>& e_gs_ms_ns_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public ck::tensor_operation::device::BaseInvoker
+    {
+        using Argument = ReferenceContraction_G2_M2_N2_K1::Argument;
+
+        float Run(const Argument& arg)
+        {
+            auto f_ms_ns = [&](auto g0, auto g1, auto m0, auto m1, auto n0, auto n1) {
+                const int K0 = arg.a_gs_ms_ks_.mDesc.GetLengths()[4];
+
+                AccDataType v_acc = 0;
+
+                for(int k0 = 0; k0 < K0; ++k0)
+                {
+                    AccDataType v_a;
+                    AccDataType v_b;
+
+                    arg.a_element_op_(
+                        v_a,
+                        ck::type_convert<const AccDataType>(arg.a_gs_ms_ks_(g0, g1, m0, m1, k0)));
+                    arg.b_element_op_(
+                        v_b,
+                        ck::type_convert<const AccDataType>(arg.b_gs_ns_ks_(g0, g1, n0, n1, k0)));
+
+                    v_acc += v_a * v_b;
+                }
+
+                AccDataType v_c;
+
+                arg.cde_element_op_(v_c, v_acc);
+
+                arg.e_gs_ms_ns_(g0, g1, m0, m1, n0, n1) = v_c;
+            };
+
+            make_ParallelTensorFunctor(f_ms_ns,
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[0],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[1],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[2],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[3],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[4],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[5])(
+                std::thread::hardware_concurrency());
+
+            return 0;
+        }
+
+        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
+    {
+        return true;
+    }
+
+    static auto MakeArgument(const Tensor<ADataType>& a_gs_ms_ks,
+                             const Tensor<BDataType>& b_gs_ns_ks,
+                             Tensor<EDataType>& e_gs_ms_ns,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{
+            a_gs_ms_ks, b_gs_ns_ks, e_gs_ms_ns, a_element_op, b_element_op, cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceContraction_G2_M2_N2_K1"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    ck::index_t G0 = 1;
+    ck::index_t G1 = 2;
+
+    ck::index_t M0 = 4;
+    ck::index_t M1 = 256;
+
+    ck::index_t N0 = 16;
+    ck::index_t N1 = 128;
+
+    ck::index_t K0 = 64;
+
+    // A[G0, G1, M0, M1, K0]
+    std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, G1, M0, M1, K0};
+    std::vector<ck::index_t> a_gs_ms_ks_strides{G1 * M0 * M1 * K0, M0 * M1 * K0, M1 * K0, K0, 1};
+    // B[G0, G1, N0, N1, K0]
+    std::vector<ck::index_t> b_gs_ns_ks_lengths{G0, G1, N0, N1, K0};
+    std::vector<ck::index_t> b_gs_ns_ks_strides{G1 * N0 * N1 * K0, N0 * N1 * K0, N1 * K0, K0, 1};
+
+    // D[G0, G1, M0, N0, M1, N1]
+    std::vector<ck::index_t> d_gs_ms_ns_lengths{G0, G1, M0, M1, N0, N1};
+    std::vector<ck::index_t> d_gs_ms_ns_strides{G1 * N0 * N1, N0 * N1, 0, 0, N1, 1};
+    // E[G0, G1, M0, N0, M1, N1]
+    std::vector<ck::index_t> e_gs_ms_ns_lengths{G0, G1, M0, M1, N0, N1};
+    std::vector<ck::index_t> e_gs_ms_ns_strides{
+        G1 * M0 * N0 * M1 * N1, M0 * N0 * M1 * N1, N0 * M1 * N1, N1, M1 * N1, 1};
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
+    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
+    Tensor<BDataType> b_gs_ns_ks(b_gs_ns_ks_lengths, b_gs_ns_ks_strides);
+    Tensor<DDataType> d_gs_ms_ns(d_gs_ms_ns_lengths, d_gs_ms_ns_strides);
+    Tensor<EDataType> e_gs_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
+    Tensor<EDataType> e_gs_ms_ns_device_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
+
+    std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
+    std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
+    std::cout << "d_gs_ms_ns: " << d_gs_ms_ns.mDesc << std::endl;
+    std::cout << "e_gs_ms_ns: " << e_gs_ms_ns_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_gs_ms_ns.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    default:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_gs_ns_ks.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_gs_ms_ns.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_gs_ms_ks.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_gs_ns_ks.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf(sizeof(DDataType) * d_gs_ms_ns.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) *
+                           e_gs_ms_ns_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_gs_ms_ks.mData.data());
+    b_device_buf.ToDevice(b_gs_ns_ks.mData.data());
+    d_device_buf.ToDevice(d_gs_ms_ns.mData.data());
+
+    // set zero
+    e_device_buf.SetZero();
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    // device operation
+    auto op       = DeviceOpInstance{};
+    auto invoker  = op.MakeInvoker();
+    auto argument = op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                                    b_device_buf.GetDeviceBuffer(),
+                                    std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
+                                    e_device_buf.GetDeviceBuffer(),
+                                    a_gs_ms_ks_lengths,
+                                    a_gs_ms_ks_strides,
+                                    b_gs_ns_ks_lengths,
+                                    b_gs_ns_ks_strides,
+                                    std::array<std::vector<ck::index_t>, 1>{d_gs_ms_ns_lengths},
+                                    std::array<std::vector<ck::index_t>, 1>{d_gs_ms_ns_strides},
+                                    e_gs_ms_ns_lengths,
+                                    e_gs_ms_ns_strides,
+                                    a_element_op,
+                                    b_element_op,
+                                    cde_element_op);
+
+    if(!op.IsSupportedArgument(argument))
+    {
+        std::cout << op.GetTypeString() << " does not support this problem" << std::endl;
+
+        return 0;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    ck::index_t G =
+        ck::accumulate_n<ck::index_t>(e_gs_ms_ns_lengths.begin(), NumDimG, 1, std::multiplies<>{});
+
+    ck::index_t M = ck::accumulate_n<ck::index_t>(
+        e_gs_ms_ns_lengths.begin() + NumDimG, NumDimM, 1, std::multiplies<>{});
+
+    ck::index_t N = ck::accumulate_n<ck::index_t>(
+        e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM, NumDimN, 1, std::multiplies<>{});
+
+    ck::index_t K = ck::accumulate_n<ck::index_t>(
+        a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM, NumDimK, 1, std::multiplies<>{});
+
+    std::size_t flop      = std::size_t(2) * G * M * N * K;
+    std::size_t num_btype = sizeof(ADataType) * G * M * K + sizeof(BDataType) * G * K * N +
+                            sizeof(DDataType) * G * M * N + sizeof(EDataType) * G * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << op.GetTypeString() << std::endl;
+
+    e_device_buf.FromDevice(e_gs_ms_ns_device_result.mData.data());
+
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_ms_ns_host_result(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
+
+        using ReferenceOpInstance = ReferenceContraction_G2_M2_N2_K1<NumDimG,
+                                                                     NumDimM,
+                                                                     NumDimN,
+                                                                     NumDimK,
+                                                                     ADataType,
+                                                                     BDataType,
+                                                                     CShuffleDataType,
+                                                                     AccDataType,
+                                                                     AElementOp,
+                                                                     BElementOp,
+                                                                     PassThrough>;
+
+        auto ref_gemm    = ReferenceOpInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_gs_ms_ks, b_gs_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(size_t g0 = 0; g0 < e_gs_ms_ns_host_result.mDesc.GetLengths()[0]; ++g0)
+        {
+            for(size_t g1 = 0; g1 < e_gs_ms_ns_host_result.mDesc.GetLengths()[1]; ++g1)
+            {
+                for(size_t m0 = 0; m0 < e_gs_ms_ns_host_result.mDesc.GetLengths()[2]; ++m0)
+                {
+                    for(size_t m1 = 0; m1 < e_gs_ms_ns_host_result.mDesc.GetLengths()[3]; ++m1)
+                    {
+                        for(size_t n0 = 0; n0 < e_gs_ms_ns_host_result.mDesc.GetLengths()[4]; ++n0)
+                        {
+                            for(size_t n1 = 0; n1 < e_gs_ms_ns_host_result.mDesc.GetLengths()[5];
+                                ++n1)
+                            {
+                                cde_element_op(e_gs_ms_ns_host_result(g0, g1, m0, m1, n0, n1),
+                                               c_ms_ns_host_result(g0, g1, m0, m1, n0, n1),
+                                               d_gs_ms_ns(g0, g1, m0, m1, n0, n1));
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        return ck::utils::check_err(e_gs_ms_ns_device_result, e_gs_ms_ns_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}
diff --git a/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt b/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
new file mode 100644
index 00000000..61b2b2f6
--- /dev/null
+++ b/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt
@@ -0,0 +1,22 @@
+add_custom_target(example_grouped_conv_fwd_multiple_d)
+
+add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp16 grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp)
+add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp32 grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp)
+add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_bf16 grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp)
+add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int8 grouped_conv_fwd_bias_relu_add_xdl_int8.cpp)
+
+add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp16)
+add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp32)
+add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_bf16)
+add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int8)
+
+if(USE_BITINT_EXTENSION_INT4)
+  add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int4 grouped_conv_fwd_bias_relu_add_xdl_int4.cpp)
+
+  add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int4)
+endif() # USE_BITINT_EXTENSION_INT4
+
+
+add_example_executable(example_grouped_conv_fwd_xdl_fp16 grouped_conv_fwd_xdl_fp16.cpp)
+
+add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_xdl_fp16)
diff --git a/example/30_grouped_conv_fwd_multiple_d/README.md b/example/30_grouped_conv_fwd_multiple_d/README.md
new file mode 100644
index 00000000..739a0425
--- /dev/null
+++ b/example/30_grouped_conv_fwd_multiple_d/README.md
@@ -0,0 +1,30 @@
+Command
+```bash
+arg1: verification (0=no, 1=yes)
+arg2: initialization (0=no init, 1=integer value, 2=decimal value)
+arg3: time kernel (0=no, 1=yes)
+Following arguments (depending on number of spatial dims):
+ Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)
+ G, N, K, C,
+ <filter spatial dimensions>, (ie Y, X for 2D)
+ <input image spatial dimensions>, (ie Hi, Wi for 2D)
+ <strides>, (ie Sy, Sx for 2D)
+ <dilations>, (ie Dy, Dx for 2D)
+ <left padding>, (ie LeftPy, LeftPx for 2D)
+ <right padding>, (ie RightPy, RightPx for 2D)
+
+./bin/example_grouped_conv_fwd_bias_relu_add_xdl_fp16 1 1 1
+```
+
+Result (MI100)
+```
+in: dim 5, lengths {1, 128, 192, 71, 71}, strides {192, 967872, 1, 13632, 192}
+wei: dim 5, lengths {1, 256, 192, 3, 3}, strides {442368, 1728, 1, 576, 192}
+bias: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
+residual: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0}
+out: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 331776, 1, 9216, 256}
+launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
+Warm up 1 time
+Start running 10 times...
+Perf: 1.55981 ms, 94.0927 TFlops, 213.868 GB/s, DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<256, 128, 256, 16, Default>
+```
diff --git a/example/30_grouped_conv_fwd_multiple_d/common.hpp b/example/30_grouped_conv_fwd_multiple_d/common.hpp
new file mode 100644
index 00000000..d6d6dd6f
--- /dev/null
+++ b/example/30_grouped_conv_fwd_multiple_d/common.hpp
@@ -0,0 +1,355 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <iostream>
+#include <string>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+using BF16 = ck::bhalf_t;
+using FP16 = ck::half_t;
+using FP32 = float;
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+using I4 = ck::int4_t;
+#endif
+using I8  = std::int8_t;
+using I32 = std::int32_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <typename InputLay, typename WeightLay, typename OutputLay>
+struct CommonLayoutSetting
+{
+    using InputLayout  = InputLay;
+    using WeightLayout = WeightLay;
+    using OutputLayout = OutputLay;
+};
+
+template <ck::index_t NDimSpatial>
+struct CommonLayoutSettingSelector;
+
+namespace ctl = ck::tensor_layout::convolution;
+
+template <>
+struct CommonLayoutSettingSelector<1> final
+    : CommonLayoutSetting<ctl::G_NW_C, ctl::G_K_X_C, ctl::G_NW_K>
+{
+};
+
+template <>
+struct CommonLayoutSettingSelector<2> final
+    : CommonLayoutSetting<ctl::G_NHW_C, ctl::G_K_YX_C, ctl::G_NHW_K>
+{
+};
+
+template <>
+struct CommonLayoutSettingSelector<3> final
+    : CommonLayoutSetting<ctl::G_NDHW_C, ctl::G_K_ZYX_C, ctl::G_NDHW_K>
+{
+};
+
+template <ck::index_t NDimSpatial>
+using InputLayout = typename CommonLayoutSettingSelector<NDimSpatial>::InputLayout;
+
+template <ck::index_t NDimSpatial>
+using WeightLayout = typename CommonLayoutSettingSelector<NDimSpatial>::WeightLayout;
+
+template <ck::index_t NDimSpatial>
+using OutputLayout = typename CommonLayoutSettingSelector<NDimSpatial>::OutputLayout;
+
+struct ExecutionConfig final
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+};
+
+#define DefaultConvParam                                                       \
+    ck::utils::conv::ConvParam                                                 \
+    {                                                                          \
+        2, 32, 2, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, { 1, 1 } \
+    }
+
+inline void print_help_msg()
+{
+    std::cerr << "arg1: verification (0=no, 1=yes)\n"
+              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+              << "arg3: time kernel (0=no, 1=yes)\n"
+              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+}
+
+inline bool parse_cmd_args(int argc,
+                           char* argv[],
+                           ExecutionConfig& config,
+                           ck::utils::conv::ConvParam& conv_param)
+{
+    constexpr int num_execution_config_args =
+        3; // arguments for do_verification, init_method, time_kernel
+    constexpr int num_conv_param_leading_args = 5; // arguments for num_dim_spatial_, G_, N_, K_, C_
+
+    constexpr int threshold_to_catch_partial_args = 1 + num_execution_config_args;
+    constexpr int threshold_to_catch_all_args =
+        threshold_to_catch_partial_args + num_conv_param_leading_args;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    // catch only ExecutionConfig arguments
+    else if(argc == threshold_to_catch_partial_args)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    // catch both ExecutionConfig & ConvParam arguments
+    else if(threshold_to_catch_all_args < argc && ((argc - threshold_to_catch_all_args) % 3 == 0))
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+        conv_param                        = ck::utils::conv::parse_conv_param(
+            num_dim_spatial, threshold_to_catch_partial_args, argv);
+    }
+    else
+    {
+        print_help_msg();
+        return false;
+    }
+
+    return true;
+}
+
+inline HostTensorDescriptor make_input_descriptor(const ck::utils::conv::ConvParam& conv_param)
+{
+    switch(conv_param.num_dim_spatial_)
+    {
+    case 1:
+        return HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]},
+            {
+                conv_param.C_,                                                        // g
+                conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n
+                1,                                                                    // c
+                conv_param.G_ * conv_param.C_                                         // wi
+            });
+
+    case 2:
+        return HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.C_,
+             conv_param.input_spatial_lengths_[0],
+             conv_param.input_spatial_lengths_[1]},
+            {
+                conv_param.C_, // g
+                conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
+                    conv_param.G_ * conv_param.C_,                                    // n
+                1,                                                                    // c
+                conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi
+                conv_param.G_ * conv_param.C_                                         // wi
+            });
+
+    case 3:
+        return HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.C_,
+             conv_param.input_spatial_lengths_[0],
+             conv_param.input_spatial_lengths_[1],
+             conv_param.input_spatial_lengths_[2]},
+            {
+                conv_param.C_, // g
+                conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] *
+                    conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n
+                1,                                                                        // c
+                conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] *
+                    conv_param.G_ * conv_param.C_,                                    // di
+                conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi
+                conv_param.G_ * conv_param.C_                                         // wi
+            });
+    }
+
+    throw std::runtime_error("unsuppored # dim spatial");
+}
+
+inline HostTensorDescriptor make_weight_descriptor(const ck::utils::conv::ConvParam& conv_param)
+{
+    switch(conv_param.num_dim_spatial_)
+    {
+    case 1:
+        return HostTensorDescriptor(
+            {conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]},
+            {
+                conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g
+                conv_param.filter_spatial_lengths_[0] * conv_param.C_,                 // k
+                1,                                                                     // c
+                conv_param.C_                                                          // x
+            });
+    case 2:
+        return HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.K_,
+             conv_param.C_,
+             conv_param.filter_spatial_lengths_[0],
+             conv_param.filter_spatial_lengths_[1]},
+            {
+                conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
+                    conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g
+                conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
+                    conv_param.C_,                                     // k
+                1,                                                     // c
+                conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y
+                conv_param.C_                                          // x
+            });
+    case 3:
+        return HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.K_,
+             conv_param.C_,
+             conv_param.filter_spatial_lengths_[0],
+             conv_param.filter_spatial_lengths_[1],
+             conv_param.filter_spatial_lengths_[2]},
+            {
+                conv_param.K_ * conv_param.filter_spatial_lengths_[0] *
+                    conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
+                    conv_param.C_, // g
+                conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] *
+                    conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k
+                1,                                                         // c
+                conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] *
+                    conv_param.C_,                                     // z
+                conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y
+                conv_param.C_                                          // x
+            });
+    }
+
+    throw std::runtime_error("unsuppored # dim spatial");
+}
+
+inline HostTensorDescriptor make_bias_descriptor(const ck::utils::conv::ConvParam& conv_param)
+{
+    switch(conv_param.num_dim_spatial_)
+    {
+    case 1:
+        return HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
+            {
+                conv_param.K_, // g
+                0,             // k
+                1,             // c
+                0              // x
+            });
+    case 2:
+        return HostTensorDescriptor({conv_param.G_,
+                                     conv_param.N_,
+                                     conv_param.K_,
+                                     conv_param.output_spatial_lengths_[0],
+                                     conv_param.output_spatial_lengths_[1]},
+                                    {
+                                        conv_param.K_, // g
+                                        0,             // n
+                                        1,             // k
+                                        0,             // ho
+                                        0              // wo
+                                    });
+    case 3:
+        return HostTensorDescriptor({conv_param.G_,
+                                     conv_param.N_,
+                                     conv_param.K_,
+                                     conv_param.output_spatial_lengths_[0],
+                                     conv_param.output_spatial_lengths_[1],
+                                     conv_param.output_spatial_lengths_[2]},
+                                    {
+                                        conv_param.K_, // g
+                                        0,             // n
+                                        1,             // k
+                                        0,             // z
+                                        0,             // y
+                                        0              // x
+                                    });
+    }
+
+    throw std::runtime_error("unsuppored # dim spatial");
+}
+
+inline HostTensorDescriptor make_output_descriptor(const ck::utils::conv::ConvParam& conv_param)
+{
+
+    switch(conv_param.num_dim_spatial_)
+    {
+    case 1:
+        return HostTensorDescriptor(
+            {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]},
+            {
+                conv_param.K_,                                                         // g
+                conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n
+                1,                                                                     // k
+                conv_param.G_ * conv_param.K_                                          // wo
+            });
+    case 2:
+        return HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.K_,
+             conv_param.output_spatial_lengths_[0],
+             conv_param.output_spatial_lengths_[1]},
+            {
+                conv_param.K_, // g
+                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
+                    conv_param.G_ * conv_param.K_,                                     // n
+                1,                                                                     // k
+                conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho
+                conv_param.G_ * conv_param.K_                                          // wo
+            });
+
+    case 3:
+        return HostTensorDescriptor(
+            {conv_param.G_,
+             conv_param.N_,
+             conv_param.K_,
+             conv_param.output_spatial_lengths_[0],
+             conv_param.output_spatial_lengths_[1],
+             conv_param.output_spatial_lengths_[2]},
+            {
+                conv_param.K_, // g
+                conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] *
+                    conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n
+                1,                                                                         // k
+                conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] *
+                    conv_param.G_ * conv_param.K_,                                     // do
+                conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho
+                conv_param.G_ * conv_param.K_                                          // wo
+            });
+    }
+
+    throw std::runtime_error("unsuppored # dim spatial");
+}
diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp
new file mode 100644
index 00000000..ee300d07
--- /dev/null
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+// kernel data types
+using InKernelDataType       = BF16;
+using WeiKernelDataType      = BF16;
+using AccDataType            = FP32;
+using CShuffleDataType       = FP32;
+using BiasKernelDataType     = BF16;
+using ResidualKernelDataType = BF16;
+using OutKernelDataType      = BF16;
+
+// tensor data types
+using InUserDataType  = InKernelDataType;
+using WeiUserDataType = WeiKernelDataType;
+using OutUserDataType = OutKernelDataType;
+
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
+
+#include "run_grouped_conv_fwd_bias_relu_add_example.inc"
+
+int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp
new file mode 100644
index 00000000..5a9df0b1
--- /dev/null
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+// kernel data types
+using InKernelDataType       = FP16;
+using WeiKernelDataType      = FP16;
+using AccDataType            = FP32;
+using CShuffleDataType       = FP16;
+using BiasKernelDataType     = FP16;
+using ResidualKernelDataType = FP16;
+using OutKernelDataType      = FP16;
+
+// tensor data types
+using InUserDataType  = InKernelDataType;
+using WeiUserDataType = WeiKernelDataType;
+using OutUserDataType = OutKernelDataType;
+
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
+
+#include "run_grouped_conv_fwd_bias_relu_add_example.inc"
+
+int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp
new file mode 100644
index 00000000..c2906cc9
--- /dev/null
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+// kernel data types
+using InKernelDataType       = FP32;
+using WeiKernelDataType      = FP32;
+using AccDataType            = FP32;
+using CShuffleDataType       = FP32;
+using BiasKernelDataType     = FP32;
+using ResidualKernelDataType = FP32;
+using OutKernelDataType      = FP32;
+
+// tensor data types
+using InUserDataType  = InKernelDataType;
+using WeiUserDataType = WeiKernelDataType;
+using OutUserDataType = OutKernelDataType;
+
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
+
+#include "run_grouped_conv_fwd_bias_relu_add_example.inc"
+
+int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int4.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int4.cpp
new file mode 100644
index 00000000..3d5a243e
--- /dev/null
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int4.cpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+#error Should compile this file with ck::int4_t support
+#endif
+
+#include "common.hpp"
+
+// kernel data types
+using InKernelDataType       = I8;
+using WeiKernelDataType      = I8;
+using AccDataType            = I32;
+using CShuffleDataType       = I8;
+using BiasKernelDataType     = I8;
+using ResidualKernelDataType = I8;
+using OutKernelDataType      = I8;
+
+// tensor data types
+using InUserDataType  = I4;
+using WeiUserDataType = I4;
+using OutUserDataType = I4;
+
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
+
+#define BUILD_INT4_EXAMPLE
+#include "run_grouped_conv_fwd_bias_relu_add_example.inc"
+
+int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int8.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int8.cpp
new file mode 100644
index 00000000..eaf680fa
--- /dev/null
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int8.cpp
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+// kernel data types
+using InKernelDataType       = I8;
+using WeiKernelDataType      = I8;
+using AccDataType            = I32;
+using CShuffleDataType       = I8;
+using BiasKernelDataType     = I8;
+using ResidualKernelDataType = I8;
+using OutKernelDataType      = I8;
+
+// tensor data types
+using InUserDataType  = InKernelDataType;
+using WeiUserDataType = WeiKernelDataType;
+using OutUserDataType = OutKernelDataType;
+
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
+
+#include "run_grouped_conv_fwd_bias_relu_add_example.inc"
+
+int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); }
diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_xdl_fp16.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_xdl_fp16.cpp
new file mode 100644
index 00000000..6de1daa3
--- /dev/null
+++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_xdl_fp16.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+// kernel data types
+using InKernelDataType  = FP16;
+using WeiKernelDataType = FP16;
+using AccDataType       = FP32;
+using CShuffleDataType  = FP16;
+using OutKernelDataType = FP16;
+
+// tensor data types
+using InUserDataType  = InKernelDataType;
+using WeiUserDataType = WeiKernelDataType;
+using OutUserDataType = OutKernelDataType;
+
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using OutElementOp = PassThrough;
+
+#include "run_grouped_conv_fwd_example.inc"
+
+int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_example(argc, argv); }
diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc
new file mode 100644
index 00000000..4561156e
--- /dev/null
+++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+template <typename BiasLay, typename ResidualLay>
+struct LayoutSetting
+{
+    using BiasLayout     = BiasLay;
+    using ResidualLayout = ResidualLay;
+};
+
+template <ck::index_t NDimSpatial>
+struct LayoutSettingSelector;
+
+template <>
+struct LayoutSettingSelector<1> final : LayoutSetting<ctl::G_K, ctl::G_NW_K>
+{
+};
+
+template <>
+struct LayoutSettingSelector<2> final : LayoutSetting<ctl::G_K, ctl::G_NHW_K>
+{
+};
+
+template <>
+struct LayoutSettingSelector<3> final : LayoutSetting<ctl::G_K, ctl::G_NDHW_K>
+{
+};
+
+template <ck::index_t NDimSpatial>
+using BiasLayout = typename LayoutSettingSelector<NDimSpatial>::BiasLayout;
+
+template <ck::index_t NDimSpatial>
+using ResidualLayout = typename LayoutSettingSelector<NDimSpatial>::ResidualLayout;
+
+template <ck::index_t NDimSpatial>
+using DeviceConvFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
+        NDimSpatial,
+        InputLayout<NDimSpatial>,
+        WeightLayout<NDimSpatial>,
+        ck::Tuple<BiasLayout<NDimSpatial>, ResidualLayout<NDimSpatial>>,
+        OutputLayout<NDimSpatial>,
+        InKernelDataType,
+        WeiKernelDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<BiasKernelDataType, ResidualKernelDataType>,
+        OutKernelDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        16,          // KPerBlock
+        4,           // AK1
+        4,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        4,           // ABlockTransferSrcScalarPerVector
+        4,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        4,           // BBlockTransferSrcScalarPerVector
+        4,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 16, 1, 16>,
+        4>;
+
+template <ck::index_t NDimSpatial>
+using HostConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                         InUserDataType,
+                                                                         WeiUserDataType,
+                                                                         CShuffleDataType,
+                                                                         InElementOp,
+                                                                         WeiElementOp,
+                                                                         PassThrough>;
+
+template <ck::index_t NDimSpatial>
+bool run_grouped_conv_fwd_bias_relu_add(const ExecutionConfig& config,
+                                        const ck::utils::conv::ConvParam& conv_param)
+{
+    static_assert(1 <= NDimSpatial && NDimSpatial <= 3, "Unsupported NDimSpatial");
+
+    const auto in_g_n_c_wis_desc   = make_input_descriptor(conv_param);
+    const auto wei_g_k_c_xs_desc   = make_weight_descriptor(conv_param);
+    const auto bias_g_n_k_wos_desc = make_bias_descriptor(conv_param);
+    const auto out_g_n_k_wos_desc  = make_output_descriptor(conv_param);
+
+    Tensor<InUserDataType> in(in_g_n_c_wis_desc);
+    Tensor<WeiUserDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<OutUserDataType> bias(bias_g_n_k_wos_desc);
+    Tensor<OutUserDataType> residual(bias_g_n_k_wos_desc);
+    Tensor<OutUserDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutKernelDataType> out_device(out_g_n_k_wos_desc);
+
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "bias: " << bias.mDesc << std::endl;
+    std::cout << "residual: " << residual.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_2<InUserDataType>{-5, 5});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiUserDataType>{-5, 5});
+        bias.GenerateTensorValue(GeneratorTensor_2<OutUserDataType>{-5, 5});
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_3<InUserDataType>{0.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiUserDataType>{-0.5, 0.5});
+        bias.GenerateTensorValue(GeneratorTensor_3<OutUserDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem in_device_buf(sizeof(InKernelDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiKernelDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem bias_device_buf(sizeof(OutKernelDataType) * bias.mDesc.GetElementSpaceSize());
+    DeviceMem residual_device_buf(sizeof(OutKernelDataType) * residual.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutKernelDataType) * out_device.mDesc.GetElementSpaceSize());
+
+#ifdef BUILD_INT4_EXAMPLE
+    const Tensor<InKernelDataType> in_converted(in);
+    const Tensor<WeiKernelDataType> wei_converted(wei);
+    const Tensor<OutKernelDataType> bias_converted(bias);
+    const Tensor<OutKernelDataType> residual_converted(residual);
+
+    in_device_buf.ToDevice(in_converted.mData.data());
+    wei_device_buf.ToDevice(wei_converted.mData.data());
+    bias_device_buf.ToDevice(bias_converted.mData.data());
+    residual_device_buf.ToDevice(residual_converted.mData.data());
+#else
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    bias_device_buf.ToDevice(bias.mData.data());
+    residual_device_buf.ToDevice(residual.mData.data());
+#endif
+
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> d0_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> d0_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> d1_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> d1_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(bias_g_n_k_wos_desc.GetLengths(), d0_g_n_k_wos_lengths);
+    copy(bias_g_n_k_wos_desc.GetStrides(), d0_g_n_k_wos_strides);
+    copy(bias_g_n_k_wos_desc.GetLengths(), d1_g_n_k_wos_lengths);
+    copy(bias_g_n_k_wos_desc.GetStrides(), d1_g_n_k_wos_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    // do Conv
+    auto conv    = DeviceConvFwdInstance<NDimSpatial>{};
+    auto invoker = conv.MakeInvoker();
+    auto argument =
+        conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
+                          wei_device_buf.GetDeviceBuffer(),
+                          std::array<const void*, 2>{bias_device_buf.GetDeviceBuffer(),
+                                                     residual_device_buf.GetDeviceBuffer()},
+                          out_device_buf.GetDeviceBuffer(),
+                          a_g_n_c_wis_lengths,
+                          a_g_n_c_wis_strides,
+                          b_g_k_c_xs_lengths,
+                          b_g_k_c_xs_strides,
+                          std::array<std::array<ck::index_t, NDimSpatial + 3>, 2>{
+                              {d0_g_n_k_wos_lengths, d1_g_n_k_wos_lengths}},
+                          std::array<std::array<ck::index_t, NDimSpatial + 3>, 2>{
+                              {d0_g_n_k_wos_strides, d1_g_n_k_wos_strides}},
+                          e_g_n_k_wos_lengths,
+                          e_g_n_k_wos_strides,
+                          conv_filter_strides,
+                          conv_filter_dilations,
+                          input_left_pads,
+                          input_right_pads,
+                          InElementOp{},
+                          WeiElementOp{},
+                          OutElementOp{});
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_conv with the specified compilation parameters does "
+            "not support this Conv problem");
+    }
+
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+    std::size_t flop      = conv_param.GetFlops();
+    std::size_t num_btype = conv_param.GetByte<InUserDataType, WeiUserDataType, OutUserDataType>();
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+
+    if(config.do_verification)
+    {
+        Tensor<CShuffleDataType> c_host(out_g_n_k_wos_desc);
+
+        auto ref_conv     = HostConvFwdInstance<NDimSpatial>{};
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei,
+                                                  c_host,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  InElementOp{},
+                                                  WeiElementOp{},
+                                                  PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        // TODO: implement elementwise operation for host
+        out_host.ForEach([&](auto&, auto idx) {
+            OutElementOp{}(out_host(idx), c_host(idx), bias(idx), residual(idx));
+        });
+
+        out_device_buf.FromDevice(out_device.mData.data());
+
+#ifdef BUILD_INT4_EXAMPLE
+        const Tensor<OutUserDataType> out_device_converted(out_device);
+
+        return ck::utils::check_err(
+            out_device_converted, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
+#else
+        return ck::utils::check_err(
+            out_device, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
+#endif
+    }
+
+    return true;
+}
+
+bool run_grouped_conv_fwd_bias_relu_add_example(int argc, char* argv[])
+{
+    ExecutionConfig config;
+    ck::utils::conv::ConvParam conv_param = DefaultConvParam;
+
+    if(!parse_cmd_args(argc, argv, config, conv_param))
+    {
+        return false;
+    }
+
+    switch(conv_param.num_dim_spatial_)
+    {
+    case 1: return run_grouped_conv_fwd_bias_relu_add<1>(config, conv_param);
+    case 2: return run_grouped_conv_fwd_bias_relu_add<2>(config, conv_param);
+    case 3: return run_grouped_conv_fwd_bias_relu_add<3>(config, conv_param);
+    }
+
+    return false;
+}
diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_example.inc b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_example.inc
new file mode 100644
index 00000000..d087c31a
--- /dev/null
+++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_example.inc
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+template <ck::index_t NDimSpatial>
+using DeviceConvFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
+        NDimSpatial,
+        InputLayout<NDimSpatial>,
+        WeightLayout<NDimSpatial>,
+        ck::Tuple<>,
+        OutputLayout<NDimSpatial>,
+        InKernelDataType,
+        WeiKernelDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<>,
+        OutKernelDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        16,          // KPerBlock
+        4,           // AK1
+        4,           // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        4,           // ABlockTransferSrcScalarPerVector
+        4,           // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        4,           // BBlockTransferSrcScalarPerVector
+        4,           // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 16, 1, 16>,
+        4>;
+
+template <ck::index_t NDimSpatial>
+using HostConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                         InUserDataType,
+                                                                         WeiUserDataType,
+                                                                         CShuffleDataType,
+                                                                         InElementOp,
+                                                                         WeiElementOp,
+                                                                         PassThrough>;
+
+template <ck::index_t NDimSpatial>
+bool run_grouped_conv_fwd(const ExecutionConfig& config,
+                          const ck::utils::conv::ConvParam& conv_param)
+{
+    static_assert(1 <= NDimSpatial && NDimSpatial <= 3, "Unsupported NDimSpatial");
+
+    const auto in_g_n_c_wis_desc  = make_input_descriptor(conv_param);
+    const auto wei_g_k_c_xs_desc  = make_weight_descriptor(conv_param);
+    const auto out_g_n_k_wos_desc = make_output_descriptor(conv_param);
+
+    Tensor<InUserDataType> in(in_g_n_c_wis_desc);
+    Tensor<WeiUserDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<OutUserDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutKernelDataType> out_device(out_g_n_k_wos_desc);
+
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_2<InUserDataType>{-5, 5});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiUserDataType>{-5, 5});
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_3<InUserDataType>{0.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiUserDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem in_device_buf(sizeof(InKernelDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiKernelDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutKernelDataType) * out_device.mDesc.GetElementSpaceSize());
+
+#ifdef BUILD_INT4_EXAMPLE
+    const Tensor<InKernelDataType> in_converted(in);
+    const Tensor<WeiKernelDataType> wei_converted(wei);
+
+    in_device_buf.ToDevice(in_converted.mData.data());
+    wei_device_buf.ToDevice(wei_converted.mData.data());
+#else
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+#endif
+
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    // do Conv
+    auto conv     = DeviceConvFwdInstance<NDimSpatial>{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
+                                      wei_device_buf.GetDeviceBuffer(),
+                                      std::array<const void*, 0>{},
+                                      out_device_buf.GetDeviceBuffer(),
+                                      a_g_n_c_wis_lengths,
+                                      a_g_n_c_wis_strides,
+                                      b_g_k_c_xs_lengths,
+                                      b_g_k_c_xs_strides,
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{},
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{},
+                                      e_g_n_k_wos_lengths,
+                                      e_g_n_k_wos_strides,
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads,
+                                      InElementOp{},
+                                      WeiElementOp{},
+                                      OutElementOp{});
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_conv with the specified compilation parameters does "
+            "not support this Conv problem");
+    }
+
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+    std::size_t flop      = conv_param.GetFlops();
+    std::size_t num_btype = conv_param.GetByte<InUserDataType, WeiUserDataType, OutUserDataType>();
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+
+    if(config.do_verification)
+    {
+        auto ref_conv     = HostConvFwdInstance<NDimSpatial>{};
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei,
+                                                  out_host,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  InElementOp{},
+                                                  WeiElementOp{},
+                                                  OutElementOp{});
+
+        ref_invoker.Run(ref_argument);
+
+        out_device_buf.FromDevice(out_device.mData.data());
+
+#ifdef BUILD_INT4_EXAMPLE
+        const Tensor<OutUserDataType> out_device_converted(out_device);
+
+        return ck::utils::check_err(
+            out_device_converted.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
+#else
+        return ck::utils::check_err(
+            out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
+#endif
+    }
+
+    return true;
+}
+
+bool run_grouped_conv_fwd_example(int argc, char* argv[])
+{
+    ExecutionConfig config;
+    ck::utils::conv::ConvParam conv_param = DefaultConvParam;
+
+    if(!parse_cmd_args(argc, argv, config, conv_param))
+    {
+        return false;
+    }
+
+    switch(conv_param.num_dim_spatial_)
+    {
+    case 1: return run_grouped_conv_fwd<1>(config, conv_param);
+    case 2: return run_grouped_conv_fwd<2>(config, conv_param);
+    case 3: return run_grouped_conv_fwd<3>(config, conv_param);
+    }
+
+    return false;
+}
diff --git a/example/31_batched_gemm_gemm/CMakeLists.txt b/example/31_batched_gemm_gemm/CMakeLists.txt
new file mode 100644
index 00000000..d7924825
--- /dev/null
+++ b/example/31_batched_gemm_gemm/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_example_executable(example_batched_gemm_gemm_xdl_fp32 batched_gemm_gemm_xdl_fp32.cpp)
+add_example_executable(example_batched_gemm_gemm_xdl_fp16 batched_gemm_gemm_xdl_fp16.cpp)
+add_example_executable(example_batched_gemm_gemm_xdl_bf16 batched_gemm_gemm_xdl_bf16.cpp)
+add_example_executable(example_batched_gemm_gemm_xdl_int8 batched_gemm_gemm_xdl_int8.cpp)
+
+if(USE_BITINT_EXTENSION_INT4)
+add_example_executable(example_batched_gemm_gemm_xdl_int4 batched_gemm_gemm_xdl_int4.cpp)
+endif(USE_BITINT_EXTENSION_INT4)
diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp
new file mode 100644
index 00000000..74e0e07e
--- /dev/null
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_bf16.cpp
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o
+                                              |------------|
+                                                   Gemm0
+                                              |---------------------|
+                                                       Gemm1
+*/
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = BF16;
+using B0DataType       = BF16;
+using B1DataType       = BF16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using CDataType        = BF16;
+
+using ALayout  = Row;
+using B0Layout = Col;
+using B1Layout = Row;
+using CLayout  = Row;
+
+using AElementOp    = PassThrough;
+using B0ElementOp   = PassThrough;
+using Acc0ElementOp = PassThrough;
+using B1ElementOp   = PassThrough;
+using CElementOp    = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmGemm_Xdl_CShuffle<
+    ALayout,
+    B0Layout,
+    B1Layout,
+    CLayout,
+    ADataType,
+    B0DataType,
+    B1DataType,
+    CDataType,
+    AccDataType,
+    CShuffleDataType,
+    AElementOp,
+    B0ElementOp,
+    Acc0ElementOp,
+    B1ElementOp,
+    CElementOp,
+    GemmDefault,
+    1,
+    256,
+    128,         // MPerBlock
+    128,         // NPerBlock
+    32,          // KPerBlock
+    128,         // Gemm1NPerBlock
+    32,          // Gemm1KPerBlock
+    8,           // AK1
+    8,           // BK1
+    2,           // B1K1
+    32,          // MPerXDL
+    32,          // NPerXDL
+    1,           // MXdlPerWave
+    4,           // NXdlPerWave
+    4,           // Gemm1NXdlPerWave
+    S<4, 64, 1>, // ABlockTransfer
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    8,
+    8,
+    true,
+    S<4, 64, 1>, // BBlockTransfer
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    8,
+    8,
+    true,
+    S<8, 32, 1>, // B1BlockTransfer
+    S<0, 2, 1>,
+    S<0, 2, 1>,
+    1,
+    4,
+    2,
+    false,
+    1,              // CShuffleMXdlPerWavePerShuffle
+    2,              // CShuffleNXdlPerWavePerShuffle
+    S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+    8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+
+using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B0DataType,
+                                                                                ADataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B0ElementOp,
+                                                                                CElementOp>;
+
+using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B1DataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B1ElementOp,
+                                                                                CElementOp>;
+
+#include "run_batched_gemm_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return run_batched_gemm_gemm_example(argc, argv) ? 0 : 1; }
diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp
new file mode 100644
index 00000000..d5fadb80
--- /dev/null
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp16.cpp
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o
+                                              |------------|
+                                                   Gemm0
+                                              |---------------------|
+                                                       Gemm1
+*/
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F16;
+using B0DataType       = F16;
+using B1DataType       = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using CDataType        = F16;
+
+using ALayout  = Row;
+using B0Layout = Col;
+using B1Layout = Row;
+using CLayout  = Row;
+
+using AElementOp    = PassThrough;
+using B0ElementOp   = PassThrough;
+using Acc0ElementOp = PassThrough;
+using B1ElementOp   = PassThrough;
+using CElementOp    = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmGemm_Xdl_CShuffle<
+    ALayout,
+    B0Layout,
+    B1Layout,
+    CLayout,
+    ADataType,
+    B0DataType,
+    B1DataType,
+    CDataType,
+    AccDataType,
+    CShuffleDataType,
+    AElementOp,
+    B0ElementOp,
+    Acc0ElementOp,
+    B1ElementOp,
+    CElementOp,
+    GemmDefault,
+    1,
+    256,
+    128,         // MPerBlock
+    128,         // NPerBlock
+    32,          // KPerBlock
+    128,         // Gemm1NPerBlock
+    32,          // Gemm1KPerBlock
+    8,           // AK1
+    8,           // BK1
+    2,           // B1K1
+    32,          // MPerXDL
+    32,          // NPerXDL
+    1,           // MXdlPerWave
+    4,           // NXdlPerWave
+    4,           // Gemm1NXdlPerWave
+    S<4, 64, 1>, // ABlockTransfer
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    8,
+    8,
+    true,
+    S<4, 64, 1>, // BBlockTransfer
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    8,
+    8,
+    true,
+    S<8, 32, 1>, // B1BlockTransfer
+    S<0, 2, 1>,
+    S<0, 2, 1>,
+    1,
+    4,
+    2,
+    false,
+    1,              // CShuffleMXdlPerWavePerShuffle
+    2,              // CShuffleNXdlPerWavePerShuffle
+    S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+    8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+
+using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B0DataType,
+                                                                                ADataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B0ElementOp,
+                                                                                CElementOp>;
+
+using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B1DataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B1ElementOp,
+                                                                                CElementOp>;
+
+#include "run_batched_gemm_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return run_batched_gemm_gemm_example(argc, argv) ? 0 : 1; }
diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp
new file mode 100644
index 00000000..0dd4e091
--- /dev/null
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_fp32.cpp
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o
+                                              |------------|
+                                                   Gemm0
+                                              |---------------------|
+                                                       Gemm1
+*/
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F32;
+using B0DataType       = F32;
+using B1DataType       = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using CDataType        = F32;
+
+using ALayout  = Row;
+using B0Layout = Col;
+using B1Layout = Row;
+using CLayout  = Row;
+
+using AElementOp    = PassThrough;
+using B0ElementOp   = PassThrough;
+using Acc0ElementOp = PassThrough;
+using B1ElementOp   = PassThrough;
+using CElementOp    = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmGemm_Xdl_CShuffle<
+    ALayout,
+    B0Layout,
+    B1Layout,
+    CLayout,
+    ADataType,
+    B0DataType,
+    B1DataType,
+    CDataType,
+    AccDataType,
+    CShuffleDataType,
+    AElementOp,
+    B0ElementOp,
+    Acc0ElementOp,
+    B1ElementOp,
+    CElementOp,
+    GemmDefault,
+    1,
+    256,
+    128,         // MPerBlock
+    128,         // NPerBlock
+    16,          // KPerBlock
+    128,         // Gemm1NPerBlock
+    16,          // Gemm1KPerBlock
+    4,           // AK1
+    4,           // BK1
+    1,           // B1K1
+    32,          // MPerXDL
+    32,          // NPerXDL
+    1,           // MXdlPerWave
+    4,           // NXdlPerWave
+    4,           // Gemm1NXdlPerWave
+    S<4, 64, 1>, // ABlockTransfer
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    4,
+    4,
+    true,
+    S<4, 64, 1>, // BBlockTransfer
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    4,
+    4,
+    true,
+    S<8, 32, 1>, // B1BlockTransfer
+    S<0, 2, 1>,
+    S<0, 2, 1>,
+    1,
+    4,
+    1,
+    false,
+    1,               // CShuffleMXdlPerWavePerShuffle
+    2,               // CShuffleNXdlPerWavePerShuffle
+    S<1, 16, 1, 16>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+    4>;              // CShuffleBlockTransferScalarPerVector_NPerBlock
+
+using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B0DataType,
+                                                                                ADataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B0ElementOp,
+                                                                                CElementOp>;
+
+using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B1DataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B1ElementOp,
+                                                                                CElementOp>;
+
+#include "run_batched_gemm_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return run_batched_gemm_gemm_example(argc, argv) ? 0 : 1; }
diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp
new file mode 100644
index 00000000..1fd93622
--- /dev/null
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int4.cpp
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o
+                                              |------------|
+                                                   Gemm0
+                                              |---------------------|
+                                                       Gemm1
+*/
+
+#ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+#error Should compile this file with ck::int4_t support
+#endif
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = ck::int4_t;
+using B0DataType       = ck::int4_t;
+using B1DataType       = ck::int4_t;
+using KernelADataType  = int8_t;
+using KernelB0DataType = int8_t;
+using KernelB1DataType = int8_t;
+using AccDataType      = int32_t;
+using CShuffleDataType = int32_t;
+using CDataType        = ck::int4_t;
+using KernelCDataType  = int8_t;
+
+using ALayout  = Row;
+using B0Layout = Col;
+using B1Layout = Row;
+using CLayout  = Row;
+
+using AElementOp    = PassThrough;
+using B0ElementOp   = PassThrough;
+using Acc0ElementOp = PassThrough;
+using B1ElementOp   = PassThrough;
+using CElementOp    = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmGemm_Xdl_CShuffle<
+    ALayout,
+    B0Layout,
+    B1Layout,
+    CLayout,
+    KernelADataType,
+    KernelB0DataType,
+    KernelB1DataType,
+    KernelCDataType,
+    AccDataType,
+    CShuffleDataType,
+    AElementOp,
+    B0ElementOp,
+    Acc0ElementOp,
+    B1ElementOp,
+    CElementOp,
+    GemmDefault,
+    1,
+    256,
+    128,         // MPerBlock
+    128,         // NPerBlock
+    64,          // KPerBlock
+    128,         // Gemm1NPerBlock
+    64,          // Gemm1KPerBlock
+    16,          // AK1
+    16,          // BK1
+    4,           // B1K1
+    32,          // MPerXDL
+    32,          // NPerXDL
+    1,           // MXdlPerWave
+    4,           // NXdlPerWave
+    4,           // Gemm1NXdlPerWave
+    S<4, 64, 1>, // ABlockTransfer
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    16,
+    16,
+    true,
+    S<4, 64, 1>, // BBlockTransfer
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    16,
+    16,
+    true,
+    S<8, 32, 1>, // B1BlockTransfer
+    S<0, 2, 1>,
+    S<0, 2, 1>,
+    1,
+    4,
+    4,
+    false,
+    1,              // CShuffleMXdlPerWavePerShuffle
+    2,              // CShuffleNXdlPerWavePerShuffle
+    S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+    8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+
+using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B0DataType,
+                                                                                ADataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B0ElementOp,
+                                                                                CElementOp>;
+
+using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B1DataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B1ElementOp,
+                                                                                CElementOp>;
+
+#define BUILD_INT4_EXAMPLE
+#include "run_batched_gemm_gemm_example.inc"
+
+#if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
+static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
+#endif
+
+int main(int argc, char* argv[]) { return run_batched_gemm_gemm_example(argc, argv) ? 0 : 1; }
diff --git a/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp
new file mode 100644
index 00000000..15d98aba
--- /dev/null
+++ b/example/31_batched_gemm_gemm/batched_gemm_gemm_xdl_int8.cpp
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o
+                                              |------------|
+                                                   Gemm0
+                                              |---------------------|
+                                                       Gemm1
+*/
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = int8_t;
+using B0DataType       = int8_t;
+using B1DataType       = int8_t;
+using AccDataType      = int32_t;
+using CShuffleDataType = int32_t;
+using CDataType        = int8_t;
+
+using ALayout  = Row;
+using B0Layout = Col;
+using B1Layout = Row;
+using CLayout  = Row;
+
+using AElementOp    = PassThrough;
+using B0ElementOp   = PassThrough;
+using Acc0ElementOp = PassThrough;
+using B1ElementOp   = PassThrough;
+using CElementOp    = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmGemm_Xdl_CShuffle<
+    ALayout,
+    B0Layout,
+    B1Layout,
+    CLayout,
+    ADataType,
+    B0DataType,
+    B1DataType,
+    CDataType,
+    AccDataType,
+    CShuffleDataType,
+    AElementOp,
+    B0ElementOp,
+    Acc0ElementOp,
+    B1ElementOp,
+    CElementOp,
+    GemmDefault,
+    1,
+    256,
+    128,         // MPerBlock
+    128,         // NPerBlock
+    64,          // KPerBlock
+    128,         // Gemm1NPerBlock
+    64,          // Gemm1KPerBlock
+    16,          // AK1
+    16,          // BK1
+    4,           // B1K1
+    32,          // MPerXDL
+    32,          // NPerXDL
+    1,           // MXdlPerWave
+    4,           // NXdlPerWave
+    4,           // Gemm1NXdlPerWave
+    S<4, 64, 1>, // ABlockTransfer
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    16,
+    16,
+    true,
+    S<4, 64, 1>, // BBlockTransfer
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    16,
+    16,
+    true,
+    S<8, 32, 1>, // B1BlockTransfer
+    S<0, 2, 1>,
+    S<0, 2, 1>,
+    1,
+    4,
+    4,
+    false,
+    1,              // CShuffleMXdlPerWavePerShuffle
+    2,              // CShuffleNXdlPerWavePerShuffle
+    S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+    8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+
+using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B0DataType,
+                                                                                ADataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B0ElementOp,
+                                                                                CElementOp>;
+
+using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B1DataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B1ElementOp,
+                                                                                CElementOp>;
+
+#include "run_batched_gemm_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return run_batched_gemm_gemm_example(argc, argv) ? 0 : 1; }
diff --git a/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc b/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc
new file mode 100644
index 00000000..7e5f1614
--- /dev/null
+++ b/example/31_batched_gemm_gemm/run_batched_gemm_gemm_example.inc
@@ -0,0 +1,277 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+bool run_batched_gemm_gemm_example(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M             = 1024;
+    ck::index_t N             = 1024;
+    ck::index_t K             = 64;
+    ck::index_t O             = 128;
+    ck::index_t BatchCount    = 4;
+    ck::index_t StrideA       = -1;
+    ck::index_t StrideB0      = -1;
+    ck::index_t StrideB1      = -1;
+    ck::index_t StrideC       = -1;
+    ck::index_t BatchStrideA  = -1;
+    ck::index_t BatchStrideB0 = -1;
+    ck::index_t BatchStrideB1 = -1;
+    ck::index_t BatchStrideC  = -1;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 9)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+        O = std::stoi(argv[7]);
+
+        BatchCount = std::stoi(argv[8]);
+    }
+    else if(argc == 17)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+        O = std::stoi(argv[7]);
+
+        BatchCount = std::stoi(argv[8]);
+
+        StrideA  = std::stoi(argv[9]);
+        StrideB0 = std::stoi(argv[10]);
+        StrideB1 = std::stoi(argv[11]);
+        StrideC  = std::stoi(argv[12]);
+
+        BatchStrideA  = std::stoi(argv[13]);
+        BatchStrideB0 = std::stoi(argv[14]);
+        BatchStrideB1 = std::stoi(argv[15]);
+        BatchStrideC  = std::stoi(argv[16]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 17: M, N, K, O, Batch, StrideA, StrideB0, StrideB1, StrideC, BatchStrideA, "
+               "BatchStrideB0, BatchStrideB1, BatchStrideC\n");
+        exit(0);
+    }
+
+    const int DefaultStrideA  = ck::is_same_v<ALayout, Row> ? K : M;
+    const int DefaultStrideB0 = ck::is_same_v<B0Layout, Row> ? N : K;
+    const int DefaultStrideB1 = ck::is_same_v<B1Layout, Row> ? O : N;
+    const int DefaultStrideC  = ck::is_same_v<CLayout, Row> ? O : M;
+
+    StrideA  = (StrideA < 0) ? DefaultStrideA : StrideA;
+    StrideB0 = (StrideB0 < 0) ? DefaultStrideB0 : StrideB0;
+    StrideB1 = (StrideB1 < 0) ? DefaultStrideB1 : StrideB1;
+    StrideC  = (StrideC < 0) ? DefaultStrideC : StrideC;
+
+    const int DefaultBatchStrideA  = (ck::is_same_v<ALayout, Col> ? K : M) * StrideA;
+    const int DefaultBatchStrideB0 = (ck::is_same_v<B0Layout, Col> ? N : K) * StrideB0;
+    const int DefaultBatchStrideB1 = (ck::is_same_v<B1Layout, Col> ? O : N) * StrideB1;
+    const int DefaultBatchStrideC  = (ck::is_same_v<CLayout, Col> ? O : M) * StrideC;
+
+    BatchStrideA  = BatchStrideA < 0 ? DefaultBatchStrideA : BatchStrideA;
+    BatchStrideB0 = BatchStrideB0 < 0 ? DefaultBatchStrideB0 : BatchStrideB0;
+    BatchStrideB1 = BatchStrideB1 < 0 ? DefaultBatchStrideB1 : BatchStrideB1;
+    BatchStrideC  = BatchStrideC < 0 ? DefaultBatchStrideC : BatchStrideC;
+
+    auto f_host_tensor_descriptor = [](std::size_t batch_count,
+                                       std::size_t row,
+                                       std::size_t col,
+                                       std::size_t stride,
+                                       std::size_t batch_stride,
+                                       auto layout) {
+        using namespace ck::literals;
+
+        if(std::is_same<decltype(layout), Row>::value)
+        {
+            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz});
+        }
+        else
+        {
+            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride});
+        }
+    };
+
+    // C_m_o = A_m_k * B0_k_n * B1_n_o
+    Tensor<ADataType> a_g_m_k(
+        f_host_tensor_descriptor(BatchCount, M, K, StrideA, BatchStrideA, ALayout{}));
+    Tensor<B0DataType> b0_g_k_n(
+        f_host_tensor_descriptor(BatchCount, K, N, StrideB0, BatchStrideB0, B0Layout{}));
+    Tensor<B1DataType> b1_g_n_o(
+        f_host_tensor_descriptor(BatchCount, N, O, StrideB1, BatchStrideB1, B1Layout{}));
+    Tensor<CDataType> c_g_m_o_host_result(
+        f_host_tensor_descriptor(BatchCount, M, O, StrideC, BatchStrideC, CLayout{}));
+    Tensor<CDataType> c_g_m_o_device_result(
+        f_host_tensor_descriptor(BatchCount, M, O, StrideC, BatchStrideC, CLayout{}));
+
+    std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
+    std::cout << "b0_g_k_n: " << b0_g_k_n.mDesc << std::endl;
+    std::cout << "b1_g_n_o: " << b1_g_n_o.mDesc << std::endl;
+    std::cout << "c_g_m_o: " << c_g_m_o_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-5, 5});
+        break;
+    case 2:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_3<B0DataType>{0.0, 1.0});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_3<B1DataType>{-0.5, 0.5});
+        break;
+    default:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+    }
+
+#ifdef BUILD_INT4_EXAMPLE
+    DeviceMem a_g_m_k_device_buf(sizeof(KernelADataType) * a_g_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b0_g_k_n_device_buf(sizeof(KernelB0DataType) * b0_g_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem b1_g_n_o_device_buf(sizeof(KernelB1DataType) * b1_g_n_o.mDesc.GetElementSpaceSize());
+    DeviceMem c_g_m_o_device_buf(sizeof(KernelCDataType) *
+                                 c_g_m_o_device_result.mDesc.GetElementSpaceSize());
+
+    const Tensor<KernelADataType> a_g_m_k_converted(a_g_m_k);
+    const Tensor<KernelB0DataType> b0_g_k_n_converted(b0_g_k_n);
+    const Tensor<KernelB1DataType> b1_g_n_o_converted(b1_g_n_o);
+
+    a_g_m_k_device_buf.ToDevice(a_g_m_k_converted.mData.data());
+    b0_g_k_n_device_buf.ToDevice(b0_g_k_n_converted.mData.data());
+    b1_g_n_o_device_buf.ToDevice(b1_g_n_o_converted.mData.data());
+#else
+    DeviceMem a_g_m_k_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b0_g_k_n_device_buf(sizeof(B0DataType) * b0_g_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem b1_g_n_o_device_buf(sizeof(B1DataType) * b1_g_n_o.mDesc.GetElementSpaceSize());
+    DeviceMem c_g_m_o_device_buf(sizeof(CDataType) *
+                                 c_g_m_o_device_result.mDesc.GetElementSpaceSize());
+
+    a_g_m_k_device_buf.ToDevice(a_g_m_k.mData.data());
+    b0_g_k_n_device_buf.ToDevice(b0_g_k_n.mData.data());
+    b1_g_n_o_device_buf.ToDevice(b1_g_n_o.mData.data());
+#endif
+
+    auto a_element_op    = AElementOp{};
+    auto b0_element_op   = B0ElementOp{};
+    auto acc0_element_op = Acc0ElementOp{};
+    auto b1_element_op   = B1ElementOp{};
+    auto c_element_op    = CElementOp{};
+
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(
+#ifdef BUILD_INT4_EXAMPLE
+        static_cast<KernelADataType*>(a_g_m_k_device_buf.GetDeviceBuffer()),
+        static_cast<KernelB0DataType*>(b0_g_k_n_device_buf.GetDeviceBuffer()),
+        static_cast<KernelB1DataType*>(b1_g_n_o_device_buf.GetDeviceBuffer()),
+        static_cast<KernelCDataType*>(c_g_m_o_device_buf.GetDeviceBuffer()),
+#else
+        static_cast<ADataType*>(a_g_m_k_device_buf.GetDeviceBuffer()),
+        static_cast<B0DataType*>(b0_g_k_n_device_buf.GetDeviceBuffer()),
+        static_cast<B1DataType*>(b1_g_n_o_device_buf.GetDeviceBuffer()),
+        static_cast<CDataType*>(c_g_m_o_device_buf.GetDeviceBuffer()),
+#endif
+        M,
+        N,
+        K,
+        O,
+        BatchCount,
+        StrideA,
+        StrideB0,
+        StrideB1,
+        StrideC,
+        BatchStrideA,
+        BatchStrideB0,
+        BatchStrideB1,
+        BatchStrideC,
+        a_element_op,
+        b0_element_op,
+        acc0_element_op,
+        b1_element_op,
+        c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return 0;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop      = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * BatchCount;
+    std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N +
+                             sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O) *
+                            BatchCount;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    if(do_verification)
+    {
+        // Output of Gemm0 is input A of Gemm1
+        Tensor<ADataType> a1_g_m_n(f_host_tensor_descriptor(BatchCount, M, N, N, M * N, Row{}));
+
+        auto ref_gemm0          = ReferenceGemm0Instance{};
+        auto ref_gemm0_invoker  = ref_gemm0.MakeInvoker();
+        auto ref_gemm0_argument = ref_gemm0.MakeArgument(
+            a_g_m_k, b0_g_k_n, a1_g_m_n, a_element_op, b0_element_op, PassThrough{});
+
+        ref_gemm0_invoker.Run(ref_gemm0_argument);
+
+        auto ref_gemm1          = ReferenceGemm1Instance{};
+        auto ref_gemm1_invoker  = ref_gemm1.MakeInvoker();
+        auto ref_gemm1_argument = ref_gemm1.MakeArgument(
+            a1_g_m_n, b1_g_n_o, c_g_m_o_host_result, PassThrough{}, b1_element_op, c_element_op);
+
+        ref_gemm1_invoker.Run(ref_gemm1_argument);
+
+#ifdef BUILD_INT4_EXAMPLE
+        Tensor<KernelCDataType> c_g_m_o_device_result_converted(c_g_m_o_host_result.mDesc);
+
+        c_g_m_o_device_buf.FromDevice(c_g_m_o_device_result_converted.mData.data());
+
+        c_g_m_o_device_result = c_g_m_o_device_result_converted.CopyAsType<CDataType>();
+#else
+        c_g_m_o_device_buf.FromDevice(c_g_m_o_device_result.mData.data());
+#endif
+
+        return ck::utils::check_err(c_g_m_o_device_result, c_g_m_o_host_result);
+    }
+
+    return true;
+}
diff --git a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt
new file mode 100644
index 00000000..8d9aaec8
--- /dev/null
+++ b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_example_executable(example_batched_gemm_scale_softmax_gemm_xdl_fp16 batched_gemm_scale_softmax_gemm_xdl_fp16.cpp)
+add_example_executable(example_batched_gemm_scale_softmax_gemm_xdl_bf16 batched_gemm_scale_softmax_gemm_xdl_bf16.cpp)
+add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_xdl_fp16 batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp)
+add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_xdl_bf16 batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp)
+add_example_executable(example_grouped_gemm_scale_softmax_gemm_permute_xdl_fp16 grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp)
+add_example_executable(example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16 batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp)
+add_example_executable(example_grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16 grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp)
+
+add_custom_target(example_gemm_scale_softmax_gemm)
+add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_scale_softmax_gemm_xdl_fp16)
+add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_scale_softmax_gemm_xdl_bf16)
+add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_scale_softmax_gemm_permute_xdl_fp16)
+add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_scale_softmax_gemm_permute_xdl_bf16)
+add_dependencies(example_gemm_scale_softmax_gemm example_grouped_gemm_scale_softmax_gemm_permute_xdl_fp16)
+add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16)
+add_dependencies(example_gemm_scale_softmax_gemm example_grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16)
diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
new file mode 100644
index 00000000..0eb15653
--- /dev/null
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
+                                                                  |-----------------|
+                                                                          Gemm0
+                                                          |-------------------------------------|
+                                                                          Gemm1
+*/
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F16;
+using B0DataType       = F16;
+using B1DataType       = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using CDataType        = F16;
+using Acc0BiasDataType = ck::Tuple<>;
+using Acc1BiasDataType = ck::Tuple<>;
+
+static constexpr ck::index_t NumDimG = 2;
+static constexpr ck::index_t NumDimM = 1;
+static constexpr ck::index_t NumDimN = 1;
+static constexpr ck::index_t NumDimK = 1;
+static constexpr ck::index_t NumDimO = 1;
+
+using AElementOp    = PassThrough;
+using B0ElementOp   = PassThrough;
+using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+using B1ElementOp   = PassThrough;
+using CElementOp    = PassThrough;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+static constexpr auto MaskingSpec =
+    ck::tensor_operation::device::MaskingSpecialization::MaskOutUpperTriangle;
+
+static constexpr auto TensorSpecA  = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecB0 = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecB1 = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecC  = ck::tensor_operation::device::TensorSpecialization::Default;
+
+using DeviceGemmInstance =
+    ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<
+        NumDimG,
+        NumDimM,
+        NumDimN,
+        NumDimK,
+        NumDimO,
+        ADataType,
+        B0DataType,
+        B1DataType,
+        CDataType,
+        Acc0BiasDataType,
+        Acc1BiasDataType,
+        AccDataType,
+        CShuffleDataType,
+        AElementOp,
+        B0ElementOp,
+        Acc0ElementOp,
+        B1ElementOp,
+        CElementOp,
+        GemmSpec,
+        TensorSpecA,
+        TensorSpecB0,
+        TensorSpecB1,
+        TensorSpecC,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        32,          // KPerBlock
+        64,          // Gemm1NPerBlock
+        32,          // Gemm1KPerBlock
+        8,           // AK1
+        8,           // BK1
+        2,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        2,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<16, 16, 1>, // B1BlockTransfer
+        S<0, 2, 1>,
+        S<0, 2, 1>,
+        1,
+        4,
+        2,
+        false,
+        1,              // CShuffleMXdlPerWavePerShuffle
+        2,              // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+        MaskingSpec>;   // MaskingSpecialization
+
+// Ref Gemm0: fp16 in, fp32 out
+using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B0DataType,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B0ElementOp,
+                                                                                Acc0ElementOp>;
+
+// Ref Softmax: fp32 in, fp16 out
+using ReferenceSoftmaxInstance =
+    ck::tensor_operation::host::ReferenceSoftmax<AccDataType, ADataType, AccDataType>;
+
+// Ref Gemm1: fp16 in, fp16 out
+using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B1DataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B1ElementOp,
+                                                                                CElementOp>;
+
+#include "run_batched_gemm_scale_softmax_gemm_permute.inc"
+
+int main(int argc, char* argv[]) { return run(argc, argv); }
diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp
new file mode 100644
index 00000000..8f1db577
--- /dev/null
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp
@@ -0,0 +1,159 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
+                                                                  |-----------------|
+                                                                          Gemm0
+                                                          |-------------------------------------|
+                                                                          Gemm1
+*/
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = BF16;
+using B0DataType       = BF16;
+using B1DataType       = BF16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using CDataType        = BF16;
+using Acc0BiasDataType = ck::Tuple<>;
+using Acc1BiasDataType = ck::Tuple<>;
+
+static constexpr ck::index_t NumDimG = 2;
+static constexpr ck::index_t NumDimM = 1;
+static constexpr ck::index_t NumDimN = 1;
+static constexpr ck::index_t NumDimK = 1;
+static constexpr ck::index_t NumDimO = 1;
+
+using AElementOp    = PassThrough;
+using B0ElementOp   = PassThrough;
+using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+using B1ElementOp   = PassThrough;
+using CElementOp    = PassThrough;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+static constexpr auto MaskingSpec =
+    ck::tensor_operation::device::MaskingSpecialization::MaskDisabled;
+
+static constexpr auto TensorSpecA  = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecB0 = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecB1 = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecC  = ck::tensor_operation::device::TensorSpecialization::Default;
+
+using DeviceGemmInstance =
+    ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<
+        NumDimG,
+        NumDimM,
+        NumDimN,
+        NumDimK,
+        NumDimO,
+        ADataType,
+        B0DataType,
+        B1DataType,
+        CDataType,
+        Acc0BiasDataType,
+        Acc1BiasDataType,
+        AccDataType,
+        CShuffleDataType,
+        AElementOp,
+        B0ElementOp,
+        Acc0ElementOp,
+        B1ElementOp,
+        CElementOp,
+        GemmSpec,
+        TensorSpecA,
+        TensorSpecB0,
+        TensorSpecB1,
+        TensorSpecC,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        32,          // KPerBlock
+        64,          // Gemm1NPerBlock
+        32,          // Gemm1KPerBlock
+        8,           // AK1
+        8,           // BK1
+        2,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        2,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<16, 16, 1>, // B1BlockTransfer
+        S<0, 2, 1>,
+        S<0, 2, 1>,
+        1,
+        4,
+        2,
+        false,
+        1,              // CShuffleMXdlPerWavePerShuffle
+        2,              // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+        MaskingSpec>;   // MaskingSpecialization
+
+// Ref Gemm0: bf16 in, fp32 out
+using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B0DataType,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B0ElementOp,
+                                                                                Acc0ElementOp>;
+
+// Ref Softmax: fp32 in, bf16 out
+using ReferenceSoftmaxInstance =
+    ck::tensor_operation::host::ReferenceSoftmax<AccDataType, ADataType, AccDataType>;
+
+// Ref Gemm1: bf16 in, bf16 out
+using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B1DataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B1ElementOp,
+                                                                                CElementOp>;
+
+#include "run_batched_gemm_scale_softmax_gemm_permute.inc"
+
+int main(int argc, char* argv[]) { return run(argc, argv); }
diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
new file mode 100644
index 00000000..2ce91a8c
--- /dev/null
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
+                                                                  |-----------------|
+                                                                          Gemm0
+                                                          |-------------------------------------|
+                                                                          Gemm1
+*/
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F16;
+using B0DataType       = F16;
+using B1DataType       = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using CDataType        = F16;
+using Acc0BiasDataType = ck::Tuple<>;
+using Acc1BiasDataType = ck::Tuple<>;
+
+static constexpr ck::index_t NumDimG = 2;
+static constexpr ck::index_t NumDimM = 1;
+static constexpr ck::index_t NumDimN = 1;
+static constexpr ck::index_t NumDimK = 1;
+static constexpr ck::index_t NumDimO = 1;
+
+using AElementOp    = PassThrough;
+using B0ElementOp   = PassThrough;
+using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+using B1ElementOp   = PassThrough;
+using CElementOp    = PassThrough;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+static constexpr auto MaskingSpec =
+    ck::tensor_operation::device::MaskingSpecialization::MaskDisabled;
+
+static constexpr auto TensorSpecA  = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecB0 = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecB1 = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecC  = ck::tensor_operation::device::TensorSpecialization::Default;
+
+using DeviceGemmInstance =
+    ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<
+        NumDimG,
+        NumDimM,
+        NumDimN,
+        NumDimK,
+        NumDimO,
+        ADataType,
+        B0DataType,
+        B1DataType,
+        CDataType,
+        Acc0BiasDataType,
+        Acc1BiasDataType,
+        AccDataType,
+        CShuffleDataType,
+        AElementOp,
+        B0ElementOp,
+        Acc0ElementOp,
+        B1ElementOp,
+        CElementOp,
+        GemmSpec,
+        TensorSpecA,
+        TensorSpecB0,
+        TensorSpecB1,
+        TensorSpecC,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        32,          // KPerBlock
+        64,          // Gemm1NPerBlock
+        32,          // Gemm1KPerBlock
+        8,           // AK1
+        8,           // BK1
+        2,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        2,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<16, 16, 1>, // B1BlockTransfer
+        S<0, 2, 1>,
+        S<0, 2, 1>,
+        1,
+        4,
+        2,
+        false,
+        1,              // CShuffleMXdlPerWavePerShuffle
+        2,              // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+        MaskingSpec>;   // MaskingSpecialization
+
+// Ref Gemm0: fp16 in, fp32 out
+using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B0DataType,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B0ElementOp,
+                                                                                Acc0ElementOp>;
+
+// Ref Softmax: fp32 in, fp16 out
+using ReferenceSoftmaxInstance =
+    ck::tensor_operation::host::ReferenceSoftmax<AccDataType, ADataType, AccDataType>;
+
+// Ref Gemm1: fp16 in, fp16 out
+using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B1DataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B1ElementOp,
+                                                                                CElementOp>;
+
+#include "run_batched_gemm_scale_softmax_gemm_permute.inc"
+
+int main(int argc, char* argv[]) { return run(argc, argv); }
diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_bf16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_bf16.cpp
new file mode 100644
index 00000000..1fd2bf69
--- /dev/null
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_bf16.cpp
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
+                                                                  |-----------------|
+                                                                          Gemm0
+                                                          |-------------------------------------|
+                                                                          Gemm1
+*/
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = BF16;
+using B0DataType       = BF16;
+using B1DataType       = BF16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using CDataType        = BF16;
+
+using ALayout  = Row;
+using B0Layout = Col;
+using B1Layout = Row;
+using CLayout  = Row;
+
+using AElementOp    = PassThrough;
+using B0ElementOp   = PassThrough;
+using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+using B1ElementOp   = PassThrough;
+using CElementOp    = PassThrough;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<
+    ALayout,
+    B0Layout,
+    B1Layout,
+    CLayout,
+    ADataType,
+    B0DataType,
+    B1DataType,
+    CDataType,
+    AccDataType,
+    CShuffleDataType,
+    AElementOp,
+    B0ElementOp,
+    Acc0ElementOp,
+    B1ElementOp,
+    CElementOp,
+    GemmSpec,
+    1,
+    256,
+    128,         // MPerBlock
+    128,         // NPerBlock
+    32,          // KPerBlock
+    64,          // Gemm1NPerBlock
+    32,          // Gemm1KPerBlock
+    8,           // AK1
+    8,           // BK1
+    2,           // B1K1
+    32,          // MPerXDL
+    32,          // NPerXDL
+    1,           // MXdlPerWave
+    4,           // NXdlPerWave
+    2,           // Gemm1NXdlPerWave
+    S<4, 64, 1>, // ABlockTransfer
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    8,
+    8,
+    true,
+    S<4, 64, 1>, // BBlockTransfer
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    8,
+    8,
+    true,
+    S<16, 16, 1>, // B1BlockTransfer
+    S<0, 2, 1>,
+    S<0, 2, 1>,
+    1,
+    4,
+    2,
+    false,
+    1,              // CShuffleMXdlPerWavePerShuffle
+    2,              // CShuffleNXdlPerWavePerShuffle
+    S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+    8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+    false>;
+
+// Ref Gemm0: fp16 in, fp32 out
+using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B0DataType,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B0ElementOp,
+                                                                                Acc0ElementOp>;
+
+// Ref Softmax: fp32 in, fp16 out
+using ReferenceSoftmaxInstance =
+    ck::tensor_operation::host::ReferenceSoftmax<AccDataType, ADataType, AccDataType>;
+
+// Ref Gemm1: fp16 in, fp16 out
+using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B1DataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B1ElementOp,
+                                                                                CElementOp>;
+
+#include "run_batched_gemm_scale_softmax_gemm.inc"
+
+int main(int argc, char* argv[]) { return run(argc, argv); }
diff --git a/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp
new file mode 100644
index 00000000..f4a85890
--- /dev/null
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_gemm_scale_softmax_gemm_xdl_fp16.cpp
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
+                                                                  |-----------------|
+                                                                          Gemm0
+                                                          |-------------------------------------|
+                                                                          Gemm1
+*/
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F16;
+using B0DataType       = F16;
+using B1DataType       = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using CDataType        = F16;
+
+using ALayout  = Row;
+using B0Layout = Col;
+using B1Layout = Row;
+using CLayout  = Row;
+
+using AElementOp    = PassThrough;
+using B0ElementOp   = PassThrough;
+using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+using B1ElementOp   = PassThrough;
+using CElementOp    = PassThrough;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<
+    ALayout,
+    B0Layout,
+    B1Layout,
+    CLayout,
+    ADataType,
+    B0DataType,
+    B1DataType,
+    CDataType,
+    AccDataType,
+    CShuffleDataType,
+    AElementOp,
+    B0ElementOp,
+    Acc0ElementOp,
+    B1ElementOp,
+    CElementOp,
+    GemmSpec,
+    1,
+    256,
+    128,         // MPerBlock
+    128,         // NPerBlock
+    32,          // KPerBlock
+    64,          // Gemm1NPerBlock
+    32,          // Gemm1KPerBlock
+    8,           // AK1
+    8,           // BK1
+    2,           // B1K1
+    32,          // MPerXDL
+    32,          // NPerXDL
+    1,           // MXdlPerWave
+    4,           // NXdlPerWave
+    2,           // Gemm1NXdlPerWave
+    S<4, 64, 1>, // ABlockTransfer
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    8,
+    8,
+    true,
+    S<4, 64, 1>, // BBlockTransfer
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    8,
+    8,
+    true,
+    S<16, 16, 1>, // B1BlockTransfer
+    S<0, 2, 1>,
+    S<0, 2, 1>,
+    1,
+    4,
+    2,
+    false,
+    1,              // CShuffleMXdlPerWavePerShuffle
+    2,              // CShuffleNXdlPerWavePerShuffle
+    S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+    8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+    false>;
+
+// Ref Gemm0: fp16 in, fp32 out
+using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B0DataType,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B0ElementOp,
+                                                                                Acc0ElementOp>;
+
+// Ref Softmax: fp32 in, fp16 out
+using ReferenceSoftmaxInstance =
+    ck::tensor_operation::host::ReferenceSoftmax<AccDataType, ADataType, AccDataType>;
+
+// Ref Gemm1: fp16 in, fp16 out
+using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B1DataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B1ElementOp,
+                                                                                CElementOp>;
+
+#include "run_batched_gemm_scale_softmax_gemm.inc"
+
+int main(int argc, char* argv[]) { return run(argc, argv); }
diff --git a/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
new file mode 100644
index 00000000..e4a71b04
--- /dev/null
+++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp
@@ -0,0 +1,159 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
+                                                                  |-----------------|
+                                                                          Gemm0
+                                                          |-------------------------------------|
+                                                                          Gemm1
+*/
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F16;
+using B0DataType       = F16;
+using B1DataType       = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using CDataType        = F16;
+using Acc0BiasDataType = ck::Tuple<>;
+using Acc1BiasDataType = ck::Tuple<>;
+
+static constexpr ck::index_t NumDimG = 2;
+static constexpr ck::index_t NumDimM = 1;
+static constexpr ck::index_t NumDimN = 1;
+static constexpr ck::index_t NumDimK = 1;
+static constexpr ck::index_t NumDimO = 1;
+
+using AElementOp    = PassThrough;
+using B0ElementOp   = PassThrough;
+using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+using B1ElementOp   = PassThrough;
+using CElementOp    = PassThrough;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+static constexpr auto MaskingSpec =
+    ck::tensor_operation::device::MaskingSpecialization::MaskOutUpperTriangle;
+
+static constexpr auto TensorSpecA  = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecB0 = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecB1 = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecC  = ck::tensor_operation::device::TensorSpecialization::Default;
+
+using DeviceGemmInstance =
+    ck::tensor_operation::device::DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle<
+        NumDimG,
+        NumDimM,
+        NumDimN,
+        NumDimK,
+        NumDimO,
+        ADataType,
+        B0DataType,
+        B1DataType,
+        CDataType,
+        Acc0BiasDataType,
+        Acc1BiasDataType,
+        AccDataType,
+        CShuffleDataType,
+        AElementOp,
+        B0ElementOp,
+        Acc0ElementOp,
+        B1ElementOp,
+        CElementOp,
+        GemmSpec,
+        TensorSpecA,
+        TensorSpecB0,
+        TensorSpecB1,
+        TensorSpecC,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        32,          // KPerBlock
+        64,          // Gemm1NPerBlock
+        32,          // Gemm1KPerBlock
+        8,           // AK1
+        8,           // BK1
+        2,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        2,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<16, 16, 1>, // B1BlockTransfer
+        S<0, 2, 1>,
+        S<0, 2, 1>,
+        1,
+        4,
+        2,
+        false,
+        1,              // CShuffleMXdlPerWavePerShuffle
+        2,              // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+        MaskingSpec>;   // MaskingSpecialization
+
+// Ref Gemm0: fp16 in, fp32 out
+using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B0DataType,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B0ElementOp,
+                                                                                Acc0ElementOp>;
+
+// Ref Softmax: fp32 in, fp16 out
+using ReferenceSoftmaxInstance =
+    ck::tensor_operation::host::ReferenceSoftmax<AccDataType, ADataType, AccDataType>;
+
+// Ref Gemm1: fp16 in, fp16 out
+using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B1DataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B1ElementOp,
+                                                                                CElementOp>;
+
+#include "run_grouped_gemm_scale_softmax_gemm_permute.inc"
+
+int main(int argc, char* argv[]) { return run(argc, argv); }
diff --git a/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp b/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
new file mode 100644
index 00000000..38b5badc
--- /dev/null
+++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+Gemm + Softmax + Gemm fused operation. Computes C_g_m_o = Softmax(A_g_m_k * B0_g_k_n) * B1_g_n_o
+                                                                  |-----------------|
+                                                                          Gemm0
+                                                          |-------------------------------------|
+                                                                          Gemm1
+*/
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F16;
+using B0DataType       = F16;
+using B1DataType       = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using CDataType        = F16;
+using Acc0BiasDataType = ck::Tuple<>;
+using Acc1BiasDataType = ck::Tuple<>;
+
+static constexpr ck::index_t NumDimG = 2;
+static constexpr ck::index_t NumDimM = 1;
+static constexpr ck::index_t NumDimN = 1;
+static constexpr ck::index_t NumDimK = 1;
+static constexpr ck::index_t NumDimO = 1;
+
+using AElementOp    = PassThrough;
+using B0ElementOp   = PassThrough;
+using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
+using B1ElementOp   = PassThrough;
+using CElementOp    = PassThrough;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+static constexpr auto MaskingSpec =
+    ck::tensor_operation::device::MaskingSpecialization::MaskDisabled;
+
+static constexpr auto TensorSpecA  = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecB0 = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecB1 = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecC  = ck::tensor_operation::device::TensorSpecialization::Default;
+
+using DeviceGemmInstance =
+    ck::tensor_operation::device::DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle<
+        NumDimG,
+        NumDimM,
+        NumDimN,
+        NumDimK,
+        NumDimO,
+        ADataType,
+        B0DataType,
+        B1DataType,
+        CDataType,
+        Acc0BiasDataType,
+        Acc1BiasDataType,
+        AccDataType,
+        CShuffleDataType,
+        AElementOp,
+        B0ElementOp,
+        Acc0ElementOp,
+        B1ElementOp,
+        CElementOp,
+        GemmSpec,
+        TensorSpecA,
+        TensorSpecB0,
+        TensorSpecB1,
+        TensorSpecC,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        32,          // KPerBlock
+        64,          // Gemm1NPerBlock
+        32,          // Gemm1KPerBlock
+        8,           // AK1
+        8,           // BK1
+        2,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        2,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<16, 16, 1>, // B1BlockTransfer
+        S<0, 2, 1>,
+        S<0, 2, 1>,
+        1,
+        4,
+        2,
+        false,
+        1,              // CShuffleMXdlPerWavePerShuffle
+        2,              // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+        MaskingSpec>;   // MaskingSpecialization
+
+// Ref Gemm0: fp16 in, fp32 out
+using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B0DataType,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B0ElementOp,
+                                                                                Acc0ElementOp>;
+
+// Ref Softmax: fp32 in, fp16 out
+using ReferenceSoftmaxInstance =
+    ck::tensor_operation::host::ReferenceSoftmax<AccDataType, ADataType, AccDataType>;
+
+// Ref Gemm1: fp16 in, fp16 out
+using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B1DataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B1ElementOp,
+                                                                                CElementOp>;
+
+#include "run_grouped_gemm_scale_softmax_gemm_permute.inc"
+
+int main(int argc, char* argv[]) { return run(argc, argv); }
diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc
new file mode 100644
index 00000000..4e43dbdd
--- /dev/null
+++ b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm.inc
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+int run(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 2;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M             = 1020;
+    ck::index_t N             = 1020;
+    ck::index_t K             = 64;
+    ck::index_t O             = 128;
+    ck::index_t BatchCount    = 4;
+    ck::index_t StrideA       = -1;
+    ck::index_t StrideB0      = -1;
+    ck::index_t StrideB1      = -1;
+    ck::index_t StrideC       = -1;
+    ck::index_t BatchStrideA  = -1;
+    ck::index_t BatchStrideB0 = -1;
+    ck::index_t BatchStrideB1 = -1;
+    ck::index_t BatchStrideC  = -1;
+    float alpha               = 1;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 9)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+        O = std::stoi(argv[7]);
+
+        BatchCount = std::stoi(argv[8]);
+    }
+    else if(argc == 18)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+        O = std::stoi(argv[7]);
+
+        BatchCount = std::stoi(argv[8]);
+
+        StrideA  = std::stoi(argv[9]);
+        StrideB0 = std::stoi(argv[10]);
+        StrideB1 = std::stoi(argv[11]);
+        StrideC  = std::stoi(argv[12]);
+
+        BatchStrideA  = std::stoi(argv[13]);
+        BatchStrideB0 = std::stoi(argv[14]);
+        BatchStrideB1 = std::stoi(argv[15]);
+        BatchStrideC  = std::stoi(argv[16]);
+
+        alpha = std::stof(argv[17]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 16: M, N, K, O, Batch, StrideA, StrideB0, StrideB1, StrideC, BatchStrideA, "
+               "BatchStrideB0, BatchStrideB1, BatchStrideC\n");
+        printf("arg17: scale (alpha)\n");
+        exit(0);
+    }
+
+    const int DefaultStrideA  = ck::is_same_v<ALayout, Row> ? K : M;
+    const int DefaultStrideB0 = ck::is_same_v<B0Layout, Row> ? N : K;
+    const int DefaultStrideB1 = ck::is_same_v<B1Layout, Row> ? O : N;
+    const int DefaultStrideC  = ck::is_same_v<CLayout, Row> ? O : M;
+
+    StrideA  = (StrideA < 0) ? DefaultStrideA : StrideA;
+    StrideB0 = (StrideB0 < 0) ? DefaultStrideB0 : StrideB0;
+    StrideB1 = (StrideB1 < 0) ? DefaultStrideB1 : StrideB1;
+    StrideC  = (StrideC < 0) ? DefaultStrideC : StrideC;
+
+    const int DefaultBatchStrideA  = (ck::is_same_v<ALayout, Col> ? K : M) * StrideA;
+    const int DefaultBatchStrideB0 = (ck::is_same_v<B0Layout, Col> ? N : K) * StrideB0;
+    const int DefaultBatchStrideB1 = (ck::is_same_v<B1Layout, Col> ? O : N) * StrideB1;
+    const int DefaultBatchStrideC  = (ck::is_same_v<CLayout, Col> ? O : M) * StrideC;
+
+    BatchStrideA  = BatchStrideA < 0 ? DefaultBatchStrideA : BatchStrideA;
+    BatchStrideB0 = BatchStrideB0 < 0 ? DefaultBatchStrideB0 : BatchStrideB0;
+    BatchStrideB1 = BatchStrideB1 < 0 ? DefaultBatchStrideB1 : BatchStrideB1;
+    BatchStrideC  = BatchStrideC < 0 ? DefaultBatchStrideC : BatchStrideC;
+
+    auto f_host_tensor_descriptor = [](std::size_t batch_count,
+                                       std::size_t row,
+                                       std::size_t col,
+                                       std::size_t stride,
+                                       std::size_t batch_stride,
+                                       auto layout) {
+        if(std::is_same<decltype(layout), Row>::value)
+        {
+            return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
+                                        std::vector<std::size_t>({batch_stride, stride, 1}));
+        }
+        else
+        {
+            return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
+                                        std::vector<std::size_t>({batch_stride, 1, stride}));
+        }
+    };
+
+    // C_m_o = A_m_k * B0_k_n * B1_n_o
+    Tensor<ADataType> a_g_m_k(
+        f_host_tensor_descriptor(BatchCount, M, K, StrideA, BatchStrideA, ALayout{}));
+    Tensor<B0DataType> b0_g_k_n(
+        f_host_tensor_descriptor(BatchCount, K, N, StrideB0, BatchStrideB0, B0Layout{}));
+    Tensor<B1DataType> b1_g_n_o(
+        f_host_tensor_descriptor(BatchCount, N, O, StrideB1, BatchStrideB1, B1Layout{}));
+    Tensor<CDataType> c_g_m_o_host_result(
+        f_host_tensor_descriptor(BatchCount, M, O, StrideC, BatchStrideC, CLayout{}));
+    Tensor<CDataType> c_g_m_o_device_result(
+        f_host_tensor_descriptor(BatchCount, M, O, StrideC, BatchStrideC, CLayout{}));
+
+    std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
+    std::cout << "b0_g_k_n: " << b0_g_k_n.mDesc << std::endl;
+    std::cout << "b1_g_n_o: " << b1_g_n_o.mDesc << std::endl;
+    std::cout << "c_g_m_o: " << c_g_m_o_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-5, 5});
+        break;
+    case 2:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_3<B0DataType>{0.0, 1.0});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_3<B1DataType>{-0.5, 0.5});
+        break;
+    case 3:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Diagonal<B0DataType>{});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+        break;
+    default:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+    }
+
+    DeviceMem a_g_m_k_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b0_g_k_n_device_buf(sizeof(B0DataType) * b0_g_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem b1_g_n_o_device_buf(sizeof(B1DataType) * b1_g_n_o.mDesc.GetElementSpaceSize());
+    DeviceMem c_g_m_o_device_buf(sizeof(CDataType) *
+                                 c_g_m_o_device_result.mDesc.GetElementSpaceSize());
+
+    a_g_m_k_device_buf.ToDevice(a_g_m_k.mData.data());
+    b0_g_k_n_device_buf.ToDevice(b0_g_k_n.mData.data());
+    b1_g_n_o_device_buf.ToDevice(b1_g_n_o.mData.data());
+
+    auto a_element_op    = AElementOp{};
+    auto b0_element_op   = B0ElementOp{};
+    auto acc0_element_op = Acc0ElementOp{alpha};
+    auto b1_element_op   = B1ElementOp{};
+    auto c_element_op    = CElementOp{};
+
+    // do GEMM
+    auto gemm    = DeviceGemmInstance{};
+    auto invoker = gemm.MakeInvoker();
+    auto argument =
+        gemm.MakeArgument(static_cast<ADataType*>(a_g_m_k_device_buf.GetDeviceBuffer()),
+                          static_cast<B0DataType*>(b0_g_k_n_device_buf.GetDeviceBuffer()),
+                          static_cast<B1DataType*>(b1_g_n_o_device_buf.GetDeviceBuffer()),
+                          static_cast<CDataType*>(c_g_m_o_device_buf.GetDeviceBuffer()),
+                          M,
+                          N,
+                          K,
+                          O,
+                          BatchCount,
+                          StrideA,
+                          StrideB0,
+                          StrideB1,
+                          StrideC,
+                          BatchStrideA,
+                          BatchStrideB0,
+                          BatchStrideB1,
+                          BatchStrideC,
+                          a_element_op,
+                          b0_element_op,
+                          acc0_element_op,
+                          b1_element_op,
+                          c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return 0;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop      = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * BatchCount;
+    std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N +
+                             sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O) *
+                            BatchCount;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    c_g_m_o_device_buf.FromDevice(c_g_m_o_device_result.mData.data());
+
+    if(do_verification)
+    {
+        // Output of Gemm0 is input A of Gemm1
+        Tensor<AccDataType> acc0_g_m_n(f_host_tensor_descriptor(BatchCount, M, N, N, M * N, Row{}));
+
+        Tensor<ADataType> a1_g_m_n(f_host_tensor_descriptor(BatchCount, M, N, N, M * N, Row{}));
+
+        auto ref_gemm0          = ReferenceGemm0Instance{};
+        auto ref_gemm0_invoker  = ref_gemm0.MakeInvoker();
+        auto ref_gemm0_argument = ref_gemm0.MakeArgument(
+            a_g_m_k, b0_g_k_n, acc0_g_m_n, a_element_op, b0_element_op, acc0_element_op);
+
+        ref_gemm0_invoker.Run(ref_gemm0_argument);
+
+        auto ref_softmax          = ReferenceSoftmaxInstance{};
+        auto ref_softmax_invoker  = ref_softmax.MakeInvoker();
+        auto ref_softmax_argument = ref_softmax.MakeArgument(acc0_g_m_n, a1_g_m_n, 1, 0, {2});
+
+        ref_softmax_invoker.Run(ref_softmax_argument);
+
+        auto ref_gemm1          = ReferenceGemm1Instance{};
+        auto ref_gemm1_invoker  = ref_gemm1.MakeInvoker();
+        auto ref_gemm1_argument = ref_gemm1.MakeArgument(
+            a1_g_m_n, b1_g_n_o, c_g_m_o_host_result, PassThrough{}, b1_element_op, c_element_op);
+
+        ref_gemm1_invoker.Run(ref_gemm1_argument);
+
+        return ck::utils::check_err(c_g_m_o_device_result.mData, c_g_m_o_host_result.mData) ? 0 : 1;
+    }
+
+    return 0;
+}
diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc
new file mode 100644
index 00000000..0b876af9
--- /dev/null
+++ b/example/32_batched_gemm_scale_softmax_gemm/run_batched_gemm_scale_softmax_gemm_permute.inc
@@ -0,0 +1,278 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+int run(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // GEMM shape for A/B0/B1/C
+    // C_g_m_o = A_g_m_k * B0_g_k_n * B1_g_n_o
+    ck::index_t M = 120;
+    ck::index_t N = 1000;
+    ck::index_t K = 64;
+    ck::index_t O = 128;
+
+    // Output shape C[G0, M, G1, O]. Batch dim, outer dim, inner dim must match GEMM shape
+    // C_g0_g1_m_o = reshape(C_g_m_o, [g0, g1, m, o])
+    // C_g0_m_g1_o = permute(C_g0_g1_m_o, [0, 2, 1, 3])
+    ck::index_t G0 = 7;
+    ck::index_t G1 = 13;
+
+    float alpha = 1;
+
+    bool input_permute  = false;
+    bool output_permute = true;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 13)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M  = std::stoi(argv[4]);
+        N  = std::stoi(argv[5]);
+        K  = std::stoi(argv[6]);
+        O  = std::stoi(argv[7]);
+        G0 = std::stoi(argv[8]);
+        G1 = std::stoi(argv[9]);
+
+        alpha = std::stof(argv[10]);
+
+        input_permute  = std::stoi(argv[11]);
+        output_permute = std::stoi(argv[12]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 11: M, N, K, O, G0, G1\n");
+        printf("arg10: scale (alpha)\n");
+        printf("arg11 to 12: input / output permute\n");
+        exit(0);
+    }
+
+    std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, G1, M, K};
+    std::vector<ck::index_t> a_gs_ms_ks_strides =
+        input_permute
+            ? std::vector<ck::index_t>{M * G1 * K, K, G1 * K, 1} // A layout [G0, M, G1, K]
+            : std::vector<ck::index_t>{G1 * M * K, M * K, K, 1}; // A layout [G0, G1, M, K]
+
+    std::vector<ck::index_t> b0_gs_ns_ks_lengths{G0, G1, N, K};
+    std::vector<ck::index_t> b0_gs_ns_ks_strides =
+        input_permute
+            ? std::vector<ck::index_t>{N * G1 * K, K, G1 * K, 1} // B0 layout [G0, N, G1, K]
+            : std::vector<ck::index_t>{G1 * N * K, N * K, K, 1}; // B0 layout [G0, G1, N, K]
+
+    std::vector<ck::index_t> b1_gs_os_ns_lengths{G0, G1, O, N};
+    std::vector<ck::index_t> b1_gs_os_ns_strides =
+        input_permute
+            ? std::vector<ck::index_t>{N * G1 * O, O, 1, G1 * O} // B1 layout [G0, N, G1, O]
+            : std::vector<ck::index_t>{G1 * N * O, N * O, 1, O}; // B1 layout [G0, G1, N, O]
+
+    std::vector<ck::index_t> c_gs_ms_os_lengths{G0, G1, M, O};
+    std::vector<ck::index_t> c_gs_ms_os_strides =
+        output_permute
+            ? std::vector<ck::index_t>{M * G1 * O, O, G1 * O, 1} // C layout [G0, M, G1, O]
+            : std::vector<ck::index_t>{G1 * M * O, M * O, O, 1}; // C layout [G0, G1, M, O]
+
+    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
+    Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
+    Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
+    Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
+    Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
+
+    std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
+    std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl;
+    std::cout << "b1_gs_os_ns: " << b1_gs_os_ns.mDesc << std::endl;
+    std::cout << "c_gs_ms_os: " << c_gs_ms_os_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-2, 2});
+        break;
+    case 2:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_3<B0DataType>{0.0, 1.0});
+        b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_3<B1DataType>{-0.5, 0.5});
+        break;
+    case 3:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal<B0DataType>{});
+        b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+        break;
+    default:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{});
+        b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal<B0DataType>{});
+        b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_gs_ms_ks.mDesc.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_gs_ns_ks.mDesc.GetElementSpaceSize());
+    DeviceMem b1_device_buf(sizeof(B1DataType) * b1_gs_os_ns.mDesc.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(CDataType) *
+                           c_gs_ms_os_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_gs_ms_ks.mData.data());
+    b0_device_buf.ToDevice(b0_gs_ns_ks.mData.data());
+    b1_device_buf.ToDevice(b1_gs_os_ns.mData.data());
+
+    auto a_element_op    = AElementOp{};
+    auto b0_element_op   = B0ElementOp{};
+    auto acc0_element_op = Acc0ElementOp{alpha};
+    auto b1_element_op   = B1ElementOp{};
+    auto c_element_op    = CElementOp{};
+
+    // do GEMM
+    // TODO ANT: replace array with vector?
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(
+        static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+        static_cast<B0DataType*>(b0_device_buf.GetDeviceBuffer()),
+        static_cast<B1DataType*>(b1_device_buf.GetDeviceBuffer()),
+        static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+        {}, // std::array<void*, 1> p_acc0_biases;
+        {}, // std::array<void*, 1> p_acc1_biases;
+        a_gs_ms_ks_lengths,
+        a_gs_ms_ks_strides,
+        b0_gs_ns_ks_lengths,
+        b0_gs_ns_ks_strides,
+        b1_gs_os_ns_lengths,
+        b1_gs_os_ns_strides,
+        c_gs_ms_os_lengths,
+        c_gs_ms_os_strides,
+        {}, // std::array<std::vector<ck::index_t>, 1>{acc0_biases_gs_ms_ns_lengths},
+        {}, // std::array<std::vector<ck::index_t>, 1>{acc0_biases_gs_ms_ns_strides},
+        {}, // std::array<std::vector<ck::index_t>, 1>{acc1_biases_gs_ms_os_lengths},
+        {}, // std::array<std::vector<ck::index_t>, 1>{acc1_biases_gs_ms_os_strides},
+        a_element_op,
+        b0_element_op,
+        acc0_element_op,
+        b1_element_op,
+        c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return 0;
+    }
+
+    ck::index_t BatchCount = G0 * G1;
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop      = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * BatchCount;
+    std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N +
+                             sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O) *
+                            BatchCount;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    if(do_verification)
+    {
+        c_device_buf.FromDevice(c_gs_ms_os_device_result.mData.data());
+
+        Tensor<ADataType> a_g_m_k({BatchCount, M, K});
+        Tensor<B0DataType> b0_g_k_n({BatchCount, K, N});
+        Tensor<B1DataType> b1_g_n_o({BatchCount, N, O});
+        Tensor<AccDataType> acc0_g_m_n({BatchCount, M, N});        // scratch object after gemm0
+        Tensor<ADataType> a1_g_m_n({BatchCount, M, N});            // scratch object after softmax
+        Tensor<CDataType> c_g_m_o_host_result({BatchCount, M, O}); // scratch object after gemm1
+
+        // permute
+        a_gs_ms_ks.ForEach([&](auto& self, auto idx) {
+            a_g_m_k(idx[0] * G1 + idx[1], idx[2], idx[3]) = self(idx);
+        });
+        b0_gs_ns_ks.ForEach([&](auto& self, auto idx) {
+            b0_g_k_n(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx);
+        });
+        b1_gs_os_ns.ForEach([&](auto& self, auto idx) {
+            b1_g_n_o(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx);
+        });
+
+        // gemm 0
+        auto ref_gemm0          = ReferenceGemm0Instance{};
+        auto ref_gemm0_invoker  = ref_gemm0.MakeInvoker();
+        auto ref_gemm0_argument = ref_gemm0.MakeArgument(
+            a_g_m_k, b0_g_k_n, acc0_g_m_n, a_element_op, b0_element_op, acc0_element_op);
+
+        ref_gemm0_invoker.Run(ref_gemm0_argument);
+
+        // masking
+        const auto mask = DeviceGemmInstance::C0MatrixMask(N);
+        acc0_g_m_n.ForEach([&](auto& self, auto idx) {
+            if(mask.IsMaskedElement(idx[1], idx[2]))
+                self(idx) = -ck::NumericLimits<float>::Infinity();
+        });
+
+        // softmax
+        auto ref_softmax          = ReferenceSoftmaxInstance{};
+        auto ref_softmax_invoker  = ref_softmax.MakeInvoker();
+        auto ref_softmax_argument = ref_softmax.MakeArgument(acc0_g_m_n, a1_g_m_n, 1, 0, {2});
+
+        ref_softmax_invoker.Run(ref_softmax_argument);
+
+        // gemm1
+        auto ref_gemm1          = ReferenceGemm1Instance{};
+        auto ref_gemm1_invoker  = ref_gemm1.MakeInvoker();
+        auto ref_gemm1_argument = ref_gemm1.MakeArgument(
+            a1_g_m_n, b1_g_n_o, c_g_m_o_host_result, PassThrough{}, b1_element_op, c_element_op);
+
+        ref_gemm1_invoker.Run(ref_gemm1_argument);
+
+        // permute
+        c_gs_ms_os_host_result.ForEach([&](auto& self, auto idx) {
+            const size_t& g0 = idx[0];
+            const size_t& g1 = idx[1];
+
+            const size_t g = g0 * G1 + g1;
+
+            self(idx) = c_g_m_o_host_result(g, idx[2], idx[3]);
+        });
+
+        // default absolute error and relative error is 0.001
+        double rtol = 1e-3;
+        double atol = 1e-3;
+
+        // when BF16 is taken, set absolute error and relative error to 0.01
+        if(std::is_same_v<ADataType, ck::bhalf_t> && std::is_same_v<B0DataType, ck::bhalf_t> &&
+           std::is_same_v<B1DataType, ck::bhalf_t> && std::is_same_v<CDataType, ck::bhalf_t>)
+        {
+            rtol = 1e-2;
+            atol = 1e-2;
+        }
+
+        return ck::utils::check_err(c_gs_ms_os_device_result.mData,
+                                    c_gs_ms_os_host_result.mData,
+                                    "Error: Incorrect results!",
+                                    rtol,
+                                    atol)
+                   ? 0
+                   : 1;
+    }
+
+    return 0;
+}
diff --git a/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc
new file mode 100644
index 00000000..ef2acf61
--- /dev/null
+++ b/example/32_batched_gemm_scale_softmax_gemm/run_grouped_gemm_scale_softmax_gemm_permute.inc
@@ -0,0 +1,319 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+int run(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    bool input_permute  = false;
+    bool output_permute = true;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 6)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        input_permute  = std::stoi(argv[4]);
+        output_permute = std::stoi(argv[5]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 5: input / output permute\n");
+        exit(0);
+    }
+
+    float alpha = 1; // scaling after 1st gemm
+
+    std::size_t group_count = 7;
+
+    // Problem descs
+    std::vector<DeviceGemmInstance::ProblemDesc> problem_descs;
+    std::vector<const void*> p_a;
+    std::vector<const void*> p_b0;
+    std::vector<const void*> p_b1;
+    std::vector<void*> p_c;
+    std::vector<std::vector<int>> g0_g1_m_n_k_o;
+
+    std::vector<Tensor<ADataType>> a_tensors;
+    std::vector<Tensor<B0DataType>> b0_tensors;
+    std::vector<Tensor<B1DataType>> b1_tensors;
+    std::vector<Tensor<CDataType>> c_tensors;
+
+    using DeviceMemPtr = std::unique_ptr<DeviceMem>;
+    std::vector<DeviceMemPtr> a_tensors_device;
+    std::vector<DeviceMemPtr> b0_tensors_device;
+    std::vector<DeviceMemPtr> b1_tensors_device;
+    std::vector<DeviceMemPtr> c_tensors_device;
+
+    std::size_t flop = 0, num_byte = 0;
+
+    std::cout << "group count " << group_count << ". printing first 4 groups\n";
+    for(std::size_t i = 0; i < group_count; i++)
+    {
+        int M  = 128 * (rand() % 8 + 1);
+        int N  = 128 * (rand() % 8 + 1);
+        int K  = 40;
+        int O  = 40 * (rand() % 2 + 1);
+        int G0 = rand() % 3 + 1;
+        int G1 = rand() % 5 + 1;
+
+        g0_g1_m_n_k_o.push_back({G0, G1, M, N, K, O});
+
+        std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, G1, M, K};
+        std::vector<ck::index_t> a_gs_ms_ks_strides =
+            input_permute
+                ? std::vector<ck::index_t>{M * G1 * K, K, G1 * K, 1} // A layout [G0, M, G1, K]
+                : std::vector<ck::index_t>{G1 * M * K, M * K, K, 1}; // A layout [G0, G1, M, K]
+
+        std::vector<ck::index_t> b0_gs_ns_ks_lengths{G0, G1, N, K};
+        std::vector<ck::index_t> b0_gs_ns_ks_strides =
+            input_permute
+                ? std::vector<ck::index_t>{N * G1 * K, K, G1 * K, 1} // B0 layout [G0, N, G1, K]
+                : std::vector<ck::index_t>{G1 * N * K, N * K, K, 1}; // B0 layout [G0, G1, N, K]
+
+        std::vector<ck::index_t> b1_gs_os_ns_lengths{G0, G1, O, N};
+        std::vector<ck::index_t> b1_gs_os_ns_strides =
+            input_permute
+                ? std::vector<ck::index_t>{N * G1 * O, O, 1, G1 * O} // B1 layout [G0, N, G1, O]
+                : std::vector<ck::index_t>{G1 * N * O, N * O, 1, O}; // B1 layout [G0, G1, N, O]
+
+        std::vector<ck::index_t> c_gs_ms_os_lengths{G0, G1, M, O};
+        std::vector<ck::index_t> c_gs_ms_os_strides =
+            output_permute
+                ? std::vector<ck::index_t>{M * G1 * O, O, G1 * O, 1} // C layout [G0, M, G1, O]
+                : std::vector<ck::index_t>{G1 * M * O, M * O, O, 1}; // C layout [G0, G1, M, O]
+
+        problem_descs.push_back({a_gs_ms_ks_lengths,
+                                 a_gs_ms_ks_strides,
+                                 b0_gs_ns_ks_lengths,
+                                 b0_gs_ns_ks_strides,
+                                 b1_gs_os_ns_lengths,
+                                 b1_gs_os_ns_strides,
+                                 c_gs_ms_os_lengths,
+                                 c_gs_ms_os_strides,
+                                 {},   // acc0_biases_gs_ms_ns_lengths
+                                 {},   // acc0_biases_gs_ms_ns_strides
+                                 {},   // acc1_biases_gs_ms_os_lengths
+                                 {}}); // acc1_biases_gs_ms_os_strides
+
+        // C_m_o = A_m_k * B0_k_n * B1_n_o
+        Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
+        Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
+        Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
+        Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
+
+        int Batch = G0 * G1;
+        flop += (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * Batch;
+        num_byte += (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N +
+                     sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O) *
+                    Batch;
+
+        if(i < 4)
+        {
+            std::cout << "a_gs_ms_ks[" << i << "]: " << a_gs_ms_ks.mDesc << ", "
+                      << "b0_gs_ns_ks[" << i << "]: " << b0_gs_ns_ks.mDesc << ", "
+                      << "b1_gs_os_ns[" << i << "]: " << b1_gs_os_ns.mDesc << ", "
+                      << "c_gs_ms_os[" << i << "]: " << c_gs_ms_os_device_result.mDesc << std::endl;
+        }
+
+        switch(init_method)
+        {
+        case 0: break;
+        case 1:
+            a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+            b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+            b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-2, 2});
+            break;
+        case 2:
+            a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+            b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_3<B0DataType>{0.0, 1.0});
+            b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_3<B1DataType>{-0.5, 0.5});
+            break;
+        case 3:
+            a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+            b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal<B0DataType>{});
+            b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+            break;
+        default:
+            a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+            b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+            b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+        }
+
+        a_tensors.push_back(a_gs_ms_ks);
+        b0_tensors.push_back(b0_gs_ns_ks);
+        b1_tensors.push_back(b1_gs_os_ns);
+        c_tensors.push_back(c_gs_ms_os_device_result);
+
+        a_tensors_device.emplace_back(std::make_unique<DeviceMem>(
+            sizeof(ADataType) * a_gs_ms_ks.mDesc.GetElementSpaceSize()));
+        b0_tensors_device.emplace_back(std::make_unique<DeviceMem>(
+            sizeof(B0DataType) * b0_gs_ns_ks.mDesc.GetElementSpaceSize()));
+        b1_tensors_device.emplace_back(std::make_unique<DeviceMem>(
+            sizeof(B1DataType) * b1_gs_os_ns.mDesc.GetElementSpaceSize()));
+        c_tensors_device.emplace_back(std::make_unique<DeviceMem>(
+            sizeof(CDataType) * c_gs_ms_os_device_result.mDesc.GetElementSpaceSize()));
+
+        a_tensors_device[i]->ToDevice(a_gs_ms_ks.mData.data());
+        b0_tensors_device[i]->ToDevice(b0_gs_ns_ks.mData.data());
+        b1_tensors_device[i]->ToDevice(b1_gs_os_ns.mData.data());
+
+        p_a.push_back(a_tensors_device[i]->GetDeviceBuffer());
+        p_b0.push_back(b0_tensors_device[i]->GetDeviceBuffer());
+        p_b1.push_back(b1_tensors_device[i]->GetDeviceBuffer());
+        p_c.push_back(c_tensors_device[i]->GetDeviceBuffer());
+    }
+
+    auto a_element_op    = AElementOp{};
+    auto b0_element_op   = B0ElementOp{};
+    auto acc0_element_op = Acc0ElementOp{alpha};
+    auto b1_element_op   = B1ElementOp{};
+    auto c_element_op    = CElementOp{};
+
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(p_a,
+                                      p_b0,
+                                      p_b1,
+                                      p_c,
+                                      {}, // p_acc0_biases
+                                      {}, // p_acc1_biases
+                                      problem_descs,
+                                      a_element_op,
+                                      b0_element_op,
+                                      acc0_element_op,
+                                      b1_element_op,
+                                      c_element_op);
+
+    // specify workspace for problem_desc
+    DeviceMem problem_desc_workspace(gemm.GetWorkSpaceSize(&argument));
+
+    gemm.SetWorkSpacePointer(&argument, problem_desc_workspace.GetDeviceBuffer());
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return 0;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    bool pass = true;
+    if(do_verification)
+    {
+        for(std::size_t i = 0; i < group_count; i++)
+        {
+            const int& G0 = g0_g1_m_n_k_o[i][0];
+            const int& G1 = g0_g1_m_n_k_o[i][1];
+            const int& M  = g0_g1_m_n_k_o[i][2];
+            const int& N  = g0_g1_m_n_k_o[i][3];
+            const int& K  = g0_g1_m_n_k_o[i][4];
+            const int& O  = g0_g1_m_n_k_o[i][5];
+
+            const auto& c_gs_ms_os_lengths = problem_descs[i].c_gs_ms_os_lengths;
+            const auto& c_gs_ms_os_strides = problem_descs[i].c_gs_ms_os_strides;
+
+            const auto& a_gs_ms_ks         = a_tensors[i];
+            const auto& b0_gs_ns_ks        = b0_tensors[i];
+            const auto& b1_gs_os_ns        = b1_tensors[i];
+            auto& c_gs_ms_os_device_result = c_tensors[i];
+            auto& c_gs_ms_os_device_buf    = *c_tensors_device[i];
+
+            c_gs_ms_os_device_buf.FromDevice(c_gs_ms_os_device_result.mData.data());
+
+            Tensor<ADataType> a_g_m_k({G0 * G1, M, K});
+            Tensor<B0DataType> b0_g_k_n({G0 * G1, K, N});
+            Tensor<B1DataType> b1_g_n_o({G0 * G1, N, O});
+            Tensor<AccDataType> acc0_g_m_n({G0 * G1, M, N});        // scratch object after gemm0
+            Tensor<ADataType> a1_g_m_n({G0 * G1, M, N});            // scratch object after softmax
+            Tensor<CDataType> c_g_m_o_host_result({G0 * G1, M, O}); // scratch object after gemm1
+            Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
+
+            // permute
+            a_gs_ms_ks.ForEach([&](auto& self, auto idx) {
+                a_g_m_k(idx[0] * G1 + idx[1], idx[2], idx[3]) = self(idx);
+            });
+            b0_gs_ns_ks.ForEach([&](auto& self, auto idx) {
+                b0_g_k_n(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx);
+            });
+            b1_gs_os_ns.ForEach([&](auto& self, auto idx) {
+                b1_g_n_o(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx);
+            });
+
+            // gemm 0
+            auto ref_gemm0          = ReferenceGemm0Instance{};
+            auto ref_gemm0_invoker  = ref_gemm0.MakeInvoker();
+            auto ref_gemm0_argument = ref_gemm0.MakeArgument(
+                a_g_m_k, b0_g_k_n, acc0_g_m_n, a_element_op, b0_element_op, acc0_element_op);
+
+            ref_gemm0_invoker.Run(ref_gemm0_argument);
+
+            // masking
+            const auto mask = DeviceGemmInstance::C0MatrixMask(N);
+            acc0_g_m_n.ForEach([&](auto& self, auto idx) {
+                if(mask.IsMaskedElement(idx[1], idx[2]))
+                    self(idx) = -ck::NumericLimits<float>::Infinity();
+            });
+
+            // softmax
+            auto ref_softmax          = ReferenceSoftmaxInstance{};
+            auto ref_softmax_invoker  = ref_softmax.MakeInvoker();
+            auto ref_softmax_argument = ref_softmax.MakeArgument(acc0_g_m_n, a1_g_m_n, 1, 0, {2});
+
+            ref_softmax_invoker.Run(ref_softmax_argument);
+
+            // gemm 1
+            auto ref_gemm1          = ReferenceGemm1Instance{};
+            auto ref_gemm1_invoker  = ref_gemm1.MakeInvoker();
+            auto ref_gemm1_argument = ref_gemm1.MakeArgument(a1_g_m_n,
+                                                             b1_g_n_o,
+                                                             c_g_m_o_host_result,
+                                                             PassThrough{},
+                                                             b1_element_op,
+                                                             c_element_op);
+
+            ref_gemm1_invoker.Run(ref_gemm1_argument);
+
+            // permute
+            c_gs_ms_os_host_result.ForEach([&](auto& self, auto idx) {
+                const size_t& g0 = idx[0];
+                const size_t& g1 = idx[1];
+
+                const size_t g = g0 * G1 + g1;
+
+                self(idx) = c_g_m_o_host_result(g, idx[2], idx[3]);
+            });
+
+            bool pass_ =
+                ck::utils::check_err(c_gs_ms_os_device_result.mData, c_gs_ms_os_host_result.mData);
+            pass &= pass_;
+        }
+    }
+
+    return pass ? 0 : 1;
+}
diff --git a/example/33_multiple_reduce/CMakeLists.txt b/example/33_multiple_reduce/CMakeLists.txt
new file mode 100644
index 00000000..bc8c3eb0
--- /dev/null
+++ b/example/33_multiple_reduce/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_example_executable(example_dual_reduce_multiblock dual_reduce_multiblock.cpp)
+add_example_executable(example_dual_reduce_threadwise dual_reduce_threadwise.cpp)
diff --git a/example/33_multiple_reduce/README.md b/example/33_multiple_reduce/README.md
new file mode 100644
index 00000000..90762a69
--- /dev/null
+++ b/example/33_multiple_reduce/README.md
@@ -0,0 +1,37 @@
+# Instructions for ```example_dual_reduce```
+
+## Run ```example_dual_reduce_multiblock```
+```bash
+# -D <xxx> : input 4-d tensor lengths
+# -v <x> :   verification (0=no, 1=yes)
+#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
+#arg2: time kernel (0=no, 1=yes) 
+./bin/example_dual_reduce_multiblock -D 600,28,28,256 -v 1 2 1
+```
+
+Result
+```
+./bin/example_dual_reduce_multiblock -D 600,28,28,256 -v 1 2 1                        
+launch_and_time_kernel: grid_dim {150, 1, 1}, block_dim {256, 1, 1} 
+Warm up 1 time
+Start running 10 times...
+Perf: 1.19529 ms, 201.499 GB/s, DeviceMultipleReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_1_InSrcVectorSize_1,OutDstVectorSize_1_1>
+```
+
+## Run ```example_dual_reduce_threadwise```
+```bash
+# -D <xxx> : input 4-d tensor lengths
+# -v <x> :   verification (0=no, 1=yes)
+#arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
+#arg2: time kernel (0=no, 1=yes)
+./bin/example_dual_reduce_multiblock -D 8000,4,4,4 -v 1 2 1
+```
+
+Result
+```
+./bin/example_dual_reduce_threadwise -D 8000,4,4,4 -v 1 2 1
+launch_and_time_kernel: grid_dim {32, 1, 1}, block_dim {256, 1, 1} 
+Warm up 1 time
+Start running 10 times...
+Perf: 0.01512 ms, 71.9577 GB/s, DeviceMultipleReduceThreadwise<256,M_C256_S1,K_C1_S4,InSrcVectorDim_1_InSrcVectorSize_2,OutDstVectorSize_1_1>
+```
diff --git a/example/33_multiple_reduce/dual_reduce_common.hpp b/example/33_multiple_reduce/dual_reduce_common.hpp
new file mode 100644
index 00000000..376b95ea
--- /dev/null
+++ b/example/33_multiple_reduce/dual_reduce_common.hpp
@@ -0,0 +1,314 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <cstdlib>
+#include <vector>
+#include <array>
+#include <algorithm>
+#include <getopt.h>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+
+static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
+                                       {"verify", required_argument, nullptr, 'v'},
+                                       {"help", no_argument, nullptr, '?'},
+                                       {nullptr, 0, nullptr, 0}};
+
+class SimpleAppArgs
+{
+    private:
+    int option_index = 0;
+
+    public:
+    std::vector<size_t> inLengths = {600, 28, 28, 256};
+    size_t n, h, w, c;
+
+    bool do_verification = true;
+    int init_method      = 2;
+    bool time_kernel     = true;
+
+    public:
+    SimpleAppArgs()
+    {
+        n = inLengths[0];
+        h = inLengths[1];
+        w = inLengths[2];
+        c = inLengths[3];
+    };
+
+    void show_usage(const char* cmd)
+    {
+        std::cout << "Usage of " << cmd << std::endl;
+        std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths"
+                  << std::endl;
+        std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by "
+                     "comparing with the host-based reduction"
+                  << std::endl;
+        std::cout << "Arg1 -- init method (0=no init, 1=single integer value, 2=scope integer "
+                     "value, 3=decimal value)"
+                  << std::endl;
+        std::cout << "Arg2 -- time kernel (0=no, 1=yes)" << std::endl;
+    };
+
+    int processArgs(int argc, char* argv[])
+    {
+        using ck::host_common::getTypeValuesFromString;
+
+        int ch;
+
+        while(1)
+        {
+            ch = getopt_long(argc, argv, "D:v:l:", long_options, &option_index);
+            if(ch == -1)
+                break;
+            switch(ch)
+            {
+            case 'D':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                inLengths = getTypeValuesFromString<size_t>(optarg);
+                if(inLengths.size() != 4)
+                    throw std::runtime_error(
+                        "Invalid option format! The number of integers is incorrect!");
+
+                break;
+            case 'v':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                do_verification = static_cast<bool>(std::atoi(optarg));
+                break;
+            case '?':
+                if(std::string(long_options[option_index].name) == "help")
+                {
+                    show_usage(argv[0]);
+                    return (-1);
+                };
+                break;
+            default: show_usage(argv[0]); return (-1);
+            };
+        };
+
+        if(optind + 2 > argc)
+            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
+
+        init_method = std::atoi(argv[optind++]);
+        time_kernel = static_cast<bool>(std::atoi(argv[optind]));
+
+        n = inLengths[0];
+        h = inLengths[1];
+        w = inLengths[2];
+        c = inLengths[3];
+
+        return (0);
+    };
+};
+
+template <typename InDataType, typename OutDataType1, typename OutDataType2, typename AccDataType>
+static void mean_meansquare_host(const Tensor<InDataType>& in,
+                                 Tensor<OutDataType1>& mean_ref,
+                                 Tensor<OutDataType2>& meansquare_ref,
+                                 size_t n,
+                                 size_t h,
+                                 size_t w,
+                                 size_t c)
+
+{
+    auto thread_reduce_func = [&](auto iN) {
+        AccDataType mean       = ck::type_convert<AccDataType>(0.0f);
+        AccDataType meansquare = ck::type_convert<AccDataType>(0.0f);
+
+        // compute mean, meanquare, variance, invVariance
+        for(std::size_t iH = 0; iH < h; iH++)
+        {
+            for(std::size_t iW = 0; iW < w; iW++)
+            {
+                for(std::size_t iC = 0; iC < c; iC++)
+                {
+                    AccDataType curr_value = ck::type_convert<AccDataType>(in(iN, iH, iW, iC));
+
+                    mean += curr_value;
+                    meansquare += curr_value * curr_value;
+                };
+            }
+        };
+
+        mean       = mean / (h * w * c);
+        meansquare = meansquare / (h * w * c);
+
+        mean_ref(iN)       = ck::type_convert<OutDataType1>(mean);
+        meansquare_ref(iN) = ck::type_convert<OutDataType2>(meansquare);
+    };
+
+    std::size_t num_thread      = std::thread::hardware_concurrency();
+    std::size_t work_per_thread = (n + num_thread - 1) / num_thread;
+
+    std::vector<joinable_thread> threads(num_thread);
+
+    for(std::size_t it = 0; it < num_thread; it++)
+    {
+        std::size_t iN_begin = it * work_per_thread;
+        std::size_t iN_end   = std::min(static_cast<size_t>((it + 1) * work_per_thread), n);
+
+        auto f = [=] {
+            for(std::size_t iN = iN_begin; iN < iN_end; iN++)
+            {
+                thread_reduce_func(iN);
+            }
+        };
+
+        threads[it] = joinable_thread(f);
+    }
+};
+
+using ReduceOperation = ck::reduce::Add;
+
+using InElementwiseOperation_Mean  = ck::tensor_operation::element_wise::PassThrough;
+using AccElementwiseOperation_Mean = ck::tensor_operation::element_wise::UnaryDivide;
+
+using InElementwiseOperation_Meansquare  = ck::tensor_operation::element_wise::UnarySquare;
+using AccElementwiseOperation_Meansquare = ck::tensor_operation::element_wise::UnaryDivide;
+
+using InElementwiseOperationTuple =
+    ck::Tuple<InElementwiseOperation_Mean, InElementwiseOperation_Meansquare>;
+using AccElementwiseOperationTuple =
+    ck::Tuple<AccElementwiseOperation_Mean, AccElementwiseOperation_Meansquare>;
+
+template <typename DeviceDualReduce,
+          typename InDataType,
+          typename OutDataType,
+          typename AccDataType,
+          int Rank,
+          int NumReduceDim>
+int mean_meansquare_dual_reduce_test(size_t n,
+                                     size_t h,
+                                     size_t w,
+                                     size_t c,
+                                     bool do_verification,
+                                     int init_method,
+                                     bool time_kernel,
+                                     const std::array<int, NumReduceDim> reduceDims)
+{
+    const std::vector<size_t> inLengths = {n, h, w, c};
+
+    Tensor<InDataType> in(inLengths);
+
+    std::vector<size_t> outLengths{n};
+
+    Tensor<OutDataType> mean_ref(outLengths);
+    Tensor<OutDataType> mean(outLengths);
+    Tensor<OutDataType> meansquare_ref(outLengths);
+    Tensor<OutDataType> meansquare(outLengths);
+
+    auto inStrides  = in.mDesc.GetStrides();
+    auto outStrides = mean.mDesc.GetStrides();
+
+    size_t invariant_total_length = n;
+    size_t reduce_total_length    = h * w * c;
+
+    const AccDataType alpha = ck::type_convert<AccDataType>(1.0f);
+    const AccDataType beta  = ck::type_convert<AccDataType>(0.0f);
+
+    std::size_t num_thread = 1;
+
+    if(do_verification)
+    {
+        switch(init_method)
+        {
+        case 0: break;
+        case 1: in.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread); break;
+        case 2: in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread); break;
+        default: in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
+        }
+    };
+
+    // these buffers are usually provided by the user application
+    DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem mean_dev(sizeof(OutDataType) * mean.mDesc.GetElementSpaceSize());
+    DeviceMem meansquare_dev(sizeof(OutDataType) * meansquare.mDesc.GetElementSpaceSize());
+
+    in_dev.ToDevice(in.mData.data());
+
+    if(do_verification)
+    {
+        mean_meansquare_host<InDataType, OutDataType, OutDataType, AccDataType>(
+            in, mean_ref, meansquare_ref, n, h, w, c);
+    };
+
+    constexpr ck::index_t NumInputDim  = Rank;
+    constexpr ck::index_t NumOutputDim = (Rank - NumReduceDim > 1) ? Rank - NumReduceDim : 1;
+
+    std::array<ck::index_t, NumInputDim> i_inLengths;
+    std::array<ck::index_t, NumInputDim> i_inStrides;
+    std::array<ck::index_t, NumOutputDim> i_outLengths;
+    std::array<ck::index_t, NumOutputDim> i_outStrides;
+
+    ck::ranges::copy(inLengths, i_inLengths.begin());
+    ck::ranges::copy(inStrides, i_inStrides.begin());
+    ck::ranges::copy(outLengths, i_outLengths.begin());
+    ck::ranges::copy(outStrides, i_outStrides.begin());
+
+    auto dual_reduce_op = DeviceDualReduce{};
+
+    auto argument_ptr = dual_reduce_op.MakeArgumentPointer(
+        i_inLengths,
+        i_inStrides,
+        i_outLengths,
+        {i_outStrides, i_outStrides},
+        reduceDims,
+        {&alpha, &alpha},
+        {&beta, &beta},
+        in_dev.GetDeviceBuffer(),
+        {mean_dev.GetDeviceBuffer(), meansquare_dev.GetDeviceBuffer()},
+        ck::make_tuple(InElementwiseOperation_Mean{}, InElementwiseOperation_Meansquare{}),
+        ck::make_tuple(
+            AccElementwiseOperation_Mean{static_cast<int32_t>(reduce_total_length)},
+            AccElementwiseOperation_Meansquare{static_cast<int32_t>(reduce_total_length)}));
+
+    if(!dual_reduce_op.IsSupportedArgument(argument_ptr.get()))
+    {
+        std::cout
+            << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!"
+            << std::endl;
+        return (-1);
+    };
+
+    std::string reduce_name = dual_reduce_op.GetTypeString();
+
+    auto invoker_ptr = dual_reduce_op.MakeInvokerPointer();
+
+    float avg_time = 0.0f;
+
+    avg_time += invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+    std::size_t num_bytes = invariant_total_length * reduce_total_length * sizeof(InDataType) +
+                            2 * invariant_total_length * sizeof(OutDataType);
+
+    float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+    std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, " << reduce_name
+              << std::endl;
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        mean_dev.FromDevice(mean.mData.data());
+        meansquare_dev.FromDevice(meansquare.mData.data());
+        pass = pass && ck::utils::check_err(mean, mean_ref);
+        pass = pass && ck::utils::check_err(meansquare, meansquare_ref);
+    };
+
+    return (pass ? 0 : 1);
+}
diff --git a/example/33_multiple_reduce/dual_reduce_multiblock.cpp b/example/33_multiple_reduce/dual_reduce_multiblock.cpp
new file mode 100644
index 00000000..9360599e
--- /dev/null
+++ b/example/33_multiple_reduce/dual_reduce_multiblock.cpp
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <cstdlib>
+#include <vector>
+#include <array>
+#include <algorithm>
+#include <getopt.h>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+
+#include "dual_reduce_common.hpp"
+
+using namespace ck;
+using namespace ck::tensor_operation::device;
+
+using InDataType       = ck::half_t;
+using OutDataType      = float;
+using OutDataTypeTuple = Tuple<OutDataType, OutDataType>;
+using AccDataType      = float;
+
+// for NHWC layer-norm calculation of mean and meansquare
+constexpr int Rank         = 4;
+constexpr int NumReduceDim = 3;
+
+constexpr bool PropagateNan = false;
+
+constexpr InMemoryDataOperationEnum OutMemoryDataOperation = InMemoryDataOperationEnum::Set;
+
+using DeviceDualReduce = DeviceMultipleReduceMultiBlock<2,
+                                                        InDataType,
+                                                        AccDataType,
+                                                        OutDataTypeTuple,
+                                                        Rank,
+                                                        NumReduceDim,
+                                                        ReduceOperation,
+                                                        InElementwiseOperationTuple,
+                                                        AccElementwiseOperationTuple,
+                                                        OutMemoryDataOperation,
+                                                        PropagateNan,
+                                                        256,
+                                                        4,
+                                                        64,
+                                                        1,
+                                                        1,
+                                                        1, // InSrcVectorDim
+                                                        1,
+                                                        ck::Sequence<1, 1>>;
+
+int main(int argc, char* argv[])
+{
+    int retval = 0;
+
+    if(argc > 1)
+    {
+        SimpleAppArgs arg;
+
+        if(arg.processArgs(argc, argv) < 0)
+            return (-1);
+
+        std::array<int, NumReduceDim> reduceDims = {1, 2, 3};
+
+        retval = mean_meansquare_dual_reduce_test<DeviceDualReduce,
+                                                  InDataType,
+                                                  OutDataType,
+                                                  AccDataType,
+                                                  Rank,
+                                                  NumReduceDim>(arg.n,
+                                                                arg.h,
+                                                                arg.w,
+                                                                arg.c,
+                                                                arg.do_verification,
+                                                                arg.init_method,
+                                                                arg.time_kernel,
+                                                                reduceDims);
+    }
+    else
+    {
+        std::array<int, NumReduceDim> reduceDims = {1, 2, 3};
+
+        retval = mean_meansquare_dual_reduce_test<DeviceDualReduce,
+                                                  InDataType,
+                                                  OutDataType,
+                                                  AccDataType,
+                                                  Rank,
+                                                  NumReduceDim>(
+            600, 28, 28, 256, true, 2, true, reduceDims);
+    };
+
+    return (retval);
+}
diff --git a/example/33_multiple_reduce/dual_reduce_threadwise.cpp b/example/33_multiple_reduce/dual_reduce_threadwise.cpp
new file mode 100644
index 00000000..56255839
--- /dev/null
+++ b/example/33_multiple_reduce/dual_reduce_threadwise.cpp
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <cstdlib>
+#include <vector>
+#include <array>
+#include <algorithm>
+#include <getopt.h>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_multiple_reduce_threadwise.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+
+#include "dual_reduce_common.hpp"
+
+using namespace ck;
+using namespace ck::tensor_operation::device;
+
+using InDataType       = ck::half_t;
+using OutDataType      = float;
+using OutDataTypeTuple = Tuple<OutDataType, OutDataType>;
+using AccDataType      = float;
+
+// for NHWC layer-norm calculation of mean and meansquare
+constexpr int Rank         = 4;
+constexpr int NumReduceDim = 3;
+
+constexpr bool PropagateNan = false;
+
+using DeviceDualReduce = DeviceMultipleReduceThreadWise<2,
+                                                        InDataType,
+                                                        AccDataType,
+                                                        OutDataTypeTuple,
+                                                        Rank,
+                                                        NumReduceDim,
+                                                        ReduceOperation,
+                                                        InElementwiseOperationTuple,
+                                                        AccElementwiseOperationTuple,
+                                                        PropagateNan,
+                                                        256,
+                                                        1,
+                                                        4,
+                                                        1, // InSrcVectorDim
+                                                        2,
+                                                        ck::Sequence<1, 1>>;
+
+int main(int argc, char* argv[])
+{
+    int retval = 0;
+
+    if(argc > 1)
+    {
+        SimpleAppArgs arg;
+
+        if(arg.processArgs(argc, argv) < 0)
+            return (-1);
+
+        std::array<int, NumReduceDim> reduceDims = {1, 2, 3};
+
+        retval = mean_meansquare_dual_reduce_test<DeviceDualReduce,
+                                                  InDataType,
+                                                  OutDataType,
+                                                  AccDataType,
+                                                  Rank,
+                                                  NumReduceDim>(arg.n,
+                                                                arg.h,
+                                                                arg.w,
+                                                                arg.c,
+                                                                arg.do_verification,
+                                                                arg.init_method,
+                                                                arg.time_kernel,
+                                                                reduceDims);
+    }
+    else
+    {
+        std::array<int, NumReduceDim> reduceDims = {1, 2, 3};
+
+        retval = mean_meansquare_dual_reduce_test<DeviceDualReduce,
+                                                  InDataType,
+                                                  OutDataType,
+                                                  AccDataType,
+                                                  Rank,
+                                                  NumReduceDim>(
+            8000, 4, 4, 4, true, 2, true, reduceDims);
+    };
+
+    return (retval);
+}
diff --git a/example/34_batchnorm/CMakeLists.txt b/example/34_batchnorm/CMakeLists.txt
new file mode 100644
index 00000000..d964f40d
--- /dev/null
+++ b/example/34_batchnorm/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_example_executable(example_batchnorm_forward_training batchnorm_forward_training_nhwc.cpp)
+add_example_executable(example_batchnorm_forward_inferring batchnorm_forward_inferring_nhwc.cpp)
+add_example_executable(example_batchnorm_backward batchnorm_backward_nhwc.cpp)
diff --git a/example/34_batchnorm/README.md b/example/34_batchnorm/README.md
new file mode 100644
index 00000000..294e32b9
--- /dev/null
+++ b/example/34_batchnorm/README.md
@@ -0,0 +1,81 @@
+# Instructions for ```batchnorm nhwc``` Example
+
+## Run ```batchnorm forward nhwc```
+```bash
+# -D <xxx> : input 4-d tensor lengths
+# -v <x> :   verification (0=no, 1=yes)
+#arg1:  data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)
+#arg2: 1/0 to indicate whether to update the moving average and variance (0=no, 1=yes)
+#arg3: 1/0 to indicate whether to save result mean/invVariance (0=no, 1=yes)
+#arg4: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
+#arg5: time kernel (0=no, 1=yes) 
+./bin/example_batchnorm_forward -D 128,16,16,1024 -v 1 0 0 1 2 1
+```
+
+Result 
+```
+./bin/example_batchnorm_forward -D 128,16,16,1024 -v 1 0 0 1 2 1
+launch_and_time_kernel: grid_dim {64, 1, 1}, block_dim {256, 1, 1} 
+Warm up 1 time
+Start running 10 times...
+launch_and_time_kernel: grid_dim {120, 1, 1}, block_dim {256, 1, 1} 
+Warm up 1 time
+Start running 10 times...
+launch_and_time_kernel: grid_dim {120, 1, 1}, block_dim {256, 1, 1} 
+Warm up 1 time
+Start running 10 times...
+Perf: 2.08231 ms, 354.519 GB/s
+```
+
+Result
+```
+./bin/example_batchnorm_forward -D 128,16,16,1024 -v 1 0 1 0 2 0
+echo $?
+0
+```
+
+## Run ```batchnorm infer nhwc```
+```bash
+# -D <xxx> : input 4-d tensor lengths
+# -v <x> :   verification (0=no, 1=yes)
+#arg1:  data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)
+#arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
+#arg3: time kernel (0=no, 1=yes)
+./bin/example_batchnorm_infer -D 128,16,16,1024 -v 1 0 2 1
+```
+
+Result
+```
+./bin/example_batchnorm_infer -D 128,16,16,1024 -v 1 0 2 1
+launch_and_time_kernel: grid_dim {120, 1, 1}, block_dim {256, 1, 1} 
+Warm up 1 time
+Start running 10 times...
+Perf: 1.28235 ms, 523.329 GB/s
+```
+
+## Run ```batchnorm backward nhwc```
+```bash
+# -D <xxx> : input 4-d tensor lengths
+# -v <x> :   verification (0=no, 1=yes)
+Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)
+Arg2 -- 1/0 to indicate whether to use saved mean and invVariance
+Arg3 -- init method used for dy and bnScale (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
+Arg4 -- time kernel (0=no, 1=yes)
+Arg5: use multi-block welford (0=n0, 1=yes)
+./bin/example_batchnorm_backward -D 128,16,3,1024 -v 1 0 0 3 1 1
+```
+
+Result 
+```
+./bin/example_batchnorm_backward -D 128,16,3,1024 -v 1 0 0 3 1 1
+launch_and_time_kernel: grid_dim {6144, 1, 1}, block_dim {256, 1, 1} 
+Warm up 1 time
+Start running 10 times...
+launch_and_time_kernel: grid_dim {6144, 1, 1}, block_dim {256, 1, 1} 
+Warm up 1 time
+Start running 10 times...
+launch_and_time_kernel: grid_dim {6144, 1, 1}, block_dim {256, 1, 1} 
+Warm up 1 time
+Start running 10 times...
+Perf: 0.411026 ms, 91.8702 GB/s
+```
diff --git a/example/34_batchnorm/batchnorm_backward_nhwc.cpp b/example/34_batchnorm/batchnorm_backward_nhwc.cpp
new file mode 100644
index 00000000..a6ca9d15
--- /dev/null
+++ b/example/34_batchnorm/batchnorm_backward_nhwc.cpp
@@ -0,0 +1,506 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <limits>
+#include <iostream>
+#include <getopt.h>
+
+#include "ck/ck.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp"
+
+static struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'},
+                                       {"verify", required_argument, nullptr, 'v'},
+                                       {"help", no_argument, nullptr, '?'},
+                                       {nullptr, 0, nullptr, 0}};
+
+class BatchNormBwdArg
+{
+    private:
+    int option_index = 0;
+
+    public:
+    std::vector<size_t> inOutLengths;
+
+    bool do_verification = false;
+
+    bool haveSavedMeanInvVar;
+
+    int data_type               = 0;
+    int init_method             = 3;
+    bool time_kernel            = false;
+    bool use_multiblock_welford = false;
+
+    public:
+    void show_usage(const char* cmd)
+    {
+        // clang-format off
+        std::cout << "Usage of " << cmd << std::endl;
+        std::cout << "--inOutLengths or -D, comma separated list of input tensor dimension lengths, must have 4 integers for nhwc" << std::endl;
+        std::cout << "--verify or -v, 1/0 to indicate whether to verify the result by comparing with the host-based batch-normalization" << std::endl;
+        std::cout << "Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)" << std::endl;
+        std::cout << "Arg2 -- 1/0 to indicate whether to use saved mean and invVariance" << std::endl;
+        std::cout << "Arg3 -- init method used for dy and bnScale (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)" << std::endl;
+        std::cout << "Arg4 -- time kernel (0=no, 1=yes)" << std::endl;
+        std::cout << "Arg5: use multi-block welford (0=n0, 1=yes)" << std::endl;
+        // clang-format on
+    };
+
+    int processArgs(int argc, char* argv[])
+    {
+        using ck::host_common::getTypeValuesFromString;
+
+        int ch;
+
+        while(1)
+        {
+            ch = getopt_long(argc, argv, "D:v:", long_options, &option_index);
+            if(ch == -1)
+                break;
+            switch(ch)
+            {
+            case 'D':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                inOutLengths = getTypeValuesFromString<size_t>(optarg);
+
+                if(inOutLengths.size() != 4)
+                    throw std::runtime_error(
+                        "NHWC tensor layout should have 4 length values specified!");
+                break;
+            case 'v':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                do_verification = static_cast<bool>(std::atoi(optarg));
+                break;
+            case '?':
+                if(std::string(long_options[option_index].name) == "help")
+                {
+                    show_usage(argv[0]);
+                    return (-1);
+                };
+                break;
+            default: show_usage(argv[0]); return (-1);
+            };
+        };
+
+        if(optind + 5 > argc)
+            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
+
+        data_type              = std::atoi(argv[optind++]);
+        haveSavedMeanInvVar    = std::atoi(argv[optind++]);
+        init_method            = std::atoi(argv[optind++]);
+        time_kernel            = static_cast<bool>(std::atoi(argv[optind++]));
+        use_multiblock_welford = static_cast<bool>(std::atoi(argv[optind]));
+
+        return (0);
+    };
+};
+
+using namespace ck;
+
+template <typename XDataType, typename AccDataType, bool UseMultiblockInK>
+bool bnorm_bwd_nhwc_test(bool do_verification,
+                         int init_method,
+                         bool time_kernel,
+                         const std::vector<size_t> inOutLengths,
+                         bool haveSavedMeanInvVar,
+                         double epsilon)
+{
+    // for NHWC BatchNorm calculation of mean and meansquare
+    constexpr index_t Rank         = 4;
+    constexpr index_t NumReduceDim = 3;
+
+    using ScaleDataType = XDataType;
+
+    const std::vector<size_t> scaleBiasMeanVarLengths = {inOutLengths[3]};
+
+    // input data of the batchnorm backward algorithm
+    Tensor<XDataType> x(inOutLengths);
+    Tensor<AccDataType> dy(inOutLengths);
+
+    Tensor<ScaleDataType> bnScale(scaleBiasMeanVarLengths);
+
+    Tensor<AccDataType> savedMean(scaleBiasMeanVarLengths);
+    Tensor<AccDataType> savedInvVar(scaleBiasMeanVarLengths);
+    // savedVariance is only used for initializing savedInvVar
+    Tensor<AccDataType> savedVariance(scaleBiasMeanVarLengths);
+
+    // output data of the batchnorm backward algorithm
+    Tensor<AccDataType> dx_ref(inOutLengths);
+    Tensor<AccDataType> dx(inOutLengths);
+
+    Tensor<AccDataType> dscale(scaleBiasMeanVarLengths);
+    Tensor<AccDataType> dbias(scaleBiasMeanVarLengths);
+
+    Tensor<AccDataType> dscale_ref(scaleBiasMeanVarLengths);
+    Tensor<AccDataType> dbias_ref(scaleBiasMeanVarLengths);
+
+    auto inOutStrides            = dy.mDesc.GetStrides();
+    auto scaleBiasMeanVarStrides = dscale.mDesc.GetStrides();
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+
+    if(haveSavedMeanInvVar)
+    {
+        const float x_mean       = 0.0f;
+        const float x_stddev     = 1.0f;
+        const float noise_stddev = 0.0001f;
+
+        // input data in normal distribution
+        x.GenerateTensorValue(GeneratorTensor_4<XDataType>{x_mean, x_stddev}, num_thread);
+
+        // initialize the savedMean to be values with tiny variation to the mean of the x values
+        savedMean.GenerateTensorValue(GeneratorTensor_4<AccDataType>{x_mean, noise_stddev},
+                                      num_thread);
+
+        // initialize the variance to be values with tiny variation to the variance of the x values
+        savedVariance.GenerateTensorValue(
+            GeneratorTensor_4<AccDataType>{x_stddev * x_stddev, noise_stddev}, num_thread);
+
+        auto it_src       = savedVariance.mData.begin();
+        auto it_dst       = savedInvVar.mData.begin();
+        float tmp_epsilon = std::numeric_limits<float>::epsilon();
+
+        while(it_src != savedVariance.mData.end())
+        {
+            *it_dst = type_convert<AccDataType>(
+                1.0f / std::sqrtf(type_convert<float>(*it_src) + tmp_epsilon));
+
+            it_src++;
+            it_dst++;
+        };
+    }
+    else
+    {
+        const float x_mean   = 0.0f;
+        const float x_stddev = 1.0f;
+
+        // input data in normal distribution
+        x.GenerateTensorValue(GeneratorTensor_4<XDataType>{x_mean, x_stddev}, num_thread);
+    };
+
+    if(do_verification)
+    {
+        switch(init_method)
+        {
+        case 0:
+            dy.GenerateTensorValue(GeneratorTensor_0<AccDataType>{}, num_thread);
+            bnScale.GenerateTensorValue(GeneratorTensor_0<ScaleDataType>{}, num_thread);
+            break;
+        case 1:
+            dy.GenerateTensorValue(GeneratorTensor_1<AccDataType>{1}, num_thread);
+            bnScale.GenerateTensorValue(GeneratorTensor_1<ScaleDataType>{1}, num_thread);
+            break;
+        case 2:
+            dy.GenerateTensorValue(GeneratorTensor_2<AccDataType>{-2, 2}, num_thread);
+            bnScale.GenerateTensorValue(GeneratorTensor_2<ScaleDataType>{-5, 5}, num_thread);
+            break;
+        default:
+            dy.GenerateTensorValue(GeneratorTensor_3<AccDataType>{-0.2f, 0.2f}, num_thread);
+            bnScale.GenerateTensorValue(GeneratorTensor_3<ScaleDataType>{-0.5f, 0.5f}, num_thread);
+        }
+    };
+
+    // input data of the batchnorm backward algorithm
+    DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
+    DeviceMem dy_dev(sizeof(AccDataType) * dy.mDesc.GetElementSpaceSize());
+
+    DeviceMem bnScale_dev(sizeof(ScaleDataType) * bnScale.mDesc.GetElementSpaceSize());
+
+    DeviceMem savedMean_dev(sizeof(AccDataType) * savedMean.mDesc.GetElementSpaceSize());
+    DeviceMem savedInvVar_dev(sizeof(AccDataType) * savedInvVar.mDesc.GetElementSpaceSize());
+
+    // output data of the batchnorm backward algorithm
+    DeviceMem dx_dev(sizeof(AccDataType) * dx.mDesc.GetElementSpaceSize());
+
+    DeviceMem dscale_dev(sizeof(AccDataType) * dscale.mDesc.GetElementSpaceSize());
+    DeviceMem dbias_dev(sizeof(AccDataType) * dbias.mDesc.GetElementSpaceSize());
+
+    x_dev.ToDevice(x.mData.data());
+    dy_dev.ToDevice(dy.mData.data());
+    bnScale_dev.ToDevice(bnScale.mData.data());
+
+    if(haveSavedMeanInvVar)
+    {
+        savedMean_dev.ToDevice(savedMean.mData.data());
+        savedInvVar_dev.ToDevice(savedInvVar.mData.data());
+    };
+
+    std::array<index_t, Rank> i_inOutLengths;
+    std::array<index_t, Rank> i_inOutStrides;
+    std::array<index_t, Rank - NumReduceDim> i_scaleBiasMeanVarLengths;
+    std::array<index_t, Rank - NumReduceDim> i_scaleBiasMeanVarStrides;
+
+    std::copy(inOutLengths.begin(), inOutLengths.end(), i_inOutLengths.begin());
+    std::copy(inOutStrides.begin(), inOutStrides.end(), i_inOutStrides.begin());
+    std::copy(scaleBiasMeanVarLengths.begin(),
+              scaleBiasMeanVarLengths.end(),
+              i_scaleBiasMeanVarLengths.begin());
+    std::copy(scaleBiasMeanVarStrides.begin(),
+              scaleBiasMeanVarStrides.end(),
+              i_scaleBiasMeanVarStrides.begin());
+
+    using PassThroughOp = ck::tensor_operation::element_wise::PassThrough;
+
+    using DeviceBatchNormBwdInstance =
+        ck::tensor_operation::device::DeviceBatchNormBwdImpl<XDataType,
+                                                             AccDataType,
+                                                             AccDataType,
+                                                             AccDataType,
+                                                             ScaleDataType, // ScaleDataType
+                                                             AccDataType,   // DscaleDbiasDataType
+                                                             AccDataType,   // MeanVarDataType
+                                                             PassThroughOp,
+                                                             Rank,
+                                                             NumReduceDim,
+                                                             UseMultiblockInK,
+                                                             256,
+                                                             16,
+                                                             16,
+                                                             1,
+                                                             2,
+                                                             0,
+                                                             1,  // XSrcVectorSize
+                                                             1,  // DySrcVectorSize
+                                                             1,  // DxDstVectorSize
+                                                             1,  // ScaleSrcVectorSize
+                                                             1,  // DscaleDbiasDstVectorSize
+                                                             1>; // MeanVarSrcVectorSize
+
+    auto batchnorm_bwd = DeviceBatchNormBwdInstance{};
+
+    auto argument_ptr = batchnorm_bwd.MakeArgumentPointer(
+        i_inOutLengths,
+        i_inOutStrides,
+        i_inOutStrides,
+        i_inOutStrides,
+        {0, 1, 2},
+        i_scaleBiasMeanVarLengths,
+        i_scaleBiasMeanVarStrides,
+        i_scaleBiasMeanVarStrides,
+        i_scaleBiasMeanVarStrides,
+        x_dev.GetDeviceBuffer(),
+        dy_dev.GetDeviceBuffer(),
+        bnScale_dev.GetDeviceBuffer(),
+        haveSavedMeanInvVar ? savedMean_dev.GetDeviceBuffer() : nullptr,
+        haveSavedMeanInvVar ? savedInvVar_dev.GetDeviceBuffer() : nullptr,
+        epsilon,
+        PassThroughOp{},
+        dx_dev.GetDeviceBuffer(),
+        dscale_dev.GetDeviceBuffer(),
+        dbias_dev.GetDeviceBuffer());
+
+    if(!batchnorm_bwd.IsSupportedArgument(argument_ptr.get()))
+    {
+        std::cout << "The runtime parameters seems not supported by the BatchNorm device instance, "
+                     "exiting!"
+                  << std::endl;
+        return (false);
+    };
+
+    size_t workspace_sz = batchnorm_bwd.GetWorkSpaceSize(argument_ptr.get());
+
+    DeviceMem workspace_dev(workspace_sz);
+
+    batchnorm_bwd.SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
+
+    auto invoker_ptr = batchnorm_bwd.MakeInvokerPointer();
+
+    if(time_kernel)
+    {
+        float avg_time   = 0.0f;
+        size_t num_bytes = 0;
+
+        size_t total_length = inOutLengths[0] * inOutLengths[1] * inOutLengths[2] * inOutLengths[3];
+        size_t invariant_length = inOutLengths[3];
+
+        avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+        // inputing of x, dy, scale, outputing of dx, dscale, dbias
+        num_bytes +=
+            total_length * sizeof(XDataType) * 3 + invariant_length * sizeof(AccDataType) * 3;
+
+        // outputing of mean, inv-variance
+        num_bytes += haveSavedMeanInvVar ? invariant_length * sizeof(AccDataType) * 2 : 0;
+
+        float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+        std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
+    }
+    else
+        (void)invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        using ReferenceBatchNormBwdInstance =
+            ck::tensor_operation::host::ReferenceBatchNormBwd<XDataType,
+                                                              AccDataType,
+                                                              AccDataType,
+                                                              AccDataType,
+                                                              ScaleDataType, // ScaleDataType
+                                                              AccDataType,
+                                                              AccDataType,
+                                                              PassThroughOp,
+                                                              Rank,
+                                                              NumReduceDim>;
+
+        auto batchNormBwd_ref = ReferenceBatchNormBwdInstance{};
+
+        auto argument_ptr_ref = batchNormBwd_ref.MakeArgumentPointer(
+            i_inOutLengths,
+            i_inOutStrides,
+            i_inOutStrides,
+            i_inOutStrides,
+            {0, 1, 2},
+            i_scaleBiasMeanVarLengths,
+            i_scaleBiasMeanVarStrides,
+            i_scaleBiasMeanVarStrides,
+            i_scaleBiasMeanVarStrides,
+            x.mData.data(),
+            dy.mData.data(),
+            bnScale.mData.data(),
+            haveSavedMeanInvVar ? savedMean.mData.data() : nullptr,
+            haveSavedMeanInvVar ? savedInvVar.mData.data() : nullptr,
+            epsilon,
+            PassThroughOp{},
+            dx_ref.mData.data(),
+            dscale_ref.mData.data(),
+            dbias_ref.mData.data());
+
+        if(!batchNormBwd_ref.IsSupportedArgument(argument_ptr_ref.get()))
+        {
+            std::cout
+                << "The runtime parameters seems not supported by the device instance, exiting!"
+                << std::endl;
+            return (false);
+        };
+
+        auto invoker_ptr_ref = batchNormBwd_ref.MakeInvokerPointer();
+
+        (void)invoker_ptr_ref->Run(argument_ptr_ref.get());
+
+        dx_dev.FromDevice(dx.mData.data());
+        dscale_dev.FromDevice(dscale.data());
+        dbias_dev.FromDevice(dbias.data());
+
+        // clang-format off
+        pass = pass && ck::utils::check_err(dbias.mData, dbias_ref.mData, "dBias result:", 2e-4, 2e-4);
+        pass = pass && ck::utils::check_err(dscale.mData, dscale_ref.mData, "dScale result:", 2e-4, 2e-4);
+        pass = pass && ck::utils::check_err(dx.mData, dx_ref.mData, "dx result:");
+        // clang-format on
+    };
+
+    return (pass);
+};
+
+static const double epsilon = std::numeric_limits<float>::epsilon();
+
+int main(int argc, char* argv[])
+{
+    bool pass = true;
+
+    if(argc > 1)
+    {
+        BatchNormBwdArg arg;
+
+        if(arg.processArgs(argc, argv) < 0)
+            return (-1);
+
+        if(arg.data_type == 0)
+        {
+            if(arg.use_multiblock_welford)
+                pass = bnorm_bwd_nhwc_test<ck::half_t, float, true>(arg.do_verification,
+                                                                    arg.init_method,
+                                                                    arg.time_kernel,
+                                                                    arg.inOutLengths,
+                                                                    arg.haveSavedMeanInvVar,
+                                                                    epsilon);
+            else
+                pass = bnorm_bwd_nhwc_test<ck::half_t, float, false>(arg.do_verification,
+                                                                     arg.init_method,
+                                                                     arg.time_kernel,
+                                                                     arg.inOutLengths,
+                                                                     arg.haveSavedMeanInvVar,
+                                                                     epsilon);
+        }
+        else if(arg.data_type == 1)
+        {
+            if(arg.use_multiblock_welford)
+                pass = bnorm_bwd_nhwc_test<float, float, true>(arg.do_verification,
+                                                               arg.init_method,
+                                                               arg.time_kernel,
+                                                               arg.inOutLengths,
+                                                               arg.haveSavedMeanInvVar,
+                                                               epsilon);
+            else
+                pass = bnorm_bwd_nhwc_test<float, float, false>(arg.do_verification,
+                                                                arg.init_method,
+                                                                arg.time_kernel,
+                                                                arg.inOutLengths,
+                                                                arg.haveSavedMeanInvVar,
+                                                                epsilon);
+        }
+        else if(arg.data_type == 5)
+        {
+            if(arg.use_multiblock_welford)
+                pass = bnorm_bwd_nhwc_test<ck::bhalf_t, float, true>(arg.do_verification,
+                                                                     arg.init_method,
+                                                                     arg.time_kernel,
+                                                                     arg.inOutLengths,
+                                                                     arg.haveSavedMeanInvVar,
+                                                                     epsilon);
+            else
+                pass = bnorm_bwd_nhwc_test<ck::bhalf_t, float, false>(arg.do_verification,
+                                                                      arg.init_method,
+                                                                      arg.time_kernel,
+                                                                      arg.inOutLengths,
+                                                                      arg.haveSavedMeanInvVar,
+                                                                      epsilon);
+        }
+        else if(arg.data_type == 6)
+        {
+            if(arg.use_multiblock_welford)
+                pass = bnorm_bwd_nhwc_test<double, double, true>(arg.do_verification,
+                                                                 arg.init_method,
+                                                                 arg.time_kernel,
+                                                                 arg.inOutLengths,
+                                                                 arg.haveSavedMeanInvVar,
+                                                                 epsilon);
+            else
+                pass = bnorm_bwd_nhwc_test<double, double, false>(arg.do_verification,
+                                                                  arg.init_method,
+                                                                  arg.time_kernel,
+                                                                  arg.inOutLengths,
+                                                                  arg.haveSavedMeanInvVar,
+                                                                  epsilon);
+        }
+    }
+    else
+    {
+        pass = bnorm_bwd_nhwc_test<ck::half_t, float, true>(true,
+                                                            3,
+                                                            false, // don't time kernel
+                                                            {128, 16, 6, 512},
+                                                            false,
+                                                            epsilon);
+
+        pass = pass && bnorm_bwd_nhwc_test<ck::half_t, float, false>(true,
+                                                                     3,
+                                                                     false, // don't time kernel
+                                                                     {128, 16, 3, 1024},
+                                                                     false,
+                                                                     epsilon);
+    };
+
+    return (pass ? 0 : 1);
+}
diff --git a/example/34_batchnorm/batchnorm_common.hpp b/example/34_batchnorm/batchnorm_common.hpp
new file mode 100644
index 00000000..bdbc8ea8
--- /dev/null
+++ b/example/34_batchnorm/batchnorm_common.hpp
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cassert>
+#include <vector>
+#include <array>
+#include <type_traits>
+
+#include "ck/utility/data_type.hpp"
+
+struct NormalizeInInfer
+{
+    NormalizeInInfer(double epsilon = 1e-4) : epsilon_(epsilon) {}
+
+    template <typename T1, typename T2, typename T3, typename T4>
+    __host__ __device__ constexpr void operator()(T1& y,
+                                                  const T1& x,
+                                                  const T2& mean,
+                                                  const T2& variance,
+                                                  const T3& gamma,
+                                                  const T4& beta) const
+    {
+        static_assert(std::is_same<T2, float>::value || std::is_same<T2, double>::value,
+                      "Data type is not supported by this operation!");
+
+        using ck::type_convert;
+        using ck::math::sqrt;
+
+        T2 tmp_x, tmp_y;
+
+        tmp_x = type_convert<T2>(x);
+
+        tmp_y = ((tmp_x - mean) / sqrt(variance + type_convert<T2>(epsilon_))) *
+                    type_convert<T2>(gamma) +
+                type_convert<T2>(beta);
+        y = type_convert<T1>(tmp_y);
+    };
+
+    double epsilon_;
+};
+
+template <int Rank, int NumReduceDim>
+static inline std::array<int, Rank - NumReduceDim>
+get_invariant_dims(const std::array<int, NumReduceDim>& reduceDims)
+{
+    int reduceFlag = 0;
+
+    // flag the bits for the reduceDims
+    for(int i = 0; i < NumReduceDim; i++)
+    {
+        reduceFlag |= 1 << reduceDims[i];
+    };
+
+    std::array<int, Rank - NumReduceDim> invariantDims;
+
+    // collect invariant dimensions
+    int dim = 0;
+    for(int i = 0; i < Rank; i++)
+        if((reduceFlag & (1 << i)) == 0)
+        {
+            invariantDims[dim] = i;
+            dim++;
+        };
+
+    return invariantDims;
+};
diff --git a/example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp b/example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp
new file mode 100644
index 00000000..dc298485
--- /dev/null
+++ b/example/34_batchnorm/batchnorm_forward_inferring_nhwc.cpp
@@ -0,0 +1,366 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <limits>
+#include <iostream>
+#include <vector>
+#include <array>
+#include <algorithm>
+#include <getopt.h>
+
+#include "ck/ck.hpp"
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp"
+
+#include "batchnorm_infer_impl.hpp"
+
+static struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'},
+                                       {"verify", required_argument, nullptr, 'v'},
+                                       {"help", no_argument, nullptr, '?'},
+                                       {nullptr, 0, nullptr, 0}};
+
+class BatchNormInferArg
+{
+    private:
+    int option_index = 0;
+
+    public:
+    std::vector<size_t> inOutLengths;
+
+    bool do_verification = false;
+
+    int data_type    = 0;
+    int init_method  = 2;
+    bool time_kernel = false;
+
+    public:
+    void show_usage(const char* cmd)
+    {
+        std::cout << "Usage of " << cmd << std::endl;
+        std::cout << "--inOutLengths or -D, comma separated list of input tensor dimension "
+                     "lengths, must have 4 integers for nhwc"
+                  << std::endl;
+        std::cout << "--verify or -v, 1/0 to indicate whether to verify the batch-normalization "
+                     "result by "
+                     "comparing with the host-based batch-normalization"
+                  << std::endl;
+        std::cout << "Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)" << std::endl;
+        std::cout << "Arg2: init method used for bnScale and bnBias (0=no init, 1=single integer "
+                     "value, 2=scope integer "
+                     "value, 3=decimal value)"
+                  << std::endl;
+        std::cout << "Arg3: time kernel (0=no, 1=yes)" << std::endl;
+    };
+
+    int processArgs(int argc, char* argv[])
+    {
+        using ck::host_common::getTypeValuesFromString;
+
+        int ch;
+
+        while(1)
+        {
+            ch = getopt_long(argc, argv, "D:v:", long_options, &option_index);
+            if(ch == -1)
+                break;
+            switch(ch)
+            {
+            case 'D':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                inOutLengths = getTypeValuesFromString<size_t>(optarg);
+
+                if(inOutLengths.size() != 4)
+                    throw std::runtime_error(
+                        "NHWC tensor layout should have 4 length values specified!");
+                break;
+            case 'v':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                do_verification = static_cast<bool>(std::atoi(optarg));
+                break;
+            case '?':
+                if(std::string(long_options[option_index].name) == "help")
+                {
+                    show_usage(argv[0]);
+                    return (-1);
+                };
+                break;
+            default: show_usage(argv[0]); return (-1);
+            };
+        };
+
+        if(optind + 3 > argc)
+            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
+
+        data_type   = std::atoi(argv[optind++]);
+        init_method = std::atoi(argv[optind++]);
+        time_kernel = static_cast<bool>(std::atoi(argv[optind]));
+
+        if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6)
+            return (-1);
+
+        return (0);
+    };
+};
+
+using namespace ck;
+
+template <typename InOutDataType, typename AccDataType>
+bool bnorm_infer_nhwc_test(bool do_verification,
+                           int init_method,
+                           bool time_kernel,
+                           const std::vector<size_t> inOutLengths,
+                           double epsilon)
+{
+    // for NHWC BatchNorm calculation of mean and meansquare
+    constexpr int Rank         = 4;
+    constexpr int NumReduceDim = 3;
+
+    // when using lengths[] to create a tensor, lengths[0] is the length of highest dimension
+    // eg. N of NHWC, so lengths[3] is the dimension C length of NHWC
+    const std::vector<size_t> scaleBiasMeanVarLengths = {inOutLengths[3]};
+
+    // input data of the batchnorm forward algorithm
+    Tensor<InOutDataType> x(inOutLengths);
+    Tensor<AccDataType> bnScale(scaleBiasMeanVarLengths);
+    Tensor<AccDataType> bnBias(scaleBiasMeanVarLengths);
+
+    // output data of the batchnorm forward algorithm
+    Tensor<InOutDataType> y_ref(inOutLengths);
+    Tensor<InOutDataType> y(inOutLengths);
+
+    Tensor<AccDataType> estimatedMean(scaleBiasMeanVarLengths);
+    Tensor<AccDataType> estimatedVariance(scaleBiasMeanVarLengths);
+
+    auto inOutStrides            = x.mDesc.GetStrides();
+    auto scaleBiasMeanVarStrides = bnScale.mDesc.GetStrides();
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+
+    if constexpr(std::is_same<InOutDataType, int8_t>::value)
+    {
+        x.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
+
+        const float x_mean       = 0.0f;
+        const float x_stddev     = 2.5f;
+        const float noise_stddev = 0.0001f;
+
+        estimatedMean.GenerateTensorValue(GeneratorTensor_4<AccDataType>{x_mean, noise_stddev},
+                                          num_thread);
+
+        estimatedVariance.GenerateTensorValue(
+            GeneratorTensor_4<AccDataType>{x_stddev * x_stddev, noise_stddev}, num_thread);
+    }
+    else
+    {
+        const float x_mean       = 0.0f;
+        const float x_stddev     = 1.0f;
+        const float noise_stddev = 0.0001f;
+
+        x.GenerateTensorValue(GeneratorTensor_4<InOutDataType>{x_mean, x_stddev}, num_thread);
+
+        // initialize the savedMean to be values with tiny variation to the mean of the x values
+        estimatedMean.GenerateTensorValue(GeneratorTensor_4<AccDataType>{x_mean, noise_stddev},
+                                          num_thread);
+
+        // initialize the variance to be values with tiny variation to the variance of the x values
+        estimatedVariance.GenerateTensorValue(
+            GeneratorTensor_4<AccDataType>{x_stddev * x_stddev, noise_stddev}, num_thread);
+    };
+
+    if(do_verification)
+    {
+        switch(init_method)
+        {
+        case 0:
+            bnScale.GenerateTensorValue(GeneratorTensor_0<AccDataType>{}, num_thread);
+            bnBias.GenerateTensorValue(GeneratorTensor_0<AccDataType>{}, num_thread);
+            break;
+        case 1:
+            bnScale.GenerateTensorValue(GeneratorTensor_1<AccDataType>{1}, num_thread);
+            bnBias.GenerateTensorValue(GeneratorTensor_1<AccDataType>{0}, num_thread);
+            break;
+        case 2:
+            bnScale.GenerateTensorValue(GeneratorTensor_2<AccDataType>{-5, 5}, num_thread);
+            bnBias.GenerateTensorValue(GeneratorTensor_2<AccDataType>{-5, 5}, num_thread);
+            break;
+        default:
+            bnScale.GenerateTensorValue(GeneratorTensor_3<AccDataType>{-5.0f, 5.0f}, num_thread);
+            bnBias.GenerateTensorValue(GeneratorTensor_3<AccDataType>{-5.0f, 5.0f}, num_thread);
+        }
+    };
+
+    // these buffers are usually provided by the user application
+    DeviceMem x_dev(sizeof(InOutDataType) * x.mDesc.GetElementSpaceSize());
+    DeviceMem y_dev(sizeof(InOutDataType) * y.mDesc.GetElementSpaceSize());
+    DeviceMem bnScale_dev(sizeof(AccDataType) * bnScale.mDesc.GetElementSpaceSize());
+    DeviceMem bnBias_dev(sizeof(AccDataType) * bnBias.mDesc.GetElementSpaceSize());
+
+    // mean_dev or resultSaveMean_dev
+    DeviceMem estimatedMean_dev(sizeof(AccDataType) * estimatedMean.mDesc.GetElementSpaceSize());
+    // meansquare_dev or resultSaveInvVariance_dev
+    DeviceMem estimatedVariance_dev(sizeof(AccDataType) *
+                                    estimatedVariance.mDesc.GetElementSpaceSize());
+
+    x_dev.ToDevice(x.mData.data());
+    bnScale_dev.ToDevice(bnScale.mData.data());
+    bnBias_dev.ToDevice(bnBias.mData.data());
+    estimatedMean_dev.ToDevice(estimatedMean.mData.data());
+    estimatedVariance_dev.ToDevice(estimatedVariance.mData.data());
+
+    using ck::index_t;
+
+    std::array<index_t, Rank> i_inOutLengths;
+    std::array<index_t, Rank> i_inOutStrides;
+    std::array<index_t, Rank - NumReduceDim> i_scaleBiasMeanVarLengths;
+    std::array<index_t, Rank - NumReduceDim> i_scaleBiasMeanVarStrides;
+
+    ck::ranges::copy(inOutLengths, i_inOutLengths.begin());
+    ck::ranges::copy(inOutStrides, i_inOutStrides.begin());
+    ck::ranges::copy(scaleBiasMeanVarLengths, i_scaleBiasMeanVarLengths.begin());
+    ck::ranges::copy(scaleBiasMeanVarStrides, i_scaleBiasMeanVarStrides.begin());
+
+    int result = 0;
+
+    result = bnorm_infer<InOutDataType,
+                         InOutDataType,
+                         AccDataType,
+                         AccDataType,
+                         AccDataType,
+                         AccDataType,
+                         Rank,
+                         NumReduceDim,
+                         false>(time_kernel,
+                                {0, 1, 2},
+                                i_inOutLengths,
+                                i_inOutStrides,
+                                i_inOutStrides,
+                                i_scaleBiasMeanVarLengths,
+                                i_scaleBiasMeanVarStrides,
+                                i_scaleBiasMeanVarStrides,
+                                i_scaleBiasMeanVarStrides,
+                                x_dev.GetDeviceBuffer(),
+                                bnScale_dev.GetDeviceBuffer(),
+                                bnBias_dev.GetDeviceBuffer(),
+                                epsilon,
+                                estimatedMean_dev.GetDeviceBuffer(),
+                                estimatedVariance_dev.GetDeviceBuffer(),
+                                y_dev.GetDeviceBuffer());
+
+    if(result < 0)
+        return (false);
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        using PassThroughOp = ck::tensor_operation::element_wise::PassThrough;
+
+        using ReferenceBatchNormInferInstance =
+            ck::tensor_operation::host::ReferenceBatchNormInfer<InOutDataType,
+                                                                InOutDataType,
+                                                                AccDataType,
+                                                                AccDataType,
+                                                                AccDataType,
+                                                                AccDataType,
+                                                                PassThroughOp,
+                                                                Rank,
+                                                                NumReduceDim>;
+        auto batchNormInfer_ref = ReferenceBatchNormInferInstance{};
+
+        auto argument_ptr_ref =
+            batchNormInfer_ref.MakeArgumentPointer(i_inOutLengths,
+                                                   i_inOutStrides,
+                                                   i_inOutStrides,
+                                                   {0, 1, 2},
+                                                   i_scaleBiasMeanVarLengths,
+                                                   i_scaleBiasMeanVarStrides,
+                                                   i_scaleBiasMeanVarStrides,
+                                                   i_scaleBiasMeanVarStrides,
+                                                   x.mData.data(),
+                                                   bnScale.mData.data(),
+                                                   bnBias.mData.data(),
+                                                   epsilon,
+                                                   PassThroughOp{},
+                                                   estimatedMean.mData.data(),
+                                                   estimatedVariance.mData.data(),
+                                                   y_ref.mData.data());
+
+        if(!batchNormInfer_ref.IsSupportedArgument(argument_ptr_ref.get()))
+        {
+            std::cout
+                << "The runtime parameters seems not supported by the BatchNorm instance, exiting!"
+                << std::endl;
+            return (-2);
+        };
+
+        auto invoker_ptr_ref = batchNormInfer_ref.MakeInvokerPointer();
+
+        (void)invoker_ptr_ref->Run(argument_ptr_ref.get());
+
+        y_dev.FromDevice(y.mData.data());
+        pass = pass && ck::utils::check_err(y, y_ref);
+    };
+
+    return (pass);
+};
+
+static const double epsilon = std::numeric_limits<float>::epsilon();
+
+int main(int argc, char* argv[])
+{
+    bool pass = true;
+
+    if(argc > 1)
+    {
+        BatchNormInferArg arg;
+
+        if(arg.processArgs(argc, argv) < 0)
+            return (-1);
+
+        if(arg.data_type == 0)
+        {
+            pass = bnorm_infer_nhwc_test<ck::half_t, float>(
+                arg.do_verification, arg.init_method, arg.time_kernel, arg.inOutLengths, epsilon);
+        }
+        else if(arg.data_type == 1)
+        {
+            pass = bnorm_infer_nhwc_test<float, float>(
+                arg.do_verification, arg.init_method, arg.time_kernel, arg.inOutLengths, epsilon);
+        }
+        else if(arg.data_type == 3)
+        {
+            pass = bnorm_infer_nhwc_test<int8_t, float>(
+                arg.do_verification, arg.init_method, arg.time_kernel, arg.inOutLengths, epsilon);
+        }
+        else if(arg.data_type == 5)
+        {
+            pass = bnorm_infer_nhwc_test<ck::bhalf_t, float>(
+                arg.do_verification, arg.init_method, arg.time_kernel, arg.inOutLengths, epsilon);
+        }
+        else if(arg.data_type == 6)
+        {
+            pass = bnorm_infer_nhwc_test<double, double>(
+                arg.do_verification, arg.init_method, arg.time_kernel, arg.inOutLengths, epsilon);
+        };
+    }
+    else
+    {
+        pass = bnorm_infer_nhwc_test<ck::half_t, float>(true,
+                                                        2,
+                                                        false, // don't time kernel
+                                                        {128, 16, 16, 1024},
+                                                        epsilon);
+    };
+
+    return (pass ? 0 : 1);
+}
diff --git a/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp b/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp
new file mode 100644
index 00000000..da36d65a
--- /dev/null
+++ b/example/34_batchnorm/batchnorm_forward_training_nhwc.cpp
@@ -0,0 +1,591 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <limits>
+#include <iostream>
+#include <vector>
+#include <array>
+#include <algorithm>
+#include <getopt.h>
+
+#include "ck/ck.hpp"
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+static struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'},
+                                       {"verify", required_argument, nullptr, 'v'},
+                                       {"help", no_argument, nullptr, '?'},
+                                       {nullptr, 0, nullptr, 0}};
+
+class BatchNormFwdArg
+{
+    private:
+    int option_index = 0;
+
+    public:
+    std::vector<size_t> inOutLengths;
+
+    bool do_verification = false;
+
+    bool updateMovingAverage;
+    bool saveMeanAndInvVariance;
+
+    int data_type               = 0;
+    int init_method             = 2;
+    bool time_kernel            = false;
+    bool use_multiblock_welford = false;
+
+    public:
+    void show_usage(const char* cmd)
+    {
+        std::cout << "Usage of " << cmd << std::endl;
+        std::cout << "--inOutLengths or -D, comma separated list of input tensor dimension "
+                     "lengths, must have 4 integers for nhwc"
+                  << std::endl;
+        std::cout << "--verify or -v, 1/0 to indicate whether to verify the batch-normalization "
+                     "result by "
+                     "comparing with the host-based batch-normalization"
+                  << std::endl;
+        std::cout << "Arg1: data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)" << std::endl;
+        std::cout << "Arg2: 1/0 to indicate whether to update the moving average and variance "
+                     "(0=no, 1=yes)"
+                  << std::endl;
+        std::cout << "Arg3: 1/0 to indicate whether to save the calculated mean and invVariance "
+                     "(0=no, 1=yes)"
+                  << std::endl;
+        std::cout << "Arg4: init method used for bnScale and bnBias (0=no init, 1=single integer "
+                     "value, 2=scope integer "
+                     "value, 3=decimal value)"
+                  << std::endl;
+        std::cout << "Arg5: time kernel (0=no, 1=yes)" << std::endl;
+        std::cout << "Arg6: use multi-block welford (0=n0, 1=yes)" << std::endl;
+    };
+
+    int processArgs(int argc, char* argv[])
+    {
+        using ck::host_common::getTypeValuesFromString;
+
+        int ch;
+
+        while(1)
+        {
+            ch = getopt_long(argc, argv, "D:v:", long_options, &option_index);
+            if(ch == -1)
+                break;
+            switch(ch)
+            {
+            case 'D':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                inOutLengths = getTypeValuesFromString<size_t>(optarg);
+
+                if(inOutLengths.size() != 4)
+                    throw std::runtime_error(
+                        "NHWC tensor layout should have 4 length values specified!");
+                break;
+            case 'v':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                do_verification = static_cast<bool>(std::atoi(optarg));
+                break;
+            case '?':
+                if(std::string(long_options[option_index].name) == "help")
+                {
+                    show_usage(argv[0]);
+                    return (-1);
+                };
+                break;
+            default: show_usage(argv[0]); return (-1);
+            };
+        };
+
+        if(optind + 6 > argc)
+            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
+
+        data_type              = std::atoi(argv[optind++]);
+        updateMovingAverage    = std::atoi(argv[optind++]);
+        saveMeanAndInvVariance = std::atoi(argv[optind++]);
+        init_method            = std::atoi(argv[optind++]);
+        time_kernel            = static_cast<bool>(std::atoi(argv[optind++]));
+        use_multiblock_welford = static_cast<bool>(std::atoi(argv[optind]));
+
+        if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6)
+            return (-1);
+
+        return (0);
+    };
+};
+
+using namespace ck;
+
+template <typename InOutDataType, typename AccDataType, bool UseMultiblockInK>
+bool bnorm_fwd_nhwc_test(bool do_verification,
+                         int init_method,
+                         bool time_kernel,
+                         const std::vector<size_t> inOutLengths,
+                         bool updateMovingAverage,
+                         bool saveMeanAndInvVariance,
+                         double averageFactor,
+                         double epsilon)
+{
+    // for NHWC BatchNorm calculation of mean and meansquare
+    constexpr int Rank         = 4;
+    constexpr int NumReduceDim = 3;
+
+    // when using lengths[] to create a tensor, lengths[0] is the length of highest dimension
+    // eg. N of NHWC, so lengths[3] is the dimension C length of NHWC
+    const std::vector<size_t> scaleBiasMeanVarLengths = {inOutLengths[3]};
+
+    // input data of the batchnorm forward algorithm
+    Tensor<InOutDataType> x(inOutLengths);
+    Tensor<AccDataType> bnScale(scaleBiasMeanVarLengths);
+    Tensor<AccDataType> bnBias(scaleBiasMeanVarLengths);
+
+    // output data of the batchnorm forward algorithm
+    Tensor<InOutDataType> y_ref(inOutLengths);
+    Tensor<InOutDataType> y(inOutLengths);
+
+    Tensor<AccDataType> resultSaveMean_ref(scaleBiasMeanVarLengths);
+    Tensor<AccDataType> resultSaveInvVariance_ref(scaleBiasMeanVarLengths);
+
+    Tensor<AccDataType> resultRunningMean_ref(scaleBiasMeanVarLengths);
+    Tensor<AccDataType> resultRunningVariance_ref(scaleBiasMeanVarLengths);
+
+    auto inOutStrides            = x.mDesc.GetStrides();
+    auto scaleBiasMeanVarStrides = bnScale.mDesc.GetStrides();
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+
+    if(updateMovingAverage)
+    {
+        if constexpr(std::is_same<InOutDataType, int8_t>::value)
+        {
+            x.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
+
+            const float x_mean       = 0.0f;
+            const float x_stddev     = 2.5f;
+            const float noise_stddev = 0.04f;
+
+            resultRunningMean_ref.GenerateTensorValue(
+                GeneratorTensor_4<AccDataType>{x_mean, noise_stddev}, num_thread);
+
+            resultRunningVariance_ref.GenerateTensorValue(
+                GeneratorTensor_4<AccDataType>{x_stddev * x_stddev, noise_stddev}, num_thread);
+        }
+        else
+        {
+            const float x_mean       = 0.0f;
+            const float x_stddev     = 1.0f;
+            const float noise_stddev = 0.04f;
+
+            // input data in normal distribution
+            x.GenerateTensorValue(GeneratorTensor_4<InOutDataType>{x_mean, x_stddev}, num_thread);
+
+            // initialize the runningMean to be values with tiny variation to the mean of the x
+            // values
+            resultRunningMean_ref.GenerateTensorValue(
+                GeneratorTensor_4<AccDataType>{x_mean, noise_stddev}, num_thread);
+
+            // initialize the runningVariance to be values with tiny variation to the variance of
+            // the x values
+            resultRunningVariance_ref.GenerateTensorValue(
+                GeneratorTensor_4<AccDataType>{x_stddev * x_stddev, noise_stddev}, num_thread);
+        };
+    }
+    else
+    {
+        if constexpr(std::is_same<InOutDataType, int8_t>::value)
+            x.GenerateTensorValue(GeneratorTensor_2<InOutDataType>{-5, 5}, num_thread);
+        else
+            x.GenerateTensorValue(GeneratorTensor_3<InOutDataType>{-5.0f, 5.0f}, num_thread);
+    };
+
+    if(do_verification)
+    {
+        switch(init_method)
+        {
+        case 0:
+            bnScale.GenerateTensorValue(GeneratorTensor_0<AccDataType>{}, num_thread);
+            bnBias.GenerateTensorValue(GeneratorTensor_0<AccDataType>{}, num_thread);
+            break;
+        case 1:
+            bnScale.GenerateTensorValue(GeneratorTensor_1<AccDataType>{1}, num_thread);
+            bnBias.GenerateTensorValue(GeneratorTensor_1<AccDataType>{0}, num_thread);
+            break;
+        case 2:
+            bnScale.GenerateTensorValue(GeneratorTensor_2<AccDataType>{-5, 5}, num_thread);
+            bnBias.GenerateTensorValue(GeneratorTensor_2<AccDataType>{-5, 5}, num_thread);
+            break;
+        default:
+            bnScale.GenerateTensorValue(GeneratorTensor_3<AccDataType>{-5.0f, 5.0f}, num_thread);
+            bnBias.GenerateTensorValue(GeneratorTensor_3<AccDataType>{-5.0f, 5.0f}, num_thread);
+        }
+    };
+
+    // these buffers are usually provided by the user application
+    DeviceMem x_dev(sizeof(InOutDataType) * x.mDesc.GetElementSpaceSize());
+    DeviceMem y_dev(sizeof(InOutDataType) * y.mDesc.GetElementSpaceSize());
+    DeviceMem bnScale_dev(sizeof(AccDataType) * bnScale.mDesc.GetElementSpaceSize());
+    DeviceMem bnBias_dev(sizeof(AccDataType) * bnBias.mDesc.GetElementSpaceSize());
+
+    // mean_dev or resultSaveMean_dev
+    DeviceMem resultSaveMean_dev(sizeof(AccDataType) *
+                                 resultSaveMean_ref.mDesc.GetElementSpaceSize());
+    // meansquare_dev or resultSaveInvVariance_dev
+    DeviceMem resultSaveInvVariance_dev(sizeof(AccDataType) *
+                                        resultSaveInvVariance_ref.mDesc.GetElementSpaceSize());
+    // resultRunningMean_dev
+    DeviceMem resultRunningMean_dev(sizeof(AccDataType) *
+                                    resultRunningMean_ref.mDesc.GetElementSpaceSize());
+    // resultRunningVariance_dev
+    DeviceMem resultRunningVariance_dev(sizeof(AccDataType) *
+                                        resultRunningVariance_ref.mDesc.GetElementSpaceSize());
+
+    x_dev.ToDevice(x.mData.data());
+    bnScale_dev.ToDevice(bnScale.mData.data());
+    bnBias_dev.ToDevice(bnBias.mData.data());
+
+    if(updateMovingAverage)
+    {
+        resultRunningMean_dev.ToDevice(resultRunningMean_ref.mData.data());
+        resultRunningVariance_dev.ToDevice(resultRunningVariance_ref.mData.data());
+    };
+
+    std::array<index_t, Rank> i_inOutLengths;
+    std::array<index_t, Rank> i_inOutStrides;
+    std::array<index_t, Rank - NumReduceDim> i_scaleBiasMeanVarLengths;
+    std::array<index_t, Rank - NumReduceDim> i_scaleBiasMeanVarStrides;
+
+    ck::ranges::copy(inOutLengths, i_inOutLengths.begin());
+    ck::ranges::copy(inOutStrides, i_inOutStrides.begin());
+    ck::ranges::copy(scaleBiasMeanVarLengths, i_scaleBiasMeanVarLengths.begin());
+    ck::ranges::copy(scaleBiasMeanVarStrides, i_scaleBiasMeanVarStrides.begin());
+
+    using PassThroughOp = ck::tensor_operation::element_wise::PassThrough;
+
+    using DeviceBatchNormFwdInstance =
+        ck::tensor_operation::device::DeviceBatchNormFwdImpl<InOutDataType,
+                                                             InOutDataType,
+                                                             AccDataType,
+                                                             AccDataType,   // ScaleDataType
+                                                             AccDataType,   // BiasDataType
+                                                             AccDataType,   // MeanVarDataType
+                                                             PassThroughOp, // YElementwiseOp
+                                                             Rank,
+                                                             NumReduceDim,
+                                                             UseMultiblockInK,
+                                                             256,
+                                                             16,
+                                                             16,
+                                                             1,
+                                                             2,
+                                                             0,
+                                                             1,
+                                                             1,
+                                                             1,
+                                                             1,
+                                                             1>;
+
+    auto batchnorm_fwd = DeviceBatchNormFwdInstance{};
+
+    auto argument_ptr = batchnorm_fwd.MakeArgumentPointer(
+        i_inOutLengths,
+        i_inOutStrides,
+        i_inOutStrides,
+        {0, 1, 2}, // indicates physical indices of reduce dimensions in lengths[] and strides[]
+        i_scaleBiasMeanVarLengths,
+        i_scaleBiasMeanVarStrides,
+        i_scaleBiasMeanVarStrides,
+        i_scaleBiasMeanVarStrides,
+        x_dev.GetDeviceBuffer(),
+        bnScale_dev.GetDeviceBuffer(),
+        bnBias_dev.GetDeviceBuffer(),
+        epsilon,
+        PassThroughOp{},
+        y_dev.GetDeviceBuffer(),
+        saveMeanAndInvVariance ? resultSaveMean_dev.GetDeviceBuffer() : nullptr,
+        saveMeanAndInvVariance ? resultSaveInvVariance_dev.GetDeviceBuffer() : nullptr,
+        averageFactor,
+        updateMovingAverage ? resultRunningMean_dev.GetDeviceBuffer() : nullptr,
+        updateMovingAverage ? resultRunningVariance_dev.GetDeviceBuffer() : nullptr);
+
+    if(!batchnorm_fwd.IsSupportedArgument(argument_ptr.get()))
+    {
+        std::cout << "The runtime parameters seems not supported by the BatchNorm device instance, "
+                     "exiting!"
+                  << std::endl;
+        return (false);
+    };
+
+    size_t workspace_sz = batchnorm_fwd.GetWorkSpaceSize(argument_ptr.get());
+
+    DeviceMem workspace_dev(workspace_sz);
+
+    batchnorm_fwd.SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
+
+    auto invoker_ptr = batchnorm_fwd.MakeInvokerPointer();
+
+    if(time_kernel)
+    {
+        float avg_time   = 0.0f;
+        size_t num_bytes = 0;
+
+        size_t total_length = inOutLengths[0] * inOutLengths[1] * inOutLengths[2] * inOutLengths[3];
+        size_t invariant_length = inOutLengths[3];
+
+        avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+        // inputing of x, scale, bias, outputing of y
+        num_bytes +=
+            total_length * sizeof(InOutDataType) * 2 + invariant_length * sizeof(AccDataType) * 2;
+
+        // outputing of mean, inv-variance
+        num_bytes += saveMeanAndInvVariance ? invariant_length * sizeof(AccDataType) * 2 : 0;
+
+        // updating of moving mean, variance
+        num_bytes += updateMovingAverage ? invariant_length * sizeof(AccDataType) * 4 : 0;
+
+        float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+        std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
+    }
+    else
+        (void)invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+
+        using ReferenceBatchNormFwdInstance =
+            ck::tensor_operation::host::ReferenceBatchNormFwd<InOutDataType,
+                                                              InOutDataType,
+                                                              AccDataType,
+                                                              AccDataType,
+                                                              AccDataType,
+                                                              AccDataType,
+                                                              PassThroughOp,
+                                                              Rank,
+                                                              NumReduceDim>;
+
+        auto batchNormFwd_ref = ReferenceBatchNormFwdInstance{};
+
+        auto argument_ptr_ref = batchNormFwd_ref.MakeArgumentPointer(
+            i_inOutLengths,
+            i_inOutStrides,
+            i_inOutStrides,
+            {0, 1, 2}, // indicates physical indices of reduce dimensions in lengths[] and strides[]
+            i_scaleBiasMeanVarLengths,
+            i_scaleBiasMeanVarStrides,
+            i_scaleBiasMeanVarStrides,
+            i_scaleBiasMeanVarStrides,
+            x.mData.data(),
+            bnScale.mData.data(),
+            bnBias.mData.data(),
+            epsilon,
+            PassThroughOp{},
+            y_ref.mData.data(),
+            saveMeanAndInvVariance ? resultSaveMean_ref.mData.data() : nullptr,
+            saveMeanAndInvVariance ? resultSaveInvVariance_ref.mData.data() : nullptr,
+            averageFactor,
+            updateMovingAverage ? resultRunningMean_ref.mData.data() : nullptr,
+            updateMovingAverage ? resultRunningVariance_ref.mData.data() : nullptr);
+
+        if(!batchNormFwd_ref.IsSupportedArgument(argument_ptr_ref.get()))
+        {
+            std::cout << "The runtime parameters seems not supported by the BatchNorm reference "
+                         "instance, exiting!"
+                      << std::endl;
+            return (false);
+        };
+
+        auto invoker_ptr_ref = batchNormFwd_ref.MakeInvokerPointer();
+
+        (void)invoker_ptr_ref->Run(argument_ptr_ref.get());
+
+        y_dev.FromDevice(y.mData.data());
+        pass = pass && ck::utils::check_err(y, y_ref);
+
+        if(updateMovingAverage)
+        {
+            Tensor<AccDataType> resultRunningMean(scaleBiasMeanVarLengths);
+            Tensor<AccDataType> resultRunningVariance(scaleBiasMeanVarLengths);
+
+            resultRunningMean_dev.FromDevice(resultRunningMean.mData.data());
+            resultRunningVariance_dev.FromDevice(resultRunningVariance.mData.data());
+
+            pass = pass && ck::utils::check_err(resultRunningMean, resultRunningMean_ref);
+            pass = pass && ck::utils::check_err(resultRunningVariance, resultRunningVariance_ref);
+        };
+
+        if(saveMeanAndInvVariance)
+        {
+            using ck::host_common::dumpBufferToFile;
+
+            Tensor<AccDataType> resultSaveMean(scaleBiasMeanVarLengths);
+            Tensor<AccDataType> resultSaveInvVariance(scaleBiasMeanVarLengths);
+
+            resultSaveMean_dev.FromDevice(resultSaveMean.mData.data());
+            resultSaveInvVariance_dev.FromDevice(resultSaveInvVariance.mData.data());
+
+            pass = pass && ck::utils::check_err(resultSaveMean, resultSaveMean_ref);
+            pass = pass && ck::utils::check_err(resultSaveInvVariance, resultSaveInvVariance_ref);
+        };
+    };
+
+    return (pass);
+};
+
+const double epsilon              = std::numeric_limits<float>::epsilon();
+static const double averageFactor = 0.1;
+
+int main(int argc, char* argv[])
+{
+    bool pass = true;
+
+    if(argc > 1)
+    {
+        BatchNormFwdArg arg;
+
+        if(arg.processArgs(argc, argv) < 0)
+            return (-1);
+
+        if(arg.data_type == 0)
+        {
+            if(arg.use_multiblock_welford)
+                pass = bnorm_fwd_nhwc_test<ck::half_t, float, true>(arg.do_verification,
+                                                                    arg.init_method,
+                                                                    arg.time_kernel,
+                                                                    arg.inOutLengths,
+                                                                    arg.updateMovingAverage,
+                                                                    arg.saveMeanAndInvVariance,
+                                                                    averageFactor,
+                                                                    epsilon);
+            else
+                pass = bnorm_fwd_nhwc_test<ck::half_t, float, false>(arg.do_verification,
+                                                                     arg.init_method,
+                                                                     arg.time_kernel,
+                                                                     arg.inOutLengths,
+                                                                     arg.updateMovingAverage,
+                                                                     arg.saveMeanAndInvVariance,
+                                                                     averageFactor,
+                                                                     epsilon);
+        }
+        else if(arg.data_type == 1)
+        {
+            if(arg.use_multiblock_welford)
+                pass = bnorm_fwd_nhwc_test<float, float, true>(arg.do_verification,
+                                                               arg.init_method,
+                                                               arg.time_kernel,
+                                                               arg.inOutLengths,
+                                                               arg.updateMovingAverage,
+                                                               arg.saveMeanAndInvVariance,
+                                                               averageFactor,
+                                                               epsilon);
+            else
+                pass = bnorm_fwd_nhwc_test<float, float, false>(arg.do_verification,
+                                                                arg.init_method,
+                                                                arg.time_kernel,
+                                                                arg.inOutLengths,
+                                                                arg.updateMovingAverage,
+                                                                arg.saveMeanAndInvVariance,
+                                                                averageFactor,
+                                                                epsilon);
+        }
+        else if(arg.data_type == 3)
+        {
+            if(arg.use_multiblock_welford)
+                pass = bnorm_fwd_nhwc_test<int8_t, float, true>(arg.do_verification,
+                                                                arg.init_method,
+                                                                arg.time_kernel,
+                                                                arg.inOutLengths,
+                                                                arg.updateMovingAverage,
+                                                                arg.saveMeanAndInvVariance,
+                                                                averageFactor,
+                                                                epsilon);
+            else
+                pass = bnorm_fwd_nhwc_test<int8_t, float, false>(arg.do_verification,
+                                                                 arg.init_method,
+                                                                 arg.time_kernel,
+                                                                 arg.inOutLengths,
+                                                                 arg.updateMovingAverage,
+                                                                 arg.saveMeanAndInvVariance,
+                                                                 averageFactor,
+                                                                 epsilon);
+        }
+        else if(arg.data_type == 5)
+        {
+            if(arg.use_multiblock_welford)
+                pass = bnorm_fwd_nhwc_test<ck::bhalf_t, float, true>(arg.do_verification,
+                                                                     arg.init_method,
+                                                                     arg.time_kernel,
+                                                                     arg.inOutLengths,
+                                                                     arg.updateMovingAverage,
+                                                                     arg.saveMeanAndInvVariance,
+                                                                     averageFactor,
+                                                                     epsilon);
+            else
+                pass = bnorm_fwd_nhwc_test<ck::bhalf_t, float, false>(arg.do_verification,
+                                                                      arg.init_method,
+                                                                      arg.time_kernel,
+                                                                      arg.inOutLengths,
+                                                                      arg.updateMovingAverage,
+                                                                      arg.saveMeanAndInvVariance,
+                                                                      averageFactor,
+                                                                      epsilon);
+        }
+        else if(arg.data_type == 6)
+        {
+            if(arg.use_multiblock_welford)
+                pass = bnorm_fwd_nhwc_test<double, double, true>(arg.do_verification,
+                                                                 arg.init_method,
+                                                                 arg.time_kernel,
+                                                                 arg.inOutLengths,
+                                                                 arg.updateMovingAverage,
+                                                                 arg.saveMeanAndInvVariance,
+                                                                 averageFactor,
+                                                                 epsilon);
+            else
+                pass = bnorm_fwd_nhwc_test<double, double, false>(arg.do_verification,
+                                                                  arg.init_method,
+                                                                  arg.time_kernel,
+                                                                  arg.inOutLengths,
+                                                                  arg.updateMovingAverage,
+                                                                  arg.saveMeanAndInvVariance,
+                                                                  averageFactor,
+                                                                  epsilon);
+        }
+    }
+    else
+    {
+        pass = bnorm_fwd_nhwc_test<ck::half_t, float, true>(true,
+                                                            2,
+                                                            false, // don't time kernel
+                                                            {128, 16, 6, 512},
+                                                            true,
+                                                            true,
+                                                            averageFactor,
+                                                            epsilon);
+
+        pass = pass && bnorm_fwd_nhwc_test<ck::half_t, float, false>(true,
+                                                                     2,
+                                                                     false, // don't time kernel
+                                                                     {128, 16, 3, 1024},
+                                                                     true,
+                                                                     true,
+                                                                     averageFactor,
+                                                                     epsilon);
+    };
+
+    return (pass ? 0 : 1);
+}
diff --git a/example/34_batchnorm/batchnorm_infer_impl.hpp b/example/34_batchnorm/batchnorm_infer_impl.hpp
new file mode 100644
index 00000000..e457df81
--- /dev/null
+++ b/example/34_batchnorm/batchnorm_infer_impl.hpp
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cassert>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/utility/sequence.hpp"
+#include "ck/utility/tuple.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
+
+#include "batchnorm_common.hpp"
+
+template <typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          ck::index_t Rank,
+          ck::index_t NumBatchNormReduceDim,
+          bool fastest_dim_is_reduced = false>
+int bnorm_infer(
+    bool time_kernel,
+    const std::array<int, NumBatchNormReduceDim> reduceDims,
+    const std::array<ck::index_t, Rank> xyLengths,
+    const std::array<ck::index_t, Rank> xStrides,
+    const std::array<ck::index_t, Rank> yStrides,
+    const std::array<ck::index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarLengths,
+    const std::array<ck::index_t, Rank - NumBatchNormReduceDim> bnScaleStrides,
+    const std::array<ck::index_t, Rank - NumBatchNormReduceDim> bnBiasStrides,
+    const std::array<ck::index_t, Rank - NumBatchNormReduceDim> bnMeanVarStrides,
+    const void* p_x,
+    const void* p_scale,
+    const void* p_bias,
+    double epsilon,
+    const void* p_estimatedMean,
+    const void* p_estimatedVariance,
+    void* p_y)
+{
+    (void)bnScaleBiasMeanVarLengths;
+
+    static_assert(NumBatchNormReduceDim < Rank,
+                  "Invalid number of reduced dimensions for batchnorm!");
+
+    using DeviceNormalizeInstance = ck::tensor_operation::device::DeviceElementwise<
+        ck::Tuple<XDataType, AccDataType, AccDataType, AccDataType, AccDataType>, // x, mean,
+                                                                                  // variance,
+                                                                                  // scale,
+                                                                                  // bias,
+        ck::Tuple<YDataType>,                                                     // y
+        NormalizeInInfer,
+        Rank,
+        2,                           // MPerthread
+        ck::Sequence<1, 1, 1, 1, 1>, // x, mean, variance, scale, bias
+        ck::Sequence<1>>;            // scalarPerVector: y
+
+    auto invariantDims = get_invariant_dims<Rank, NumBatchNormReduceDim>(reduceDims);
+    std::array<ck::index_t, Rank> aligned_bnScaleStrides{0};
+    std::array<ck::index_t, Rank> aligned_bnBiasStrides{0};
+    std::array<ck::index_t, Rank> aligned_bnMeanVarStrides{0};
+
+    int i = 0;
+    for(auto dim : invariantDims)
+    {
+        assert(xyLengths[dim] == bnScaleBiasMeanVarLengths[i]);
+
+        aligned_bnScaleStrides[dim]   = bnScaleStrides[i];
+        aligned_bnBiasStrides[dim]    = bnBiasStrides[i];
+        aligned_bnMeanVarStrides[dim] = bnMeanVarStrides[i];
+        i++;
+    };
+
+    int32_t reduceLength = 1;
+
+    for(auto dim : reduceDims)
+        reduceLength *= xyLengths[dim];
+
+    int32_t invariantLength = 1;
+
+    for(auto dim : invariantDims)
+        invariantLength *= xyLengths[dim];
+
+    size_t total_length = static_cast<size_t>(invariantLength) * reduceLength;
+
+    float avg_time        = 0.0f;
+    std::size_t num_bytes = 0;
+
+    auto dev_normalize = DeviceNormalizeInstance{};
+
+    auto argument_ptr1 = dev_normalize.MakeArgumentPointer(
+        xyLengths,
+        {xStrides,
+         aligned_bnMeanVarStrides,
+         aligned_bnMeanVarStrides,
+         aligned_bnScaleStrides,
+         aligned_bnBiasStrides},
+        {yStrides},
+        {p_x, p_estimatedMean, p_estimatedVariance, p_scale, p_bias},
+        {p_y},
+        NormalizeInInfer{epsilon});
+
+    if(!dev_normalize.IsSupportedArgument(argument_ptr1.get()))
+    {
+        std::cout << "The runtime parameters seems not supported by the Devic, exiting!"
+                  << std::endl;
+
+        return (-1);
+    };
+
+    auto invoker_ptr1 = dev_normalize.MakeInvokerPointer();
+
+    avg_time += invoker_ptr1->Run(argument_ptr1.get(), StreamConfig{nullptr, time_kernel});
+
+    num_bytes += total_length * sizeof(XDataType) +
+                 invariantLength *
+                     (sizeof(ScaleDataType) + sizeof(BiasDataType) + 2 * sizeof(MeanVarDataType)) +
+                 total_length * sizeof(YDataType);
+
+    if(time_kernel)
+    {
+        float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+        std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s" << std::endl;
+    };
+
+    return (0);
+};
diff --git a/example/35_splitK_gemm/CMakeLists.txt b/example/35_splitK_gemm/CMakeLists.txt
new file mode 100644
index 00000000..79458395
--- /dev/null
+++ b/example/35_splitK_gemm/CMakeLists.txt
@@ -0,0 +1,17 @@
+add_custom_target(example_splitK_gemm_xdl)
+
+add_example_executable(example_splitK_gemm_xdl_fp32 splitK_gemm_xdl_fp32.cpp)
+add_example_executable(example_splitK_gemm_xdl_fp16 splitK_gemm_xdl_fp16.cpp)
+add_example_executable(example_splitK_gemm_xdl_bfp16 splitK_gemm_xdl_bfp16.cpp)
+add_example_executable(example_splitK_gemm_xdl_int8 splitK_gemm_xdl_int8.cpp)
+
+add_dependencies(example_splitK_gemm_xdl
+                 example_splitK_gemm_xdl_fp32
+                 example_splitK_gemm_xdl_fp16
+                 example_splitK_gemm_xdl_bfp16
+                 example_splitK_gemm_xdl_int8)
+
+if(USE_BITINT_EXTENSION_INT4)
+  add_example_executable(example_splitK_gemm_xdl_int4 splitK_gemm_xdl_int4.cpp)
+  add_dependencies(example_splitK_gemm_xdl example_splitK_gemm_xdl_int4)
+endif()
diff --git a/example/35_splitK_gemm/run_splitK_gemm_example.inc b/example/35_splitK_gemm/run_splitK_gemm_example.inc
new file mode 100644
index 00000000..e9bd5c55
--- /dev/null
+++ b/example/35_splitK_gemm/run_splitK_gemm_example.inc
@@ -0,0 +1,217 @@
+#pragma once
+
+struct ProblemSize final
+{
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t stride_A = K;
+    ck::index_t stride_B = K;
+    ck::index_t stride_C = N;
+
+    ck::index_t k_batch = 4;
+};
+
+struct ExecutionConfig final
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+};
+
+bool run_splitK_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
+{
+    using namespace ck::literals;
+
+#if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
+    static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
+    static_assert(sizeof(ADataType) == sizeof(KernelADataType));
+    static_assert(sizeof(BDataType) == sizeof(KernelBDataType));
+#endif
+
+    auto& [M, N, K, StrideA, StrideB, StrideC, KBatch] = problem_size;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl;
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    case 2:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_Sequential<0>{});
+        b_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+    }
+
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+#ifdef BUILD_INT4_EXAMPLE
+    const Tensor<KernelADataType> a_m_k_converted(a_m_k);
+    const Tensor<KernelBDataType> b_k_n_converted(b_k_n);
+
+    a_m_k_device_buf.ToDevice(a_m_k_converted.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n_converted.mData.data());
+#else
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+#endif
+    c_m_n_device_buf.SetZero();
+
+    auto a_element_op = AElementOp{};
+    auto b_element_op = BElementOp{};
+    auto c_element_op = CElementOp{};
+
+    // do GEMM
+    auto gemm     = DeviceGemmInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(
+#ifdef BUILD_INT4_EXAMPLE
+        static_cast<KernelADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+        static_cast<KernelBDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+#else
+        static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+        static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+#endif
+        static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
+        M,
+        N,
+        K,
+        StrideA,
+        StrideB,
+        StrideC,
+        a_element_op,
+        b_element_op,
+        c_element_op,
+        KBatch);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return 0;
+    }
+
+    invoker.Run(argument, StreamConfig{nullptr, false});
+    bool pass = true;
+
+    if(config.do_verification)
+    {
+        c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                CElementOp>;
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        if(std::is_same<CDataType, ck::half_t>::value)
+        {
+            pass &= ck::utils::check_err(
+                c_m_n_device_result, c_m_n_host_result, "fp16 incorrect result", 3e-3, 1e-3);
+        }
+        else
+        {
+            pass &= ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+        }
+    }
+
+    if(config.time_kernel)
+    {
+        float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+        std::size_t flop = std::size_t(2) * M * N * K;
+        std::size_t num_btype =
+            sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+
+        float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s, " << gemm.GetTypeString() << std::endl;
+    }
+
+    return pass;
+}
+
+bool run_splitK_gemm_example(int argc, char* argv[])
+{
+    ProblemSize problem_size;
+    ExecutionConfig config;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 5)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+        problem_size.k_batch   = std::stoi(argv[4]);
+    }
+    else if(argc == 11)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+        problem_size.k_batch   = std::stoi(argv[4]);
+
+        problem_size.M = std::stoi(argv[5]);
+        problem_size.N = std::stoi(argv[6]);
+        problem_size.K = std::stoi(argv[7]);
+
+        problem_size.stride_A = std::stoi(argv[8]);
+        problem_size.stride_B = std::stoi(argv[9]);
+        problem_size.stride_C = std::stoi(argv[10]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4: KBatch\n");
+        printf("arg5 to 11: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
+        exit(0);
+    }
+
+    return run_splitK_gemm(problem_size, config);
+}
diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_bfp16.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_bfp16.cpp
new file mode 100644
index 00000000..7191ecf5
--- /dev/null
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_bfp16.cpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType   = BF16;
+using BDataType   = BF16;
+using AccDataType = F32;
+using CDataType   = F32;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdlSplitKCShuffle
+    // clang-format off
+//######|     AData|     BData|     CData|     AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer|  KPer|  K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+//######|      Type|      Type|      Type|        Type|        |        |        | Elementwise| Elementwise| Elementwise| Spacialization|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+//######|          |          |          |            |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|          |          |          |            |        |        |        |            |            |            |               |      |      |      |      |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,   256,   256,   128,     4,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              8,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               4>;
+// clang-format on
+
+#include "run_splitK_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_splitK_gemm_example(argc, argv); }
diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp
new file mode 100644
index 00000000..efdb315b
--- /dev/null
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_fp16.cpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType   = F16;
+using BDataType   = F16;
+using AccDataType = F32;
+using CDataType   = F16;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdlSplitKCShuffle
+    // clang-format off
+//######|     AData|     BData|     CData|     AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer|  KPer|  K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+//######|      Type|      Type|      Type|        Type|        |        |        | Elementwise| Elementwise| Elementwise| Spacialization|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+//######|          |          |          |            |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|          |          |          |            |        |        |        |            |            |            |               |      |      |      |      |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,   256,   256,   128,     4,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              8,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>;
+// clang-format on
+
+#include "run_splitK_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_splitK_gemm_example(argc, argv); }
diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp
new file mode 100644
index 00000000..bc2e3d1d
--- /dev/null
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_fp32.cpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType   = F32;
+using BDataType   = F32;
+using AccDataType = F32;
+using CDataType   = F32;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdlSplitKCShuffle
+    // clang-format off
+//######|     AData|     BData|     CData|     AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer|  KPer|  K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+//######|      Type|      Type|      Type|        Type|        |        |        | Elementwise| Elementwise| Elementwise| Spacialization|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+//######|          |          |          |            |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|          |          |          |            |        |        |        |            |            |            |               |      |      |      |      |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,   256,   256,   128,     4,   4,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              4,              4,      true,           1,           1,                   S<1, 32, 1, 8>,               4>;
+// clang-format on
+
+#include "run_splitK_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_splitK_gemm_example(argc, argv); }
diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp
new file mode 100644
index 00000000..4eb27824
--- /dev/null
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_int4.cpp
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType   = ck::int4_t;
+using BDataType   = ck::int4_t;
+using AccDataType = int32_t;
+using CDataType   = int32_t;
+
+using KernelADataType = int8_t;
+using KernelBDataType = int8_t;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdlSplitKCShuffle
+    // clang-format off
+        <KernelADataType,      //ADataType    
+         KernelBDataType,      //BDataType   
+         CDataType,            //EDataType
+         AccDataType,          //AccDataType
+         ALayout,              //ALayout
+         BLayout,              //BLayout
+         CLayout,              //ELayout
+         AElementOp,           //AElementwiseOperation
+         BElementOp,           //BElementwiseOperation
+         CElementOp,           //CElementwiseOperation
+         GemmDefault,          //GEMMSpecialization
+         256,                  // BlockSize
+         256,                  // MPerBlock
+         128,                  // NPerBlock
+         4,                    // KPerBlock
+         16,                   // K1
+         32,                   // MPerXdl
+         32,                   // NPerXdl
+         4,                    // MXdlPerWave
+         2,                    // NXdlPerWave
+         S<1, 4, 64, 1>,       // ABlockTransfer ThreadCluster Lengths_K0_M_K1
+         S<0, 2, 1, 3>,        // ABlockTransfer ThreadCluster ArrangeOrder
+         S<0, 2, 1, 3>,        // ABlockTransfer SrcAccessOrder
+         3,                    // ABlockTransfer SrcVectorDim
+         16,                   // ABlockTransfer SrcScalarPerVector
+         16,                   // ABlockTransfer DstScalarPerVector_K1
+         true,                 // ABlockLdsExtraM
+         S<1, 4, 64, 1>,       // BBlockTransfer ThreadCluster Lengths_K0_N_K1
+         S<0, 1, 3, 2>,        // BBlockTransfer ThreadCluster ArrangeOrder
+         S<0, 1, 3, 2>,        // BBlockTransfer SrcAccessOrder
+         3,                    // BBlockTransfer SrcVectorDim
+         16,                   // BBlockTransfer SrcScalarPerVector
+         16,                   // BBlockTransfer DstScalarPerVector_K1
+         true,                 // BBlockLdsExtraN
+         1,                    // CShuffleMXdlPerWavePerShuffle
+         1,                    // CShuffleNXdlPerWavePerShuffle
+         S<1, 32, 1, 8>,       // CBlockTransferClusterLengths _MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
+         4>;                   // CBlockTransferScalarPerVector_NWaveNPerXdl
+// clang-format on
+
+#define BUILD_INT4_EXAMPLE
+#include "run_splitK_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_splitK_gemm_example(argc, argv); }
diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp
new file mode 100644
index 00000000..eefdbca6
--- /dev/null
+++ b/example/35_splitK_gemm/splitK_gemm_xdl_int8.cpp
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/literals.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType   = int8_t;
+using BDataType   = int8_t;
+using AccDataType = int32_t;
+using CDataType   = int32_t;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdlSplitKCShuffle
+    // clang-format off
+//######|     AData|     BData|     CData|     AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer|  KPer|  K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+//######|      Type|      Type|      Type|        Type|        |        |        | Elementwise| Elementwise| Elementwise| Spacialization|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+//######|          |          |          |            |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|          |          |          |            |        |        |        |            |            |            |               |      |      |      |      |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    GemmDefault,   256,   256,   128,     4,  16,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,             16,             16,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,             16,             16,      true,           1,           1,                   S<1, 32, 1, 8>,               4>;
+// clang-format on
+
+#include "run_splitK_gemm_example.inc"
+
+int main(int argc, char* argv[]) { return !run_splitK_gemm_example(argc, argv); }
diff --git a/example/36_sparse_embedding/CMakeLists.txt b/example/36_sparse_embedding/CMakeLists.txt
new file mode 100644
index 00000000..16ca6ea4
--- /dev/null
+++ b/example/36_sparse_embedding/CMakeLists.txt
@@ -0,0 +1 @@
+add_example_executable(example_sparse_embedding3_forward_layernorm sparse_embedding3_forward_layernorm.cpp)
diff --git a/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp b/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp
new file mode 100644
index 00000000..f5eb4c3b
--- /dev/null
+++ b/example/36_sparse_embedding/sparse_embedding3_forward_layernorm.cpp
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+#include <ctime>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_sparse_embedding3_forward_layernorm.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_sparse_embedding3_forward_layernorm.hpp"
+
+// using EmbType       = float;
+// using IndexType     = int64_t;
+// using GammaDataType = float;
+// using BetaDataType  = float;
+// using AccDataType   = float;
+// using OutType       = float;
+
+using EmbType       = ck::half_t;
+using IndexType     = int64_t;
+using GammaDataType = ck::half_t;
+using BetaDataType  = ck::half_t;
+using AccDataType   = float;
+using OutType       = ck::half_t;
+
+// clang-format off
+//                                                                                                         BlockSize, DimClusterSize, RowClusterSize, DimPerBlock, RowPerBlock, DimThreadSize, RowVectorSize
+using DeviceInstance_fp32_e256   = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  256,   1, 1>;
+using DeviceInstance_fp32_e512   = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  512,   1, 1>;
+using DeviceInstance_fp32_e768   = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  768,   1, 1>;
+using DeviceInstance_fp32_e1024  = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  1024,  1, 1>;
+using DeviceInstance_fp32_e1536  = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  1536,  1, 1>;
+using DeviceInstance_fp32_e2048  = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  2048,  1, 4>;
+using DeviceInstance_fp32_e4096  = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  4096,  1, 4>;
+using DeviceInstance_fp32_e8192  = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  8192,  1, 4>;
+using DeviceInstance_fp32_e16384 = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  16384, 1, 4>;
+
+using DeviceInstance_fp16_e256   = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  256,   1, 1>;
+using DeviceInstance_fp16_e512   = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  512,   1, 2>;
+using DeviceInstance_fp16_e768   = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  768,   1, 1>;
+using DeviceInstance_fp16_e1024  = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  1024,  1, 2>;
+using DeviceInstance_fp16_e1536  = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  1536,  1, 2>;
+using DeviceInstance_fp16_e2048  = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  2048,  1, 2>;
+using DeviceInstance_fp16_e4096  = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  4096,  1, 8>;
+using DeviceInstance_fp16_e8192  = ck::tensor_operation::device::DeviceSparseEmbedding3ForwardLayernorm<EmbType, IndexType, GammaDataType, BetaDataType, AccDataType, OutType, 256,  1,  256, 1,  8192,  1, 8>;
+
+template<typename emb_type, ck::index_t dim> struct emb_kernel{};
+
+template<> struct emb_kernel<float, 256>  { using kernel_type = DeviceInstance_fp32_e256; };
+template<> struct emb_kernel<float, 512>  { using kernel_type = DeviceInstance_fp32_e512; };
+template<> struct emb_kernel<float, 768>  { using kernel_type = DeviceInstance_fp32_e768; };
+template<> struct emb_kernel<float, 1024> { using kernel_type = DeviceInstance_fp32_e1024;};
+template<> struct emb_kernel<float, 1536> { using kernel_type = DeviceInstance_fp32_e1536;};
+template<> struct emb_kernel<float, 2048> { using kernel_type = DeviceInstance_fp32_e2048;};
+template<> struct emb_kernel<float, 4096> { using kernel_type = DeviceInstance_fp32_e4096;};
+template<> struct emb_kernel<float, 8192> { using kernel_type = DeviceInstance_fp32_e8192;};
+template<> struct emb_kernel<float, 16384>{ using kernel_type = DeviceInstance_fp32_e16384;};
+
+template<> struct emb_kernel<ck::half_t, 256>  { using kernel_type = DeviceInstance_fp16_e256; };
+template<> struct emb_kernel<ck::half_t, 512>  { using kernel_type = DeviceInstance_fp16_e512; };
+template<> struct emb_kernel<ck::half_t, 768>  { using kernel_type = DeviceInstance_fp16_e768; };
+template<> struct emb_kernel<ck::half_t, 1024> { using kernel_type = DeviceInstance_fp16_e1024; };
+template<> struct emb_kernel<ck::half_t, 1536> { using kernel_type = DeviceInstance_fp16_e1536; };
+template<> struct emb_kernel<ck::half_t, 2048> { using kernel_type = DeviceInstance_fp16_e2048; };
+template<> struct emb_kernel<ck::half_t, 4096> { using kernel_type = DeviceInstance_fp16_e4096; };
+template<> struct emb_kernel<ck::half_t, 8192> { using kernel_type = DeviceInstance_fp16_e8192; };
+
+// clang-format on
+
+int main()
+{
+    bool time_kernel = true;
+
+    constexpr auto num_rows = 65536;
+    constexpr auto dims     = ck::Sequence<256, 512, 768, 1024, 1536, 2048, 4096, 8192>{};
+    // constexpr auto dims = ck::Sequence<256, 512>{};
+    constexpr auto index_length   = 2048;
+    constexpr AccDataType epsilon = 1e-4;
+
+    auto f_host_tensor_desc_1d = [](std::size_t len_) { return HostTensorDescriptor({len_}); };
+
+    auto f_host_tensor_desc_2d = [](std::size_t rows_, std::size_t cols_) {
+        return HostTensorDescriptor({rows_, cols_});
+    };
+
+    using ReferenceInstance =
+        ck::tensor_operation::host::ReferenceSparseEmbedding3ForwardLayernorm<EmbType,
+                                                                              IndexType,
+                                                                              GammaDataType,
+                                                                              BetaDataType,
+                                                                              AccDataType,
+                                                                              OutType>;
+
+    ck::static_for<0, dims.Size(), 1>{}([&](auto I) {
+        std::srand(std::time(nullptr));
+        constexpr auto current_dim = dims.At(I);
+        Tensor<EmbType> emb_a(f_host_tensor_desc_2d(num_rows, current_dim));
+        Tensor<EmbType> emb_b(f_host_tensor_desc_2d(num_rows, current_dim));
+        Tensor<EmbType> emb_c(f_host_tensor_desc_2d(num_rows, current_dim));
+
+        Tensor<IndexType> index_a(f_host_tensor_desc_1d(index_length));
+        Tensor<IndexType> index_b(f_host_tensor_desc_1d(index_length));
+        Tensor<IndexType> index_c(f_host_tensor_desc_1d(index_length));
+
+        Tensor<GammaDataType> gamma(f_host_tensor_desc_1d(current_dim));
+        Tensor<BetaDataType> beta(f_host_tensor_desc_1d(current_dim));
+
+        Tensor<OutType> out(f_host_tensor_desc_2d(index_length, current_dim));
+
+        emb_a.GenerateTensorValue(GeneratorTensor_3<EmbType>{0.0, 1.0});
+        emb_b.GenerateTensorValue(GeneratorTensor_3<EmbType>{0.0, 1.0});
+        emb_c.GenerateTensorValue(GeneratorTensor_3<EmbType>{0.0, 1.0});
+
+        index_a.GenerateTensorValue(GeneratorTensor_2<IndexType>{0, num_rows});
+        index_b.GenerateTensorValue(GeneratorTensor_2<IndexType>{0, num_rows});
+        index_c.GenerateTensorValue(GeneratorTensor_2<IndexType>{0, num_rows});
+
+        gamma.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{0.0, 1.0});
+        beta.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{0.0, 1.0});
+
+        DeviceMem emb_a_dev(sizeof(EmbType) * emb_a.mDesc.GetElementSpaceSize());
+        DeviceMem emb_b_dev(sizeof(EmbType) * emb_b.mDesc.GetElementSpaceSize());
+        DeviceMem emb_c_dev(sizeof(EmbType) * emb_c.mDesc.GetElementSpaceSize());
+
+        DeviceMem index_a_dev(sizeof(IndexType) * index_a.mDesc.GetElementSpaceSize());
+        DeviceMem index_b_dev(sizeof(IndexType) * index_b.mDesc.GetElementSpaceSize());
+        DeviceMem index_c_dev(sizeof(IndexType) * index_c.mDesc.GetElementSpaceSize());
+
+        DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
+        DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
+
+        DeviceMem out_dev(sizeof(OutType) * out.mDesc.GetElementSpaceSize());
+
+        emb_a_dev.ToDevice(emb_a.mData.data());
+        emb_b_dev.ToDevice(emb_b.mData.data());
+        emb_c_dev.ToDevice(emb_c.mData.data());
+
+        index_a_dev.ToDevice(index_a.mData.data());
+        index_b_dev.ToDevice(index_b.mData.data());
+        index_c_dev.ToDevice(index_c.mData.data());
+
+        gamma_dev.ToDevice(gamma.mData.data());
+        beta_dev.ToDevice(beta.mData.data());
+
+        auto device_instance = typename emb_kernel<EmbType, current_dim>::kernel_type{};
+        auto argument_ptr    = device_instance.MakeArgumentPointer(out_dev.GetDeviceBuffer(),
+                                                                emb_a_dev.GetDeviceBuffer(),
+                                                                emb_b_dev.GetDeviceBuffer(),
+                                                                emb_c_dev.GetDeviceBuffer(),
+                                                                index_a_dev.GetDeviceBuffer(),
+                                                                index_b_dev.GetDeviceBuffer(),
+                                                                index_c_dev.GetDeviceBuffer(),
+                                                                gamma_dev.GetDeviceBuffer(),
+                                                                beta_dev.GetDeviceBuffer(),
+                                                                num_rows,
+                                                                current_dim,
+                                                                index_length,
+                                                                epsilon);
+        std::cout << "Dim:" << current_dim << ", kernel:" << device_instance.GetTypeString()
+                  << std::endl
+                  << std::flush;
+
+        bool is_supported = device_instance.IsSupportedArgument(argument_ptr.get());
+
+        if(!is_supported)
+        {
+            std::cout << "Runtime parameters are not supported" << std::endl;
+            return;
+        }
+
+        auto invoker_ptr = device_instance.MakeInvokerPointer();
+        float time_ms    = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+        bool pass = true;
+        {
+            Tensor<OutType> out_from_dev(f_host_tensor_desc_2d(index_length, current_dim));
+            ReferenceInstance ref;
+            auto ref_argument = ref.MakeArgument(out,
+                                                 emb_a,
+                                                 emb_b,
+                                                 emb_c,
+                                                 index_a,
+                                                 index_b,
+                                                 index_c,
+                                                 gamma,
+                                                 beta,
+                                                 num_rows,
+                                                 current_dim,
+                                                 index_length,
+                                                 epsilon);
+            auto ref_invoker  = ref.MakeInvoker();
+            ref_invoker.Run(ref_argument);
+
+            out_dev.FromDevice(out_from_dev.mData.data());
+            pass &= ck::utils::check_err(out_from_dev, out, "Error: Incorrect results", 1e-3, 1e-3);
+        }
+
+        double total_read = current_dim * index_length * 3 * sizeof(EmbType) +
+                            current_dim * sizeof(GammaDataType) +
+                            current_dim * sizeof(BetaDataType);
+        double total_write = current_dim * index_length * sizeof(OutType);
+        double gbps        = (total_read + total_write) / time_ms / 1e6;
+
+        std::cout << ", total bytes:" << (total_read + total_write) << ", time:" << time_ms
+                  << ", gbps:" << gbps << ", valid:" << (pass ? "y" : "n") << std::endl
+                  << std::flush;
+    });
+
+    return 0;
+}
diff --git a/example/37_batched_gemm_add_add_relu_gemm_add/CMakeLists.txt b/example/37_batched_gemm_add_add_relu_gemm_add/CMakeLists.txt
new file mode 100644
index 00000000..a9be3a71
--- /dev/null
+++ b/example/37_batched_gemm_add_add_relu_gemm_add/CMakeLists.txt
@@ -0,0 +1 @@
+add_example_executable(example_batched_gemm_add_add_relu_gemm_add_xdl_fp16 batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp)
diff --git a/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
new file mode 100644
index 00000000..071e8a74
--- /dev/null
+++ b/example/37_batched_gemm_add_add_relu_gemm_add/batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp
@@ -0,0 +1,519 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+Computes C_m_o = Relu(A0[m, k] * B0[n, k] + D00[m, n] + D01[mn]) * B1[n, o] + D1[m, o]
+*/
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using A0DataType        = F16;
+using B0DataType        = F16;
+using Acc0DataType      = F32;
+using D00DataType       = F16;
+using D01DataType       = F16;
+using B1DataType        = F16;
+using Acc1DataType      = F32;
+using C1ShuffleDataType = F32;
+using D1DataType        = F16;
+using E1DataType        = F16;
+
+using A0Layout  = Row;
+using B0Layout  = Col;
+using D00Layout = Row;
+using D01Layout = Row;
+using B1Layout  = Row;
+using D1Layout  = Row;
+using E1Layout  = Row;
+
+// E = Relu(C + D0 + D1)
+struct AddAddRelu
+{
+    __host__ __device__ void
+    operator()(ck::half_t& e, const ck::half_t& c, const ck::half_t& d0, const ck::half_t& d1) const
+    {
+        const ck::half_t x = c + d0 + d1;
+
+        ck::tensor_operation::element_wise::Relu{}.template operator()<ck::half_t>(e, x);
+    }
+    __host__ __device__ void
+    operator()(float& e, const float& c, const ck::half_t& d0, const ck::half_t& d1) const
+    {
+        const float x = c + (d0 + d1);
+
+        ck::tensor_operation::element_wise::Relu{}.template operator()<float>(e, x);
+    }
+};
+
+// E = Gelu(C + D0 + D1)
+struct AddAddGelu
+{
+    __host__ __device__ void
+    operator()(ck::half_t& e, const ck::half_t& c, const ck::half_t& d0, const ck::half_t& d1) const
+    {
+        const ck::half_t x = c + d0 + d1;
+
+        ck::tensor_operation::element_wise::Gelu{}.template operator()<ck::half_t, ck::half_t>(e,
+                                                                                               x);
+    }
+
+    __host__ __device__ void
+    operator()(float& e, const float& c, const ck::half_t& d0, const ck::half_t& d1) const
+    {
+        const float x = c + (d0 + d1);
+
+        ck::tensor_operation::element_wise::Gelu{}.template operator()<float, float>(e, x);
+    }
+};
+
+// E = FastGelu(C + D0 + D1)
+struct AddAddFastGelu
+{
+    __host__ __device__ void
+    operator()(float& e, const float& c, const ck::half_t& d0, const ck::half_t& d1) const
+    {
+        const float x = c + (d0 + d1);
+
+        ck::tensor_operation::element_wise::FastGelu{}.template operator()<float, float>(e, x);
+    }
+};
+
+using A0ElementOp   = PassThrough;
+using B0ElementOp   = PassThrough;
+using CDE0ElementOp = AddAddRelu;
+using A1ElementOp   = PassThrough;
+using B1ElementOp   = PassThrough;
+using CDE1ElementOp = ck::tensor_operation::element_wise::Add;
+
+static constexpr bool PadGemm0M = false;
+static constexpr bool PadGemm0N = false;
+static constexpr bool PadGemm0K = false;
+static constexpr bool PadGemm1N = false;
+static constexpr bool PadGemm1K = false;
+
+using DeviceGemmInstance =
+    ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle<
+        A0Layout,
+        B0Layout,
+        ck::Tuple<D00Layout, D01Layout>,
+        B1Layout,
+        ck::Tuple<D1Layout>,
+        E1Layout,
+        A0DataType,
+        B0DataType,
+        Acc0DataType,
+        ck::Tuple<D00DataType, D01DataType>,
+        B1DataType,
+        Acc1DataType,
+        C1ShuffleDataType,
+        ck::Tuple<D1DataType>,
+        E1DataType,
+        A0ElementOp,
+        B0ElementOp,
+        CDE0ElementOp,
+        B1ElementOp,
+        CDE1ElementOp,
+        PadGemm0M,
+        PadGemm0N,
+        PadGemm0K,
+        PadGemm1N,
+        PadGemm1K,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        32,          // KPerBlock
+        128,         // Gemm1NPerBlock
+        32,          // Gemm1KPerBlock
+        8,           // AK1
+        8,           // BK1
+        2,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        4,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<8, 32, 1>, // B1BlockTransfer
+        S<0, 2, 1>,
+        S<0, 2, 1>,
+        1,
+        4,
+        2,
+        false,
+        1,              // CShuffleMXdlPerWavePerShuffle
+        2,              // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M              = 1024;
+    ck::index_t N              = 1024;
+    ck::index_t K              = 64;
+    ck::index_t O              = 128;
+    ck::index_t BatchCount     = 4;
+    ck::index_t StrideA0       = -1;
+    ck::index_t StrideB0       = -1;
+    ck::index_t StrideD00      = -1;
+    ck::index_t StrideD01      = -1;
+    ck::index_t StrideB1       = -1;
+    ck::index_t StrideD1       = -1;
+    ck::index_t StrideE1       = -1;
+    ck::index_t BatchStrideA0  = -1;
+    ck::index_t BatchStrideB0  = -1;
+    ck::index_t BatchStrideD00 = -1;
+    ck::index_t BatchStrideD01 = -1;
+    ck::index_t BatchStrideB1  = -1;
+    ck::index_t BatchStrideD1  = -1;
+    ck::index_t BatchStrideE1  = -1;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 9)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+        O = std::stoi(argv[7]);
+
+        BatchCount = std::stoi(argv[8]);
+    }
+    else if(argc == 23)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+        O = std::stoi(argv[7]);
+
+        BatchCount = std::stoi(argv[8]);
+
+        StrideA0  = std::stoi(argv[9]);
+        StrideB0  = std::stoi(argv[10]);
+        StrideD00 = std::stoi(argv[11]);
+        StrideD01 = std::stoi(argv[12]);
+        StrideB1  = std::stoi(argv[13]);
+        StrideD1  = std::stoi(argv[14]);
+        StrideE1  = std::stoi(argv[15]);
+
+        BatchStrideA0  = std::stoi(argv[16]);
+        BatchStrideB0  = std::stoi(argv[17]);
+        BatchStrideD00 = std::stoi(argv[18]);
+        BatchStrideD01 = std::stoi(argv[19]);
+        BatchStrideB1  = std::stoi(argv[20]);
+        BatchStrideD1  = std::stoi(argv[21]);
+        BatchStrideE1  = std::stoi(argv[22]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 8: M, N, K, O, Batch\n");
+        printf(
+            "arg9 to 15: StrideA0, StrideB0, StrideD00, StrideD01, StrideB1, StrideD1, StrideE1\n");
+        printf("arg16 to 22: BatchStrideA0, BatchStrideB0, BatchStrideD00, BatchStrideD01, "
+               "BatchStrideB1, BatchStrideD1, BatchStrideE1 \n");
+        exit(0);
+    }
+
+    const int DefaultStrideA0  = ck::is_same_v<A0Layout, Row> ? K : M;
+    const int DefaultStrideB0  = ck::is_same_v<B0Layout, Row> ? N : K;
+    const int DefaultStrideD00 = ck::is_same_v<D00Layout, Row> ? N : M;
+    const int DefaultStrideD01 = ck::is_same_v<D01Layout, Row> ? N : M;
+    const int DefaultStrideB1  = ck::is_same_v<B1Layout, Row> ? O : N;
+    const int DefaultStrideD1  = ck::is_same_v<D1Layout, Row> ? O : M;
+    const int DefaultStrideE1  = ck::is_same_v<E1Layout, Row> ? O : M;
+
+    StrideA0  = (StrideA0 < 0) ? DefaultStrideA0 : StrideA0;
+    StrideB0  = (StrideB0 < 0) ? DefaultStrideB0 : StrideB0;
+    StrideD00 = (StrideD00 < 0) ? DefaultStrideD00 : StrideD00;
+    StrideD01 = (StrideD01 < 0) ? DefaultStrideD01 : StrideD01;
+    StrideB1  = (StrideB1 < 0) ? DefaultStrideB1 : StrideB1;
+    StrideD1  = (StrideD1 < 0) ? DefaultStrideD1 : StrideD1;
+    StrideE1  = (StrideE1 < 0) ? DefaultStrideE1 : StrideE1;
+
+    const int DefaultBatchStrideA0  = (ck::is_same_v<A0Layout, Col> ? K : M) * StrideA0;
+    const int DefaultBatchStrideB0  = (ck::is_same_v<B0Layout, Col> ? N : K) * StrideB0;
+    const int DefaultBatchStrideD00 = (ck::is_same_v<D00Layout, Col> ? N : M) * StrideD00;
+    const int DefaultBatchStrideD01 = (ck::is_same_v<D01Layout, Col> ? N : M) * StrideD01;
+    const int DefaultBatchStrideB1  = (ck::is_same_v<B1Layout, Col> ? O : N) * StrideB1;
+    const int DefaultBatchStrideD1  = (ck::is_same_v<D1Layout, Col> ? O : M) * StrideD1;
+    const int DefaultBatchStrideE1  = (ck::is_same_v<E1Layout, Col> ? O : M) * StrideE1;
+
+    BatchStrideA0  = BatchStrideA0 < 0 ? DefaultBatchStrideA0 : BatchStrideA0;
+    BatchStrideB0  = BatchStrideB0 < 0 ? DefaultBatchStrideB0 : BatchStrideB0;
+    BatchStrideD00 = BatchStrideD00 < 0 ? DefaultBatchStrideD00 : BatchStrideD00;
+    BatchStrideD01 = BatchStrideD01 < 0 ? DefaultBatchStrideD01 : BatchStrideD01;
+    BatchStrideB1  = BatchStrideB1 < 0 ? DefaultBatchStrideB1 : BatchStrideB1;
+    BatchStrideD1  = BatchStrideD1 < 0 ? DefaultBatchStrideD1 : BatchStrideD1;
+    BatchStrideE1  = BatchStrideE1 < 0 ? DefaultBatchStrideE1 : BatchStrideE1;
+
+    auto f_host_tensor_descriptor = [](std::size_t batch_count,
+                                       std::size_t row,
+                                       std::size_t col,
+                                       std::size_t stride,
+                                       std::size_t batch_stride,
+                                       auto layout) {
+        using namespace ck::literals;
+
+        if(std::is_same<decltype(layout), Row>::value)
+        {
+            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz});
+        }
+        else
+        {
+            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride});
+        }
+    };
+
+    // E_m_o = A_m_k * B0_k_n * B1_n_o
+    Tensor<A0DataType> a0_g_m_k(
+        f_host_tensor_descriptor(BatchCount, M, K, StrideA0, BatchStrideA0, A0Layout{}));
+    Tensor<B0DataType> b0_g_k_n(
+        f_host_tensor_descriptor(BatchCount, K, N, StrideB0, BatchStrideB0, B0Layout{}));
+    Tensor<D00DataType> d00_g_m_n(
+        f_host_tensor_descriptor(BatchCount, M, N, StrideD00, BatchStrideD00, D00Layout{}));
+    Tensor<D01DataType> d01_g_m_n(
+        f_host_tensor_descriptor(BatchCount, M, N, StrideD01, BatchStrideD01, D01Layout{}));
+    Tensor<B1DataType> b1_g_n_o(
+        f_host_tensor_descriptor(BatchCount, N, O, StrideB1, BatchStrideB1, B1Layout{}));
+    Tensor<D1DataType> d1_g_m_o(
+        f_host_tensor_descriptor(BatchCount, M, O, StrideD1, BatchStrideD1, D1Layout{}));
+    Tensor<E1DataType> e1_g_m_o_host_result(
+        f_host_tensor_descriptor(BatchCount, M, O, StrideE1, BatchStrideE1, E1Layout{}));
+    Tensor<E1DataType> e1_g_m_o_device_result(
+        f_host_tensor_descriptor(BatchCount, M, O, StrideE1, BatchStrideE1, E1Layout{}));
+
+    std::cout << "a0_g_m_k: " << a0_g_m_k.mDesc << std::endl;
+    std::cout << "b0_g_k_n: " << b0_g_k_n.mDesc << std::endl;
+    std::cout << "d00_g_m_n: " << d00_g_m_n.mDesc
+              << " size: " << d00_g_m_n.mDesc.GetElementSpaceSize() << std::endl;
+    std::cout << "d01_g_m_n: " << d01_g_m_n.mDesc
+              << " size: " << d01_g_m_n.mDesc.GetElementSpaceSize() << std::endl;
+    std::cout << "b1_g_n_o: " << b1_g_n_o.mDesc << std::endl;
+    std::cout << "e1_g_m_o: " << e1_g_m_o_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_g_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 3});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 3});
+        d00_g_m_n.GenerateTensorValue(GeneratorTensor_2<D00DataType>{-2, 3});
+        d01_g_m_n.GenerateTensorValue(GeneratorTensor_2<D01DataType>{-2, 3});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-2, 3});
+        d1_g_m_o.GenerateTensorValue(GeneratorTensor_2<D1DataType>{-2, 3});
+        break;
+    case 2:
+        a0_g_m_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_3<B0DataType>{0.0, 1.0});
+        d00_g_m_n.GenerateTensorValue(GeneratorTensor_3<D00DataType>{0.0, 1.0});
+        d01_g_m_n.GenerateTensorValue(GeneratorTensor_3<D01DataType>{0.0, 1.0});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_3<B1DataType>{-0.5, 0.5});
+        d1_g_m_o.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
+        break;
+    default:
+        a0_g_m_k.GenerateTensorValue(GeneratorTensor_1<A0DataType>{1});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        d00_g_m_n.GenerateTensorValue(GeneratorTensor_1<D00DataType>{1});
+        d01_g_m_n.GenerateTensorValue(GeneratorTensor_1<D01DataType>{1});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+        d1_g_m_o.GenerateTensorValue(GeneratorTensor_1<D1DataType>{1});
+    }
+
+    DeviceMem a0_g_m_k_device_buf(sizeof(A0DataType) * a0_g_m_k.mDesc.GetElementSize());
+    DeviceMem b0_g_k_n_device_buf(sizeof(B0DataType) * b0_g_k_n.mDesc.GetElementSize());
+    DeviceMem d00_g_m_n_device_buf(sizeof(D00DataType) * d00_g_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem d01_g_m_n_device_buf(sizeof(D01DataType) * d01_g_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem b1_g_n_o_device_buf(sizeof(B1DataType) * b1_g_n_o.mDesc.GetElementSize());
+    DeviceMem e1_g_m_o_device_buf(sizeof(E1DataType) *
+                                  e1_g_m_o_device_result.mDesc.GetElementSize());
+    DeviceMem d1_g_m_o_device_buf(sizeof(D1DataType) * d1_g_m_o.mDesc.GetElementSpaceSize());
+
+    a0_g_m_k_device_buf.ToDevice(a0_g_m_k.mData.data());
+    b0_g_k_n_device_buf.ToDevice(b0_g_k_n.mData.data());
+    d00_g_m_n_device_buf.ToDevice(d00_g_m_n.mData.data());
+    d01_g_m_n_device_buf.ToDevice(d01_g_m_n.mData.data());
+    b1_g_n_o_device_buf.ToDevice(b1_g_n_o.mData.data());
+    d1_g_m_o_device_buf.ToDevice(d1_g_m_o.mData.data());
+
+    auto a0_element_op   = A0ElementOp{};
+    auto b0_element_op   = B0ElementOp{};
+    auto cde0_element_op = CDE0ElementOp{};
+    auto b1_element_op   = B1ElementOp{};
+    auto cde1_element_op = CDE1ElementOp{};
+
+    // do GEMM
+    auto gemm    = DeviceGemmInstance{};
+    auto invoker = gemm.MakeInvoker();
+    auto argument =
+        gemm.MakeArgument(static_cast<A0DataType*>(a0_g_m_k_device_buf.GetDeviceBuffer()),
+                          static_cast<B0DataType*>(b0_g_k_n_device_buf.GetDeviceBuffer()),
+                          std::array<const void*, 2>{d00_g_m_n_device_buf.GetDeviceBuffer(),
+                                                     d01_g_m_n_device_buf.GetDeviceBuffer()},
+                          static_cast<B1DataType*>(b1_g_n_o_device_buf.GetDeviceBuffer()),
+                          std::array<const void*, 1>{d1_g_m_o_device_buf.GetDeviceBuffer()},
+                          static_cast<E1DataType*>(e1_g_m_o_device_buf.GetDeviceBuffer()),
+                          M,
+                          N,
+                          K,
+                          O,
+                          BatchCount,
+                          StrideA0,
+                          StrideB0,
+                          std::array<ck::index_t, 2>{StrideD00, StrideD01},
+                          StrideB1,
+                          std::array<ck::index_t, 1>{StrideD1},
+                          StrideE1,
+                          BatchStrideA0,
+                          BatchStrideB0,
+                          std::array<ck::index_t, 2>{BatchStrideD00, BatchStrideD01},
+                          BatchStrideB1,
+                          std::array<ck::index_t, 1>{BatchStrideD1},
+                          BatchStrideE1,
+                          a0_element_op,
+                          b0_element_op,
+                          cde0_element_op,
+                          b1_element_op,
+                          cde1_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return 0;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * BatchCount;
+    std::size_t num_btype =
+        (sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N + sizeof(D00DataType) * N +
+         sizeof(D01DataType) * N + sizeof(B1DataType) * N * O + sizeof(E1DataType) * M * O +
+         sizeof(D1DataType) * O) *
+        BatchCount;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    e1_g_m_o_device_buf.FromDevice(e1_g_m_o_device_result.mData.data());
+
+    if(do_verification)
+    {
+        using ReferenceGemm0Instance =
+            ck::tensor_operation::host::ReferenceBatchedGemm<A0DataType,
+                                                             B0DataType,
+                                                             Acc0DataType,
+                                                             Acc0DataType,
+                                                             A0ElementOp,
+                                                             B0ElementOp,
+                                                             PassThrough>;
+
+        using ReferenceGemm1Instance =
+            ck::tensor_operation::host::ReferenceBatchedGemm<Acc0DataType,
+                                                             B1DataType,
+                                                             Acc1DataType,
+                                                             Acc1DataType,
+                                                             PassThrough,
+                                                             B1ElementOp,
+                                                             PassThrough>;
+
+        // Output of Gemm0 is input A of Gemm1
+        Tensor<Acc0DataType> c0_g_m_n(f_host_tensor_descriptor(BatchCount, M, N, N, M * N, Row{}));
+        Tensor<Acc0DataType> e0_g_m_n(f_host_tensor_descriptor(BatchCount, M, N, N, M * N, Row{}));
+        Tensor<Acc1DataType> c1_g_m_o(f_host_tensor_descriptor(BatchCount, M, O, O, M * O, Row{}));
+
+        auto ref_gemm0          = ReferenceGemm0Instance{};
+        auto ref_gemm0_invoker  = ref_gemm0.MakeInvoker();
+        auto ref_gemm0_argument = ref_gemm0.MakeArgument(
+            a0_g_m_k, b0_g_k_n, c0_g_m_n, a0_element_op, b0_element_op, PassThrough{});
+
+        ref_gemm0_invoker.Run(ref_gemm0_argument);
+
+        // bias+bias+relu
+        e0_g_m_n.ForEach([&](auto&, auto idx) {
+            cde0_element_op(e0_g_m_n(idx), c0_g_m_n(idx), d00_g_m_n(idx), d01_g_m_n(idx));
+        });
+
+        auto ref_gemm1          = ReferenceGemm1Instance{};
+        auto ref_gemm1_invoker  = ref_gemm1.MakeInvoker();
+        auto ref_gemm1_argument = ref_gemm1.MakeArgument(
+            e0_g_m_n, b1_g_n_o, c1_g_m_o, PassThrough{}, b1_element_op, PassThrough{});
+
+        ref_gemm1_invoker.Run(ref_gemm1_argument);
+
+        // bias
+        e1_g_m_o_host_result.ForEach([&](auto&, auto idx) {
+            cde1_element_op(e1_g_m_o_host_result(idx), c1_g_m_o(idx), d1_g_m_o(idx));
+        });
+
+        return ck::utils::check_err(e1_g_m_o_device_result, e1_g_m_o_host_result) ? 0 : 1;
+    }
+
+    return 0;
+}
diff --git a/example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt b/example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
new file mode 100644
index 00000000..9cf960c5
--- /dev/null
+++ b/example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_custom_target(example_grouped_conv_bwd_data)
+
+add_example_executable(example_grouped_conv_bwd_data_fp16 grouped_conv_bwd_data_fp16.cpp)
+add_example_executable(example_grouped_conv_bwd_data_bias_relu_fp16 grouped_conv_bwd_data_bias_relu_fp16.cpp)
+
+add_dependencies(example_grouped_conv_bwd_data example_grouped_conv_bwd_data_fp16)
+add_dependencies(example_grouped_conv_bwd_data example_grouped_conv_bwd_data_bias_relu_fp16)
diff --git a/example/38_grouped_conv_bwd_data_multiple_d/common.hpp b/example/38_grouped_conv_bwd_data_multiple_d/common.hpp
new file mode 100644
index 00000000..d07ee7bd
--- /dev/null
+++ b/example/38_grouped_conv_bwd_data_multiple_d/common.hpp
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+#include <initializer_list>
+#include <iostream>
+#include <numeric>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static inline constexpr ck::index_t NDimSpatial = 2;
+
+static constexpr auto ConvBwdDataDefault =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
+
+using FP16 = ck::half_t;
+using FP32 = float;
+
+struct ExecutionConfig final
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = true;
+};
+
+#define DefaultConvParams                                                                \
+    ck::utils::conv::ConvParam                                                           \
+    {                                                                                    \
+        NDimSpatial, 32, 4, 192, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, { 1, 1 } \
+    }
+
+inline void print_help_msg()
+{
+    std::cerr << "arg1: verification (0=no, 1=yes)\n"
+              << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n"
+              << "arg3: time kernel (0=no, 1=yes)\n"
+              << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+}
+
+inline bool parse_cmd_args(int argc,
+                           char* argv[],
+                           ExecutionConfig& config,
+                           ck::utils::conv::ConvParam& conv_params)
+{
+    constexpr int num_execution_config_args =
+        3; // arguments for do_verification, init_method, time_kernel
+    constexpr int num_conv_param_leading_args = 5; // arguments for num_dim_spatial_, G_, N_, K_, C_
+
+    constexpr int threshold_to_catch_partial_args = 1 + num_execution_config_args;
+    constexpr int threshold_to_catch_all_args =
+        threshold_to_catch_partial_args + num_conv_param_leading_args;
+
+    if(argc == 1)
+    {
+        // use default
+        config = ExecutionConfig{};
+    }
+    // catch only ExecutionConfig arguments
+    else if(argc == threshold_to_catch_partial_args)
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+    }
+    // catch both ExecutionConfig & ConvParam arguments
+    else if(threshold_to_catch_all_args < argc && ((argc - threshold_to_catch_all_args) % 3 == 0))
+    {
+        config.do_verification = std::stoi(argv[1]);
+        config.init_method     = std::stoi(argv[2]);
+        config.time_kernel     = std::stoi(argv[3]);
+
+        const ck::index_t num_dim_spatial = std::stoi(argv[4]);
+        conv_params                       = ck::utils::conv::parse_conv_param(
+            num_dim_spatial, threshold_to_catch_partial_args, argv);
+    }
+    else
+    {
+        print_help_msg();
+        return false;
+    }
+
+    return true;
+}
diff --git a/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_bias_relu_fp16.cpp b/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_bias_relu_fp16.cpp
new file mode 100644
index 00000000..55ea8c3a
--- /dev/null
+++ b/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_bias_relu_fp16.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using OutDataType      = FP16;
+using WeiDataType      = FP16;
+using AccDataType      = FP32;
+using CShuffleDataType = FP16;
+using BiasDataType     = FP16; // bias
+using InDataType       = FP16;
+
+using OutLayout  = ck::tensor_layout::convolution::GNHWK;
+using WeiLayout  = ck::tensor_layout::convolution::GKYXC;
+using BiasLayout = ck::Tuple<ck::tensor_layout::convolution::G_C>;
+using InLayout   = ck::tensor_layout::convolution::GNHWC;
+
+using OutElementOp = PassThrough;
+using WeiElementOp = PassThrough;
+using InElementOp  = ck::tensor_operation::element_wise::AddRelu;
+
+// clang-format off
+using DeviceConvInstance = ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
+// ######| NDimSpatial|   ALayout|   BLayout|   DsLayout|  ELayout|       AData|       BData|     AccData|         CShuffle|                  DsData|      EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
+// ######|            |          |          |           |         |        Type|        Type|        Type|         DataType|                    Type|       Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
+// ######|            |          |          |           |         |            |            |            |                 |                        |           |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
+// ######|            |          |          |           |         |            |            |            |                 |                        |           |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
+         < NDimSpatial, OutLayout, WeiLayout, BiasLayout, InLayout, OutDataType, WeiDataType, AccDataType, CShuffleDataType, ck::Tuple<BiasDataType>, InDataType, OutElementOp, WeiElementOp,    InElementOp,  ConvBwdDataDefault,  true,  true,             1,   256,   128,   256,    32,   8,   2,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,            1,            1,     S<1, 32, 1, 8>,                8>;
+// clang-format on
+
+#include "run_grouped_conv_bwd_data_bias_relu_example.inc"
+
+int main(int argc, char* argv[]) { return run_grouped_conv_bwd_data_bias_relu_example(argc, argv); }
diff --git a/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_fp16.cpp b/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_fp16.cpp
new file mode 100644
index 00000000..ddf82ec5
--- /dev/null
+++ b/example/38_grouped_conv_bwd_data_multiple_d/grouped_conv_bwd_data_fp16.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using OutDataType      = FP16;
+using WeiDataType      = FP16;
+using AccDataType      = FP32;
+using CShuffleDataType = FP16;
+using DsDataType       = ck::Tuple<>;
+using InDataType       = FP16;
+
+using OutLayout = ck::tensor_layout::convolution::GNHWK;
+using WeiLayout = ck::tensor_layout::convolution::GKYXC;
+using DsLayout  = ck::Tuple<>;
+using InLayout  = ck::tensor_layout::convolution::GNHWC;
+
+using OutElementOp = PassThrough;
+using WeiElementOp = PassThrough;
+using InElementOp  = PassThrough;
+
+// clang-format off
+using DeviceConvInstance = ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
+// ######| NDimSpatial|   ALayout|   BLayout|   DsLayout|  ELayout|       AData|       BData|     AccData|         CShuffle|       DsData|      EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
+// ######|            |          |          |           |         |        Type|        Type|        Type|         DataType|         Type|       Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
+// ######|            |          |          |           |         |            |            |            |                 |             |           |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
+// ######|            |          |          |           |         |            |            |            |                 |             |           |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
+         < NDimSpatial, OutLayout, WeiLayout,   DsLayout, InLayout, OutDataType, WeiDataType, AccDataType, CShuffleDataType,   DsDataType, InDataType, OutElementOp, WeiElementOp,    InElementOp,  ConvBwdDataDefault,  true,  true,             1,   256,   128,   256,    32,   8,   2,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,            1,            1,     S<1, 32, 1, 8>,                8>;
+// clang-format on
+
+#include "run_grouped_conv_bwd_data_example.inc"
+
+int main(int argc, char* argv[]) { return run_grouped_conv_bwd_data_example(argc, argv); }
diff --git a/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc b/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc
new file mode 100644
index 00000000..0afd8bd7
--- /dev/null
+++ b/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_bias_relu_example.inc
@@ -0,0 +1,224 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+bool run_conv_bwd_data_bias_relu(const ExecutionConfig& config,
+                                 const ck::utils::conv::ConvParam& conv_params,
+                                 const HostTensorDescriptor& out_g_n_k_wos_desc,
+                                 const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                                 const HostTensorDescriptor& bias_g_n_c_wis_desc,
+                                 const HostTensorDescriptor& in_g_n_c_wis_desc,
+                                 const OutElementOp& out_element_op,
+                                 const WeiElementOp& wei_element_op,
+                                 const InElementOp& in_element_op)
+{
+    Tensor<OutDataType> out(out_g_n_k_wos_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<BiasDataType> bias(bias_g_n_c_wis_desc);
+    Tensor<InDataType> in_host(in_g_n_c_wis_desc);
+    Tensor<InDataType> in_device(in_g_n_c_wis_desc);
+
+    std::cout << "out: " << out.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "bias: " << bias.mDesc << std::endl;
+    std::cout << "in: " << in_host.mDesc << std::endl;
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        bias.GenerateTensorValue(GeneratorTensor_2<BiasDataType>{-5, 5});
+        break;
+    default:
+        out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+        bias.GenerateTensorValue(GeneratorTensor_3<BiasDataType>{0.0, 1.0});
+    }
+
+    DeviceMem out_device_buf(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem bias_device_buf(sizeof(BiasDataType) * bias.mDesc.GetElementSpaceSize());
+    DeviceMem in_device_buf(sizeof(InDataType) * in_device.mDesc.GetElementSpaceSize());
+
+    out_device_buf.ToDevice(out.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    bias_device_buf.ToDevice(bias.mData.data());
+
+    // reset input to zero
+    in_device_buf.SetZero();
+
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> d0_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> d0_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(out_g_n_k_wos_desc.GetLengths(), a_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), a_g_n_k_wos_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(bias_g_n_c_wis_desc.GetLengths(), d0_g_n_c_wis_lengths);
+    copy(bias_g_n_c_wis_desc.GetStrides(), d0_g_n_c_wis_strides);
+    copy(in_g_n_c_wis_desc.GetLengths(), e_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), e_g_n_c_wis_strides);
+    copy(conv_params.conv_filter_strides_, conv_filter_strides);
+    copy(conv_params.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_params.input_left_pads_, input_left_pads);
+    copy(conv_params.input_right_pads_, input_right_pads);
+
+    static_assert(std::is_default_constructible_v<DeviceConvInstance>);
+
+    // do conv
+    auto conv     = DeviceConvInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(
+        out_device_buf.GetDeviceBuffer(),
+        wei_device_buf.GetDeviceBuffer(),
+        std::array<const void*, 1>{bias_device_buf.GetDeviceBuffer()},
+        in_device_buf.GetDeviceBuffer(),
+        a_g_n_k_wos_lengths,
+        a_g_n_k_wos_strides,
+        b_g_k_c_xs_lengths,
+        b_g_k_c_xs_strides,
+        std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{d0_g_n_c_wis_lengths},
+        std::array<std::array<ck::index_t, NDimSpatial + 3>, 1>{d0_g_n_c_wis_strides},
+        e_g_n_c_wis_lengths,
+        e_g_n_c_wis_strides,
+        conv_filter_strides,
+        conv_filter_dilations,
+        input_left_pads,
+        input_right_pads,
+        out_element_op,
+        wei_element_op,
+        in_element_op);
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        std::cerr << "wrong! device_conv with the specified compilation parameters does "
+                     "not support this Conv problem"
+                  << std::endl;
+
+        return false;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+    std::size_t flop      = conv_params.GetFlops();
+    std::size_t num_btype = conv_params.GetByte<InDataType, WeiDataType, OutDataType>();
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    if(config.do_verification)
+    {
+        // c doesn't physically exist, any layout is fine
+        Tensor<float> c_host(in_g_n_c_wis_desc);
+
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdData<NDimSpatial,
+                                                                         float,
+                                                                         WeiDataType,
+                                                                         OutDataType,
+                                                                         PassThrough,
+                                                                         WeiElementOp,
+                                                                         OutElementOp>();
+
+        auto ref_invoker = ref_conv.MakeInvoker();
+
+        auto ref_argument = ref_conv.MakeArgument(c_host,
+                                                  wei,
+                                                  out,
+                                                  conv_params.conv_filter_strides_,
+                                                  conv_params.conv_filter_dilations_,
+                                                  conv_params.input_left_pads_,
+                                                  conv_params.input_right_pads_,
+                                                  PassThrough{},
+                                                  wei_element_op,
+                                                  out_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        // TODO: implement elementwise operation for host
+        in_host.ForEach(
+            [&](auto&, auto idx) { in_element_op(in_host(idx), c_host(idx), bias(idx)); });
+
+        in_device_buf.FromDevice(in_device.mData.data());
+
+        return ck::utils::check_err(in_device, in_host);
+    }
+
+    return true;
+}
+
+int run_grouped_conv_bwd_data_bias_relu_example(int argc, char* argv[])
+{
+    namespace ctc = ck::tensor_layout::convolution;
+
+    ExecutionConfig config;
+    ck::utils::conv::ConvParam conv_params = DefaultConvParams;
+
+    if(!parse_cmd_args(argc, argv, config, conv_params))
+    {
+        return EXIT_FAILURE;
+    }
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+
+    if(conv_params.num_dim_spatial_ != NDimSpatial)
+    {
+        std::cerr << "unsupported # of spatials dimensions" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // output image: GNHWK
+    const auto out_g_n_k_wos_desc =
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
+            conv_params);
+
+    // weight: GKYXC
+    const auto wei_g_k_c_xs_desc =
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_params);
+
+    // input image bias: G_C
+    const auto bias_g_n_c_wis_desc = HostTensorDescriptor({conv_params.G_,
+                                                           conv_params.N_,
+                                                           conv_params.C_,
+                                                           conv_params.input_spatial_lengths_[0],
+                                                           conv_params.input_spatial_lengths_[1]},
+                                                          {
+                                                              conv_params.C_, // g
+                                                              0,              // n
+                                                              1,              // c
+                                                              0,              // hi
+                                                              0               // wi
+                                                          });
+
+    // input image: GNHWC
+    const auto in_g_n_c_wis_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_params);
+
+    return !run_conv_bwd_data_bias_relu(config,
+                                        conv_params,
+                                        out_g_n_k_wos_desc,
+                                        wei_g_k_c_xs_desc,
+                                        bias_g_n_c_wis_desc,
+                                        in_g_n_c_wis_desc,
+                                        wei_element_op,
+                                        out_element_op,
+                                        in_element_op);
+}
diff --git a/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_example.inc b/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_example.inc
new file mode 100644
index 00000000..e50c98bb
--- /dev/null
+++ b/example/38_grouped_conv_bwd_data_multiple_d/run_grouped_conv_bwd_data_example.inc
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+bool run_conv_bwd_data(const ExecutionConfig& config,
+                       const ck::utils::conv::ConvParam& conv_params,
+                       const HostTensorDescriptor& out_g_n_k_wos_desc,
+                       const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                       const HostTensorDescriptor& in_g_n_c_wis_desc,
+                       const OutElementOp& out_element_op,
+                       const WeiElementOp& wei_element_op,
+                       const InElementOp& in_element_op)
+{
+    Tensor<OutDataType> out(out_g_n_k_wos_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<InDataType> in_host(in_g_n_c_wis_desc);
+    Tensor<InDataType> in_device(in_g_n_c_wis_desc);
+
+    std::cout << "out: " << out.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "in: " << in_host.mDesc << std::endl;
+
+    switch(config.init_method)
+    {
+    case 0: break;
+    case 1:
+        out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        break;
+    default:
+        out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem out_device_buf(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem in_device_buf(sizeof(InDataType) * in_device.mDesc.GetElementSpaceSize());
+
+    out_device_buf.ToDevice(out.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+
+    // reset input to zero
+    in_device_buf.SetZero();
+
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(out_g_n_k_wos_desc.GetLengths(), a_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), a_g_n_k_wos_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(in_g_n_c_wis_desc.GetLengths(), e_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), e_g_n_c_wis_strides);
+    copy(conv_params.conv_filter_strides_, conv_filter_strides);
+    copy(conv_params.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_params.input_left_pads_, input_left_pads);
+    copy(conv_params.input_right_pads_, input_right_pads);
+
+    static_assert(std::is_default_constructible_v<DeviceConvInstance>);
+
+    // do conv
+    auto conv     = DeviceConvInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(out_device_buf.GetDeviceBuffer(),
+                                      wei_device_buf.GetDeviceBuffer(),
+                                      std::array<const void*, 0>{},
+                                      in_device_buf.GetDeviceBuffer(),
+                                      a_g_n_k_wos_lengths,
+                                      a_g_n_k_wos_strides,
+                                      b_g_k_c_xs_lengths,
+                                      b_g_k_c_xs_strides,
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{},
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{},
+                                      e_g_n_c_wis_lengths,
+                                      e_g_n_c_wis_strides,
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads,
+                                      out_element_op,
+                                      wei_element_op,
+                                      in_element_op);
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        std::cerr << "wrong! device_conv with the specified compilation parameters does "
+                     "not support this Conv problem"
+                  << std::endl;
+
+        return false;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel});
+
+    std::size_t flop      = conv_params.GetFlops();
+    std::size_t num_btype = conv_params.GetByte<InDataType, WeiDataType, OutDataType>();
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    if(config.do_verification)
+    {
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdData<NDimSpatial,
+                                                                         InDataType,
+                                                                         WeiDataType,
+                                                                         OutDataType,
+                                                                         PassThrough,
+                                                                         WeiElementOp,
+                                                                         OutElementOp>();
+
+        auto ref_invoker = ref_conv.MakeInvoker();
+
+        auto ref_argument = ref_conv.MakeArgument(in_host,
+                                                  wei,
+                                                  out,
+                                                  conv_params.conv_filter_strides_,
+                                                  conv_params.conv_filter_dilations_,
+                                                  conv_params.input_left_pads_,
+                                                  conv_params.input_right_pads_,
+                                                  PassThrough{},
+                                                  wei_element_op,
+                                                  out_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        in_device_buf.FromDevice(in_device.mData.data());
+
+        return ck::utils::check_err(in_device.mData, in_host.mData);
+    }
+
+    return true;
+}
+
+int run_grouped_conv_bwd_data_example(int argc, char* argv[])
+{
+    namespace ctc = ck::tensor_layout::convolution;
+
+    ExecutionConfig config;
+    ck::utils::conv::ConvParam conv_params = DefaultConvParams;
+
+    if(!parse_cmd_args(argc, argv, config, conv_params))
+    {
+        return EXIT_FAILURE;
+    }
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+
+    if(conv_params.num_dim_spatial_ != NDimSpatial)
+    {
+        std::cerr << "unsupported # of spatials dimensions" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    // output image: GNHWK
+    const auto out_g_n_k_wos_desc =
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
+            conv_params);
+
+    // weight: GKYXC
+    const auto wei_g_k_c_xs_desc =
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_params);
+
+    // input image: GNHWC
+    const auto in_g_n_c_wis_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_params);
+
+    return !run_conv_bwd_data(config,
+                              conv_params,
+                              out_g_n_k_wos_desc,
+                              wei_g_k_c_xs_desc,
+                              in_g_n_c_wis_desc,
+                              wei_element_op,
+                              out_element_op,
+                              in_element_op);
+}
diff --git a/example/39_permute/CMakeLists.txt b/example/39_permute/CMakeLists.txt
new file mode 100644
index 00000000..573ad723
--- /dev/null
+++ b/example/39_permute/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_custom_target(example_permute)
+
+add_example_executable(example_permute_1xHxW_fp16 permute_1xHxW_fp16.cpp)
+add_example_executable(example_permute_NxHxW_fp16 permute_NxHxW_fp16.cpp)
+add_example_executable(example_permute_HxWx4_fp16 permute_HxWx4_fp16.cpp)
+
+add_dependencies(example_permute example_permute_1xHxW_fp16)
+add_dependencies(example_permute example_permute_NxHxW_fp16)
+add_dependencies(example_permute example_permute_HxWx4_fp16)
diff --git a/example/39_permute/common.hpp b/example/39_permute/common.hpp
new file mode 100644
index 00000000..ab612cea
--- /dev/null
+++ b/example/39_permute/common.hpp
@@ -0,0 +1,456 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <type_traits>
+#include <utility>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+#include "ck/utility/type.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/fill.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+using F64 = double;
+
+struct Problem final
+{
+    static constexpr std::size_t NumDim = 3;
+
+    using Shape = std::array<std::size_t, NumDim>;
+    using Axes  = Shape;
+
+    Problem() = delete;
+
+    explicit Problem(const Shape& default_shape, const Axes& default_axes)
+        : shape(default_shape), axes(default_axes)
+    {
+    }
+
+    Shape shape;
+    Axes axes;
+};
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+namespace detail {
+
+template <typename Array, std::size_t Difference>
+struct enlarge_array_size;
+
+template <typename T, std::size_t Size, std::size_t Difference>
+struct enlarge_array_size<std::array<T, Size>, Difference>
+{
+    using type = std::array<T, Size + Difference>;
+};
+
+template <typename Array, std::size_t Difference>
+using enlarge_array_size_t = typename enlarge_array_size<Array, Difference>::type;
+
+template <typename Array>
+struct get_array_size;
+
+template <typename T, std::size_t Size>
+struct get_array_size<std::array<T, Size>> : std::integral_constant<std::size_t, Size>
+{
+};
+
+template <typename Array>
+inline constexpr std::size_t get_array_size_v = get_array_size<Array>::value;
+
+template <typename T, typename = void>
+struct is_iterator : std::false_type
+{
+};
+
+template <typename T>
+struct is_iterator<T,
+                   std::void_t<decltype(*std::declval<T>()),
+                               decltype(++std::declval<std::add_lvalue_reference_t<T>>()),
+                               decltype(std::declval<std::add_lvalue_reference_t<T>>()++)>>
+    : std::true_type
+{
+};
+
+template <typename T>
+inline constexpr bool is_iterator_v = is_iterator<T>::value;
+
+struct Placeholder final
+{
+    template <typename T>
+    constexpr inline operator T() const noexcept;
+};
+
+template <typename Iterator, typename = void>
+struct is_output_iterator : std::false_type
+{
+};
+
+template <typename Iterator>
+struct is_output_iterator<
+    Iterator,
+    std::void_t<decltype(*std::declval<Iterator>() = std::declval<Placeholder>())>>
+    : std::bool_constant<is_iterator_v<Iterator>>
+{
+};
+
+template <typename T>
+inline constexpr bool is_output_iterator_v = is_output_iterator<T>::value;
+
+template <typename Iterator, typename = void>
+struct is_bidirectional_iterator : std::false_type
+{
+};
+
+template <typename Iterator>
+struct is_bidirectional_iterator<
+    Iterator,
+    std::void_t<decltype(--std::declval<std::add_lvalue_reference_t<Iterator>>()),
+                decltype(std::declval<std::add_lvalue_reference_t<Iterator>>()--)>>
+    : std::bool_constant<is_iterator_v<Iterator>>
+{
+};
+
+template <typename Iterator>
+inline constexpr bool is_bidirectional_iterator_v = is_bidirectional_iterator<Iterator>::value;
+
+template <typename Iterator, typename = void>
+struct is_random_access_iterator : std::false_type
+{
+};
+
+template <typename Iterator>
+struct is_random_access_iterator<Iterator,
+                                 std::void_t<decltype(std::declval<Iterator>() + 1),
+                                             decltype(std::declval<Iterator>() - 1),
+                                             decltype(std::declval<Iterator>()[1])>>
+    : std::bool_constant<is_iterator_v<Iterator>>
+{
+};
+
+template <typename Iterator>
+inline constexpr bool is_random_access_iterator_v = is_random_access_iterator<Iterator>::value;
+
+template <typename T, typename = void>
+struct is_range : std::false_type
+{
+};
+
+template <typename T>
+struct is_range<T,
+                std::void_t<decltype(begin(std::declval<T>())),
+                            decltype(end(std::declval<T>())),
+                            decltype(begin(std::declval<T>()) != end(std::declval<T>()))>>
+    : std::bool_constant<is_iterator_v<ck::remove_cvref_t<decltype(begin(std::declval<T>()))>>>
+{
+};
+
+template <typename T>
+inline constexpr bool is_range_v = is_range<T>::value;
+
+template <typename Range, typename = void>
+struct is_sized_range : std::false_type
+{
+};
+
+template <typename Range>
+struct is_sized_range<Range, std::void_t<decltype(size(std::declval<Range>()))>>
+    : std::bool_constant<is_range_v<Range>>
+{
+};
+
+template <typename Range>
+inline constexpr bool is_sized_range_v = is_sized_range<Range>::value;
+
+template <typename Range, typename = void>
+struct is_bidirectional_range : std::false_type
+{
+};
+
+template <typename Range>
+struct is_bidirectional_range<Range, std::void_t<>>
+    : std::bool_constant<
+          is_range_v<Range> &&
+          is_bidirectional_iterator_v<ck::remove_cvref_t<decltype(begin(std::declval<Range>()))>>>
+{
+};
+
+template <typename Range>
+inline constexpr bool is_bidirectional_range_v = is_bidirectional_range<Range>::value;
+
+template <typename Range, typename = void>
+struct is_random_access_range : std::false_type
+{
+};
+
+template <typename Range>
+struct is_random_access_range<Range, std::void_t<>>
+    : std::bool_constant<
+          is_range_v<Range> &&
+          is_random_access_iterator_v<ck::remove_cvref_t<decltype(begin(std::declval<Range>()))>>>
+{
+};
+
+template <typename Range>
+inline constexpr bool is_random_access_range_v = is_random_access_range<Range>::value;
+
+template <typename Range>
+class to_array_proxy
+{
+    static_assert(is_range_v<Range>);
+
+    public:
+    explicit to_array_proxy(const Range& source) noexcept : source_(source) {}
+
+    template <typename T, std::size_t Size>
+    operator std::array<T, Size>() const
+    {
+        std::array<T, Size> destination;
+
+        std::copy_n(std::begin(source_),
+                    std::min<std::size_t>(Size, std::size(source_)),
+                    std::begin(destination));
+
+        return destination;
+    }
+
+    private:
+    const Range& source_;
+};
+
+} // namespace detail
+
+template <typename Range>
+inline auto to_array(Range& range) noexcept
+    -> std::enable_if_t<detail::is_range_v<Range>,
+                        detail::to_array_proxy<ck::remove_cvref_t<Range>>>
+{
+    return detail::to_array_proxy<ck::remove_cvref_t<Range>>{range};
+}
+
+template <typename Axes>
+inline auto is_valid_axes(const Axes& axes)
+    -> std::enable_if_t<detail::is_random_access_range_v<Axes>, bool>
+{
+    using std::empty;
+    if(empty(axes))
+    {
+        return false;
+    }
+
+    using std::begin, std::end;
+    std::vector<std::size_t> sorted_axes(begin(axes), end(axes));
+
+    std::sort(begin(sorted_axes), end(sorted_axes));
+    const auto last = std::unique(begin(sorted_axes), end(sorted_axes));
+
+    return (last == end(sorted_axes)) && (*begin(sorted_axes) == 0) &&
+           (*std::prev(last) == size(axes) - 1);
+}
+
+template <typename Shape>
+inline auto is_valid_shape(const Shape& shape) -> std::enable_if_t<detail::is_range_v<Shape>, bool>
+{
+    static_assert(std::is_unsigned_v<ck::remove_cvref_t<decltype(*std::begin(shape))>>);
+
+    using std::begin, std::end;
+    using std::empty;
+    return !empty(shape) && std::all_of(begin(shape), end(shape), [](auto dim) { return 0 < dim; });
+}
+
+template <typename Shape, typename Indices>
+inline auto is_valid_indices(const Shape& shape, const Indices& indices)
+    -> std::enable_if_t<detail::is_sized_range_v<Shape> && detail::is_sized_range_v<Indices>, bool>
+{
+    static_assert(std::is_unsigned_v<ck::remove_cvref_t<decltype(*std::begin(indices))>>);
+
+    if(!is_valid_shape(shape))
+    {
+        return false;
+    }
+
+    using std::empty;
+    if(empty(indices))
+    {
+        return false;
+    }
+
+    using std::size;
+    if(size(shape) != size(indices))
+    {
+        return false;
+    }
+
+    using std::begin, std::end;
+
+    auto dim = begin(shape);
+    auto idx = begin(indices);
+    for(; dim != end(shape) && idx != end(indices); ++dim, ++idx)
+    {
+        if(*dim <= *idx)
+        {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+template <std::size_t Size>
+std::array<std::size_t, Size> transpose(const std::array<std::size_t, Size>& shape,
+                                        const std::array<std::size_t, Size>& axes)
+{
+    assert(is_valid_shape(shape) && is_valid_axes(axes));
+
+    std::array<std::size_t, Size> transposed;
+    auto iter = std::begin(transposed);
+    for(const auto axis : axes)
+    {
+        *iter++ = shape[axis];
+    }
+
+    return transposed;
+}
+
+auto extend_shape(const Problem::Shape& shape, std::size_t new_dim)
+{
+    detail::enlarge_array_size_t<Problem::Shape, 1> extended_shape;
+
+    using std::begin, std::end;
+
+    ck::ranges::copy(shape, begin(extended_shape));
+    extended_shape.back() = new_dim;
+
+    return extended_shape;
+}
+
+auto extend_axes(const Problem::Axes& axes)
+{
+    detail::enlarge_array_size_t<Problem::Axes, 1> extended_axes;
+
+    using std::begin, std::end;
+
+    ck::ranges::copy(axes, begin(extended_axes));
+    extended_axes.back() = detail::get_array_size_v<Problem::Axes>;
+
+    return extended_axes;
+}
+
+template <typename Shape, typename Indices>
+auto advance_indices(const Shape& shape, Indices& indices) -> std::enable_if_t<
+    detail::is_bidirectional_range_v<Shape> && detail::is_sized_range_v<Shape> &&
+        detail::is_bidirectional_range_v<Indices> && detail::is_sized_range_v<Indices>,
+    bool>
+{
+    using std::size;
+    if(!(is_valid_shape(shape) && is_valid_indices(shape, indices) && size(shape) == size(indices)))
+    {
+        return false;
+    }
+
+    bool carry = true;
+
+    using std::rbegin, std::rend;
+    auto dim = rbegin(shape);
+    auto idx = rbegin(indices);
+    for(; carry && dim != rend(shape) && idx != rend(indices); ++dim, ++idx)
+    {
+        *idx  = (*idx + carry);
+        carry = ((*idx == *dim) ? (*idx = 0, true) : false);
+    }
+
+    return !carry;
+}
+
+template <typename Src, typename Axes, typename Functor, typename Dest>
+auto host_permute(const Tensor<Src>& src, const Axes& axes, Functor functor, Tensor<Dest>& dest)
+    -> std::enable_if_t<detail::is_random_access_range_v<Axes> && detail::is_sized_range_v<Axes> &&
+                            std::is_invocable_v<Functor,
+                                                std::add_lvalue_reference_t<Dest>,
+                                                std::add_lvalue_reference_t<Src>>,
+                        bool>
+{
+    const auto& shape            = src.mDesc.GetLengths();
+    const auto& transposed_shape = dest.mDesc.GetLengths();
+    if(!(is_valid_shape(shape) && is_valid_shape(transposed_shape)))
+    {
+        return false;
+    }
+
+    using std::size;
+    if(!is_valid_axes(axes))
+    {
+        return false;
+    }
+
+    static_assert(detail::is_sized_range_v<ck::remove_cvref_t<decltype(shape)>> &&
+                  detail::is_sized_range_v<ck::remove_cvref_t<decltype(transposed_shape)>>);
+
+    if(size(shape) != size(transposed_shape))
+    {
+        return false;
+    }
+
+    static_assert(detail::is_random_access_range_v<ck::remove_cvref_t<decltype(shape)>> &&
+                  detail::is_random_access_range_v<ck::remove_cvref_t<decltype(transposed_shape)>>);
+    {
+        for(std::size_t idx = 0; idx < size(shape); ++idx)
+        {
+            if(transposed_shape[idx] != shape[axes[idx]])
+            {
+                return false;
+            }
+        }
+    }
+
+    std::vector<std::size_t> indices(size(shape), 0);
+    if(!is_valid_indices(shape, indices))
+    {
+        return false;
+    }
+
+    switch(size(shape))
+    {
+    case 3: {
+        do
+        {
+            Dest output = 0;
+            functor(output, src(indices[0], indices[1], indices[2]));
+            dest(indices[axes[0]], indices[axes[1]], indices[axes[2]]) = output;
+        } while(advance_indices(shape, indices));
+    }
+    break;
+    case 4: {
+        do
+        {
+            Dest output = 0;
+            functor(output, src(indices[0], indices[1], indices[2], indices[3]));
+            dest(indices[axes[0]], indices[axes[1]], indices[axes[2]], indices[axes[3]]) = output;
+        } while(advance_indices(shape, indices));
+    }
+    break;
+    default: return false;
+    }
+
+    return true;
+}
diff --git a/example/39_permute/permute_1xHxW_fp16.cpp b/example/39_permute/permute_1xHxW_fp16.cpp
new file mode 100644
index 00000000..d7f9b805
--- /dev/null
+++ b/example/39_permute/permute_1xHxW_fp16.cpp
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using InDataType  = F16;
+using OutDataType = F16;
+
+// clang-format off
+using DevicePermuteInstance = ck::tensor_operation::device::DevicePermuteImpl
+// ######| NumDim|     InData|     OutData| Elementwise| Block|  NPer|  HPer|  WPer|   InBlock|      InBlockTransfer|           InBlockTransfer|       Src|       Dst|             Src|             Dst|
+// ######|       |       Type|        Type|   Operation|  Size| Block| Block| Block| LdsExtraW| ThreadClusterLengths| ThreadClusterArrangeOrder| VectorDim| VectorDim| ScalarPerVector| ScalarPerVector|
+// ######|       |           |            |            |      |      |      |      |          |                     |                          |          |          |                |                |
+// ######|       |           |            |            |      |      |      |      |          |                     |                          |          |          |                |                |
+         <      3, InDataType, OutDataType, PassThrough,   256,     1,    32,    32,         3,         S<1, 32,  8>,                S<0, 1, 2>,         2,         1,               2,               1>;
+// clang-format on
+
+#include "run_permute_element_example.inc"
+
+int main() { return !run_permute_element_example({1, 32000, 80}, {0, 2, 1}); }
diff --git a/example/39_permute/permute_HxWx4_fp16.cpp b/example/39_permute/permute_HxWx4_fp16.cpp
new file mode 100644
index 00000000..342aa134
--- /dev/null
+++ b/example/39_permute/permute_HxWx4_fp16.cpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using DataType   = F16;
+using BundleType = F64;
+
+static_assert(sizeof(BundleType) % sizeof(DataType) == 0);
+
+// clang-format off
+using DevicePermuteInstance = ck::tensor_operation::device::DevicePermuteImpl
+// ######| NumDim|     InData|     OutData| Elementwise| Block|  NPer|  HPer|  WPer|   InBlock|      InBlockTransfer|           InBlockTransfer|       Src|       Dst|             Src|             Dst|
+// ######|       |       Type|        Type|   Operation|  Size| Block| Block| Block| LdsExtraW| ThreadClusterLengths| ThreadClusterArrangeOrder| VectorDim| VectorDim| ScalarPerVector| ScalarPerVector|
+// ######|       |           |            |            |      |      |      |      |          |                     |                          |          |          |                |                |
+// ######|       |           |            |            |      |      |      |      |          |                     |                          |          |          |                |                |
+         <       3, BundleType, BundleType, PassThrough,   256,     1,    32,    32,         5,         S<1, 32,  8>,                S<0, 1, 2>,         2,         1,               4,               1>;
+// clang-format on
+
+#include "run_permute_bundle_example.inc"
+
+int main() { return !run_permute_bundle_example({1, 80, 32000}, {0, 2, 1}); }
diff --git a/example/39_permute/permute_NxHxW_fp16.cpp b/example/39_permute/permute_NxHxW_fp16.cpp
new file mode 100644
index 00000000..b53975eb
--- /dev/null
+++ b/example/39_permute/permute_NxHxW_fp16.cpp
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+using InDataType  = F16;
+using OutDataType = F16;
+
+// clang-format off
+using DevicePermuteInstance = ck::tensor_operation::device::DevicePermuteImpl
+// ######| NumDim|     InData|     OutData| Elementwise| Block|  NPer|  HPer|  WPer|   InBlock|      InBlockTransfer|           InBlockTransfer|       Src|       Dst|             Src|             Dst|
+// ######|       |       Type|        Type|   Operation|  Size| Block| Block| Block| LdsExtraW| ThreadClusterLengths| ThreadClusterArrangeOrder| VectorDim| VectorDim| ScalarPerVector| ScalarPerVector|
+// ######|       |           |            |            |      |      |      |      |          |                     |                          |          |          |                |                |
+// ######|       |           |            |            |      |      |      |      |          |                     |                          |          |          |                |                |
+         <      3, InDataType, OutDataType, PassThrough,   128,     4,    16,     8,         6,          S<2, 16, 4>,                S<0, 1, 2>,         2,         1,               2,               1>;
+// clang-format on
+
+#include "run_permute_element_example.inc"
+
+int main() { return !run_permute_element_example({121, 768, 80}, {0, 2, 1}); }
diff --git a/example/39_permute/run_permute_bundle_example.inc b/example/39_permute/run_permute_bundle_example.inc
new file mode 100644
index 00000000..70406d63
--- /dev/null
+++ b/example/39_permute/run_permute_bundle_example.inc
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+bool run_permute_bundle(const Problem& problem)
+{
+    const auto& input_bundle_shape = problem.shape;
+    const auto& input_bundle_axes  = problem.axes;
+
+    const auto output_bundle_shape = transpose(input_bundle_shape, input_bundle_axes);
+
+    Tensor<BundleType> input_bundle_tensor(input_bundle_shape);
+    Tensor<BundleType> output_bundle_tensor(output_bundle_shape);
+
+    // initialize tensor by assigning DataType values
+    ck::utils::FillUniformDistribution<DataType>{-1.f, 1.f}(input_bundle_tensor.AsSpan<DataType>());
+
+    DeviceMem input_device_buf(input_bundle_tensor.GetElementSpaceSizeInBytes());
+    DeviceMem output_device_buf(output_bundle_tensor.GetElementSpaceSizeInBytes());
+
+    using std::data;
+    input_device_buf.ToDevice(data(input_bundle_tensor));
+
+    static_assert(std::is_default_constructible_v<DevicePermuteInstance>);
+
+    auto permute  = DevicePermuteInstance{};
+    auto argument = permute.MakeArgument(to_array(input_bundle_shape),
+                                         to_array(input_bundle_tensor.GetStrides()),
+                                         to_array(output_bundle_shape),
+                                         to_array(output_bundle_tensor.GetStrides()),
+                                         input_device_buf.GetDeviceBuffer(),
+                                         output_device_buf.GetDeviceBuffer(),
+                                         PassThrough{});
+
+    if(!permute.IsSupportedArgument(argument))
+    {
+        std::cerr << "The runtime parameters seems not supported by the device instance, exiting!"
+                  << std::endl;
+        return false;
+    };
+
+    auto invoker   = permute.MakeInvoker();
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, true});
+
+    std::cout << "Perf: " << ave_time << " ms" << std::endl;
+
+    output_device_buf.FromDevice(data(output_bundle_tensor));
+
+    constexpr std::size_t NumElemsInBundle = sizeof(BundleType) / sizeof(DataType);
+
+    // extend tensor shape from [N, H, W] to [N, H, W, NumElemsInBundle]
+    //               axes  from [0, 2, 1] to [0, 2, 1, 3]
+    const auto input_shape = extend_shape(input_bundle_shape, NumElemsInBundle);
+    const auto input_axes  = extend_axes(input_bundle_axes);
+
+    using std::begin;
+
+    Tensor<DataType> input_tensor(input_shape);
+    ck::ranges::copy(input_bundle_tensor.AsSpan<const DataType>(), begin(input_tensor));
+
+    Tensor<DataType> output_tensor(transpose(input_shape, input_axes));
+    if(!host_permute(input_tensor, input_axes, PassThrough{}, output_tensor))
+    {
+        return false;
+    }
+
+    return ck::utils::check_err(output_bundle_tensor.AsSpan<const DataType>(),
+                                output_tensor.AsSpan<const DataType>(),
+                                "Error: incorrect results in output tensor",
+                                1e-6,
+                                1e-6);
+}
+
+bool run_permute_bundle_example(const Problem::Shape& shape, const Problem::Axes& axes)
+{
+    return run_permute_bundle(Problem{shape, axes});
+}
diff --git a/example/39_permute/run_permute_element_example.inc b/example/39_permute/run_permute_element_example.inc
new file mode 100644
index 00000000..bc623530
--- /dev/null
+++ b/example/39_permute/run_permute_element_example.inc
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+bool run_permute_element(const Problem& problem)
+{
+    const auto& input_shape = problem.shape;
+    const auto& input_axes  = problem.axes;
+
+    const auto output_shape = transpose(input_shape, input_axes);
+
+    Tensor<InDataType> input_tensor(input_shape);
+    Tensor<OutDataType> output_tensor(output_shape);
+
+    ck::utils::FillUniformDistribution<InDataType>{-1.f, 1.f}(input_tensor);
+
+    DeviceMem input_device_buf(input_tensor.GetElementSpaceSizeInBytes());
+    DeviceMem output_device_buf(output_tensor.GetElementSpaceSizeInBytes());
+
+    using std::data;
+    input_device_buf.ToDevice(data(input_tensor));
+
+    static_assert(std::is_default_constructible_v<DevicePermuteInstance>);
+
+    auto permute  = DevicePermuteInstance{};
+    auto argument = permute.MakeArgument(to_array(input_shape),
+                                         to_array(input_tensor.GetStrides()),
+                                         to_array(output_shape),
+                                         to_array(output_tensor.GetStrides()),
+                                         input_device_buf.GetDeviceBuffer(),
+                                         output_device_buf.GetDeviceBuffer(),
+                                         PassThrough{});
+
+    if(!permute.IsSupportedArgument(argument))
+    {
+        std::cerr << "The runtime parameters seems not supported by the device instance, exiting!"
+                  << std::endl;
+        return false;
+    };
+
+    auto invoker   = permute.MakeInvoker();
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, true});
+
+    std::cout << "Perf: " << ave_time << " ms" << std::endl;
+
+    output_device_buf.FromDevice(data(output_tensor));
+
+    Tensor<OutDataType> output_tensor_host(output_shape);
+    if(!host_permute(input_tensor, input_axes, PassThrough{}, output_tensor_host))
+    {
+        return false;
+    }
+
+    return ck::utils::check_err(output_tensor.AsSpan<const OutDataType>(),
+                                output_tensor_host.AsSpan<const OutDataType>(),
+                                "Error: incorrect results in output tensor",
+                                1e-6,
+                                1e-6);
+}
+
+bool run_permute_element_example(const Problem::Shape& shape, const Problem::Axes& axes)
+{
+    return run_permute_element(Problem{shape, axes});
+}
diff --git a/example/41_grouped_conv_conv_fwd/CMakeLists.txt b/example/41_grouped_conv_conv_fwd/CMakeLists.txt
new file mode 100644
index 00000000..9cb30f61
--- /dev/null
+++ b/example/41_grouped_conv_conv_fwd/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_example_executable(example_grouped_conv_conv_fwd_xdl_fp32 grouped_conv_conv_fwd_xdl_fp32.cpp)
+add_example_executable(example_grouped_conv_conv_fwd_xdl_fp16 grouped_conv_conv_fwd_xdl_fp16.cpp)
+add_example_executable(example_grouped_conv_conv_fwd_xdl_bf16 grouped_conv_conv_fwd_xdl_bf16.cpp)
+add_example_executable(example_grouped_conv_conv_fwd_xdl_int8 grouped_conv_conv_fwd_xdl_int8.cpp)
+
+if(USE_BITINT_EXTENSION_INT4)
+add_example_executable(example_grouped_conv_conv_fwd_xdl_int4 grouped_conv_conv_fwd_xdl_int4.cpp)
+endif(USE_BITINT_EXTENSION_INT4)
diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp
new file mode 100644
index 00000000..2aea08c4
--- /dev/null
+++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_bf16.cpp
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+using In0DataType       = ck::bhalf_t;
+using Wei0DataType      = ck::bhalf_t;
+using Acc0DataType      = float;
+using Wei1DataType      = ck::bhalf_t;
+using Acc1DataType      = float;
+using C1ShuffleDataType = float;
+using Out1DataType      = ck::bhalf_t;
+
+// This is used for reference code
+using Out0DataType = ck::bhalf_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using In0ElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using Wei0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Wei1ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Out0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Out1ElementOp = ck::tensor_operation::element_wise::UnaryConvert;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceBatchedGemmGemmInstance =
+    ck::tensor_operation::device::DeviceBatchedGemmGemm_Xdl_CShuffle<
+        Row,               // ALayout
+        Col,               // B0Layout
+        Col,               // B1Layout
+        Row,               // CLayout
+        In0DataType,       // ADataType,
+        Wei0DataType,      // B0DataType,
+        Wei1DataType,      // B1DataType,
+        Out1DataType,      // CDataType,
+        Acc0DataType,      // AccDataType,
+        C1ShuffleDataType, // CShuffleDataType,
+        In0ElementOp,      // AElementOp,
+        Wei0ElementOp,     // B0ElementOp,
+        Out0ElementOp,     // Acc0ElementOp,
+        Wei1ElementOp,     // B1ElementOp,
+        Out1ElementOp,     // CElementOp,
+        GemmDefault,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        32,          // KPerBlock
+        128,         // Gemm1NPerBlock
+        32,          // Gemm1KPerBlock
+        8,           // AK1
+        8,           // BK1
+        4,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        4,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<4, 64, 1>, // B1BlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        4,
+        4,
+        true,
+        1,              // CShuffleMXdlPerWavePerShuffle
+        2,              // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+
+#include "run_grouped_conv_conv_fwd_example.inc"
+
+int main(int argc, char* argv[]) { return run_grouped_conv_conv_fwd_example(argc, argv) ? 0 : 1; }
diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp
new file mode 100644
index 00000000..b7f80e76
--- /dev/null
+++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp16.cpp
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+using In0DataType       = ck::half_t;
+using Wei0DataType      = ck::half_t;
+using Acc0DataType      = float;
+using Wei1DataType      = ck::half_t;
+using Acc1DataType      = float;
+using C1ShuffleDataType = float;
+using Out1DataType      = ck::half_t;
+
+// This is used for reference code
+using Out0DataType = ck::half_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using In0ElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using Wei0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Wei1ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Out0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Out1ElementOp = ck::tensor_operation::element_wise::UnaryConvert;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceBatchedGemmGemmInstance =
+    ck::tensor_operation::device::DeviceBatchedGemmGemm_Xdl_CShuffle<
+        Row,               // ALayout
+        Col,               // B0Layout
+        Col,               // B1Layout
+        Row,               // CLayout
+        In0DataType,       // ADataType,
+        Wei0DataType,      // B0DataType,
+        Wei1DataType,      // B1DataType,
+        Out1DataType,      // CDataType,
+        Acc0DataType,      // AccDataType,
+        C1ShuffleDataType, // CShuffleDataType,
+        In0ElementOp,      // AElementOp,
+        Wei0ElementOp,     // B0ElementOp,
+        Out0ElementOp,     // Acc0ElementOp,
+        Wei1ElementOp,     // B1ElementOp,
+        Out1ElementOp,     // CElementOp,
+        GemmDefault,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        32,          // KPerBlock
+        128,         // Gemm1NPerBlock
+        32,          // Gemm1KPerBlock
+        8,           // AK1
+        8,           // BK1
+        4,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        4,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<4, 64, 1>, // B1BlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        4,
+        4,
+        true,
+        1,              // CShuffleMXdlPerWavePerShuffle
+        2,              // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+
+#include "run_grouped_conv_conv_fwd_example.inc"
+
+int main(int argc, char* argv[]) { return run_grouped_conv_conv_fwd_example(argc, argv) ? 0 : 1; }
diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp
new file mode 100644
index 00000000..15e46094
--- /dev/null
+++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_fp32.cpp
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+using In0DataType       = float;
+using Wei0DataType      = float;
+using Acc0DataType      = float;
+using Wei1DataType      = float;
+using Acc1DataType      = float;
+using C1ShuffleDataType = float;
+using Out1DataType      = float;
+
+// This is used for reference code
+using Out0DataType = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using In0ElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using Wei0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Wei1ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Out0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Out1ElementOp = ck::tensor_operation::element_wise::UnaryConvert;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceBatchedGemmGemmInstance =
+    ck::tensor_operation::device::DeviceBatchedGemmGemm_Xdl_CShuffle<
+        Row,               // ALayout
+        Col,               // B0Layout
+        Col,               // B1Layout
+        Row,               // CLayout
+        In0DataType,       // ADataType,
+        Wei0DataType,      // B0DataType,
+        Wei1DataType,      // B1DataType,
+        Out1DataType,      // CDataType,
+        Acc0DataType,      // AccDataType,
+        C1ShuffleDataType, // CShuffleDataType,
+        In0ElementOp,      // AElementOp,
+        Wei0ElementOp,     // B0ElementOp,
+        Out0ElementOp,     // Acc0ElementOp,
+        Wei1ElementOp,     // B1ElementOp,
+        Out1ElementOp,     // CElementOp,
+        GemmDefault,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        16,          // KPerBlock
+        128,         // Gemm1NPerBlock
+        16,          // Gemm1KPerBlock
+        4,           // AK1
+        4,           // BK1
+        2,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        4,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        4,
+        4,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        4,
+        4,
+        true,
+        S<4, 64, 1>, // B1BlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        2,
+        2,
+        true,
+        1,               // CShuffleMXdlPerWavePerShuffle
+        2,               // CShuffleNXdlPerWavePerShuffle
+        S<1, 16, 1, 16>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        4>;              // CShuffleBlockTransferScalarPerVector_NPerBlock
+
+#include "run_grouped_conv_conv_fwd_example.inc"
+
+int main(int argc, char* argv[]) { return run_grouped_conv_conv_fwd_example(argc, argv) ? 0 : 1; }
diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp
new file mode 100644
index 00000000..2cc4c07c
--- /dev/null
+++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int4.cpp
@@ -0,0 +1,122 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+#error Should compile this file with ck::int4_t support
+#endif
+
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+using In0DataType        = ck::int4_t;
+using Wei0DataType       = ck::int4_t;
+using KernelIn0DataType  = int8_t;
+using KernelWei0DataType = int8_t;
+using Acc0DataType       = int32_t;
+using Wei1DataType       = ck::int4_t;
+using KernelWei1DataType = int8_t;
+using Acc1DataType       = int32_t;
+using C1ShuffleDataType  = int32_t;
+using Out1DataType       = ck::int4_t;
+using KernelOut1DataType = int8_t;
+
+// This is used for reference code
+using Out0DataType = ck::int4_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using In0ElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using Wei0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Wei1ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Out0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Out1ElementOp = ck::tensor_operation::element_wise::UnaryConvert;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceBatchedGemmGemmInstance =
+    ck::tensor_operation::device::DeviceBatchedGemmGemm_Xdl_CShuffle<
+        Row,                // ALayout
+        Col,                // B0Layout
+        Col,                // B1Layout
+        Row,                // CLayout
+        KernelIn0DataType,  // ADataType,
+        KernelWei0DataType, // B0DataType,
+        KernelWei1DataType, // B1DataType,
+        KernelOut1DataType, // CDataType,
+        Acc0DataType,       // AccDataType,
+        C1ShuffleDataType,  // CShuffleDataType,
+        In0ElementOp,       // AElementOp,
+        Wei0ElementOp,      // B0ElementOp,
+        Out0ElementOp,      // Acc0ElementOp,
+        Wei1ElementOp,      // B1ElementOp,
+        Out1ElementOp,      // CElementOp,
+        GemmDefault,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        64,          // KPerBlock
+        128,         // Gemm1NPerBlock
+        64,          // Gemm1KPerBlock
+        16,          // AK1
+        16,          // BK1
+        4,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        4,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        16,
+        16,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        16,
+        16,
+        true,
+        S<4, 64, 1>, // B1BlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        4,
+        4,
+        true,
+        1,              // CShuffleMXdlPerWavePerShuffle
+        2,              // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+
+#define BUILD_INT4_EXAMPLE
+#include "run_grouped_conv_conv_fwd_example.inc"
+
+#if defined(BUILD_INT4_EXAMPLE) && defined(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4)
+static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
+#endif
+
+int main(int argc, char* argv[]) { return run_grouped_conv_conv_fwd_example(argc, argv) ? 0 : 1; }
diff --git a/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp
new file mode 100644
index 00000000..40ff0f69
--- /dev/null
+++ b/example/41_grouped_conv_conv_fwd/grouped_conv_conv_fwd_xdl_int8.cpp
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+using In0DataType       = int8_t;
+using Wei0DataType      = int8_t;
+using Acc0DataType      = int32_t;
+using Wei1DataType      = int8_t;
+using Acc1DataType      = int32_t;
+using C1ShuffleDataType = int32_t;
+using Out1DataType      = int8_t;
+
+// This is used for reference code
+using Out0DataType = int8_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using In0ElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using Wei0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Wei1ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Out0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using Out1ElementOp = ck::tensor_operation::element_wise::UnaryConvert;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceBatchedGemmGemmInstance =
+    ck::tensor_operation::device::DeviceBatchedGemmGemm_Xdl_CShuffle<
+        Row,               // ALayout
+        Col,               // B0Layout
+        Col,               // B1Layout
+        Row,               // CLayout
+        In0DataType,       // ADataType,
+        Wei0DataType,      // B0DataType,
+        Wei1DataType,      // B1DataType,
+        Out1DataType,      // CDataType,
+        Acc0DataType,      // AccDataType,
+        C1ShuffleDataType, // CShuffleDataType,
+        In0ElementOp,      // AElementOp,
+        Wei0ElementOp,     // B0ElementOp,
+        Out0ElementOp,     // Acc0ElementOp,
+        Wei1ElementOp,     // B1ElementOp,
+        Out1ElementOp,     // CElementOp,
+        GemmDefault,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        64,          // KPerBlock
+        128,         // Gemm1NPerBlock
+        64,          // Gemm1KPerBlock
+        16,          // AK1
+        16,          // BK1
+        4,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        4,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        16,
+        16,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        16,
+        16,
+        true,
+        S<4, 64, 1>, // B1BlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        4,
+        4,
+        true,
+        1,              // CShuffleMXdlPerWavePerShuffle
+        2,              // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+
+#include "run_grouped_conv_conv_fwd_example.inc"
+
+int main(int argc, char* argv[]) { return run_grouped_conv_conv_fwd_example(argc, argv) ? 0 : 1; }
diff --git a/example/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc b/example/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc
new file mode 100644
index 00000000..a2c97f4d
--- /dev/null
+++ b/example/41_grouped_conv_conv_fwd/run_grouped_conv_conv_fwd_example.inc
@@ -0,0 +1,379 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+template <ck::index_t NDimSpatial,
+          typename In0DataType,
+          typename Wei0DataType,
+          typename Out0DataType,
+          typename Wei1DataType,
+          typename Out1DataType,
+          typename In0ElementOp,
+          typename Wei0ElementOp,
+          typename Out0ElementOp,
+          typename Wei1ElementOp,
+          typename Out1ElementOp,
+          typename DeviceOpInstance>
+bool run_grouped_conv_conv_fwd(bool do_verification,
+                               int init_method,
+                               bool time_kernel,
+                               const ck::utils::conv::ConvParam& conv0_param,
+                               const ck::utils::conv::ConvParam& conv1_param,
+                               const HostTensorDescriptor& in0_g_n_c_wis_desc,
+                               const HostTensorDescriptor& wei0_g_k_c_xs_desc,
+                               const HostTensorDescriptor& out0_g_n_k_wos_desc,
+                               const HostTensorDescriptor& wei1_g_k_c_xs_desc,
+                               const HostTensorDescriptor& out1_g_n_k_wos_desc,
+                               const In0ElementOp& in0_element_op,
+                               const Wei0ElementOp& wei0_element_op,
+                               const Wei1ElementOp& wei1_element_op,
+                               const Out0ElementOp& out0_element_op,
+                               const Out1ElementOp& out1_element_op)
+{
+    Tensor<In0DataType> in0(in0_g_n_c_wis_desc);
+    Tensor<Wei0DataType> wei0(wei0_g_k_c_xs_desc);
+    Tensor<Wei1DataType> wei1(wei1_g_k_c_xs_desc);
+    Tensor<Out1DataType> out1_host(out1_g_n_k_wos_desc);
+    Tensor<Out1DataType> out1_device(out1_g_n_k_wos_desc);
+
+    std::cout << "in0: " << in0.mDesc << std::endl;
+    std::cout << "wei0: " << wei0.mDesc << std::endl;
+    std::cout << "wei1: " << wei1.mDesc << std::endl;
+    std::cout << "out1: " << out1_host.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in0.GenerateTensorValue(GeneratorTensor_2<In0DataType>{-5, 5});
+        wei0.GenerateTensorValue(GeneratorTensor_2<Wei0DataType>{-5, 5});
+        wei1.GenerateTensorValue(GeneratorTensor_2<Wei1DataType>{-5, 5});
+        break;
+    default:
+        in0.GenerateTensorValue(GeneratorTensor_3<In0DataType>{0.0, 1.0});
+        wei0.GenerateTensorValue(GeneratorTensor_3<Wei0DataType>{-0.5, 0.5});
+        wei1.GenerateTensorValue(GeneratorTensor_3<Wei1DataType>{-0.5, 0.5});
+    }
+
+#ifdef BUILD_INT4_EXAMPLE
+    DeviceMem in0_device_buf(sizeof(KernelIn0DataType) * in0.mDesc.GetElementSpaceSize());
+    DeviceMem wei0_device_buf(sizeof(KernelWei0DataType) * wei0.mDesc.GetElementSpaceSize());
+    DeviceMem wei1_device_buf(sizeof(KernelWei1DataType) * wei1.mDesc.GetElementSpaceSize());
+    DeviceMem out1_device_buf(sizeof(KernelOut1DataType) * out1_device.mDesc.GetElementSpaceSize());
+
+    const Tensor<KernelIn0DataType> in0_converted(in0);
+    const Tensor<KernelWei0DataType> wei0_converted(wei0);
+    const Tensor<KernelWei1DataType> wei1_converted(wei1);
+
+    in0_device_buf.ToDevice(in0_converted.mData.data());
+    wei0_device_buf.ToDevice(wei0_converted.mData.data());
+    wei1_device_buf.ToDevice(wei1_converted.mData.data());
+#else
+    DeviceMem in0_device_buf(sizeof(In0DataType) * in0.mDesc.GetElementSpaceSize());
+    DeviceMem wei0_device_buf(sizeof(Wei0DataType) * wei0.mDesc.GetElementSpaceSize());
+    DeviceMem wei1_device_buf(sizeof(Wei1DataType) * wei1.mDesc.GetElementSpaceSize());
+    DeviceMem out1_device_buf(sizeof(Out1DataType) * out1_device.mDesc.GetElementSpaceSize());
+
+    in0_device_buf.ToDevice(in0.mData.data());
+    wei0_device_buf.ToDevice(wei0.mData.data());
+    wei1_device_buf.ToDevice(wei1.mData.data());
+#endif
+
+    std::array<ck::index_t, NDimSpatial + 3> a0_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a0_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b0_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b0_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b1_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b1_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e1_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e1_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv0_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv0_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input0_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input0_right_pads{};
+    std::array<ck::index_t, NDimSpatial> conv1_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv1_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input1_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input1_right_pads{};
+
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(in0_g_n_c_wis_desc.GetLengths(), a0_g_n_c_wis_lengths);
+    copy(in0_g_n_c_wis_desc.GetStrides(), a0_g_n_c_wis_strides);
+    copy(wei0_g_k_c_xs_desc.GetLengths(), b0_g_k_c_xs_lengths);
+    copy(wei0_g_k_c_xs_desc.GetStrides(), b0_g_k_c_xs_strides);
+    copy(wei1_g_k_c_xs_desc.GetLengths(), b1_g_k_c_xs_lengths);
+    copy(wei1_g_k_c_xs_desc.GetStrides(), b1_g_k_c_xs_strides);
+    copy(out1_g_n_k_wos_desc.GetLengths(), e1_g_n_k_wos_lengths);
+    copy(out1_g_n_k_wos_desc.GetStrides(), e1_g_n_k_wos_strides);
+    copy(conv0_param.conv_filter_strides_, conv0_filter_strides);
+    copy(conv0_param.conv_filter_dilations_, conv0_filter_dilations);
+    copy(conv0_param.input_left_pads_, input0_left_pads);
+    copy(conv0_param.input_right_pads_, input0_right_pads);
+    copy(conv1_param.conv_filter_strides_, conv1_filter_strides);
+    copy(conv1_param.conv_filter_dilations_, conv1_filter_dilations);
+    copy(conv1_param.input_left_pads_, input1_left_pads);
+    copy(conv1_param.input_right_pads_, input1_right_pads);
+
+    // do Conv using GEMM, only works for 1x1 conv for now
+    const ck::index_t gemm_batch = a0_g_n_c_wis_lengths[0];
+
+    const ck::index_t gemm0_m_length =
+        e1_g_n_k_wos_lengths[1] *
+        ck::accumulate_n<ck::index_t>(
+            e1_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>{});
+
+    const ck::index_t gemm0_n_length = b0_g_k_c_xs_lengths[1];
+
+    const ck::index_t gemm0_k_length = ck::accumulate_n<ck::index_t>(
+        b0_g_k_c_xs_lengths.begin() + 2, NDimSpatial + 1, 1, std::multiplies<>{});
+
+    const ck::index_t gemm1_n_length = b1_g_k_c_xs_lengths[1];
+
+    //
+    const ck::index_t a0_stride = a0_g_n_c_wis_strides[2 + NDimSpatial];
+    const ck::index_t b0_stride = b0_g_k_c_xs_strides[2 + NDimSpatial];
+    const ck::index_t b1_stride = b1_g_k_c_xs_strides[2 + NDimSpatial];
+    const ck::index_t e1_stride = e1_g_n_k_wos_strides[2 + NDimSpatial];
+
+    //
+    const ck::index_t a0_batch_stride = a0_g_n_c_wis_strides[0];
+    const ck::index_t b0_batch_stride = b0_g_k_c_xs_strides[0];
+    const ck::index_t b1_batch_stride = b1_g_k_c_xs_strides[0];
+    const ck::index_t e1_batch_stride = e1_g_n_k_wos_strides[0];
+
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto argument  = device_op.MakeArgument(
+#ifdef BUILD_INT4_EXAMPLE
+        static_cast<KernelIn0DataType*>(in0_device_buf.GetDeviceBuffer()),
+        static_cast<KernelWei0DataType*>(wei0_device_buf.GetDeviceBuffer()),
+        static_cast<KernelWei1DataType*>(wei1_device_buf.GetDeviceBuffer()),
+        static_cast<KernelOut1DataType*>(out1_device_buf.GetDeviceBuffer()),
+#else
+        static_cast<In0DataType*>(in0_device_buf.GetDeviceBuffer()),
+        static_cast<Wei0DataType*>(wei0_device_buf.GetDeviceBuffer()),
+        static_cast<Wei1DataType*>(wei1_device_buf.GetDeviceBuffer()),
+        static_cast<Out1DataType*>(out1_device_buf.GetDeviceBuffer()),
+#endif
+        gemm0_m_length,
+        gemm0_n_length,
+        gemm0_k_length,
+        gemm1_n_length,
+        gemm_batch,
+        a0_stride,
+        b0_stride,
+        b1_stride,
+        e1_stride,
+        a0_batch_stride,
+        b0_batch_stride,
+        b1_batch_stride,
+        e1_batch_stride,
+        in0_element_op,
+        wei0_element_op,
+        out0_element_op,
+        wei1_element_op,
+        out1_element_op);
+
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_conv with the specified compilation parameters does "
+            "not support this Conv problem");
+    }
+
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop      = conv0_param.GetFlops() + conv1_param.GetFlops();
+    std::size_t num_btype = conv0_param.template GetInputByte<In0DataType>() +
+                            conv0_param.template GetWeightByte<Wei0DataType>() +
+                            conv1_param.template GetWeightByte<Wei1DataType>() +
+                            conv1_param.template GetOutputByte<Out1DataType>();
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << device_op.GetTypeString() << std::endl;
+
+    if(do_verification)
+    {
+        using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+        Tensor<Out0DataType> out0_host(out0_g_n_k_wos_desc);
+
+        auto ref_conv0 = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                      In0DataType,
+                                                                      Wei0DataType,
+                                                                      Out0DataType,
+                                                                      In0ElementOp,
+                                                                      Wei0ElementOp,
+                                                                      Out0ElementOp>();
+
+        auto ref_conv1 = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                      Out0DataType,
+                                                                      Wei1DataType,
+                                                                      Out1DataType,
+                                                                      PassThrough,
+                                                                      Wei1ElementOp,
+                                                                      Out1ElementOp>();
+
+        auto ref_conv0_invoker = ref_conv0.MakeInvoker();
+        auto ref_conv1_invoker = ref_conv1.MakeInvoker();
+
+        auto ref_conv0_argument = ref_conv0.MakeArgument(in0,
+                                                         wei0,
+                                                         out0_host,
+                                                         conv0_param.conv_filter_strides_,
+                                                         conv0_param.conv_filter_dilations_,
+                                                         conv0_param.input_left_pads_,
+                                                         conv0_param.input_right_pads_,
+                                                         in0_element_op,
+                                                         wei0_element_op,
+                                                         out0_element_op);
+
+        auto ref_conv1_argument = ref_conv1.MakeArgument(out0_host,
+                                                         wei1,
+                                                         out1_host,
+                                                         conv1_param.conv_filter_strides_,
+                                                         conv1_param.conv_filter_dilations_,
+                                                         conv1_param.input_left_pads_,
+                                                         conv1_param.input_right_pads_,
+                                                         out0_element_op,
+                                                         wei1_element_op,
+                                                         out1_element_op);
+
+        ref_conv0_invoker.Run(ref_conv0_argument);
+        ref_conv1_invoker.Run(ref_conv1_argument);
+
+#ifdef BUILD_INT4_EXAMPLE
+        Tensor<KernelOut1DataType> out1_device_converted(out1_host.mDesc);
+
+        out1_device_buf.FromDevice(out1_device_converted.mData.data());
+
+        out1_device = out1_device_converted.CopyAsType<Out1DataType>();
+#else
+        out1_device_buf.FromDevice(out1_device.mData.data());
+#endif
+
+        return ck::utils::check_err(
+            out1_device, out1_host, "Error: incorrect results!", 1e-5f, 1e-4f);
+    }
+
+    return true;
+}
+
+bool run_grouped_conv_conv_fwd_example(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    ck::utils::conv::ConvParam conv0_param{
+        2, 1, 128, 512, 128, {1, 1}, {28, 28}, {1, 1}, {1, 1}, {0, 0}, {0, 0}};
+
+    ck::utils::conv::ConvParam conv1_param{
+        2, 1, 128, 128, 512, {1, 1}, {28, 28}, {1, 1}, {1, 1}, {0, 0}, {0, 0}};
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
+    const auto in0_element_op  = In0ElementOp{};
+    const auto wei0_element_op = Wei0ElementOp{};
+    const auto wei1_element_op = Wei1ElementOp{};
+    const auto out0_element_op = Out0ElementOp{};
+    const auto out1_element_op = Out1ElementOp{};
+
+    const auto run = [&](auto ndim_spatial,
+                         auto in0_layout,
+                         auto wei0_layout,
+                         auto wei1_layout,
+                         auto out1_layout) {
+        constexpr ck::index_t ndim_spatial_value = ndim_spatial.value;
+
+        using In0Layout  = decltype(in0_layout);
+        using Wei0Layout = decltype(wei0_layout);
+        using Wei1Layout = decltype(wei1_layout);
+        using Out1Layout = decltype(out1_layout);
+
+        const auto in0_g_n_c_wis_desc =
+            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<In0Layout>(
+                conv0_param);
+
+        const auto wei0_g_k_c_xs_desc =
+            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<Wei0Layout>(
+                conv0_param);
+
+        // out0 doesn't physical exist, any layout for host verification is OK
+        const auto out0_g_n_k_wos_desc =
+            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<Out1Layout>(
+                conv0_param);
+
+        const auto wei1_g_k_c_xs_desc =
+            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<Wei1Layout>(
+                conv1_param);
+
+        const auto out1_g_n_k_wos_desc =
+            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<Out1Layout>(
+                conv1_param);
+
+        return run_grouped_conv_conv_fwd<ndim_spatial_value,
+                                         In0DataType,
+                                         Wei0DataType,
+                                         Out0DataType,
+                                         Wei1DataType,
+                                         Out1DataType,
+                                         In0ElementOp,
+                                         Wei0ElementOp,
+                                         Out0ElementOp,
+                                         Wei1ElementOp,
+                                         Out1ElementOp,
+                                         DeviceBatchedGemmGemmInstance>(do_verification,
+                                                                        init_method,
+                                                                        time_kernel,
+                                                                        conv0_param,
+                                                                        conv1_param,
+                                                                        in0_g_n_c_wis_desc,
+                                                                        wei0_g_k_c_xs_desc,
+                                                                        out0_g_n_k_wos_desc,
+                                                                        wei1_g_k_c_xs_desc,
+                                                                        out1_g_n_k_wos_desc,
+                                                                        in0_element_op,
+                                                                        wei0_element_op,
+                                                                        wei1_element_op,
+                                                                        out0_element_op,
+                                                                        out1_element_op);
+    };
+
+    namespace ctc = ck::tensor_layout::convolution;
+
+    if(conv0_param.num_dim_spatial_ == 1)
+    {
+        return run(ck::Number<1>{}, ctc::GNWC{}, ctc::GKXC{}, ctc::GKXC{}, ctc::GNWK{});
+    }
+    else if(conv0_param.num_dim_spatial_ == 2)
+    {
+        return run(ck::Number<2>{}, ctc::GNHWC{}, ctc::GKYXC{}, ctc::GKYXC{}, ctc::GNHWK{});
+    }
+    else if(conv0_param.num_dim_spatial_ == 3)
+    {
+        return run(ck::Number<3>{}, ctc::GNDHWC{}, ctc::GKZYXC{}, ctc::GKZYXC{}, ctc::GNDHWK{});
+    }
+
+    return true;
+}
diff --git a/example/42_groupnorm/CMakeLists.txt b/example/42_groupnorm/CMakeLists.txt
new file mode 100644
index 00000000..c3b7b825
--- /dev/null
+++ b/example/42_groupnorm/CMakeLists.txt
@@ -0,0 +1 @@
+add_example_executable(example_groupnorm_sigmoid_fp16 groupnorm_sigmoid_fp16.cpp)
diff --git a/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp b/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp
new file mode 100644
index 00000000..e62001d6
--- /dev/null
+++ b/example/42_groupnorm/groupnorm_sigmoid_fp16.cpp
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+
+#include "ck/library/utility/fill.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp"
+
+constexpr int Rank         = 5;
+constexpr int NumReduceDim = 3;
+
+using XDataType     = ck::half_t;
+using GammaDataType = ck::half_t;
+using BetaDataType  = ck::half_t;
+using YDataType     = ck::half_t;
+using AccDataType   = float;
+
+struct YElementOp
+{
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(ck::is_same<T, float>::value || ck::is_same<T, double>::value ||
+                          ck::is_same<T, ck::half_t>::value,
+                      "Data type is not supported by this operation!");
+
+        T a;
+
+        ck::tensor_operation::element_wise::Sigmoid{}(a, x);
+
+        y = x * a;
+    };
+};
+
+using DeviceInstance =
+    ck::tensor_operation::device::DeviceNormalizationImpl<XDataType,
+                                                          GammaDataType,
+                                                          BetaDataType,
+                                                          AccDataType,
+                                                          YDataType,
+                                                          YElementOp,
+                                                          Rank,
+                                                          NumReduceDim,
+                                                          1024, // BlockSize
+                                                          1,    // ClusterM
+                                                          1024, // ClusterK
+                                                          1,    // SliceM
+                                                          32,   // SliceK
+                                                          1,    // SrcVecDim (0=M, 1=K)
+                                                          2,    // SrcScalarPerVector
+                                                          1,    // GammaVecDim (0=M, 1=K)
+                                                          2,    // GammaScalarPerVector
+                                                          1,    // BetaVecDim (0=M, 1=K)
+                                                          2,    // BetaScalarPerVector
+                                                          2>;   // OutScalarPerVector
+
+int main(int argc, char* argv[])
+{
+    ck::index_t N = 2;
+    ck::index_t H = 32;
+    ck::index_t W = 32;
+    ck::index_t G = 32;
+    ck::index_t C = 30;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 6)
+    {
+        N = std::stoi(argv[1]);
+        H = std::stoi(argv[2]);
+        W = std::stoi(argv[3]);
+        G = std::stoi(argv[4]);
+        C = std::stoi(argv[5]);
+    }
+    else
+    {
+        std::cerr << "arg1 to 5: N, H, W, G, C" << std::endl;
+
+        return 1;
+    }
+
+    Tensor<XDataType> x({N, H, W, G, C});
+    Tensor<YDataType> y({N, H, W, G, C});
+    Tensor<GammaDataType> gamma({G, C});
+    Tensor<BetaDataType> beta({G, C});
+
+    ck::utils::FillUniformDistribution<XDataType>{0.f, 1.f}(x);
+    ck::utils::FillUniformDistribution<GammaDataType>{0.f, 1.f}(gamma);
+    ck::utils::FillUniformDistribution<BetaDataType>{0.f, 1.f}(beta);
+
+    DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
+    DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
+    DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
+    DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
+
+    x_dev.ToDevice(x.mData.data());
+    gamma_dev.ToDevice(gamma.mData.data());
+    beta_dev.ToDevice(beta.mData.data());
+
+    const auto y_element_op = YElementOp{};
+
+    auto device_instance = DeviceInstance{};
+    auto argument_ptr    = device_instance.MakeArgumentPointer(
+        {N, H, W, G, C},
+        std::vector<ck::index_t>{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()},
+        {0, 0, 0, C, 1},
+        {0, 0, 0, C, 1},
+        std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
+        {1, 2, 4}, // reduction dimension: [H, W, C]
+        1e-6,
+        x_dev.GetDeviceBuffer(),
+        gamma_dev.GetDeviceBuffer(),
+        beta_dev.GetDeviceBuffer(),
+        y_dev.GetDeviceBuffer(),
+        nullptr,
+        nullptr,
+        y_element_op);
+
+    if(!device_instance.IsSupportedArgument(argument_ptr.get()))
+    {
+        std::cout << "The runtime parameters are not supported" << std::endl;
+        return 1;
+    };
+
+    auto invoker_ptr = device_instance.MakeInvokerPointer();
+    float ave_time   = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true, true});
+
+    std::size_t num_btype = sizeof(XDataType) * N * H * W * G * C +
+                            sizeof(YDataType) * N * H * W * G * C + sizeof(GammaDataType) * G * C +
+                            sizeof(BetaDataType) * G * C;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s, "
+              << device_instance.GetTypeString() << std::endl;
+
+    bool pass = true;
+    {
+        Tensor<YDataType> host_y({N, H, W, G, C});
+        using ReferenceInstance = ck::tensor_operation::host::ReferenceGroupnorm<XDataType,
+                                                                                 GammaDataType,
+                                                                                 BetaDataType,
+                                                                                 YDataType,
+                                                                                 AccDataType,
+                                                                                 YElementOp>;
+
+        ReferenceInstance ref;
+        auto ref_argument =
+            ref.MakeArgument(x, gamma, beta, host_y, y_element_op, {N, H, W, G, C}, 1e-6);
+        auto ref_invoker = ref.MakeInvoker();
+        ref_invoker.Run(ref_argument);
+
+        y_dev.FromDevice(y.mData.data());
+        pass &= ck::utils::check_err(y, host_y, "Error: Incorrect results", 1e-3, 1e-3);
+    }
+
+    return (pass ? 0 : 1);
+}
diff --git a/example/43_splitk_gemm_bias_e_permute/CMakeLists.txt b/example/43_splitk_gemm_bias_e_permute/CMakeLists.txt
new file mode 100644
index 00000000..c29f18f1
--- /dev/null
+++ b/example/43_splitk_gemm_bias_e_permute/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_example_executable(example_splitk_gemm_bias_e_permute_xdl_fp16 splitk_gemm_bias_e_permute_xdl_fp16.cpp)
+add_example_executable(example_splitk_gemm_bias_e_permute_xdl_fp32 splitk_gemm_bias_e_permute_xdl_fp32.cpp)
diff --git a/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp b/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp
new file mode 100644
index 00000000..7ac4b682
--- /dev/null
+++ b/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp16.cpp
@@ -0,0 +1,407 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Add         = ck::tensor_operation::element_wise::Add;
+
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F16;
+using DDataType        = F16;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F16;
+
+static constexpr ck::index_t NumDimG = 2;
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 1;
+
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Add;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+static constexpr auto ABSpec = ck::tensor_operation::device::TensorSpecialization::Packed;
+static constexpr auto DESpec = ck::tensor_operation::device::TensorSpecialization::Default;
+
+// clang-format off
+using DeviceOpInstanceKKNN = ck::tensor_operation::device::
+        //############################################| NumDimG| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|     DsData| EData|            A|           B|          CDE|           Gemm|              A|              B|             DE| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|CBlockTransferClusterLengths|   CBlockTransfer|
+        //############################################|        |        |        |        |  Type|  Type|    Type| DataType|       Type|  Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Spacialization| Spacialization| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|        _MBlock_MWaveMPerXdl|  ScalarPerVector|
+        //############################################|        |        |        |        |      |      |        |         |           |      |    Operation|   Operation|    Operation|               |               |               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|        _NBlock_NWaveNPerXdl|    _NWaveNPerXdl|
+        //############################################|        |        |        |        |      |      |        |         |           |      |             |            |             |               |               |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                            |                 |
+        DeviceSplitKContractionMultipleD_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK,   F16,   F16,     F32,      F16, DsDataType,   F16,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,         ABSpec,         ABSpec,         DESpec,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,         1,           1,           1,              S<1, 32, 1, 4>,               8>;
+// clang-format on
+
+using DeviceOpInstance = DeviceOpInstanceKKNN;
+
+// hardcoded for NumDimM == NumDimN == NumDimK == 2
+template <ck::index_t NumDimG,
+          ck::index_t NumDimM,
+          ck::index_t NumDimN,
+          ck::index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename EDataType,
+          typename AccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          ck::enable_if_t<NumDimG == 2 && NumDimM == 2 && NumDimN == 2 && NumDimK == 1, bool> =
+              false>
+struct ReferenceContraction_G2_M2_N2_K1 : public ck::tensor_operation::device::BaseOperator
+{
+    // Argument
+    struct Argument : public ck::tensor_operation::device::BaseArgument
+    {
+        Argument(const Tensor<ADataType>& a_gs_ms_ks,
+                 const Tensor<BDataType>& b_gs_ns_ks,
+                 Tensor<EDataType>& e_gs_ms_ns,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+            : a_gs_ms_ks_{a_gs_ms_ks},
+              b_gs_ns_ks_{b_gs_ns_ks},
+              e_gs_ms_ns_{e_gs_ms_ns},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op}
+        {
+        }
+
+        const Tensor<ADataType>& a_gs_ms_ks_;
+        const Tensor<BDataType>& b_gs_ns_ks_;
+        Tensor<EDataType>& e_gs_ms_ns_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public ck::tensor_operation::device::BaseInvoker
+    {
+        using Argument = ReferenceContraction_G2_M2_N2_K1::Argument;
+
+        float Run(const Argument& arg)
+        {
+            auto f_ms_ns = [&](auto g0, auto g1, auto m0, auto m1, auto n0, auto n1) {
+                const int K0 = arg.a_gs_ms_ks_.mDesc.GetLengths()[4];
+
+                AccDataType v_acc = 0;
+
+                for(int k0 = 0; k0 < K0; ++k0)
+                {
+                    AccDataType v_a;
+                    AccDataType v_b;
+
+                    arg.a_element_op_(
+                        v_a,
+                        ck::type_convert<const AccDataType>(arg.a_gs_ms_ks_(g0, g1, m0, m1, k0)));
+                    arg.b_element_op_(
+                        v_b,
+                        ck::type_convert<const AccDataType>(arg.b_gs_ns_ks_(g0, g1, n0, n1, k0)));
+
+                    v_acc += v_a * v_b;
+                }
+
+                AccDataType v_c;
+
+                arg.cde_element_op_(v_c, v_acc);
+
+                arg.e_gs_ms_ns_(g0, g1, m0, m1, n0, n1) = v_c;
+            };
+
+            make_ParallelTensorFunctor(f_ms_ns,
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[0],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[1],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[2],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[3],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[4],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[5])(
+                std::thread::hardware_concurrency());
+
+            return 0;
+        }
+
+        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
+    {
+        return true;
+    }
+
+    static auto MakeArgument(const Tensor<ADataType>& a_gs_ms_ks,
+                             const Tensor<BDataType>& b_gs_ns_ks,
+                             Tensor<EDataType>& e_gs_ms_ns,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{
+            a_gs_ms_ks, b_gs_ns_ks, e_gs_ms_ns, a_element_op, b_element_op, cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceContraction_G2_M2_N2_K1"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+    int split_k          = 1;
+
+    ck::index_t G0 = 1;
+    ck::index_t G1 = 2;
+
+    ck::index_t M0 = 4;
+    ck::index_t M1 = 256;
+
+    ck::index_t N0 = 16;
+    ck::index_t N1 = 128;
+
+    ck::index_t K0 = 64 * 2;
+
+    // A[G0, G1, M0, M1, K0]
+    std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, G1, M0, M1, K0};
+    std::vector<ck::index_t> a_gs_ms_ks_strides{G1 * M0 * M1 * K0, M0 * M1 * K0, M1 * K0, K0, 1};
+    // B[G0, G1, N0, N1, K0]
+    std::vector<ck::index_t> b_gs_ns_ks_lengths{G0, G1, N0, N1, K0};
+    std::vector<ck::index_t> b_gs_ns_ks_strides{G1 * N0 * N1 * K0, N0 * N1 * K0, N1 * K0, K0, 1};
+
+    // D[G0, G1, M0, N0, M1, N1]
+    std::vector<ck::index_t> d_gs_ms_ns_lengths{G0, G1, M0, M1, N0, N1};
+    std::vector<ck::index_t> d_gs_ms_ns_strides{G1 * N0 * N1, N0 * N1, 0, 0, N1, 1};
+    // E[G0, G1, M0, N0, M1, N1]
+    std::vector<ck::index_t> e_gs_ms_ns_lengths{G0, G1, M0, M1, N0, N1};
+    std::vector<ck::index_t> e_gs_ms_ns_strides{
+        G1 * M0 * N0 * M1 * N1, M0 * N0 * M1 * N1, N0 * M1 * N1, N1, M1 * N1, 1};
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 5)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        split_k         = std::stoi(argv[4]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
+    Tensor<ADataType> a_gs_ms_ks(
+        std::vector<std::size_t>(a_gs_ms_ks_lengths.begin(), a_gs_ms_ks_lengths.end()),
+        std::vector<std::size_t>(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.end()));
+    Tensor<BDataType> b_gs_ns_ks(
+        std::vector<std::size_t>(b_gs_ns_ks_lengths.begin(), b_gs_ns_ks_lengths.end()),
+        std::vector<std::size_t>(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.end()));
+    Tensor<DDataType> d_gs_ms_ns(
+        std::vector<std::size_t>(d_gs_ms_ns_lengths.begin(), d_gs_ms_ns_lengths.end()),
+        std::vector<std::size_t>(d_gs_ms_ns_strides.begin(), d_gs_ms_ns_strides.end()));
+    Tensor<EDataType> e_gs_ms_ns_host_result(
+        std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
+        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
+    Tensor<EDataType> e_gs_ms_ns_device_result(
+        std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
+        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
+
+    std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
+    std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
+    std::cout << "d_gs_ms_ns: " << d_gs_ms_ns.mDesc << std::endl;
+    std::cout << "e_gs_ms_ns: " << e_gs_ms_ns_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_gs_ms_ns.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    case 2:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_gs_ns_ks.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_gs_ms_ns.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    default:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        d_gs_ms_ns.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        break;
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_gs_ms_ks.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_gs_ns_ks.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf(sizeof(DDataType) * d_gs_ms_ns.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) *
+                           e_gs_ms_ns_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_gs_ms_ks.mData.data());
+    b_device_buf.ToDevice(b_gs_ns_ks.mData.data());
+    d_device_buf.ToDevice(d_gs_ms_ns.mData.data());
+
+    // set zero
+    e_device_buf.SetZero();
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    // device operation
+    auto op       = DeviceOpInstance{};
+    auto invoker  = op.MakeInvoker();
+    auto argument = op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                                    b_device_buf.GetDeviceBuffer(),
+                                    std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
+                                    e_device_buf.GetDeviceBuffer(),
+                                    a_gs_ms_ks_lengths,
+                                    a_gs_ms_ks_strides,
+                                    b_gs_ns_ks_lengths,
+                                    b_gs_ns_ks_strides,
+                                    std::array<std::vector<ck::index_t>, 1>{d_gs_ms_ns_lengths},
+                                    std::array<std::vector<ck::index_t>, 1>{d_gs_ms_ns_strides},
+                                    e_gs_ms_ns_lengths,
+                                    e_gs_ms_ns_strides,
+                                    a_element_op,
+                                    b_element_op,
+                                    cde_element_op,
+                                    split_k);
+
+    if(!op.IsSupportedArgument(argument))
+    {
+        std::cout << op.GetTypeString() << " does not support this problem" << std::endl;
+
+        return 0;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    ck::index_t G = std::accumulate(e_gs_ms_ns_lengths.begin(),
+                                    e_gs_ms_ns_lengths.begin() + NumDimG,
+                                    ck::index_t{1},
+                                    std::multiplies<ck::index_t>{});
+
+    ck::index_t M = std::accumulate(e_gs_ms_ns_lengths.begin() + NumDimG,
+                                    e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM,
+                                    ck::index_t{1},
+                                    std::multiplies<ck::index_t>{});
+
+    ck::index_t N = std::accumulate(e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM,
+                                    e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM + NumDimN,
+                                    ck::index_t{1},
+                                    std::multiplies<ck::index_t>{});
+
+    ck::index_t K = std::accumulate(a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM,
+                                    a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM + NumDimK,
+                                    ck::index_t{1},
+                                    std::multiplies<ck::index_t>{});
+
+    std::size_t flop      = std::size_t(2) * G * M * N * K;
+    std::size_t num_btype = sizeof(ADataType) * G * M * K + sizeof(BDataType) * G * K * N +
+                            sizeof(DDataType) * G * M * N + sizeof(EDataType) * G * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << op.GetTypeString() << std::endl;
+
+    e_device_buf.FromDevice(e_gs_ms_ns_device_result.mData.data());
+
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_ms_ns_host_result(
+            std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
+            std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
+
+        using ReferenceOpInstance = ReferenceContraction_G2_M2_N2_K1<NumDimG,
+                                                                     NumDimM,
+                                                                     NumDimN,
+                                                                     NumDimK,
+                                                                     ADataType,
+                                                                     BDataType,
+                                                                     CShuffleDataType,
+                                                                     AccDataType,
+                                                                     AElementOp,
+                                                                     BElementOp,
+                                                                     PassThrough>;
+
+        auto ref_gemm    = ReferenceOpInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_gs_ms_ks, b_gs_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        e_gs_ms_ns_host_result.ForEach([&](auto&, auto idx) {
+            cde_element_op(e_gs_ms_ns_host_result(idx), c_ms_ns_host_result(idx), d_gs_ms_ns(idx));
+        });
+
+        return ck::utils::check_err(e_gs_ms_ns_device_result.mData, e_gs_ms_ns_host_result.mData)
+                   ? 0
+                   : 1;
+    }
+
+    return 0;
+}
diff --git a/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp b/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp
new file mode 100644
index 00000000..764e55ef
--- /dev/null
+++ b/example/43_splitk_gemm_bias_e_permute/splitk_gemm_bias_e_permute_xdl_fp32.cpp
@@ -0,0 +1,407 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Add         = ck::tensor_operation::element_wise::Add;
+
+using ADataType        = F32;
+using BDataType        = F32;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DDataType        = F32;
+using DsDataType       = ck::Tuple<DDataType>;
+using EDataType        = F32;
+
+static constexpr ck::index_t NumDimG = 2;
+static constexpr ck::index_t NumDimM = 2;
+static constexpr ck::index_t NumDimN = 2;
+static constexpr ck::index_t NumDimK = 1;
+
+using AElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CDEElementOp = ck::tensor_operation::element_wise::Add;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
+
+static constexpr auto ABSpec = ck::tensor_operation::device::TensorSpecialization::Packed;
+static constexpr auto DESpec = ck::tensor_operation::device::TensorSpecialization::Default;
+
+// clang-format off
+using DeviceOpInstanceKKNN = ck::tensor_operation::device::
+        //############################################| NumDimG| NumDimM| NumDimN| NumDimK|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|            A|           B|          CDE|           Gemm|              A|              B|             DE| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|CBlockTransferClusterLengths|   CBlockTransfer|
+        //############################################|        |        |        |        |      Type|      Type|        Type|         DataType|       Type|      Type|  Elementwise| Elementwise|  Elementwise| Spacialization| Spacialization| Spacialization| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|        _MBlock_MWaveMPerXdl|  ScalarPerVector|
+        //############################################|        |        |        |        |          |          |            |                 |           |          |    Operation|   Operation|    Operation|               |               |               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|        _NBlock_NWaveNPerXdl|    _NWaveNPerXdl|
+        //############################################|        |        |        |        |          |          |            |                 |           |          |             |            |             |               |               |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                            |                 |
+        DeviceSplitKContractionMultipleD_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType,   AElementOp,  BElementOp, CDEElementOp,       GemmSpec,         ABSpec,         ABSpec,         DESpec,        1,   256,   256,   128,    32,   4,   4,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,         1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,         1,           1,           1,              S<1, 32, 1, 4>,               4>;
+// clang-format on
+
+using DeviceOpInstance = DeviceOpInstanceKKNN;
+
+// hardcoded for NumDimM == NumDimN == NumDimK == 2
+template <ck::index_t NumDimG,
+          ck::index_t NumDimM,
+          ck::index_t NumDimN,
+          ck::index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename EDataType,
+          typename AccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          ck::enable_if_t<NumDimG == 2 && NumDimM == 2 && NumDimN == 2 && NumDimK == 1, bool> =
+              false>
+struct ReferenceContraction_G2_M2_N2_K1 : public ck::tensor_operation::device::BaseOperator
+{
+    // Argument
+    struct Argument : public ck::tensor_operation::device::BaseArgument
+    {
+        Argument(const Tensor<ADataType>& a_gs_ms_ks,
+                 const Tensor<BDataType>& b_gs_ns_ks,
+                 Tensor<EDataType>& e_gs_ms_ns,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+            : a_gs_ms_ks_{a_gs_ms_ks},
+              b_gs_ns_ks_{b_gs_ns_ks},
+              e_gs_ms_ns_{e_gs_ms_ns},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op}
+        {
+        }
+
+        const Tensor<ADataType>& a_gs_ms_ks_;
+        const Tensor<BDataType>& b_gs_ns_ks_;
+        Tensor<EDataType>& e_gs_ms_ns_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public ck::tensor_operation::device::BaseInvoker
+    {
+        using Argument = ReferenceContraction_G2_M2_N2_K1::Argument;
+
+        float Run(const Argument& arg)
+        {
+            auto f_ms_ns = [&](auto g0, auto g1, auto m0, auto m1, auto n0, auto n1) {
+                const int K0 = arg.a_gs_ms_ks_.mDesc.GetLengths()[4];
+
+                AccDataType v_acc = 0;
+
+                for(int k0 = 0; k0 < K0; ++k0)
+                {
+                    AccDataType v_a;
+                    AccDataType v_b;
+
+                    arg.a_element_op_(
+                        v_a,
+                        ck::type_convert<const AccDataType>(arg.a_gs_ms_ks_(g0, g1, m0, m1, k0)));
+                    arg.b_element_op_(
+                        v_b,
+                        ck::type_convert<const AccDataType>(arg.b_gs_ns_ks_(g0, g1, n0, n1, k0)));
+
+                    v_acc += v_a * v_b;
+                }
+
+                AccDataType v_c;
+
+                arg.cde_element_op_(v_c, v_acc);
+
+                arg.e_gs_ms_ns_(g0, g1, m0, m1, n0, n1) = v_c;
+            };
+
+            make_ParallelTensorFunctor(f_ms_ns,
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[0],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[1],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[2],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[3],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[4],
+                                       arg.e_gs_ms_ns_.mDesc.GetLengths()[5])(
+                std::thread::hardware_concurrency());
+
+            return 0;
+        }
+
+        float Run(const ck::tensor_operation::device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const ck::tensor_operation::device::BaseArgument*) override
+    {
+        return true;
+    }
+
+    static auto MakeArgument(const Tensor<ADataType>& a_gs_ms_ks,
+                             const Tensor<BDataType>& b_gs_ns_ks,
+                             Tensor<EDataType>& e_gs_ms_ns,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{
+            a_gs_ms_ks, b_gs_ns_ks, e_gs_ms_ns, a_element_op, b_element_op, cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<ck::tensor_operation::device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceContraction_G2_M2_N2_K1"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+    int split_k          = 1;
+
+    ck::index_t G0 = 1;
+    ck::index_t G1 = 2;
+
+    ck::index_t M0 = 4;
+    ck::index_t M1 = 256;
+
+    ck::index_t N0 = 16;
+    ck::index_t N1 = 128;
+
+    ck::index_t K0 = 64 * 2;
+
+    // A[G0, G1, M0, M1, K0]
+    std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, G1, M0, M1, K0};
+    std::vector<ck::index_t> a_gs_ms_ks_strides{G1 * M0 * M1 * K0, M0 * M1 * K0, M1 * K0, K0, 1};
+    // B[G0, G1, N0, N1, K0]
+    std::vector<ck::index_t> b_gs_ns_ks_lengths{G0, G1, N0, N1, K0};
+    std::vector<ck::index_t> b_gs_ns_ks_strides{G1 * N0 * N1 * K0, N0 * N1 * K0, N1 * K0, K0, 1};
+
+    // D[G0, G1, M0, N0, M1, N1]
+    std::vector<ck::index_t> d_gs_ms_ns_lengths{G0, G1, M0, M1, N0, N1};
+    std::vector<ck::index_t> d_gs_ms_ns_strides{G1 * N0 * N1, N0 * N1, 0, 0, N1, 1};
+    // E[G0, G1, M0, N0, M1, N1]
+    std::vector<ck::index_t> e_gs_ms_ns_lengths{G0, G1, M0, M1, N0, N1};
+    std::vector<ck::index_t> e_gs_ms_ns_strides{
+        G1 * M0 * N0 * M1 * N1, M0 * N0 * M1 * N1, N0 * M1 * N1, N1, M1 * N1, 1};
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 5)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        split_k         = std::stoi(argv[4]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+
+    Tensor<ADataType> a_gs_ms_ks(
+        std::vector<std::size_t>(a_gs_ms_ks_lengths.begin(), a_gs_ms_ks_lengths.end()),
+        std::vector<std::size_t>(a_gs_ms_ks_strides.begin(), a_gs_ms_ks_strides.end()));
+    Tensor<BDataType> b_gs_ns_ks(
+        std::vector<std::size_t>(b_gs_ns_ks_lengths.begin(), b_gs_ns_ks_lengths.end()),
+        std::vector<std::size_t>(b_gs_ns_ks_strides.begin(), b_gs_ns_ks_strides.end()));
+    Tensor<DDataType> d_gs_ms_ns(
+        std::vector<std::size_t>(d_gs_ms_ns_lengths.begin(), d_gs_ms_ns_lengths.end()),
+        std::vector<std::size_t>(d_gs_ms_ns_strides.begin(), d_gs_ms_ns_strides.end()));
+    Tensor<EDataType> e_gs_ms_ns_host_result(
+        std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
+        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
+    Tensor<EDataType> e_gs_ms_ns_device_result(
+        std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
+        std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
+
+    std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
+    std::cout << "b_gs_ns_ks: " << b_gs_ns_ks.mDesc << std::endl;
+    std::cout << "d_gs_ms_ns: " << d_gs_ms_ns.mDesc << std::endl;
+    std::cout << "e_gs_ms_ns: " << e_gs_ms_ns_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_gs_ms_ns.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    case 2:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_gs_ns_ks.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_gs_ms_ns.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    default:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b_gs_ns_ks.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        d_gs_ms_ns.GenerateTensorValue(GeneratorTensor_1<BDataType>{1});
+        break;
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_gs_ms_ks.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_gs_ns_ks.mDesc.GetElementSpaceSize());
+    DeviceMem d_device_buf(sizeof(DDataType) * d_gs_ms_ns.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) *
+                           e_gs_ms_ns_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_gs_ms_ks.mData.data());
+    b_device_buf.ToDevice(b_gs_ns_ks.mData.data());
+    d_device_buf.ToDevice(d_gs_ms_ns.mData.data());
+
+    // set zero
+    e_device_buf.SetZero();
+
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+
+    // device operation
+    auto op       = DeviceOpInstance{};
+    auto invoker  = op.MakeInvoker();
+    auto argument = op.MakeArgument(a_device_buf.GetDeviceBuffer(),
+                                    b_device_buf.GetDeviceBuffer(),
+                                    std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
+                                    e_device_buf.GetDeviceBuffer(),
+                                    a_gs_ms_ks_lengths,
+                                    a_gs_ms_ks_strides,
+                                    b_gs_ns_ks_lengths,
+                                    b_gs_ns_ks_strides,
+                                    std::array<std::vector<ck::index_t>, 1>{d_gs_ms_ns_lengths},
+                                    std::array<std::vector<ck::index_t>, 1>{d_gs_ms_ns_strides},
+                                    e_gs_ms_ns_lengths,
+                                    e_gs_ms_ns_strides,
+                                    a_element_op,
+                                    b_element_op,
+                                    cde_element_op,
+                                    split_k);
+
+    if(!op.IsSupportedArgument(argument))
+    {
+        std::cout << op.GetTypeString() << " does not support this problem" << std::endl;
+
+        return 0;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    ck::index_t G = std::accumulate(e_gs_ms_ns_lengths.begin(),
+                                    e_gs_ms_ns_lengths.begin() + NumDimG,
+                                    ck::index_t{1},
+                                    std::multiplies<ck::index_t>{});
+
+    ck::index_t M = std::accumulate(e_gs_ms_ns_lengths.begin() + NumDimG,
+                                    e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM,
+                                    ck::index_t{1},
+                                    std::multiplies<ck::index_t>{});
+
+    ck::index_t N = std::accumulate(e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM,
+                                    e_gs_ms_ns_lengths.begin() + NumDimG + NumDimM + NumDimN,
+                                    ck::index_t{1},
+                                    std::multiplies<ck::index_t>{});
+
+    ck::index_t K = std::accumulate(a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM,
+                                    a_gs_ms_ks_lengths.begin() + NumDimG + NumDimM + NumDimK,
+                                    ck::index_t{1},
+                                    std::multiplies<ck::index_t>{});
+
+    std::size_t flop      = std::size_t(2) * G * M * N * K;
+    std::size_t num_btype = sizeof(ADataType) * G * M * K + sizeof(BDataType) * G * K * N +
+                            sizeof(DDataType) * G * M * N + sizeof(EDataType) * G * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << op.GetTypeString() << std::endl;
+
+    e_device_buf.FromDevice(e_gs_ms_ns_device_result.mData.data());
+
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_ms_ns_host_result(
+            std::vector<std::size_t>(e_gs_ms_ns_lengths.begin(), e_gs_ms_ns_lengths.end()),
+            std::vector<std::size_t>(e_gs_ms_ns_strides.begin(), e_gs_ms_ns_strides.end()));
+
+        using ReferenceOpInstance = ReferenceContraction_G2_M2_N2_K1<NumDimG,
+                                                                     NumDimM,
+                                                                     NumDimN,
+                                                                     NumDimK,
+                                                                     ADataType,
+                                                                     BDataType,
+                                                                     CShuffleDataType,
+                                                                     AccDataType,
+                                                                     AElementOp,
+                                                                     BElementOp,
+                                                                     PassThrough>;
+
+        auto ref_gemm    = ReferenceOpInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_gs_ms_ks, b_gs_ns_ks, c_ms_ns_host_result, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        e_gs_ms_ns_host_result.ForEach([&](auto&, auto idx) {
+            cde_element_op(e_gs_ms_ns_host_result(idx), c_ms_ns_host_result(idx), d_gs_ms_ns(idx));
+        });
+
+        return ck::utils::check_err(e_gs_ms_ns_device_result.mData, e_gs_ms_ns_host_result.mData)
+                   ? 0
+                   : 1;
+    }
+
+    return 0;
+}
diff --git a/example/44_conv2d_fwd_quantization/CMakeLists.txt b/example/44_conv2d_fwd_quantization/CMakeLists.txt
new file mode 100644
index 00000000..f02e5110
--- /dev/null
+++ b/example/44_conv2d_fwd_quantization/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_example_executable(example_conv2d_fwd_xdl_perchannel_quantization_int8 conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp)
+add_example_executable(example_conv2d_fwd_xdl_perlayer_quantization_int8 conv2d_fwd_xdl_perlayer_quantization_int8.cpp)
+add_example_executable(example_conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8 conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp)
diff --git a/example/44_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp b/example/44_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
new file mode 100644
index 00000000..832665ed
--- /dev/null
+++ b/example/44_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perchannel_quantization_int8.cpp
@@ -0,0 +1,342 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+using InDataType           = int8_t;
+using WeiDataType          = int8_t;
+using BiasDataType         = int32_t;
+using RequantScaleDataType = float;
+using AccDataType          = int32_t;
+using CShuffleDataType     = int32_t;
+using OutDataType          = int8_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using ActivationOp = ck::tensor_operation::element_wise::Relu;
+using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<ActivationOp>;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename BiasLayout,
+          typename RequantScaleLayout,
+          typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<BiasLayout, RequantScaleLayout>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<BiasDataType, RequantScaleDataType>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        64,          // KPerBlock
+        16,          // AK1
+        16,          // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        16,          // ABlockTransferSrcScalarPerVector
+        16,          // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        16,          // BBlockTransferSrcScalarPerVector
+        16,          // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 64, 1, 4>,
+        8>;
+
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNDFwdInstance>
+bool run_grouped_conv_fwd(bool do_verification,
+                          bool time_kernel,
+                          const ck::utils::conv::ConvParam& conv_param,
+                          const HostTensorDescriptor& in_g_n_c_wis_desc,
+                          const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                          const HostTensorDescriptor& bias_g_k_desc,
+                          const HostTensorDescriptor& requant_scale_g_k_desc,
+                          const HostTensorDescriptor& out_g_n_k_wos_desc,
+                          const InElementOp& in_element_op,
+                          const WeiElementOp& wei_element_op,
+                          const OutElementOp& out_element_op)
+{
+    Tensor<InDataType> in(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<BiasDataType> bias(bias_g_k_desc);
+    Tensor<RequantScaleDataType> requant_scale(requant_scale_g_k_desc);
+    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
+
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "bias: " << bias.mDesc << std::endl;
+    std::cout << "requant_scale: " << requant_scale.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
+
+    in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-128, 127});
+    wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-128, 127});
+    bias.GenerateTensorValue(GeneratorTensor_2<BiasDataType>{-128, 127});
+    requant_scale.GenerateTensorValue(GeneratorTensor_2<RequantScaleDataType>{0, 1});
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem bias_device_buf(sizeof(BiasDataType) * bias.mDesc.GetElementSpaceSize());
+    DeviceMem requant_scale_device_buf(sizeof(RequantScaleDataType) *
+                                       requant_scale.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    bias_device_buf.ToDevice(bias.mData.data());
+    requant_scale_device_buf.ToDevice(requant_scale.mData.data());
+
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> d0_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> d0_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> d1_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> d1_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(bias_g_k_desc.GetLengths(), d0_g_n_k_wos_lengths);
+    copy(bias_g_k_desc.GetStrides(), d0_g_n_k_wos_strides);
+    copy(requant_scale_g_k_desc.GetLengths(), d1_g_n_k_wos_lengths);
+    copy(requant_scale_g_k_desc.GetStrides(), d1_g_n_k_wos_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    // do Conv
+    auto conv     = DeviceConvNDFwdInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(
+        in_device_buf.GetDeviceBuffer(),
+        wei_device_buf.GetDeviceBuffer(),
+        {bias_device_buf.GetDeviceBuffer(), requant_scale_device_buf.GetDeviceBuffer()},
+        out_device_buf.GetDeviceBuffer(),
+        a_g_n_c_wis_lengths,
+        a_g_n_c_wis_strides,
+        b_g_k_c_xs_lengths,
+        b_g_k_c_xs_strides,
+        {d0_g_n_k_wos_lengths, d1_g_n_k_wos_lengths},
+        {d0_g_n_k_wos_strides, d1_g_n_k_wos_strides},
+        e_g_n_k_wos_lengths,
+        e_g_n_k_wos_strides,
+        conv_filter_strides,
+        conv_filter_dilations,
+        input_left_pads,
+        input_right_pads,
+        in_element_op,
+        wei_element_op,
+        out_element_op);
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_conv with the specified compilation parameters does "
+            "not support this Conv problem");
+    }
+
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop      = conv_param.GetFlops();
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_host(out_g_n_k_wos_desc);
+
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                     InDataType,
+                                                                     WeiDataType,
+                                                                     CShuffleDataType,
+                                                                     InElementOp,
+                                                                     WeiElementOp,
+                                                                     PassThrough>();
+
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei,
+                                                  c_host,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        // TODO: implement elementwise operation for host
+        out_host.ForEach([&](auto&, auto idx) {
+            out_element_op(out_host(idx), c_host(idx), bias(idx), requant_scale(idx));
+        });
+
+        out_device_buf.FromDevice(out_device.mData.data());
+
+        pass &=
+            ck::utils::check_err(out_device, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
+    }
+
+    return (pass ? 0 : 1);
+}
+
+int main()
+{
+    bool do_verification           = true;
+    bool time_kernel               = true;
+    const ck::index_t ndim_spatial = 2;
+
+    ck::utils::conv::ConvParam conv_param{
+        ndim_spatial, // n_dim
+        1,            // group
+        4,            // batch
+        64,           // output channels
+        32,           // input chanels
+        {3, 3},       // weight HW
+        {71, 71},     // x HW
+        {2, 2},       // strides
+        {1, 1},       // dilations
+        {1, 1},       // left_pads
+        {1, 1}        // right_pads
+    };
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{ActivationOp{}};
+
+    using InLayout           = ck::tensor_layout::convolution::GNHWC;
+    using WeiLayout          = ck::tensor_layout::convolution::GKYXC;
+    using BiasLayout         = ck::tensor_layout::convolution::G_K;
+    using RequantScaleLayout = ck::tensor_layout::convolution::G_K;
+    using OutLayout          = ck::tensor_layout::convolution::GNHWK;
+
+    const auto in_g_n_c_wis_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+
+    const auto wei_g_k_c_xs_desc =
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+
+    // TODO - make_bias_host_tensor_descriptor_g_n_k_wos_packed()
+    const auto bias_g_k_desc = HostTensorDescriptor({conv_param.G_,
+                                                     conv_param.N_,
+                                                     conv_param.K_,
+                                                     conv_param.output_spatial_lengths_[0],
+                                                     conv_param.output_spatial_lengths_[1]},
+                                                    {
+                                                        conv_param.K_, // g
+                                                        0,             // n
+                                                        1,             // k
+                                                        0,             // ho
+                                                        0              // wo
+                                                    });
+
+    const auto requant_scale_g_k_desc = bias_g_k_desc;
+
+    const auto out_g_n_k_wos_desc =
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
+    std::cout << out_g_n_k_wos_desc << std::endl;
+
+    using deviceOp = DeviceGroupedConvNDFwdInstance<ndim_spatial,
+                                                    InLayout,
+                                                    WeiLayout,
+                                                    BiasLayout,
+                                                    RequantScaleLayout,
+                                                    OutLayout>;
+
+    return run_grouped_conv_fwd<ndim_spatial,
+                                InDataType,
+                                WeiDataType,
+                                OutDataType,
+                                InElementOp,
+                                WeiElementOp,
+                                OutElementOp,
+                                deviceOp>(do_verification,
+                                          time_kernel,
+                                          conv_param,
+                                          in_g_n_c_wis_desc,
+                                          wei_g_k_c_xs_desc,
+                                          bias_g_k_desc,
+                                          requant_scale_g_k_desc,
+                                          out_g_n_k_wos_desc,
+                                          in_element_op,
+                                          wei_element_op,
+                                          out_element_op);
+}
diff --git a/example/44_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp b/example/44_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
new file mode 100644
index 00000000..f5401350
--- /dev/null
+++ b/example/44_conv2d_fwd_quantization/conv2d_fwd_xdl_bias_relu_perlayer_quantization_int8.cpp
@@ -0,0 +1,318 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+using InDataType       = int8_t;
+using WeiDataType      = int8_t;
+using BiasDataType     = int32_t;
+using AccDataType      = int32_t;
+using CShuffleDataType = int32_t;
+using OutDataType      = int8_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using ActivationOp = ck::tensor_operation::element_wise::Relu;
+using OutElementOp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<ActivationOp>;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename BiasLayout,
+          typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<BiasLayout>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<BiasDataType>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        64,          // KPerBlock
+        16,          // AK1
+        16,          // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        16,          // ABlockTransferSrcScalarPerVector
+        16,          // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        16,          // BBlockTransferSrcScalarPerVector
+        16,          // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 64, 1, 4>,
+        8>;
+
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNDFwdInstance>
+bool run_grouped_conv_fwd(bool do_verification,
+                          bool time_kernel,
+                          const ck::utils::conv::ConvParam& conv_param,
+                          const HostTensorDescriptor& in_g_n_c_wis_desc,
+                          const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                          const HostTensorDescriptor& bias_g_k_desc,
+                          const HostTensorDescriptor& out_g_n_k_wos_desc,
+                          const InElementOp& in_element_op,
+                          const WeiElementOp& wei_element_op,
+                          const OutElementOp& out_element_op)
+{
+    Tensor<InDataType> in(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<BiasDataType> bias(bias_g_k_desc);
+    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
+
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "bias: " << bias.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
+
+    in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+    wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+    bias.GenerateTensorValue(GeneratorTensor_2<BiasDataType>{-5, 5});
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem bias_device_buf(sizeof(BiasDataType) * bias.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+    bias_device_buf.ToDevice(bias.mData.data());
+
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> d0_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> d0_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(bias_g_k_desc.GetLengths(), d0_g_n_k_wos_lengths);
+    copy(bias_g_k_desc.GetStrides(), d0_g_n_k_wos_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    // do Conv
+    auto conv     = DeviceConvNDFwdInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
+                                      wei_device_buf.GetDeviceBuffer(),
+                                      {bias_device_buf.GetDeviceBuffer()},
+                                      out_device_buf.GetDeviceBuffer(),
+                                      a_g_n_c_wis_lengths,
+                                      a_g_n_c_wis_strides,
+                                      b_g_k_c_xs_lengths,
+                                      b_g_k_c_xs_strides,
+                                      {d0_g_n_k_wos_lengths},
+                                      {d0_g_n_k_wos_strides},
+                                      e_g_n_k_wos_lengths,
+                                      e_g_n_k_wos_strides,
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads,
+                                      in_element_op,
+                                      wei_element_op,
+                                      out_element_op);
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_conv with the specified compilation parameters does "
+            "not support this Conv problem");
+    }
+
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop      = conv_param.GetFlops();
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        Tensor<CShuffleDataType> c_host(out_g_n_k_wos_desc);
+
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                     InDataType,
+                                                                     WeiDataType,
+                                                                     CShuffleDataType,
+                                                                     InElementOp,
+                                                                     WeiElementOp,
+                                                                     PassThrough>();
+
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei,
+                                                  c_host,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        // TODO: implement elementwise operation for host
+        out_host.ForEach(
+            [&](auto&, auto idx) { out_element_op(out_host(idx), c_host(idx), bias(idx)); });
+
+        out_device_buf.FromDevice(out_device.mData.data());
+
+        pass &=
+            ck::utils::check_err(out_device, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
+    }
+
+    return (pass ? 0 : 1);
+}
+
+int main()
+{
+    bool do_verification           = true;
+    bool time_kernel               = true;
+    const ck::index_t ndim_spatial = 2;
+
+    ck::utils::conv::ConvParam conv_param{
+        ndim_spatial, // n_dim
+        1,            // group
+        4,            // batch
+        64,           // output channels
+        32,           // input chanels
+        {3, 3},       // weight HW
+        {71, 71},     // x HW
+        {2, 2},       // strides
+        {1, 1},       // dilations
+        {1, 1},       // left_pads
+        {1, 1}        // right_pads
+    };
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{0.5f, ActivationOp{}};
+
+    using InLayout   = ck::tensor_layout::convolution::GNHWC;
+    using WeiLayout  = ck::tensor_layout::convolution::GKYXC;
+    using BiasLayout = ck::tensor_layout::convolution::G_K;
+    using OutLayout  = ck::tensor_layout::convolution::GNHWK;
+
+    const auto in_g_n_c_wis_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+
+    const auto wei_g_k_c_xs_desc =
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+
+    // TODO - make_bias_host_tensor_descriptor_g_n_k_wos_packed()
+    const auto bias_g_k_desc = HostTensorDescriptor({conv_param.G_,
+                                                     conv_param.N_,
+                                                     conv_param.K_,
+                                                     conv_param.output_spatial_lengths_[0],
+                                                     conv_param.output_spatial_lengths_[1]},
+                                                    {
+                                                        conv_param.K_, // g
+                                                        0,             // n
+                                                        1,             // k
+                                                        0,             // ho
+                                                        0              // wo
+                                                    });
+
+    const auto out_g_n_k_wos_desc =
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
+    std::cout << out_g_n_k_wos_desc << std::endl;
+
+    return run_grouped_conv_fwd<
+        ndim_spatial,
+        InDataType,
+        WeiDataType,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        DeviceGroupedConvNDFwdInstance<ndim_spatial, InLayout, WeiLayout, BiasLayout, OutLayout>>(
+        do_verification,
+        time_kernel,
+        conv_param,
+        in_g_n_c_wis_desc,
+        wei_g_k_c_xs_desc,
+        bias_g_k_desc,
+        out_g_n_k_wos_desc,
+        in_element_op,
+        wei_element_op,
+        out_element_op);
+}
diff --git a/example/44_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp b/example/44_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
new file mode 100644
index 00000000..2d46d866
--- /dev/null
+++ b/example/44_conv2d_fwd_quantization/conv2d_fwd_xdl_perlayer_quantization_int8.cpp
@@ -0,0 +1,279 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+using InDataType       = int8_t;
+using WeiDataType      = int8_t;
+using AccDataType      = int32_t;
+using CShuffleDataType = int32_t;
+using OutDataType      = int8_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using ActivationOp = PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<ActivationOp>;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<
+        NDimSpatial,
+        InLayout,
+        WeiLayout,
+        ck::Tuple<>,
+        OutLayout,
+        InDataType,
+        WeiDataType,
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<>,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        ConvSpec,    // ConvForwardSpecialization
+        GemmSpec,    // GemmSpecialization
+        1,           //
+        256,         // BlockSize
+        128,         // MPerBlock
+        256,         // NPerBlock
+        64,          // KPerBlock
+        16,          // AK1
+        16,          // BK1
+        32,          // MPerXdl
+        32,          // NPerXdl
+        2,           // MXdlPerWave
+        4,           // NXdlPerWave
+        S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,           // ABlockTransferSrcVectorDim
+        16,          // ABlockTransferSrcScalarPerVector
+        16,          // ABlockTransferDstScalarPerVector_AK1
+        1,           // ABlockLdsExtraM
+        S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,           // BBlockTransferSrcVectorDim
+        16,          // BBlockTransferSrcScalarPerVector
+        16,          // BBlockTransferDstScalarPerVector_BK1
+        1,           // BBlockLdsExtraN
+        1,
+        1,
+        S<1, 64, 1, 4>,
+        16>;
+
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNDFwdInstance>
+bool run_grouped_conv_fwd(bool do_verification,
+                          bool time_kernel,
+                          const ck::utils::conv::ConvParam& conv_param,
+                          const HostTensorDescriptor& in_g_n_c_wis_desc,
+                          const HostTensorDescriptor& wei_g_k_c_xs_desc,
+                          const HostTensorDescriptor& out_g_n_k_wos_desc,
+                          const InElementOp& in_element_op,
+                          const WeiElementOp& wei_element_op,
+                          const OutElementOp& out_element_op)
+{
+    Tensor<InDataType> in(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
+    Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
+
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
+
+    in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+    wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    // do Conv
+    auto conv     = DeviceConvNDFwdInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
+                                      wei_device_buf.GetDeviceBuffer(),
+                                      {},
+                                      out_device_buf.GetDeviceBuffer(),
+                                      a_g_n_c_wis_lengths,
+                                      a_g_n_c_wis_strides,
+                                      b_g_k_c_xs_lengths,
+                                      b_g_k_c_xs_strides,
+                                      {},
+                                      {},
+                                      e_g_n_k_wos_lengths,
+                                      e_g_n_k_wos_strides,
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads,
+                                      in_element_op,
+                                      wei_element_op,
+                                      out_element_op);
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_conv with the specified compilation parameters does "
+            "not support this Conv problem");
+    }
+
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop      = conv_param.GetFlops();
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                     InDataType,
+                                                                     WeiDataType,
+                                                                     OutDataType,
+                                                                     InElementOp,
+                                                                     WeiElementOp,
+                                                                     OutElementOp>();
+
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei,
+                                                  out_host,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        out_device_buf.FromDevice(out_device.mData.data());
+
+        pass &=
+            ck::utils::check_err(out_device, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
+    }
+
+    return (pass ? 0 : 1);
+}
+
+int main()
+{
+    bool do_verification           = true;
+    bool time_kernel               = true;
+    const ck::index_t ndim_spatial = 2;
+
+    ck::utils::conv::ConvParam conv_param{
+        ndim_spatial, // n_dim
+        1,            // group
+        4,            // batch
+        64,           // output channels
+        32,           // input chanels
+        {3, 3},       // weight HW
+        {71, 71},     // x HW
+        {2, 2},       // strides
+        {1, 1},       // dilations
+        {1, 1},       // left_pads
+        {1, 1}        // right_pads
+    };
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{0.5f, ActivationOp{}};
+
+    using InLayout  = ck::tensor_layout::convolution::GNHWC;
+    using WeiLayout = ck::tensor_layout::convolution::GKYXC;
+    using OutLayout = ck::tensor_layout::convolution::GNHWK;
+
+    const auto in_g_n_c_wis_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+
+    const auto wei_g_k_c_xs_desc =
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+
+    const auto out_g_n_k_wos_desc =
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
+    return run_grouped_conv_fwd<
+        ndim_spatial,
+        InDataType,
+        WeiDataType,
+        OutDataType,
+        InElementOp,
+        WeiElementOp,
+        OutElementOp,
+        DeviceGroupedConvNDFwdInstance<ndim_spatial, InLayout, WeiLayout, OutLayout>>(
+        do_verification,
+        time_kernel,
+        conv_param,
+        in_g_n_c_wis_desc,
+        wei_g_k_c_xs_desc,
+        out_g_n_k_wos_desc,
+        in_element_op,
+        wei_element_op,
+        out_element_op);
+}
diff --git a/example/44_elementwise_permute/CMakeLists.txt b/example/44_elementwise_permute/CMakeLists.txt
new file mode 100644
index 00000000..0e0091a9
--- /dev/null
+++ b/example/44_elementwise_permute/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_example_executable(example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp)
+add_example_executable(example_elementwise_permute_4D_fp16_2d elementwise_permute_4D_fp16_2d.cpp)
diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
new file mode 100644
index 00000000..0bbdbe52
--- /dev/null
+++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16.cpp
@@ -0,0 +1,116 @@
+#include <iostream>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using ADataType = F16;
+using BDataType = F16;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using DeviceElementwisePermuteInstance =
+    ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ADataType>,
+                                                    ck::Tuple<BDataType>,
+                                                    PassThrough,
+                                                    4,
+                                                    8,
+                                                    ck::Sequence<8>,
+                                                    ck::Sequence<1>>;
+
+template <typename HostTensorA, typename HostTensorB, typename Functor>
+void host_elementwise4D(HostTensorB& B_nhwc, const HostTensorA& A_nchw, Functor functor)
+{
+    for(std::size_t n = 0; n < A_nchw.mDesc.GetLengths()[0]; ++n)
+        for(std::size_t c = 0; c < A_nchw.mDesc.GetLengths()[1]; ++c)
+            for(std::size_t h = 0; h < A_nchw.mDesc.GetLengths()[2]; ++h)
+                for(std::size_t w = 0; w < A_nchw.mDesc.GetLengths()[3]; ++w)
+                {
+                    auto a_val = A_nchw(n, c, h, w);
+                    functor(B_nhwc(n, h, w, c), a_val);
+                }
+}
+
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = true;
+
+    std::vector<std::size_t> nchw = {16, 128, 32, 64};
+    std::vector<std::size_t> nhwc = {16, 32, 64, 128};
+    Tensor<ADataType> a(nchw);
+    Tensor<BDataType> b(nhwc);
+
+    a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a.mData.data());
+
+    std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
+    std::array<void*, 1> output      = {b_device_buf.GetDeviceBuffer()};
+
+    std::array<ck::index_t, 4> ab_lengths;
+    std::array<ck::index_t, 4> a_strides = {static_cast<int>(nchw[1] * nchw[2] * nchw[3]),
+                                            static_cast<int>(nchw[2] * nchw[3]),
+                                            static_cast<int>(nchw[3]),
+                                            1};
+    std::array<ck::index_t, 4> b_strides = {static_cast<int>(nhwc[1] * nhwc[2] * nhwc[3]),
+                                            1,
+                                            static_cast<int>(nhwc[2] * nhwc[3]),
+                                            static_cast<int>(nhwc[3])};
+
+    ck::ranges::copy(nchw, ab_lengths.begin());
+
+    auto broadcastPermute = DeviceElementwisePermuteInstance{};
+    auto argument         = broadcastPermute.MakeArgumentPointer(
+        ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{});
+
+    if(!broadcastPermute.IsSupportedArgument(argument.get()))
+    {
+        throw std::runtime_error(
+            "The runtime parameters seems not supported by the device instance, exiting!");
+    };
+
+    std::cout << "A (nchw): " << a.mDesc << std::endl;
+    std::cout << "B (nhwc): " << b.mDesc << std::endl;
+
+    auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer();
+    float ave_time =
+        broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel});
+    std::size_t flop = std::size_t(2) * nchw[0] * nchw[1] * nchw[2] * nchw[3];
+
+    std::size_t num_btype = sizeof(ADataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]) +
+                            sizeof(BDataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]);
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        b_device_buf.FromDevice(b.mData.data());
+        Tensor<BDataType> host_b(nhwc);
+        host_elementwise4D(host_b, a, PassThrough{});
+
+        pass &=
+            ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
+    }
+
+    return pass ? 0 : 1;
+}
diff --git a/example/44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp b/example/44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp
new file mode 100644
index 00000000..f16ad3b3
--- /dev/null
+++ b/example/44_elementwise_permute/elementwise_permute_4D_fp16_2d.cpp
@@ -0,0 +1,130 @@
+#include <iostream>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/device_elementwise_2d.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+
+using F16 = ck::half_t;
+
+using ADataType = F16;
+using BDataType = F16;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using DeviceElementwisePermuteInstance =
+    ck::tensor_operation::device::DeviceElementwise<ck::Tuple<ADataType>,
+                                                    ck::Tuple<BDataType>,
+                                                    PassThrough,
+                                                    3, // NumDim_M
+                                                    1, // NumDim_N
+                                                    8,
+                                                    8,
+                                                    ck::Sequence<8>,
+                                                    ck::Sequence<8>>;
+
+template <typename HostTensorA, typename HostTensorB, typename Functor>
+void host_elementwise4D(HostTensorB& B_nhwc,
+                        const HostTensorA& A_nchw,
+                        const std::vector<std::size_t>& shape_nchw,
+                        Functor functor)
+{
+    for(std::size_t n = 0; n < shape_nchw[0]; ++n)
+        for(std::size_t c = 0; c < shape_nchw[1]; ++c)
+            for(std::size_t h = 0; h < shape_nchw[2]; ++h)
+                for(std::size_t w = 0; w < shape_nchw[3]; ++w)
+                {
+                    auto a_val = A_nchw(n, c, h, w);
+                    functor(B_nhwc(n, h, w, c), a_val);
+                }
+}
+
+int main()
+{
+    bool do_verification = true;
+    bool time_kernel     = true;
+
+    const int N = 120;
+    const int C = 128;
+    const int H = 32;
+    const int W = 1024;
+
+    /**const int N = 120;
+    const int H = 32;
+    const int W = 64;
+
+    const int C = 128;**/
+
+    std::vector<std::size_t> nchw = {N, C, H, W};
+    std::vector<std::size_t> nhwc = {N, H, W, C};
+
+    Tensor<ADataType> a(nchw);
+    Tensor<BDataType> b(nhwc);
+
+    a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a.mData.data());
+    // LogRangeAsType<float>(std::cout << "Tensor a  : ", a.mData, ",") << std::endl;
+
+    std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
+    std::array<void*, 1> output      = {b_device_buf.GetDeviceBuffer()};
+
+    std::array<ck::index_t, 4> ab_lengths{N, H, W, C};
+
+    std::array<ck::index_t, 4> a_strides = {C * H * W, W, 1, H * W};
+    std::array<ck::index_t, 4> b_strides = {H * W * C, W * C, C, 1};
+
+    auto broadcastPermute = DeviceElementwisePermuteInstance{};
+    auto argument         = broadcastPermute.MakeArgumentPointer(
+        ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{});
+
+    if(!broadcastPermute.IsSupportedArgument(argument.get()))
+    {
+        throw std::runtime_error(
+            "The runtime parameters seems not supported by the device instance, exiting!");
+    };
+
+    std::cout << "A (nchw): " << a.mDesc << std::endl;
+    std::cout << "B (nhwc): " << b.mDesc << std::endl;
+
+    auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer();
+    float ave_time =
+        broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop = std::size_t(2) * nchw[0] * nchw[1] * nchw[2] * nchw[3];
+
+    std::size_t num_btype = sizeof(ADataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]) +
+                            sizeof(BDataType) * (nchw[0] * nchw[1] * nchw[2] * nchw[3]);
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    bool pass = true;
+
+    if(do_verification)
+    {
+        b_device_buf.FromDevice(b.mData.data());
+        // LogRangeAsType<float>(std::cout << "Tensor b  : ", b.mData, ",") << std::endl;
+
+        Tensor<BDataType> host_b(nhwc);
+        host_elementwise4D<Tensor<ADataType>, Tensor<BDataType>, PassThrough>(
+            host_b, a, nchw, PassThrough{});
+
+        // LogRangeAsType<float>(std::cout << "Host b  : ", host_b.mData, ",") << std::endl;
+        pass &=
+            ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
+    }
+
+    return pass ? 0 : 1;
+}
diff --git a/example/45_elementwise_normalization/CMakeLists.txt b/example/45_elementwise_normalization/CMakeLists.txt
new file mode 100644
index 00000000..8f5b9d4d
--- /dev/null
+++ b/example/45_elementwise_normalization/CMakeLists.txt
@@ -0,0 +1 @@
+add_example_executable(example_elementwise_layernorm_blockwise elementwise_layernorm_blockwise.cpp)
diff --git a/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp b/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp
new file mode 100644
index 00000000..12f6bc6f
--- /dev/null
+++ b/example/45_elementwise_normalization/elementwise_layernorm_blockwise.cpp
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <getopt.h>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
+
+using ADataType             = ck::half_t; // Input 1
+using BDataType             = ck::half_t; // Input 2
+using XDataType             = ck::half_t;
+using GammaDataType         = ck::half_t;
+using BetaDataType          = ck::half_t;
+using YDataType             = ck::half_t;
+using AccDataType           = float;
+using XElementwiseOperation = ck::tensor_operation::element_wise::Add;
+using YElementwiseOperation = ck::tensor_operation::element_wise::PassThrough;
+
+constexpr int Rank         = 2;
+constexpr int NumReduceDim = 1;
+
+// X = Elementwise(input1, input2, input3, ...)
+// Y = Layernorm(X, beta, gamma)
+using DeviceInstance = ck::tensor_operation::device::DeviceElementwiseNormalizationImpl<
+    ck::Tuple<ADataType, BDataType>,
+    GammaDataType,
+    BetaDataType,
+    AccDataType,
+    YDataType,
+    XElementwiseOperation,
+    YElementwiseOperation,
+    Rank,
+    NumReduceDim,
+    256, // BlockSize
+    8,   // ClusterM
+    32,  // ClusterK
+    1,   // SliceM
+    32,  // SliceK
+    1,   // SrcVecDim (0=M, 1=K)
+    8,   // SrcScalarPerVector
+    1,   // GammaVecDim (0=M, 1=K)
+    8,   // GammaScalarPerVector
+    1,   // BetaVecDim (0=M, 1=K)
+    8,   // BetaScalarPerVector
+    8>;  // OutScalarPerVector
+
+template <typename HostTensorA, typename HostTensorB, typename HostTensorC, typename Functor>
+void host_elementwise2D(HostTensorC& C,
+                        const HostTensorA& A,
+                        const HostTensorB& B,
+                        const std::vector<std::size_t>& shape,
+                        Functor functor)
+{
+    using ctype = ck::remove_reference_t<decltype(C(0, 0))>;
+
+    for(std::size_t m = 0; m < shape[0]; ++m)
+        for(std::size_t n = 0; n < shape[1]; ++n)
+        {
+            auto a_val  = A(m, n);
+            auto b_val  = B(m, n);
+            ctype c_val = 0;
+            functor(c_val, a_val, b_val);
+            C(m, n) = c_val;
+        }
+}
+
+int main()
+{
+    bool time_kernel = true;
+
+    ck::index_t M      = 48 * 256;
+    ck::index_t N      = 1024;
+    ck::index_t Stride = N;
+
+    auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
+        return HostTensorDescriptor(std::vector<std::size_t>({len}),
+                                    std::vector<std::size_t>({stride}));
+    };
+
+    auto f_host_tensor_descriptor2d = [](std::size_t row, std::size_t col, std::size_t stride) {
+        return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                    std::vector<std::size_t>({stride, 1}));
+    };
+
+    Tensor<ADataType> a(f_host_tensor_descriptor2d(M, N, Stride));
+    Tensor<BDataType> b(f_host_tensor_descriptor2d(M, N, Stride));
+    Tensor<GammaDataType> gamma(f_host_tensor_descriptor1d(N, 1));
+    Tensor<BetaDataType> beta(f_host_tensor_descriptor1d(N, 1));
+    Tensor<YDataType> y(f_host_tensor_descriptor2d(M, N, Stride));
+
+    a.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+    b.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+    gamma.GenerateTensorValue(GeneratorTensor_2<GammaDataType>{-5, 5});
+    beta.GenerateTensorValue(GeneratorTensor_2<BetaDataType>{-5, 5});
+
+    DeviceMem a_dev(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
+    DeviceMem b_dev(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
+    DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
+    DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
+    DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
+
+    a_dev.ToDevice(a.mData.data());
+    b_dev.ToDevice(b.mData.data());
+    gamma_dev.ToDevice(gamma.mData.data());
+    beta_dev.ToDevice(beta.mData.data());
+
+    std::array<const void*, 2> input = {a_dev.GetDeviceBuffer(), b_dev.GetDeviceBuffer()};
+
+    auto device_instance = DeviceInstance{};
+    auto argument_ptr    = device_instance.MakeArgumentPointer(
+        {M, N},
+        {
+            std::vector<ck::index_t>{a.mDesc.GetStrides().begin(), a.mDesc.GetStrides().end()},
+            std::vector<ck::index_t>{b.mDesc.GetStrides().begin(), b.mDesc.GetStrides().end()},
+        },
+        {0, 1},
+        {0, 1},
+        std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
+        {1},
+        1e-4,
+        input,
+        gamma_dev.GetDeviceBuffer(),
+        beta_dev.GetDeviceBuffer(),
+        y_dev.GetDeviceBuffer(),
+        XElementwiseOperation{},
+        YElementwiseOperation{});
+
+    if(!device_instance.IsSupportedArgument(argument_ptr.get()))
+    {
+        std::cout << "The runtime parameters are not supported" << std::endl;
+        return 1;
+    };
+
+    auto invoker_ptr = device_instance.MakeInvokerPointer();
+    float ela_time   = 0;
+    ela_time         = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+    float data_mem_size = M * N * sizeof(ADataType) + M * N * sizeof(BDataType) +
+                          M * N * sizeof(YDataType) + N * sizeof(GammaDataType) +
+                          N * sizeof(BetaDataType);
+    float bandwidth = data_mem_size * 1000 / ela_time / 1024 / 1024 / 1024;
+
+    std::cout << "Bandwidth is : " << bandwidth << "GB/s . " << std::endl;
+    std::cout << "Time elapase is : " << ela_time << " ms . " << std::endl;
+
+    bool pass = true;
+    {
+        std::vector<std::size_t> mn = {static_cast<unsigned long>(M),
+                                       static_cast<unsigned long>(N)};
+        Tensor<XDataType> x(f_host_tensor_descriptor2d(M, N, Stride));
+        host_elementwise2D<Tensor<ADataType>,
+                           Tensor<BDataType>,
+                           Tensor<XDataType>,
+                           XElementwiseOperation>(x, a, b, mn, XElementwiseOperation{});
+
+        Tensor<YDataType> host_y(f_host_tensor_descriptor2d(M, N, Stride));
+        using ReferenceInstance =
+            ck::tensor_operation::host::ReferenceLayernorm<XDataType,
+                                                           GammaDataType,
+                                                           BetaDataType,
+                                                           YDataType,
+                                                           AccDataType,
+                                                           YElementwiseOperation,
+                                                           Rank,
+                                                           NumReduceDim>;
+
+        ReferenceInstance ref;
+        auto ref_argument =
+            ref.MakeArgument(x, gamma, beta, host_y, YElementwiseOperation{}, {M, N}, {1}, 1e-4);
+        auto ref_invoker = ref.MakeInvoker();
+        ref_invoker.Run(ref_argument);
+
+        y_dev.FromDevice(y.mData.data());
+        pass &=
+            ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results d1", 1e-3, 1e-3);
+        if(!(pass))
+        {
+            std::cout << "layernorm wrong" << std::endl;
+        }
+    }
+    return (pass ? 0 : 1);
+}
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
new file mode 100644
index 00000000..1fdd2f6d
--- /dev/null
+++ b/example/CMakeLists.txt
@@ -0,0 +1,32 @@
+include_directories(BEFORE
+    ${PROJECT_SOURCE_DIR}/include
+    ${PROJECT_SOURCE_DIR}/library/include
+)
+
+add_custom_target(examples)
+
+function(add_example_executable EXAMPLE_NAME FILE_NAME)
+    message("adding example ${EXAMPLE_NAME}")
+    add_executable(${EXAMPLE_NAME} ${FILE_NAME})
+    target_link_libraries(${EXAMPLE_NAME} PRIVATE utility)
+    add_test(NAME ${EXAMPLE_NAME} COMMAND $<TARGET_FILE:${EXAMPLE_NAME}> ${ARGN})
+    add_dependencies(examples ${EXAMPLE_NAME})
+    add_dependencies(check ${EXAMPLE_NAME})
+    rocm_install(TARGETS ${EXAMPLE_NAME} COMPONENT examples)
+endfunction(add_example_executable EXAMPLE_NAME)
+
+function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
+    message("adding example ${EXAMPLE_NAME}")
+    add_executable(${EXAMPLE_NAME} ${FILE_NAME})
+    target_link_libraries(${EXAMPLE_NAME} PRIVATE utility)
+    add_dependencies(examples ${EXAMPLE_NAME})
+    rocm_install(TARGETS ${EXAMPLE_NAME} COMPONENT examples)
+endfunction(add_example_executable_no_testing EXAMPLE_NAME)
+
+# add all example subdir
+file(GLOB dir_list LIST_DIRECTORIES true *)
+FOREACH(subdir ${dir_list})
+    IF(IS_DIRECTORY "${subdir}")
+        add_subdirectory(${subdir})
+    ENDIF()
+ENDFOREACH()
diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
new file mode 100644
index 00000000..4be2e85d
--- /dev/null
+++ b/include/ck/ck.hpp
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS
+#include "hip/hip_runtime.h"
+#include "hip/hip_fp16.h"
+#endif
+
+#define CK_TIME_KERNEL 1
+
+// constant address space for kernel parameter
+// https://llvm.org/docs/AMDGPUUsage.html#address-spaces
+#define CK_CONSTANT_ADDRESS_SPACE __attribute__((address_space(4)))
+
+// launch bounds
+#define CK_USE_LAUNCH_BOUNDS 1
+
+#ifdef CK_USE_LAUNCH_BOUNDS
+#define CK_MAX_THREAD_PER_BLOCK 256
+#define CK_MIN_BLOCK_PER_CU 2
+#endif
+
+// check GPU target
+#ifdef __HIP_DEVICE_COMPILE__
+#if !(defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \
+      defined(__gfx90a__) || defined(__gfx1030__) || defined(__gfx1100__))
+#error Not supported target
+#endif
+#endif
+
+// buffer resource
+#ifndef __HIP_DEVICE_COMPILE__ // for host code
+#define CK_BUFFER_RESOURCE_3RD_DWORD -1
+#elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \
+    defined(__gfx90a__) // for GPU code
+#define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
+#elif defined(__gfx1030__) // for GPU code
+#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
+#elif defined(__gfx1100__) // for GPU code
+#define CK_BUFFER_RESOURCE_3RD_DWORD 0x10020000
+#endif
+
+// FMA instruction
+#ifndef __HIP_DEVICE_COMPILE__                   // for host code, define nothing
+#elif defined(__gfx803__) || defined(__gfx900__) // for GPU code
+#define CK_USE_AMD_V_MAC_F32
+#elif defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx1030__) // for GPU code
+#define CK_USE_AMD_V_FMAC_F32
+#define CK_USE_AMD_V_DOT2_F32_F16
+#define CK_USE_AMD_V_DOT4_I32_I8
+#endif
+
+// MFMA instruction
+#ifndef __HIP_DEVICE_COMPILE__ // for host code
+#define CK_USE_AMD_MFMA
+#elif defined(__gfx908__) || defined(__gfx90a__) // for GPU code
+#define CK_USE_AMD_MFMA
+#endif
+
+#if defined(__gfx90a__)
+#define CK_USE_AMD_MFMA_BF16_1K_OP
+#endif
+
+// WMMA instruction
+#ifndef __HIP_DEVICE_COMPILE__ // for host code
+#define CK_USE_AMD_WMMA
+#elif defined(__gfx1100__) // for GPU code
+#define CK_USE_AMD_WMMA
+#endif
+
+// buffer load
+#define CK_USE_AMD_BUFFER_LOAD 1
+
+// buffer store
+#define CK_USE_AMD_BUFFER_STORE 1
+
+// buffer atomic add: integer
+#define CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER 1
+
+// buffer atomic add: floating point
+#ifndef __HIP_DEVICE_COMPILE__ // for host code
+#define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
+#elif defined(__gfx908__) || defined(__gfx90a__) // for GPU code
+#define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
+#else // for GPU code
+#define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0
+#endif
+
+#if defined(__gfx90a__) // for GPU code
+#define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 1
+#else
+#define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 0
+#endif
+
+// inline asm
+#define CK_USE_AMD_INLINE_ASM 1
+
+// inner product (DLOP)
+#define CK_USE_AMD_INNER_PRODUCT_INLINE_ASM 1
+
+// block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
+#define CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
+
+// experimental feature: multi index implemented as array
+#define CK_EXPERIMENTAL_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0
+
+// experimental feature: static tensor descriptor
+#define CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR 0
+
+// experimental feature: buffer load/store/atomic-add/ OOB trick
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting. Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter for each usage
+#ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 0
+#endif
+#define CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1
+#define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK 1
+#define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK 1
+
+// experimental feature: in-regsiter sub-dword transpose
+#define CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE 1
+
+// experimental feature: merge transformation use magic number division
+#define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 1
+
+// experimental feature: use __builtin_memcpy instead of pointer cast to access a vector from
+// pointer of scalar
+#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS 0
+
+// experimental feature: use __builtin_memcpy instead of union to do bit_cast
+#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST 1
+
+// experimental feature: optimize for inter-wave scheduling policy
+#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING 1
+#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS 1
+// this will let make_default_loop_scheduler() return interwave scheduling flag by default
+#define CK_EXPERIMENTAL_DEFAULT_TO_INTER_WAVE_SCHEDULING 0
+// experimental feature: add instances using interwave scheduling
+#define CK_EXPERIMENTAL_INTER_WAVE_INSTANCES 1
+// experimental feature: add instances using pipeline v2
+#define CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES 1
+
+// hack: have underlying assumption that need to be satsified, otherwise it's a bug
+// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
+// thread-invariant, otherwise it's a bug
+// TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
+#define CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
+
+// workaround: compiler crash when compiling recursive lambda
+#define CK_WORKAROUND_SWDEV_275126 1
+
+// workaround: compiler crash when using buffer load/store for i8
+#define CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE 1
+
+// workaround: compiler gnerating inefficient ds_write instructions
+#define CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE 1
+
+// workaround: verifaction failure, due to compiler regression, for conv bwd-data fp16 using some
+// tuning parameter
+#define CK_WORKAROUND_SWDEV_325164 0
+
+// workaround: a BF16 attention kernel for gfx908 is likely affected by a compiler issue
+#ifdef __gfx908__
+#define CK_WORKAROUND_SWDEV_XXXXXX_BF16_ATTEN_FWD_GFX908_ISSUE 1
+#else // __gfx90a__, ...
+#define CK_WORKAROUND_SWDEV_XXXXXX_BF16_ATTEN_FWD_GFX908_ISSUE 0
+#endif // __gfx908__
+
+namespace ck {
+
+enum struct InMemoryDataOperationEnum
+{
+    Set,
+    AtomicAdd,
+    AtomicMax,
+    Add
+};
+
+// FIXME: use regular Sequence and remove this
+template <InMemoryDataOperationEnum... Is>
+struct InMemoryDataOperationEnumSequence
+{
+    static constexpr int mSize = sizeof...(Is);
+
+    __host__ __device__ static constexpr InMemoryDataOperationEnum At(int I)
+    {
+        // the last dummy element is to prevent compiler complain about empty array, when mSize = 0
+        const InMemoryDataOperationEnum mData[mSize + 1] = {Is..., InMemoryDataOperationEnum::Set};
+        return mData[I];
+    }
+};
+
+// index type
+using index_t      = int32_t;
+using long_index_t = int64_t;
+
+} // namespace ck
diff --git a/include/ck/host_utility/device_prop.hpp b/include/ck/host_utility/device_prop.hpp
new file mode 100644
index 00000000..e2cbdb73
--- /dev/null
+++ b/include/ck/host_utility/device_prop.hpp
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+#include <map>
+#include <hip/hip_runtime.h>
+
+namespace ck {
+
+inline std::string get_device_name()
+{
+    hipDeviceProp_t props{};
+    int device;
+    auto status = hipGetDevice(&device);
+    if(status != hipSuccess)
+    {
+        return std::string();
+    }
+
+    status = hipGetDeviceProperties(&props, device);
+    if(status != hipSuccess)
+    {
+        return std::string();
+    }
+    const std::string raw_name(props.gcnArchName);
+
+    // https://github.com/ROCmSoftwarePlatform/MIOpen/blob/8498875aef84878e04c1eabefdf6571514891086/src/target_properties.cpp#L40
+    static std::map<std::string, std::string> device_name_map = {
+        {"Ellesmere", "gfx803"},
+        {"Baffin", "gfx803"},
+        {"RacerX", "gfx803"},
+        {"Polaris10", "gfx803"},
+        {"Polaris11", "gfx803"},
+        {"Tonga", "gfx803"},
+        {"Fiji", "gfx803"},
+        {"gfx800", "gfx803"},
+        {"gfx802", "gfx803"},
+        {"gfx804", "gfx803"},
+        {"Vega10", "gfx900"},
+        {"gfx901", "gfx900"},
+        {"10.3.0 Sienna_Cichlid 18", "gfx1030"},
+    };
+
+    const auto name = raw_name.substr(0, raw_name.find(':')); // str.substr(0, npos) returns str.
+
+    auto match = device_name_map.find(name);
+    if(match != device_name_map.end())
+        return match->second;
+    return name;
+}
+
+} // namespace ck
diff --git a/include/ck/host_utility/hip_check_error.hpp b/include/ck/host_utility/hip_check_error.hpp
new file mode 100644
index 00000000..d3dc8eaf
--- /dev/null
+++ b/include/ck/host_utility/hip_check_error.hpp
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <hip/hip_runtime.h>
+
+inline void hip_check_error(hipError_t x)
+{
+    if(x != hipSuccess)
+    {
+        std::ostringstream ss;
+        ss << "HIP runtime error: " << hipGetErrorString(x) << ". " << __FILE__ << ": " << __LINE__
+           << "in function: " << __func__;
+        throw std::runtime_error(ss.str());
+    }
+}
diff --git a/include/ck/host_utility/io.hpp b/include/ck/host_utility/io.hpp
new file mode 100644
index 00000000..ac871959
--- /dev/null
+++ b/include/ck/host_utility/io.hpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+#include <iostream>
+#include <vector>
+#include <iterator>
+
+#include "ck/tensor_description/tensor_descriptor.hpp"
+
+template <typename T>
+std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
+{
+    std::copy(std::begin(v), std::end(v), std::ostream_iterator<T>(os, " "));
+    return os;
+}
+
+template <typename T, std::size_t N>
+std::ostream& operator<<(std::ostream& os, const std::array<T, N>& v)
+{
+    std::copy(std::begin(v), std::end(v), std::ostream_iterator<T>(os, " "));
+    return os;
+}
+
+template <typename... Ts>
+std::ostream& operator<<(std::ostream& os, const ck::TensorDescriptor<Ts...>& desc)
+{
+    constexpr ck::index_t nDim = ck::remove_cvref_t<decltype(desc)>::GetNumOfDimension();
+
+    os << "{";
+
+    ck::static_for<0, nDim - 1, 1>{}([&](auto i) { os << desc.GetLength(i) << ", "; });
+
+    os << desc.GetLength(ck::Number<nDim - 1>{});
+
+    os << "}";
+
+    return os;
+}
diff --git a/include/ck/host_utility/kernel_launch.hpp b/include/ck/host_utility/kernel_launch.hpp
new file mode 100644
index 00000000..ed6e2f0b
--- /dev/null
+++ b/include/ck/host_utility/kernel_launch.hpp
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <hip/hip_runtime.h>
+
+#include "ck/ck.hpp"
+#include "ck/stream_config.hpp"
+#include "ck/host_utility/hip_check_error.hpp"
+
+template <typename... Args, typename F>
+float launch_and_time_kernel(const StreamConfig& stream_config,
+                             F kernel,
+                             dim3 grid_dim,
+                             dim3 block_dim,
+                             std::size_t lds_byte,
+                             Args... args)
+{
+#if CK_TIME_KERNEL
+    if(stream_config.time_kernel_)
+    {
+        printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
+               __func__,
+               grid_dim.x,
+               grid_dim.y,
+               grid_dim.z,
+               block_dim.x,
+               block_dim.y,
+               block_dim.z);
+
+        const int nrepeat = 10;
+
+        printf("Warm up 1 time\n");
+
+        // warm up
+        kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+
+        printf("Start running %d times...\n", nrepeat);
+
+        hipEvent_t start, stop;
+
+        hip_check_error(hipEventCreate(&start));
+        hip_check_error(hipEventCreate(&stop));
+
+        hip_check_error(hipDeviceSynchronize());
+        hip_check_error(hipEventRecord(start, stream_config.stream_id_));
+
+        for(int i = 0; i < nrepeat; ++i)
+        {
+            kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+        }
+
+        hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
+        hip_check_error(hipEventSynchronize(stop));
+
+        float total_time = 0;
+
+        hip_check_error(hipEventElapsedTime(&total_time, start, stop));
+
+        return total_time / nrepeat;
+    }
+    else
+    {
+        kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+
+        return 0;
+    }
+#else
+    kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+
+    return 0;
+#endif
+}
diff --git a/include/ck/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp b/include/ck/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 00000000..db8e48df
--- /dev/null
+++ b/include/ck/problem_transform/transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,275 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1_NHWC_KYXC_NHWK_HPP
+#define CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1_NHWC_KYXC_NHWK_HPP
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// Number of GEMMs = YTilde * XTilde
+// GemmM = C
+// GemmN = N * HTildeSlice * WTildeSlice
+// GemmK = K * YDotSlice * XDotSlice
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          index_t IYTildeValue,
+          index_t IXTildeValue,
+          index_t GemmK1Value>
+__host__ __device__ constexpr auto
+transform_backward_data_convolution_into_gemm_v4r1_nhwc_kyxc_nhwk(
+    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Number<IYTildeValue>,
+    Number<IXTildeValue>,
+    Number<GemmK1Value>)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto GemmK1  = Number<GemmK1Value>{};
+    constexpr auto IYTilde = Number<IYTildeValue>{};
+    constexpr auto IXTilde = Number<IXTildeValue>{};
+
+    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
+    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
+    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
+
+    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
+    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
+
+    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
+    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
+
+    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
+    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+    const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+
+    const auto YTilde = ConvStrideH / GcdStrideDilationH;
+    const auto XTilde = ConvStrideW / GcdStrideDilationW;
+
+    const auto YDot = math::integer_divide_ceil(Y, YTilde);
+    const auto XDot = math::integer_divide_ceil(X, XTilde);
+
+    const auto HTilde = Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH);
+    const auto WTilde = Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW);
+
+    // only work on HTilde and WTilde that contribute to non-padding area of input tensor
+    const auto IHTildeSliceBegin = math::integer_divide_floor(
+        math::max(I0, InLeftPadH - ConvDilationH * (YTilde - I1)), ConvStrideH);
+    const auto IWTildeSliceBegin = math::integer_divide_floor(
+        math::max(I0, InLeftPadW - ConvDilationW * (XTilde - I1)), ConvStrideW);
+
+    const auto IHTildeSliceEnd =
+        math::min(HTilde, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1);
+    const auto IWTildeSliceEnd =
+        math::min(WTilde, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1);
+
+    const auto HTildeSlice = IHTildeSliceEnd - IHTildeSliceBegin;
+    const auto WTildeSlice = IWTildeSliceEnd - IWTildeSliceBegin;
+
+    // GemmK is different for each GEMM
+    const auto YDotSlice = math::integer_divide_ceil(Y - IYTilde, YTilde);
+    const auto XDotSlice = math::integer_divide_ceil(X - IXTilde, XTilde);
+
+    const auto K1 = GemmK1;
+    const auto K0 = K / K1;
+
+    // weight tensor
+    const auto wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc = transform_tensor_descriptor(
+        wei_k_y_x_c_grid_desc,
+        make_tuple(make_pass_through_transform(K),
+                   make_embed_transform(make_tuple(YDot, YTilde),
+                                        make_tuple(ConvStrideH / GcdStrideDilationH, I1)),
+                   make_embed_transform(make_tuple(XDot, XTilde),
+                                        make_tuple(ConvStrideW / GcdStrideDilationW, I1)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+    const auto wei_k0_k1_ydotslice_xdotslice_c_grid_desc =
+        transform_tensor_descriptor(wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                                               make_slice_transform(YDot, I0, YDotSlice),
+                                               make_slice_transform(XDot, I0, XDotSlice),
+                                               make_freeze_transform(IYTilde),
+                                               make_freeze_transform(IXTilde),
+                                               make_pass_through_transform(C)),
+                                    make_tuple(Sequence<0>{},
+                                               Sequence<1>{},
+                                               Sequence<3>{},
+                                               Sequence<2>{},
+                                               Sequence<4>{},
+                                               Sequence<5>{}),
+                                    make_tuple(Sequence<0, 1>{},
+                                               Sequence<2>{},
+                                               Sequence<3>{},
+                                               Sequence<>{},
+                                               Sequence<>{},
+                                               Sequence<4>{}));
+
+#if 1
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+        wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
+                   make_pass_through_transform(C),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<2, 3, 0>{}, Sequence<4>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#else
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+        wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)),
+                   make_pass_through_transform(C),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<0, 2, 3>{}, Sequence<4>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#endif
+
+    // output tensor
+    // this add padding check
+    const auto out_n_hop_wop_k_grid_desc = transform_tensor_descriptor(
+        out_n_ho_wo_k_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Ho, I0, I0),
+                   make_pad_transform(Wo, I0, I0),
+                   make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto out_n_ydot_htilde_xdot_wtilde_k_grid_desc = transform_tensor_descriptor(
+        out_n_hop_wop_k_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(YDot, HTilde),
+                                        make_tuple(-ConvDilationH / GcdStrideDilationH, I1)),
+                   make_embed_transform(make_tuple(XDot, WTilde),
+                                        make_tuple(-ConvDilationW / GcdStrideDilationW, I1)),
+                   make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+    const auto out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k0_k1_grid_desc =
+        transform_tensor_descriptor(
+            out_n_ydot_htilde_xdot_wtilde_k_grid_desc,
+            make_tuple(make_pass_through_transform(N),
+                       make_slice_transform(YDot, I0, YDotSlice),
+                       make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice),
+                       make_slice_transform(XDot, I0, XDotSlice),
+                       make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
+                       make_unmerge_transform(make_tuple(K0, K1))),
+            make_tuple(Sequence<0>{},
+                       Sequence<1>{},
+                       Sequence<2>{},
+                       Sequence<3>{},
+                       Sequence<4>{},
+                       Sequence<5>{}),
+            make_tuple(Sequence<0>{},
+                       Sequence<1>{},
+                       Sequence<2>{},
+                       Sequence<3>{},
+                       Sequence<4>{},
+                       Sequence<5, 6>{}));
+
+#if 1
+    const auto out_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+        out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k0_k1_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
+                   make_merge_transform(make_tuple(N, HTildeSlice, WTildeSlice)),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}, Sequence<6>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#else
+    const auto out_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+        out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k0_k1_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)),
+                   make_merge_transform(make_tuple(N, HTildeSlice, WTildeSlice)),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<5, 1, 3>{}, Sequence<0, 2, 4>{}, Sequence<6>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#endif
+
+    // input tensor
+    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+        in_n_hi_wi_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc = transform_tensor_descriptor(
+        in_n_hip_wip_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(YTilde, HTilde),
+                                        make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(XTilde, WTilde),
+                                        make_tuple(ConvDilationW, ConvStrideW)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+    const auto in_n_htildeslice_wtildeslice_c_grid_desc = transform_tensor_descriptor(
+        in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_freeze_transform(IYTilde),
+                   make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice),
+                   make_freeze_transform(IXTilde),
+                   make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{},
+                   Sequence<1>{},
+                   Sequence<2>{},
+                   Sequence<3>{},
+                   Sequence<4>{},
+                   Sequence<5>{}),
+        make_tuple(Sequence<0>{},
+                   Sequence<>{},
+                   Sequence<1>{},
+                   Sequence<>{},
+                   Sequence<2>{},
+                   Sequence<3>{}));
+
+    const auto in_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+        in_n_htildeslice_wtildeslice_c_grid_desc,
+        make_tuple(make_pass_through_transform(C),
+                   make_merge_transform(make_tuple(N, HTildeSlice, WTildeSlice))),
+        make_tuple(Sequence<3>{}, Sequence<0, 1, 2>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    return make_tuple(wei_gemmk0_gemmm_gemmk1_grid_desc,
+                      out_gemmk0_gemmn_gemmk1_grid_desc,
+                      in_gemmm_gemmn_grid_desc);
+}
+
+} // namespace ck
+#endif
diff --git a/include/ck/problem_transform/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp b/include/ck/problem_transform/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 00000000..5391b595
--- /dev/null
+++ b/include/ck/problem_transform/transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,355 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1R2_NHWC_KYXC_NHWK_HPP
+#define CK_TRANSFORM_BACKWARD_DATA_CONVOLUTION_INTO_GEMM_V4R1R2_NHWC_KYXC_NHWK_HPP
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// A: out
+// B: wei
+// C: in
+// Number of GEMMs = YTilde * XTilde
+// GemmM = N * HTildeSlice * WTildeSlice
+// GemmN = C
+// GemmK = K * YDotSlice * XDotSlice
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          typename IYTilde,
+          typename IXTilde,
+          index_t GemmK1Value>
+__host__ __device__ constexpr auto
+transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk(
+    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    IYTilde i_ytilde,
+    IXTilde i_xtilde,
+    Number<GemmK1Value>)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto GemmK1 = Number<GemmK1Value>{};
+
+    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
+    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
+    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
+
+    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
+    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
+
+    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
+    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
+
+    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
+    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+    const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+
+    const auto YTilde = ConvStrideH / GcdStrideDilationH;
+    const auto XTilde = ConvStrideW / GcdStrideDilationW;
+
+    const auto YDot = math::integer_divide_ceil(Y, YTilde);
+    const auto XDot = math::integer_divide_ceil(X, XTilde);
+
+    const auto HTilde = Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH);
+    const auto WTilde = Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW);
+
+    // only work on HTilde and WTilde that contribute to non-padding area of input tensor
+    const auto IHTildeSliceBegin = math::integer_divide_floor(
+        math::max(I0, InLeftPadH - ConvDilationH * (YTilde - I1)), ConvStrideH);
+    const auto IWTildeSliceBegin = math::integer_divide_floor(
+        math::max(I0, InLeftPadW - ConvDilationW * (XTilde - I1)), ConvStrideW);
+
+    const auto IHTildeSliceEnd =
+        math::min(HTilde, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1);
+    const auto IWTildeSliceEnd =
+        math::min(WTilde, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1);
+
+    const auto HTildeSlice = IHTildeSliceEnd - IHTildeSliceBegin;
+    const auto WTildeSlice = IWTildeSliceEnd - IWTildeSliceBegin;
+
+    // GemmK is different for each GEMM
+    const auto YDotSlice = math::integer_divide_ceil(Y - i_ytilde, YTilde);
+    const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde);
+
+    const auto K1 = GemmK1;
+    const auto K0 = K / K1;
+
+    // A: output tensor
+    // this add padding check
+    const auto out_n_hop_wop_k_grid_desc = transform_tensor_descriptor(
+        out_n_ho_wo_k_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Ho, I0, I0),
+                   make_pad_transform(Wo, I0, I0),
+                   make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto out_n_ydot_htilde_xdot_wtilde_k_grid_desc = transform_tensor_descriptor(
+        out_n_hop_wop_k_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(YDot, HTilde),
+                                        make_tuple(-ConvDilationH / GcdStrideDilationH, I1)),
+                   make_embed_transform(make_tuple(XDot, WTilde),
+                                        make_tuple(-ConvDilationW / GcdStrideDilationW, I1)),
+                   make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+    const auto out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k0_k1_grid_desc =
+        transform_tensor_descriptor(
+            out_n_ydot_htilde_xdot_wtilde_k_grid_desc,
+            make_tuple(make_pass_through_transform(N),
+                       make_slice_transform(YDot, I0, YDotSlice),
+                       make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice),
+                       make_slice_transform(XDot, I0, XDotSlice),
+                       make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
+                       make_unmerge_transform(make_tuple(K0, K1))),
+            make_tuple(Sequence<0>{},
+                       Sequence<1>{},
+                       Sequence<2>{},
+                       Sequence<3>{},
+                       Sequence<4>{},
+                       Sequence<5>{}),
+            make_tuple(Sequence<0>{},
+                       Sequence<1>{},
+                       Sequence<2>{},
+                       Sequence<3>{},
+                       Sequence<4>{},
+                       Sequence<5, 6>{}));
+
+#if 1
+    const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+        out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k0_k1_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
+                   make_merge_transform(make_tuple(N, HTildeSlice, WTildeSlice)),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}, Sequence<6>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#else
+    const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+        out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k0_k1_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)),
+                   make_merge_transform(make_tuple(N, HTildeSlice, WTildeSlice)),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<5, 1, 3>{}, Sequence<0, 2, 4>{}, Sequence<6>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#endif
+
+    // B: weight tensor
+    const auto wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc = transform_tensor_descriptor(
+        wei_k_y_x_c_grid_desc,
+        make_tuple(make_pass_through_transform(K),
+                   make_embed_transform(make_tuple(YDot, YTilde),
+                                        make_tuple(ConvStrideH / GcdStrideDilationH, I1)),
+                   make_embed_transform(make_tuple(XDot, XTilde),
+                                        make_tuple(ConvStrideW / GcdStrideDilationW, I1)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+    const auto wei_k0_k1_ydotslice_xdotslice_c_grid_desc =
+        transform_tensor_descriptor(wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                                               make_slice_transform(YDot, I0, YDotSlice),
+                                               make_slice_transform(XDot, I0, XDotSlice),
+                                               make_freeze_transform(i_ytilde),
+                                               make_freeze_transform(i_xtilde),
+                                               make_pass_through_transform(C)),
+                                    make_tuple(Sequence<0>{},
+                                               Sequence<1>{},
+                                               Sequence<3>{},
+                                               Sequence<2>{},
+                                               Sequence<4>{},
+                                               Sequence<5>{}),
+                                    make_tuple(Sequence<0, 1>{},
+                                               Sequence<2>{},
+                                               Sequence<3>{},
+                                               Sequence<>{},
+                                               Sequence<>{},
+                                               Sequence<4>{}));
+
+#if 1
+    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+        wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
+                   make_pass_through_transform(C),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<2, 3, 0>{}, Sequence<4>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#else
+    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+        wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(K0, YDotSlice, XDotSlice)),
+                   make_pass_through_transform(C),
+                   make_pass_through_transform(K1)),
+        make_tuple(Sequence<0, 2, 3>{}, Sequence<4>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+#endif
+
+    // C: input tensor
+    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+        in_n_hi_wi_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc = transform_tensor_descriptor(
+        in_n_hip_wip_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(YTilde, HTilde),
+                                        make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(XTilde, WTilde),
+                                        make_tuple(ConvDilationW, ConvStrideW)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+    const auto in_n_htildeslice_wtildeslice_c_grid_desc = transform_tensor_descriptor(
+        in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_freeze_transform(i_ytilde),
+                   make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice),
+                   make_freeze_transform(i_xtilde),
+                   make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{},
+                   Sequence<1>{},
+                   Sequence<2>{},
+                   Sequence<3>{},
+                   Sequence<4>{},
+                   Sequence<5>{}),
+        make_tuple(Sequence<0>{},
+                   Sequence<>{},
+                   Sequence<1>{},
+                   Sequence<>{},
+                   Sequence<2>{},
+                   Sequence<3>{}));
+
+    const auto in_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+        in_n_htildeslice_wtildeslice_c_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(N, HTildeSlice, WTildeSlice)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    return make_tuple(out_gemmk0_gemmm_gemmk1_grid_desc,
+                      wei_gemmk0_gemmn_gemmk1_grid_desc,
+                      in_gemmm_gemmn_grid_desc);
+}
+
+// A: out
+// B: wei
+// C: in
+// Number of GEMMs = 1
+// GemmM = N * Ho * Wo
+// GemmN = C
+// GemmK = K
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          index_t GemmK1Value>
+__host__ __device__ constexpr auto
+transform_backward_data_convolution_into_gemm_v4r1r2_nhwc_kyxc_nhwk_1x1(
+    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const TensorDescriptor<Wei...>& /* wei_k_y_x_c_grid_desc */,
+    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const ConvStrides& conv_strides,
+    Number<GemmK1Value>)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto GemmK1 = Number<GemmK1Value>{};
+
+    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
+    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
+    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
+
+    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
+    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto K1 = GemmK1;
+    const auto K0 = K / K1;
+
+    // A: output tensor
+    const auto out_gemmk0_gemmm_gemmk1_grid_desc =
+        transform_tensor_descriptor(make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)),
+                                    make_tuple(make_pass_through_transform(N * Ho * Wo),
+                                               make_unmerge_transform(make_tuple(K0, K1))),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<1>{}, Sequence<0, 2>{}));
+
+    // B: weight tensor
+    const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, C)),
+        make_tuple(make_unmerge_transform(make_tuple(K0, K1)), make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    // C: input tensor
+    const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+        in_n_hi_wi_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(I1, Ho), make_tuple(I1, ConvStrideH)),
+                   make_embed_transform(make_tuple(I1, Wo), make_tuple(I1, ConvStrideW)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+    const auto in_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+        in_n_y_ho_x_wo_c_grid_desc,
+        make_tuple(make_freeze_transform(I0),
+                   make_freeze_transform(I0),
+                   make_merge_transform(make_tuple(N, Ho, Wo)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<1>{}, Sequence<3>{}, Sequence<0, 2, 4>{}, Sequence<5>{}),
+        make_tuple(Sequence<>{}, Sequence<>{}, Sequence<0>{}, Sequence<1>{}));
+
+    return make_tuple(out_gemmk0_gemmm_gemmk1_grid_desc,
+                      wei_gemmk0_gemmn_gemmk1_grid_desc,
+                      in_gemmm_gemmn_grid_desc);
+}
+
+} // namespace ck
+#endif
diff --git a/include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_atomic_nchw_kcyx_nkhw.hpp b/include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_atomic_nchw_kcyx_nkhw.hpp
new file mode 100644
index 00000000..bb1dc239
--- /dev/null
+++ b/include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_atomic_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_TRANSFORM_BACKWARD_WEIGHT_CONVOLUTION_INTO_GEMM_V4R4R2_ATOMIC_NCHW_KCYX_NKHW_HPP
+#define CK_TRANSFORM_BACKWARD_WEIGHT_CONVOLUTION_INTO_GEMM_V4R4R2_ATOMIC_NCHW_KCYX_NKHW_HPP
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// GemmM = K
+// GemmK = N * Ho * Wo
+// GemmN = C * Y * X
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          index_t GemmK1Value,
+          typename GemmKBatchType,
+          typename GemmKPadType>
+__host__ __device__ constexpr auto
+transform_backward_weight_convolution_into_gemm_v4r4r2_atomic_nchw_kcyx_nkhw_pad(
+    const TensorDescriptor<Wei...>& wei_k_c_y_x_grid_desc,
+    const TensorDescriptor<In...>& in_n_c_hi_wi_grid_desc,
+    const TensorDescriptor<Out...>& out_n_k_ho_wo_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Number<GemmK1Value>,
+    GemmKBatchType GemmKBatch,
+    GemmKPadType GemmKPad)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto GemmK1 = Number<GemmK1Value>{};
+
+    const auto N = in_n_c_hi_wi_grid_desc.GetLength(I0);
+    const auto C = in_n_c_hi_wi_grid_desc.GetLength(I1);
+    const auto K = out_n_k_ho_wo_grid_desc.GetLength(I1);
+
+    const auto Hi = in_n_c_hi_wi_grid_desc.GetLength(I2);
+    const auto Wi = in_n_c_hi_wi_grid_desc.GetLength(I3);
+
+    const auto Ho = out_n_k_ho_wo_grid_desc.GetLength(I2);
+    const auto Wo = out_n_k_ho_wo_grid_desc.GetLength(I3);
+
+    const auto Y = wei_k_c_y_x_grid_desc.GetLength(I2);
+    const auto X = wei_k_c_y_x_grid_desc.GetLength(I3);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    const auto GemmM      = K;
+    const auto GemmN      = C * Y * X;
+    const auto GemmKTotal = N * Ho * Wo;
+    const index_t GemmK0  = GemmKPad / (GemmKBatch * GemmK1);
+
+    // A: output tensor
+    const auto out_gemmktotal_gemmm_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)),
+        make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
+        make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    const auto out_gemmkpad_gemmm_grid_desc = transform_tensor_descriptor(
+        out_gemmktotal_gemmm_grid_desc,
+        make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                   make_pass_through_transform(GemmM)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+        out_gemmkpad_gemmm_grid_desc,
+        make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1)),
+                   make_pass_through_transform(GemmM)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+    // B: input tensor
+    const auto in_n_c_hip_wip_grid_desc = transform_tensor_descriptor(
+        in_n_c_hi_wi_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pass_through_transform(C),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto in_n_c_y_ho_x_wo_grid_desc = transform_tensor_descriptor(
+        in_n_c_hip_wip_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pass_through_transform(C),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW))),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+
+    const auto in_gemmktotal_gemmn_grid_desc =
+        transform_tensor_descriptor(in_n_c_y_ho_x_wo_grid_desc,
+                                    make_tuple(make_merge_transform(make_tuple(C, Y, X)),
+                                               make_merge_transform(make_tuple(N, Ho, Wo))),
+                                    make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
+                                    make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    const auto in_gemmkpad_gemmn_grid_desc = transform_tensor_descriptor(
+        in_gemmktotal_gemmn_grid_desc,
+        make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                   make_pass_through_transform(GemmN)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+        in_gemmkpad_gemmn_grid_desc,
+        make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1)),
+                   make_pass_through_transform(GemmN)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+    // C: weight tensor
+    const auto wei_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                      in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+                      wei_gemmm_gemmn_grid_desc);
+}
+
+} // namespace ck
+#endif
diff --git a/include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp b/include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
new file mode 100644
index 00000000..ca530934
--- /dev/null
+++ b/include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_TRANSFORM_BACKWARD_WEIGHT_CONVOLUTION_INTO_GEMM_V4R4R2_NCHW_KCYX_NKHW_HPP
+#define CK_TRANSFORM_BACKWARD_WEIGHT_CONVOLUTION_INTO_GEMM_V4R4R2_NCHW_KCYX_NKHW_HPP
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// GemmM = K
+// GemmK = N * Ho * Wo
+// GemmN = C * Y * X
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          index_t GemmK1Value>
+__host__ __device__ constexpr auto
+transform_backward_weight_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(
+    const TensorDescriptor<Wei...>& wei_k_c_y_x_grid_desc,
+    const TensorDescriptor<In...>& in_n_c_hi_wi_grid_desc,
+    const TensorDescriptor<Out...>& out_n_k_ho_wo_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Number<GemmK1Value>)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto GemmK1 = Number<GemmK1Value>{};
+
+    const auto N = in_n_c_hi_wi_grid_desc.GetLength(I0);
+    const auto C = in_n_c_hi_wi_grid_desc.GetLength(I1);
+    const auto K = out_n_k_ho_wo_grid_desc.GetLength(I1);
+
+    const auto Hi = in_n_c_hi_wi_grid_desc.GetLength(I2);
+    const auto Wi = in_n_c_hi_wi_grid_desc.GetLength(I3);
+
+    const auto Ho = out_n_k_ho_wo_grid_desc.GetLength(I2);
+    const auto Wo = out_n_k_ho_wo_grid_desc.GetLength(I3);
+
+    const auto Y = wei_k_c_y_x_grid_desc.GetLength(I2);
+    const auto X = wei_k_c_y_x_grid_desc.GetLength(I3);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    const auto GemmM  = K;
+    const auto GemmN  = C * Y * X;
+    const auto GemmK  = N * Ho * Wo;
+    const auto GemmK0 = GemmK / GemmK1;
+
+    // weight tensor
+    const auto wei_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    // input tensor
+    const auto in_n_c_hip_wip_grid_desc = transform_tensor_descriptor(
+        in_n_c_hi_wi_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pass_through_transform(C),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto in_n_c_y_ho_x_wo_grid_desc = transform_tensor_descriptor(
+        in_n_c_hip_wip_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pass_through_transform(C),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW))),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+
+    const auto in_gemmk_gemmn_grid_desc =
+        transform_tensor_descriptor(in_n_c_y_ho_x_wo_grid_desc,
+                                    make_tuple(make_merge_transform(make_tuple(C, Y, X)),
+                                               make_merge_transform(make_tuple(N, Ho, Wo))),
+                                    make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
+                                    make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    const auto in_gemmk0_gemmn_gemmk1_grid_desc =
+        transform_tensor_descriptor(in_gemmk_gemmn_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmN)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    // output tensor
+    const auto out_gemmk_gemmm_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)),
+        make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
+        make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    const auto out_gemmk0_gemmm_gemmk1_grid_desc =
+        transform_tensor_descriptor(out_gemmk_gemmm_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmM)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    return make_tuple(out_gemmk0_gemmm_gemmk1_grid_desc,
+                      in_gemmk0_gemmn_gemmk1_grid_desc,
+                      wei_gemmm_gemmn_grid_desc);
+}
+
+} // namespace ck
+#endif
diff --git a/include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r4_atomic_nhwc_kyxc_nhwk.hpp b/include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r4_atomic_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 00000000..e960f90c
--- /dev/null
+++ b/include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r4_atomic_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_TRANSFORM_BACKWARD_WEIGHT_CONVOLUTION_INTO_GEMM_V4R4R4_ATOMIC_NHWC_KYXC_NHWK_HPP
+#define CK_TRANSFORM_BACKWARD_WEIGHT_CONVOLUTION_INTO_GEMM_V4R4R4_ATOMIC_NHWC_KYXC_NHWK_HPP
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// A: in
+// B: wei
+// C: out
+// GemmM = N * Ho * Wo
+// GemmN = K
+// GemmK = Y * X * C
+template <typename... In,
+          typename... Wei,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          index_t GemmK1Value,
+          typename GemmKBatchType,
+          typename GemmKPadType>
+__host__ __device__ constexpr auto
+transform_backward_weight_convolution_into_gemm_v4r4r4_atomic_nhwc_kyxc_nhwk_pad(
+    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Number<GemmK1Value>,
+    GemmKBatchType GemmKBatch,
+    GemmKPadType GemmKPad)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto GemmK1 = Number<GemmK1Value>{};
+
+    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
+    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
+    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
+
+    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
+    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
+
+    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
+    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
+
+    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
+    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    const auto GemmM      = Y * X * C;
+    const auto GemmN      = K;
+    const auto GemmKTotal = N * Ho * Wo;
+    const index_t GemmK0  = GemmKPad / (GemmKBatch * GemmK1);
+
+    // A: input tensor
+    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+        in_n_hi_wi_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+        in_n_hip_wip_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+    const auto in_gemmktotal_gemmm_grid_desc =
+        transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                    make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                                               make_merge_transform(make_tuple(N, Ho, Wo))),
+                                    make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+                                    make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    const auto in_gemmkpad_gemmm_grid_desc = transform_tensor_descriptor(
+        in_gemmktotal_gemmm_grid_desc,
+        make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                   make_pass_through_transform(GemmM)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    const auto in_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+        in_gemmkpad_gemmm_grid_desc,
+        make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1)),
+                   make_pass_through_transform(GemmM)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+    // B: output tensor
+    const auto out_gemmktotal_gemmn_grid_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K));
+
+    const auto out_gemmkpad_gemmn_grid_desc = transform_tensor_descriptor(
+        out_gemmktotal_gemmn_grid_desc,
+        make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                   make_pass_through_transform(GemmN)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    const auto out_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+        out_gemmkpad_gemmn_grid_desc,
+        make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1)),
+                   make_pass_through_transform(GemmN)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+    // C: weight tensor
+    const auto wei_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    return make_tuple(in_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                      out_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+                      wei_gemmm_gemmn_grid_desc);
+}
+
+} // namespace ck
+#endif
diff --git a/include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp b/include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 00000000..052bab42
--- /dev/null
+++ b/include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_TRANSFORM_BACKWARD_WEIGHT_CONVOLUTION_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
+#define CK_TRANSFORM_BACKWARD_WEIGHT_CONVOLUTION_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// A: in
+// B: wei
+// C: out
+// GemmM = N * Ho * Wo
+// GemmN = K
+// GemmK = Y * X * C
+template <typename... In,
+          typename... Wei,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          index_t GemmK1Value>
+__host__ __device__ constexpr auto
+transform_backward_weight_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk_pad(
+    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Number<GemmK1Value>)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto GemmK1 = Number<GemmK1Value>{};
+
+    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
+    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
+    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
+
+    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
+    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
+
+    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
+    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
+
+    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
+    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    const auto GemmM  = Y * X * C;
+    const auto GemmN  = K;
+    const auto GemmK  = N * Ho * Wo;
+    const auto GemmK0 = GemmK / GemmK1;
+
+    // A: input tensor
+    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+        in_n_hi_wi_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+        in_n_hip_wip_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+    const auto in_gemmk_gemmm_grid_desc =
+        transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                    make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                                               make_merge_transform(make_tuple(N, Ho, Wo))),
+                                    make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+                                    make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    const auto in_gemmk0_gemmm_gemmk1_grid_desc =
+        transform_tensor_descriptor(in_gemmk_gemmm_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmM)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    // B: output tensor
+    const auto out_gemmk_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)),
+        make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    const auto out_gemmk0_gemmn_gemmk1_grid_desc =
+        transform_tensor_descriptor(out_gemmk_gemmn_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmN)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    // C: weight tensor
+    const auto wei_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    return make_tuple(in_gemmk0_gemmm_gemmk1_grid_desc,
+                      out_gemmk0_gemmn_gemmk1_grid_desc,
+                      wei_gemmm_gemmn_grid_desc);
+}
+
+} // namespace ck
+#endif
diff --git a/include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r5_nhwc_kyxc_nhwk.hpp b/include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r5_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 00000000..c301a9e0
--- /dev/null
+++ b/include/ck/problem_transform/transform_backward_weight_convolution_into_gemm_v4r4r5_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_TRANSFORM_BACKWARD_WEIGHT_CONVOLUTION_INTO_GEMM_V4R4R5_NHWC_KYXC_NHWK_HPP
+#define CK_TRANSFORM_BACKWARD_WEIGHT_CONVOLUTION_INTO_GEMM_V4R4R5_NHWC_KYXC_NHWK_HPP
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// A: out
+// B: in
+// C: wei
+// GemmM = K
+// GemmN = Y * X * C
+// GemmKTotal = N * Ho * Wo
+template <typename... In,
+          typename... Wei,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          index_t GemmK1Value,
+          typename GemmKBatchType,
+          typename GemmKPadType>
+__host__ __device__ constexpr auto
+transform_backward_weight_convolution_into_gemm_v4r4r5_nhwc_kyxc_nhwk_pad(
+    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Number<GemmK1Value>,
+    GemmKBatchType GemmKBatch,
+    GemmKPadType GemmKPad)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto GemmK1 = Number<GemmK1Value>{};
+
+    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
+    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
+    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
+
+    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
+    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
+
+    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
+    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
+
+    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
+    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    const auto GemmM      = K;
+    const auto GemmN      = Y * X * C;
+    const auto GemmKTotal = N * Ho * Wo;
+    const index_t GemmK0  = GemmKPad / (GemmKBatch * GemmK1);
+
+    // A: output tensor
+    const auto out_gemmktotal_gemmm_grid_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K));
+
+    const auto out_gemmkpad_gemmm_grid_desc = transform_tensor_descriptor(
+        out_gemmktotal_gemmm_grid_desc,
+        make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                   make_pass_through_transform(GemmM)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+        out_gemmkpad_gemmm_grid_desc,
+        make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1)),
+                   make_pass_through_transform(GemmM)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+    // B: input tensor
+    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+        in_n_hi_wi_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+        in_n_hip_wip_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+    const auto in_gemmktotal_gemmn_grid_desc =
+        transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                    make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                                               make_merge_transform(make_tuple(N, Ho, Wo))),
+                                    make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+                                    make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    const auto in_gemmkpad_gemmn_grid_desc = transform_tensor_descriptor(
+        in_gemmktotal_gemmn_grid_desc,
+        make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                   make_pass_through_transform(GemmN)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+        in_gemmkpad_gemmn_grid_desc,
+        make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1)),
+                   make_pass_through_transform(GemmN)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+    // C: weight tensor
+    const auto wei_gemmm_gemmn_grid_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C));
+
+    return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                      in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+                      wei_gemmm_gemmn_grid_desc);
+}
+
+} // namespace ck
+#endif
diff --git a/include/ck/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp b/include/ck/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp
new file mode 100644
index 00000000..41267536
--- /dev/null
+++ b/include/ck/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION3D_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
+#define CK_TRANSFORM_FORWARD_CONVOLUTION3D_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// A: in
+// B: wei
+// C: out
+// GemmM = N * Do * Ho * Wo
+// GemmN = K
+// GemmK = Z * Y * X * C
+template <typename... In,
+          typename... Wei,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          index_t GemmK1Value>
+__host__ __device__ constexpr auto
+transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk_pad(
+    const TensorDescriptor<In...>& in_grid_desc_n_di_hi_wi_c,
+    const TensorDescriptor<Wei...>& wei_k_z_y_x_c_grid_desc,
+    const TensorDescriptor<Out...>& out_n_do_ho_wo_k_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Number<GemmK1Value>)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+    constexpr auto I4 = Number<4>{};
+
+    constexpr auto GemmK1 = Number<GemmK1Value>{};
+
+    const auto N = in_grid_desc_n_di_hi_wi_c.GetLength(I0);
+    const auto K = out_n_do_ho_wo_k_grid_desc.GetLength(I4);
+    const auto C = in_grid_desc_n_di_hi_wi_c.GetLength(I4);
+
+    const auto Di = in_grid_desc_n_di_hi_wi_c.GetLength(I1);
+    const auto Hi = in_grid_desc_n_di_hi_wi_c.GetLength(I2);
+    const auto Wi = in_grid_desc_n_di_hi_wi_c.GetLength(I3);
+
+    const auto Do = out_n_do_ho_wo_k_grid_desc.GetLength(I1);
+    const auto Ho = out_n_do_ho_wo_k_grid_desc.GetLength(I2);
+    const auto Wo = out_n_do_ho_wo_k_grid_desc.GetLength(I3);
+
+    const auto Z = wei_k_z_y_x_c_grid_desc.GetLength(I1);
+    const auto Y = wei_k_z_y_x_c_grid_desc.GetLength(I2);
+    const auto X = wei_k_z_y_x_c_grid_desc.GetLength(I3);
+
+    const auto ConvStrideD = conv_strides[I0];
+    const auto ConvStrideH = conv_strides[I1];
+    const auto ConvStrideW = conv_strides[I2];
+
+    const auto ConvDilationD = conv_dilations[I0];
+    const auto ConvDilationH = conv_dilations[I1];
+    const auto ConvDilationW = conv_dilations[I2];
+
+    const auto InLeftPadD = in_left_pads[I0];
+    const auto InLeftPadH = in_left_pads[I1];
+    const auto InLeftPadW = in_left_pads[I2];
+
+    const auto InRightPadD = in_right_pads[I0];
+    const auto InRightPadH = in_right_pads[I1];
+    const auto InRightPadW = in_right_pads[I2];
+
+    const auto GemmM  = N * Do * Ho * Wo;
+    const auto GemmN  = K;
+    const auto GemmK  = Z * Y * X * C;
+    const auto GemmK0 = GemmK / GemmK1;
+
+    // A: input tensor
+    const auto in_grid_desc_n_dip_hip_wip_c = transform_tensor_descriptor(
+        in_grid_desc_n_di_hi_wi_c,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Di, InLeftPadD, InRightPadD),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+    const auto in_grid_desc_n_z_do_y_ho_x_wo_c = transform_tensor_descriptor(
+        in_grid_desc_n_dip_hip_wip_c,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(Z, Do), make_tuple(ConvDilationD, ConvStrideD)),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+        make_tuple(
+            Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5, 6>{}, Sequence<7>{}));
+
+    const auto in_grid_desc_gemmk_gemmm =
+        transform_tensor_descriptor(in_grid_desc_n_z_do_y_ho_x_wo_c,
+                                    make_tuple(make_merge_transform(make_tuple(Z, Y, X, C)),
+                                               make_merge_transform(make_tuple(N, Do, Ho, Wo))),
+                                    make_tuple(Sequence<1, 3, 5, 7>{}, Sequence<0, 2, 4, 6>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    const auto in_grid_desc_gemmk0_gemmm_gemmk1 =
+        transform_tensor_descriptor(in_grid_desc_gemmk_gemmm,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmM)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    // B: weight tensor
+    const auto wei_grid_desc_gemmk_gemmn = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, Z * Y * X * C)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(Z * Y * X * C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    const auto wei_grid_desc_gemmk0_gemmn_gemmk1 =
+        transform_tensor_descriptor(wei_grid_desc_gemmk_gemmn,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmN)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    // C: output tensor
+    const auto out_grid_desc_gemmm_gemmn = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N * Do * Ho * Wo, K)),
+        make_tuple(make_pass_through_transform(N * Do * Ho * Wo), make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    // const auto out_grid_desc_gemmm_gemmn = transform_tensor_descriptor(
+    //     out_n_do_ho_wo_k_grid_desc,
+    //     make_tuple(make_merge_transform(make_tuple(N, Do, Ho, Wo)),
+    //                make_pass_through_transform(K)),
+    //     make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<3>{}),
+    //     make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    return make_tuple(in_grid_desc_gemmk0_gemmm_gemmk1,
+                      wei_grid_desc_gemmk0_gemmn_gemmk1,
+                      out_grid_desc_gemmm_gemmn);
+}
+
+} // namespace ck
+#endif
diff --git a/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp b/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp
new file mode 100644
index 00000000..381f9ac9
--- /dev/null
+++ b/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,260 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4_NCHW_KCYX_NKHW_HPP
+#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4_NCHW_KCYX_NKHW_HPP
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// GemmM = K
+// GemmN = N * Ho * Wo
+// GemmK = C * Y * X
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+__host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_pad(
+    const TensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+    const TensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+    const TensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    const auto N = in_n_c_hi_wi_global_desc.GetLength(I0);
+    const auto C = in_n_c_hi_wi_global_desc.GetLength(I1);
+    const auto K = out_n_k_ho_wo_global_desc.GetLength(I1);
+
+    const auto Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
+    const auto Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
+
+    const auto Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
+    const auto Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
+
+    const auto Y = wei_k_c_y_x_global_desc.GetLength(I2);
+    const auto X = wei_k_c_y_x_global_desc.GetLength(I3);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    // weight tensor
+    const auto wei_gemmk_gemmm_global_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    // input tensor
+    const auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor(
+        in_n_c_hi_wi_global_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pass_through_transform(C),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
+        in_n_c_hip_wip_global_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pass_through_transform(C),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW))),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+
+    const auto in_gemmk_gemmn_global_desc =
+        transform_tensor_descriptor(in_n_c_y_ho_x_wo_global_desc,
+                                    make_tuple(make_merge_transform(make_tuple(C, Y, X)),
+                                               make_merge_transform(make_tuple(N, Ho, Wo))),
+                                    make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    // output tensor
+    const auto out_gemmm_gemmn_global_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)),
+        make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
+        make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    return make_tuple(
+        wei_gemmk_gemmm_global_desc, in_gemmk_gemmn_global_desc, out_gemmm_gemmn_global_desc);
+}
+
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+__host__ __device__ constexpr auto
+transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_no_pad(
+    const TensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+    const TensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+    const TensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    const auto N = in_n_c_hi_wi_global_desc.GetLength(I0);
+    const auto C = in_n_c_hi_wi_global_desc.GetLength(I1);
+    const auto K = out_n_k_ho_wo_global_desc.GetLength(I1);
+
+    const auto Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
+    const auto Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
+
+    const auto Y = wei_k_c_y_x_global_desc.GetLength(I2);
+    const auto X = wei_k_c_y_x_global_desc.GetLength(I3);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    assert(InLeftPadH == 0 && InLeftPadW == 0 && InRightPadH == 0 && InRightPadW == 0);
+
+    // weight tensor
+    const auto wei_gemmk_gemmm_global_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    // input tensor
+    const auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
+        in_n_c_hi_wi_global_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pass_through_transform(C),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW))),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+
+    const auto in_gemmk_gemmn_global_desc =
+        transform_tensor_descriptor(in_n_c_y_ho_x_wo_global_desc,
+                                    make_tuple(make_merge_transform(make_tuple(C, Y, X)),
+                                               make_merge_transform(make_tuple(N, Ho, Wo))),
+                                    make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    // output tensor
+    const auto out_gemmm_gemmn_global_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)),
+        make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
+        make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    return make_tuple(
+        wei_gemmk_gemmm_global_desc, in_gemmk_gemmn_global_desc, out_gemmm_gemmn_global_desc);
+}
+
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+__host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nchw_kcyx_nkhw_1x1(
+    const TensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
+    const TensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
+    const TensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    const auto N = in_n_c_hi_wi_global_desc.GetLength(I0);
+    const auto C = in_n_c_hi_wi_global_desc.GetLength(I1);
+    const auto K = out_n_k_ho_wo_global_desc.GetLength(I1);
+
+    const auto Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
+    const auto Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
+
+    const auto Y = wei_k_c_y_x_global_desc.GetLength(I2);
+    const auto X = wei_k_c_y_x_global_desc.GetLength(I3);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    assert(Y == 1 && X == 1 && ConvStrideH == 1 && ConvStrideW == 1 && ConvDilationH == 1 &&
+           ConvDilationW == 1 && InLeftPadH == 0 && InLeftPadW == 0 && InRightPadH == 0 &&
+           InRightPadW == 0);
+
+    // weight tensor
+    const auto wei_gemmk_gemmm_global_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, C)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    // input tensor
+    const auto in_gemmk_gemmn_global_desc = transform_tensor_descriptor(
+        in_n_c_hi_wi_global_desc,
+        make_tuple(make_pass_through_transform(C), make_merge_transform(make_tuple(N, Ho, Wo))),
+        make_tuple(Sequence<1>{}, Sequence<0, 2, 3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    // output tensor
+    const auto out_gemmm_gemmn_global_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)),
+        make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
+        make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    return make_tuple(
+        wei_gemmk_gemmm_global_desc, in_gemmk_gemmn_global_desc, out_gemmm_gemmn_global_desc);
+}
+
+} // namespace ck
+#endif
diff --git a/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp b/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 00000000..ebfaabb0
--- /dev/null
+++ b/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4_NHWC_KYXC_NHWK_HPP
+#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4_NHWC_KYXC_NHWK_HPP
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// GemmM = K
+// GemmN = N * Ho * Wo
+// GemmK = C * Y * X
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+__host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk_pad(
+    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
+    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
+    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
+
+    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
+    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
+
+    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
+    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
+
+    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
+    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    // weight tensor
+    const auto wei_gemmk_gemmm_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    // input tensor
+    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+        in_n_hi_wi_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+        in_n_hip_wip_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+    const auto in_gemmk_gemmn_grid_desc =
+        transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                    make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                                               make_merge_transform(make_tuple(N, Ho, Wo))),
+                                    make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    // output tensor
+    const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)),
+        make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    return make_tuple(
+        wei_gemmk_gemmm_grid_desc, in_gemmk_gemmn_grid_desc, out_gemmm_gemmn_grid_desc);
+}
+
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+__host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4_nhwc_kyxc_nhwk_1x1(
+    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
+    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
+    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
+
+    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
+    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
+
+    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
+    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    assert(Y == 1 && X == 1 && ConvStrideH == 1 && ConvStrideW == 1 && ConvDilationH == 1 &&
+           ConvDilationW == 1 && InLeftPadH == 0 && InLeftPadW == 0 && InRightPadH == 0 &&
+           InRightPadW == 0);
+
+    // weight tensor
+    const auto wei_gemmk_gemmm_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, C)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    // input tensor
+    const auto in_gemmk_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, C)),
+        make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    // output tensor
+    const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)),
+        make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    return make_tuple(
+        wei_gemmk_gemmm_grid_desc, in_gemmk_gemmn_grid_desc, out_gemmm_gemmn_grid_desc);
+}
+
+} // namespace ck
+#endif
diff --git a/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp b/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
new file mode 100644
index 00000000..6e576d69
--- /dev/null
+++ b/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NCHW_KCYX_NKHW_HPP
+#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NCHW_KCYX_NKHW_HPP
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// GemmM = K
+// GemmN = N * Ho * Wo
+// GemmK = C * Y * X
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          index_t GemmK1Value>
+__host__ __device__ constexpr auto
+transform_forward_convolution_into_gemm_v4r4r2_nchw_kcyx_nkhw_pad(
+    const TensorDescriptor<Wei...>& wei_k_c_y_x_grid_desc,
+    const TensorDescriptor<In...>& in_n_c_hi_wi_grid_desc,
+    const TensorDescriptor<Out...>& out_n_k_ho_wo_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Number<GemmK1Value>)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto GemmK1 = Number<GemmK1Value>{};
+
+    const auto N = in_n_c_hi_wi_grid_desc.GetLength(I0);
+    const auto C = in_n_c_hi_wi_grid_desc.GetLength(I1);
+    const auto K = out_n_k_ho_wo_grid_desc.GetLength(I1);
+
+    const auto Hi = in_n_c_hi_wi_grid_desc.GetLength(I2);
+    const auto Wi = in_n_c_hi_wi_grid_desc.GetLength(I3);
+
+    const auto Ho = out_n_k_ho_wo_grid_desc.GetLength(I2);
+    const auto Wo = out_n_k_ho_wo_grid_desc.GetLength(I3);
+
+    const auto Y = wei_k_c_y_x_grid_desc.GetLength(I2);
+    const auto X = wei_k_c_y_x_grid_desc.GetLength(I3);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    const auto GemmM  = K;
+    const auto GemmN  = N * Ho * Wo;
+    const auto GemmK  = C * Y * X;
+    const auto GemmK0 = GemmK / GemmK1;
+
+    // weight tensor
+    const auto wei_gemmk_gemmm_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(C * Y * X)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc =
+        transform_tensor_descriptor(wei_gemmk_gemmm_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmM)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    // input tensor
+    const auto in_n_c_hip_wip_grid_desc = transform_tensor_descriptor(
+        in_n_c_hi_wi_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pass_through_transform(C),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto in_n_c_y_ho_x_wo_grid_desc = transform_tensor_descriptor(
+        in_n_c_hip_wip_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pass_through_transform(C),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW))),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
+
+    const auto in_gemmk_gemmn_grid_desc =
+        transform_tensor_descriptor(in_n_c_y_ho_x_wo_grid_desc,
+                                    make_tuple(make_merge_transform(make_tuple(C, Y, X)),
+                                               make_merge_transform(make_tuple(N, Ho, Wo))),
+                                    make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    const auto in_gemmk0_gemmn_gemmk1_grid_desc =
+        transform_tensor_descriptor(in_gemmk_gemmn_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmN)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    // output tensor
+    const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo)),
+        make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(N, Ho * Wo))),
+        make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    return make_tuple(wei_gemmk0_gemmm_gemmk1_grid_desc,
+                      in_gemmk0_gemmn_gemmk1_grid_desc,
+                      out_gemmm_gemmn_grid_desc);
+}
+
+} // namespace ck
+#endif
diff --git a/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp b/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 00000000..13e1bf25
--- /dev/null
+++ b/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NHWC_KYXC_NHWK_HPP
+#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R2_NHWC_KYXC_NHWK_HPP
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// GemmM = K
+// GemmN = N * Ho * Wo
+// GemmK = C * Y * X
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          index_t GemmK1Value>
+__host__ __device__ constexpr auto
+transform_forward_convolution_into_gemm_v4r4r2_nhwc_kyxc_nhwk_pad(
+    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Number<GemmK1Value>)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto GemmK1 = Number<GemmK1Value>{};
+
+    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
+    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
+    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
+
+    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
+    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
+
+    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
+    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
+
+    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
+    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    const auto GemmM  = K;
+    const auto GemmN  = N * Ho * Wo;
+    const auto GemmK  = C * Y * X;
+    const auto GemmK0 = GemmK / GemmK1;
+
+    // weight tensor
+    const auto wei_gemmk_gemmm_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    const auto wei_gemmk0_gemmm_gemmk1_grid_desc =
+        transform_tensor_descriptor(wei_gemmk_gemmm_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmM)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    // input tensor
+    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+        in_n_hi_wi_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+        in_n_hip_wip_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+    const auto in_gemmk_gemmn_grid_desc =
+        transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                    make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                                               make_merge_transform(make_tuple(N, Ho, Wo))),
+                                    make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    const auto in_gemmk0_gemmn_gemmk1_grid_desc =
+        transform_tensor_descriptor(in_gemmk_gemmn_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmN)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    // output tensor
+    const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)),
+        make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    return make_tuple(wei_gemmk0_gemmm_gemmk1_grid_desc,
+                      in_gemmk0_gemmn_gemmk1_grid_desc,
+                      out_gemmm_gemmn_grid_desc);
+}
+
+} // namespace ck
+#endif
diff --git a/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp b/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 00000000..088d14b2
--- /dev/null
+++ b/include/ck/problem_transform/transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
+#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_GEMM_V4R4R4_NHWC_KYXC_NHWK_HPP
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// A: in
+// B: wei
+// C: out
+// GemmM = N * Ho * Wo
+// GemmN = K
+// GemmK = Y * X * C
+template <typename... In,
+          typename... Wei,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          index_t GemmK1Value>
+__host__ __device__ constexpr auto transform_forward_convolution_into_gemm_v4r4r4_nhwc_kyxc_nhwk(
+    const TensorDescriptor<In...>& in_n_hi_wi_c_grid_desc,
+    const TensorDescriptor<Wei...>& wei_k_y_x_c_grid_desc,
+    const TensorDescriptor<Out...>& out_n_ho_wo_k_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    Number<GemmK1Value>)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    constexpr auto GemmK1 = Number<GemmK1Value>{};
+
+    const auto N = in_n_hi_wi_c_grid_desc.GetLength(I0);
+    const auto C = in_n_hi_wi_c_grid_desc.GetLength(I3);
+    const auto K = out_n_ho_wo_k_grid_desc.GetLength(I3);
+
+    const auto Hi = in_n_hi_wi_c_grid_desc.GetLength(I1);
+    const auto Wi = in_n_hi_wi_c_grid_desc.GetLength(I2);
+
+    const auto Ho = out_n_ho_wo_k_grid_desc.GetLength(I1);
+    const auto Wo = out_n_ho_wo_k_grid_desc.GetLength(I2);
+
+    const auto Y = wei_k_y_x_c_grid_desc.GetLength(I1);
+    const auto X = wei_k_y_x_c_grid_desc.GetLength(I2);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    const auto GemmM  = N * Ho * Wo;
+    const auto GemmN  = K;
+    const auto GemmK  = Y * X * C;
+    const auto GemmK0 = GemmK / GemmK1;
+
+    // A: input tensor
+    const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+        in_n_hi_wi_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+        in_n_hip_wip_c_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                   make_pass_through_transform(C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+    const auto in_gemmk_gemmm_grid_desc =
+        transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                    make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                                               make_merge_transform(make_tuple(N, Ho, Wo))),
+                                    make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    const auto in_gemmk0_gemmm_gemmk1_grid_desc =
+        transform_tensor_descriptor(in_gemmk_gemmm_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmM)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    // B: weight tensor
+    const auto wei_gemmk_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C)),
+        make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+    const auto wei_gemmk0_gemmn_gemmk1_grid_desc =
+        transform_tensor_descriptor(wei_gemmk_gemmn_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1)),
+                                               make_pass_through_transform(GemmN)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+    // C: output tensor
+    const auto out_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+        make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)),
+        make_tuple(make_pass_through_transform(N * Ho * Wo), make_pass_through_transform(K)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+    return make_tuple(in_gemmk0_gemmm_gemmk1_grid_desc,
+                      wei_gemmk0_gemmn_gemmk1_grid_desc,
+                      out_gemmm_gemmn_grid_desc);
+}
+
+} // namespace ck
+#endif
diff --git a/include/ck/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp b/include/ck/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp
new file mode 100644
index 00000000..a6785d56
--- /dev/null
+++ b/include/ck/problem_transform/transform_forward_convolution_into_gemm_v6r1_nchw_kcyx_nkhw.hpp
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_CONTRACTION_V6R1_NCHW_KCYX_NKHW_HPP
+#define CK_TRANSFORM_FORWARD_CONVOLUTION_INTO_CONTRACTION_V6R1_NCHW_KCYX_NKHW_HPP
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// GemmM0 = 1
+// GemmM1 = K
+// GemmN0 = N0
+// GemmN1 = (N / N0) * Ho * Wo
+// GemmK0 = (C / C0) * Y * X
+// GemmK1 = C0
+template <typename... Wei,
+          typename... In,
+          typename... Out,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads,
+          typename N0Type,
+          typename C0Type>
+__host__ __device__ constexpr auto
+transform_forward_convolution_into_contraction_v6r1_nchw_kcyx_nkhw_pad(
+    const TensorDescriptor<Wei...>& wei_k_c_y_x_grid_desc,
+    const TensorDescriptor<In...>& in_n_c_hi_wi_grid_desc,
+    const TensorDescriptor<Out...>& out_n_k_ho_wo_grid_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations& conv_dilations,
+    const InLeftPads& in_left_pads,
+    const InRightPads& in_right_pads,
+    const N0Type& N0,
+    const C0Type& C0)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    const auto N = in_n_c_hi_wi_grid_desc.GetLength(I0);
+    const auto C = in_n_c_hi_wi_grid_desc.GetLength(I1);
+    const auto K = out_n_k_ho_wo_grid_desc.GetLength(I1);
+
+    const auto Hi = in_n_c_hi_wi_grid_desc.GetLength(I2);
+    const auto Wi = in_n_c_hi_wi_grid_desc.GetLength(I3);
+
+    const auto Ho = out_n_k_ho_wo_grid_desc.GetLength(I2);
+    const auto Wo = out_n_k_ho_wo_grid_desc.GetLength(I3);
+
+    const auto Y = wei_k_c_y_x_grid_desc.GetLength(I2);
+    const auto X = wei_k_c_y_x_grid_desc.GetLength(I3);
+
+    const auto ConvStrideH = conv_strides[I0];
+    const auto ConvStrideW = conv_strides[I1];
+
+    const auto ConvDilationH = conv_dilations[I0];
+    const auto ConvDilationW = conv_dilations[I1];
+
+    const auto InLeftPadH = in_left_pads[I0];
+    const auto InLeftPadW = in_left_pads[I1];
+
+    const auto InRightPadH = in_right_pads[I0];
+    const auto InRightPadW = in_right_pads[I1];
+
+    const auto N1 = N / N0;
+    const auto C1 = C / C0;
+
+    // weight tensor
+    const auto wei_gk0_gm0_gm1_gk1_grid_desc =
+        transform_tensor_descriptor(make_naive_tensor_descriptor_packed(make_tuple(K, C * Y * X)),
+                                    make_tuple(make_unmerge_transform(make_tuple(I1, K)),
+                                               make_unmerge_transform(make_tuple(C0, C1 * Y * X))),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                    make_tuple(Sequence<1, 2>{}, Sequence<3, 0>{}));
+
+    // input tensor
+    const auto in_n_c_hip_wip_grid_desc = transform_tensor_descriptor(
+        in_n_c_hi_wi_grid_desc,
+        make_tuple(make_pass_through_transform(N),
+                   make_pass_through_transform(C),
+                   make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                   make_pad_transform(Wi, InLeftPadW, InRightPadW)),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    const auto in_n0_n1_c0_c1_y_ho_x_wo_grid_desc = transform_tensor_descriptor(
+        in_n_c_hip_wip_grid_desc,
+        make_tuple(make_unmerge_transform(make_tuple(N0, N1)),
+                   make_unmerge_transform(make_tuple(C0, C1)),
+                   make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                   make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW))),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+        make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}, Sequence<6, 7>{}));
+
+    const auto in_gk0_gn0_gn1_gk1_grid_desc = transform_tensor_descriptor(
+        in_n0_n1_c0_c1_y_ho_x_wo_grid_desc,
+        make_tuple(make_merge_transform(make_tuple(C1, Y, X)),
+                   make_pass_through_transform(N0),
+                   make_merge_transform(make_tuple(N1, Ho, Wo)),
+                   make_pass_through_transform(C0)),
+        make_tuple(Sequence<3, 4, 6>{}, Sequence<0>{}, Sequence<1, 5, 7>{}, Sequence<2>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    // output tensor
+    const auto out_n_k_howo_grid_desc =
+        make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho * Wo));
+
+    const auto out_n0_n1_1_k_howo_grid_desc =
+        transform_tensor_descriptor(out_n_k_howo_grid_desc,
+                                    make_tuple(make_unmerge_transform(make_tuple(N0, N1)),
+                                               make_unmerge_transform(make_tuple(I1, K)),
+                                               make_pass_through_transform(Ho * Wo)),
+                                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                                    make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}, Sequence<4>{}));
+
+    const auto out_gm0_gm1_gn0_gn1_grid_desc = transform_tensor_descriptor(
+        out_n0_n1_1_k_howo_grid_desc,
+        make_tuple(make_pass_through_transform(I1),
+                   make_pass_through_transform(K),
+                   make_pass_through_transform(N0),
+                   make_merge_transform_v2_magic_division(make_tuple(N1, Ho * Wo))),
+        make_tuple(Sequence<2>{}, Sequence<3>{}, Sequence<0>{}, Sequence<1, 4>{}),
+        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+    return make_tuple(
+        wei_gk0_gm0_gm1_gk1_grid_desc, in_gk0_gn0_gn1_gk1_grid_desc, out_gm0_gm1_gn0_gn1_grid_desc);
+}
+
+} // namespace ck
+#endif
diff --git a/include/ck/stream_config.hpp b/include/ck/stream_config.hpp
new file mode 100644
index 00000000..70ca3455
--- /dev/null
+++ b/include/ck/stream_config.hpp
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+
+struct StreamConfig
+{
+    hipStream_t stream_id_ = nullptr;
+    bool time_kernel_      = false;
+    int log_level_         = 0;
+};
diff --git a/include/ck/tensor/static_tensor.hpp b/include/ck/tensor/static_tensor.hpp
new file mode 100644
index 00000000..fee679f9
--- /dev/null
+++ b/include/ck/tensor/static_tensor.hpp
@@ -0,0 +1,273 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_STATIC_TENSOR_HPP
+#define CK_STATIC_TENSOR_HPP
+
+namespace ck {
+
+// StaticTensor for Scalar
+template <AddressSpaceEnum AddressSpace,
+          typename T,
+          typename TensorDesc,
+          bool InvalidElementUseNumericalZeroValue,
+          typename enable_if<TensorDesc::IsKnownAtCompileTime(), bool>::type = false>
+struct StaticTensor
+{
+    static constexpr auto desc_                  = TensorDesc{};
+    static constexpr index_t ndim_               = TensorDesc::GetNumOfDimension();
+    static constexpr index_t element_space_size_ = desc_.GetElementSpaceSize();
+
+    __host__ __device__ constexpr StaticTensor() : invalid_element_scalar_value_{0} {}
+
+    __host__ __device__ constexpr StaticTensor(T invalid_element_value)
+        : invalid_element_scalar_value_{invalid_element_value}
+    {
+    }
+
+    // read access
+    template <typename Idx,
+              typename enable_if<is_known_at_compile_time<Idx>::value && Idx::Size() == ndim_,
+                                 bool>::type = false>
+    __host__ __device__ constexpr const T& operator[](Idx) const
+    {
+        constexpr auto coord = make_tensor_coordinate(desc_, to_multi_index(Idx{}));
+
+        constexpr index_t offset = coord.GetOffset();
+
+        constexpr bool is_valid = coordinate_has_valid_offset(desc_, coord);
+
+        if constexpr(is_valid)
+        {
+            return data_[Number<offset>{}];
+        }
+        else
+        {
+            if constexpr(InvalidElementUseNumericalZeroValue)
+            {
+                return zero_scalar_value_;
+            }
+            else
+            {
+                return invalid_element_scalar_value_;
+            }
+        }
+    }
+
+    // write access
+    template <typename Idx,
+              typename enable_if<is_known_at_compile_time<Idx>::value && Idx::Size() == ndim_,
+                                 bool>::type = false>
+    __host__ __device__ constexpr T& operator()(Idx)
+    {
+        constexpr auto coord = make_tensor_coordinate(desc_, to_multi_index(Idx{}));
+
+        constexpr index_t offset = coord.GetOffset();
+
+        constexpr bool is_valid = coordinate_has_valid_offset(desc_, coord);
+
+        if constexpr(is_valid)
+        {
+            return data_(Number<offset>{});
+        }
+        else
+        {
+            return ignored_element_scalar_;
+        }
+    }
+
+    StaticBuffer<AddressSpace, T, element_space_size_, true> data_;
+    static constexpr T zero_scalar_value_ = T{0};
+    const T invalid_element_scalar_value_;
+    T ignored_element_scalar_;
+};
+
+// StaticTensor for vector
+template <AddressSpaceEnum AddressSpace,
+          typename S,
+          index_t ScalarPerVector,
+          typename TensorDesc,
+          bool InvalidElementUseNumericalZeroValue,
+          typename enable_if<TensorDesc::IsKnownAtCompileTime(), bool>::type = false>
+struct StaticTensorTupleOfVectorBuffer
+{
+    static constexpr auto desc_                  = TensorDesc{};
+    static constexpr index_t ndim_               = TensorDesc::GetNumOfDimension();
+    static constexpr index_t element_space_size_ = desc_.GetElementSpaceSize();
+
+    static constexpr index_t num_of_vector_ =
+        math::integer_divide_ceil(element_space_size_, ScalarPerVector);
+
+    using V = vector_type<S, ScalarPerVector>;
+
+    __host__ __device__ constexpr StaticTensorTupleOfVectorBuffer()
+        : invalid_element_scalar_value_{0}
+    {
+    }
+
+    __host__ __device__ constexpr StaticTensorTupleOfVectorBuffer(S invalid_element_value)
+        : invalid_element_scalar_value_{invalid_element_value}
+    {
+    }
+
+    // Get S
+    // Idx is for S, not V
+    template <typename Idx,
+              typename enable_if<is_known_at_compile_time<Idx>::value && Idx::Size() == ndim_,
+                                 bool>::type = false>
+    __host__ __device__ constexpr const S& operator[](Idx) const
+    {
+        constexpr auto coord = make_tensor_coordinate(desc_, to_multi_index(Idx{}));
+
+        constexpr index_t offset = coord.GetOffset();
+
+        constexpr bool is_valid = coordinate_has_valid_offset(desc_, coord);
+
+        if constexpr(is_valid)
+        {
+            return data_[Number<offset>{}];
+        }
+        else
+        {
+            if constexpr(InvalidElementUseNumericalZeroValue)
+            {
+                return zero_scalar_value_;
+            }
+            else
+            {
+                return invalid_element_scalar_value_;
+            }
+        }
+    }
+
+    // Set S
+    // Idx is for S, not V
+    template <typename Idx,
+              typename enable_if<is_known_at_compile_time<Idx>::value && Idx::Size() == ndim_,
+                                 bool>::type = false>
+    __host__ __device__ constexpr S& operator()(Idx)
+    {
+        constexpr auto coord = make_tensor_coordinate(desc_, to_multi_index(Idx{}));
+
+        constexpr index_t offset = coord.GetOffset();
+
+        constexpr bool is_valid = coordinate_has_valid_offset(desc_, coord);
+
+        if constexpr(is_valid)
+        {
+            return data_(Number<offset>{});
+        }
+        else
+        {
+            return ignored_element_scalar_;
+        }
+    }
+
+    // Get X
+    // Idx is for S, not X. Idx should be aligned with X
+    template <typename X,
+              typename Idx,
+              typename enable_if<has_same_scalar_type<S, X>::value &&
+                                     is_known_at_compile_time<Idx>::value && Idx::Size() == ndim_,
+                                 bool>::type = false>
+    __host__ __device__ constexpr X GetAsType(Idx) const
+    {
+        constexpr auto coord = make_tensor_coordinate(desc_, to_multi_index(Idx{}));
+
+        constexpr index_t offset = coord.GetOffset();
+
+        constexpr bool is_valid = coordinate_has_valid_offset(desc_, coord);
+
+        if constexpr(is_valid)
+        {
+            return data_.template GetAsType<X>(Number<offset>{});
+        }
+        else
+        {
+            if constexpr(InvalidElementUseNumericalZeroValue)
+            {
+                // TODO: is this right way to initialize a vector?
+                return X{0};
+            }
+            else
+            {
+                // TODO: is this right way to initialize a vector?
+                return X{invalid_element_scalar_value_};
+            }
+        }
+    }
+
+    // Set X
+    // Idx is for S, not X. Idx should be aligned with X
+    template <typename X,
+              typename Idx,
+              typename enable_if<has_same_scalar_type<S, X>::value &&
+                                     is_known_at_compile_time<Idx>::value && Idx::Size() == ndim_,
+                                 bool>::type = false>
+    __host__ __device__ constexpr void SetAsType(Idx, X x)
+    {
+        constexpr auto coord = make_tensor_coordinate(desc_, to_multi_index(Idx{}));
+
+        constexpr index_t offset = coord.GetOffset();
+
+        constexpr bool is_valid = coordinate_has_valid_offset(desc_, coord);
+
+        if constexpr(is_valid)
+        {
+            data_.template SetAsType<X>(Number<offset>{}, x);
+        }
+    }
+
+    // Get read access to V. No is_valid check
+    // Idx is for S, not V. Idx should be aligned with V
+    template <typename Idx>
+    __host__ __device__ constexpr const V& GetVectorTypeReference(Idx) const
+    {
+        constexpr auto coord = make_tensor_coordinate(desc_, to_multi_index(Idx{}));
+
+        constexpr index_t offset = coord.GetOffset();
+
+        return data_.GetVectorTypeReference(Number<offset>{});
+    }
+
+    // Get read access to V. No is_valid check
+    // Idx is for S, not V. Idx should be aligned with V
+    template <typename Idx>
+    __host__ __device__ constexpr V& GetVectorTypeReference(Idx)
+    {
+        constexpr auto coord = make_tensor_coordinate(desc_, to_multi_index(Idx{}));
+
+        constexpr index_t offset = coord.GetOffset();
+
+        return data_.GetVectorTypeReference(Number<offset>{});
+    }
+
+    StaticBufferTupleOfVector<AddressSpace, S, num_of_vector_, ScalarPerVector, true> data_;
+    static constexpr S zero_scalar_value_ = S{0};
+    const S invalid_element_scalar_value_ = S{0};
+    S ignored_element_scalar_;
+};
+
+template <AddressSpaceEnum AddressSpace,
+          typename T,
+          typename TensorDesc,
+          typename enable_if<TensorDesc::IsKnownAtCompileTime(), bool>::type = false>
+__host__ __device__ constexpr auto make_static_tensor(TensorDesc)
+{
+    return StaticTensor<AddressSpace, T, TensorDesc, true>{};
+}
+
+template <
+    AddressSpaceEnum AddressSpace,
+    typename T,
+    typename TensorDesc,
+    typename X,
+    typename enable_if<TensorDesc::IsKnownAtCompileTime(), bool>::type                   = false,
+    typename enable_if<is_same<remove_cvref_t<T>, remove_cvref_t<X>>::value, bool>::type = false>
+__host__ __device__ constexpr auto make_static_tensor(TensorDesc, X invalid_element_value)
+{
+    return StaticTensor<AddressSpace, T, TensorDesc, true>{invalid_element_value};
+}
+
+} // namespace ck
+#endif
diff --git a/include/ck/tensor_description/cluster_descriptor.hpp b/include/ck/tensor_description/cluster_descriptor.hpp
new file mode 100644
index 00000000..0c9ea2ff
--- /dev/null
+++ b/include/ck/tensor_description/cluster_descriptor.hpp
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"
+
+namespace ck {
+
+template <typename Lengths,
+          typename ArrangeOrder = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type>
+__host__ __device__ constexpr auto make_cluster_descriptor(
+    const Lengths& lengths,
+    ArrangeOrder order = typename arithmetic_sequence_gen<0, Lengths::Size(), 1>::type{})
+{
+    constexpr index_t ndim_low = Lengths::Size();
+
+    const auto reordered_lengths = container_reorder_given_new2old(lengths, order);
+
+    const auto low_lengths = generate_tuple(
+        [&](auto idim_low) { return reordered_lengths[idim_low]; }, Number<ndim_low>{});
+
+    const auto transform = make_merge_transform(low_lengths);
+
+    constexpr auto low_dim_old_top_ids = ArrangeOrder{};
+
+    constexpr auto up_dim_new_top_ids = Sequence<0>{};
+
+    return make_single_stage_tensor_adaptor(
+        make_tuple(transform), make_tuple(low_dim_old_top_ids), make_tuple(up_dim_new_top_ids));
+}
+
+} // namespace ck
diff --git a/include/ck/tensor_description/multi_index_transform.hpp b/include/ck/tensor_description/multi_index_transform.hpp
new file mode 100644
index 00000000..4e4d7593
--- /dev/null
+++ b/include/ck/tensor_description/multi_index_transform.hpp
@@ -0,0 +1,1954 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/utility/multi_index.hpp"
+
+namespace ck {
+
+template <typename LowLength>
+struct PassThrough
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<1>;
+
+    using UpLengths = decltype(make_tuple(LowLength{}));
+
+    UpLengths up_lengths_;
+
+    __host__ __device__ constexpr PassThrough() = default;
+
+    __host__ __device__ constexpr PassThrough(const LowLength& low_length)
+        : up_lengths_{make_tuple(low_length)}
+    {
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ static constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                                  const UpIdx& idx_up)
+    {
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        idx_low(Number<0>{}) = idx_up[Number<0>{}];
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                                     const UpIdxDiff& idx_diff_up,
+                                                     LowIdx& idx_low,
+                                                     const UpIdx&,
+                                                     Number<Hack>)
+    {
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        constexpr auto I0 = Number<0>{};
+
+        idx_diff_low(I0) = idx_diff_up[I0];
+
+        idx_low += idx_diff_low;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<UpLengths>::value;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("PassThrough, ");
+        printf("up_lengths_");
+        print_multi_index(up_lengths_);
+        printf("}");
+    }
+};
+
+template <typename LowLength,
+          typename LeftPadLength,
+          typename RightPadLength,
+          bool SkipIsValidCheck = false>
+struct Pad
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<1>;
+
+    using UpLengths = decltype(make_tuple(LowLength{} + LeftPadLength{} + RightPadLength{}));
+
+    UpLengths up_lengths_;
+    LeftPadLength left_pad_length_;
+    RightPadLength right_pad_length_;
+
+    __host__ __device__ constexpr Pad() = default;
+
+    __host__ __device__ constexpr Pad(const LowLength& low_length,
+                                      const LeftPadLength& left_pad_length,
+                                      const RightPadLength& right_pad_length)
+        : up_lengths_{make_tuple(low_length + left_pad_length + right_pad_length)},
+          left_pad_length_{left_pad_length},
+          right_pad_length_{right_pad_length}
+    {
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        idx_low(Number<0>{}) = idx_up[Number<0>{}] - left_pad_length_;
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                                     const UpIdxDiff& idx_diff_up,
+                                                     LowIdx& idx_low,
+                                                     const UpIdx&,
+                                                     Number<Hack>)
+    {
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        constexpr auto I0 = Number<0>{};
+
+        idx_diff_low(I0) = idx_diff_up[I0];
+
+        idx_low += idx_diff_low;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return SkipIsValidCheck;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& idx_up) const
+    {
+        return SkipIsValidCheck ||
+               ((idx_up[Number<0>{}] >= left_pad_length_) &&
+                (idx_up[Number<0>{}] < up_lengths_[Number<0>{}] - right_pad_length_));
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<UpLengths>::value &&
+               is_known_at_compile_time<LeftPadLength>::value &&
+               is_known_at_compile_time<RightPadLength>::value;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("Pad, ");
+        printf("up_lengths_");
+        print_multi_index(up_lengths_);
+        printf("left_pad_length %d", index_t{left_pad_length_});
+        printf("right_pad_length %d", index_t{right_pad_length_});
+        printf("}");
+    }
+};
+
+template <typename LowLength, typename LeftPadLength, bool SkipIsValidCheck = false>
+struct LeftPad
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<1>;
+
+    using UpLengths = decltype(make_tuple(LowLength{} + LeftPadLength{}));
+
+    UpLengths up_lengths_;
+    LeftPadLength left_pad_length_;
+
+    __host__ __device__ constexpr LeftPad() = default;
+
+    __host__ __device__ constexpr LeftPad(const LowLength& low_length,
+                                          const LeftPadLength& left_pad_length)
+        : up_lengths_{make_tuple(low_length + left_pad_length)}, left_pad_length_{left_pad_length}
+    {
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        idx_low(Number<0>{}) = idx_up[Number<0>{}] - left_pad_length_;
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                                     const UpIdxDiff& idx_diff_up,
+                                                     LowIdx& idx_low,
+                                                     const UpIdx&,
+                                                     Number<Hack>)
+    {
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        constexpr auto I0 = Number<0>{};
+
+        idx_diff_low(I0) = idx_diff_up[I0];
+
+        idx_low += idx_diff_low;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return SkipIsValidCheck;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& idx_up) const
+    {
+        return SkipIsValidCheck || (idx_up[Number<0>{}] >= left_pad_length_);
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<UpLengths>::value &&
+               is_known_at_compile_time<LeftPadLength>::value;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("LeftPad, ");
+        printf("up_lengths_");
+        print_multi_index(up_lengths_);
+        printf("left_pad_length_ %d", index_t{left_pad_length_});
+        printf("}");
+    }
+};
+
+template <typename LowLength, typename RightPadLength, bool SkipIsValidCheck = false>
+struct RightPad
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<1>;
+
+    using UpLengths = decltype(make_tuple(LowLength{} + RightPadLength{}));
+
+    UpLengths up_lengths_;
+    LowLength low_length_;
+    RightPadLength right_pad_length_;
+
+    __host__ __device__ constexpr RightPad() = default;
+
+    __host__ __device__ constexpr RightPad(const LowLength& low_length,
+                                           const RightPadLength& right_pad_length)
+        : up_lengths_{make_tuple(low_length + right_pad_length)},
+          low_length_{low_length},
+          right_pad_length_{right_pad_length}
+    {
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ static constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                                  const UpIdx& idx_up)
+    {
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        idx_low(Number<0>{}) = idx_up[Number<0>{}];
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                                     const UpIdxDiff& idx_diff_up,
+                                                     LowIdx& idx_low,
+                                                     const UpIdx&,
+                                                     Number<Hack>)
+    {
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        constexpr auto I0 = Number<0>{};
+
+        idx_diff_low(I0) = idx_diff_up[I0];
+
+        idx_low += idx_diff_low;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return SkipIsValidCheck;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& idx_up) const
+    {
+        return SkipIsValidCheck || (idx_up[Number<0>{}] < low_length_);
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<UpLengths>::value &&
+               is_known_at_compile_time<LowLength>::value &&
+               is_known_at_compile_time<RightPadLength>::value;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("RightPad, ");
+        printf("up_lengths_");
+        print_multi_index(up_lengths_);
+        printf("low_length_ %d", index_t{low_length_});
+        printf("left_pad_length_ %d", index_t{right_pad_length_});
+        printf("}");
+    }
+};
+
+// idx_low = coefficients[0, ...nDimUp-1] * idx_up[0, ...nDimUp-1]
+// UpLengths and Coefficients can be either of the followings:
+//   1) Tuple of index_t, which is known at run-time, or
+//   2) Tuple of Number, which is known at compile-time, or
+//   3) Tuple of mixture of index_t and Number, which is known partially at run-time and partially
+//   at compile-time
+template <typename UpLengths,
+          typename Coefficients,
+          typename enable_if<UpLengths::Size() == Coefficients::Size(), bool>::type = false>
+struct Embed
+{
+    static constexpr index_t NDimUp = UpLengths::Size();
+
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<NDimUp>;
+
+    UpLengths up_lengths_;
+    Coefficients coefficients_;
+
+    __host__ __device__ constexpr Embed() = default;
+
+    __host__ __device__ constexpr Embed(const UpLengths& up_lengths,
+                                        const Coefficients& coefficients)
+        : up_lengths_{up_lengths}, coefficients_{coefficients}
+    {
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return NDimUp; }
+
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == NDimUp,
+                      "wrong! inconsistent # of dimension");
+
+        idx_low(Number<0>{}) = 0;
+
+        static_for<0, NDimUp, 1>{}([&idx_low, &idx_up, this](auto i) {
+            idx_low(Number<0>{}) += idx_up[i] * this->coefficients_[i];
+        });
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                              const UpIdxDiff& idx_diff_up,
+                                              LowIdx& idx_low,
+                                              const UpIdx&,
+                                              Number<Hack>) const
+    {
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == NDimUp &&
+                          LowIdx::Size() == 1 && UpIdx::Size() == NDimUp,
+                      "wrong! inconsistent # of dimension");
+
+        idx_diff_low(Number<0>{}) = 0;
+
+        static_for<0, NDimUp, 1>{}(
+            [&](auto i) { idx_diff_low(Number<0>{}) += idx_diff_up[i] * coefficients_[i]; });
+
+        idx_low += idx_diff_low;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<UpLengths>::value &&
+               is_known_at_compile_time<Coefficients>::value;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("Embed, ");
+        printf("up_lengths_ ");
+        print_multi_index(up_lengths_);
+        printf("coefficients_ ");
+        print_multi_index(coefficients_);
+        printf("}");
+    }
+};
+
+// Implementation of "Merge" transformation primitive that uses regular to do lowering of
+// multi-index and use carry-and-borrow check to do lowering of multi-index delta
+template <typename LowLengths>
+struct Merge_v1_carry_check
+{
+    static constexpr index_t NDimLow = LowLengths::Size();
+
+    using LowerIndex = MultiIndex<NDimLow>;
+    using UpperIndex = MultiIndex<1>;
+
+    using LowLengthsScan =
+        decltype(container_reverse_exclusive_scan(LowLengths{}, math::multiplies{}, Number<1>{}));
+
+    using UpLengths =
+        decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{})));
+
+    LowLengths low_lengths_;
+    LowLengthsScan low_lengths_scan_;
+    UpLengths up_lengths_;
+
+    __host__ __device__ constexpr Merge_v1_carry_check() = default;
+
+    __host__ __device__ constexpr Merge_v1_carry_check(const LowLengths& low_lengths)
+        : low_lengths_{low_lengths},
+          low_lengths_scan_{
+              container_reverse_exclusive_scan(low_lengths, math::multiplies{}, Number<1>{})},
+          up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))}
+    {
+        static_assert(LowerIndex::Size() == NDimLow, "wrong!");
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return NDimLow; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        index_t tmp = idx_up[Number<0>{}];
+
+        // normal division
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_low(i) = tmp / this->low_lengths_scan_[i];
+            tmp -= idx_low[i] * this->low_lengths_scan_[i];
+        });
+
+        idx_low(Number<NDimLow - 1>{}) = tmp;
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex_1a(LowIdxDiff& idx_diff_low,
+                                                 const UpIdxDiff& idx_diff_up,
+                                                 LowIdx& idx_low,
+                                                 const UpIdx& /* idx_up_new */,
+                                                 Number<Hack>) const
+    {
+        static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
+                          LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        // CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
+        // However,
+        //   1) If idx_diff_up is known at compile-time, then idx_diff_low_const
+        //   can be calculated at compile-time.
+        //   2) If idx_diff_up is not known at compile-time, but its value
+        //   doesn't change during the whole kernel execution, then
+        //   idx_diff_low_const also
+        //   doesn't change during the whole kernel execution. Compiler generated
+        //   ISA should
+        //   only caclculate idx_diff_low_const once and save it durinng the whole
+        //   kernel execution
+        // If neither 1) nor 2) is satisfied, then the calculation will also be
+        // computed at
+        //   run-time each time this function is called, and can be very expensive.
+        LowerIndex idx_diff_low_const;
+        LowerIndex idx_low_length_minus_idx_diff_low_const;
+        LowerIndex idx_low_length_plus_idx_diff_low_const;
+
+#if !CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
+        index_t tmp = idx_diff_up[Number<0>{}];
+
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
+            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
+        });
+
+        idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
+
+        static_for<0, NDimLow, 1>{}([&](auto i) {
+            idx_low_length_minus_idx_diff_low_const(i) = low_lengths_[i] - idx_diff_low_const[i];
+
+            idx_low_length_plus_idx_diff_low_const(i) = low_lengths_[i] + idx_diff_low_const[i];
+        });
+#else
+        // Hack: this force result into SGPR. Need to make sure the result is thread invariant
+        index_t tmp = idx_diff_up[Number<0>{}];
+
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_diff_low_const(i) = __builtin_amdgcn_readfirstlane(tmp / low_lengths_scan_[i]);
+            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
+        });
+
+        idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
+
+        static_for<0, NDimLow, 1>{}([&](auto i) {
+            idx_low_length_minus_idx_diff_low_const(i) =
+                __builtin_amdgcn_readfirstlane(low_lengths_[i] - idx_diff_low_const[i]);
+
+            idx_low_length_plus_idx_diff_low_const(i) =
+                __builtin_amdgcn_readfirstlane(low_lengths_[i] + idx_diff_low_const[i]);
+        });
+#endif
+
+        if constexpr(Hack == 1)
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            index_t carry = 0;
+
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                index_t idx_low_tmp = idx_low[i] + carry;
+
+                bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
+
+                idx_diff_low(i) =
+                    do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
+
+                idx_diff_low(i) += carry;
+
+                carry = do_carry ? 1 : 0;
+            });
+
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
+
+            idx_low += idx_diff_low;
+        }
+        else if constexpr(Hack == 2)
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            index_t borrow = 0;
+
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                index_t idx_low_tmp = idx_low[i] - borrow;
+
+                bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
+
+                idx_diff_low(i) =
+                    do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low_const[i];
+
+                idx_diff_low(i) -= borrow;
+
+                borrow = do_borrow ? 1 : 0;
+            });
+
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] - borrow;
+
+            idx_low += idx_diff_low;
+        }
+        else
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            index_t carry = 0;
+
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                index_t idx_low_tmp = idx_low[i] + carry;
+
+                bool do_carry  = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
+                bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
+
+                idx_diff_low(i) =
+                    do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
+                idx_diff_low(i) =
+                    do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
+
+                idx_diff_low(i) += carry;
+
+                carry = do_carry ? 1 : 0;
+                carry = do_borrow ? -1 : carry;
+            });
+
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
+
+            idx_low += idx_diff_low;
+        }
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex_1b(LowIdxDiff& idx_diff_low,
+                                                 const UpIdxDiff& idx_diff_up,
+                                                 LowIdx& idx_low,
+                                                 const UpIdx& /* idx_up_new */,
+                                                 Number<Hack>) const
+    {
+        static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
+                          LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        // CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
+        // However,
+        //   1) If idx_diff_up is known at compile-time, then idx_diff_low_const
+        //   can be calculated at compile-time.
+        //   2) If idx_diff_up is not known at compile-time, but its value
+        //   doesn't change during the whole kernel execution, then
+        //   idx_diff_low_const also
+        //   doesn't change during the whole kernel execution. Compiler generated
+        //   ISA should
+        //   only caclculate idx_diff_low_const once and save it durinng the whole
+        //   kernel execution
+        // If neither 1) nor 2) is satisfied, then the calculation will also be
+        // computed at
+        //   run-time each time this function is called, and can be very expensive.
+        LowerIndex idx_diff_low_const;
+        LowerIndex idx_low_length_minus_idx_diff_low_const;
+        LowerIndex idx_low_length_plus_idx_diff_low_const;
+
+#if !CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
+        index_t tmp = idx_diff_up[Number<0>{}];
+
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
+            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
+        });
+
+        idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
+
+        static_for<0, NDimLow, 1>{}([&](auto i) {
+            idx_low_length_minus_idx_diff_low_const(i) = low_lengths_[i] - idx_diff_low_const[i];
+
+            idx_low_length_plus_idx_diff_low_const(i) = low_lengths_[i] + idx_diff_low_const[i];
+        });
+#else
+        // Hack: this force result into SGPR. Need to make sure the result is thread invariant
+        index_t tmp = idx_diff_up[Number<0>{}];
+
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_diff_low_const(i) = __builtin_amdgcn_readfirstlane(tmp / low_lengths_scan_[i]);
+            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
+        });
+
+        idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
+
+        static_for<0, NDimLow, 1>{}([&](auto i) {
+            idx_low_length_minus_idx_diff_low_const(i) =
+                __builtin_amdgcn_readfirstlane(low_lengths_[i] - idx_diff_low_const[i]);
+
+            idx_low_length_plus_idx_diff_low_const(i) = low_lengths_[i] + idx_diff_low_const[i];
+        });
+#endif
+
+        if constexpr(Hack == 1)
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            index_t carry = 0;
+
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                index_t idx_low_tmp = idx_low[i] + carry;
+
+                bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
+
+                idx_diff_low(i) =
+                    do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
+
+                idx_diff_low(i) += carry;
+
+                carry = do_carry ? 1 : 0;
+            });
+
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
+
+            idx_low += idx_diff_low;
+        }
+        else if constexpr(Hack == 2)
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            index_t borrow = 0;
+
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                index_t negative_idx_low_tmp = borrow - idx_low[i];
+
+                bool do_borrow = negative_idx_low_tmp > idx_diff_low_const[i];
+
+                idx_diff_low(i) =
+                    do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low_const[i];
+
+                idx_diff_low(i) -= borrow;
+
+                borrow = do_borrow ? 1 : 0;
+            });
+
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] - borrow;
+
+            idx_low += idx_diff_low;
+        }
+        else
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            index_t carry = 0;
+
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                index_t idx_low_tmp = idx_low[i] + carry;
+
+                bool do_carry  = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
+                bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
+
+                idx_diff_low(i) =
+                    do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
+                idx_diff_low(i) =
+                    do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
+
+                idx_diff_low(i) += carry;
+
+                carry = do_carry ? 1 : 0;
+                carry = do_borrow ? -1 : carry;
+            });
+
+            idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
+
+            idx_low += idx_diff_low;
+        }
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex_2(LowIdxDiff& idx_diff_low,
+                                                const UpIdxDiff& idx_diff_up,
+                                                LowIdx& idx_low,
+                                                const UpIdx& /* idx_up_new */,
+                                                Number<Hack>) const
+    {
+        static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
+                          LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        // CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
+        // However,
+        //   1) If idx_diff_up is known at compile-time, then idx_diff_low_const
+        //   can be calculated at compile-time.
+        //   2) If idx_diff_up is not known at compile-time, but its value
+        //   doesn't change during the whole kernel execution, then
+        //   idx_diff_low_const also
+        //   doesn't change during the whole kernel execution. Compiler generated
+        //   ISA should
+        //   only caclculate idx_diff_low_const once and save it durinng the whole
+        //   kernel execution
+        // If neither 1) nor 2) is satisfied, then the calculation will also be
+        //   computed at run-time each time this function is called, and can be
+        //   very expensive.
+        LowerIndex idx_diff_low_const;
+
+#if !CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
+        index_t tmp = idx_diff_up[Number<0>{}];
+
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
+            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
+        });
+
+        idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
+#else
+        // Hack: this force result into SGPR. Need to make sure the result is thread invariant
+        index_t tmp = idx_diff_up[Number<0>{}];
+
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_diff_low_const(i) = __builtin_amdgcn_readfirstlane(tmp / low_lengths_scan_[i]);
+            tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
+        });
+
+        idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
+#endif
+
+        if constexpr(Hack == 1)
+        {
+            // do carry check on each low dimension in reversed order
+            // do not need to check the first dimension
+            bool do_carry = 0;
+
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                idx_diff_low(i) = idx_diff_low_const[i] + do_carry;
+
+                index_t idx_low_tmp = idx_low[i] + idx_diff_low[i];
+
+                do_carry = idx_low_tmp >= low_lengths_[i];
+
+#if 0
+                // TODO: use exec-mask inline asm, which use 1 VALU
+                if(do_carry)
+                {
+                    idx_diff_low(i) -= low_lengths_[i];
+                }
+#elif 1
+                // this use 2 VALU
+                idx_diff_low(i) = do_carry ? idx_diff_low[i] - low_lengths_[i] : idx_diff_low[i];
+#elif 1
+                // this use 2 VALU
+                index_t idx_diff_low_tmp = idx_diff_low[i] - low_lengths_[i];
+                idx_diff_low(i)          = do_carry ? idx_diff_low_tmp : idx_diff_low[i];
+#endif
+
+                idx_low(i) += idx_diff_low[i];
+            });
+
+            constexpr auto I0 = Number<0>{};
+
+            idx_diff_low(I0) = idx_diff_low_const[I0] + do_carry;
+
+            idx_low(I0) += idx_diff_low[I0];
+        }
+        else if constexpr(Hack == 2)
+        {
+            // do borrow check on each low dimension in reversed order
+            // do not need to check the first dimension
+            bool do_borrow = 0;
+
+            static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
+                idx_diff_low(i) = idx_diff_low_const[i] - do_borrow;
+
+                index_t idx_low_tmp = idx_low[i] + idx_diff_low[i];
+
+                do_borrow = idx_low_tmp < 0;
+
+#if 0
+                // TODO: use exec-mask inline asm
+                if(do_borrow)
+                {
+                    idx_diff_low(i) += low_lengths_[i];
+                }
+#elif 1
+                idx_diff_low(i) = do_borrow ? idx_diff_low[i] + low_lengths_[i] : idx_diff_low[i];
+#elif 1
+                index_t idx_diff_low_tmp = idx_diff_low[i] + low_lengths_[i];
+                idx_diff_low(i)          = do_borrow ? idx_diff_low_tmp : idx_diff_low[i];
+#endif
+
+                idx_low(i) += idx_diff_low[i];
+            });
+
+            constexpr auto I0 = Number<0>{};
+
+            idx_diff_low(I0) = idx_diff_low_const[I0] - do_borrow;
+
+            idx_low(I0) += idx_diff_low[I0];
+        }
+        else
+        {
+            // not implemented
+        }
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                              const UpIdxDiff& idx_diff_up,
+                                              LowIdx& idx_low,
+                                              const UpIdx& idx_up_new,
+                                              Number<Hack>) const
+    {
+#if 1
+        UpdateLowerIndex_1a(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
+#elif 0
+        UpdateLowerIndex_1b(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
+#else
+        UpdateLowerIndex_2(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
+#endif
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return false; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<LowLengths>::value &&
+               is_known_at_compile_time<LowLengthsScan>::value &&
+               is_known_at_compile_time<UpLengths>::value;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("Merge_v1_carry_check, ");
+        printf("low_lengths_ ");
+        print_multi_index(low_lengths_);
+        printf("low_lengths_scan_ ");
+        print_multi_index(low_lengths_scan_);
+        printf("up_lengths_ ");
+        print_multi_index(up_lengths_);
+        printf("}");
+    }
+};
+
+template <typename LowLengths>
+struct lambda_merge_generate_MagicDivision_calculate_magic_multiplier
+{
+    template <index_t I>
+    __host__ __device__ constexpr auto operator()(Number<I> i) const
+    {
+        return MagicDivision::CalculateMagicMultiplier(LowLengths{}[i]);
+    }
+};
+
+template <typename LowLengths>
+struct lambda_merge_generate_MagicDivision_calculate_magic_shift
+{
+    template <index_t I>
+    __host__ __device__ constexpr auto operator()(Number<I> i) const
+    {
+        return MagicDivision::CalculateMagicShift(LowLengths{}[i]);
+    }
+};
+
+// Implementation of "Merge" transformation primitive that uses magic-number-division to do lowering
+// of both multi-index and delta of multi-index
+// Caution:
+//   1. The magic number division implementation being used would produce correct result if the
+//   dividended is uint32_t and its value is with in 31-bit value range of uint32_t.
+//   2. The magic number division for int32_t dividened has not been implemented, the int32_t
+//   dividend would be bit-wise interpreted as uint32_t and magic number division implementation for
+//   uint32_t is then used.
+//   3. For Merge primitive, upper-index is the dividend.
+//   4. When upper-index is uint32_t, its value need to be within 31-bit range.
+//   5. When upper-index is int32_t type (when index_t is int32_t), its value need to be
+//   non-negative.
+template <typename LowLengths>
+struct Merge_v2_magic_division
+{
+    static constexpr index_t NDimLow = LowLengths::Size();
+
+    using LowerIndex = MultiIndex<NDimLow>;
+    using UpperIndex = MultiIndex<1>;
+
+    using UpLengths =
+        decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{})));
+
+    using LowLengthsMagicDivisorMultipiler = decltype(
+        generate_tuple(lambda_merge_generate_MagicDivision_calculate_magic_multiplier<LowLengths>{},
+                       Number<NDimLow>{}));
+
+    using LowLengthsMagicDivisorShift = decltype(
+        generate_tuple(lambda_merge_generate_MagicDivision_calculate_magic_shift<LowLengths>{},
+                       Number<NDimLow>{}));
+
+    LowLengths low_lengths_;
+    LowLengthsMagicDivisorMultipiler low_lengths_magic_divisor_multiplier_;
+    LowLengthsMagicDivisorShift low_lengths_magic_divisor_shift_;
+    UpLengths up_lengths_;
+
+    __host__ __device__ constexpr Merge_v2_magic_division() = default;
+
+    __host__ __device__ constexpr Merge_v2_magic_division(const LowLengths& low_lengths)
+        : low_lengths_{low_lengths},
+          low_lengths_magic_divisor_multiplier_{generate_tuple(
+              [&](auto i) { return MagicDivision::CalculateMagicMultiplier(low_lengths[i]); },
+              Number<NDimLow>{})},
+          low_lengths_magic_divisor_shift_{generate_tuple(
+              [&](auto i) { return MagicDivision::CalculateMagicShift(low_lengths[i]); },
+              Number<NDimLow>{})},
+          up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))}
+    {
+        static_assert(LowerIndex::Size() == NDimLow, "wrong!");
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return NDimLow; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        index_t tmp = idx_up[Number<0>{}];
+
+        static_for<NDimLow - 1, 0, -1>{}([&, this](auto i) {
+            index_t tmp2 =
+                MagicDivision::DoMagicDivision(tmp,
+                                               this->low_lengths_magic_divisor_multiplier_[i],
+                                               this->low_lengths_magic_divisor_shift_[i]);
+            idx_low(i) = tmp - tmp2 * this->low_lengths_[i];
+            tmp        = tmp2;
+        });
+
+        idx_low(Number<0>{}) = tmp;
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                              const UpIdxDiff&,
+                                              LowIdx& idx_low,
+                                              const UpIdx& idx_up_new,
+                                              Number<Hack>) const
+    {
+        static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
+                          LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        index_t tmp = idx_up_new[Number<0>{}];
+
+        static_for<NDimLow - 1, 0, -1>{}([&, this](auto i) {
+            index_t tmp2 =
+                MagicDivision::DoMagicDivision(tmp,
+                                               this->low_lengths_magic_divisor_multiplier_[i],
+                                               this->low_lengths_magic_divisor_shift_[i]);
+
+            index_t idx_low_old = idx_low[i];
+
+            idx_low(i) = tmp - tmp2 * this->low_lengths_[i];
+            tmp        = tmp2;
+
+            idx_diff_low(i) = idx_low[i] - idx_low_old;
+        });
+
+        idx_diff_low(Number<0>{}) = tmp - idx_low(Number<0>{});
+
+        idx_low(Number<0>{}) = tmp;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return false; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<LowLengths>::value &&
+               is_known_at_compile_time<LowLengthsMagicDivisorMultipiler>::value &&
+               is_known_at_compile_time<LowLengthsMagicDivisorShift>::value &&
+               is_known_at_compile_time<UpLengths>::value;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("Merge_v2_magic_division, ");
+        printf("low_lengths_ ");
+        print_multi_index(low_lengths_);
+        printf("low_lengths_magic_divisor_multiplier_ ");
+        print_multi_index(low_lengths_magic_divisor_multiplier_);
+        printf("low_lengths_magic_divisor_shift_ ");
+        print_multi_index(low_lengths_magic_divisor_shift_);
+        printf("up_lengths_ ");
+        print_multi_index(up_lengths_);
+        printf("}");
+    }
+};
+
+// Implementation of "Merge" transformation primitive that uses magic-number-division to do lowering
+// of both multi-index and delta of multi-index
+// Caution:
+//   1. The magic number division implementation being used would produce correct result if the
+//   dividended is uint32_t and its value is with in 31-bit value range of uint32_t.
+//   2. The magic number division for int32_t dividened has not been implemented, the int32_t
+//   dividend would be bit-wise interpreted as uint32_t and magic number division implementation for
+//   uint32_t is then used.
+//   3. For Merge primitive, upper-index is the dividend.
+//   4. When upper-index is uint32_t, its value need to be within 31-bit range.
+//   5. When upper-index is int32_t type (when index_t is int32_t), its value need to be
+//   non-negative.
+template <typename LowLengths>
+struct Merge_v2r2_magic_division
+{
+    static constexpr index_t NDimLow = LowLengths::Size();
+
+    using LowerIndex = MultiIndex<NDimLow>;
+    using UpperIndex = MultiIndex<1>;
+
+    using LowLengthsScan =
+        decltype(container_reverse_exclusive_scan(LowLengths{}, math::multiplies{}, Number<1>{}));
+
+    using UpLengths =
+        decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{})));
+
+    using LowLengthsScanMagicDivisorMultipiler = decltype(generate_tuple(
+        lambda_merge_generate_MagicDivision_calculate_magic_multiplier<LowLengthsScan>{},
+        Number<NDimLow>{}));
+
+    using LowLengthsScanMagicDivisorShift = decltype(
+        generate_tuple(lambda_merge_generate_MagicDivision_calculate_magic_shift<LowLengthsScan>{},
+                       Number<NDimLow>{}));
+
+    LowLengths low_lengths_;
+    LowLengthsScan low_lengths_scan_;
+    LowLengthsScanMagicDivisorMultipiler low_lengths_scan_magic_divisor_multiplier_;
+    LowLengthsScanMagicDivisorShift low_lengths_scan_magic_divisor_shift_;
+    UpLengths up_lengths_;
+
+    __host__ __device__ constexpr Merge_v2r2_magic_division() = default;
+
+    __host__ __device__ constexpr Merge_v2r2_magic_division(const LowLengths& low_lengths)
+        : low_lengths_{low_lengths},
+          low_lengths_scan_{
+              container_reverse_exclusive_scan(low_lengths, math::multiplies{}, Number<1>{})},
+          low_lengths_scan_magic_divisor_multiplier_{generate_tuple(
+              [&](auto i) { return MagicDivision::CalculateMagicMultiplier(low_lengths_scan_[i]); },
+              Number<NDimLow>{})},
+          low_lengths_scan_magic_divisor_shift_{generate_tuple(
+              [&](auto i) { return MagicDivision::CalculateMagicShift(low_lengths_scan_[i]); },
+              Number<NDimLow>{})},
+          up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))}
+    {
+        static_assert(LowerIndex::Size() == NDimLow, "wrong!");
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return NDimLow; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        index_t tmp = idx_up[Number<0>{}];
+
+        static_for<0, NDimLow - 1, 1>{}([&, this](auto i) {
+            idx_low(i) =
+                MagicDivision::DoMagicDivision(tmp,
+                                               this->low_lengths_scan_magic_divisor_multiplier_[i],
+                                               this->low_lengths_scan_magic_divisor_shift_[i]);
+
+            tmp -= idx_low[i] * this->low_lengths_scan_[i];
+        });
+
+        idx_low(Number<NDimLow - 1>{}) = tmp;
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                              const UpIdxDiff&,
+                                              LowIdx& idx_low,
+                                              const UpIdx& idx_up_new,
+                                              Number<Hack>) const
+    {
+        static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
+                          LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        index_t tmp = idx_up_new[Number<0>{}];
+
+        static_for<0, NDimLow - 1, 1>{}([&, this](auto i) {
+            index_t idx_low_old = idx_low[i];
+
+            idx_low(i) =
+                MagicDivision::DoMagicDivision(tmp,
+                                               this->low_lengths_scan_magic_divisor_multiplier_[i],
+                                               this->low_lengths_scan_magic_divisor_shift_[i]);
+
+            idx_diff_low(i) = idx_low[i] - idx_low_old;
+
+            tmp -= idx_low[i] * this->low_lengths_scan_[i];
+        });
+
+        idx_diff_low(Number<NDimLow - 1>{}) = tmp - idx_low[Number<NDimLow - 1>{}];
+
+        idx_low(Number<NDimLow - 1>{}) = tmp;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return false; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<LowLengths>::value &&
+               is_known_at_compile_time<LowLengthsScanMagicDivisorMultipiler>::value &&
+               is_known_at_compile_time<LowLengthsScanMagicDivisorShift>::value &&
+               is_known_at_compile_time<UpLengths>::value;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("Merge_v2r2_magic_division, ");
+        printf("low_lengths_ ");
+        print_multi_index(low_lengths_);
+        printf("low_lengths_scan ");
+        print_multi_index(low_lengths_scan_);
+        printf("low_lengths_scan_magic_divisor_multiplier_ ");
+        print_multi_index(low_lengths_scan_magic_divisor_multiplier_);
+        printf("low_lengths_scan_magic_divisor_shift_ ");
+        print_multi_index(low_lengths_scan_magic_divisor_shift_);
+        printf("up_lengths_ ");
+        print_multi_index(up_lengths_);
+        printf("}");
+    }
+};
+
+// Implementation of "Merge" transformation primitive that uses division and mod. It is supposed to
+// be used for low_lengths that are known at compile time and are power of 2, otherwise performance
+// will be very bad
+template <typename LowLengths>
+struct Merge_v3_division_mod
+{
+    static constexpr index_t NDimLow = LowLengths::Size();
+
+    using LowerIndex = MultiIndex<NDimLow>;
+    using UpperIndex = MultiIndex<1>;
+
+    using LowLengthsScan =
+        decltype(container_reverse_exclusive_scan(LowLengths{}, math::multiplies{}, Number<1>{}));
+
+    using UpLengths =
+        decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{})));
+
+    LowLengths low_lengths_;
+    LowLengthsScan low_lengths_scan_;
+    UpLengths up_lengths_;
+
+    __host__ __device__ constexpr Merge_v3_division_mod() = default;
+
+    __host__ __device__ constexpr Merge_v3_division_mod(const LowLengths& low_lengths)
+        : low_lengths_{low_lengths},
+          low_lengths_scan_{
+              container_reverse_exclusive_scan(low_lengths, math::multiplies{}, Number<1>{})},
+          up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))}
+    {
+        static_assert(LowerIndex::Size() == NDimLow, "wrong!");
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return NDimLow; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        index_t tmp = idx_up[Number<0>{}];
+
+        // division and mod
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_low(i) = tmp / this->low_lengths_scan_[i];
+            tmp %= this->low_lengths_scan_[i];
+        });
+
+        idx_low(Number<NDimLow - 1>{}) = tmp;
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                              const UpIdxDiff&,
+                                              LowIdx& idx_low,
+                                              const UpIdx& idx_up_new,
+                                              Number<Hack>) const
+    {
+        static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
+                          LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        constexpr auto I0   = Number<0>{};
+        constexpr auto INm1 = Number<NDimLow - 1>{};
+
+        index_t tmp = idx_up_new[I0];
+
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            const index_t tmp2 = idx_low[i];
+            idx_low(i)         = tmp / this->low_lengths_scan_[i];
+            idx_diff_low(i)    = idx_low[i] - tmp2;
+            tmp %= this->low_lengths_scan_[i];
+        });
+
+        const index_t tmp2 = idx_low[INm1];
+        idx_low(INm1)      = tmp;
+        idx_diff_low(INm1) = idx_low[INm1] - tmp2;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return false; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<LowLengths>::value &&
+               is_known_at_compile_time<LowLengthsScan>::value &&
+               is_known_at_compile_time<UpLengths>::value;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("Merge_v3_direct_division_mod, ");
+        printf("low_lengths_ ");
+        print_multi_index(low_lengths_);
+        printf("low_lengths_scan_ ");
+        print_multi_index(low_lengths_scan_);
+        printf("up_lengths_ ");
+        print_multi_index(up_lengths_);
+        printf("}");
+    }
+};
+
+template <typename UpLengths, bool Use24BitIntegerCalculation>
+struct UnMerge
+{
+    static constexpr index_t NDimUp = UpLengths::Size();
+
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<NDimUp>;
+
+    using UpLengthsScan =
+        decltype(container_reverse_exclusive_scan(UpLengths{}, math::multiplies{}, Number<1>{}));
+
+    UpLengths up_lengths_;
+    UpLengthsScan up_lengths_scan_;
+
+    __host__ __device__ constexpr UnMerge() = default;
+
+    __host__ __device__ constexpr UnMerge(const UpLengths& up_lengths)
+        : up_lengths_{up_lengths},
+          up_lengths_scan_{
+              container_reverse_exclusive_scan(up_lengths, math::multiplies{}, Number<1>{})}
+    {
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return NDimUp; }
+
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        if constexpr(!Use24BitIntegerCalculation)
+        {
+            idx_low(Number<0>{}) = idx_up[Number<NDimUp - 1>{}];
+
+            static_for<0, NDimUp - 1, 1>{}(
+                [&](auto i) { idx_low(Number<0>{}) += idx_up[i] * up_lengths_scan_[i]; });
+        }
+        else
+        {
+            idx_low(Number<0>{}) = idx_up[Number<NDimUp - 1>{}];
+
+            static_for<0, NDimUp - 1, 1>{}([&](auto i) {
+                idx_low(Number<0>{}) =
+                    (0x00ffffff & idx_low[Number<0>{}]) +
+                    (0x00ffffff & idx_up[i]) * (0x00ffffff & up_lengths_scan_[i]);
+            });
+        }
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                              const UpIdxDiff& idx_diff_up,
+                                              LowIdx& idx_low,
+                                              const UpIdx&,
+                                              Number<Hack>) const
+    {
+        CalculateLowerIndex(idx_diff_low, idx_diff_up);
+
+        idx_low += idx_diff_low;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<UpLengths>::value &&
+               is_known_at_compile_time<UpLengthsScan>::value;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("UnMerge, ");
+        printf("up_lengths_");
+        print_multi_index(up_lengths_);
+        printf("up_lengths_scan_");
+        print_multi_index(up_lengths_scan_);
+        printf("}");
+    }
+};
+
+template <typename LowerIndex>
+struct Freeze
+{
+    LowerIndex low_idx_;
+
+    __host__ __device__ constexpr Freeze() = default;
+
+    __host__ __device__ constexpr Freeze(const LowerIndex& low_idx) : low_idx_{low_idx} {}
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 0; }
+
+    __host__ __device__ static constexpr auto GetUpperLengths() { return Tuple<>{}; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& /* idx_up */) const
+    {
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 0,
+                      "wrong! inconsistent # of dimension");
+
+        idx_low(Number<0>{}) = low_idx_;
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                                     const UpIdxDiff& /* idx_diff_up */,
+                                                     LowIdx& /* idx_low */,
+                                                     const UpIdx& /* idx_up_new */,
+                                                     Number<Hack>)
+    {
+        idx_diff_low(Number<0>{}) = 0;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<LowerIndex>::value;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("Freeze");
+        printf("low_idx_ %d", index_t{low_idx_});
+    }
+};
+
+// Insert a dangling upper dimension without lower dimension
+template <typename UpperLength>
+struct Insert
+{
+    using UpLengths = decltype(make_tuple(UpperLength{}));
+
+    UpLengths up_lengths_;
+
+    __host__ __device__ constexpr Insert() = default;
+
+    __host__ __device__ constexpr Insert(const UpperLength& up_length)
+        : up_lengths_{make_tuple(up_length)}
+    {
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 0; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+
+    __host__ __device__ constexpr auto GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx&, const UpIdx&) const
+    {
+        static_assert(LowIdx::Size() == 0 && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ static void
+    UpdateLowerIndex(LowIdxDiff&, const UpIdxDiff&, LowIdx&, const UpIdx&, Number<Hack>)
+    {
+        static_assert(LowIdxDiff::Size() == 0 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 0 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<UpperLength>::value;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("Insert");
+        print_multi_index(up_lengths_);
+    }
+};
+
+template <typename VectorSize, typename UpLength>
+struct Vectorize
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<1>;
+
+    using UpLengths = decltype(make_tuple(UpLength{}));
+
+    UpLengths up_lengths_;
+    VectorSize vector_size_;
+
+    __host__ __device__ constexpr Vectorize() = default;
+
+    __host__ __device__ constexpr Vectorize(const VectorSize& vector_size,
+                                            const UpLength& up_length)
+        : vector_size_{vector_size}, up_lengths_{make_tuple(up_length)}
+    {
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        idx_low(Number<0>{}) = vector_size_ * idx_up[Number<0>{}];
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                              const UpIdxDiff& idx_diff_up,
+                                              LowIdx& idx_low,
+                                              const UpIdx&,
+                                              Number<Hack>) const
+    {
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        constexpr auto I0 = Number<0>{};
+
+        idx_diff_low(I0) = vector_size_ * idx_diff_up[I0];
+
+        idx_low += idx_diff_low;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<UpLengths>::value;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("Vectorize, ");
+        printf("up_lengths_");
+        print_multi_index(up_lengths_);
+        printf("}");
+    }
+};
+
+template <typename LowLength, typename SliceBegin, typename SliceEnd>
+struct Slice
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<1>;
+
+    using UpLengths = decltype(make_tuple(SliceEnd{} - SliceBegin{}));
+
+    UpLengths up_lengths_;
+    SliceBegin slice_begin_;
+    SliceEnd slice_end_;
+
+    __host__ __device__ constexpr Slice() = default;
+
+    __host__ __device__ constexpr Slice(const LowLength&,
+                                        const SliceBegin& slice_begin,
+                                        const SliceEnd& slice_end)
+        : up_lengths_{make_tuple(slice_end - slice_begin)},
+          slice_begin_{slice_begin},
+          slice_end_{slice_end}
+    {
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        idx_low(Number<0>{}) = idx_up[Number<0>{}] + slice_begin_;
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ static void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                                     const UpIdxDiff& idx_diff_up,
+                                                     LowIdx& idx_low,
+                                                     const UpIdx&,
+                                                     Number<Hack>)
+    {
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        constexpr auto I0 = Number<0>{};
+
+        idx_diff_low(I0) = idx_diff_up[I0];
+
+        idx_low += idx_diff_low;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ constexpr bool IsValidUpperIndexMappedToValidLowerIndex(const UpIdx&) const
+    {
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<UpLengths>::value &&
+               is_known_at_compile_time<SliceBegin>::value &&
+               is_known_at_compile_time<SliceEnd>::value;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("Slice, ");
+        printf("up_lengths_");
+        print_multi_index(up_lengths_);
+        printf("slice_begin_ %d", index_t{slice_begin_});
+        printf("slice_end %d", index_t{slice_end_});
+        printf("}");
+    }
+};
+
+/*
+ * \brief lower_idx = upper_idx % modulus.
+ * TODO: Need an improved implementation since the modulo operation is expensive.
+ */
+template <typename Modulus, typename UpLength>
+struct Modulo
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<1>;
+    using UpLengths  = decltype(make_tuple(UpLength{}));
+
+    Modulus modulus_;
+    UpLengths up_lengths_;
+
+    __host__ __device__ constexpr Modulo() = default;
+
+    __host__ __device__ constexpr Modulo(const Modulus& modulus, const UpLength& up_length)
+        : modulus_{modulus}, up_lengths_{make_tuple(up_length)}
+    {
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return 1; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::Size() == 1 && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        idx_low(Number<0>{}) = idx_up[Number<0>{}] % modulus_;
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                              const UpIdxDiff& idx_diff_up,
+                                              LowIdx& idx_low,
+                                              const UpIdx& up_idx,
+                                              Number<Hack>) const
+    {
+        static_assert(LowIdxDiff::Size() == 1 && UpIdxDiff::Size() == 1 && LowIdx::Size() == 1 &&
+                          UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        constexpr auto I0 = Number<0>{};
+
+        const auto idx_low_old = idx_low;
+        idx_low(I0)            = (up_idx(I0) + idx_diff_up(I0)) % modulus_;
+        idx_diff_low(I0)       = idx_low - idx_low_old;
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return false; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<UpLengths>::value;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("Modulus, ");
+        printf("up_lengths_");
+        print_multi_index(up_lengths_);
+        printf("}");
+    }
+};
+} // namespace ck
diff --git a/include/ck/tensor_description/multi_index_transform_helper.hpp b/include/ck/tensor_description/multi_index_transform_helper.hpp
new file mode 100644
index 00000000..044a9037
--- /dev/null
+++ b/include/ck/tensor_description/multi_index_transform_helper.hpp
@@ -0,0 +1,130 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform.hpp"
+
+namespace ck {
+
+template <typename LowLength>
+__host__ __device__ constexpr auto make_pass_through_transform(const LowLength& low_length)
+{
+    return PassThrough<LowLength>{low_length};
+}
+
+template <typename LowLength, typename LeftPad, typename RightPad, bool SkipIsValidCheck = false>
+__host__ __device__ constexpr auto
+make_pad_transform(const LowLength& low_length,
+                   const LeftPad& left_pad,
+                   const RightPad& right_pad,
+                   integral_constant<bool, SkipIsValidCheck> = integral_constant<bool, false>{})
+{
+    return Pad<LowLength, LeftPad, RightPad, SkipIsValidCheck>{low_length, left_pad, right_pad};
+}
+
+template <typename LowLength, typename LeftPadLength, bool SkipIsValidCheck = false>
+__host__ __device__ constexpr auto make_left_pad_transform(
+    const LowLength& low_length,
+    const LeftPadLength& left_pad,
+    integral_constant<bool, SkipIsValidCheck> = integral_constant<bool, false>{})
+{
+    return LeftPad<LowLength, LeftPadLength, SkipIsValidCheck>{low_length, left_pad};
+}
+
+template <typename LowLength, typename RightPadLength, bool SkipIsValidCheck = false>
+__host__ __device__ constexpr auto make_right_pad_transform(
+    const LowLength& low_length,
+    const RightPadLength& right_pad,
+    integral_constant<bool, SkipIsValidCheck> = integral_constant<bool, false>{})
+{
+    return RightPad<LowLength, RightPadLength, SkipIsValidCheck>{low_length, right_pad};
+}
+
+template <typename UpLengths,
+          typename Coefficients,
+          typename enable_if<UpLengths::Size() == Coefficients::Size(), bool>::type = false>
+__host__ __device__ constexpr auto make_embed_transform(const UpLengths& up_lengths,
+                                                        const Coefficients& coefficients)
+{
+    return Embed<UpLengths, Coefficients>{up_lengths, coefficients};
+}
+
+template <typename LowLengths>
+__host__ __device__ constexpr auto make_merge_transform(const LowLengths& low_lengths)
+{
+#if CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION
+    return make_merge_transform_v2_magic_division(low_lengths);
+#else
+    return make_merge_transform_v1_carry_check(low_lengths);
+#endif
+}
+
+template <typename LowLengths>
+__host__ __device__ constexpr auto
+make_merge_transform_v1_carry_check(const LowLengths& low_lengths)
+{
+    return Merge_v1_carry_check<LowLengths>{low_lengths};
+}
+
+template <typename LowLengths>
+__host__ __device__ constexpr auto
+make_merge_transform_v2_magic_division(const LowLengths& low_lengths)
+{
+#if 1
+    return Merge_v2_magic_division<LowLengths>{low_lengths};
+#else
+    return Merge_v2r2_magic_division<LowLengths>{low_lengths};
+#endif
+}
+
+template <typename LowLengths>
+__host__ __device__ constexpr auto
+make_merge_transform_v3_division_mod(const LowLengths& low_lengths)
+{
+    return Merge_v3_division_mod<LowLengths>{low_lengths};
+}
+
+template <typename UpLengths, bool Use24BitIntegerCalculation = false>
+__host__ __device__ constexpr auto make_unmerge_transform(
+    const UpLengths& up_lengths,
+    integral_constant<bool, Use24BitIntegerCalculation> = integral_constant<bool, false>{})
+{
+    return UnMerge<UpLengths, Use24BitIntegerCalculation>{up_lengths};
+}
+
+template <typename LowerIndex>
+__host__ __device__ constexpr auto make_freeze_transform(const LowerIndex& low_idx)
+{
+    return Freeze<LowerIndex>{low_idx};
+}
+
+template <typename UpperIndex>
+__host__ __device__ constexpr auto make_insert_transform(const UpperIndex& up_idx)
+{
+    return Insert<UpperIndex>{up_idx};
+}
+
+template <typename LowLength, typename SliceBegin, typename SliceEnd>
+__host__ __device__ constexpr auto make_slice_transform(const LowLength& low_length,
+                                                        const SliceBegin& slice_begin,
+                                                        const SliceEnd& slice_end)
+{
+    return Slice<LowLength, SliceBegin, SliceEnd>{low_length, slice_begin, slice_end};
+}
+
+template <typename VectorSize, typename UpLength>
+__host__ __device__ constexpr auto make_vectorize_transform(const VectorSize& vector_size,
+                                                            const UpLength& up_length)
+{
+    return Vectorize<VectorSize, UpLength>{vector_size, up_length};
+}
+
+template <typename Modulus, typename UpLength>
+__host__ __device__ constexpr auto make_modulo_transform(const Modulus& modulus,
+                                                         const UpLength& up_length)
+{
+    return Modulo<Modulus, UpLength>{modulus, up_length};
+}
+} // namespace ck
diff --git a/include/ck/tensor_description/tensor_adaptor.hpp b/include/ck/tensor_description/tensor_adaptor.hpp
new file mode 100644
index 00000000..d42e0a6f
--- /dev/null
+++ b/include/ck/tensor_description/tensor_adaptor.hpp
@@ -0,0 +1,482 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// Transforms: Tuple<transforms...>
+// LowerDimensionHiddenIdss : Tuple<Sequence<...>, ...>
+// UpperDimensionHiddenIdss : Tuple<Sequence<...>, ...>
+// BottomDimensionHiddenIds : Sequence<...>
+// TopDimensionHiddenIds : Sequence<...>
+template <typename Transforms,
+          typename LowerDimensionHiddenIdss,
+          typename UpperDimensionHiddenIdss,
+          typename BottomDimensionHiddenIds,
+          typename TopDimensionHiddenIds>
+struct TensorAdaptor
+{
+    __host__ __device__ static constexpr index_t GetNumOfTransform() { return Transforms::Size(); }
+
+    __host__ __device__ constexpr const auto& GetTransforms() const { return transforms_; }
+
+    __host__ __device__ static constexpr auto GetLowerDimensionHiddenIdss()
+    {
+        return LowerDimensionHiddenIdss{};
+    }
+
+    __host__ __device__ static constexpr auto GetUpperDimensionHiddenIdss()
+    {
+        return UpperDimensionHiddenIdss{};
+    }
+
+    __host__ __device__ static constexpr auto GetTopDimensionHiddenIds()
+    {
+        return TopDimensionHiddenIds{};
+    }
+
+    __host__ __device__ static constexpr auto GetBottomDimensionHiddenIds()
+    {
+        return BottomDimensionHiddenIds{};
+    }
+
+    __host__ __device__ static constexpr auto InitializeElementSize(const Transforms& transforms)
+    {
+        const auto lengths = generate_tuple(
+            [&](auto idim_top) {
+                constexpr auto tmp = GetTransformAndItsUpperDimension(idim_top);
+
+                constexpr index_t itran   = tmp[Number<0>{}];
+                constexpr index_t idim_up = tmp[Number<1>{}];
+                constexpr bool found      = tmp[Number<2>{}];
+
+                static_assert(found == true,
+                              "wrong! not found matching transformation and upper-dimension");
+
+                const auto length =
+                    transforms[Number<itran>{}].GetUpperLengths()[Number<idim_up>{}];
+
+                return length;
+            },
+            Number<ndim_top_>{});
+
+        // TODO: make container_reduce support tuple of Number and index_t
+        return container_reduce(lengths, math::multiplies{}, Number<1>{});
+    }
+
+    template <index_t IDim>
+    __host__ __device__ static constexpr auto GetTransformAndItsUpperDimension(Number<IDim>)
+    {
+        constexpr auto idim_top = Number<IDim>{};
+
+        constexpr index_t idim_hidden = TopDimensionHiddenIds::At(idim_top);
+
+        index_t itran_found   = 0;
+        index_t idim_up_found = 0;
+        bool found            = false;
+
+        static_for<0, ntransform_, 1>{}([&](auto itran) {
+            constexpr auto up_dim_ids = UpperDimensionHiddenIdss{}[itran];
+
+            static_for<0, up_dim_ids.Size(), 1>{}([&](auto idim_up) {
+                if constexpr(up_dim_ids[idim_up] == idim_hidden)
+                {
+                    itran_found   = itran;
+                    idim_up_found = idim_up;
+                    found         = true;
+                }
+            });
+        });
+
+        return make_tuple(itran_found, idim_up_found, found);
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfBottomDimension()
+    {
+        return BottomDimensionHiddenIds::Size();
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfTopDimension()
+    {
+        return TopDimensionHiddenIds::Size();
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfHiddenDimension()
+    {
+        constexpr auto all_low_dim_ids = unpack(
+            [](auto&&... xs) constexpr { return merge_sequences(xs...); },
+            LowerDimensionHiddenIdss{});
+
+        constexpr auto all_up_dim_ids = unpack(
+            [](auto&&... xs) constexpr { return merge_sequences(xs...); },
+            UpperDimensionHiddenIdss{});
+
+        constexpr auto all_dim_ids = merge_sequences(all_low_dim_ids, all_up_dim_ids);
+
+        using unique_sort_all_dim_ids = typename sequence_unique_sort<decltype(all_dim_ids),
+                                                                      math::less<index_t>,
+                                                                      math::equal<index_t>>::type;
+
+        return unique_sort_all_dim_ids::Size();
+    }
+
+    constexpr static index_t ntransform_  = GetNumOfTransform();
+    constexpr static index_t ndim_hidden_ = GetNumOfHiddenDimension();
+    constexpr static index_t ndim_bottom_ = GetNumOfBottomDimension();
+    constexpr static index_t ndim_top_    = GetNumOfTopDimension();
+
+    using HiddenIndex = MultiIndex<ndim_hidden_>;
+    using BottomIndex = MultiIndex<ndim_bottom_>;
+    using TopIndex    = MultiIndex<ndim_top_>;
+
+    // may be index_t or Number<>
+    using ElementSize = remove_cv_t<decltype(InitializeElementSize(Transforms{}))>;
+
+    public:
+#if 0 // workaround compiler complaint about constexpr
+    __host__ __device__ constexpr TensorAdaptor() = default;
+#else
+    __host__ __device__ constexpr TensorAdaptor() : transforms_{}, element_size_{} {}
+#endif
+
+    __host__ __device__ constexpr TensorAdaptor(const Transforms& transforms)
+        : transforms_{transforms}, element_size_{InitializeElementSize(transforms)}
+    {
+        static_assert(Transforms::Size() == ntransform_ &&
+                          LowerDimensionHiddenIdss::Size() == ntransform_ &&
+                          UpperDimensionHiddenIdss::Size() == ntransform_,
+                      "wrong! inconsistent # of transformations");
+
+        // TODO check dependency of dimensions is valid
+    }
+
+    __host__ __device__ constexpr auto GetElementSize() const { return element_size_; }
+
+#if 0 // debug
+    template <index_t I>
+    __host__ __device__ constexpr index_t GetTopDimensionLength(Number<I> idim) const
+    {
+        // TODO: not implemented
+    }
+
+    template <index_t I>
+    __host__ __device__ constexpr index_t GetBottomDimensionLength(Number<I> idim) const
+    {
+        // TODO: not implemented
+    }
+#endif
+
+    template <typename TopIdx>
+    __host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx& idx_top) const
+    {
+        static_assert(TopIdx::Size() == TopDimensionHiddenIds::Size(),
+                      "wrong! # of dimension inconsistent");
+
+        constexpr index_t ntransform  = GetNumOfTransform();
+        constexpr index_t ndim_hidden = GetNumOfHiddenDimension();
+
+        MultiIndex<ndim_hidden> idx_hidden;
+
+        // initialize uppest index
+        set_container_subset(idx_hidden, GetTopDimensionHiddenIds(), idx_top);
+
+        // calculate hidden index
+        static_for<ntransform, 0, -1>{}([&](auto itran_p1) {
+            auto itran              = itran_p1 - Number<1>{};
+            const auto& tran        = GetTransforms().At(itran);
+            constexpr auto dims_low = GetLowerDimensionHiddenIdss().At(itran);
+            constexpr auto dims_up  = GetUpperDimensionHiddenIdss().At(itran);
+
+            const auto idx_up = get_container_subset(idx_hidden, dims_up);
+
+            MultiIndex<dims_low.Size()> idx_low;
+
+            tran.CalculateLowerIndex(idx_low, idx_up);
+
+            set_container_subset(idx_hidden, dims_low, idx_low);
+        });
+
+        return get_container_subset(idx_hidden, BottomDimensionHiddenIds{});
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        bool is_known = true;
+
+        static_for<0, Transforms::Size(), 1>{}([&](auto i) {
+            is_known &= remove_cvref_t<decltype(Transforms{}[i])>::IsKnownAtCompileTime();
+        });
+
+        return is_known && is_known_at_compile_time<ElementSize>::value;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("TensorAdaptor, ");
+        static_for<0, ntransform_, 1>{}([&](auto i) {
+            printf("transforms: ");
+            transforms_[i].Print();
+            printf("LowerDimensionHiddenIds:");
+            LowerDimensionHiddenIdss{}.At(i).Print();
+            printf("UpperDimensionHiddenIds:");
+            UpperDimensionHiddenIdss{}.At(i).Print();
+        });
+
+        printf("BottomDimensionHiddenIds:");
+        BottomDimensionHiddenIds::Print();
+        printf("TopDimensionHiddenIds:");
+        TopDimensionHiddenIds::Print();
+
+        printf("}");
+    }
+
+    private:
+    Transforms transforms_;
+    ElementSize element_size_;
+};
+
+template <typename TensorAdaptor0, typename TensorAdaptor1>
+__host__ __device__ constexpr auto chain_tensor_adaptors(const TensorAdaptor0& adaptor0,
+                                                         const TensorAdaptor1& adaptor1)
+{
+    static_assert(TensorAdaptor0::GetNumOfTopDimension() ==
+                      TensorAdaptor1::GetNumOfBottomDimension(),
+                  "wrong!");
+
+    // all_transforms = transform0 + transform1
+    const auto all_transforms =
+        container_concat(adaptor0.GetTransforms(), adaptor1.GetTransforms());
+
+    // shift
+    constexpr index_t adaptor0_max_hidden_id = [&]() {
+        index_t adaptor0_max_hidden_id_ = NumericLimits<index_t>::Min();
+
+        static_for<0, TensorAdaptor0::GetNumOfTransform(), 1>{}([&](auto itran) {
+            constexpr index_t ndim_low =
+                TensorAdaptor0{}.GetTransforms()[itran].GetNumOfLowerDimension();
+
+            static_for<0, ndim_low, 1>{}([&](auto idim_low) {
+                adaptor0_max_hidden_id_ =
+                    math::max(adaptor0_max_hidden_id_,
+                              TensorAdaptor0::GetLowerDimensionHiddenIdss()[itran][idim_low].value);
+            });
+
+            constexpr index_t ndim_up =
+                TensorAdaptor0{}.GetTransforms()[itran].GetNumOfUpperDimension();
+
+            static_for<0, ndim_up, 1>{}([&](auto idim_up) {
+                adaptor0_max_hidden_id_ =
+                    math::max(adaptor0_max_hidden_id_,
+                              TensorAdaptor0::GetUpperDimensionHiddenIdss()[itran][idim_up].value);
+            });
+        });
+
+        return adaptor0_max_hidden_id_;
+    }();
+
+    constexpr index_t adaptor1_min_hidden_id = [&]() {
+        index_t adaptor1_min_hidden_id_ = NumericLimits<index_t>::Max();
+
+        static_for<0, TensorAdaptor1::GetNumOfTransform(), 1>{}([&](auto itran) {
+            constexpr index_t ndim_low =
+                TensorAdaptor1{}.GetTransforms()[itran].GetNumOfLowerDimension();
+
+            // get the min of all lower dimenions, but not bottom dimension (because their id will
+            // be matched with top id from adaptor0)
+            static_for<0, ndim_low, 1>{}([&](auto idim_low) {
+                constexpr index_t low_dim_hidden_id =
+                    TensorAdaptor1::GetLowerDimensionHiddenIdss()[itran][idim_low].value;
+
+                bool is_bottom_dim = false;
+                static_for<0, TensorAdaptor1::GetNumOfBottomDimension(), 1>{}([&](auto i) {
+                    if constexpr(low_dim_hidden_id ==
+                                 TensorAdaptor1::GetBottomDimensionHiddenIds()[i])
+                    {
+                        is_bottom_dim = true;
+                    }
+                });
+
+                if(!is_bottom_dim)
+                {
+                    adaptor1_min_hidden_id_ = math::min(adaptor1_min_hidden_id_, low_dim_hidden_id);
+                }
+            });
+
+            constexpr index_t ndim_up =
+                TensorAdaptor1{}.GetTransforms()[itran].GetNumOfUpperDimension();
+
+            // get the min of all upper dimensions
+            static_for<0, ndim_up, 1>{}([&](auto idim_up) {
+                adaptor1_min_hidden_id_ =
+                    math::min(adaptor1_min_hidden_id_,
+                              TensorAdaptor1::GetUpperDimensionHiddenIdss()[itran][idim_up].value);
+            });
+        });
+
+        return adaptor1_min_hidden_id_;
+    }();
+
+    constexpr index_t adaptor1_hidden_id_shift =
+        adaptor0_max_hidden_id + 1 - adaptor1_min_hidden_id;
+
+    constexpr index_t ndim_bottom_1 = TensorAdaptor1::GetNumOfBottomDimension();
+
+    // all_low_dim_hidden_idss =
+    // low_dim_hidden_idss_0 + match_hidden_id_for_1(shift_hidden_id_for_1(low_dim_hiden_idss_1))
+    constexpr auto low_dim_hidden_idss_1 = generate_tuple(
+        // generate sequence of ids for a transform
+        [&](auto itran) {
+            constexpr auto ndim_low_1 = TensorAdaptor1::GetLowerDimensionHiddenIdss()[itran].Size();
+
+            constexpr auto low_dim_hidden_ids_1 =
+                TensorAdaptor1::GetLowerDimensionHiddenIdss()[itran];
+
+            // sequence in, sequence out
+            constexpr auto low_dim_hidden_ids_1_mod = [&]() constexpr
+            {
+                auto low_dim_hidden_ids_1_mod_ = to_multi_index(low_dim_hidden_ids_1);
+
+                // shift hidden id so every dim id is unique
+                static_for<0, ndim_low_1, 1>{}([&](auto idim_low_1) {
+                    low_dim_hidden_ids_1_mod_(idim_low_1) += adaptor1_hidden_id_shift;
+                });
+
+                // match hidden id
+                static_for<0, ndim_low_1, 1>{}([&](auto idim_low_1) {
+                    static_for<0, ndim_bottom_1, 1>{}([&](auto idim_bottom_1) {
+                        // if this low dim is bottom dim, then do id matching
+                        if constexpr(low_dim_hidden_ids_1[idim_low_1] ==
+                                     TensorAdaptor1::GetBottomDimensionHiddenIds()[idim_bottom_1])
+                        {
+                            low_dim_hidden_ids_1_mod_(idim_low_1) =
+                                TensorAdaptor0::GetTopDimensionHiddenIds()[idim_bottom_1];
+                        }
+                    });
+                });
+
+                return low_dim_hidden_ids_1_mod_;
+            }
+            ();
+
+            return generate_sequence_v2(
+                [&](auto i) constexpr { return Number<low_dim_hidden_ids_1_mod[i]>{}; },
+                Number<ndim_low_1>{});
+        },
+        Number<TensorAdaptor1::GetNumOfTransform()>{});
+
+    constexpr auto all_low_dim_hidden_idss =
+        container_concat(TensorAdaptor0::GetLowerDimensionHiddenIdss(), low_dim_hidden_idss_1);
+
+    // all_up_dim_hidden_idss =
+    // up_dim_hidden_idss_0 + shift_hidden_id_for_1(up_dim_hiden_idss_1)
+    constexpr auto up_dim_hidden_idss_1 = generate_tuple(
+        // generate sequence of ids for a transform
+        [&](auto itran) {
+            constexpr auto ndim_up_1 = TensorAdaptor1::GetUpperDimensionHiddenIdss()[itran].Size();
+
+            constexpr auto up_dim_hidden_ids_1 =
+                TensorAdaptor1::GetUpperDimensionHiddenIdss()[itran];
+
+            // sequence in, constexpr tuple out
+            constexpr auto up_dim_hidden_ids_1_mod = [&]() constexpr
+            {
+                auto up_dim_hidden_ids_1_mod_ = to_multi_index(up_dim_hidden_ids_1);
+
+                // shift hidden id
+                static_for<0, ndim_up_1, 1>{}([&](auto idim_up_1) {
+                    up_dim_hidden_ids_1_mod_(idim_up_1) += adaptor1_hidden_id_shift;
+                });
+
+                return up_dim_hidden_ids_1_mod_;
+            }
+            ();
+
+            // constexpr tuple to sequence
+            return generate_sequence_v2(
+                [&](auto i) constexpr { return Number<up_dim_hidden_ids_1_mod[i]>{}; },
+                Number<ndim_up_1>{});
+        },
+        Number<TensorAdaptor1::GetNumOfTransform()>{});
+
+    constexpr auto all_up_dim_hidden_idss =
+        container_concat(TensorAdaptor0::GetUpperDimensionHiddenIdss(), up_dim_hidden_idss_1);
+
+    // bottom_dim_hidden_ids = bottom_dim_hidden_ids_0
+    constexpr auto bottom_dim_hidden_ids = TensorAdaptor0::GetBottomDimensionHiddenIds();
+
+    // top_dim_hidden_ids = shift_hidden_id(top_dim_hidden_ids_1)
+    constexpr auto top_dim_hidden_ids =
+        TensorAdaptor1::GetTopDimensionHiddenIds() + Number<adaptor1_hidden_id_shift>{};
+
+    // put everything together
+    return TensorAdaptor<remove_cv_t<decltype(all_transforms)>,
+                         remove_cv_t<decltype(all_low_dim_hidden_idss)>,
+                         remove_cv_t<decltype(all_up_dim_hidden_idss)>,
+                         remove_cv_t<decltype(bottom_dim_hidden_ids)>,
+                         remove_cv_t<decltype(top_dim_hidden_ids)>>{all_transforms};
+}
+
+// Transforms: Tuple<transforms...>
+// LowerDimensionOldTopIdss: Tuple<Sequence<...>, ...>
+// UpperDimensionNewTopIdss: Tuple<Sequence<...>, ...>
+template <typename Transforms, typename LowerDimensionOldTopIdss, typename UpperDimensionNewTopIdss>
+__host__ __device__ constexpr auto make_single_stage_tensor_adaptor(const Transforms& transforms,
+                                                                    LowerDimensionOldTopIdss,
+                                                                    UpperDimensionNewTopIdss)
+{
+    constexpr index_t ntransform = Transforms::Size();
+
+    static_assert(LowerDimensionOldTopIdss::Size() == ntransform &&
+                      UpperDimensionNewTopIdss::Size() == ntransform,
+                  "wrong!");
+
+    // sanity check on LowerDimensionOldTopIdss and UpperDimensionNewTopIdss
+    constexpr auto all_low_dim_old_top_ids = unpack(
+        [](auto&&... xs) constexpr { return merge_sequences(xs...); }, LowerDimensionOldTopIdss{});
+
+    constexpr auto all_up_dim_new_top_ids = unpack(
+        [](auto&&... xs) constexpr { return merge_sequences(xs...); }, UpperDimensionNewTopIdss{});
+
+    static_assert(is_valid_sequence_map<decltype(all_low_dim_old_top_ids)>::value &&
+                      is_valid_sequence_map<decltype(all_up_dim_new_top_ids)>::value,
+                  "wrong!");
+
+    constexpr index_t ndim_old_top = all_low_dim_old_top_ids.Size();
+    constexpr index_t ndim_new_top = all_up_dim_new_top_ids.Size();
+
+    // low_dim_hidden_idss
+    constexpr auto low_dim_hidden_idss = LowerDimensionOldTopIdss{};
+
+    // up_dim_hidden_idss: shift UpperDimensionNewTopIdss by ndim_bottom
+    constexpr auto up_dim_hidden_idss = generate_tuple(
+        [](auto itran) { return UpperDimensionNewTopIdss{}[itran] + Number<ndim_old_top>{}; },
+        Number<ntransform>{});
+
+    // bottom_dim_hidden_ids
+    constexpr auto bottom_dim_hidden_ids =
+        typename arithmetic_sequence_gen<0, ndim_old_top, 1>::type{};
+
+    // top_dim_hidden_ids
+    constexpr auto top_dim_hidden_ids =
+        typename arithmetic_sequence_gen<0, ndim_new_top, 1>::type{} + Number<ndim_old_top>{};
+
+    return TensorAdaptor<remove_cv_t<Transforms>,
+                         remove_cv_t<decltype(low_dim_hidden_idss)>,
+                         remove_cv_t<decltype(up_dim_hidden_idss)>,
+                         remove_cv_t<decltype(bottom_dim_hidden_ids)>,
+                         remove_cv_t<decltype(top_dim_hidden_ids)>>{transforms};
+}
+
+template <typename X, typename... Xs, typename enable_if<sizeof...(Xs) >= 2, bool>::type = false>
+__host__ __device__ constexpr auto chain_tensor_adaptors(const X& x, const Xs&... xs)
+{
+    return chain_tensor_adaptors(x, chain_tensor_adaptors(xs...));
+}
+
+} // namespace ck
diff --git a/include/ck/tensor_description/tensor_descriptor.hpp b/include/ck/tensor_description/tensor_descriptor.hpp
new file mode 100644
index 00000000..f07d5b17
--- /dev/null
+++ b/include/ck/tensor_description/tensor_descriptor.hpp
@@ -0,0 +1,615 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/utility/sequence_helper.hpp"
+#include "ck/tensor_description/multi_index_transform.hpp"
+
+namespace ck {
+
+template <index_t NDimHidden, typename VisibleDimensionIds>
+struct TensorCoordinate;
+
+template <index_t NTransform, index_t NDimVisible, typename UpdateLowerIndexHack>
+struct TensorCoordinateStep;
+
+// Transforms: Tuple<transforms...>
+// LowerDimensionIdss : Tuple<Sequence<...>, ...>
+// UpperDimensionIdss : Tuple<Sequence<...>, ...>
+// VisibleDimensionIds> : Sequence<...>
+template <typename Transforms,
+          typename LowerDimensionIdss,
+          typename UpperDimensionIdss,
+          typename VisibleDimensionIds,
+          typename ElementSpaceSize>
+struct TensorDescriptor
+{
+    // TODO make these private
+    __host__ __device__ static constexpr index_t GetNumOfTransform() { return Transforms::Size(); }
+
+    __host__ __device__ static constexpr index_t GetNumOfVisibleDimension()
+    {
+        return VisibleDimensionIds::Size();
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfHiddenDimension()
+    {
+        constexpr auto all_low_dim_ids = unpack(
+            [](auto&&... xs) constexpr { return merge_sequences(xs...); }, LowerDimensionIdss{});
+
+        constexpr auto all_up_dim_ids = unpack(
+            [](auto&&... xs) constexpr { return merge_sequences(xs...); }, UpperDimensionIdss{});
+
+        constexpr auto all_dim_ids = merge_sequences(all_low_dim_ids, all_up_dim_ids);
+
+        using unique_sort_all_dim_ids = typename sequence_unique_sort<decltype(all_dim_ids),
+                                                                      math::less<index_t>,
+                                                                      math::equal<index_t>>::type;
+
+        return unique_sort_all_dim_ids::Size();
+    }
+
+    __host__ __device__ static constexpr auto InitializeElementSize(const Transforms& transforms)
+    {
+        const auto lengths = generate_tuple(
+            [&](auto idim_visible) {
+                constexpr auto tmp = GetTransformAndItsUpperDimension(idim_visible);
+
+                constexpr index_t itran   = tmp[Number<0>{}];
+                constexpr index_t idim_up = tmp[Number<1>{}];
+                constexpr bool found      = tmp[Number<2>{}];
+
+                static_assert(found == true,
+                              "wrong! not found matching transformation and upper-dimension");
+
+                const auto length =
+                    transforms[Number<itran>{}].GetUpperLengths()[Number<idim_up>{}];
+
+                return length;
+            },
+            Number<ndim_visible_>{});
+
+        // TODO: make container_reduce support tuple of Number and index_t
+        return container_reduce(lengths, math::multiplies{}, Number<1>{});
+    }
+
+    template <index_t IDim>
+    __host__ __device__ static constexpr auto GetTransformAndItsUpperDimension(Number<IDim>)
+    {
+        constexpr auto idim_visible = Number<IDim>{};
+
+        constexpr index_t idim_hidden = VisibleDimensionIds::At(idim_visible);
+
+        index_t itran_found   = 0;
+        index_t idim_up_found = 0;
+        bool found            = false;
+
+        static_for<0, ntransform_, 1>{}([&](auto itran) {
+            constexpr auto up_dim_ids = UpperDimensionIdss{}[itran];
+
+            static_for<0, up_dim_ids.Size(), 1>{}([&](auto idim_up) {
+                if constexpr(up_dim_ids[idim_up] == idim_hidden)
+                {
+                    itran_found   = itran;
+                    idim_up_found = idim_up;
+                    found         = true;
+                }
+            });
+        });
+
+        return make_tuple(itran_found, idim_up_found, found);
+    }
+
+    constexpr static index_t ntransform_   = GetNumOfTransform();
+    constexpr static index_t ndim_visible_ = GetNumOfVisibleDimension();
+    constexpr static index_t ndim_hidden_  = GetNumOfHiddenDimension();
+
+    using VisibleIndex = MultiIndex<ndim_visible_>;
+    using HiddenIndex  = MultiIndex<ndim_hidden_>;
+    using Coordinate   = TensorCoordinate<ndim_hidden_, VisibleDimensionIds>;
+
+    // may be index_t or Number<>
+    using ElementSize = remove_cv_t<decltype(InitializeElementSize(Transforms{}))>;
+
+    public:
+#if 0 // workaround compiler complaint about constexpr
+    __host__ __device__ constexpr TensorDescriptor() = default;
+#else
+    __host__ __device__ constexpr TensorDescriptor()
+        : transforms_{}, element_size_{}, element_space_size_{}
+    {
+    }
+#endif
+
+    __host__ __device__ constexpr TensorDescriptor(const Transforms& transforms,
+                                                   ElementSpaceSize element_space_size)
+        : transforms_{transforms},
+          element_size_{InitializeElementSize(transforms)},
+          element_space_size_{element_space_size}
+
+    {
+        static_assert(Transforms::Size() == ntransform_ &&
+                          LowerDimensionIdss::Size() == ntransform_ &&
+                          UpperDimensionIdss::Size() == ntransform_,
+                      "wrong! inconsistent # of transformations");
+
+        // TODO check dependency of dimensions is valid
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfDimension()
+    {
+        return GetNumOfVisibleDimension();
+    }
+
+    template <index_t IDim>
+    __host__ __device__ constexpr auto GetLength(Number<IDim>) const
+    {
+        static_assert(IDim >= 0 && IDim < ndim_visible_, "wrong! out of range");
+
+        constexpr auto tmp = GetTransformAndItsUpperDimension(Number<IDim>{});
+
+        constexpr index_t itran   = tmp[Number<0>{}];
+        constexpr index_t idim_up = tmp[Number<1>{}];
+        constexpr bool found      = tmp[Number<2>{}];
+
+        static_assert(found == true,
+                      "wrong! not found matching transformation and upper-dimension");
+
+        return transforms_[Number<itran>{}].GetUpperLengths()[Number<idim_up>{}];
+    }
+
+    __host__ __device__ constexpr auto GetLengths() const
+    {
+        // FIXME: use Tuple of reference instead
+        return generate_sequence_v2([&](auto I) { return GetLength(I); }, Number<ndim_visible_>{});
+    }
+
+    __host__ __device__ constexpr auto GetElementSize() const { return element_size_; }
+
+    __host__ __device__ constexpr auto GetElementSpaceSize() const { return element_space_size_; }
+
+    template <typename Idx>
+    __host__ __device__ constexpr index_t CalculateOffset(const Idx& idx) const
+    {
+        static_assert(Idx::Size() == GetNumOfDimension(), "wrong! inconsistent # of dimension");
+
+        return make_tensor_coordinate(*this, idx).GetOffset();
+    }
+
+    // TODO make these private
+    __host__ __device__ constexpr const auto& GetTransforms() const { return transforms_; }
+
+    __host__ __device__ static constexpr auto GetLowerDimensionIdss()
+    {
+        return LowerDimensionIdss{};
+    }
+
+    __host__ __device__ static constexpr auto GetUpperDimensionIdss()
+    {
+        return UpperDimensionIdss{};
+    }
+
+    __host__ __device__ static constexpr auto GetVisibleDimensionIds()
+    {
+        return VisibleDimensionIds{};
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        bool is_known = true;
+
+        static_for<0, Transforms::Size(), 1>{}([&](auto i) {
+            is_known &= remove_cvref_t<decltype(Transforms{}[i])>::IsKnownAtCompileTime();
+        });
+
+        return is_known && is_known_at_compile_time<ElementSize>::value &&
+               is_known_at_compile_time<ElementSpaceSize>::value;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("TensorDescriptor, ");
+        static_for<0, ntransform_, 1>{}([&](auto i) {
+            printf("transforms: ");
+            transforms_[i].Print();
+            printf("LowerDimensionIds:");
+            LowerDimensionIdss{}.At(i).Print();
+            printf("UpperDimensionIds:");
+            UpperDimensionIdss{}.At(i).Print();
+        });
+        printf("}");
+
+        VisibleDimensionIds::Print();
+    }
+
+    // TODO make these private
+    Transforms transforms_;
+    ElementSize element_size_;
+    ElementSpaceSize element_space_size_;
+};
+
+template <index_t NDimHidden, typename VisibleDimensionIds>
+struct TensorCoordinate
+{
+    // TODO make these private
+    static constexpr index_t ndim_visible_ = VisibleDimensionIds::Size();
+
+    using HiddenIndex  = MultiIndex<NDimHidden>;
+    using VisibleIndex = MultiIndex<ndim_visible_>;
+
+    public:
+    __host__ __device__ constexpr TensorCoordinate() = default;
+
+    __host__ __device__ constexpr TensorCoordinate(const HiddenIndex& idx_hidden)
+        : idx_hidden_{idx_hidden}
+    {
+    }
+
+    __host__ __device__ constexpr auto GetIndex() const { return GetVisibleIndex(); }
+
+    __host__ __device__ constexpr index_t GetOffset() const { return idx_hidden_[Number<0>{}]; }
+
+    // TODO make these private
+    __host__ __device__ constexpr const auto& GetHiddenIndex() const { return idx_hidden_; }
+
+    __host__ __device__ auto& GetHiddenIndex() { return idx_hidden_; }
+
+    __host__ __device__ constexpr auto GetVisibleIndex() const
+    {
+        return get_container_subset(idx_hidden_, VisibleDimensionIds{});
+    }
+
+    // TODO make these private
+    HiddenIndex idx_hidden_;
+};
+
+template <index_t NTransform, index_t NDimVisible, typename UpdateLowerIndexHack>
+struct TensorCoordinateStep
+{
+    // TODO make these private
+    using VisibleIndex = MultiIndex<NDimVisible>;
+
+    public:
+    __host__ __device__ constexpr TensorCoordinateStep() = default;
+
+    __host__ __device__ constexpr TensorCoordinateStep(const VisibleIndex& idx_diff_visible,
+                                                       const MultiIndex<NTransform>& do_transforms)
+        : idx_diff_visible_{idx_diff_visible}, do_transforms_{do_transforms}
+    {
+    }
+
+    __host__ __device__ constexpr const auto& GetIndexDiff() const { return GetVisibleIndexDiff(); }
+
+    // TODO make these private
+    __host__ __device__ constexpr const auto& GetVisibleIndexDiff() const
+    {
+        return idx_diff_visible_;
+    }
+
+    VisibleIndex idx_diff_visible_;
+    MultiIndex<NTransform> do_transforms_;
+
+    // HACK: control UpdateLowerIndex()
+    static constexpr UpdateLowerIndexHack update_lower_index_hack_;
+};
+
+// TODO: How to fix this? It uses an struct instead of lambda because lambda
+// doesn't have constructor, and to put it outside the scope where it is used
+// (transform_tensor_descriptor) because template cannot be defined inside a function
+// template
+template <typename NewTransforms>
+struct lambda_get_up_dim_num
+{
+    template <typename I>
+    __host__ __device__ constexpr auto operator()(I) const
+    {
+        using Tran = remove_reference_t<decltype(NewTransforms{}.At(I{}))>;
+        return Number<Tran::GetNumOfUpperDimension()>{};
+    }
+};
+
+template <typename OldTensorDescriptor,
+          typename NewTransforms,
+          typename NewLowerDimensionOldVisibleIdss,
+          typename NewUpperDimensionNewVisibleIdss>
+__host__ __device__ constexpr auto
+transform_tensor_descriptor(const OldTensorDescriptor& old_tensor_desc,
+                            const NewTransforms& new_transforms,
+                            NewLowerDimensionOldVisibleIdss,
+                            NewUpperDimensionNewVisibleIdss)
+{
+    // sanity check
+    {
+        static_assert(NewTransforms::Size() == NewLowerDimensionOldVisibleIdss::Size() &&
+                          NewTransforms::Size() == NewUpperDimensionNewVisibleIdss::Size(),
+                      "wrong! inconsitent number of transform");
+
+        constexpr auto all_old_top_ids = unpack([](auto... xs) { return merge_sequences(xs...); },
+                                                NewLowerDimensionOldVisibleIdss{});
+
+        constexpr auto all_new_top_ids = unpack([](auto... xs) { return merge_sequences(xs...); },
+                                                NewUpperDimensionNewVisibleIdss{});
+
+        static_assert(is_valid_sequence_map<decltype(all_old_top_ids)>::value &&
+                          is_valid_sequence_map<decltype(all_new_top_ids)>::value,
+                      "wrong!");
+    }
+
+    // lower dimension's hidden idss
+    // convert lower dimension visible idss (tuple of sequences) to hidden idss (tuple of
+    // sequences)
+    constexpr auto low_dim_hidden_idss = transform_tuples(
+        // convert lower dimension visible ids (a sequence) to hidden ids (a sequence)
+        [](auto low_dim_visible_ids) constexpr {
+            return transform_sequences(
+                // convert lower dimension visible id to hidden id
+                [](auto low_dim_visible_id) constexpr {
+                    return OldTensorDescriptor::GetVisibleDimensionIds()[low_dim_visible_id];
+                },
+                low_dim_visible_ids);
+        },
+        NewLowerDimensionOldVisibleIdss{});
+
+    constexpr index_t num_new_transform = NewTransforms::Size();
+
+    // upper dimension's hidden idss
+    constexpr index_t old_hidden_dim_number = OldTensorDescriptor::GetNumOfHiddenDimension();
+
+    constexpr auto up_dim_numbers =
+        generate_sequence(lambda_get_up_dim_num<NewTransforms>{}, Number<num_new_transform>{});
+
+    constexpr auto up_dim_numbers_scan = merge_sequences(
+        Sequence<0>{}, inclusive_scan_sequence(up_dim_numbers, math::plus<index_t>{}, Number<0>{}));
+
+    constexpr auto up_dim_hidden_idss = generate_tuple(
+        [ old_hidden_dim_number, up_dim_numbers_scan ](auto i) constexpr {
+            return
+                typename arithmetic_sequence_gen<old_hidden_dim_number + up_dim_numbers_scan[i],
+                                                 old_hidden_dim_number + up_dim_numbers_scan[i + 1],
+                                                 1>::type{};
+        },
+        Number<num_new_transform>{});
+
+    // new visible dimension's hidden ids
+    constexpr auto unordered_new_visible_dim_hidden_ids = unpack(
+        [](auto... xs) constexpr { return merge_sequences(xs...); }, up_dim_hidden_idss);
+
+    constexpr auto new_visible_dim_unordered2ordered = unpack(
+        [](auto... xs) constexpr { return merge_sequences(xs...); },
+        NewUpperDimensionNewVisibleIdss{});
+
+    constexpr auto new_visible_dim_hidden_ids =
+        unordered_new_visible_dim_hidden_ids.ReorderGivenOld2New(new_visible_dim_unordered2ordered);
+
+    // put everything together
+    const auto all_transforms = container_concat(old_tensor_desc.GetTransforms(), new_transforms);
+
+    constexpr auto all_low_dim_hidden_idss =
+        container_concat(OldTensorDescriptor::GetLowerDimensionIdss(), low_dim_hidden_idss);
+
+    constexpr auto all_up_dim_hidden_idss =
+        container_concat(OldTensorDescriptor::GetUpperDimensionIdss(), up_dim_hidden_idss);
+
+    const auto element_space_size = old_tensor_desc.GetElementSpaceSize();
+
+    return TensorDescriptor<remove_cv_t<decltype(all_transforms)>,
+                            remove_cv_t<decltype(all_low_dim_hidden_idss)>,
+                            remove_cv_t<decltype(all_up_dim_hidden_idss)>,
+                            remove_cv_t<decltype(new_visible_dim_hidden_ids)>,
+                            remove_cv_t<decltype(element_space_size)>>{all_transforms,
+                                                                       element_space_size};
+}
+
+template <typename TensorDesc, typename VisibleIndex>
+__host__ __device__ constexpr auto make_tensor_coordinate(const TensorDesc& tensor_desc,
+                                                          const VisibleIndex& idx_visible)
+{
+    static_assert(TensorDesc::GetNumOfDimension() == VisibleIndex::Size(),
+                  "wrong! # of dimension inconsistent");
+
+    constexpr index_t ntransform   = TensorDesc::GetNumOfTransform();
+    constexpr index_t ndim_hidden  = TensorDesc::GetNumOfHiddenDimension();
+    constexpr auto visible_dim_ids = TensorDesc::GetVisibleDimensionIds();
+
+    MultiIndex<ndim_hidden> idx_hidden;
+
+    // initialize visible index
+    set_container_subset(idx_hidden, visible_dim_ids, idx_visible);
+
+    // calculate hidden index
+    static_for<ntransform, 0, -1>{}([&tensor_desc, &idx_hidden](auto itran_p1) {
+        auto itran              = itran_p1 - Number<1>{};
+        const auto& tran        = tensor_desc.GetTransforms().At(itran);
+        constexpr auto dims_low = TensorDesc::GetLowerDimensionIdss().At(itran);
+        constexpr auto dims_up  = TensorDesc::GetUpperDimensionIdss().At(itran);
+
+        const auto idx_up = get_container_subset(idx_hidden, dims_up);
+
+        MultiIndex<dims_low.Size()> idx_low;
+
+        tran.CalculateLowerIndex(idx_low, idx_up);
+
+        set_container_subset(idx_hidden, dims_low, idx_low);
+    });
+
+    return TensorCoordinate<ndim_hidden, decltype(visible_dim_ids)>{idx_hidden};
+}
+
+// UpdateLowerIndexHack: Sequence<...>
+// HACK: control UpdateLowerIndex
+template <typename TensorDesc, typename VisibleIndex, typename UpdateLowerIndexHack>
+__host__ __device__ constexpr auto make_tensor_coordinate_step(const TensorDesc&,
+                                                               const VisibleIndex& idx_diff_visible,
+                                                               UpdateLowerIndexHack)
+{
+    static_assert(TensorDesc::GetNumOfDimension() == VisibleIndex::Size(),
+                  "wrong! # of dimension inconsistent");
+
+    constexpr index_t ntransform   = TensorDesc::GetNumOfTransform();
+    constexpr index_t ndim_hidden  = TensorDesc::GetNumOfHiddenDimension();
+    constexpr index_t ndim_visible = TensorDesc::GetNumOfVisibleDimension();
+    constexpr auto visible_dim_ids = TensorDesc::GetVisibleDimensionIds();
+
+    static_assert(UpdateLowerIndexHack::Size() == ntransform, "wrong!");
+
+    // use index_t for boolean type
+    auto do_transforms    = make_zero_multi_index<ntransform>();
+    auto is_non_zero_diff = make_zero_multi_index<ndim_hidden>();
+
+    // decide do_transform by checkout non-zero index diff components
+    MultiIndex<VisibleIndex::Size()> non_zero_diff_pick_visible;
+
+    static_for<0, ndim_visible, 1>{}(
+        [&](auto i) { non_zero_diff_pick_visible(i) = (idx_diff_visible[i] != 0); });
+
+    set_container_subset(is_non_zero_diff, visible_dim_ids, non_zero_diff_pick_visible);
+
+    static_for<ntransform - 1, -1, -1>{}([&](auto itran) {
+        constexpr auto dims_low = TensorDesc::GetLowerDimensionIdss().At(itran);
+        constexpr auto dims_up  = TensorDesc::GetUpperDimensionIdss().At(itran);
+
+        const auto non_zero_diff_pick_up = get_container_subset(is_non_zero_diff, dims_up);
+
+        MultiIndex<dims_low.Size()> non_zero_diff_pick_low;
+
+        // if any of upper index diff components is non-zero, then
+        //   1) Need to do this transform
+        //   2) all components of lower index diff will assume to be non-zero and need to be
+        //   computed
+        const bool idx_diff_up_has_non_zero = container_reduce(
+            non_zero_diff_pick_up, [](auto a, auto b) constexpr { return a or b; }, false);
+
+        do_transforms(itran) = idx_diff_up_has_non_zero;
+
+        static_for<0, dims_low.Size(), 1>{}(
+            [&](auto i) { non_zero_diff_pick_low(i) = idx_diff_up_has_non_zero; });
+
+        set_container_subset(is_non_zero_diff, dims_low, non_zero_diff_pick_low);
+    });
+
+    return TensorCoordinateStep<ntransform, ndim_visible, UpdateLowerIndexHack>{idx_diff_visible,
+                                                                                do_transforms};
+}
+
+template <typename TensorDesc, typename VisibleIndex>
+__host__ __device__ constexpr auto make_tensor_coordinate_step(const TensorDesc&,
+                                                               const VisibleIndex& idx_diff_visible)
+{
+    constexpr index_t ntransform = TensorDesc::GetNumOfTransform();
+
+    return make_tensor_coordinate_step(
+        TensorDesc{}, idx_diff_visible, typename uniform_sequence_gen<ntransform, 0>::type{});
+}
+
+template <typename TensorDesc, typename TensorCoord, typename TensorCoordStep>
+__host__ __device__ constexpr void move_tensor_coordinate(const TensorDesc& tensor_desc,
+                                                          TensorCoord& coord,
+                                                          const TensorCoordStep& coord_step)
+{
+    constexpr index_t ndim_hidden = TensorDesc::GetNumOfHiddenDimension();
+    constexpr index_t ntransform  = TensorDesc::GetNumOfTransform();
+
+    // this is what needs to be calculated
+    auto idx_diff_hidden = make_zero_multi_index<ndim_hidden>();
+
+    // initialize visible index diff
+    set_container_subset(
+        idx_diff_hidden, TensorDesc::GetVisibleDimensionIds(), coord_step.GetVisibleIndexDiff());
+
+    // this is what needs to be updated
+    auto& idx_hidden = coord.GetHiddenIndex();
+
+    // update visible index
+    auto idx_hidden_pick_visible =
+        get_container_subset(idx_hidden, TensorDesc::GetVisibleDimensionIds());
+
+    idx_hidden_pick_visible += coord_step.GetIndexDiff();
+
+    set_container_subset(idx_hidden, TensorDesc::GetVisibleDimensionIds(), idx_hidden_pick_visible);
+
+    // update rest of hidden index
+    static_for<ntransform - 1, -1, -1>{}([&](auto itran) {
+        if(coord_step.do_transforms_[itran])
+        {
+            const auto& tran        = tensor_desc.GetTransforms().At(itran);
+            constexpr auto dims_low = TensorDesc::GetLowerDimensionIdss().At(itran);
+            constexpr auto dims_up  = TensorDesc::GetUpperDimensionIdss().At(itran);
+
+            const auto idx_up_new  = get_container_subset(idx_hidden, dims_up);
+            auto idx_low           = get_container_subset(idx_hidden, dims_low);
+            const auto idx_diff_up = get_container_subset(idx_diff_hidden, dims_up);
+
+            MultiIndex<dims_low.Size()> idx_diff_low;
+
+            // HACK: control UpdateLowerIndex for Merge using hack
+            constexpr index_t Hack = decltype(coord_step.update_lower_index_hack_)::At(itran);
+
+            tran.UpdateLowerIndex(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
+
+            set_container_subset(idx_diff_hidden, dims_low, idx_diff_low);
+            set_container_subset(idx_hidden, dims_low, idx_low);
+        }
+    });
+}
+
+template <typename TensorDesc, typename TensorCoord>
+__host__ __device__ constexpr bool
+coordinate_has_valid_offset_assuming_visible_index_is_valid(const TensorDesc& tensor_desc,
+                                                            const TensorCoord& coord)
+{
+    bool valid = true;
+
+    constexpr index_t ntransform = TensorDesc::GetNumOfTransform();
+
+    const auto& idx_hidden = coord.GetHiddenIndex();
+
+    static_for<ntransform - 1, -1, -1>{}([&tensor_desc, &idx_hidden, &valid](auto itran) {
+        const auto tran = tensor_desc.GetTransforms().At(itran);
+
+        // check validity, only if current transformation does not always has a valid mapping
+        if constexpr(!decltype(tran)::IsValidUpperIndexAlwaysMappedToValidLowerIndex())
+        {
+            const auto idx_up =
+                get_container_subset(idx_hidden, TensorDesc::GetUpperDimensionIdss().At(itran));
+
+            // Comment: using valid = valid && .. will result in weird control flow in ISA
+            valid &= tran.IsValidUpperIndexMappedToValidLowerIndex(idx_up);
+        }
+    });
+
+    return valid;
+}
+
+template <typename TensorDesc, typename TensorCoord>
+__host__ __device__ constexpr bool coordinate_has_valid_offset(const TensorDesc& tensor_desc,
+                                                               const TensorCoord& coord)
+{
+    // check visible index
+    const auto& idx_visible = coord.GetVisibleIndex();
+
+    bool is_visible_index_valid = true;
+
+    static_for<0, TensorDesc::GetNumOfDimension(), 1>{}(
+        [&is_visible_index_valid, &idx_visible, &tensor_desc](auto i) {
+            is_visible_index_valid =
+                is_visible_index_valid &&
+                (idx_visible[i] >= 0 && idx_visible[i] < tensor_desc.GetLength(i));
+        });
+
+    // check other hidden index
+    return is_visible_index_valid &&
+           coordinate_has_valid_offset_assuming_visible_index_is_valid(tensor_desc, coord);
+}
+
+template <typename TensorDesc>
+using TensorCoordinate_t = decltype(make_tensor_coordinate(
+    TensorDesc{}, MultiIndex<remove_cvref_t<TensorDesc>::GetNumOfDimension()>{}));
+
+template <typename TensorDesc>
+using TensorCoordinateStep_t = decltype(make_tensor_coordinate_step(
+    TensorDesc{}, MultiIndex<remove_cvref_t<TensorDesc>::GetNumOfDimension()>{}));
+
+} // namespace ck
diff --git a/include/ck/tensor_description/tensor_descriptor_helper.hpp b/include/ck/tensor_description/tensor_descriptor_helper.hpp
new file mode 100644
index 00000000..461aae72
--- /dev/null
+++ b/include/ck/tensor_description/tensor_descriptor_helper.hpp
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+
+namespace ck {
+
+/*
+ * These functions create tensor descriptor at runtime. If they are not constexpr, you will
+ * likely see usage of scratch memory during construction of these tensor descriptors. So
+ * it's better to call these functions on host and then pass the constructed tensor descritpors
+ * to GPU. If the tensor descritpors being constructed are constexpr, then you can call these
+ * functions on GPU without worrying about scratch memory usage.
+ */
+
+#if CK_WORKAROUND_SWDEV_275126
+template <typename Lengths, typename Strides, index_t I, typename AccOld>
+__host__ __device__ constexpr auto calculate_element_space_size_impl(const Lengths& lengths,
+                                                                     const Strides& strides,
+                                                                     Number<I> i,
+                                                                     AccOld acc_old)
+{
+    auto acc_new = acc_old + (lengths[i] - Number<1>{}) * strides[i];
+
+    if constexpr(i.value < Lengths::Size() - 1)
+    {
+        return calculate_element_space_size_impl(lengths, strides, i + Number<1>{}, acc_new);
+    }
+    else
+    {
+        return acc_new;
+    }
+}
+#endif
+
+// Lengths..., Strides... could be:
+//   1) index_t, which is known at run-time, or
+//   2) Number<>, which is known at compile-time
+// element_space_size could be:
+//   1) long_index_t, or
+//   2) LongNumber<>
+template <typename... Lengths,
+          typename... Strides,
+          typename enable_if<sizeof...(Lengths) == sizeof...(Strides), bool>::type = false>
+__host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple<Lengths...>& lengths,
+                                                                const Tuple<Strides...>& strides)
+{
+    constexpr index_t N = sizeof...(Lengths);
+
+    const auto transforms = make_tuple(make_embed_transform(lengths, strides));
+
+    constexpr auto low_dim_hidden_idss = make_tuple(Sequence<0>{});
+
+    constexpr auto up_dim_hidden_idss =
+        make_tuple(typename arithmetic_sequence_gen<1, N + 1, 1>::type{});
+
+    constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{};
+
+#if !CK_WORKAROUND_SWDEV_275126
+    // rocm-4.1 compiler would crash for recursive labmda
+    // recursive function for reduction
+    auto f = [&](auto fs, auto i, auto acc_old) {
+        auto acc_new = acc_old + (lengths[i] - Number<1>{}) * strides[i];
+
+        if constexpr(i.value < N - 1)
+        {
+            return fs(fs, i + Number<1>{}, acc_new);
+        }
+        else
+        {
+            return acc_new;
+        }
+    };
+
+    const auto element_space_size = f(f, Number<0>{}, LongNumber<1>{});
+#else
+    const auto element_space_size =
+        calculate_element_space_size_impl(lengths, strides, Number<0>{}, LongNumber<1>{});
+#endif
+
+    return TensorDescriptor<remove_cv_t<decltype(transforms)>,
+                            remove_cv_t<decltype(low_dim_hidden_idss)>,
+                            remove_cv_t<decltype(up_dim_hidden_idss)>,
+                            remove_cv_t<decltype(visible_dim_hidden_ids)>,
+                            remove_cv_t<decltype(element_space_size)>>{transforms,
+                                                                       element_space_size};
+}
+
+// Lengths... could be:
+//   1) index_t, which is known at run-time, or
+//   2) Number<>, which is known at compile-time
+// element_space_size could be:
+//   1) long_index_t, or
+//   2) LongNumber<>
+template <typename... Lengths>
+__host__ __device__ constexpr auto
+make_naive_tensor_descriptor_packed(const Tuple<Lengths...>& lengths)
+{
+    constexpr index_t N = sizeof...(Lengths);
+
+    const auto transforms = make_tuple(make_unmerge_transform(lengths));
+
+    constexpr auto low_dim_hidden_idss = make_tuple(Sequence<0>{});
+
+    constexpr auto up_dim_hidden_idss =
+        make_tuple(typename arithmetic_sequence_gen<1, N + 1, 1>::type{});
+
+    constexpr auto visible_dim_hidden_ids = typename arithmetic_sequence_gen<1, N + 1, 1>::type{};
+
+    const auto element_space_size = container_reduce(lengths, math::multiplies{}, LongNumber<1>{});
+
+    return TensorDescriptor<remove_cv_t<decltype(transforms)>,
+                            remove_cv_t<decltype(low_dim_hidden_idss)>,
+                            remove_cv_t<decltype(up_dim_hidden_idss)>,
+                            remove_cv_t<decltype(visible_dim_hidden_ids)>,
+                            remove_cv_t<decltype(element_space_size)>>{transforms,
+                                                                       element_space_size};
+}
+
+// Lengths... could be:
+//   1) index_t, which is known at run-time, or
+//   2) Number<>, which is known at compile-time
+// align could be:
+//   1) index_t, or
+//   2) Number<>
+template <typename... Lengths, typename Align>
+__host__ __device__ constexpr auto
+make_naive_tensor_descriptor_aligned(const Tuple<Lengths...>& lengths, Align align)
+{
+    constexpr auto I1 = Number<1>{};
+
+    constexpr index_t N = sizeof...(Lengths);
+
+    const auto stride_n_minus_2 = math::integer_least_multiple(lengths[Number<N - 1>{}], align);
+
+    auto strides = generate_tuple(
+        [&](auto i) {
+            if constexpr(i.value == N - 1)
+            {
+                return I1;
+            }
+            else if constexpr(i.value == N - 2)
+            {
+                return Number<stride_n_minus_2>{};
+            }
+            else
+            {
+                return container_reduce(lengths,
+                                        math::multiplies{},
+                                        Number<stride_n_minus_2>{},
+                                        i + I1,
+                                        Number<N - 1>{},
+                                        I1);
+            }
+        },
+        Number<N>{});
+
+    return make_naive_tensor_descriptor(lengths, strides);
+}
+
+} // namespace ck
diff --git a/include/ck/tensor_description/tensor_space_filling_curve.hpp b/include/ck/tensor_description/tensor_space_filling_curve.hpp
new file mode 100644
index 00000000..17c9100b
--- /dev/null
+++ b/include/ck/tensor_description/tensor_space_filling_curve.hpp
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/math.hpp"
+#include "ck/utility/sequence.hpp"
+#include "ck/utility/sequence_helper.hpp"
+#include "ck/utility/statically_indexed_array_multi_index.hpp"
+#include "ck/utility/tuple_helper.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"
+
+namespace ck {
+
+template <typename TensorLengths,
+          typename DimAccessOrder,
+          typename ScalarsPerAccess,
+          bool SnakeCurved = true> // # of scalars per access in each dimension
+struct SpaceFillingCurve
+{
+    static constexpr index_t nDim = TensorLengths::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    static constexpr index_t ScalarPerVector =
+        reduce_on_sequence(ScalarsPerAccess{}, math::multiplies{}, Number<1>{});
+
+    static constexpr auto access_lengths   = TensorLengths{} / ScalarsPerAccess{};
+    static constexpr auto dim_access_order = DimAccessOrder{};
+    static constexpr auto ordered_access_lengths =
+        container_reorder_given_new2old(access_lengths, dim_access_order);
+
+    static constexpr auto to_index_adaptor = make_single_stage_tensor_adaptor(
+        make_tuple(make_merge_transform(ordered_access_lengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, nDim, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    __host__ __device__ static constexpr index_t GetNumOfAccess()
+    {
+        static_assert(TensorLengths::Size() == ScalarsPerAccess::Size());
+        static_assert(TensorLengths{} % ScalarsPerAccess{} ==
+                      typename uniform_sequence_gen<TensorLengths::Size(), 0>::type{});
+
+        return reduce_on_sequence(TensorLengths{}, math::multiplies{}, Number<1>{}) /
+               ScalarPerVector;
+    }
+
+    template <index_t AccessIdx1dBegin, index_t AccessIdx1dEnd>
+    static __device__ __host__ constexpr auto GetStepBetween(Number<AccessIdx1dBegin>,
+                                                             Number<AccessIdx1dEnd>)
+    {
+        static_assert(AccessIdx1dBegin >= 0, "1D index should be non-negative");
+        static_assert(AccessIdx1dBegin < GetNumOfAccess(), "1D index should be larger than 0");
+        static_assert(AccessIdx1dEnd >= 0, "1D index should be non-negative");
+        static_assert(AccessIdx1dEnd < GetNumOfAccess(), "1D index should be larger than 0");
+
+        constexpr auto idx_begin = GetIndex(Number<AccessIdx1dBegin>{});
+        constexpr auto idx_end   = GetIndex(Number<AccessIdx1dEnd>{});
+        return idx_end - idx_begin;
+    }
+
+    template <index_t AccessIdx1d>
+    static __device__ __host__ constexpr auto GetForwardStep(Number<AccessIdx1d>)
+    {
+        static_assert(AccessIdx1d < GetNumOfAccess(), "1D index should be larger than 0");
+        return GetStepBetween(Number<AccessIdx1d>{}, Number<AccessIdx1d + 1>{});
+    }
+
+    template <index_t AccessIdx1d>
+    static __device__ __host__ constexpr auto GetBackwardStep(Number<AccessIdx1d>)
+    {
+        static_assert(AccessIdx1d > 0, "1D index should be larger than 0");
+
+        return GetStepBetween(Number<AccessIdx1d>{}, Number<AccessIdx1d - 1>{});
+    }
+
+    template <index_t AccessIdx1d>
+    static __device__ __host__ constexpr Index GetIndex(Number<AccessIdx1d>)
+    {
+#if 0
+        /*
+         * \todo: TensorAdaptor::CalculateBottomIndex does NOT return constexpr as expected.
+         */
+        constexpr auto ordered_access_idx = to_index_adaptor.CalculateBottomIndex(make_multi_index(Number<AccessIdx1d>{}));
+#else
+
+        constexpr auto access_strides = container_reverse_exclusive_scan(
+            ordered_access_lengths, math::multiplies{}, Number<1>{});
+
+        constexpr auto idx_1d = Number<AccessIdx1d>{};
+        // Given tensor strides \p access_lengths, and 1D index of space-filling-curve, compute the
+        // idim-th element of multidimensional index.
+        // All constexpr variables have to be captured by VALUE.
+        constexpr auto compute_index = [ idx_1d, access_strides ](auto idim) constexpr
+        {
+            constexpr auto compute_index_impl = [ idx_1d, access_strides ](auto jdim) constexpr
+            {
+                auto res = idx_1d.value;
+                auto id  = 0;
+
+                static_for<0, jdim.value + 1, 1>{}([&](auto kdim) {
+                    id = res / access_strides[kdim].value;
+                    res -= id * access_strides[kdim].value;
+                });
+
+                return id;
+            };
+
+            constexpr auto id = compute_index_impl(idim);
+            return Number<id>{};
+        };
+
+        constexpr auto ordered_access_idx = generate_tuple(compute_index, Number<nDim>{});
+#endif
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto idim) {
+                index_t tmp = ordered_access_idx[I0];
+
+                static_for<1, idim, 1>{}(
+                    [&](auto j) { tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j]; });
+
+                forward_sweep_(idim) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate multi-dim tensor index
+        auto idx_md = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto idim) {
+                ordered_idx(idim) =
+                    !SnakeCurved || forward_sweep[idim]
+                        ? ordered_access_idx[idim]
+                        : ordered_access_lengths[idim] - 1 - ordered_access_idx[idim];
+            });
+
+            return container_reorder_given_old2new(ordered_idx, dim_access_order) *
+                   ScalarsPerAccess{};
+        }();
+        return idx_md;
+    }
+
+    // FIXME: rename this function
+    template <index_t AccessIdx1d>
+    static __device__ __host__ constexpr auto GetIndexTupleOfNumber(Number<AccessIdx1d>)
+    {
+        constexpr auto idx = GetIndex(Number<AccessIdx1d>{});
+
+        return generate_tuple([&](auto i) { return Number<idx[i]>{}; }, Number<nDim>{});
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
new file mode 100644
index 00000000..8b1b7be1
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp
@@ -0,0 +1,412 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_contraction_dl.hpp"
+
+namespace ck {
+
+// C[BM0, BM1, BN0, BN1] += transpose(A[K, BM0, BM1]) * B[K, BN0, BN1]
+// A and B are visable to the whole block, C is distributed among each thread
+// Assume:
+//   1. A:
+//     1. ABlockDesc_BK0_BM_BK1 is known at compile-time
+//     2. ABlockBuffer is DynamicBuffer
+//   2. B:
+//     1. BBlockDesc_BK0_BN_BK1 is known at compile-time
+//     2. BBlockBuffer is DynamicBuffer
+//   3. C:
+//     1. CThreadDesc_BM0_BM11_BN0_BN11 is known at compile-time
+//     2. CThreadBuffer is StaticBuffer
+// Also assume:
+//   BM10BN10ThreadClusterBM10Xs::Size() = BM10BN10ThreadClusterBN10Xs::Size() == 2
+//   BM0 = BN0 = 2. It will do 2x2 pipelined read and fma (ABBA optimization)
+template <index_t BlockSize,
+          typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename ABlockDesc_BK0_BM_BK1,
+          typename BBlockDesc_BK0_BN_BK1,
+          index_t BM1PerThreadBM11,
+          index_t BN1PerThreadBN11,
+          index_t BK0PerThread,
+          typename BM10BN10ThreadClusterBM10Xs, // Sequence<BM10BN10ThreadClusterBM100,
+                                                //          BM10BN10ThreadClusterBM101, ...>
+          typename BM10BN10ThreadClusterBN10Xs, // Sequence<BM10BN10ThreadClusterBN100,
+                                                //          BM10BN10ThreadClusterBN101, ...>
+          index_t AThreadCopyScalarPerVector_BM11,
+          index_t BThreadCopyScalarPerVector_BN11,
+          typename enable_if<ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&
+                                 BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(),
+                             bool>::type = false>
+struct BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2
+{
+    using AIndex = MultiIndex<3>;
+    using BIndex = MultiIndex<3>;
+    using CIndex = MultiIndex<4>;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr index_t BK0 = ABlockDesc_BK0_BM_BK1{}.GetLength(I0);
+    static constexpr index_t BK1 = ABlockDesc_BK0_BM_BK1{}.GetLength(I2);
+    static constexpr index_t BM  = ABlockDesc_BK0_BM_BK1{}.GetLength(I1);
+    static constexpr index_t BN  = BBlockDesc_BK0_BN_BK1{}.GetLength(I1);
+
+    static constexpr index_t BM100 = BM10BN10ThreadClusterBM10Xs{}[I0];
+    static constexpr index_t BN100 = BM10BN10ThreadClusterBN10Xs{}[I0];
+
+    static constexpr index_t BM101 = BM10BN10ThreadClusterBM10Xs{}[I1];
+    static constexpr index_t BN101 = BM10BN10ThreadClusterBN10Xs{}[I1];
+
+    static constexpr index_t BM11 = BM1PerThreadBM11;
+    static constexpr index_t BN11 = BN1PerThreadBN11;
+
+    static constexpr index_t BM1 = BM100 * BM101 * BM11;
+    static constexpr index_t BN1 = BN100 * BN101 * BN11;
+
+    static constexpr index_t BM0 = BM / BM1;
+    static constexpr index_t BN0 = BN / BN1;
+
+    __host__ __device__ static constexpr auto
+    MakeABlockDescriptor_BK0_BM0_BM1_BK1(const ABlockDesc_BK0_BM_BK1& a_block_desc_bk0_bm_bk1)
+    {
+        const auto a_block_bk0_bm0_bm1_bk1 = transform_tensor_descriptor(
+            a_block_desc_bk0_bm_bk1,
+            make_tuple(make_pass_through_transform(Number<BK0>{}),
+                       make_unmerge_transform(make_tuple(Number<BM0>{}, Number<BM1>{})),
+                       make_pass_through_transform(Number<BK1>{})),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+        return a_block_bk0_bm0_bm1_bk1;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeBBlockDescriptor_BK0_BN0_BN1_BK1(const BBlockDesc_BK0_BN_BK1& b_block_desc_bk0_bn_bk1)
+    {
+        const auto b_block_desc_bk0_bn0_bn1_bk1 = transform_tensor_descriptor(
+            b_block_desc_bk0_bn_bk1,
+            make_tuple(make_pass_through_transform(Number<BK0>{}),
+                       make_unmerge_transform(make_tuple(Number<BN0>{}, Number<BN1>{})),
+                       make_pass_through_transform(Number<BK1>{})),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+        return b_block_desc_bk0_bn0_bn1_bk1;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM_BN()
+    {
+        // upper: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
+        // lower: [BM, BN]
+        constexpr auto c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m_n =
+            make_single_stage_tensor_adaptor(
+                make_tuple(make_unmerge_transform(make_tuple(
+                               Number<BM0>{}, Number<BM100>{}, Number<BM101>{}, Number<BM11>{})),
+                           make_unmerge_transform(make_tuple(
+                               Number<BN0>{}, Number<BN100>{}, Number<BN101>{}, Number<BN11>{}))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<4, 5, 6, 7>{}));
+
+        return c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m_n;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM0_BM1_BN0_BN1()
+    {
+        // upper: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
+        // lower: [BM0, BM1, BN0, BN1]
+        constexpr auto c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m0_m1_n0_n1 =
+            make_single_stage_tensor_adaptor(
+                make_tuple(make_pass_through_transform(Number<BM0>{}),
+                           make_unmerge_transform(
+                               make_tuple(Number<BM100>{}, Number<BM101>{}, Number<BM11>{})),
+                           make_pass_through_transform(Number<BN0>{}),
+                           make_unmerge_transform(
+                               make_tuple(Number<BN100>{}, Number<BN101>{}, Number<BN11>{}))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{}, Sequence<5, 6, 7>{}));
+
+        return c_block_adaptor_m0_m100_m101_m11_n0_n100_n101_n11_to_m0_m1_n0_n1;
+    }
+
+    __host__ __device__ static constexpr auto GetCThreadTensorLengths_BM0_BM1_BN0_BN1()
+    {
+        return Sequence<BM0, BM11, BN0, BN11>{};
+    }
+
+    static constexpr auto a_block_desc_bk0_bm0_bm1_bk1_ =
+        MakeABlockDescriptor_BK0_BM0_BM1_BK1(ABlockDesc_BK0_BM_BK1{});
+
+    static constexpr auto b_block_desc_bk0_bn0_bn1_bk1_ =
+        MakeBBlockDescriptor_BK0_BN0_BN1_BK1(BBlockDesc_BK0_BN_BK1{});
+
+    public:
+    __device__ BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2()
+        : c_thread_origin_data_idx_{CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1(
+              get_thread_local_1d_id())},
+          a_thread_copy_{
+              make_tuple(0, c_thread_origin_data_idx_[I0], c_thread_origin_data_idx_[I1], 0)},
+          b_thread_copy_{
+              make_tuple(0, c_thread_origin_data_idx_[I2], c_thread_origin_data_idx_[I3], 0)}
+    {
+        static_assert(ABlockDesc_BK0_BM_BK1::IsKnownAtCompileTime() &&
+                          BBlockDesc_BK0_BN_BK1::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        static_assert(BlockSize == BM101 * BM100 * BN101 * BN100,
+                      "wrong! blocksize and cluster size not consistent");
+
+        static_assert(BM % BM1 == 0 && BN % BN1 == 0, "wrong!");
+
+        static_assert(ABlockDesc_BK0_BM_BK1{}.GetLength(I0) ==
+                          BBlockDesc_BK0_BN_BK1{}.GetLength(I0),
+                      "wrong! K dimension not consistent");
+
+        // TODO remove this restriction
+        static_assert(BM10BN10ThreadClusterBM10Xs::Size() == 2 &&
+                          BM10BN10ThreadClusterBN10Xs::Size() == 2,
+                      "wrong!");
+
+        // TODO: remove this restriction
+        static_assert(BM0 == 2, "wrong");
+        static_assert(BM0 == 2 && BN0 == 2, "wrong");
+    }
+
+    __device__ static CIndex CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1(index_t thread_id)
+    {
+        // lower: [BM0, BM1, BN0, BN1]
+        // upper: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
+        constexpr auto adaptor0 =
+            MakeCBlockAdaptor_BM0_BM100_BM101_BM11_BN0_BN100_BN101_BN11_To_BM0_BM1_BN0_BN1();
+
+        // lower: [BM0, BM100, BM101, BM11, BN0, BN100, BN101, BN11]
+        // upper: [Tid, BM0, BM11, BN0, BN11]
+        constexpr auto adaptor1 = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(BM100, BN100, BM101, BN101)),
+                       make_pass_through_transform(BM0),
+                       make_pass_through_transform(BM11),
+                       make_pass_through_transform(BN0),
+                       make_pass_through_transform(BN11)),
+            make_tuple(
+                Sequence<1, 5, 2, 6>{}, Sequence<0>{}, Sequence<3>{}, Sequence<4>{}, Sequence<7>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+        constexpr auto adaptor = chain_tensor_adaptors(adaptor0, adaptor1);
+
+        return adaptor.CalculateBottomIndex(make_multi_index(thread_id, 0, 0, 0, 0));
+    }
+
+    template <typename CThreadDesc_BM0_BM11_BN0_BN11,
+              typename ABlockBuffer,
+              typename BBlockBuffer,
+              typename CThreadBuffer>
+    __device__ void Run(const CThreadDesc_BM0_BM11_BN0_BN11&,
+                        const ABlockBuffer& a_block_buf,
+                        const BBlockBuffer& b_block_buf,
+                        CThreadBuffer& c_thread_buf) const
+    {
+        static_assert(CThreadDesc_BM0_BM11_BN0_BN11::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        // TODO: remove this restriction
+        static_assert(BM0 == 2 && BN0 == 2 &&
+                          CThreadDesc_BM0_BM11_BN0_BN11{}.GetLength(I0) == BM0 &&
+                          CThreadDesc_BM0_BM11_BN0_BN11{}.GetLength(I2) == BN0,
+                      "wrong");
+
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatA>(
+            a_thread_desc_bk0_bm0_bm1_bk1_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatB>(
+            b_thread_desc_bk0_bn0_bn1_bk1_.GetElementSpaceSize());
+
+        constexpr auto threadwise_contraction =
+            ThreadwiseContractionDl_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1<
+                FloatA,
+                FloatB,
+                FloatC,
+                decltype(a_thread_desc_bk0_bm0_bm1_bk1_),
+                decltype(b_thread_desc_bk0_bn0_bn1_bk1_),
+                CThreadDesc_BM0_BM11_BN0_BN11,
+                Sequence<BK0PerThread, BK1>,
+                Sequence<1, BM1PerThreadBM11>,
+                Sequence<1, BN1PerThreadBN11>>{};
+
+        // read A_sub_0
+        a_thread_copy_.Run(a_block_desc_bk0_bm0_bm1_bk1_,
+                           make_tuple(I0, I0, I0, I0),
+                           a_block_buf,
+                           a_thread_desc_bk0_bm0_bm1_bk1_,
+                           make_tuple(I0, I0, I0, I0),
+                           a_thread_buf);
+
+        // read B_sub_0
+        b_thread_copy_.Run(b_block_desc_bk0_bn0_bn1_bk1_,
+                           make_tuple(I0, I0, I0, I0),
+                           b_block_buf,
+                           b_thread_desc_bk0_bn0_bn1_bk1_,
+                           make_tuple(I0, I0, I0, I0),
+                           b_thread_buf);
+
+        // read B_sub_1
+        b_thread_copy_.Run(b_block_desc_bk0_bn0_bn1_bk1_,
+                           make_tuple(I0, I1, I0, I0),
+                           b_block_buf,
+                           b_thread_desc_bk0_bn0_bn1_bk1_,
+                           make_tuple(I0, I1, I0, I0),
+                           b_thread_buf);
+
+        // read A_sub_1
+        a_thread_copy_.Run(a_block_desc_bk0_bm0_bm1_bk1_,
+                           make_tuple(I0, I1, I0, I0),
+                           a_block_buf,
+                           a_thread_desc_bk0_bm0_bm1_bk1_,
+                           make_tuple(I0, I1, I0, I0),
+                           a_thread_buf);
+
+        // C_sub_00 += transpose(A_sub_0) * B_sub_0
+        threadwise_contraction.Run(a_thread_buf,
+                                   make_tuple(I0, I0, I0, I0),
+                                   b_thread_buf,
+                                   make_tuple(I0, I0, I0, I0),
+                                   c_thread_buf,
+                                   make_tuple(I0, I0, I0, I0));
+
+        // C_sub_01 += transpose(A_sub_0) * B_sub_1
+        threadwise_contraction.Run(a_thread_buf,
+                                   make_tuple(I0, I0, I0, I0),
+                                   b_thread_buf,
+                                   make_tuple(I0, I1, I0, I0),
+                                   c_thread_buf,
+                                   make_tuple(I0, I0, I1, I0));
+
+        // loop over rest of bk0
+        static_for<BK0PerThread, BK0, BK0PerThread>{}([&](auto bk0) {
+            // read A_sub_0
+            a_thread_copy_.Run(a_block_desc_bk0_bm0_bm1_bk1_,
+                               make_tuple(bk0, I0, I0, I0),
+                               a_block_buf,
+                               a_thread_desc_bk0_bm0_bm1_bk1_,
+                               make_tuple(I0, I0, I0, I0),
+                               a_thread_buf);
+
+            // C_sub_10 += transpose(A_sub_1) * B_sub_0
+            threadwise_contraction.Run(a_thread_buf,
+                                       make_tuple(I0, I1, I0, I0),
+                                       b_thread_buf,
+                                       make_tuple(I0, I0, I0, I0),
+                                       c_thread_buf,
+                                       make_tuple(I1, I0, I0, I0));
+
+            // read B_sub_0
+            b_thread_copy_.Run(b_block_desc_bk0_bn0_bn1_bk1_,
+                               make_tuple(bk0, I0, I0, I0),
+                               b_block_buf,
+                               b_thread_desc_bk0_bn0_bn1_bk1_,
+                               make_tuple(I0, I0, I0, I0),
+                               b_thread_buf);
+
+            // C_sub_11 += transpose(A_sub_1) * B_sub_1
+            threadwise_contraction.Run(a_thread_buf,
+                                       make_tuple(I0, I1, I0, I0),
+                                       b_thread_buf,
+                                       make_tuple(I0, I1, I0, I0),
+                                       c_thread_buf,
+                                       make_tuple(I1, I0, I1, I0));
+
+            // read B_sub_1
+            b_thread_copy_.Run(b_block_desc_bk0_bn0_bn1_bk1_,
+                               make_tuple(bk0, I1, I0, I0),
+                               b_block_buf,
+                               b_thread_desc_bk0_bn0_bn1_bk1_,
+                               make_tuple(I0, I1, I0, I0),
+                               b_thread_buf);
+
+            // read A_sub_1
+            a_thread_copy_.Run(a_block_desc_bk0_bm0_bm1_bk1_,
+                               make_tuple(bk0, I1, I0, I0),
+                               a_block_buf,
+                               a_thread_desc_bk0_bm0_bm1_bk1_,
+                               make_tuple(I0, I1, I0, I0),
+                               a_thread_buf);
+
+            // C_sub_00 += transpose(A_sub_0) * B_sub_0
+            threadwise_contraction.Run(a_thread_buf,
+                                       make_tuple(I0, I0, I0, I0),
+                                       b_thread_buf,
+                                       make_tuple(I0, I0, I0, I0),
+                                       c_thread_buf,
+                                       make_tuple(I0, I0, I0, I0));
+
+            // C_sub_01 += transpose(A_sub_0) * B_sub_1
+            threadwise_contraction.Run(a_thread_buf,
+                                       make_tuple(I0, I0, I0, I0),
+                                       b_thread_buf,
+                                       make_tuple(I0, I1, I0, I0),
+                                       c_thread_buf,
+                                       make_tuple(I0, I0, I1, I0));
+        });
+
+        // C_sub_10 += transpose(A_sub_1) * B_sub_0
+        threadwise_contraction.Run(a_thread_buf,
+                                   make_tuple(I0, I1, I0, I0),
+                                   b_thread_buf,
+                                   make_tuple(I0, I0, I0, I0),
+                                   c_thread_buf,
+                                   make_tuple(I1, I0, I0, I0));
+
+        // C_sub_11 += transpose(A_sub_1) * B_sub_1
+        threadwise_contraction.Run(a_thread_buf,
+                                   make_tuple(I0, I1, I0, I0),
+                                   b_thread_buf,
+                                   make_tuple(I0, I1, I0, I0),
+                                   c_thread_buf,
+                                   make_tuple(I1, I0, I1, I0));
+    }
+
+    private:
+    // A[BK0, BM0, BM1, BK1]
+    static constexpr auto a_thread_desc_bk0_bm0_bm1_bk1_ =
+        make_naive_tensor_descriptor_packed(make_tuple(
+            Number<BK0PerThread>{}, Number<BM0>{}, Number<BM1PerThreadBM11>{}, Number<BK1>{}));
+
+    // B[BK0, BN0, BN1, BK1]
+    static constexpr auto b_thread_desc_bk0_bn0_bn1_bk1_ =
+        make_naive_tensor_descriptor_packed(make_tuple(
+            Number<BK0PerThread>{}, Number<BN0>{}, Number<BN1PerThreadBN11>{}, Number<BK1>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4r1<
+        FloatA,
+        FloatA,
+        decltype(a_block_desc_bk0_bm0_bm1_bk1_),
+        decltype(a_thread_desc_bk0_bm0_bm1_bk1_),
+        Sequence<BK0PerThread, 1, BM1PerThreadBM11, BK1>, // SliceLengths
+        Sequence<0, 1, 2, 3>,                             // DimAccessOrder
+        Sequence<1, 1, BM1PerThreadBM11, BK1>,            // SrcVectorTensorLengths
+        Sequence<0, 1, 2, 3>>;                            // SrcVectorTensorContiguousDimOrder
+
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4r1<
+        FloatB,
+        FloatB,
+        decltype(b_block_desc_bk0_bn0_bn1_bk1_),
+        decltype(b_thread_desc_bk0_bn0_bn1_bk1_),
+        Sequence<BK0PerThread, 1, BN1PerThreadBN11, BK1>, // SliceLengths
+        Sequence<0, 1, 2, 3>,                             // DimAccessOrder
+        Sequence<1, 1, BN1PerThreadBN11, BK1>,            // SrcVectorTensorLengths
+        Sequence<0, 1, 2, 3>>;                            // SrcVectorTensorContiguousDimOrder
+
+    CIndex c_thread_origin_data_idx_;
+
+    AThreadCopy a_thread_copy_;
+    BThreadCopy b_thread_copy_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp
new file mode 100644
index 00000000..33120bd8
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp
@@ -0,0 +1,397 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_BLOCKWISE_GEMM_DLOPS_V2R2_HPP
+#define CK_BLOCKWISE_GEMM_DLOPS_V2R2_HPP
+
+#include "common_header.hpp"
+#include "tensor_adaptor.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
+#include "threadwise_contraction_dlops.hpp"
+
+namespace ck {
+
+// C[M0, M1, N0, N1] += transpose(A[K, M0, M1]) * B[K, N0, N1]
+// A and B are visable to the whole block, C is distributed among each thread
+// Assume:
+//   1. A:
+//     1. AKMBlockDesc is known at compile-time
+//     2. ABlockBuffer is DynamicBuffer
+//   2. B:
+//     1. BKNBlockDesc is known at compile-time
+//     2. BBlockBuffer is DynamicBuffer
+//   3. C:
+//     1. CM0M1N0N1ThreadDesc is known at compile-time
+//     2. CThreadBuffer is StaticBuffer
+// Also assume:
+//   M0 = N0 = 2. It will do 2x2 pipelined read and fma (ABBA optimization)
+template <
+    index_t BlockSize,
+    typename FloatA,
+    typename FloatB,
+    typename FloatC,
+    typename AKMBlockDesc,
+    typename BKNBlockDesc,
+    index_t M1PerThreadM11,
+    index_t N1PerThreadN11,
+    index_t KPerThread,
+    index_t M1N1ThreadClusterM100,
+    index_t M1N1ThreadClusterN100,
+    index_t M1N1ThreadClusterM101,
+    index_t M1N1ThreadClusterN101,
+    index_t AThreadCopyScalarPerVector_M11,
+    index_t BThreadCopyScalarPerVector_N11,
+    typename enable_if<AKMBlockDesc::IsKnownAtCompileTime() && BKNBlockDesc::IsKnownAtCompileTime(),
+                       bool>::type = false>
+struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2
+{
+    using AIndex = MultiIndex<3>;
+    using BIndex = MultiIndex<3>;
+    using CIndex = MultiIndex<4>;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr index_t K = AKMBlockDesc{}.GetLength(I0);
+    static constexpr index_t M = AKMBlockDesc{}.GetLength(I1);
+    static constexpr index_t N = BKNBlockDesc{}.GetLength(I1);
+
+    static constexpr index_t M100 = M1N1ThreadClusterM100;
+    static constexpr index_t N100 = M1N1ThreadClusterN100;
+
+    static constexpr index_t M101 = M1N1ThreadClusterM101;
+    static constexpr index_t N101 = M1N1ThreadClusterN101;
+
+    static constexpr index_t M11 = M1PerThreadM11;
+    static constexpr index_t N11 = N1PerThreadN11;
+
+    static constexpr index_t M1 = M1N1ThreadClusterM100 * M1N1ThreadClusterM101 * M1PerThreadM11;
+    static constexpr index_t N1 = M1N1ThreadClusterN100 * M1N1ThreadClusterN101 * N1PerThreadN11;
+
+    static constexpr index_t M0 = M / M1;
+    static constexpr index_t N0 = N / N1;
+
+    __host__ __device__ static constexpr auto
+    MakeAKM0M1BlockDescriptor(const AKMBlockDesc& /* a_k_m_block_desc */)
+    {
+        const auto a_k_m0_m1_block_desc = transform_tensor_descriptor(
+            AKMBlockDesc{},
+            make_tuple(make_pass_through_transform(Number<K>{}),
+                       make_unmerge_transform(make_tuple(Number<M0>{}, Number<M1>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}));
+
+        return a_k_m0_m1_block_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeBKN0N1BlockDescriptor(const BKNBlockDesc& /* b_k_n_block_desc */)
+    {
+        const auto b_k_n0_n1_block_desc = transform_tensor_descriptor(
+            BKNBlockDesc{},
+            make_tuple(make_pass_through_transform(Number<K>{}),
+                       make_unmerge_transform(make_tuple(Number<N0>{}, Number<N1>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}));
+
+        return b_k_n0_n1_block_desc;
+    }
+
+    __host__ __device__ static constexpr auto MakeCM0M100M101M11N0N100N101N11ToMNBlockAdaptor()
+    {
+        // upper: [M0, M100, M101, M11, N0, N100, N101, N11]
+        // lower: [M, N]
+        constexpr auto c_m0_m100_m101_m11_n0_n100_n101_n11_to_m_n_block_adaptor =
+            make_single_stage_tensor_adaptor(
+                make_tuple(make_unmerge_transform(make_tuple(
+                               Number<M0>{}, Number<M100>{}, Number<M101>{}, Number<M11>{})),
+                           make_unmerge_transform(make_tuple(
+                               Number<N0>{}, Number<N100>{}, Number<N101>{}, Number<N11>{}))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<4, 5, 6, 7>{}));
+
+        return c_m0_m100_m101_m11_n0_n100_n101_n11_to_m_n_block_adaptor;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCM0M100M101M11N0N100N101N11ToM0M1N0N1BlockAdaptor()
+    {
+        // upper: [M0, M100, M101, M11, N0, N100, N101, N11]
+        // lower: [M0, M1, N0, N1]
+        constexpr auto c_m0_m100_m101_m11_n0_n100_n101_n11_to_m0_m1_n0_n1_block_adaptor =
+            make_single_stage_tensor_adaptor(
+                make_tuple(make_pass_through_transform(Number<M0>{}),
+                           make_unmerge_transform(
+                               make_tuple(Number<M100>{}, Number<M101>{}, Number<M11>{})),
+                           make_pass_through_transform(Number<N0>{}),
+                           make_unmerge_transform(
+                               make_tuple(Number<N100>{}, Number<N101>{}, Number<N11>{}))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2, 3>{}, Sequence<4>{}, Sequence<5, 6, 7>{}));
+
+        return c_m0_m100_m101_m11_n0_n100_n101_n11_to_m0_m1_n0_n1_block_adaptor;
+    }
+
+    __host__ __device__ static constexpr auto GetCM0M1N0N1ThreadTensorLengths()
+    {
+        return Sequence<M0, M11, N0, N11>{};
+    }
+
+    static constexpr auto a_k_m0_m1_block_desc_ = MakeAKM0M1BlockDescriptor(AKMBlockDesc{});
+    static constexpr auto b_k_n0_n1_block_desc_ = MakeBKN0N1BlockDescriptor(BKNBlockDesc{});
+
+    public:
+    __device__ BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2()
+        : c_thread_origin_data_idx_{CalculateCM0M1N0N1ThreadOriginOnBlock(
+              get_thread_local_1d_id())},
+          a_thread_copy_{
+              make_tuple(0, c_thread_origin_data_idx_[I0], c_thread_origin_data_idx_[I1])},
+          b_thread_copy_{
+              make_tuple(0, c_thread_origin_data_idx_[I2], c_thread_origin_data_idx_[I3])}
+    {
+        static_assert(AKMBlockDesc::IsKnownAtCompileTime() && BKNBlockDesc::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        static_assert(BlockSize == M101 * M100 * N101 * N100,
+                      "wrong! blocksize and cluster size not consistent");
+
+        static_assert(M % M1 == 0 && N % N1 == 0, "wrong!");
+
+        static_assert(AKMBlockDesc{}.GetLength(I0) == BKNBlockDesc{}.GetLength(I0),
+                      "wrong! K dimension not consistent");
+
+        // TODO: remove this restriction
+        static_assert(M0 == 2 && N0 == 2, "wrong");
+    }
+
+    __device__ static CIndex CalculateCM0M1N0N1ThreadOriginOnBlock(index_t thread_id)
+    {
+        // lower: [M0, M1, N0, N1]
+        // upper: [M0, M100, M101, M11, N0, N100, N101, N11]
+        constexpr auto adaptor0 = MakeCM0M100M101M11N0N100N101N11ToM0M1N0N1BlockAdaptor();
+
+        // lower: [M0, M100, M101, M11, N0, N100, N101, N11]
+        // upper: [Tid, M0, M11, N0, N11]
+        constexpr auto adaptor1 = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(M100, N100, M101, N101)),
+                       make_pass_through_transform(M0),
+                       make_pass_through_transform(M11),
+                       make_pass_through_transform(N0),
+                       make_pass_through_transform(N11)),
+            make_tuple(
+                Sequence<1, 5, 2, 6>{}, Sequence<0>{}, Sequence<3>{}, Sequence<4>{}, Sequence<7>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+        constexpr auto adaptor = chain_tensor_adaptors(adaptor0, adaptor1);
+
+        return adaptor.CalculateBottomIndex(make_multi_index(thread_id, 0, 0, 0, 0));
+    }
+
+    __host__ __device__ static constexpr index_t GetABlockAlignment() { return M1PerThreadM11; }
+
+    __host__ __device__ static constexpr auto GetBBlockAlignment() { return N1PerThreadN11; }
+
+    template <typename CM0M1N0N1ThreadDesc,
+              typename ABlockBuffer,
+              typename BBlockBuffer,
+              typename CThreadBuffer>
+    __device__ void Run(const CM0M1N0N1ThreadDesc& /* c_m0_m1_n0_n1_thread_desc */,
+                        const ABlockBuffer& a_block_buf,
+                        const BBlockBuffer& b_block_buf,
+                        CThreadBuffer& c_thread_buf) const
+    {
+        static_assert(CM0M1N0N1ThreadDesc::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        // TODO: remove this restriction
+        static_assert(M0 == 2 && N0 == 2 && CM0M1N0N1ThreadDesc{}.GetLength(I0) == M0 &&
+                          CM0M1N0N1ThreadDesc{}.GetLength(I2) == N0,
+                      "wrong");
+
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatA>(
+            a_k_m0_m1_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatB>(
+            b_k_n0_n1_thread_desc_.GetElementSpaceSize());
+
+        constexpr auto threadwise_gemm =
+            ThreadwiseGemmDlops_km0m1_kn0n1_m0m1n0n1<FloatA,
+                                                     FloatB,
+                                                     FloatC,
+                                                     decltype(a_k_m0_m1_thread_desc_),
+                                                     decltype(b_k_n0_n1_thread_desc_),
+                                                     CM0M1N0N1ThreadDesc,
+                                                     Sequence<KPerThread>,
+                                                     Sequence<1, M1PerThreadM11>,
+                                                     Sequence<1, N1PerThreadN11>>{};
+
+        // read A_sub_0
+        a_thread_copy_.Run(a_k_m0_m1_block_desc_,
+                           make_tuple(I0, I0, I0),
+                           a_block_buf,
+                           a_k_m0_m1_thread_desc_,
+                           make_tuple(I0, I0, I0),
+                           a_thread_buf);
+
+        // read B_sub_0
+        b_thread_copy_.Run(b_k_n0_n1_block_desc_,
+                           make_tuple(I0, I0, I0),
+                           b_block_buf,
+                           b_k_n0_n1_thread_desc_,
+                           make_tuple(I0, I0, I0),
+                           b_thread_buf);
+
+        // read B_sub_1
+        b_thread_copy_.Run(b_k_n0_n1_block_desc_,
+                           make_tuple(I0, I1, I0),
+                           b_block_buf,
+                           b_k_n0_n1_thread_desc_,
+                           make_tuple(I0, I1, I0),
+                           b_thread_buf);
+
+        // read A_sub_1
+        a_thread_copy_.Run(a_k_m0_m1_block_desc_,
+                           make_tuple(I0, I1, I0),
+                           a_block_buf,
+                           a_k_m0_m1_thread_desc_,
+                           make_tuple(I0, I1, I0),
+                           a_thread_buf);
+
+        // C_sub_00 += transpose(A_sub_0) * B_sub_0
+        threadwise_gemm.Run(a_thread_buf,
+                            make_tuple(I0, I0, I0),
+                            b_thread_buf,
+                            make_tuple(I0, I0, I0),
+                            c_thread_buf,
+                            make_tuple(I0, I0, I0, I0));
+
+        // C_sub_01 += transpose(A_sub_0) * B_sub_1
+        threadwise_gemm.Run(a_thread_buf,
+                            make_tuple(I0, I0, I0),
+                            b_thread_buf,
+                            make_tuple(I0, I1, I0),
+                            c_thread_buf,
+                            make_tuple(I0, I0, I1, I0));
+
+        // loop over rest of k
+        static_for<KPerThread, K, KPerThread>{}([&](auto k) {
+            // read A_sub_0
+            a_thread_copy_.Run(a_k_m0_m1_block_desc_,
+                               make_tuple(k, I0, I0),
+                               a_block_buf,
+                               a_k_m0_m1_thread_desc_,
+                               make_tuple(I0, I0, I0),
+                               a_thread_buf);
+
+            // C_sub_10 += transpose(A_sub_1) * B_sub_0
+            threadwise_gemm.Run(a_thread_buf,
+                                make_tuple(I0, I1, I0),
+                                b_thread_buf,
+                                make_tuple(I0, I0, I0),
+                                c_thread_buf,
+                                make_tuple(I1, I0, I0, I0));
+
+            // read B_sub_0
+            b_thread_copy_.Run(b_k_n0_n1_block_desc_,
+                               make_tuple(k, I0, I0),
+                               b_block_buf,
+                               b_k_n0_n1_thread_desc_,
+                               make_tuple(I0, I0, I0),
+                               b_thread_buf);
+
+            // C_sub_11 += transpose(A_sub_1) * B_sub_1
+            threadwise_gemm.Run(a_thread_buf,
+                                make_tuple(I0, I1, I0),
+                                b_thread_buf,
+                                make_tuple(I0, I1, I0),
+                                c_thread_buf,
+                                make_tuple(I1, I0, I1, I0));
+
+            // read B_sub_1
+            b_thread_copy_.Run(b_k_n0_n1_block_desc_,
+                               make_tuple(k, I1, I0),
+                               b_block_buf,
+                               b_k_n0_n1_thread_desc_,
+                               make_tuple(I0, I1, I0),
+                               b_thread_buf);
+
+            // read A_sub_1
+            a_thread_copy_.Run(a_k_m0_m1_block_desc_,
+                               make_tuple(k, I1, I0),
+                               a_block_buf,
+                               a_k_m0_m1_thread_desc_,
+                               make_tuple(I0, I1, I0),
+                               a_thread_buf);
+
+            // C_sub_00 += transpose(A_sub_0) * B_sub_0
+            threadwise_gemm.Run(a_thread_buf,
+                                make_tuple(I0, I0, I0),
+                                b_thread_buf,
+                                make_tuple(I0, I0, I0),
+                                c_thread_buf,
+                                make_tuple(I0, I0, I0, I0));
+
+            // C_sub_01 += transpose(A_sub_0) * B_sub_1
+            threadwise_gemm.Run(a_thread_buf,
+                                make_tuple(I0, I0, I0),
+                                b_thread_buf,
+                                make_tuple(I0, I1, I0),
+                                c_thread_buf,
+                                make_tuple(I0, I0, I1, I0));
+        });
+
+        // C_sub_10 += transpose(A_sub_1) * B_sub_0
+        threadwise_gemm.Run(a_thread_buf,
+                            make_tuple(I0, I1, I0),
+                            b_thread_buf,
+                            make_tuple(I0, I0, I0),
+                            c_thread_buf,
+                            make_tuple(I1, I0, I0, I0));
+
+        // C_sub_11 += transpose(A_sub_1) * B_sub_1
+        threadwise_gemm.Run(a_thread_buf,
+                            make_tuple(I0, I1, I0),
+                            b_thread_buf,
+                            make_tuple(I0, I1, I0),
+                            c_thread_buf,
+                            make_tuple(I1, I0, I1, I0));
+    }
+
+    private:
+    // A[K, M0, M1]
+    static constexpr auto a_k_m0_m1_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<KPerThread>{}, Number<M0>{}, Number<M1PerThreadM11>{}));
+
+    // B[K, N0, N1]
+    static constexpr auto b_k_n0_n1_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<KPerThread>{}, Number<N0>{}, Number<N1PerThreadN11>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatA,
+                                                         FloatA,
+                                                         decltype(a_k_m0_m1_block_desc_),
+                                                         decltype(a_k_m0_m1_thread_desc_),
+                                                         Sequence<KPerThread, 1, M1PerThreadM11>,
+                                                         Sequence<0, 1, 2>,
+                                                         2,
+                                                         AThreadCopyScalarPerVector_M11,
+                                                         1>;
+
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatB,
+                                                         FloatB,
+                                                         decltype(b_k_n0_n1_block_desc_),
+                                                         decltype(b_k_n0_n1_thread_desc_),
+                                                         Sequence<KPerThread, 1, N1PerThreadN11>,
+                                                         Sequence<0, 1, 2>,
+                                                         2,
+                                                         BThreadCopyScalarPerVector_N11,
+                                                         1>;
+
+    CIndex c_thread_origin_data_idx_;
+
+    AThreadCopy a_thread_copy_;
+    BThreadCopy b_thread_copy_;
+};
+
+} // namespace ck
+#endif
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp
new file mode 100644
index 00000000..f4565572
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_BLOCKWISE_GEMM_DLOPS_V3_HPP
+#define CK_BLOCKWISE_GEMM_DLOPS_V3_HPP
+
+#include "common_header.hpp"
+#include "threadwise_gemm_dlops_v3.hpp"
+
+namespace ck {
+
+template <index_t BlockSize,
+          typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename ABlockDesc_E1_K1_E2,
+          typename BBlockDesc_E1_N_Ho_Wo_E2,
+          typename CThreadDesc_K_N_Ho_Wo,
+          index_t EPerThreadLoop,
+          index_t KPerThreadLoop>
+struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+
+    using AIndex = MultiIndex<3>;
+    using BIndex = MultiIndex<3>;
+    using CIndex = MultiIndex<4>;
+
+    static constexpr auto E1        = ABlockDesc_E1_K1_E2{}.GetLength(I0);
+    static constexpr auto KPerBlock = ABlockDesc_E1_K1_E2{}.GetLength(I1);
+    static constexpr auto E2        = ABlockDesc_E1_K1_E2{}.GetLength(I2);
+
+    static constexpr auto HoPerBlock = BBlockDesc_E1_N_Ho_Wo_E2{}.GetLength(I2);
+    static constexpr auto WoPerBlock = BBlockDesc_E1_N_Ho_Wo_E2{}.GetLength(I3);
+
+    static constexpr auto KPerThread  = CThreadDesc_K_N_Ho_Wo{}.GetLength(I0);
+    static constexpr auto HoPerThread = CThreadDesc_K_N_Ho_Wo{}.GetLength(I2);
+    static constexpr auto WoPerThread = CThreadDesc_K_N_Ho_Wo{}.GetLength(I3);
+
+    static constexpr auto a_thread_mtx_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<EPerThreadLoop>{}, Number<KPerThreadLoop>{}, Number<E2>{}));
+
+    static constexpr auto b_thread_mtx_ =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<EPerThreadLoop>{},
+                                                       Number<1>{},
+                                                       Number<HoPerThread>{},
+                                                       Number<WoPerThread>{},
+                                                       Number<E2>{}));
+
+    static constexpr auto c_thread_mtx_ = make_naive_tensor_descriptor_packed(make_tuple(
+        Number<KPerThreadLoop>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{}));
+
+    __device__ BlockwiseGemmDlops_km_kn_m0m1n0n1_v3()
+        : c_thread_origin_data_idx_{GetBeginOfCThreadDesc_K_N_Ho_Wo(get_thread_local_1d_id())},
+          a_thread_copy_{make_tuple(0, c_thread_origin_data_idx_[I0] * KPerThread, 0)}
+    {
+        static_assert(ABlockDesc_E1_K1_E2::IsKnownAtCompileTime() &&
+                          BBlockDesc_E1_N_Ho_Wo_E2::IsKnownAtCompileTime() &&
+                          CThreadDesc_K_N_Ho_Wo::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        static_assert(
+            ABlockDesc_E1_K1_E2{}.GetLength(I0) == BBlockDesc_E1_N_Ho_Wo_E2{}.GetLength(I0) &&
+                ABlockDesc_E1_K1_E2{}.GetLength(I2) == BBlockDesc_E1_N_Ho_Wo_E2{}.GetLength(I4),
+            "wrong! E dimension not consistent\n");
+
+        static_assert(E1 % EPerThreadLoop == 0, "");
+        static_assert(KPerThread % KPerThreadLoop == 0, "");
+
+        static_assert(KPerBlock % KPerThread == 0 && HoPerBlock % HoPerThread == 0 &&
+                          WoPerBlock % WoPerThread == 0,
+                      "wrong! Cannot evenly divide work among\n");
+
+        constexpr auto KThreadCluster = KPerBlock / KPerThread;
+        constexpr auto HThreadCluster = HoPerBlock / HoPerThread;
+        constexpr auto WThreadCluster = WoPerBlock / WoPerThread;
+
+        static_assert(BlockSize == KThreadCluster * HThreadCluster * WThreadCluster,
+                      "wrong! wrong blocksize\n");
+    }
+
+    __device__ static constexpr auto GetCThreadDesc_K_N_Ho_WoLengths()
+    {
+        return Sequence<KPerThread, I1, HoPerThread, WoPerThread>{};
+    }
+
+    __device__ static CIndex GetBeginOfCThreadDesc_K_N_Ho_Wo(index_t thread_id)
+    {
+        constexpr auto K0 = KPerBlock / KPerThread;
+        constexpr auto N0 = I1;
+        constexpr auto H0 = HoPerBlock / HoPerThread;
+        constexpr auto W0 = WoPerBlock / WoPerThread;
+
+        constexpr auto c_threadid_to_k_n_h_w_thread_cluster_adaptor =
+            make_single_stage_tensor_adaptor(
+                make_tuple(make_merge_transform(make_tuple(K0, N0, H0, W0))),
+                make_tuple(Sequence<0, 1, 2, 3>{}),
+                make_tuple(Sequence<0>{}));
+
+        const auto c_k_n_h_w_thread_cluster_idx =
+            c_threadid_to_k_n_h_w_thread_cluster_adaptor.CalculateBottomIndex(
+                make_multi_index(thread_id));
+
+        return c_k_n_h_w_thread_cluster_idx;
+    }
+
+    template <typename ABlockBuffer, typename BThreadBuffer, typename CThreadBuffer>
+    __device__ void Run(const ABlockBuffer& a_block_buf,
+                        const BThreadBuffer& b_thread_buf,
+                        CThreadBuffer& c_thread_buf) const
+    {
+        static_assert(
+            is_same<remove_cvref_t<typename ABlockBuffer::type>, remove_cvref_t<FloatA>>::value &&
+            is_same<remove_cvref_t<typename BThreadBuffer::type>, remove_cvref_t<FloatB>>::value &&
+            is_same<remove_cvref_t<typename CThreadBuffer::type>, remove_cvref_t<FloatC>>::value &&
+            "wrong! inconsistent type");
+
+        constexpr auto a_block_mtx = ABlockDesc_E1_K1_E2{};
+
+        // thread A buffer for GEMM
+        StaticBuffer<AddressSpaceEnum::Vgpr, FloatA, a_thread_mtx_.GetElementSpaceSize(), true>
+            a_thread_buf;
+
+        constexpr auto threadwise_gemm = ThreadwiseGemmDlops_km_kn_mn_v3<FloatA,
+                                                                         FloatB,
+                                                                         FloatC,
+                                                                         decltype(a_thread_mtx_),
+                                                                         decltype(b_thread_mtx_),
+                                                                         decltype(c_thread_mtx_)>{};
+
+        static_for<0, E1, EPerThreadLoop>{}([&](auto e_begin) {
+            static_for<0, KPerThread, KPerThreadLoop>{}([&](auto k_begin) {
+                a_thread_copy_.Run(a_block_mtx,
+                                   make_tuple(e_begin, k_begin, I0),
+                                   a_block_buf,
+                                   a_thread_mtx_,
+                                   make_tuple(I0, I0, I0),
+                                   a_thread_buf);
+
+                threadwise_gemm.Run(a_thread_buf,
+                                    make_tuple(I0, I0, I0),
+                                    b_thread_buf,
+                                    make_tuple(e_begin, I0, I0, I0, I0),
+                                    c_thread_buf,
+                                    make_tuple(k_begin, I0, I0, I0));
+            });
+        });
+    }
+
+    template <typename ABlockSliceMoveStepIdx>
+    __device__ void MoveABlockSliceWindow(const ABlockSliceMoveStepIdx& a_block_slice_move_step_idx)
+    {
+        a_thread_copy_.MoveSrcSliceWindow(ABlockDesc_E1_K1_E2{}, a_block_slice_move_step_idx);
+    }
+
+    private:
+    using AThreadCopy =
+        ThreadwiseTensorSliceTransfer_v4<FloatA,
+                                         FloatA,
+                                         ABlockDesc_E1_K1_E2,
+                                         decltype(a_thread_mtx_),
+                                         Sequence<EPerThreadLoop, KPerThreadLoop, E2>,
+                                         Sequence<0, 1, 2>,
+                                         2,
+                                         E2,
+                                         E2>;
+
+    CIndex c_thread_origin_data_idx_;
+
+    AThreadCopy a_thread_copy_;
+};
+
+} // namespace ck
+#endif
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
new file mode 100644
index 00000000..aeef03d5
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
@@ -0,0 +1,998 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/warp/xdlops_gemm.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"
+
+namespace ck {
+
+enum struct LoopScheduler
+{
+    Default,
+    Interwave,
+};
+
+constexpr LoopScheduler make_default_loop_scheduler()
+{
+#if CK_EXPERIMENTAL_DEFAULT_TO_INTER_WAVE_SCHEDULING
+    return LoopScheduler::Interwave;
+#else
+    return LoopScheduler::Default;
+#endif // if CK_EXPERIMENTAL_DEFAULT_TO_INTER_WAVE_SCHEDULING
+}
+
+template <index_t MNXdlPerWave, index_t MNWaves, index_t MNPerXdl, typename TileDesc_K0_MN_K1>
+__host__ __device__ static constexpr auto
+MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K(const TileDesc_K0_MN_K1&)
+{
+    constexpr index_t K0 = TileDesc_K0_MN_K1{}.GetLength(Number<0>{});
+    constexpr index_t K1 = TileDesc_K0_MN_K1{}.GetLength(Number<2>{});
+
+    return transform_tensor_descriptor(
+        TileDesc_K0_MN_K1{},
+        make_tuple(make_merge_transform_v3_division_mod(make_tuple(Number<K0>{}, Number<K1>{})),
+                   make_unmerge_transform(
+                       make_tuple(Number<MNXdlPerWave>{}, Number<MNWaves>{}, Number<MNPerXdl>{}))),
+        make_tuple(Sequence<0, 2>{}, Sequence<1>{}),
+        make_tuple(Sequence<3>{}, Sequence<0, 1, 2>{}));
+}
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename AK0MK1BlockDesc,
+          typename BK0NK1BlockDesc,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack>
+struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    static constexpr index_t WaveSize = get_warp_size();
+
+    static constexpr index_t MPerBlock = AK0MK1BlockDesc{}.GetLength(I1);
+    static constexpr index_t NPerBlock = BK0NK1BlockDesc{}.GetLength(I1);
+    static constexpr index_t KPerBlock =
+        BK0NK1BlockDesc{}.GetLength(I0) * BK0NK1BlockDesc{}.GetLength(I2);
+
+    static constexpr index_t A_K0 = AK0MK1BlockDesc{}.GetLength(I0);
+    static constexpr index_t B_K0 = BK0NK1BlockDesc{}.GetLength(I0);
+    static constexpr index_t A_K1 = AK0MK1BlockDesc{}.GetLength(I2);
+    static constexpr index_t B_K1 = BK0NK1BlockDesc{}.GetLength(I2);
+
+    static constexpr auto xdlops_gemm = XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack>{};
+
+    static constexpr index_t KPerThread = KPerBlock / xdlops_gemm.K0PerXdlops;
+
+    static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL);
+    static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL);
+
+    StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                              FloatAcc,
+                              MRepeat * NRepeat,
+                              xdlops_gemm.GetRegSizePerXdlops(),
+                              true>
+        c_thread_buf_;
+
+    __host__ __device__ constexpr auto& GetCThreadBuffer() { return c_thread_buf_; }
+
+    __device__ static auto GetWaveIdx()
+    {
+        const index_t thread_id = ThisThreadBlock::GetThreadId();
+
+        constexpr auto threadid_to_wave_idx_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(MWaves, NWaves, WaveSize))),
+            make_tuple(Sequence<0, 1, 2>{}),
+            make_tuple(Sequence<0>{}));
+
+        return threadid_to_wave_idx_adaptor.CalculateBottomIndex(make_multi_index(thread_id));
+    }
+
+    __device__ static auto CalculateAThreadOriginDataIndex()
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_m = wave_idx[I0];
+
+        const auto xdlops_a_idx = xdlops_gemm.CalculateAThreadOriginDataIndex();
+
+        return make_tuple(0, waveId_m, xdlops_a_idx[I1], KPerThread * xdlops_a_idx[I0]);
+    }
+
+    __device__ static auto CalculateBThreadOriginDataIndex()
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_n = wave_idx[I1];
+
+        const auto xdlops_b_idx = xdlops_gemm.CalculateBThreadOriginDataIndex();
+
+        return make_tuple(0, waveId_n, xdlops_b_idx[I1], KPerThread * xdlops_b_idx[I0]);
+    }
+
+    template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
+    __device__ static auto
+        CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_m = wave_idx[I0];
+        const auto waveId_n = wave_idx[I1];
+
+        const auto blk_idx = xdlops_gemm.GetBeginOfThreadBlk(xdlops_i, blk_i);
+
+        constexpr auto mrepeat_mwave_mperxdl_to_m_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, MPerXDL))),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0, 1, 2>{}));
+
+        constexpr auto nrepeat_nwave_nperxdl_to_n_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_unmerge_transform(make_tuple(NRepeat, NWaves, NPerXDL))),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0, 1, 2>{}));
+
+        const index_t c_thread_m = mrepeat_mwave_mperxdl_to_m_adaptor.CalculateBottomIndex(
+            make_tuple(m0, waveId_m, blk_idx[I0]))[I0];
+        const index_t c_thread_n = nrepeat_nwave_nperxdl_to_n_adaptor.CalculateBottomIndex(
+            make_tuple(n0, waveId_n, blk_idx[I1]))[I0];
+
+        return make_tuple(c_thread_m, c_thread_n);
+    }
+
+    template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
+    __device__ static auto
+        CalculateCThreadOriginDataIndex8D(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_m = wave_idx[I0];
+        const auto waveId_n = wave_idx[I1];
+
+        const auto blk_idx = xdlops_gemm.GetBeginOfThreadBlk4D(xdlops_i, blk_i);
+
+        return make_tuple(Number<m0>{},
+                          Number<n0>{},
+                          waveId_m,
+                          waveId_n,
+                          blk_idx[I0],
+                          blk_idx[I1],
+                          blk_idx[I2],
+                          blk_idx[I3]);
+    }
+
+    __host__ __device__ BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1()
+    {
+        static_assert(AK0MK1BlockDesc::IsKnownAtCompileTime() &&
+                          BK0NK1BlockDesc::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        static_assert(ThisThreadBlock::GetNumOfThread() == MWaves * NWaves * WaveSize,
+                      "ThisThreadBlock::GetNumOfThread() != MWaves * NWaves * WaveSize\n");
+
+        static_assert(MPerBlock % (MPerXDL * MRepeat) == 0 && NPerBlock % (NPerXDL * NRepeat) == 0,
+                      "wrong!");
+    }
+
+    __host__ __device__ static constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
+    {
+        constexpr auto c_m0_m1_m2_n_tblk_lens = xdlops_gemm.GetCM0M1M2NThreadBlkLengths();
+
+        constexpr auto M0 = c_m0_m1_m2_n_tblk_lens[I0];
+        constexpr auto M1 = c_m0_m1_m2_n_tblk_lens[I1];
+        constexpr auto M2 = c_m0_m1_m2_n_tblk_lens[I2];
+        constexpr auto N  = c_m0_m1_m2_n_tblk_lens[I3];
+
+        return make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, I1, I1, M0, M1, M2, N));
+    }
+
+    __host__ __device__ static constexpr auto GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
+    {
+        constexpr auto c_m0_m1_m2_n_tblk_lens = xdlops_gemm.GetCM0M1M2NThreadBlkLengths();
+
+        constexpr auto M0 = c_m0_m1_m2_n_tblk_lens[I0];
+        constexpr auto M1 = c_m0_m1_m2_n_tblk_lens[I1];
+        constexpr auto M2 = c_m0_m1_m2_n_tblk_lens[I2];
+        constexpr auto N  = c_m0_m1_m2_n_tblk_lens[I3];
+
+        return make_naive_tensor_descriptor_packed(
+            make_tuple(I1, Number<MRepeat>{}, Number<NRepeat>{}, I1, I1, M0, M1, M2, N));
+    }
+
+    __host__ __device__ static constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
+    {
+        constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2 =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{},
+                                                           Number<NRepeat>{},
+                                                           Number<MWaves>{},
+                                                           Number<NWaves>{},
+                                                           Number<MPerXDL>{},
+                                                           Number<NPerXDL>{}));
+
+        return xdlops_gemm.MakeCDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_block_desc_m0_n0_m1_n1_m2_n2);
+    }
+
+    __host__ __device__ static constexpr auto GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
+    {
+        constexpr auto c_block_desc_g_m0_n0_m1_n1_m2_n2 =
+            make_naive_tensor_descriptor_packed(make_tuple(I1,
+                                                           Number<MRepeat>{},
+                                                           Number<NRepeat>{},
+                                                           Number<MWaves>{},
+                                                           Number<NWaves>{},
+                                                           Number<MPerXDL>{},
+                                                           Number<NPerXDL>{}));
+
+        return xdlops_gemm.MakeCDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(
+            c_block_desc_g_m0_n0_m1_n1_m2_n2);
+    }
+
+    template <typename CGridDesc_M_N>
+    __host__ __device__ static constexpr auto
+    MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        const auto M = c_grid_desc_m_n.GetLength(I0);
+        const auto N = c_grid_desc_m_n.GetLength(I1);
+
+        const auto c_grid_desc_m0_n0_m1_n1_m2_n2 = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(M / (MWaves * MPerXDL), MWaves, MPerXDL)),
+                       make_unmerge_transform(make_tuple(N / (NWaves * NPerXDL), NWaves, NPerXDL))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 2, 4>{}, Sequence<1, 3, 5>{}));
+
+        return xdlops_gemm.MakeCDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m0_n0_m1_n1_m2_n2);
+    }
+
+    template <typename CGridDesc_G_M_N>
+    __host__ __device__ static constexpr auto
+    MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_G_M_N& c_grid_desc_g_m_n)
+    {
+        const auto G = c_grid_desc_g_m_n.GetLength(I0);
+        const auto M = c_grid_desc_g_m_n.GetLength(I1);
+        const auto N = c_grid_desc_g_m_n.GetLength(I2);
+
+        const auto c_grid_desc_g_m0_n0_m1_n1_m2_n2 = transform_tensor_descriptor(
+            c_grid_desc_g_m_n,
+            make_tuple(make_pass_through_transform(G),
+                       make_unmerge_transform(make_tuple(M / (MWaves * MPerXDL), MWaves, MPerXDL)),
+                       make_unmerge_transform(make_tuple(N / (NWaves * NPerXDL), NWaves, NPerXDL))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 3, 5>{}, Sequence<2, 4, 6>{}));
+
+        return xdlops_gemm.MakeCDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(
+            c_grid_desc_g_m0_n0_m1_n1_m2_n2);
+    }
+
+    __host__ __device__ static constexpr auto MakeABlockDescriptor_M0_M1_M2_K()
+    {
+        return transform_tensor_descriptor(
+            AK0MK1BlockDesc{},
+            make_tuple(
+                make_merge_transform_v3_division_mod(make_tuple(Number<A_K0>{}, Number<A_K1>{})),
+                make_unmerge_transform(
+                    make_tuple(Number<MRepeat>{}, Number<MWaves>{}, Number<MPerXDL>{}))),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}),
+            make_tuple(Sequence<3>{}, Sequence<0, 1, 2>{}));
+    }
+
+    __host__ __device__ static constexpr auto MakeBBlockDescriptor_N0_N1_N2_K()
+    {
+        return transform_tensor_descriptor(
+            BK0NK1BlockDesc{},
+            make_tuple(
+                make_merge_transform_v3_division_mod(make_tuple(Number<B_K0>{}, Number<B_K1>{})),
+                make_unmerge_transform(
+                    make_tuple(Number<NRepeat>{}, Number<NWaves>{}, Number<NPerXDL>{}))),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}),
+            make_tuple(Sequence<3>{}, Sequence<0, 1, 2>{}));
+    }
+
+    static constexpr auto a_block_desc_m0_m1_m2_k = MakeABlockDescriptor_M0_M1_M2_K();
+    static constexpr auto b_block_desc_n0_n1_n2_k = MakeBBlockDescriptor_N0_N1_N2_K();
+
+    template <typename ABlockBuffer, typename BBlockBuffer, typename CThreadBuffer>
+    __device__ void Run(const ABlockBuffer& a_block_buf,
+                        const BBlockBuffer& b_block_buf,
+                        CThreadBuffer& c_thread_buf) const
+    {
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            // read A
+            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                               make_tuple(m0, I0, I0, I0),
+                               a_block_buf,
+                               a_thread_desc_,
+                               make_tuple(I0, I0, I0, I0),
+                               a_thread_buf);
+
+            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                // read B
+                b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                   make_tuple(n0, I0, I0, I0),
+                                   b_block_buf,
+                                   b_thread_desc_,
+                                   make_tuple(I0, I0, I0, I0),
+                                   b_thread_buf);
+
+                static_for<0, KPerThread, KPack>{}([&](auto k) {
+                    vector_type<FloatAB, KPack> a_thread_vec;
+                    vector_type<FloatAB, KPack> b_thread_vec;
+
+                    static_for<0, KPack, 1>{}([&](auto i) {
+                        a_thread_vec.template AsType<FloatAB>()(i) = a_thread_buf
+                            [Number<a_thread_desc_.CalculateOffset(make_tuple(0, 0, 0, k + i))>{}];
+                        b_thread_vec.template AsType<FloatAB>()(i) = b_thread_buf
+                            [Number<b_thread_desc_.CalculateOffset(make_tuple(0, 0, 0, k + i))>{}];
+                    });
+
+                    using mfma_input_type =
+                        typename vector_type<FloatAB, xdlops_gemm.K1PerXdlops>::type;
+
+                    constexpr index_t c_offset =
+                        c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                    xdlops_gemm.template Run(
+                        a_thread_vec.template AsType<mfma_input_type>(),
+                        b_thread_vec.template AsType<mfma_input_type>(),
+                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                });
+            });
+        });
+    }
+
+    protected:
+    // A[M0, M1, M2, KPerThread]
+    static constexpr auto a_thread_desc_ =
+        make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPerThread>{}));
+
+    // B[N0, N1, N2, KPerThread]
+    static constexpr auto b_thread_desc_ =
+        make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPerThread>{}));
+
+    // C[M, N, NumRegXdlops]
+    static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, xdlops_gemm.GetRegSizePerXdlops()));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
+                                                         FloatAB,
+                                                         decltype(a_block_desc_m0_m1_m2_k),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, KPerThread>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         A_K1,
+                                                         A_K1>;
+
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
+                                                         FloatAB,
+                                                         decltype(b_block_desc_n0_n1_n2_k),
+                                                         decltype(b_thread_desc_),
+                                                         Sequence<1, 1, 1, KPerThread>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         B_K1,
+                                                         B_K1>;
+
+    AThreadCopy a_thread_copy_{CalculateAThreadOriginDataIndex()};
+    BThreadCopy b_thread_copy_{CalculateBThreadOriginDataIndex()};
+};
+
+// Note: To facilitate the inter-wave loop scheduler, we need to explicitly set the macro
+// CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING=1 as a few intrinsics are not yet available in
+// the latest ROCm release. For unsupported compilers, inter-wave loop scheduler falls back to the
+// default loop scheduler which is given by the macro CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING=0
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename AK0MK1BlockDesc,
+          typename BK0NK1BlockDesc,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack,
+          index_t NumMacClusters = CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS>
+struct BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
+    : public BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
+                                                                 FloatAB,
+                                                                 FloatAcc,
+                                                                 AK0MK1BlockDesc,
+                                                                 BK0NK1BlockDesc,
+                                                                 MPerXDL,
+                                                                 NPerXDL,
+                                                                 MRepeat,
+                                                                 NRepeat,
+                                                                 KPack>
+{
+    using Base = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
+                                                                     FloatAB,
+                                                                     FloatAcc,
+                                                                     AK0MK1BlockDesc,
+                                                                     BK0NK1BlockDesc,
+                                                                     MPerXDL,
+                                                                     NPerXDL,
+                                                                     MRepeat,
+                                                                     NRepeat,
+                                                                     KPack>;
+
+#if CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::A_K1;
+    using Base::b_block_desc_n0_n1_n2_k;
+    using Base::B_K1;
+    using Base::c_thread_buf_;
+    using Base::c_thread_desc_;
+    using Base::CalculateAThreadOriginDataIndex;
+    using Base::CalculateBThreadOriginDataIndex;
+    using Base::I0;
+    using Base::I1;
+    using Base::KPerThread;
+    using Base::xdlops_gemm;
+
+    static constexpr index_t KPerInnerLoop = math::max(KPerThread / NumMacClusters, KPack);
+
+    // 2-wave optimized blockwise gemm
+    template <typename ABlockBuffer, typename BBlockBuffer, typename CThreadBuffer>
+    __device__ void Run(const ABlockBuffer& a_block_buf,
+                        const BBlockBuffer& b_block_buf,
+                        CThreadBuffer& c_thread_buf) const
+    {
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        static_for<0, KPerThread, KPerInnerLoop>{}([&](auto k) {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                // read A
+                a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                   make_tuple(m0, I0, I0, k),
+                                   a_block_buf,
+                                   a_thread_desc_,
+                                   make_tuple(m0, I0, I0, I0),
+                                   a_thread_buf);
+            });
+            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                // read B
+                b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                   make_tuple(n0, I0, I0, k),
+                                   b_block_buf,
+                                   b_thread_desc_,
+                                   make_tuple(n0, I0, I0, I0),
+                                   b_thread_buf);
+            });
+            __builtin_amdgcn_sched_barrier(0);
+            // NOTE: Synchronize threads in a workgroup at the start of each MAC cluster, but except
+            // the first, as we can shorten non-MAC cluster a bit and there's no observable negative
+            // impact. The desired effect is waves in a workgroup executing MAC in sync. This avoids
+            // some out-of-sync waves hijacking MAC resource from other workgroups and reducing the
+            // chance of latency hiding by waiting for the rest of the workgroup at the eventual
+            // sync point.
+            if constexpr(k.value != 0 || KPerInnerLoop == KPerThread)
+            {
+                asm volatile("s_barrier" ::);
+                __builtin_amdgcn_sched_barrier(0);
+            }
+            static_for<0, KPerInnerLoop, KPack>{}([&](auto k_) {
+                static_for<0, MRepeat, 1>{}([&](auto m0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        vector_type<FloatAB, KPack> a_thread_vec;
+                        vector_type<FloatAB, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto i) {
+                            a_thread_vec.template AsType<FloatAB>()(i) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, 0, 0, k_ + i))>{}];
+                            b_thread_vec.template AsType<FloatAB>()(i) =
+                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, 0, 0, k_ + i))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<FloatAB, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        // The block_sync_lds() here performs double duty:
+                        // A) safeguard against data hazard because barrier from blockwise_gemm is
+                        // moved here B) reduce VMEM FIFO congestion by applying small delays to
+                        // different wavefronts It is performed near the end of MAC cluster to
+                        // minimize lgkmcnt penalty
+                        if constexpr(k.value == KPerThread - KPerInnerLoop &&
+                                     k_.value == KPerInnerLoop - KPack && m0.value == MRepeat - 1 &&
+                                     n0.value == NRepeat - 1)
+                        {
+                            __builtin_amdgcn_sched_barrier(0);
+                            block_sync_lds();
+                            __builtin_amdgcn_sched_barrier(0);
+                        }
+
+                        // TODO: insert setprio in more precise manner since we
+                        // could have more than >1 MFMA instructions in single call
+                        xdlops_gemm.template Run(
+                            a_thread_vec.template AsType<mfma_input_type>(),
+                            b_thread_vec.template AsType<mfma_input_type>(),
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                        if constexpr(k_.value == 0 && m0.value == 0 && n0.value == 0)
+                        {
+                            __builtin_amdgcn_sched_barrier(0);
+                            __builtin_amdgcn_s_setprio(1);
+                            __builtin_amdgcn_sched_barrier(0);
+                        }
+                    });
+                });
+            });
+            __builtin_amdgcn_sched_barrier(0);
+            __builtin_amdgcn_s_setprio(0);
+            __builtin_amdgcn_sched_barrier(0);
+        });
+    }
+
+    protected:
+    // A[M0, M1, M2, KPerInnerLoop]
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat>{}, I1, I1, Number<KPerInnerLoop>{}));
+
+    // B[N0, N1, N2, KPerInnerLoop]
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat>{}, I1, I1, Number<KPerInnerLoop>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
+                                                         FloatAB,
+                                                         decltype(a_block_desc_m0_m1_m2_k),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, KPerInnerLoop>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         A_K1,
+                                                         A_K1>;
+
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
+                                                         FloatAB,
+                                                         decltype(b_block_desc_n0_n1_n2_k),
+                                                         decltype(b_thread_desc_),
+                                                         Sequence<1, 1, 1, KPerInnerLoop>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         B_K1,
+                                                         B_K1>;
+
+    AThreadCopy a_thread_copy_{CalculateAThreadOriginDataIndex()};
+    BThreadCopy b_thread_copy_{CalculateBThreadOriginDataIndex()};
+
+#endif // #if CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING
+};
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename AK0MK1BlockDesc,
+          typename BK0NK1BlockDesc,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack,
+          LoopScheduler LoopSched>
+constexpr auto BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector()
+{
+    if constexpr(LoopSched == LoopScheduler::Default)
+    {
+        return BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
+                                                                   FloatAB,
+                                                                   FloatAcc,
+                                                                   AK0MK1BlockDesc,
+                                                                   BK0NK1BlockDesc,
+                                                                   MPerXDL,
+                                                                   NPerXDL,
+                                                                   MRepeat,
+                                                                   NRepeat,
+                                                                   KPack>{};
+    }
+    else if constexpr(LoopSched == LoopScheduler::Interwave)
+    {
+        return BlockwiseGemmXdlopsInterwave_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
+                                                                            FloatAB,
+                                                                            FloatAcc,
+                                                                            AK0MK1BlockDesc,
+                                                                            BK0NK1BlockDesc,
+                                                                            MPerXDL,
+                                                                            NPerXDL,
+                                                                            MRepeat,
+                                                                            NRepeat,
+                                                                            KPack>{};
+    }
+};
+
+// Blockwise gemm supporting
+// 1. regular XDL output M2_M3_M4_M2 and transposed XDL output M2_N2_N3_N4
+// 2. decoupled input tile descriptor and mma tile descriptor in order to support both vgpr and LDS
+// source buffer
+// 3. configurable k index starting position and step size after each FMA/XDL instruction
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack,
+          bool TransposeC = false,
+          index_t AMmaKStride =
+              KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, TransposeC>{}.K0PerXdlops,
+          index_t BMmaKStride =
+              KPack* XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, TransposeC>{}.K0PerXdlops>
+struct BlockwiseGemmXdlops_v2
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    static constexpr index_t WaveSize = get_warp_size();
+
+    static constexpr index_t A_K0 = ATileDesc{}.GetLength(I0);
+    static constexpr index_t B_K0 = BTileDesc{}.GetLength(I0);
+    static constexpr index_t A_K1 = ATileDesc{}.GetLength(I2);
+    static constexpr index_t B_K1 = BTileDesc{}.GetLength(I2);
+
+    static constexpr auto xdlops_gemm = XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack, TransposeC>{};
+
+    static constexpr index_t KPerThread = KPerBlock / xdlops_gemm.K0PerXdlops;
+
+    static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL);
+    static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL);
+
+    static_assert(KPerThread % KPack == 0,
+                  "Wrong KPack setting; try increasing KPerThread or decreasing KPack");
+
+    StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                              FloatAcc,
+                              MRepeat * NRepeat,
+                              xdlops_gemm.GetRegSizePerXdlops(),
+                              true>
+        c_thread_buf_;
+
+    __host__ __device__ constexpr auto& GetCThreadBuffer() { return c_thread_buf_; }
+
+    __device__ static auto GetWaveIdx()
+    {
+        const index_t thread_id = ThisThreadBlock::GetThreadId();
+
+        constexpr auto threadid_to_wave_idx_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(MWaves, NWaves, WaveSize))),
+            make_tuple(Sequence<0, 1, 2>{}),
+            make_tuple(Sequence<0>{}));
+
+        return threadid_to_wave_idx_adaptor.CalculateBottomIndex(make_multi_index(thread_id));
+    }
+
+    __device__ static auto CalculateAThreadOriginDataIndex()
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_m = wave_idx[I0];
+
+        const auto xdlops_a_idx = xdlops_gemm.CalculateAThreadOriginDataIndex();
+
+        return make_tuple(0, waveId_m, xdlops_a_idx[I1], KPack * xdlops_a_idx[I0]);
+    }
+
+    __device__ static auto CalculateBThreadOriginDataIndex()
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_n = wave_idx[I1];
+
+        const auto xdlops_b_idx = xdlops_gemm.CalculateBThreadOriginDataIndex();
+
+        return make_tuple(0, waveId_n, xdlops_b_idx[I1], KPack * xdlops_b_idx[I0]);
+    }
+
+    template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
+    __device__ static auto
+        CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_m = wave_idx[I0];
+        const auto waveId_n = wave_idx[I1];
+
+        const auto blk_idx = xdlops_gemm.GetBeginOfThreadBlk(xdlops_i, blk_i);
+
+        constexpr auto mrepeat_mwave_mperxdl_to_m_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, MPerXDL))),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0, 1, 2>{}));
+
+        constexpr auto nrepeat_nwave_nperxdl_to_n_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_unmerge_transform(make_tuple(NRepeat, NWaves, NPerXDL))),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0, 1, 2>{}));
+
+        const index_t c_thread_m = mrepeat_mwave_mperxdl_to_m_adaptor.CalculateBottomIndex(
+            make_tuple(m0, waveId_m, blk_idx[I0]))[I0];
+        const index_t c_thread_n = nrepeat_nwave_nperxdl_to_n_adaptor.CalculateBottomIndex(
+            make_tuple(n0, waveId_n, blk_idx[I1]))[I0];
+
+        return make_tuple(c_thread_m, c_thread_n);
+    }
+
+    template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
+    __device__ static auto
+        CalculateCThreadOriginDataIndex8D(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_m = wave_idx[I0];
+        const auto waveId_n = wave_idx[I1];
+
+        const auto blk_idx = xdlops_gemm.GetBeginOfThreadBlk4D(xdlops_i, blk_i);
+
+        return make_tuple(
+            m0, n0, waveId_m, waveId_n, blk_idx[I0], blk_idx[I1], blk_idx[I2], blk_idx[I3]);
+    }
+
+    using Tuple4 = decltype(CalculateAThreadOriginDataIndex());
+
+    __host__ __device__ BlockwiseGemmXdlops_v2(Tuple4 a_origin = CalculateAThreadOriginDataIndex(),
+                                               Tuple4 b_origin = CalculateBThreadOriginDataIndex())
+        : a_thread_copy_(a_origin), b_thread_copy_(b_origin)
+    {
+        static_assert(AMmaTileDesc::IsKnownAtCompileTime() && BMmaTileDesc::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        static_assert(ThisThreadBlock::GetNumOfThread() == MWaves * NWaves * WaveSize,
+                      "ThisThreadBlock::GetNumOfThread() != MWaves * NWaves * WaveSize\n");
+
+        static_assert(MPerBlock % (MPerXDL * MRepeat) == 0 && NPerBlock % (NPerXDL * NRepeat) == 0,
+                      "wrong!");
+    }
+
+    __host__ __device__ BlockwiseGemmXdlops_v2(const BlockwiseGemmXdlops_v2& other)
+        : a_thread_copy_(other.a_origin), b_thread_copy_(other.b_origin)
+    {
+    }
+
+    // transposed XDL output supporting C_xdl' = B_xdl' * A_xdl'
+    __host__ __device__ static constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4()
+    {
+        constexpr auto c_m0_m1_m2_n_tblk_lens = xdlops_gemm.GetCM0M1M2NThreadBlkLengths();
+
+        constexpr auto M0 = c_m0_m1_m2_n_tblk_lens[I0];
+        constexpr auto M1 = c_m0_m1_m2_n_tblk_lens[I1];
+        constexpr auto M2 = c_m0_m1_m2_n_tblk_lens[I2];
+        constexpr auto N  = c_m0_m1_m2_n_tblk_lens[I3];
+
+        return make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, I1, I1, N, M0, M1, M2));
+    }
+
+    // XDL output supporting C_xdl = A_xdl * B_xdl
+    __host__ __device__ static constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
+    {
+        constexpr auto c_m0_m1_m2_n_tblk_lens = xdlops_gemm.GetCM0M1M2NThreadBlkLengths();
+
+        constexpr auto M0 = c_m0_m1_m2_n_tblk_lens[I0];
+        constexpr auto M1 = c_m0_m1_m2_n_tblk_lens[I1];
+        constexpr auto M2 = c_m0_m1_m2_n_tblk_lens[I2];
+        constexpr auto N  = c_m0_m1_m2_n_tblk_lens[I3];
+
+        return make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, I1, I1, M0, M1, M2, N));
+    }
+
+    __host__ __device__ static constexpr auto GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
+    {
+        constexpr auto c_m0_m1_m2_n_tblk_lens = xdlops_gemm.GetCM0M1M2NThreadBlkLengths();
+
+        constexpr auto M0 = c_m0_m1_m2_n_tblk_lens[I0];
+        constexpr auto M1 = c_m0_m1_m2_n_tblk_lens[I1];
+        constexpr auto M2 = c_m0_m1_m2_n_tblk_lens[I2];
+        constexpr auto N  = c_m0_m1_m2_n_tblk_lens[I3];
+
+        return make_naive_tensor_descriptor_packed(
+            make_tuple(I1, Number<MRepeat>{}, Number<NRepeat>{}, I1, I1, M0, M1, M2, N));
+    }
+
+    // transposed XDL output supporting C_xdl' = B_xdl' * A_xdl'
+    __host__ __device__ static constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4()
+    {
+        constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2 =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{},
+                                                           Number<NRepeat>{},
+                                                           Number<MWaves>{},
+                                                           Number<NWaves>{},
+                                                           Number<MPerXDL>{},
+                                                           Number<NPerXDL>{}));
+
+        return xdlops_gemm.MakeCDescriptor_M0_N0_M1_N1_M2_N2_N3_N4(c_block_desc_m0_n0_m1_n1_m2_n2);
+    }
+
+    // XDL output supporting C_xdl = A_xdl * B_xdl
+    __host__ __device__ static constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
+    {
+        constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2 =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{},
+                                                           Number<NRepeat>{},
+                                                           Number<MWaves>{},
+                                                           Number<NWaves>{},
+                                                           Number<MPerXDL>{},
+                                                           Number<NPerXDL>{}));
+
+        return xdlops_gemm.MakeCDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_block_desc_m0_n0_m1_n1_m2_n2);
+    }
+
+    __host__ __device__ static constexpr auto GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
+    {
+        constexpr auto c_block_desc_g_m0_n0_m1_n1_m2_n2 =
+            make_naive_tensor_descriptor_packed(make_tuple(I1,
+                                                           Number<MRepeat>{},
+                                                           Number<NRepeat>{},
+                                                           Number<MWaves>{},
+                                                           Number<NWaves>{},
+                                                           Number<MPerXDL>{},
+                                                           Number<NPerXDL>{}));
+
+        return xdlops_gemm.MakeCDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(
+            c_block_desc_g_m0_n0_m1_n1_m2_n2);
+    }
+
+    template <typename CGridDesc_M_N>
+    __host__ __device__ static constexpr auto
+    MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        const auto M = c_grid_desc_m_n.GetLength(I0);
+        const auto N = c_grid_desc_m_n.GetLength(I1);
+
+        const auto c_grid_desc_m0_n0_m1_n1_m2_n2 = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(M / (MWaves * MPerXDL), MWaves, MPerXDL)),
+                       make_unmerge_transform(make_tuple(N / (NWaves * NPerXDL), NWaves, NPerXDL))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 2, 4>{}, Sequence<1, 3, 5>{}));
+
+        return xdlops_gemm.MakeCDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m0_n0_m1_n1_m2_n2);
+    }
+
+    template <typename CGridDesc_G_M_N>
+    __host__ __device__ static constexpr auto
+    MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_G_M_N& c_grid_desc_g_m_n)
+    {
+        const auto G = c_grid_desc_g_m_n.GetLength(I0);
+        const auto M = c_grid_desc_g_m_n.GetLength(I1);
+        const auto N = c_grid_desc_g_m_n.GetLength(I2);
+
+        const auto c_grid_desc_g_m0_n0_m1_n1_m2_n2 = transform_tensor_descriptor(
+            c_grid_desc_g_m_n,
+            make_tuple(make_pass_through_transform(G),
+                       make_unmerge_transform(make_tuple(M / (MWaves * MPerXDL), MWaves, MPerXDL)),
+                       make_unmerge_transform(make_tuple(N / (NWaves * NPerXDL), NWaves, NPerXDL))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 3, 5>{}, Sequence<2, 4, 6>{}));
+
+        return xdlops_gemm.MakeCDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(
+            c_grid_desc_g_m0_n0_m1_n1_m2_n2);
+    }
+
+    static constexpr AMmaTileDesc a_block_desc_m0_m1_m2_k;
+    static constexpr BMmaTileDesc b_block_desc_n0_n1_n2_k;
+
+    template <typename ABlockBuffer, typename BBlockBuffer, typename CThreadBuffer>
+    __device__ void Run(const ABlockBuffer& a_block_buf,
+                        const BBlockBuffer& b_block_buf,
+                        CThreadBuffer& c_thread_buf) const
+    {
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        static_for<0, KPerThread / KPack, 1>{}([&](auto k) { // k=0,1,2 instead of k=0,kpack*1, ...
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                // read A
+                a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                                   make_tuple(m0, I0, I0, Number<k * AMmaKStride>{}),
+                                   a_block_buf,
+                                   a_thread_desc_,
+                                   make_tuple(I0, I0, I0, I0),
+                                   a_thread_buf);
+
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    // read B
+                    b_thread_copy_.Run(b_block_desc_n0_n1_n2_k,
+                                       make_tuple(n0, I0, I0, Number<k * BMmaKStride>{}),
+                                       b_block_buf,
+                                       b_thread_desc_,
+                                       make_tuple(I0, I0, I0, I0),
+                                       b_thread_buf);
+                    vector_type<FloatAB, KPack> a_thread_vec;
+                    vector_type<FloatAB, KPack> b_thread_vec;
+
+                    static_for<0, KPack, 1>{}([&](auto i) {
+                        a_thread_vec.template AsType<FloatAB>()(i) = a_thread_buf
+                            [Number<a_thread_desc_.CalculateOffset(make_tuple(0, 0, 0, i))>{}];
+                        b_thread_vec.template AsType<FloatAB>()(i) = b_thread_buf
+                            [Number<b_thread_desc_.CalculateOffset(make_tuple(0, 0, 0, i))>{}];
+                    });
+
+                    using mfma_input_type =
+                        typename vector_type<FloatAB, xdlops_gemm.K1PerXdlops>::type;
+
+                    constexpr index_t c_offset =
+                        c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                    xdlops_gemm.template Run(
+                        a_thread_vec.template AsType<mfma_input_type>(),
+                        b_thread_vec.template AsType<mfma_input_type>(),
+                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                });
+            });
+        });
+    }
+
+    protected:
+    // A[M0, M1, M2, KPack]
+    static constexpr auto a_thread_desc_ =
+        make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPack>{}));
+
+    // B[N0, N1, N2, KPack]
+    static constexpr auto b_thread_desc_ =
+        make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPack>{}));
+
+    // C[M, N, NumRegXdlops]
+    static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, xdlops_gemm.GetRegSizePerXdlops()));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
+                                                         FloatAB,
+                                                         decltype(a_block_desc_m0_m1_m2_k),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, KPack>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         A_K1,
+                                                         A_K1>;
+
+    using BThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
+                                                         FloatAB,
+                                                         decltype(b_block_desc_n0_n1_n2_k),
+                                                         decltype(b_thread_desc_),
+                                                         Sequence<1, 1, 1, KPack>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         B_K1,
+                                                         B_K1>;
+
+    AThreadCopy a_thread_copy_;
+    BThreadCopy b_thread_copy_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp
new file mode 100644
index 00000000..aa814ab0
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp
@@ -0,0 +1,321 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/warp/xdlops_gemm.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"
+
+namespace ck {
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename AK0MK1BlockDesc,
+          typename BK0K0BN0N1N2N3K1BlockDesc,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t K0PerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack>
+struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1r1
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr index_t WaveSize = 64;
+
+    static constexpr index_t KPerBlock = K0PerBlock * KPack;
+
+    static constexpr index_t A_K0 = AK0MK1BlockDesc{}.GetLength(I0);
+    static constexpr index_t A_K1 = AK0MK1BlockDesc{}.GetLength(I2);
+
+    static constexpr auto xdlops_gemm = XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack>{};
+
+    static constexpr index_t KPerThread  = KPerBlock / xdlops_gemm.K0PerXdlops;
+    static constexpr index_t K0PerThread = K0PerBlock / xdlops_gemm.K0PerXdlops;
+
+    static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL);
+    static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL);
+
+    StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
+                              FloatAcc,
+                              MRepeat * NRepeat,
+                              xdlops_gemm.GetRegSizePerXdlops(),
+                              true>
+        c_thread_buf_;
+
+    __host__ __device__ constexpr auto& GetCThreadBuffer() { return c_thread_buf_; }
+
+    __device__ static auto GetWaveIdx()
+    {
+        const index_t thread_id = get_thread_local_1d_id();
+
+        constexpr auto threadid_to_wave_idx_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(MWaves, NWaves, WaveSize))),
+            make_tuple(Sequence<0, 1, 2>{}),
+            make_tuple(Sequence<0>{}));
+
+        return threadid_to_wave_idx_adaptor.CalculateBottomIndex(make_multi_index(thread_id));
+    }
+
+    __device__ static auto CalculateAThreadOriginDataIndex()
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_m = wave_idx[I0];
+
+        const auto xdlops_a_idx = xdlops_gemm.CalculateAThreadOriginDataIndex();
+
+        return make_tuple(0, waveId_m, xdlops_a_idx[I1], KPerThread * xdlops_a_idx[I0]);
+    }
+
+    __device__ static auto CalculateBThreadOriginDataIndex()
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_n = wave_idx[I1];
+
+        const auto xdlops_b_idx = xdlops_gemm.CalculateBThreadOriginDataIndex();
+
+        return make_tuple(0, waveId_n, xdlops_b_idx[I1], KPerThread * xdlops_b_idx[I0]);
+    }
+
+    template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
+    __device__ static auto
+        CalculateCThreadOriginDataIndex(Number<m0>, Number<n0>, Number<xdlops_i>, Number<blk_i>)
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_m = wave_idx[I0];
+        const auto waveId_n = wave_idx[I1];
+
+        const auto blk_idx = xdlops_gemm.GetBeginOfThreadBlk(xdlops_i, blk_i);
+
+        constexpr auto mrepeat_mwave_mperxdl_to_m_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, MPerXDL))),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0, 1, 2>{}));
+
+        constexpr auto nrepeat_nwave_nperxdl_to_n_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_unmerge_transform(make_tuple(NRepeat, NWaves, NPerXDL))),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0, 1, 2>{}));
+
+        const index_t c_thread_m = mrepeat_mwave_mperxdl_to_m_adaptor.CalculateBottomIndex(
+            make_tuple(m0, waveId_m, blk_idx[I0]))[I0];
+        const index_t c_thread_n = nrepeat_nwave_nperxdl_to_n_adaptor.CalculateBottomIndex(
+            make_tuple(n0, waveId_n, blk_idx[I1]))[I0];
+
+        return make_tuple(c_thread_m, c_thread_n);
+    }
+
+    __host__ __device__ BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1r1()
+    {
+        static_assert(AK0MK1BlockDesc::IsKnownAtCompileTime() &&
+                          BK0K0BN0N1N2N3K1BlockDesc::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        static_assert(BlockSize == MWaves * NWaves * WaveSize,
+                      "BlockSize != MWaves * NWaves * WaveSize\n");
+
+        static_assert(MPerBlock % (MPerXDL * MRepeat) == 0 && NPerBlock % (NPerXDL * NRepeat) == 0,
+                      "wrong!");
+    }
+
+    __host__ __device__ static constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
+    {
+        constexpr auto c_m0_m1_m2_n_tblk_lens = xdlops_gemm.GetCM0M1M2NThreadBlkLengths();
+
+        constexpr auto M0 = c_m0_m1_m2_n_tblk_lens[I0];
+        constexpr auto M1 = c_m0_m1_m2_n_tblk_lens[I1];
+        constexpr auto M2 = c_m0_m1_m2_n_tblk_lens[I2];
+        constexpr auto N  = c_m0_m1_m2_n_tblk_lens[I3];
+
+        return make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, I1, I1, M0, M1, M2, N));
+    }
+
+    __host__ __device__ static constexpr auto GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
+    {
+        constexpr auto c_m0_m1_m2_n_tblk_lens = xdlops_gemm.GetCM0M1M2NThreadBlkLengths();
+
+        constexpr auto M0 = c_m0_m1_m2_n_tblk_lens[I0];
+        constexpr auto M1 = c_m0_m1_m2_n_tblk_lens[I1];
+        constexpr auto M2 = c_m0_m1_m2_n_tblk_lens[I2];
+        constexpr auto N  = c_m0_m1_m2_n_tblk_lens[I3];
+
+        return make_naive_tensor_descriptor_packed(
+            make_tuple(I1, Number<MRepeat>{}, Number<NRepeat>{}, I1, I1, M0, M1, M2, N));
+    }
+
+    __host__ __device__ static constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
+    {
+        constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2 =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<MRepeat>{},
+                                                           Number<NRepeat>{},
+                                                           Number<MWaves>{},
+                                                           Number<NWaves>{},
+                                                           Number<MPerXDL>{},
+                                                           Number<NPerXDL>{}));
+
+        return xdlops_gemm.MakeCDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_block_desc_m0_n0_m1_n1_m2_n2);
+    }
+
+    __host__ __device__ static constexpr auto GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
+    {
+        constexpr auto c_block_desc_g_m0_n0_m1_n1_m2_n2 =
+            make_naive_tensor_descriptor_packed(make_tuple(I1,
+                                                           Number<MRepeat>{},
+                                                           Number<NRepeat>{},
+                                                           Number<MWaves>{},
+                                                           Number<NWaves>{},
+                                                           Number<MPerXDL>{},
+                                                           Number<NPerXDL>{}));
+
+        return xdlops_gemm.MakeCDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(
+            c_block_desc_g_m0_n0_m1_n1_m2_n2);
+    }
+
+    template <typename CGridDesc_M_N>
+    __host__ __device__ static constexpr auto
+    MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        const auto M = c_grid_desc_m_n.GetLength(I0);
+        const auto N = c_grid_desc_m_n.GetLength(I1);
+
+        const auto c_grid_desc_m0_n0_m1_n1_m2_n2 = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(M / (MWaves * MPerXDL), MWaves, MPerXDL)),
+                       make_unmerge_transform(make_tuple(N / (NWaves * NPerXDL), NWaves, NPerXDL))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 2, 4>{}, Sequence<1, 3, 5>{}));
+
+        return xdlops_gemm.MakeCDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m0_n0_m1_n1_m2_n2);
+    }
+
+    template <typename CGridDesc_G_M_N>
+    __host__ __device__ static constexpr auto
+    MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_G_M_N& c_grid_desc_g_m_n)
+    {
+        const auto G = c_grid_desc_g_m_n.GetLength(I0);
+        const auto M = c_grid_desc_g_m_n.GetLength(I1);
+        const auto N = c_grid_desc_g_m_n.GetLength(I2);
+
+        const auto c_grid_desc_g_m0_n0_m1_n1_m2_n2 = transform_tensor_descriptor(
+            c_grid_desc_g_m_n,
+            make_tuple(make_pass_through_transform(G),
+                       make_unmerge_transform(make_tuple(M / (MWaves * MPerXDL), MWaves, MPerXDL)),
+                       make_unmerge_transform(make_tuple(N / (NWaves * NPerXDL), NWaves, NPerXDL))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 3, 5>{}, Sequence<2, 4, 6>{}));
+
+        return xdlops_gemm.MakeCDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(
+            c_grid_desc_g_m0_n0_m1_n1_m2_n2);
+    }
+
+    __host__ __device__ static constexpr auto MakeABlockDescriptor_M0_M1_M2_K()
+    {
+        return transform_tensor_descriptor(
+            AK0MK1BlockDesc{},
+            make_tuple(
+                make_merge_transform_v3_division_mod(make_tuple(Number<A_K0>{}, Number<A_K1>{})),
+                make_unmerge_transform(
+                    make_tuple(Number<MRepeat>{}, Number<MWaves>{}, Number<MPerXDL>{}))),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}),
+            make_tuple(Sequence<3>{}, Sequence<0, 1, 2>{}));
+    }
+
+    __device__ void MoveABlockSliceWindow()
+    {
+        a_thread_copy_.MoveSrcSliceWindow(a_block_desc_m0_m1_m2_k,
+                                          make_multi_index(0, 0, 0, K0PerBlock * KPack));
+    }
+    __device__ void ResetABlockStartWindow()
+    {
+        a_thread_copy_.SetSrcCoord(CalculateAThreadOriginDataIndex());
+    }
+
+    static constexpr auto a_block_desc_m0_m1_m2_k = MakeABlockDescriptor_M0_M1_M2_K();
+
+    template <typename ABlockBuffer, typename BBlockBuffer, typename CThreadBuffer>
+    __device__ void Run(const ABlockBuffer& a_block_buf,
+                        const BBlockBuffer& b_thread_buf,
+                        CThreadBuffer& c_thread_buf) const
+    {
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
+            a_thread_desc_.GetElementSpaceSize());
+
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            // read A
+            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k,
+                               make_tuple(m0, I0, I0, I0),
+                               a_block_buf,
+                               a_thread_desc_,
+                               make_tuple(I0, I0, I0, I0),
+                               a_thread_buf);
+
+            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                // read B
+                static_for<0, KPerThread, KPack>{}([&](auto k) {
+                    vector_type<FloatAB, KPack> a_thread_vec;
+                    vector_type<FloatAB, KPack> b_thread_vec;
+                    constexpr index_t k0 = k / KPack;
+                    static_for<0, KPack, 1>{}([&](auto i) {
+                        a_thread_vec.template AsType<FloatAB>()(i) = a_thread_buf
+                            [Number<a_thread_desc_.CalculateOffset(make_tuple(0, 0, 0, k + i))>{}];
+                        b_thread_vec.template AsType<FloatAB>()(i) = b_thread_buf
+                            [Number<b_thread_desc_.CalculateOffset(make_tuple(k0, n0, i))>{}];
+                    });
+
+                    using mfma_input_type =
+                        typename vector_type<FloatAB, xdlops_gemm.K1PerXdlops>::type;
+
+                    constexpr index_t c_offset =
+                        c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                    xdlops_gemm.template Run(
+                        a_thread_vec.template AsType<mfma_input_type>(),
+                        b_thread_vec.template AsType<mfma_input_type>(),
+                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                });
+            });
+        });
+    }
+
+    private:
+    // A[M0, M1, M2, KPerThread]
+    static constexpr auto a_thread_desc_ =
+        make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPerThread>{}));
+
+    // B[N0, N1, N2, KPerThread]
+    static constexpr auto b_thread_desc_ =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<K0PerThread>{}, // KPerThread
+                                                       Number<NRepeat>{},     // repeat
+                                                       Number<KPack>{}));
+
+    // C[M, N, NumRegXdlops]
+    static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, xdlops_gemm.GetRegSizePerXdlops()));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<FloatAB,
+                                                         FloatAB,
+                                                         decltype(a_block_desc_m0_m1_m2_k),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, KPerThread>,
+                                                         Sequence<0, 1, 2, 3>,
+                                                         3,
+                                                         A_K1,
+                                                         A_K1>;
+
+    AThreadCopy a_thread_copy_{CalculateAThreadOriginDataIndex()};
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp b/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp
new file mode 100644
index 00000000..d7ec1773
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_common.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/utility/reduction_functions_accumulate.hpp"
+#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp"
+#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
+
+namespace ck {
+
+template <index_t BlockSize,
+          typename AccDataType,
+          typename ThreadMap_M_K, // thread_id to m_k
+          typename ThreadClusterDesc_M_K,
+          typename ThreadSliceDesc_M_K,
+          bool IgnoreNaN = false>
+struct BlockwiseSoftmax
+{
+    static constexpr auto I0         = Number<0>{};
+    static constexpr auto I1         = Number<1>{};
+    static constexpr index_t MRepeat = ThreadSliceDesc_M_K{}.GetLength(I0);
+    static constexpr index_t KRepeat = ThreadSliceDesc_M_K{}.GetLength(I1);
+
+    using ThreadSliceDesc_M = decltype(
+        make_naive_tensor_descriptor_packed(make_tuple(ThreadSliceDesc_M_K{}.GetLength(I0))));
+
+    using ThreadwiseMaxReduce = typename conditional<
+        IgnoreNaN,
+        ThreadwiseReduction<AccDataType,
+                            ThreadSliceDesc_M_K,
+                            ThreadSliceDesc_M,
+                            reduce::Max,
+                            false,
+                            detail::AccumulateWithNanIgnore<reduce::Max, AccDataType>>,
+        ThreadwiseReduction<AccDataType,
+                            ThreadSliceDesc_M_K,
+                            ThreadSliceDesc_M,
+                            reduce::Max,
+                            false>>::type;
+
+    using ThreadwiseSumReduce = typename conditional<
+        IgnoreNaN,
+        ThreadwiseReduction<AccDataType,
+                            ThreadSliceDesc_M_K,
+                            ThreadSliceDesc_M,
+                            reduce::Add,
+                            false,
+                            detail::AccumulateWithNanIgnore<reduce::Add, AccDataType>>,
+        ThreadwiseReduction<AccDataType,
+                            ThreadSliceDesc_M_K,
+                            ThreadSliceDesc_M,
+                            reduce::Add,
+                            false>>::type;
+
+    using ThreadClusterLengths_M_K = decltype(ThreadClusterDesc_M_K{}.GetLengths());
+
+    using BlockwiseMaxReduce = PartitionedBlockwiseReduction_v2<AccDataType,
+                                                                BlockSize,
+                                                                ThreadClusterLengths_M_K,
+                                                                ThreadMap_M_K,
+                                                                reduce::Max,
+                                                                false>;
+
+    using BlockwiseSumReduce = PartitionedBlockwiseReduction_v2<AccDataType,
+                                                                BlockSize,
+                                                                ThreadClusterLengths_M_K,
+                                                                ThreadMap_M_K,
+                                                                reduce::Add,
+                                                                false>;
+
+    using BufferType = StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MRepeat, true>;
+
+    template <typename CThreadBuffer, typename WorkspaceBuffer>
+    __host__ __device__ void Run(CThreadBuffer& in_thread_buf, WorkspaceBuffer& reduce_work_buf)
+    {
+        // find max value
+        static_for<0, MRepeat, 1>{}([&](auto I) {
+            max_value_buf(I) = reduce::Max::template GetIdentityValue<AccDataType>();
+        });
+        ThreadwiseMaxReduce::Reduce(in_thread_buf, max_value_buf);
+        static_for<0, MRepeat, 1>{}([&](auto I) {
+            BlockwiseMaxReduce::Reduce(reduce_work_buf, max_value_buf(I));
+            block_sync_lds();
+        });
+
+        // calculate exp for elements, P=exp(s-max)
+        static_for<0, MRepeat, 1>{}([&](auto iM) {
+            static_for<0, KRepeat, 1>{}([&](auto iK) {
+                auto offset = Number<ThreadSliceDesc_M_K{}.CalculateOffset(make_tuple(iM, iK))>{};
+                in_thread_buf(offset) = IgnoreNaN && ck::math::isnan(in_thread_buf[offset])
+                                            ? 0
+                                            : math::exp(in_thread_buf[offset] - max_value_buf(iM));
+            });
+        });
+
+        // sum data
+        static_for<0, MRepeat, 1>{}([&](auto I) {
+            sum_value_buf(I) = reduce::Add::template GetIdentityValue<AccDataType>();
+        });
+        ThreadwiseSumReduce::Reduce(in_thread_buf, sum_value_buf);
+        static_for<0, MRepeat, 1>{}([&](auto I) {
+            BlockwiseSumReduce::Reduce(reduce_work_buf, sum_value_buf(I));
+            block_sync_lds();
+        });
+    }
+
+    BufferType max_value_buf;
+    BufferType sum_value_buf;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
new file mode 100644
index 00000000..03e4d42d
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp"
+
+namespace ck {
+
+// this version does following things to avoid scratch memory issue
+// 1. Use StaticallyIndexedArray instead of C array for thread buffer
+// 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
+// 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
+template <index_t BlockSize,
+          InMemoryDataOperationEnum DstInMemOp,
+          typename BlockSliceLengths,
+          typename ThreadSliceLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          typename SrcVectorTensorLengths,
+          typename DstVectorTensorLengths,
+          typename SrcVectorTensorContiguousDimOrder,
+          typename DstVectorTensorContiguousDimOrder,
+          bool ThreadTransferSrcResetCoordinateAfterRun,
+          bool ThreadTransferDstResetCoordinateAfterRun>
+struct BlockwiseTensorSliceTransfer_v5r1
+{
+    static constexpr index_t nDim = remove_reference_t<SrcDesc>::GetNumOfDimension();
+
+    using Index = MultiIndex<nDim>;
+
+    __device__ constexpr BlockwiseTensorSliceTransfer_v5r1(const SrcDesc& src_desc,
+                                                           const Index& src_block_slice_origin,
+                                                           const DstDesc& dst_desc,
+                                                           const Index& dst_block_slice_origin)
+        : threadwise_transfer_(
+              src_desc, make_zero_multi_index<nDim>(), dst_desc, make_zero_multi_index<nDim>())
+
+    {
+        static_assert(nDim == remove_cvref_t<SrcDesc>::GetNumOfDimension() &&
+                          nDim == remove_cvref_t<DstDesc>::GetNumOfDimension() &&
+                          nDim == BlockSliceLengths::Size() && nDim == ThreadSliceLengths::Size() &&
+                          nDim == ThreadClusterLengths::Size() &&
+                          nDim == ThreadClusterArrangeOrder::Size() &&
+                          nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(),
+                      "wrong! nDim not consistent");
+
+        static_assert(
+            is_same<BlockSliceLengths, decltype(ThreadSliceLengths{} * ThreadClusterLengths{})>{},
+            "wrong! threads should be mapped to cover entire slicing window");
+
+        static_assert(BlockSize >= thread_cluster_desc_.GetElementSize(),
+                      "wrong! BlockSize too small");
+
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
+                make_multi_index(get_thread_local_1d_id()));
+
+            const auto thread_data_idx_begin = thread_cluster_idx * ThreadSliceLengths{};
+
+            threadwise_transfer_.SetSrcSliceOrigin(src_desc,
+                                                   src_block_slice_origin + thread_data_idx_begin);
+            threadwise_transfer_.SetDstSliceOrigin(dst_desc,
+                                                   dst_block_slice_origin + thread_data_idx_begin);
+        }
+    }
+
+    template <typename SrcBuffer>
+    __device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.RunRead(src_desc, src_buf);
+        }
+    }
+
+    template <typename DstBuffer>
+    __device__ void RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.RunWrite(dst_desc, dst_buf);
+        }
+    }
+
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& step)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrcSliceWindow(src_desc, step);
+        }
+    }
+
+    // SrcMoveSliceWindowStepHack to control index calculation move slice window
+    template <typename SrcMoveSliceWindowStepHack>
+    __device__ void
+    MoveSrcSliceWindow(const SrcDesc& src_desc,
+                       const Index& step,
+                       const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrcSliceWindow(
+                src_desc, step, src_move_slice_window_step_hack);
+        }
+    }
+
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc, const Index& step)
+    {
+        if(BlockSize == thread_cluster_desc_.GetElementSize() or
+           get_thread_local_1d_id() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveDstSliceWindow(dst_desc, step);
+        }
+    }
+
+    private:
+    static constexpr auto thread_cluster_desc_ =
+        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
+
+    using ThreadwiseTransfer =
+        ThreadwiseTensorSliceTransfer_v5r1<ThreadSliceLengths,
+                                           DstInMemOp,
+                                           SrcData,
+                                           DstData,
+                                           SrcDesc,
+                                           DstDesc,
+                                           SrcDimAccessOrder,
+                                           DstDimAccessOrder,
+                                           SrcVectorTensorLengths,
+                                           DstVectorTensorLengths,
+                                           SrcVectorTensorContiguousDimOrder,
+                                           DstVectorTensorContiguousDimOrder,
+                                           ThreadTransferSrcResetCoordinateAfterRun,
+                                           ThreadTransferDstResetCoordinateAfterRun>;
+
+    ThreadwiseTransfer threadwise_transfer_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_welford.hpp b/include/ck/tensor_operation/gpu/block/blockwise_welford.hpp
new file mode 100644
index 00000000..31650865
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_welford.hpp
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/utility/reduction_common.hpp"
+
+namespace ck {
+
+// clang-format off
+// Assume:
+//  1) work_buffer is buffer (typically LDS) allocated outside as workspace
+//  2) work_buffer has T elements, and space size is no less than 3*BlockSize
+//  3) mean_value, var_value and count is the input data in vgpr from each thread
+//  4) mean_value, var_value and count is the over-written reduced output in vgpr for each thread
+//  5) Merge mean and M from ThreadwiseWelford
+// clang-format on
+template <typename T,
+          index_t BlockSize,
+          typename ThreadClusterLengths_M_K,
+          typename ThreadClusterArrangeOrder,
+          bool GetActualVariance = true>
+struct BlockwiseWelford
+{
+    static_assert(BlockSize == ThreadClusterLengths_M_K::At(0) * ThreadClusterLengths_M_K::At(1),
+                  "The product of cluster lengths should be same as BlockSize!");
+
+    static constexpr auto BufferLength_M = ThreadClusterLengths_M_K::At(0);
+    static constexpr auto BufferLength_K = ThreadClusterLengths_M_K::At(1);
+
+    static constexpr auto block_buf_desc_m_k = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<BufferLength_M>{}, Number<BufferLength_K>{}));
+
+    static constexpr auto thread_cluster_desc =
+        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
+
+    __device__ static inline void
+    Merge(T& mean_a, T& var_a, int& count_a, T mean_b, T var_b, int count_b)
+    {
+        int count            = count_a + count_b;
+        T count_b_over_count = count == 0 ? type_convert<T>(0) : type_convert<T>(count_b) / count;
+        T delta              = mean_b - mean_a;
+        mean_a += delta * count_b_over_count;
+        var_a += var_b + delta * delta * count_a * count_b_over_count;
+        count_a = count;
+    }
+
+    __device__ static void Run(T& mean_value, T& var_value, int& count)
+    {
+        __shared__ T mean_block_buf[BlockSize];
+        __shared__ T var_block_buf[BlockSize];
+        __shared__ int count_block_buf[BlockSize];
+
+        constexpr auto cluster_len_shift = get_shift<BufferLength_K>();
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(get_thread_local_1d_id()));
+
+        const auto thread_m_cluster_id = thread_cluster_idx[Number<0>{}];
+        const auto thread_k_cluster_id = thread_cluster_idx[Number<1>{}];
+
+        index_t offset1 = block_buf_desc_m_k.CalculateOffset(thread_cluster_idx);
+
+        mean_block_buf[offset1]  = mean_value;
+        var_block_buf[offset1]   = var_value;
+        count_block_buf[offset1] = count;
+
+        block_sync_lds();
+
+        static_for<0, cluster_len_shift, 1>{}([&](auto I) {
+            constexpr index_t indOffset = 1 << (cluster_len_shift - 1 - I());
+
+            if(thread_k_cluster_id < indOffset)
+            {
+                index_t offset2 = block_buf_desc_m_k.CalculateOffset(thread_cluster_idx +
+                                                                     make_tuple(0, indOffset));
+
+                T mean1    = mean_block_buf[offset1];
+                T var1     = var_block_buf[offset1];
+                int count1 = count_block_buf[offset1];
+
+                T mean2    = mean_block_buf[offset2];
+                T var2     = var_block_buf[offset2];
+                int count2 = count_block_buf[offset2];
+
+                Merge(mean1, var1, count1, mean2, var2, count2);
+
+                mean_block_buf[offset1]  = mean1;
+                var_block_buf[offset1]   = var1;
+                count_block_buf[offset1] = count1;
+            }
+
+            block_sync_lds();
+        });
+
+        index_t offset = block_buf_desc_m_k.CalculateOffset(make_tuple(thread_m_cluster_id, 0));
+
+        count      = count_block_buf[offset];
+        mean_value = mean_block_buf[offset];
+
+        if constexpr(GetActualVariance)
+            var_value = var_block_buf[offset] / count;
+        else
+            var_value = var_block_buf[offset];
+    };
+};
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp b/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
new file mode 100644
index 00000000..2163ad32
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/utility/reduction_common.hpp"
+#include "ck/utility/reduction_functions_accumulate.hpp"
+
+namespace ck {
+
+// clang-format off
+// Assume:
+//  1) work_buffer is buffer (typically LDS) allocated outside as workspace, does not include any in/out data
+//  2) work_buffer has AccDataType elements, and space size is no less than BlockSize
+//  3) in_out_value is the input data in vgpr from each thread
+//  4) in_out_value is the over-written reduced output in vgpr for each thread
+// clang-format on
+template <typename AccDataType,
+          index_t BlockSize,
+          typename ThreadClusterLengths_M_K,
+          typename ThreadClusterArrangeOrder,
+          typename OpReduce,
+          bool PropagateNan,
+          typename Accumulation =
+              detail::AccumulateWithNanCheck<PropagateNan, OpReduce, AccDataType>>
+struct PartitionedBlockwiseReduction
+{
+    static_assert(BlockSize == ThreadClusterLengths_M_K::At(0) * ThreadClusterLengths_M_K::At(1),
+                  "The product of cluster lengths should be same as BlockSize!");
+
+    static constexpr auto BufferLength_M = ThreadClusterLengths_M_K::At(0);
+    static constexpr auto BufferLength_K = ThreadClusterLengths_M_K::At(1);
+
+    static_assert(BufferLength_K > 1, "Parallel reduction need work on at least two elements");
+
+    static constexpr auto block_buf_desc_m_k = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<BufferLength_M>{}, Number<BufferLength_K>{}));
+
+    static constexpr auto thread_cluster_desc =
+        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
+
+    template <typename BufferType>
+    __device__ static void Reduce(BufferType& work_buffer, AccDataType& in_out_value)
+    {
+        static_assert(is_same<typename BufferType::type, AccDataType>{},
+                      "Buffer data type should be consistent as AccDataType!");
+
+        constexpr auto cluster_len_shift = get_shift<BufferLength_K>();
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(get_thread_local_1d_id()));
+
+        const auto thread_m_cluster_id = thread_cluster_idx[Number<0>{}];
+        const auto thread_k_cluster_id = thread_cluster_idx[Number<1>{}];
+
+        work_buffer(block_buf_desc_m_k.CalculateOffset(thread_cluster_idx)) = in_out_value;
+
+        __syncthreads();
+
+        static_for<0, cluster_len_shift, 1>{}([&](auto I) {
+            constexpr index_t indOffset = 1 << (cluster_len_shift - 1 - I());
+
+            if(thread_k_cluster_id < indOffset)
+            {
+                index_t offset1 = block_buf_desc_m_k.CalculateOffset(thread_cluster_idx);
+                index_t offset2 = block_buf_desc_m_k.CalculateOffset(thread_cluster_idx +
+                                                                     make_tuple(0, indOffset));
+
+                AccDataType opData1 = work_buffer[offset1];
+                AccDataType opData2 = work_buffer[offset2];
+                Accumulation::Calculate(opData1, opData2);
+                work_buffer(offset1) = opData1;
+            }
+
+            __syncthreads();
+        });
+
+        index_t offset = block_buf_desc_m_k.CalculateOffset(make_tuple(thread_m_cluster_id, 0));
+
+        in_out_value = work_buffer[offset];
+    };
+};
+
+// clang-format off
+// Assume:
+//  1) work_buffer is buffer (typically LDS) allocated outside as workspace, does not include any in/out data
+//  2) work_buffer has AccDataType elements, and space size is no less than BlockSize
+//  3) in_out_value is the input data in vgpr from each thread
+//  4) in_out_value is the over-written reduced output in vgpr for each thread
+// clang-format on
+template <typename AccDataType,
+          index_t BlockSize,
+          typename ThreadClusterLengths_M_K,
+          typename ThreadClusterDesc,
+          typename OpReduce,
+          bool PropagateNan,
+          typename Accumulation =
+              detail::AccumulateWithNanCheck<PropagateNan, OpReduce, AccDataType>>
+struct PartitionedBlockwiseReduction_v2
+{
+    static_assert(BlockSize == ThreadClusterLengths_M_K::At(0) * ThreadClusterLengths_M_K::At(1),
+                  "The product of cluster lengths should be same as BlockSize!");
+
+    static constexpr auto BufferLength_M = ThreadClusterLengths_M_K::At(0);
+    static constexpr auto BufferLength_K = ThreadClusterLengths_M_K::At(1);
+
+    static_assert(BufferLength_K > 1, "Parallel reduction need work on at least two elements");
+
+    static constexpr auto block_buf_desc_m_k = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<BufferLength_M>{}, Number<BufferLength_K>{}));
+
+    static constexpr auto thread_cluster_desc = ThreadClusterDesc{};
+
+    template <typename BufferType>
+    __device__ static void Reduce(BufferType& work_buffer, AccDataType& in_out_value)
+    {
+        static_assert(is_same<typename BufferType::type, AccDataType>{},
+                      "Buffer data type should be consistent as AccDataType!");
+
+        constexpr auto cluster_len_shift = get_shift<BufferLength_K>();
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(get_thread_local_1d_id()));
+
+        const auto thread_m_cluster_id = thread_cluster_idx[Number<0>{}];
+        const auto thread_k_cluster_id = thread_cluster_idx[Number<1>{}];
+
+        work_buffer(block_buf_desc_m_k.CalculateOffset(thread_cluster_idx)) = in_out_value;
+
+        __syncthreads();
+
+        static_for<0, cluster_len_shift, 1>{}([&](auto I) {
+            constexpr index_t indOffset = 1 << (cluster_len_shift - 1 - I());
+
+            if(thread_k_cluster_id < indOffset)
+            {
+                index_t offset1 = block_buf_desc_m_k.CalculateOffset(thread_cluster_idx);
+                index_t offset2 = block_buf_desc_m_k.CalculateOffset(thread_cluster_idx +
+                                                                     make_tuple(0, indOffset));
+
+                AccDataType opData1 = work_buffer[offset1];
+                AccDataType opData2 = work_buffer[offset2];
+                Accumulation::Calculate(opData1, opData2);
+                work_buffer(offset1) = opData1;
+            }
+
+            __syncthreads();
+        });
+
+        index_t offset = block_buf_desc_m_k.CalculateOffset(make_tuple(thread_m_cluster_id, 0));
+
+        in_out_value = work_buffer[offset];
+    };
+};
+
+// clang-format off
+// Assume:
+//  1) work_val_buffer/work_idx_buffer is buffer (typically LDS) allocated outside as workspace, does not include any in/out data
+//  2) work_val_buffer/work_idx_buffer has AccDataType/IndexDataType elements, and space size is no less than BlockSize
+//  3) in_out_value/in_out_index is the input data in vgpr from each thread
+//  4) in_out_value/in_out_index is the over-written reduced output in vgpr for each thread
+// clang-format on
+template <
+    typename AccDataType,
+    typename IndexDataType,
+    index_t BlockSize,
+    typename ThreadClusterLengths_M_K,
+    typename ThreadClusterArrangeOrder,
+    typename OpReduce,
+    bool PropagateNan,
+    typename Accumulation =
+        detail::AccumulateWithIndexAndNanCheck<PropagateNan, OpReduce, AccDataType, IndexDataType>>
+struct PartitionedBlockwiseReductionWithIndex
+{
+    static_assert(BlockSize == ThreadClusterLengths_M_K::At(0) * ThreadClusterLengths_M_K::At(1),
+                  "The product of cluster lengths should be same as BlockSize!");
+
+    static constexpr auto BufferLength_M = ThreadClusterLengths_M_K::At(0);
+    static constexpr auto BufferLength_K = ThreadClusterLengths_M_K::At(1);
+
+    static_assert(BufferLength_K > 1, "Parallel reduction need work on at least two elements");
+
+    static constexpr auto block_buf_desc_m_k = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<BufferLength_M>{}, Number<BufferLength_K>{}));
+
+    static constexpr auto thread_cluster_desc =
+        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
+
+    // This interface accumulates on both data values and indices
+    template <typename BufferType, typename IdxBufferType>
+    __device__ static void Reduce(BufferType& work_val_buffer,
+                                  IdxBufferType& work_idx_buffer,
+                                  AccDataType& in_out_value,
+                                  IndexDataType& in_out_index)
+    {
+        static_assert(is_same<typename BufferType::type, AccDataType>{},
+                      "Buffer data type should be consistent as AccDataType!");
+        static_assert(is_same<typename IdxBufferType::type, IndexDataType>{},
+                      "Buffer data type should be consistent as IndexDataType!");
+
+        constexpr auto cluster_len_shift = get_shift<BufferLength_K>();
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(get_thread_local_1d_id()));
+
+        const auto thread_m_cluster_id = thread_cluster_idx[Number<0>{}];
+        const auto thread_k_cluster_id = thread_cluster_idx[Number<1>{}];
+
+        work_val_buffer(block_buf_desc_m_k.CalculateOffset(thread_cluster_idx)) = in_out_value;
+        work_idx_buffer(block_buf_desc_m_k.CalculateOffset(thread_cluster_idx)) = in_out_index;
+
+        __syncthreads();
+
+        static_for<0, cluster_len_shift, 1>{}([&](auto I) {
+            constexpr index_t indOffset = 1 << I();
+
+            if(thread_k_cluster_id % (indOffset * 2) == 0)
+            {
+                index_t offset1 = block_buf_desc_m_k.CalculateOffset(thread_cluster_idx);
+                index_t offset2 = block_buf_desc_m_k.CalculateOffset(thread_cluster_idx +
+                                                                     make_tuple(0, indOffset));
+
+                AccDataType opData1      = work_val_buffer[offset1];
+                AccDataType opData2      = work_val_buffer[offset2];
+                IndexDataType currIndex1 = work_idx_buffer[offset1];
+                IndexDataType currIndex2 = work_idx_buffer[offset2];
+
+                Accumulation::Calculate(opData1, opData2, currIndex1, currIndex2);
+                work_val_buffer(offset1) = opData1;
+                work_idx_buffer(offset1) = currIndex1;
+            }
+
+            __syncthreads();
+        });
+
+        index_t offset = block_buf_desc_m_k.CalculateOffset(make_tuple(thread_m_cluster_id, 0));
+
+        in_out_value = work_val_buffer[offset];
+        in_out_index = work_idx_buffer[offset];
+    };
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
new file mode 100644
index 00000000..0e5dfb35
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp"
+
+namespace ck {
+
+// this version does following things to avoid scratch memory issue
+// 1. Use StaticallyIndexedArray instead of C array for thread buffer
+// 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
+// 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
+template <typename ThreadGroup,
+          typename SrcElementwiseOperation,
+          typename DstElementwiseOperation,
+          InMemoryDataOperationEnum DstInMemOp,
+          typename BlockSliceLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          index_t SrcScalarPerVector,
+          index_t DstScalarPerVector,
+          index_t SrcScalarStrideInVector,
+          index_t DstScalarStrideInVector,
+          bool ThreadTransferSrcResetCoordinateAfterRun,
+          bool ThreadTransferDstResetCoordinateAfterRun,
+          index_t NumThreadScratch = 1>
+struct ThreadGroupTensorSliceTransfer_v4r1
+{
+    static constexpr index_t nDim = remove_reference_t<SrcDesc>::GetNumOfDimension();
+
+    static constexpr auto thread_slice_lengths = BlockSliceLengths{} / ThreadClusterLengths{};
+
+    using Index = MultiIndex<nDim>;
+
+    __device__ constexpr ThreadGroupTensorSliceTransfer_v4r1(
+        const SrcDesc& src_desc,
+        const Index& src_block_slice_origin,
+        const SrcElementwiseOperation& src_element_op,
+        const DstDesc& dst_desc,
+        const Index& dst_block_slice_origin,
+        const DstElementwiseOperation& dst_element_op)
+        : threadwise_transfer_(src_desc,
+                               make_zero_multi_index<nDim>(),
+                               src_element_op,
+                               dst_desc,
+                               make_zero_multi_index<nDim>(),
+                               dst_element_op)
+
+    {
+        static_assert(nDim == remove_cvref_t<SrcDesc>::GetNumOfDimension() &&
+                          nDim == remove_cvref_t<DstDesc>::GetNumOfDimension() &&
+                          nDim == ThreadClusterLengths::Size() &&
+                          nDim == ThreadClusterArrangeOrder::Size() &&
+                          nDim == SrcDimAccessOrder::Size() && nDim == DstDimAccessOrder::Size(),
+                      "wrong! nDim not consistent");
+
+        static_assert(
+            is_same<BlockSliceLengths, decltype(thread_slice_lengths * ThreadClusterLengths{})>{},
+            "wrong! threads should be mapped to cover entire slicing window");
+
+        static_assert(ThreadGroup::GetNumOfThread() >= thread_cluster_desc_.GetElementSize(),
+                      "wrong! ThreadGroup::GetNumOfThread() too small");
+
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
+                make_multi_index(ThreadGroup::GetThreadId()));
+
+            const auto thread_data_idx_begin = thread_cluster_idx * thread_slice_lengths;
+
+            threadwise_transfer_.SetSrcSliceOrigin(src_desc,
+                                                   src_block_slice_origin + thread_data_idx_begin);
+            threadwise_transfer_.SetDstSliceOrigin(dst_desc,
+                                                   dst_block_slice_origin + thread_data_idx_begin);
+        }
+    }
+
+    template <typename SrcBuffer, index_t ThreadScratchId = 0>
+    __device__ void RunRead(const SrcDesc& src_desc,
+                            const SrcBuffer& src_buf,
+                            Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.RunRead(src_desc, src_buf, thread_scratch_id);
+        }
+    }
+
+    template <typename DstBuffer, index_t ThreadScratchId = 0>
+    __device__ void RunWrite(const DstDesc& dst_desc,
+                             DstBuffer& dst_buf,
+                             Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.RunWrite(dst_desc, dst_buf, thread_scratch_id);
+        }
+    }
+
+    template <typename SrcBuffer, typename DstBuffer, index_t ThreadScratchId>
+    __device__ void Run(const SrcDesc& src_desc,
+                        const SrcBuffer& src_buf,
+                        const DstDesc& dst_desc,
+                        DstBuffer& dst_buf,
+                        Number<ThreadScratchId> thread_scratch_id)
+    {
+        RunRead(src_desc, src_buf, thread_scratch_id);
+        RunWrite(dst_desc, dst_buf, thread_scratch_id);
+    }
+
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrcSliceWindow(src_desc, step);
+        }
+    }
+
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveDstSliceWindow(dst_desc, step);
+        }
+    }
+
+    private:
+    static constexpr auto thread_cluster_desc_ =
+        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
+
+    using ThreadwiseTransfer =
+        ThreadwiseTensorSliceTransfer_v3r1<decltype(thread_slice_lengths),
+                                           SrcElementwiseOperation,
+                                           DstElementwiseOperation,
+                                           DstInMemOp,
+                                           SrcData,
+                                           DstData,
+                                           SrcDesc,
+                                           DstDesc,
+                                           SrcDimAccessOrder,
+                                           DstDimAccessOrder,
+                                           SrcVectorDim,
+                                           DstVectorDim,
+                                           SrcScalarPerVector,
+                                           DstScalarPerVector,
+                                           SrcScalarStrideInVector,
+                                           DstScalarStrideInVector,
+                                           ThreadTransferSrcResetCoordinateAfterRun,
+                                           ThreadTransferDstResetCoordinateAfterRun,
+                                           NumThreadScratch>;
+
+    ThreadwiseTransfer threadwise_transfer_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp
new file mode 100644
index 00000000..5c47a49b
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp"
+
+namespace ck {
+
+// this version does following things to avoid scratch memory issue
+// 1. Use StaticallyIndexedArray instead of C array for thread buffer
+// 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
+// 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
+template <typename ThreadGroup,
+          typename ElementwiseOperation,
+          InMemoryDataOperationEnum DstInMemOp,
+          typename SliceLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename DimAccessOrder,
+          index_t VectorDim,
+          index_t ScalarPerVector,
+          bool ThreadTransferSrcResetCoordinateAfterRun,
+          bool ThreadTransferDstResetCoordinateAfterRun>
+struct ThreadGroupTensorSliceTransfer_v6r1
+{
+    static constexpr index_t nDim = remove_reference_t<SrcDesc>::GetNumOfDimension();
+
+    static constexpr auto thread_slice_lengths = SliceLengths{} / ThreadClusterLengths{};
+
+    using Index = MultiIndex<nDim>;
+
+    __device__ constexpr ThreadGroupTensorSliceTransfer_v6r1(const SrcDesc& src_desc,
+                                                             const Index& src_block_slice_origin,
+                                                             const DstDesc& dst_desc,
+                                                             const Index& dst_block_slice_origin,
+                                                             const ElementwiseOperation& element_op)
+        : threadwise_transfer_(src_desc,
+                               make_zero_multi_index<nDim>(),
+                               dst_desc,
+                               make_zero_multi_index<nDim>(),
+                               element_op)
+
+    {
+        static_assert(nDim == remove_cvref_t<SrcDesc>::GetNumOfDimension() &&
+                          nDim == remove_cvref_t<DstDesc>::GetNumOfDimension() &&
+                          nDim == ThreadClusterLengths::Size() &&
+                          nDim == ThreadClusterArrangeOrder::Size() &&
+                          nDim == DimAccessOrder::Size(),
+                      "wrong! nDim not consistent");
+
+        static_assert(
+            is_same<SliceLengths, decltype(thread_slice_lengths * ThreadClusterLengths{})>{},
+            "wrong! threads should be mapped to cover entire slicing window");
+
+        static_assert(ThreadGroup::GetNumOfThread() >= thread_cluster_desc_.GetElementSize(),
+                      "wrong! ThreadGroup::GetNumOfThread() too small");
+
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
+                make_multi_index(ThreadGroup::GetThreadId()));
+
+            const auto thread_data_idx_begin = thread_cluster_idx * thread_slice_lengths;
+
+            threadwise_transfer_.SetSrcSliceOrigin(src_desc,
+                                                   src_block_slice_origin + thread_data_idx_begin);
+            threadwise_transfer_.SetDstSliceOrigin(dst_desc,
+                                                   dst_block_slice_origin + thread_data_idx_begin);
+        }
+    }
+
+    template <typename SrcBuffer, typename DstBuffer>
+    __device__ void Run(const SrcDesc& src_desc,
+                        const SrcBuffer& src_buf,
+                        const DstDesc& dst_desc,
+                        DstBuffer& dst_buf)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.Run(src_desc, src_buf, dst_desc, dst_buf);
+        }
+    }
+
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrcSliceWindow(src_desc, step);
+        }
+    }
+
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveDstSliceWindow(dst_desc, step);
+        }
+    }
+
+    private:
+    static constexpr auto thread_cluster_desc_ =
+        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
+
+    using ThreadwiseTransfer =
+        ThreadwiseTensorSliceTransfer_v6r1<SrcData,
+                                           DstData,
+                                           SrcDesc,
+                                           DstDesc,
+                                           ElementwiseOperation,
+                                           decltype(thread_slice_lengths),
+                                           DimAccessOrder,
+                                           VectorDim,
+                                           ScalarPerVector,
+                                           DstInMemOp,
+                                           ThreadTransferSrcResetCoordinateAfterRun,
+                                           ThreadTransferDstResetCoordinateAfterRun>;
+
+    ThreadwiseTransfer threadwise_transfer_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp
new file mode 100644
index 00000000..aa33fc08
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp
@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp"
+
+namespace ck {
+
+// this version does following things to avoid scratch memory issue
+// 1. Use StaticallyIndexedArray instead of C array for thread buffer
+// 2. It does not keep reference to tensor descriptor
+// 3. Run() does not construct new tensor coordinate
+template <typename ThreadGroup,
+          typename ElementwiseOperation,
+          InMemoryDataOperationEnum DstInMemOp,
+          typename SliceLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename Src0Data,
+          typename Src1Data,
+          typename DstData,
+          typename Src0Desc,
+          typename Src1Desc,
+          typename DstDesc,
+          typename DimAccessOrder,
+          index_t VectorDim,
+          index_t ScalarPerVector,
+          bool ThreadTransferSrc0ResetCoordinateAfterRun,
+          bool ThreadTransferSrc1ResetCoordinateAfterRun,
+          bool ThreadTransferDstResetCoordinateAfterRun>
+struct ThreadGroupTensorSliceTransfer_v6r2
+{
+    static constexpr index_t nDim = remove_reference_t<Src0Desc>::GetNumOfDimension();
+
+    static constexpr auto thread_slice_lengths = SliceLengths{} / ThreadClusterLengths{};
+
+    using Index = MultiIndex<nDim>;
+
+    __device__ constexpr ThreadGroupTensorSliceTransfer_v6r2(const Src0Desc& src0_desc,
+                                                             const Index& src0_block_slice_origin,
+                                                             const Src1Desc& src1_desc,
+                                                             const Index& src1_block_slice_origin,
+                                                             const DstDesc& dst_desc,
+                                                             const Index& dst_block_slice_origin,
+                                                             const ElementwiseOperation& element_op)
+        : threadwise_transfer_(src0_desc,
+                               make_zero_multi_index<nDim>(),
+                               src1_desc,
+                               make_zero_multi_index<nDim>(),
+                               dst_desc,
+                               make_zero_multi_index<nDim>(),
+                               element_op)
+
+    {
+        static_assert(nDim == remove_cvref_t<Src0Desc>::GetNumOfDimension() &&
+                          nDim == remove_cvref_t<Src1Desc>::GetNumOfDimension() &&
+                          nDim == remove_cvref_t<DstDesc>::GetNumOfDimension() &&
+                          nDim == ThreadClusterLengths::Size() &&
+                          nDim == ThreadClusterArrangeOrder::Size() &&
+                          nDim == DimAccessOrder::Size(),
+                      "wrong! nDim not consistent");
+
+        static_assert(
+            is_same<SliceLengths, decltype(thread_slice_lengths * ThreadClusterLengths{})>{},
+            "wrong! threads should be mapped to cover entire slicing window");
+
+        static_assert(ThreadGroup::GetNumOfThread() >= thread_cluster_desc_.GetElementSize(),
+                      "wrong! ThreadGroup::GetNumOfThread() too small");
+
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
+                make_multi_index(ThreadGroup::GetThreadId()));
+
+            const auto thread_data_idx_begin = thread_cluster_idx * thread_slice_lengths;
+
+            threadwise_transfer_.SetSrc0SliceOrigin(
+                src0_desc, src0_block_slice_origin + thread_data_idx_begin);
+            threadwise_transfer_.SetSrc1SliceOrigin(
+                src1_desc, src1_block_slice_origin + thread_data_idx_begin);
+            threadwise_transfer_.SetDstSliceOrigin(dst_desc,
+                                                   dst_block_slice_origin + thread_data_idx_begin);
+        }
+    }
+
+    template <typename Src0Buffer, typename Src1Buffer, typename DstBuffer>
+    __device__ void Run(const Src0Desc& src0_desc,
+                        const Src0Buffer& src0_buf,
+                        const Src1Desc& src1_desc,
+                        const Src1Buffer& src1_buf,
+                        const DstDesc& dst_desc,
+                        DstBuffer& dst_buf)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.Run(src0_desc, src0_buf, src1_desc, src1_buf, dst_desc, dst_buf);
+        }
+    }
+
+    __device__ void MoveSrc0SliceWindow(const Src0Desc& src0_desc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrc0SliceWindow(src0_desc, step);
+        }
+    }
+
+    __device__ void MoveSrc1SliceWindow(const Src1Desc& src1_desc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrc1SliceWindow(src1_desc, step);
+        }
+    }
+
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveDstSliceWindow(dst_desc, step);
+        }
+    }
+
+    private:
+    static constexpr auto thread_cluster_desc_ =
+        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
+
+    using ThreadwiseTransfer =
+        ThreadwiseTensorSliceTransfer_v6r2<Src0Data,
+                                           Src1Data,
+                                           DstData,
+                                           Src0Desc,
+                                           Src1Desc,
+                                           DstDesc,
+                                           ElementwiseOperation,
+                                           decltype(thread_slice_lengths),
+                                           DimAccessOrder,
+                                           VectorDim,
+                                           ScalarPerVector,
+                                           DstInMemOp,
+                                           ThreadTransferSrc0ResetCoordinateAfterRun,
+                                           ThreadTransferSrc1ResetCoordinateAfterRun,
+                                           ThreadTransferDstResetCoordinateAfterRun>;
+
+    ThreadwiseTransfer threadwise_transfer_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp
new file mode 100644
index 00000000..eb5f589a
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp
@@ -0,0 +1,183 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp"
+
+namespace ck {
+
+// this version does following things to avoid scratch memory issue
+// 1. Use StaticallyIndexedArray instead of C array for thread buffer
+// 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
+// 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
+template <typename ThreadGroup,
+          typename ElementwiseOperation,
+          InMemoryDataOperationEnum DstInMemOp,
+          typename SliceLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename Src0Data,
+          typename Src1Data,
+          typename Src2Data,
+          typename DstData,
+          typename Src0Desc,
+          typename Src1Desc,
+          typename Src2Desc,
+          typename DstDesc,
+          typename DimAccessOrder,
+          index_t VectorDim,
+          index_t ScalarPerVector,
+          bool ThreadTransferSrc0ResetCoordinateAfterRun,
+          bool ThreadTransferSrc1ResetCoordinateAfterRun,
+          bool ThreadTransferSrc2ResetCoordinateAfterRun,
+          bool ThreadTransferDstResetCoordinateAfterRun>
+struct ThreadGroupTensorSliceTransfer_v6r3
+{
+    static constexpr index_t nDim = remove_reference_t<Src0Desc>::GetNumOfDimension();
+
+    static constexpr auto thread_slice_lengths = SliceLengths{} / ThreadClusterLengths{};
+
+    using Index = MultiIndex<nDim>;
+
+    __device__ constexpr ThreadGroupTensorSliceTransfer_v6r3(const Src0Desc& src0_desc,
+                                                             const Index& src0_block_slice_origin,
+                                                             const Src1Desc& src1_desc,
+                                                             const Index& src1_block_slice_origin,
+                                                             const Src2Desc& src2_desc,
+                                                             const Index& src2_block_slice_origin,
+                                                             const DstDesc& dst_desc,
+                                                             const Index& dst_block_slice_origin,
+                                                             const ElementwiseOperation& element_op)
+        : threadwise_transfer_(src0_desc,
+                               make_zero_multi_index<nDim>(),
+                               src1_desc,
+                               make_zero_multi_index<nDim>(),
+                               src2_desc,
+                               make_zero_multi_index<nDim>(),
+                               dst_desc,
+                               make_zero_multi_index<nDim>(),
+                               element_op)
+
+    {
+        static_assert(nDim == remove_cvref_t<Src0Desc>::GetNumOfDimension() &&
+                          nDim == remove_cvref_t<Src1Desc>::GetNumOfDimension() &&
+                          nDim == remove_cvref_t<Src2Desc>::GetNumOfDimension() &&
+                          nDim == remove_cvref_t<DstDesc>::GetNumOfDimension() &&
+                          nDim == ThreadClusterLengths::Size() &&
+                          nDim == ThreadClusterArrangeOrder::Size() &&
+                          nDim == DimAccessOrder::Size(),
+                      "wrong! nDim not consistent");
+
+        static_assert(
+            is_same<SliceLengths, decltype(thread_slice_lengths * ThreadClusterLengths{})>{},
+            "wrong! threads should be mapped to cover entire slicing window");
+
+        static_assert(ThreadGroup::GetNumOfThread() >= thread_cluster_desc_.GetElementSize(),
+                      "wrong! ThreadGroup::GetNumOfThread() too small");
+
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
+                make_multi_index(get_thread_local_1d_id()));
+
+            const auto thread_data_idx_begin = thread_cluster_idx * thread_slice_lengths;
+
+            threadwise_transfer_.SetSrc0SliceOrigin(
+                src0_desc, src0_block_slice_origin + thread_data_idx_begin);
+            threadwise_transfer_.SetSrc1SliceOrigin(
+                src1_desc, src1_block_slice_origin + thread_data_idx_begin);
+            threadwise_transfer_.SetSrc2SliceOrigin(
+                src2_desc, src2_block_slice_origin + thread_data_idx_begin);
+            threadwise_transfer_.SetDstSliceOrigin(dst_desc,
+                                                   dst_block_slice_origin + thread_data_idx_begin);
+        }
+    }
+
+    template <typename Src0Buffer, typename Src1Buffer, typename Src2Buffer, typename DstBuffer>
+    __device__ void Run(const Src0Desc& src0_desc,
+                        const Src0Buffer& src0_buf,
+                        const Src1Desc& src1_desc,
+                        const Src1Buffer& src1_buf,
+                        const Src2Desc& src2_desc,
+                        const Src2Buffer& src2_buf,
+                        const DstDesc& dst_desc,
+                        DstBuffer& dst_buf)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.Run(
+                src0_desc, src0_buf, src1_desc, src1_buf, src2_desc, src2_buf, dst_desc, dst_buf);
+        }
+    }
+
+    __device__ void MoveSrc0SliceWindow(const Src0Desc& src0_desc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrc0SliceWindow(src0_desc, step);
+        }
+    }
+
+    __device__ void MoveSrc1SliceWindow(const Src1Desc& src1_desc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrc1SliceWindow(src1_desc, step);
+        }
+    }
+
+    __device__ void MoveSrc2SliceWindow(const Src2Desc& src2_desc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrc2SliceWindow(src2_desc, step);
+        }
+    }
+
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveDstSliceWindow(dst_desc, step);
+        }
+    }
+
+    private:
+    static constexpr auto thread_cluster_desc_ =
+        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
+
+    using ThreadwiseTransfer =
+        ThreadwiseTensorSliceTransfer_v6r3<Src0Data,
+                                           Src1Data,
+                                           Src2Data,
+                                           DstData,
+                                           Src0Desc,
+                                           Src1Desc,
+                                           Src2Desc,
+                                           DstDesc,
+                                           ElementwiseOperation,
+                                           decltype(thread_slice_lengths),
+                                           DimAccessOrder,
+                                           VectorDim,
+                                           ScalarPerVector,
+                                           DstInMemOp,
+                                           ThreadTransferSrc0ResetCoordinateAfterRun,
+                                           ThreadTransferSrc1ResetCoordinateAfterRun,
+                                           ThreadTransferSrc2ResetCoordinateAfterRun,
+                                           ThreadTransferDstResetCoordinateAfterRun>;
+
+    ThreadwiseTransfer threadwise_transfer_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp
new file mode 100644
index 00000000..3bd78063
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp"
+
+namespace ck {
+
+// Thread-group level multi-source, multi-destination tensor slice data movement
+// Assume:
+//   1. All sources and destinations are DynamicBuffer
+//   2. Same VectorDim and ScalerPerVector for all sources and destinations
+//   3. DstInMemOps are per destination tensor
+//   4. ThreadTransferSrcResetCoordinateAfterRunFlags are per source tensor
+//   5. ThreadTransferDstResetCoordinateAfterRunFlags are per destination tensor
+//
+// Does following things to avoid scratch memory issue
+//   1. Pass tensor descritpors by reference (or tuple of references)
+//   2. Does not keep reference to tensor descriptor
+//   3. Does not construct new tensor coordinate when call Run()
+template <typename ThreadGroup,
+          typename SrcDatas,
+          typename DstDatas,
+          typename SrcDescs,
+          typename DstDescs,
+          typename ElementwiseOperation,
+          typename DstInMemOps, // Sequence<InMemoryDataOperationEnum ...>
+          typename SliceLengths,
+          typename ThreadClusterLengths,
+          typename ThreadClusterArrangeOrder,
+          typename DimAccessOrder,
+          index_t VectorDim,
+          index_t ScalarPerVector,
+          typename ThreadTransferSrcResetCoordinateAfterRunFlags,
+          typename ThreadTransferDstResetCoordinateAfterRunFlags>
+struct ThreadGroupTensorSliceTransfer_v7
+{
+    static constexpr index_t nDim =
+        remove_cvref_t<tuple_element_t<0, SrcDescs>>::GetNumOfDimension();
+
+    static constexpr index_t nSrc = remove_cvref_t<SrcDescs>::Size();
+    static constexpr index_t nDst = remove_cvref_t<DstDescs>::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    static constexpr auto thread_slice_lengths = SliceLengths{} / ThreadClusterLengths{};
+
+    __device__ constexpr ThreadGroupTensorSliceTransfer_v7(
+        const SrcDescs& src_descs,
+        const StaticallyIndexedArray<Index, nSrc>& src_block_slice_origins,
+        const DstDescs& dst_descs,
+        const StaticallyIndexedArray<Index, nDst>& dst_block_slice_origins,
+        const ElementwiseOperation& element_op)
+        : threadwise_transfer_(src_descs,
+                               StaticallyIndexedArray<Index, nSrc>{},
+                               dst_descs,
+                               StaticallyIndexedArray<Index, nDst>{},
+                               element_op)
+    {
+        static_assert(nSrc == SrcDatas::Size() && nSrc == SrcDescs::Size() &&
+                          nSrc == ThreadTransferSrcResetCoordinateAfterRunFlags::Size() &&
+                          nDst == DstDatas::Size() && nDst == DstDescs::Size() &&
+                          nDst == ThreadTransferDstResetCoordinateAfterRunFlags::Size(),
+                      "wrong!");
+
+        static_for<0, nSrc, 1>{}([&](auto i) {
+            static_assert(
+                nDim == remove_cvref_t<tuple_element_t<i.value, SrcDescs>>::GetNumOfDimension(),
+                "wrong!");
+        });
+
+        static_for<0, nDst, 1>{}([&](auto i) {
+            static_assert(
+                nDim == remove_cvref_t<tuple_element_t<i.value, DstDescs>>::GetNumOfDimension(),
+                "wrong!");
+        });
+
+        static_assert(nDim == ThreadClusterLengths::Size() &&
+                          nDim == ThreadClusterArrangeOrder::Size() &&
+                          nDim == DimAccessOrder::Size(),
+                      "wrong! nDim not consistent");
+
+        static_assert(
+            is_same<SliceLengths, decltype(thread_slice_lengths * ThreadClusterLengths{})>{},
+            "wrong! threads should be mapped to cover entire slicing window");
+
+        static_assert(ThreadGroup::GetNumOfThread() >= thread_cluster_desc_.GetElementSize(),
+                      "wrong! ThreadGroup::GetNumOfThread() too small");
+
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            const auto thread_cluster_idx = thread_cluster_desc_.CalculateBottomIndex(
+                make_multi_index(get_thread_local_1d_id()));
+
+            const auto thread_data_idx_begin = thread_cluster_idx * thread_slice_lengths;
+
+            const auto src_thread_slice_origins = generate_tuple(
+                [&](auto i) { return src_block_slice_origins[i] + thread_data_idx_begin; },
+                Number<nSrc>{});
+
+            const auto dst_thread_slice_origins = generate_tuple(
+                [&](auto i) { return dst_block_slice_origins[i] + thread_data_idx_begin; },
+                Number<nDst>{});
+
+            threadwise_transfer_.SetSrcSliceOrigins(src_descs, src_thread_slice_origins);
+            threadwise_transfer_.SetDstSliceOrigins(dst_descs, dst_thread_slice_origins);
+        }
+    }
+
+    template <typename SrcBuffers, typename DstBuffers>
+    __device__ void Run(const SrcDescs& src_descs,
+                        const SrcBuffers& src_bufs,
+                        const DstDescs& dst_descs,
+                        DstBuffers dst_bufs)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.Run(src_descs, src_bufs, dst_descs, dst_bufs);
+        }
+    }
+
+    template <index_t ISrc>
+    __device__ void
+    MoveSrcSliceWindow(const SrcDescs& src_descs, Number<ISrc> iSrc, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveSrcSliceWindow(src_descs, iSrc, step);
+        }
+    }
+
+    template <index_t IDst>
+    __device__ void
+    MoveDstSliceWindow(const DstDescs& dst_descs, Number<IDst> iDst, const Index& step)
+    {
+        if(ThreadGroup::GetNumOfThread() == thread_cluster_desc_.GetElementSize() or
+           ThreadGroup::GetThreadId() < thread_cluster_desc_.GetElementSize())
+        {
+            threadwise_transfer_.MoveDstSliceWindow(dst_descs, iDst, step);
+        }
+    }
+
+    private:
+    static constexpr auto thread_cluster_desc_ =
+        make_cluster_descriptor(ThreadClusterLengths{}, ThreadClusterArrangeOrder{});
+
+    using ThreadwiseTransfer =
+        ThreadwiseTensorSliceTransfer_v7<SrcDatas,
+                                         DstDatas,
+                                         SrcDescs,
+                                         DstDescs,
+                                         ElementwiseOperation,
+                                         DstInMemOps,
+                                         decltype(thread_slice_lengths),
+                                         DimAccessOrder,
+                                         VectorDim,
+                                         ScalarPerVector,
+                                         ThreadTransferSrcResetCoordinateAfterRunFlags,
+                                         ThreadTransferDstResetCoordinateAfterRunFlags>;
+
+    ThreadwiseTransfer threadwise_transfer_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp b/include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp
new file mode 100644
index 00000000..a4a29f5d
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+enum struct ConvolutionBackwardDataSpecialization
+{
+    Default,
+    Filter1x1Stride1Pad0,
+};
+
+inline std::string
+getConvBackwardDataSpecializationString(const ConvolutionBackwardDataSpecialization& s)
+{
+    switch(s)
+    {
+    case ConvolutionBackwardDataSpecialization::Default: return "Default";
+    case ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0:
+        return "FFilter1x1Stride1Pad0";
+    default: return "Unrecognized specialization!";
+    }
+}
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp b/include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp
new file mode 100644
index 00000000..20b2a152
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+enum struct ConvolutionBackwardWeightSpecialization
+{
+    Default,
+    Filter1x1Stride1Pad0,
+    Filter1x1Pad0,
+    OddC,
+};
+
+inline std::string
+getConvBackwardWeightSpecializationString(const ConvolutionBackwardWeightSpecialization& s)
+{
+    switch(s)
+    {
+    case ConvolutionBackwardWeightSpecialization::Default: return "Default";
+    case ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0:
+        return "Filter1x1Stride1Pad0";
+    case ConvolutionBackwardWeightSpecialization::Filter1x1Pad0: return "Filter1x1Pad0";
+    case ConvolutionBackwardWeightSpecialization::OddC: return "OddC";
+    default: return "Unrecognized specialization!";
+    }
+}
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp b/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp
new file mode 100644
index 00000000..953ff1e0
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+enum struct ConvolutionForwardSpecialization
+{
+    Default,
+    Filter1x1Pad0,
+    Filter1x1Stride1Pad0,
+    OddC,
+};
+
+inline std::string getConvForwardSpecializationString(const ConvolutionForwardSpecialization& s)
+{
+    switch(s)
+    {
+    case ConvolutionForwardSpecialization::Default: return "Default";
+    case ConvolutionForwardSpecialization::Filter1x1Pad0: return "Filter1x1Pad0";
+    case ConvolutionForwardSpecialization::Filter1x1Stride1Pad0: return "Filter1x1Stride1Pad0";
+    case ConvolutionForwardSpecialization::OddC: return "OddC";
+    default: return "Unrecognized specialization!";
+    }
+}
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_base.hpp b/include/ck/tensor_operation/gpu/device/device_base.hpp
new file mode 100644
index 00000000..8137098e
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_base.hpp
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cmath>
+#include <string>
+#include <sstream>
+
+#include "ck/stream_config.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+struct BaseArgument
+{
+    BaseArgument()                    = default;
+    BaseArgument(const BaseArgument&) = default;
+    BaseArgument& operator=(const BaseArgument&) = default;
+
+    virtual ~BaseArgument() {}
+
+    void* p_workspace_ = nullptr;
+};
+
+struct BaseInvoker
+{
+    BaseInvoker()                   = default;
+    BaseInvoker(const BaseInvoker&) = default;
+    BaseInvoker& operator=(const BaseInvoker&) = default;
+
+    virtual float Run(const BaseArgument*, const StreamConfig& = StreamConfig{})
+    {
+        return float{0};
+    }
+
+    virtual ~BaseInvoker() {}
+};
+
+struct BaseOperator
+{
+    BaseOperator()                    = default;
+    BaseOperator(const BaseOperator&) = default;
+    BaseOperator& operator=(const BaseOperator&) = default;
+
+    virtual bool IsSupportedArgument(const BaseArgument*) { return false; }
+    virtual std::string GetTypeString() const { return ""; }
+
+    virtual std::string GetTypeIdName() const { return typeid(*this).name(); }
+
+    virtual std::string GetTypeIdHashCode() const
+    {
+        std::ostringstream oss;
+
+        oss << std::hex << typeid(*this).hash_code();
+
+        return oss.str();
+    };
+
+    virtual size_t GetWorkSpaceSize(const BaseArgument*) const { return 0; }
+
+    virtual void SetWorkSpacePointer(BaseArgument* p_arg, void* p_workspace) const
+    {
+        assert(p_arg);
+        p_arg->p_workspace_ = p_workspace;
+    }
+
+    virtual ~BaseOperator() {}
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d.hpp
new file mode 100644
index 00000000..9fcd893c
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d.hpp
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// Tensor Contraction:
+//   input : A
+//   input : B
+//   input : D0, D1, ...
+//   output : E
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   A[G0, G1, ..., M0, M1, M2, ..., K0, K1, K2, ...]
+//   B[G0, G1, ..., N0, N1, N2, ..., K0, K1, K2, ...]
+//   D[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2, ...]
+//   E[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2, ...]
+template <index_t NumDimG,
+          index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceBatchedContractionMultipleD : public BaseOperator
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        const std::vector<index_t>& a_gs_ms_ns_lengths,
+                        const std::vector<index_t>& a_gs_ms_ks_strides,
+                        const std::vector<index_t>& b_gs_ns_ks_lengths,
+                        const std::vector<index_t>& b_gs_ns_ks_strides,
+                        const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_lengths,
+                        const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_strides,
+                        const std::vector<index_t>& e_gs_ms_ns_lengths,
+                        const std::vector<index_t>& e_gs_ms_ns_strides,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp
new file mode 100644
index 00000000..e7559132
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceBatchedGemm : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        void* p_c,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB,
+                        ck::index_t StrideC,
+                        ck::index_t BatchStrideA,
+                        ck::index_t BatchStrideB,
+                        ck::index_t BatchStrideC,
+                        ck::index_t Batch,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CElementwiseOperation c_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+using DeviceBatchedGemmPtr = std::unique_ptr<DeviceBatchedGemm<ALayout,
+                                                               BLayout,
+                                                               CLayout,
+                                                               ADataType,
+                                                               BDataType,
+                                                               CDataType,
+                                                               AElementwiseOperation,
+                                                               BElementwiseOperation,
+                                                               CElementwiseOperation>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_e_permute.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_e_permute.hpp
new file mode 100644
index 00000000..acd779b2
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_e_permute.hpp
@@ -0,0 +1,50 @@
+#pragma once
+#include <iostream>
+#include <vector>
+
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+struct BatchedGemmEPermuteDesc
+{
+    ck::index_t G0_, G1_, M_, N_;
+    ck::index_t stride_G0_, stride_G1_, stride_M_, stride_N_;
+};
+
+template <typename ALayout,
+          typename BLayout,
+          typename DELayout,
+          typename ADataType,
+          typename BDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceBatchedGemmEPermute : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        void* p_e,
+                        index_t M,
+                        index_t N,
+                        index_t K,
+                        index_t stride_A,
+                        index_t stride_B,
+                        index_t batch_stride_A,
+                        index_t batch_stride_B,
+                        BatchedGemmEPermuteDesc batched_gemm_e_permute_desc,
+                        index_t BatchCount,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp
new file mode 100644
index 00000000..af681127
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename B0Layout,
+          typename B1Layout,
+          typename CLayout,
+          typename ADataType,
+          typename B0DataType,
+          typename B1DataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename B0ElementwiseOperation,
+          typename Acc0ElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceBatchedGemmGemm : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b0,
+                        const void* p_b1,
+                        void* p_c,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t O,
+                        ck::index_t Batch,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB0,
+                        ck::index_t StrideB1,
+                        ck::index_t StrideC,
+                        ck::index_t BatchStrideA,
+                        ck::index_t BatchStrideB0,
+                        ck::index_t BatchStrideB1,
+                        ck::index_t BatchStrideC,
+                        AElementwiseOperation a_element_op,
+                        B0ElementwiseOperation b0_element_op,
+                        Acc0ElementwiseOperation acc0_element_op,
+                        B1ElementwiseOperation b1_element_op,
+                        CElementwiseOperation c_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
new file mode 100644
index 00000000..116e62c0
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceBatchedGemmMultiD : public BaseOperator
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static_assert(DsLayout::Size() == DsDataType::Size(), "wrong! inconsisiten NumDTensor");
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        const std::array<const void*, NumDTensor>& p_ds,
+                        void* p_e,
+                        index_t M,
+                        index_t N,
+                        index_t K,
+                        index_t Batch,
+                        index_t StrideA,
+                        index_t StrideB,
+                        const std::array<ck::index_t, NumDTensor>& StrideDs,
+                        index_t StrideE,
+                        index_t BatchStrideA,
+                        index_t BatchStrideB,
+                        const std::array<ck::index_t, NumDTensor>& BatchStrideDs,
+                        index_t BatchStrideE,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d.hpp
new file mode 100644
index 00000000..eacc5976
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d.hpp
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename A0Layout,
+          typename B0Layout,
+          typename D0sLayout,
+          typename B1Layout,
+          typename D1sLayout,
+          typename E1Layout,
+          typename A0DataType,
+          typename B0DataType,
+          typename D0sDataType,
+          typename B1DataType,
+          typename D1sDataType,
+          typename E1DataType,
+          typename A0ElementwiseOperation,
+          typename B0ElementwiseOperation,
+          typename CDE0ElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CDE1ElementwiseOperation>
+struct DeviceBatchedGemmMultipleDGemmMultipleD : public BaseOperator
+{
+    static constexpr index_t NumD0Tensor = D0sDataType::Size();
+    static constexpr index_t NumD1Tensor = D1sDataType::Size();
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a0,
+                        const void* p_b0,
+                        std::array<const void*, NumD0Tensor> p_d0s,
+                        const void* p_b1,
+                        std::array<const void*, NumD1Tensor> p_d1s,
+                        void* p_e1,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t O,
+                        ck::index_t Batch,
+                        ck::index_t StrideA0,
+                        ck::index_t StrideB0,
+                        std::array<ck::index_t, NumD0Tensor> StrideD0s,
+                        ck::index_t StrideB1,
+                        std::array<ck::index_t, NumD1Tensor> StrideD1s,
+                        ck::index_t StrideE1,
+                        ck::index_t BatchStrideA0,
+                        ck::index_t BatchStrideB0,
+                        std::array<ck::index_t, NumD0Tensor> BatchStrideD0s,
+                        ck::index_t BatchStrideB1,
+                        std::array<ck::index_t, NumD1Tensor> BatchStrideD1s,
+                        ck::index_t BatchStrideE1,
+                        A0ElementwiseOperation a0_element_op,
+                        B0ElementwiseOperation b0_element_op,
+                        CDE0ElementwiseOperation cde0_element_op,
+                        B1ElementwiseOperation b1_element_op,
+                        CDE1ElementwiseOperation cde1_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
new file mode 100644
index 00000000..c1f85e57
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename B0Layout,
+          typename B1Layout,
+          typename CLayout,
+          typename ADataType,
+          typename B0DataType,
+          typename B1DataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename B0ElementwiseOperation,
+          typename Acc0ElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation,
+          bool MaskOutUpperTriangle> // TODO: enum for mask type
+struct DeviceBatchedGemmSoftmaxGemm : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b0,
+                        const void* p_b1,
+                        void* p_c,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t O,
+                        ck::index_t Batch,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB0,
+                        ck::index_t StrideB1,
+                        ck::index_t StrideC,
+                        ck::index_t BatchStrideA,
+                        ck::index_t BatchStrideB0,
+                        ck::index_t BatchStrideB1,
+                        ck::index_t BatchStrideC,
+                        AElementwiseOperation a_element_op,
+                        B0ElementwiseOperation b0_element_op,
+                        Acc0ElementwiseOperation acc0_element_op,
+                        B1ElementwiseOperation b1_element_op,
+                        CElementwiseOperation c_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp
new file mode 100644
index 00000000..ff555199
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "device_base.hpp"
+#include "ck/tensor_operation/gpu/device/masking_specialization.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <index_t NumDimG,
+          index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          index_t NumDimO,
+          typename ADataType,
+          typename B0DataType,
+          typename B1DataType,
+          typename CDataType,
+          typename Acc0BiasDataType,
+          typename Acc1BiasDataType,
+          typename AElementwiseOperation,
+          typename B0ElementwiseOperation,
+          typename Acc0ElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation,
+          MaskingSpecialization MaskingSpec>
+struct DeviceBatchedGemmSoftmaxGemmPermute : public BaseOperator
+{
+    static constexpr index_t NumAcc0Bias = Acc0BiasDataType::Size();
+    static constexpr index_t NumAcc1Bias = Acc1BiasDataType::Size();
+
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(
+        const void* p_a,
+        const void* p_b0,
+        const void* p_b1,
+        void* p_c,
+        const std::array<void*, NumAcc0Bias> p_acc0_biases,
+        const std::array<void*, NumAcc1Bias> p_acc1_biases,
+        const std::vector<index_t>& a_gs_ms_ks_lengths,
+        const std::vector<index_t>& a_gs_ms_ks_strides,
+        const std::vector<index_t>& b_gs_ns_ks_lengths,
+        const std::vector<index_t>& b_gs_ns_ks_strides,
+        const std::vector<index_t>& b1_gs_gemm1ns_gemm1ks_lengths, // b1_gs_os_ns_lengths
+        const std::vector<index_t>& b1_gs_gemm1ns_gemm1ks_strides, // b1_gs_os_ns_strides
+        const std::vector<index_t>& c_gs_ms_gemm1ns_lengths,       // c_gs_ms_os_lengths
+        const std::vector<index_t>& c_gs_ms_gemm1ns_strides,       // c_gs_ms_os_strides
+        const std::array<std::vector<index_t>, NumAcc0Bias> acc0_biases_gs_ms_ns_lengths,
+        const std::array<std::vector<index_t>, NumAcc0Bias> acc0_biases_gs_ms_ns_strides,
+        const std::array<std::vector<index_t>, NumAcc1Bias>
+            acc1_biases_gs_ms_gemm1ns_lengths, // acc1_biases_gs_ms_os_lengths
+        const std::array<std::vector<index_t>, NumAcc1Bias>
+            acc1_biases_gs_ms_gemm1ns_strides, // acc1_biases_gs_ms_os_strides
+        AElementwiseOperation a_element_op,
+        B0ElementwiseOperation b0_element_op,
+        Acc0ElementwiseOperation acc0_element_op,
+        B1ElementwiseOperation b1_element_op,
+        CElementwiseOperation c_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp b/include/ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp
new file mode 100644
index 00000000..d39f3b7c
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+#include <memory>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename XDataType,
+          typename DxDataType,
+          typename DyDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename DscaleDbiasDataType,
+          typename MeanVarDataType,
+          typename DyElementwiseOp,
+          index_t Rank,
+          index_t NumBatchNormReduceDim>
+struct DeviceBatchNormBwd : public BaseOperator
+{
+    static constexpr index_t NumInvariantDim = Rank - NumBatchNormReduceDim;
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::array<index_t, Rank> xyLengths,
+                        const std::array<index_t, Rank> xStrides,
+                        const std::array<index_t, Rank> dyStrides,
+                        const std::array<index_t, Rank> dxStrides,
+                        const std::array<int, NumBatchNormReduceDim> reduceDims,
+                        const std::array<ck::index_t, NumInvariantDim> bnScaleBiasMeanVarLengths,
+                        const std::array<ck::index_t, NumInvariantDim> bnScaleStrides,
+                        const std::array<ck::index_t, NumInvariantDim> bnDscaleDbiasStrides,
+                        const std::array<ck::index_t, NumInvariantDim> bnMeanVarStrides,
+                        const void* p_x,
+                        const void* p_dy,
+                        const void* p_scale,
+                        const void* p_savedMean,
+                        const void* p_savedInvVar,
+                        double epsilon,
+                        const DyElementwiseOp dy_elementwise_op,
+                        void* p_dx,
+                        void* p_dscale,
+                        void* p_dbias) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+template <typename XDataType,
+          typename DxDataType,
+          typename DyDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename DscaleDbiasDataType,
+          typename MeanVarDataType,
+          typename DyElementwiseOp,
+          index_t Rank,
+          index_t NumBatchNormReduceDim>
+using DeviceBatchNormBwdPtr = std::unique_ptr<DeviceBatchNormBwd<XDataType,
+                                                                 DxDataType,
+                                                                 DyDataType,
+                                                                 AccDataType,
+                                                                 ScaleDataType,
+                                                                 DscaleDbiasDataType,
+                                                                 MeanVarDataType,
+                                                                 DyElementwiseOp,
+                                                                 Rank,
+                                                                 NumBatchNormReduceDim>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp b/include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp
new file mode 100644
index 00000000..aa93dd9c
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+#include <memory>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename YElementwiseOp,
+          index_t Rank,
+          index_t NumBatchNormReduceDim>
+struct DeviceBatchNormFwd : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(
+        const std::array<index_t, Rank> xyLengths,
+        const std::array<index_t, Rank> xStrides,
+        const std::array<index_t, Rank> yStrides,
+        const std::array<int, NumBatchNormReduceDim> reduceDims,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarLengths,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleStrides,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnBiasStrides,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnMeanVarStrides,
+        const void* p_x,
+        const void* bnScale,
+        const void* bnBias,
+        double epsilon,
+        const YElementwiseOp y_elementwise_op,
+        void* p_y,
+        void* resultSaveMean,
+        void* resultSaveInvVariance,
+        double exponentialAverageFactor,
+        void* resultRunningMean,
+        void* resultRunningVariance) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+template <typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename YElementwiseOp,
+          index_t Rank,
+          index_t NumBatchNormReduceDim>
+using DeviceBatchNormFwdPtr = std::unique_ptr<DeviceBatchNormFwd<XDataType,
+                                                                 YDataType,
+                                                                 AccDataType,
+                                                                 ScaleDataType,
+                                                                 BiasDataType,
+                                                                 MeanVarDataType,
+                                                                 YElementwiseOp,
+                                                                 Rank,
+                                                                 NumBatchNormReduceDim>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp b/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp
new file mode 100644
index 00000000..8a00fd9d
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+#include <memory>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename YElementwiseOp,
+          index_t Rank,
+          index_t NumBatchNormReduceDim>
+struct DeviceBatchNormInfer : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(
+        const std::array<index_t, Rank> xyLengths,
+        const std::array<index_t, Rank> xStrides,
+        const std::array<index_t, Rank> yStrides,
+        const std::array<int, NumBatchNormReduceDim> reduceDims,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarLengths,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleStrides,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnBiasStrides,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnMeanVarStrides,
+        const void* p_x,
+        const void* bnScale,
+        const void* bnBias,
+        double epsilon,
+        const YElementwiseOp y_elementwise_op,
+        const void* estimatedMean,
+        const void* estimatedInvVariance,
+        void* p_y) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+template <typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename YElementwiseOp,
+          index_t Rank,
+          index_t NumBatchNormReduceDim>
+using DeviceBatchNormInferPtr = std::unique_ptr<DeviceBatchNormInfer<XDataType,
+                                                                     YDataType,
+                                                                     AccDataType,
+                                                                     ScaleDataType,
+                                                                     BiasDataType,
+                                                                     MeanVarDataType,
+                                                                     YElementwiseOp,
+                                                                     Rank,
+                                                                     NumBatchNormReduceDim>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_cgemm.hpp b/include/ck/tensor_operation/gpu/device/device_cgemm.hpp
new file mode 100644
index 00000000..aedae538
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_cgemm.hpp
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceCGemm : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a_real,
+                                                              const void* p_a_imag,
+                                                              const void* p_b_real,
+                                                              const void* p_b_imag,
+                                                              void* p_c_real,
+                                                              void* p_c_imag,
+                                                              void* p_workspace,
+                                                              ck::index_t M,
+                                                              ck::index_t N,
+                                                              ck::index_t K,
+                                                              ck::index_t StrideA,
+                                                              ck::index_t StrideB,
+                                                              ck::index_t StrideC,
+                                                              AElementwiseOperation a_element_op,
+                                                              BElementwiseOperation b_element_op,
+                                                              CElementwiseOperation c_element_op,
+                                                              ck::index_t KBatch = 1) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+    virtual std::size_t GetWorkspaceSize(index_t MRaw,
+                                         index_t NRaw,
+                                         index_t KRaw,
+                                         index_t StrideA,
+                                         index_t StrideB,
+                                         index_t StrideC)     = 0;
+};
+
+template <typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+using DeviceCGemmPtr = std::unique_ptr<
+    DeviceCGemm<AElementwiseOperation, BElementwiseOperation, CElementwiseOperation>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp
new file mode 100644
index 00000000..dbc525c0
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// Tensor Contraction:
+//   input : A
+//   input : B
+//   input : D0, D1, ...
+//   output : E
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   A[M0, M1, M2, ..., K0, K1, K2, ...]
+//   B[N0, N1, N2, ..., K0, K1, K2, ...]
+//   D[M0, M1, M2, ..., N0, N1, N2, ...]
+//   E[M0, M1, M2, ..., N0, N1, N2, ...]
+template <index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceContractionMultipleD : public BaseOperator
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        const std::vector<index_t>& a_ms_ns_lengths,
+                        const std::vector<index_t>& a_ms_ks_strides,
+                        const std::vector<index_t>& b_ns_ks_lengths,
+                        const std::vector<index_t>& b_ns_ks_strides,
+                        const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_lengths,
+                        const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_strides,
+                        const std::vector<index_t>& e_ms_ns_lengths,
+                        const std::vector<index_t>& e_ms_ns_strides,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp b/include/ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp
new file mode 100644
index 00000000..82054a3c
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <ck::index_t NumDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation>
+struct DeviceConvBwdData : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(void* p_in,
+                        const void* p_wei,
+                        const void* p_out,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t C,
+                        std::vector<ck::index_t> input_spatial_lengths,
+                        std::vector<ck::index_t> filter_spatial_lengths,
+                        std::vector<ck::index_t> output_spatial_lengths,
+                        std::vector<ck::index_t> conv_filter_strides,
+                        std::vector<ck::index_t> conv_filter_dilations,
+                        std::vector<ck::index_t> input_left_pads,
+                        std::vector<ck::index_t> input_right_pads,
+                        InElementwiseOperation in_element_op,
+                        WeiElementwiseOperation wei_element_op,
+                        OutElementwiseOperation out_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_conv_fwd.hpp b/include/ck/tensor_operation/gpu/device/device_conv_fwd.hpp
new file mode 100644
index 00000000..4b988108
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_conv_fwd.hpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <ck::index_t NumDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation>
+struct DeviceConvFwd : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_in,
+                        const void* p_wei,
+                        void* p_out,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t C,
+                        std::vector<ck::index_t> input_spatial_lengths,
+                        std::vector<ck::index_t> filter_spatial_lengths,
+                        std::vector<ck::index_t> output_spatial_lengths,
+                        std::vector<ck::index_t> conv_filter_strides,
+                        std::vector<ck::index_t> conv_filter_dilations,
+                        std::vector<ck::index_t> input_left_pads,
+                        std::vector<ck::index_t> input_right_pads,
+                        InElementwiseOperation in_element_op,
+                        WeiElementwiseOperation wei_element_op,
+                        OutElementwiseOperation out_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation.hpp b/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation.hpp
new file mode 100644
index 00000000..5a627dee
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation.hpp
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+#include <iostream>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation>
+struct DeviceConvFwdBiasActivation : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_in,
+                        const void* p_wei,
+                        void* p_out,
+                        const void* p_bias,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t C,
+                        std::vector<ck::index_t> input_spatial_lengths,
+                        std::vector<ck::index_t> filter_spatial_lengths,
+                        std::vector<ck::index_t> output_spatial_lengths,
+                        std::vector<ck::index_t> conv_filter_strides,
+                        std::vector<ck::index_t> conv_filter_dilations,
+                        std::vector<ck::index_t> input_left_pads,
+                        std::vector<ck::index_t> input_right_pads,
+                        InElementwiseOperation in_element_op,
+                        WeiElementwiseOperation wei_element_op,
+                        OutElementwiseOperation out_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+template <typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation>
+using DeviceConvFwdBiasActivationPtr =
+    std::unique_ptr<DeviceConvFwdBiasActivation<InElementwiseOperation,
+                                                WeiElementwiseOperation,
+                                                OutElementwiseOperation>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation_add.hpp b/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation_add.hpp
new file mode 100644
index 00000000..cc139303
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation_add.hpp
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+#include <iostream>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation>
+struct DeviceConvFwdBiasActivationAdd : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_in,
+                        const void* p_wei,
+                        void* p_out,
+                        const void* p_bias,
+                        const void* p_resi,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t C,
+                        std::vector<ck::index_t> input_spatial_lengths,
+                        std::vector<ck::index_t> filter_spatial_lengths,
+                        std::vector<ck::index_t> output_spatial_lengths,
+                        std::vector<ck::index_t> conv_filter_strides,
+                        std::vector<ck::index_t> conv_filter_dilations,
+                        std::vector<ck::index_t> input_left_pads,
+                        std::vector<ck::index_t> input_right_pads,
+                        InElementwiseOperation in_element_op,
+                        WeiElementwiseOperation wei_element_op,
+                        OutElementwiseOperation out_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+template <typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation>
+using DeviceConvFwdBiasActivationAddPtr =
+    std::unique_ptr<DeviceConvFwdBiasActivationAdd<InElementwiseOperation,
+                                                   WeiElementwiseOperation,
+                                                   OutElementwiseOperation>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp
new file mode 100644
index 00000000..23aada0f
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_elementwise_2d.hpp
@@ -0,0 +1,341 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/math.hpp"
+#include "ck/utility/sequence.hpp"
+#include "ck/tensor_operation/gpu/device/device_elementwise_base.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename InDataTypeTuple,
+          typename OutDataTypeTuple,
+          typename ElementwiseOperation,
+          index_t NumDim_m,
+          index_t NumDim_n,
+          index_t MPerThread,
+          index_t NPerThread,
+          typename InScalarPerVectorSeq,
+          typename OutScalarPerVectorSeq>
+struct DeviceElementwise : public DeviceElementwiseBase<InDataTypeTuple,
+                                                        OutDataTypeTuple,
+                                                        ElementwiseOperation,
+                                                        NumDim_m + NumDim_n>
+{
+    static constexpr index_t NumDim = NumDim_m + NumDim_n;
+
+    static constexpr int NumInput  = InDataTypeTuple::Size();
+    static constexpr int NumOutput = OutDataTypeTuple::Size();
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    static_assert(NumInput == InScalarPerVectorSeq::Size() &&
+                      NumOutput == OutScalarPerVectorSeq::Size(),
+                  "Tuple size is inconsistent with the number of in/out!");
+
+    static auto GenerateInDataTypePointerTuple()
+    {
+        return generate_tuple(
+            [&](auto I) {
+                using DataType = remove_cvref_t<decltype(InDataTypeTuple{}[I])>;
+
+                return static_cast<const DataType*>(nullptr);
+            },
+            Number<NumInput>{});
+    };
+
+    static auto GenerateOutDataTypePointerTuple()
+    {
+        return generate_tuple(
+            [&](auto I) {
+                using DataType = remove_cvref_t<decltype(OutDataTypeTuple{}[I])>;
+
+                return static_cast<DataType*>(nullptr);
+            },
+            Number<NumOutput>{});
+    };
+
+    using InDataTypePointerTuple  = decltype(GenerateInDataTypePointerTuple());
+    using OutDataTypePointerTuple = decltype(GenerateOutDataTypePointerTuple());
+
+    template <typename Desc_MN>
+    static auto PadDescriptor_MN_2d(Desc_MN desc_mn,
+                                    index_t gridSize,
+                                    index_t blockSize,
+                                    index_t num_threads_m,
+                                    index_t num_threads_n)
+    {
+        std::ignore               = blockSize;
+        std::ignore               = gridSize;
+        const auto m              = desc_mn.GetLength(I0);
+        const auto n              = desc_mn.GetLength(I1);
+        const index_t loop_step_m = num_threads_m * MPerThread;
+        const index_t loop_step_n = num_threads_n * NPerThread;
+        const auto pad_m          = math::integer_least_multiple(m, loop_step_m) - m;
+        const auto pad_n          = math::integer_least_multiple(n, loop_step_n) - n;
+
+        const auto desc_mn_pad = transform_tensor_descriptor(
+            desc_mn,
+            make_tuple(make_right_pad_transform(m, pad_m), make_right_pad_transform(n, pad_n)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+        return desc_mn_pad;
+    }
+
+    static auto MakeDescriptor_MN(const std::array<index_t, NumDim>& lengths,
+                                  const std::array<index_t, NumDim>& stride,
+                                  index_t gridSize,
+                                  index_t blockSize,
+                                  index_t num_threads_m,
+                                  index_t num_threads_n)
+    {
+        auto tupleOfShape  = generate_tuple([&](auto I) { return lengths[I]; }, Number<NumDim>{});
+        auto tupleOfStride = generate_tuple([&](auto I) { return stride[I]; }, Number<NumDim>{});
+
+        // nd desc - [s0, s1, s2, ...]
+        const auto desc = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride);
+
+        constexpr auto mDimIds = typename arithmetic_sequence_gen<0, NumDim_m, 1>::type();
+        constexpr auto nDimIds =
+            typename arithmetic_sequence_gen<NumDim_m, NumDim_m + NumDim_n, 1>::type();
+
+        const auto mLengths = get_container_subset(tupleOfShape, mDimIds);
+        const auto nLengths = get_container_subset(tupleOfShape, nDimIds);
+
+        // merge nd to 2d desc - [s0 * s1 * ...]
+
+        if constexpr(NumDim > 2)
+        {
+            const auto desc_mn = transform_tensor_descriptor(
+                desc,
+                make_tuple(make_merge_transform(mLengths), make_merge_transform(nLengths)),
+                make_tuple(mDimIds, nDimIds),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return PadDescriptor_MN_2d(desc_mn, gridSize, blockSize, num_threads_m, num_threads_n);
+        }
+        else
+            return PadDescriptor_MN_2d(desc, gridSize, blockSize, num_threads_m, num_threads_n);
+    }
+
+    template <index_t TupleSize>
+    static auto GenerateInOutGrid2dDescTuple(Number<TupleSize>)
+    {
+        return generate_tuple(
+            [&](auto) {
+                if constexpr(NumDim > 2)
+                {
+                    return MakeDescriptor_MN({1, 1}, {1, 1}, 1, 1, 1, 1);
+                }
+                else
+                {
+                    return MakeDescriptor_MN({1}, {1}, 1, 1, 1, 1);
+                };
+            },
+            Number<TupleSize>{});
+    };
+
+    using OutGrid2dDescTuple = decltype(GenerateInOutGrid2dDescTuple(Number<NumOutput>{}));
+    using InGrid2dDescTuple  = decltype(GenerateInOutGrid2dDescTuple(Number<NumInput>{}));
+
+    using GridwiseElementwise = GridwiseElementwise_2D<InGrid2dDescTuple,
+                                                       OutGrid2dDescTuple,
+                                                       InDataTypePointerTuple,
+                                                       OutDataTypePointerTuple,
+                                                       ElementwiseOperation,
+                                                       MPerThread,
+                                                       NPerThread,
+                                                       InScalarPerVectorSeq,
+                                                       OutScalarPerVectorSeq>;
+
+    struct Argument : public BaseArgument
+    {
+        Argument(const std::array<index_t, NumDim> lengths,
+                 const std::array<std::array<index_t, NumDim>, NumInput> inStridesArray,
+                 const std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray,
+                 const std::array<const void*, NumInput> in_dev_buffers,
+                 const std::array<void*, NumOutput> out_dev_buffers,
+                 ElementwiseOperation elementwise_op)
+
+            : lengths_(lengths),
+              inStridesArray_(inStridesArray),
+              outStridesArray_(outStridesArray),
+              elementwise_op_(elementwise_op),
+              blockSize_(256),
+              gridSize_(120), // FIXME - Calculate the grid size by number of CU in the future
+              num_threads_m_((gridSize_ * blockSize_) / 16),
+              num_threads_n_(16)
+        {
+            static_assert(NumDim_m > 0, "");
+            static_assert(NumDim_n > 0, "");
+
+            in_dev_buffers_ = generate_tuple(
+                [&](auto I) {
+                    using DataType = remove_cvref_t<decltype(InDataTypeTuple{}[I])>;
+                    return static_cast<const DataType*>(in_dev_buffers[I.value]);
+                },
+                Number<NumInput>{});
+
+            out_dev_buffers_ = generate_tuple(
+                [&](auto I) {
+                    using DataType = remove_cvref_t<decltype(OutDataTypeTuple{}[I])>;
+                    return static_cast<DataType*>(out_dev_buffers[I.value]);
+                },
+                Number<NumOutput>{});
+
+            in_grid_2d_desc_tuple_ = generate_tuple(
+                [&](auto I) {
+                    return MakeDescriptor_MN(lengths,
+                                             inStridesArray[I.value],
+                                             gridSize_,
+                                             blockSize_,
+                                             num_threads_m_,
+                                             num_threads_n_);
+                },
+                Number<NumInput>{});
+
+            out_grid_2d_desc_tuple_ = generate_tuple(
+                [&](auto I) {
+                    return MakeDescriptor_MN(lengths,
+                                             outStridesArray[I.value],
+                                             gridSize_,
+                                             blockSize_,
+                                             num_threads_m_,
+                                             num_threads_n_);
+                },
+                Number<NumOutput>{});
+        }
+
+        InDataTypePointerTuple in_dev_buffers_;
+        OutDataTypePointerTuple out_dev_buffers_;
+        InGrid2dDescTuple in_grid_2d_desc_tuple_;
+        OutGrid2dDescTuple out_grid_2d_desc_tuple_;
+
+        std::array<index_t, NumDim> lengths_;
+        std::array<std::array<index_t, NumDim>, NumInput> inStridesArray_;
+        std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray_;
+
+        ElementwiseOperation elementwise_op_;
+        index_t blockSize_;
+        index_t gridSize_;
+        index_t num_threads_m_;
+        index_t num_threads_n_;
+    };
+
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            const auto kernel = kernel_elementwise_2d<GridwiseElementwise,
+                                                      InGrid2dDescTuple,
+                                                      OutGrid2dDescTuple,
+                                                      InDataTypePointerTuple,
+                                                      OutDataTypePointerTuple,
+                                                      ElementwiseOperation>;
+
+            float elapsed_time = launch_and_time_kernel(stream_config,
+                                                        kernel,
+                                                        dim3(arg.gridSize_),
+                                                        dim3(arg.blockSize_),
+                                                        0,
+                                                        arg.in_grid_2d_desc_tuple_,
+                                                        arg.out_grid_2d_desc_tuple_,
+                                                        arg.in_dev_buffers_,
+                                                        arg.out_dev_buffers_,
+                                                        arg.elementwise_op_,
+                                                        arg.num_threads_m_,
+                                                        arg.num_threads_n_);
+            return elapsed_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
+
+        if(pArg == nullptr)
+            return false;
+
+        if(pArg->lengths_.back() % MPerThread != 0)
+            return false;
+
+        auto IsScalarPerVectorValid = [&](const std::array<index_t, NumDim>& lengths,
+                                          const std::array<index_t, NumDim>& strides,
+                                          index_t scalarPerVector,
+                                          index_t vectorDim) {
+            if(strides[vectorDim] == 1 &&
+               (lengths[vectorDim] % scalarPerVector == 0 ||
+                lengths[vectorDim] % scalarPerVector == lengths[vectorDim]))
+            {
+                return true;
+            }
+            if(strides[vectorDim] != 1 && scalarPerVector == strides[vectorDim])
+            {
+                return true;
+            }
+            return false;
+        };
+
+        bool valid = true;
+        static_for<0, NumInput, 1>{}([&](auto I) {
+            if(!IsScalarPerVectorValid(pArg->lengths_,
+                                       pArg->inStridesArray_[I.value],
+                                       InScalarPerVectorSeq::At(I),
+                                       NumDim_m - 1))
+                valid = false;
+        });
+
+        static_for<0, NumOutput, 1>{}([&](auto I) {
+            if(!IsScalarPerVectorValid(pArg->lengths_,
+                                       pArg->outStridesArray_[I.value],
+                                       OutScalarPerVectorSeq::At(I),
+                                       NumDim - 1))
+                valid = false;
+        });
+
+        return valid;
+    };
+
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::array<index_t, NumDim> lengths,
+                        const std::array<std::array<index_t, NumDim>, NumInput> inStridesArray,
+                        const std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray,
+                        const std::array<const void*, NumInput> in_dev_buffers,
+                        const std::array<void*, NumOutput> out_dev_buffers,
+                        ElementwiseOperation elementwise_op) override
+    {
+        return std::make_unique<Argument>(lengths,
+                                          inStridesArray,
+                                          outStridesArray,
+                                          in_dev_buffers,
+                                          out_dev_buffers,
+                                          elementwise_op);
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>();
+    };
+}; // namespace device
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise_base.hpp b/include/ck/tensor_operation/gpu/device/device_elementwise_base.hpp
new file mode 100644
index 00000000..728faf54
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_elementwise_base.hpp
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <memory>
+#include <array>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename InDataTypeTuple,
+          typename OutDataTypeTuple,
+          typename ElementwiseOperation,
+          index_t NumDim>
+struct DeviceElementwiseBase : public BaseOperator
+{
+    static constexpr int NumInput  = InDataTypeTuple::Size();
+    static constexpr int NumOutput = OutDataTypeTuple::Size();
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::array<index_t, NumDim> lengths,
+                        const std::array<std::array<index_t, NumDim>, NumInput> inStridesArray,
+                        const std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray,
+                        const std::array<const void*, NumInput> in_dev_buffers,
+                        const std::array<void*, NumOutput> out_dev_buffers,
+                        ElementwiseOperation elementwise_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+}; // namespace device
+
+template <typename InDataTypeTuple,
+          typename OutDataTypeTuple,
+          typename ElementwiseOperation,
+          index_t NumDim>
+using DeviceElementwiseBasePtr = std::unique_ptr<
+    DeviceElementwiseBase<InDataTypeTuple, OutDataTypeTuple, ElementwiseOperation, NumDim>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp b/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp
new file mode 100644
index 00000000..42f821ff
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename InDataTypeTuple,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename AccDataType,
+          typename YDataType,
+          typename XElementwiseOperation,
+          typename YElementwiseOperation,
+          index_t Rank,
+          index_t NumReduceDim>
+struct DeviceElementwiseNormalization : public BaseOperator
+{
+    static constexpr int NumInput = InDataTypeTuple::Size();
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::vector<index_t> lengths,
+                        const std::array<std::vector<index_t>, NumInput> inStridesArray,
+                        const std::vector<index_t> gammaStrides,
+                        const std::vector<index_t> betaStrides,
+                        const std::vector<index_t> yStrides,
+                        const std::vector<index_t> reduceDims,
+                        AccDataType epsilon,
+                        const std::array<const void*, NumInput> in_dev_buffers,
+                        const void* p_gamma,
+                        const void* p_beta,
+                        void* p_y,
+                        XElementwiseOperation x_elementwise_op,
+                        YElementwiseOperation y_elementwise_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+template <typename InDataTypeTuple,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename AccDataType,
+          typename YDataType,
+          typename XElementwiseOperation,
+          typename YElementwiseOperation,
+          index_t Rank,
+          index_t NumReduceDim>
+using DeviceElementwiseNormalizationPtr =
+    std::unique_ptr<DeviceElementwiseNormalization<InDataTypeTuple,
+                                                   GammaDataType,
+                                                   BetaDataType,
+                                                   AccDataType,
+                                                   YDataType,
+                                                   XElementwiseOperation,
+                                                   YElementwiseOperation,
+                                                   Rank,
+                                                   NumReduceDim>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm.hpp b/include/ck/tensor_operation/gpu/device/device_gemm.hpp
new file mode 100644
index 00000000..c0af6f80
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_gemm.hpp
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceGemm : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        void* p_c,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB,
+                        ck::index_t StrideC,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CElementwiseOperation c_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_bias_e_permute.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_bias_e_permute.hpp
new file mode 100644
index 00000000..4c2161ea
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_bias_e_permute.hpp
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+struct DEGridDesc_M0_M1_M2_N0_N1
+{
+    ck::index_t M0_, M1_, M2_, N0_, N1_;
+    ck::index_t stride_M0_, stride_M1_, stride_M2_, stride_N0_, stride_N1_;
+};
+
+// input : A[M, K], B[K, N],
+// input : D[M, N], ...
+// output : E[M, N]
+// C = a_op(A) * b_op(B)
+// E = cde_op(C, D)
+template <typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceGemmBiasCPermute : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        const void* p_d,
+                        void* p_e,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB,
+                        DEGridDesc_M0_M1_M2_N0_N1 d_gride_desc,
+                        DEGridDesc_M0_M1_M2_N0_N1 e_gride_desc,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
new file mode 100644
index 00000000..9113bb7b
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// GEMM:
+//   input : A[M, K], B[K, N],
+//   input : D0[M, N], D1[M, N], ...
+//   output : E[M, N]
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   D0, D1, ... and E have the same layout
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceGemmMultipleD : public BaseOperator
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB,
+                        std::array<ck::index_t, NumDTensor> StrideDs,
+                        ck::index_t StrideE,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r.hpp
new file mode 100644
index 00000000..f4881e32
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r.hpp
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// FIXME: DeviceGemmReduce type need to well define the problem
+// GEMM:
+//   input : A[AK0, M, AK1]
+//   input : B[AK0, N, AK1]
+//   input : D0[M, N], D1[M, N], ...
+//   output : E[M, N]
+//   output : R0[M], R1[M], ...
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+//   Q0 = reduce0(q_op0(E)), Q1 = reduce1(q_op0(E)), ...
+//   R0 = r_op0(Q0), R1 = r_op1(Q1), ...
+// Assume:
+//   D0, D1, ... and E have the same layout
+template <typename ALayout,
+          typename BLayout,
+          typename DELayout,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename RsDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename QsElementwiseOperation,
+          typename RsElementwiseOperation>
+struct DeviceGemmMultipleDMultipleR : public BaseOperator
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+    static constexpr index_t NumRTensor = RsDataType::Size();
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        std::array<void*, NumRTensor> p_rs,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB,
+                        std::array<ck::index_t, NumDTensor> StrideDs,
+                        ck::index_t StrideE,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op,
+                        QsElementwiseOperation qs_element_op,
+                        RsElementwiseOperation rs_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+template <typename ALayout,
+          typename BLayout,
+          typename DELayout,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename RsDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename QsElementwiseOperation,
+          typename RsElementwiseOperation>
+using DeviceGemmMultipleDMultipleRPtr =
+    std::unique_ptr<DeviceGemmMultipleDMultipleR<ALayout,
+                                                 BLayout,
+                                                 DELayout,
+                                                 ADataType,
+                                                 BDataType,
+                                                 DsDataType,
+                                                 EDataType,
+                                                 RsDataType,
+                                                 AElementwiseOperation,
+                                                 BElementwiseOperation,
+                                                 CDEElementwiseOperation,
+                                                 QsElementwiseOperation,
+                                                 RsElementwiseOperation>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp
new file mode 100644
index 00000000..fcc088ca
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// FIXME: DeviceGemmReduce type need to well define the problem
+template <ck::index_t NumDTensor, ck::index_t NumReduce>
+struct DeviceGemmReduce : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        const void* p_bias,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_c,
+                        std::array<void*, NumReduce> p_reduces,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB,
+                        ck::index_t StrideC,
+                        std::array<ck::index_t, NumDTensor> StrideDs,
+                        std::array<void*, 3> gemm_element_ops,
+                        std::array<void*, NumDTensor> d_element_ops,
+                        std::array<void*, NumReduce> reduce_in_element_ops,
+                        std::array<void*, NumReduce> reduce_out_element_ops,
+                        ck::index_t BatchCount = 1) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+template <ck::index_t NumDTensor, ck::index_t NumReduce>
+using DeviceGemmReducePtr = std::unique_ptr<DeviceGemmReduce<NumDTensor, NumReduce>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp
new file mode 100644
index 00000000..c701bff5
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceGemmSplitK : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                              const void* p_b,
+                                                              void* p_c,
+                                                              ck::index_t M,
+                                                              ck::index_t N,
+                                                              ck::index_t K,
+                                                              ck::index_t StrideA,
+                                                              ck::index_t StrideB,
+                                                              ck::index_t StrideC,
+                                                              AElementwiseOperation a_element_op,
+                                                              BElementwiseOperation b_element_op,
+                                                              CElementwiseOperation c_element_op,
+                                                              ck::index_t KBatch) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+using DeviceGemmSplitKPtr = std::unique_ptr<DeviceGemmSplitK<ALayout,
+                                                             BLayout,
+                                                             CLayout,
+                                                             ADataType,
+                                                             BDataType,
+                                                             CDataType,
+                                                             AElementwiseOperation,
+                                                             BElementwiseOperation,
+                                                             CElementwiseOperation>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_contraction_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_contraction_multiple_d.hpp
new file mode 100644
index 00000000..173c613a
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_contraction_multiple_d.hpp
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <index_t NumDTensor>
+struct ContractionDesc
+{
+    std::vector<index_t> a_ms_ks_lengths;
+    std::vector<index_t> a_ms_ks_strides;
+
+    std::vector<index_t> b_ns_ks_lengths;
+    std::vector<index_t> b_ns_ks_strides;
+
+    std::array<std::vector<index_t>, NumDTensor> ds_ms_ns_lengths;
+    std::array<std::vector<index_t>, NumDTensor> ds_ms_ns_strides;
+
+    std::vector<index_t> e_ms_ns_lengths;
+    std::vector<index_t> e_ms_ns_strides;
+};
+
+// Tensor Contraction:
+//   input : A
+//   input : B
+//   input : D0, D1, ...
+//   output : E
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   A[M0, M1, M2, ..., K0, K1, K2, ...]
+//   B[N0, N1, N2, ..., K0, K1, K2, ...]
+//   D[M0, M1, M2, ..., N0, N1, N2, ...]
+//   E[M0, M1, M2, ..., N0, N1, N2, ...]
+template <index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceGroupedContractionMultipleD : public BaseOperator
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(std::vector<const void*> p_a_vec,
+                        std::vector<const void*> p_b_vec,
+                        std::vector<std::array<const void*, NumDTensor>> p_ds_vec,
+                        std::vector<void*> p_e_vec,
+                        std::vector<ContractionDesc<NumDTensor>> contraction_descs,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
new file mode 100644
index 00000000..3350aec8
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// Conv backward data multiple D:
+//   input : output image A[G, N, K, Ho, Wo]
+//   input : weight B[G, K, C, Y, X],
+//   input : D0[G, N, K, Ho, Wo], D1[G, N, K, Ho, Wo], ...
+//   output : input image E[G, N, C, Hi, Wi],
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceGroupedConvBwdDataMultipleD : public BaseOperator
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static_assert(NumDTensor == DsLayout::Size(), "wrong! Inconsistent NumDTensor");
+
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(
+        const void* p_a,                                                 // output image
+        const void* p_b,                                                 // weight
+        const std::array<const void*, NumDTensor>& p_ds,                 // bias
+        void* p_e,                                                       // input image
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_lengths, // output image
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_strides, // output image
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,  // weight
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,  // weight
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+            ds_g_n_k_wos_lengths, // bias
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+            ds_g_n_k_wos_strides,                                        // bias
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_c_wis_lengths, // input image
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_c_wis_strides, // input image
+        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+        const std::array<index_t, NDimSpatial>& input_left_pads,
+        const std::array<index_t, NDimSpatial>& input_right_pads,
+        const AElementwiseOperation& a_element_op,
+        const BElementwiseOperation& b_element_op,
+        const CDEElementwiseOperation& cde_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
new file mode 100644
index 00000000..1258aed7
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation>
+struct DeviceGroupedConvBwdWeight : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_in,
+                        void* p_wei,
+                        const void* p_out,
+                        ck::index_t G,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t C,
+                        std::array<ck::index_t, NDimSpatial> input_spatial_lengths,
+                        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths,
+                        std::array<ck::index_t, NDimSpatial> output_spatial_lengths,
+                        std::array<ck::index_t, NDimSpatial> conv_filter_strides,
+                        std::array<ck::index_t, NDimSpatial> conv_filter_dilations,
+                        std::array<ck::index_t, NDimSpatial> input_left_pads,
+                        std::array<ck::index_t, NDimSpatial> input_right_pads,
+                        InElementwiseOperation in_element_op,
+                        WeiElementwiseOperation wei_element_op,
+                        OutElementwiseOperation out_element_op,
+                        ck::index_t split_k) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp
new file mode 100644
index 00000000..644c7ee9
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// Convolution Forward:
+//   input : input image A[G, N, C, Hi, Wi],
+//   input : weight B[G, K, C, Y, X],
+//   output : output image E[G, N, K, Ho, Wo]
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+template <index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation>
+struct DeviceGroupedConvFwd : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_in,  // input image
+                        const void* p_wei, // weight
+                        void* p_out,       // output image
+                        const std::array<index_t, NDimSpatial + 3>& in_g_n_c_wis_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& in_g_n_c_wis_strides,
+                        const std::array<index_t, NDimSpatial + 3>& wei_g_k_c_xs_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& wei_g_k_c_xs_strides,
+                        const std::array<index_t, NDimSpatial + 3>& out_g_n_k_wos_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& out_g_n_k_wos_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                        const std::array<index_t, NDimSpatial>& input_left_pads,
+                        const std::array<index_t, NDimSpatial>& input_right_pads,
+                        const InElementwiseOperation& in_element_op,
+                        const WeiElementwiseOperation& wei_element_op,
+                        const OutElementwiseOperation& out_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 00000000..079135e5
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,959 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/io.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+namespace {
+
+template <index_t NumDTensor>
+struct ComputePtrOffsetOfStridedBatch
+{
+    ComputePtrOffsetOfStridedBatch() = default;
+
+    ComputePtrOffsetOfStridedBatch(index_t BatchStrideA,
+                                   index_t BatchStrideB,
+                                   Array<ck::index_t, NumDTensor> BatchStrideDs,
+                                   index_t BatchStrideE)
+        : BatchStrideA_(BatchStrideA),
+          BatchStrideB_(BatchStrideB),
+          BatchStrideDs_(BatchStrideDs),
+          BatchStrideE_(BatchStrideE)
+    {
+    }
+
+    __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
+    {
+        return g_idx * static_cast<long_index_t>(BatchStrideA_);
+    }
+
+    __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
+    {
+        return g_idx * static_cast<long_index_t>(BatchStrideB_);
+    }
+
+    __host__ __device__ constexpr auto GetDsPtrOffset(index_t g_idx) const
+    {
+        Array<long_index_t, NumDTensor> ds_offset;
+        static_for<0, NumDTensor, 1>{}(
+            [&](auto i) { ds_offset(i) = g_idx * static_cast<long_index_t>(BatchStrideDs_[i]); });
+        return ds_offset;
+    }
+
+    __host__ __device__ constexpr long_index_t GetEPtrOffset(index_t g_idx) const
+    {
+        return g_idx * static_cast<long_index_t>(BatchStrideE_);
+    }
+
+    index_t BatchStrideA_;
+    index_t BatchStrideB_;
+    Array<ck::index_t, NumDTensor> BatchStrideDs_;
+    index_t BatchStrideE_;
+};
+
+/*
+ * \brief Wrapper function of GridwiseGemm::Run to realize BatchedGEMM.
+ *
+ * \tparam ComputePtrOffsetOfBatch Class that computes the base pointer offsets of A, B, C matrix
+ * given the batch. For example, ComputePtrOffsetOfStridedBatch() computes the offsets of evenly
+ * strided batched, but we can easily extend to other layouts. The returned offset can be either \p
+ * index_t or \p long_index_t. If it returns \p long_index_t, we are not subject to the 2GB
+ * limitations.
+ *
+ * \tparam Block2ETileMap Block2ETileMap::CalculateBottomIndex() takes in id of a workgroup and
+ * returns the 2D index of the tile that it computes. \see
+ * GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3::Run().
+ *
+ * \note Using \p ComputePtrOffsetOfBatch gives us the flexibility that 2 workgroups can compute 2
+ * tiles from different matrices. Keep in mind that these 2 matrices can share the same grid
+ * descriptor (like in BatchedGEMM), or use their own grid descriptors (in GroupedGemm). \link
+ * device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp kernel_gemm_xdlops_v2r3_for_conv3d \endlink for \link
+ * DeviceConv3d \endlink uses the same concept, but currently does NOT encapsulate the computing of
+ * pointer offset into \p ComputePtrOffsetOfStridedBatch.
+ *
+ * \note \p Block2ETileMap allows customized mapping between a workgroup and the C-tile it computes.
+ * Together with \p ComputePtrOffsetOfBatch, we can reuse GridwiseGemm (and GridwiseGemm fusion ) to
+ * realize BatchedGemm and GroupedGemm (and the corresponding GEMM fusion).
+ *
+ */
+template <typename GridwiseGemm,
+          typename ABDataType,
+          typename DsPointer,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename AGridDesc_K0_M0_M1_K1,
+          typename BGridDesc_K0_N0_N1_K1,
+          typename DsGridDesc_M0_M10_M11_N0_N10_N11,
+          typename CGridDesc_M0_M10_M11_N0_N10_N11,
+          typename Block2CTileMap,
+          typename ComputePtrOffsetOfBatch,
+          bool HasMainKBlockLoop,
+          bool HasDoubleTailKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_grouped_conv_fwd_dl_multiple_d(
+            const ABDataType* __restrict__ p_a_grid,
+            const ABDataType* __restrict__ p_b_grid,
+            DsPointer p_ds_grid,
+            EDataType* __restrict__ p_e_grid,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CDEElementwiseOperation cde_element_op,
+            const index_t batch_count,
+            const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1,
+            const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1,
+            const DsGridDesc_M0_M10_M11_N0_N10_N11 ds_grid_desc_m0_m10_m11_n0_n10_n11,
+            const CGridDesc_M0_M10_M11_N0_N10_N11 e_grid_desc_m0_m10_m11_n0_n10_n11,
+            const Block2CTileMap block_2_ctile_map,
+            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx1030__))
+    // offset base pointer for each work-group
+    const index_t num_blocks_per_batch =
+        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+    const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
+
+    const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
+
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(ABDataType);
+
+    __shared__ ABDataType p_shared[shared_block_size];
+
+    DsPointer p_ds_grid_grp;
+
+    static constexpr index_t NumDTensor = DsGridDesc_M0_M10_M11_N0_N10_N11::Size();
+
+    static_for<0, NumDTensor, 1>{}(
+        [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
+
+    GridwiseGemm::Run(p_a_grid + a_batch_offset,
+                      p_b_grid + b_batch_offset,
+                      p_ds_grid_grp,
+                      p_e_grid + c_batch_offset,
+                      p_shared,
+                      a_element_op,
+                      b_element_op,
+                      cde_element_op,
+                      a_grid_desc_k0_m0_m1_k1,
+                      b_grid_desc_k0_n0_n1_k1,
+                      ds_grid_desc_m0_m10_m11_n0_n10_n11,
+                      e_grid_desc_m0_m10_m11_n0_n10_n11,
+                      block_2_ctile_map,
+                      integral_constant<bool, HasMainKBlockLoop>{},
+                      integral_constant<bool, HasDoubleTailKBlockLoop>{});
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_ds_grid;
+    ignore = p_e_grid;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = cde_element_op;
+    ignore = batch_count;
+    ignore = a_grid_desc_k0_m0_m1_k1;
+    ignore = b_grid_desc_k0_n0_n1_k1;
+    ignore = ds_grid_desc_m0_m10_m11_n0_n10_n11;
+    ignore = e_grid_desc_m0_m10_m11_n0_n10_n11;
+    ignore = compute_ptr_offset_of_batch;
+    ignore = block_2_ctile_map;
+
+    compute_ptr_offset_of_batch.GetAPtrOffset(0);
+    compute_ptr_offset_of_batch.GetBPtrOffset(0);
+    compute_ptr_offset_of_batch.GetEPtrOffset(0);
+#endif
+}
+} // namespace
+
+//
+// @brief      Device Convolution operation.
+//
+// Supports:
+//  @li         Forward convolution with up to 3 spatial dimentions
+//  @li         Input tensor in GNWC data format
+//  @li         Weight tensor in GKXC data format
+//  @li         Output tensor in GNWK data format
+//
+// 1D:
+// out[N, Wo, K] = in[N, Wi, C] * wei[K, X, C]
+// 2D:
+// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
+// 3D:
+// out[N, Do, Ho, Wo, K] = in[N, Di, Hi, Wi, C] * wei[K, Z, Y, X, C]
+//
+template <index_t NDimSpatial,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AccDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          ConvolutionForwardSpecialization ConvForwardSpecialization,
+          GemmSpecialization GemmSpec,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t K0PerBlock,
+          index_t K1,
+          index_t M1PerThread,
+          index_t N1PerThread,
+          index_t KPerThread,
+          typename M1N1ThreadClusterM1Xs,
+          typename M1N1ThreadClusterN1Xs,
+          typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+          typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
+          typename ABlockTransferSrcVectorTensorContiguousDimOrder,
+          typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
+          typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+          typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
+          typename BBlockTransferSrcVectorTensorContiguousDimOrder,
+          typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector>
+struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
+    : public DeviceGroupedConvFwdMultipleD<NDimSpatial,
+                                           ALayout,
+                                           BLayout,
+                                           DsLayout,
+                                           ELayout,
+                                           ADataType,
+                                           BDataType,
+                                           DsDataType,
+                                           EDataType,
+                                           AElementwiseOperation,
+                                           BElementwiseOperation,
+                                           CDEElementwiseOperation>
+{
+    using DeviceOp = DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK;
+
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr auto conv_to_gemm_transformer =
+        TransformConvFwdToGemm<NDimSpatial, ConvForwardSpecialization>{};
+
+    static constexpr auto matrix_padder =
+        MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, K0PerBlock};
+
+    template <typename ALay>
+    static auto
+    MakeAGridDescriptor_AK0_M_AK1(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+                                  const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                                  const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                                  const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+                                  const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
+                                  const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
+                                  const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                                  const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                                  const std::array<index_t, NDimSpatial>& input_left_pads,
+                                  const std::array<index_t, NDimSpatial>& input_right_pads)
+    {
+        const auto in_gemmmraw_gemmkraw_desc =
+            conv_to_gemm_transformer.template MakeADescriptor_M_K<ALay>(a_g_n_c_wis_lengths,
+                                                                        a_g_n_c_wis_strides,
+                                                                        b_g_k_c_xs_lengths,
+                                                                        b_g_k_c_xs_strides,
+                                                                        e_g_n_k_wos_lengths,
+                                                                        e_g_n_k_wos_strides,
+                                                                        conv_filter_strides,
+                                                                        conv_filter_dilations,
+                                                                        input_left_pads,
+                                                                        input_right_pads);
+
+        const auto in_gemmm_gemmk_desc =
+            matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmkraw_desc);
+
+        const auto M = in_gemmm_gemmk_desc.GetLength(I0);
+        const auto K = in_gemmm_gemmk_desc.GetLength(I1);
+
+        const auto AK0 = K / K1;
+
+        return transform_tensor_descriptor(
+            in_gemmm_gemmk_desc,
+            make_tuple(make_unmerge_transform(make_tuple(AK0, K1)), make_pass_through_transform(M)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    template <typename BLay>
+    static auto
+    MakeBGridDescriptor_BK0_N_BK1(const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                                  const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides)
+    {
+        const auto wei_gemmnraw_gemmkraw_desc =
+            conv_to_gemm_transformer.template MakeBDescriptor_N_K<BLay>(b_g_k_c_xs_lengths,
+                                                                        b_g_k_c_xs_strides);
+
+        const auto wei_gemmn_gemmk_desc =
+            matrix_padder.PadBDescriptor_N_K(wei_gemmnraw_gemmkraw_desc);
+
+        const auto N = wei_gemmn_gemmk_desc.GetLength(I0);
+        const auto K = wei_gemmn_gemmk_desc.GetLength(I1);
+
+        const auto BK0 = K / K1;
+
+        return transform_tensor_descriptor(
+            wei_gemmn_gemmk_desc,
+            make_tuple(make_unmerge_transform(make_tuple(BK0, K1)), make_pass_through_transform(N)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    template <typename ELay>
+    static auto
+    MakeEGridDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
+                            const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides)
+    {
+        const auto out_gemmmraw_gemmnraw_desc =
+            conv_to_gemm_transformer.template MakeCDescriptor_M_N<ELay>(e_g_n_k_wos_lengths,
+                                                                        e_g_n_k_wos_strides);
+
+        const auto out_gemmm_gemmn_desc =
+            matrix_padder.PadCDescriptor_M_N(out_gemmmraw_gemmnraw_desc);
+
+        return out_gemmm_gemmn_desc;
+    }
+
+    static auto MakeDsGridDescriptor_M_N(
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_lengths,
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_strides)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+
+                return DeviceOp::MakeEGridDescriptor_M_N<DLayout>(ds_g_n_k_wos_lengths[i],
+                                                                  ds_g_n_k_wos_strides[i]);
+            },
+            Number<NumDTensor>{});
+    }
+
+    // desc for problem definition
+    using AGridDesc_AK0_M_AK1 = remove_cvref_t<decltype(
+        MakeAGridDescriptor_AK0_M_AK1<ALayout>({}, {}, {}, {}, {}, {}, {}, {}, {}, {}))>;
+    using BGridDesc_BK0_N_BK1 =
+        remove_cvref_t<decltype(MakeBGridDescriptor_BK0_N_BK1<BLayout>({}, {}))>;
+    using DsGridDesc_M_N = remove_cvref_t<decltype(MakeDsGridDescriptor_M_N({}, {}))>;
+    using EGridDesc_M_N  = remove_cvref_t<decltype(MakeEGridDescriptor_M_N<ELayout>({}, {}))>;
+
+    // GridwiseGemm
+    using GridwiseGemm =
+        GridwiseGemmDlMultipleD_km_kn_mn<BlockSize,
+                                         ADataType,
+                                         AccDataType,
+                                         DsDataType,
+                                         EDataType,
+                                         AElementwiseOperation,
+                                         BElementwiseOperation,
+                                         CDEElementwiseOperation,
+                                         InMemoryDataOperationEnum::Set,
+                                         AGridDesc_AK0_M_AK1,
+                                         BGridDesc_BK0_N_BK1,
+                                         EGridDesc_M_N,
+                                         MPerBlock,
+                                         NPerBlock,
+                                         K0PerBlock,
+                                         K1,
+                                         M1PerThread,
+                                         N1PerThread,
+                                         KPerThread,
+                                         M1N1ThreadClusterM1Xs,
+                                         M1N1ThreadClusterN1Xs,
+                                         ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+                                         ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+                                         ABlockTransferThreadClusterArrangeOrder,
+                                         ABlockTransferSrcAccessOrder,
+                                         ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
+                                         ABlockTransferSrcVectorTensorContiguousDimOrder,
+                                         ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
+                                         BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+                                         BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+                                         BBlockTransferThreadClusterArrangeOrder,
+                                         BBlockTransferSrcAccessOrder,
+                                         BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
+                                         BBlockTransferSrcVectorTensorContiguousDimOrder,
+                                         BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
+                                         CThreadTransferSrcDstAccessOrder,
+                                         CThreadTransferSrcDstVectorDim,
+                                         CThreadTransferDstScalarPerVector>;
+
+    using AGridDesc_K0_M0_M1_K1 =
+        decltype(GridwiseGemm::MakeAGridDescriptor_K0_M0_M1_K1(AGridDesc_AK0_M_AK1{}));
+    using BGridDesc_K0_N0_N1_K1 =
+        decltype(GridwiseGemm::MakeBGridDescriptor_K0_N0_N1_K1(BGridDesc_BK0_N_BK1{}));
+    using DsGridDesc_M0_M10_M11_N0_N10_N11 =
+        decltype(GridwiseGemm::MakeDsGridDescriptor_M0_M10_M11_N0_N10_N11(DsGridDesc_M_N{}));
+    using CGridDesc_M0_M10_M11_N0_N10_N11 =
+        decltype(GridwiseGemm::MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(EGridDesc_M_N{}));
+    using DefaultBlock2CTileMap =
+        decltype(GridwiseGemm::MakeDefaultBlock2CTileMap(EGridDesc_M_N{}));
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const void* p_a,
+                 const void* p_b,
+                 const std::array<const void*, NumDTensor>& p_ds,
+                 void* p_e,
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+                 const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+                     ds_g_n_k_wos_lengths,
+                 const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+                     ds_g_n_k_wos_strides,
+                 const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
+                 const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
+                 const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                 const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                 const std::array<index_t, NDimSpatial>& input_left_pads,
+                 const std::array<index_t, NDimSpatial>& input_right_pads,
+                 const AElementwiseOperation& a_element_op,
+                 const BElementwiseOperation& b_element_op,
+                 const CDEElementwiseOperation& cde_element_op)
+            : p_a_grid_{static_cast<const ADataType*>(p_a)},
+              p_b_grid_{static_cast<const BDataType*>(p_b)},
+              p_ds_grid_{},
+              p_e_grid_{static_cast<EDataType*>(p_e)},
+              num_group_{a_g_n_c_wis_lengths[0]},
+              a_grid_desc_ak0_m_ak1_{
+                  DeviceOp::MakeAGridDescriptor_AK0_M_AK1<ALayout>(a_g_n_c_wis_lengths,
+                                                                   a_g_n_c_wis_strides,
+                                                                   b_g_k_c_xs_lengths,
+                                                                   b_g_k_c_xs_strides,
+                                                                   e_g_n_k_wos_lengths,
+                                                                   e_g_n_k_wos_strides,
+                                                                   conv_filter_strides,
+                                                                   conv_filter_dilations,
+                                                                   input_left_pads,
+                                                                   input_right_pads)},
+              b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1<BLayout>(
+                  b_g_k_c_xs_lengths, b_g_k_c_xs_strides)},
+              e_grid_desc_m_n_{DeviceOp::MakeEGridDescriptor_M_N<ELayout>(e_g_n_k_wos_lengths,
+                                                                          e_g_n_k_wos_strides)},
+              a_grid_desc_k0_m0_m1_k1_{},
+              b_grid_desc_k0_n0_n1_k1_{},
+              ds_grid_desc_m0_m10_m11_n0_n10_n11_{},
+              e_grid_desc_m0_m10_m11_n0_n10_n11_{},
+              block_2_ctile_map_{},
+              compute_ptr_offset_of_batch_{},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op},
+              a_g_n_c_wis_lengths_{a_g_n_c_wis_lengths},
+              a_g_n_c_wis_strides_{a_g_n_c_wis_strides},
+              b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths},
+              b_g_k_c_xs_strides_{b_g_k_c_xs_strides},
+              e_g_n_k_wos_lengths_{e_g_n_k_wos_lengths},
+              e_g_n_k_wos_strides_{e_g_n_k_wos_strides},
+              conv_filter_strides_{conv_filter_strides},
+              conv_filter_dilations_{conv_filter_dilations},
+              input_left_pads_{input_left_pads},
+              input_right_pads_{input_right_pads}
+        {
+            // A/B/E Batch Stride
+            compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_c_wis_strides[0];
+            compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_k_c_xs_strides[0];
+            compute_ptr_offset_of_batch_.BatchStrideE_ = e_g_n_k_wos_strides[0];
+
+            // populate pointer, batch stride, desc for Ds
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DLayout   = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                // D pointer
+                p_ds_grid_(i) = static_cast<const DDataType*>(p_ds[i]);
+
+                // D batch stride
+                compute_ptr_offset_of_batch_.BatchStrideDs_(i) = ds_g_n_k_wos_strides[i][0];
+
+                // D desc
+                ds_grid_desc_m_n_(i) = DeviceOp::MakeEGridDescriptor_M_N<DLayout>(
+                    ds_g_n_k_wos_lengths[i], ds_g_n_k_wos_strides[i]);
+            });
+
+            // populate desc for Ds/E
+            if(GridwiseGemm::CheckValidity(
+                   a_grid_desc_ak0_m_ak1_, b_grid_desc_bk0_n_bk1_, e_grid_desc_m_n_))
+            {
+
+                a_grid_desc_k0_m0_m1_k1_ =
+                    GridwiseGemm::MakeAGridDescriptor_K0_M0_M1_K1(a_grid_desc_ak0_m_ak1_);
+                b_grid_desc_k0_n0_n1_k1_ =
+                    GridwiseGemm::MakeBGridDescriptor_K0_N0_N1_K1(b_grid_desc_bk0_n_bk1_);
+                e_grid_desc_m0_m10_m11_n0_n10_n11_ =
+                    GridwiseGemm::MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(e_grid_desc_m_n_);
+
+                ds_grid_desc_m0_m10_m11_n0_n10_n11_ =
+                    GridwiseGemm::MakeDsGridDescriptor_M0_M10_M11_N0_N10_N11(ds_grid_desc_m_n_);
+
+                block_2_ctile_map_ = GridwiseGemm::MakeDefaultBlock2CTileMap(e_grid_desc_m_n_);
+            }
+        }
+
+        void Print() const
+        {
+            std::cout << "A[K0, M, K1]: " << a_grid_desc_ak0_m_ak1_ << std::endl;
+            std::cout << "B[K0, N, K1]: " << b_grid_desc_bk0_n_bk1_ << std::endl;
+            std::cout << "E[M, N]: " << e_grid_desc_m_n_ << std::endl;
+            std::cout << "num_group: " << num_group_ << std::endl;
+
+            std::cout << "A[k0, m0, m1, k1]: " << a_grid_desc_k0_m0_m1_k1_ << std::endl;
+            std::cout << "B[k0, n0, n1, k1]: " << b_grid_desc_k0_n0_n1_k1_ << std::endl;
+            std::cout << "A[m0, m10, m11, n0, n10, n11]: " << e_grid_desc_m0_m10_m11_n0_n10_n11_
+                      << std::endl;
+        }
+
+        //  private:
+        // pointers
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        EDataType* p_e_grid_;
+
+        // tensor descriptors for problem definiton
+        index_t num_group_;
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        DsGridDesc_M_N ds_grid_desc_m_n_;
+        EGridDesc_M_N e_grid_desc_m_n_;
+
+        // tensor descriptors for block/thread-wise copy
+        AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1_;
+        BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1_;
+        DsGridDesc_M0_M10_M11_N0_N10_N11 ds_grid_desc_m0_m10_m11_n0_n10_n11_;
+        CGridDesc_M0_M10_M11_N0_N10_N11 e_grid_desc_m0_m10_m11_n0_n10_n11_;
+
+        // block-to-e-tile map
+        DefaultBlock2CTileMap block_2_ctile_map_;
+
+        // for computing batch offset
+        ComputePtrOffsetOfStridedBatch<NumDTensor> compute_ptr_offset_of_batch_;
+
+        // element-wise op
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+
+        // for checking IsSupportedArgument()
+        std::array<index_t, NDimSpatial + 3> a_g_n_c_wis_lengths_;
+        std::array<index_t, NDimSpatial + 3> a_g_n_c_wis_strides_;
+        std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_lengths_;
+        std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_strides_;
+        std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_k_wos_lengths_;
+        std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_k_wos_strides_;
+        std::array<index_t, NDimSpatial + 3> e_g_n_k_wos_lengths_;
+        std::array<index_t, NDimSpatial + 3> e_g_n_k_wos_strides_;
+        std::array<index_t, NDimSpatial> conv_filter_strides_;
+        std::array<index_t, NDimSpatial> conv_filter_dilations_;
+        std::array<index_t, NDimSpatial> input_left_pads_;
+        std::array<index_t, NDimSpatial> input_right_pads_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config)
+        {
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+            }
+
+            if(!GridwiseGemm::CheckValidity(
+                   arg.a_grid_desc_ak0_m_ak1_, arg.b_grid_desc_bk0_n_bk1_, arg.e_grid_desc_m_n_))
+            {
+                throw std::runtime_error(
+                    "wrong! DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK has invalid setting");
+            }
+
+            const index_t grid_size =
+                GridwiseGemm::CalculateGridSize(arg.e_grid_desc_m_n_.GetLength(I0),
+                                                arg.e_grid_desc_m_n_.GetLength(I1)) *
+                arg.num_group_;
+
+            auto launch_kernel = [&](auto has_main_k_block_loop,
+                                     auto has_double_tail_k_block_loop) {
+                constexpr bool has_main_loop   = has_main_k_block_loop.value;
+                constexpr bool has_double_loop = has_double_tail_k_block_loop;
+
+                const auto kernel = kernel_grouped_conv_fwd_dl_multiple_d<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    typename GridwiseGemm::DsGridPointer,
+                    EDataType,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CDEElementwiseOperation,
+                    DeviceOp::AGridDesc_K0_M0_M1_K1,
+                    DeviceOp::BGridDesc_K0_N0_N1_K1,
+                    DeviceOp::DsGridDesc_M0_M10_M11_N0_N10_N11,
+                    DeviceOp::CGridDesc_M0_M10_M11_N0_N10_N11,
+                    DefaultBlock2CTileMap,
+                    ComputePtrOffsetOfStridedBatch<NumDTensor>,
+                    has_main_loop,
+                    has_double_loop>;
+
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_a_grid_,
+                                              arg.p_b_grid_,
+                                              arg.p_ds_grid_,
+                                              arg.p_e_grid_,
+                                              arg.a_element_op_,
+                                              arg.b_element_op_,
+                                              arg.cde_element_op_,
+                                              arg.a_g_n_c_wis_lengths_[0], // Group count
+                                              arg.a_grid_desc_k0_m0_m1_k1_,
+                                              arg.b_grid_desc_k0_n0_n1_k1_,
+                                              arg.ds_grid_desc_m0_m10_m11_n0_n10_n11_,
+                                              arg.e_grid_desc_m0_m10_m11_n0_n10_n11_,
+                                              arg.block_2_ctile_map_,
+                                              arg.compute_ptr_offset_of_batch_);
+            };
+
+            const auto K0                    = arg.a_grid_desc_k0_m0_m1_k1_.GetLength(I0);
+            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K0);
+            const bool has_double_tail_k_block_loop =
+                GridwiseGemm::CalculateHasDoubleTailKBlockLoop(K0);
+
+            if(has_main_k_block_loop && has_double_tail_k_block_loop)
+            {
+                return launch_kernel(integral_constant<bool, true>{},
+                                     integral_constant<bool, true>{});
+            }
+            else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
+            {
+                return launch_kernel(integral_constant<bool, true>{},
+                                     integral_constant<bool, false>{});
+            }
+            else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
+            {
+                return launch_kernel(integral_constant<bool, false>{},
+                                     integral_constant<bool, true>{});
+            }
+            else
+            {
+                return launch_kernel(integral_constant<bool, false>{},
+                                     integral_constant<bool, false>{});
+            }
+            return 0;
+        }
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        namespace ctc = tensor_layout::convolution;
+
+        // check device
+        if(!(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030"))
+        {
+            return false;
+        }
+
+        // check ConvolutionForwardSpecialization
+        if constexpr(ConvForwardSpecialization ==
+                     ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
+        {
+            // check if it's 1x1, stride=1 conv
+            for(index_t i = 0; i < NDimSpatial; ++i)
+            {
+                const index_t X          = arg.b_g_k_c_xs_lengths_[i + 3];
+                const index_t ConvStride = arg.conv_filter_strides_[i];
+                const index_t LeftPad    = arg.input_left_pads_[i];
+                const index_t RightPad   = arg.input_right_pads_[i];
+
+                if(!(X == 1 && ConvStride == 1 && LeftPad == 0 && RightPad == 0))
+                {
+                    std::cout << "Filter1x1Stride1Pad0 check: XY_index = " << i << " X = " << X
+                              << " ConvStride = " << ConvStride << " LeftPad = " << LeftPad
+                              << " RightPad = " << RightPad << std::endl;
+                    return false;
+                }
+            }
+        }
+        else if constexpr(ConvForwardSpecialization ==
+                          ConvolutionForwardSpecialization::Filter1x1Pad0)
+        {
+            // check if it's 1x1 conv
+            for(index_t i = 0; i < NDimSpatial; ++i)
+            {
+                const index_t X        = arg.b_g_k_c_xs_lengths_[i + 3];
+                const index_t LeftPad  = arg.input_left_pads_[i];
+                const index_t RightPad = arg.input_right_pads_[i];
+
+                if(!(X == 1 && LeftPad == 0 && RightPad == 0))
+                {
+                    std::cout << "Filter1x1Stride1Pad0 check: XY_index = " << i << " X = " << X
+                              << " LeftPad = " << LeftPad << " RightPad = " << RightPad
+                              << std::endl;
+                    return false;
+                }
+            }
+        }
+
+        // check vector access of A
+        // FIXME: layout
+        if constexpr(is_same_v<ALayout, ctc::G_NW_C> || is_same_v<ALayout, ctc::G_NHW_C> ||
+                     is_same_v<ALayout, ctc::G_NDHW_C> || is_same_v<ALayout, ctc::GNWC> ||
+                     is_same_v<ALayout, ctc::GNHWC> || is_same_v<ALayout, ctc::GNDHWC> ||
+                     is_same_v<ALayout, ctc::NWGC> || is_same_v<ALayout, ctc::NHWGC> ||
+                     is_same_v<ALayout, ctc::NDHWGC>)
+        {
+            auto srcVectorLengths = ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1{};
+            if(srcVectorLengths[I1] != 1 || srcVectorLengths[I2] != 1)
+            {
+                return false;
+            }
+            if(K1 % srcVectorLengths[I3] != 0 || K0PerBlock % srcVectorLengths[I0] != 0)
+            {
+                return false;
+            }
+
+            const index_t C = arg.a_g_n_c_wis_lengths_[2];
+
+            if(C % (srcVectorLengths[I0] * srcVectorLengths[I3]) != 0)
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+
+        // check vector access of B
+        // FIXME: layout
+        if constexpr(is_same_v<BLayout, ctc::G_K_X_C> || is_same_v<BLayout, ctc::G_K_YX_C> ||
+                     is_same_v<BLayout, ctc::G_K_ZYX_C> || is_same_v<BLayout, ctc::GKXC> ||
+                     is_same_v<BLayout, ctc::GKYXC> || is_same_v<BLayout, ctc::GKZYXC> ||
+                     is_same_v<BLayout, ctc::KXGC> || is_same_v<BLayout, ctc::KYXGC> ||
+                     is_same_v<BLayout, ctc::KZYXGC>)
+
+        {
+            auto srcVectorLengths = BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1{};
+            if(srcVectorLengths[I1] != 1 || srcVectorLengths[I2] != 1)
+            {
+                return false;
+            }
+            if(K1 % srcVectorLengths[I3] != 0 || K0PerBlock % srcVectorLengths[I0] != 0)
+            {
+                return false;
+            }
+
+            const index_t C = arg.b_g_k_c_xs_lengths_[2];
+
+            if(C % (srcVectorLengths[I0] * srcVectorLengths[I3]) != 0)
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+
+        // check vector access of E
+        if constexpr(is_same_v<ELayout, ctc::G_NW_K> || is_same_v<ELayout, ctc::G_NHW_K> ||
+                     is_same_v<ELayout, ctc::G_NDHW_K> || is_same_v<ELayout, ctc::GNWK> ||
+                     is_same_v<ELayout, ctc::GNHWK> || is_same_v<ELayout, ctc::GNDHWK> ||
+                     is_same_v<ELayout, ctc::NWGK> || is_same_v<ELayout, ctc::NHWGK> ||
+                     is_same_v<ELayout, ctc::NDHWGK>)
+        {
+            const index_t K = arg.e_g_n_k_wos_lengths_[2];
+
+            if(!(K % CThreadTransferDstScalarPerVector == 0 && CThreadTransferSrcDstVectorDim == 5))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+        // check Gridwise GEMM
+        return GridwiseGemm::CheckValidity(
+            arg.a_grid_desc_ak0_m_ak1_, arg.b_grid_desc_bk0_n_bk1_, arg.e_grid_desc_m_n_);
+    }
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(
+        const void* p_a,
+        const void* p_b,
+        const std::array<const void*, NumDTensor>& p_ds,
+        void* p_e,
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_lengths,
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_strides,
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+        const std::array<index_t, NDimSpatial>& input_left_pads,
+        const std::array<index_t, NDimSpatial>& input_right_pads,
+        const AElementwiseOperation& a_element_op,
+        const BElementwiseOperation& b_element_op,
+        const CDEElementwiseOperation& cde_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_ds,
+                        p_e,
+                        a_g_n_c_wis_lengths,
+                        a_g_n_c_wis_strides,
+                        b_g_k_c_xs_lengths,
+                        b_g_k_c_xs_strides,
+                        ds_g_n_k_wos_lengths,
+                        ds_g_n_k_wos_strides,
+                        e_g_n_k_wos_lengths,
+                        e_g_n_k_wos_strides,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        a_element_op,
+                        b_element_op,
+                        cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(
+        const void* p_a,
+        const void* p_b,
+        const std::array<const void*, NumDTensor>& p_ds,
+        void* p_e,
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_lengths,
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_strides,
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+        const std::array<index_t, NDimSpatial>& input_left_pads,
+        const std::array<index_t, NDimSpatial>& input_right_pads,
+        const AElementwiseOperation& a_element_op,
+        const BElementwiseOperation& b_element_op,
+        const CDEElementwiseOperation& cde_element_op) override
+    {
+        return std::make_unique<Argument>(p_a,
+                                          p_b,
+                                          p_ds,
+                                          p_e,
+                                          a_g_n_c_wis_lengths,
+                                          a_g_n_c_wis_strides,
+                                          b_g_k_c_xs_lengths,
+                                          b_g_k_c_xs_strides,
+                                          ds_g_n_k_wos_lengths,
+                                          ds_g_n_k_wos_strides,
+                                          e_g_n_k_wos_lengths,
+                                          e_g_n_k_wos_strides,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads,
+                                          a_element_op,
+                                          b_element_op,
+                                          cde_element_op);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << K0PerBlock << ", "
+            << getConvForwardSpecializationString(ConvForwardSpecialization)
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 00000000..47c82117
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,837 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/io.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+namespace {
+
+struct ComputePtrOffsetOfStridedBatch
+{
+    ComputePtrOffsetOfStridedBatch(index_t BatchStrideA, index_t BatchStrideB, index_t BatchStrideC)
+        : BatchStrideA_(BatchStrideA), BatchStrideB_(BatchStrideB), BatchStrideC_(BatchStrideC)
+    {
+    }
+
+    __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
+    {
+        return g_idx * static_cast<long_index_t>(BatchStrideA_);
+    }
+
+    __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
+    {
+        return g_idx * static_cast<long_index_t>(BatchStrideB_);
+    }
+
+    __host__ __device__ constexpr long_index_t GetCPtrOffset(index_t g_idx) const
+    {
+        return g_idx * static_cast<long_index_t>(BatchStrideC_);
+    }
+
+    index_t BatchStrideA_;
+    index_t BatchStrideB_;
+    index_t BatchStrideC_;
+};
+
+/*
+ * \brief Wrapper function of GridwiseGemm::Run to realize BatchedGEMM.
+ *
+ * \tparam ComputePtrOffsetOfBatch Class that computes the base pointer offsets of A, B, C matrix
+ * given the batch. For example, ComputePtrOffsetOfStridedBatch() computes the offsets of evenly
+ * strided batched, but we can easily extend to other layouts. The returned offset can be either \p
+ * index_t or \p long_index_t. If it returns \p long_index_t, we are not subject to the 2GB
+ * limitations.
+ *
+ * \tparam Block2ETileMap Block2ETileMap::CalculateBottomIndex() takes in id of a workgroup and
+ * returns the 2D index of the tile that it computes. \see
+ * GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3::Run().
+ *
+ * \note Using \p ComputePtrOffsetOfBatch gives us the flexibility that 2 workgroups can compute 2
+ * tiles from different matrices. Keep in mind that these 2 matrices can share the same grid
+ * descriptor (like in BatchedGEMM), or use their own grid descriptors (in GroupedGemm). \link
+ * device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp kernel_gemm_xdlops_v2r3_for_conv3d \endlink for \link
+ * DeviceConv3d \endlink uses the same concept, but currently does NOT encapsulate the computing of
+ * pointer offset into \p ComputePtrOffsetOfStridedBatch.
+ *
+ * \note \p Block2ETileMap allows customized mapping between a workgroup and the C-tile it computes.
+ * Together with \p ComputePtrOffsetOfBatch, we can reuse GridwiseGemm (and GridwiseGemm fusion ) to
+ * realize BatchedGemm and GroupedGemm (and the corresponding GEMM fusion).
+ *
+ */
+template <typename GridwiseGemm,
+          typename ABDataType,
+          typename CDataType,
+          typename AGridDesc_K0_M0_M1_K1,
+          typename BGridDesc_K0_N0_N1_K1,
+          typename CGridDesc_M0_M10_M11_N0_N10_N11,
+          typename Block2CTileMap,
+          typename ComputePtrOffsetOfBatch,
+          bool HasMainKBlockLoop,
+          bool HasDoubleTailKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_grouped_conv_fwd_dl(
+            const ABDataType* __restrict__ p_a_grid,
+            const ABDataType* __restrict__ p_b_grid,
+            CDataType* __restrict__ p_c_grid,
+            const index_t batch_count,
+            const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1,
+            const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1,
+            const CGridDesc_M0_M10_M11_N0_N10_N11 c_grid_desc_m0_m10_m11_n0_n10_n11,
+            const Block2CTileMap block_2_ctile_map,
+            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx1030__))
+    // offset base pointer for each work-group
+    const index_t num_blocks_per_batch =
+        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+    const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx)));
+
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(ABDataType);
+
+    __shared__ ABDataType p_shared[shared_block_size];
+
+    GridwiseGemm::Run(p_a_grid + a_batch_offset,
+                      p_b_grid + b_batch_offset,
+                      p_c_grid + c_batch_offset,
+                      p_shared,
+                      a_grid_desc_k0_m0_m1_k1,
+                      b_grid_desc_k0_n0_n1_k1,
+                      c_grid_desc_m0_m10_m11_n0_n10_n11,
+                      block_2_ctile_map,
+                      integral_constant<bool, HasMainKBlockLoop>{},
+                      integral_constant<bool, HasDoubleTailKBlockLoop>{});
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_c_grid;
+    ignore = batch_count;
+    ignore = a_grid_desc_k0_m0_m1_k1;
+    ignore = b_grid_desc_k0_n0_n1_k1;
+    ignore = c_grid_desc_m0_m10_m11_n0_n10_n11;
+    ignore = compute_ptr_offset_of_batch;
+    ignore = block_2_ctile_map;
+
+    compute_ptr_offset_of_batch.GetAPtrOffset(0);
+    compute_ptr_offset_of_batch.GetBPtrOffset(0);
+    compute_ptr_offset_of_batch.GetCPtrOffset(0);
+#endif
+}
+
+} // namespace
+
+//
+// @brief      Device Convolution operation.
+//
+// Supports:
+//  @li         Forward convolution with up to 3 spatial dimentions
+//  @li         Input tensor in GNWC data format
+//  @li         Weight tensor in GKXC data format
+//  @li         Output tensor in GNWK data format
+//
+// 1D:
+// out[N, Wo, K] = in[N, Wi, C] * wei[K, X, C]
+// 2D:
+// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
+// 3D:
+// out[N, Do, Ho, Wo, K] = in[N, Di, Hi, Wi, C] * wei[K, Z, Y, X, C]
+//
+template <
+    index_t NDimSpatial,
+    typename ADataType,
+    typename BDataType,
+    typename CDataType,
+    typename AccDataType,
+    typename ALayout,
+    typename BLayout,
+    typename CLayout,
+    typename AElementwiseOperation,
+    typename BElementwiseOperation,
+    typename CElementwiseOperation,
+    ConvolutionForwardSpecialization ConvForwardSpecialization,
+    GemmSpecialization GemmSpec,
+    index_t BlockSize,
+    index_t MPerBlock,
+    index_t NPerBlock,
+    index_t K0PerBlock,
+    index_t K1,
+    index_t M1PerThread,
+    index_t N1PerThread,
+    index_t KPerThread,
+    typename M1N1ThreadClusterM1Xs,
+    typename M1N1ThreadClusterN1Xs,
+    typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+    typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+    typename ABlockTransferThreadClusterArrangeOrder,
+    typename ABlockTransferSrcAccessOrder,
+    typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
+    typename ABlockTransferSrcVectorTensorContiguousDimOrder,
+    typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
+    typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+    typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+    typename BBlockTransferThreadClusterArrangeOrder,
+    typename BBlockTransferSrcAccessOrder,
+    typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
+    typename BBlockTransferSrcVectorTensorContiguousDimOrder,
+    typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
+    typename CThreadTransferSrcDstAccessOrder,
+    index_t CThreadTransferSrcDstVectorDim,
+    index_t CThreadTransferDstScalarPerVector,
+    enable_if_t<
+        is_same_v<AElementwiseOperation, ck::tensor_operation::element_wise::PassThrough> &&
+            is_same_v<BElementwiseOperation, ck::tensor_operation::element_wise::PassThrough> &&
+            is_same_v<CElementwiseOperation, ck::tensor_operation::element_wise::PassThrough>,
+        bool> = false>
+struct DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK : public DeviceGroupedConvFwd<NDimSpatial,
+                                                                           ALayout,
+                                                                           BLayout,
+                                                                           CLayout,
+                                                                           ADataType,
+                                                                           BDataType,
+                                                                           CDataType,
+                                                                           AElementwiseOperation,
+                                                                           BElementwiseOperation,
+                                                                           CElementwiseOperation>
+{
+    using DeviceOp = DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr auto conv_to_gemm_transformer =
+        TransformConvFwdToGemm<NDimSpatial, ConvForwardSpecialization>{};
+
+    static constexpr auto matrix_padder =
+        MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, K0PerBlock};
+
+    template <typename ALay>
+    static auto
+    MakeAGridDescriptor_AK0_M_AK1(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+                                  const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                                  const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                                  const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+                                  const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
+                                  const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_strides,
+                                  const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                                  const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                                  const std::array<index_t, NDimSpatial>& input_left_pads,
+                                  const std::array<index_t, NDimSpatial>& input_right_pads)
+    {
+        const auto in_gemmmraw_gemmkraw_desc =
+            conv_to_gemm_transformer.template MakeADescriptor_M_K<ALay>(a_g_n_c_wis_lengths,
+                                                                        a_g_n_c_wis_strides,
+                                                                        b_g_k_c_xs_lengths,
+                                                                        b_g_k_c_xs_strides,
+                                                                        c_g_n_k_wos_lengths,
+                                                                        c_g_n_k_wos_strides,
+                                                                        conv_filter_strides,
+                                                                        conv_filter_dilations,
+                                                                        input_left_pads,
+                                                                        input_right_pads);
+
+        const auto in_gemmm_gemmk_desc =
+            matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmkraw_desc);
+
+        const auto M = in_gemmm_gemmk_desc.GetLength(I0);
+        const auto K = in_gemmm_gemmk_desc.GetLength(I1);
+
+        const auto AK0 = K / K1;
+
+        return transform_tensor_descriptor(
+            in_gemmm_gemmk_desc,
+            make_tuple(make_unmerge_transform(make_tuple(AK0, K1)), make_pass_through_transform(M)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    template <typename BLay>
+    static auto
+    MakeBGridDescriptor_BK0_N_BK1(const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                                  const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides)
+    {
+        const auto wei_gemmnraw_gemmkraw_desc =
+            conv_to_gemm_transformer.template MakeBDescriptor_N_K<BLay>(b_g_k_c_xs_lengths,
+                                                                        b_g_k_c_xs_strides);
+
+        const auto wei_gemmn_gemmk_desc =
+            matrix_padder.PadBDescriptor_N_K(wei_gemmnraw_gemmkraw_desc);
+
+        const auto N = wei_gemmn_gemmk_desc.GetLength(I0);
+        const auto K = wei_gemmn_gemmk_desc.GetLength(I1);
+
+        const auto BK0 = K / K1;
+
+        return transform_tensor_descriptor(
+            wei_gemmn_gemmk_desc,
+            make_tuple(make_unmerge_transform(make_tuple(BK0, K1)), make_pass_through_transform(N)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    template <typename CLay>
+    static auto
+    MakeCGridDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
+                            const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_strides)
+    {
+        const auto out_gemmmraw_gemmnraw_desc =
+            conv_to_gemm_transformer.template MakeCDescriptor_M_N<CLay>(c_g_n_k_wos_lengths,
+                                                                        c_g_n_k_wos_strides);
+
+        const auto out_gemmm_gemmn_desc =
+            matrix_padder.PadCDescriptor_M_N(out_gemmmraw_gemmnraw_desc);
+
+        return out_gemmm_gemmn_desc;
+    }
+
+    // desc for problem definition
+    using AGridDesc_AK0_M_AK1 = remove_cvref_t<decltype(
+        MakeAGridDescriptor_AK0_M_AK1<ALayout>({}, {}, {}, {}, {}, {}, {}, {}, {}, {}))>;
+    using BGridDesc_BK0_N_BK1 =
+        remove_cvref_t<decltype(MakeBGridDescriptor_BK0_N_BK1<BLayout>({}, {}))>;
+    using CGridDesc_M_N = remove_cvref_t<decltype(MakeCGridDescriptor_M_N<CLayout>({}, {}))>;
+
+    // GridwiseGemm
+    using GridwiseGemm =
+        GridwiseGemmDl_km_kn_mn_v1r3<BlockSize,
+                                     ADataType,
+                                     AccDataType,
+                                     CDataType,
+                                     InMemoryDataOperationEnum::Set,
+                                     AGridDesc_AK0_M_AK1,
+                                     BGridDesc_BK0_N_BK1,
+                                     CGridDesc_M_N,
+                                     MPerBlock,
+                                     NPerBlock,
+                                     K0PerBlock,
+                                     K1,
+                                     M1PerThread,
+                                     N1PerThread,
+                                     KPerThread,
+                                     M1N1ThreadClusterM1Xs,
+                                     M1N1ThreadClusterN1Xs,
+                                     ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+                                     ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+                                     ABlockTransferThreadClusterArrangeOrder,
+                                     ABlockTransferSrcAccessOrder,
+                                     ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
+                                     ABlockTransferSrcVectorTensorContiguousDimOrder,
+                                     ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
+                                     BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+                                     BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+                                     BBlockTransferThreadClusterArrangeOrder,
+                                     BBlockTransferSrcAccessOrder,
+                                     BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
+                                     BBlockTransferSrcVectorTensorContiguousDimOrder,
+                                     BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
+                                     CThreadTransferSrcDstAccessOrder,
+                                     CThreadTransferSrcDstVectorDim,
+                                     CThreadTransferDstScalarPerVector>;
+
+    using AGridDesc_K0_M0_M1_K1 =
+        decltype(GridwiseGemm::MakeAGridDescriptor_K0_M0_M1_K1(AGridDesc_AK0_M_AK1{}));
+    using BGridDesc_K0_N0_N1_K1 =
+        decltype(GridwiseGemm::MakeBGridDescriptor_K0_N0_N1_K1(BGridDesc_BK0_N_BK1{}));
+    using CGridDesc_M0_M10_M11_N0_N10_N11 =
+        decltype(GridwiseGemm::MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(CGridDesc_M_N{}));
+    using DefaultBlock2CTileMap =
+        decltype(GridwiseGemm::MakeDefaultBlock2CTileMap(CGridDesc_M_N{}));
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const void* p_a,
+                 const void* p_b,
+                 void* p_c,
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+                 const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
+                 const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_strides,
+                 const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                 const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                 const std::array<index_t, NDimSpatial>& input_left_pads,
+                 const std::array<index_t, NDimSpatial>& input_right_pads,
+                 const AElementwiseOperation& a_element_op,
+                 const BElementwiseOperation& b_element_op,
+                 const CElementwiseOperation& c_element_op)
+            : p_a_grid_{static_cast<const ADataType*>(p_a)},
+              p_b_grid_{static_cast<const BDataType*>(p_b)},
+              p_c_grid_{static_cast<CDataType*>(p_c)},
+              num_group_{a_g_n_c_wis_lengths[0]},
+              a_grid_desc_ak0_m_ak1_{
+                  DeviceOp::MakeAGridDescriptor_AK0_M_AK1<ALayout>(a_g_n_c_wis_lengths,
+                                                                   a_g_n_c_wis_strides,
+                                                                   b_g_k_c_xs_lengths,
+                                                                   b_g_k_c_xs_strides,
+                                                                   c_g_n_k_wos_lengths,
+                                                                   c_g_n_k_wos_strides,
+                                                                   conv_filter_strides,
+                                                                   conv_filter_dilations,
+                                                                   input_left_pads,
+                                                                   input_right_pads)},
+              b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1<BLayout>(
+                  b_g_k_c_xs_lengths, b_g_k_c_xs_strides)},
+              c_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N<CLayout>(c_g_n_k_wos_lengths,
+                                                                          c_g_n_k_wos_strides)},
+              a_grid_desc_k0_m0_m1_k1_{},
+              b_grid_desc_k0_n0_n1_k1_{},
+              c_grid_desc_m0_m10_m11_n0_n10_n11_{},
+              block_2_ctile_map_{},
+              compute_ptr_offset_of_batch_{
+                  a_g_n_c_wis_strides[0], b_g_k_c_xs_strides[0], c_g_n_k_wos_strides[0]},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op},
+              a_g_n_c_wis_lengths_{a_g_n_c_wis_lengths},
+              a_g_n_c_wis_strides_{a_g_n_c_wis_strides},
+              b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths},
+              b_g_k_c_xs_strides_{b_g_k_c_xs_strides},
+              c_g_n_k_wos_lengths_{c_g_n_k_wos_lengths},
+              c_g_n_k_wos_strides_{c_g_n_k_wos_strides},
+              conv_filter_strides_{conv_filter_strides},
+              conv_filter_dilations_{conv_filter_dilations},
+              input_left_pads_{input_left_pads},
+              input_right_pads_{input_right_pads}
+        {
+            // A/B/E Batch Stride
+            compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_c_wis_strides[0];
+            compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_k_c_xs_strides[0];
+            compute_ptr_offset_of_batch_.BatchStrideC_ = c_g_n_k_wos_strides[0];
+
+            // populate desc for Ds/E
+            if(GridwiseGemm::CheckValidity(
+                   a_grid_desc_ak0_m_ak1_, b_grid_desc_bk0_n_bk1_, c_grid_desc_m_n_))
+            {
+
+                a_grid_desc_k0_m0_m1_k1_ =
+                    GridwiseGemm::MakeAGridDescriptor_K0_M0_M1_K1(a_grid_desc_ak0_m_ak1_);
+                b_grid_desc_k0_n0_n1_k1_ =
+                    GridwiseGemm::MakeBGridDescriptor_K0_N0_N1_K1(b_grid_desc_bk0_n_bk1_);
+                c_grid_desc_m0_m10_m11_n0_n10_n11_ =
+                    GridwiseGemm::MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(c_grid_desc_m_n_);
+
+                block_2_ctile_map_ = GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_);
+            }
+        }
+
+        void Print() const
+        {
+            std::cout << "A[K0, M, K1]: " << a_grid_desc_ak0_m_ak1_ << std::endl;
+            std::cout << "B[K0, N, K1]: " << b_grid_desc_bk0_n_bk1_ << std::endl;
+            std::cout << "C[M, N]: " << c_grid_desc_m_n_ << std::endl;
+            std::cout << "num_group: " << num_group_ << std::endl;
+
+            std::cout << "A[k0, m0, m1, k1]: " << a_grid_desc_k0_m0_m1_k1_ << std::endl;
+            std::cout << "B[k0, n0, n1, k1]: " << b_grid_desc_k0_n0_n1_k1_ << std::endl;
+            std::cout << "A[m0, m10, m11, n0, n10, n11]: " << c_grid_desc_m0_m10_m11_n0_n10_n11_
+                      << std::endl;
+        }
+
+        //  private:
+        // pointers
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        CDataType* p_c_grid_;
+
+        // tensor descriptors for problem definiton
+        index_t num_group_;
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+
+        // tensor descriptors for block/thread-wise copy
+        AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1_;
+        BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1_;
+        CGridDesc_M0_M10_M11_N0_N10_N11 c_grid_desc_m0_m10_m11_n0_n10_n11_;
+
+        // block-to-e-tile map
+        DefaultBlock2CTileMap block_2_ctile_map_;
+
+        // for computing batch offset
+        ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_;
+
+        // element-wise op
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+
+        // for checking IsSupportedArgument()
+        std::array<index_t, NDimSpatial + 3> a_g_n_c_wis_lengths_;
+        std::array<index_t, NDimSpatial + 3> a_g_n_c_wis_strides_;
+        std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_lengths_;
+        std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_strides_;
+        std::array<index_t, NDimSpatial + 3> c_g_n_k_wos_lengths_;
+        std::array<index_t, NDimSpatial + 3> c_g_n_k_wos_strides_;
+        std::array<index_t, NDimSpatial> conv_filter_strides_;
+        std::array<index_t, NDimSpatial> conv_filter_dilations_;
+        std::array<index_t, NDimSpatial> input_left_pads_;
+        std::array<index_t, NDimSpatial> input_right_pads_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            // if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+            }
+
+            if(!GridwiseGemm::CheckValidity(
+                   arg.a_grid_desc_ak0_m_ak1_, arg.b_grid_desc_bk0_n_bk1_, arg.c_grid_desc_m_n_))
+            {
+                throw std::runtime_error(
+                    "wrong! DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK has invalid setting");
+            }
+
+            const index_t grid_size =
+                GridwiseGemm::CalculateGridSize(arg.c_grid_desc_m_n_.GetLength(I0),
+                                                arg.c_grid_desc_m_n_.GetLength(I1)) *
+                arg.num_group_;
+
+            auto launch_kernel = [&](auto has_main_k_block_loop,
+                                     auto has_double_tail_k_block_loop) {
+                constexpr bool has_main_loop   = has_main_k_block_loop.value;
+                constexpr bool has_double_loop = has_double_tail_k_block_loop;
+
+                const auto kernel =
+                    kernel_grouped_conv_fwd_dl<GridwiseGemm,
+                                               ADataType, // TODO: distiguish A/B datatype
+                                               CDataType,
+                                               DeviceOp::AGridDesc_K0_M0_M1_K1,
+                                               DeviceOp::BGridDesc_K0_N0_N1_K1,
+                                               DeviceOp::CGridDesc_M0_M10_M11_N0_N10_N11,
+                                               DefaultBlock2CTileMap,
+                                               ComputePtrOffsetOfStridedBatch,
+                                               has_main_loop,
+                                               has_double_loop>;
+
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_a_grid_,
+                                              arg.p_b_grid_,
+                                              arg.p_c_grid_,
+                                              arg.a_g_n_c_wis_lengths_[0], // Group count
+                                              arg.a_grid_desc_k0_m0_m1_k1_,
+                                              arg.b_grid_desc_k0_n0_n1_k1_,
+                                              arg.c_grid_desc_m0_m10_m11_n0_n10_n11_,
+                                              arg.block_2_ctile_map_,
+                                              arg.compute_ptr_offset_of_batch_);
+            };
+
+            const auto K0                    = arg.a_grid_desc_k0_m0_m1_k1_.GetLength(I0);
+            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K0);
+            const bool has_double_tail_k_block_loop =
+                GridwiseGemm::CalculateHasDoubleTailKBlockLoop(K0);
+
+            if(has_main_k_block_loop && has_double_tail_k_block_loop)
+            {
+                return launch_kernel(integral_constant<bool, true>{},
+                                     integral_constant<bool, true>{});
+            }
+            else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
+            {
+                return launch_kernel(integral_constant<bool, true>{},
+                                     integral_constant<bool, false>{});
+            }
+            else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
+            {
+                return launch_kernel(integral_constant<bool, false>{},
+                                     integral_constant<bool, true>{});
+            }
+            else
+            {
+                return launch_kernel(integral_constant<bool, false>{},
+                                     integral_constant<bool, false>{});
+            }
+        }
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        namespace ctc = tensor_layout::convolution;
+
+        // check device
+        if(!(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030"))
+        {
+            return false;
+        }
+
+        // check ConvolutionForwardSpecialization
+        if constexpr(ConvForwardSpecialization ==
+                     ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
+        {
+            // check if it's 1x1, stride=1 conv
+            for(index_t i = 0; i < NDimSpatial; ++i)
+            {
+                const index_t X          = arg.b_g_k_c_xs_lengths_[i + 3];
+                const index_t ConvStride = arg.conv_filter_strides_[i];
+                const index_t LeftPad    = arg.input_left_pads_[i];
+                const index_t RightPad   = arg.input_right_pads_[i];
+
+                if(!(X == 1 && ConvStride == 1 && LeftPad == 0 && RightPad == 0))
+                {
+                    std::cout << "Filter1x1Stride1Pad0 check: i = " << i << " X = " << X
+                              << " ConvStride = " << ConvStride << " LeftPad = " << LeftPad
+                              << " RightPad = " << RightPad << std::endl;
+                    return false;
+                }
+            }
+        }
+        else if constexpr(ConvForwardSpecialization ==
+                          ConvolutionForwardSpecialization::Filter1x1Pad0)
+        {
+            // check if it's 1x1 conv
+            for(index_t i = 0; i < NDimSpatial; ++i)
+            {
+                const index_t X        = arg.b_g_k_c_xs_lengths_[i + 3];
+                const index_t LeftPad  = arg.input_left_pads_[i];
+                const index_t RightPad = arg.input_right_pads_[i];
+
+                if(!(X == 1 && LeftPad == 0 && RightPad == 0))
+                {
+                    std::cout << "Filter1x1Stride1Pad0 check: i = " << i << " X = " << X
+                              << " LeftPad = " << LeftPad << " RightPad = " << RightPad
+                              << std::endl;
+                    return false;
+                }
+            }
+        }
+
+        // check vector access of A
+        // FIXME: layout
+        if constexpr(is_same_v<ALayout, ctc::G_NW_C> || is_same_v<ALayout, ctc::G_NHW_C> ||
+                     is_same_v<ALayout, ctc::G_NDHW_C> || is_same_v<ALayout, ctc::GNWC> ||
+                     is_same_v<ALayout, ctc::GNHWC> || is_same_v<ALayout, ctc::GNDHWC> ||
+                     is_same_v<ALayout, ctc::NWGC> || is_same_v<ALayout, ctc::NHWGC> ||
+                     is_same_v<ALayout, ctc::NDHWGC>)
+        {
+            auto srcVectorLengths = ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1{};
+            if(srcVectorLengths[I1] != 1 || srcVectorLengths[I2] != 1)
+            {
+                return false;
+            }
+            if(K1 % srcVectorLengths[I3] != 0 || K0PerBlock % srcVectorLengths[I0] != 0)
+            {
+                return false;
+            }
+
+            const index_t C = arg.a_g_n_c_wis_lengths_[2];
+
+            if(C % (srcVectorLengths[I0] * srcVectorLengths[I3]) != 0)
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+
+        // check vector access of B
+        // FIXME: layout
+        if constexpr(is_same_v<BLayout, ctc::G_K_X_C> || is_same_v<BLayout, ctc::G_K_YX_C> ||
+                     is_same_v<BLayout, ctc::G_K_ZYX_C> || is_same_v<BLayout, ctc::GKXC> ||
+                     is_same_v<BLayout, ctc::GKYXC> || is_same_v<BLayout, ctc::GKZYXC> ||
+                     is_same_v<BLayout, ctc::KXGC> || is_same_v<BLayout, ctc::KYXGC> ||
+                     is_same_v<BLayout, ctc::KZYXGC>)
+
+        {
+            auto srcVectorLengths = BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1{};
+            if(srcVectorLengths[I1] != 1 || srcVectorLengths[I2] != 1)
+            {
+                return false;
+            }
+            if(K1 % srcVectorLengths[I3] != 0 || K0PerBlock % srcVectorLengths[I0] != 0)
+            {
+                return false;
+            }
+
+            const index_t C = arg.b_g_k_c_xs_lengths_[2];
+
+            if(C % (srcVectorLengths[I0] * srcVectorLengths[I3]) != 0)
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+
+        // check vector access of C
+        if constexpr(is_same_v<CLayout, ctc::G_NW_K> || is_same_v<CLayout, ctc::G_NHW_K> ||
+                     is_same_v<CLayout, ctc::G_NDHW_K> || is_same_v<CLayout, ctc::GNWK> ||
+                     is_same_v<CLayout, ctc::GNHWK> || is_same_v<CLayout, ctc::GNDHWK> ||
+                     is_same_v<CLayout, ctc::NWGK> || is_same_v<CLayout, ctc::NHWGK> ||
+                     is_same_v<CLayout, ctc::NDHWGK>)
+        {
+            const index_t K = arg.c_g_n_k_wos_lengths_[2];
+
+            if(!(K % CThreadTransferDstScalarPerVector == 0 && CThreadTransferSrcDstVectorDim == 5))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+        // check Gridwise GEMM
+        return GridwiseGemm::CheckValidity(
+            arg.a_grid_desc_ak0_m_ak1_, arg.b_grid_desc_bk0_n_bk1_, arg.c_grid_desc_m_n_);
+    }
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const void* p_a,
+                             const void* p_b,
+                             void* p_c,
+                             const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+                             const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                             const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                             const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+                             const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
+                             const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_strides,
+                             const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                             const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                             const std::array<index_t, NDimSpatial>& input_left_pads,
+                             const std::array<index_t, NDimSpatial>& input_right_pads,
+                             const AElementwiseOperation& a_element_op,
+                             const BElementwiseOperation& b_element_op,
+                             const CElementwiseOperation& c_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_c,
+                        a_g_n_c_wis_lengths,
+                        a_g_n_c_wis_strides,
+                        b_g_k_c_xs_lengths,
+                        b_g_k_c_xs_strides,
+                        c_g_n_k_wos_lengths,
+                        c_g_n_k_wos_strides,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        void* p_c,
+                        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+                        const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                        const std::array<index_t, NDimSpatial>& input_left_pads,
+                        const std::array<index_t, NDimSpatial>& input_right_pads,
+                        const AElementwiseOperation& a_element_op,
+                        const BElementwiseOperation& b_element_op,
+                        const CElementwiseOperation& c_element_op) override
+    {
+        return std::make_unique<Argument>(p_a,
+                                          p_b,
+                                          p_c,
+                                          a_g_n_c_wis_lengths,
+                                          a_g_n_c_wis_strides,
+                                          b_g_k_c_xs_lengths,
+                                          b_g_k_c_xs_strides,
+                                          c_g_n_k_wos_lengths,
+                                          c_g_n_k_wos_strides,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << K0PerBlock << ", "
+            << getConvForwardSpecializationString(ConvForwardSpecialization)
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp
new file mode 100644
index 00000000..1e2f8191
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// Convolution Forward:
+//   input : input image A[G, N, C, Hi, Wi],
+//   input : weight B[G, K, C, Y, X],
+//   input : D0[G, N, K, Ho, Wo], D1[G, N, K, Ho, Wo], ...
+//   output : output image E[G, N, K, Ho, Wo]
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceGroupedConvFwdMultipleD : public BaseOperator
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static_assert(NumDTensor == DsLayout::Size(), "wrong! Inconsistent NumDTensor");
+
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(
+        const void* p_a, // input image
+        const void* p_b, // weight
+        const std::array<const void*, NumDTensor>& p_ds,
+        void* p_e, // output image
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_lengths,
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_strides,
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+        const std::array<index_t, NDimSpatial>& input_left_pads,
+        const std::array<index_t, NDimSpatial>& input_right_pads,
+        const AElementwiseOperation& a_element_op,
+        const BElementwiseOperation& b_element_op,
+        const CDEElementwiseOperation& cde_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp
new file mode 100644
index 00000000..181ee4b4
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm.hpp
@@ -0,0 +1,51 @@
+#pragma once
+#include <iostream>
+#include <vector>
+
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+struct GemmDesc
+{
+    ck::index_t M_, N_, K_;
+    ck::index_t stride_A_, stride_B_, stride_C_;
+
+    std::vector<ck::index_t> stride_Ds_;
+};
+
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceGroupedGemm : public BaseOperator
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static_assert(DsLayout::Size() == DsDataType::Size(), "wrong! inconsisiten NumDTensor");
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(std::vector<const void*>& p_a,
+                        std::vector<const void*>& p_b,
+                        std::vector<std::array<const void*, NumDTensor>>& p_ds,
+                        std::vector<void*>& p_e,
+                        std::vector<GemmDesc>& gemm_desc,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CElementwiseOperation c_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute.hpp
new file mode 100644
index 00000000..b066a445
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute.hpp
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "device_base.hpp"
+#include "ck/tensor_operation/gpu/device/masking_specialization.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <index_t NumDimG,
+          index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          index_t NumDimO,
+          typename ADataType,
+          typename B0DataType,
+          typename B1DataType,
+          typename CDataType,
+          typename Acc0BiasDataType,
+          typename Acc1BiasDataType,
+          typename AElementwiseOperation,
+          typename B0ElementwiseOperation,
+          typename Acc0ElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation,
+          MaskingSpecialization MaskingSpec>
+struct DeviceGroupedGemmSoftmaxGemmPermute : public BaseOperator
+{
+    struct ProblemDesc
+    {
+        std::vector<index_t> a_gs_ms_ks_lengths;
+        std::vector<index_t> a_gs_ms_ks_strides;
+
+        std::vector<index_t> b0_gs_ns_ks_lengths;
+        std::vector<index_t> b0_gs_ns_ks_strides;
+
+        std::vector<index_t> b1_gs_os_ns_lengths;
+        std::vector<index_t> b1_gs_os_ns_strides;
+
+        std::vector<index_t> c_gs_ms_os_lengths;
+        std::vector<index_t> c_gs_ms_os_strides;
+
+        std::vector<std::vector<index_t>> acc0_biases_gs_ms_ns_lengths;
+        std::vector<std::vector<index_t>> acc0_biases_gs_ms_ns_strides;
+
+        std::vector<std::vector<index_t>> acc1_biases_gs_ms_os_lengths;
+        std::vector<std::vector<index_t>> acc1_biases_gs_ms_os_strides;
+    };
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(std::vector<const void*> p_a_vec,
+                        std::vector<const void*> p_b0_vec,
+                        std::vector<const void*> p_b1_vec,
+                        std::vector<void*> p_c_vec,
+                        std::vector<std::vector<const void*>> p_acc0_biases_vec,
+                        std::vector<std::vector<const void*>> p_acc1_biases_vec,
+                        std::vector<ProblemDesc> problem_desc_vec,
+                        AElementwiseOperation a_element_op,
+                        B0ElementwiseOperation b0_element_op,
+                        Acc0ElementwiseOperation acc0_element_op,
+                        B1ElementwiseOperation b1_element_op,
+                        CElementwiseOperation c_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
new file mode 100644
index 00000000..946a757c
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
@@ -0,0 +1,881 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_gemm_softmax_gemm_permute.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp"
+#include "ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename GridwiseGemm,
+          typename GroupKernelArg,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename AccElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_grouped_gemm_softmax_gemm_xdl_cshuffle_v1(
+            const void CK_CONSTANT_ADDRESS_SPACE* group_kernel_args,
+            const index_t group_count,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const AccElementwiseOperation acc_element_op,
+            const B1ElementwiseOperation b1_element_op,
+            const CElementwiseOperation c_element_op)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    const index_t block_id = get_block_1d_id();
+
+    const auto arg_ptr = reinterpret_cast<const GroupKernelArg*>(
+        cast_pointer_to_generic_address_space(group_kernel_args));
+
+    index_t left     = 0;
+    index_t right    = group_count;
+    index_t group_id = index_t((left + right) / 2);
+
+    while(
+        (!(block_id >= arg_ptr[group_id].block_start_ && block_id < arg_ptr[group_id].block_end_)))
+    {
+        if(block_id < arg_ptr[group_id].block_start_)
+        {
+            right = group_id;
+        }
+        else
+        {
+            left = group_id;
+        }
+        group_id = index_t((left + right) / 2);
+    }
+
+    // per-group batch offset
+    const index_t num_blocks_per_batch = arg_ptr[group_id].num_blocks_per_batch_;
+    const index_t g_idx                = __builtin_amdgcn_readfirstlane(
+        (block_id - arg_ptr[group_id].block_start_) / num_blocks_per_batch);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(arg_ptr[group_id].compute_base_ptr_of_batch_.GetABasePtr(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(arg_ptr[group_id].compute_base_ptr_of_batch_.GetBBasePtr(g_idx)));
+    const long_index_t b1_batch_offset = __builtin_amdgcn_readfirstlane(static_cast<long_index_t>(
+        arg_ptr[group_id].compute_base_ptr_of_batch_.GetB1BasePtr(g_idx)));
+    const long_index_t c_batch_offset  = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(arg_ptr[group_id].compute_base_ptr_of_batch_.GetCBasePtr(g_idx)));
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(
+        arg_ptr[group_id].p_a_grid_ + a_batch_offset,
+        arg_ptr[group_id].p_b_grid_ + b_batch_offset,
+        arg_ptr[group_id].p_b1_grid_ + b1_batch_offset,
+        arg_ptr[group_id].p_c_grid_ + c_batch_offset,
+        p_shared,
+        a_element_op,
+        b_element_op,
+        acc_element_op,
+        b1_element_op,
+        c_element_op,
+        arg_ptr[group_id].a_grid_desc_ak0_m_ak1_,
+        arg_ptr[group_id].b_grid_desc_bk0_n_bk1_,
+        arg_ptr[group_id].b1_grid_desc_bk0_n_bk1_,
+        arg_ptr[group_id].c_grid_desc_mblock_mperblock_nblock_nperblock_,
+        arg_ptr[group_id].block_2_ctile_map_,
+        arg_ptr[group_id].c0_matrix_mask_);
+#else
+    ignore = group_kernel_args;
+    ignore = group_count;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = acc_element_op;
+    ignore = b1_element_op;
+    ignore = c_element_op;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+// Computes C = A * B0 * B1
+//              ^^^^^^ (Acc0)
+//              ^^^^^^^^^^^ (Acc1)
+template <index_t NumDimG,
+          index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          index_t NumDimO, // NumDimGemm1N
+          typename ADataType,
+          typename BDataType,
+          typename B1DataType,
+          typename CDataType,
+          typename Acc0BiasDataType,
+          typename Acc1BiasDataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename AccElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          TensorSpecialization ASpec,
+          TensorSpecialization BSpec,
+          TensorSpecialization B1Spec,
+          TensorSpecialization CSpec,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock, // Gemm0NPerBlock
+          index_t KPerBlock, // Gemm0KPerBlock
+          index_t Gemm1NPerBlock,
+          index_t Gemm1KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t B1K1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          index_t Gemm1NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          typename B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename B1BlockTransferThreadClusterArrangeOrder,
+          typename B1BlockTransferSrcAccessOrder,
+          index_t B1BlockTransferSrcVectorDim,
+          index_t B1BlockTransferSrcScalarPerVector,
+          index_t B1BlockTransferDstScalarPerVector_BK1,
+          bool B1BlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          MaskingSpecialization MaskingSpec,
+          LoopScheduler LoopSched = LoopScheduler::Default>
+struct DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle
+    : public DeviceGroupedGemmSoftmaxGemmPermute<NumDimG,
+                                                 NumDimM,
+                                                 NumDimN,
+                                                 NumDimK,
+                                                 NumDimO,
+                                                 ADataType,
+                                                 BDataType,
+                                                 B1DataType,
+                                                 CDataType,
+                                                 Acc0BiasDataType,
+                                                 Acc1BiasDataType,
+                                                 AElementwiseOperation,
+                                                 BElementwiseOperation,
+                                                 AccElementwiseOperation,
+                                                 B1ElementwiseOperation,
+                                                 CElementwiseOperation,
+                                                 MaskingSpec>
+{
+    static_assert(NumDimG > 0 && NumDimM > 0 && NumDimN > 0 && NumDimK > 0 && NumDimO > 0,
+                  "Number of dimension must be greater than 0");
+
+    static constexpr index_t NumAcc0Bias = Acc0BiasDataType::Size();
+    static constexpr index_t NumAcc1Bias = Acc1BiasDataType::Size();
+
+    // TODO ANT: implement bias combination
+    static_assert(NumAcc0Bias == 0 && NumAcc0Bias == 0, "Bias addition is unimplemented");
+
+#if 0
+    // TODO ANT: use alias
+    static constexpr index_t NumDimGemm0M = NumDimM;
+    static constexpr index_t NumDimGemm0N = NumDimN;
+    static constexpr index_t NumDimGemm0K = NumDimK;
+    static constexpr index_t NumDimGemm1M = NumDimM;
+    static constexpr index_t NumDimGemm1N = NumDimO;
+    static constexpr index_t NumDimGemm1K = NumDimN;
+#endif
+
+    using DeviceOp    = DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle;
+    using ProblemDesc = typename DeviceGroupedGemmSoftmaxGemmPermute<NumDimG,
+                                                                     NumDimM,
+                                                                     NumDimN,
+                                                                     NumDimK,
+                                                                     NumDimO,
+                                                                     ADataType,
+                                                                     BDataType,
+                                                                     B1DataType,
+                                                                     CDataType,
+                                                                     Acc0BiasDataType,
+                                                                     Acc1BiasDataType,
+                                                                     AElementwiseOperation,
+                                                                     BElementwiseOperation,
+                                                                     AccElementwiseOperation,
+                                                                     B1ElementwiseOperation,
+                                                                     CElementwiseOperation,
+                                                                     MaskingSpec>::ProblemDesc;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    using Transform = TransformBatchedContractionContractionToBatchedGemmGemm<
+        Sequence<NumDimG, NumDimM, NumDimN, NumDimK, NumDimO>,
+        Sequence<MPerBlock, NPerBlock, KPerBlock, Gemm1NPerBlock>,
+        GemmSpec,
+        ASpec,
+        BSpec,
+        B1Spec,
+        CSpec>;
+
+    static auto MakeAGridDescriptor_AK0_M_AK1(const std::vector<index_t>& a_gs_ms_ks_lengths_vec,
+                                              const std::vector<index_t>& a_gs_ms_ks_strides_vec)
+    {
+        return Transform::MakeAGridDescriptor_AK0_M_AK1(
+            Transform::MakeAGridDescriptor_M_K(a_gs_ms_ks_lengths_vec, a_gs_ms_ks_strides_vec),
+            Number<AK1>{});
+    }
+
+    static auto MakeBGridDescriptor_BK0_N_BK1(const std::vector<index_t>& b_gs_ns_ks_lengths_vec,
+                                              const std::vector<index_t>& b_gs_ns_ks_strides_vec)
+    {
+        return Transform::MakeB0GridDescriptor_BK0_N_BK1(
+            Transform::MakeB0GridDescriptor_N_K(b_gs_ns_ks_lengths_vec, b_gs_ns_ks_strides_vec),
+            Number<BK1>{});
+    }
+
+    static auto
+    MakeB1GridDescriptor_BK0_N_BK1(const std::vector<index_t>& b1_gs_gemm1ns_gemm1ks_lengths_vec,
+                                   const std::vector<index_t>& b1_gs_gemm1ns_gemm1ks_strides_vec)
+    {
+        return Transform::MakeB1GridDescriptor_BK0_N_BK1(
+            Transform::MakeB1GridDescriptor_N_K(b1_gs_gemm1ns_gemm1ks_lengths_vec,
+                                                b1_gs_gemm1ns_gemm1ks_strides_vec),
+            Number<B1K1>{});
+    }
+
+    using AGridDesc_AK0_M_AK1  = decltype(MakeAGridDescriptor_AK0_M_AK1({}, {}));
+    using BGridDesc_BK0_N_BK1  = decltype(MakeBGridDescriptor_BK0_N_BK1({}, {}));
+    using B1GridDesc_BK0_N_BK1 = decltype(MakeB1GridDescriptor_BK0_N_BK1({}, {}));
+    using CGridDesc_M_N        = decltype(Transform::MakeCGridDescriptor_M_N({}, {}));
+    using AGridDesc_G_M_K      = decltype(Transform::MakeAGridDescriptor_G_M_K({}, {}));
+    using BGridDesc_G_N_K      = decltype(Transform::MakeB0GridDescriptor_G_N_K({}, {}));
+    using B1GridDesc_G_N_K     = decltype(Transform::MakeB1GridDescriptor_G_N_K({}, {}));
+    using CGridDesc_G_M_N      = decltype(Transform::MakeCGridDescriptor_G_M_N({}, {}));
+
+    constexpr static auto make_MaskOutPredicate()
+    {
+        if constexpr(MaskingSpec == MaskingSpecialization::MaskDisabled)
+        {
+            return MaskDisabledPredicate{};
+        }
+        else if constexpr(MaskingSpec == MaskingSpecialization::MaskOutUpperTriangle)
+        {
+            return MaskOutUpperTrianglePredicate{};
+        }
+    }
+    using C0MatrixMask = C0MatrixMask_impl<decltype(make_MaskOutPredicate())>;
+
+    struct ComputeBasePtrOfStridedBatch
+    {
+        ComputeBasePtrOfStridedBatch(const AGridDesc_G_M_K& a_grid_desc_g_m_k,
+                                     const BGridDesc_G_N_K& b_grid_desc_g_n_k,
+                                     const B1GridDesc_G_N_K& b1_grid_desc_g_n_k,
+                                     const CGridDesc_G_M_N& c_grid_desc_g_m_n)
+            : a_grid_desc_g_m_k_(a_grid_desc_g_m_k),
+              b_grid_desc_g_n_k_(b_grid_desc_g_n_k),
+              b1_grid_desc_g_n_k_(b1_grid_desc_g_n_k),
+              c_grid_desc_g_m_n_(c_grid_desc_g_m_n)
+        {
+        }
+
+        __host__ __device__ constexpr long_index_t GetABasePtr(index_t g_idx) const
+        {
+            return a_grid_desc_g_m_k_.CalculateOffset(make_multi_index(g_idx, 0, 0));
+        }
+
+        __host__ __device__ constexpr long_index_t GetBBasePtr(index_t g_idx) const
+        {
+            return b_grid_desc_g_n_k_.CalculateOffset(make_multi_index(g_idx, 0, 0));
+        }
+
+        __host__ __device__ constexpr long_index_t GetB1BasePtr(index_t g_idx) const
+        {
+            return b1_grid_desc_g_n_k_.CalculateOffset(make_multi_index(g_idx, 0, 0));
+        }
+
+        __host__ __device__ constexpr long_index_t GetCBasePtr(index_t g_idx) const
+        {
+            return c_grid_desc_g_m_n_.CalculateOffset(make_multi_index(g_idx, 0, 0));
+        }
+
+        private:
+        AGridDesc_G_M_K a_grid_desc_g_m_k_;
+        BGridDesc_G_N_K b_grid_desc_g_n_k_;
+        B1GridDesc_G_N_K b1_grid_desc_g_n_k_;
+        CGridDesc_G_M_N c_grid_desc_g_m_n_;
+    };
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle<
+        ADataType, // TODO: distinguish A/B datatype
+        GemmAccDataType,
+        CShuffleDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        AccElementwiseOperation,
+        B1ElementwiseOperation,
+        CElementwiseOperation,
+        InMemoryDataOperationEnum::Set,
+        AGridDesc_AK0_M_AK1,
+        BGridDesc_BK0_N_BK1,
+        B1GridDesc_BK0_N_BK1,
+        CGridDesc_M_N,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        Gemm1NPerBlock,
+        Gemm1KPerBlock,
+        AK1,
+        BK1,
+        B1K1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        Gemm1NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        true,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        true,
+        BBlockLdsExtraN,
+        B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+        B1BlockTransferThreadClusterArrangeOrder,
+        B1BlockTransferSrcAccessOrder,
+        B1BlockTransferSrcVectorDim,
+        B1BlockTransferSrcScalarPerVector,
+        B1BlockTransferDstScalarPerVector_BK1,
+        false,
+        B1BlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        LoopSched,
+        Transform::matrix_padder.PadN,
+        MaskingSpec == MaskingSpecialization::MaskOutUpperTriangle>;
+
+    using Block2CTileMap = OffsettedBlockToCTileMap<typename GridwiseGemm::DefaultBlock2CTileMap>;
+
+    struct GroupKernelArg
+    {
+        // pointers
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        const B1DataType* p_b1_grid_;
+        CDataType* p_c_grid_;
+
+        // tensor descriptors for block/thread-wise copy
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1_;
+        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock_;
+
+        // batch & stride
+        index_t num_blocks_per_batch_;
+        ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch_;
+
+        // check C0 masking and padding
+        C0MatrixMask c0_matrix_mask_;
+
+        // block-to-c-tile map
+        Block2CTileMap block_2_ctile_map_;
+
+        index_t block_start_, block_end_;
+    };
+
+    struct GroupDeviceArg
+    {
+        // lengths for the last dimensions of overall problem for sanity check of vector load/store
+        std::vector<index_t> raw_lengths_mz_nz_kz_gemm1nz_;
+
+        // strides for the last dimensions of each tensor for sanity check of vector load/store
+        std::vector<index_t> a_mz_kz_strides_;
+        std::vector<index_t> b_nz_kz_strides_;
+        std::vector<index_t> b1_nz_kz_strides_;
+        std::vector<index_t> c_mz_gemm1nz_strides_;
+
+        // for gridwise gemm check
+        CGridDesc_M_N c_grid_desc_m_n_;
+    };
+
+    // Argument
+    // FIXME: constness
+    struct Argument : public BaseArgument
+    {
+        Argument(std::vector<const void*> p_a_vec,
+                 std::vector<const void*> p_b_vec,
+                 std::vector<const void*> p_b1_vec,
+                 std::vector<void*> p_c_vec,
+                 std::vector<std::vector<const void*>> p_acc0_biases_vec,
+                 std::vector<std::vector<const void*>> p_acc1_biases_vec,
+                 std::vector<ProblemDesc> problem_desc_vec,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 AccElementwiseOperation acc_element_op,
+                 B1ElementwiseOperation b1_element_op,
+                 CElementwiseOperation c_element_op)
+            : a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              acc_element_op_{acc_element_op},
+              b1_element_op_{b1_element_op},
+              c_element_op_{c_element_op}
+        {
+            // TODO ANT: implement bias addition
+            group_count_ = problem_desc_vec.size();
+
+            if(!(group_count_ == p_a_vec.size() && group_count_ == p_b_vec.size() &&
+                 group_count_ == p_b1_vec.size() && group_count_ == p_c_vec.size()))
+            {
+                throw std::runtime_error("wrong! group_count_ != a/b/b1/c_vec.size");
+            }
+
+            if(!(p_acc0_biases_vec.size() == p_acc1_biases_vec.size()))
+            {
+                throw std::runtime_error("wrong! acc0_bias_vec.size != acc1_bias_vec.size");
+            }
+
+            grid_size_ = 0;
+
+            for(std::size_t i = 0; i < group_count_; i++)
+            {
+                const auto p_a_grid  = static_cast<const ADataType*>(p_a_vec[i]);
+                const auto p_b_grid  = static_cast<const BDataType*>(p_b_vec[i]);
+                const auto p_b1_grid = static_cast<const B1DataType*>(p_b1_vec[i]);
+                const auto p_c_grid  = static_cast<CDataType*>(p_c_vec[i]);
+
+                const auto& problem_desc = problem_desc_vec[i];
+
+                const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
+                    problem_desc.a_gs_ms_ks_lengths, problem_desc.a_gs_ms_ks_strides);
+                const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(
+                    problem_desc.b0_gs_ns_ks_lengths, problem_desc.b0_gs_ns_ks_strides);
+                const auto b1_grid_desc_bk0_n_bk1 = MakeB1GridDescriptor_BK0_N_BK1(
+                    problem_desc.b1_gs_os_ns_lengths, problem_desc.b1_gs_os_ns_strides);
+                const auto c_grid_desc_m_n = Transform::MakeCGridDescriptor_M_N(
+                    problem_desc.c_gs_ms_os_lengths, problem_desc.c_gs_ms_os_strides);
+
+                const auto a_grid_desc_g_m_k = Transform::MakeAGridDescriptor_G_M_K(
+                    problem_desc.a_gs_ms_ks_lengths, problem_desc.a_gs_ms_ks_strides);
+                const auto b_grid_desc_g_n_k = Transform::MakeB0GridDescriptor_G_N_K(
+                    problem_desc.b0_gs_ns_ks_lengths, problem_desc.b0_gs_ns_ks_strides);
+                const auto b1_grid_desc_g_n_k = Transform::MakeB1GridDescriptor_G_N_K(
+                    problem_desc.b1_gs_os_ns_lengths, problem_desc.b1_gs_os_ns_strides);
+                const auto c_grid_desc_g_m_n = Transform::MakeCGridDescriptor_G_M_N(
+                    problem_desc.c_gs_ms_os_lengths, problem_desc.c_gs_ms_os_strides);
+
+                const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
+                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        c_grid_desc_m_n);
+
+                const index_t BlockStart     = grid_size_;
+                const auto block_2_ctile_map = Block2CTileMap(c_grid_desc_m_n, BlockStart);
+                const index_t batch_count    = c_grid_desc_g_m_n.GetLength(I0);
+                const index_t grid_size_grp =
+                    block_2_ctile_map.CalculateGridSize(c_grid_desc_m_n) * batch_count;
+                const index_t BlockEnd = grid_size_ + grid_size_grp;
+
+                // batch stride
+                const auto compute_base_ptr_of_batch = ComputeBasePtrOfStridedBatch(
+                    a_grid_desc_g_m_k, b_grid_desc_g_n_k, b1_grid_desc_g_n_k, c_grid_desc_g_m_n);
+
+                // C0 mask
+                const auto c0_matrix_mask = C0MatrixMask(b_grid_desc_g_n_k.GetLength(I1));
+
+                grid_size_ += grid_size_grp;
+
+                // for each group, make sure acc0_biases_gs_ms_ns_lengths.size() == NumAcc0Bias and
+                // so on
+                if(!(problem_desc.acc0_biases_gs_ms_ns_lengths.size() == NumAcc0Bias &&
+                     problem_desc.acc0_biases_gs_ms_ns_strides.size() == NumAcc0Bias &&
+                     problem_desc.acc1_biases_gs_ms_os_lengths.size() == NumAcc1Bias &&
+                     problem_desc.acc1_biases_gs_ms_os_strides.size() == NumAcc1Bias))
+                {
+                    throw std::runtime_error(
+                        "wrong! number of biases in function argument does not "
+                        "match that in template argument");
+                }
+
+                group_kernel_args_.push_back({p_a_grid,
+                                              p_b_grid,
+                                              p_b1_grid,
+                                              p_c_grid,
+                                              a_grid_desc_ak0_m_ak1,
+                                              b_grid_desc_bk0_n_bk1,
+                                              b1_grid_desc_bk0_n_bk1,
+                                              c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                              block_2_ctile_map.CalculateGridSize(c_grid_desc_m_n),
+                                              compute_base_ptr_of_batch,
+                                              c0_matrix_mask,
+                                              block_2_ctile_map,
+                                              BlockStart,
+                                              BlockEnd});
+
+                group_device_args_.push_back(
+                    {{problem_desc.a_gs_ms_ks_lengths[NumDimG + NumDimM - 1],
+                      problem_desc.b0_gs_ns_ks_lengths[NumDimG + NumDimN - 1],
+                      problem_desc.b0_gs_ns_ks_lengths[NumDimG + NumDimN + NumDimK - 1],
+                      problem_desc.b1_gs_os_ns_lengths[NumDimG + NumDimO - 1]},
+                     {problem_desc.a_gs_ms_ks_strides[NumDimG + NumDimM - 1],
+                      problem_desc.a_gs_ms_ks_strides[NumDimG + NumDimM + NumDimK - 1]},
+                     {problem_desc.b0_gs_ns_ks_strides[NumDimG + NumDimN - 1],
+                      problem_desc.b0_gs_ns_ks_strides[NumDimG + NumDimN + NumDimK - 1]},
+                     {problem_desc.b1_gs_os_ns_strides[NumDimG + NumDimO - 1],
+                      problem_desc.b1_gs_os_ns_strides[NumDimG + NumDimO + NumDimN - 1]},
+                     {problem_desc.c_gs_ms_os_strides[NumDimG + NumDimM - 1],
+                      problem_desc.c_gs_ms_os_strides[NumDimG + NumDimM + NumDimO - 1]},
+                     c_grid_desc_m_n});
+            }
+        }
+
+        std::vector<GroupKernelArg> group_kernel_args_;
+        std::vector<GroupDeviceArg> group_device_args_;
+
+        std::size_t group_count_;
+        index_t grid_size_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        AccElementwiseOperation acc_element_op_;
+        B1ElementwiseOperation b1_element_op_;
+        CElementwiseOperation c_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(!DeviceOp::IsSupportedArgument(arg))
+            {
+                throw std::runtime_error("wrong! unsupported argument");
+            }
+
+            bool all_has_main_k_block_loop  = true;
+            bool some_has_main_k_block_loop = false;
+            for(std::size_t i = 0; i < arg.group_count_; i++)
+            {
+                const auto K = arg.group_kernel_args_[i].a_grid_desc_ak0_m_ak1_.GetLength(I0) *
+                               arg.group_kernel_args_[i].a_grid_desc_ak0_m_ak1_.GetLength(I2);
+                const bool y = GridwiseGemm::CalculateHasMainKBlockLoop(K);
+                all_has_main_k_block_loop &= y;
+                some_has_main_k_block_loop |= y;
+            }
+
+            hipGetErrorString(hipMemcpy(arg.p_workspace_,
+                                        arg.group_kernel_args_.data(),
+                                        arg.group_kernel_args_.size() * sizeof(GroupKernelArg),
+                                        hipMemcpyHostToDevice));
+
+            float ave_time = 0;
+
+            auto launch_kernel = [&](auto has_main_k_block_loop_) {
+                const auto kernel =
+                    kernel_grouped_gemm_softmax_gemm_xdl_cshuffle_v1<GridwiseGemm,
+                                                                     GroupKernelArg,
+                                                                     AElementwiseOperation,
+                                                                     BElementwiseOperation,
+                                                                     AccElementwiseOperation,
+                                                                     B1ElementwiseOperation,
+                                                                     CElementwiseOperation,
+                                                                     has_main_k_block_loop_>;
+
+                return launch_and_time_kernel(
+                    stream_config,
+                    kernel,
+                    dim3(arg.grid_size_),
+                    dim3(BlockSize),
+                    0,
+                    cast_pointer_to_constant_address_space(arg.p_workspace_),
+                    arg.group_count_,
+                    arg.a_element_op_,
+                    arg.b_element_op_,
+                    arg.acc_element_op_,
+                    arg.b1_element_op_,
+                    arg.c_element_op_);
+            };
+
+            // Gemm1_K is split into Gemm1_K0/K1 where K1 is known at compile time, so we only need
+            // to concern Gemm0's loop
+            if(all_has_main_k_block_loop)
+            {
+                ave_time = launch_kernel(integral_constant<bool, true>{});
+            }
+            else if(!some_has_main_k_block_loop)
+            {
+                ave_time = launch_kernel(integral_constant<bool, false>{});
+            }
+            else
+            {
+                throw std::runtime_error("wrong! all gemm problems have to simultaneously meet "
+                                         "has_main_k_block_loop or no_main_k_block_loop");
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        {
+            return false;
+        }
+
+        // TODO ANT: Check if tensor specialization & strides mismatch
+
+        bool all_has_main_k_block_loop  = true;
+        bool some_has_main_k_block_loop = false;
+
+        for(std::size_t i = 0; i < arg.group_count_; i++)
+        {
+            const auto& kernel_arg = arg.group_kernel_args_[i];
+            const auto& device_arg = arg.group_device_args_[i];
+
+            // Check if C permute dimension matches GEMM + GEMM shape
+            const index_t c_m       = device_arg.c_grid_desc_m_n_.GetLength(I0);
+            const index_t c_gemm1n  = device_arg.c_grid_desc_m_n_.GetLength(I1);
+            const index_t a_m       = kernel_arg.a_grid_desc_ak0_m_ak1_.GetLength(I1);
+            const index_t b1_gemm1n = kernel_arg.b1_grid_desc_bk0_n_bk1_.GetLength(I1);
+            if(!(c_m == a_m && c_gemm1n == b1_gemm1n))
+            {
+                return false;
+            }
+
+            // Check if having main loop
+            const auto K = kernel_arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) *
+                           kernel_arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+            const bool y = GridwiseGemm::CalculateHasMainKBlockLoop(K);
+            all_has_main_k_block_loop &= y;
+            some_has_main_k_block_loop |= y;
+
+            // Note: we need raw lengths since threadwise copy can not handle vector load when
+            // part of vector is out of bounds
+            const auto MzRaw      = device_arg.raw_lengths_mz_nz_kz_gemm1nz_[0];
+            const auto NzRaw      = device_arg.raw_lengths_mz_nz_kz_gemm1nz_[1];
+            const auto KzRaw      = device_arg.raw_lengths_mz_nz_kz_gemm1nz_[2];
+            const auto Gemm1NzRaw = device_arg.raw_lengths_mz_nz_kz_gemm1nz_[3];
+
+            // Check scalar per vector requirement
+            const auto a_extent_lowest  = ABlockTransferSrcVectorDim == 2 ? KzRaw : MzRaw;
+            const auto b_extent_lowest  = BBlockTransferSrcVectorDim == 2 ? KzRaw : NzRaw;
+            const auto b1_extent_lowest = B1BlockTransferSrcVectorDim == 2 ? NzRaw : Gemm1NzRaw;
+            const auto c_extent_lowest  = Gemm1NzRaw;
+
+            if(!(a_extent_lowest % ABlockTransferSrcScalarPerVector == 0 &&
+                 b_extent_lowest % BBlockTransferSrcScalarPerVector == 0 &&
+                 b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 &&
+                 c_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0))
+            {
+                return false;
+            }
+
+            // Check vector load/store requirement
+            const auto a_stride_lowest = ABlockTransferSrcVectorDim == 2
+                                             ? device_arg.a_mz_kz_strides_[1]
+                                             : device_arg.a_mz_kz_strides_[0];
+            const auto b_stride_lowest = BBlockTransferSrcVectorDim == 2
+                                             ? device_arg.b_nz_kz_strides_[1]
+                                             : device_arg.b_nz_kz_strides_[0];
+            const auto b1_stride_lowest = B1BlockTransferSrcVectorDim == 2
+                                              ? device_arg.b1_nz_kz_strides_[1]
+                                              : device_arg.b1_nz_kz_strides_[0];
+            const auto c_stride_lowest =
+                device_arg.c_mz_gemm1nz_strides_[1]; // cshuffle assumes lowest dim in Gemm1Ns to be
+                                                     // contiguous
+
+            if(!(a_stride_lowest == 1 || b_stride_lowest == 1 || b1_stride_lowest == 1 ||
+                 c_stride_lowest == 1))
+            {
+                return false;
+            }
+
+            if(!GridwiseGemm::CheckValidity(kernel_arg.a_grid_desc_ak0_m_ak1_,
+                                            kernel_arg.b_grid_desc_bk0_n_bk1_,
+                                            kernel_arg.b1_grid_desc_bk0_n_bk1_,
+                                            device_arg.c_grid_desc_m_n_,
+                                            kernel_arg.block_2_ctile_map_))
+            {
+                return false;
+            }
+        }
+
+        // all gemm problems have to simultaneously meet has_main_k_block_loop or
+        // no_main_k_block_loop
+        if(!(all_has_main_k_block_loop || !some_has_main_k_block_loop))
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(std::vector<const void*> p_a_vec,
+                             std::vector<const void*> p_b_vec,
+                             std::vector<const void*> p_b1_vec,
+                             std::vector<void*> p_c_vec,
+                             std::vector<std::vector<const void*>> p_acc0_biases_vec,
+                             std::vector<std::vector<const void*>> p_acc1_biases_vec,
+                             std::vector<ProblemDesc> problem_desc_vec,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             AccElementwiseOperation acc_element_op,
+                             B1ElementwiseOperation b1_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{p_a_vec,
+                        p_b_vec,
+                        p_b1_vec,
+                        p_c_vec,
+                        p_acc0_biases_vec,
+                        p_acc1_biases_vec,
+                        problem_desc_vec,
+                        a_element_op,
+                        b_element_op,
+                        acc_element_op,
+                        b1_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(std::vector<const void*> p_a_vec,
+                        std::vector<const void*> p_b_vec,
+                        std::vector<const void*> p_b1_vec,
+                        std::vector<void*> p_c_vec,
+                        std::vector<std::vector<const void*>> p_acc0_biases_vec,
+                        std::vector<std::vector<const void*>> p_acc1_biases_vec,
+                        std::vector<ProblemDesc> problem_desc_vec,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        AccElementwiseOperation acc_element_op,
+                        B1ElementwiseOperation b1_element_op,
+                        CElementwiseOperation c_element_op) override
+    {
+        return std::make_unique<Argument>(p_a_vec,
+                                          p_b_vec,
+                                          p_b1_vec,
+                                          p_c_vec,
+                                          p_acc0_biases_vec,
+                                          p_acc1_biases_vec,
+                                          problem_desc_vec,
+                                          a_element_op,
+                                          b_element_op,
+                                          acc_element_op,
+                                          b1_element_op,
+                                          c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceGroupedGemmSoftmaxGemmPermute_Xdl_CShuffle"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1 << ", "
+            << MPerBlock << ", "
+            << Gemm1NPerBlock << ", "
+            << Gemm1KPerBlock << ", "
+            << B1K1 << ", "
+            << getGemmSpecializationString(GemmSpec) << ", "
+            << "ASpec" << getTensorSpecializationString(ASpec) << ", "
+            << "B0Spec" << getTensorSpecializationString(BSpec) << ", "
+            << "B1Spec" << getTensorSpecializationString(B1Spec) << ", "
+            << "CSpec" << getTensorSpecializationString(CSpec) << ", "
+            << getMaskingSpecializationString(MaskingSpec) << ">";
+        // clang-format on
+
+        return str.str();
+    }
+
+    size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override
+    {
+        return dynamic_cast<const Argument*>(p_arg)->group_count_ * sizeof(GroupKernelArg);
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_multiple_reduce.hpp b/include/ck/tensor_operation/gpu/device/device_multiple_reduce.hpp
new file mode 100644
index 00000000..93202e35
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_multiple_reduce.hpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+#include <memory>
+#include <array>
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <index_t Rank,
+          index_t NumReduceDim,
+          index_t NumReduction,
+          typename InElementwiseOperationTuple,
+          typename AccElementwiseOperationTuple>
+struct DeviceMultipleReduce : public BaseOperator
+{
+    static constexpr index_t NumInputDim  = Rank;
+    static constexpr index_t NumOutputDim = (Rank - NumReduceDim > 1) ? Rank - NumReduceDim : 1;
+
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(
+        const std::array<index_t, NumInputDim> inLengths,
+        const std::array<index_t, NumInputDim> inStrides,
+        const std::array<index_t, NumOutputDim> outLengths,
+        const std::array<std::array<index_t, NumOutputDim>, NumReduction> outStrides,
+        const std::array<int, NumReduceDim> reduceDims,
+        const std::array<const void*, NumReduction> alphas,
+        const std::array<const void*, NumReduction> betas,
+        const void* in_dev,
+        const std::array<void*, NumReduction> out_dev_buffers,
+        const InElementwiseOperationTuple in_elementwise_op_tuple,
+        const AccElementwiseOperationTuple acc_elementwise_op_tuple) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+template <index_t Rank,
+          index_t NumReduceDim,
+          index_t NumReduction,
+          typename InElementwiseOperationTuple,
+          typename AccElementwiseOperationTuple>
+using DeviceMultipleReducePtr = std::unique_ptr<DeviceMultipleReduce<Rank,
+                                                                     NumReduceDim,
+                                                                     NumReduction,
+                                                                     InElementwiseOperationTuple,
+                                                                     AccElementwiseOperationTuple>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_normalization.hpp b/include/ck/tensor_operation/gpu/device/device_normalization.hpp
new file mode 100644
index 00000000..227c352c
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_normalization.hpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename XDataType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename AccDataType,
+          typename YDataType,
+          typename AccElementwiseOperation,
+          index_t Rank,
+          index_t NumReduceDim>
+struct DeviceNormalization : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::vector<index_t> lengths,
+                        const std::vector<index_t> xStrides,
+                        const std::vector<index_t> gammaStrides,
+                        const std::vector<index_t> betaStrides,
+                        const std::vector<index_t> yStrides,
+                        const std::vector<index_t> reduceDims,
+                        AccDataType epsilon,
+                        const void* p_x,
+                        const void* p_gamma,
+                        const void* p_beta,
+                        void* p_y,
+                        void* p_savedMean,
+                        void* p_savedInvVar,
+                        AccElementwiseOperation acc_elementwise_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+template <typename XDataType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename AccDataType,
+          typename YDataType,
+          typename AccElementwiseOperation,
+          index_t Rank,
+          index_t NumReduceDim>
+using DeviceNormalizationPtr = std::unique_ptr<DeviceNormalization<XDataType,
+                                                                   GammaDataType,
+                                                                   BetaDataType,
+                                                                   AccDataType,
+                                                                   YDataType,
+                                                                   AccElementwiseOperation,
+                                                                   Rank,
+                                                                   NumReduceDim>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_permute.hpp b/include/ck/tensor_operation/gpu/device/device_permute.hpp
new file mode 100644
index 00000000..baa91447
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_permute.hpp
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+#include <cmath>
+#include <memory>
+#include <type_traits>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <index_t NumDim, typename InDataType, typename OutDataType, typename ElementwiseOperation>
+struct DevicePermute : BaseOperator
+{
+    using Lengths = std::array<index_t, NumDim>;
+    using Strides = Lengths;
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const Lengths& in_lengths,
+                        const Strides& in_strides,
+                        const Lengths& out_lengths,
+                        const Strides& out_strides,
+                        const void* in_dev_buffer,
+                        void* out_dev_buffer,
+                        ElementwiseOperation elementwise_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_pool2d_fwd.hpp b/include/ck/tensor_operation/gpu/device/device_pool2d_fwd.hpp
new file mode 100644
index 00000000..3b376c6f
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_pool2d_fwd.hpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <array>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <ck::ReduceTensorOp ReduceOpId>
+struct DevicePool2dFwd : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* in_dev,
+                        void* out_dev,
+                        void* out_indices_dev,
+                        ck::index_t N,
+                        ck::index_t C,
+                        std::array<ck::index_t, 2> input_spatial_lengths,
+                        std::array<ck::index_t, 2> window_spatial_lengths,
+                        std::array<ck::index_t, 2> output_spatial_lengths,
+                        std::array<ck::index_t, 2> window_strides,
+                        std::array<ck::index_t, 2> input_left_pads,
+                        std::array<ck::index_t, 2> input_right_pads) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+template <ck::ReduceTensorOp ReduceOpId>
+using DevicePool2dFwdPtr = std::unique_ptr<DevicePool2dFwd<ReduceOpId>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_reduce.hpp b/include/ck/tensor_operation/gpu/device/device_reduce.hpp
new file mode 100644
index 00000000..15aeb8e9
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_reduce.hpp
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+#include <memory>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <index_t Rank,
+          index_t NumReduceDim,
+          typename InElementwiseOperation,
+          typename AccElementwiseOperation>
+struct DeviceReduce : public BaseOperator
+{
+    static constexpr index_t NumOutDim = (Rank - NumReduceDim == 0) ? 1 : Rank - NumReduceDim;
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::array<index_t, Rank> inLengths,
+                        const std::array<index_t, Rank> inStrides,
+                        const std::array<index_t, NumOutDim> outLengths,
+                        const std::array<index_t, NumOutDim> outStrides,
+                        const std::array<int, NumReduceDim> reduceDims,
+                        float alpha,
+                        float beta,
+                        const void* in_dev,
+                        const void* in_index_dev,
+                        void* out_dev,
+                        void* out_index_dev,
+                        const InElementwiseOperation in_elementwise_op,
+                        const AccElementwiseOperation acc_elementwise_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+template <index_t Rank,
+          index_t NumReduceDim,
+          typename InElementwiseOperation,
+          typename AccElementwiseOperation>
+using DeviceReducePtr = std::unique_ptr<
+    DeviceReduce<Rank, NumReduceDim, InElementwiseOperation, AccElementwiseOperation>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_softmax.hpp b/include/ck/tensor_operation/gpu/device/device_softmax.hpp
new file mode 100644
index 00000000..676e0812
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_softmax.hpp
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          typename InElementwiseOp,
+          typename AccElementwiseOp,
+          index_t Rank>
+struct DeviceSoftmax : public BaseOperator
+{
+    //
+    // @brief      Makes a pointer to Argument class.
+    //
+    // @param[in]  inLengths           Input tensor extent(s) from high to low dimension
+    // @param[in]  inStrides           Input tensor stride(s) from high to low dimension
+    // @param[in]  reduceDims          The dimension(s) the normalization operation is applied
+    // @param[in]  alpha               Typeless pointer in host memory storing the alpha scaling
+    //                                 value as type AccDataType
+    // @param[in]  beta                Typeless pointer in host memory storing the beta scaling
+    //                                 value as type AccDataType
+    // @param[in]  in_dev              Typeless const pointer in device memory storing the input
+    //                                 tensor
+    // @param      out_dev             Typeless pointer in device memory storing the output tensor
+    // @param[in]  in_elementwise_op   The input elementwise operation.
+    // @param[in]  acc_elementwise_op  The accumulation elementwise operation.
+    //
+    // @return     Unique pointer to the Argument class.
+    //
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::vector<index_t> inLengths,
+                        const std::vector<index_t> inStrides,
+                        const std::vector<int> reduceDims,
+                        const void* alpha,
+                        const void* beta,
+                        const void* in_dev,
+                        void* out_dev,
+                        InElementwiseOp in_elementwise_op,
+                        AccElementwiseOp acc_elementwise_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+    virtual index_t GetRank() const                           = 0;
+    virtual index_t GetNumReduceDim() const                   = 0;
+};
+
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          typename InElementwiseOp,
+          typename AccElementwiseOp,
+          index_t Rank>
+using DeviceSoftmaxPtr = std::unique_ptr<
+    DeviceSoftmax<InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d.hpp
new file mode 100644
index 00000000..f59e6093
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d.hpp
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// Tensor Contraction:
+//   input : A
+//   input : B
+//   input : D0, D1, ...
+//   output : E
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   A[G0, G1, ..., M0, M1, M2, ..., K0, K1, K2, ...]
+//   B[G0, G1, ..., N0, N1, N2, ..., K0, K1, K2, ...]
+//   D[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2, ...]
+//   E[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2, ...]
+template <index_t NumDimG,
+          index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceSplitKContractionMultipleD : public BaseOperator
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        const std::vector<index_t>& a_gs_ms_ns_lengths,
+                        const std::vector<index_t>& a_gs_ms_ks_strides,
+                        const std::vector<index_t>& b_gs_ns_ks_lengths,
+                        const std::vector<index_t>& b_gs_ns_ks_strides,
+                        const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_lengths,
+                        const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_strides,
+                        const std::vector<index_t>& e_gs_ms_ns_lengths,
+                        const std::vector<index_t>& e_gs_ms_ns_strides,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op,
+                        index_t split_k) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp
new file mode 100644
index 00000000..8eab1cde
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d_xdl_cshuffle.hpp
@@ -0,0 +1,1147 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_splitk_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatDsPointer,
+          typename FloatE,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename AGridDesc_AKB_AK0_M_AK1,
+          typename BGridDesc_BKB_BK0_N_BK1,
+          typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename ComputePtrOffsetOfBatch,
+          typename Block2ETileMap,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_contraction_multiple_d_xdl_cshuffle(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatDsPointer p_ds_grid,
+            FloatE* __restrict__ p_e_grid,
+            const index_t batch_count,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CDEElementwiseOperation cde_element_op,
+            const AGridDesc_AKB_AK0_M_AK1 a_grid_desc_akb_ak0_m_ak1,
+            const BGridDesc_BKB_BK0_N_BK1 b_grid_desc_bkb_bk0_n_bk1,
+            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                ds_grid_desc_mblock_mperblock_nblock_nperblock,
+            const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                e_grid_desc_mblock_mperblock_nblock_nperblock,
+            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+            const Block2ETileMap block_2_etile_map)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    const index_t num_blocks_per_batch =
+        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+    const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
+
+    const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
+
+    FloatDsPointer p_ds_grid_grp;
+
+    static constexpr index_t NumDTensor =
+        DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size();
+
+    static_for<0, NumDTensor, 1>{}(
+        [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+                                                  p_b_grid + b_batch_offset,
+                                                  p_ds_grid_grp,
+                                                  p_e_grid + e_batch_offset,
+                                                  p_shared,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  cde_element_op,
+                                                  a_grid_desc_akb_ak0_m_ak1,
+                                                  b_grid_desc_bkb_bk0_n_bk1,
+                                                  ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  block_2_etile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_ds_grid;
+    ignore = p_e_grid;
+    ignore = batch_count;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = cde_element_op;
+    ignore = a_grid_desc_akb_ak0_m_ak1;
+    ignore = b_grid_desc_bkb_bk0_n_bk1;
+    ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = e_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = block_2_etile_map;
+    ignore = compute_ptr_offset_of_batch;
+#endif
+}
+
+} // namespace ck
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// Tensor Contraction:
+//   input : A
+//   input : B
+//   input : D0, D1, ...
+//   output : E
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   A[G0, G1, ..., M0, M1, M2, ..., K0, K1, K2, ...]
+//   B[G0, G1, ..., N0, N1, N2, ..., K0, K1, K2, ...]
+//   D[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2, ...]
+//   E[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2, ...]
+template <index_t NumDimG,
+          index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          TensorSpecialization ASpec,
+          TensorSpecialization BSpec,
+          TensorSpecialization DESpec,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CDEBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched = make_default_loop_scheduler()>
+struct DeviceSplitKContractionMultipleD_Xdl_CShuffle
+    : public DeviceSplitKContractionMultipleD<NumDimG,
+                                              NumDimM,
+                                              NumDimN,
+                                              NumDimK,
+                                              ADataType,
+                                              BDataType,
+                                              DsDataType,
+                                              EDataType,
+                                              AElementwiseOperation,
+                                              BElementwiseOperation,
+                                              CDEElementwiseOperation>
+{
+    using DeviceOp = DeviceSplitKContractionMultipleD_Xdl_CShuffle;
+
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr auto matrix_padder =
+        MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock};
+
+    // Assume: A[G0, G1, ..., M0, M1, M2, ..., K0, K1, K2, ...]
+    static auto MakeAGridDescriptor_M_K(const std::vector<index_t>& a_gs_ms_ks_lengths_vec,
+                                        const std::vector<index_t>& a_gs_ms_ks_strides_vec)
+    {
+        assert(a_gs_ms_ks_lengths_vec.size() == NumDimG + NumDimM + NumDimK &&
+               a_gs_ms_ks_strides_vec.size() == NumDimG + NumDimM + NumDimK);
+
+        const auto to_tuple = [&](auto& vec, auto start, auto end) {
+            return generate_tuple([&](auto i) { return vec[start + i]; }, Number<end - start>{});
+        };
+
+        const auto a_ms_ks_lengths = to_tuple(
+            a_gs_ms_ks_lengths_vec, Number<NumDimG>{}, Number<NumDimG + NumDimM + NumDimK>{});
+        const auto a_ms_ks_strides = to_tuple(
+            a_gs_ms_ks_strides_vec, Number<NumDimG>{}, Number<NumDimG + NumDimM + NumDimK>{});
+
+        // dimension Ids for M0, M1, ...
+        constexpr auto mDimIds = typename arithmetic_sequence_gen<0, NumDimM, 1>::type{};
+
+        // dimension Ids for K0, K1, ...
+        constexpr auto kDimIds =
+            typename arithmetic_sequence_gen<NumDimM, NumDimM + NumDimK, 1>::type{};
+
+        // lengths for M0, M1, ...
+        const auto mLengths = get_container_subset(a_ms_ks_lengths, mDimIds);
+
+        // lengths for K0, K1, ...
+        const auto kLengths = get_container_subset(a_ms_ks_lengths, kDimIds);
+
+        if constexpr(ASpec == TensorSpecialization::Packed)
+        {
+            auto M = container_reduce(mLengths, math::multiplies{}, Number<1>{});
+            auto K = container_reduce(kLengths, math::multiplies{}, Number<1>{});
+            const auto a_grid_desc_mraw_kraw = make_naive_tensor_descriptor(
+                make_tuple(M, K),
+                make_tuple(a_ms_ks_strides[Number<NumDimM - 1>{}],
+                           a_ms_ks_strides[Number<NumDimM + NumDimK - 1>{}]));
+            return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
+        }
+        else
+        {
+            // naive tensor A[M0, M1, M2, ..., K0, K1, K2...]
+            const auto a_grid_desc_ms_ks =
+                make_naive_tensor_descriptor(a_ms_ks_lengths, a_ms_ks_strides);
+
+            // transformed tensor A[MRaw = M0 * M1 * M2 * ... , KRaw = K0 * K1 * K2 * ...]
+            const auto a_grid_desc_mraw_kraw = transform_tensor_descriptor(
+                a_grid_desc_ms_ks,
+                make_tuple(make_merge_transform(mLengths), make_merge_transform(kLengths)),
+                make_tuple(mDimIds, kDimIds),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
+        }
+    }
+
+    // Assume: B[G0, G1, ..., N0, N1, N2, ..., K0, K1, K2, ...]
+    static auto MakeBGridDescriptor_N_K(const std::vector<index_t>& b_gs_ns_ks_lengths_vec,
+                                        const std::vector<index_t>& b_gs_ns_ks_strides_vec)
+    {
+        assert(b_gs_ns_ks_lengths_vec.size() == NumDimG + NumDimN + NumDimK &&
+               b_gs_ns_ks_strides_vec.size() == NumDimG + NumDimN + NumDimK);
+
+        const auto to_tuple = [&](auto& vec, auto start, auto end) {
+            return generate_tuple([&](auto i) { return vec[start + i]; }, Number<end - start>{});
+        };
+
+        const auto b_ns_ks_lengths = to_tuple(
+            b_gs_ns_ks_lengths_vec, Number<NumDimG>{}, Number<NumDimG + NumDimN + NumDimK>{});
+        const auto b_ns_ks_strides = to_tuple(
+            b_gs_ns_ks_strides_vec, Number<NumDimG>{}, Number<NumDimG + NumDimN + NumDimK>{});
+
+        // dimension Ids for N0, N1, ...
+        constexpr auto nDimIds = typename arithmetic_sequence_gen<0, NumDimN, 1>::type{};
+
+        // dimension Ids for K0, K1, ...
+        constexpr auto kDimIds =
+            typename arithmetic_sequence_gen<NumDimN, NumDimN + NumDimK, 1>::type{};
+
+        // lengths for K0, K1, ...
+        const auto kLengths = get_container_subset(b_ns_ks_lengths, kDimIds);
+
+        // lengths for N0, N1, ...
+        const auto nLengths = get_container_subset(b_ns_ks_lengths, nDimIds);
+
+        if constexpr(BSpec == TensorSpecialization::Packed)
+        {
+            auto N = container_reduce(nLengths, math::multiplies{}, Number<1>{});
+            auto K = container_reduce(kLengths, math::multiplies{}, Number<1>{});
+            const auto b_grid_desc_nraw_kraw = make_naive_tensor_descriptor(
+                make_tuple(N, K),
+                make_tuple(b_ns_ks_strides[Number<NumDimN - 1>{}],
+                           b_ns_ks_strides[Number<NumDimN + NumDimK - 1>{}]));
+            return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
+        }
+        else
+        {
+            // naive tensor B[N0, N1, N2, ..., K0, K1, K2, ...]
+            const auto b_grid_desc_ns_ks =
+                make_naive_tensor_descriptor(b_ns_ks_lengths, b_ns_ks_strides);
+
+            // transformed tensor B[NRaw = N0 * N1 * N2 * ..., KRaw = K0 * K1 * K2 * ...]
+            const auto b_grid_desc_nraw_kraw = transform_tensor_descriptor(
+                b_grid_desc_ns_ks,
+                make_tuple(make_merge_transform(nLengths), make_merge_transform(kLengths)),
+                make_tuple(nDimIds, kDimIds),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
+        }
+    }
+
+    // assume E[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2...]
+    static auto MakeEGridDescriptor_M_N(const std::vector<index_t>& e_gs_ms_ns_lengths_vec,
+                                        const std::vector<index_t>& e_gs_ms_ns_strides_vec)
+    {
+        assert(e_gs_ms_ns_lengths_vec.size() == NumDimG + NumDimM + NumDimN &&
+               e_gs_ms_ns_strides_vec.size() == NumDimG + NumDimM + NumDimN);
+
+        const auto to_tuple = [&](auto& vec, auto start, auto end) {
+            return generate_tuple([&](auto i) { return vec[start + i]; }, Number<end - start>{});
+        };
+
+        const auto e_ms_ns_lengths = to_tuple(
+            e_gs_ms_ns_lengths_vec, Number<NumDimG>{}, Number<NumDimG + NumDimM + NumDimN>{});
+        const auto e_ms_ns_strides = to_tuple(
+            e_gs_ms_ns_strides_vec, Number<NumDimG>{}, Number<NumDimG + NumDimM + NumDimN>{});
+
+        // dimension Ids for M0, M1, ...
+        constexpr auto mDimIds = typename arithmetic_sequence_gen<0, NumDimM, 1>::type{};
+
+        // dimension Ids for N0, N1, ...
+        constexpr auto nDimIds =
+            typename arithmetic_sequence_gen<NumDimM, NumDimM + NumDimN, 1>::type{};
+
+        // lengths for M0, M1, ...
+        const auto mLengths = get_container_subset(e_ms_ns_lengths, mDimIds);
+
+        // lengths for K0, K1, ...
+        const auto nLengths = get_container_subset(e_ms_ns_lengths, nDimIds);
+
+        if constexpr(DESpec == TensorSpecialization::Packed)
+        {
+            auto M = container_reduce(mLengths, math::multiplies{}, Number<1>{});
+            auto N = container_reduce(nLengths, math::multiplies{}, Number<1>{});
+            const auto e_grid_desc_mraw_nraw = make_naive_tensor_descriptor(
+                make_tuple(M, N),
+                make_tuple(e_ms_ns_strides[Number<NumDimM - 1>{}],
+                           e_ms_ns_strides[Number<NumDimM + NumDimN - 1>{}]));
+            return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
+        }
+        else
+        {
+            // naive tensor E[M0, M1, M2, ..., N0, N1, N2...]
+            const auto e_grid_desc_ms_ns =
+                make_naive_tensor_descriptor(e_ms_ns_lengths, e_ms_ns_strides);
+
+            // transformed tensor E[MRaw = M0 * M1 * M2 * ... , NRaw = N0 * N1 * N2 * ...]
+            const auto e_grid_desc_mraw_nraw = transform_tensor_descriptor(
+                e_grid_desc_ms_ns,
+                make_tuple(make_merge_transform(mLengths), make_merge_transform(nLengths)),
+                make_tuple(mDimIds, nDimIds),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
+        }
+    }
+
+    // assume E[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2...]
+    static auto MakeEGridDescriptor_G_M_N(const std::vector<index_t>& e_gs_ms_ns_lengths_vec,
+                                          const std::vector<index_t>& e_gs_ms_ns_strides_vec)
+    {
+        assert(e_gs_ms_ns_lengths_vec.size() == NumDimG + NumDimM + NumDimN &&
+               e_gs_ms_ns_strides_vec.size() == NumDimG + NumDimM + NumDimN);
+
+        const auto to_tuple = [&](auto& vec, auto start, auto end) {
+            return generate_tuple([&](auto i) { return vec[start + i]; }, Number<end - start>{});
+        };
+
+        const auto e_gs_ms_ns_lengths =
+            to_tuple(e_gs_ms_ns_lengths_vec, Number<0>{}, Number<NumDimG + NumDimM + NumDimN>{});
+        const auto e_gs_ms_ns_strides =
+            to_tuple(e_gs_ms_ns_strides_vec, Number<0>{}, Number<NumDimG + NumDimM + NumDimN>{});
+
+        // dimension Ids for G0, G1, ...
+        constexpr auto gDimIds = typename arithmetic_sequence_gen<0, NumDimG, 1>::type{};
+
+        // dimension Ids for M0, M1, ...
+        constexpr auto mDimIds =
+            typename arithmetic_sequence_gen<NumDimG, NumDimG + NumDimM, 1>::type{};
+
+        // dimension Ids for N0, N1, ...
+        constexpr auto nDimIds = typename arithmetic_sequence_gen<NumDimG + NumDimM,
+                                                                  NumDimG + NumDimM + NumDimN,
+                                                                  1>::type{};
+
+        // lengths for G0, G1, ...
+        const auto gLengths = get_container_subset(e_gs_ms_ns_lengths, gDimIds);
+
+        // lengths for M0, M1, ...
+        const auto mLengths = get_container_subset(e_gs_ms_ns_lengths, mDimIds);
+
+        // lengths for K0, K1, ...
+        const auto nLengths = get_container_subset(e_gs_ms_ns_lengths, nDimIds);
+
+        if constexpr(DESpec == TensorSpecialization::Packed)
+        {
+            auto G = container_reduce(gLengths, math::multiplies{}, Number<1>{});
+            auto M = container_reduce(mLengths, math::multiplies{}, Number<1>{});
+            auto N = container_reduce(nLengths, math::multiplies{}, Number<1>{});
+            const auto e_grid_desc_g_mraw_nraw = make_naive_tensor_descriptor(
+                make_tuple(G, M, N),
+                make_tuple(e_gs_ms_ns_strides[Number<NumDimG - 1>{}],
+                           e_gs_ms_ns_strides[Number<NumDimG + NumDimM - 1>{}],
+                           e_gs_ms_ns_strides[Number<NumDimG + NumDimM + NumDimN - 1>{}]));
+            // return matrix_padder.PadCDescriptor_M_N(e_grid_desc_g_mraw_nraw);
+            return e_grid_desc_g_mraw_nraw;
+        }
+        else
+        {
+            // naive tensor E[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2...]
+            const auto e_grid_desc_gs_ms_ns =
+                make_naive_tensor_descriptor(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
+
+            // transformed tensor E[G = G0 * G1 * ..., MRaw = M0 * M1 * M2 * ... , NRaw = N0 * N1 *
+            // N2 * ...]
+            const auto e_grid_desc_g_mraw_nraw = transform_tensor_descriptor(
+                e_grid_desc_gs_ms_ns,
+                make_tuple(make_merge_transform(gLengths),
+                           make_merge_transform(mLengths),
+                           make_merge_transform(nLengths)),
+                make_tuple(gDimIds, mDimIds, nDimIds),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            // return matrix_padder.PadCDescriptor_M_N(e_grid_desc_g_mraw_nraw);
+            return e_grid_desc_g_mraw_nraw;
+        }
+    }
+
+    static auto MakeDsGridDescriptor_M_N(
+        const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_lengths_vec,
+        const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_strides_vec)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                return DeviceOp::MakeEGridDescriptor_M_N(ds_gs_ms_ns_lengths_vec[i],
+                                                         ds_gs_ms_ns_strides_vec[i]);
+            },
+            Number<NumDTensor>{});
+    }
+
+    static auto MakeDsGridDescriptor_G_M_N(
+        const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_lengths_vec,
+        const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_strides_vec)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                return DeviceOp::MakeEGridDescriptor_G_M_N(ds_gs_ms_ns_lengths_vec[i],
+                                                           ds_gs_ms_ns_strides_vec[i]);
+            },
+            Number<NumDTensor>{});
+    }
+
+    using AGridDesc_M_K  = decltype(MakeAGridDescriptor_M_K({}, {}));
+    using BGridDesc_N_K  = decltype(MakeBGridDescriptor_N_K({}, {}));
+    using DsGridDesc_M_N = remove_cvref_t<decltype(MakeDsGridDescriptor_M_N({{}}, {{}}))>;
+    using EGridDesc_M_N  = decltype(MakeEGridDescriptor_M_N({}, {}));
+
+    using DsGridDesc_G_M_N = remove_cvref_t<decltype(MakeDsGridDescriptor_G_M_N({}, {}))>;
+    using EGridDesc_G_M_N  = decltype(MakeEGridDescriptor_G_M_N({}, {}));
+
+    struct ComputePtrOffsetOfStridedBatch
+    {
+        ComputePtrOffsetOfStridedBatch(index_t batch_stride_A,
+                                       index_t batch_stride_B,
+                                       DsGridDesc_G_M_N ds_grid_desc_g_m_n,
+                                       EGridDesc_G_M_N e_grid_desc_g_m_n)
+            : batch_stride_A_(batch_stride_A),
+              batch_stride_B_(batch_stride_B),
+              ds_grid_desc_g_m_n_(ds_grid_desc_g_m_n),
+              e_grid_desc_g_m_n_(e_grid_desc_g_m_n)
+        {
+        }
+
+        __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(batch_stride_A_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(batch_stride_B_);
+        }
+
+        __host__ __device__ constexpr auto GetDsPtrOffset(index_t g_idx) const
+        {
+            std::array<long_index_t, NumDTensor> ds_offset;
+
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                ds_offset[i] = static_cast<long_index_t>(g_idx) *
+                               ds_grid_desc_g_m_n_[i].CalculateOffset(make_multi_index(1, 0, 0));
+            });
+
+            return ds_offset;
+        }
+
+        __host__ __device__ constexpr long_index_t GetEPtrOffset(index_t g_idx) const
+        {
+            return static_cast<long_index_t>(g_idx) *
+                   e_grid_desc_g_m_n_.CalculateOffset(make_multi_index(1, 0, 0));
+        }
+
+        private:
+        index_t batch_stride_A_;
+        index_t batch_stride_B_;
+        DsGridDesc_G_M_N ds_grid_desc_g_m_n_;
+        EGridDesc_G_M_N e_grid_desc_g_m_n_;
+    };
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemmSplitKMultipleD_xdl_cshuffle<
+        ADataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CShuffleDataType,
+        DsDataType,
+        EDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CDEElementwiseOperation,
+        InMemoryDataOperationEnum::Set,
+        AGridDesc_M_K,
+        BGridDesc_N_K,
+        DsGridDesc_M_N,
+        EGridDesc_M_N,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEBlockTransferScalarPerVector_NPerBlock,
+        LoopSched>;
+
+    // GridwiseGemm
+    using GridwiseGemmAtomicAdd = GridwiseGemmSplitKMultipleD_xdl_cshuffle<
+        ADataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CShuffleDataType,
+        DsDataType,
+        EDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CDEElementwiseOperation,
+        InMemoryDataOperationEnum::AtomicAdd,
+        AGridDesc_M_K,
+        BGridDesc_N_K,
+        DsGridDesc_M_N,
+        EGridDesc_M_N,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEBlockTransferScalarPerVector_NPerBlock,
+        LoopSched>;
+
+    using AGridDesc_AKB_AK0_M_AK1 = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultAGridDescriptor_AKB_AK0_M_AK1(AGridDesc_M_K{}, 1))>;
+    using BGridDesc_BKB_BK0_N_BK1 = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultBGridDescriptor_BKB_BK0_N_BK1(BGridDesc_N_K{}, 1))>;
+
+    using Block2ETileMap = typename GridwiseGemm::DefaultBlock2ETileMap;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const void* p_a_grid,
+                 const void* p_b_grid,
+                 std::array<const void*, NumDTensor> p_ds_grid,
+                 void* p_e_grid,
+                 const std::vector<index_t>& a_gs_ms_ns_lengths,
+                 const std::vector<index_t>& a_gs_ms_ks_strides,
+                 const std::vector<index_t>& b_gs_ns_ks_lengths,
+                 const std::vector<index_t>& b_gs_ns_ks_strides,
+                 const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_lengths,
+                 const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_strides,
+                 const std::vector<index_t>& e_gs_ms_ns_lengths,
+                 const std::vector<index_t>& e_gs_ms_ns_strides,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op,
+                 index_t split_k)
+            : p_a_grid_{static_cast<const ADataType*>(p_a_grid)},
+              p_b_grid_{static_cast<const BDataType*>(p_b_grid)},
+              p_ds_grid_{},
+              p_e_grid_{static_cast<EDataType*>(p_e_grid)},
+              a_grid_desc_m_k_{
+                  DeviceOp::MakeAGridDescriptor_M_K(a_gs_ms_ns_lengths, a_gs_ms_ks_strides)},
+              b_grid_desc_n_k_{
+                  DeviceOp::MakeBGridDescriptor_N_K(b_gs_ns_ks_lengths, b_gs_ns_ks_strides)},
+              ds_grid_desc_m_n_{},
+              e_grid_desc_m_n_{
+                  DeviceOp::MakeEGridDescriptor_M_N(e_gs_ms_ns_lengths, e_gs_ms_ns_strides)},
+              ds_grid_desc_g_m_n_{
+                  DeviceOp::MakeDsGridDescriptor_G_M_N(ds_gs_ms_ns_lengths, ds_gs_ms_ns_strides)},
+              e_grid_desc_g_m_n_{
+                  DeviceOp::MakeEGridDescriptor_G_M_N(e_gs_ms_ns_lengths, e_gs_ms_ns_strides)},
+              a_grid_desc_akb_ak0_m_ak1_{GridwiseGemm::MakeDefaultAGridDescriptor_AKB_AK0_M_AK1(
+                  a_grid_desc_m_k_, split_k)},
+              b_grid_desc_bkb_bk0_n_bk1_{GridwiseGemm::MakeDefaultBGridDescriptor_BKB_BK0_N_BK1(
+                  b_grid_desc_n_k_, split_k)},
+              ds_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              e_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              block_2_etile_map_{
+                  GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_, split_k)},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op},
+              a_mz_stride_{},
+              a_kz_stride_{},
+              b_nz_stride_{},
+              b_kz_stride_{},
+              ds_nz_stride_{},
+              e_nz_stride_{},
+              a_batch_stride_{a_gs_ms_ks_strides[NumDimG - 1]},
+              b_batch_stride_{b_gs_ns_ks_strides[NumDimG - 1]},
+              compute_ptr_offset_of_batch_{
+                  a_batch_stride_, b_batch_stride_, ds_grid_desc_g_m_n_, e_grid_desc_g_m_n_},
+              split_k_{split_k}
+        {
+            static_assert(NumDimG > 0 && NumDimM > 0 && NumDimN > 0 && NumDimK > 0, "");
+
+            // populate pointer, batch stride, desc for Ds
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                // D pointer
+                p_ds_grid_(i) = static_cast<const DDataType*>(p_ds_grid[i]);
+
+                // D desc
+                ds_grid_desc_m_n_(i) = DeviceOp::MakeEGridDescriptor_M_N(ds_gs_ms_ns_lengths[i],
+                                                                         ds_gs_ms_ns_strides[i]);
+            });
+
+            // populate desc for Ds/E
+            if(GridwiseGemm::CheckValidity(a_grid_desc_akb_ak0_m_ak1_,
+                                           b_grid_desc_bkb_bk0_n_bk1_,
+                                           ds_grid_desc_m_n_,
+                                           e_grid_desc_m_n_,
+                                           block_2_etile_map_))
+            {
+                e_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        e_grid_desc_m_n_);
+
+                ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        ds_grid_desc_m_n_);
+            }
+
+            // for sanity check of vector memory access
+            a_mz_stride_ = a_gs_ms_ks_strides[NumDimG + NumDimM - 1];
+            a_kz_stride_ = a_gs_ms_ks_strides[NumDimG + NumDimM + NumDimK - 1];
+            b_nz_stride_ = b_gs_ns_ks_strides[NumDimG + NumDimN - 1];
+            b_kz_stride_ = b_gs_ns_ks_strides[NumDimG + NumDimN + NumDimK - 1];
+
+            for(index_t i = 0; i < NumDTensor; ++i)
+            {
+                ds_nz_stride_[i] = ds_gs_ms_ns_strides[i][NumDimG + NumDimM + NumDimN - 1];
+            }
+
+            e_nz_stride_ = e_gs_ms_ns_strides[NumDimG + NumDimM + NumDimN - 1];
+
+            Print();
+        }
+
+        void Print() const
+        {
+            std::cout << "A[M, K]: " << a_grid_desc_m_k_.GetLength(I0) << ", "
+                      << a_grid_desc_m_k_.GetLength(I1) << std::endl;
+            std::cout << "B[N, K]: " << b_grid_desc_n_k_.GetLength(I0) << ", "
+                      << b_grid_desc_n_k_.GetLength(I1) << std::endl;
+
+            std::cout << "A[akb, ak0, m, ak1]: " << a_grid_desc_akb_ak0_m_ak1_.GetLength(I0) << ", "
+                      << a_grid_desc_akb_ak0_m_ak1_.GetLength(I1) << ", "
+                      << a_grid_desc_akb_ak0_m_ak1_.GetLength(I2) << ", "
+                      << a_grid_desc_akb_ak0_m_ak1_.GetLength(I3) << std::endl;
+            std::cout << "B[bkb, bk0, n, bk1]: " << b_grid_desc_bkb_bk0_n_bk1_.GetLength(I0) << ", "
+                      << b_grid_desc_bkb_bk0_n_bk1_.GetLength(I1) << ", "
+                      << b_grid_desc_bkb_bk0_n_bk1_.GetLength(I2) << ", "
+                      << b_grid_desc_bkb_bk0_n_bk1_.GetLength(I3) << std::endl;
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                std::cout << "Ds[M, N]: " << ds_grid_desc_m_n_[i].GetLength(I0) << ", "
+                          << ds_grid_desc_m_n_[i].GetLength(I1) << std::endl;
+            });
+            std::cout << "E[M, N]: " << e_grid_desc_m_n_.GetLength(I0) << ", "
+                      << e_grid_desc_m_n_.GetLength(I1) << std::endl;
+        }
+
+        //  private:
+        // pointers
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        EDataType* p_e_grid_;
+
+        // tensor descriptors for problem definiton
+        AGridDesc_M_K a_grid_desc_m_k_;
+        BGridDesc_N_K b_grid_desc_n_k_;
+        DsGridDesc_M_N ds_grid_desc_m_n_;
+        EGridDesc_M_N e_grid_desc_m_n_;
+
+        DsGridDesc_G_M_N ds_grid_desc_g_m_n_;
+        EGridDesc_G_M_N e_grid_desc_g_m_n_;
+
+        // tensor descriptors for block/thread-wise copy
+        AGridDesc_AKB_AK0_M_AK1 a_grid_desc_akb_ak0_m_ak1_;
+        BGridDesc_BKB_BK0_N_BK1 b_grid_desc_bkb_bk0_n_bk1_;
+        typename GridwiseGemm::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            ds_grid_desc_mblock_mperblock_nblock_nperblock_;
+        typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            e_grid_desc_mblock_mperblock_nblock_nperblock_;
+
+        // block-to-e-tile map
+        Block2ETileMap block_2_etile_map_;
+
+        // element-wise op
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+
+        // Strides for the last M/N/K dimensions of A/B/Ds/E
+        //   for sanity check of vector load/store
+        index_t a_mz_stride_;
+        index_t a_kz_stride_;
+        index_t b_nz_stride_;
+        index_t b_kz_stride_;
+        std::array<index_t, NumDTensor> ds_nz_stride_;
+        index_t e_mz_stride_;
+        index_t e_nz_stride_;
+
+        index_t a_batch_stride_;
+        index_t b_batch_stride_;
+
+        ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_;
+
+        index_t split_k_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_akb_ak0_m_ak1_,
+                                            arg.b_grid_desc_bkb_bk0_n_bk1_,
+                                            arg.ds_grid_desc_m_n_,
+                                            arg.e_grid_desc_m_n_,
+                                            arg.block_2_etile_map_))
+            {
+                throw std::runtime_error(
+                    "wrong! GridwiseGemmMultipleD_xdl_cshuffle has invalid setting");
+            }
+
+            const index_t G = arg.e_grid_desc_g_m_n_.GetLength(I0);
+
+            const index_t grid_size =
+                arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_) * G;
+
+            const auto K = arg.a_grid_desc_akb_ak0_m_ak1_.GetLength(I1) *
+                           arg.a_grid_desc_akb_ak0_m_ak1_.GetLength(I3);
+
+            auto launch_kernel = [&](auto has_main_k_block_loop) {
+                constexpr bool has_main_loop = has_main_k_block_loop.value;
+
+                const auto kernel = kernel_contraction_multiple_d_xdl_cshuffle<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    typename GridwiseGemm::DsGridPointer,
+                    EDataType,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CDEElementwiseOperation,
+                    DeviceOp::AGridDesc_AKB_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BKB_BK0_N_BK1,
+                    typename GridwiseGemm::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    ComputePtrOffsetOfStridedBatch,
+                    typename GridwiseGemm::DefaultBlock2ETileMap,
+                    has_main_loop>;
+
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_a_grid_,
+                                              arg.p_b_grid_,
+                                              arg.p_ds_grid_,
+                                              arg.p_e_grid_,
+                                              G,
+                                              arg.a_element_op_,
+                                              arg.b_element_op_,
+                                              arg.cde_element_op_,
+                                              arg.a_grid_desc_akb_ak0_m_ak1_,
+                                              arg.b_grid_desc_bkb_bk0_n_bk1_,
+                                              arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.compute_ptr_offset_of_batch_,
+                                              arg.block_2_etile_map_);
+            };
+
+            auto launch_kernel_atomic_add = [&](auto has_main_k_block_loop) {
+                constexpr bool has_main_loop = has_main_k_block_loop.value;
+
+                const auto kernel = kernel_contraction_multiple_d_xdl_cshuffle<
+                    GridwiseGemmAtomicAdd,
+                    ADataType, // TODO: distiguish A/B datatype
+                    typename GridwiseGemmAtomicAdd::DsGridPointer,
+                    EDataType,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CDEElementwiseOperation,
+                    DeviceOp::AGridDesc_AKB_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BKB_BK0_N_BK1,
+                    typename GridwiseGemmAtomicAdd::
+                        DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemmAtomicAdd::
+                        EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    ComputePtrOffsetOfStridedBatch,
+                    typename GridwiseGemmAtomicAdd::DefaultBlock2ETileMap,
+                    has_main_loop>;
+
+                hipGetErrorString(hipMemset(
+                    arg.p_e_grid_,
+                    0,
+                    arg.e_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
+                        sizeof(EDataType)));
+
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_a_grid_,
+                                              arg.p_b_grid_,
+                                              arg.p_ds_grid_,
+                                              arg.p_e_grid_,
+                                              G,
+                                              arg.a_element_op_,
+                                              arg.b_element_op_,
+                                              arg.cde_element_op_,
+                                              arg.a_grid_desc_akb_ak0_m_ak1_,
+                                              arg.b_grid_desc_bkb_bk0_n_bk1_,
+                                              arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.compute_ptr_offset_of_batch_,
+                                              arg.block_2_etile_map_);
+            };
+
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                if(arg.split_k_ <= 1)
+                    return launch_kernel(integral_constant<bool, true>{});
+                else
+                    return launch_kernel_atomic_add(integral_constant<bool, true>{});
+            }
+            else
+            {
+                if(arg.split_k_ <= 1)
+                    return launch_kernel(integral_constant<bool, false>{});
+                else
+                    return launch_kernel_atomic_add(integral_constant<bool, false>{});
+            }
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        {
+            return false;
+        }
+
+        if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_akb_ak0_m_ak1_,
+                                        arg.b_grid_desc_bkb_bk0_n_bk1_,
+                                        arg.ds_grid_desc_m_n_,
+                                        arg.e_grid_desc_m_n_,
+                                        arg.block_2_etile_map_))
+        {
+            return false;
+        }
+
+        // check vector access
+        static_assert((ABlockTransferSrcVectorDim == 2 || ABlockTransferSrcVectorDim == 3) &&
+                          (BBlockTransferSrcVectorDim == 2 || BBlockTransferSrcVectorDim == 3),
+                      "wrong!");
+
+        // vector memory access of A: could be on M or AK1 dimension
+        if constexpr(ABlockTransferSrcVectorDim == 2)
+        {
+            if(!(arg.a_mz_stride_ == 1 &&
+                 arg.a_grid_desc_akb_ak0_m_ak1_.GetLength(I2) % ABlockTransferSrcScalarPerVector ==
+                     0))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            if(!(arg.a_kz_stride_ == 1 &&
+                 arg.a_grid_desc_akb_ak0_m_ak1_.GetLength(I3) % ABlockTransferSrcScalarPerVector ==
+                     0))
+            {
+                return false;
+            }
+        }
+
+        // vector memory access of B: could be on N or BK1 dimension
+        if constexpr(BBlockTransferSrcVectorDim == 2)
+        {
+            if(!(arg.b_nz_stride_ == 1 &&
+                 arg.b_grid_desc_bkb_bk0_n_bk1_.GetLength(I2) % BBlockTransferSrcScalarPerVector ==
+                     0))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            if(!(arg.b_kz_stride_ == 1 &&
+                 arg.b_grid_desc_bkb_bk0_n_bk1_.GetLength(I3) % BBlockTransferSrcScalarPerVector ==
+                     0))
+            {
+                return false;
+            }
+        }
+
+        // vector memory access of Ds: always on NPerBlock dimension
+        bool valid_d_access = true;
+
+        static_for<0, NumDTensor, 1>{}([&](auto i) {
+            if(!(arg.ds_nz_stride_[i] == 1 &&
+                 arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_[i].GetLength(I3) %
+                         CDEBlockTransferScalarPerVector_NPerBlock ==
+                     0))
+            {
+                valid_d_access = false;
+            }
+        });
+
+        if(valid_d_access == false)
+        {
+            return false;
+        }
+
+        // vector memory access of E: always on NPerBlock dimension
+        if(!((arg.e_nz_stride_ == 1 &&
+              arg.e_grid_desc_mblock_mperblock_nblock_nperblock_.GetLength(I3) %
+                      CDEBlockTransferScalarPerVector_NPerBlock ==
+                  0) ||
+             CDEBlockTransferScalarPerVector_NPerBlock == 1))
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto
+    MakeArgument(const void* p_a,
+                 const void* p_b,
+                 std::array<const void*, NumDTensor> p_ds,
+                 void* p_e,
+                 const std::vector<index_t>& a_gs_ms_ns_lengths,
+                 const std::vector<index_t>& a_gs_ms_ks_strides,
+                 const std::vector<index_t>& b_gs_ns_ks_lengths,
+                 const std::vector<index_t>& b_gs_ns_ks_strides,
+                 const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_lengths,
+                 const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_strides,
+                 const std::vector<index_t>& e_gs_ms_ns_lengths,
+                 const std::vector<index_t>& e_gs_ms_ns_strides,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op,
+                 index_t split_k)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_ds,
+                        p_e,
+                        a_gs_ms_ns_lengths,
+                        a_gs_ms_ks_strides,
+                        b_gs_ns_ks_lengths,
+                        b_gs_ns_ks_strides,
+                        ds_gs_ms_ns_lengths,
+                        ds_gs_ms_ns_strides,
+                        e_gs_ms_ns_lengths,
+                        e_gs_ms_ns_strides,
+                        a_element_op,
+                        b_element_op,
+                        cde_element_op,
+                        split_k};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        const std::vector<index_t>& a_gs_ms_ns_lengths,
+                        const std::vector<index_t>& a_gs_ms_ks_strides,
+                        const std::vector<index_t>& b_gs_ns_ks_lengths,
+                        const std::vector<index_t>& b_gs_ns_ks_strides,
+                        const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_lengths,
+                        const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_strides,
+                        const std::vector<index_t>& e_gs_ms_ns_lengths,
+                        const std::vector<index_t>& e_gs_ms_ns_strides,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op,
+                        index_t split_k) override
+    {
+        return std::make_unique<Argument>(p_a,
+                                          p_b,
+                                          p_ds,
+                                          p_e,
+                                          a_gs_ms_ns_lengths,
+                                          a_gs_ms_ks_strides,
+                                          b_gs_ns_ks_lengths,
+                                          b_gs_ns_ks_strides,
+                                          ds_gs_ms_ns_lengths,
+                                          ds_gs_ms_ns_strides,
+                                          e_gs_ms_ns_lengths,
+                                          e_gs_ms_ns_strides,
+                                          a_element_op,
+                                          b_element_op,
+                                          cde_element_op,
+                                          split_k);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceSplitKContractionMultipleD_Xdl_CShuffle"
+            << "<"
+            << NumDimG << ", "
+            << NumDimM << ", "
+            << NumDimN << ", "
+            << NumDimK << ", "
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1 << ", "
+            << ABlockTransferSrcVectorDim << ", "
+            << BBlockTransferSrcVectorDim
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp b/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
new file mode 100644
index 00000000..fc913e9b
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+enum struct GemmSpecialization
+{
+    // Gemm
+    Default,
+    MPadding,
+    NPadding,
+    KPadding,
+    MNPadding,
+    MKPadding,
+    NKPadding,
+    MNKPadding,
+    // Gemm + Gemm
+    OPadding,
+    MOPadding,
+    NOPadding,
+    KOPadding,
+    MNOPadding,
+    MKOPadding,
+    NKOPadding,
+    MNKOPadding,
+};
+
+inline std::string getGemmSpecializationString(const GemmSpecialization& s)
+{
+    switch(s)
+    {
+    case GemmSpecialization::Default: return "Default";
+    case GemmSpecialization::MPadding: return "MPadding";
+    case GemmSpecialization::NPadding: return "NPadding";
+    case GemmSpecialization::KPadding: return "KPadding";
+    case GemmSpecialization::MNPadding: return "MNPadding";
+    case GemmSpecialization::MKPadding: return "MKPadding";
+    case GemmSpecialization::NKPadding: return "NKPadding";
+    case GemmSpecialization::MNKPadding: return "MNKPadding";
+    case GemmSpecialization::OPadding: return "OPadding";
+    case GemmSpecialization::MOPadding: return "MOPadding";
+    case GemmSpecialization::NOPadding: return "NOPadding";
+    case GemmSpecialization::KOPadding: return "KOPadding";
+    case GemmSpecialization::MNOPadding: return "MNOPadding";
+    case GemmSpecialization::MKOPadding: return "MKOPadding";
+    case GemmSpecialization::NKOPadding: return "NKOPadding";
+    case GemmSpecialization::MNKOPadding: return "MNKOPadding";
+    default: return "Unrecognized specialization!";
+    }
+}
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
new file mode 100644
index 00000000..2237ad94
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
@@ -0,0 +1,1040 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatDsPointer,
+          typename FloatE,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename ComputePtrOffsetOfBatch,
+          typename Block2ETileMap,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_contraction_multiple_d_xdl_cshuffle(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatDsPointer p_ds_grid,
+            FloatE* __restrict__ p_e_grid,
+            const index_t batch_count,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CDEElementwiseOperation cde_element_op,
+            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                ds_grid_desc_mblock_mperblock_nblock_nperblock,
+            const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                e_grid_desc_mblock_mperblock_nblock_nperblock,
+            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+            const Block2ETileMap block_2_etile_map)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    const index_t num_blocks_per_batch =
+        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+    const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
+
+    const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
+
+    FloatDsPointer p_ds_grid_grp;
+
+    static constexpr index_t NumDTensor =
+        DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size();
+
+    static_for<0, NumDTensor, 1>{}(
+        [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+                                                  p_b_grid + b_batch_offset,
+                                                  p_ds_grid_grp,
+                                                  p_e_grid + e_batch_offset,
+                                                  p_shared,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  cde_element_op,
+                                                  a_grid_desc_ak0_m_ak1,
+                                                  b_grid_desc_bk0_n_bk1,
+                                                  ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  block_2_etile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_ds_grid;
+    ignore = p_e_grid;
+    ignore = batch_count;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = cde_element_op;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = e_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = block_2_etile_map;
+    ignore = compute_ptr_offset_of_batch;
+#endif
+}
+
+} // namespace ck
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// Tensor Contraction:
+//   input : A
+//   input : B
+//   input : D0, D1, ...
+//   output : E
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   A[G0, G1, ..., M0, M1, M2, ..., K0, K1, K2, ...]
+//   B[G0, G1, ..., N0, N1, N2, ..., K0, K1, K2, ...]
+//   D[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2, ...]
+//   E[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2, ...]
+
+// NOTE: TensorSpecialization::Packed specialized tensor is "packed" in a sense that each inner
+// dimension in a dimension group (eg [G0, G1] in Gs, [M0, M1, M2] in Ms, etc.) are contiguous and
+// ordered. Not in a sense that the tensor [G0, G1, ..., M0, M1, ..., N0, N1...] can be permuted
+// while still being a contiguous, unpadded tensor. In other words, it merely degenerates into
+// TensorSpecialization::Default with NumDimG/M/N/K = 1
+//
+// Detail- Packed tensor satisfies
+//   stride_0 = 1
+//   stride_i = stride_{i - 1} * extent_{i - 1}
+// So tensor
+//   [G0, G1, G2, M, N]
+// transposed into tensor
+//   [G0, G2, G1, M, N]
+// with strides
+//   [G2 * G1 * M * N, G1 * M * N, M * N, N, 1]
+// is again a packed tensor. MakeGridDescriptor() currently just merges dimensions and ignores some
+// strides from input tensor extents so finer dimension information is lost. Merging dimensions is
+// essentially a degenerated case of TensorSpecialization::Default with NumDimG/M/N/K = 1.
+//
+// Might need to expose dimension order to the interface to fully support
+// TensorSpecialization::Packed in a traditional sense of "packed" tensor
+template <index_t NumDimG,
+          index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          TensorSpecialization ASpec,
+          TensorSpecialization BSpec,
+          TensorSpecialization DESpec,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CDEBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched = make_default_loop_scheduler()>
+struct DeviceBatchedContractionMultipleD_Xdl_CShuffle
+    : public DeviceBatchedContractionMultipleD<NumDimG,
+                                               NumDimM,
+                                               NumDimN,
+                                               NumDimK,
+                                               ADataType,
+                                               BDataType,
+                                               DsDataType,
+                                               EDataType,
+                                               AElementwiseOperation,
+                                               BElementwiseOperation,
+                                               CDEElementwiseOperation>
+{
+    using DeviceOp = DeviceBatchedContractionMultipleD_Xdl_CShuffle;
+
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr auto matrix_padder =
+        MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock};
+
+    // Assume: A[G0, G1, ..., M0, M1, M2, ..., K0, K1, K2, ...]
+    static auto MakeAGridDescriptor_M_K(const std::vector<index_t>& a_gs_ms_ks_lengths_vec,
+                                        const std::vector<index_t>& a_gs_ms_ks_strides_vec)
+    {
+        assert(a_gs_ms_ks_lengths_vec.size() == NumDimG + NumDimM + NumDimK &&
+               a_gs_ms_ks_strides_vec.size() == NumDimG + NumDimM + NumDimK);
+
+        const auto to_tuple = [&](auto& vec, auto start, auto end) {
+            return generate_tuple([&](auto i) { return vec[start + i]; }, Number<end - start>{});
+        };
+
+        const auto a_ms_ks_lengths = to_tuple(
+            a_gs_ms_ks_lengths_vec, Number<NumDimG>{}, Number<NumDimG + NumDimM + NumDimK>{});
+        const auto a_ms_ks_strides = to_tuple(
+            a_gs_ms_ks_strides_vec, Number<NumDimG>{}, Number<NumDimG + NumDimM + NumDimK>{});
+
+        // dimension Ids for M0, M1, ...
+        constexpr auto mDimIds = typename arithmetic_sequence_gen<0, NumDimM, 1>::type{};
+
+        // dimension Ids for K0, K1, ...
+        constexpr auto kDimIds =
+            typename arithmetic_sequence_gen<NumDimM, NumDimM + NumDimK, 1>::type{};
+
+        // lengths for M0, M1, ...
+        const auto mLengths = get_container_subset(a_ms_ks_lengths, mDimIds);
+
+        // lengths for K0, K1, ...
+        const auto kLengths = get_container_subset(a_ms_ks_lengths, kDimIds);
+
+        if constexpr(ASpec == TensorSpecialization::Packed)
+        {
+            auto M = container_reduce(mLengths, math::multiplies{}, Number<1>{});
+            auto K = container_reduce(kLengths, math::multiplies{}, Number<1>{});
+            const auto a_grid_desc_mraw_kraw = make_naive_tensor_descriptor(
+                make_tuple(M, K),
+                make_tuple(a_ms_ks_strides[Number<NumDimM - 1>{}],
+                           a_ms_ks_strides[Number<NumDimM + NumDimK - 1>{}]));
+            return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
+        }
+        else
+        {
+            // naive tensor A[M0, M1, M2, ..., K0, K1, K2...]
+            const auto a_grid_desc_ms_ks =
+                make_naive_tensor_descriptor(a_ms_ks_lengths, a_ms_ks_strides);
+
+            // transformed tensor A[MRaw = M0 * M1 * M2 * ... , KRaw = K0 * K1 * K2 * ...]
+            const auto a_grid_desc_mraw_kraw = transform_tensor_descriptor(
+                a_grid_desc_ms_ks,
+                make_tuple(make_merge_transform(mLengths), make_merge_transform(kLengths)),
+                make_tuple(mDimIds, kDimIds),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
+        }
+    }
+
+    // Assume: B[G0, G1, ..., N0, N1, N2, ..., K0, K1, K2, ...]
+    static auto MakeBGridDescriptor_N_K(const std::vector<index_t>& b_gs_ns_ks_lengths_vec,
+                                        const std::vector<index_t>& b_gs_ns_ks_strides_vec)
+    {
+        assert(b_gs_ns_ks_lengths_vec.size() == NumDimG + NumDimN + NumDimK &&
+               b_gs_ns_ks_strides_vec.size() == NumDimG + NumDimN + NumDimK);
+
+        const auto to_tuple = [&](auto& vec, auto start, auto end) {
+            return generate_tuple([&](auto i) { return vec[start + i]; }, Number<end - start>{});
+        };
+
+        const auto b_ns_ks_lengths = to_tuple(
+            b_gs_ns_ks_lengths_vec, Number<NumDimG>{}, Number<NumDimG + NumDimN + NumDimK>{});
+        const auto b_ns_ks_strides = to_tuple(
+            b_gs_ns_ks_strides_vec, Number<NumDimG>{}, Number<NumDimG + NumDimN + NumDimK>{});
+
+        // dimension Ids for N0, N1, ...
+        constexpr auto nDimIds = typename arithmetic_sequence_gen<0, NumDimN, 1>::type{};
+
+        // dimension Ids for K0, K1, ...
+        constexpr auto kDimIds =
+            typename arithmetic_sequence_gen<NumDimN, NumDimN + NumDimK, 1>::type{};
+
+        // lengths for K0, K1, ...
+        const auto kLengths = get_container_subset(b_ns_ks_lengths, kDimIds);
+
+        // lengths for N0, N1, ...
+        const auto nLengths = get_container_subset(b_ns_ks_lengths, nDimIds);
+
+        if constexpr(BSpec == TensorSpecialization::Packed)
+        {
+            auto N = container_reduce(nLengths, math::multiplies{}, Number<1>{});
+            auto K = container_reduce(kLengths, math::multiplies{}, Number<1>{});
+            const auto b_grid_desc_nraw_kraw = make_naive_tensor_descriptor(
+                make_tuple(N, K),
+                make_tuple(b_ns_ks_strides[Number<NumDimN - 1>{}],
+                           b_ns_ks_strides[Number<NumDimN + NumDimK - 1>{}]));
+            return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
+        }
+        else
+        {
+            // naive tensor B[N0, N1, N2, ..., K0, K1, K2, ...]
+            const auto b_grid_desc_ns_ks =
+                make_naive_tensor_descriptor(b_ns_ks_lengths, b_ns_ks_strides);
+
+            // transformed tensor B[NRaw = N0 * N1 * N2 * ..., KRaw = K0 * K1 * K2 * ...]
+            const auto b_grid_desc_nraw_kraw = transform_tensor_descriptor(
+                b_grid_desc_ns_ks,
+                make_tuple(make_merge_transform(nLengths), make_merge_transform(kLengths)),
+                make_tuple(nDimIds, kDimIds),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
+        }
+    }
+
+    // assume E[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2...]
+    static auto MakeEGridDescriptor_M_N(const std::vector<index_t>& e_gs_ms_ns_lengths_vec,
+                                        const std::vector<index_t>& e_gs_ms_ns_strides_vec)
+    {
+        assert(e_gs_ms_ns_lengths_vec.size() == NumDimG + NumDimM + NumDimN &&
+               e_gs_ms_ns_strides_vec.size() == NumDimG + NumDimM + NumDimN);
+
+        const auto to_tuple = [&](auto& vec, auto start, auto end) {
+            return generate_tuple([&](auto i) { return vec[start + i]; }, Number<end - start>{});
+        };
+
+        const auto e_ms_ns_lengths = to_tuple(
+            e_gs_ms_ns_lengths_vec, Number<NumDimG>{}, Number<NumDimG + NumDimM + NumDimN>{});
+        const auto e_ms_ns_strides = to_tuple(
+            e_gs_ms_ns_strides_vec, Number<NumDimG>{}, Number<NumDimG + NumDimM + NumDimN>{});
+
+        // dimension Ids for M0, M1, ...
+        constexpr auto mDimIds = typename arithmetic_sequence_gen<0, NumDimM, 1>::type{};
+
+        // dimension Ids for N0, N1, ...
+        constexpr auto nDimIds =
+            typename arithmetic_sequence_gen<NumDimM, NumDimM + NumDimN, 1>::type{};
+
+        // lengths for M0, M1, ...
+        const auto mLengths = get_container_subset(e_ms_ns_lengths, mDimIds);
+
+        // lengths for K0, K1, ...
+        const auto nLengths = get_container_subset(e_ms_ns_lengths, nDimIds);
+
+        if constexpr(DESpec == TensorSpecialization::Packed)
+        {
+            auto M = container_reduce(mLengths, math::multiplies{}, Number<1>{});
+            auto N = container_reduce(nLengths, math::multiplies{}, Number<1>{});
+            const auto e_grid_desc_mraw_nraw = make_naive_tensor_descriptor(
+                make_tuple(M, N),
+                make_tuple(e_ms_ns_strides[Number<NumDimM - 1>{}],
+                           e_ms_ns_strides[Number<NumDimM + NumDimN - 1>{}]));
+            return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
+        }
+        else
+        {
+            // naive tensor E[M0, M1, M2, ..., N0, N1, N2...]
+            const auto e_grid_desc_ms_ns =
+                make_naive_tensor_descriptor(e_ms_ns_lengths, e_ms_ns_strides);
+
+            // transformed tensor E[MRaw = M0 * M1 * M2 * ... , NRaw = N0 * N1 * N2 * ...]
+            const auto e_grid_desc_mraw_nraw = transform_tensor_descriptor(
+                e_grid_desc_ms_ns,
+                make_tuple(make_merge_transform(mLengths), make_merge_transform(nLengths)),
+                make_tuple(mDimIds, nDimIds),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
+        }
+    }
+
+    // assume E[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2...]
+    static auto MakeEGridDescriptor_G_M_N(const std::vector<index_t>& e_gs_ms_ns_lengths_vec,
+                                          const std::vector<index_t>& e_gs_ms_ns_strides_vec)
+    {
+        assert(e_gs_ms_ns_lengths_vec.size() == NumDimG + NumDimM + NumDimN &&
+               e_gs_ms_ns_strides_vec.size() == NumDimG + NumDimM + NumDimN);
+
+        const auto to_tuple = [&](auto& vec, auto start, auto end) {
+            return generate_tuple([&](auto i) { return vec[start + i]; }, Number<end - start>{});
+        };
+
+        const auto e_gs_ms_ns_lengths =
+            to_tuple(e_gs_ms_ns_lengths_vec, Number<0>{}, Number<NumDimG + NumDimM + NumDimN>{});
+        const auto e_gs_ms_ns_strides =
+            to_tuple(e_gs_ms_ns_strides_vec, Number<0>{}, Number<NumDimG + NumDimM + NumDimN>{});
+
+        // dimension Ids for G0, G1, ...
+        constexpr auto gDimIds = typename arithmetic_sequence_gen<0, NumDimG, 1>::type{};
+
+        // dimension Ids for M0, M1, ...
+        constexpr auto mDimIds =
+            typename arithmetic_sequence_gen<NumDimG, NumDimG + NumDimM, 1>::type{};
+
+        // dimension Ids for N0, N1, ...
+        constexpr auto nDimIds = typename arithmetic_sequence_gen<NumDimG + NumDimM,
+                                                                  NumDimG + NumDimM + NumDimN,
+                                                                  1>::type{};
+
+        // lengths for G0, G1, ...
+        const auto gLengths = get_container_subset(e_gs_ms_ns_lengths, gDimIds);
+
+        // lengths for M0, M1, ...
+        const auto mLengths = get_container_subset(e_gs_ms_ns_lengths, mDimIds);
+
+        // lengths for K0, K1, ...
+        const auto nLengths = get_container_subset(e_gs_ms_ns_lengths, nDimIds);
+
+        if constexpr(DESpec == TensorSpecialization::Packed)
+        {
+            auto G = container_reduce(gLengths, math::multiplies{}, Number<1>{});
+            auto M = container_reduce(mLengths, math::multiplies{}, Number<1>{});
+            auto N = container_reduce(nLengths, math::multiplies{}, Number<1>{});
+            const auto e_grid_desc_g_mraw_nraw = make_naive_tensor_descriptor(
+                make_tuple(G, M, N),
+                make_tuple(e_gs_ms_ns_strides[Number<NumDimG - 1>{}],
+                           e_gs_ms_ns_strides[Number<NumDimG + NumDimM - 1>{}],
+                           e_gs_ms_ns_strides[Number<NumDimG + NumDimM + NumDimN - 1>{}]));
+            // return matrix_padder.PadCDescriptor_M_N(e_grid_desc_g_mraw_nraw);
+            return e_grid_desc_g_mraw_nraw;
+        }
+        else
+        {
+            // naive tensor E[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2...]
+            const auto e_grid_desc_gs_ms_ns =
+                make_naive_tensor_descriptor(e_gs_ms_ns_lengths, e_gs_ms_ns_strides);
+
+            // transformed tensor E[G = G0 * G1 * ..., MRaw = M0 * M1 * M2 * ... , NRaw = N0 * N1 *
+            // N2 * ...]
+            const auto e_grid_desc_g_mraw_nraw = transform_tensor_descriptor(
+                e_grid_desc_gs_ms_ns,
+                make_tuple(make_merge_transform(gLengths),
+                           make_merge_transform(mLengths),
+                           make_merge_transform(nLengths)),
+                make_tuple(gDimIds, mDimIds, nDimIds),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            // return matrix_padder.PadCDescriptor_M_N(e_grid_desc_g_mraw_nraw);
+            return e_grid_desc_g_mraw_nraw;
+        }
+    }
+
+    static auto MakeDsGridDescriptor_M_N(
+        const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_lengths_vec,
+        const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_strides_vec)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                return DeviceOp::MakeEGridDescriptor_M_N(ds_gs_ms_ns_lengths_vec[i],
+                                                         ds_gs_ms_ns_strides_vec[i]);
+            },
+            Number<NumDTensor>{});
+    }
+
+    static auto MakeDsGridDescriptor_G_M_N(
+        const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_lengths_vec,
+        const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_strides_vec)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                return DeviceOp::MakeEGridDescriptor_G_M_N(ds_gs_ms_ns_lengths_vec[i],
+                                                           ds_gs_ms_ns_strides_vec[i]);
+            },
+            Number<NumDTensor>{});
+    }
+
+    using AGridDesc_M_K  = decltype(MakeAGridDescriptor_M_K({}, {}));
+    using BGridDesc_N_K  = decltype(MakeBGridDescriptor_N_K({}, {}));
+    using DsGridDesc_M_N = remove_cvref_t<decltype(MakeDsGridDescriptor_M_N({{}}, {{}}))>;
+    using EGridDesc_M_N  = decltype(MakeEGridDescriptor_M_N({}, {}));
+
+    using DsGridDesc_G_M_N = remove_cvref_t<decltype(MakeDsGridDescriptor_G_M_N({}, {}))>;
+    using EGridDesc_G_M_N  = decltype(MakeEGridDescriptor_G_M_N({}, {}));
+
+    struct ComputePtrOffsetOfStridedBatch
+    {
+        ComputePtrOffsetOfStridedBatch(index_t batch_stride_A,
+                                       index_t batch_stride_B,
+                                       DsGridDesc_G_M_N ds_grid_desc_g_m_n,
+                                       EGridDesc_G_M_N e_grid_desc_g_m_n)
+            : batch_stride_A_(batch_stride_A),
+              batch_stride_B_(batch_stride_B),
+              ds_grid_desc_g_m_n_(ds_grid_desc_g_m_n),
+              e_grid_desc_g_m_n_(e_grid_desc_g_m_n)
+        {
+        }
+
+        __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
+        {
+            return static_cast<long_index_t>(g_idx) * batch_stride_A_;
+        }
+
+        __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
+        {
+            return static_cast<long_index_t>(g_idx) * batch_stride_B_;
+        }
+
+        __host__ __device__ constexpr auto GetDsPtrOffset(index_t g_idx) const
+        {
+            std::array<long_index_t, NumDTensor> ds_offset;
+
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                ds_offset[i] = static_cast<long_index_t>(g_idx) *
+                               ds_grid_desc_g_m_n_[i].CalculateOffset(make_multi_index(1, 0, 0));
+            });
+
+            return ds_offset;
+        }
+
+        __host__ __device__ constexpr long_index_t GetEPtrOffset(index_t g_idx) const
+        {
+            return static_cast<long_index_t>(g_idx) *
+                   e_grid_desc_g_m_n_.CalculateOffset(make_multi_index(1, 0, 0));
+        }
+
+        private:
+        index_t batch_stride_A_;
+        index_t batch_stride_B_;
+        DsGridDesc_G_M_N ds_grid_desc_g_m_n_;
+        EGridDesc_G_M_N e_grid_desc_g_m_n_;
+    };
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<
+        ADataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CShuffleDataType,
+        DsDataType,
+        EDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CDEElementwiseOperation,
+        InMemoryDataOperationEnum::Set,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEBlockTransferScalarPerVector_NPerBlock,
+        LoopSched>;
+
+    // desc for blockwise copy
+    using AGridDesc_AK0_M_AK1                          = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(AGridDesc_M_K{}))>;
+    using BGridDesc_BK0_N_BK1                          = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(BGridDesc_N_K{}))>;
+    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(DsGridDesc_M_N{}))>;
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock  = remove_cvref_t<decltype(
+        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}))>;
+
+    // block-to-e-tile map
+    using Block2ETileMap =
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const void* p_a_grid,
+                 const void* p_b_grid,
+                 std::array<const void*, NumDTensor> p_ds_grid,
+                 void* p_e_grid,
+                 const std::vector<index_t>& a_gs_ms_ns_lengths,
+                 const std::vector<index_t>& a_gs_ms_ks_strides,
+                 const std::vector<index_t>& b_gs_ns_ks_lengths,
+                 const std::vector<index_t>& b_gs_ns_ks_strides,
+                 const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_lengths,
+                 const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_strides,
+                 const std::vector<index_t>& e_gs_ms_ns_lengths,
+                 const std::vector<index_t>& e_gs_ms_ns_strides,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+            : p_a_grid_{static_cast<const ADataType*>(p_a_grid)},
+              p_b_grid_{static_cast<const BDataType*>(p_b_grid)},
+              p_ds_grid_{},
+              p_e_grid_{static_cast<EDataType*>(p_e_grid)},
+              a_grid_desc_m_k_{
+                  DeviceOp::MakeAGridDescriptor_M_K(a_gs_ms_ns_lengths, a_gs_ms_ks_strides)},
+              b_grid_desc_n_k_{
+                  DeviceOp::MakeBGridDescriptor_N_K(b_gs_ns_ks_lengths, b_gs_ns_ks_strides)},
+              ds_grid_desc_m_n_{},
+              e_grid_desc_m_n_{
+                  DeviceOp::MakeEGridDescriptor_M_N(e_gs_ms_ns_lengths, e_gs_ms_ns_strides)},
+              ds_grid_desc_g_m_n_{
+                  DeviceOp::MakeDsGridDescriptor_G_M_N(ds_gs_ms_ns_lengths, ds_gs_ms_ns_strides)},
+              e_grid_desc_g_m_n_{
+                  DeviceOp::MakeEGridDescriptor_G_M_N(e_gs_ms_ns_lengths, e_gs_ms_ns_strides)},
+              a_grid_desc_ak0_m_ak1_{
+                  GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
+              b_grid_desc_bk0_n_bk1_{
+                  GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
+              ds_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              e_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op},
+              a_mz_stride_{},
+              a_kz_stride_{},
+              b_nz_stride_{},
+              b_kz_stride_{},
+              ds_nz_stride_{},
+              e_nz_stride_{},
+              a_batch_stride_{a_gs_ms_ks_strides[NumDimG - 1]},
+              b_batch_stride_{b_gs_ns_ks_strides[NumDimG - 1]},
+              compute_ptr_offset_of_batch_{
+                  a_batch_stride_, b_batch_stride_, ds_grid_desc_g_m_n_, e_grid_desc_g_m_n_}
+        {
+            static_assert(NumDimG > 0 && NumDimM > 0 && NumDimN > 0 && NumDimK > 0, "");
+
+            // populate pointer, batch stride, desc for Ds
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                // D pointer
+                p_ds_grid_(i) = static_cast<const DDataType*>(p_ds_grid[i]);
+
+                // D desc
+                ds_grid_desc_m_n_(i) = DeviceOp::MakeEGridDescriptor_M_N(ds_gs_ms_ns_lengths[i],
+                                                                         ds_gs_ms_ns_strides[i]);
+            });
+
+            // populate desc for Ds/E
+            if(GridwiseGemm::CheckValidity(a_grid_desc_m_k_,
+                                           b_grid_desc_n_k_,
+                                           ds_grid_desc_m_n_,
+                                           e_grid_desc_m_n_,
+                                           block_2_etile_map_))
+            {
+                e_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        e_grid_desc_m_n_);
+
+                ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        ds_grid_desc_m_n_);
+            }
+
+            // for sanity check of vector memory access
+            a_mz_stride_ = a_gs_ms_ks_strides[NumDimG + NumDimM - 1];
+            a_kz_stride_ = a_gs_ms_ks_strides[NumDimG + NumDimM + NumDimK - 1];
+            b_nz_stride_ = b_gs_ns_ks_strides[NumDimG + NumDimN - 1];
+            b_kz_stride_ = b_gs_ns_ks_strides[NumDimG + NumDimN + NumDimK - 1];
+
+            for(index_t i = 0; i < NumDTensor; ++i)
+            {
+                ds_nz_stride_[i] = ds_gs_ms_ns_strides[i][NumDimG + NumDimM + NumDimN - 1];
+            }
+
+            e_nz_stride_ = e_gs_ms_ns_strides[NumDimG + NumDimM + NumDimN - 1];
+        }
+
+        void Print() const
+        {
+            std::cout << "A[M, K]: " << a_grid_desc_m_k_ << std::endl;
+            std::cout << "B[N, K]: " << b_grid_desc_n_k_ << std::endl;
+            static_for<0, NumDTensor, 1>{}(
+                [&](auto i) { std::cout << "Ds[M, N]: " << ds_grid_desc_m_n_[i] << std::endl; });
+            std::cout << "E[M, N]: " << e_grid_desc_m_n_ << std::endl;
+        }
+
+        //  private:
+        // pointers
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        EDataType* p_e_grid_;
+
+        // tensor descriptors for problem definiton
+        AGridDesc_M_K a_grid_desc_m_k_;
+        BGridDesc_N_K b_grid_desc_n_k_;
+        DsGridDesc_M_N ds_grid_desc_m_n_;
+        EGridDesc_M_N e_grid_desc_m_n_;
+
+        DsGridDesc_G_M_N ds_grid_desc_g_m_n_;
+        EGridDesc_G_M_N e_grid_desc_g_m_n_;
+
+        // tensor descriptors for block/thread-wise copy
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+            ds_grid_desc_mblock_mperblock_nblock_nperblock_;
+        EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_;
+
+        // block-to-e-tile map
+        Block2ETileMap block_2_etile_map_;
+
+        // element-wise op
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+
+        // Strides for the last M/N/K dimensions of A/B/Ds/E
+        //   for sanity check of vector load/store
+        index_t a_mz_stride_;
+        index_t a_kz_stride_;
+        index_t b_nz_stride_;
+        index_t b_kz_stride_;
+        std::array<index_t, NumDTensor> ds_nz_stride_;
+        index_t e_mz_stride_;
+        index_t e_nz_stride_;
+
+        index_t a_batch_stride_;
+        index_t b_batch_stride_;
+
+        ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
+                                            arg.b_grid_desc_n_k_,
+                                            arg.ds_grid_desc_m_n_,
+                                            arg.e_grid_desc_m_n_,
+                                            arg.block_2_etile_map_))
+            {
+                throw std::runtime_error(
+                    "wrong! GridwiseGemmMultipleD_xdl_cshuffle has invalid setting");
+            }
+
+            const index_t G = arg.e_grid_desc_g_m_n_.GetLength(I0);
+
+            const index_t grid_size =
+                arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_) * G;
+
+            const auto K =
+                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+
+            auto launch_kernel = [&](auto has_main_k_block_loop) {
+                constexpr bool has_main_loop = has_main_k_block_loop.value;
+
+                const auto kernel = kernel_contraction_multiple_d_xdl_cshuffle<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    typename GridwiseGemm::DsGridPointer,
+                    EDataType,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CDEElementwiseOperation,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    DeviceOp::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                    DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                    ComputePtrOffsetOfStridedBatch,
+                    DeviceOp::Block2ETileMap,
+                    has_main_loop>;
+
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_a_grid_,
+                                              arg.p_b_grid_,
+                                              arg.p_ds_grid_,
+                                              arg.p_e_grid_,
+                                              G,
+                                              arg.a_element_op_,
+                                              arg.b_element_op_,
+                                              arg.cde_element_op_,
+                                              arg.a_grid_desc_ak0_m_ak1_,
+                                              arg.b_grid_desc_bk0_n_bk1_,
+                                              arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.compute_ptr_offset_of_batch_,
+                                              arg.block_2_etile_map_);
+            };
+
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                return launch_kernel(integral_constant<bool, true>{});
+            }
+            else
+            {
+                return launch_kernel(integral_constant<bool, false>{});
+            }
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        {
+            return false;
+        }
+
+        if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
+                                        arg.b_grid_desc_n_k_,
+                                        arg.ds_grid_desc_m_n_,
+                                        arg.e_grid_desc_m_n_,
+                                        arg.block_2_etile_map_))
+        {
+            return false;
+        }
+
+        // check vector access
+        static_assert((ABlockTransferSrcVectorDim == 1 || ABlockTransferSrcVectorDim == 2) &&
+                          (BBlockTransferSrcVectorDim == 1 || BBlockTransferSrcVectorDim == 2),
+                      "wrong!");
+
+        // vector memory access of A: could be on M or AK1 dimension
+        if constexpr(ABlockTransferSrcVectorDim == 1)
+        {
+            if(!(arg.a_mz_stride_ == 1 &&
+                 arg.a_grid_desc_ak0_m_ak1_.GetLength(I1) % ABlockTransferSrcScalarPerVector == 0))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            if(!(arg.a_kz_stride_ == 1 &&
+                 arg.a_grid_desc_ak0_m_ak1_.GetLength(I2) % ABlockTransferSrcScalarPerVector == 0))
+            {
+                return false;
+            }
+        }
+
+        // vector memory access of B: could be on N or BK1 dimension
+        if constexpr(BBlockTransferSrcVectorDim == 1)
+        {
+            if(!(arg.b_nz_stride_ == 1 &&
+                 arg.b_grid_desc_bk0_n_bk1_.GetLength(I1) % BBlockTransferSrcScalarPerVector == 0))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            if(!(arg.b_kz_stride_ == 1 &&
+                 arg.b_grid_desc_bk0_n_bk1_.GetLength(I2) % BBlockTransferSrcScalarPerVector == 0))
+            {
+                return false;
+            }
+        }
+
+        // vector memory access of Ds: always on NPerBlock dimension
+        bool valid_d_access = true;
+
+        static_for<0, NumDTensor, 1>{}([&](auto i) {
+            if(!(arg.ds_nz_stride_[i] == 1 &&
+                 arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_[i].GetLength(I3) %
+                         CDEBlockTransferScalarPerVector_NPerBlock ==
+                     0))
+            {
+                valid_d_access = false;
+            }
+        });
+
+        if(valid_d_access == false)
+        {
+            return false;
+        }
+
+        // vector memory access of E: always on NPerBlock dimension
+        if(!((arg.e_nz_stride_ == 1 &&
+              arg.e_grid_desc_mblock_mperblock_nblock_nperblock_.GetLength(I3) %
+                      CDEBlockTransferScalarPerVector_NPerBlock ==
+                  0) ||
+             CDEBlockTransferScalarPerVector_NPerBlock == 1))
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto
+    MakeArgument(const void* p_a,
+                 const void* p_b,
+                 std::array<const void*, NumDTensor> p_ds,
+                 void* p_e,
+                 const std::vector<index_t>& a_gs_ms_ns_lengths,
+                 const std::vector<index_t>& a_gs_ms_ks_strides,
+                 const std::vector<index_t>& b_gs_ns_ks_lengths,
+                 const std::vector<index_t>& b_gs_ns_ks_strides,
+                 const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_lengths,
+                 const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_strides,
+                 const std::vector<index_t>& e_gs_ms_ns_lengths,
+                 const std::vector<index_t>& e_gs_ms_ns_strides,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_ds,
+                        p_e,
+                        a_gs_ms_ns_lengths,
+                        a_gs_ms_ks_strides,
+                        b_gs_ns_ks_lengths,
+                        b_gs_ns_ks_strides,
+                        ds_gs_ms_ns_lengths,
+                        ds_gs_ms_ns_strides,
+                        e_gs_ms_ns_lengths,
+                        e_gs_ms_ns_strides,
+                        a_element_op,
+                        b_element_op,
+                        cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        const std::vector<index_t>& a_gs_ms_ns_lengths,
+                        const std::vector<index_t>& a_gs_ms_ks_strides,
+                        const std::vector<index_t>& b_gs_ns_ks_lengths,
+                        const std::vector<index_t>& b_gs_ns_ks_strides,
+                        const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_lengths,
+                        const std::array<std::vector<index_t>, NumDTensor>& ds_gs_ms_ns_strides,
+                        const std::vector<index_t>& e_gs_ms_ns_lengths,
+                        const std::vector<index_t>& e_gs_ms_ns_strides,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) override
+    {
+        return std::make_unique<Argument>(p_a,
+                                          p_b,
+                                          p_ds,
+                                          p_e,
+                                          a_gs_ms_ns_lengths,
+                                          a_gs_ms_ks_strides,
+                                          b_gs_ns_ks_lengths,
+                                          b_gs_ns_ks_strides,
+                                          ds_gs_ms_ns_lengths,
+                                          ds_gs_ms_ns_strides,
+                                          e_gs_ms_ns_lengths,
+                                          e_gs_ms_ns_strides,
+                                          a_element_op,
+                                          b_element_op,
+                                          cde_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceBatchedContractionMultipleD_Xdl_CShuffle"
+            << "<"
+            << NumDimG << ", "
+            << NumDimM << ", "
+            << NumDimN << ", "
+            << NumDimK << ", "
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1 << ", "
+            << ABlockTransferSrcVectorDim << ", "
+            << BBlockTransferSrcVectorDim
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp
new file mode 100644
index 00000000..01f5e17d
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_e_permute_xdl.hpp
@@ -0,0 +1,683 @@
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_e_permute.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+/*
+ * \brief Wrapper function of GridwiseGemm::Run to realize BatchedGEMM.
+ *
+ * \tparam ComputePtrOffsetOfBatch Class that computes the base pointer offsets of A, B, C matrix
+ * given the batch. For example, ComputePtrOffsetOfStridedBatch() computes the offsets of evenly
+ * strided batched, but we can easily extend to other layouts. The returned offset can be either \p
+ * index_t or \p long_index_t. If it returns \p long_index_t, we are not subject to the 2GB
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+ * limitations.
+ *
+ * \tparam Block2ETileMap Block2ETileMap::CalculateBottomIndex() takes in id of a workgroup and
+ * returns the 2D index of the tile that it computes. \see
+ * GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3::Run().
+ * \note Using \p ComputePtrOffsetOfBatch gives us the flexibility that 2 workgroups can compute 2
+ * tiles from different matrices. Keep in mind that these 2 matrices can share the same grid
+ * descriptor (like in BatchedGEMM), or use their own grid descriptors (in GroupedGemm). \link
+ * impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp kernel_gemm_xdlops_v2r3_for_conv3d \endlink for
+\link
+ * DeviceConv3d \endlink uses the same concept, but currently does NOT encapsulate the computing of
+ * pointer offset into \p ComputePtrOffsetOfStridedBatch.
+ *
+ * \note \p Block2ETileMap allows customized mapping between a workgroup and the C-tile it computes.
+ * Together with \p ComputePtrOffsetOfBatch, we can reuse GridwiseGemm (and GridwiseGemm fusion ) to
+ * realize BatchedGemmCPermute and GroupedGemm (and the corresponding GEMM fusion).
+ *
+ */
+template <typename GridwiseGemm,
+          typename ABDataType,
+          typename EDataType,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename ComputePtrOffsetOfBatch,
+          typename Block2ETileMap,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_batched_gemm_e_permute_xdl(const ABDataType* __restrict__ p_a_grid,
+                                          const ABDataType* __restrict__ p_b_grid,
+                                          EDataType* __restrict__ p_e_grid,
+                                          const index_t batch_count,
+                                          const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+                                          const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+                                          const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+                                              e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                          const AElementwiseOperation a_element_op,
+                                          const BElementwiseOperation b_element_op,
+                                          const CDEElementwiseOperation cde_element_op,
+                                          const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+                                          const Block2ETileMap block_2_etile_map)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    const index_t num_blocks_per_batch =
+        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+    const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx)));
+
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+                                                  p_b_grid + b_batch_offset,
+                                                  ck::Tuple<>{},
+                                                  p_e_grid + e_batch_offset,
+                                                  p_shared,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  cde_element_op,
+                                                  a_grid_desc_ak0_m_ak1,
+                                                  b_grid_desc_bk0_n_bk1,
+                                                  ck::Tuple<>{},
+                                                  e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  block_2_etile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_e_grid;
+    ignore = batch_count;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = e_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = cde_element_op;
+    ignore = compute_ptr_offset_of_batch;
+    ignore = block_2_etile_map;
+#endif
+}
+
+template <typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t NumPrefetch,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_K1,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_K1,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CDEBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched = make_default_loop_scheduler()>
+struct DeviceBatchedGemmEPermuteXdl : public DeviceBatchedGemmEPermute<ALayout,
+                                                                       BLayout,
+                                                                       ELayout,
+                                                                       ADataType,
+                                                                       BDataType,
+                                                                       EDataType,
+                                                                       AElementwiseOperation,
+                                                                       BElementwiseOperation,
+                                                                       CDEElementwiseOperation>
+{
+    using DeviceOp = DeviceBatchedGemmEPermuteXdl;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    static constexpr auto matrix_padder =
+        MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock};
+
+    static auto MakeAGridDescriptor_M_K(index_t MRaw, index_t KRaw, index_t StrideA)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(I1, StrideA));
+            }
+        }();
+
+        return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
+    }
+
+    static auto MakeBGridDescriptor_N_K(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
+    }
+
+    static auto
+    MakeEGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t stride_M, index_t stride_N)
+    {
+        const auto e_grid_desc_mraw_nraw =
+            make_naive_tensor_descriptor(make_tuple(MRaw, NRaw), make_tuple(stride_M, stride_N));
+
+        return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
+    }
+
+    static auto MakeEGridDescriptor_G0_G1_M_N(index_t G0,
+                                              index_t G1,
+                                              index_t MRaw,
+                                              index_t NRaw,
+                                              index_t stride_G0,
+                                              index_t stride_G1,
+                                              index_t stride_M,
+                                              index_t stride_N)
+    {
+        const auto e_grid_desc_g0_g1_mraw_nraw = [&]() {
+            return make_naive_tensor_descriptor(
+                make_tuple(G0, G1, MRaw, NRaw),
+                make_tuple(stride_G0, stride_G1, stride_M, stride_N));
+        }();
+
+        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
+
+        const auto MPad = M - MRaw;
+        const auto NPad = N - NRaw;
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad M and N
+            return transform_tensor_descriptor(
+                e_grid_desc_g0_g1_mraw_nraw,
+                make_tuple(make_pass_through_transform(G0),
+                           make_pass_through_transform(G1),
+                           make_right_pad_transform(MRaw, MPad),
+                           make_right_pad_transform(NRaw, NPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad M, but not N
+            return transform_tensor_descriptor(
+                e_grid_desc_g0_g1_mraw_nraw,
+                make_tuple(make_pass_through_transform(G0),
+                           make_pass_through_transform(G1),
+                           make_right_pad_transform(MRaw, MPad),
+                           make_pass_through_transform(NRaw)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad N, but not M
+            return transform_tensor_descriptor(
+                e_grid_desc_g0_g1_mraw_nraw,
+                make_tuple(make_pass_through_transform(G0),
+                           make_pass_through_transform(G1),
+                           make_pass_through_transform(MRaw),
+                           make_right_pad_transform(NRaw, NPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+        }
+        else
+        {
+            // not pad M or N
+            return e_grid_desc_g0_g1_mraw_nraw;
+        }
+    }
+
+    using AGridDesc_M_K       = decltype(MakeAGridDescriptor_M_K(1, 1, 1));
+    using BGridDesc_N_K       = decltype(MakeBGridDescriptor_N_K(1, 1, 1));
+    using EGridDesc_M_N       = decltype(MakeEGridDescriptor_M_N(1, 1, 1, 1));
+    using EGridDesc_G0_G1_M_N = decltype(MakeEGridDescriptor_G0_G1_M_N(1, 1, 1, 1, 1, 1, 1, 1));
+
+    struct ComputePtrOffsetOfStridedBatch
+    {
+        ComputePtrOffsetOfStridedBatch(index_t Batchstride_A,
+                                       index_t Batchstride_B,
+                                       EGridDesc_G0_G1_M_N e_grid_desc_g0_g1_m_n)
+            : Batchstride_A_(Batchstride_A),
+              Batchstride_B_(Batchstride_B),
+              e_grid_desc_g0_g1_m_n_(e_grid_desc_g0_g1_m_n)
+        {
+        }
+
+        __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(Batchstride_A_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(Batchstride_B_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetCPtrOffset(index_t g_idx) const
+        {
+            const index_t G1 = e_grid_desc_g0_g1_m_n_.GetLength(I1);
+            index_t b0       = g_idx / G1;
+            index_t b1       = g_idx - b0 * G1; // g_idx % G1
+            return e_grid_desc_g0_g1_m_n_.CalculateOffset(make_multi_index(b0, b1, 0, 0));
+        }
+
+        private:
+        index_t Batchstride_A_;
+        index_t Batchstride_B_;
+        EGridDesc_G0_G1_M_N e_grid_desc_g0_g1_m_n_;
+    };
+
+    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<
+        ADataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<>, // DsDataType,
+        EDataType,   // EDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CDEElementwiseOperation,
+        InMemoryDataOperationEnum::Set,
+        AGridDesc_M_K,
+        BGridDesc_N_K,
+        Tuple<>,
+        EGridDesc_M_N,
+        NumPrefetch,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_K0_M_K1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_K1,
+        false, // AThreadTransferSrcResetCoordinateAfterRun,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_K0_N_K1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_K1,
+        false, // BThreadTransferSrcResetCoordinateAfterRun,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEBlockTransferScalarPerVector_NPerBlock,
+        LoopSched>;
+
+    using AGridDesc_AK0_M_AK1 = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(AGridDesc_M_K{}))>;
+    using BGridDesc_BK0_N_BK1 = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(BGridDesc_N_K{}))>;
+
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = decltype(
+        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}));
+    using Block2ETileMap = typename GridwiseGemm::DefaultBlock2ETileMap;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const ADataType* p_a_grid,
+                 const BDataType* p_b_grid,
+                 EDataType* p_e_grid,
+                 index_t M,
+                 index_t N,
+                 index_t K,
+                 index_t stride_A,
+                 index_t stride_B,
+                 index_t batch_stride_A,
+                 index_t batch_stride_B,
+                 BatchedGemmEPermuteDesc batched_gemm_e_permute_desc,
+                 index_t BatchCount,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+            : p_a_grid_{p_a_grid},
+              p_b_grid_{p_b_grid},
+              p_e_grid_{p_e_grid},
+              BatchCount_(BatchCount),
+              a_grid_desc_m_k_{DeviceOp::MakeAGridDescriptor_M_K(M, K, stride_A)},
+              b_grid_desc_n_k_{DeviceOp::MakeBGridDescriptor_N_K(K, N, stride_B)},
+              e_grid_desc_m_n_{
+                  DeviceOp::MakeEGridDescriptor_M_N(batched_gemm_e_permute_desc.M_,
+                                                    batched_gemm_e_permute_desc.N_,
+                                                    batched_gemm_e_permute_desc.stride_M_,
+                                                    batched_gemm_e_permute_desc.stride_N_)},
+              a_grid_desc_ak0_m_ak1_{
+                  GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
+              b_grid_desc_bk0_n_bk1_{
+                  GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
+              e_grid_desc_mblock_mperblock_nblock_nperblock{},
+              e_grid_desc_g0_g1_m_n_{
+                  DeviceOp::MakeEGridDescriptor_G0_G1_M_N(batched_gemm_e_permute_desc.G0_,
+                                                          batched_gemm_e_permute_desc.G1_,
+                                                          batched_gemm_e_permute_desc.M_,
+                                                          batched_gemm_e_permute_desc.N_,
+                                                          batched_gemm_e_permute_desc.stride_G0_,
+                                                          batched_gemm_e_permute_desc.stride_G1_,
+                                                          batched_gemm_e_permute_desc.stride_M_,
+                                                          batched_gemm_e_permute_desc.stride_N_)},
+              compute_ptr_offset_of_batch_{batch_stride_A, batch_stride_B, e_grid_desc_g0_g1_m_n_},
+              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op}
+        {
+            if(GridwiseGemm::CheckValidity(a_grid_desc_m_k_,
+                                           b_grid_desc_n_k_,
+                                           ck::Tuple<>{},
+                                           e_grid_desc_m_n_,
+                                           block_2_etile_map_))
+            {
+                e_grid_desc_mblock_mperblock_nblock_nperblock =
+                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        e_grid_desc_m_n_);
+            }
+        }
+
+        void Print() const
+        {
+            std::cout << "A[M, K]: " << a_grid_desc_m_k_ << std::endl;
+            std::cout << "B[N, K]: " << b_grid_desc_n_k_ << std::endl;
+            std::cout << "C[M, N]: " << e_grid_desc_m_n_ << std::endl;
+        }
+
+        //  private:
+        // pointers
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        EDataType* p_e_grid_;
+
+        // batch count
+        index_t BatchCount_;
+
+        // tensor descriptors for problem definiton
+        AGridDesc_M_K a_grid_desc_m_k_;
+        BGridDesc_N_K b_grid_desc_n_k_;
+        EGridDesc_M_N e_grid_desc_m_n_;
+
+        // tensor descriptors for block/thread-wise copy
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock;
+        EGridDesc_G0_G1_M_N e_grid_desc_g0_g1_m_n_;
+
+        // for calculating Batch offset
+        ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_;
+
+        // block-to-e-tile map
+        Block2ETileMap block_2_etile_map_;
+
+        // element-wise op
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
+                                            arg.b_grid_desc_n_k_,
+                                            ck::Tuple<>{},
+                                            arg.e_grid_desc_m_n_,
+                                            arg.block_2_etile_map_))
+            {
+                throw std::runtime_error(
+                    "wrong! GridwiseBatchedGemmCPermute_km_kn_m0m1n0n1_xdlops_v2r3 has invalid "
+                    "setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_) * arg.BatchCount_;
+
+            const auto K =
+                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+
+            auto launch_kernel = [&](auto has_main_k_block_loop_) {
+                const auto kernel = kernel_batched_gemm_e_permute_xdl<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    EDataType,
+                    remove_reference_t<DeviceOp::AGridDesc_AK0_M_AK1>,
+                    remove_reference_t<DeviceOp::BGridDesc_BK0_N_BK1>,
+                    typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CDEElementwiseOperation,
+                    ComputePtrOffsetOfStridedBatch,
+                    remove_reference_t<Block2ETileMap>,
+                    has_main_k_block_loop_>;
+
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_a_grid_,
+                                              arg.p_b_grid_,
+                                              arg.p_e_grid_,
+                                              arg.BatchCount_,
+                                              arg.a_grid_desc_ak0_m_ak1_,
+                                              arg.b_grid_desc_bk0_n_bk1_,
+                                              arg.e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                              arg.a_element_op_,
+                                              arg.b_element_op_,
+                                              arg.cde_element_op_,
+                                              arg.compute_ptr_offset_of_batch_,
+                                              arg.block_2_etile_map_);
+            };
+
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                return launch_kernel(integral_constant<bool, true>{});
+            }
+            else
+            {
+                return launch_kernel(integral_constant<bool, false>{});
+            }
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
+                                           arg.b_grid_desc_n_k_,
+                                           ck::Tuple<>{},
+                                           arg.e_grid_desc_m_n_,
+                                           arg.block_2_etile_map_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const ADataType* p_a,
+                             const BDataType* p_b,
+                             EDataType* p_e,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             index_t stride_A,
+                             index_t stride_B,
+                             index_t batch_stride_A,
+                             index_t batch_stride_B,
+                             BatchedGemmEPermuteDesc batched_gemm_e_permute_desc,
+                             index_t BatchCount,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_e,
+                        M,
+                        N,
+                        K,
+                        stride_A,
+                        stride_B,
+                        batch_stride_A,
+                        batch_stride_B,
+                        batched_gemm_e_permute_desc,
+                        BatchCount,
+                        a_element_op,
+                        b_element_op,
+                        cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        void* p_e,
+                        index_t M,
+                        index_t N,
+                        index_t K,
+                        index_t stride_A,
+                        index_t stride_B,
+                        index_t batch_stride_A,
+                        index_t batch_stride_B,
+                        BatchedGemmEPermuteDesc batched_gemm_e_permute_desc,
+                        index_t BatchCount,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<EDataType*>(p_e),
+                                          M,
+                                          N,
+                                          K,
+                                          stride_A,
+                                          stride_B,
+                                          batch_stride_A,
+                                          batch_stride_B,
+                                          batched_gemm_e_permute_desc,
+                                          BatchCount,
+                                          a_element_op,
+                                          b_element_op,
+                                          cde_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceBatchedGemmEPermuteXdl"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp
new file mode 100644
index 00000000..3b87e563
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp
@@ -0,0 +1,747 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/io.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename AccElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename B1GridDesc_BK0_N_BK1,
+          typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename Block2CTileMap,
+          typename ComputeBasePtrOfStridedBatch,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_gemm_xdl_cshuffle_v1(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            const FloatAB* __restrict__ p_b1_grid,
+            FloatC* __restrict__ p_c_grid,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const AccElementwiseOperation acc_element_op,
+            const B1ElementwiseOperation b1_element_op,
+            const CElementwiseOperation c_element_op,
+            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+            const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1,
+            const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                c_grid_desc_mblock_mperblock_nblock_nperblock,
+            const Block2CTileMap block_2_ctile_map,
+            const index_t batch_count,
+            const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    const index_t num_blocks_per_batch =
+        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetABasePtr(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetBBasePtr(g_idx)));
+    const long_index_t b1_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetB1BasePtr(g_idx)));
+    const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetCBasePtr(g_idx)));
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+                                                  p_b_grid + b_batch_offset,
+                                                  p_b1_grid + b1_batch_offset,
+                                                  p_c_grid + c_batch_offset,
+                                                  p_shared,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  acc_element_op,
+                                                  b1_element_op,
+                                                  c_element_op,
+                                                  a_grid_desc_ak0_m_ak1,
+                                                  b_grid_desc_bk0_n_bk1,
+                                                  b1_grid_desc_bk0_n_bk1,
+                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  block_2_ctile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_b1_grid;
+    ignore = p_c_grid;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = acc_element_op;
+    ignore = b1_element_op;
+    ignore = c_element_op;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = b1_grid_desc_bk0_n_bk1;
+    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = block_2_ctile_map;
+    ignore = batch_count;
+    ignore = compute_base_ptr_of_batch;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+// Computes C = A * B0 * B1
+//              ^^^^^^ (Acc0)
+//              ^^^^^^^^^^^ (Acc1)
+template <typename ALayout,
+          typename BLayout, // B0Layout
+          typename B1Layout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename B1DataType,
+          typename CDataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename AccElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock, // Gemm0NPerBlock
+          index_t KPerBlock, // Gemm0KPerBlock
+          index_t Gemm1NPerBlock,
+          index_t Gemm1KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t B1K1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          index_t Gemm1NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          typename B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename B1BlockTransferThreadClusterArrangeOrder,
+          typename B1BlockTransferSrcAccessOrder,
+          index_t B1BlockTransferSrcVectorDim,
+          index_t B1BlockTransferSrcScalarPerVector,
+          index_t B1BlockTransferDstScalarPerVector_BK1,
+          bool B1BlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched = LoopScheduler::Default>
+struct DeviceBatchedGemmGemm_Xdl_CShuffle : public DeviceBatchedGemmGemm<ALayout,
+                                                                         BLayout,
+                                                                         B1Layout,
+                                                                         CLayout,
+                                                                         ADataType,
+                                                                         BDataType,
+                                                                         B1DataType,
+                                                                         CDataType,
+                                                                         AElementwiseOperation,
+                                                                         BElementwiseOperation,
+                                                                         AccElementwiseOperation,
+                                                                         B1ElementwiseOperation,
+                                                                         CElementwiseOperation>
+{
+    using DeviceOp = DeviceBatchedGemmGemm_Xdl_CShuffle;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    static constexpr auto matrix_padder =
+        GemmGemmPadder<GemmSpec, index_t, index_t, index_t, index_t>{
+            MPerBlock, NPerBlock, KPerBlock, Gemm1NPerBlock};
+
+    static auto MakeAGridDescriptor_AK0_M_AK1(index_t MRaw, index_t KRaw, index_t StrideA)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(I1, StrideA));
+            }
+        }();
+
+        const auto a_grid_desc_m_k = matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
+
+        const auto M = a_grid_desc_m_k.GetLength(I0);
+        const auto K = a_grid_desc_m_k.GetLength(I1);
+
+        const auto AK0 = K / AK1;
+
+        return transform_tensor_descriptor(a_grid_desc_m_k,
+                                           make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                      make_pass_through_transform(M)),
+                                           make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                           make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    static auto MakeBGridDescriptor_BK0_N_BK1(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        const auto b_grid_desc_n_k = matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
+
+        const auto N = b_grid_desc_n_k.GetLength(I0);
+        const auto K = b_grid_desc_n_k.GetLength(I1);
+
+        const auto BK0 = K / BK1;
+
+        return transform_tensor_descriptor(b_grid_desc_n_k,
+                                           make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                      make_pass_through_transform(N)),
+                                           make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                           make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    // Args: Gemm1KRaw, Gemm1NRaw, StrideB1
+    static auto MakeB1GridDescriptor_BK0_N_BK1(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b1_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, B1Layout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, B1Layout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        const auto b1_grid_desc_n_k = matrix_padder.PadB1Descriptor_N_K(b1_grid_desc_nraw_kraw);
+
+        const auto N = b1_grid_desc_n_k.GetLength(I0);
+        const auto K = b1_grid_desc_n_k.GetLength(I1);
+
+        const auto B1K0 = K / B1K1;
+
+        return transform_tensor_descriptor(
+            b1_grid_desc_n_k,
+            make_tuple(make_unmerge_transform(make_tuple(B1K0, B1K1)),
+                       make_pass_through_transform(N)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    static auto MakeCGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(I1, StrideC));
+            }
+        }();
+
+        return matrix_padder.PadCDescriptor_M_N(c_grid_desc_mraw_nraw);
+    }
+
+    struct ComputeBasePtrOfStridedBatch
+    {
+        ComputeBasePtrOfStridedBatch(index_t BatchStrideA,
+                                     index_t BatchStrideB,
+                                     index_t BatchStrideB1,
+                                     index_t BatchStrideC)
+            : BatchStrideA_(BatchStrideA),
+              BatchStrideB_(BatchStrideB),
+              BatchStrideB1_(BatchStrideB1),
+              BatchStrideC_(BatchStrideC)
+        {
+        }
+
+        __host__ __device__ constexpr long_index_t GetABasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideA_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetBBasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideB_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetB1BasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideB1_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetCBasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideC_);
+        }
+
+        private:
+        index_t BatchStrideA_;
+        index_t BatchStrideB_;
+        index_t BatchStrideB1_;
+        index_t BatchStrideC_;
+    };
+
+    using AGridDesc_AK0_M_AK1  = decltype(MakeAGridDescriptor_AK0_M_AK1(1, 1, 1));
+    using BGridDesc_BK0_N_BK1  = decltype(MakeBGridDescriptor_BK0_N_BK1(1, 1, 1));
+    using B1GridDesc_BK0_N_BK1 = decltype(MakeB1GridDescriptor_BK0_N_BK1(1, 1, 1));
+    using CGridDesc_M_N        = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseBatchedGemmGemm_Xdl_CShuffle<
+        ADataType, // TODO: distinguish A/B datatype
+        GemmAccDataType,
+        CShuffleDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        AccElementwiseOperation,
+        B1ElementwiseOperation,
+        CElementwiseOperation,
+        InMemoryDataOperationEnum::Set,
+        AGridDesc_AK0_M_AK1,
+        BGridDesc_BK0_N_BK1,
+        B1GridDesc_BK0_N_BK1,
+        CGridDesc_M_N,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        Gemm1NPerBlock,
+        Gemm1KPerBlock,
+        AK1,
+        BK1,
+        B1K1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        Gemm1NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        true,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        true,
+        BBlockLdsExtraN,
+        B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+        B1BlockTransferThreadClusterArrangeOrder,
+        B1BlockTransferSrcAccessOrder,
+        B1BlockTransferSrcVectorDim,
+        B1BlockTransferSrcScalarPerVector,
+        B1BlockTransferDstScalarPerVector_BK1,
+        false,
+        B1BlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        LoopSched>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const ADataType* p_a_grid,
+                 const BDataType* p_b_grid,
+                 const B1DataType* p_b1_grid,
+                 CDataType* p_c_grid,
+                 index_t MRaw,
+                 index_t NRaw,
+                 index_t KRaw,
+                 index_t Gemm1NRaw, // = ORaw
+                 index_t Batch,
+                 index_t StrideA,
+                 index_t StrideB,
+                 index_t StrideB1,
+                 index_t StrideC,
+                 index_t BatchStrideA,
+                 index_t BatchStrideB,
+                 index_t BatchStrideB1,
+                 index_t BatchStrideC,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 AccElementwiseOperation acc_element_op,
+                 B1ElementwiseOperation b1_element_op,
+                 CElementwiseOperation c_element_op)
+            : p_a_grid_{p_a_grid},
+              p_b_grid_{p_b_grid},
+              p_b1_grid_{p_b1_grid},
+              p_c_grid_{p_c_grid},
+              a_grid_desc_ak0_m_ak1_{DeviceOp::MakeAGridDescriptor_AK0_M_AK1(MRaw, KRaw, StrideA)},
+              b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1(KRaw, NRaw, StrideB)},
+              b1_grid_desc_bk0_n_bk1_{
+                  DeviceOp::MakeB1GridDescriptor_BK0_N_BK1(NRaw, Gemm1NRaw, StrideB1)},
+              c_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, Gemm1NRaw, StrideC)},
+              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              block_2_ctile_map_{GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_)},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              acc_element_op_{acc_element_op},
+              b1_element_op_{b1_element_op},
+              c_element_op_{c_element_op},
+              batch_count_(Batch),
+              compute_base_ptr_of_batch_{BatchStrideA, BatchStrideB, BatchStrideB1, BatchStrideC},
+              raw_lengths_m_n_k_o_{MRaw, NRaw, KRaw, Gemm1NRaw}
+        {
+            if(GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1_,
+                                           b_grid_desc_bk0_n_bk1_,
+                                           b1_grid_desc_bk0_n_bk1_,
+                                           c_grid_desc_m_n_,
+                                           block_2_ctile_map_))
+            {
+                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        c_grid_desc_m_n_);
+            }
+        }
+
+        void Print() const
+        {
+            std::cout << "A[AK0, M, AK1]: " << a_grid_desc_ak0_m_ak1_ << std::endl;
+            std::cout << "B0[BK0, N, BK1]: " << b_grid_desc_bk0_n_bk1_ << std::endl;
+            std::cout << "B1[BK0, N, BK1]: " << b1_grid_desc_bk0_n_bk1_ << std::endl;
+            std::cout << "C[M, N]: " << c_grid_desc_m_n_ << std::endl;
+        }
+
+        //  private:
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        const B1DataType* p_b1_grid_;
+        CDataType* p_c_grid_;
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock_;
+        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        AccElementwiseOperation acc_element_op_;
+        B1ElementwiseOperation b1_element_op_;
+        CElementwiseOperation c_element_op_;
+        index_t batch_count_;
+        ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch_;
+
+        // For robust IsSupportedArgument() check
+        std::vector<index_t> raw_lengths_m_n_k_o_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(!DeviceOp::IsSupportedArgument(arg))
+            {
+                throw std::runtime_error("wrong! unsupported argument");
+            }
+
+            const index_t grid_size =
+                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) * arg.batch_count_;
+
+            // Gemm0_K
+            const auto K =
+                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+
+            float ave_time = 0;
+
+            auto launch_kernel = [&](auto has_main_k_block_loop_) {
+                const auto kernel = kernel_gemm_gemm_xdl_cshuffle_v1<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    AccElementwiseOperation,
+                    B1ElementwiseOperation,
+                    CElementwiseOperation,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    DeviceOp::B1GridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::DefaultBlock2CTileMap,
+                    ComputeBasePtrOfStridedBatch,
+                    has_main_k_block_loop_>;
+
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_a_grid_,
+                                              arg.p_b_grid_,
+                                              arg.p_b1_grid_,
+                                              arg.p_c_grid_,
+                                              arg.a_element_op_,
+                                              arg.b_element_op_,
+                                              arg.acc_element_op_,
+                                              arg.b1_element_op_,
+                                              arg.c_element_op_,
+                                              arg.a_grid_desc_ak0_m_ak1_,
+                                              arg.b_grid_desc_bk0_n_bk1_,
+                                              arg.b1_grid_desc_bk0_n_bk1_,
+                                              arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.block_2_ctile_map_,
+                                              arg.batch_count_,
+                                              arg.compute_base_ptr_of_batch_);
+            };
+
+            // Gemm1_K is split into Gemm1_K0/K1 where K1 is known at compile time, so we only need
+            // to concern Gemm0's loop
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                ave_time = launch_kernel(integral_constant<bool, true>{});
+            }
+            else
+            {
+                ave_time = launch_kernel(integral_constant<bool, false>{});
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        {
+            return false;
+        }
+
+        // Note: we need raw lengths since threadwise copy can not handle vector load when part of
+        // vector is out of bounds
+        const auto MRaw      = arg.raw_lengths_m_n_k_o_[0];
+        const auto NRaw      = arg.raw_lengths_m_n_k_o_[1];
+        const auto KRaw      = arg.raw_lengths_m_n_k_o_[2];
+        const auto Gemm1NRaw = arg.raw_lengths_m_n_k_o_[3];
+
+        // Check scalar per vector requirement
+        const auto a_extent_lowest =
+            is_same_v<tensor_layout::gemm::RowMajor, ALayout> ? KRaw : MRaw;
+        const auto b_extent_lowest =
+            is_same_v<tensor_layout::gemm::RowMajor, BLayout> ? NRaw : KRaw;
+        const auto b1_extent_lowest =
+            is_same_v<tensor_layout::gemm::RowMajor, B1Layout> ? Gemm1NRaw : NRaw;
+        const auto c_extent_lowest =
+            is_same_v<tensor_layout::gemm::RowMajor, CLayout> ? Gemm1NRaw : MRaw;
+
+        if(!(a_extent_lowest % ABlockTransferSrcScalarPerVector == 0 &&
+             b_extent_lowest % BBlockTransferSrcScalarPerVector == 0 &&
+             b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 &&
+             c_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0))
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.b1_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_m_n_,
+                                           arg.block_2_ctile_map_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const ADataType* p_a,
+                             const BDataType* p_b,
+                             const B1DataType* p_b1,
+                             CDataType* p_c,
+                             index_t MRaw,
+                             index_t NRaw,
+                             index_t KRaw,
+                             index_t Gemm1NRaw,
+                             index_t Batch,
+                             index_t StrideA,
+                             index_t StrideB,
+                             index_t StrideB1,
+                             index_t StrideC,
+                             index_t BatchStrideA,
+                             index_t BatchStrideB,
+                             index_t BatchStrideB1,
+                             index_t BatchStrideC,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             AccElementwiseOperation acc_element_op,
+                             B1ElementwiseOperation b1_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{p_a,           p_b,          p_b1,         p_c,          MRaw,
+                        NRaw,          KRaw,         Gemm1NRaw,    Batch,        StrideA,
+                        StrideB,       StrideB1,     StrideC,      BatchStrideA, BatchStrideB,
+                        BatchStrideB1, BatchStrideC, a_element_op, b_element_op, acc_element_op,
+                        b1_element_op, c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_b,
+                                                      const void* p_b1,
+                                                      void* p_c,
+                                                      index_t MRaw,
+                                                      index_t NRaw,
+                                                      index_t KRaw,
+                                                      index_t Gemm1NRaw,
+                                                      index_t Batch,
+                                                      index_t StrideA,
+                                                      index_t StrideB,
+                                                      index_t StrideB1,
+                                                      index_t StrideC,
+                                                      index_t BatchStrideA,
+                                                      index_t BatchStrideB,
+                                                      index_t BatchStrideB1,
+                                                      index_t BatchStrideC,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      AccElementwiseOperation acc_element_op,
+                                                      B1ElementwiseOperation b1_element_op,
+                                                      CElementwiseOperation c_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<const B1DataType*>(p_b1),
+                                          static_cast<CDataType*>(p_c),
+                                          MRaw,
+                                          NRaw,
+                                          KRaw,
+                                          Gemm1NRaw,
+                                          Batch,
+                                          StrideA,
+                                          StrideB,
+                                          StrideB1,
+                                          StrideC,
+                                          BatchStrideA,
+                                          BatchStrideB,
+                                          BatchStrideB1,
+                                          BatchStrideC,
+                                          a_element_op,
+                                          b_element_op,
+                                          acc_element_op,
+                                          b1_element_op,
+                                          c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceBatchedGemmGemm_Xdl_CShuffle"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1 << ", "
+            << MPerBlock << ", "
+            << Gemm1NPerBlock << ", "
+            << Gemm1KPerBlock << ", "
+            << B1K1 << ", "
+            << getGemmSpecializationString(GemmSpec) << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
new file mode 100644
index 00000000..e34c19bd
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multi_d_xdl.hpp
@@ -0,0 +1,716 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multi_d.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/io.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+/*
+ * \brief Wrapper function of GridwiseGemm::Run to realize BatchedGEMM.
+ *
+ * \tparam ComputePtrOffsetOfBatch Class that computes the base pointer offsets of A, B, C matrix
+ * given the batch. For example, ComputePtrOffsetOfStridedBatch() computes the offsets of evenly
+ * strided batched, but we can easily extend to other layouts. The returned offset can be either \p
+ * index_t or \p long_index_t. If it returns \p long_index_t, we are not subject to the 2GB
+ * limitations.
+ *
+ * \tparam Block2ETileMap Block2ETileMap::CalculateBottomIndex() takes in id of a workgroup and
+ * returns the 2D index of the tile that it computes. \see
+ * GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3::Run().
+ *
+ * \note Using \p ComputePtrOffsetOfBatch gives us the flexibility that 2 workgroups can compute 2
+ * tiles from different matrices. Keep in mind that these 2 matrices can share the same grid
+ * descriptor (like in BatchedGEMM), or use their own grid descriptors (in GroupedGemm). \link
+ * impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp kernel_gemm_xdlops_v2r3_for_conv3d \endlink for
+ * \link DeviceConv3d \endlink uses the same concept, but currently does NOT encapsulate the
+ * computing of pointer offset into \p ComputePtrOffsetOfStridedBatch.
+ *
+ * \note \p Block2ETileMap allows customized mapping between a workgroup and the C-tile it computes.
+ * Together with \p ComputePtrOffsetOfBatch, we can reuse GridwiseGemm (and GridwiseGemm fusion ) to
+ * realize BatchedGemm and GroupedGemm (and the corresponding GEMM fusion).
+ *
+ */
+template <typename GridwiseGemm,
+          typename ABDataType,
+          typename DsPointer,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename ComputePtrOffsetOfBatch,
+          typename Block2ETileMap,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_batched_gemm_xdl(const ABDataType* __restrict__ p_a_grid,
+                                const ABDataType* __restrict__ p_b_grid,
+                                DsPointer p_ds_grid,
+                                EDataType* __restrict__ p_e_grid,
+                                const index_t batch_count,
+                                const AElementwiseOperation a_element_op,
+                                const BElementwiseOperation b_element_op,
+                                const CDEElementwiseOperation cde_element_op,
+                                const AGridDesc_AK0_M_AK1 a_grid_desc_k0_m_k1,
+                                const BGridDesc_BK0_N_BK1 b_grid_desc_k0_n_k1,
+                                const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                                    ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+                                    e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+                                const Block2ETileMap block_2_etile_map)
+{
+
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    const index_t num_blocks_per_batch =
+        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+    const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
+
+    const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
+
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    DsPointer p_ds_grid_grp;
+
+    static constexpr index_t NumDTensor =
+        DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size();
+
+    static_for<0, NumDTensor, 1>{}(
+        [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+                                                  p_b_grid + b_batch_offset,
+                                                  p_ds_grid_grp,
+                                                  p_e_grid + e_batch_offset,
+                                                  p_shared,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  cde_element_op,
+                                                  a_grid_desc_k0_m_k1,
+                                                  b_grid_desc_k0_n_k1,
+                                                  ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                                  block_2_etile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_ds_grid;
+    ignore = p_e_grid;
+    ignore = batch_count;
+    ignore = a_grid_desc_k0_m_k1;
+    ignore = b_grid_desc_k0_n_k1;
+    ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = e_grid_desc_mblock_mperblock_nblock_nperblock_;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = cde_element_op;
+    ignore = compute_ptr_offset_of_batch;
+    ignore = block_2_etile_map;
+#endif
+}
+
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CDEBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched = make_default_loop_scheduler()>
+struct DeviceBatchedGemmMultiD_Xdl : public DeviceBatchedGemmMultiD<ALayout,
+                                                                    BLayout,
+                                                                    DsLayout,
+                                                                    ELayout,
+                                                                    ADataType,
+                                                                    BDataType,
+                                                                    DsDataType,
+                                                                    EDataType,
+                                                                    AElementwiseOperation,
+                                                                    BElementwiseOperation,
+                                                                    CDEElementwiseOperation>
+{
+    using DeviceOp = DeviceBatchedGemmMultiD_Xdl;
+
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr auto matrix_padder =
+        MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock};
+
+    static auto MakeAGridDescriptor_M_K(index_t MRaw, index_t KRaw, index_t StrideA)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(I1, StrideA));
+            }
+        }();
+
+        return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
+    }
+
+    static auto MakeBGridDescriptor_N_K(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
+    }
+
+    template <typename ELay>
+    static auto MakeEGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideE)
+    {
+        const auto e_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ELay>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(StrideE, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ELay>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(I1, StrideE));
+            }
+        }();
+
+        return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
+    }
+
+    static auto MakeDsGridDescriptor_M_N(const std::array<index_t, NumDTensor>& MRaws,
+                                         const std::array<index_t, NumDTensor>& NRaws,
+                                         const std::array<index_t, NumDTensor>& DsStride)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+
+                return DeviceOp::MakeEGridDescriptor_M_N<DLayout>(MRaws[i], NRaws[i], DsStride[i]);
+            },
+            Number<NumDTensor>{});
+    }
+
+    using AGridDesc_M_K  = decltype(MakeAGridDescriptor_M_K(1, 1, 1));
+    using BGridDesc_N_K  = decltype(MakeBGridDescriptor_N_K(1, 1, 1));
+    using DsGridDesc_M_N = remove_cvref_t<decltype(MakeDsGridDescriptor_M_N({}, {}, {}))>;
+    using EGridDesc_M_N  = decltype(MakeEGridDescriptor_M_N<ELayout>(1, 1, 1));
+
+    struct ComputePtrOffsetOfStridedBatch
+    {
+        ComputePtrOffsetOfStridedBatch(index_t BatchStrideA,
+                                       index_t BatchStrideB,
+                                       std::array<ck::index_t, NumDTensor> BatchStrideDs,
+                                       index_t BatchStrideE)
+            : BatchStrideA_(BatchStrideA),
+              BatchStrideB_(BatchStrideB),
+              BatchStrideDs_(BatchStrideDs),
+              BatchStrideE_(BatchStrideE)
+        {
+        }
+
+        __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideA_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideB_);
+        }
+
+        __host__ __device__ constexpr auto GetDsPtrOffset(index_t g_idx) const
+        {
+            std::array<long_index_t, NumDTensor> ds_offset;
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                ds_offset[i] = g_idx * static_cast<long_index_t>(BatchStrideDs_[i]);
+            });
+            return ds_offset;
+        }
+
+        __host__ __device__ constexpr long_index_t GetEPtrOffset(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideE_);
+        }
+
+        private:
+        index_t BatchStrideA_;
+        index_t BatchStrideB_;
+        std::array<ck::index_t, NumDTensor> BatchStrideDs_;
+        index_t BatchStrideE_;
+    };
+
+    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<
+        ADataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CShuffleDataType,
+        DsDataType,
+        EDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CDEElementwiseOperation,
+        InMemoryDataOperationEnum::Set,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEBlockTransferScalarPerVector_NPerBlock,
+        LoopSched>;
+
+    // desc for blockwise copy
+    using AGridDesc_AK0_M_AK1                          = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(AGridDesc_M_K{}))>;
+    using BGridDesc_BK0_N_BK1                          = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(BGridDesc_N_K{}))>;
+    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(DsGridDesc_M_N{}))>;
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock  = remove_cvref_t<decltype(
+        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}))>;
+
+    // block-to-e-tile map
+    using Block2ETileMap =
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const void* p_a_grid,
+                 const void* p_b_grid,
+                 std::array<const void*, NumDTensor> p_ds_grid,
+                 void* p_e_grid,
+                 index_t MRaw,
+                 index_t NRaw,
+                 index_t KRaw,
+                 index_t Batch,
+                 index_t StrideA,
+                 index_t StrideB,
+                 const std::array<ck::index_t, NumDTensor>& StrideDs,
+                 index_t StrideE,
+                 index_t BatchStrideA,
+                 index_t BatchStrideB,
+                 const std::array<ck::index_t, NumDTensor>& BatchStrideDs,
+                 index_t BatchStrideE,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+            : p_a_grid_{static_cast<const ADataType*>(p_a_grid)},
+              p_b_grid_{static_cast<const BDataType*>(p_b_grid)},
+              p_ds_grid_{},
+              p_e_grid_{static_cast<EDataType*>(p_e_grid)},
+              Batch_(Batch),
+              a_grid_desc_m_k_{DeviceOp::MakeAGridDescriptor_M_K(MRaw, KRaw, StrideA)},
+              b_grid_desc_n_k_{DeviceOp::MakeBGridDescriptor_N_K(KRaw, NRaw, StrideB)},
+              ds_grid_desc_m_n_{},
+              e_grid_desc_m_n_{DeviceOp::MakeEGridDescriptor_M_N<ELayout>(MRaw, NRaw, StrideE)},
+              a_grid_desc_ak0_m_ak1_{
+                  GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
+              b_grid_desc_bk0_n_bk1_{
+                  GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
+              ds_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              e_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              compute_ptr_offset_of_batch_{BatchStrideA, BatchStrideB, BatchStrideDs, BatchStrideE},
+              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op}
+        {
+            // populate pointer, desc for Ds
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DLayout   = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                // D pointer
+                p_ds_grid_(i) = static_cast<const DDataType*>(p_ds_grid[i]);
+
+                // D desc
+                ds_grid_desc_m_n_(i) =
+                    DeviceOp::MakeEGridDescriptor_M_N<DLayout>(MRaw, NRaw, StrideDs[i]);
+            });
+
+            // populate desc for Ds/E
+            if(GridwiseGemm::CheckValidity(a_grid_desc_m_k_,
+                                           b_grid_desc_n_k_,
+                                           ds_grid_desc_m_n_,
+                                           e_grid_desc_m_n_,
+                                           block_2_etile_map_))
+            {
+                ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        ds_grid_desc_m_n_);
+
+                e_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        e_grid_desc_m_n_);
+            }
+        }
+
+        void Print() const
+        {
+            std::cout << "A[M, K]: " << a_grid_desc_m_k_ << std::endl;
+            std::cout << "B[N, K]: " << b_grid_desc_n_k_ << std::endl;
+            static_for<0, NumDTensor, 1>{}(
+                [&](auto i) { std::cout << "Ds[M, N]: " << ds_grid_desc_m_n_[i] << std::endl; });
+            std::cout << "E[M, N]: " << e_grid_desc_m_n_ << std::endl;
+        }
+
+        //  private:
+        // pointers
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        EDataType* p_e_grid_;
+
+        // Batch
+        index_t Batch_;
+
+        // tensor descriptors for problem definiton
+        AGridDesc_M_K a_grid_desc_m_k_;
+        BGridDesc_N_K b_grid_desc_n_k_;
+        DsGridDesc_M_N ds_grid_desc_m_n_;
+        EGridDesc_M_N e_grid_desc_m_n_;
+
+        // tensor descriptors for block/thread-wise copy
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+            ds_grid_desc_mblock_mperblock_nblock_nperblock_;
+        EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_;
+
+        // for calculating batch offset
+        ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_;
+
+        // block-to-e-tile map
+        Block2ETileMap block_2_etile_map_;
+
+        // element-wise op
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceBatchedGemmMultiD_Xdl::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
+                                            arg.b_grid_desc_n_k_,
+                                            arg.ds_grid_desc_m_n_,
+                                            arg.e_grid_desc_m_n_,
+                                            arg.block_2_etile_map_))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_) * arg.Batch_;
+
+            const auto K =
+                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+
+            auto launch_kernel = [&](auto has_main_k_block_loop) {
+                constexpr bool has_main_loop = has_main_k_block_loop.value;
+
+                const auto kernel =
+                    kernel_batched_gemm_xdl<GridwiseGemm,
+                                            ADataType, // TODO: distiguish A/B datatype
+                                            typename GridwiseGemm::DsGridPointer,
+                                            EDataType,
+                                            AElementwiseOperation,
+                                            BElementwiseOperation,
+                                            CDEElementwiseOperation,
+                                            DeviceOp::AGridDesc_AK0_M_AK1,
+                                            DeviceOp::BGridDesc_BK0_N_BK1,
+                                            DeviceOp::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                            ComputePtrOffsetOfStridedBatch,
+                                            Block2ETileMap,
+                                            has_main_loop>;
+
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_a_grid_,
+                                              arg.p_b_grid_,
+                                              arg.p_ds_grid_,
+                                              arg.p_e_grid_,
+                                              arg.Batch_,
+                                              arg.a_element_op_,
+                                              arg.b_element_op_,
+                                              arg.cde_element_op_,
+                                              arg.a_grid_desc_ak0_m_ak1_,
+                                              arg.b_grid_desc_bk0_n_bk1_,
+                                              arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.compute_ptr_offset_of_batch_,
+                                              arg.block_2_etile_map_);
+            };
+
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                return launch_kernel(integral_constant<bool, true>{});
+            }
+            else
+            {
+                return launch_kernel(integral_constant<bool, false>{});
+            }
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
+                                           arg.b_grid_desc_n_k_,
+                                           arg.ds_grid_desc_m_n_,
+                                           arg.e_grid_desc_m_n_,
+                                           arg.block_2_etile_map_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const void* p_a,
+                             const void* p_b,
+                             const std::array<const void*, NumDTensor>& p_ds,
+                             void* p_e,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             index_t Batch,
+                             index_t StrideA,
+                             index_t StrideB,
+                             const std::array<index_t, NumDTensor>& StrideDs,
+                             index_t StrideE,
+                             index_t BatchStrideA,
+                             index_t BatchStrideB,
+                             const std::array<ck::index_t, NumDTensor>& BatchStrideDs,
+                             index_t BatchStrideE,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_ds,
+                        p_e,
+                        M,
+                        N,
+                        K,
+                        Batch,
+                        StrideA,
+                        StrideB,
+                        StrideDs,
+                        StrideE,
+                        BatchStrideA,
+                        BatchStrideB,
+                        BatchStrideDs,
+                        BatchStrideE,
+                        a_element_op,
+                        b_element_op,
+                        cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        const std::array<const void*, NumDTensor>& p_ds,
+                        void* p_e,
+                        index_t M,
+                        index_t N,
+                        index_t K,
+                        index_t Batch,
+                        index_t StrideA,
+                        index_t StrideB,
+                        const std::array<ck::index_t, NumDTensor>& StrideDs,
+                        index_t StrideE,
+                        index_t BatchStrideA,
+                        index_t BatchStrideB,
+                        const std::array<ck::index_t, NumDTensor>& BatchStrideDs,
+                        index_t BatchStrideE,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) override
+    {
+        return std::make_unique<Argument>(p_a,
+                                          p_b,
+                                          p_ds,
+                                          p_e,
+                                          M,
+                                          N,
+                                          K,
+                                          Batch,
+                                          StrideA,
+                                          StrideB,
+                                          StrideDs,
+                                          StrideE,
+                                          BatchStrideA,
+                                          BatchStrideB,
+                                          BatchStrideDs,
+                                          BatchStrideE,
+                                          a_element_op,
+                                          b_element_op,
+                                          cde_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceBatchedGemmMultiD_Xdl"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1 << ", "
+            << getGemmSpecializationString(GemmSpec)
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
new file mode 100644
index 00000000..19e2649e
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
@@ -0,0 +1,951 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename GridwiseGemm,
+          typename A0B0B1DataType,
+          typename D0sPointer,
+          typename D1sPointer,
+          typename E1DataType,
+          typename A0ElementwiseOperation,
+          typename B0ElementwiseOperation,
+          typename CDE0ElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CDE1ElementwiseOperation,
+          typename A0GridDesc_AK0_M_AK1,
+          typename B0GridDesc_BK0_N_BK1,
+          typename D0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5,
+          typename B1GridDesc_BK0_N_BK1,
+          typename D1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename E1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename Block2E1TileMap,
+          typename ComputeBasePtrOfStridedBatch,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_batched_gemm_gemm_xdl_cshuffle_v1(
+            const A0B0B1DataType* __restrict__ p_a0_grid,
+            const A0B0B1DataType* __restrict__ p_b0_grid,
+            D0sPointer p_d0s_grid,
+            const A0B0B1DataType* __restrict__ p_b1_grid,
+            D1sPointer p_d1s_grid,
+            E1DataType* __restrict__ p_e1_grid,
+            const A0ElementwiseOperation a0_element_op,
+            const B0ElementwiseOperation b0_element_op,
+            const CDE0ElementwiseOperation cde0_element_op,
+            const B1ElementwiseOperation b1_element_op,
+            const CDE1ElementwiseOperation cde1_element_op,
+            const A0GridDesc_AK0_M_AK1 a0_grid_desc_ak0_m_ak1,
+            const B0GridDesc_BK0_N_BK1 b0_grid_desc_bk0_n_bk1,
+            const D0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5
+                d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
+            const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1,
+            const D1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                d1s_grid_desc_mblock_mperblock_nblock_nperblock,
+            const E1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                e1_grid_desc_mblock_mperblock_nblock_nperblock,
+            const Block2E1TileMap block_2_e1tile_map,
+            const index_t batch_count,
+            const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    const index_t num_blocks_per_batch =
+        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetABasePtr(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetBBasePtr(g_idx)));
+    const long_index_t b1_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetB1BasePtr(g_idx)));
+    const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetCBasePtr(g_idx)));
+
+    static_for<0, p_d0s_grid.Size(), 1>{}([&](auto In) {
+        const long_index_t d0_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_base_ptr_of_batch.GetD0BasePtr(g_idx, In)));
+        p_d0s_grid(In) = p_d0s_grid(In) + d0_batch_offset;
+    });
+
+    static_for<0, p_d1s_grid.Size(), 1>{}([&](auto In) {
+        const long_index_t d1_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_base_ptr_of_batch.GetD1BasePtr(g_idx, In)));
+        p_d1s_grid(In) = p_d1s_grid(In) + d1_batch_offset;
+    });
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a0_grid + a_batch_offset,
+                                                  p_b0_grid + b_batch_offset,
+                                                  p_d0s_grid,
+                                                  p_b1_grid + b1_batch_offset,
+                                                  p_d1s_grid,
+                                                  p_e1_grid + c_batch_offset,
+                                                  p_shared,
+                                                  a0_element_op,
+                                                  b0_element_op,
+                                                  cde0_element_op,
+                                                  b1_element_op,
+                                                  cde1_element_op,
+                                                  a0_grid_desc_ak0_m_ak1,
+                                                  b0_grid_desc_bk0_n_bk1,
+                                                  d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
+                                                  b1_grid_desc_bk0_n_bk1,
+                                                  d1s_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  e1_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  block_2_e1tile_map);
+#else
+    ignore = p_a0_grid;
+    ignore = p_b0_grid;
+    ignore = p_d0s_grid;
+    ignore = p_b1_grid;
+    ignore = p_d1s_grid;
+    ignore = p_e1_grid;
+    ignore = a0_element_op;
+    ignore = b0_element_op;
+    ignore = cde0_element_op;
+    ignore = b1_element_op;
+    ignore = cde1_element_op;
+    ignore = a0_grid_desc_ak0_m_ak1;
+    ignore = b0_grid_desc_bk0_n_bk1;
+    ignore = d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5;
+    ignore = b1_grid_desc_bk0_n_bk1;
+    ignore = d1s_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = e1_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = block_2_e1tile_map;
+    ignore = batch_count;
+    ignore = compute_base_ptr_of_batch;
+#endif
+}
+
+// Computes C = A * B0 * B1
+//              ^^^^^^ (Acc0)
+//              ^^^^^^^^^^^ (Acc1)
+template <typename A0Layout,
+          typename B0Layout, // B0Layout
+          typename D0sLayout,
+          typename B1Layout,
+          typename D1sLayout,
+          typename E1Layout,
+          typename A0DataType,
+          typename B0DataType,
+          typename Acc0DataType,
+          typename D0sDataType,
+          typename B1DataType,
+          typename Acc1DataType,
+          typename C1ShuffleDataType,
+          typename D1sDataType,
+          typename E1DataType,
+          typename A0ElementwiseOperation,
+          typename B0ElementwiseOperation,
+          typename CDE0ElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CDE1ElementwiseOperation,
+          bool PadGemm0M,
+          bool PadGemm0N,
+          bool PadGemm0K,
+          bool PadGemm1N,
+          bool PadGemm1K,
+          index_t NumGemm0KPrefetchStage,
+          index_t BlockSize,
+          index_t Gemm0MPerBlock,
+          index_t Gemm0NPerBlock,
+          index_t Gemm0KPerBlock,
+          index_t Gemm1NPerBlock,
+          index_t Gemm1KPerBlock,
+          index_t A0K1,
+          index_t B0K1,
+          index_t B1K1,
+          index_t Gemm0MPerXdl,
+          index_t Gemm0NPerXdl,
+          index_t Gemm0MXdlPerWave,
+          index_t Gemm0NXdlPerWave,
+          index_t Gemm1NXdlPerWave,
+          typename A0BlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename A0BlockTransferThreadClusterArrangeOrder,
+          typename A0BlockTransferSrcAccessOrder,
+          index_t A0BlockTransferSrcVectorDim,
+          index_t A0BlockTransferSrcScalarPerVector,
+          index_t A0BlockTransferDstScalarPerVector_AK1,
+          bool A0BlockLdsExtraM,
+          typename B0BlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename B0BlockTransferThreadClusterArrangeOrder,
+          typename B0BlockTransferSrcAccessOrder,
+          index_t B0BlockTransferSrcVectorDim,
+          index_t B0BlockTransferSrcScalarPerVector,
+          index_t B0BlockTransferDstScalarPerVector_BK1,
+          bool B0BlockLdsExtraN,
+          typename B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename B1BlockTransferThreadClusterArrangeOrder,
+          typename B1BlockTransferSrcAccessOrder,
+          index_t B1BlockTransferSrcVectorDim,
+          index_t B1BlockTransferSrcScalarPerVector,
+          index_t B1BlockTransferDstScalarPerVector_BK1,
+          bool B1BlockLdsExtraN,
+          index_t C1ShuffleMXdlPerWavePerShuffle,
+          index_t C1ShuffleGemm0NXdlPerWavePerShuffle,
+          typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched = LoopScheduler::Default>
+struct DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
+    : public DeviceBatchedGemmMultipleDGemmMultipleD<A0Layout,
+                                                     B0Layout,
+                                                     D0sLayout,
+                                                     B1Layout,
+                                                     D1sLayout,
+                                                     E1Layout,
+                                                     A0DataType,
+                                                     B0DataType,
+                                                     D0sDataType,
+                                                     B1DataType,
+                                                     D1sDataType,
+                                                     E1DataType,
+                                                     A0ElementwiseOperation,
+                                                     B0ElementwiseOperation,
+                                                     CDE0ElementwiseOperation,
+                                                     B1ElementwiseOperation,
+                                                     CDE1ElementwiseOperation>
+{
+    using DeviceOp = DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle;
+
+    static constexpr index_t NumD0Tensor = D0sDataType::Size();
+    static constexpr index_t NumD1Tensor = D1sDataType::Size();
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+    static constexpr auto I8 = Number<8>{};
+    static constexpr auto I9 = Number<9>{};
+
+    static constexpr auto gemm0_padder =
+        GemmPadder_v2<PadGemm0M, PadGemm0N, PadGemm0K, index_t, index_t, index_t>{
+            Gemm0MPerBlock, Gemm0NPerBlock, Gemm0KPerBlock};
+
+    static constexpr auto gemm1_padder =
+        GemmPadder_v2<PadGemm0M, PadGemm1N, PadGemm1K, index_t, index_t, index_t>{
+            Gemm0MPerBlock, Gemm1NPerBlock, Gemm1KPerBlock};
+
+    // for Gemm0
+    static auto MakeA0GridDescriptor_M_K(index_t MRaw, index_t KRaw, index_t StrideA0)
+    {
+        const auto a0_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, A0Layout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(StrideA0, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, A0Layout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(I1, StrideA0));
+            }
+        }();
+
+        return gemm0_padder.PadADescriptor_M_K(a0_grid_desc_mraw_kraw);
+    }
+
+    // for Gemm0
+    static auto MakeB0GridDescriptor_N_K(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b0_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, B0Layout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, B0Layout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        return gemm0_padder.PadBDescriptor_N_K(b0_grid_desc_nraw_kraw);
+    }
+
+    // for Gemm0
+    template <typename DLay>
+    static auto MakeD0GridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideD0)
+    {
+        const auto d0_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, DLay>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(StrideD0, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, DLay>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(I1, StrideD0));
+            }
+        }();
+
+        return gemm0_padder.PadCDescriptor_M_N(d0_grid_desc_mraw_nraw);
+    }
+
+    // for Gemm1
+    static auto MakeB1GridDescriptor_N_K(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b1_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, B1Layout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, B1Layout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        return gemm1_padder.PadBDescriptor_N_K(b1_grid_desc_nraw_kraw);
+    }
+
+    // for Gemm1
+    template <typename ELay>
+    static auto MakeE1GridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideE1)
+    {
+        const auto e1_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ELay>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(StrideE1, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ELay>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(I1, StrideE1));
+            }
+        }();
+
+        return gemm1_padder.PadCDescriptor_M_N(e1_grid_desc_mraw_nraw);
+    }
+
+    static auto MakeD0sGridDescriptor_M_N(const std::array<index_t, NumD1Tensor>& MRaws,
+                                          const std::array<index_t, NumD1Tensor>& NRaws,
+                                          const std::array<index_t, NumD1Tensor>& DsStride)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, D0sLayout>>;
+
+                return DeviceOp::MakeD0GridDescriptor_M_N<DLayout>(MRaws[i], NRaws[i], DsStride[i]);
+            },
+            Number<NumD0Tensor>{});
+    }
+
+    static auto MakeD1sGridDescriptor_M_N(const std::array<index_t, NumD1Tensor>& MRaws,
+                                          const std::array<index_t, NumD1Tensor>& NRaws,
+                                          const std::array<index_t, NumD1Tensor>& DsStride)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, D1sLayout>>;
+
+                return DeviceOp::MakeE1GridDescriptor_M_N<DLayout>(MRaws[i], NRaws[i], DsStride[i]);
+            },
+            Number<NumD1Tensor>{});
+    }
+
+    struct ComputeBasePtrOfStridedBatch
+    {
+        ComputeBasePtrOfStridedBatch(index_t BatchStrideA0,
+                                     index_t BatchStrideB0,
+                                     std::array<index_t, NumD0Tensor> BatchStrideD0s,
+                                     index_t BatchStrideB1,
+                                     std::array<index_t, NumD1Tensor> BatchStrideD1s,
+                                     index_t BatchStrideE1)
+            : BatchStrideA0_(BatchStrideA0),
+              BatchStrideB0_(BatchStrideB0),
+              BatchStrideD0s_(BatchStrideD0s),
+              BatchStrideB1_(BatchStrideB1),
+              BatchStrideD1s_(BatchStrideD1s),
+              BatchStrideE1_(BatchStrideE1)
+        {
+        }
+
+        __host__ __device__ constexpr long_index_t GetABasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideA0_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetBBasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideB0_);
+        }
+
+        template <index_t I>
+        __host__ __device__ constexpr long_index_t GetD0BasePtr(index_t g_idx,
+                                                                Number<I> d1_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideD0s_[d1_idx]);
+        }
+
+        __host__ __device__ constexpr long_index_t GetB1BasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideB1_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetCBasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideE1_);
+        }
+
+        template <index_t I>
+        __host__ __device__ constexpr auto GetD1BasePtr(index_t g_idx, Number<I> d1_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideD1s_[d1_idx]);
+        }
+
+        private:
+        index_t BatchStrideA0_;
+        index_t BatchStrideB0_;
+        std::array<index_t, NumD0Tensor> BatchStrideD0s_;
+        index_t BatchStrideB1_;
+        std::array<index_t, NumD1Tensor> BatchStrideD1s_;
+        index_t BatchStrideE1_;
+    };
+
+    using A0GridDesc_M_K  = decltype(MakeA0GridDescriptor_M_K(1, 1, 1));
+    using B0GridDesc_N_K  = decltype(MakeB0GridDescriptor_N_K(1, 1, 1));
+    using D0sGridDesc_M_N = remove_cvref_t<decltype(MakeD0sGridDescriptor_M_N({}, {}, {}))>;
+    using B1GridDesc_N_K  = decltype(MakeB1GridDescriptor_N_K(1, 1, 1));
+    using D1sGridDesc_M_N = remove_cvref_t<decltype(MakeD1sGridDescriptor_M_N({}, {}, {}))>;
+    using E1GridDesc_M_N  = decltype(MakeE1GridDescriptor_M_N<E1Layout>(1, 1, 1));
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle<
+        A0DataType, // TODO: distinguish A/B datatype
+        Acc0DataType,
+        D0sDataType,
+        Acc1DataType,
+        C1ShuffleDataType,
+        D1sDataType,
+        E1DataType,
+        A0ElementwiseOperation,
+        B0ElementwiseOperation,
+        CDE0ElementwiseOperation,
+        B1ElementwiseOperation,
+        CDE1ElementwiseOperation,
+        InMemoryDataOperationEnum::Set,
+        A0GridDesc_M_K,
+        B0GridDesc_N_K,
+        D0sGridDesc_M_N,
+        B1GridDesc_N_K,
+        D1sGridDesc_M_N,
+        E1GridDesc_M_N,
+        NumGemm0KPrefetchStage,
+        BlockSize,
+        Gemm0MPerBlock,
+        Gemm0NPerBlock,
+        Gemm0KPerBlock,
+        Gemm1NPerBlock,
+        Gemm1KPerBlock,
+        A0K1,
+        B0K1,
+        B1K1,
+        Gemm0MPerXdl,
+        Gemm0NPerXdl,
+        Gemm0MXdlPerWave,
+        Gemm0NXdlPerWave,
+        Gemm1NXdlPerWave,
+        A0BlockTransferThreadClusterLengths_AK0_M_AK1,
+        A0BlockTransferThreadClusterArrangeOrder,
+        A0BlockTransferSrcAccessOrder,
+        A0BlockTransferSrcVectorDim,
+        A0BlockTransferSrcScalarPerVector,
+        A0BlockTransferDstScalarPerVector_AK1,
+        true,
+        A0BlockLdsExtraM,
+        B0BlockTransferThreadClusterLengths_BK0_N_BK1,
+        B0BlockTransferThreadClusterArrangeOrder,
+        B0BlockTransferSrcAccessOrder,
+        B0BlockTransferSrcVectorDim,
+        B0BlockTransferSrcScalarPerVector,
+        B0BlockTransferDstScalarPerVector_BK1,
+        true,
+        B0BlockLdsExtraN,
+        B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+        B1BlockTransferThreadClusterArrangeOrder,
+        B1BlockTransferSrcAccessOrder,
+        B1BlockTransferSrcVectorDim,
+        B1BlockTransferSrcScalarPerVector,
+        B1BlockTransferDstScalarPerVector_BK1,
+        false,
+        B1BlockLdsExtraN,
+        C1ShuffleMXdlPerWavePerShuffle,
+        C1ShuffleGemm0NXdlPerWavePerShuffle,
+        CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDE1ShuffleBlockTransferScalarPerVector_NPerBlock,
+        LoopSched>;
+
+    using A0GridDesc_AK0_M_AK1 = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultA0GridDescriptor_AK0_M_AK1(A0GridDesc_M_K{}))>;
+    using B0GridDesc_BK0_N_BK1 = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultB0GridDescriptor_BK0_N_BK1(B0GridDesc_N_K{}))>;
+    using B1GridDesc_BK0_N_BK1 = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultB1GridDescriptor_BK0_N_BK1(B1GridDesc_N_K{}))>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const A0DataType* p_a0_grid,
+                 const B0DataType* p_b0_grid,
+                 std::array<const void*, NumD0Tensor> p_d0s_grid,
+                 const B1DataType* p_b1_grid,
+                 std::array<const void*, NumD1Tensor> p_d1s_grid,
+                 E1DataType* p_e1_grid,
+                 index_t MRaw,
+                 index_t NRaw,
+                 index_t KRaw,
+                 index_t Gemm1NRaw, // = ORaw
+                 index_t Batch,
+                 index_t StrideA0,
+                 index_t StrideB0,
+                 std::array<index_t, NumD0Tensor> StrideD0s,
+                 index_t StrideB1,
+                 std::array<index_t, NumD1Tensor> StrideD1s,
+                 index_t StrideE1,
+                 index_t BatchStrideA0,
+                 index_t BatchStrideB0,
+                 std::array<index_t, NumD0Tensor> BatchStrideD0s,
+                 index_t BatchStrideB1,
+                 std::array<index_t, NumD1Tensor> BatchStrideD1s,
+                 index_t BatchStrideE1,
+                 A0ElementwiseOperation a0_element_op,
+                 B0ElementwiseOperation b0_element_op,
+                 CDE0ElementwiseOperation cde0_element_op,
+                 B1ElementwiseOperation b1_element_op,
+                 CDE1ElementwiseOperation cde1_element_op)
+            : p_a0_grid_{p_a0_grid},
+              p_b0_grid_{p_b0_grid},
+              p_d0s_grid_{},
+              p_b1_grid_{p_b1_grid},
+              p_d1s_grid_{},
+              p_e1_grid_{p_e1_grid},
+              a0_grid_desc_m_k_{DeviceOp::MakeA0GridDescriptor_M_K(MRaw, KRaw, StrideA0)},
+              b0_grid_desc_n_k_{DeviceOp::MakeB0GridDescriptor_N_K(KRaw, NRaw, StrideB0)},
+              d0s_grid_desc_m_n_{},
+              b1_grid_desc_n_k_{DeviceOp::MakeB1GridDescriptor_N_K(NRaw, Gemm1NRaw, StrideB1)},
+              d1s_grid_desc_m_n_{},
+              e1_grid_desc_m_n_{
+                  DeviceOp::MakeE1GridDescriptor_M_N<E1Layout>(MRaw, Gemm1NRaw, StrideE1)},
+              a0_grid_desc_ak0_m_ak1_{
+                  GridwiseGemm::MakeDefaultA0GridDescriptor_AK0_M_AK1(a0_grid_desc_m_k_)},
+              b0_grid_desc_bk0_n_bk1_{
+                  GridwiseGemm::MakeDefaultB0GridDescriptor_BK0_N_BK1(b0_grid_desc_n_k_)},
+              d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_{},
+              b1_grid_desc_bk0_n_bk1_{
+                  GridwiseGemm::MakeDefaultB1GridDescriptor_BK0_N_BK1(b1_grid_desc_n_k_)},
+              d1s_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              e1_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              block_2_e1tile_map_{GridwiseGemm::MakeDefaultBlock2E1TileMap(e1_grid_desc_m_n_)},
+              a0_element_op_{a0_element_op},
+              b0_element_op_{b0_element_op},
+              cde0_element_op_{cde0_element_op},
+              b1_element_op_{b1_element_op},
+              cde1_element_op_{cde1_element_op},
+              batch_count_(Batch),
+              compute_base_ptr_of_batch_{BatchStrideA0,
+                                         BatchStrideB0,
+                                         BatchStrideD0s,
+                                         BatchStrideB1,
+                                         BatchStrideD1s,
+                                         BatchStrideE1}
+        {
+            std::cout << "a0_grid_desc_m_k_{" << a0_grid_desc_m_k_.GetLength(I0) << ", "
+                      << a0_grid_desc_m_k_.GetLength(I1) << "}" << std::endl;
+            std::cout << "b0_grid_desc_n_k_{" << b0_grid_desc_n_k_.GetLength(I0) << ", "
+                      << b0_grid_desc_n_k_.GetLength(I1) << "}" << std::endl;
+            std::cout << "d0s_grid_desc_m_n_[I0]{" << d0s_grid_desc_m_n_[I0].GetLength(I0) << ", "
+                      << d0s_grid_desc_m_n_[I0].GetLength(I1) << "}" << std::endl;
+            std::cout << "b1_grid_desc_n_k_{" << b1_grid_desc_n_k_.GetLength(I0) << ", "
+                      << b1_grid_desc_n_k_.GetLength(I1) << "}" << std::endl;
+            std::cout << "d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_{"
+                      << d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_[I0].GetLength(I0) << ", "
+                      << d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_[I0].GetLength(I1) << ", "
+                      << d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_[I0].GetLength(I2) << ", "
+                      << d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_[I0].GetLength(I3) << ", "
+                      << d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_[I0].GetLength(I4) << ", "
+                      << d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_[I0].GetLength(I5) << ", "
+                      << d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_[I0].GetLength(I6) << ", "
+                      << d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_[I0].GetLength(I7) << ", "
+                      << d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_[I0].GetLength(I8) << ", "
+                      << d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_[I0].GetLength(I9) << "}"
+                      << std::endl;
+            std::cout << "e1_grid_desc_m_n_{" << e1_grid_desc_m_n_.GetLength(I0) << ", "
+                      << e1_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+
+            static_for<0, NumD0Tensor, 1>{}([&](auto i) {
+                using D0Layout   = remove_cvref_t<tuple_element_t<i.value, D0sLayout>>;
+                using D0DataType = remove_cvref_t<tuple_element_t<i.value, D0sDataType>>;
+
+                // D0 pointer
+                p_d0s_grid_(i) = static_cast<const D0DataType*>(p_d0s_grid[i]);
+
+                // D0 desc
+                d0s_grid_desc_m_n_(i) =
+                    DeviceOp::MakeD0GridDescriptor_M_N<D0Layout>(MRaw, NRaw, StrideD0s[i]);
+            });
+
+            static_for<0, NumD1Tensor, 1>{}([&](auto i) {
+                using D1Layout   = remove_cvref_t<tuple_element_t<i.value, D1sLayout>>;
+                using D1DataType = remove_cvref_t<tuple_element_t<i.value, D1sDataType>>;
+
+                // D1 pointer
+                p_d1s_grid_(i) = static_cast<const D1DataType*>(p_d1s_grid[i]);
+
+                // D1 desc
+                d1s_grid_desc_m_n_(i) =
+                    DeviceOp::MakeE1GridDescriptor_M_N<D1Layout>(MRaw, Gemm1NRaw, StrideD1s[i]);
+            });
+
+            if(GridwiseGemm::CheckValidity(a0_grid_desc_m_k_,
+                                           b0_grid_desc_n_k_,
+                                           b1_grid_desc_n_k_,
+                                           e1_grid_desc_m_n_,
+                                           block_2_e1tile_map_))
+            {
+                e1_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeE1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        e1_grid_desc_m_n_);
+
+                d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_ =
+                    GridwiseGemm::MakeD0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5(
+                        d0s_grid_desc_m_n_);
+
+                d1s_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeD1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        d1s_grid_desc_m_n_);
+            }
+        }
+
+        //  private:
+        // pointers
+        const A0DataType* p_a0_grid_;
+        const B0DataType* p_b0_grid_;
+        typename GridwiseGemm::D0sGridPointer p_d0s_grid_;
+        const B1DataType* p_b1_grid_;
+        typename GridwiseGemm::D1sGridPointer p_d1s_grid_;
+        E1DataType* p_e1_grid_;
+
+        // tensor descriptors for problem definiton
+        A0GridDesc_M_K a0_grid_desc_m_k_;
+        B0GridDesc_N_K b0_grid_desc_n_k_;
+        D0sGridDesc_M_N d0s_grid_desc_m_n_;
+        B1GridDesc_N_K b1_grid_desc_n_k_;
+        D1sGridDesc_M_N d1s_grid_desc_m_n_;
+        E1GridDesc_M_N e1_grid_desc_m_n_;
+
+        // tensor descriptors for block/thread-wise copy
+        A0GridDesc_AK0_M_AK1 a0_grid_desc_ak0_m_ak1_;
+        B0GridDesc_BK0_N_BK1 b0_grid_desc_bk0_n_bk1_;
+        typename GridwiseGemm::D0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5
+            d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_;
+        B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1_;
+        typename GridwiseGemm::D1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            d1s_grid_desc_mblock_mperblock_nblock_nperblock_;
+        typename GridwiseGemm::E1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            e1_grid_desc_mblock_mperblock_nblock_nperblock_;
+
+        // block-to-e1-tile map
+        typename GridwiseGemm::DefaultBlock2E1TileMap block_2_e1tile_map_;
+
+        // element-wise op
+        A0ElementwiseOperation a0_element_op_;
+        B0ElementwiseOperation b0_element_op_;
+        CDE0ElementwiseOperation cde0_element_op_;
+        B1ElementwiseOperation b1_element_op_;
+        CDE1ElementwiseOperation cde1_element_op_;
+
+        // batch
+        index_t batch_count_;
+        ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(!GridwiseGemm::CheckValidity(arg.a0_grid_desc_m_k_,
+                                            arg.b0_grid_desc_n_k_,
+                                            arg.b1_grid_desc_n_k_,
+                                            arg.e1_grid_desc_m_n_,
+                                            arg.block_2_e1tile_map_))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_e1tile_map_.CalculateGridSize(arg.e1_grid_desc_m_n_) * arg.batch_count_;
+
+            // Gemm0_K
+            const auto K = arg.a0_grid_desc_m_k_.GetLength(I1);
+
+            auto launch_kernel = [&](auto has_main_k_block_loop_) {
+                const auto kernel = kernel_batched_gemm_gemm_xdl_cshuffle_v1<
+                    GridwiseGemm,
+                    A0DataType, // TODO: distiguish A/B datatype
+                    typename GridwiseGemm::D0sGridPointer,
+                    typename GridwiseGemm::D1sGridPointer,
+                    E1DataType,
+                    A0ElementwiseOperation,
+                    B0ElementwiseOperation,
+                    CDE0ElementwiseOperation,
+                    B1ElementwiseOperation,
+                    CDE1ElementwiseOperation,
+                    DeviceOp::A0GridDesc_AK0_M_AK1,
+                    DeviceOp::B0GridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::D0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5,
+                    DeviceOp::B1GridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::D1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::E1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::DefaultBlock2E1TileMap,
+                    ComputeBasePtrOfStridedBatch,
+                    has_main_k_block_loop_>;
+
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_a0_grid_,
+                                              arg.p_b0_grid_,
+                                              arg.p_d0s_grid_,
+                                              arg.p_b1_grid_,
+                                              arg.p_d1s_grid_,
+                                              arg.p_e1_grid_,
+                                              arg.a0_element_op_,
+                                              arg.b0_element_op_,
+                                              arg.cde0_element_op_,
+                                              arg.b1_element_op_,
+                                              arg.cde1_element_op_,
+                                              arg.a0_grid_desc_ak0_m_ak1_,
+                                              arg.b0_grid_desc_bk0_n_bk1_,
+                                              arg.d0s_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5_,
+                                              arg.b1_grid_desc_bk0_n_bk1_,
+                                              arg.d1s_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.e1_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.block_2_e1tile_map_,
+                                              arg.batch_count_,
+                                              arg.compute_base_ptr_of_batch_);
+            };
+
+            // Gemm1_K is split into Gemm1_K0/K1 where K1 is known at compile time, so we only need
+            // to concern Gemm0's loop
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                return launch_kernel(integral_constant<bool, true>{});
+            }
+            else
+            {
+                return launch_kernel(integral_constant<bool, false>{});
+            }
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg.a0_grid_desc_m_k_,
+                                           arg.b0_grid_desc_n_k_,
+                                           arg.b1_grid_desc_n_k_,
+                                           arg.e1_grid_desc_m_n_,
+                                           arg.block_2_e1tile_map_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const A0DataType* p_a0,
+                             const B0DataType* p_b0,
+                             std::array<const void*, NumD0Tensor> p_d0s,
+                             const B1DataType* p_b1,
+                             std::array<const void*, NumD1Tensor> p_d1s,
+                             E1DataType* p_e1,
+                             index_t MRaw,
+                             index_t NRaw,
+                             index_t KRaw,
+                             index_t Gemm1NRaw,
+                             index_t Batch,
+                             index_t StrideA0,
+                             index_t StrideB0,
+                             std::array<index_t, NumD0Tensor> StrideD0s,
+                             index_t StrideB1,
+                             std::array<index_t, NumD1Tensor> StrideD1s,
+                             index_t StrideE1,
+                             index_t BatchStrideA0,
+                             index_t BatchStrideB0,
+                             std::array<index_t, NumD0Tensor> BatchStrideD0s,
+                             index_t BatchStrideB1,
+                             std::array<index_t, NumD1Tensor> BatchStrideD1s,
+                             index_t BatchStrideE1,
+                             A0ElementwiseOperation a0_element_op,
+                             B0ElementwiseOperation b0_element_op,
+                             CDE0ElementwiseOperation cde0_element_op,
+                             B1ElementwiseOperation b1_element_op,
+                             CDE1ElementwiseOperation cde1_element_op)
+    {
+        return Argument{p_a0,          p_b0,
+                        p_d0s,         p_b1,
+                        p_d1s,         p_e1,
+                        MRaw,          NRaw,
+                        KRaw,          Gemm1NRaw,
+                        Batch,         StrideA0,
+                        StrideB0,      StrideD0s,
+                        StrideB1,      StrideD1s,
+                        StrideE1,      BatchStrideA0,
+                        BatchStrideB0, BatchStrideD0s,
+                        BatchStrideB1, BatchStrideD1s,
+                        BatchStrideE1, a0_element_op,
+                        b0_element_op, cde0_element_op,
+                        b1_element_op, cde1_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a0,
+                        const void* p_b0,
+                        std::array<const void*, NumD0Tensor> p_d0s,
+                        const void* p_b1,
+                        std::array<const void*, NumD1Tensor> p_d1s,
+                        void* p_e1,
+                        index_t MRaw,
+                        index_t NRaw,
+                        index_t KRaw,
+                        index_t Gemm1NRaw,
+                        index_t Batch,
+                        index_t StrideA0,
+                        index_t StrideB0,
+                        std::array<ck::index_t, NumD0Tensor> StrideD0s,
+                        index_t StrideB1,
+                        std::array<ck::index_t, NumD1Tensor> StrideD1s,
+                        index_t StrideE1,
+                        index_t BatchStrideA0,
+                        index_t BatchStrideB0,
+                        std::array<ck::index_t, NumD0Tensor> BatchStrideD0s,
+                        index_t BatchStrideB1,
+                        std::array<ck::index_t, NumD1Tensor> BatchStrideD1s,
+                        index_t BatchStrideE1,
+                        A0ElementwiseOperation a0_element_op,
+                        B0ElementwiseOperation b0_element_op,
+                        CDE0ElementwiseOperation cde0_element_op,
+                        B1ElementwiseOperation b1_element_op,
+                        CDE1ElementwiseOperation cde1_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const A0DataType*>(p_a0),
+                                          static_cast<const B0DataType*>(p_b0),
+                                          p_d0s,
+                                          static_cast<const B1DataType*>(p_b1),
+                                          p_d1s,
+                                          static_cast<E1DataType*>(p_e1),
+                                          MRaw,
+                                          NRaw,
+                                          KRaw,
+                                          Gemm1NRaw,
+                                          Batch,
+                                          StrideA0,
+                                          StrideB0,
+                                          StrideD0s,
+                                          StrideB1,
+                                          StrideD1s,
+                                          StrideE1,
+                                          BatchStrideA0,
+                                          BatchStrideB0,
+                                          BatchStrideD0s,
+                                          BatchStrideB1,
+                                          BatchStrideD1s,
+                                          BatchStrideE1,
+                                          a0_element_op,
+                                          b0_element_op,
+                                          cde0_element_op,
+                                          b1_element_op,
+                                          cde1_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle"
+            << "<"
+            << BlockSize << ", "
+            << Gemm0MPerBlock << ", "
+            << Gemm0NPerBlock << ", "
+            << Gemm0KPerBlock << ", "
+            << A0K1 << ", "
+            << B0K1 << ", "
+            << B1K1 << ", "
+            << Gemm0MPerXdl << ", "
+            << Gemm0NPerXdl << ", "
+            << Gemm0MXdlPerWave << ", "
+            << Gemm0NXdlPerWave << ", "
+            << Gemm1NXdlPerWave << "> ";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
new file mode 100644
index 00000000..3c5fdbda
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp
@@ -0,0 +1,1001 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename ReducePtrsGlobal,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename ReduceInElementwiseOperations,
+          typename ReduceAccElementwiseOperations,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename ReduceGridDescriptor_MBlock_MPerBlock,
+          typename ComputeBasePrtOfBatch,
+          typename Block2CTileMap,
+          bool HasMainK0BlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_batched_gemm_reduce_xdl_cshuffle_v1(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            ReducePtrsGlobal p_reduces_grid,
+            const index_t batch_count,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CElementwiseOperation c_element_op,
+            const ReduceInElementwiseOperations reduce_in_element_ops,
+            const ReduceAccElementwiseOperations reduce_out_element_ops,
+            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+            const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                c_grid_desc_mblock_mperblock_nblock_nperblock,
+            const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock,
+            const ComputeBasePrtOfBatch compute_base_ptr_of_batch_,
+            const Block2CTileMap block_2_ctile_map)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    const index_t num_blocks_per_batch =
+        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch_.GetABasePtr(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch_.GetBBasePtr(g_idx)));
+    const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch_.GetCBasePtr(g_idx)));
+
+    static_for<0, p_reduces_grid.Size(), 1>{}([&](auto In) {
+        const long_index_t d_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_base_ptr_of_batch_.GetDBasePtr(g_idx, In)));
+        p_reduces_grid(In) = p_reduces_grid(In) + d_batch_offset;
+    });
+
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    GridwiseGemm::template Run<HasMainK0BlockLoop>(p_a_grid + a_batch_offset,
+                                                   p_b_grid + b_batch_offset,
+                                                   p_c_grid + c_batch_offset,
+                                                   p_reduces_grid,
+                                                   p_shared,
+                                                   a_element_op,
+                                                   b_element_op,
+                                                   c_element_op,
+                                                   reduce_in_element_ops,
+                                                   reduce_out_element_ops,
+                                                   a_grid_desc_ak0_m_ak1,
+                                                   b_grid_desc_bk0_n_bk1,
+                                                   c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                   reduce_grid_desc_mblock_mperblock,
+                                                   block_2_ctile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_c_grid;
+    ignore = p_reduces_grid;
+    ignore = batch_count;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = c_element_op;
+    ignore = reduce_in_element_ops;
+    ignore = reduce_out_element_ops;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = reduce_grid_desc_mblock_mperblock;
+    ignore = compute_base_ptr_of_batch_;
+    ignore = block_2_ctile_map;
+#endif
+}
+
+// Note: inter-wave loop scheduler is rolled out to c-shuffle version first. Becuase non c-shuffle
+// version currently has compiler issues with register spill which further causes validation
+// failures.
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename ReduceAccDataType,
+          typename ReducePtrsGlobal,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename ReduceOperations,
+          typename ReduceInElementwiseOperations,
+          typename ReduceAccElementwiseOperations,
+          typename ReduceGlobalMemoryDataOperation,
+          GemmSpecialization GemmSpec,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          typename CReduceThreadClusterLengths_MPerBlock_NPerBlock,
+          index_t CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
+          index_t CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
+          LoopScheduler LoopSched = make_default_loop_scheduler()>
+struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<0, ReduceOperations::Size()>
+{
+    using DeviceOp = DeviceBatchedGemmReduce_Xdl_CShuffle;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    static auto MakeAGridDescriptor_AK0_M_AK1(index_t MRaw, index_t KRaw, index_t StrideA)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(I1, StrideA));
+            }
+        }();
+
+        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto K = math::integer_divide_ceil(KRaw, KPerBlock) * KPerBlock;
+
+        const auto MPad = M - MRaw;
+        const auto KPad = K - KRaw;
+
+        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both M and K
+            assert(K % AK1 == 0);
+
+            const auto AK0 = K / AK1;
+
+            const auto a_grid_desc_m_k =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_right_pad_transform(MRaw, MPad),
+                                                       make_right_pad_transform(KRaw, KPad)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_m_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(M)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad M, but not K
+            assert(KRaw % AK1 == 0);
+
+            const auto AK0 = KRaw / AK1;
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_right_pad_transform(MRaw, MPad)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad K, but not M
+            assert(K % AK1 == 0);
+
+            const auto AK0 = K / AK1;
+
+            const auto a_grid_desc_m_k = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(KRaw, KPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_m_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(MRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else
+        {
+            // not pad M or K
+            assert(KRaw % AK1 == 0);
+
+            const auto AK0 = KRaw / AK1;
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(MRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+    }
+
+    static auto MakeBGridDescriptor_BK0_N_BK1(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
+        const auto K = math::integer_divide_ceil(KRaw, KPerBlock) * KPerBlock;
+
+        const auto NPad = N - NRaw;
+        const auto KPad = K - KRaw;
+
+        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both N and K
+            assert(K % BK1 == 0);
+
+            const auto BK0 = K / BK1;
+
+            const auto b_grid_desc_n_k =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_right_pad_transform(NRaw, NPad),
+                                                       make_right_pad_transform(KRaw, KPad)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_n_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(N)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad N, but not K
+            assert(KRaw % BK1 == 0);
+
+            const auto BK0 = KRaw / BK1;
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_right_pad_transform(NRaw, NPad)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad K, but not N
+            assert(K % BK1 == 0);
+
+            const auto BK0 = K / BK1;
+
+            const auto b_grid_desc_n_k = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_pass_through_transform(NRaw), make_right_pad_transform(KRaw, KPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_n_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(NRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else
+        {
+            // not pad N or K
+            assert(KRaw % BK1 == 0);
+
+            const auto BK0 = KRaw / BK1;
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(NRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+    }
+
+    static auto MakeCGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(I1, StrideC));
+            }
+        }();
+
+        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
+
+        const auto MPad = M - MRaw;
+        const auto NPad = N - NRaw;
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad M and N
+            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                               make_tuple(make_right_pad_transform(MRaw, MPad),
+                                                          make_right_pad_transform(NRaw, NPad)),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad M, but not N
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_right_pad_transform(MRaw, MPad), make_pass_through_transform(NRaw)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad N, but not M
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(NRaw, NPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            // not pad M or N
+            return c_grid_desc_mraw_nraw;
+        }
+    }
+
+    // assume D is packed tensor
+    static auto MakeReduceGridDescriptor_M(index_t MRaw)
+    {
+        const auto d_grid_desc_mraw = make_naive_tensor_descriptor_packed(make_tuple(MRaw));
+
+        const auto M    = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto MPad = M - MRaw;
+
+        if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                     GemmSpec == GemmSpecialization::MNPadding ||
+                     GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad M
+            return transform_tensor_descriptor(d_grid_desc_mraw,
+                                               make_tuple(make_right_pad_transform(MRaw, MPad)),
+                                               make_tuple(Sequence<0>{}),
+                                               make_tuple(Sequence<0>{}));
+        }
+        else
+        {
+            // not pad M
+            return d_grid_desc_mraw;
+        }
+    }
+
+    using AGridDesc_AK0_M_AK1 = decltype(MakeAGridDescriptor_AK0_M_AK1(1, 1, 1));
+    using BGridDesc_BK0_N_BK1 = decltype(MakeBGridDescriptor_BK0_N_BK1(1, 1, 1));
+    using CGridDesc_M_N       = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
+    using ReduceGridDesc_M    = decltype(MakeReduceGridDescriptor_M(1));
+
+    struct ComputeBasePtrOfStridedBatch
+    {
+        ComputeBasePtrOfStridedBatch(index_t BatchStrideA,
+                                     index_t BatchStrideB,
+                                     index_t BatchStrideC,
+                                     index_t BatchStrideD)
+            : BatchStrideA_(BatchStrideA),
+              BatchStrideB_(BatchStrideB),
+              BatchStrideC_(BatchStrideC),
+              BatchStrideD_(BatchStrideD)
+        {
+        }
+
+        __host__ __device__ constexpr long_index_t GetABasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideA_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetBBasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideB_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetCBasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideC_);
+        }
+
+        template <index_t I>
+        __host__ __device__ constexpr long_index_t GetDBasePtr(index_t g_idx,
+                                                               Number<I> reduction_idx) const
+        {
+            // TODO - Support sequence of StrideD in MakeArgument()
+            (void)reduction_idx;
+            return g_idx * static_cast<long_index_t>(BatchStrideD_);
+        }
+
+        private:
+        index_t BatchStrideA_;
+        index_t BatchStrideB_;
+        index_t BatchStrideC_;
+        index_t BatchStrideD_;
+    };
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
+        ADataType, // TODO: distinguish A/B datatype
+        GemmAccDataType,
+        CShuffleDataType,
+        CDataType,
+        ReduceAccDataType,
+        ReducePtrsGlobal,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        ReduceOperations,
+        ReduceInElementwiseOperations,
+        ReduceAccElementwiseOperations,
+        InMemoryDataOperationEnum::Set,
+        ReduceGlobalMemoryDataOperation,
+        AGridDesc_AK0_M_AK1,
+        BGridDesc_BK0_N_BK1,
+        CGridDesc_M_N,
+        ReduceGridDesc_M,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        CReduceThreadClusterLengths_MPerBlock_NPerBlock,
+        CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
+        CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
+        LoopSched>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const ADataType* p_a_grid,
+                 const BDataType* p_b_grid,
+                 CDataType* p_c_grid,
+                 ReducePtrsGlobal p_reduces_grid,
+                 index_t MRaw,
+                 index_t NRaw,
+                 index_t KRaw,
+                 index_t StrideA,
+                 index_t StrideB,
+                 index_t StrideC,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op,
+                 ReduceInElementwiseOperations reduce_in_element_ops,
+                 ReduceAccElementwiseOperations reduce_out_element_ops,
+                 index_t Batch)
+            : p_a_grid_{p_a_grid},
+              p_b_grid_{p_b_grid},
+              p_c_grid_{p_c_grid},
+              p_reduces_grid_{p_reduces_grid},
+              Batch_(Batch),
+              a_grid_desc_ak0_m_ak1_{DeviceOp::MakeAGridDescriptor_AK0_M_AK1(MRaw, KRaw, StrideA)},
+              b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1(KRaw, NRaw, StrideB)},
+              c_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, NRaw, StrideC)},
+              reduce_grid_desc_m_{DeviceOp::MakeReduceGridDescriptor_M(MRaw)},
+              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              reduce_grid_desc_mblock_mperblock_{},
+              compute_base_ptr_of_batch_{
+                  type_convert<index_t>(a_grid_desc_ak0_m_ak1_.GetElementSpaceSize()),
+                  type_convert<index_t>(b_grid_desc_bk0_n_bk1_.GetElementSpaceSize()),
+                  type_convert<index_t>(c_grid_desc_m_n_.GetElementSpaceSize()),
+                  type_convert<index_t>(reduce_grid_desc_m_.GetElementSpaceSize())},
+              block_2_ctile_map_{GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_)},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op},
+              reduce_in_element_ops_{reduce_in_element_ops},
+              reduce_out_element_ops_{reduce_out_element_ops}
+        {
+            if(GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1_,
+                                           b_grid_desc_bk0_n_bk1_,
+                                           c_grid_desc_m_n_,
+                                           block_2_ctile_map_))
+            {
+                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        c_grid_desc_m_n_);
+
+                reduce_grid_desc_mblock_mperblock_ =
+                    GridwiseGemm::MakeReduceGridDescriptor_MBlock_MPerBlock(reduce_grid_desc_m_);
+            }
+        }
+
+        //  private:
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        CDataType* p_c_grid_;
+        ReducePtrsGlobal p_reduces_grid_;
+        index_t Batch_;
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        ReduceGridDesc_M reduce_grid_desc_m_;
+        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock_;
+        typename GridwiseGemm::ReduceGridDescriptor_MBlock_MPerBlock
+            reduce_grid_desc_mblock_mperblock_;
+        ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch_;
+        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+        ReduceInElementwiseOperations reduce_in_element_ops_;
+        ReduceAccElementwiseOperations reduce_out_element_ops_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+#if 0
+            {
+                std::cout << "arg.Batch_ = " << arg.Batch_ << std::endl;
+
+                std::cout << "arg.a_grid_desc_ak0_m_ak1_{"
+                          << arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) << ", "
+                          << arg.a_grid_desc_ak0_m_ak1_.GetLength(I1) << ", "
+                          << arg.a_grid_desc_ak0_m_ak1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.b_grid_desc_bk0_n_bk1_{"
+                          << arg.b_grid_desc_bk0_n_bk1_.GetLength(I0) << ", "
+                          << arg.b_grid_desc_bk0_n_bk1_.GetLength(I1) << ", "
+                          << arg.b_grid_desc_bk0_n_bk1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
+                          << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+
+                std::cout << "arg.reduce_grid_desc_m_{ " << arg.reduce_grid_desc_m_.GetLength(I0) << "}"
+                          << std::endl;
+            }
+#endif
+
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                            arg.b_grid_desc_bk0_n_bk1_,
+                                            arg.c_grid_desc_m_n_,
+                                            arg.block_2_ctile_map_))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) * arg.Batch_;
+
+            const auto K =
+                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+
+            float elapsed_time = 0.0f;
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                const auto kernel = kernel_batched_gemm_reduce_xdl_cshuffle_v1<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    ReducePtrsGlobal,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CElementwiseOperation,
+                    ReduceInElementwiseOperations,
+                    ReduceAccElementwiseOperations,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::ReduceGridDescriptor_MBlock_MPerBlock,
+                    ComputeBasePtrOfStridedBatch,
+                    typename GridwiseGemm::DefaultBlock2CTileMap,
+                    true>;
+
+                elapsed_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_,
+                                           arg.p_b_grid_,
+                                           arg.p_c_grid_,
+                                           arg.p_reduces_grid_,
+                                           arg.Batch_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.reduce_in_element_ops_,
+                                           arg.reduce_out_element_ops_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.reduce_grid_desc_mblock_mperblock_,
+                                           arg.compute_base_ptr_of_batch_,
+                                           arg.block_2_ctile_map_);
+            }
+            else
+            {
+                const auto kernel = kernel_batched_gemm_reduce_xdl_cshuffle_v1<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    ReducePtrsGlobal,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CElementwiseOperation,
+                    ReduceInElementwiseOperations,
+                    ReduceAccElementwiseOperations,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::ReduceGridDescriptor_MBlock_MPerBlock,
+                    ComputeBasePtrOfStridedBatch,
+                    typename GridwiseGemm::DefaultBlock2CTileMap,
+                    false>;
+
+                elapsed_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_,
+                                           arg.p_b_grid_,
+                                           arg.p_c_grid_,
+                                           arg.p_reduces_grid_,
+                                           arg.Batch_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.reduce_in_element_ops_,
+                                           arg.reduce_out_element_ops_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.reduce_grid_desc_mblock_mperblock_,
+                                           arg.compute_base_ptr_of_batch_,
+                                           arg.block_2_ctile_map_);
+            }
+
+            return elapsed_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_m_n_,
+                                           arg.block_2_ctile_map_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        auto casted_p_arg = dynamic_cast<const Argument*>(p_arg);
+        if(casted_p_arg == nullptr)
+        {
+            return false;
+        }
+        else
+        {
+            return IsSupportedArgument(*casted_p_arg);
+        }
+    }
+
+    static constexpr int NumReduce = ReduceOperations::Size();
+    static auto MakeArgument(const void* p_a,
+                             const void* p_b,
+                             const void* p_bias,
+                             std::array<const void*, 0> p_ds,
+                             void* p_c,
+                             std::array<void*, NumReduce> p_reduces,
+                             ck::index_t M,
+                             ck::index_t N,
+                             ck::index_t K,
+                             ck::index_t StrideA,
+                             ck::index_t StrideB,
+                             ck::index_t StrideC,
+                             std::array<ck::index_t, 0> StrideDs,
+                             std::array<void*, 3> gemm_element_ops,
+                             std::array<void*, 0> d_element_ops,
+                             std::array<void*, NumReduce> reduce_in_element_op,
+                             std::array<void*, NumReduce> reduce_out_element_op,
+                             index_t Batch)
+    {
+        (void)p_bias;
+        (void)p_ds;
+        (void)StrideDs;
+        (void)d_element_ops;
+
+        ReducePtrsGlobal reduce_tuple = generate_tuple(
+            [&](auto I) {
+                auto tmp = ReducePtrsGlobal{}[I];
+                using T  = remove_pointer_t<decltype(tmp)>;
+                return static_cast<T*>(p_reduces[I]);
+            },
+            Number<NumReduce>{});
+
+        ReduceInElementwiseOperations reduce_in_element_ops = generate_tuple(
+            [&](auto I) {
+                auto tmp = ReduceInElementwiseOperations{}[I];
+                using T  = remove_pointer_t<decltype(tmp)>;
+                return *(static_cast<T*>(reduce_in_element_op[I]));
+            },
+            Number<NumReduce>{});
+        ReduceAccElementwiseOperations reduce_out_element_ops = generate_tuple(
+            [&](auto I) {
+                auto tmp = ReduceAccElementwiseOperations{}[I];
+                using T  = remove_pointer_t<decltype(tmp)>;
+                return *(static_cast<T*>(reduce_out_element_op[I]));
+            },
+            Number<NumReduce>{});
+
+        AElementwiseOperation a_element_op =
+            *(static_cast<AElementwiseOperation*>(gemm_element_ops[0]));
+        BElementwiseOperation b_element_op =
+            *(static_cast<BElementwiseOperation*>(gemm_element_ops[1]));
+        CElementwiseOperation c_element_op =
+            *(static_cast<CElementwiseOperation*>(gemm_element_ops[2]));
+
+        return Argument{static_cast<const ADataType*>(p_a),
+                        static_cast<const BDataType*>(p_b),
+                        static_cast<CDataType*>(p_c),
+                        reduce_tuple,
+                        M,
+                        N,
+                        K,
+                        StrideA,
+                        StrideB,
+                        StrideC,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op,
+                        reduce_in_element_ops,
+                        reduce_out_element_ops,
+                        Batch};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        const void* p_bias,
+                        std::array<const void*, 0> p_ds,
+                        void* p_c,
+                        std::array<void*, NumReduce> p_reduces,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB,
+                        ck::index_t StrideC,
+                        std::array<ck::index_t, 0> StrideDs,
+                        std::array<void*, 3> gemm_element_ops,
+                        std::array<void*, 0> d_element_ops,
+                        std::array<void*, NumReduce> reduce_in_element_op,
+                        std::array<void*, NumReduce> reduce_out_element_op,
+                        index_t Batch = 1) override
+    {
+        (void)p_bias;
+        (void)p_ds;
+        (void)StrideDs;
+        (void)d_element_ops;
+
+        ReducePtrsGlobal reduce_tuple = generate_tuple(
+            [&](auto I) {
+                auto tmp = ReducePtrsGlobal{}[I];
+                using T  = remove_pointer_t<decltype(tmp)>;
+                return static_cast<T*>(p_reduces[I]);
+            },
+            Number<NumReduce>{});
+
+        ReduceInElementwiseOperations reduce_in_element_ops = generate_tuple(
+            [&](auto I) {
+                auto tmp = ReduceInElementwiseOperations{}[I];
+                using T  = remove_pointer_t<decltype(tmp)>;
+                return *(static_cast<T*>(reduce_in_element_op[I]));
+            },
+            Number<NumReduce>{});
+        ReduceAccElementwiseOperations reduce_out_element_ops = generate_tuple(
+            [&](auto I) {
+                auto tmp = ReduceAccElementwiseOperations{}[I];
+                using T  = remove_pointer_t<decltype(tmp)>;
+                return *(static_cast<T*>(reduce_out_element_op[I]));
+            },
+            Number<NumReduce>{});
+
+        AElementwiseOperation a_element_op =
+            *(static_cast<AElementwiseOperation*>(gemm_element_ops[0]));
+        BElementwiseOperation b_element_op =
+            *(static_cast<BElementwiseOperation*>(gemm_element_ops[1]));
+        CElementwiseOperation c_element_op =
+            *(static_cast<CElementwiseOperation*>(gemm_element_ops[2]));
+
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<CDataType*>(p_c),
+                                          reduce_tuple,
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideB,
+                                          StrideC,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op,
+                                          reduce_in_element_ops,
+                                          reduce_out_element_ops,
+                                          Batch);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceBatchedGemmReduce_Xdl_CShuffle"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
new file mode 100644
index 00000000..5baa0f8d
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
@@ -0,0 +1,859 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp"
+#include "ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename AccElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename B1GridDesc_BK0_N_BK1,
+          typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename Block2CTileMap,
+          typename ComputeBasePtrOfStridedBatch,
+          typename C0MatrixMask,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_batched_gemm_softmax_gemm_xdl_cshuffle_v1(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            const FloatAB* __restrict__ p_b1_grid,
+            FloatC* __restrict__ p_c_grid,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const AccElementwiseOperation acc_element_op,
+            const B1ElementwiseOperation b1_element_op,
+            const CElementwiseOperation c_element_op,
+            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+            const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1,
+            const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                c_grid_desc_mblock_mperblock_nblock_nperblock,
+            const Block2CTileMap block_2_ctile_map,
+            const index_t batch_count,
+            const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch,
+            const C0MatrixMask c0_matrix_mask)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    const index_t num_blocks_per_batch =
+        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetABasePtr(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetBBasePtr(g_idx)));
+    const long_index_t b1_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetB1BasePtr(g_idx)));
+    const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetCBasePtr(g_idx)));
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+                                                  p_b_grid + b_batch_offset,
+                                                  p_b1_grid + b1_batch_offset,
+                                                  p_c_grid + c_batch_offset,
+                                                  p_shared,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  acc_element_op,
+                                                  b1_element_op,
+                                                  c_element_op,
+                                                  a_grid_desc_ak0_m_ak1,
+                                                  b_grid_desc_bk0_n_bk1,
+                                                  b1_grid_desc_bk0_n_bk1,
+                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  block_2_ctile_map,
+                                                  c0_matrix_mask);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_b1_grid;
+    ignore = p_c_grid;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = acc_element_op;
+    ignore = b1_element_op;
+    ignore = c_element_op;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = b1_grid_desc_bk0_n_bk1;
+    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = block_2_ctile_map;
+    ignore = batch_count;
+    ignore = compute_base_ptr_of_batch;
+    ignore = c0_matrix_mask;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+// Computes C = A * B0 * B1
+//              ^^^^^^ (Acc0)
+//              ^^^^^^^^^^^ (Acc1)
+template <index_t NumDimG,
+          index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          index_t NumDimO, // NumDimGemm1N
+          typename ADataType,
+          typename BDataType,
+          typename B1DataType,
+          typename CDataType,
+          typename Acc0BiasDataType,
+          typename Acc1BiasDataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename AccElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          TensorSpecialization ASpec,
+          TensorSpecialization BSpec,
+          TensorSpecialization B1Spec,
+          TensorSpecialization CSpec,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock, // Gemm0NPerBlock
+          index_t KPerBlock, // Gemm0KPerBlock
+          index_t Gemm1NPerBlock,
+          index_t Gemm1KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t B1K1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          index_t Gemm1NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          typename B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename B1BlockTransferThreadClusterArrangeOrder,
+          typename B1BlockTransferSrcAccessOrder,
+          index_t B1BlockTransferSrcVectorDim,
+          index_t B1BlockTransferSrcScalarPerVector,
+          index_t B1BlockTransferDstScalarPerVector_BK1,
+          bool B1BlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          MaskingSpecialization MaskingSpec,
+          LoopScheduler LoopSched = LoopScheduler::Default>
+struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
+    : public DeviceBatchedGemmSoftmaxGemmPermute<NumDimG,
+                                                 NumDimM,
+                                                 NumDimN,
+                                                 NumDimK,
+                                                 NumDimO,
+                                                 ADataType,
+                                                 BDataType,
+                                                 B1DataType,
+                                                 CDataType,
+                                                 Acc0BiasDataType,
+                                                 Acc1BiasDataType,
+                                                 AElementwiseOperation,
+                                                 BElementwiseOperation,
+                                                 AccElementwiseOperation,
+                                                 B1ElementwiseOperation,
+                                                 CElementwiseOperation,
+                                                 MaskingSpec>
+{
+    static_assert(NumDimG > 0 && NumDimM > 0 && NumDimN > 0 && NumDimK > 0 && NumDimO > 0,
+                  "Number of dimension must be greater than 0");
+
+    static constexpr index_t NumAcc0Bias = Acc0BiasDataType::Size();
+    static constexpr index_t NumAcc1Bias = Acc1BiasDataType::Size();
+
+    // TODO ANT: implement bias combination
+    static_assert(NumAcc0Bias == 0 && NumAcc0Bias == 0, "Bias addition is unimplemented");
+
+#if 0
+    // TODO ANT: use alias
+    static constexpr index_t NumDimGemm0M = NumDimM;
+    static constexpr index_t NumDimGemm0N = NumDimN;
+    static constexpr index_t NumDimGemm0K = NumDimK;
+    static constexpr index_t NumDimGemm1M = NumDimM;
+    static constexpr index_t NumDimGemm1N = NumDimO;
+    static constexpr index_t NumDimGemm1K = NumDimN;
+#endif
+
+    using DeviceOp = DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    using Transform = TransformBatchedContractionContractionToBatchedGemmGemm<
+        Sequence<NumDimG, NumDimM, NumDimN, NumDimK, NumDimO>,
+        Sequence<MPerBlock, NPerBlock, KPerBlock, Gemm1NPerBlock>,
+        GemmSpec,
+        ASpec,
+        BSpec,
+        B1Spec,
+        CSpec>;
+
+    static auto MakeAGridDescriptor_AK0_M_AK1(const std::vector<index_t>& a_gs_ms_ks_lengths_vec,
+                                              const std::vector<index_t>& a_gs_ms_ks_strides_vec)
+    {
+        return Transform::MakeAGridDescriptor_AK0_M_AK1(
+            Transform::MakeAGridDescriptor_M_K(a_gs_ms_ks_lengths_vec, a_gs_ms_ks_strides_vec),
+            Number<AK1>{});
+    }
+
+    static auto MakeBGridDescriptor_BK0_N_BK1(const std::vector<index_t>& b_gs_ns_ks_lengths_vec,
+                                              const std::vector<index_t>& b_gs_ns_ks_strides_vec)
+    {
+        return Transform::MakeB0GridDescriptor_BK0_N_BK1(
+            Transform::MakeB0GridDescriptor_N_K(b_gs_ns_ks_lengths_vec, b_gs_ns_ks_strides_vec),
+            Number<BK1>{});
+    }
+
+    static auto
+    MakeB1GridDescriptor_BK0_N_BK1(const std::vector<index_t>& b1_gs_gemm1ns_gemm1ks_lengths_vec,
+                                   const std::vector<index_t>& b1_gs_gemm1ns_gemm1ks_strides_vec)
+    {
+        return Transform::MakeB1GridDescriptor_BK0_N_BK1(
+            Transform::MakeB1GridDescriptor_N_K(b1_gs_gemm1ns_gemm1ks_lengths_vec,
+                                                b1_gs_gemm1ns_gemm1ks_strides_vec),
+            Number<B1K1>{});
+    }
+
+    using AGridDesc_AK0_M_AK1  = decltype(MakeAGridDescriptor_AK0_M_AK1({}, {}));
+    using BGridDesc_BK0_N_BK1  = decltype(MakeBGridDescriptor_BK0_N_BK1({}, {}));
+    using B1GridDesc_BK0_N_BK1 = decltype(MakeB1GridDescriptor_BK0_N_BK1({}, {}));
+    using CGridDesc_M_N        = decltype(Transform::MakeCGridDescriptor_M_N({}, {}));
+    using AGridDesc_G_M_K      = decltype(Transform::MakeAGridDescriptor_G_M_K({}, {}));
+    using BGridDesc_G_N_K      = decltype(Transform::MakeB0GridDescriptor_G_N_K({}, {}));
+    using B1GridDesc_G_N_K     = decltype(Transform::MakeB1GridDescriptor_G_N_K({}, {}));
+    using CGridDesc_G_M_N      = decltype(Transform::MakeCGridDescriptor_G_M_N({}, {}));
+
+    constexpr static auto make_MaskOutPredicate()
+    {
+        if constexpr(MaskingSpec == MaskingSpecialization::MaskDisabled)
+        {
+            return MaskDisabledPredicate{};
+        }
+        else if constexpr(MaskingSpec == MaskingSpecialization::MaskOutUpperTriangle)
+        {
+            return MaskOutUpperTrianglePredicate{};
+        }
+    }
+    using C0MatrixMask = C0MatrixMask_impl<decltype(make_MaskOutPredicate())>;
+
+    struct ComputeBasePtrOfStridedBatch
+    {
+        ComputeBasePtrOfStridedBatch(const AGridDesc_G_M_K& a_grid_desc_g_m_k,
+                                     const BGridDesc_G_N_K& b_grid_desc_g_n_k,
+                                     const B1GridDesc_G_N_K& b1_grid_desc_g_n_k,
+                                     const CGridDesc_G_M_N& c_grid_desc_g_m_n)
+            : a_grid_desc_g_m_k_(a_grid_desc_g_m_k),
+              b_grid_desc_g_n_k_(b_grid_desc_g_n_k),
+              b1_grid_desc_g_n_k_(b1_grid_desc_g_n_k),
+              c_grid_desc_g_m_n_(c_grid_desc_g_m_n)
+        {
+        }
+
+        __host__ __device__ constexpr long_index_t GetABasePtr(index_t g_idx) const
+        {
+            return a_grid_desc_g_m_k_.CalculateOffset(make_multi_index(g_idx, 0, 0));
+        }
+
+        __host__ __device__ constexpr long_index_t GetBBasePtr(index_t g_idx) const
+        {
+            return b_grid_desc_g_n_k_.CalculateOffset(make_multi_index(g_idx, 0, 0));
+        }
+
+        __host__ __device__ constexpr long_index_t GetB1BasePtr(index_t g_idx) const
+        {
+            return b1_grid_desc_g_n_k_.CalculateOffset(make_multi_index(g_idx, 0, 0));
+        }
+
+        __host__ __device__ constexpr long_index_t GetCBasePtr(index_t g_idx) const
+        {
+            return c_grid_desc_g_m_n_.CalculateOffset(make_multi_index(g_idx, 0, 0));
+        }
+
+        private:
+        AGridDesc_G_M_K a_grid_desc_g_m_k_;
+        BGridDesc_G_N_K b_grid_desc_g_n_k_;
+        B1GridDesc_G_N_K b1_grid_desc_g_n_k_;
+        CGridDesc_G_M_N c_grid_desc_g_m_n_;
+    };
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle<
+        ADataType, // TODO: distinguish A/B datatype
+        GemmAccDataType,
+        CShuffleDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        AccElementwiseOperation,
+        B1ElementwiseOperation,
+        CElementwiseOperation,
+        InMemoryDataOperationEnum::Set,
+        AGridDesc_AK0_M_AK1,
+        BGridDesc_BK0_N_BK1,
+        B1GridDesc_BK0_N_BK1,
+        CGridDesc_M_N,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        Gemm1NPerBlock,
+        Gemm1KPerBlock,
+        AK1,
+        BK1,
+        B1K1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        Gemm1NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        true,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        true,
+        BBlockLdsExtraN,
+        B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+        B1BlockTransferThreadClusterArrangeOrder,
+        B1BlockTransferSrcAccessOrder,
+        B1BlockTransferSrcVectorDim,
+        B1BlockTransferSrcScalarPerVector,
+        B1BlockTransferDstScalarPerVector_BK1,
+        false,
+        B1BlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        LoopSched,
+        Transform::matrix_padder.PadN,
+        MaskingSpec == MaskingSpecialization::MaskOutUpperTriangle>;
+
+    // Argument
+    // FIXME: constness
+    struct Argument : public BaseArgument
+    {
+        Argument(
+            const ADataType* p_a_grid,
+            const BDataType* p_b_grid,
+            const B1DataType* p_b1_grid,
+            CDataType* p_c_grid,
+            const std::array<void*, NumAcc0Bias> p_acc0_biases,
+            const std::array<void*, NumAcc1Bias> p_acc1_biases,
+            const std::vector<index_t>& a_gs_ms_ks_lengths,
+            const std::vector<index_t>& a_gs_ms_ks_strides,
+            const std::vector<index_t>& b_gs_ns_ks_lengths,
+            const std::vector<index_t>& b_gs_ns_ks_strides,
+            const std::vector<index_t>& b1_gs_gemm1ns_gemm1ks_lengths, // b1_gs_os_ns_lengths
+            const std::vector<index_t>& b1_gs_gemm1ns_gemm1ks_strides, // b1_gs_os_ns_strides
+            const std::vector<index_t>& c_gs_ms_gemm1ns_lengths,       // c_gs_ms_os_lengths
+            const std::vector<index_t>& c_gs_ms_gemm1ns_strides,       // c_gs_ms_os_strides
+            const std::array<std::vector<ck::index_t>, NumAcc0Bias> acc0_biases_gs_ms_ns_lengths,
+            const std::array<std::vector<ck::index_t>, NumAcc0Bias> acc0_biases_gs_ms_ns_strides,
+            const std::array<std::vector<ck::index_t>, NumAcc1Bias>
+                acc1_biases_gs_ms_gemm1ns_lengths, // acc1_biases_gs_ms_os_lengths
+            const std::array<std::vector<ck::index_t>, NumAcc1Bias>
+                acc1_biases_gs_ms_gemm1ns_strides, // acc1_biases_gs_ms_os_strides
+            AElementwiseOperation a_element_op,
+            BElementwiseOperation b_element_op,
+            AccElementwiseOperation acc_element_op,
+            B1ElementwiseOperation b1_element_op,
+            CElementwiseOperation c_element_op)
+            : p_a_grid_{p_a_grid},
+              p_b_grid_{p_b_grid},
+              p_b1_grid_{p_b1_grid},
+              p_c_grid_{p_c_grid},
+              a_grid_desc_ak0_m_ak1_{
+                  DeviceOp::MakeAGridDescriptor_AK0_M_AK1(a_gs_ms_ks_lengths, a_gs_ms_ks_strides)},
+              b_grid_desc_bk0_n_bk1_{
+                  DeviceOp::MakeBGridDescriptor_BK0_N_BK1(b_gs_ns_ks_lengths, b_gs_ns_ks_strides)},
+              b1_grid_desc_bk0_n_bk1_{DeviceOp::MakeB1GridDescriptor_BK0_N_BK1(
+                  b1_gs_gemm1ns_gemm1ks_lengths, b1_gs_gemm1ns_gemm1ks_strides)},
+              c_grid_desc_m_n_{Transform::MakeCGridDescriptor_M_N(c_gs_ms_gemm1ns_lengths,
+                                                                  c_gs_ms_gemm1ns_strides)},
+              a_grid_desc_g_m_k_{
+                  Transform::MakeAGridDescriptor_G_M_K(a_gs_ms_ks_lengths, a_gs_ms_ks_strides)},
+              b_grid_desc_g_n_k_{
+                  Transform::MakeB0GridDescriptor_G_N_K(b_gs_ns_ks_lengths, b_gs_ns_ks_strides)},
+              b1_grid_desc_g_n_k_{Transform::MakeB1GridDescriptor_G_N_K(
+                  b1_gs_gemm1ns_gemm1ks_lengths, b1_gs_gemm1ns_gemm1ks_strides)},
+              c_grid_desc_g_m_n_{Transform::MakeCGridDescriptor_G_M_N(c_gs_ms_gemm1ns_lengths,
+                                                                      c_gs_ms_gemm1ns_strides)},
+              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              block_2_ctile_map_{GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_)},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              acc_element_op_{acc_element_op},
+              b1_element_op_{b1_element_op},
+              c_element_op_{c_element_op},
+              c0_matrix_mask_{b_grid_desc_g_n_k_.GetLength(I1)},
+              raw_lengths_mz_nz_kz_gemm1nz_{a_gs_ms_ks_lengths[NumDimG + NumDimM - 1],
+                                            b_gs_ns_ks_lengths[NumDimG + NumDimN - 1],
+                                            b_gs_ns_ks_lengths[NumDimG + NumDimN + NumDimK - 1],
+                                            b1_gs_gemm1ns_gemm1ks_lengths[NumDimG + NumDimO - 1]},
+              a_mz_kz_strides_{a_gs_ms_ks_strides[NumDimG + NumDimM - 1],
+                               a_gs_ms_ks_strides[NumDimG + NumDimM + NumDimK - 1]},
+              b_nz_kz_strides_{b_gs_ns_ks_strides[NumDimG + NumDimN - 1],
+                               b_gs_ns_ks_strides[NumDimG + NumDimN + NumDimK - 1]},
+              b1_nz_kz_strides_{b1_gs_gemm1ns_gemm1ks_strides[NumDimG + NumDimO - 1],
+                                b1_gs_gemm1ns_gemm1ks_strides[NumDimG + NumDimO + NumDimN - 1]},
+              c_mz_gemm1nz_strides_{c_gs_ms_gemm1ns_strides[NumDimG + NumDimM - 1],
+                                    c_gs_ms_gemm1ns_strides[NumDimG + NumDimM + NumDimO - 1]},
+              batch_count_{c_grid_desc_g_m_n_.GetLength(I0)},
+              compute_base_ptr_of_batch_{
+                  a_grid_desc_g_m_k_, b_grid_desc_g_n_k_, b1_grid_desc_g_n_k_, c_grid_desc_g_m_n_}
+        {
+            // TODO ANT: implement bias addition
+            ignore = p_acc0_biases;
+            ignore = p_acc1_biases;
+            ignore = acc0_biases_gs_ms_ns_lengths;
+            ignore = acc0_biases_gs_ms_ns_strides;
+            ignore = acc1_biases_gs_ms_gemm1ns_lengths;
+            ignore = acc1_biases_gs_ms_gemm1ns_strides;
+
+            if(GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1_,
+                                           b_grid_desc_bk0_n_bk1_,
+                                           b1_grid_desc_bk0_n_bk1_,
+                                           c_grid_desc_m_n_,
+                                           block_2_ctile_map_))
+            {
+                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        c_grid_desc_m_n_);
+            }
+        }
+
+        void Print() const
+        {
+            std::cout << "a_grid_desc_g_m_k_: " << a_grid_desc_g_m_k_.GetLength(I0) << ", "
+                      << a_grid_desc_g_m_k_.GetLength(I1) << ", "
+                      << a_grid_desc_g_m_k_.GetLength(I2) << '\n';
+            // a_grid_desc_g_m_k_.Print();
+            std::cout << "b_grid_desc_g_n_k_: " << b_grid_desc_g_n_k_.GetLength(I0) << ", "
+                      << b_grid_desc_g_n_k_.GetLength(I1) << ", "
+                      << b_grid_desc_g_n_k_.GetLength(I2) << '\n';
+            // b_grid_desc_g_n_k_.Print();
+            std::cout << "b1_grid_desc_g_n_k_: " << b1_grid_desc_g_n_k_.GetLength(I0) << ", "
+                      << b1_grid_desc_g_n_k_.GetLength(I1) << ", "
+                      << b1_grid_desc_g_n_k_.GetLength(I2) << '\n';
+            // b1_grid_desc_g_n_k_.Print();
+            std::cout << "c_grid_desc_g_m_n_: " << c_grid_desc_g_m_n_.GetLength(I0) << ", "
+                      << c_grid_desc_g_m_n_.GetLength(I1) << ", "
+                      << c_grid_desc_g_m_n_.GetLength(I2) << '\n';
+            // c_grid_desc_g_m_n_.Print();
+        }
+
+        // pointers
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        const B1DataType* p_b1_grid_;
+        CDataType* p_c_grid_;
+
+        // tensor descriptor
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        AGridDesc_G_M_K a_grid_desc_g_m_k_;
+        BGridDesc_G_N_K b_grid_desc_g_n_k_;
+        B1GridDesc_G_N_K b1_grid_desc_g_n_k_;
+        CGridDesc_G_M_N c_grid_desc_g_m_n_;
+        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock_;
+
+        // block-to-c-tile map
+        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
+
+        // element-wise op
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        AccElementwiseOperation acc_element_op_;
+        B1ElementwiseOperation b1_element_op_;
+        CElementwiseOperation c_element_op_;
+
+        // check C0 masking and padding
+        C0MatrixMask c0_matrix_mask_;
+
+        // For robust IsSupportedArgument() check
+        std::vector<index_t> raw_lengths_mz_nz_kz_gemm1nz_;
+        std::vector<index_t> a_mz_kz_strides_;
+        std::vector<index_t> b_nz_kz_strides_;
+        std::vector<index_t> b1_nz_kz_strides_;
+        std::vector<index_t> c_mz_gemm1nz_strides_;
+
+        index_t batch_count_;
+        ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(!DeviceOp::IsSupportedArgument(arg))
+            {
+                throw std::runtime_error("wrong! unsupported argument");
+            }
+
+            const index_t grid_size =
+                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) * arg.batch_count_;
+
+            // Gemm0_K
+            const auto K =
+                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+
+            float ave_time = 0;
+
+            auto launch_kernel = [&](auto has_main_k_block_loop_) {
+                const auto kernel = kernel_batched_gemm_softmax_gemm_xdl_cshuffle_v1<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    AccElementwiseOperation,
+                    B1ElementwiseOperation,
+                    CElementwiseOperation,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    DeviceOp::B1GridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::DefaultBlock2CTileMap,
+                    ComputeBasePtrOfStridedBatch,
+                    C0MatrixMask,
+                    has_main_k_block_loop_>;
+
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_a_grid_,
+                                              arg.p_b_grid_,
+                                              arg.p_b1_grid_,
+                                              arg.p_c_grid_,
+                                              arg.a_element_op_,
+                                              arg.b_element_op_,
+                                              arg.acc_element_op_,
+                                              arg.b1_element_op_,
+                                              arg.c_element_op_,
+                                              arg.a_grid_desc_ak0_m_ak1_,
+                                              arg.b_grid_desc_bk0_n_bk1_,
+                                              arg.b1_grid_desc_bk0_n_bk1_,
+                                              arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.block_2_ctile_map_,
+                                              arg.batch_count_,
+                                              arg.compute_base_ptr_of_batch_,
+                                              arg.c0_matrix_mask_);
+            };
+
+            // Gemm1_K is split into Gemm1_K0/K1 where K1 is known at compile time, so we only need
+            // to concern Gemm0's loop
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                ave_time = launch_kernel(integral_constant<bool, true>{});
+            }
+            else
+            {
+                ave_time = launch_kernel(integral_constant<bool, false>{});
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+#if 0
+        arg.Print();
+#endif
+
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        {
+            return false;
+        }
+
+        // TODO ANT: Check if tensor specialization & strides mismatch
+
+        // Check if C permute dimension matches GEMM + GEMM shape
+        const index_t c_g       = arg.c_grid_desc_g_m_n_.GetLength(I0); // unpadded
+        const index_t c_m       = arg.c_grid_desc_m_n_.GetLength(I0);
+        const index_t c_gemm1n  = arg.c_grid_desc_m_n_.GetLength(I1);
+        const index_t a_m       = arg.a_grid_desc_ak0_m_ak1_.GetLength(I1);
+        const index_t b1_gemm1n = arg.b1_grid_desc_bk0_n_bk1_.GetLength(I1);
+
+        if(!(c_g == arg.batch_count_ && c_m == a_m && c_gemm1n == b1_gemm1n))
+        {
+            return false;
+        }
+
+        // Note: we need raw lengths since threadwise copy can not handle vector load when part of
+        // vector is out of bounds
+        // Note: need lowest dim in Ms/Ns/Ks/Os, not merged M/N/K/O
+        const auto MzRaw      = arg.raw_lengths_mz_nz_kz_gemm1nz_[0];
+        const auto NzRaw      = arg.raw_lengths_mz_nz_kz_gemm1nz_[1];
+        const auto KzRaw      = arg.raw_lengths_mz_nz_kz_gemm1nz_[2];
+        const auto Gemm1NzRaw = arg.raw_lengths_mz_nz_kz_gemm1nz_[3];
+
+        // Check scalar per vector requirement
+        const auto a_extent_lowest  = ABlockTransferSrcVectorDim == 2 ? KzRaw : MzRaw;
+        const auto b_extent_lowest  = BBlockTransferSrcVectorDim == 2 ? KzRaw : NzRaw;
+        const auto b1_extent_lowest = B1BlockTransferSrcVectorDim == 2 ? NzRaw : Gemm1NzRaw;
+        const auto c_extent_lowest  = Gemm1NzRaw;
+
+        if(!(a_extent_lowest % ABlockTransferSrcScalarPerVector == 0 &&
+             b_extent_lowest % BBlockTransferSrcScalarPerVector == 0 &&
+             b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 &&
+             c_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0))
+        {
+            return false;
+        }
+
+        // Check vector load/store requirement
+        const auto a_stride_lowest =
+            ABlockTransferSrcVectorDim == 2 ? arg.a_mz_kz_strides_[1] : arg.a_mz_kz_strides_[0];
+        const auto b_stride_lowest =
+            BBlockTransferSrcVectorDim == 2 ? arg.b_nz_kz_strides_[1] : arg.b_nz_kz_strides_[0];
+        const auto b1_stride_lowest =
+            B1BlockTransferSrcVectorDim == 2 ? arg.b1_nz_kz_strides_[1] : arg.b1_nz_kz_strides_[0];
+        const auto c_stride_lowest =
+            arg.c_mz_gemm1nz_strides_[1]; // cshuffle assumes lowest dim in Gemm1Ns to be contiguous
+
+        if(!(a_stride_lowest == 1 || b_stride_lowest == 1 || b1_stride_lowest == 1 ||
+             c_stride_lowest == 1))
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.b1_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_m_n_,
+                                           arg.block_2_ctile_map_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(
+        const ADataType* p_a,
+        const BDataType* p_b,
+        const B1DataType* p_b1,
+        CDataType* p_c,
+        const std::array<void*, NumAcc0Bias> p_acc0_biases,
+        const std::array<void*, NumAcc1Bias> p_acc1_biases,
+        const std::vector<index_t>& a_gs_ms_ks_lengths,
+        const std::vector<index_t>& a_gs_ms_ks_strides,
+        const std::vector<index_t>& b_gs_ns_ks_lengths,
+        const std::vector<index_t>& b_gs_ns_ks_strides,
+        const std::vector<index_t>& b1_gs_gemm1ns_gemm1ks_lengths, // b1_gs_os_ns_lengths
+        const std::vector<index_t>& b1_gs_gemm1ns_gemm1ks_strides, // b1_gs_os_ns_strides
+        const std::vector<index_t>& c_gs_ms_gemm1ns_lengths,       // c_gs_ms_os_lengths
+        const std::vector<index_t>& c_gs_ms_gemm1ns_strides,       // c_gs_ms_os_strides
+        const std::array<std::vector<ck::index_t>, NumAcc0Bias> acc0_biases_gs_ms_ns_lengths,
+        const std::array<std::vector<ck::index_t>, NumAcc0Bias> acc0_biases_gs_ms_ns_strides,
+        const std::array<std::vector<ck::index_t>, NumAcc1Bias>
+            acc1_biases_gs_ms_gemm1ns_lengths, // acc1_biases_gs_ms_os_lengths
+        const std::array<std::vector<ck::index_t>, NumAcc1Bias>
+            acc1_biases_gs_ms_gemm1ns_strides, // acc1_biases_gs_ms_os_strides
+        AElementwiseOperation a_element_op,
+        BElementwiseOperation b_element_op,
+        AccElementwiseOperation acc_element_op,
+        B1ElementwiseOperation b1_element_op,
+        CElementwiseOperation c_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_b1,
+                        p_c,
+                        p_acc0_biases,
+                        p_acc1_biases,
+                        a_gs_ms_ks_lengths,
+                        a_gs_ms_ks_strides,
+                        b_gs_ns_ks_lengths,
+                        b_gs_ns_ks_strides,
+                        b1_gs_gemm1ns_gemm1ks_lengths, // b1_gs_os_ns_lengths
+                        b1_gs_gemm1ns_gemm1ks_strides, // b1_gs_os_ns_strides
+                        c_gs_ms_gemm1ns_lengths,       // c_gs_ms_os_lengths
+                        c_gs_ms_gemm1ns_strides,       // c_gs_ms_os_strides
+                        acc0_biases_gs_ms_ns_lengths,
+                        acc0_biases_gs_ms_ns_strides,
+                        acc1_biases_gs_ms_gemm1ns_lengths, // acc1_biases_gs_ms_os_lengths
+                        acc1_biases_gs_ms_gemm1ns_strides, // acc1_biases_gs_ms_os_strides
+                        a_element_op,
+                        b_element_op,
+                        acc_element_op,
+                        b1_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    // FIXME: constness
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(
+        const void* p_a,
+        const void* p_b,
+        const void* p_b1,
+        void* p_c,
+        const std::array<void*, NumAcc0Bias> p_acc0_biases,
+        const std::array<void*, NumAcc1Bias> p_acc1_biases,
+        const std::vector<index_t>& a_gs_ms_ks_lengths,
+        const std::vector<index_t>& a_gs_ms_ks_strides,
+        const std::vector<index_t>& b_gs_ns_ks_lengths,
+        const std::vector<index_t>& b_gs_ns_ks_strides,
+        const std::vector<index_t>& b1_gs_gemm1ns_gemm1ks_lengths, // b1_gs_os_ns_lengths
+        const std::vector<index_t>& b1_gs_gemm1ns_gemm1ks_strides, // b1_gs_os_ns_strides
+        const std::vector<index_t>& c_gs_ms_gemm1ns_lengths,       // c_gs_ms_os_lengths
+        const std::vector<index_t>& c_gs_ms_gemm1ns_strides,       // c_gs_ms_os_strides
+        const std::array<std::vector<ck::index_t>, NumAcc0Bias> acc0_biases_gs_ms_ns_lengths,
+        const std::array<std::vector<ck::index_t>, NumAcc0Bias> acc0_biases_gs_ms_ns_strides,
+        const std::array<std::vector<ck::index_t>, NumAcc1Bias>
+            acc1_biases_gs_ms_gemm1ns_lengths, // acc1_biases_gs_ms_os_lengths
+        const std::array<std::vector<ck::index_t>, NumAcc1Bias>
+            acc1_biases_gs_ms_gemm1ns_strides, // acc1_biases_gs_ms_os_strides
+        AElementwiseOperation a_element_op,
+        BElementwiseOperation b_element_op,
+        AccElementwiseOperation acc_element_op,
+        B1ElementwiseOperation b1_element_op,
+        CElementwiseOperation c_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<const B1DataType*>(p_b1),
+                                          static_cast<CDataType*>(p_c),
+                                          p_acc0_biases, // cast in struct Argument
+                                          p_acc1_biases, // cast in struct Argument
+                                          a_gs_ms_ks_lengths,
+                                          a_gs_ms_ks_strides,
+                                          b_gs_ns_ks_lengths,
+                                          b_gs_ns_ks_strides,
+                                          b1_gs_gemm1ns_gemm1ks_lengths, // b1_gs_os_ns_lengths
+                                          b1_gs_gemm1ns_gemm1ks_strides, // b1_gs_os_ns_strides
+                                          c_gs_ms_gemm1ns_lengths,       // c_gs_ms_os_lengths
+                                          c_gs_ms_gemm1ns_strides,       // c_gs_ms_os_strides
+                                          acc0_biases_gs_ms_ns_lengths,
+                                          acc0_biases_gs_ms_ns_strides,
+                                          acc1_biases_gs_ms_gemm1ns_lengths,
+                                          acc1_biases_gs_ms_gemm1ns_strides,
+                                          a_element_op,
+                                          b_element_op,
+                                          acc_element_op,
+                                          b1_element_op,
+                                          c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1 << ", "
+            << MPerBlock << ", "
+            << Gemm1NPerBlock << ", "
+            << Gemm1KPerBlock << ", "
+            << B1K1 << ", "
+            << getGemmSpecializationString(GemmSpec) << ", "
+            << "ASpec" << getTensorSpecializationString(ASpec) << ", "
+            << "B0Spec" << getTensorSpecializationString(BSpec) << ", "
+            << "B1Spec" << getTensorSpecializationString(B1Spec) << ", "
+            << "CSpec" << getTensorSpecializationString(CSpec) << ", "
+            << getMaskingSpecializationString(MaskingSpec) << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
new file mode 100644
index 00000000..1f21f2d7
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
@@ -0,0 +1,771 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/masking_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename AccElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename B1GridDesc_BK0_N_BK1,
+          typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename Block2CTileMap,
+          typename ComputeBasePtrOfStridedBatch,
+          typename C0MatrixMask,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_batched_gemm_softmax_gemm_xdl_cshuffle_v1(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            const FloatAB* __restrict__ p_b1_grid,
+            FloatC* __restrict__ p_c_grid,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const AccElementwiseOperation acc_element_op,
+            const B1ElementwiseOperation b1_element_op,
+            const CElementwiseOperation c_element_op,
+            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+            const B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1,
+            const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                c_grid_desc_mblock_mperblock_nblock_nperblock,
+            const Block2CTileMap block_2_ctile_map,
+            const index_t batch_count,
+            const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch,
+            const C0MatrixMask c0_matrix_mask)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    const index_t num_blocks_per_batch =
+        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetABasePtr(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetBBasePtr(g_idx)));
+    const long_index_t b1_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetB1BasePtr(g_idx)));
+    const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetCBasePtr(g_idx)));
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+                                                  p_b_grid + b_batch_offset,
+                                                  p_b1_grid + b1_batch_offset,
+                                                  p_c_grid + c_batch_offset,
+                                                  p_shared,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  acc_element_op,
+                                                  b1_element_op,
+                                                  c_element_op,
+                                                  a_grid_desc_ak0_m_ak1,
+                                                  b_grid_desc_bk0_n_bk1,
+                                                  b1_grid_desc_bk0_n_bk1,
+                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  block_2_ctile_map,
+                                                  c0_matrix_mask);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_b1_grid;
+    ignore = p_c_grid;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = acc_element_op;
+    ignore = b1_element_op;
+    ignore = c_element_op;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = b1_grid_desc_bk0_n_bk1;
+    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = block_2_ctile_map;
+    ignore = batch_count;
+    ignore = compute_base_ptr_of_batch;
+    ignore = c0_matrix_mask;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+// Computes C = A * B0 * B1
+//              ^^^^^^ (Acc0)
+//              ^^^^^^^^^^^ (Acc1)
+
+// When using NPadding as GemmSpecialization, AccElementwiseOperation should be set to
+// ScaleAndResetNaNToMinusInfinity.
+// if !isNan(AccElement)
+//     AccElement *= scale
+// else
+//     AccElement = -INFINITY
+// Otherwise, result may be wrong.
+
+template <typename ALayout,
+          typename BLayout, // B0Layout
+          typename B1Layout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename B1DataType,
+          typename CDataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename AccElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock, // Gemm0NPerBlock
+          index_t KPerBlock, // Gemm0KPerBlock
+          index_t Gemm1NPerBlock,
+          index_t Gemm1KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t B1K1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          index_t Gemm1NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          typename B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename B1BlockTransferThreadClusterArrangeOrder,
+          typename B1BlockTransferSrcAccessOrder,
+          index_t B1BlockTransferSrcVectorDim,
+          index_t B1BlockTransferSrcScalarPerVector,
+          index_t B1BlockTransferDstScalarPerVector_BK1,
+          bool B1BlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          bool MaskOutUpperTriangle,
+          LoopScheduler LoopSched = LoopScheduler::Default>
+struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
+    : public DeviceBatchedGemmSoftmaxGemm<ALayout,
+                                          BLayout,
+                                          B1Layout,
+                                          CLayout,
+                                          ADataType,
+                                          BDataType,
+                                          B1DataType,
+                                          CDataType,
+                                          AElementwiseOperation,
+                                          BElementwiseOperation,
+                                          AccElementwiseOperation,
+                                          B1ElementwiseOperation,
+                                          CElementwiseOperation,
+                                          MaskOutUpperTriangle>
+{
+    using DeviceOp = DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    static constexpr auto matrix_padder =
+        GemmGemmPadder<GemmSpec, index_t, index_t, index_t, index_t>{
+            MPerBlock, NPerBlock, KPerBlock, Gemm1NPerBlock};
+
+    static auto MakeAGridDescriptor_AK0_M_AK1(index_t MRaw, index_t KRaw, index_t StrideA)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(I1, StrideA));
+            }
+        }();
+
+        const auto a_grid_desc_m_k = matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
+
+        const auto M = a_grid_desc_m_k.GetLength(I0);
+        const auto K = a_grid_desc_m_k.GetLength(I1);
+
+        const auto AK0 = K / AK1;
+
+        return transform_tensor_descriptor(a_grid_desc_m_k,
+                                           make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                      make_pass_through_transform(M)),
+                                           make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                           make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    static auto MakeBGridDescriptor_BK0_N_BK1(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        const auto b_grid_desc_n_k = matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
+
+        const auto N = b_grid_desc_n_k.GetLength(I0);
+        const auto K = b_grid_desc_n_k.GetLength(I1);
+
+        const auto BK0 = K / BK1;
+
+        return transform_tensor_descriptor(b_grid_desc_n_k,
+                                           make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                      make_pass_through_transform(N)),
+                                           make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                           make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    // Args: Gemm1KRaw, Gemm1NRaw, StrideB1
+    static auto MakeB1GridDescriptor_BK0_N_BK1(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b1_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, B1Layout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, B1Layout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        const auto b1_grid_desc_n_k = matrix_padder.PadB1Descriptor_N_K(b1_grid_desc_nraw_kraw);
+
+        const auto N = b1_grid_desc_n_k.GetLength(I0);
+        const auto K = b1_grid_desc_n_k.GetLength(I1);
+
+        const auto B1K0 = K / B1K1;
+
+        return transform_tensor_descriptor(
+            b1_grid_desc_n_k,
+            make_tuple(make_unmerge_transform(make_tuple(B1K0, B1K1)),
+                       make_pass_through_transform(N)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    static auto MakeCGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(I1, StrideC));
+            }
+        }();
+
+        return matrix_padder.PadCDescriptor_M_N(c_grid_desc_mraw_nraw);
+    }
+
+    struct ComputeBasePtrOfStridedBatch
+    {
+        ComputeBasePtrOfStridedBatch(index_t BatchStrideA,
+                                     index_t BatchStrideB,
+                                     index_t BatchStrideB1,
+                                     index_t BatchStrideC)
+            : BatchStrideA_(BatchStrideA),
+              BatchStrideB_(BatchStrideB),
+              BatchStrideB1_(BatchStrideB1),
+              BatchStrideC_(BatchStrideC)
+        {
+        }
+
+        __host__ __device__ constexpr long_index_t GetABasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideA_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetBBasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideB_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetB1BasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideB1_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetCBasePtr(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideC_);
+        }
+
+        private:
+        index_t BatchStrideA_;
+        index_t BatchStrideB_;
+        index_t BatchStrideB1_;
+        index_t BatchStrideC_;
+    };
+
+    using AGridDesc_AK0_M_AK1  = decltype(MakeAGridDescriptor_AK0_M_AK1(1, 1, 1));
+    using BGridDesc_BK0_N_BK1  = decltype(MakeBGridDescriptor_BK0_N_BK1(1, 1, 1));
+    using B1GridDesc_BK0_N_BK1 = decltype(MakeB1GridDescriptor_BK0_N_BK1(1, 1, 1));
+    using CGridDesc_M_N        = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
+
+    using C0MatrixMask = conditional_t<MaskOutUpperTriangle,
+                                       C0MatrixMask_impl<MaskOutUpperTrianglePredicate>,
+                                       C0MatrixMask_impl<MaskDisabledPredicate>>;
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle<
+        ADataType, // TODO: distinguish A/B datatype
+        GemmAccDataType,
+        CShuffleDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        AccElementwiseOperation,
+        B1ElementwiseOperation,
+        CElementwiseOperation,
+        InMemoryDataOperationEnum::Set,
+        AGridDesc_AK0_M_AK1,
+        BGridDesc_BK0_N_BK1,
+        B1GridDesc_BK0_N_BK1,
+        CGridDesc_M_N,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        Gemm1NPerBlock,
+        Gemm1KPerBlock,
+        AK1,
+        BK1,
+        B1K1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        Gemm1NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        true,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        true,
+        BBlockLdsExtraN,
+        B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+        B1BlockTransferThreadClusterArrangeOrder,
+        B1BlockTransferSrcAccessOrder,
+        B1BlockTransferSrcVectorDim,
+        B1BlockTransferSrcScalarPerVector,
+        B1BlockTransferDstScalarPerVector_BK1,
+        false,
+        B1BlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        LoopSched,
+        matrix_padder.PadN,
+        MaskOutUpperTriangle>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const ADataType* p_a_grid,
+                 const BDataType* p_b_grid,
+                 const B1DataType* p_b1_grid,
+                 CDataType* p_c_grid,
+                 index_t MRaw,
+                 index_t NRaw,
+                 index_t KRaw,
+                 index_t Gemm1NRaw, // = ORaw
+                 index_t Batch,
+                 index_t StrideA,
+                 index_t StrideB,
+                 index_t StrideB1,
+                 index_t StrideC,
+                 index_t BatchStrideA,
+                 index_t BatchStrideB,
+                 index_t BatchStrideB1,
+                 index_t BatchStrideC,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 AccElementwiseOperation acc_element_op,
+                 B1ElementwiseOperation b1_element_op,
+                 CElementwiseOperation c_element_op)
+            : p_a_grid_{p_a_grid},
+              p_b_grid_{p_b_grid},
+              p_b1_grid_{p_b1_grid},
+              p_c_grid_{p_c_grid},
+              a_grid_desc_ak0_m_ak1_{DeviceOp::MakeAGridDescriptor_AK0_M_AK1(MRaw, KRaw, StrideA)},
+              b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1(KRaw, NRaw, StrideB)},
+              b1_grid_desc_bk0_n_bk1_{
+                  DeviceOp::MakeB1GridDescriptor_BK0_N_BK1(NRaw, Gemm1NRaw, StrideB1)},
+              c_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, Gemm1NRaw, StrideC)},
+              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              block_2_ctile_map_{GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_)},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              acc_element_op_{acc_element_op},
+              b1_element_op_{b1_element_op},
+              c_element_op_{c_element_op},
+              batch_count_(Batch),
+              compute_base_ptr_of_batch_{BatchStrideA, BatchStrideB, BatchStrideB1, BatchStrideC},
+              c0_matrix_mask_{NRaw},
+              raw_lengths_m_n_k_o_{MRaw, NRaw, KRaw, Gemm1NRaw}
+        {
+            if(GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1_,
+                                           b_grid_desc_bk0_n_bk1_,
+                                           b1_grid_desc_bk0_n_bk1_,
+                                           c_grid_desc_m_n_,
+                                           block_2_ctile_map_))
+            {
+                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        c_grid_desc_m_n_);
+            }
+        }
+
+        //  private:
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        const B1DataType* p_b1_grid_;
+        CDataType* p_c_grid_;
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        B1GridDesc_BK0_N_BK1 b1_grid_desc_bk0_n_bk1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock_;
+        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        AccElementwiseOperation acc_element_op_;
+        B1ElementwiseOperation b1_element_op_;
+        CElementwiseOperation c_element_op_;
+        index_t batch_count_;
+        ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch_;
+
+        // check C0 masking and padding
+        C0MatrixMask c0_matrix_mask_;
+
+        // For robust IsSupportedArgument() check
+        std::vector<index_t> raw_lengths_m_n_k_o_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                            arg.b_grid_desc_bk0_n_bk1_,
+                                            arg.b1_grid_desc_bk0_n_bk1_,
+                                            arg.c_grid_desc_m_n_,
+                                            arg.block_2_ctile_map_))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) * arg.batch_count_;
+
+            // Gemm0_K
+            const auto K =
+                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+
+            float ave_time = 0;
+
+            auto launch_kernel = [&](auto has_main_k_block_loop_) {
+                const auto kernel = kernel_batched_gemm_softmax_gemm_xdl_cshuffle_v1<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    AccElementwiseOperation,
+                    B1ElementwiseOperation,
+                    CElementwiseOperation,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    DeviceOp::B1GridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::DefaultBlock2CTileMap,
+                    ComputeBasePtrOfStridedBatch,
+                    C0MatrixMask,
+                    has_main_k_block_loop_>;
+
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_a_grid_,
+                                              arg.p_b_grid_,
+                                              arg.p_b1_grid_,
+                                              arg.p_c_grid_,
+                                              arg.a_element_op_,
+                                              arg.b_element_op_,
+                                              arg.acc_element_op_,
+                                              arg.b1_element_op_,
+                                              arg.c_element_op_,
+                                              arg.a_grid_desc_ak0_m_ak1_,
+                                              arg.b_grid_desc_bk0_n_bk1_,
+                                              arg.b1_grid_desc_bk0_n_bk1_,
+                                              arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.block_2_ctile_map_,
+                                              arg.batch_count_,
+                                              arg.compute_base_ptr_of_batch_,
+                                              arg.c0_matrix_mask_);
+            };
+
+            // Gemm1_K is split into Gemm1_K0/K1 where K1 is known at compile time, so we only need
+            // to concern Gemm0's loop
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                ave_time = launch_kernel(integral_constant<bool, true>{});
+            }
+            else
+            {
+                ave_time = launch_kernel(integral_constant<bool, false>{});
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        {
+            return false;
+        }
+
+        // Note: we need raw lengths since threadwise copy can not handle vector load when part of
+        // vector is out of bounds
+        const auto MRaw      = arg.raw_lengths_m_n_k_o_[0];
+        const auto NRaw      = arg.raw_lengths_m_n_k_o_[1];
+        const auto KRaw      = arg.raw_lengths_m_n_k_o_[2];
+        const auto Gemm1NRaw = arg.raw_lengths_m_n_k_o_[3];
+
+        // Check scalar per vector requirement
+        const auto a_extent_lowest =
+            is_same_v<tensor_layout::gemm::RowMajor, ALayout> ? KRaw : MRaw;
+        const auto b_extent_lowest =
+            is_same_v<tensor_layout::gemm::RowMajor, BLayout> ? NRaw : KRaw;
+        const auto b1_extent_lowest =
+            is_same_v<tensor_layout::gemm::RowMajor, B1Layout> ? Gemm1NRaw : NRaw;
+        const auto c_extent_lowest =
+            is_same_v<tensor_layout::gemm::RowMajor, CLayout> ? Gemm1NRaw : MRaw;
+
+        if(!(a_extent_lowest % ABlockTransferSrcScalarPerVector == 0 &&
+             b_extent_lowest % BBlockTransferSrcScalarPerVector == 0 &&
+             b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 &&
+             c_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0))
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.b1_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_m_n_,
+                                           arg.block_2_ctile_map_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const ADataType* p_a,
+                             const BDataType* p_b,
+                             const B1DataType* p_b1,
+                             CDataType* p_c,
+                             index_t MRaw,
+                             index_t NRaw,
+                             index_t KRaw,
+                             index_t Gemm1NRaw,
+                             index_t Batch,
+                             index_t StrideA,
+                             index_t StrideB,
+                             index_t StrideB1,
+                             index_t StrideC,
+                             index_t BatchStrideA,
+                             index_t BatchStrideB,
+                             index_t BatchStrideB1,
+                             index_t BatchStrideC,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             AccElementwiseOperation acc_element_op,
+                             B1ElementwiseOperation b1_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{p_a,           p_b,          p_b1,         p_c,          MRaw,
+                        NRaw,          KRaw,         Gemm1NRaw,    Batch,        StrideA,
+                        StrideB,       StrideB1,     StrideC,      BatchStrideA, BatchStrideB,
+                        BatchStrideB1, BatchStrideC, a_element_op, b_element_op, acc_element_op,
+                        b1_element_op, c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_b,
+                                                      const void* p_b1,
+                                                      void* p_c,
+                                                      index_t MRaw,
+                                                      index_t NRaw,
+                                                      index_t KRaw,
+                                                      index_t Gemm1NRaw,
+                                                      index_t Batch,
+                                                      index_t StrideA,
+                                                      index_t StrideB,
+                                                      index_t StrideB1,
+                                                      index_t StrideC,
+                                                      index_t BatchStrideA,
+                                                      index_t BatchStrideB,
+                                                      index_t BatchStrideB1,
+                                                      index_t BatchStrideC,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      AccElementwiseOperation acc_element_op,
+                                                      B1ElementwiseOperation b1_element_op,
+                                                      CElementwiseOperation c_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<const B1DataType*>(p_b1),
+                                          static_cast<CDataType*>(p_c),
+                                          MRaw,
+                                          NRaw,
+                                          KRaw,
+                                          Gemm1NRaw,
+                                          Batch,
+                                          StrideA,
+                                          StrideB,
+                                          StrideB1,
+                                          StrideC,
+                                          BatchStrideA,
+                                          BatchStrideB,
+                                          BatchStrideB1,
+                                          BatchStrideC,
+                                          a_element_op,
+                                          b_element_op,
+                                          acc_element_op,
+                                          b1_element_op,
+                                          c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1 << ", "
+            << MPerBlock << ", "
+            << Gemm1NPerBlock << ", "
+            << Gemm1KPerBlock << ", "
+            << B1K1 << ", "
+            << getGemmSpecializationString(GemmSpec) << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
new file mode 100644
index 00000000..5ea32963
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp
@@ -0,0 +1,668 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+/*
+ * \brief Wrapper function of GridwiseGemm::Run to realize BatchedGEMM.
+ *
+ * \tparam ComputePtrOffsetOfBatch Class that computes the base pointer offsets of A, B, C matrix
+ * given the batch. For example, ComputePtrOffsetOfStridedBatch() computes the offsets of evenly
+ * strided batched, but we can easily extend to other layouts. The returned offset can be either \p
+ * index_t or \p long_index_t. If it returns \p long_index_t, we are not subject to the 2GB
+ * limitations.
+ *
+ * \tparam Block2CTileMap Block2CTileMap::CalculateBottomIndex() takes in id of a workgroup and
+ * returns the 2D index of the tile that it computes. \see
+ * GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3::Run().
+ *
+ * \note Using \p ComputePtrOffsetOfBatch gives us the flexibility that 2 workgroups can compute 2
+ * tiles from different matrices. Keep in mind that these 2 matrices can share the same grid
+ * descriptor (like in BatchedGEMM), or use their own grid descriptors (in GroupedGemm). \link
+ * device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp kernel_gemm_xdlops_v2r3_for_conv3d \endlink for \link
+ * DeviceConv3d \endlink uses the same concept, but currently does NOT encapsulate the computing of
+ * pointer offset into \p ComputePtrOffsetOfStridedBatch.
+ *
+ * \note \p Block2CTileMap allows customized mapping between a workgroup and the C-tile it computes.
+ * Together with \p ComputePtrOffsetOfBatch, we can reuse GridwiseGemm (and GridwiseGemm fusion ) to
+ * realize BatchedGemm and GroupedGemm (and the corresponding GEMM fusion).
+ *
+ */
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AGridDesc_K0_M_K1,
+          typename BGridDesc_K0_N_K1,
+          typename CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename ComputePtrOffsetOfBatch,
+          typename Block2CTileMap,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_batched_gemm_xdlops_v2r3(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const index_t batch_count,
+            const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
+            const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
+            const CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CElementwiseOperation c_element_op,
+            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+            const Block2CTileMap block_2_ctile_map)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    const index_t num_blocks_per_batch =
+        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+    const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx)));
+
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+                                                  p_b_grid + b_batch_offset,
+                                                  p_c_grid + c_batch_offset,
+                                                  p_shared,
+                                                  a_grid_desc_k0_m_k1,
+                                                  b_grid_desc_k0_n_k1,
+                                                  c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  c_element_op,
+                                                  block_2_ctile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_c_grid;
+    ignore = batch_count;
+    ignore = a_grid_desc_k0_m_k1;
+    ignore = b_grid_desc_k0_n_k1;
+    ignore = c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = c_element_op;
+    ignore = compute_ptr_offset_of_batch;
+    ignore = block_2_ctile_map;
+#endif
+}
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AccDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          ck::index_t BlockSize,
+          ck::index_t MPerBlock,
+          ck::index_t NPerBlock,
+          ck::index_t K0PerBlock,
+          ck::index_t K1,
+          ck::index_t MPerXDL,
+          ck::index_t NPerXDL,
+          ck::index_t MXdlPerWave,
+          ck::index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          ck::index_t ABlockTransferSrcVectorDim,
+          ck::index_t ABlockTransferSrcScalarPerVector,
+          ck::index_t ABlockTransferDstScalarPerVector_K1,
+          bool ABlockLdsAddExtraM,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          ck::index_t BBlockTransferSrcVectorDim,
+          ck::index_t BBlockTransferSrcScalarPerVector,
+          ck::index_t BBlockTransferDstScalarPerVector_K1,
+          bool BBlockLdsAddExtraN,
+          ck::index_t CThreadTransferSrcDstVectorDim,
+          ck::index_t CThreadTransferDstScalarPerVector,
+          ck::index_t NumGemmKPrefetchStage = 1,
+          ck::LoopScheduler LoopSched       = make_default_loop_scheduler(),
+          ck::PipelineVersion PipelineVer   = ck::PipelineVersion::v1>
+struct DeviceBatchedGemmXdl : public DeviceBatchedGemm<ALayout,
+                                                       BLayout,
+                                                       CLayout,
+                                                       ADataType,
+                                                       BDataType,
+                                                       CDataType,
+                                                       AElementwiseOperation,
+                                                       BElementwiseOperation,
+                                                       CElementwiseOperation>
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    static constexpr auto K1Number = Number<K1>{};
+
+    static auto MakeAGridDescriptor_K0_M_K1(index_t M, index_t K, index_t StrideA)
+    {
+        assert(K % K1 == 0);
+
+        const index_t K0 = K / K1;
+
+        const auto a_grid_desc_m_k = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ALayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
+            }
+        }();
+
+        const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
+
+        const auto a_grid_desc_k0_mp_k1 =
+            transform_tensor_descriptor(a_grid_desc_m_k,
+                                        make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
+                                                   make_right_pad_transform(M, PadM)),
+                                        make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+        return a_grid_desc_k0_mp_k1;
+    }
+
+    static auto MakeBGridDescriptor_K0_N_K1(index_t K, index_t N, index_t StrideB)
+    {
+        assert(K % K1 == 0);
+
+        const index_t K0 = K / K1;
+
+        const auto b_grid_desc_k_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB));
+            }
+        }();
+
+        const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
+
+        const auto b_grid_desc_k0_np_k1 =
+            transform_tensor_descriptor(b_grid_desc_k_n,
+                                        make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
+                                                   make_right_pad_transform(N, PadN)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+        return b_grid_desc_k0_np_k1;
+    }
+
+    static auto MakeCGridDescriptor_M_N(index_t M, index_t N, index_t StrideC)
+    {
+        const auto c_grid_desc_m_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
+            }
+        }();
+
+        const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
+        const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
+
+        const auto c_grid_desc_mp_np = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_right_pad_transform(M, PadM), make_right_pad_transform(N, PadN)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        return c_grid_desc_mp_np;
+    }
+
+    using AGridDesc_K0_M_K1 = decltype(MakeAGridDescriptor_K0_M_K1(1, 1, 1));
+    using BGridDesc_K0_N_K1 = decltype(MakeBGridDescriptor_K0_N_K1(1, 1, 1));
+    using CGridDesc_M_N     = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
+
+    struct ComputePtrOffsetOfStridedBatch
+    {
+        ComputePtrOffsetOfStridedBatch(index_t BatchStrideA,
+                                       index_t BatchStrideB,
+                                       index_t BatchStrideC)
+            : BatchStrideA_(BatchStrideA), BatchStrideB_(BatchStrideB), BatchStrideC_(BatchStrideC)
+        {
+        }
+
+        __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideA_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideB_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetCPtrOffset(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideC_);
+        }
+
+        private:
+        index_t BatchStrideA_;
+        index_t BatchStrideB_;
+        index_t BatchStrideC_;
+    };
+
+    // GridwiseGemm
+    using GridwiseGemm =
+        GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<BlockSize,
+                                                ADataType, // TODO: distinguish A/B datatype
+                                                AccDataType,
+                                                CDataType,
+                                                InMemoryDataOperationEnum::Set,
+                                                AGridDesc_K0_M_K1,
+                                                BGridDesc_K0_N_K1,
+                                                CGridDesc_M_N,
+                                                AElementwiseOperation,
+                                                BElementwiseOperation,
+                                                CElementwiseOperation,
+                                                MPerBlock,
+                                                NPerBlock,
+                                                K0PerBlock,
+                                                MPerXDL,
+                                                NPerXDL,
+                                                K1,
+                                                MXdlPerWave,
+                                                NXdlPerWave,
+                                                ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ABlockTransferSrcAccessOrder,
+                                                ABlockTransferSrcVectorDim,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_K1,
+                                                false, // AThreadTransferSrcResetCoordinateAfterRun,
+                                                ABlockLdsAddExtraM,
+                                                BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                BBlockTransferSrcAccessOrder,
+                                                BBlockTransferSrcVectorDim,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_K1,
+                                                false, // BThreadTransferSrcResetCoordinateAfterRun,
+                                                BBlockLdsAddExtraN,
+                                                Sequence<2, 3, 0, 1, 7, 5, 4, 6>,
+                                                CThreadTransferSrcDstVectorDim,
+                                                CThreadTransferDstScalarPerVector,
+                                                NumGemmKPrefetchStage,
+                                                LoopSched,
+                                                PipelineVer>;
+
+    using CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 =
+        decltype(GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(CGridDesc_M_N{}));
+    using Block2CTileMap = typename GridwiseGemm::DefaultBlock2CTileMap;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const ADataType* p_a_grid,
+                 const BDataType* p_b_grid,
+                 CDataType* p_c_grid,
+                 index_t M,
+                 index_t N,
+                 index_t K,
+                 index_t StrideA,
+                 index_t StrideB,
+                 index_t StrideC,
+                 index_t BatchStrideA,
+                 index_t BatchStrideB,
+                 index_t BatchStrideC,
+                 index_t Batch,
+                 index_t M01,
+                 index_t N01,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op)
+            : p_a_grid_{p_a_grid},
+              p_b_grid_{p_b_grid},
+              p_c_grid_{p_c_grid},
+              Batch_(Batch),
+              a_grid_desc_k0_m_k1_{
+                  DeviceBatchedGemmXdl::MakeAGridDescriptor_K0_M_K1(M, K, StrideA)},
+              b_grid_desc_k0_n_k1_{
+                  DeviceBatchedGemmXdl::MakeBGridDescriptor_K0_N_K1(K, N, StrideB)},
+              c_grid_desc_m_n_{DeviceBatchedGemmXdl::MakeCGridDescriptor_M_N(M, N, StrideC)},
+              c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_{},
+              compute_ptr_offset_of_batch_{BatchStrideA, BatchStrideB, BatchStrideC},
+              block_2_ctile_map_{
+                  GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_, M01, N01)},
+              M01_{M01},
+              N01_{N01},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op},
+              kraw_{K}
+        {
+            if(GridwiseGemm::CheckValidity(a_grid_desc_k0_m_k1_,
+                                           b_grid_desc_k0_n_k1_,
+                                           c_grid_desc_m_n_,
+                                           block_2_ctile_map_))
+            {
+                c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_ =
+                    GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m_n_);
+            }
+        }
+
+        //  private:
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        CDataType* p_c_grid_;
+        index_t Batch_;
+        AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1_;
+        BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_;
+        ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_;
+        Block2CTileMap block_2_ctile_map_;
+        index_t M01_;
+        index_t N01_;
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+        index_t kraw_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceBatchedGemmXdl::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+#if 0
+            {
+                std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
+                          << ", " << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", "
+                          << arg.a_grid_desc_k0_m_k1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.b_grid_desc_k0_n_k1_{" << arg.b_grid_desc_k0_n_k1_.GetLength(I0)
+                          << ", " << arg.b_grid_desc_k0_n_k1_.GetLength(I1) << ", "
+                          << arg.b_grid_desc_k0_n_k1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.c_grid_desc_m_n_{" << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
+                          << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+            }
+#endif
+
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
+                                            arg.b_grid_desc_k0_n_k1_,
+                                            arg.c_grid_desc_m_n_,
+                                            arg.block_2_ctile_map_))
+            {
+                throw std::runtime_error(
+                    "wrong! GridwiseBatchedGemm_km_kn_m0m1n0n1_xdlops_v2r3 has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) * arg.Batch_;
+
+            const auto K =
+                arg.a_grid_desc_k0_m_k1_.GetLength(I0) * arg.a_grid_desc_k0_m_k1_.GetLength(I2);
+
+            float ave_time = 0;
+
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                const auto kernel = kernel_batched_gemm_xdlops_v2r3<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    remove_reference_t<DeviceBatchedGemmXdl::AGridDesc_K0_M_K1>,
+                    remove_reference_t<DeviceBatchedGemmXdl::BGridDesc_K0_N_K1>,
+                    remove_reference_t<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CElementwiseOperation,
+                    ComputePtrOffsetOfStridedBatch,
+                    remove_reference_t<Block2CTileMap>,
+                    true>;
+
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
+                                                  dim3(grid_size),
+                                                  dim3(BlockSize),
+                                                  0,
+                                                  arg.p_a_grid_,
+                                                  arg.p_b_grid_,
+                                                  arg.p_c_grid_,
+                                                  arg.Batch_,
+                                                  arg.a_grid_desc_k0_m_k1_,
+                                                  arg.b_grid_desc_k0_n_k1_,
+                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
+                                                  arg.a_element_op_,
+                                                  arg.b_element_op_,
+                                                  arg.c_element_op_,
+                                                  arg.compute_ptr_offset_of_batch_,
+                                                  arg.block_2_ctile_map_);
+            }
+            else
+            {
+                const auto kernel = kernel_batched_gemm_xdlops_v2r3<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    remove_reference_t<DeviceBatchedGemmXdl::AGridDesc_K0_M_K1>,
+                    remove_reference_t<DeviceBatchedGemmXdl::BGridDesc_K0_N_K1>,
+                    remove_reference_t<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CElementwiseOperation,
+                    ComputePtrOffsetOfStridedBatch,
+                    remove_reference_t<Block2CTileMap>,
+                    false>;
+
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
+                                                  dim3(grid_size),
+                                                  dim3(BlockSize),
+                                                  0,
+                                                  arg.p_a_grid_,
+                                                  arg.p_b_grid_,
+                                                  arg.p_c_grid_,
+                                                  arg.Batch_,
+                                                  arg.a_grid_desc_k0_m_k1_,
+                                                  arg.b_grid_desc_k0_n_k1_,
+                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
+                                                  arg.a_element_op_,
+                                                  arg.b_element_op_,
+                                                  arg.c_element_op_,
+                                                  arg.compute_ptr_offset_of_batch_,
+                                                  arg.block_2_ctile_map_);
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(arg.kraw_ % K1 != 0)
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
+                                           arg.b_grid_desc_k0_n_k1_,
+                                           arg.c_grid_desc_m_n_,
+                                           arg.block_2_ctile_map_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const ADataType* p_a,
+                             const BDataType* p_b,
+                             CDataType* p_c,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             index_t StrideA,
+                             index_t StrideB,
+                             index_t StrideC,
+                             index_t BatchStrideA,
+                             index_t BatchStrideB,
+                             index_t BatchStrideC,
+                             index_t Batch,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_c,
+                        M,
+                        N,
+                        K,
+                        StrideA,
+                        StrideB,
+                        StrideC,
+                        BatchStrideA,
+                        BatchStrideB,
+                        BatchStrideC,
+                        Batch,
+                        1,
+                        1,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_b,
+                                                      void* p_c,
+                                                      index_t M,
+                                                      index_t N,
+                                                      index_t K,
+                                                      index_t StrideA,
+                                                      index_t StrideB,
+                                                      index_t StrideC,
+                                                      index_t BatchStrideA,
+                                                      index_t BatchStrideB,
+                                                      index_t BatchStrideC,
+                                                      index_t Batch,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      CElementwiseOperation c_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<CDataType*>(p_c),
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideB,
+                                          StrideC,
+                                          BatchStrideA,
+                                          BatchStrideB,
+                                          BatchStrideC,
+                                          Batch,
+                                          1,
+                                          1,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        std::map<LoopScheduler, std::string> LoopSchedToString{
+            {LoopScheduler::Default, "Default"}, {LoopScheduler::Interwave, "Interwave"}};
+
+        std::map<PipelineVersion, std::string> PipelineVersionToString{{PipelineVersion::v1, "v1"},
+                                                                       {PipelineVersion::v2, "v2"}};
+
+        // clang-format off
+        str << "DeviceBatchedGemmXdl"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << K0PerBlock
+            << ">"
+            << " NumGemmKPrefetchStage: "
+            << NumGemmKPrefetchStage << ", "
+            << "LoopScheduler: "
+            << LoopSchedToString[LoopSched] << ", "
+            << "PipelineVersion: "
+            << PipelineVersionToString[PipelineVer];
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp
new file mode 100644
index 00000000..ab16a757
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp
@@ -0,0 +1,874 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp"
+#include "ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp"
+#include "ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_multiblock_reduce_first_half.hpp"
+#include "ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/welford_helper.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename XDataType,
+          typename DxDataType,
+          typename DyDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename DscaleDbiasDataType,
+          typename MeanVarDataType,
+          typename DyElementwiseOp,
+          index_t Rank,
+          index_t NumBatchNormReduceDim,
+          bool UseMultiblockInK,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t XDyDxVectorDim,
+          index_t XSrcVectorSize,
+          index_t DySrcVectorSize,
+          index_t DxDstVectorSize,
+          index_t ScaleSrcVectorSize,
+          index_t DscaleDbiasDstVectorSize,
+          index_t MeanVarSrcVectorSize>
+struct DeviceBatchNormBwdImpl : public DeviceBatchNormBwd<XDataType,
+                                                          DxDataType,
+                                                          DyDataType,
+                                                          AccDataType,
+                                                          ScaleDataType,
+                                                          DscaleDbiasDataType,
+                                                          MeanVarDataType,
+                                                          DyElementwiseOp,
+                                                          Rank,
+                                                          NumBatchNormReduceDim>
+{
+    static_assert(Rank <= 6, "Bigger Rank size is not supported!");
+    static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
+                  "Invalid thread cluster size assignments!");
+
+    static_assert((XDyDxVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0 &&
+                   MThreadSliceSize % DySrcVectorSize == 0 &&
+                   MThreadSliceSize % DxDstVectorSize == 0) ||
+                      (XDyDxVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0 &&
+                       KThreadSliceSize % DySrcVectorSize == 0 &&
+                       KThreadSliceSize % DxDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static constexpr index_t NumInvariantDim = Rank - NumBatchNormReduceDim;
+
+    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+
+    static auto MakeXY2dDescriptor(const std::array<index_t, Rank>& xyLengths,
+                                   const std::array<index_t, Rank>& xyStrides,
+                                   int blkGroupSize,
+                                   int numBlockTileIteration)
+    {
+        const auto tupleXYLengths =
+            generate_tuple([&](auto I) { return xyLengths[I]; }, Number<Rank>{});
+        const auto tupleXYStrides =
+            generate_tuple([&](auto I) { return xyStrides[I]; }, Number<Rank>{});
+
+        const auto raw_grid_desc = make_naive_tensor_descriptor(tupleXYLengths, tupleXYStrides);
+
+        const auto grid_desc_m_k = [&]() {
+            using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type;
+            using ReduceDims    = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
+
+            const auto reduceDimLengths =
+                generate_tuple([&](auto I) { return xyLengths[NumInvariantDim + I]; },
+                               Number<NumBatchNormReduceDim>{});
+            const auto invariantDimLengths =
+                generate_tuple([&](auto I) { return xyLengths[I]; }, Number<NumInvariantDim>{});
+
+            return transform_tensor_descriptor(raw_grid_desc,
+                                               make_tuple(make_merge_transform(invariantDimLengths),
+                                                          make_merge_transform(reduceDimLengths)),
+                                               make_tuple(InvariantDims{}, ReduceDims{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }();
+
+        const auto invariantLength = grid_desc_m_k.GetLength(Number<0>{});
+        const auto reduceLength    = grid_desc_m_k.GetLength(Number<1>{});
+
+        const int workSizePerBlock = K_BlockTileSize * numBlockTileIteration;
+        const auto mPad =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+        const auto kPad = workSizePerBlock * blkGroupSize - reduceLength;
+
+        auto grid_desc_m_k_padded =
+            transform_tensor_descriptor(grid_desc_m_k,
+                                        make_tuple(make_right_pad_transform(invariantLength, mPad),
+                                                   make_right_pad_transform(reduceLength, kPad)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        return (grid_desc_m_k_padded);
+    };
+
+    static auto MakeMultiblockFirstReduceOutputMG2dDescriptor(int invariantLength, int blkGroupSize)
+    {
+        const auto grid_desc_m_g =
+            make_naive_tensor_descriptor_packed(make_tuple(invariantLength, blkGroupSize));
+
+        const auto mPad =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+
+        auto grid_desc_m_g_padded =
+            transform_tensor_descriptor(grid_desc_m_g,
+                                        make_tuple(make_right_pad_transform(invariantLength, mPad),
+                                                   make_pass_through_transform(blkGroupSize)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        return (grid_desc_m_g_padded);
+    };
+
+    static auto MakeMultiblockFinalReduceInputMK2dDescriptor(int invariantLength, int blkGroupSize)
+    {
+        const auto reduceLength = blkGroupSize;
+        const auto grid_desc_m_k =
+            make_naive_tensor_descriptor_packed(make_tuple(invariantLength, reduceLength));
+
+        const auto mPad =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+        const auto kPad =
+            math::integer_least_multiple(reduceLength, KThreadClusterSize) - reduceLength;
+
+        auto grid_desc_m_k_padded =
+            transform_tensor_descriptor(grid_desc_m_k,
+                                        make_tuple(make_right_pad_transform(invariantLength, mPad),
+                                                   make_right_pad_transform(reduceLength, kPad)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        return (grid_desc_m_k_padded);
+    };
+
+    static auto
+    MakeScaleBiasMeanVar1dDescriptor(const std::array<index_t, NumInvariantDim>& lengths,
+                                     const std::array<index_t, NumInvariantDim>& strides)
+    {
+        const auto tupleLengths =
+            generate_tuple([&](auto I) { return lengths[I]; }, Number<NumInvariantDim>{});
+        const auto tupleStrides =
+            generate_tuple([&](auto I) { return strides[I]; }, Number<NumInvariantDim>{});
+
+        auto raw_grid_desc = make_naive_tensor_descriptor(tupleLengths, tupleStrides);
+
+        auto grid_desc_m = transform_tensor_descriptor(
+            raw_grid_desc,
+            make_tuple(make_merge_transform(tupleLengths)),
+            make_tuple(typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type{}),
+            make_tuple(Sequence<0>{}));
+
+        const auto invariantLength = grid_desc_m.GetLength(Number<0>{});
+
+        const auto mPad =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+
+        auto grid_desc_m_padded =
+            transform_tensor_descriptor(grid_desc_m,
+                                        make_tuple(make_right_pad_transform(invariantLength, mPad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        return (grid_desc_m_padded);
+    };
+
+    using XYGridDesc_M_K      = decltype(MakeXY2dDescriptor({1}, {1}, 1, 1));
+    using ScaleBiasGridDesc_M = decltype(MakeScaleBiasMeanVar1dDescriptor({1}, {1}));
+    using MeanVarGridDesc_M   = ScaleBiasGridDesc_M;
+
+    struct Argument : public BaseArgument
+    {
+        Argument(const std::array<index_t, Rank> xyLengths,
+                 const std::array<index_t, Rank> xStrides,
+                 const std::array<index_t, Rank> dyStrides,
+                 const std::array<index_t, Rank> dxStrides,
+                 const std::array<int, NumBatchNormReduceDim> reduceDims,
+                 const std::array<ck::index_t, NumInvariantDim> bnScaleBiasMeanVarLengths,
+                 const std::array<ck::index_t, NumInvariantDim> bnScaleStrides,
+                 const std::array<ck::index_t, NumInvariantDim> bnDscaleDbiasStrides,
+                 const std::array<ck::index_t, NumInvariantDim> bnMeanVarStrides,
+                 const XDataType* p_x,
+                 const DyDataType* p_dy,
+                 const ScaleDataType* p_scale,
+                 const MeanVarDataType* p_savedMean,
+                 const MeanVarDataType* p_savedInvVar,
+                 const DyElementwiseOp dy_elementwise_op,
+                 double epsilon,
+                 DxDataType* p_dx,
+                 DscaleDbiasDataType* p_dscale,
+                 DscaleDbiasDataType* p_dbias)
+            : bnScaleBiasMeanVarLengths_(bnScaleBiasMeanVarLengths),
+              bnScaleStrides_(bnScaleStrides),
+              bnDscaleDbiasStrides_(bnDscaleDbiasStrides),
+              bnMeanVarStrides_(bnMeanVarStrides),
+              p_x_(p_x),
+              p_dy_(p_dy),
+              p_scale_(p_scale),
+              p_savedMean_(p_savedMean),
+              p_savedInvVar_(p_savedInvVar),
+              dy_elementwise_op_(dy_elementwise_op),
+              p_dx_(p_dx),
+              p_dscale_(p_dscale),
+              p_dbias_(p_dbias)
+        {
+            xyLengths_ =
+                shuffle_tensor_dimensions<Rank, NumBatchNormReduceDim>(xyLengths, reduceDims);
+            xStrides_ =
+                shuffle_tensor_dimensions<Rank, NumBatchNormReduceDim>(xStrides, reduceDims);
+            dyStrides_ =
+                shuffle_tensor_dimensions<Rank, NumBatchNormReduceDim>(dyStrides, reduceDims);
+            dxStrides_ =
+                shuffle_tensor_dimensions<Rank, NumBatchNormReduceDim>(dxStrides, reduceDims);
+
+            std::tie(invariant_length, reduce_length) =
+                get_2d_lengths<Rank, NumBatchNormReduceDim>(xyLengths_);
+
+            epsilon_ = type_convert<AccDataType>(epsilon);
+
+            haveSavedMeanInvVar_ = (p_savedMean_ != nullptr && p_savedInvVar_ != nullptr);
+
+            if(UseMultiblockInK)
+            {
+                int iterations = 1;
+                while(true)
+                {
+                    int testBlkGroupSize = (reduce_length + (K_BlockTileSize * iterations) - 1) /
+                                           (K_BlockTileSize * iterations);
+
+                    // we want the blkGroupSize be not more than 128
+                    if(testBlkGroupSize <= 128)
+                        break;
+
+                    iterations++;
+                };
+
+                blkGroupSize = (reduce_length + (K_BlockTileSize * iterations) - 1) /
+                               (K_BlockTileSize * iterations);
+
+                numBlockTileIteration = iterations;
+            }
+            else
+            {
+                blkGroupSize          = 1;
+                numBlockTileIteration = (reduce_length + K_BlockTileSize - 1) / K_BlockTileSize;
+            };
+
+            gridSize = (invariant_length + M_BlockTileSize - 1) / M_BlockTileSize * blkGroupSize;
+
+            x_grid_desc_m_k =
+                MakeXY2dDescriptor(xyLengths_, xStrides_, blkGroupSize, numBlockTileIteration);
+            dy_grid_desc_m_k =
+                MakeXY2dDescriptor(xyLengths_, dyStrides_, blkGroupSize, numBlockTileIteration);
+            dx_grid_desc_m_k =
+                MakeXY2dDescriptor(xyLengths_, dxStrides_, blkGroupSize, numBlockTileIteration);
+            scale_grid_desc_m =
+                MakeScaleBiasMeanVar1dDescriptor(bnScaleBiasMeanVarLengths, bnScaleStrides);
+            dscale_dbias_grid_desc_m =
+                MakeScaleBiasMeanVar1dDescriptor(bnScaleBiasMeanVarLengths, bnDscaleDbiasStrides);
+            mean_var_grid_desc_m =
+                MakeScaleBiasMeanVar1dDescriptor(bnScaleBiasMeanVarLengths, bnMeanVarStrides);
+        }
+
+        AccDataType epsilon_;
+
+        bool haveSavedMeanInvVar_;
+
+        std::array<index_t, Rank> xyLengths_;
+        std::array<index_t, Rank> xStrides_;
+        std::array<index_t, Rank> dyStrides_;
+        std::array<index_t, Rank> dxStrides_;
+
+        std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarLengths_;
+        std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleStrides_;
+        std::array<index_t, Rank - NumBatchNormReduceDim> bnDscaleDbiasStrides_;
+        std::array<index_t, Rank - NumBatchNormReduceDim> bnMeanVarStrides_;
+
+        const XDataType* p_x_;
+        const DyDataType* p_dy_;
+        const ScaleDataType* p_scale_;
+        const MeanVarDataType* p_savedMean_;
+        const MeanVarDataType* p_savedInvVar_;
+        const DyElementwiseOp dy_elementwise_op_;
+        DxDataType* p_dx_;
+        DscaleDbiasDataType* p_dscale_;
+        DscaleDbiasDataType* p_dbias_;
+
+        long_index_t invariant_length;
+        long_index_t reduce_length;
+
+        int blkGroupSize;
+        int numBlockTileIteration;
+        size_t gridSize;
+
+        XYGridDesc_M_K x_grid_desc_m_k;
+        XYGridDesc_M_K dy_grid_desc_m_k;
+        XYGridDesc_M_K dx_grid_desc_m_k;
+        ScaleBiasGridDesc_M scale_grid_desc_m;
+        ScaleBiasGridDesc_M dscale_dbias_grid_desc_m;
+        MeanVarGridDesc_M mean_var_grid_desc_m;
+
+        void* workspace_mean;
+        void* workspace_variance;
+        void* workspace_count;
+
+        void* workspace_savedMean;
+        void* workspace_savedInvVar;
+
+        void* workspace_reduce_dscale;
+        void* workspace_reduce_dbias;
+    };
+
+    size_t GetWorkSpaceSize(const BaseArgument* pArg) const override
+    {
+        const Argument* pArg_ = dynamic_cast<const Argument*>(pArg);
+
+        size_t workspace_size = 0;
+
+        if(UseMultiblockInK && pArg_->blkGroupSize > 1)
+        {
+            // workspace for the partial reduced result for dscale
+            workspace_size +=
+                pArg_->invariant_length * pArg_->blkGroupSize * sizeof(DscaleDbiasDataType) + 64;
+
+            // workspace for the partial reduced result for dbias
+            workspace_size +=
+                pArg_->invariant_length * pArg_->blkGroupSize * sizeof(DscaleDbiasDataType) + 64;
+
+            if(!pArg_->haveSavedMeanInvVar_)
+            {
+                // workspace for welford intermediate mean
+                workspace_size +=
+                    pArg_->invariant_length * pArg_->blkGroupSize * sizeof(MeanVarDataType) + 64;
+
+                // workspace for welford intermediate variance
+                workspace_size +=
+                    pArg_->invariant_length * pArg_->blkGroupSize * sizeof(MeanVarDataType) + 64;
+
+                // workspace for welford intermediate count
+                workspace_size +=
+                    pArg_->invariant_length * pArg_->blkGroupSize * sizeof(int32_t) + 64;
+
+                // workspace for welford result mean
+                workspace_size += pArg_->invariant_length * sizeof(MeanVarDataType) + 64;
+
+                // workspace for welford result inv_variance
+                workspace_size += pArg_->invariant_length * sizeof(MeanVarDataType) + 64;
+            };
+        }
+
+        return (workspace_size);
+    };
+
+    void SetWorkSpacePointer(BaseArgument* pArg, void* p_workspace) const override
+    {
+        Argument* pArg_ = dynamic_cast<Argument*>(pArg);
+
+        pArg_->p_workspace_ = p_workspace;
+
+        index_t space_sz;
+
+        // setup buffer for the partial reduced result for dscale
+        pArg_->workspace_reduce_dscale = pArg_->p_workspace_;
+
+        space_sz = pArg_->invariant_length * pArg_->blkGroupSize * sizeof(DscaleDbiasDataType);
+        space_sz = math::integer_least_multiple(space_sz, 64);
+
+        // setup buffer for the partial reduced result for dbias
+        pArg_->workspace_reduce_dbias =
+            reinterpret_cast<char*>(pArg_->workspace_reduce_dscale) + space_sz;
+
+        if(UseMultiblockInK && pArg_->blkGroupSize > 1)
+        {
+            space_sz = pArg_->invariant_length * pArg_->blkGroupSize * sizeof(DscaleDbiasDataType);
+            space_sz = math::integer_least_multiple(space_sz, 64);
+
+            // setup buffer for welford intermediate mean
+            pArg_->workspace_mean =
+                reinterpret_cast<char*>(pArg_->workspace_reduce_dbias) + space_sz;
+
+            space_sz = pArg_->invariant_length * pArg_->blkGroupSize * sizeof(MeanVarDataType);
+            space_sz = math::integer_least_multiple(space_sz, 64);
+
+            // setup buffer for welford intermediate varirance
+            pArg_->workspace_variance = reinterpret_cast<char*>(pArg_->workspace_mean) + space_sz;
+
+            space_sz = pArg_->invariant_length * pArg_->blkGroupSize * sizeof(MeanVarDataType);
+            space_sz = math::integer_least_multiple(space_sz, 64);
+
+            // setup buffer for welford intermediate count
+            pArg_->workspace_count = reinterpret_cast<char*>(pArg_->workspace_variance) + space_sz;
+
+            space_sz = pArg_->invariant_length * pArg_->blkGroupSize * sizeof(int32_t);
+            space_sz = math::integer_least_multiple(space_sz, 64);
+
+            // setup buffer for welford result mean
+            pArg_->workspace_savedMean = reinterpret_cast<char*>(pArg_->workspace_count) + space_sz;
+
+            space_sz = pArg_->invariant_length * sizeof(MeanVarDataType);
+            space_sz = math::integer_least_multiple(space_sz, 64);
+
+            // setup buffer for welford result inv_variance
+            pArg_->workspace_savedInvVar =
+                reinterpret_cast<char*>(pArg_->workspace_savedMean) + space_sz;
+        };
+    };
+
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            float avg_time = 0;
+
+            const auto mean_var_count_grid_desc_m_g =
+                DeviceBatchNormBwdImpl::MakeMultiblockFirstReduceOutputMG2dDescriptor(
+                    arg.invariant_length, arg.blkGroupSize);
+
+            const auto dscale_dbias_grid_desc_m_g =
+                DeviceBatchNormBwdImpl::MakeMultiblockFirstReduceOutputMG2dDescriptor(
+                    arg.invariant_length, arg.blkGroupSize);
+
+            const auto mean_var_count_grid_desc_m_k =
+                DeviceBatchNormBwdImpl::MakeMultiblockFinalReduceInputMK2dDescriptor(
+                    arg.invariant_length, arg.blkGroupSize);
+
+            const auto dscale_dbias_grid_desc_m_k =
+                DeviceBatchNormBwdImpl::MakeMultiblockFinalReduceInputMK2dDescriptor(
+                    arg.invariant_length, arg.blkGroupSize);
+
+            using MeanVarCountGridDesc_M_G = decltype(mean_var_count_grid_desc_m_g);
+            using MeanVarCountGridDesc_M_K = decltype(mean_var_count_grid_desc_m_k);
+            using DscaleDbiasGridDesc_M_G  = decltype(dscale_dbias_grid_desc_m_g);
+            using DscaleDbiasGridDesc_M_K  = decltype(dscale_dbias_grid_desc_m_k);
+
+            using GridwiseWelfordSecondHalfReduceFirstHalf_ =
+                GridwiseWelfordSecondHalfReduceFirstHalf<XDataType,
+                                                         DyDataType,
+                                                         AccDataType,
+                                                         ScaleDataType,
+                                                         DscaleDbiasDataType,
+                                                         MeanVarDataType,
+                                                         DyElementwiseOp,
+                                                         XYGridDesc_M_K,
+                                                         MeanVarGridDesc_M,
+                                                         MeanVarCountGridDesc_M_K,
+                                                         DscaleDbiasGridDesc_M_G,
+                                                         BlockSize,
+                                                         MThreadClusterSize,
+                                                         KThreadClusterSize,
+                                                         MThreadSliceSize,
+                                                         KThreadSliceSize,
+                                                         XDyDxVectorDim,
+                                                         XSrcVectorSize,
+                                                         DySrcVectorSize,
+                                                         MeanVarSrcVectorSize>;
+
+            using GridwiseReduceSecondHalfBatchNormBwdFinal_ =
+                GridwiseReduceSecondHalfBatchNormBackwardFinal<XDataType,
+                                                               DyDataType,
+                                                               DxDataType,
+                                                               AccDataType,
+                                                               ScaleDataType,
+                                                               DscaleDbiasDataType,
+                                                               MeanVarDataType,
+                                                               DyElementwiseOp,
+                                                               XYGridDesc_M_K,
+                                                               DscaleDbiasGridDesc_M_K,
+                                                               MeanVarGridDesc_M,
+                                                               ScaleBiasGridDesc_M,
+                                                               BlockSize,
+                                                               MThreadClusterSize,
+                                                               KThreadClusterSize,
+                                                               MThreadSliceSize,
+                                                               KThreadSliceSize,
+                                                               XDyDxVectorDim,
+                                                               XSrcVectorSize,
+                                                               DySrcVectorSize,
+                                                               DxDstVectorSize,
+                                                               ScaleSrcVectorSize,
+                                                               DscaleDbiasDstVectorSize,
+                                                               MeanVarSrcVectorSize>;
+
+            if(UseMultiblockInK && arg.blkGroupSize > 1)
+            {
+                using GetReduceCountPerThreadFunctor =
+                    GetReduceCountPerThreadForMultiblockWelford<K_BlockTileSize, KThreadSliceSize>;
+
+                GetReduceCountPerThreadFunctor get_reduce_count_per_thread(
+                    arg.blkGroupSize, arg.numBlockTileIteration, arg.reduce_length);
+
+                if(!arg.haveSavedMeanInvVar_)
+                {
+                    using GridwiseMultiblockWelfordFirstHalf_ =
+                        GridwiseMultiblockWelfordFirstHalf<XDataType,
+                                                           AccDataType,
+                                                           MeanVarDataType,
+                                                           XYGridDesc_M_K,
+                                                           MeanVarCountGridDesc_M_G,
+                                                           GetReduceCountPerThreadFunctor,
+                                                           BlockSize,
+                                                           MThreadClusterSize,
+                                                           KThreadClusterSize,
+                                                           MThreadSliceSize,
+                                                           KThreadSliceSize,
+                                                           XDyDxVectorDim,
+                                                           XSrcVectorSize>;
+
+                    const auto kern_multiblock_welford_first_half =
+                        kernel_multiblock_welford_first_half<GridwiseMultiblockWelfordFirstHalf_,
+                                                             XDataType,
+                                                             MeanVarDataType,
+                                                             XYGridDesc_M_K,
+                                                             MeanVarCountGridDesc_M_G,
+                                                             GetReduceCountPerThreadFunctor>;
+
+                    avg_time += launch_and_time_kernel(
+                        stream_config,
+                        kern_multiblock_welford_first_half,
+                        dim3(arg.gridSize),
+                        dim3(BlockSize),
+                        0,
+                        arg.x_grid_desc_m_k,
+                        mean_var_count_grid_desc_m_g,
+                        get_reduce_count_per_thread,
+                        arg.numBlockTileIteration,
+                        arg.p_x_,
+                        static_cast<MeanVarDataType*>(arg.workspace_mean),
+                        static_cast<MeanVarDataType*>(arg.workspace_variance),
+                        static_cast<int32_t*>(arg.workspace_count));
+                };
+
+                const auto kern_welford_second_half_reduce_first_half =
+                    kernel_welford_second_half_reduce_first_half<
+                        GridwiseWelfordSecondHalfReduceFirstHalf_,
+                        XDataType,
+                        DyDataType,
+                        AccDataType,
+                        ScaleDataType,
+                        DscaleDbiasDataType,
+                        MeanVarDataType,
+                        DyElementwiseOp,
+                        XYGridDesc_M_K,
+                        MeanVarGridDesc_M,
+                        MeanVarCountGridDesc_M_K,
+                        DscaleDbiasGridDesc_M_G>;
+
+                const auto kern_reduce_second_half_batchnorm_backward_final =
+                    kernel_reduce_second_half_batchnorm_backward_final<
+                        GridwiseReduceSecondHalfBatchNormBwdFinal_,
+                        XDataType,
+                        DyDataType,
+                        DxDataType,
+                        ScaleDataType,
+                        DscaleDbiasDataType,
+                        MeanVarDataType,
+                        DyElementwiseOp,
+                        XYGridDesc_M_K,
+                        DscaleDbiasGridDesc_M_K,
+                        MeanVarGridDesc_M,
+                        ScaleBiasGridDesc_M>;
+
+                index_t numDscaleDbiasBlockTileIteration =
+                    (arg.blkGroupSize + KThreadClusterSize - 1) / KThreadClusterSize;
+
+                avg_time += launch_and_time_kernel(
+                    stream_config,
+                    kern_welford_second_half_reduce_first_half,
+                    dim3(arg.gridSize),
+                    dim3(BlockSize),
+                    0,
+                    arg.x_grid_desc_m_k,
+                    arg.dy_grid_desc_m_k,
+                    arg.mean_var_grid_desc_m,
+                    mean_var_count_grid_desc_m_k,
+                    dscale_dbias_grid_desc_m_g,
+                    arg.blkGroupSize,
+                    arg.numBlockTileIteration,
+                    numDscaleDbiasBlockTileIteration,
+                    arg.epsilon_,
+                    arg.haveSavedMeanInvVar_,
+                    arg.haveSavedMeanInvVar_ ? arg.p_savedMean_ : nullptr,
+                    arg.haveSavedMeanInvVar_ ? arg.p_savedInvVar_ : nullptr,
+                    arg.haveSavedMeanInvVar_
+                        ? nullptr
+                        : static_cast<const MeanVarDataType*>(arg.workspace_mean),
+                    arg.haveSavedMeanInvVar_
+                        ? nullptr
+                        : static_cast<const MeanVarDataType*>(arg.workspace_variance),
+                    arg.haveSavedMeanInvVar_ ? nullptr
+                                             : static_cast<const int32_t*>(arg.workspace_count),
+                    arg.dy_elementwise_op_,
+                    arg.haveSavedMeanInvVar_
+                        ? nullptr
+                        : static_cast<MeanVarDataType*>(arg.workspace_savedMean),
+                    arg.haveSavedMeanInvVar_
+                        ? nullptr
+                        : static_cast<MeanVarDataType*>(arg.workspace_savedInvVar),
+                    arg.p_x_,
+                    arg.p_dy_,
+                    static_cast<DscaleDbiasDataType*>(arg.workspace_reduce_dscale),
+                    static_cast<DscaleDbiasDataType*>(arg.workspace_reduce_dbias));
+
+                avg_time += launch_and_time_kernel(
+                    stream_config,
+                    kern_reduce_second_half_batchnorm_backward_final,
+                    dim3(arg.gridSize),
+                    dim3(BlockSize),
+                    0,
+                    arg.x_grid_desc_m_k,
+                    arg.dy_grid_desc_m_k,
+                    arg.dx_grid_desc_m_k,
+                    dscale_dbias_grid_desc_m_k,
+                    arg.mean_var_grid_desc_m,
+                    arg.scale_grid_desc_m,
+                    arg.dscale_dbias_grid_desc_m,
+                    arg.blkGroupSize,
+                    arg.reduce_length,
+                    arg.numBlockTileIteration,
+                    numDscaleDbiasBlockTileIteration,
+                    static_cast<const DscaleDbiasDataType*>(arg.workspace_reduce_dscale),
+                    static_cast<const DscaleDbiasDataType*>(arg.workspace_reduce_dbias),
+                    arg.haveSavedMeanInvVar_
+                        ? arg.p_savedMean_
+                        : static_cast<const MeanVarDataType*>(arg.workspace_savedMean),
+                    arg.haveSavedMeanInvVar_
+                        ? arg.p_savedInvVar_
+                        : static_cast<const MeanVarDataType*>(arg.workspace_savedInvVar),
+                    arg.p_x_,
+                    arg.p_dy_,
+                    arg.p_scale_,
+                    arg.dy_elementwise_op_,
+                    arg.p_dx_,
+                    arg.p_dscale_,
+                    arg.p_dbias_);
+            }
+            else
+            {
+                using GetReduceCountPerThreadFunctor =
+                    GetReduceCountPerThreadForBlockwiseWelford<K_BlockTileSize, KThreadSliceSize>;
+
+                GetReduceCountPerThreadFunctor get_reduce_count_per_thread(
+                    arg.numBlockTileIteration, arg.reduce_length);
+
+                using GridwiseBatchNormBackwardWithBlockwiseWelford_ =
+                    GridwiseBatchNormBackwardWithBlockwiseWelford<XDataType,
+                                                                  DyDataType,
+                                                                  DxDataType,
+                                                                  AccDataType,
+                                                                  ScaleDataType,
+                                                                  DscaleDbiasDataType,
+                                                                  MeanVarDataType,
+                                                                  DyElementwiseOp,
+                                                                  XYGridDesc_M_K,
+                                                                  ScaleBiasGridDesc_M,
+                                                                  MeanVarGridDesc_M,
+                                                                  GetReduceCountPerThreadFunctor,
+                                                                  BlockSize,
+                                                                  MThreadClusterSize,
+                                                                  KThreadClusterSize,
+                                                                  MThreadSliceSize,
+                                                                  KThreadSliceSize,
+                                                                  XDyDxVectorDim,
+                                                                  XSrcVectorSize,
+                                                                  DySrcVectorSize,
+                                                                  DxDstVectorSize,
+                                                                  ScaleSrcVectorSize,
+                                                                  DscaleDbiasDstVectorSize,
+                                                                  MeanVarSrcVectorSize>;
+
+                const auto kern_batchnorm_bwd = kernel_batchnorm_backward_with_blockwise_welford<
+                    GridwiseBatchNormBackwardWithBlockwiseWelford_,
+                    XDataType,
+                    DyDataType,
+                    DxDataType,
+                    AccDataType,
+                    ScaleDataType,
+                    DscaleDbiasDataType,
+                    MeanVarDataType,
+                    DyElementwiseOp,
+                    XYGridDesc_M_K,
+                    ScaleBiasGridDesc_M,
+                    MeanVarGridDesc_M,
+                    GetReduceCountPerThreadFunctor>;
+
+                avg_time += launch_and_time_kernel(stream_config,
+                                                   kern_batchnorm_bwd,
+                                                   dim3(arg.gridSize),
+                                                   dim3(BlockSize),
+                                                   0,
+                                                   arg.x_grid_desc_m_k,
+                                                   arg.dy_grid_desc_m_k,
+                                                   arg.dx_grid_desc_m_k,
+                                                   arg.scale_grid_desc_m,
+                                                   arg.dscale_dbias_grid_desc_m,
+                                                   arg.mean_var_grid_desc_m,
+                                                   get_reduce_count_per_thread,
+                                                   arg.reduce_length,
+                                                   arg.numBlockTileIteration,
+                                                   arg.epsilon_,
+                                                   arg.p_x_,
+                                                   arg.p_dy_,
+                                                   arg.p_scale_,
+                                                   arg.haveSavedMeanInvVar_,
+                                                   arg.p_savedMean_,
+                                                   arg.p_savedInvVar_,
+                                                   arg.dy_elementwise_op_,
+                                                   arg.p_dx_,
+                                                   arg.p_dscale_,
+                                                   arg.p_dbias_);
+            };
+
+            return (avg_time);
+        };
+
+        float Run(const BaseArgument* pArg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(pArg), stream_config);
+        };
+    };
+
+    bool IsSupportedArgument(const BaseArgument* pArg) override
+    {
+        const Argument* pArg_ = dynamic_cast<const Argument*>(pArg);
+
+        if constexpr(XDyDxVectorDim == 0)
+        {
+            if(pArg_->xStrides_[NumInvariantDim - 1] != 1 ||
+               pArg_->dyStrides_[NumInvariantDim - 1] != 1 ||
+               pArg_->dxStrides_[NumInvariantDim - 1] != 1)
+                return false;
+
+            if(pArg_->xyLengths_[NumInvariantDim - 1] % XSrcVectorSize != 0 ||
+               pArg_->xyLengths_[NumInvariantDim - 1] % DySrcVectorSize != 0 ||
+               pArg_->xyLengths_[NumInvariantDim - 1] % DxDstVectorSize != 0)
+                return false;
+        }
+        else
+        {
+            if(pArg_->xStrides_[Rank - 1] != 1 || pArg_->dyStrides_[Rank - 1] != 1 ||
+               pArg_->dxStrides_[Rank - 1] != 1)
+                return false;
+
+            if(pArg_->xyLengths_[Rank - 1] % XSrcVectorSize != 0 ||
+               pArg_->xyLengths_[Rank - 1] % DySrcVectorSize != 0 ||
+               pArg_->xyLengths_[Rank - 1] % DxDstVectorSize != 0)
+                return false;
+        };
+
+        if(pArg_->bnScaleStrides_[NumInvariantDim - 1] != 1 && ScaleSrcVectorSize != 1)
+            return false;
+
+        if(pArg_->bnDscaleDbiasStrides_[NumInvariantDim - 1] != 1 && DscaleDbiasDstVectorSize != 1)
+            return false;
+
+        if(pArg_->bnScaleBiasMeanVarLengths_[NumInvariantDim - 1] % ScaleSrcVectorSize != 0)
+            return false;
+
+        if(pArg_->bnScaleBiasMeanVarLengths_[NumInvariantDim - 1] % DscaleDbiasDstVectorSize != 0)
+            return false;
+
+        if(pArg_->haveSavedMeanInvVar_)
+        {
+            if(pArg_->bnMeanVarStrides_[NumInvariantDim - 1] != 1 && MeanVarSrcVectorSize != 1)
+                return false;
+
+            if(pArg_->bnScaleBiasMeanVarLengths_[NumInvariantDim - 1] % MeanVarSrcVectorSize != 0)
+                return false;
+        };
+
+        bool is_valid = true;
+
+        static_for<0, NumInvariantDim, 1>{}([&](auto I) {
+            if(pArg_->xyLengths_[I] != pArg_->bnScaleBiasMeanVarLengths_[I])
+                is_valid = false;
+        });
+
+        if(!is_valid)
+            return false;
+
+        return true;
+    };
+
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::array<index_t, Rank> xyLengths,
+                        const std::array<index_t, Rank> xStrides,
+                        const std::array<index_t, Rank> dyStrides,
+                        const std::array<index_t, Rank> dxStrides,
+                        const std::array<int, NumBatchNormReduceDim> reduceDims,
+                        const std::array<ck::index_t, NumInvariantDim> bnScaleBiasMeanVarLengths,
+                        const std::array<ck::index_t, NumInvariantDim> bnScaleStrides,
+                        const std::array<ck::index_t, NumInvariantDim> bnDscaleDbiasStrides,
+                        const std::array<ck::index_t, NumInvariantDim> bnMeanVarStrides,
+                        const void* p_x,
+                        const void* p_dy,
+                        const void* p_scale,
+                        const void* p_savedMean,
+                        const void* p_savedInvVar,
+                        double epsilon,
+                        const DyElementwiseOp dy_elementwise_op,
+                        void* p_dx,
+                        void* p_dscale,
+                        void* p_dbias) override
+    {
+        return std::make_unique<Argument>(xyLengths,
+                                          xStrides,
+                                          dyStrides,
+                                          dxStrides,
+                                          reduceDims,
+                                          bnScaleBiasMeanVarLengths,
+                                          bnScaleStrides,
+                                          bnDscaleDbiasStrides,
+                                          bnMeanVarStrides,
+                                          static_cast<const XDataType*>(p_x),
+                                          static_cast<const DyDataType*>(p_dy),
+                                          static_cast<const ScaleDataType*>(p_scale),
+                                          static_cast<const MeanVarDataType*>(p_savedMean),
+                                          static_cast<const MeanVarDataType*>(p_savedInvVar),
+                                          dy_elementwise_op,
+                                          epsilon,
+                                          static_cast<DxDataType*>(p_dx),
+                                          static_cast<DscaleDbiasDataType*>(p_dscale),
+                                          static_cast<DscaleDbiasDataType*>(p_dbias));
+    };
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>();
+    };
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceBatchNormBwdImpl<" << BlockSize << ",";
+        str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
+        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
+        str << "XDyDxVectorDim_" << XDyDxVectorDim  << ",";
+        str << "VectorSize_X" << XSrcVectorSize << "_scale_" << ScaleSrcVectorSize << "_bias_" << DscaleDbiasDstVectorSize << "_mean_var_" << MeanVarSrcVectorSize << "_Dx_" << DxDstVectorSize << ">";
+        // clang-format on
+
+        return str.str();
+    }
+}; // namespace device
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp
new file mode 100644
index 00000000..5a16ff76
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp
@@ -0,0 +1,718 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
+#include "ck/tensor_operation/gpu/device/welford_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp"
+#include "ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_batchnorm_forward_blockwise_welford.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename YElementwiseOp,
+          index_t Rank,
+          index_t NumBatchNormReduceDim,
+          bool UseMultiblockInK,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t XSrcYDstVectorDim,
+          index_t XSrcVectorSize,
+          index_t YDstVectorSize,
+          index_t ScaleSrcVectorSize,
+          index_t BiasSrcVectorSize,
+          index_t MeanVarSrcDstVectorSize>
+struct DeviceBatchNormFwdImpl : public DeviceBatchNormFwd<XDataType,
+                                                          YDataType,
+                                                          AccDataType,
+                                                          ScaleDataType,
+                                                          BiasDataType,
+                                                          MeanVarDataType,
+                                                          YElementwiseOp,
+                                                          Rank,
+                                                          NumBatchNormReduceDim>
+{
+    static_assert(Rank <= 6, "Bigger Rank size is not supported!");
+    static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
+                  "Invalid thread cluster size assignments!");
+
+    static_assert((XSrcYDstVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0) ||
+                      (XSrcYDstVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static constexpr index_t NumInvariantDim = Rank - NumBatchNormReduceDim;
+
+    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+
+    static auto MakeXY2dDescriptor(const std::array<index_t, Rank>& xyLengths,
+                                   const std::array<index_t, Rank>& xyStrides,
+                                   int blkGroupSize,
+                                   int numBlockTileIteration)
+    {
+        const auto tupleXYLengths =
+            generate_tuple([&](auto I) { return xyLengths[I]; }, Number<Rank>{});
+        const auto tupleXYStrides =
+            generate_tuple([&](auto I) { return xyStrides[I]; }, Number<Rank>{});
+
+        const auto raw_grid_desc = make_naive_tensor_descriptor(tupleXYLengths, tupleXYStrides);
+
+        const auto grid_desc_m_k = [&]() {
+            using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type;
+            using ReduceDims    = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
+
+            const auto reduceDimLengths =
+                generate_tuple([&](auto I) { return xyLengths[NumInvariantDim + I]; },
+                               Number<NumBatchNormReduceDim>{});
+            const auto invariantDimLengths =
+                generate_tuple([&](auto I) { return xyLengths[I]; }, Number<NumInvariantDim>{});
+
+            return transform_tensor_descriptor(raw_grid_desc,
+                                               make_tuple(make_merge_transform(invariantDimLengths),
+                                                          make_merge_transform(reduceDimLengths)),
+                                               make_tuple(InvariantDims{}, ReduceDims{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }();
+
+        const auto invariantLength = grid_desc_m_k.GetLength(Number<0>{});
+        const auto reduceLength    = grid_desc_m_k.GetLength(Number<1>{});
+
+        const int workSizePerBlock = K_BlockTileSize * numBlockTileIteration;
+        const auto mPad =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+        const auto kPad = workSizePerBlock * blkGroupSize - reduceLength;
+
+        auto grid_desc_m_k_padded =
+            transform_tensor_descriptor(grid_desc_m_k,
+                                        make_tuple(make_right_pad_transform(invariantLength, mPad),
+                                                   make_right_pad_transform(reduceLength, kPad)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        return (grid_desc_m_k_padded);
+    };
+
+    static auto MakeMeanVarCountOutputMG2dDescriptor(int invariantLength, int blkGroupSize)
+    {
+        const auto grid_desc_m_g =
+            make_naive_tensor_descriptor_packed(make_tuple(invariantLength, blkGroupSize));
+
+        const auto mPad =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+
+        auto grid_desc_m_g_padded =
+            transform_tensor_descriptor(grid_desc_m_g,
+                                        make_tuple(make_right_pad_transform(invariantLength, mPad),
+                                                   make_pass_through_transform(blkGroupSize)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        return (grid_desc_m_g_padded);
+    };
+
+    static auto MakeMeanVarCountInputMK2dDescriptor(int invariantLength, int blkGroupSize)
+    {
+        const auto reduceLength = blkGroupSize;
+        const auto grid_desc_m_k =
+            make_naive_tensor_descriptor_packed(make_tuple(invariantLength, reduceLength));
+
+        const auto mPad =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+        const auto kPad =
+            math::integer_least_multiple(reduceLength, KThreadClusterSize) - reduceLength;
+
+        auto grid_desc_m_k_padded =
+            transform_tensor_descriptor(grid_desc_m_k,
+                                        make_tuple(make_right_pad_transform(invariantLength, mPad),
+                                                   make_right_pad_transform(reduceLength, kPad)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        return (grid_desc_m_k_padded);
+    };
+
+    static auto
+    MakeScaleBiasMeanVar1dDescriptor(const std::array<index_t, NumInvariantDim>& lengths,
+                                     const std::array<index_t, NumInvariantDim>& strides)
+    {
+        const auto tupleLengths =
+            generate_tuple([&](auto I) { return lengths[I]; }, Number<NumInvariantDim>{});
+        const auto tupleStrides =
+            generate_tuple([&](auto I) { return strides[I]; }, Number<NumInvariantDim>{});
+
+        auto raw_grid_desc = make_naive_tensor_descriptor(tupleLengths, tupleStrides);
+
+        auto grid_desc_m = transform_tensor_descriptor(
+            raw_grid_desc,
+            make_tuple(make_merge_transform(tupleLengths)),
+            make_tuple(typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type{}),
+            make_tuple(Sequence<0>{}));
+
+        const auto invariantLength = grid_desc_m.GetLength(Number<0>{});
+
+        const auto mPad =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+
+        auto grid_desc_m_padded =
+            transform_tensor_descriptor(grid_desc_m,
+                                        make_tuple(make_right_pad_transform(invariantLength, mPad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        return (grid_desc_m_padded);
+    };
+
+    using XYGridDesc_M_K             = decltype(MakeXY2dDescriptor({1}, {1}, 1, 1));
+    using ScaleBiasMeanVarGridDesc_M = decltype(MakeScaleBiasMeanVar1dDescriptor({1}, {1}));
+
+    struct Argument : public BaseArgument
+    {
+        Argument(const std::array<index_t, Rank> xyLengths,
+                 const std::array<index_t, Rank> xStrides,
+                 const std::array<index_t, Rank> yStrides,
+                 const std::array<int, NumBatchNormReduceDim> reduceDims,
+                 const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarLengths,
+                 const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleStrides,
+                 const std::array<index_t, Rank - NumBatchNormReduceDim> bnBiasStrides,
+                 const std::array<index_t, Rank - NumBatchNormReduceDim> bnMeanVarStrides,
+                 const XDataType* p_x,
+                 const ScaleDataType* p_scale,
+                 const BiasDataType* p_bias,
+                 const YElementwiseOp y_elementwise_op,
+                 double epsilon,
+                 YDataType* p_y,
+                 MeanVarDataType* resultSaveMean,
+                 MeanVarDataType* resultSaveInvVariance,
+                 double averageFactor,
+                 MeanVarDataType* resultRunningMean,
+                 MeanVarDataType* resultRunningVariance)
+            : bnScaleBiasMeanVarLengths_(bnScaleBiasMeanVarLengths),
+              bnScaleStrides_(bnScaleStrides),
+              bnBiasStrides_(bnBiasStrides),
+              bnMeanVarStrides_(bnMeanVarStrides),
+              p_x_(p_x),
+              p_scale_(p_scale),
+              p_bias_(p_bias),
+              y_elementwise_op_(y_elementwise_op),
+              p_y_(p_y),
+              resultSaveMean_(resultSaveMean),
+              resultSaveInvVariance_(resultSaveInvVariance),
+              resultRunningMean_(resultRunningMean),
+              resultRunningVariance_(resultRunningVariance)
+        {
+            xyLengths_ =
+                shuffle_tensor_dimensions<Rank, NumBatchNormReduceDim>(xyLengths, reduceDims);
+            xStrides_ =
+                shuffle_tensor_dimensions<Rank, NumBatchNormReduceDim>(xStrides, reduceDims);
+            yStrides_ =
+                shuffle_tensor_dimensions<Rank, NumBatchNormReduceDim>(yStrides, reduceDims);
+
+            std::tie(invariant_length_, reduce_length_) =
+                get_2d_lengths<Rank, NumBatchNormReduceDim>(xyLengths_);
+
+            epsilon_       = type_convert<AccDataType>(epsilon);
+            averageFactor_ = type_convert<AccDataType>(averageFactor);
+
+            updateMovingAverage_ =
+                (resultRunningMean != nullptr && resultRunningVariance != nullptr);
+            saveMeanInvVariance_ = (resultSaveMean != nullptr && resultSaveInvVariance_ != nullptr);
+
+            if(UseMultiblockInK)
+            {
+                int iterations = 1;
+                while(true)
+                {
+                    int testBlkGroupSize = (reduce_length_ + (K_BlockTileSize * iterations) - 1) /
+                                           (K_BlockTileSize * iterations);
+
+                    // we want the blkGroupSize be not more than 128
+                    if(testBlkGroupSize <= 128)
+                        break;
+
+                    iterations++;
+                };
+
+                blkGroupSize_ = (reduce_length_ + (K_BlockTileSize * iterations) - 1) /
+                                (K_BlockTileSize * iterations);
+
+                numBlockTileIteration_ = iterations;
+            }
+            else
+            {
+                blkGroupSize_          = 1;
+                numBlockTileIteration_ = (reduce_length_ + K_BlockTileSize - 1) / K_BlockTileSize;
+            };
+
+            gridSize_ = (invariant_length_ + M_BlockTileSize - 1) / M_BlockTileSize * blkGroupSize_;
+
+            x_grid_desc_m_k_ =
+                MakeXY2dDescriptor(xyLengths_, xStrides_, blkGroupSize_, numBlockTileIteration_);
+            y_grid_desc_m_k_ =
+                MakeXY2dDescriptor(xyLengths_, yStrides_, blkGroupSize_, numBlockTileIteration_);
+            scale_grid_desc_m_ =
+                MakeScaleBiasMeanVar1dDescriptor(bnScaleBiasMeanVarLengths, bnScaleStrides_);
+            bias_grid_desc_m_ =
+                MakeScaleBiasMeanVar1dDescriptor(bnScaleBiasMeanVarLengths, bnBiasStrides_);
+            mean_var_grid_desc_m_ =
+                MakeScaleBiasMeanVar1dDescriptor(bnScaleBiasMeanVarLengths, bnMeanVarStrides_);
+        }
+
+        AccDataType epsilon_;
+        AccDataType averageFactor_;
+
+        bool updateMovingAverage_;
+        bool saveMeanInvVariance_;
+
+        std::array<index_t, Rank> xyLengths_;
+        std::array<index_t, Rank> xStrides_;
+        std::array<index_t, Rank> yStrides_;
+
+        std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarLengths_;
+        std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleStrides_;
+        std::array<index_t, Rank - NumBatchNormReduceDim> bnBiasStrides_;
+        std::array<index_t, Rank - NumBatchNormReduceDim> bnMeanVarStrides_;
+
+        const XDataType* p_x_;
+        const ScaleDataType* p_scale_;
+        const BiasDataType* p_bias_;
+        const YElementwiseOp y_elementwise_op_;
+        YDataType* p_y_;
+
+        MeanVarDataType* resultSaveMean_;
+        MeanVarDataType* resultSaveInvVariance_;
+
+        MeanVarDataType* resultRunningMean_;
+        MeanVarDataType* resultRunningVariance_;
+
+        long_index_t invariant_length_;
+        long_index_t reduce_length_;
+
+        int blkGroupSize_;
+        int numBlockTileIteration_;
+        size_t gridSize_;
+
+        XYGridDesc_M_K x_grid_desc_m_k_;
+        XYGridDesc_M_K y_grid_desc_m_k_;
+        ScaleBiasMeanVarGridDesc_M scale_grid_desc_m_;
+        ScaleBiasMeanVarGridDesc_M bias_grid_desc_m_;
+        ScaleBiasMeanVarGridDesc_M mean_var_grid_desc_m_;
+
+        void* workspace_mean_;
+        void* workspace_variance_;
+        void* workspace_count_;
+    };
+
+    size_t GetWorkSpaceSize(const BaseArgument* pArg) const override
+    {
+        const Argument* pArg_ = dynamic_cast<const Argument*>(pArg);
+
+        size_t workspace_size = 0;
+
+        if(UseMultiblockInK && pArg_->blkGroupSize_ > 1)
+        {
+            // workspace for welford intermediate mean
+            workspace_size +=
+                pArg_->invariant_length_ * pArg_->blkGroupSize_ * sizeof(MeanVarDataType) + 64;
+
+            // workspace for welford intermediate variance
+            workspace_size +=
+                pArg_->invariant_length_ * pArg_->blkGroupSize_ * sizeof(MeanVarDataType) + 64;
+
+            // workspace for welford intermediate count
+            workspace_size +=
+                pArg_->invariant_length_ * pArg_->blkGroupSize_ * sizeof(int32_t) + 64;
+        }
+
+        return (workspace_size);
+    };
+
+    void SetWorkSpacePointer(BaseArgument* pArg, void* p_workspace) const override
+    {
+        Argument* pArg_ = dynamic_cast<Argument*>(pArg);
+
+        pArg_->p_workspace_ = p_workspace;
+
+        if(UseMultiblockInK && pArg_->blkGroupSize_ > 1)
+        {
+
+            // setup buffer used for intermediate welford mean
+            pArg_->workspace_mean_ = static_cast<char*>(pArg_->p_workspace_);
+
+            index_t mean_space_sz =
+                pArg_->invariant_length_ * pArg_->blkGroupSize_ * sizeof(MeanVarDataType);
+
+            mean_space_sz = math::integer_least_multiple(mean_space_sz, 64);
+
+            // setup buffer used for intermediate welford varirance
+            pArg_->workspace_variance_ =
+                reinterpret_cast<char*>(pArg_->workspace_mean_) + mean_space_sz;
+
+            index_t variance_space_sz =
+                pArg_->invariant_length_ * pArg_->blkGroupSize_ * sizeof(MeanVarDataType);
+
+            variance_space_sz = math::integer_least_multiple(variance_space_sz, 64);
+
+            // setup buffer used for intermediate welfor count
+            pArg_->workspace_count_ =
+                reinterpret_cast<char*>(pArg_->workspace_variance_) + variance_space_sz;
+        };
+    };
+
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            float avg_time = 0;
+
+            if(UseMultiblockInK && arg.blkGroupSize_ > 1)
+            {
+                using GetReduceCountPerThreadFunctor =
+                    GetReduceCountPerThreadForMultiblockWelford<K_BlockTileSize, KThreadSliceSize>;
+
+                GetReduceCountPerThreadFunctor get_reduce_count_per_thread(
+                    arg.blkGroupSize_, arg.numBlockTileIteration_, arg.reduce_length_);
+
+                const auto mean_var_count_grid_desc_m_g =
+                    DeviceBatchNormFwdImpl::MakeMeanVarCountOutputMG2dDescriptor(
+                        arg.invariant_length_, arg.blkGroupSize_);
+
+                const auto mean_var_count_grid_desc_m_k =
+                    DeviceBatchNormFwdImpl::MakeMeanVarCountInputMK2dDescriptor(
+                        arg.invariant_length_, arg.blkGroupSize_);
+
+                using MeanVarCountGridDesc_M_G = decltype(mean_var_count_grid_desc_m_g);
+                using MeanVarCountGridDesc_M_K = decltype(mean_var_count_grid_desc_m_k);
+
+                using GridwiseMultiblockWelfordFirstHalf_ =
+                    GridwiseMultiblockWelfordFirstHalf<XDataType,
+                                                       AccDataType,
+                                                       MeanVarDataType,
+                                                       XYGridDesc_M_K,
+                                                       MeanVarCountGridDesc_M_G,
+                                                       GetReduceCountPerThreadFunctor,
+                                                       BlockSize,
+                                                       MThreadClusterSize,
+                                                       KThreadClusterSize,
+                                                       MThreadSliceSize,
+                                                       KThreadSliceSize,
+                                                       XSrcYDstVectorDim,
+                                                       XSrcVectorSize>;
+
+                using GridwiseWelfordSecondHalfBatchNormForwardFinal_ =
+                    GridwiseWelfordSecondHalfBatchNormForwardFinal<XDataType,
+                                                                   YDataType,
+                                                                   AccDataType,
+                                                                   ScaleDataType,
+                                                                   BiasDataType,
+                                                                   MeanVarDataType,
+                                                                   YElementwiseOp,
+                                                                   XYGridDesc_M_K,
+                                                                   MeanVarCountGridDesc_M_K,
+                                                                   ScaleBiasMeanVarGridDesc_M,
+                                                                   ScaleBiasMeanVarGridDesc_M,
+                                                                   BlockSize,
+                                                                   MThreadClusterSize,
+                                                                   KThreadClusterSize,
+                                                                   MThreadSliceSize,
+                                                                   KThreadSliceSize,
+                                                                   XSrcYDstVectorDim,
+                                                                   XSrcVectorSize,
+                                                                   YDstVectorSize,
+                                                                   ScaleSrcVectorSize,
+                                                                   BiasSrcVectorSize,
+                                                                   MeanVarSrcDstVectorSize>;
+
+                index_t numMeanVarCountBlockTileIteration =
+                    (arg.blkGroupSize_ + KThreadClusterSize - 1) / KThreadClusterSize;
+
+                const auto kern_multiblock_welford_first_half =
+                    kernel_multiblock_welford_first_half<GridwiseMultiblockWelfordFirstHalf_,
+                                                         XDataType,
+                                                         MeanVarDataType,
+                                                         XYGridDesc_M_K,
+                                                         MeanVarCountGridDesc_M_G,
+                                                         GetReduceCountPerThreadFunctor>;
+
+                const auto kern_welford_second_half_batchnorm_forward_final =
+                    kernel_welford_second_half_batchnorm_forward_final<
+                        GridwiseWelfordSecondHalfBatchNormForwardFinal_,
+                        XDataType,
+                        YDataType,
+                        AccDataType,
+                        ScaleDataType,
+                        BiasDataType,
+                        MeanVarDataType,
+                        YElementwiseOp,
+                        XYGridDesc_M_K,
+                        MeanVarCountGridDesc_M_K,
+                        ScaleBiasMeanVarGridDesc_M,
+                        ScaleBiasMeanVarGridDesc_M>;
+
+                avg_time +=
+                    launch_and_time_kernel(stream_config,
+                                           kern_multiblock_welford_first_half,
+                                           dim3(arg.gridSize_),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.x_grid_desc_m_k_,
+                                           mean_var_count_grid_desc_m_g,
+                                           get_reduce_count_per_thread,
+                                           arg.numBlockTileIteration_,
+                                           arg.p_x_,
+                                           static_cast<MeanVarDataType*>(arg.workspace_mean_),
+                                           static_cast<MeanVarDataType*>(arg.workspace_variance_),
+                                           static_cast<int32_t*>(arg.workspace_count_));
+
+                avg_time +=
+                    launch_and_time_kernel(stream_config,
+                                           kern_welford_second_half_batchnorm_forward_final,
+                                           dim3(arg.gridSize_),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.x_grid_desc_m_k_,
+                                           arg.y_grid_desc_m_k_,
+                                           mean_var_count_grid_desc_m_k,
+                                           arg.scale_grid_desc_m_,
+                                           arg.bias_grid_desc_m_,
+                                           arg.mean_var_grid_desc_m_,
+                                           arg.blkGroupSize_,
+                                           arg.numBlockTileIteration_,
+                                           numMeanVarCountBlockTileIteration,
+                                           arg.epsilon_,
+                                           static_cast<MeanVarDataType*>(arg.workspace_mean_),
+                                           static_cast<MeanVarDataType*>(arg.workspace_variance_),
+                                           static_cast<int32_t*>(arg.workspace_count_),
+                                           arg.p_x_,
+                                           arg.p_scale_,
+                                           arg.p_bias_,
+                                           arg.y_elementwise_op_,
+                                           arg.p_y_,
+                                           arg.updateMovingAverage_,
+                                           arg.averageFactor_,
+                                           arg.resultRunningMean_,
+                                           arg.resultRunningVariance_,
+                                           arg.saveMeanInvVariance_,
+                                           arg.resultSaveMean_,
+                                           arg.resultSaveInvVariance_);
+            }
+            else
+            {
+                using GetReduceCountPerThreadFunctor =
+                    GetReduceCountPerThreadForBlockwiseWelford<K_BlockTileSize, KThreadSliceSize>;
+
+                GetReduceCountPerThreadFunctor get_reduce_count_per_thread(
+                    arg.numBlockTileIteration_, arg.reduce_length_);
+
+                using GridwiseBatchNormForwardWithBlockwiseWelford_ =
+                    GridwiseBatchNormForwardWithBlockwiseWelford<XDataType,
+                                                                 YDataType,
+                                                                 AccDataType,
+                                                                 ScaleDataType,
+                                                                 BiasDataType,
+                                                                 MeanVarDataType,
+                                                                 YElementwiseOp,
+                                                                 XYGridDesc_M_K,
+                                                                 ScaleBiasMeanVarGridDesc_M,
+                                                                 ScaleBiasMeanVarGridDesc_M,
+                                                                 GetReduceCountPerThreadFunctor,
+                                                                 BlockSize,
+                                                                 MThreadClusterSize,
+                                                                 KThreadClusterSize,
+                                                                 MThreadSliceSize,
+                                                                 KThreadSliceSize,
+                                                                 XSrcYDstVectorDim,
+                                                                 XSrcVectorSize,
+                                                                 YDstVectorSize,
+                                                                 ScaleSrcVectorSize,
+                                                                 BiasSrcVectorSize,
+                                                                 MeanVarSrcDstVectorSize>;
+
+                const auto kern_batchnorm_fwd = kernel_batchnorm_forward_with_blockwise_welford<
+                    GridwiseBatchNormForwardWithBlockwiseWelford_,
+                    XDataType,
+                    YDataType,
+                    AccDataType,
+                    ScaleDataType,
+                    BiasDataType,
+                    MeanVarDataType,
+                    YElementwiseOp,
+                    XYGridDesc_M_K,
+                    ScaleBiasMeanVarGridDesc_M,
+                    ScaleBiasMeanVarGridDesc_M,
+                    GetReduceCountPerThreadFunctor>;
+
+                avg_time += launch_and_time_kernel(stream_config,
+                                                   kern_batchnorm_fwd,
+                                                   dim3(arg.gridSize_),
+                                                   dim3(BlockSize),
+                                                   0,
+                                                   arg.x_grid_desc_m_k_,
+                                                   arg.y_grid_desc_m_k_,
+                                                   arg.scale_grid_desc_m_,
+                                                   arg.bias_grid_desc_m_,
+                                                   arg.mean_var_grid_desc_m_,
+                                                   get_reduce_count_per_thread,
+                                                   arg.numBlockTileIteration_,
+                                                   arg.epsilon_,
+                                                   arg.p_x_,
+                                                   arg.p_scale_,
+                                                   arg.p_bias_,
+                                                   arg.y_elementwise_op_,
+                                                   arg.p_y_,
+                                                   arg.updateMovingAverage_, // true or false
+                                                   arg.averageFactor_,
+                                                   arg.resultRunningMean_,
+                                                   arg.resultRunningVariance_,
+                                                   arg.saveMeanInvVariance_, // true or false
+                                                   arg.resultSaveMean_,
+                                                   arg.resultSaveInvVariance_);
+            };
+
+            return (avg_time);
+        };
+
+        float Run(const BaseArgument* pArg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(pArg), stream_config);
+        };
+    };
+
+    bool IsSupportedArgument(const BaseArgument* pArg) override
+    {
+        const Argument* pArg_ = dynamic_cast<const Argument*>(pArg);
+
+        if constexpr(XSrcYDstVectorDim == 0)
+        {
+            if(pArg_->xStrides_[NumInvariantDim - 1] != 1 ||
+               pArg_->yStrides_[NumInvariantDim - 1] != 1)
+                return false;
+
+            if(pArg_->xyLengths_[NumInvariantDim - 1] % XSrcVectorSize != 0 ||
+               pArg_->xyLengths_[NumInvariantDim - 1] % YDstVectorSize != 0)
+                return false;
+        }
+        else
+        {
+            if(pArg_->xStrides_[Rank - 1] != 1 || pArg_->yStrides_[Rank - 1] != 1)
+                return false;
+
+            if(pArg_->xyLengths_[Rank - 1] % XSrcVectorSize != 0 ||
+               pArg_->xyLengths_[Rank - 1] % YDstVectorSize != 0)
+                return false;
+        };
+
+        if(pArg_->bnScaleStrides_[NumInvariantDim - 1] != 1 && ScaleSrcVectorSize != 1)
+            return false;
+        if(pArg_->bnBiasStrides_[NumInvariantDim - 1] != 1 && BiasSrcVectorSize != 1)
+            return false;
+
+        if(pArg_->bnScaleBiasMeanVarLengths_[NumInvariantDim - 1] % ScaleSrcVectorSize != 0)
+            return false;
+        if(pArg_->bnScaleBiasMeanVarLengths_[NumInvariantDim - 1] % BiasSrcVectorSize != 0)
+            return false;
+
+        if(pArg_->bnMeanVarStrides_[NumInvariantDim - 1] != 1 && MeanVarSrcDstVectorSize != 1)
+            return false;
+
+        if(pArg_->bnScaleBiasMeanVarLengths_[NumInvariantDim - 1] % MeanVarSrcDstVectorSize != 0)
+            return false;
+
+        bool is_valid = true;
+
+        static_for<0, NumInvariantDim, 1>{}([&](auto I) {
+            if(pArg_->xyLengths_[I] != pArg_->bnScaleBiasMeanVarLengths_[I])
+                is_valid = false;
+        });
+
+        if(!is_valid)
+            return false;
+
+        return true;
+    };
+
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(
+        const std::array<index_t, Rank> xyLengths,
+        const std::array<index_t, Rank> xStrides,
+        const std::array<index_t, Rank> yStrides,
+        const std::array<int, NumBatchNormReduceDim> reduceDims,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleBiasMeanVarLengths,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnScaleStrides,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnBiasStrides,
+        const std::array<index_t, Rank - NumBatchNormReduceDim> bnMeanVarStrides,
+        const void* p_x,
+        const void* p_scale,
+        const void* p_bias,
+        double epsilon,
+        const YElementwiseOp y_elementwise_op,
+        void* p_y,
+        void* resultSaveMean,
+        void* resultSaveInvVariance,
+        double averageFactor,
+        void* resultRunningMean,
+        void* resultRunningVariance) override
+    {
+        return std::make_unique<Argument>(xyLengths,
+                                          xStrides,
+                                          yStrides,
+                                          reduceDims,
+                                          bnScaleBiasMeanVarLengths,
+                                          bnScaleStrides,
+                                          bnBiasStrides,
+                                          bnMeanVarStrides,
+                                          static_cast<const XDataType*>(p_x),
+                                          static_cast<const ScaleDataType*>(p_scale),
+                                          static_cast<const BiasDataType*>(p_bias),
+                                          y_elementwise_op,
+                                          epsilon,
+                                          static_cast<YDataType*>(p_y),
+                                          static_cast<MeanVarDataType*>(resultSaveMean),
+                                          static_cast<MeanVarDataType*>(resultSaveInvVariance),
+                                          averageFactor,
+                                          static_cast<MeanVarDataType*>(resultRunningMean),
+                                          static_cast<MeanVarDataType*>(resultRunningVariance));
+    };
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>();
+    };
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceBatchNormFwdImpl<" << BlockSize << ",";
+        str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
+        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
+        str << "XSrcYDstVectorDim_" << XSrcYDstVectorDim  << ",";
+        str << "VectorSize_X" << XSrcVectorSize << "_scale_" << ScaleSrcVectorSize << "_bias_" << BiasSrcVectorSize << "_mean_var_" << MeanVarSrcDstVectorSize << "_Y" << YDstVectorSize << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp
new file mode 100644
index 00000000..29978458
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_cgemm_4gemm_xdl_cshuffle.hpp
@@ -0,0 +1,948 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/device_cgemm.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <
+    typename ALayout,
+    typename BLayout,
+    typename CLayout,
+    typename ADataType,
+    typename BDataType,
+    typename CDataType,
+    typename GemmAccDataType,
+    typename CShuffleDataType,
+    typename AElementwiseOperation,
+    typename BElementwiseOperation,
+    typename CElementwiseOperation,
+    GemmSpecialization GemmSpec,
+    index_t NumGemmKPrefetchStage,
+    index_t BlockSize,
+    index_t MPerBlock,
+    index_t NPerBlock,
+    index_t KPerBlock,
+    index_t AK1,
+    index_t BK1,
+    index_t MPerXDL,
+    index_t NPerXDL,
+    index_t MXdlPerWave,
+    index_t NXdlPerWave,
+    typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+    typename ABlockTransferThreadClusterArrangeOrder,
+    typename ABlockTransferSrcAccessOrder,
+    index_t ABlockTransferSrcVectorDim,
+    index_t ABlockTransferSrcScalarPerVector,
+    index_t ABlockTransferDstScalarPerVector_AK1,
+    bool ABlockLdsExtraM,
+    typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+    typename BBlockTransferThreadClusterArrangeOrder,
+    typename BBlockTransferSrcAccessOrder,
+    index_t BBlockTransferSrcVectorDim,
+    index_t BBlockTransferSrcScalarPerVector,
+    index_t BBlockTransferDstScalarPerVector_BK1,
+    bool BBlockLdsExtraN,
+    index_t CShuffleMXdlPerWavePerShuffle,
+    index_t CShuffleNXdlPerWavePerShuffle,
+    typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+    index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+    LoopScheduler LoopSched = make_default_loop_scheduler(),
+    enable_if_t<
+        is_same_v<AElementwiseOperation, ck::tensor_operation::element_wise::PassThrough> &&
+            is_same_v<BElementwiseOperation, ck::tensor_operation::element_wise::PassThrough> &&
+            is_same_v<CElementwiseOperation, ck::tensor_operation::element_wise::PassThrough>,
+        bool> = false>
+struct DeviceCGemm_4Gemm_Xdl_CShuffle
+    : public DeviceCGemm<AElementwiseOperation, BElementwiseOperation, CElementwiseOperation>
+{
+    using DeviceOp = DeviceCGemm_4Gemm_Xdl_CShuffle;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    static constexpr auto MPerThread       = Number<4>{};
+    static constexpr auto AScalarPerVector = Number<4>{};
+    static constexpr auto BScalarPerVector = Number<4>{};
+    static constexpr auto CScalarPerVector = Number<4>{};
+
+    template <typename Desc_M>
+    static auto PadDescriptor_M_1d(Desc_M desc_m, index_t gridSize, index_t blockSize)
+    {
+        const auto M            = desc_m.GetLength(I0);
+        const index_t loop_step = gridSize * blockSize * MPerThread;
+        const auto pad          = math::integer_least_multiple(M, loop_step) - M;
+        const auto desc_m_pad =
+            transform_tensor_descriptor(desc_m,
+                                        make_tuple(make_right_pad_transform(M, pad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        return desc_m_pad;
+    }
+
+    static auto MakeDescriptor_M(const std::vector<index_t>& lengths,
+                                 const std::vector<index_t>& strides,
+                                 index_t gridSize,
+                                 index_t blockSize)
+    {
+        auto tupleOfShape  = generate_tuple([&](auto I) { return lengths[I]; }, Number<2>{});
+        auto tupleOfStride = generate_tuple([&](auto I) { return strides[I]; }, Number<2>{});
+
+        // nd desc - [s0, s1, s2, ...]
+        const auto desc   = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride);
+        const auto desc_m = transform_tensor_descriptor(
+            desc,
+            make_tuple(make_merge_transform(tupleOfShape)),
+            make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number<2>{})),
+            make_tuple(Sequence<0>{}));
+
+        return PadDescriptor_M_1d(desc_m, gridSize, blockSize);
+    }
+
+    static auto MakeAGridDescriptor_AK0_M_AK1(index_t MRaw, index_t KRaw, index_t StrideA)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(I1, StrideA));
+            }
+        }();
+
+        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto K = math::integer_divide_ceil(KRaw, KPerBlock) * KPerBlock;
+
+        const auto MPad = M - MRaw;
+        const auto KPad = K - KRaw;
+
+        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both M and K
+            assert(K % AK1 == 0);
+
+            const auto AK0 = K / AK1;
+
+            const auto a_grid_desc_m_k =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_right_pad_transform(MRaw, MPad),
+                                                       make_right_pad_transform(KRaw, KPad)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_m_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(M)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad M, but not K
+            assert(KRaw % AK1 == 0);
+
+            const auto AK0 = KRaw / AK1;
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_right_pad_transform(MRaw, MPad)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad K, but not M
+            assert(K % AK1 == 0);
+
+            const auto AK0 = K / AK1;
+
+            const auto a_grid_desc_m_k = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(KRaw, KPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_m_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(MRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else
+        {
+            // not pad M or K
+            assert(KRaw % AK1 == 0);
+
+            const auto AK0 = KRaw / AK1;
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(MRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+    }
+
+    static auto MakeBGridDescriptor_BK0_N_BK1(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
+        const auto K = math::integer_divide_ceil(KRaw, KPerBlock) * KPerBlock;
+
+        const auto NPad = N - NRaw;
+        const auto KPad = K - KRaw;
+
+        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both N and K
+            assert(K % BK1 == 0);
+
+            const auto BK0 = K / BK1;
+
+            const auto b_grid_desc_n_k =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_right_pad_transform(NRaw, NPad),
+                                                       make_right_pad_transform(KRaw, KPad)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_n_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(N)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad N, but not K
+            assert(KRaw % BK1 == 0);
+
+            const auto BK0 = KRaw / BK1;
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_right_pad_transform(NRaw, NPad)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad K, but not N
+            assert(K % BK1 == 0);
+
+            const auto BK0 = K / BK1;
+
+            const auto b_grid_desc_n_k = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_pass_through_transform(NRaw), make_right_pad_transform(KRaw, KPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_n_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(NRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else
+        {
+            // not pad N or K
+            assert(KRaw % BK1 == 0);
+
+            const auto BK0 = KRaw / BK1;
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(NRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+    }
+
+    static auto MakeCGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(I1, StrideC));
+            }
+        }();
+
+        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
+
+        const auto MPad = M - MRaw;
+        const auto NPad = N - NRaw;
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad M and N
+            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                               make_tuple(make_right_pad_transform(MRaw, MPad),
+                                                          make_right_pad_transform(NRaw, NPad)),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad M, but not N
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_right_pad_transform(MRaw, MPad), make_pass_through_transform(NRaw)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad N, but not M
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(NRaw, NPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            // not pad M or N
+            return c_grid_desc_mraw_nraw;
+        }
+    }
+
+    using AGridDesc_AK0_M_AK1 = decltype(MakeAGridDescriptor_AK0_M_AK1(1, 1, 1));
+    using BGridDesc_BK0_N_BK1 = decltype(MakeBGridDescriptor_BK0_N_BK1(1, 1, 1));
+    using CGridDesc_M_N       = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
+    using CGridDesc_M         = decltype(MakeDescriptor_M({1, 1}, {1, 1}, 1, 1));
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
+        ADataType, // TODO: distinguish A/B datatype
+        GemmAccDataType,
+        CShuffleDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        InMemoryDataOperationEnum::Set,
+        AGridDesc_AK0_M_AK1,
+        BGridDesc_BK0_N_BK1,
+        CGridDesc_M_N,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        LoopSched>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const ADataType* p_a_grid_real,
+                 const ADataType* p_a_grid_imag,
+                 const BDataType* p_b_grid_real,
+                 const BDataType* p_b_grid_imag,
+                 CDataType* p_c_grid_real,
+                 CDataType* p_c_grid_imag,
+                 CDataType* p_workspace,
+                 index_t MRaw,
+                 index_t NRaw,
+                 index_t KRaw,
+                 index_t StrideA,
+                 index_t StrideB,
+                 index_t StrideC,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op)
+            : p_a_grid_real_{p_a_grid_real},
+              p_a_grid_imag_{p_a_grid_imag},
+              p_b_grid_real_{p_b_grid_real},
+              p_b_grid_imag_{p_b_grid_imag},
+              p_c_grid_real_{p_c_grid_real},
+              p_c_grid_imag_{p_c_grid_imag},
+              p_aux_grid_{p_workspace},
+              a_grid_desc_ak0_m_ak1_{DeviceOp::MakeAGridDescriptor_AK0_M_AK1(MRaw, KRaw, StrideA)},
+              b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1(KRaw, NRaw, StrideB)},
+              c_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, NRaw, StrideC)},
+              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              block_2_ctile_map_{GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_)},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op}
+        {
+            if(GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1_,
+                                           b_grid_desc_bk0_n_bk1_,
+                                           c_grid_desc_m_n_,
+                                           block_2_ctile_map_))
+            {
+                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        c_grid_desc_m_n_);
+            }
+
+            const index_t grid_size = block_2_ctile_map_.CalculateGridSize(c_grid_desc_m_n_);
+
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                c_grid_desc_m_ =
+                    DeviceOp::MakeDescriptor_M({MRaw, NRaw}, {StrideC, I1}, grid_size, BlockSize);
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                c_grid_desc_m_ =
+                    DeviceOp::MakeDescriptor_M({MRaw, NRaw}, {I1, StrideC}, grid_size, BlockSize);
+            }
+
+            p_aux_2_grid_ = p_workspace + c_grid_desc_m_n_.GetElementSpaceSize();
+        }
+
+        //  private:
+        const ADataType* p_a_grid_real_;
+        const ADataType* p_a_grid_imag_;
+        const BDataType* p_b_grid_real_;
+        const BDataType* p_b_grid_imag_;
+        CDataType* p_c_grid_real_;
+        CDataType* p_c_grid_imag_;
+        CDataType* p_aux_grid_;
+        CDataType* p_aux_2_grid_;
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        CGridDesc_M c_grid_desc_m_;
+        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock_;
+        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                            arg.b_grid_desc_bk0_n_bk1_,
+                                            arg.c_grid_desc_m_n_,
+                                            arg.block_2_ctile_map_))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_);
+
+            const auto K =
+                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+
+            float ave_time = 0;
+
+            using Add      = ck::tensor_operation::element_wise::Add;
+            using Subtract = ck::tensor_operation::element_wise::Subtract;
+
+            using GridwiseBinAdd =
+                GridwiseElementwise_1D<Tuple<CGridDesc_M, CGridDesc_M>,
+                                       Tuple<CGridDesc_M>,
+                                       Tuple<const CDataType*, const CDataType*>,
+                                       Tuple<CDataType*>,
+                                       Add,
+                                       MPerThread,
+                                       Sequence<AScalarPerVector, BScalarPerVector>,
+                                       Sequence<CScalarPerVector>>;
+
+            using GridwiseBinSubtract =
+                GridwiseElementwise_1D<Tuple<CGridDesc_M, CGridDesc_M>,
+                                       Tuple<CGridDesc_M>,
+                                       Tuple<const CDataType*, const CDataType*>,
+                                       Tuple<CDataType*>,
+                                       Subtract,
+                                       MPerThread,
+                                       Sequence<AScalarPerVector, BScalarPerVector>,
+                                       Sequence<CScalarPerVector>>;
+
+            const auto add_kernel = kernel_elementwise_1d<GridwiseBinAdd,
+                                                          Tuple<CGridDesc_M, CGridDesc_M>,
+                                                          Tuple<CGridDesc_M>,
+                                                          Tuple<const CDataType*, const CDataType*>,
+                                                          Tuple<CDataType*>,
+                                                          Add>;
+
+            const auto subtract_kernel =
+                kernel_elementwise_1d<GridwiseBinSubtract,
+                                      Tuple<CGridDesc_M, CGridDesc_M>,
+                                      Tuple<CGridDesc_M>,
+                                      Tuple<const CDataType*, const CDataType*>,
+                                      Tuple<CDataType*>,
+                                      Subtract>;
+
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                const auto kernel = kernel_gemm_xdl_cshuffle_v1<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CElementwiseOperation,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::DefaultBlock2CTileMap,
+                    true>;
+
+                ave_time +=
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_real_,
+                                           arg.p_b_grid_real_,
+                                           arg.p_aux_grid_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.block_2_ctile_map_);
+
+                ave_time +=
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_imag_,
+                                           arg.p_b_grid_imag_,
+                                           arg.p_aux_2_grid_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.block_2_ctile_map_);
+
+                // c_real = aux - aux_2
+                ave_time += launch_and_time_kernel(
+                    stream_config,
+                    subtract_kernel,
+                    dim3(grid_size),
+                    dim3(BlockSize),
+                    0,
+                    make_tuple(arg.c_grid_desc_m_, arg.c_grid_desc_m_),
+                    make_tuple(arg.c_grid_desc_m_),
+                    make_tuple(const_cast<const CDataType*>(arg.p_aux_grid_),
+                               const_cast<const CDataType*>(arg.p_aux_2_grid_)),
+                    make_tuple(arg.p_c_grid_real_),
+                    Subtract{});
+
+                ave_time +=
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_real_,
+                                           arg.p_b_grid_imag_,
+                                           arg.p_aux_grid_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.block_2_ctile_map_);
+
+                ave_time +=
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_imag_,
+                                           arg.p_b_grid_real_,
+                                           arg.p_aux_2_grid_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.block_2_ctile_map_);
+
+                // c_imag = aux + aux_2
+                ave_time += launch_and_time_kernel(
+                    stream_config,
+                    add_kernel,
+                    dim3(grid_size),
+                    dim3(BlockSize),
+                    0,
+                    make_tuple(arg.c_grid_desc_m_, arg.c_grid_desc_m_),
+                    make_tuple(arg.c_grid_desc_m_),
+                    make_tuple(const_cast<const CDataType*>(arg.p_aux_grid_),
+                               const_cast<const CDataType*>(arg.p_aux_2_grid_)),
+                    make_tuple(arg.p_c_grid_imag_),
+                    Add{});
+            }
+            else
+            {
+                const auto kernel = kernel_gemm_xdl_cshuffle_v1<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CElementwiseOperation,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::DefaultBlock2CTileMap,
+                    false>;
+
+                ave_time +=
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_real_,
+                                           arg.p_b_grid_real_,
+                                           arg.p_aux_grid_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.block_2_ctile_map_);
+
+                ave_time +=
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_imag_,
+                                           arg.p_b_grid_imag_,
+                                           arg.p_aux_2_grid_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.block_2_ctile_map_);
+
+                // c_real = aux - aux_2
+                ave_time += launch_and_time_kernel(
+                    stream_config,
+                    subtract_kernel,
+                    dim3(grid_size),
+                    dim3(BlockSize),
+                    0,
+                    make_tuple(arg.c_grid_desc_m_, arg.c_grid_desc_m_),
+                    make_tuple(arg.c_grid_desc_m_),
+                    make_tuple(const_cast<const CDataType*>(arg.p_aux_grid_),
+                               const_cast<const CDataType*>(arg.p_aux_2_grid_)),
+                    make_tuple(arg.p_c_grid_real_),
+                    Subtract{});
+
+                ave_time +=
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_real_,
+                                           arg.p_b_grid_imag_,
+                                           arg.p_aux_grid_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.block_2_ctile_map_);
+
+                ave_time +=
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_imag_,
+                                           arg.p_b_grid_real_,
+                                           arg.p_aux_2_grid_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.block_2_ctile_map_);
+
+                // c_imag = aux + aux_2
+                ave_time += launch_and_time_kernel(
+                    stream_config,
+                    add_kernel,
+                    dim3(grid_size),
+                    dim3(BlockSize),
+                    0,
+                    make_tuple(arg.c_grid_desc_m_, arg.c_grid_desc_m_),
+                    make_tuple(arg.c_grid_desc_m_),
+                    make_tuple(const_cast<const CDataType*>(arg.p_aux_grid_),
+                               const_cast<const CDataType*>(arg.p_aux_2_grid_)),
+                    make_tuple(arg.p_c_grid_imag_),
+                    Add{});
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_m_n_,
+                                           arg.block_2_ctile_map_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const ADataType* p_a_real,
+                             const ADataType* p_a_imag,
+                             const BDataType* p_b_real,
+                             const BDataType* p_b_imag,
+                             CDataType* p_c_real,
+                             CDataType* p_c_imag,
+                             CDataType* p_workspace,
+                             index_t MRaw,
+                             index_t NRaw,
+                             index_t KRaw,
+                             index_t StrideA,
+                             index_t StrideB,
+                             index_t StrideC,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{p_a_real,
+                        p_a_imag,
+                        p_b_real,
+                        p_b_imag,
+                        p_c_real,
+                        p_c_imag,
+                        p_workspace,
+                        MRaw,
+                        NRaw,
+                        KRaw,
+                        StrideA,
+                        StrideB,
+                        StrideC,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a_real,
+                                                      const void* p_a_imag,
+                                                      const void* p_b_real,
+                                                      const void* p_b_imag,
+                                                      void* p_c_real,
+                                                      void* p_c_imag,
+                                                      void* p_workspace,
+                                                      index_t MRaw,
+                                                      index_t NRaw,
+                                                      index_t KRaw,
+                                                      index_t StrideA,
+                                                      index_t StrideB,
+                                                      index_t StrideC,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      CElementwiseOperation c_element_op,
+                                                      index_t /* KBatch */ = 1) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a_real),
+                                          static_cast<const ADataType*>(p_a_imag),
+                                          static_cast<const BDataType*>(p_b_real),
+                                          static_cast<const BDataType*>(p_b_imag),
+                                          static_cast<CDataType*>(p_c_real),
+                                          static_cast<CDataType*>(p_c_imag),
+                                          static_cast<CDataType*>(p_workspace),
+                                          MRaw,
+                                          NRaw,
+                                          KRaw,
+                                          StrideA,
+                                          StrideB,
+                                          StrideC,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceCGemm_4Gemm_Xdl_CShuffle"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+
+    std::size_t GetWorkspaceSize(index_t MRaw,
+                                 index_t NRaw,
+                                 [[maybe_unused]] index_t KRaw,
+                                 [[maybe_unused]] index_t StrideA,
+                                 [[maybe_unused]] index_t StrideB,
+                                 index_t StrideC) override
+    {
+        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N(MRaw, NRaw, StrideC);
+
+        return 2 * sizeof(CDataType) * c_grid_desc_m_n.GetElementSpaceSize();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
new file mode 100644
index 00000000..72c6d0b6
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp
@@ -0,0 +1,779 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatDsPointer,
+          typename FloatE,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename Block2ETileMap,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_contraction_multiple_d_xdl_cshuffle(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatDsPointer p_ds_grid,
+            FloatE* __restrict__ p_e_grid,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CDEElementwiseOperation cde_element_op,
+            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                ds_grid_desc_mblock_mperblock_nblock_nperblock,
+            const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                e_grid_desc_mblock_mperblock_nblock_nperblock,
+            const Block2ETileMap block_2_etile_map)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
+                                                  p_b_grid,
+                                                  p_ds_grid,
+                                                  p_e_grid,
+                                                  p_shared,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  cde_element_op,
+                                                  a_grid_desc_ak0_m_ak1,
+                                                  b_grid_desc_bk0_n_bk1,
+                                                  ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  block_2_etile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_ds_grid;
+    ignore = p_e_grid;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = cde_element_op;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = e_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = block_2_etile_map;
+#endif
+}
+
+} // namespace ck
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// Tensor Contraction:
+//   input : A
+//   input : B
+//   input : D0, D1, ...
+//   output : E
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   A[M0, M1, M2, ..., K0, K1, K2, ...]
+//   B[N0, N1, N2, ..., K0, K1, K2, ...]
+//   D[M0, M1, M2, ..., N0, N1, N2, ...]
+//   E[M0, M1, M2, ..., N0, N1, N2, ...]
+template <index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CDEBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched = make_default_loop_scheduler()>
+struct DeviceContractionMultipleD_Xdl_CShuffle
+    : public DeviceContractionMultipleD<NumDimM,
+                                        NumDimN,
+                                        NumDimK,
+                                        ADataType,
+                                        BDataType,
+                                        DsDataType,
+                                        EDataType,
+                                        AElementwiseOperation,
+                                        BElementwiseOperation,
+                                        CDEElementwiseOperation>
+{
+    using DeviceOp = DeviceContractionMultipleD_Xdl_CShuffle;
+
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr auto matrix_padder =
+        MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock};
+
+    // Assume: A[M0, M1, M2, ..., K0, K1, K2, ...]
+    static auto MakeAGridDescriptor_M_K(const std::vector<index_t>& a_ms_ks_lengths_vec,
+                                        const std::vector<index_t>& a_ms_ks_strides_vec)
+    {
+        assert(a_ms_ks_lengths_vec.size() == NumDimM + NumDimK &&
+               a_ms_ks_strides_vec.size() == NumDimM + NumDimK);
+
+        const auto to_tuple = [&](auto& vec, auto num) {
+            return generate_tuple([&](auto i) { return vec[i]; }, num);
+        };
+
+        const auto a_ms_ns_lengths = to_tuple(a_ms_ks_lengths_vec, Number<NumDimM + NumDimK>{});
+        const auto a_ms_ks_strides = to_tuple(a_ms_ks_strides_vec, Number<NumDimM + NumDimK>{});
+
+        // dimension Ids for M0, M1, ...
+        constexpr auto mDimIds = typename arithmetic_sequence_gen<0, NumDimM, 1>::type{};
+
+        // dimension Ids for K0, K1, ...
+        constexpr auto kDimIds =
+            typename arithmetic_sequence_gen<NumDimM, NumDimM + NumDimK, 1>::type{};
+
+        // lengths for M0, M1, ...
+        const auto mLengths = get_container_subset(a_ms_ns_lengths, mDimIds);
+
+        // lengths for K0, K1, ...
+        const auto kLengths = get_container_subset(a_ms_ns_lengths, kDimIds);
+
+        // naive tensor A[M0, M1, M2, ..., K0, K1, K2...]
+        const auto a_grid_desc_ms_ks =
+            make_naive_tensor_descriptor(a_ms_ns_lengths, a_ms_ks_strides);
+
+        // transformed tensor A[MRaw = M0 * M1 * M2 * ... , KRaw = K0 * K1 * K2 * ...]
+        const auto a_grid_desc_mraw_kraw = transform_tensor_descriptor(
+            a_grid_desc_ms_ks,
+            make_tuple(make_merge_transform(mLengths), make_merge_transform(kLengths)),
+            make_tuple(mDimIds, kDimIds),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
+    }
+
+    // Assume: B[N0, N1, N2, ..., K0, K1, K2, ...]
+    static auto MakeBGridDescriptor_N_K(const std::vector<index_t>& b_ns_ks_lengths_vec,
+                                        const std::vector<index_t>& b_ns_ks_strides_vec)
+    {
+        assert(b_ns_ks_lengths_vec.size() == NumDimN + NumDimK &&
+               b_ns_ks_strides_vec.size() == NumDimN + NumDimK);
+
+        const auto to_tuple = [&](auto& vec, auto num) {
+            return generate_tuple([&](auto i) { return vec[i]; }, num);
+        };
+
+        const auto b_ns_ks_lengths = to_tuple(b_ns_ks_lengths_vec, Number<NumDimN + NumDimK>{});
+        const auto b_ns_ks_strides = to_tuple(b_ns_ks_strides_vec, Number<NumDimN + NumDimK>{});
+
+        // dimension Ids for N0, N1, ...
+        constexpr auto nDimIds = typename arithmetic_sequence_gen<0, NumDimN, 1>::type{};
+
+        // dimension Ids for K0, K1, ...
+        constexpr auto kDimIds =
+            typename arithmetic_sequence_gen<NumDimN, NumDimN + NumDimK, 1>::type{};
+
+        // lengths for K0, K1, ...
+        const auto kLengths = get_container_subset(b_ns_ks_lengths, kDimIds);
+
+        // lengths for N0, N1, ...
+        const auto nLengths = get_container_subset(b_ns_ks_lengths, nDimIds);
+
+        // naive tensor B[N0, N1, N2, ..., K0, K1, K2, ...]
+        const auto b_grid_desc_ns_ks =
+            make_naive_tensor_descriptor(b_ns_ks_lengths, b_ns_ks_strides);
+
+        // transformed tensor B[NRaw = N0 * N1 * N2 * ..., KRaw = K0 * K1 * K2 * ...]
+        const auto b_grid_desc_nraw_kraw = transform_tensor_descriptor(
+            b_grid_desc_ns_ks,
+            make_tuple(make_merge_transform(nLengths), make_merge_transform(kLengths)),
+            make_tuple(nDimIds, kDimIds),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
+    }
+
+    // assume E[M0, M1, M2, ..., N0, N1, N2...]
+    static auto MakeEGridDescriptor_M_N(const std::vector<index_t>& e_ms_ns_lengths_vec,
+                                        const std::vector<index_t>& e_ms_ns_strides_vec)
+    {
+        assert(e_ms_ns_lengths_vec.size() == NumDimM + NumDimN &&
+               e_ms_ns_strides_vec.size() == NumDimM + NumDimN);
+
+        const auto to_tuple = [&](auto& vec, auto num) {
+            return generate_tuple([&](auto i) { return vec[i]; }, num);
+        };
+
+        const auto e_ms_ns_lengths = to_tuple(e_ms_ns_lengths_vec, Number<NumDimM + NumDimN>{});
+        const auto e_ms_ns_strides = to_tuple(e_ms_ns_strides_vec, Number<NumDimM + NumDimN>{});
+
+        // dimension Ids for M0, M1, ...
+        constexpr auto mDimIds = typename arithmetic_sequence_gen<0, NumDimM, 1>::type{};
+
+        // dimension Ids for N0, N1, ...
+        constexpr auto nDimIds =
+            typename arithmetic_sequence_gen<NumDimM, NumDimM + NumDimN, 1>::type{};
+
+        // lengths for M0, M1, ...
+        const auto mLengths = get_container_subset(e_ms_ns_lengths, mDimIds);
+
+        // lengths for K0, K1, ...
+        const auto nLengths = get_container_subset(e_ms_ns_lengths, nDimIds);
+
+        // naive tensor E[M0, M1, M2, ..., N0, N1, N2...]
+        const auto e_grid_desc_ms_ns =
+            make_naive_tensor_descriptor(e_ms_ns_lengths, e_ms_ns_strides);
+
+        // transformed tensor E[MRaw = M0 * M1 * M2 * ... , NRaw = N0 * N1 * N2 * ...]
+        const auto e_grid_desc_mraw_nraw = transform_tensor_descriptor(
+            e_grid_desc_ms_ns,
+            make_tuple(make_merge_transform(mLengths), make_merge_transform(nLengths)),
+            make_tuple(mDimIds, nDimIds),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
+    }
+
+    static auto MakeDsGridDescriptor_M_N(
+        const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_lengths_vec,
+        const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_strides_vec)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                return DeviceOp::MakeEGridDescriptor_M_N(ds_ms_ns_lengths_vec[i],
+                                                         ds_ms_ns_strides_vec[i]);
+            },
+            Number<NumDTensor>{});
+    }
+
+    using AGridDesc_M_K  = decltype(MakeAGridDescriptor_M_K({}, {}));
+    using BGridDesc_N_K  = decltype(MakeBGridDescriptor_N_K({}, {}));
+    using DsGridDesc_M_N = remove_cvref_t<decltype(MakeDsGridDescriptor_M_N({{}}, {{}}))>;
+    using EGridDesc_M_N  = decltype(MakeEGridDescriptor_M_N({}, {}));
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<
+        ADataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CShuffleDataType,
+        DsDataType,
+        EDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CDEElementwiseOperation,
+        InMemoryDataOperationEnum::Set,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEBlockTransferScalarPerVector_NPerBlock,
+        LoopSched>;
+
+    // desc for blockwise copy
+    using AGridDesc_AK0_M_AK1                          = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(AGridDesc_M_K{}))>;
+    using BGridDesc_BK0_N_BK1                          = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(BGridDesc_N_K{}))>;
+    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(DsGridDesc_M_N{}))>;
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock  = remove_cvref_t<decltype(
+        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}))>;
+
+    // block-to-e-tile map
+    using Block2ETileMap =
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const void* p_a_grid,
+                 const void* p_b_grid,
+                 std::array<const void*, NumDTensor> p_ds_grid,
+                 void* p_e_grid,
+                 const std::vector<index_t>& a_ms_ns_lengths,
+                 const std::vector<index_t>& a_ms_ks_strides,
+                 const std::vector<index_t>& b_ns_ks_lengths,
+                 const std::vector<index_t>& b_ns_ks_strides,
+                 const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_lengths,
+                 const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_strides,
+                 const std::vector<index_t>& e_ms_ns_lengths,
+                 const std::vector<index_t>& e_ms_ns_strides,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+            : p_a_grid_{static_cast<const ADataType*>(p_a_grid)},
+              p_b_grid_{static_cast<const BDataType*>(p_b_grid)},
+              p_ds_grid_{},
+              p_e_grid_{static_cast<EDataType*>(p_e_grid)},
+              a_grid_desc_m_k_{DeviceOp::MakeAGridDescriptor_M_K(a_ms_ns_lengths, a_ms_ks_strides)},
+              b_grid_desc_n_k_{DeviceOp::MakeBGridDescriptor_N_K(b_ns_ks_lengths, b_ns_ks_strides)},
+              ds_grid_desc_m_n_{},
+              e_grid_desc_m_n_{DeviceOp::MakeEGridDescriptor_M_N(e_ms_ns_lengths, e_ms_ns_strides)},
+              a_grid_desc_ak0_m_ak1_{
+                  GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
+              b_grid_desc_bk0_n_bk1_{
+                  GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
+              ds_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              e_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op},
+              a_mz_stride_{},
+              a_kz_stride_{},
+              b_nz_stride_{},
+              b_kz_stride_{},
+              ds_nz_stride_{},
+              e_nz_stride_{}
+        {
+            // populate pointer, batch stride, desc for Ds
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                // D pointer
+                p_ds_grid_(i) = static_cast<const DDataType*>(p_ds_grid[i]);
+
+                // D desc
+                ds_grid_desc_m_n_(i) =
+                    DeviceOp::MakeEGridDescriptor_M_N(ds_ms_ns_lengths[i], ds_ms_ns_strides[i]);
+            });
+
+            // populate desc for Ds/E
+            if(GridwiseGemm::CheckValidity(a_grid_desc_m_k_,
+                                           b_grid_desc_n_k_,
+                                           ds_grid_desc_m_n_,
+                                           e_grid_desc_m_n_,
+                                           block_2_etile_map_))
+            {
+                e_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        e_grid_desc_m_n_);
+
+                ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        ds_grid_desc_m_n_);
+            }
+
+            // for sanity check of vector memory access
+            a_mz_stride_ = a_ms_ks_strides[NumDimM - 1];
+            a_kz_stride_ = a_ms_ks_strides[NumDimM + NumDimK - 1];
+
+            b_nz_stride_ = b_ns_ks_strides[NumDimN - 1];
+            b_kz_stride_ = b_ns_ks_strides[NumDimN + NumDimK - 1];
+
+            for(index_t i = 0; i < NumDTensor; ++i)
+            {
+                ds_nz_stride_[i] = ds_ms_ns_strides[i][NumDimM + NumDimN - 1];
+            }
+
+            e_nz_stride_ = e_ms_ns_strides[NumDimM + NumDimN - 1];
+        }
+
+        void Print() const
+        {
+            std::cout << "A[M, K]: " << a_grid_desc_m_k_ << std::endl;
+            std::cout << "B[N, K]: " << b_grid_desc_n_k_ << std::endl;
+            static_for<0, NumDTensor, 1>{}(
+                [&](auto i) { std::cout << "Ds[M, N]: " << ds_grid_desc_m_n_[i] << std::endl; });
+            std::cout << "E[M, N]: " << e_grid_desc_m_n_ << std::endl;
+        }
+
+        //  private:
+        // pointers
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        EDataType* p_e_grid_;
+
+        // tensor descriptors for problem definiton
+        AGridDesc_M_K a_grid_desc_m_k_;
+        BGridDesc_N_K b_grid_desc_n_k_;
+        DsGridDesc_M_N ds_grid_desc_m_n_;
+        EGridDesc_M_N e_grid_desc_m_n_;
+
+        // tensor descriptors for block/thread-wise copy
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+            ds_grid_desc_mblock_mperblock_nblock_nperblock_;
+        EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_;
+
+        // block-to-e-tile map
+        Block2ETileMap block_2_etile_map_;
+
+        // element-wise op
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+
+        // Strides for the last M/N/K dimensions of A/B/Ds/E
+        //   for sanity check of vector load/store
+        index_t a_mz_stride_;
+        index_t a_kz_stride_;
+        index_t b_nz_stride_;
+        index_t b_kz_stride_;
+        std::array<index_t, NumDTensor> ds_nz_stride_;
+        index_t e_mz_stride_;
+        index_t e_nz_stride_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
+                                            arg.b_grid_desc_n_k_,
+                                            arg.ds_grid_desc_m_n_,
+                                            arg.e_grid_desc_m_n_,
+                                            arg.block_2_etile_map_))
+            {
+                throw std::runtime_error(
+                    "wrong! GridwiseGemmMultipleD_xdl_cshuffle has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_);
+
+            const auto K =
+                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+
+            auto launch_kernel = [&](auto has_main_k_block_loop) {
+                constexpr bool has_main_loop = has_main_k_block_loop.value;
+
+                const auto kernel = kernel_contraction_multiple_d_xdl_cshuffle<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    typename GridwiseGemm::DsGridPointer,
+                    EDataType,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CDEElementwiseOperation,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    DeviceOp::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                    DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                    DeviceOp::Block2ETileMap,
+                    has_main_loop>;
+
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_a_grid_,
+                                              arg.p_b_grid_,
+                                              arg.p_ds_grid_,
+                                              arg.p_e_grid_,
+                                              arg.a_element_op_,
+                                              arg.b_element_op_,
+                                              arg.cde_element_op_,
+                                              arg.a_grid_desc_ak0_m_ak1_,
+                                              arg.b_grid_desc_bk0_n_bk1_,
+                                              arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.block_2_etile_map_);
+            };
+
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                return launch_kernel(integral_constant<bool, true>{});
+            }
+            else
+            {
+                return launch_kernel(integral_constant<bool, false>{});
+            }
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        {
+            return false;
+        }
+
+        if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
+                                        arg.b_grid_desc_n_k_,
+                                        arg.ds_grid_desc_m_n_,
+                                        arg.e_grid_desc_m_n_,
+                                        arg.block_2_etile_map_))
+        {
+            return false;
+        }
+
+        // check vector access
+        static_assert((ABlockTransferSrcVectorDim == 1 || ABlockTransferSrcVectorDim == 2) &&
+                          (BBlockTransferSrcVectorDim == 1 || BBlockTransferSrcVectorDim == 2),
+                      "wrong!");
+
+        // vector memory access of A: could be on M or AK1 dimension
+        if constexpr(ABlockTransferSrcVectorDim == 1)
+        {
+            if(!(arg.a_mz_stride_ == 1 &&
+                 arg.a_grid_desc_ak0_m_ak1_.GetLength(I1) % ABlockTransferSrcScalarPerVector == 0))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            if(!(arg.a_kz_stride_ == 1 &&
+                 arg.a_grid_desc_ak0_m_ak1_.GetLength(I2) % ABlockTransferSrcScalarPerVector == 0))
+            {
+                return false;
+            }
+        }
+
+        // vector memory access of B: could be on N or BK1 dimension
+        if constexpr(BBlockTransferSrcVectorDim == 1)
+        {
+            if(!(arg.b_nz_stride_ == 1 &&
+                 arg.b_grid_desc_bk0_n_bk1_.GetLength(I1) % BBlockTransferSrcScalarPerVector == 0))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            if(!(arg.b_kz_stride_ == 1 &&
+                 arg.b_grid_desc_bk0_n_bk1_.GetLength(I2) % BBlockTransferSrcScalarPerVector == 0))
+            {
+                return false;
+            }
+        }
+
+        // vector memory access of Ds: always on NPerBlock dimension
+        bool valid_d_access = true;
+
+        static_for<0, NumDTensor, 1>{}([&](auto i) {
+            if(!(arg.ds_nz_stride_[i] == 1 &&
+                 arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_[i].GetLength(I3) %
+                         CDEBlockTransferScalarPerVector_NPerBlock ==
+                     0))
+            {
+                valid_d_access = false;
+            }
+        });
+
+        if(valid_d_access == false)
+        {
+            return false;
+        }
+
+        // vector memory access of E: always on NPerBlock dimension
+        if(!(arg.e_nz_stride_ == 1 &&
+             arg.e_grid_desc_mblock_mperblock_nblock_nperblock_.GetLength(I3) %
+                     CDEBlockTransferScalarPerVector_NPerBlock ==
+                 0))
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const void* p_a,
+                             const void* p_b,
+                             std::array<const void*, NumDTensor> p_ds,
+                             void* p_e,
+                             const std::vector<index_t>& a_ms_ns_lengths,
+                             const std::vector<index_t>& a_ms_ks_strides,
+                             const std::vector<index_t>& b_ns_ks_lengths,
+                             const std::vector<index_t>& b_ns_ks_strides,
+                             const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_lengths,
+                             const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_strides,
+                             const std::vector<index_t>& e_ms_ns_lengths,
+                             const std::vector<index_t>& e_ms_ns_strides,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_ds,
+                        p_e,
+                        a_ms_ns_lengths,
+                        a_ms_ks_strides,
+                        b_ns_ks_lengths,
+                        b_ns_ks_strides,
+                        ds_ms_ns_lengths,
+                        ds_ms_ns_strides,
+                        e_ms_ns_lengths,
+                        e_ms_ns_strides,
+                        a_element_op,
+                        b_element_op,
+                        cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        const std::vector<index_t>& a_ms_ns_lengths,
+                        const std::vector<index_t>& a_ms_ks_strides,
+                        const std::vector<index_t>& b_ns_ks_lengths,
+                        const std::vector<index_t>& b_ns_ks_strides,
+                        const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_lengths,
+                        const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_strides,
+                        const std::vector<index_t>& e_ms_ns_lengths,
+                        const std::vector<index_t>& e_ms_ns_strides,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) override
+    {
+        return std::make_unique<Argument>(p_a,
+                                          p_b,
+                                          p_ds,
+                                          p_e,
+                                          a_ms_ns_lengths,
+                                          a_ms_ks_strides,
+                                          b_ns_ks_lengths,
+                                          b_ns_ks_strides,
+                                          ds_ms_ns_lengths,
+                                          ds_ms_ns_strides,
+                                          e_ms_ns_lengths,
+                                          e_ms_ns_strides,
+                                          a_element_op,
+                                          b_element_op,
+                                          cde_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceContractionMultipleD_Xdl_CShuffle"
+            << "<"
+            << NumDimM << ", "
+            << NumDimN << ", "
+            << NumDimK << ", "
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1 << ", "
+            << ABlockTransferSrcVectorDim << ", "
+            << BBlockTransferSrcVectorDim
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 00000000..4760422b
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_backward_weight_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,787 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_bwd_weight.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
+template <typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename AccDataType,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation,
+          ck::index_t BlockSize,
+          ck::index_t MPerBlock,
+          ck::index_t NPerBlock,
+          ck::index_t K0PerBlock,
+          ck::index_t K1,
+          ck::index_t MPerXdl,
+          ck::index_t NPerXdl,
+          ck::index_t MXdlPerWave,
+          ck::index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          ck::index_t ABlockTransferSrcVectorDim,
+          ck::index_t ABlockTransferSrcScalarPerVector,
+          ck::index_t ABlockTransferDstScalarPerVector_K1,
+          bool ABlockLdsAddExtraM,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          ck::index_t BBlockTransferSrcVectorDim,
+          ck::index_t BBlockTransferSrcScalarPerVector,
+          ck::index_t BBlockTransferDstScalarPerVector_K1,
+          bool BBlockLdsAddExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CBlockTransferScalarPerVector_NWaveNPerXdl>
+struct DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
+    : public DeviceConvBwdWeight<2,
+                                 ck::tensor_layout::convolution::NHWC,
+                                 ck::tensor_layout::convolution::KYXC,
+                                 ck::tensor_layout::convolution::NHWK,
+                                 InDataType,
+                                 WeiDataType,
+                                 OutDataType,
+                                 InElementwiseOperation,
+                                 WeiElementwiseOperation,
+                                 OutElementwiseOperation>
+{
+    static constexpr ck::index_t NDimSpatial = 2;
+
+    using DeviceOp =
+        DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K;
+
+    using ADataType = OutDataType;
+    using BDataType = InDataType;
+    using CDataType = WeiDataType;
+
+    using AElementwiseOperation = OutElementwiseOperation;
+    using BElementwiseOperation = InElementwiseOperation;
+    using CElementwiseOperation = WeiElementwiseOperation;
+
+    // TODO make A/B datatype different
+    using ABDataType = InDataType;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+
+    static constexpr auto K1Number     = Number<K1>{};
+    static constexpr auto GemmK1Number = K1Number;
+
+    static constexpr auto N1Number = K1Number;
+
+    // Bytes per 32 lds bank: 32 * 4 bytes
+    static constexpr auto BankLength = 128;
+    static constexpr auto ElePerBank = BankLength / sizeof(ADataType);
+
+    // M1 & M0
+    static constexpr auto ABlockLdsM1PerBlock = ElePerBank / K1;
+    static constexpr auto ABlockLdsM0PerBlock = MPerBlock / ABlockLdsM1PerBlock;
+    static constexpr auto ABlockLdsM1Padding  = 4;
+
+    // N1 & N0
+    static constexpr auto BBlockLdsN1PerBlock = ElePerBank / K1;
+    static constexpr auto BBlockLdsN0PerBlock = NPerBlock / BBlockLdsN1PerBlock;
+    static constexpr auto BBlockLdsN1Padding  = 4;
+
+    static auto MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
+        ck::index_t N,
+        ck::index_t K,
+        ck::index_t C,
+        std::array<ck::index_t, NDimSpatial> input_spatial_lengths,
+        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths,
+        std::array<ck::index_t, NDimSpatial> output_spatial_lengths,
+        std::array<ck::index_t, NDimSpatial> conv_filter_strides,
+        std::array<ck::index_t, NDimSpatial> conv_filter_dilations,
+        std::array<ck::index_t, NDimSpatial> input_left_pads,
+        std::array<ck::index_t, NDimSpatial> input_right_pads,
+        ck::index_t batch_k)
+    {
+        using namespace ck;
+
+        const index_t Hi = input_spatial_lengths[0];
+        const index_t Wi = input_spatial_lengths[1];
+
+        const index_t Ho = output_spatial_lengths[0];
+        const index_t Wo = output_spatial_lengths[1];
+
+        const index_t Y = filter_spatial_lengths[0];
+        const index_t X = filter_spatial_lengths[1];
+
+        const index_t ConvStrideH = conv_filter_strides[0];
+        const index_t ConvStrideW = conv_filter_strides[1];
+
+        const index_t ConvDilationH = conv_filter_dilations[0];
+        const index_t ConvDilationW = conv_filter_dilations[1];
+
+        const index_t InLeftPadH = input_left_pads[0];
+        const index_t InLeftPadW = input_left_pads[1];
+
+        const index_t InRightPadH = input_right_pads[0];
+        const index_t InRightPadW = input_right_pads[1];
+
+        const index_t GemmKTotal = N * Ho * Wo;
+        const index_t GemmM      = K;
+        const index_t GemmN      = C * X * Y;
+
+        const index_t GemmKBatch = batch_k;
+        const index_t GemmK0 =
+            math::integer_divide_ceil(GemmKTotal, GemmK1Number * K0PerBlock * GemmKBatch) *
+            K0PerBlock;
+
+        const auto in_n_hi_wi_c_grid_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C));
+
+        // A: output tensor
+        const index_t N0          = N / N1Number;
+        const index_t GemmK0Total = N0 * Ho * Wo;
+
+        const index_t GemmK0S =
+            math::integer_divide_ceil(GemmK0Total, K0PerBlock * GemmKBatch) * K0PerBlock;
+        const index_t GemmK0Pad = GemmKBatch * GemmK0S;
+        const auto out_n_ho_wo_k_grid_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(N, Ho * Wo, K));
+
+        const auto out_n0_ho_wo_k_n1_grid_desc =
+            transform_tensor_descriptor(out_n_ho_wo_k_grid_desc,
+                                        make_tuple(make_unmerge_transform(make_tuple(N0, N1Number)),
+                                                   make_pass_through_transform(Ho * Wo),
+                                                   make_pass_through_transform(K)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                                        make_tuple(Sequence<0, 3>{}, Sequence<1>{}, Sequence<2>{}));
+
+        const auto out_gemmk0total_gemmm_gemmk1_grid_desc =
+            transform_tensor_descriptor(out_n0_ho_wo_k_n1_grid_desc,
+                                        make_tuple(make_merge_transform(make_tuple(N0, Ho * Wo)),
+                                                   make_pass_through_transform(K),
+                                                   make_pass_through_transform(N1Number)),
+                                        make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+        const auto out_gemmk0pad_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+            out_gemmk0total_gemmm_gemmk1_grid_desc,
+            make_tuple(make_right_pad_transform(GemmK0Total, GemmK0Pad - GemmK0Total),
+                       make_pass_through_transform(GemmM),
+                       make_pass_through_transform(N1Number)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+        const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+            out_gemmk0pad_gemmm_gemmk1_grid_desc,
+            make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0)),
+                       make_pass_through_transform(GemmM),
+                       make_pass_through_transform(N1Number)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        // B: input tensor
+        const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+            in_n_hi_wi_c_grid_desc,
+            make_tuple(make_pass_through_transform(N),
+                       make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                       make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                       make_pass_through_transform(C)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+            in_n_hip_wip_c_grid_desc,
+            make_tuple(
+                make_pass_through_transform(N),
+                make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                make_pass_through_transform(C)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+        const auto in_n0_y_ho_x_wo_c_n1_grid_desc =
+            transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                        make_tuple(make_unmerge_transform(make_tuple(N0, N1Number)),
+                                                   make_pass_through_transform(Y),
+                                                   make_pass_through_transform(Ho),
+                                                   make_pass_through_transform(X),
+                                                   make_pass_through_transform(Wo),
+                                                   make_pass_through_transform(C)),
+                                        make_tuple(Sequence<0>{},
+                                                   Sequence<1>{},
+                                                   Sequence<2>{},
+                                                   Sequence<3>{},
+                                                   Sequence<4>{},
+                                                   Sequence<5>{}),
+                                        make_tuple(Sequence<0, 6>{},
+                                                   Sequence<1>{},
+                                                   Sequence<2>{},
+                                                   Sequence<3>{},
+                                                   Sequence<4>{},
+                                                   Sequence<5>{}));
+
+        const auto in_gemmk0total_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+            in_n0_y_ho_x_wo_c_n1_grid_desc,
+            make_tuple(make_merge_transform(make_tuple(N0, Ho, Wo)),
+                       make_merge_transform(make_tuple(Y, X, C)),
+                       make_pass_through_transform(N1Number)),
+            make_tuple(Sequence<0, 2, 4>{}, Sequence<1, 3, 5>{}, Sequence<6>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+        const auto in_gemmk0pad_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+            in_gemmk0total_gemmn_gemmk1_grid_desc,
+            make_tuple(make_right_pad_transform(GemmK0Total, GemmK0Pad - GemmK0Total),
+                       make_pass_through_transform(GemmN),
+                       make_pass_through_transform(N1Number)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+        const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+            in_gemmk0pad_gemmn_gemmk1_grid_desc,
+            make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0)),
+                       make_pass_through_transform(GemmN),
+                       make_pass_through_transform(N1Number)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        // C: weight tensor
+        const auto wei_gemmm_gemmn_grid_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C));
+
+        return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                          in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+                          wei_gemmm_gemmn_grid_desc);
+    }
+
+    using ABCGridDescs = decltype(MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
+        1, 1, 1, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, 1));
+
+    using AGridDesc_K0_M_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I0])>;
+    using BGridDesc_K0_N_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I1])>;
+    using CGridDesc_M_N     = remove_cvref_t<decltype(ABCGridDescs{}[I2])>;
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight<
+        BlockSize,
+        ADataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CDataType,
+        InMemoryDataOperationEnum::Set,
+        AGridDesc_K0_M_K1,
+        BGridDesc_K0_N_K1,
+        CGridDesc_M_N,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        MPerBlock,
+        NPerBlock,
+        K0PerBlock,
+        MPerXdl,
+        NPerXdl,
+        K1,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_K0_M_K1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_K1,
+        false, // AThreadTransferSrcResetCoordinateAfterRun,
+        ABlockLdsAddExtraM,
+        ABlockLdsM1PerBlock,
+        ABlockLdsM0PerBlock,
+        ABlockLdsM1Padding,
+        BBlockTransferThreadClusterLengths_K0_N_K1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_K1,
+        false, // BThreadTransferSrcResetCoordinateAfterRun,
+        BBlockLdsAddExtraN,
+        BBlockLdsN1PerBlock,
+        BBlockLdsN0PerBlock,
+        BBlockLdsN1Padding,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CBlockTransferScalarPerVector_NWaveNPerXdl,
+        CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        true,
+        true>;
+
+    using GridwiseGemmAtomicAdd = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight<
+        BlockSize,
+        ADataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CDataType,
+        InMemoryDataOperationEnum::AtomicAdd,
+        AGridDesc_K0_M_K1,
+        BGridDesc_K0_N_K1,
+        CGridDesc_M_N,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        MPerBlock,
+        NPerBlock,
+        K0PerBlock,
+        MPerXdl,
+        NPerXdl,
+        K1,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_K0_M_K1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_K1,
+        false, // AThreadTransferSrcResetCoordinateAfterRun,
+        ABlockLdsAddExtraM,
+        ABlockLdsM1PerBlock,
+        ABlockLdsM0PerBlock,
+        ABlockLdsM1Padding,
+        BBlockTransferThreadClusterLengths_K0_N_K1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_K1,
+        false, // BThreadTransferSrcResetCoordinateAfterRun,
+        BBlockLdsAddExtraN,
+        BBlockLdsN1PerBlock,
+        BBlockLdsN0PerBlock,
+        BBlockLdsN1Padding,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CBlockTransferScalarPerVector_NWaveNPerXdl,
+        CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        true,
+        true>;
+    // Argument
+    using CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
+        decltype(GridwiseGemm::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(CGridDesc_M_N{}));
+
+    using Block2CTileMap =
+        decltype(GridwiseGemm::MakeCBlockClusterAdaptor(CGridDesc_M_N{}, 1, 1, 1));
+    struct Argument : public BaseArgument
+    {
+        Argument(const InDataType* p_in_grid,
+                 WeiDataType* p_wei_grid,
+                 const OutDataType* p_out_grid,
+                 ck::index_t N,
+                 ck::index_t K,
+                 ck::index_t C,
+                 std::array<ck::index_t, NDimSpatial> input_spatial_lengths,
+                 std::array<ck::index_t, NDimSpatial> filter_spatial_lengths,
+                 std::array<ck::index_t, NDimSpatial> output_spatial_lengths,
+                 std::array<ck::index_t, NDimSpatial> conv_filter_strides,
+                 std::array<ck::index_t, NDimSpatial> conv_filter_dilations,
+                 std::array<ck::index_t, NDimSpatial> input_left_pads,
+                 std::array<ck::index_t, NDimSpatial> input_right_pads,
+                 ck::index_t M01,
+                 ck::index_t N01,
+                 InElementwiseOperation in_element_op,
+                 WeiElementwiseOperation wei_element_op,
+                 OutElementwiseOperation out_element_op,
+                 ck::index_t split_k)
+            : p_a_grid_{p_out_grid},
+              p_b_grid_{p_in_grid},
+              p_c_grid_{p_wei_grid},
+              a_grid_desc_kbatch_k0_m_k1_{},
+              b_grid_desc_kbatch_k0_n_k1_{},
+              c_grid_desc_m_n_{},
+              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              block_2_ctile_map_{},
+              M01_{M01},
+              N01_{N01},
+              a_element_op_{out_element_op},
+              b_element_op_{in_element_op},
+              c_element_op_{wei_element_op},
+              Conv_N_{N},
+              Conv_K_{K},
+              Conv_C_{C},
+              output_spatial_lengths_{output_spatial_lengths},
+              filter_spatial_lengths_{filter_spatial_lengths},
+              conv_filter_strides_{conv_filter_strides},
+              input_left_pads_{input_left_pads},
+              input_right_pads_{input_right_pads},
+              k_batch_{split_k}
+        {
+            const auto descs =
+                DeviceOp::MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(N,
+                                                                          K,
+                                                                          C,
+                                                                          input_spatial_lengths,
+                                                                          filter_spatial_lengths,
+                                                                          output_spatial_lengths,
+                                                                          conv_filter_strides,
+                                                                          conv_filter_dilations,
+                                                                          input_left_pads,
+                                                                          input_right_pads,
+                                                                          k_batch_);
+
+            a_grid_desc_kbatch_k0_m_k1_ = descs[I0];
+            b_grid_desc_kbatch_k0_n_k1_ = descs[I1];
+            c_grid_desc_m_n_            = descs[I2];
+
+            block_2_ctile_map_ =
+                GridwiseGemm::MakeCBlockClusterAdaptor(c_grid_desc_m_n_, M01, N01, k_batch_);
+
+            if(GridwiseGemm::CheckValidity(a_grid_desc_kbatch_k0_m_k1_,
+                                           b_grid_desc_kbatch_k0_n_k1_,
+                                           c_grid_desc_m_n_,
+                                           block_2_ctile_map_))
+            {
+                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n_);
+            }
+        }
+
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        CDataType* p_c_grid_;
+        AGridDesc_K0_M_K1 a_grid_desc_kbatch_k0_m_k1_;
+        BGridDesc_K0_N_K1 b_grid_desc_kbatch_k0_n_k1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock_;
+        Block2CTileMap block_2_ctile_map_;
+        index_t M01_;
+        index_t N01_;
+        InElementwiseOperation a_element_op_;
+        OutElementwiseOperation b_element_op_;
+        WeiElementwiseOperation c_element_op_;
+        // for checking IsSupportedArgument()
+        index_t Conv_N_;
+        index_t Conv_K_;
+        index_t Conv_C_;
+        std::array<index_t, NDimSpatial> output_spatial_lengths_;
+        std::array<index_t, NDimSpatial> filter_spatial_lengths_;
+        std::array<index_t, NDimSpatial> conv_filter_strides_;
+        std::array<index_t, NDimSpatial> input_left_pads_;
+        std::array<index_t, NDimSpatial> input_right_pads_;
+        index_t k_batch_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        void ShowInfo(const Argument& arg)
+        {
+            std::cout << "arg.a_grid_desc_kbatch_k0_m_k1_{"
+                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0) << ", "
+                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I1) << ", "
+                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I2) << ", "
+                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I3) << "}" << std::endl;
+
+            std::cout << "arg.b_grid_desc_kbatch_k0_n_k1_{"
+                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I0) << ", "
+                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I1) << ", "
+                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I2) << ", "
+                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I3) << "}" << std::endl;
+
+            std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
+                      << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+        }
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            ShowInfo(arg);
+
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
+                                            arg.b_grid_desc_kbatch_k0_n_k1_,
+                                            arg.c_grid_desc_m_n_,
+                                            arg.block_2_ctile_map_))
+            {
+                throw std::runtime_error(
+                    "wrong! GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight has invalid setting");
+            }
+            const auto kbatch = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0);
+            const index_t grid_size =
+                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_);
+
+            const auto K0 = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I1);
+
+            const bool has_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0);
+
+            float ave_time = 0;
+
+            const auto Run = [&](const auto& kernel) {
+                hipGetErrorString(hipMemset(
+                    arg.p_c_grid_,
+                    0,
+                    arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
+                        sizeof(CDataType)));
+
+                ave_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_,
+                                           arg.p_b_grid_,
+                                           arg.p_c_grid_,
+                                           arg.a_grid_desc_kbatch_k0_m_k1_,
+                                           arg.b_grid_desc_kbatch_k0_n_k1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.block_2_ctile_map_);
+            };
+
+            if(has_main_k0_block_loop)
+            {
+                if(kbatch == 1)
+                {
+                    const auto kernel = kernel_gemm_xdlops_bwd_weight<
+                        GridwiseGemm,
+                        ADataType, // TODO: distiguish A/B datatype
+                        CDataType,
+                        remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                        remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                        remove_reference_t<DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                        OutElementwiseOperation,
+                        InElementwiseOperation,
+                        WeiElementwiseOperation,
+                        remove_reference_t<DeviceOp::Block2CTileMap>,
+                        true>;
+
+                    Run(kernel);
+                }
+                else
+                {
+                    const auto kernel = kernel_gemm_xdlops_bwd_weight<
+                        GridwiseGemmAtomicAdd,
+                        ADataType, // TODO: distiguish A/B datatype
+                        CDataType,
+                        remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                        remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                        remove_reference_t<DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                        OutElementwiseOperation,
+                        InElementwiseOperation,
+                        WeiElementwiseOperation,
+                        remove_reference_t<DeviceOp::Block2CTileMap>,
+                        true>;
+
+                    Run(kernel);
+                }
+            }
+            else
+            {
+                if(kbatch == 1)
+                {
+                    const auto kernel = kernel_gemm_xdlops_bwd_weight<
+                        GridwiseGemm,
+                        ADataType, // TODO: distiguish A/B datatype
+                        CDataType,
+                        remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                        remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                        remove_reference_t<DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                        OutElementwiseOperation,
+                        InElementwiseOperation,
+                        WeiElementwiseOperation,
+                        remove_reference_t<DeviceOp::Block2CTileMap>,
+                        false>;
+
+                    Run(kernel);
+                }
+                else
+                {
+                    const auto kernel = kernel_gemm_xdlops_bwd_weight<
+                        GridwiseGemmAtomicAdd,
+                        ADataType, // TODO: distiguish A/B datatype
+                        CDataType,
+                        remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                        remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                        remove_reference_t<DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                        OutElementwiseOperation,
+                        InElementwiseOperation,
+                        WeiElementwiseOperation,
+                        remove_reference_t<DeviceOp::Block2CTileMap>,
+                        false>;
+
+                    Run(kernel);
+                }
+            }
+
+            return ave_time;
+        }
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        // vector load A/B matrix from global memory
+        if(!(ABlockTransferSrcVectorDim == 2 && BBlockTransferSrcVectorDim == 2 &&
+             arg.Conv_K_ % ABlockTransferSrcScalarPerVector == 0 &&
+             arg.Conv_C_ % BBlockTransferSrcScalarPerVector == 0))
+        {
+            return false;
+        }
+
+        // unmerge N to N0 and N1, where N1 equals to K1
+        if(!(arg.Conv_N_ % K1 == 0))
+        {
+            return false;
+        }
+
+        // vector store C matrix into global memory
+        if(!(arg.Conv_C_ % CBlockTransferScalarPerVector_NWaveNPerXdl == 0))
+        {
+            return false;
+        }
+
+        // Gridwise GEMM size
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
+                                           arg.b_grid_desc_kbatch_k0_n_k1_,
+                                           arg.c_grid_desc_m_n_,
+                                           arg.block_2_ctile_map_);
+    }
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const InDataType* p_in_grid,
+                             WeiDataType* p_wei_grid,
+                             const OutDataType* p_out_grid,
+                             ck::index_t N,
+                             ck::index_t K,
+                             ck::index_t C,
+                             std::array<ck::index_t, NDimSpatial> input_spatial_lengths,
+                             std::array<ck::index_t, NDimSpatial> filter_spatial_lengths,
+                             std::array<ck::index_t, NDimSpatial> output_spatial_lengths,
+                             std::array<ck::index_t, NDimSpatial> conv_filter_strides,
+                             std::array<ck::index_t, NDimSpatial> conv_filter_dilations,
+                             std::array<ck::index_t, NDimSpatial> input_left_pads,
+                             std::array<ck::index_t, NDimSpatial> input_right_pads,
+                             InElementwiseOperation in_element_op,
+                             WeiElementwiseOperation wei_element_op,
+                             OutElementwiseOperation out_element_op,
+                             ck::index_t split_k)
+    {
+        return Argument{p_in_grid,
+                        p_wei_grid,
+                        p_out_grid,
+                        N,
+                        K,
+                        C,
+                        input_spatial_lengths,
+                        filter_spatial_lengths,
+                        output_spatial_lengths,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        1,
+                        1,
+                        in_element_op,
+                        wei_element_op,
+                        out_element_op,
+                        split_k};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_in_grid,
+                        void* p_wei_grid,
+                        const void* p_out_grid,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t C,
+                        std::array<ck::index_t, NDimSpatial> input_spatial_lengths,
+                        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths,
+                        std::array<ck::index_t, NDimSpatial> output_spatial_lengths,
+                        std::array<ck::index_t, NDimSpatial> conv_filter_strides,
+                        std::array<ck::index_t, NDimSpatial> conv_filter_dilations,
+                        std::array<ck::index_t, NDimSpatial> input_left_pads,
+                        std::array<ck::index_t, NDimSpatial> input_right_pads,
+                        InElementwiseOperation in_element_op,
+                        WeiElementwiseOperation wei_element_op,
+                        OutElementwiseOperation out_element_op,
+                        ck::index_t split_k) override
+    {
+        return std::make_unique<Argument>(static_cast<const InDataType*>(p_in_grid),
+                                          static_cast<WeiDataType*>(p_wei_grid),
+                                          static_cast<const OutDataType*>(p_out_grid),
+                                          N,
+                                          K,
+                                          C,
+                                          input_spatial_lengths,
+                                          filter_spatial_lengths,
+                                          output_spatial_lengths,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads,
+                                          1,
+                                          1,
+                                          in_element_op,
+                                          wei_element_op,
+                                          out_element_op,
+                                          split_k);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceConv2dBwdWeightXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << K0PerBlock
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 00000000..ca79b932
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,835 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
+template <typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename AccDataType,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation,
+          ConvolutionBackwardDataSpecialization ConvBackwardDataSpecialization,
+          ck::index_t BlockSize,
+          ck::index_t MPerBlock,
+          ck::index_t NPerBlock,
+          ck::index_t K0PerBlock,
+          ck::index_t K1,
+          ck::index_t MPerXdl,
+          ck::index_t NPerXdl,
+          ck::index_t MXdlPerWave,
+          ck::index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          ck::index_t ABlockTransferSrcVectorDim,
+          ck::index_t ABlockTransferSrcScalarPerVector,
+          ck::index_t ABlockTransferDstScalarPerVector_K1,
+          bool ABlockLdsAddExtraM,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          ck::index_t BBlockTransferSrcVectorDim,
+          ck::index_t BBlockTransferSrcScalarPerVector,
+          ck::index_t BBlockTransferDstScalarPerVector_K1,
+          bool BBlockLdsAddExtraN,
+          ck::index_t CThreadTransferSrcDstVectorDim,
+          ck::index_t CThreadTransferDstScalarPerVector>
+struct DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
+    : public DeviceConvBwdData<2,
+                               ck::tensor_layout::convolution::NHWC,
+                               ck::tensor_layout::convolution::KYXC,
+                               ck::tensor_layout::convolution::NHWK,
+                               InDataType,
+                               WeiDataType,
+                               OutDataType,
+                               InElementwiseOperation,
+                               WeiElementwiseOperation,
+                               OutElementwiseOperation>
+{
+    using DeviceOp = DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K;
+
+    using ADataType = OutDataType;
+    using BDataType = WeiDataType;
+    using CDataType = InDataType;
+
+    // TODO make A/B datatype different
+    using ABDataType = InDataType;
+
+    static constexpr index_t NDimSpatial = 2;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+
+    static_assert((K1 % ABlockTransferThreadClusterLengths_K0_M_K1{}[I2]) %
+                      ABlockTransferSrcScalarPerVector ==
+                  0);
+    static_assert((NPerBlock / BBlockTransferThreadClusterLengths_K0_N_K1{}[I1]) %
+                      BBlockTransferSrcScalarPerVector ==
+                  0);
+
+    static constexpr auto K1Number     = Number<K1>{};
+    static constexpr auto GemmK1Number = K1Number;
+
+    static auto
+    MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(ck::index_t N,
+                                                    ck::index_t K,
+                                                    ck::index_t C,
+                                                    std::vector<ck::index_t> input_spatial_lengths,
+                                                    std::vector<ck::index_t> filter_spatial_lengths,
+                                                    std::vector<ck::index_t> output_spatial_lengths,
+                                                    std::vector<ck::index_t> conv_filter_strides,
+                                                    std::vector<ck::index_t> conv_filter_dilations,
+                                                    std::vector<ck::index_t> input_left_pads,
+                                                    std::vector<ck::index_t> input_right_pads,
+                                                    index_t i_ytilde,
+                                                    index_t i_xtilde)
+    {
+        using namespace ck;
+
+        const index_t Hi = input_spatial_lengths[0];
+        const index_t Wi = input_spatial_lengths[1];
+
+        const index_t Ho = output_spatial_lengths[0];
+        const index_t Wo = output_spatial_lengths[1];
+
+        const index_t Y = filter_spatial_lengths[0];
+        const index_t X = filter_spatial_lengths[1];
+
+        const index_t InLeftPadH = input_left_pads[0];
+        const index_t InLeftPadW = input_left_pads[1];
+
+        const index_t InRightPadH = input_right_pads[0];
+        const index_t InRightPadW = input_right_pads[1];
+
+        const index_t ConvStrideH = conv_filter_strides[0];
+        const index_t ConvStrideW = conv_filter_strides[1];
+
+        const index_t ConvDilationH = conv_filter_dilations[0];
+        const index_t ConvDilationW = conv_filter_dilations[1];
+
+        const auto K0 = K / K1;
+
+        const auto out_n_ho_wo_k_grid_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(N, Ho, Wo, K));
+        const auto wei_k_y_x_c_grid_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(K, Y, X, C));
+        const auto in_n_hi_wi_c_grid_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C));
+
+        if constexpr(ConvBackwardDataSpecialization ==
+                     ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0)
+        {
+            // A: output tensor
+            const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)),
+                make_tuple(make_pass_through_transform(N * Ho * Wo),
+                           make_unmerge_transform(make_tuple(K0, K1))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<1>{}, Sequence<0, 2>{}));
+
+            // B: weight tensor
+            const auto wei_gemmk0_gemmn_gemmk1_grid_desc =
+                transform_tensor_descriptor(make_naive_tensor_descriptor_packed(make_tuple(K, C)),
+                                            make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                                                       make_pass_through_transform(C)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // C: input tensor
+            const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_embed_transform(make_tuple(I1, Ho), make_tuple(I1, ConvStrideH)),
+                           make_embed_transform(make_tuple(I1, Wo), make_tuple(I1, ConvStrideW)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+            const auto in_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+                in_n_y_ho_x_wo_c_grid_desc,
+                make_tuple(make_freeze_transform(I0),
+                           make_freeze_transform(I0),
+                           make_merge_transform(make_tuple(N, Ho, Wo)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<1>{}, Sequence<3>{}, Sequence<0, 2, 4>{}, Sequence<5>{}),
+                make_tuple(Sequence<>{}, Sequence<>{}, Sequence<0>{}, Sequence<1>{}));
+
+            return make_tuple(out_gemmk0_gemmm_gemmk1_grid_desc,
+                              wei_gemmk0_gemmn_gemmk1_grid_desc,
+                              in_gemmm_gemmn_grid_desc);
+        }
+        else
+        {
+            const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+            const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+
+            const auto YTilde = ConvStrideH / GcdStrideDilationH;
+            const auto XTilde = ConvStrideW / GcdStrideDilationW;
+
+            const auto YDot = math::integer_divide_ceil(Y, YTilde);
+            const auto XDot = math::integer_divide_ceil(X, XTilde);
+
+            const auto HTilde =
+                Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH);
+            const auto WTilde =
+                Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW);
+
+            // only work on HTilde and WTilde that contribute to non-padding area of input tensor
+            const auto IHTildeSliceBegin = math::integer_divide_floor(
+                math::max(I0, InLeftPadH - ConvDilationH * (YTilde - I1)), ConvStrideH);
+            const auto IWTildeSliceBegin = math::integer_divide_floor(
+                math::max(I0, InLeftPadW - ConvDilationW * (XTilde - I1)), ConvStrideW);
+
+            const auto IHTildeSliceEnd = math::min(
+                HTilde, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1);
+            const auto IWTildeSliceEnd = math::min(
+                WTilde, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1);
+
+            const auto HTildeSlice = IHTildeSliceEnd - IHTildeSliceBegin;
+            const auto WTildeSlice = IWTildeSliceEnd - IWTildeSliceBegin;
+
+            // GemmK is different for each GEMM
+            const auto YDotSlice = math::integer_divide_ceil(Y - i_ytilde, YTilde);
+            const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde);
+
+            // A: output tensor
+            const auto out_n_hop_wop_k_grid_desc = transform_tensor_descriptor(
+                out_n_ho_wo_k_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Ho, I0, I0),
+                           make_pad_transform(Wo, I0, I0),
+                           make_pass_through_transform(K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto out_n_ydot_htilde_xdot_wtilde_k_grid_desc = transform_tensor_descriptor(
+                out_n_hop_wop_k_grid_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(YDot, HTilde),
+                                         make_tuple(-ConvDilationH / GcdStrideDilationH, I1)),
+                    make_embed_transform(make_tuple(XDot, WTilde),
+                                         make_tuple(-ConvDilationW / GcdStrideDilationW, I1)),
+                    make_pass_through_transform(K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+            const auto out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k0_k1_grid_desc =
+                transform_tensor_descriptor(
+                    out_n_ydot_htilde_xdot_wtilde_k_grid_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_slice_transform(YDot, I0, YDotSlice),
+                               make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice),
+                               make_slice_transform(XDot, I0, XDotSlice),
+                               make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
+                               make_unmerge_transform(make_tuple(K0, K1))),
+                    make_tuple(Sequence<0>{},
+                               Sequence<1>{},
+                               Sequence<2>{},
+                               Sequence<3>{},
+                               Sequence<4>{},
+                               Sequence<5>{}),
+                    make_tuple(Sequence<0>{},
+                               Sequence<1>{},
+                               Sequence<2>{},
+                               Sequence<3>{},
+                               Sequence<4>{},
+                               Sequence<5, 6>{}));
+
+            const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k0_k1_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
+                           make_merge_transform(make_tuple(N, HTildeSlice, WTildeSlice)),
+                           make_pass_through_transform(K1)),
+                make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}, Sequence<6>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            // B weight tensor
+            const auto wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc = transform_tensor_descriptor(
+                wei_k_y_x_c_grid_desc,
+                make_tuple(make_pass_through_transform(K),
+                           make_embed_transform(make_tuple(YDot, YTilde),
+                                                make_tuple(ConvStrideH / GcdStrideDilationH, I1)),
+                           make_embed_transform(make_tuple(XDot, XTilde),
+                                                make_tuple(ConvStrideW / GcdStrideDilationW, I1)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+            const auto wei_k0_k1_ydotslice_xdotslice_c_grid_desc =
+                transform_tensor_descriptor(wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc,
+                                            make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                                                       make_slice_transform(YDot, I0, YDotSlice),
+                                                       make_slice_transform(XDot, I0, XDotSlice),
+                                                       make_freeze_transform(i_ytilde),
+                                                       make_freeze_transform(i_xtilde),
+                                                       make_pass_through_transform(C)),
+                                            make_tuple(Sequence<0>{},
+                                                       Sequence<1>{},
+                                                       Sequence<3>{},
+                                                       Sequence<2>{},
+                                                       Sequence<4>{},
+                                                       Sequence<5>{}),
+                                            make_tuple(Sequence<0, 1>{},
+                                                       Sequence<2>{},
+                                                       Sequence<3>{},
+                                                       Sequence<>{},
+                                                       Sequence<>{},
+                                                       Sequence<4>{}));
+
+            const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
+                           make_pass_through_transform(C),
+                           make_pass_through_transform(K1)),
+                make_tuple(Sequence<2, 3, 0>{}, Sequence<4>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            // C: input tensor
+            const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+                in_n_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc = transform_tensor_descriptor(
+                in_n_hip_wip_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_embed_transform(make_tuple(YTilde, HTilde),
+                                                make_tuple(ConvDilationH, ConvStrideH)),
+                           make_embed_transform(make_tuple(XTilde, WTilde),
+                                                make_tuple(ConvDilationW, ConvStrideW)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+            const auto in_n_htildeslice_wtildeslice_c_grid_desc = transform_tensor_descriptor(
+                in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_freeze_transform(i_ytilde),
+                           make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice),
+                           make_freeze_transform(i_xtilde),
+                           make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{},
+                           Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<3>{},
+                           Sequence<4>{},
+                           Sequence<5>{}),
+                make_tuple(Sequence<0>{},
+                           Sequence<>{},
+                           Sequence<1>{},
+                           Sequence<>{},
+                           Sequence<2>{},
+                           Sequence<3>{}));
+
+            const auto in_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+                in_n_htildeslice_wtildeslice_c_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(N, HTildeSlice, WTildeSlice)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return make_tuple(out_gemmk0_gemmm_gemmk1_grid_desc,
+                              wei_gemmk0_gemmn_gemmk1_grid_desc,
+                              in_gemmm_gemmn_grid_desc);
+        }
+
+    } // function end
+
+    using ABCGridDescs = decltype(MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
+        1, 1, 1, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, 0, 0));
+
+    using AGridDesc_K0_M_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I0])>;
+    using BGridDesc_K0_N_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I1])>;
+    using CGridDesc_M_N     = remove_cvref_t<decltype(ABCGridDescs{}[I2])>;
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<
+        BlockSize,
+        ABDataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CDataType,
+        InMemoryDataOperationEnum::Set,
+        AGridDesc_K0_M_K1,
+        BGridDesc_K0_N_K1,
+        CGridDesc_M_N,
+        InElementwiseOperation,
+        WeiElementwiseOperation,
+        OutElementwiseOperation,
+        MPerBlock,
+        NPerBlock,
+        K0PerBlock,
+        MPerXdl,
+        NPerXdl,
+        K1,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_K0_M_K1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_K1,
+        false, // AThreadTransferSrcResetCoordinateAfterRun,
+        ABlockLdsAddExtraM,
+        BBlockTransferThreadClusterLengths_K0_N_K1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_K1,
+        false, // BThreadTransferSrcResetCoordinateAfterRun,
+        BBlockLdsAddExtraN,
+        Sequence<2, 3, 0, 1, 7, 5, 4, 6>, // CThreadTransferSrcDstAccessOrder,
+        7,                                // CThreadTransferSrcDstVectorDim,
+        CThreadTransferDstScalarPerVector>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(InDataType* p_in_grid,
+                 const WeiDataType* p_wei_grid,
+                 const OutDataType* p_out_grid,
+                 ck::index_t N,
+                 ck::index_t K,
+                 ck::index_t C,
+                 std::vector<ck::index_t> input_spatial_lengths,
+                 std::vector<ck::index_t> filter_spatial_lengths,
+                 std::vector<ck::index_t> output_spatial_lengths,
+                 std::vector<ck::index_t> conv_filter_strides,
+                 std::vector<ck::index_t> conv_filter_dilations,
+                 std::vector<ck::index_t> input_left_pads,
+                 std::vector<ck::index_t> input_right_pads,
+                 ck::index_t M01,
+                 ck::index_t N01,
+                 InElementwiseOperation in_element_op,
+                 WeiElementwiseOperation wei_element_op,
+                 OutElementwiseOperation out_element_op)
+            : p_a_grid_{p_out_grid},
+              p_b_grid_{p_wei_grid},
+              p_c_grid_{p_in_grid},
+              M01_{M01},
+              N01_{N01},
+              a_element_op_{out_element_op},
+              b_element_op_{wei_element_op},
+              c_element_op_{in_element_op},
+              Conv_N_{N},
+              Conv_K_{K},
+              Conv_C_{C},
+              input_spatial_lengths_{input_spatial_lengths},
+              filter_spatial_lengths_{filter_spatial_lengths},
+              output_spatial_lengths_{output_spatial_lengths},
+              conv_filter_strides_{conv_filter_strides},
+              conv_filter_dilations_{conv_filter_dilations},
+              input_left_pads_{input_left_pads},
+              input_right_pads_{input_right_pads}
+        {
+            const index_t ConvStrideH = conv_filter_strides[0];
+            const index_t ConvStrideW = conv_filter_strides[1];
+
+            const index_t ConvDilationH = conv_filter_dilations[0];
+            const index_t ConvDilationW = conv_filter_dilations[1];
+
+            const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+            const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+
+            const auto YTilde = ConvStrideH / GcdStrideDilationH;
+            const auto XTilde = ConvStrideW / GcdStrideDilationW;
+
+            for(index_t i_ytilde = 0; i_ytilde < YTilde; ++i_ytilde)
+            {
+                for(index_t i_xtilde = 0; i_xtilde < XTilde; ++i_xtilde)
+                {
+                    // check slice is valid
+                    const index_t Y      = filter_spatial_lengths_[0];
+                    const index_t X      = filter_spatial_lengths_[1];
+                    const auto YDotSlice = math::integer_divide_ceil(Y - i_ytilde, YTilde);
+                    const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde);
+                    if(YDotSlice * XDotSlice <= 0)
+                    {
+                        continue;
+                    }
+
+                    const auto descs = DeviceOp::MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
+                        N,
+                        K,
+                        C,
+                        input_spatial_lengths,
+                        filter_spatial_lengths,
+                        output_spatial_lengths,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        i_ytilde,
+                        i_xtilde);
+                    a_grid_desc_k0_m_k1_container_.push_back(descs[I0]);
+                    b_grid_desc_k0_n_k1_container_.push_back(descs[I1]);
+                    c_grid_desc_m_n_container_.push_back(descs[I2]);
+
+                    auto block_2_ctile_map =
+                        GridwiseGemm::MakeDefaultBlock2CTileMap(descs[I2], M01, N01);
+
+                    if(GridwiseGemm::CheckValidity(
+                           descs[I0], descs[I1], descs[I2], block_2_ctile_map))
+                    {
+                        c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_.push_back(
+                            GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(descs[I2]));
+
+                        block_2_ctile_map_container_.push_back(block_2_ctile_map);
+                    }
+                }
+            }
+        }
+
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        CDataType* p_c_grid_;
+        std::vector<AGridDesc_K0_M_K1> a_grid_desc_k0_m_k1_container_;
+        std::vector<BGridDesc_K0_N_K1> b_grid_desc_k0_n_k1_container_;
+        std::vector<CGridDesc_M_N> c_grid_desc_m_n_container_;
+        std::vector<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>
+            c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_;
+        std::vector<typename GridwiseGemm::DefaultBlock2CTileMap> block_2_ctile_map_container_;
+        index_t M01_;
+        index_t N01_;
+        OutElementwiseOperation a_element_op_;
+        WeiElementwiseOperation b_element_op_;
+        InElementwiseOperation c_element_op_;
+        // for checking IsSupportedArgument()
+        index_t Conv_N_;
+        index_t Conv_K_;
+        index_t Conv_C_;
+
+        std::vector<ck::index_t> input_spatial_lengths_;
+        std::vector<ck::index_t> filter_spatial_lengths_;
+        std::vector<ck::index_t> output_spatial_lengths_;
+        std::vector<ck::index_t> conv_filter_strides_;
+        std::vector<ck::index_t> conv_filter_dilations_;
+        std::vector<ck::index_t> input_left_pads_;
+        std::vector<ck::index_t> input_right_pads_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            float ave_time = 0;
+            for(size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++)
+            {
+#if 0
+                {
+                    std::cout << "arg.a_grid_desc_k0_m_k1_container_{"
+                              << arg.a_grid_desc_k0_m_k1_container_[i].GetLength(I0) << ", "
+                              << arg.a_grid_desc_k0_m_k1_container_[i].GetLength(I1) << ", "
+                              << arg.a_grid_desc_k0_m_k1_container_[i].GetLength(I2) << "}"
+                              << std::endl;
+
+                    std::cout << "arg.b_grid_desc_k0_n_k1_container_{"
+                              << arg.b_grid_desc_k0_n_k1_container_[i].GetLength(I0) << ", "
+                              << arg.b_grid_desc_k0_n_k1_container_[i].GetLength(I1) << ", "
+                              << arg.b_grid_desc_k0_n_k1_container_[i].GetLength(I2) << "}"
+                              << std::endl;
+
+                    std::cout << "arg.c_grid_desc_m_n_container_{ "
+                              << arg.c_grid_desc_m_n_container_[i].GetLength(I0) << ", "
+                              << arg.c_grid_desc_m_n_container_[i].GetLength(I1) << "}"
+                              << std::endl;
+
+                    std::cout << "arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_( "
+                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I0)
+                              << ", "
+                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I1)
+                              << ", "
+                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I2)
+                              << ", "
+                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I3)
+                              << ", "
+                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I4)
+                              << ", "
+                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I5)
+                              << " ) " << std::endl;
+                }
+#endif
+
+                if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_container_[i],
+                                                arg.b_grid_desc_k0_n_k1_container_[i],
+                                                arg.c_grid_desc_m_n_container_[i],
+                                                arg.block_2_ctile_map_container_[i]))
+                {
+                    throw std::runtime_error(
+                        "wrong! GridwiseGemm_km_kn_m0m1n0n1_xdlops_v3r1 has invalid setting");
+                }
+
+                const index_t grid_size = arg.block_2_ctile_map_container_[i].CalculateGridSize(
+                    arg.c_grid_desc_m_n_container_[i]);
+
+                const auto K = arg.a_grid_desc_k0_m_k1_container_[i].GetLength(I0) *
+                               arg.a_grid_desc_k0_m_k1_container_[i].GetLength(I2);
+
+                if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+                {
+                    const auto kernel = kernel_gemm_xdlops_v2r3<
+                        GridwiseGemm,
+                        ADataType, // TODO: distiguish A/B datatype
+                        CDataType,
+                        remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                        remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                        remove_reference_t<
+                            typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
+                        OutElementwiseOperation,
+                        WeiElementwiseOperation,
+                        InElementwiseOperation,
+                        remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
+                        true>;
+
+                    ave_time += launch_and_time_kernel(
+                        stream_config,
+                        kernel,
+                        dim3(grid_size),
+                        dim3(BlockSize),
+                        0,
+                        arg.p_a_grid_,
+                        arg.p_b_grid_,
+                        arg.p_c_grid_,
+                        arg.a_grid_desc_k0_m_k1_container_[i],
+                        arg.b_grid_desc_k0_n_k1_container_[i],
+                        arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i],
+                        arg.a_element_op_,
+                        arg.b_element_op_,
+                        arg.c_element_op_,
+                        arg.block_2_ctile_map_container_[i]);
+                }
+                else
+                {
+                    const auto kernel = kernel_gemm_xdlops_v2r3<
+                        GridwiseGemm,
+                        ADataType, // TODO: distiguish A/B datatype
+                        CDataType,
+                        remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                        remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                        remove_reference_t<
+                            typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
+                        OutElementwiseOperation,
+                        WeiElementwiseOperation,
+                        InElementwiseOperation,
+                        remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
+                        false>;
+
+                    ave_time += launch_and_time_kernel(
+                        stream_config,
+                        kernel,
+                        dim3(grid_size),
+                        dim3(BlockSize),
+                        0,
+                        arg.p_a_grid_,
+                        arg.p_b_grid_,
+                        arg.p_c_grid_,
+                        arg.a_grid_desc_k0_m_k1_container_[i],
+                        arg.b_grid_desc_k0_n_k1_container_[i],
+                        arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i],
+                        arg.a_element_op_,
+                        arg.b_element_op_,
+                        arg.c_element_op_,
+                        arg.block_2_ctile_map_container_[i]);
+                }
+            }
+            return ave_time;
+        }
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if constexpr(ConvBackwardDataSpecialization ==
+                     ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0)
+        {
+            // check if it's 1x1, stride=1 pad = 0 conv
+            if(!(arg.filter_spatial_lengths_[0] == 1 && arg.filter_spatial_lengths_[1] == 1 &&
+                 arg.conv_filter_strides_[0] == 1 && arg.conv_filter_strides_[1] == 1 &&
+                 arg.input_left_pads_[0] == 0 && arg.input_left_pads_[1] == 0 &&
+                 arg.input_right_pads_[0] == 0 && arg.input_right_pads_[1] == 0))
+            {
+                return false;
+            }
+        }
+
+        // vector load A/B matrix from global memory
+        if(!(ABlockTransferSrcVectorDim == 2 && BBlockTransferSrcVectorDim == 1 &&
+             arg.Conv_K_ % ABlockTransferSrcScalarPerVector == 0 &&
+             arg.Conv_C_ % BBlockTransferSrcScalarPerVector == 0))
+        {
+            return false;
+        }
+
+        // vector store C matrix into global memory
+        if(!(arg.Conv_C_ % CThreadTransferDstScalarPerVector == 0))
+        {
+            return false;
+        }
+
+        // Gridwise GEMM size
+        for(std::size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++)
+        {
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_container_[i],
+                                            arg.b_grid_desc_k0_n_k1_container_[i],
+                                            arg.c_grid_desc_m_n_container_[i],
+                                            arg.block_2_ctile_map_container_[i]))
+            {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(InDataType* p_in_grid,
+                             const WeiDataType* p_wei_grid,
+                             const OutDataType* p_out_grid,
+                             ck::index_t N,
+                             ck::index_t K,
+                             ck::index_t C,
+                             std::vector<ck::index_t> input_spatial_lengths,
+                             std::vector<ck::index_t> filter_spatial_lengths,
+                             std::vector<ck::index_t> output_spatial_lengths,
+                             std::vector<ck::index_t> conv_filter_strides,
+                             std::vector<ck::index_t> conv_filter_dilations,
+                             std::vector<ck::index_t> input_left_pads,
+                             std::vector<ck::index_t> input_right_pads,
+                             InElementwiseOperation in_element_op,
+                             WeiElementwiseOperation wei_element_op,
+                             OutElementwiseOperation out_element_op)
+    {
+        return Argument{p_in_grid,
+                        p_wei_grid,
+                        p_out_grid,
+                        N,
+                        K,
+                        C,
+                        input_spatial_lengths,
+                        filter_spatial_lengths,
+                        output_spatial_lengths,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        1,
+                        1,
+                        in_element_op,
+                        wei_element_op,
+                        out_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(void* p_in_grid,
+                        const void* p_wei_grid,
+                        const void* p_out_grid,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t C,
+                        std::vector<ck::index_t> input_spatial_lengths,
+                        std::vector<ck::index_t> filter_spatial_lengths,
+                        std::vector<ck::index_t> output_spatial_lengths,
+                        std::vector<ck::index_t> conv_filter_strides,
+                        std::vector<ck::index_t> conv_filter_dilations,
+                        std::vector<ck::index_t> input_left_pads,
+                        std::vector<ck::index_t> input_right_pads,
+                        InElementwiseOperation in_element_op,
+                        WeiElementwiseOperation wei_element_op,
+                        OutElementwiseOperation out_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<InDataType*>(p_in_grid),
+                                          static_cast<const WeiDataType*>(p_wei_grid),
+                                          static_cast<const OutDataType*>(p_out_grid),
+                                          N,
+                                          K,
+                                          C,
+                                          input_spatial_lengths,
+                                          filter_spatial_lengths,
+                                          output_spatial_lengths,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads,
+                                          1,
+                                          1,
+                                          in_element_op,
+                                          wei_element_op,
+                                          out_element_op);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << K0PerBlock
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 00000000..4749665c
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,968 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation_add.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// out[N, Ho, Wo, K] =
+//     activate(in[N, Hi, Wi, C] * wei[K, Y, X, C] + bias[K]) + residual[N, Ho, Wo, K]
+template <
+    typename InDataType,
+    typename WeiDataType,
+    typename OutDataType,
+    typename AccDataType,
+    typename InElementwiseOperation,
+    typename WeiElementwiseOperation,
+    typename OutElementwiseOperation,
+    ConvolutionForwardSpecialization ConvForwardSpecialization,
+    ck::index_t BlockSize,
+    ck::index_t MPerBlock,
+    ck::index_t NPerBlock,
+    ck::index_t K0PerBlock,
+    ck::index_t K1,
+    ck::index_t MPerXDL,
+    ck::index_t NPerXDL,
+    ck::index_t MXdlPerWave,
+    ck::index_t NXdlPerWave,
+    typename ABlockTransferThreadClusterLengths_K0_M_K1,
+    typename ABlockTransferThreadClusterArrangeOrder,
+    typename ABlockTransferSrcAccessOrder,
+    ck::index_t ABlockTransferSrcVectorDim,
+    ck::index_t ABlockTransferSrcScalarPerVector,
+    ck::index_t ABlockTransferDstScalarPerVector_K1,
+    bool ABlockLdsAddExtraM,
+    typename BBlockTransferThreadClusterLengths_K0_N_K1,
+    typename BBlockTransferThreadClusterArrangeOrder,
+    typename BBlockTransferSrcAccessOrder,
+    ck::index_t BBlockTransferSrcVectorDim,
+    ck::index_t BBlockTransferSrcScalarPerVector,
+    ck::index_t BBlockTransferDstScalarPerVector_K1,
+    bool BBlockLdsAddExtraN,
+    index_t CShuffleMXdlPerWavePerShuffle,
+    index_t CShuffleNXdlPerWavePerShuffle,
+    typename CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl,
+    index_t CBlockTransferScalarPerVector_NWaveNPerXdl>
+struct
+    DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
+    : public DeviceConvFwdBiasActivationAdd<InElementwiseOperation,
+                                            WeiElementwiseOperation,
+                                            OutElementwiseOperation>
+{
+    using DeviceOp =
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K;
+
+    using ADataType = InDataType;
+    using BDataType = WeiDataType;
+    using CDataType = OutDataType;
+
+    // TODO make A/B datatype different
+    using ABDataType = InDataType;
+
+    // TODO make it support any # of spatial dimensions
+    static constexpr index_t NDimSpatial = 2;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+
+    static constexpr auto K1Number     = Number<K1>{};
+    static constexpr auto GemmK1Number = K1Number;
+
+    static auto
+    MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(ck::index_t N,
+                                                    ck::index_t K,
+                                                    ck::index_t C,
+                                                    std::vector<ck::index_t> input_spatial_lengths,
+                                                    std::vector<ck::index_t> filter_spatial_lengths,
+                                                    std::vector<ck::index_t> output_spatial_lengths,
+                                                    std::vector<ck::index_t> conv_filter_strides,
+                                                    std::vector<ck::index_t> conv_filter_dilations,
+                                                    std::vector<ck::index_t> input_left_pads,
+                                                    std::vector<ck::index_t> input_right_pads)
+    {
+        using namespace ck;
+
+        const index_t Hi = input_spatial_lengths[0];
+        const index_t Wi = input_spatial_lengths[1];
+
+        const index_t Ho = output_spatial_lengths[0];
+        const index_t Wo = output_spatial_lengths[1];
+
+        const index_t Y = filter_spatial_lengths[0];
+        const index_t X = filter_spatial_lengths[1];
+
+        const index_t ConvStrideH = conv_filter_strides[0];
+        const index_t ConvStrideW = conv_filter_strides[1];
+
+        const index_t ConvDilationH = conv_filter_dilations[0];
+        const index_t ConvDilationW = conv_filter_dilations[1];
+
+        const index_t InLeftPadH = input_left_pads[0];
+        const index_t InLeftPadW = input_left_pads[1];
+
+        const index_t InRightPadH = input_right_pads[0];
+        const index_t InRightPadW = input_right_pads[1];
+
+        const index_t GemmMRaw = N * Ho * Wo;
+        const index_t GemmN    = K;
+
+        const auto GemmM    = math::integer_least_multiple(GemmMRaw, MPerBlock);
+        const auto GemmMPad = GemmM - GemmMRaw;
+
+        if constexpr(ConvForwardSpecialization ==
+                     ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
+        { // 1x1, stride=1, pad=0
+            const index_t GemmK = Y * X * C;
+            assert(GemmK % GemmK1Number == 0);
+
+            const index_t GemmK0 = GemmK / GemmK1Number;
+
+            // A: input tensor
+            const auto in_gemmmraw_gemmk_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, C));
+
+            const auto in_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmmraw_gemmk_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1Number)),
+                           make_right_pad_transform(GemmMRaw, GemmMPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // B: weight tensor
+            const auto wei_gemmn_gemmk_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(K, C));
+
+            const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                wei_gemmn_gemmk_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // C: output tensor
+            const auto out_gemmmraw_gemmn_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K));
+
+            const auto out_gemmm_gemmn_grid_desc =
+                transform_tensor_descriptor(out_gemmmraw_gemmn_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmMRaw, GemmMPad),
+                                                       make_pass_through_transform(GemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            // C0: bias tensor: assume a contiguous vector
+            const auto bias_grid_desc_gemmm_gemmn =
+                make_naive_tensor_descriptor(make_tuple(GemmM, GemmN), make_tuple(I0, I1));
+
+            // C1: residual tensor: assume same layout as output tensor
+            const auto resi_grid_desc_gemmm_gemmn = out_gemmm_gemmn_grid_desc;
+
+            return make_tuple(in_gemmk0_gemmm_gemmk1_grid_desc,
+                              wei_gemmk0_gemmn_gemmk1_grid_desc,
+                              out_gemmm_gemmn_grid_desc,
+                              bias_grid_desc_gemmm_gemmn,
+                              resi_grid_desc_gemmm_gemmn);
+        }
+        else if constexpr(ConvForwardSpecialization ==
+                          ConvolutionForwardSpecialization::Filter1x1Pad0)
+        { // 1x1, pad=0
+            const index_t GemmK = Y * X * C;
+            assert(GemmK % GemmK1Number == 0);
+
+            const index_t GemmK0 = GemmK / GemmK1Number;
+
+            // A: input tensor
+            const auto in_n_hi_wi_c_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C));
+
+            const auto in_n_ho_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_embed_transform(make_tuple(Ho), make_tuple(ConvStrideH)),
+                           make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto in_gemmk0_gemmmraw_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_n_ho_wo_c_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1Number)),
+                           make_merge_transform(make_tuple(N, Ho, Wo))),
+                make_tuple(Sequence<3>{}, Sequence<0, 1, 2>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            const auto in_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmk0_gemmmraw_gemmk1_grid_desc,
+                make_tuple(make_pass_through_transform(GemmK0),
+                           make_right_pad_transform(GemmMRaw, GemmMPad),
+                           make_pass_through_transform(GemmK1Number)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            // B: weight tensor
+            const auto wei_gemmn_gemmk_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(K, C));
+
+            const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                wei_gemmn_gemmk_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // C: output tensor
+            const auto out_gemmmraw_gemmn_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K));
+
+            const auto out_gemmm_gemmn_grid_desc =
+                transform_tensor_descriptor(out_gemmmraw_gemmn_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmMRaw, GemmMPad),
+                                                       make_pass_through_transform(GemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            // C0: bias tensor: assume a contiguous vector
+            const auto bias_grid_desc_gemmm_gemmn =
+                make_naive_tensor_descriptor(make_tuple(GemmM, GemmN), make_tuple(I0, I1));
+
+            // C1: residual tensor: assume same layout as output tensor
+            const auto resi_grid_desc_gemmm_gemmn = out_gemmm_gemmn_grid_desc;
+
+            return make_tuple(in_gemmk0_gemmm_gemmk1_grid_desc,
+                              wei_gemmk0_gemmn_gemmk1_grid_desc,
+                              out_gemmm_gemmn_grid_desc,
+                              bias_grid_desc_gemmm_gemmn,
+                              resi_grid_desc_gemmm_gemmn);
+        }
+        else if constexpr(ConvForwardSpecialization == ConvolutionForwardSpecialization::OddC)
+        { // C = odd value
+            const index_t GemmKRaw = Y * X * C;
+            const index_t GemmK = math::integer_least_multiple(GemmKRaw, K0PerBlock * GemmK1Number);
+            const index_t GemmKPad = GemmK - GemmKRaw;
+            const index_t GemmK0   = GemmK / GemmK1Number;
+
+            // A: input tensor
+            const auto in_n_hi_wi_c_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C));
+
+            const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+                in_n_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_hip_wip_c_grid_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                    make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                    make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+            const auto in_gemmkraw_gemmmraw_grid_desc =
+                transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                            make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
+                                            make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto in_gemmk_gemmm_grid_desc = transform_tensor_descriptor(
+                in_gemmkraw_gemmmraw_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKRaw, GemmKPad),
+                           make_right_pad_transform(GemmMRaw, GemmMPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto in_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmk_gemmm_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // B: weight tensor
+            const auto wei_k_yxc_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C));
+
+            const auto wei_gemmk_gemmn_grid_desc = transform_tensor_descriptor(
+                wei_k_yxc_grid_desc,
+                make_tuple(make_pass_through_transform(K),
+                           make_right_pad_transform(GemmKRaw, GemmKPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+            const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                wei_gemmk_gemmn_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // C: output tensor
+            const auto out_nhowo_k_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K));
+
+            const auto out_gemmmraw_gemmn_grid_desc =
+                transform_tensor_descriptor(out_nhowo_k_grid_desc,
+                                            make_tuple(make_pass_through_transform(N * Ho * Wo),
+                                                       make_pass_through_transform(K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto out_gemmm_gemmn_grid_desc =
+                transform_tensor_descriptor(out_gemmmraw_gemmn_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmMRaw, GemmMPad),
+                                                       make_pass_through_transform(GemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            // C0: bias tensor: assume a contiguous vector
+            const auto bias_grid_desc_gemmm_gemmn =
+                make_naive_tensor_descriptor(make_tuple(GemmM, GemmN), make_tuple(I0, I1));
+
+            // C1: residual tensor: assume same layout as output tensor
+            const auto resi_grid_desc_gemmm_gemmn = out_gemmm_gemmn_grid_desc;
+
+            return make_tuple(in_gemmk0_gemmm_gemmk1_grid_desc,
+                              wei_gemmk0_gemmn_gemmk1_grid_desc,
+                              out_gemmm_gemmn_grid_desc,
+                              bias_grid_desc_gemmm_gemmn,
+                              resi_grid_desc_gemmm_gemmn);
+        }
+        else
+        {
+            const index_t GemmK = Y * X * C;
+            assert(GemmK % GemmK1Number == 0);
+
+            const index_t GemmK0 = GemmK / GemmK1Number;
+
+            // A: input tensor
+            const auto in_n_hi_wi_c_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C));
+
+            const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+                in_n_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_hip_wip_c_grid_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                    make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                    make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+            const auto in_gemmk_gemmmraw_grid_desc =
+                transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                            make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
+                                            make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto in_gemmk0_gemmmraw_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmk_gemmmraw_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmMRaw)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            const auto in_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmk0_gemmmraw_gemmk1_grid_desc,
+                make_tuple(make_pass_through_transform(GemmK0),
+                           make_right_pad_transform(GemmMRaw, GemmMPad),
+                           make_pass_through_transform(GemmK1Number)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            // B: weight tensor
+            const auto wei_k_yxc_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C));
+
+            const auto wei_gemmk_gemmn_grid_desc = transform_tensor_descriptor(
+                wei_k_yxc_grid_desc,
+                make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+            const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                wei_gemmk_gemmn_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // C: output tensor
+            const auto out_nhowo_k_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K));
+
+            const auto out_gemmmraw_gemmn_grid_desc =
+                transform_tensor_descriptor(out_nhowo_k_grid_desc,
+                                            make_tuple(make_pass_through_transform(N * Ho * Wo),
+                                                       make_pass_through_transform(K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto out_gemmm_gemmn_grid_desc =
+                transform_tensor_descriptor(out_gemmmraw_gemmn_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmMRaw, GemmMPad),
+                                                       make_pass_through_transform(GemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            // C0: bias tensor: assume a contiguous vector
+            const auto bias_grid_desc_gemmm_gemmn =
+                make_naive_tensor_descriptor(make_tuple(GemmM, GemmN), make_tuple(I0, I1));
+
+            // C1: residual tensor: assume same layout as output tensor
+            const auto resi_grid_desc_gemmm_gemmn = out_gemmm_gemmn_grid_desc;
+
+            return make_tuple(in_gemmk0_gemmm_gemmk1_grid_desc,
+                              wei_gemmk0_gemmn_gemmk1_grid_desc,
+                              out_gemmm_gemmn_grid_desc,
+                              bias_grid_desc_gemmm_gemmn,
+                              resi_grid_desc_gemmm_gemmn);
+        }
+    }
+
+    using GridDescs = decltype(MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
+        1, 1, 1, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}));
+
+    using AGridDesc_K0_M_K1 = remove_cvref_t<decltype(GridDescs{}[I0])>;
+    using BGridDesc_K0_N_K1 = remove_cvref_t<decltype(GridDescs{}[I1])>;
+    using CGridDesc_M_N     = remove_cvref_t<decltype(GridDescs{}[I2])>;
+    using C0GridDesc_M_N    = remove_cvref_t<decltype(GridDescs{}[I3])>;
+    using C1GridDesc_M_N    = remove_cvref_t<decltype(GridDescs{}[I4])>;
+
+    using Block2CTileMap = BlockToCTileMap_M00_N0_M01<MPerBlock, NPerBlock, CGridDesc_M_N>;
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3<
+        BlockSize,
+        ABDataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CDataType,
+        InMemoryDataOperationEnum::Set,
+        AGridDesc_K0_M_K1,
+        BGridDesc_K0_N_K1,
+        CGridDesc_M_N,
+        C0GridDesc_M_N,
+        C1GridDesc_M_N,
+        InElementwiseOperation,
+        WeiElementwiseOperation,
+        OutElementwiseOperation,
+        MPerBlock,
+        NPerBlock,
+        K0PerBlock,
+        MPerXDL,
+        NPerXDL,
+        K1,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_K0_M_K1,
+        Sequence<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder,
+        Sequence<1, 0, 2>, // ABlockTransferSrcAccessOrder,
+        2,                 // ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_K1,
+        false, // AThreadTransferSrcResetCoordinateAfterRun,
+        ABlockLdsAddExtraM,
+        BBlockTransferThreadClusterLengths_K0_N_K1,
+        Sequence<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder,
+        Sequence<1, 0, 2>, // BBlockTransferSrcAccessOrder,
+        2,                 // BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_K1,
+        false, // BThreadTransferSrcResetCoordinateAfterRun,
+        BBlockLdsAddExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl,
+        CBlockTransferScalarPerVector_NWaveNPerXdl>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const InDataType* p_in_grid,
+                 const WeiDataType* p_wei_grid,
+                 OutDataType* p_out_grid,
+                 const OutDataType* p_bias_grid,
+                 const OutDataType* p_resi_grid,
+                 ck::index_t N,
+                 ck::index_t K,
+                 ck::index_t C,
+                 std::vector<ck::index_t> input_spatial_lengths,
+                 std::vector<ck::index_t> filter_spatial_lengths,
+                 std::vector<ck::index_t> output_spatial_lengths,
+                 std::vector<ck::index_t> conv_filter_strides,
+                 std::vector<ck::index_t> conv_filter_dilations,
+                 std::vector<ck::index_t> input_left_pads,
+                 std::vector<ck::index_t> input_right_pads,
+                 InElementwiseOperation in_element_op,
+                 WeiElementwiseOperation wei_element_op,
+                 OutElementwiseOperation out_element_op)
+            : p_a_grid_{p_in_grid},
+              p_b_grid_{p_wei_grid},
+              p_c_grid_{p_out_grid},
+              p_c0_grid_{p_bias_grid},
+              p_c1_grid_{p_resi_grid},
+              a_grid_desc_k0_m_k1_{},
+              b_grid_desc_k0_n_k1_{},
+              c_grid_desc_m_n_{},
+              c0_grid_desc_m_n_{},
+              c1_grid_desc_m_n_{},
+              c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_{},
+              c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_{},
+              c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_{},
+              block_2_ctile_map_{},
+              in_element_op_{in_element_op},
+              wei_element_op_{wei_element_op},
+              out_element_op_{out_element_op},
+              Conv_N_{N},
+              Conv_K_{K},
+              Conv_C_{C},
+              input_spatial_lengths_{input_spatial_lengths},
+              filter_spatial_lengths_{filter_spatial_lengths},
+              output_spatial_lengths_{output_spatial_lengths},
+              conv_filter_strides_{conv_filter_strides},
+              conv_filter_dilations_{conv_filter_dilations},
+              input_left_pads_{input_left_pads},
+              input_right_pads_{input_right_pads}
+        {
+            const auto descs =
+                DeviceOp::MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(N,
+                                                                          K,
+                                                                          C,
+                                                                          input_spatial_lengths,
+                                                                          filter_spatial_lengths,
+                                                                          output_spatial_lengths,
+                                                                          conv_filter_strides,
+                                                                          conv_filter_dilations,
+                                                                          input_left_pads,
+                                                                          input_right_pads);
+
+            a_grid_desc_k0_m_k1_ = descs[I0];
+            b_grid_desc_k0_n_k1_ = descs[I1];
+            c_grid_desc_m_n_     = descs[I2];
+            c0_grid_desc_m_n_    = descs[I3];
+            c1_grid_desc_m_n_    = descs[I4];
+
+            block_2_ctile_map_ = Block2CTileMap{c_grid_desc_m_n_};
+
+            if(GridwiseGemm::CheckValidity(a_grid_desc_k0_m_k1_,
+                                           b_grid_desc_k0_n_k1_,
+                                           c_grid_desc_m_n_,
+                                           block_2_ctile_map_))
+            {
+                c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_ =
+                    GridwiseGemm::
+                        MakeCGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl(
+                            c_grid_desc_m_n_);
+
+                c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_ =
+                    GridwiseGemm::
+                        MakeCGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl(
+                            c0_grid_desc_m_n_);
+
+                c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_ =
+                    GridwiseGemm::
+                        MakeCGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl(
+                            c1_grid_desc_m_n_);
+            }
+        }
+
+        //  private:
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        CDataType* p_c_grid_;
+        const CDataType* p_c0_grid_;
+        const CDataType* p_c1_grid_;
+        AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1_;
+        BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        C0GridDesc_M_N c0_grid_desc_m_n_;
+        C1GridDesc_M_N c1_grid_desc_m_n_;
+        typename GridwiseGemm::
+            CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
+                c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_;
+        typename GridwiseGemm::
+            C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
+                c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_;
+        typename GridwiseGemm::
+            C1GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
+                c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_;
+        Block2CTileMap block_2_ctile_map_;
+        InElementwiseOperation in_element_op_;
+        WeiElementwiseOperation wei_element_op_;
+        OutElementwiseOperation out_element_op_;
+        // for checking IsSupportedArgument()
+        index_t Conv_N_;
+        index_t Conv_K_;
+        index_t Conv_C_;
+        std::vector<index_t> input_spatial_lengths_;
+        std::vector<index_t> filter_spatial_lengths_;
+        std::vector<index_t> output_spatial_lengths_;
+        std::vector<index_t> conv_filter_strides_;
+        std::vector<index_t> conv_filter_dilations_;
+        std::vector<index_t> input_left_pads_;
+        std::vector<index_t> input_right_pads_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+#if 0
+            {
+                std::cout << DeviceOp{}.GetTypeString() << std::endl;
+                std::cout << "N " << arg.Conv_N_ << ", "
+                          << "K " << arg.Conv_K_ << ", "
+                          << "C " << arg.Conv_C_ << ", " << std::endl;
+                std::cout << "Y X " << arg.filter_spatial_lengths_[0] << ", "
+                          << arg.filter_spatial_lengths_[1] << ", " << std::endl;
+                std::cout << "Hi Wi " << arg.input_spatial_lengths_[0] << ", "
+                          << arg.input_spatial_lengths_[1] << ", " << std::endl;
+                std::cout << "Ho Wo " << arg.output_spatial_lengths_[0] << ", "
+                          << arg.output_spatial_lengths_[1] << ", " << std::endl;
+                std::cout << "Strides " << arg.conv_filter_strides_[0] << ", "
+                          << arg.conv_filter_strides_[1] << ", " << std::endl;
+                std::cout << "Dilations " << arg.conv_filter_dilations_[0] << ", "
+                          << arg.conv_filter_dilations_[1] << ", " << std::endl;
+                std::cout << "InLeftPads " << arg.input_left_pads_[0] << ", "
+                          << arg.input_left_pads_[1] << ", " << std::endl;
+                std::cout << "InLeftPads " << arg.input_right_pads_[0] << ", "
+                          << arg.input_right_pads_[1] << ", " << std::endl;
+            }
+
+            {
+                std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
+                          << ", " << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", "
+                          << arg.a_grid_desc_k0_m_k1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.b_grid_desc_k0_n_k1_{" << arg.b_grid_desc_k0_n_k1_.GetLength(I0)
+                          << ", " << arg.b_grid_desc_k0_n_k1_.GetLength(I1) << ", "
+                          << arg.b_grid_desc_k0_n_k1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
+                          << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+
+                std::cout << "arg.c0_grid_desc_m_n_{ " << arg.c0_grid_desc_m_n_.GetLength(I0)
+                          << ", " << arg.c0_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+
+                std::cout << "arg.c1_grid_desc_m_n_{ " << arg.c1_grid_desc_m_n_.GetLength(I0)
+                          << ", " << arg.c1_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+            }
+#endif
+
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
+                                            arg.b_grid_desc_k0_n_k1_,
+                                            arg.c_grid_desc_m_n_,
+                                            arg.block_2_ctile_map_))
+            {
+                throw std::runtime_error(
+                    "wrong! GridwiseGemm_km_kn_m0m1n0n1_xdlops_v3r3 has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_);
+
+            const auto K =
+                arg.a_grid_desc_k0_m_k1_.GetLength(I0) * arg.a_grid_desc_k0_m_k1_.GetLength(I2);
+
+            float ave_time = 0;
+
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                const auto kernel = kernel_gemm_xdlops_v3r3<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                    remove_reference_t<
+                        typename GridwiseGemm::
+                            CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl>,
+                    remove_reference_t<
+                        typename GridwiseGemm::
+                            C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl>,
+                    remove_reference_t<
+                        typename GridwiseGemm::
+                            C1GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl>,
+                    InElementwiseOperation,
+                    WeiElementwiseOperation,
+                    OutElementwiseOperation,
+                    Block2CTileMap,
+                    true>;
+
+                ave_time = launch_and_time_kernel(
+                    stream_config,
+                    kernel,
+                    dim3(grid_size),
+                    dim3(BlockSize),
+                    0,
+                    arg.p_a_grid_,
+                    arg.p_b_grid_,
+                    arg.p_c_grid_,
+                    arg.p_c0_grid_,
+                    arg.p_c1_grid_,
+                    arg.a_grid_desc_k0_m_k1_,
+                    arg.b_grid_desc_k0_n_k1_,
+                    arg.c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_,
+                    arg.c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_,
+                    arg.c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_,
+                    arg.in_element_op_,
+                    arg.wei_element_op_,
+                    arg.out_element_op_,
+                    arg.block_2_ctile_map_);
+            }
+            else
+            {
+                const auto kernel = kernel_gemm_xdlops_v3r3<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                    remove_reference_t<
+                        typename GridwiseGemm::
+                            CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl>,
+                    remove_reference_t<
+                        typename GridwiseGemm::
+                            C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl>,
+                    remove_reference_t<
+                        typename GridwiseGemm::
+                            C1GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl>,
+                    InElementwiseOperation,
+                    WeiElementwiseOperation,
+                    OutElementwiseOperation,
+                    Block2CTileMap,
+                    false>;
+
+                ave_time = launch_and_time_kernel(
+                    stream_config,
+                    kernel,
+                    dim3(grid_size),
+                    dim3(BlockSize),
+                    0,
+                    arg.p_a_grid_,
+                    arg.p_b_grid_,
+                    arg.p_c_grid_,
+                    arg.p_c0_grid_,
+                    arg.p_c1_grid_,
+                    arg.a_grid_desc_k0_m_k1_,
+                    arg.b_grid_desc_k0_n_k1_,
+                    arg.c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_,
+                    arg.c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_,
+                    arg.c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_,
+                    arg.in_element_op_,
+                    arg.wei_element_op_,
+                    arg.out_element_op_,
+                    arg.block_2_ctile_map_);
+            }
+
+            return ave_time;
+        }
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if constexpr(ConvForwardSpecialization ==
+                     ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
+        {
+            // check if it's 1x1, stride=1 conv
+            if(!(arg.filter_spatial_lengths_[0] == 1 && arg.filter_spatial_lengths_[1] == 1 &&
+                 arg.conv_filter_strides_[0] == 1 && arg.conv_filter_strides_[1] == 1 &&
+                 arg.input_left_pads_[0] == 0 && arg.input_left_pads_[1] == 0 &&
+                 arg.input_right_pads_[0] == 0 && arg.input_right_pads_[1] == 0))
+            {
+                return false;
+            }
+        }
+        else if constexpr(ConvForwardSpecialization ==
+                          ConvolutionForwardSpecialization::Filter1x1Pad0)
+        {
+            // check if it's 1x1 conv
+            if(!(arg.filter_spatial_lengths_[0] == 1 && arg.filter_spatial_lengths_[1] == 1 &&
+                 arg.input_left_pads_[0] == 0 && arg.input_left_pads_[1] == 0 &&
+                 arg.input_right_pads_[0] == 0 && arg.input_right_pads_[1] == 0))
+            {
+                return false;
+            }
+        }
+
+        // vector load A/B matrix from global memory
+        if(!(ABlockTransferSrcVectorDim == 2 && BBlockTransferSrcVectorDim == 2 &&
+             arg.Conv_C_ % ABlockTransferSrcScalarPerVector == 0 &&
+             arg.Conv_C_ % BBlockTransferSrcScalarPerVector == 0))
+        {
+            return false;
+        }
+
+        // vector store C matrix into global memory
+        if(!(arg.Conv_K_ % CBlockTransferScalarPerVector_NWaveNPerXdl == 0))
+        {
+            return false;
+        }
+
+        // Gridwise GEMM size
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
+                                           arg.b_grid_desc_k0_n_k1_,
+                                           arg.c_grid_desc_m_n_,
+                                           arg.block_2_ctile_map_);
+    }
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const InDataType* p_in_grid,
+                             const WeiDataType* p_wei_grid,
+                             OutDataType* p_out_grid,
+                             const OutDataType* p_bias_grid,
+                             const OutDataType* p_resi_grid,
+                             ck::index_t N,
+                             ck::index_t K,
+                             ck::index_t C,
+                             std::vector<ck::index_t> input_spatial_lengths,
+                             std::vector<ck::index_t> filter_spatial_lengths,
+                             std::vector<ck::index_t> output_spatial_lengths,
+                             std::vector<ck::index_t> conv_filter_strides,
+                             std::vector<ck::index_t> conv_filter_dilations,
+                             std::vector<ck::index_t> input_left_pads,
+                             std::vector<ck::index_t> input_right_pads,
+                             InElementwiseOperation in_element_op,
+                             WeiElementwiseOperation wei_element_op,
+                             OutElementwiseOperation out_element_op)
+    {
+        return Argument{p_in_grid,
+                        p_wei_grid,
+                        p_out_grid,
+                        p_bias_grid,
+                        p_resi_grid,
+                        N,
+                        K,
+                        C,
+                        input_spatial_lengths,
+                        filter_spatial_lengths,
+                        output_spatial_lengths,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        in_element_op,
+                        wei_element_op,
+                        out_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_in_grid,
+                        const void* p_wei_grid,
+                        void* p_out_grid,
+                        const void* p_bias_grid,
+                        const void* p_resi_grid,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t C,
+                        std::vector<ck::index_t> input_spatial_lengths,
+                        std::vector<ck::index_t> filter_spatial_lengths,
+                        std::vector<ck::index_t> output_spatial_lengths,
+                        std::vector<ck::index_t> conv_filter_strides,
+                        std::vector<ck::index_t> conv_filter_dilations,
+                        std::vector<ck::index_t> input_left_pads,
+                        std::vector<ck::index_t> input_right_pads,
+                        InElementwiseOperation in_element_op,
+                        WeiElementwiseOperation wei_element_op,
+                        OutElementwiseOperation out_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const InDataType*>(p_in_grid),
+                                          static_cast<const WeiDataType*>(p_wei_grid),
+                                          static_cast<OutDataType*>(p_out_grid),
+                                          static_cast<const OutDataType*>(p_bias_grid),
+                                          static_cast<const OutDataType*>(p_resi_grid),
+                                          N,
+                                          K,
+                                          C,
+                                          input_spatial_lengths,
+                                          filter_spatial_lengths,
+                                          output_spatial_lengths,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads,
+                                          in_element_op,
+                                          wei_element_op,
+                                          out_element_op);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << K0PerBlock
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 00000000..bafbfe4d
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,925 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// out[N, Ho, Wo, K] =
+//     activate(in[N, Hi, Wi, C] * wei[K, Y, X, C] + bias[K])
+template <
+    typename InDataType,
+    typename WeiDataType,
+    typename OutDataType,
+    typename AccDataType,
+    typename InElementwiseOperation,
+    typename WeiElementwiseOperation,
+    typename OutElementwiseOperation,
+    InMemoryDataOperationEnum OutGlobalMemoryDataOperation,
+    ConvolutionForwardSpecialization ConvForwardSpecialization,
+    ck::index_t BlockSize,
+    ck::index_t MPerBlock,
+    ck::index_t NPerBlock,
+    ck::index_t K0PerBlock,
+    ck::index_t K1,
+    ck::index_t MPerXDL,
+    ck::index_t NPerXDL,
+    ck::index_t MXdlPerWave,
+    ck::index_t NXdlPerWave,
+    typename ABlockTransferThreadClusterLengths_K0_M_K1,
+    typename ABlockTransferThreadClusterArrangeOrder,
+    typename ABlockTransferSrcAccessOrder,
+    ck::index_t ABlockTransferSrcVectorDim,
+    ck::index_t ABlockTransferSrcScalarPerVector,
+    ck::index_t ABlockTransferDstScalarPerVector_K1,
+    bool ABlockLdsAddExtraM,
+    typename BBlockTransferThreadClusterLengths_K0_N_K1,
+    typename BBlockTransferThreadClusterArrangeOrder,
+    typename BBlockTransferSrcAccessOrder,
+    ck::index_t BBlockTransferSrcVectorDim,
+    ck::index_t BBlockTransferSrcScalarPerVector,
+    ck::index_t BBlockTransferDstScalarPerVector_K1,
+    bool BBlockLdsAddExtraN,
+    index_t CShuffleMXdlPerWavePerShuffle,
+    index_t CShuffleNXdlPerWavePerShuffle,
+    typename CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl,
+    index_t CBlockTransferScalarPerVector_NWaveNPerXdl>
+struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
+    : public DeviceConvFwdBiasActivation<InElementwiseOperation,
+                                         WeiElementwiseOperation,
+                                         OutElementwiseOperation>
+{
+    using DeviceOp =
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K;
+
+    using ADataType = InDataType;
+    using BDataType = WeiDataType;
+    using CDataType = OutDataType;
+
+    // TODO make A/B datatype different
+    using ABDataType = InDataType;
+
+    // TODO make it support any # of spatial dimensions
+    static constexpr index_t NDimSpatial = 2;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr auto K1Number     = Number<K1>{};
+    static constexpr auto GemmK1Number = K1Number;
+
+    static auto
+    MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(ck::index_t N,
+                                                    ck::index_t K,
+                                                    ck::index_t C,
+                                                    std::vector<ck::index_t> input_spatial_lengths,
+                                                    std::vector<ck::index_t> filter_spatial_lengths,
+                                                    std::vector<ck::index_t> output_spatial_lengths,
+                                                    std::vector<ck::index_t> conv_filter_strides,
+                                                    std::vector<ck::index_t> conv_filter_dilations,
+                                                    std::vector<ck::index_t> input_left_pads,
+                                                    std::vector<ck::index_t> input_right_pads)
+    {
+        using namespace ck;
+
+        const index_t Hi = input_spatial_lengths[0];
+        const index_t Wi = input_spatial_lengths[1];
+
+        const index_t Ho = output_spatial_lengths[0];
+        const index_t Wo = output_spatial_lengths[1];
+
+        const index_t Y = filter_spatial_lengths[0];
+        const index_t X = filter_spatial_lengths[1];
+
+        const index_t ConvStrideH = conv_filter_strides[0];
+        const index_t ConvStrideW = conv_filter_strides[1];
+
+        const index_t ConvDilationH = conv_filter_dilations[0];
+        const index_t ConvDilationW = conv_filter_dilations[1];
+
+        const index_t InLeftPadH = input_left_pads[0];
+        const index_t InLeftPadW = input_left_pads[1];
+
+        const index_t InRightPadH = input_right_pads[0];
+        const index_t InRightPadW = input_right_pads[1];
+
+        const index_t GemmMRaw = N * Ho * Wo;
+        const index_t GemmN    = K;
+
+        const auto GemmM    = math::integer_least_multiple(GemmMRaw, MPerBlock);
+        const auto GemmMPad = GemmM - GemmMRaw;
+
+        if constexpr(ConvForwardSpecialization ==
+                     ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
+        { // 1x1, stride=1, pad=0
+            const index_t GemmK = Y * X * C;
+            assert(GemmK % GemmK1Number == 0);
+
+            const index_t GemmK0 = GemmK / GemmK1Number;
+
+            // A: input tensor
+            const auto in_gemmmraw_gemmk_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, C));
+
+            const auto in_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmmraw_gemmk_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1Number)),
+                           make_right_pad_transform(GemmMRaw, GemmMPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // B: weight tensor
+            const auto wei_gemmn_gemmk_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(K, C));
+
+            const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                wei_gemmn_gemmk_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // C: output tensor
+            const auto out_gemmmraw_gemmn_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K));
+
+            const auto out_gemmm_gemmn_grid_desc =
+                transform_tensor_descriptor(out_gemmmraw_gemmn_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmMRaw, GemmMPad),
+                                                       make_pass_through_transform(GemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            // C0: bias tensor: assume a contiguous vector
+            const auto bias_grid_desc_gemmm_gemmn =
+                make_naive_tensor_descriptor(make_tuple(GemmM, GemmN), make_tuple(I0, I1));
+
+            return make_tuple(in_gemmk0_gemmm_gemmk1_grid_desc,
+                              wei_gemmk0_gemmn_gemmk1_grid_desc,
+                              out_gemmm_gemmn_grid_desc,
+                              bias_grid_desc_gemmm_gemmn);
+        }
+        else if constexpr(ConvForwardSpecialization ==
+                          ConvolutionForwardSpecialization::Filter1x1Pad0)
+        { // 1x1, pad=0
+            const index_t GemmK = Y * X * C;
+            assert(GemmK % GemmK1Number == 0);
+
+            const index_t GemmK0 = GemmK / GemmK1Number;
+
+            // A: input tensor
+            const auto in_n_hi_wi_c_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C));
+
+            const auto in_n_ho_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_embed_transform(make_tuple(Ho), make_tuple(ConvStrideH)),
+                           make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto in_gemmk0_gemmmraw_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_n_ho_wo_c_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1Number)),
+                           make_merge_transform(make_tuple(N, Ho, Wo))),
+                make_tuple(Sequence<3>{}, Sequence<0, 1, 2>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            const auto in_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmk0_gemmmraw_gemmk1_grid_desc,
+                make_tuple(make_pass_through_transform(GemmK0),
+                           make_right_pad_transform(GemmMRaw, GemmMPad),
+                           make_pass_through_transform(GemmK1Number)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            // B: weight tensor
+            const auto wei_gemmn_gemmk_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(K, C));
+
+            const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                wei_gemmn_gemmk_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // C: output tensor
+            const auto out_gemmmraw_gemmn_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K));
+
+            const auto out_gemmm_gemmn_grid_desc =
+                transform_tensor_descriptor(out_gemmmraw_gemmn_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmMRaw, GemmMPad),
+                                                       make_pass_through_transform(GemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            // C0: bias tensor: assume a contiguous vector
+            const auto bias_grid_desc_gemmm_gemmn =
+                make_naive_tensor_descriptor(make_tuple(GemmM, GemmN), make_tuple(I0, I1));
+
+            return make_tuple(in_gemmk0_gemmm_gemmk1_grid_desc,
+                              wei_gemmk0_gemmn_gemmk1_grid_desc,
+                              out_gemmm_gemmn_grid_desc,
+                              bias_grid_desc_gemmm_gemmn);
+        }
+        else if constexpr(ConvForwardSpecialization == ConvolutionForwardSpecialization::OddC)
+        { // C = odd value
+            const index_t GemmKRaw = Y * X * C;
+            const index_t GemmK = math::integer_least_multiple(GemmKRaw, K0PerBlock * GemmK1Number);
+            const index_t GemmKPad = GemmK - GemmKRaw;
+            const index_t GemmK0   = GemmK / GemmK1Number;
+
+            // A: input tensor
+            const auto in_n_hi_wi_c_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C));
+
+            const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+                in_n_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_hip_wip_c_grid_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                    make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                    make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+            const auto in_gemmkraw_gemmmraw_grid_desc =
+                transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                            make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
+                                            make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto in_gemmk_gemmm_grid_desc = transform_tensor_descriptor(
+                in_gemmkraw_gemmmraw_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKRaw, GemmKPad),
+                           make_right_pad_transform(GemmMRaw, GemmMPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto in_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmk_gemmm_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // B: weight tensor
+            const auto wei_k_yxc_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C));
+
+            const auto wei_gemmk_gemmn_grid_desc = transform_tensor_descriptor(
+                wei_k_yxc_grid_desc,
+                make_tuple(make_pass_through_transform(K),
+                           make_right_pad_transform(GemmKRaw, GemmKPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+            const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                wei_gemmk_gemmn_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // C: output tensor
+            const auto out_nhowo_k_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K));
+
+            const auto out_gemmmraw_gemmn_grid_desc =
+                transform_tensor_descriptor(out_nhowo_k_grid_desc,
+                                            make_tuple(make_pass_through_transform(N * Ho * Wo),
+                                                       make_pass_through_transform(K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto out_gemmm_gemmn_grid_desc =
+                transform_tensor_descriptor(out_gemmmraw_gemmn_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmMRaw, GemmMPad),
+                                                       make_pass_through_transform(GemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            // C0: bias tensor: assume a contiguous vector
+            const auto bias_grid_desc_gemmm_gemmn =
+                make_naive_tensor_descriptor(make_tuple(GemmM, GemmN), make_tuple(I0, I1));
+
+            return make_tuple(in_gemmk0_gemmm_gemmk1_grid_desc,
+                              wei_gemmk0_gemmn_gemmk1_grid_desc,
+                              out_gemmm_gemmn_grid_desc,
+                              bias_grid_desc_gemmm_gemmn);
+        }
+        else
+        {
+            const index_t GemmK = Y * X * C;
+            assert(GemmK % GemmK1Number == 0);
+
+            const index_t GemmK0 = GemmK / GemmK1Number;
+
+            // A: input tensor
+            const auto in_n_hi_wi_c_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C));
+
+            const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+                in_n_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_hip_wip_c_grid_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                    make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                    make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+            const auto in_gemmk_gemmmraw_grid_desc =
+                transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                            make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
+                                            make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto in_gemmk0_gemmmraw_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmk_gemmmraw_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmMRaw)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            const auto in_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmk0_gemmmraw_gemmk1_grid_desc,
+                make_tuple(make_pass_through_transform(GemmK0),
+                           make_right_pad_transform(GemmMRaw, GemmMPad),
+                           make_pass_through_transform(GemmK1Number)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            // B: weight tensor
+            const auto wei_k_yxc_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C));
+
+            const auto wei_gemmk_gemmn_grid_desc = transform_tensor_descriptor(
+                wei_k_yxc_grid_desc,
+                make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+            const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                wei_gemmk_gemmn_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // C: output tensor
+            const auto out_nhowo_k_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K));
+
+            const auto out_gemmmraw_gemmn_grid_desc =
+                transform_tensor_descriptor(out_nhowo_k_grid_desc,
+                                            make_tuple(make_pass_through_transform(N * Ho * Wo),
+                                                       make_pass_through_transform(K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto out_gemmm_gemmn_grid_desc =
+                transform_tensor_descriptor(out_gemmmraw_gemmn_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmMRaw, GemmMPad),
+                                                       make_pass_through_transform(GemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            // C0: bias tensor: assume a contiguous vector
+            const auto bias_grid_desc_gemmm_gemmn =
+                make_naive_tensor_descriptor(make_tuple(GemmM, GemmN), make_tuple(I0, I1));
+
+            return make_tuple(in_gemmk0_gemmm_gemmk1_grid_desc,
+                              wei_gemmk0_gemmn_gemmk1_grid_desc,
+                              out_gemmm_gemmn_grid_desc,
+                              bias_grid_desc_gemmm_gemmn);
+        }
+    }
+
+    using ABCGridDescs = decltype(MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
+        1, 1, 1, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}));
+
+    using AGridDesc_K0_M_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I0])>;
+    using BGridDesc_K0_N_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I1])>;
+    using CGridDesc_M_N     = remove_cvref_t<decltype(ABCGridDescs{}[I2])>;
+    using C0GridDesc_M_N    = remove_cvref_t<decltype(ABCGridDescs{}[I3])>;
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2<
+        BlockSize,
+        ABDataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CDataType,
+        OutGlobalMemoryDataOperation,
+        AGridDesc_K0_M_K1,
+        BGridDesc_K0_N_K1,
+        CGridDesc_M_N,
+        C0GridDesc_M_N,
+        InElementwiseOperation,
+        WeiElementwiseOperation,
+        OutElementwiseOperation,
+        MPerBlock,
+        NPerBlock,
+        K0PerBlock,
+        MPerXDL,
+        NPerXDL,
+        K1,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_K0_M_K1,
+        Sequence<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder,
+        Sequence<1, 0, 2>, // ABlockTransferSrcAccessOrder,
+        2,                 // ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_K1,
+        false, // AThreadTransferSrcResetCoordinateAfterRun,
+        ABlockLdsAddExtraM,
+        BBlockTransferThreadClusterLengths_K0_N_K1,
+        Sequence<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder,
+        Sequence<1, 0, 2>, // BBlockTransferSrcAccessOrder,
+        2,                 // BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_K1,
+        false, // BThreadTransferSrcResetCoordinateAfterRun,
+        BBlockLdsAddExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl,
+        CBlockTransferScalarPerVector_NWaveNPerXdl>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const InDataType* p_in_grid,
+                 const WeiDataType* p_wei_grid,
+                 OutDataType* p_out_grid,
+                 const OutDataType* p_bias_grid,
+                 ck::index_t N,
+                 ck::index_t K,
+                 ck::index_t C,
+                 std::vector<ck::index_t> input_spatial_lengths,
+                 std::vector<ck::index_t> filter_spatial_lengths,
+                 std::vector<ck::index_t> output_spatial_lengths,
+                 std::vector<ck::index_t> conv_filter_strides,
+                 std::vector<ck::index_t> conv_filter_dilations,
+                 std::vector<ck::index_t> input_left_pads,
+                 std::vector<ck::index_t> input_right_pads,
+                 ck::index_t M01,
+                 ck::index_t N01,
+                 InElementwiseOperation in_element_op,
+                 WeiElementwiseOperation wei_element_op,
+                 OutElementwiseOperation out_element_op)
+            : p_a_grid_{p_in_grid},
+              p_b_grid_{p_wei_grid},
+              p_c_grid_{p_out_grid},
+              p_c0_grid_{p_bias_grid},
+              a_grid_desc_k0_m_k1_{},
+              b_grid_desc_k0_n_k1_{},
+              c_grid_desc_m_n_{},
+              c0_grid_desc_m_n_{},
+              c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_{},
+              c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_{},
+              block_2_ctile_map_{},
+              M01_{M01},
+              N01_{N01},
+              in_element_op_{in_element_op},
+              wei_element_op_{wei_element_op},
+              out_element_op_{out_element_op},
+              Conv_N_{N},
+              Conv_K_{K},
+              Conv_C_{C},
+              input_spatial_lengths_{input_spatial_lengths},
+              filter_spatial_lengths_{filter_spatial_lengths},
+              output_spatial_lengths_{output_spatial_lengths},
+              conv_filter_strides_{conv_filter_strides},
+              conv_filter_dilations_{conv_filter_dilations},
+              input_left_pads_{input_left_pads},
+              input_right_pads_{input_right_pads}
+        {
+            const auto descs =
+                DeviceOp::MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(N,
+                                                                          K,
+                                                                          C,
+                                                                          input_spatial_lengths,
+                                                                          filter_spatial_lengths,
+                                                                          output_spatial_lengths,
+                                                                          conv_filter_strides,
+                                                                          conv_filter_dilations,
+                                                                          input_left_pads,
+                                                                          input_right_pads);
+
+            a_grid_desc_k0_m_k1_ = descs[I0];
+            b_grid_desc_k0_n_k1_ = descs[I1];
+            c_grid_desc_m_n_     = descs[I2];
+            c0_grid_desc_m_n_    = descs[I3];
+            block_2_ctile_map_ =
+                GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_, M01, N01);
+
+            if(GridwiseGemm::CheckValidity(a_grid_desc_k0_m_k1_,
+                                           b_grid_desc_k0_n_k1_,
+                                           c_grid_desc_m_n_,
+                                           block_2_ctile_map_))
+            {
+                c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_ =
+                    GridwiseGemm::
+                        MakeCGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl(
+                            c_grid_desc_m_n_);
+
+                c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_ =
+                    GridwiseGemm::
+                        MakeCGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl(
+                            c0_grid_desc_m_n_);
+            }
+        }
+
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        CDataType* p_c_grid_;
+        const CDataType* p_c0_grid_;
+        AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1_;
+        BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        C0GridDesc_M_N c0_grid_desc_m_n_;
+        typename GridwiseGemm::
+            CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
+                c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_;
+        typename GridwiseGemm::
+            C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
+                c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_;
+        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
+        index_t M01_;
+        index_t N01_;
+        InElementwiseOperation in_element_op_;
+        WeiElementwiseOperation wei_element_op_;
+        OutElementwiseOperation out_element_op_;
+        // for checking IsSupportedArgument()
+        index_t Conv_N_;
+        index_t Conv_K_;
+        index_t Conv_C_;
+        std::vector<index_t> input_spatial_lengths_;
+        std::vector<index_t> filter_spatial_lengths_;
+        std::vector<index_t> output_spatial_lengths_;
+        std::vector<index_t> conv_filter_strides_;
+        std::vector<index_t> conv_filter_dilations_;
+        std::vector<index_t> input_left_pads_;
+        std::vector<index_t> input_right_pads_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+#if 0
+            {
+                std::cout << DeviceOp{}.GetTypeString() << std::endl;
+                std::cout << "N " << arg.Conv_N_ << ", "
+                          << "K " << arg.Conv_K_ << ", "
+                          << "C " << arg.Conv_C_ << ", " << std::endl;
+                std::cout << "Y X " << arg.filter_spatial_lengths_[0] << ", "
+                          << arg.filter_spatial_lengths_[1] << ", " << std::endl;
+                std::cout << "Hi Wi " << arg.input_spatial_lengths_[0] << ", "
+                          << arg.input_spatial_lengths_[1] << ", " << std::endl;
+                std::cout << "Ho Wo " << arg.output_spatial_lengths_[0] << ", "
+                          << arg.output_spatial_lengths_[1] << ", " << std::endl;
+                std::cout << "Strides " << arg.conv_filter_strides_[0] << ", "
+                          << arg.conv_filter_strides_[1] << ", " << std::endl;
+                std::cout << "Dilations " << arg.conv_filter_dilations_[0] << ", "
+                          << arg.conv_filter_dilations_[1] << ", " << std::endl;
+                std::cout << "InLeftPads " << arg.input_left_pads_[0] << ", "
+                          << arg.input_left_pads_[1] << ", " << std::endl;
+                std::cout << "InLeftPads " << arg.input_right_pads_[0] << ", "
+                          << arg.input_right_pads_[1] << ", " << std::endl;
+            }
+
+            {
+                std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
+                          << ", " << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", "
+                          << arg.a_grid_desc_k0_m_k1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.b_grid_desc_k0_n_k1_{" << arg.b_grid_desc_k0_n_k1_.GetLength(I0)
+                          << ", " << arg.b_grid_desc_k0_n_k1_.GetLength(I1) << ", "
+                          << arg.b_grid_desc_k0_n_k1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
+                          << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+
+                std::cout << "arg.c0_grid_desc_m_n_{ " << arg.c0_grid_desc_m_n_.GetLength(I0)
+                          << ", " << arg.c0_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+            }
+#endif
+
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
+                                            arg.b_grid_desc_k0_n_k1_,
+                                            arg.c_grid_desc_m_n_,
+                                            arg.block_2_ctile_map_))
+            {
+                throw std::runtime_error(
+                    "wrong! GridwiseGemm_km_kn_m0m1n0n1_xdlops_v3r2 has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_);
+
+            const auto K =
+                arg.a_grid_desc_k0_m_k1_.GetLength(I0) * arg.a_grid_desc_k0_m_k1_.GetLength(I2);
+
+            float ave_time = 0;
+
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                const auto kernel = kernel_gemm_xdlops_v3r2<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                    remove_reference_t<
+                        typename GridwiseGemm::
+                            CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl>,
+                    remove_reference_t<
+                        typename GridwiseGemm::
+                            C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl>,
+                    InElementwiseOperation,
+                    WeiElementwiseOperation,
+                    OutElementwiseOperation,
+                    remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
+                    true>;
+
+                ave_time = launch_and_time_kernel(
+                    stream_config,
+                    kernel,
+                    dim3(grid_size),
+                    dim3(BlockSize),
+                    0,
+                    arg.p_a_grid_,
+                    arg.p_b_grid_,
+                    arg.p_c_grid_,
+                    arg.p_c0_grid_,
+                    arg.a_grid_desc_k0_m_k1_,
+                    arg.b_grid_desc_k0_n_k1_,
+                    arg.c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_,
+                    arg.c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_,
+                    arg.in_element_op_,
+                    arg.wei_element_op_,
+                    arg.out_element_op_,
+                    arg.block_2_ctile_map_);
+            }
+            else
+            {
+                const auto kernel = kernel_gemm_xdlops_v3r2<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                    remove_reference_t<
+                        typename GridwiseGemm::
+                            CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl>,
+                    remove_reference_t<
+                        typename GridwiseGemm::
+                            C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl>,
+                    InElementwiseOperation,
+                    WeiElementwiseOperation,
+                    OutElementwiseOperation,
+                    remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
+                    false>;
+
+                ave_time = launch_and_time_kernel(
+                    stream_config,
+                    kernel,
+                    dim3(grid_size),
+                    dim3(BlockSize),
+                    0,
+                    arg.p_a_grid_,
+                    arg.p_b_grid_,
+                    arg.p_c_grid_,
+                    arg.p_c0_grid_,
+                    arg.a_grid_desc_k0_m_k1_,
+                    arg.b_grid_desc_k0_n_k1_,
+                    arg.c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_,
+                    arg.c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_,
+                    arg.in_element_op_,
+                    arg.wei_element_op_,
+                    arg.out_element_op_,
+                    arg.block_2_ctile_map_);
+            }
+
+            return ave_time;
+        }
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if constexpr(ConvForwardSpecialization ==
+                     ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
+        {
+            // check if it's 1x1, stride=1 conv
+            if(!(arg.filter_spatial_lengths_[0] == 1 && arg.filter_spatial_lengths_[1] == 1 &&
+                 arg.conv_filter_strides_[0] == 1 && arg.conv_filter_strides_[1] == 1 &&
+                 arg.input_left_pads_[0] == 0 && arg.input_left_pads_[1] == 0 &&
+                 arg.input_right_pads_[0] == 0 && arg.input_right_pads_[1] == 0))
+            {
+                return false;
+            }
+        }
+        else if constexpr(ConvForwardSpecialization ==
+                          ConvolutionForwardSpecialization::Filter1x1Pad0)
+        {
+            // check if it's 1x1 conv
+            if(!(arg.filter_spatial_lengths_[0] == 1 && arg.filter_spatial_lengths_[1] == 1 &&
+                 arg.input_left_pads_[0] == 0 && arg.input_left_pads_[1] == 0 &&
+                 arg.input_right_pads_[0] == 0 && arg.input_right_pads_[1] == 0))
+            {
+                return false;
+            }
+        }
+
+        // vector load A/B matrix from global memory
+        if(!(ABlockTransferSrcVectorDim == 2 && BBlockTransferSrcVectorDim == 2 &&
+             arg.Conv_C_ % ABlockTransferSrcScalarPerVector == 0 &&
+             arg.Conv_C_ % BBlockTransferSrcScalarPerVector == 0))
+        {
+            return false;
+        }
+
+        // vector store C matrix into global memory
+        if(!(arg.Conv_K_ % CBlockTransferScalarPerVector_NWaveNPerXdl == 0))
+        {
+            return false;
+        }
+
+        // Gridwise GEMM size
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
+                                           arg.b_grid_desc_k0_n_k1_,
+                                           arg.c_grid_desc_m_n_,
+                                           arg.block_2_ctile_map_);
+    }
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const InDataType* p_in_grid,
+                             const WeiDataType* p_wei_grid,
+                             OutDataType* p_out_grid,
+                             const OutDataType* p_bias_grid,
+                             ck::index_t N,
+                             ck::index_t K,
+                             ck::index_t C,
+                             std::vector<ck::index_t> input_spatial_lengths,
+                             std::vector<ck::index_t> filter_spatial_lengths,
+                             std::vector<ck::index_t> output_spatial_lengths,
+                             std::vector<ck::index_t> conv_filter_strides,
+                             std::vector<ck::index_t> conv_filter_dilations,
+                             std::vector<ck::index_t> input_left_pads,
+                             std::vector<ck::index_t> input_right_pads,
+                             InElementwiseOperation in_element_op,
+                             WeiElementwiseOperation wei_element_op,
+                             OutElementwiseOperation out_element_op)
+    {
+        return Argument{p_in_grid,
+                        p_wei_grid,
+                        p_out_grid,
+                        p_bias_grid,
+                        N,
+                        K,
+                        C,
+                        input_spatial_lengths,
+                        filter_spatial_lengths,
+                        output_spatial_lengths,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        1,
+                        1,
+                        in_element_op,
+                        wei_element_op,
+                        out_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_in_grid,
+                        const void* p_wei_grid,
+                        void* p_out_grid,
+                        const void* p_bias_grid,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t C,
+                        std::vector<ck::index_t> input_spatial_lengths,
+                        std::vector<ck::index_t> filter_spatial_lengths,
+                        std::vector<ck::index_t> output_spatial_lengths,
+                        std::vector<ck::index_t> conv_filter_strides,
+                        std::vector<ck::index_t> conv_filter_dilations,
+                        std::vector<ck::index_t> input_left_pads,
+                        std::vector<ck::index_t> input_right_pads,
+                        InElementwiseOperation in_element_op,
+                        WeiElementwiseOperation wei_element_op,
+                        OutElementwiseOperation out_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const InDataType*>(p_in_grid),
+                                          static_cast<const WeiDataType*>(p_wei_grid),
+                                          static_cast<OutDataType*>(p_out_grid),
+                                          static_cast<const OutDataType*>(p_bias_grid),
+                                          N,
+                                          K,
+                                          C,
+                                          input_spatial_lengths,
+                                          filter_spatial_lengths,
+                                          output_spatial_lengths,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads,
+                                          1,
+                                          1,
+                                          in_element_op,
+                                          wei_element_op,
+                                          out_element_op);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << K0PerBlock
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 00000000..6a6d24bf
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,893 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
+template <
+    typename InDataType,
+    typename WeiDataType,
+    typename OutDataType,
+    typename AccDataType,
+    typename InElementwiseOperation,
+    typename WeiElementwiseOperation,
+    typename OutElementwiseOperation,
+    ConvolutionForwardSpecialization ConvForwardSpecialization,
+    ck::index_t BlockSize,
+    ck::index_t MPerBlock,
+    ck::index_t NPerBlock,
+    ck::index_t K0PerBlock,
+    ck::index_t K1,
+    ck::index_t MPerXdl,
+    ck::index_t NPerXdl,
+    ck::index_t MXdlPerWave,
+    ck::index_t NXdlPerWave,
+    typename ABlockTransferThreadClusterLengths_K0_M_K1,
+    typename ABlockTransferThreadClusterArrangeOrder,
+    typename ABlockTransferSrcAccessOrder,
+    ck::index_t ABlockTransferSrcVectorDim,
+    ck::index_t ABlockTransferSrcScalarPerVector,
+    ck::index_t ABlockTransferDstScalarPerVector_K1,
+    bool ABlockLdsAddExtraM,
+    typename BBlockTransferThreadClusterLengths_K0_N_K1,
+    typename BBlockTransferThreadClusterArrangeOrder,
+    typename BBlockTransferSrcAccessOrder,
+    ck::index_t BBlockTransferSrcVectorDim,
+    ck::index_t BBlockTransferSrcScalarPerVector,
+    ck::index_t BBlockTransferDstScalarPerVector_K1,
+    bool BBlockLdsAddExtraN,
+    index_t CShuffleMXdlPerWavePerShuffle,
+    index_t CShuffleNXdlPerWavePerShuffle,
+    typename CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl,
+    index_t CBlockTransferScalarPerVector_NWaveNPerXdl>
+struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
+    : public DeviceConvFwd<2,
+                           ck::tensor_layout::convolution::NHWC,
+                           ck::tensor_layout::convolution::KYXC,
+                           ck::tensor_layout::convolution::NHWK,
+                           InDataType,
+                           WeiDataType,
+                           OutDataType,
+                           InElementwiseOperation,
+                           WeiElementwiseOperation,
+                           OutElementwiseOperation>
+{
+    using DeviceOp = DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K;
+
+    using ADataType = InDataType;
+    using BDataType = WeiDataType;
+    using CDataType = OutDataType;
+
+    // TODO make A/B datatype different
+    using ABDataType = InDataType;
+
+    static constexpr index_t NDimSpatial = 2;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+
+    static constexpr auto K1Number     = Number<K1>{};
+    static constexpr auto GemmK1Number = K1Number;
+
+    static auto
+    MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(ck::index_t N,
+                                                    ck::index_t K,
+                                                    ck::index_t C,
+                                                    std::vector<ck::index_t> input_spatial_lengths,
+                                                    std::vector<ck::index_t> filter_spatial_lengths,
+                                                    std::vector<ck::index_t> output_spatial_lengths,
+                                                    std::vector<ck::index_t> conv_filter_strides,
+                                                    std::vector<ck::index_t> conv_filter_dilations,
+                                                    std::vector<ck::index_t> input_left_pads,
+                                                    std::vector<ck::index_t> input_right_pads)
+    {
+        using namespace ck;
+
+        const index_t Hi = input_spatial_lengths[0];
+        const index_t Wi = input_spatial_lengths[1];
+
+        const index_t Ho = output_spatial_lengths[0];
+        const index_t Wo = output_spatial_lengths[1];
+
+        const index_t Y = filter_spatial_lengths[0];
+        const index_t X = filter_spatial_lengths[1];
+
+        const index_t ConvStrideH = conv_filter_strides[0];
+        const index_t ConvStrideW = conv_filter_strides[1];
+
+        const index_t ConvDilationH = conv_filter_dilations[0];
+        const index_t ConvDilationW = conv_filter_dilations[1];
+
+        const index_t InLeftPadH = input_left_pads[0];
+        const index_t InLeftPadW = input_left_pads[1];
+
+        const index_t InRightPadH = input_right_pads[0];
+        const index_t InRightPadW = input_right_pads[1];
+
+        const index_t GemmMRaw = N * Ho * Wo;
+        const index_t GemmN    = K;
+
+        const auto GemmM    = math::integer_least_multiple(GemmMRaw, MPerBlock);
+        const auto GemmMPad = GemmM - GemmMRaw;
+
+        if constexpr(ConvForwardSpecialization ==
+                     ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
+        { // 1x1, stride=1, pad=0
+            const index_t GemmK = Y * X * C;
+            assert(GemmK % GemmK1Number == 0);
+
+            const index_t GemmK0 = GemmK / GemmK1Number;
+
+            // A: input tensor
+            const auto in_gemmmraw_gemmk_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, C));
+
+            const auto in_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmmraw_gemmk_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1Number)),
+                           make_right_pad_transform(GemmMRaw, GemmMPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // B: weight tensor
+            const auto wei_gemmn_gemmk_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(K, C));
+
+            const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                wei_gemmn_gemmk_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // C: output tensor
+            const auto out_gemmmraw_gemmn_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K));
+
+            const auto out_gemmm_gemmn_grid_desc =
+                transform_tensor_descriptor(out_gemmmraw_gemmn_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmMRaw, GemmMPad),
+                                                       make_pass_through_transform(GemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return make_tuple(in_gemmk0_gemmm_gemmk1_grid_desc,
+                              wei_gemmk0_gemmn_gemmk1_grid_desc,
+                              out_gemmm_gemmn_grid_desc);
+        }
+        else if constexpr(ConvForwardSpecialization ==
+                          ConvolutionForwardSpecialization::Filter1x1Pad0)
+        { // 1x1, pad=0
+            const index_t GemmK = Y * X * C;
+            assert(GemmK % GemmK1Number == 0);
+
+            const index_t GemmK0 = GemmK / GemmK1Number;
+
+            // A: input tensor
+            const auto in_n_hi_wi_c_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C));
+
+            const auto in_n_ho_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_embed_transform(make_tuple(Ho), make_tuple(ConvStrideH)),
+                           make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto in_gemmk0_gemmmraw_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_n_ho_wo_c_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1Number)),
+                           make_merge_transform(make_tuple(N, Ho, Wo))),
+                make_tuple(Sequence<3>{}, Sequence<0, 1, 2>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            const auto in_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmk0_gemmmraw_gemmk1_grid_desc,
+                make_tuple(make_pass_through_transform(GemmK0),
+                           make_right_pad_transform(GemmMRaw, GemmMPad),
+                           make_pass_through_transform(GemmK1Number)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            // B: weight tensor
+            const auto wei_gemmn_gemmk_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(K, C));
+
+            const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                wei_gemmn_gemmk_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // C: output tensor
+            const auto out_gemmmraw_gemmn_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K));
+
+            const auto out_gemmm_gemmn_grid_desc =
+                transform_tensor_descriptor(out_gemmmraw_gemmn_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmMRaw, GemmMPad),
+                                                       make_pass_through_transform(GemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return make_tuple(in_gemmk0_gemmm_gemmk1_grid_desc,
+                              wei_gemmk0_gemmn_gemmk1_grid_desc,
+                              out_gemmm_gemmn_grid_desc);
+        }
+        else if constexpr(ConvForwardSpecialization == ConvolutionForwardSpecialization::OddC)
+        { // C = odd value
+            const index_t GemmKRaw = Y * X * C;
+            const index_t GemmK = math::integer_least_multiple(GemmKRaw, K0PerBlock * GemmK1Number);
+            const index_t GemmKPad = GemmK - GemmKRaw;
+            const index_t GemmK0   = GemmK / GemmK1Number;
+
+            // A: input tensor
+            const auto in_n_hi_wi_c_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C));
+
+            const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+                in_n_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_hip_wip_c_grid_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                    make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                    make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+            const auto in_gemmkraw_gemmmraw_grid_desc =
+                transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                            make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
+                                            make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto in_gemmk_gemmm_grid_desc = transform_tensor_descriptor(
+                in_gemmkraw_gemmmraw_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKRaw, GemmKPad),
+                           make_right_pad_transform(GemmMRaw, GemmMPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto in_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmk_gemmm_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // B: weight tensor
+            const auto wei_k_yxc_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C));
+
+            const auto wei_gemmk_gemmn_grid_desc = transform_tensor_descriptor(
+                wei_k_yxc_grid_desc,
+                make_tuple(make_pass_through_transform(K),
+                           make_right_pad_transform(GemmKRaw, GemmKPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+            const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                wei_gemmk_gemmn_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // C: output tensor
+            const auto out_nhowo_k_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K));
+
+            const auto out_gemmmraw_gemmn_grid_desc =
+                transform_tensor_descriptor(out_nhowo_k_grid_desc,
+                                            make_tuple(make_pass_through_transform(N * Ho * Wo),
+                                                       make_pass_through_transform(K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto out_gemmm_gemmn_grid_desc =
+                transform_tensor_descriptor(out_gemmmraw_gemmn_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmMRaw, GemmMPad),
+                                                       make_pass_through_transform(GemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return make_tuple(in_gemmk0_gemmm_gemmk1_grid_desc,
+                              wei_gemmk0_gemmn_gemmk1_grid_desc,
+                              out_gemmm_gemmn_grid_desc);
+        }
+        else
+        {
+            const index_t GemmK = Y * X * C;
+            assert(GemmK % GemmK1Number == 0);
+
+            const index_t GemmK0 = GemmK / GemmK1Number;
+
+            // A: input tensor
+            const auto in_n_hi_wi_c_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C));
+
+            const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+                in_n_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_hip_wip_c_grid_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                    make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                    make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+            const auto in_gemmk_gemmmraw_grid_desc =
+                transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                            make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
+                                            make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto in_gemmk0_gemmmraw_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmk_gemmmraw_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmMRaw)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            const auto in_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmk0_gemmmraw_gemmk1_grid_desc,
+                make_tuple(make_pass_through_transform(GemmK0),
+                           make_right_pad_transform(GemmMRaw, GemmMPad),
+                           make_pass_through_transform(GemmK1Number)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            // B: weight tensor
+            const auto wei_k_yxc_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C));
+
+            const auto wei_gemmk_gemmn_grid_desc = transform_tensor_descriptor(
+                wei_k_yxc_grid_desc,
+                make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+            const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                wei_gemmk_gemmn_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // C: output tensor
+            const auto out_nhowo_k_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K));
+
+            const auto out_gemmmraw_gemmn_grid_desc =
+                transform_tensor_descriptor(out_nhowo_k_grid_desc,
+                                            make_tuple(make_pass_through_transform(N * Ho * Wo),
+                                                       make_pass_through_transform(K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto out_gemmm_gemmn_grid_desc =
+                transform_tensor_descriptor(out_gemmmraw_gemmn_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmMRaw, GemmMPad),
+                                                       make_pass_through_transform(GemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return make_tuple(in_gemmk0_gemmm_gemmk1_grid_desc,
+                              wei_gemmk0_gemmn_gemmk1_grid_desc,
+                              out_gemmm_gemmn_grid_desc);
+        }
+    }
+
+    using ABCGridDescs = decltype(MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
+        1, 1, 1, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}));
+
+    using AGridDesc_K0_M_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I0])>;
+    using BGridDesc_K0_N_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I1])>;
+    using CGridDesc_M_N     = remove_cvref_t<decltype(ABCGridDescs{}[I2])>;
+
+    using Block2CTileMap = BlockToCTileMap_M00_N0_M01<MPerBlock, NPerBlock, CGridDesc_M_N>;
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1<
+        BlockSize,
+        ABDataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CDataType, // TODO: Add ShuffleType for DeviceConv2d
+        CDataType,
+        InMemoryDataOperationEnum::Set,
+        AGridDesc_K0_M_K1,
+        BGridDesc_K0_N_K1,
+        CGridDesc_M_N,
+        InElementwiseOperation,
+        WeiElementwiseOperation,
+        OutElementwiseOperation,
+        MPerBlock,
+        NPerBlock,
+        K0PerBlock * K1,
+        K1, // AK1
+        K1, // BK1
+        MPerXdl,
+        NPerXdl,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_K0_M_K1,
+        Sequence<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder,
+        Sequence<1, 0, 2>, // ABlockTransferSrcAccessOrder,
+        2,                 // ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_K1,
+        false, // AThreadTransferSrcResetCoordinateAfterRun,
+        ABlockLdsAddExtraM,
+        BBlockTransferThreadClusterLengths_K0_N_K1,
+        Sequence<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder,
+        Sequence<1, 0, 2>, // BBlockTransferSrcAccessOrder,
+        2,                 // BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_K1,
+        false, // BThreadTransferSrcResetCoordinateAfterRun,
+        BBlockLdsAddExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl,
+        CBlockTransferScalarPerVector_NWaveNPerXdl>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const InDataType* p_in_grid,
+                 const WeiDataType* p_wei_grid,
+                 OutDataType* p_out_grid,
+                 ck::index_t N,
+                 ck::index_t K,
+                 ck::index_t C,
+                 std::vector<ck::index_t> input_spatial_lengths,
+                 std::vector<ck::index_t> filter_spatial_lengths,
+                 std::vector<ck::index_t> output_spatial_lengths,
+                 std::vector<ck::index_t> conv_filter_strides,
+                 std::vector<ck::index_t> conv_filter_dilations,
+                 std::vector<ck::index_t> input_left_pads,
+                 std::vector<ck::index_t> input_right_pads,
+                 InElementwiseOperation in_element_op,
+                 WeiElementwiseOperation wei_element_op,
+                 OutElementwiseOperation out_element_op)
+            : p_a_grid_{p_in_grid},
+              p_b_grid_{p_wei_grid},
+              p_c_grid_{p_out_grid},
+              a_grid_desc_k0_m_k1_{},
+              b_grid_desc_k0_n_k1_{},
+              c_grid_desc_m_n_{},
+              c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_{},
+              block_2_ctile_map_{},
+              in_element_op_{in_element_op},
+              wei_element_op_{wei_element_op},
+              out_element_op_{out_element_op},
+              Conv_N_{N},
+              Conv_K_{K},
+              Conv_C_{C},
+              input_spatial_lengths_{input_spatial_lengths},
+              filter_spatial_lengths_{filter_spatial_lengths},
+              output_spatial_lengths_{output_spatial_lengths},
+              conv_filter_strides_{conv_filter_strides},
+              conv_filter_dilations_{conv_filter_dilations},
+              input_left_pads_{input_left_pads},
+              input_right_pads_{input_right_pads}
+        {
+            const auto descs =
+                DeviceOp::MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(N,
+                                                                          K,
+                                                                          C,
+                                                                          input_spatial_lengths,
+                                                                          filter_spatial_lengths,
+                                                                          output_spatial_lengths,
+                                                                          conv_filter_strides,
+                                                                          conv_filter_dilations,
+                                                                          input_left_pads,
+                                                                          input_right_pads);
+
+            a_grid_desc_k0_m_k1_ = descs[I0];
+            b_grid_desc_k0_n_k1_ = descs[I1];
+            c_grid_desc_m_n_     = descs[I2];
+
+            block_2_ctile_map_ = Block2CTileMap{c_grid_desc_m_n_};
+
+            if(GridwiseGemm::CheckValidity(a_grid_desc_k0_m_k1_,
+                                           b_grid_desc_k0_n_k1_,
+                                           c_grid_desc_m_n_,
+                                           block_2_ctile_map_))
+            {
+                c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_ =
+                    GridwiseGemm::
+                        MakeCGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl(
+                            c_grid_desc_m_n_);
+            }
+        }
+
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        CDataType* p_c_grid_;
+        AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1_;
+        BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        typename GridwiseGemm::
+            CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
+                c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_;
+        Block2CTileMap block_2_ctile_map_;
+        InElementwiseOperation in_element_op_;
+        WeiElementwiseOperation wei_element_op_;
+        OutElementwiseOperation out_element_op_;
+        // for checking IsSupportedArgument()
+        index_t Conv_N_;
+        index_t Conv_K_;
+        index_t Conv_C_;
+        std::vector<index_t> input_spatial_lengths_;
+        std::vector<index_t> filter_spatial_lengths_;
+        std::vector<index_t> output_spatial_lengths_;
+        std::vector<index_t> conv_filter_strides_;
+        std::vector<index_t> conv_filter_dilations_;
+        std::vector<index_t> input_left_pads_;
+        std::vector<index_t> input_right_pads_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+#if 0
+            {
+                std::cout << DeviceOp{}.GetTypeString() << std::endl;
+                std::cout << "N " << arg.Conv_N_ << ", "
+                          << "K " << arg.Conv_K_ << ", "
+                          << "C " << arg.Conv_C_ << ", " << std::endl;
+                std::cout << "Y X " << arg.filter_spatial_lengths_[0] << ", "
+                          << arg.filter_spatial_lengths_[1] << ", " << std::endl;
+                std::cout << "Hi Wi " << arg.input_spatial_lengths_[0] << ", "
+                          << arg.input_spatial_lengths_[1] << ", " << std::endl;
+                std::cout << "Ho Wo " << arg.output_spatial_lengths_[0] << ", "
+                          << arg.output_spatial_lengths_[1] << ", " << std::endl;
+                std::cout << "Strides " << arg.conv_filter_strides_[0] << ", "
+                          << arg.conv_filter_strides_[1] << ", " << std::endl;
+                std::cout << "Dilations " << arg.conv_filter_dilations_[0] << ", "
+                          << arg.conv_filter_dilations_[1] << ", " << std::endl;
+                std::cout << "InLeftPads " << arg.input_left_pads_[0] << ", "
+                          << arg.input_left_pads_[1] << ", " << std::endl;
+                std::cout << "InLeftPads " << arg.input_right_pads_[0] << ", "
+                          << arg.input_right_pads_[1] << ", " << std::endl;
+            }
+
+            {
+                std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
+                          << ", " << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", "
+                          << arg.a_grid_desc_k0_m_k1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.b_grid_desc_k0_n_k1_{" << arg.b_grid_desc_k0_n_k1_.GetLength(I0)
+                          << ", " << arg.b_grid_desc_k0_n_k1_.GetLength(I1) << ", "
+                          << arg.b_grid_desc_k0_n_k1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
+                          << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+
+                std::cout
+                    << "arg.c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_"
+                       "nwavenperxdl_{ "
+                    << arg.c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_
+                           .GetLength(I0)
+                    << ", "
+                    << arg.c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_
+                           .GetLength(I1)
+                    << ", "
+                    << arg.c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_
+                           .GetLength(I2)
+                    << ", "
+                    << arg.c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_
+                           .GetLength(I3)
+                    << ", "
+                    << arg.c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_
+                           .GetLength(I4)
+                    << ", "
+                    << arg.c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_
+                           .GetLength(I5)
+                    << "}" << std::endl;
+            }
+#endif
+
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
+                                            arg.b_grid_desc_k0_n_k1_,
+                                            arg.c_grid_desc_m_n_,
+                                            arg.block_2_ctile_map_))
+            {
+                throw std::runtime_error(
+                    "wrong! GridwiseGemm_km_kn_m0m1n0n1_xdlops_v3r1 has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_);
+
+            const auto K =
+                arg.a_grid_desc_k0_m_k1_.GetLength(I0) * arg.a_grid_desc_k0_m_k1_.GetLength(I2);
+
+            float ave_time = 0;
+
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                const auto kernel = kernel_gemm_xdlops_v3r1<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                    remove_reference_t<
+                        typename GridwiseGemm::
+                            CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl>,
+                    InElementwiseOperation,
+                    WeiElementwiseOperation,
+                    OutElementwiseOperation,
+                    Block2CTileMap,
+                    true>;
+
+                ave_time = launch_and_time_kernel(
+                    stream_config,
+                    kernel,
+                    dim3(grid_size),
+                    dim3(BlockSize),
+                    0,
+                    arg.p_a_grid_,
+                    arg.p_b_grid_,
+                    arg.p_c_grid_,
+                    arg.a_grid_desc_k0_m_k1_,
+                    arg.b_grid_desc_k0_n_k1_,
+                    arg.c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_,
+                    arg.in_element_op_,
+                    arg.wei_element_op_,
+                    arg.out_element_op_,
+                    arg.block_2_ctile_map_);
+            }
+            else
+            {
+                const auto kernel = kernel_gemm_xdlops_v3r1<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                    remove_reference_t<
+                        typename GridwiseGemm::
+                            CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl>,
+                    InElementwiseOperation,
+                    WeiElementwiseOperation,
+                    OutElementwiseOperation,
+                    Block2CTileMap,
+                    false>;
+
+                ave_time = launch_and_time_kernel(
+                    stream_config,
+                    kernel,
+                    dim3(grid_size),
+                    dim3(BlockSize),
+                    0,
+                    arg.p_a_grid_,
+                    arg.p_b_grid_,
+                    arg.p_c_grid_,
+                    arg.a_grid_desc_k0_m_k1_,
+                    arg.b_grid_desc_k0_n_k1_,
+                    arg.c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl_,
+                    arg.in_element_op_,
+                    arg.wei_element_op_,
+                    arg.out_element_op_,
+                    arg.block_2_ctile_map_);
+            }
+
+            return ave_time;
+        }
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if constexpr(ConvForwardSpecialization ==
+                     ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
+        {
+            // check if it's 1x1, stride=1 conv
+            if(!(arg.filter_spatial_lengths_[0] == 1 && arg.filter_spatial_lengths_[1] == 1 &&
+                 arg.conv_filter_strides_[0] == 1 && arg.conv_filter_strides_[1] == 1 &&
+                 arg.input_left_pads_[0] == 0 && arg.input_left_pads_[1] == 0 &&
+                 arg.input_right_pads_[0] == 0 && arg.input_right_pads_[1] == 0))
+            {
+                return false;
+            }
+        }
+        else if constexpr(ConvForwardSpecialization ==
+                          ConvolutionForwardSpecialization::Filter1x1Pad0)
+        {
+            // check if it's 1x1 conv
+            if(!(arg.filter_spatial_lengths_[0] == 1 && arg.filter_spatial_lengths_[1] == 1 &&
+                 arg.input_left_pads_[0] == 0 && arg.input_left_pads_[1] == 0 &&
+                 arg.input_right_pads_[0] == 0 && arg.input_right_pads_[1] == 0))
+            {
+                return false;
+            }
+        }
+
+        // vector load A/B matrix from global memory
+        if(!(ABlockTransferSrcVectorDim == 2 && BBlockTransferSrcVectorDim == 2 &&
+             arg.Conv_C_ % ABlockTransferSrcScalarPerVector == 0 &&
+             arg.Conv_C_ % BBlockTransferSrcScalarPerVector == 0))
+        {
+            return false;
+        }
+
+        // vector store C matrix into global memory
+        if(!(arg.Conv_K_ % CBlockTransferScalarPerVector_NWaveNPerXdl == 0))
+        {
+            return false;
+        }
+
+        // Gridwise GEMM size
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
+                                           arg.b_grid_desc_k0_n_k1_,
+                                           arg.c_grid_desc_m_n_,
+                                           arg.block_2_ctile_map_);
+    }
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const InDataType* p_in_grid,
+                             const WeiDataType* p_wei_grid,
+                             OutDataType* p_out_grid,
+                             ck::index_t N,
+                             ck::index_t K,
+                             ck::index_t C,
+                             std::vector<ck::index_t> input_spatial_lengths,
+                             std::vector<ck::index_t> filter_spatial_lengths,
+                             std::vector<ck::index_t> output_spatial_lengths,
+                             std::vector<ck::index_t> conv_filter_strides,
+                             std::vector<ck::index_t> conv_filter_dilations,
+                             std::vector<ck::index_t> input_left_pads,
+                             std::vector<ck::index_t> input_right_pads,
+                             InElementwiseOperation in_element_op,
+                             WeiElementwiseOperation wei_element_op,
+                             OutElementwiseOperation out_element_op)
+    {
+        return Argument{p_in_grid,
+                        p_wei_grid,
+                        p_out_grid,
+                        N,
+                        K,
+                        C,
+                        input_spatial_lengths,
+                        filter_spatial_lengths,
+                        output_spatial_lengths,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        in_element_op,
+                        wei_element_op,
+                        out_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_in_grid,
+                        const void* p_wei_grid,
+                        void* p_out_grid,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t C,
+                        std::vector<ck::index_t> input_spatial_lengths,
+                        std::vector<ck::index_t> filter_spatial_lengths,
+                        std::vector<ck::index_t> output_spatial_lengths,
+                        std::vector<ck::index_t> conv_filter_strides,
+                        std::vector<ck::index_t> conv_filter_dilations,
+                        std::vector<ck::index_t> input_left_pads,
+                        std::vector<ck::index_t> input_right_pads,
+                        InElementwiseOperation in_element_op,
+                        WeiElementwiseOperation wei_element_op,
+                        OutElementwiseOperation out_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const InDataType*>(p_in_grid),
+                                          static_cast<const WeiDataType*>(p_wei_grid),
+                                          static_cast<OutDataType*>(p_out_grid),
+                                          N,
+                                          K,
+                                          C,
+                                          input_spatial_lengths,
+                                          filter_spatial_lengths,
+                                          output_spatial_lengths,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads,
+                                          in_element_op,
+                                          wei_element_op,
+                                          out_element_op);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << K0PerBlock << ", "
+            << getConvForwardSpecializationString(ConvForwardSpecialization)
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
new file mode 100644
index 00000000..5821e06b
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
@@ -0,0 +1,733 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
+template <typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename AccDataType,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation,
+          ConvolutionForwardSpecialization ConvForwardSpecialization,
+          ck::index_t BlockSize,
+          ck::index_t MPerBlock,
+          ck::index_t NPerBlock,
+          ck::index_t K0PerBlock,
+          ck::index_t K1,
+          ck::index_t MPerXDL,
+          ck::index_t NPerXDL,
+          ck::index_t MXdlPerWave,
+          ck::index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          ck::index_t ABlockTransferSrcVectorDim,
+          ck::index_t ABlockTransferSrcScalarPerVector,
+          ck::index_t ABlockTransferDstScalarPerVector_K1,
+          bool ABlockLdsAddExtraM,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          ck::index_t BBlockTransferSrcVectorDim,
+          ck::index_t BBlockTransferSrcScalarPerVector,
+          ck::index_t BBlockTransferDstScalarPerVector_K1,
+          bool BBlockLdsAddExtraN,
+          ck::index_t CThreadTransferSrcDstVectorDim,
+          ck::index_t CThreadTransferDstScalarPerVector>
+struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
+    : public DeviceConvFwd<2,
+                           ck::tensor_layout::convolution::NHWC,
+                           ck::tensor_layout::convolution::KYXC,
+                           ck::tensor_layout::convolution::NHWK,
+                           InDataType,
+                           WeiDataType,
+                           OutDataType,
+                           InElementwiseOperation,
+                           WeiElementwiseOperation,
+                           OutElementwiseOperation>
+{
+    using DeviceOp = DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K;
+
+    using ADataType = InDataType;
+    using BDataType = WeiDataType;
+    using CDataType = OutDataType;
+
+    // TODO make A/B datatype different
+    using ABDataType = InDataType;
+
+    static constexpr index_t NDimSpatial = 2;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr auto K1Number     = Number<K1>{};
+    static constexpr auto GemmK1Number = K1Number;
+
+    static auto
+    MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(ck::index_t N,
+                                                    ck::index_t K,
+                                                    ck::index_t C,
+                                                    std::vector<ck::index_t> input_spatial_lengths,
+                                                    std::vector<ck::index_t> filter_spatial_lengths,
+                                                    std::vector<ck::index_t> output_spatial_lengths,
+                                                    std::vector<ck::index_t> conv_filter_strides,
+                                                    std::vector<ck::index_t> conv_filter_dilations,
+                                                    std::vector<ck::index_t> input_left_pads,
+                                                    std::vector<ck::index_t> input_right_pads)
+    {
+        using namespace ck;
+
+        const index_t Hi = input_spatial_lengths[0];
+        const index_t Wi = input_spatial_lengths[1];
+
+        const index_t Ho = output_spatial_lengths[0];
+        const index_t Wo = output_spatial_lengths[1];
+
+        const index_t Y = filter_spatial_lengths[0];
+        const index_t X = filter_spatial_lengths[1];
+
+        const index_t ConvStrideH = conv_filter_strides[0];
+        const index_t ConvStrideW = conv_filter_strides[1];
+
+        const index_t ConvDilationH = conv_filter_dilations[0];
+        const index_t ConvDilationW = conv_filter_dilations[1];
+
+        const index_t InLeftPadH = input_left_pads[0];
+        const index_t InLeftPadW = input_left_pads[1];
+
+        const index_t InRightPadH = input_right_pads[0];
+        const index_t InRightPadW = input_right_pads[1];
+
+        const index_t GemmMRaw = N * Ho * Wo;
+        const index_t GemmN    = K;
+        const index_t GemmK    = Y * X * C;
+
+        const auto GemmMPad = math::integer_least_multiple(GemmMRaw, MPerBlock) - GemmMRaw;
+
+        assert(GemmK % GemmK1Number == 0);
+
+        const index_t GemmK0 = GemmK / GemmK1Number;
+
+        if constexpr(ConvForwardSpecialization ==
+                     ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
+        {
+            // A: input tensor
+            const auto in_gemmmraw_gemmk_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, C));
+
+            const auto in_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmmraw_gemmk_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1Number)),
+                           make_right_pad_transform(GemmMRaw, GemmMPad)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // B: weight tensor
+            const auto wei_gemmn_gemmk_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(K, C));
+
+            const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                wei_gemmn_gemmk_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // C: output tensor
+            const auto out_gemmmraw_gemmn_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K));
+
+            const auto out_gemmm_gemmn_grid_desc =
+                transform_tensor_descriptor(out_gemmmraw_gemmn_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmMRaw, GemmMPad),
+                                                       make_pass_through_transform(GemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return make_tuple(in_gemmk0_gemmm_gemmk1_grid_desc,
+                              wei_gemmk0_gemmn_gemmk1_grid_desc,
+                              out_gemmm_gemmn_grid_desc);
+        }
+        else if constexpr(ConvForwardSpecialization ==
+                          ConvolutionForwardSpecialization::Filter1x1Pad0)
+        {
+            // A: input tensor
+            const auto in_n_hi_wi_c_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C));
+
+            const auto in_n_ho_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_embed_transform(make_tuple(Ho), make_tuple(ConvStrideH)),
+                           make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto in_gemmk0_gemmmraw_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_n_ho_wo_c_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1Number)),
+                           make_merge_transform(make_tuple(N, Ho, Wo))),
+                make_tuple(Sequence<3>{}, Sequence<0, 1, 2>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            const auto in_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmk0_gemmmraw_gemmk1_grid_desc,
+                make_tuple(make_pass_through_transform(GemmK0),
+                           make_right_pad_transform(GemmMRaw, GemmMPad),
+                           make_pass_through_transform(GemmK1Number)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            // B: weight tensor
+            const auto wei_gemmn_gemmk_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(K, C));
+
+            const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                wei_gemmn_gemmk_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // C: output tensor
+            const auto out_gemmmraw_gemmn_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K));
+
+            const auto out_gemmm_gemmn_grid_desc =
+                transform_tensor_descriptor(out_gemmmraw_gemmn_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmMRaw, GemmMPad),
+                                                       make_pass_through_transform(GemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return make_tuple(in_gemmk0_gemmm_gemmk1_grid_desc,
+                              wei_gemmk0_gemmn_gemmk1_grid_desc,
+                              out_gemmm_gemmn_grid_desc);
+        }
+        else
+        {
+            // A: input tensor
+            const auto in_n_hi_wi_c_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C));
+
+            const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+                in_n_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_hip_wip_c_grid_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                    make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                    make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+            const auto in_gemmk_gemmmraw_grid_desc =
+                transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                            make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
+                                            make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto in_gemmk0_gemmmraw_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmk_gemmmraw_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmMRaw)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            const auto in_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmk0_gemmmraw_gemmk1_grid_desc,
+                make_tuple(make_pass_through_transform(GemmK0),
+                           make_right_pad_transform(GemmMRaw, GemmMPad),
+                           make_pass_through_transform(GemmK1Number)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            // B: weight tensor
+            const auto wei_k_yxc_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C));
+
+            const auto wei_gemmk_gemmn_grid_desc = transform_tensor_descriptor(
+                wei_k_yxc_grid_desc,
+                make_tuple(make_pass_through_transform(K), make_pass_through_transform(Y * X * C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+            const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                wei_gemmk_gemmn_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // C: output tensor
+            const auto out_nhowo_k_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K));
+
+            const auto out_gemmmraw_gemmn_grid_desc =
+                transform_tensor_descriptor(out_nhowo_k_grid_desc,
+                                            make_tuple(make_pass_through_transform(N * Ho * Wo),
+                                                       make_pass_through_transform(K)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto out_gemmm_gemmn_grid_desc =
+                transform_tensor_descriptor(out_gemmmraw_gemmn_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmMRaw, GemmMPad),
+                                                       make_pass_through_transform(GemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return make_tuple(in_gemmk0_gemmm_gemmk1_grid_desc,
+                              wei_gemmk0_gemmn_gemmk1_grid_desc,
+                              out_gemmm_gemmn_grid_desc);
+        }
+    }
+
+    using ABCGridDescs = decltype(MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
+        1, 1, 1, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}));
+
+    using AGridDesc_K0_M_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I0])>;
+    using BGridDesc_K0_N_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I1])>;
+    using CGridDesc_M_N     = remove_cvref_t<decltype(ABCGridDescs{}[I2])>;
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<
+        BlockSize,
+        ABDataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CDataType,
+        InMemoryDataOperationEnum::Set,
+        AGridDesc_K0_M_K1,
+        BGridDesc_K0_N_K1,
+        CGridDesc_M_N,
+        InElementwiseOperation,
+        WeiElementwiseOperation,
+        OutElementwiseOperation,
+        MPerBlock,
+        NPerBlock,
+        K0PerBlock,
+        MPerXDL,
+        NPerXDL,
+        K1,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_K0_M_K1,
+        Sequence<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder,
+        Sequence<1, 0, 2>, // ABlockTransferSrcAccessOrder,
+        2,                 // ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_K1,
+        false, // AThreadTransferSrcResetCoordinateAfterRun,
+        ABlockLdsAddExtraM,
+        BBlockTransferThreadClusterLengths_K0_N_K1,
+        Sequence<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder,
+        Sequence<1, 0, 2>, // BBlockTransferSrcAccessOrder,
+        2,                 // BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_K1,
+        false, // BThreadTransferSrcResetCoordinateAfterRun,
+        BBlockLdsAddExtraN,
+        Sequence<2, 3, 0, 1, 7, 5, 4, 6>, // CThreadTransferSrcDstAccessOrder,
+        7,                                // CThreadTransferSrcDstVectorDim,
+        CThreadTransferDstScalarPerVector>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const InDataType* p_in_grid,
+                 const WeiDataType* p_wei_grid,
+                 OutDataType* p_out_grid,
+                 ck::index_t N,
+                 ck::index_t K,
+                 ck::index_t C,
+                 std::vector<ck::index_t> input_spatial_lengths,
+                 std::vector<ck::index_t> filter_spatial_lengths,
+                 std::vector<ck::index_t> output_spatial_lengths,
+                 std::vector<ck::index_t> conv_filter_strides,
+                 std::vector<ck::index_t> conv_filter_dilations,
+                 std::vector<ck::index_t> input_left_pads,
+                 std::vector<ck::index_t> input_right_pads,
+                 ck::index_t M01,
+                 ck::index_t N01,
+                 InElementwiseOperation in_element_op,
+                 WeiElementwiseOperation wei_element_op,
+                 OutElementwiseOperation out_element_op)
+            : p_a_grid_{p_in_grid},
+              p_b_grid_{p_wei_grid},
+              p_c_grid_{p_out_grid},
+              a_grid_desc_k0_m_k1_{},
+              b_grid_desc_k0_n_k1_{},
+              c_grid_desc_m_n_{},
+              c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_{},
+              block_2_ctile_map_{},
+              M01_{M01},
+              N01_{N01},
+              in_element_op_{in_element_op},
+              wei_element_op_{wei_element_op},
+              out_element_op_{out_element_op},
+              Conv_N_{N},
+              Conv_K_{K},
+              Conv_C_{C},
+              filter_spatial_lengths_{filter_spatial_lengths},
+              conv_filter_strides_{conv_filter_strides},
+              input_left_pads_{input_left_pads},
+              input_right_pads_{input_right_pads}
+        {
+            const auto descs =
+                DeviceOp::MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(N,
+                                                                          K,
+                                                                          C,
+                                                                          input_spatial_lengths,
+                                                                          filter_spatial_lengths,
+                                                                          output_spatial_lengths,
+                                                                          conv_filter_strides,
+                                                                          conv_filter_dilations,
+                                                                          input_left_pads,
+                                                                          input_right_pads);
+
+            a_grid_desc_k0_m_k1_ = descs[I0];
+            b_grid_desc_k0_n_k1_ = descs[I1];
+            c_grid_desc_m_n_     = descs[I2];
+            block_2_ctile_map_ =
+                GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_, M01, N01);
+
+            if(GridwiseGemm::CheckValidity(a_grid_desc_k0_m_k1_,
+                                           b_grid_desc_k0_n_k1_,
+                                           c_grid_desc_m_n_,
+                                           block_2_ctile_map_))
+            {
+                c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_ =
+                    GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m_n_);
+            }
+        }
+
+        //  private:
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        CDataType* p_c_grid_;
+        AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1_;
+        BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2
+            c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_;
+        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
+        index_t M01_;
+        index_t N01_;
+        InElementwiseOperation in_element_op_;
+        WeiElementwiseOperation wei_element_op_;
+        OutElementwiseOperation out_element_op_;
+        // for checking IsSupportedArgument()
+        index_t Conv_N_;
+        index_t Conv_K_;
+        index_t Conv_C_;
+        std::vector<index_t> filter_spatial_lengths_;
+        std::vector<index_t> conv_filter_strides_;
+        std::vector<index_t> input_left_pads_;
+        std::vector<index_t> input_right_pads_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+#if 0
+            {
+                std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
+                          << ", " << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", "
+                          << arg.a_grid_desc_k0_m_k1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.b_grid_desc_k0_n_k1_{" << arg.b_grid_desc_k0_n_k1_.GetLength(I0)
+                          << ", " << arg.b_grid_desc_k0_n_k1_.GetLength(I1) << ", "
+                          << arg.b_grid_desc_k0_n_k1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
+                          << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+            }
+#endif
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
+                                            arg.b_grid_desc_k0_n_k1_,
+                                            arg.c_grid_desc_m_n_,
+                                            arg.block_2_ctile_map_))
+            {
+                throw std::runtime_error(
+                    "wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_);
+
+            const auto K =
+                arg.a_grid_desc_k0_m_k1_.GetLength(I0) * arg.a_grid_desc_k0_m_k1_.GetLength(I2);
+
+            float ave_time = 0;
+
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                const auto kernel = kernel_gemm_xdlops_v2r3<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                    remove_reference_t<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
+                    InElementwiseOperation,
+                    WeiElementwiseOperation,
+                    OutElementwiseOperation,
+                    remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
+                    true>;
+
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
+                                                  dim3(grid_size),
+                                                  dim3(BlockSize),
+                                                  0,
+                                                  arg.p_a_grid_,
+                                                  arg.p_b_grid_,
+                                                  arg.p_c_grid_,
+                                                  arg.a_grid_desc_k0_m_k1_,
+                                                  arg.b_grid_desc_k0_n_k1_,
+                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
+                                                  arg.in_element_op_,
+                                                  arg.wei_element_op_,
+                                                  arg.out_element_op_,
+                                                  arg.block_2_ctile_map_);
+            }
+            else
+            {
+                const auto kernel = kernel_gemm_xdlops_v2r3<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                    remove_reference_t<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
+                    InElementwiseOperation,
+                    WeiElementwiseOperation,
+                    OutElementwiseOperation,
+                    remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
+                    false>;
+
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
+                                                  dim3(grid_size),
+                                                  dim3(BlockSize),
+                                                  0,
+                                                  arg.p_a_grid_,
+                                                  arg.p_b_grid_,
+                                                  arg.p_c_grid_,
+                                                  arg.a_grid_desc_k0_m_k1_,
+                                                  arg.b_grid_desc_k0_n_k1_,
+                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
+                                                  arg.in_element_op_,
+                                                  arg.wei_element_op_,
+                                                  arg.out_element_op_,
+                                                  arg.block_2_ctile_map_);
+            }
+
+            return ave_time;
+        }
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if constexpr(ConvForwardSpecialization ==
+                     ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
+        {
+            // check if it's 1x1, stride=1 conv
+            if(!(arg.filter_spatial_lengths_[0] == 1 && arg.filter_spatial_lengths_[1] == 1 &&
+                 arg.conv_filter_strides_[0] == 1 && arg.conv_filter_strides_[1] == 1 &&
+                 arg.input_left_pads_[0] == 0 && arg.input_left_pads_[1] == 0 &&
+                 arg.input_right_pads_[0] == 0 && arg.input_right_pads_[1] == 0))
+            {
+                return false;
+            }
+        }
+        else if constexpr(ConvForwardSpecialization ==
+                          ConvolutionForwardSpecialization::Filter1x1Pad0)
+        {
+            // check if it's 1x1 conv
+            if(!(arg.filter_spatial_lengths_[0] == 1 && arg.filter_spatial_lengths_[1] == 1 &&
+                 arg.input_left_pads_[0] == 0 && arg.input_left_pads_[1] == 0 &&
+                 arg.input_right_pads_[0] == 0 && arg.input_right_pads_[1] == 0))
+            {
+                return false;
+            }
+        }
+
+        // vector load A/B matrix from global memory
+        if(!(ABlockTransferSrcVectorDim == 2 && BBlockTransferSrcVectorDim == 2 &&
+             arg.Conv_C_ % ABlockTransferSrcScalarPerVector == 0 &&
+             arg.Conv_C_ % BBlockTransferSrcScalarPerVector == 0))
+        {
+            return false;
+        }
+
+        // vector store C matrix into global memory
+        if(!(arg.Conv_K_ % CThreadTransferDstScalarPerVector == 0))
+        {
+            return false;
+        }
+
+        // Gridwise GEMM size
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
+                                           arg.b_grid_desc_k0_n_k1_,
+                                           arg.c_grid_desc_m_n_,
+                                           arg.block_2_ctile_map_);
+    }
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const InDataType* p_in_grid,
+                             const WeiDataType* p_wei_grid,
+                             OutDataType* p_out_grid,
+                             ck::index_t N,
+                             ck::index_t K,
+                             ck::index_t C,
+                             std::vector<ck::index_t> input_spatial_lengths,
+                             std::vector<ck::index_t> filter_spatial_lengths,
+                             std::vector<ck::index_t> output_spatial_lengths,
+                             std::vector<ck::index_t> conv_filter_strides,
+                             std::vector<ck::index_t> conv_filter_dilations,
+                             std::vector<ck::index_t> input_left_pads,
+                             std::vector<ck::index_t> input_right_pads,
+                             InElementwiseOperation in_element_op,
+                             WeiElementwiseOperation wei_element_op,
+                             OutElementwiseOperation out_element_op)
+    {
+        return Argument{p_in_grid,
+                        p_wei_grid,
+                        p_out_grid,
+                        N,
+                        K,
+                        C,
+                        input_spatial_lengths,
+                        filter_spatial_lengths,
+                        output_spatial_lengths,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        1,
+                        1,
+                        in_element_op,
+                        wei_element_op,
+                        out_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_in_grid,
+                        const void* p_wei_grid,
+                        void* p_out_grid,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t C,
+                        std::vector<ck::index_t> input_spatial_lengths,
+                        std::vector<ck::index_t> filter_spatial_lengths,
+                        std::vector<ck::index_t> output_spatial_lengths,
+                        std::vector<ck::index_t> conv_filter_strides,
+                        std::vector<ck::index_t> conv_filter_dilations,
+                        std::vector<ck::index_t> input_left_pads,
+                        std::vector<ck::index_t> input_right_pads,
+                        InElementwiseOperation in_element_op,
+                        WeiElementwiseOperation wei_element_op,
+                        OutElementwiseOperation out_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const InDataType*>(p_in_grid),
+                                          static_cast<const WeiDataType*>(p_wei_grid),
+                                          static_cast<OutDataType*>(p_out_grid),
+                                          N,
+                                          K,
+                                          C,
+                                          input_spatial_lengths,
+                                          filter_spatial_lengths,
+                                          output_spatial_lengths,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads,
+                                          1,
+                                          1,
+                                          in_element_op,
+                                          wei_element_op,
+                                          out_element_op);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << K0PerBlock << ", "
+            << getConvForwardSpecializationString(ConvForwardSpecialization)
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp
new file mode 100644
index 00000000..f69d8f18
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_naive_ndhwc_kzyxc_ndhwk.hpp
@@ -0,0 +1,268 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef DEVICE_CONV3D_FWD_NAIVE_HPP
+#define DEVICE_CONV3D_FWD_NAIVE_HPP
+
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include "conv_util.hpp"
+#include "device.hpp"
+#include "device_conv_fwd.hpp"
+#include "common_header.hpp"
+#include "naive_conv_fwd.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// specialization for #D conv: in[n, di, hi, wi, c] * wei[k, z, y, x, c] = out[n, do, ho, wo, k]
+template <typename InDataType,
+          typename WeiDataType, // WeiDataType must be the same as InDataType
+          typename OutDataType,
+          typename AccDataType,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation>
+struct DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_K
+    : public DeviceConvFwd<InElementwiseOperation, WeiElementwiseOperation, OutElementwiseOperation>
+
+{
+    using DeviceOp = DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_K;
+
+    using ADataType = InDataType;
+    using BDataType = WeiDataType;
+    using CDataType = OutDataType;
+    // TODO make A/B datatype different
+    using ABDataType = InDataType;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const InDataType* p_in,
+                 const WeiDataType* p_wei,
+                 OutDataType* p_out,
+                 const index_t N,
+                 const index_t K,
+                 const index_t C,
+                 std::vector<ck::index_t> input_spatial_lengths,
+                 std::vector<ck::index_t> filter_spatial_lengths,
+                 std::vector<ck::index_t> output_spatial_lengths,
+                 std::vector<ck::index_t> conv_filter_strides,
+                 std::vector<ck::index_t> conv_filter_dilations,
+                 std::vector<ck::index_t> input_left_pads,
+                 std::vector<ck::index_t> input_right_pads,
+                 InElementwiseOperation in_element_op,
+                 WeiElementwiseOperation wei_element_op,
+                 OutElementwiseOperation out_element_op)
+            : params_{3,
+                      N,
+                      K,
+                      C,
+                      filter_spatial_lengths,
+                      input_spatial_lengths,
+                      conv_filter_strides,
+                      conv_filter_dilations,
+                      input_left_pads,
+                      input_right_pads},
+              out_spatial_lengths_{output_spatial_lengths},
+              p_in_{p_in},
+              p_wei_{p_wei},
+              p_out_{p_out},
+              in_element_op_{in_element_op},
+              wei_element_op_{wei_element_op},
+              out_element_op_{out_element_op}
+
+        {
+        }
+
+        //  private:
+        utils::conv::ConvParams params_;
+        std::vector<index_t> out_spatial_lengths_;
+
+        const InDataType* p_in_;
+        const WeiDataType* p_wei_;
+        OutDataType* p_out_;
+
+        InElementwiseOperation in_element_op_;
+        WeiElementwiseOperation wei_element_op_;
+        OutElementwiseOperation out_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            const auto naive_conv3d_fwd =
+                ref::naive_conv_fwd_ndhwc_kzyxc_ndhwk<InDataType,
+                                                      WeiDataType,
+                                                      OutDataType,
+                                                      AccDataType,
+                                                      InElementwiseOperation,
+                                                      WeiElementwiseOperation,
+                                                      OutElementwiseOperation>;
+
+            float ave_time = launch_and_time_kernel(stream_config,
+                                                    naive_conv3d_fwd,
+                                                    dim3(256),
+                                                    dim3(256),
+                                                    0,
+                                                    arg.p_in_,
+                                                    arg.p_wei_,
+                                                    arg.p_out_,
+                                                    arg.N_,
+                                                    arg.K_,
+                                                    arg.C_,
+                                                    arg.in_spatial_lengths_[0],
+                                                    arg.in_spatial_lengths_[1],
+                                                    arg.in_spatial_lengths_[2],
+                                                    arg.filter_spatial_lengths_[0],
+                                                    arg.filter_spatial_lengths_[1],
+                                                    arg.filter_spatial_lengths_[2],
+                                                    arg.out_spatial_lengths_[0],
+                                                    arg.out_spatial_lengths_[1],
+                                                    arg.out_spatial_lengths_[2],
+                                                    arg.conv_filter_strides_[0],
+                                                    arg.conv_filter_strides_[1],
+                                                    arg.conv_filter_strides_[2],
+                                                    arg.conv_filter_dilations_[0],
+                                                    arg.conv_filter_dilations_[1],
+                                                    arg.conv_filter_dilations_[2],
+                                                    arg.in_left_pads_[0],
+                                                    arg.in_left_pads_[1],
+                                                    arg.in_left_pads_[2]);
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        std::vector<index_t> out_spatial_lengths = arg.params_.GetOutputSpatialLengths();
+
+        bool out_lengths_are_consistent = out_spatial_lengths[0] == arg.out_spatial_lengths_[0] &&
+                                          out_spatial_lengths[1] == arg.out_spatial_lengths_[1] &&
+                                          out_spatial_lengths[2] == arg.out_spatial_lengths_[2];
+        return out_lengths_are_consistent;
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const InDataType* p_in,
+                             const WeiDataType* p_wei,
+                             OutDataType* p_out,
+                             const index_t N,
+                             const index_t K,
+                             const index_t C,
+                             std::vector<ck::index_t> input_spatial_lengths,
+                             std::vector<ck::index_t> filter_spatial_lengths,
+                             std::vector<ck::index_t> output_spatial_lengths,
+                             std::vector<ck::index_t> conv_filter_strides,
+                             std::vector<ck::index_t> conv_filter_dilations,
+                             std::vector<ck::index_t> input_left_pads,
+                             std::vector<ck::index_t> input_right_pads,
+                             InElementwiseOperation in_element_op,
+                             WeiElementwiseOperation wei_element_op,
+                             OutElementwiseOperation out_element_op)
+    {
+        return Argument{p_in,
+                        p_wei,
+                        p_out,
+                        N,
+                        K,
+                        C,
+                        input_spatial_lengths,
+                        filter_spatial_lengths,
+                        output_spatial_lengths,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        in_element_op,
+                        wei_element_op,
+                        out_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_in,
+                        const void* p_wei,
+                        void* p_out,
+                        const index_t N,
+                        const index_t K,
+                        const index_t C,
+                        std::vector<ck::index_t> input_spatial_lengths,
+                        std::vector<ck::index_t> filter_spatial_lengths,
+                        std::vector<ck::index_t> output_spatial_lengths,
+                        std::vector<ck::index_t> conv_filter_strides,
+                        std::vector<ck::index_t> conv_filter_dilations,
+                        std::vector<ck::index_t> input_left_pads,
+                        std::vector<ck::index_t> input_right_pads,
+                        InElementwiseOperation in_element_op,
+                        WeiElementwiseOperation wei_element_op,
+                        OutElementwiseOperation out_element_op) override
+
+    {
+        return std::make_unique<Argument>(static_cast<const InDataType*>(p_in),
+                                          static_cast<const WeiDataType*>(p_wei),
+                                          static_cast<OutDataType*>(p_out),
+                                          N,
+                                          K,
+                                          C,
+                                          input_spatial_lengths,
+                                          filter_spatial_lengths,
+                                          output_spatial_lengths,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads,
+                                          in_element_op,
+                                          wei_element_op,
+                                          out_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceConv3dFwdNaive_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_K<>";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+#endif
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
new file mode 100644
index 00000000..f950538d
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp
@@ -0,0 +1,642 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef DEVICE_CONV3D_FWD_XDL_HPP
+#define DEVICE_CONV3D_FWD_XDL_HPP
+
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include "device.hpp"
+#include "device_conv_fwd.hpp"
+#include "common_header.hpp"
+#include "tensor_layout.hpp"
+#include "convolution_forward_specialization.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp"
+#include "gridwise_gemm_xdlops_v2r3.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+/*
+ * \see \link impl/device_batched_gemm_xdl.hpp kernel_batched_gemm_xdlops_v2r3() \endlink.
+ */
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AGridDesc_K0_M_K1,
+          typename BGridDesc_K0_N_K1,
+          typename CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename Block2CTileMap,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_xdlops_v2r3_for_conv3d(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const index_t num_batches,
+            const index_t a_batch_stride,
+            const index_t b_batch_stride,
+            const index_t c_batch_stride,
+            const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
+            const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
+            const CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CElementwiseOperation c_element_op,
+            const Block2CTileMap block_2_ctile_map)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    const index_t num_blocks_per_batch =
+        __builtin_amdgcn_readfirstlane(get_grid_size() / num_batches);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+    const long_index_t a_batch_offset =
+        __builtin_amdgcn_readfirstlane(static_cast<long_index_t>(a_batch_stride) * g_idx);
+    const long_index_t b_batch_offset =
+        __builtin_amdgcn_readfirstlane(static_cast<long_index_t>(b_batch_stride) * g_idx);
+    const long_index_t c_batch_offset =
+        __builtin_amdgcn_readfirstlane(static_cast<long_index_t>(c_batch_stride) * g_idx);
+
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+                                                  p_b_grid + b_batch_offset,
+                                                  p_c_grid + c_batch_offset,
+                                                  p_shared,
+                                                  a_grid_desc_k0_m_k1,
+                                                  b_grid_desc_k0_n_k1,
+                                                  c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  c_element_op,
+                                                  block_2_ctile_map);
+
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_c_grid;
+    ignore = num_batches;
+    ignore = a_batch_stride;
+    ignore = b_batch_stride;
+    ignore = c_batch_stride;
+    ignore = a_grid_desc_k0_m_k1;
+    ignore = b_grid_desc_k0_n_k1;
+    ignore = c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = c_element_op;
+    ignore = block_2_ctile_map;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+// specialization for #D conv: in[n, di, hi, wi, c] * wei[k, z, y, x, c] = out[n, do, ho, wo, k]
+template <typename InDataType,
+          typename WeiDataType, // WeiDataType must be the same as InDataType
+          typename OutDataType,
+          typename AccDataType,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation,
+          ConvolutionForwardSpecialization ConvForwardSpecialization,
+          ck::index_t BlockSize,
+          ck::index_t MPerBlock,
+          ck::index_t NPerBlock,
+          ck::index_t K0PerBlock,
+          ck::index_t K1,
+          ck::index_t MPerXDL,
+          ck::index_t NPerXDL,
+          ck::index_t MXdlPerWave,
+          ck::index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          ck::index_t ABlockTransferSrcVectorDim,
+          ck::index_t ABlockTransferSrcScalarPerVector,
+          ck::index_t ABlockTransferDstScalarPerVector_K1,
+          bool ABlockLdsAddExtraM,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          ck::index_t BBlockTransferSrcVectorDim,
+          ck::index_t BBlockTransferSrcScalarPerVector,
+          ck::index_t BBlockTransferDstScalarPerVector_K1,
+          bool BBlockLdsAddExtraN,
+          ck::index_t CThreadTransferSrcDstVectorDim,
+          ck::index_t CThreadTransferDstScalarPerVector>
+struct DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_K
+    : public DeviceConvFwd<InElementwiseOperation, WeiElementwiseOperation, OutElementwiseOperation>
+
+{
+    using DeviceOp = DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_K;
+
+    using ADataType = InDataType;
+    using BDataType = WeiDataType;
+    using CDataType = OutDataType;
+    // TODO make A/B datatype different
+    using ABDataType = InDataType;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    /*
+     * \brief Split the number of batches, \p N, into N = B * N1, such that the memory
+     * space of input and output tensors stays with the value range of index_t, and each subbatch
+     * can be dealed with GridwiseGemm.
+     */
+    static index_t GetMaxAllowableSubBatchSize(const index_t N,
+                                               const index_t K,
+                                               const index_t C,
+                                               std::vector<ck::index_t> input_spatial_lengths,
+                                               std::vector<ck::index_t> output_spatial_lengths)
+    {
+        const index_t Di = input_spatial_lengths[0];
+        const index_t Hi = input_spatial_lengths[1];
+        const index_t Wi = input_spatial_lengths[2];
+
+        const index_t Do = output_spatial_lengths[0];
+        const index_t Ho = output_spatial_lengths[1];
+        const index_t Wo = output_spatial_lengths[2];
+
+        // N1 should satisfy that
+        //   1) N % N1 = 0;
+        //   2) N1 * (Do * Ho * Wo * K) < (2^31 - 1)
+        //   3) N1 * (Di * Hi * Wi * C) < (2^31 - 1)
+        //
+        // Do NOT confuse (B, N1) in this function with (B, N1) in gridewise GEMM.
+        auto N1 = N + 1;
+
+        const auto stride =
+            math::max(long_index_t(Do) * Ho * Wo * K, long_index_t(Di) * Hi * Wi * C);
+        const index_t max_stride = NumericLimits<index_t>::Max();
+
+        for(index_t n0 = 1; n0 <= N; ++n0)
+        {
+            index_t n1 = N / n0;
+            if(n0 * n1 == N && long_index_t(n1) * long_index_t(stride) < max_stride)
+            {
+                N1 = n1;
+                break;
+            }
+        }
+
+        const auto B = N / N1;
+        if(B * N1 != N)
+        {
+            throw std::runtime_error(__func__ +
+                                     std::string(": failed to find num_subbatches for conv3d.\n"));
+        }
+
+        return N1;
+    }
+
+    static auto
+    MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(const index_t N,
+                                                    const index_t K,
+                                                    const index_t C,
+                                                    std::vector<ck::index_t> input_spatial_lengths,
+                                                    std::vector<ck::index_t> filter_spatial_lengths,
+                                                    std::vector<ck::index_t> output_spatial_lengths,
+                                                    std::vector<ck::index_t> conv_filter_strides,
+                                                    std::vector<ck::index_t> conv_filter_dilations,
+                                                    std::vector<ck::index_t> input_left_pads,
+                                                    std::vector<ck::index_t> input_right_pads)
+    {
+        assert(input_spatial_lengths.size() > 2);
+        assert(filter_spatial_lengths.size() > 2);
+        assert(conv_filter_strides.size() > 2);
+        assert(conv_filter_dilations.size() > 2);
+        assert(input_left_pads.size() > 2);
+        assert(input_right_pads.size() > 2);
+
+        const index_t Di = input_spatial_lengths[0];
+        const index_t Hi = input_spatial_lengths[1];
+        const index_t Wi = input_spatial_lengths[2];
+        const index_t Z  = filter_spatial_lengths[0];
+        const index_t Y  = filter_spatial_lengths[1];
+        const index_t X  = filter_spatial_lengths[2];
+
+        const index_t Do = output_spatial_lengths[0];
+        const index_t Ho = output_spatial_lengths[1];
+        const index_t Wo = output_spatial_lengths[2];
+
+        static_assert(ConvForwardSpecialization == ConvolutionForwardSpecialization::Default,
+                      "Wrong! This specialization not implemented!");
+
+        const auto in_desc_n_di_hi_wi_c =
+            make_naive_tensor_descriptor_packed(make_tuple(N, Di, Hi, Wi, C));
+        const auto wei_desc_k_z_y_x_c =
+            make_naive_tensor_descriptor_packed(make_tuple(K, Z, Y, X, C));
+        const auto out_desc_n_do_ho_wo_k =
+            make_naive_tensor_descriptor_packed(make_tuple(N, Do, Ho, Wo, K));
+
+        const auto descs = transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk_pad(
+            in_desc_n_di_hi_wi_c,
+            wei_desc_k_z_y_x_c,
+            out_desc_n_do_ho_wo_k,
+            make_tuple(conv_filter_strides[0], conv_filter_strides[1], conv_filter_strides[2]),
+            make_tuple(
+                conv_filter_dilations[0], conv_filter_dilations[1], conv_filter_dilations[2]),
+            make_tuple(input_left_pads[0], input_left_pads[1], input_left_pads[2]),
+            make_tuple(input_right_pads[0], input_right_pads[1], input_right_pads[2]),
+            Number<K1>{});
+
+        return descs;
+    }
+
+    using ABCGridDescs = remove_cvref_t<decltype(MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
+        1, 1, 1, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}))>;
+
+    using AGridDesc_K0_M_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I0])>;
+    using BGridDesc_K0_N_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I1])>;
+    using CGridDesc_M_N     = remove_cvref_t<decltype(ABCGridDescs{}[I2])>;
+
+    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<
+        BlockSize,
+        InDataType,
+        AccDataType,
+        OutDataType,
+        InMemoryDataOperationEnum::Set,
+        AGridDesc_K0_M_K1,
+        BGridDesc_K0_N_K1,
+        CGridDesc_M_N,
+        InElementwiseOperation,
+        WeiElementwiseOperation,
+        OutElementwiseOperation,
+        MPerBlock,
+        NPerBlock,
+        K0PerBlock,
+        MPerXDL,
+        NPerXDL,
+        K1,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_K0_M_K1,
+        Sequence<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder,
+        Sequence<1, 0, 2>, // ABlockTransferSrcAccessOrder,
+        2,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_K1,
+        false, // AThreadTransferSrcResetCoordinateAfterRun,
+        ABlockLdsAddExtraM,
+        BBlockTransferThreadClusterLengths_K0_N_K1,
+        Sequence<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder,
+        Sequence<1, 0, 2>, // ABlockTransferSrcAccessOrder,
+        2,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_K1,
+        false, // BThreadTransferSrcResetCoordinateAfterRun,
+        BBlockLdsAddExtraN,
+        Sequence<2, 3, 0, 1, 7, 5, 4, 6>,
+        7,
+        CThreadTransferDstScalarPerVector>;
+
+    using CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 =
+        decltype(GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(CGridDesc_M_N{}));
+    using Block2CTileMap = typename GridwiseGemm::DefaultBlock2CTileMap;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const InDataType* p_in,
+                 const WeiDataType* p_wei,
+                 OutDataType* p_out,
+                 const index_t N,
+                 const index_t K,
+                 const index_t C,
+                 std::vector<ck::index_t> input_spatial_lengths,
+                 std::vector<ck::index_t> filter_spatial_lengths,
+                 std::vector<ck::index_t> output_spatial_lengths,
+                 std::vector<ck::index_t> conv_filter_strides,
+                 std::vector<ck::index_t> conv_filter_dilations,
+                 std::vector<ck::index_t> input_left_pads,
+                 std::vector<ck::index_t> input_right_pads,
+                 index_t M01,
+                 index_t N01,
+                 InElementwiseOperation in_element_op,
+                 WeiElementwiseOperation wei_element_op,
+                 OutElementwiseOperation out_element_op)
+            : p_a_grid_{p_in},
+              p_b_grid_{p_wei},
+              p_c_grid_{p_out},
+              M01_{M01},
+              N01_{N01},
+              in_element_op_{in_element_op},
+              wei_element_op_{wei_element_op},
+              out_element_op_{out_element_op}
+        {
+            const index_t subbatch_size =
+                GetMaxAllowableSubBatchSize(N, K, C, input_spatial_lengths, output_spatial_lengths);
+            num_subbatches_ = N / subbatch_size;
+
+            const auto descs =
+                MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(subbatch_size,
+                                                                K,
+                                                                C,
+                                                                input_spatial_lengths,
+                                                                filter_spatial_lengths,
+                                                                output_spatial_lengths,
+                                                                conv_filter_strides,
+                                                                conv_filter_dilations,
+                                                                input_left_pads,
+                                                                input_right_pads);
+
+            a_grid_desc_k0_m_k1_ = descs[I0];
+            b_grid_desc_k0_n_k1_ = descs[I1];
+            c_grid_desc_m_n_     = descs[I2];
+
+            block_2_ctile_map_ =
+                GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_, M01, N01);
+
+            a_batch_stride_ = a_grid_desc_k0_m_k1_.GetElementSpaceSize();
+            b_batch_stride_ = 0;
+            c_batch_stride_ = c_grid_desc_m_n_.GetElementSpaceSize();
+
+            if(GridwiseGemm::CheckValidity(a_grid_desc_k0_m_k1_,
+                                           b_grid_desc_k0_n_k1_,
+                                           c_grid_desc_m_n_,
+                                           block_2_ctile_map_))
+            {
+                c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_ =
+                    GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m_n_);
+            }
+        }
+
+        //  private:
+        const InDataType* p_a_grid_;
+        const WeiDataType* p_b_grid_;
+        OutDataType* p_c_grid_;
+        index_t num_subbatches_;
+        index_t a_batch_stride_;
+        index_t b_batch_stride_;
+        index_t c_batch_stride_;
+        AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1_;
+        BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_;
+        Block2CTileMap block_2_ctile_map_;
+        index_t M01_;
+        index_t N01_;
+        InElementwiseOperation in_element_op_;
+        WeiElementwiseOperation wei_element_op_;
+        OutElementwiseOperation out_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            {
+                std::cout << "num_batches_of_GEMM = " << arg.num_subbatches_ << std::endl;
+                std::cout << "a_grid_desc_k0_m_k1{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
+                          << ", " << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", "
+                          << arg.a_grid_desc_k0_m_k1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "b_grid_desc_k0_n_k1{" << arg.b_grid_desc_k0_n_k1_.GetLength(I0)
+                          << ", " << arg.b_grid_desc_k0_n_k1_.GetLength(I1) << ", "
+                          << arg.b_grid_desc_k0_n_k1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "c_grid_desc_m_n{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
+                          << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+            }
+
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
+                                            arg.b_grid_desc_k0_n_k1_,
+                                            arg.c_grid_desc_m_n_,
+                                            arg.block_2_ctile_map_))
+            {
+                throw std::runtime_error(
+                    "wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) *
+                arg.num_subbatches_;
+
+            const auto K0 = arg.a_grid_desc_k0_m_k1_.GetLength(I0);
+
+            const bool has_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0);
+
+            float ave_time = 0;
+            if(has_main_k0_block_loop)
+            {
+                const auto kernel = kernel_gemm_xdlops_v2r3_for_conv3d<
+                    GridwiseGemm,
+                    InDataType,
+                    OutDataType,
+                    remove_reference_t<AGridDesc_K0_M_K1>,
+                    remove_reference_t<BGridDesc_K0_N_K1>,
+                    remove_reference_t<CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
+                    InElementwiseOperation,
+                    WeiElementwiseOperation,
+                    OutElementwiseOperation,
+                    remove_reference_t<Block2CTileMap>,
+                    true>;
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
+                                                  dim3(grid_size),
+                                                  dim3(BlockSize),
+                                                  0,
+                                                  arg.p_a_grid_,
+                                                  arg.p_b_grid_,
+                                                  arg.p_c_grid_,
+                                                  arg.num_subbatches_,
+                                                  arg.a_batch_stride_,
+                                                  arg.b_batch_stride_,
+                                                  arg.c_batch_stride_,
+                                                  arg.a_grid_desc_k0_m_k1_,
+                                                  arg.b_grid_desc_k0_n_k1_,
+                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
+                                                  arg.in_element_op_,
+                                                  arg.wei_element_op_,
+                                                  arg.out_element_op_,
+                                                  arg.block_2_ctile_map_);
+            }
+            else
+            {
+                const auto kernel = kernel_gemm_xdlops_v2r3_for_conv3d<
+                    GridwiseGemm,
+                    InDataType,
+                    OutDataType,
+                    remove_reference_t<AGridDesc_K0_M_K1>,
+                    remove_reference_t<BGridDesc_K0_N_K1>,
+                    remove_reference_t<CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
+                    InElementwiseOperation,
+                    WeiElementwiseOperation,
+                    OutElementwiseOperation,
+                    remove_reference_t<Block2CTileMap>,
+                    false>;
+
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
+                                                  dim3(grid_size),
+                                                  dim3(BlockSize),
+                                                  0,
+                                                  arg.p_a_grid_,
+                                                  arg.p_b_grid_,
+                                                  arg.p_c_grid_,
+                                                  arg.num_subbatches_,
+                                                  arg.a_batch_stride_,
+                                                  arg.b_batch_stride_,
+                                                  arg.c_batch_stride_,
+                                                  arg.a_grid_desc_k0_m_k1_,
+                                                  arg.b_grid_desc_k0_n_k1_,
+                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
+                                                  arg.in_element_op_,
+                                                  arg.wei_element_op_,
+                                                  arg.out_element_op_,
+                                                  arg.block_2_ctile_map_);
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
+                                           arg.b_grid_desc_k0_n_k1_,
+                                           arg.c_grid_desc_m_n_,
+                                           arg.block_2_ctile_map_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const InDataType* p_in,
+                             const WeiDataType* p_wei,
+                             OutDataType* p_out,
+                             const index_t N,
+                             const index_t K,
+                             const index_t C,
+                             std::vector<ck::index_t> input_spatial_lengths,
+                             std::vector<ck::index_t> filter_spatial_lengths,
+                             std::vector<ck::index_t> output_spatial_lengths,
+                             std::vector<ck::index_t> conv_filter_strides,
+                             std::vector<ck::index_t> conv_filter_dilations,
+                             std::vector<ck::index_t> input_left_pads,
+                             std::vector<ck::index_t> input_right_pads,
+                             InElementwiseOperation in_element_op,
+                             WeiElementwiseOperation wei_element_op,
+                             OutElementwiseOperation out_element_op)
+    {
+        return Argument{p_in,
+                        p_wei,
+                        p_out,
+                        N,
+                        K,
+                        C,
+                        input_spatial_lengths,
+                        filter_spatial_lengths,
+                        output_spatial_lengths,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        1,
+                        1,
+                        in_element_op,
+                        wei_element_op,
+                        out_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_in,
+                        const void* p_wei,
+                        void* p_out,
+                        const index_t N,
+                        const index_t K,
+                        const index_t C,
+                        std::vector<ck::index_t> input_spatial_lengths,
+                        std::vector<ck::index_t> filter_spatial_lengths,
+                        std::vector<ck::index_t> output_spatial_lengths,
+                        std::vector<ck::index_t> conv_filter_strides,
+                        std::vector<ck::index_t> conv_filter_dilations,
+                        std::vector<ck::index_t> input_left_pads,
+                        std::vector<ck::index_t> input_right_pads,
+                        InElementwiseOperation in_element_op,
+                        WeiElementwiseOperation wei_element_op,
+                        OutElementwiseOperation out_element_op) override
+
+    {
+        return std::make_unique<Argument>(static_cast<const InDataType*>(p_in),
+                                          static_cast<const WeiDataType*>(p_wei),
+                                          static_cast<OutDataType*>(p_out),
+                                          N,
+                                          K,
+                                          C,
+                                          input_spatial_lengths,
+                                          filter_spatial_lengths,
+                                          output_spatial_lengths,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads,
+                                          1,
+                                          1,
+                                          in_element_op,
+                                          wei_element_op,
+                                          out_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_K"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << K0PerBlock
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+#endif
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp
new file mode 100644
index 00000000..4cb111c8
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp
@@ -0,0 +1,1583 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename AccDataType,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation,
+          ConvolutionBackwardDataSpecialization ConvBackwardDataSpecialization,
+          ck::index_t BlockSize,
+          ck::index_t MPerBlock,
+          ck::index_t NPerBlock,
+          ck::index_t K0PerBlock,
+          ck::index_t K1,
+          index_t M1PerThread,
+          index_t N1PerThread,
+          index_t KPerThread,
+          typename M1N1ThreadClusterM1Xs,
+          typename M1N1ThreadClusterN1Xs,
+          typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+          typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
+          typename ABlockTransferSrcVectorTensorContiguousDimOrder,
+          typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
+          typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+          typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
+          typename BBlockTransferSrcVectorTensorContiguousDimOrder,
+          typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector>
+struct DeviceConvNdBwdDataNwcKxcNwk_Dl
+    : public DeviceConvBwdData<
+          NDimSpatial,
+          ck::tuple_element_t<NDimSpatial - 1,
+                              ck::Tuple<ck::tensor_layout::convolution::NWC,
+                                        ck::tensor_layout::convolution::NHWC,
+                                        ck::tensor_layout::convolution::NDHWC>>,
+          ck::tuple_element_t<NDimSpatial - 1,
+                              ck::Tuple<ck::tensor_layout::convolution::KXC,
+                                        ck::tensor_layout::convolution::KYXC,
+                                        ck::tensor_layout::convolution::KZYXC>>,
+          ck::tuple_element_t<NDimSpatial - 1,
+                              ck::Tuple<ck::tensor_layout::convolution::NWK,
+                                        ck::tensor_layout::convolution::NHWK,
+                                        ck::tensor_layout::convolution::NDHWK>>,
+          InDataType,
+          WeiDataType,
+          OutDataType,
+          InElementwiseOperation,
+          WeiElementwiseOperation,
+          OutElementwiseOperation>
+{
+    using DeviceOp = DeviceConvNdBwdDataNwcKxcNwk_Dl;
+
+    using ADataType = OutDataType;
+    using BDataType = WeiDataType;
+    using CDataType = InDataType;
+
+    // TODO make A/B datatype different
+    using ABDataType = InDataType;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    template <ck::index_t NDim, typename ck::enable_if<NDim == 1, bool>::type = false>
+    static auto
+    MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(ck::index_t N,
+                                                    ck::index_t K,
+                                                    ck::index_t C,
+                                                    std::vector<ck::index_t> input_spatial_lengths,
+                                                    std::vector<ck::index_t> filter_spatial_lengths,
+                                                    std::vector<ck::index_t> output_spatial_lengths,
+                                                    std::vector<ck::index_t> conv_filter_strides,
+                                                    std::vector<ck::index_t> conv_filter_dilations,
+                                                    std::vector<ck::index_t> input_left_pads,
+                                                    std::vector<ck::index_t> input_right_pads,
+                                                    std::vector<ck::index_t> tildes)
+    {
+        using namespace ck;
+
+        index_t i_xtilde = tildes[0];
+
+        const index_t Wi            = input_spatial_lengths[0];
+        const index_t Wo            = output_spatial_lengths[0];
+        const index_t X             = filter_spatial_lengths[0];
+        const index_t InLeftPadW    = input_left_pads[0];
+        const index_t InRightPadW   = input_right_pads[0];
+        const index_t ConvStrideW   = conv_filter_strides[0];
+        const index_t ConvDilationW = conv_filter_dilations[0];
+
+        const auto K0 = K / K1;
+
+        const auto in_n_wi_c_grid_desc = make_naive_tensor_descriptor_packed(make_tuple(N, Wi, C));
+
+        if constexpr(ConvBackwardDataSpecialization ==
+                     ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0)
+        {
+            // A: output tensor
+            const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                make_naive_tensor_descriptor_packed(make_tuple(N * Wo, K)),
+                make_tuple(make_pass_through_transform(N * Wo),
+                           make_unmerge_transform(make_tuple(K0, K1))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<1>{}, Sequence<0, 2>{}));
+
+            // B: weight tensor
+            const auto wei_gemmk0_gemmn_gemmk1_grid_desc =
+                transform_tensor_descriptor(make_naive_tensor_descriptor_packed(make_tuple(K, C)),
+                                            make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                                                       make_pass_through_transform(C)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // C: input tensor
+            const auto in_n_x_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_embed_transform(make_tuple(I1, Wo), make_tuple(I1, ConvStrideW)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+            const auto in_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+                in_n_x_wo_c_grid_desc,
+                make_tuple(make_freeze_transform(I0),
+                           make_merge_transform(make_tuple(N, Wo)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<1>{}, Sequence<0, 2>{}, Sequence<3>{}),
+                make_tuple(Sequence<>{}, Sequence<0>{}, Sequence<1>{}));
+
+            return make_tuple(out_gemmk0_gemmm_gemmk1_grid_desc,
+                              wei_gemmk0_gemmn_gemmk1_grid_desc,
+                              in_gemmm_gemmn_grid_desc);
+        }
+        else
+        {
+            const auto out_n_wo_k_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N, Wo, K));
+            const auto wei_k_x_c_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(K, X, C));
+
+            const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+
+            const auto XTilde = ConvStrideW / GcdStrideDilationW;
+
+            const auto XDot = math::integer_divide_ceil(X, XTilde);
+
+            const auto WTilde =
+                Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW);
+
+            // only work on HTilde and WTilde that contribute to non-padding area of input tensor
+            const auto IWTildeSliceBegin = math::integer_divide_floor(
+                math::max(I0, InLeftPadW - ConvDilationW * (XTilde - I1)), ConvStrideW);
+
+            const auto IWTildeSliceEnd = math::min(
+                WTilde, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1);
+
+            const auto WTildeSlice = IWTildeSliceEnd - IWTildeSliceBegin;
+
+            // GemmK is different for each GEMM
+            const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde);
+
+            // A: output tensor
+            const auto out_n_wop_k_grid_desc = transform_tensor_descriptor(
+                out_n_wo_k_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Wo, I0, I0),
+                           make_pass_through_transform(K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            const auto out_n_xdot_wtilde_k_grid_desc = transform_tensor_descriptor(
+                out_n_wop_k_grid_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(XDot, WTilde),
+                                         make_tuple(-ConvDilationW / GcdStrideDilationW, I1)),
+                    make_pass_through_transform(K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+            const auto out_n_xdotslice_wtildeslice_k0_k1_grid_desc = transform_tensor_descriptor(
+                out_n_xdot_wtilde_k_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_slice_transform(XDot, I0, XDotSlice),
+                           make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
+                           make_unmerge_transform(make_tuple(K0, K1))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4>{}));
+
+            const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                out_n_xdotslice_wtildeslice_k0_k1_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(XDotSlice, K0)),
+                           make_merge_transform(make_tuple(N, WTildeSlice)),
+                           make_pass_through_transform(K1)),
+                make_tuple(Sequence<1, 3>{}, Sequence<0, 2>{}, Sequence<4>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            // B weight tensor
+            const auto wei_k_xdot_xtilde_c_grid_desc = transform_tensor_descriptor(
+                wei_k_x_c_grid_desc,
+                make_tuple(make_pass_through_transform(K),
+                           make_embed_transform(make_tuple(XDot, XTilde),
+                                                make_tuple(ConvStrideW / GcdStrideDilationW, I1)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+            const auto wei_k0_k1_xdotslice_c_grid_desc = transform_tensor_descriptor(
+                wei_k_xdot_xtilde_c_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                           make_slice_transform(XDot, I0, XDotSlice),
+                           make_freeze_transform(i_xtilde),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<>{}, Sequence<3>{}));
+
+            const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                wei_k0_k1_xdotslice_c_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(XDotSlice, K0)),
+                           make_pass_through_transform(C),
+                           make_pass_through_transform(K1)),
+                make_tuple(Sequence<2, 0>{}, Sequence<3>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            // C: input tensor
+            const auto in_n_wip_c_grid_desc = transform_tensor_descriptor(
+                in_n_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            const auto in_n_xtilde_wtilde_c_grid_desc = transform_tensor_descriptor(
+                in_n_wip_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_embed_transform(make_tuple(XTilde, WTilde),
+                                                make_tuple(ConvDilationW, ConvStrideW)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+            const auto in_n_wtildeslice_c_grid_desc = transform_tensor_descriptor(
+                in_n_xtilde_wtilde_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_freeze_transform(i_xtilde),
+                           make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<>{}, Sequence<1>{}, Sequence<2>{}));
+
+            const auto in_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+                in_n_wtildeslice_c_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(N, WTildeSlice)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0, 1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return make_tuple(out_gemmk0_gemmm_gemmk1_grid_desc,
+                              wei_gemmk0_gemmn_gemmk1_grid_desc,
+                              in_gemmm_gemmn_grid_desc);
+        }
+
+    } // function end
+    template <ck::index_t NDim, typename ck::enable_if<NDim == 2, bool>::type = false>
+    static auto
+    MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(ck::index_t N,
+                                                    ck::index_t K,
+                                                    ck::index_t C,
+                                                    std::vector<ck::index_t> input_spatial_lengths,
+                                                    std::vector<ck::index_t> filter_spatial_lengths,
+                                                    std::vector<ck::index_t> output_spatial_lengths,
+                                                    std::vector<ck::index_t> conv_filter_strides,
+                                                    std::vector<ck::index_t> conv_filter_dilations,
+                                                    std::vector<ck::index_t> input_left_pads,
+                                                    std::vector<ck::index_t> input_right_pads,
+                                                    std::vector<ck::index_t> tildes)
+    {
+        using namespace ck;
+
+        index_t i_ytilde = tildes[0];
+        index_t i_xtilde = tildes[1];
+
+        const index_t Hi = input_spatial_lengths[0];
+        const index_t Wi = input_spatial_lengths[1];
+
+        const index_t Ho = output_spatial_lengths[0];
+        const index_t Wo = output_spatial_lengths[1];
+
+        const index_t Y = filter_spatial_lengths[0];
+        const index_t X = filter_spatial_lengths[1];
+
+        const index_t InLeftPadH = input_left_pads[0];
+        const index_t InLeftPadW = input_left_pads[1];
+
+        const index_t InRightPadH = input_right_pads[0];
+        const index_t InRightPadW = input_right_pads[1];
+
+        const index_t ConvStrideH = conv_filter_strides[0];
+        const index_t ConvStrideW = conv_filter_strides[1];
+
+        const index_t ConvDilationH = conv_filter_dilations[0];
+        const index_t ConvDilationW = conv_filter_dilations[1];
+
+        const auto K0 = K / K1;
+
+        const auto out_n_ho_wo_k_grid_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(N, Ho, Wo, K));
+        const auto wei_k_y_x_c_grid_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(K, Y, X, C));
+        const auto in_n_hi_wi_c_grid_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C));
+
+        if constexpr(ConvBackwardDataSpecialization ==
+                     ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0)
+        {
+            // A: output tensor
+            const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)),
+                make_tuple(make_pass_through_transform(N * Ho * Wo),
+                           make_unmerge_transform(make_tuple(K0, K1))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<1>{}, Sequence<0, 2>{}));
+
+            // B: weight tensor
+            const auto wei_gemmk0_gemmn_gemmk1_grid_desc =
+                transform_tensor_descriptor(make_naive_tensor_descriptor_packed(make_tuple(K, C)),
+                                            make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                                                       make_pass_through_transform(C)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // C: input tensor
+            const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_embed_transform(make_tuple(I1, Ho), make_tuple(I1, ConvStrideH)),
+                           make_embed_transform(make_tuple(I1, Wo), make_tuple(I1, ConvStrideW)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+            const auto in_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+                in_n_y_ho_x_wo_c_grid_desc,
+                make_tuple(make_freeze_transform(I0),
+                           make_freeze_transform(I0),
+                           make_merge_transform(make_tuple(N, Ho, Wo)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<1>{}, Sequence<3>{}, Sequence<0, 2, 4>{}, Sequence<5>{}),
+                make_tuple(Sequence<>{}, Sequence<>{}, Sequence<0>{}, Sequence<1>{}));
+
+            return make_tuple(out_gemmk0_gemmm_gemmk1_grid_desc,
+                              wei_gemmk0_gemmn_gemmk1_grid_desc,
+                              in_gemmm_gemmn_grid_desc);
+        }
+        else
+        {
+            const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+            const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+
+            const auto YTilde = ConvStrideH / GcdStrideDilationH;
+            const auto XTilde = ConvStrideW / GcdStrideDilationW;
+
+            const auto YDot = math::integer_divide_ceil(Y, YTilde);
+            const auto XDot = math::integer_divide_ceil(X, XTilde);
+
+            const auto HTilde =
+                Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH);
+            const auto WTilde =
+                Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW);
+
+            // only work on HTilde and WTilde that contribute to non-padding area of input tensor
+            const auto IHTildeSliceBegin = math::integer_divide_floor(
+                math::max(I0, InLeftPadH - ConvDilationH * (YTilde - I1)), ConvStrideH);
+            const auto IWTildeSliceBegin = math::integer_divide_floor(
+                math::max(I0, InLeftPadW - ConvDilationW * (XTilde - I1)), ConvStrideW);
+
+            const auto IHTildeSliceEnd = math::min(
+                HTilde, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1);
+            const auto IWTildeSliceEnd = math::min(
+                WTilde, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1);
+
+            const auto HTildeSlice = IHTildeSliceEnd - IHTildeSliceBegin;
+            const auto WTildeSlice = IWTildeSliceEnd - IWTildeSliceBegin;
+
+            // GemmK is different for each GEMM
+            const auto YDotSlice = math::integer_divide_ceil(Y - i_ytilde, YTilde);
+            const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde);
+
+            // A: output tensor
+            const auto out_n_hop_wop_k_grid_desc = transform_tensor_descriptor(
+                out_n_ho_wo_k_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Ho, I0, I0),
+                           make_pad_transform(Wo, I0, I0),
+                           make_pass_through_transform(K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto out_n_ydot_htilde_xdot_wtilde_k_grid_desc = transform_tensor_descriptor(
+                out_n_hop_wop_k_grid_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(YDot, HTilde),
+                                         make_tuple(-ConvDilationH / GcdStrideDilationH, I1)),
+                    make_embed_transform(make_tuple(XDot, WTilde),
+                                         make_tuple(-ConvDilationW / GcdStrideDilationW, I1)),
+                    make_pass_through_transform(K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+            const auto out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k0_k1_grid_desc =
+                transform_tensor_descriptor(
+                    out_n_ydot_htilde_xdot_wtilde_k_grid_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_slice_transform(YDot, I0, YDotSlice),
+                               make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice),
+                               make_slice_transform(XDot, I0, XDotSlice),
+                               make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
+                               make_unmerge_transform(make_tuple(K0, K1))),
+                    make_tuple(Sequence<0>{},
+                               Sequence<1>{},
+                               Sequence<2>{},
+                               Sequence<3>{},
+                               Sequence<4>{},
+                               Sequence<5>{}),
+                    make_tuple(Sequence<0>{},
+                               Sequence<1>{},
+                               Sequence<2>{},
+                               Sequence<3>{},
+                               Sequence<4>{},
+                               Sequence<5, 6>{}));
+
+            const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k0_k1_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
+                           make_merge_transform(make_tuple(N, HTildeSlice, WTildeSlice)),
+                           make_pass_through_transform(K1)),
+                make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}, Sequence<6>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            // B weight tensor
+            const auto wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc = transform_tensor_descriptor(
+                wei_k_y_x_c_grid_desc,
+                make_tuple(make_pass_through_transform(K),
+                           make_embed_transform(make_tuple(YDot, YTilde),
+                                                make_tuple(ConvStrideH / GcdStrideDilationH, I1)),
+                           make_embed_transform(make_tuple(XDot, XTilde),
+                                                make_tuple(ConvStrideW / GcdStrideDilationW, I1)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+            const auto wei_k0_k1_ydotslice_xdotslice_c_grid_desc =
+                transform_tensor_descriptor(wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc,
+                                            make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                                                       make_slice_transform(YDot, I0, YDotSlice),
+                                                       make_slice_transform(XDot, I0, XDotSlice),
+                                                       make_freeze_transform(i_ytilde),
+                                                       make_freeze_transform(i_xtilde),
+                                                       make_pass_through_transform(C)),
+                                            make_tuple(Sequence<0>{},
+                                                       Sequence<1>{},
+                                                       Sequence<3>{},
+                                                       Sequence<2>{},
+                                                       Sequence<4>{},
+                                                       Sequence<5>{}),
+                                            make_tuple(Sequence<0, 1>{},
+                                                       Sequence<2>{},
+                                                       Sequence<3>{},
+                                                       Sequence<>{},
+                                                       Sequence<>{},
+                                                       Sequence<4>{}));
+
+            const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
+                           make_pass_through_transform(C),
+                           make_pass_through_transform(K1)),
+                make_tuple(Sequence<2, 3, 0>{}, Sequence<4>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            // C: input tensor
+            const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+                in_n_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc = transform_tensor_descriptor(
+                in_n_hip_wip_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_embed_transform(make_tuple(YTilde, HTilde),
+                                                make_tuple(ConvDilationH, ConvStrideH)),
+                           make_embed_transform(make_tuple(XTilde, WTilde),
+                                                make_tuple(ConvDilationW, ConvStrideW)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+            const auto in_n_htildeslice_wtildeslice_c_grid_desc = transform_tensor_descriptor(
+                in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_freeze_transform(i_ytilde),
+                           make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice),
+                           make_freeze_transform(i_xtilde),
+                           make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{},
+                           Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<3>{},
+                           Sequence<4>{},
+                           Sequence<5>{}),
+                make_tuple(Sequence<0>{},
+                           Sequence<>{},
+                           Sequence<1>{},
+                           Sequence<>{},
+                           Sequence<2>{},
+                           Sequence<3>{}));
+
+            const auto in_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+                in_n_htildeslice_wtildeslice_c_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(N, HTildeSlice, WTildeSlice)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return make_tuple(out_gemmk0_gemmm_gemmk1_grid_desc,
+                              wei_gemmk0_gemmn_gemmk1_grid_desc,
+                              in_gemmm_gemmn_grid_desc);
+        }
+
+    } // function end
+
+    template <ck::index_t NDim, typename ck::enable_if<NDim == 3, bool>::type = false>
+    static auto
+    MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(ck::index_t N,
+                                                    ck::index_t K,
+                                                    ck::index_t C,
+                                                    std::vector<ck::index_t> input_spatial_lengths,
+                                                    std::vector<ck::index_t> filter_spatial_lengths,
+                                                    std::vector<ck::index_t> output_spatial_lengths,
+                                                    std::vector<ck::index_t> conv_filter_strides,
+                                                    std::vector<ck::index_t> conv_filter_dilations,
+                                                    std::vector<ck::index_t> input_left_pads,
+                                                    std::vector<ck::index_t> input_right_pads,
+                                                    std::vector<ck::index_t> tildes)
+    {
+        using namespace ck;
+
+        const index_t i_ztilde = tildes[0];
+        const index_t i_ytilde = tildes[1];
+        const index_t i_xtilde = tildes[2];
+
+        const index_t Di = input_spatial_lengths[0];
+        const index_t Hi = input_spatial_lengths[1];
+        const index_t Wi = input_spatial_lengths[2];
+
+        const index_t Do = output_spatial_lengths[0];
+        const index_t Ho = output_spatial_lengths[1];
+        const index_t Wo = output_spatial_lengths[2];
+
+        const index_t Z = filter_spatial_lengths[0];
+        const index_t Y = filter_spatial_lengths[1];
+        const index_t X = filter_spatial_lengths[2];
+
+        const index_t InLeftPadD = input_left_pads[0];
+        const index_t InLeftPadH = input_left_pads[1];
+        const index_t InLeftPadW = input_left_pads[2];
+
+        const index_t InRightPadD = input_right_pads[0];
+        const index_t InRightPadH = input_right_pads[1];
+        const index_t InRightPadW = input_right_pads[2];
+
+        const index_t ConvStrideD = conv_filter_strides[0];
+        const index_t ConvStrideH = conv_filter_strides[1];
+        const index_t ConvStrideW = conv_filter_strides[2];
+
+        const index_t ConvDilationD = conv_filter_dilations[0];
+        const index_t ConvDilationH = conv_filter_dilations[1];
+        const index_t ConvDilationW = conv_filter_dilations[2];
+
+        const auto K0 = K / K1;
+
+        const auto out_n_do_ho_wo_k_grid_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(N, Do, Ho, Wo, K));
+        const auto wei_k_z_y_x_c_grid_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(K, Z, Y, X, C));
+        const auto in_n_di_hi_wi_c_grid_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(N, Di, Hi, Wi, C));
+
+        if constexpr(ConvBackwardDataSpecialization ==
+                     ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0)
+        {
+            // A: output tensor
+            const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                make_naive_tensor_descriptor_packed(make_tuple(N * Do * Ho * Wo, K)),
+                make_tuple(make_pass_through_transform(N * Do * Ho * Wo),
+                           make_unmerge_transform(make_tuple(K0, K1))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<1>{}, Sequence<0, 2>{}));
+
+            // B: weight tensor
+            const auto wei_gemmk0_gemmn_gemmk1_grid_desc =
+                transform_tensor_descriptor(make_naive_tensor_descriptor_packed(make_tuple(K, C)),
+                                            make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                                                       make_pass_through_transform(C)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // C: input tensor
+            const auto in_n_z_do_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_di_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_embed_transform(make_tuple(I1, Do), make_tuple(I1, ConvStrideD)),
+                           make_embed_transform(make_tuple(I1, Ho), make_tuple(I1, ConvStrideH)),
+                           make_embed_transform(make_tuple(I1, Wo), make_tuple(I1, ConvStrideW)),
+                           make_pass_through_transform(C)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                make_tuple(Sequence<0>{},
+                           Sequence<1, 2>{},
+                           Sequence<3, 4>{},
+                           Sequence<5, 6>{},
+                           Sequence<7>{}));
+
+            const auto in_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+                in_n_z_do_y_ho_x_wo_c_grid_desc,
+                make_tuple(make_freeze_transform(I0),
+                           make_freeze_transform(I0),
+                           make_freeze_transform(I0),
+                           make_merge_transform(make_tuple(N, Do, Ho, Wo)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<1>{},
+                           Sequence<3>{},
+                           Sequence<5>{},
+                           Sequence<0, 2, 4, 6>{},
+                           Sequence<7>{}),
+                make_tuple(Sequence<>{}, Sequence<>{}, Sequence<>{}, Sequence<0>{}, Sequence<1>{}));
+
+            return make_tuple(out_gemmk0_gemmm_gemmk1_grid_desc,
+                              wei_gemmk0_gemmn_gemmk1_grid_desc,
+                              in_gemmm_gemmn_grid_desc);
+        }
+        else
+        {
+            const auto GcdStrideDilationD = math::gcd(ConvStrideD, ConvDilationD);
+            const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+            const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+
+            const auto ZTilde = ConvStrideD / GcdStrideDilationD;
+            const auto YTilde = ConvStrideH / GcdStrideDilationH;
+            const auto XTilde = ConvStrideW / GcdStrideDilationW;
+
+            const auto ZDot = math::integer_divide_ceil(Z, ZTilde);
+            const auto YDot = math::integer_divide_ceil(Y, YTilde);
+            const auto XDot = math::integer_divide_ceil(X, XTilde);
+
+            const auto DTilde =
+                Do + math::integer_divide_ceil(ConvDilationD * (Z - I1), ConvStrideD);
+            const auto HTilde =
+                Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH);
+            const auto WTilde =
+                Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW);
+
+            // only work on HTilde and WTilde that contribute to non-padding area of input tensor
+            const auto IDTildeSliceBegin = math::integer_divide_floor(
+                math::max(I0, InLeftPadD - ConvDilationD * (ZTilde - I1)), ConvStrideD);
+            const auto IHTildeSliceBegin = math::integer_divide_floor(
+                math::max(I0, InLeftPadH - ConvDilationH * (YTilde - I1)), ConvStrideH);
+            const auto IWTildeSliceBegin = math::integer_divide_floor(
+                math::max(I0, InLeftPadW - ConvDilationW * (XTilde - I1)), ConvStrideW);
+
+            const auto IDTildeSliceEnd = math::min(
+                DTilde, math::integer_divide_ceil(InLeftPadD + Di - I1, ConvStrideD) + I1);
+            const auto IHTildeSliceEnd = math::min(
+                HTilde, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1);
+            const auto IWTildeSliceEnd = math::min(
+                WTilde, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1);
+
+            const auto DTildeSlice = IDTildeSliceEnd - IDTildeSliceBegin;
+            const auto HTildeSlice = IHTildeSliceEnd - IHTildeSliceBegin;
+            const auto WTildeSlice = IWTildeSliceEnd - IWTildeSliceBegin;
+
+            // GemmK is different for each GEMM
+            const auto ZDotSlice = math::integer_divide_ceil(Z - i_ztilde, ZTilde);
+            const auto YDotSlice = math::integer_divide_ceil(Y - i_ytilde, YTilde);
+            const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde);
+
+            // A: output tensor
+            const auto out_n_dop_hop_wop_k_grid_desc = transform_tensor_descriptor(
+                out_n_do_ho_wo_k_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Do, I0, I0),
+                           make_pad_transform(Ho, I0, I0),
+                           make_pad_transform(Wo, I0, I0),
+                           make_pass_through_transform(K)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+            const auto out_n_zdot_dtilde_ydot_htilde_xdot_wtilde_k_grid_desc =
+                transform_tensor_descriptor(
+                    out_n_dop_hop_wop_k_grid_desc,
+                    make_tuple(
+                        make_pass_through_transform(N),
+                        make_embed_transform(make_tuple(ZDot, DTilde),
+                                             make_tuple(-ConvDilationD / GcdStrideDilationD, I1)),
+                        make_embed_transform(make_tuple(YDot, HTilde),
+                                             make_tuple(-ConvDilationH / GcdStrideDilationH, I1)),
+                        make_embed_transform(make_tuple(XDot, WTilde),
+                                             make_tuple(-ConvDilationW / GcdStrideDilationW, I1)),
+                        make_pass_through_transform(K)),
+                    make_tuple(
+                        Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                    make_tuple(Sequence<0>{},
+                               Sequence<1, 2>{},
+                               Sequence<3, 4>{},
+                               Sequence<5, 6>{},
+                               Sequence<7>{}));
+
+            const auto
+                out_n_zdotslice_dtildeslice_ydotslice_htildeslice_xdotslice_wtildeslice_k0_k1_grid_desc =
+                    transform_tensor_descriptor(
+                        out_n_zdot_dtilde_ydot_htilde_xdot_wtilde_k_grid_desc,
+                        make_tuple(make_pass_through_transform(N),
+                                   make_slice_transform(ZDot, I0, ZDotSlice),
+                                   make_slice_transform(DTilde, IDTildeSliceBegin, DTildeSlice),
+                                   make_slice_transform(YDot, I0, YDotSlice),
+                                   make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice),
+                                   make_slice_transform(XDot, I0, XDotSlice),
+                                   make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
+                                   make_unmerge_transform(make_tuple(K0, K1))),
+                        make_tuple(Sequence<0>{},
+                                   Sequence<1>{},
+                                   Sequence<2>{},
+                                   Sequence<3>{},
+                                   Sequence<4>{},
+                                   Sequence<5>{},
+                                   Sequence<6>{},
+                                   Sequence<7>{}),
+                        make_tuple(Sequence<0>{},
+                                   Sequence<1>{},
+                                   Sequence<2>{},
+                                   Sequence<3>{},
+                                   Sequence<4>{},
+                                   Sequence<5>{},
+                                   Sequence<6>{},
+                                   Sequence<7, 8>{}));
+
+            const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                out_n_zdotslice_dtildeslice_ydotslice_htildeslice_xdotslice_wtildeslice_k0_k1_grid_desc,
+                make_tuple(
+                    make_merge_transform(make_tuple(ZDotSlice, YDotSlice, XDotSlice, K0)),
+                    make_merge_transform(make_tuple(N, DTildeSlice, HTildeSlice, WTildeSlice)),
+                    make_pass_through_transform(K1)),
+                make_tuple(Sequence<1, 3, 5, 7>{}, Sequence<0, 2, 4, 6>{}, Sequence<8>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            // B weight tensor
+            const auto wei_k_zdot_ztilde_ydot_ytilde_xdot_xtilde_c_grid_desc =
+                transform_tensor_descriptor(
+                    wei_k_z_y_x_c_grid_desc,
+                    make_tuple(
+                        make_pass_through_transform(K),
+                        make_embed_transform(make_tuple(ZDot, ZTilde),
+                                             make_tuple(ConvStrideD / GcdStrideDilationD, I1)),
+                        make_embed_transform(make_tuple(YDot, YTilde),
+                                             make_tuple(ConvStrideH / GcdStrideDilationH, I1)),
+                        make_embed_transform(make_tuple(XDot, XTilde),
+                                             make_tuple(ConvStrideW / GcdStrideDilationW, I1)),
+                        make_pass_through_transform(C)),
+                    make_tuple(
+                        Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                    make_tuple(Sequence<0>{},
+                               Sequence<1, 2>{},
+                               Sequence<3, 4>{},
+                               Sequence<5, 6>{},
+                               Sequence<7>{}));
+
+            const auto wei_k0_k1_zdotslice_ydotslice_xdotslice_c_grid_desc =
+                transform_tensor_descriptor(wei_k_zdot_ztilde_ydot_ytilde_xdot_xtilde_c_grid_desc,
+                                            make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                                                       make_slice_transform(ZDot, I0, ZDotSlice),
+                                                       make_slice_transform(YDot, I0, YDotSlice),
+                                                       make_slice_transform(XDot, I0, XDotSlice),
+                                                       make_freeze_transform(i_ztilde),
+                                                       make_freeze_transform(i_ytilde),
+                                                       make_freeze_transform(i_xtilde),
+                                                       make_pass_through_transform(C)),
+                                            make_tuple(Sequence<0>{},
+                                                       Sequence<1>{},
+                                                       Sequence<3>{},
+                                                       Sequence<5>{},
+                                                       Sequence<2>{},
+                                                       Sequence<4>{},
+                                                       Sequence<6>{},
+                                                       Sequence<7>{}),
+                                            make_tuple(Sequence<0, 1>{},
+                                                       Sequence<2>{},
+                                                       Sequence<3>{},
+                                                       Sequence<4>{},
+                                                       Sequence<>{},
+                                                       Sequence<>{},
+                                                       Sequence<>{},
+                                                       Sequence<5>{}));
+
+            const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                wei_k0_k1_zdotslice_ydotslice_xdotslice_c_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(ZDotSlice, YDotSlice, XDotSlice, K0)),
+                           make_pass_through_transform(C),
+                           make_pass_through_transform(K1)),
+                make_tuple(Sequence<2, 3, 4, 0>{}, Sequence<5>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            // C: input tensor
+            const auto in_n_dip_hip_wip_c_grid_desc = transform_tensor_descriptor(
+                in_n_di_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Di, InLeftPadD, InRightPadD),
+                           make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+            const auto in_n_ztilde_dtilde_ytilde_htilde_xtilde_wtilde_c_grid_desc =
+                transform_tensor_descriptor(
+                    in_n_dip_hip_wip_c_grid_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_embed_transform(make_tuple(ZTilde, DTilde),
+                                                    make_tuple(ConvDilationD, ConvStrideD)),
+                               make_embed_transform(make_tuple(YTilde, HTilde),
+                                                    make_tuple(ConvDilationH, ConvStrideH)),
+                               make_embed_transform(make_tuple(XTilde, WTilde),
+                                                    make_tuple(ConvDilationW, ConvStrideW)),
+                               make_pass_through_transform(C)),
+                    make_tuple(
+                        Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                    make_tuple(Sequence<0>{},
+                               Sequence<1, 2>{},
+                               Sequence<3, 4>{},
+                               Sequence<5, 6>{},
+                               Sequence<7>{}));
+
+            const auto in_n_dtildeslice_htildeslice_wtildeslice_c_grid_desc =
+                transform_tensor_descriptor(
+                    in_n_ztilde_dtilde_ytilde_htilde_xtilde_wtilde_c_grid_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_freeze_transform(i_ztilde),
+                               make_slice_transform(DTilde, IDTildeSliceBegin, DTildeSlice),
+                               make_freeze_transform(i_ytilde),
+                               make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice),
+                               make_freeze_transform(i_xtilde),
+                               make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
+                               make_pass_through_transform(C)),
+                    make_tuple(Sequence<0>{},
+                               Sequence<1>{},
+                               Sequence<2>{},
+                               Sequence<3>{},
+                               Sequence<4>{},
+                               Sequence<5>{},
+                               Sequence<6>{},
+                               Sequence<7>{}),
+                    make_tuple(Sequence<0>{},
+                               Sequence<>{},
+                               Sequence<1>{},
+                               Sequence<>{},
+                               Sequence<2>{},
+                               Sequence<>{},
+                               Sequence<3>{},
+                               Sequence<4>{}));
+
+            const auto in_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+                in_n_dtildeslice_htildeslice_wtildeslice_c_grid_desc,
+                make_tuple(
+                    make_merge_transform(make_tuple(N, DTildeSlice, HTildeSlice, WTildeSlice)),
+                    make_pass_through_transform(C)),
+                make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<4>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return make_tuple(out_gemmk0_gemmm_gemmk1_grid_desc,
+                              wei_gemmk0_gemmn_gemmk1_grid_desc,
+                              in_gemmm_gemmn_grid_desc);
+        }
+
+    } // function end
+
+    template <ck::index_t NDim, typename ck::enable_if<NDim == 1, bool>::type = false>
+    static auto GetABCGridDesc()
+    {
+        return MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<1>(
+            1, 1, 1, {1}, {1}, {1}, {1}, {1}, {1}, {1}, {0});
+    }
+
+    template <ck::index_t NDim, typename ck::enable_if<NDim == 2, bool>::type = false>
+    static auto GetABCGridDesc()
+    {
+        return MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<2>(
+            1, 1, 1, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {0, 0});
+    }
+
+    template <ck::index_t NDim, typename ck::enable_if<NDim == 3, bool>::type = false>
+    static auto GetABCGridDesc()
+    {
+        return MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<3>(1,
+                                                                  1,
+                                                                  1,
+                                                                  {1, 1, 1},
+                                                                  {1, 1, 1},
+                                                                  {1, 1, 1},
+                                                                  {1, 1, 1},
+                                                                  {1, 1, 1},
+                                                                  {1, 1, 1},
+                                                                  {1, 1, 1},
+                                                                  {0, 0, 0});
+    }
+
+    using ABCGridDescs = decltype(GetABCGridDesc<NDimSpatial>());
+
+    using AGridDesc_K0_M_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I0])>;
+    using BGridDesc_K0_N_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I1])>;
+    using CGridDesc_M_N     = remove_cvref_t<decltype(ABCGridDescs{}[I2])>;
+
+    // GridwiseGemm
+    using GridwiseGemm =
+        GridwiseGemmDl_km_kn_mn_v1r3<BlockSize,
+                                     ADataType,
+                                     AccDataType,
+                                     CDataType,
+                                     InMemoryDataOperationEnum::Set,
+                                     AGridDesc_K0_M_K1,
+                                     BGridDesc_K0_N_K1,
+                                     CGridDesc_M_N,
+                                     MPerBlock,
+                                     NPerBlock,
+                                     K0PerBlock,
+                                     K1,
+                                     M1PerThread,
+                                     N1PerThread,
+                                     KPerThread,
+                                     M1N1ThreadClusterM1Xs,
+                                     M1N1ThreadClusterN1Xs,
+                                     ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+                                     ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+                                     ABlockTransferThreadClusterArrangeOrder,
+                                     ABlockTransferSrcAccessOrder,
+                                     ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
+                                     ABlockTransferSrcVectorTensorContiguousDimOrder,
+                                     ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
+                                     BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+                                     BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+                                     BBlockTransferThreadClusterArrangeOrder,
+                                     BBlockTransferSrcAccessOrder,
+                                     BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
+                                     BBlockTransferSrcVectorTensorContiguousDimOrder,
+                                     BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
+                                     CThreadTransferSrcDstAccessOrder,
+                                     CThreadTransferSrcDstVectorDim,
+                                     CThreadTransferDstScalarPerVector>;
+
+    using AGridDesc_K0_M0_M1_K1 =
+        decltype(GridwiseGemm::MakeAGridDescriptor_K0_M0_M1_K1(AGridDesc_K0_M_K1{}));
+    using BGridDesc_K0_N0_N1_K1 =
+        decltype(GridwiseGemm::MakeBGridDescriptor_K0_N0_N1_K1(BGridDesc_K0_N_K1{}));
+    using CGridDesc_M0_M10_M11_N0_N10_N11 =
+        decltype(GridwiseGemm::MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(CGridDesc_M_N{}));
+    using DefaultBlock2CTileMap =
+        decltype(GridwiseGemm::MakeDefaultBlock2CTileMap(CGridDesc_M_N{}));
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(InDataType* p_in_grid,
+                 const WeiDataType* p_wei_grid,
+                 const OutDataType* p_out_grid,
+                 ck::index_t N,
+                 ck::index_t K,
+                 ck::index_t C,
+                 std::vector<ck::index_t> input_spatial_lengths,
+                 std::vector<ck::index_t> filter_spatial_lengths,
+                 std::vector<ck::index_t> output_spatial_lengths,
+                 std::vector<ck::index_t> conv_filter_strides,
+                 std::vector<ck::index_t> conv_filter_dilations,
+                 std::vector<ck::index_t> input_left_pads,
+                 std::vector<ck::index_t> input_right_pads,
+                 InElementwiseOperation in_element_op,
+                 WeiElementwiseOperation wei_element_op,
+                 OutElementwiseOperation out_element_op)
+            : p_a_grid_{p_out_grid},
+              p_b_grid_{p_wei_grid},
+              p_c_grid_{p_in_grid},
+              a_element_op_{out_element_op},
+              b_element_op_{wei_element_op},
+              c_element_op_{in_element_op},
+              Conv_N_{N},
+              Conv_K_{K},
+              Conv_C_{C},
+              input_spatial_lengths_{input_spatial_lengths},
+              filter_spatial_lengths_{filter_spatial_lengths},
+              output_spatial_lengths_{output_spatial_lengths},
+              conv_filter_strides_{conv_filter_strides},
+              conv_filter_dilations_{conv_filter_dilations},
+              input_left_pads_{input_left_pads},
+              input_right_pads_{input_right_pads}
+        {
+            CreateABCDesc<NDimSpatial>();
+        }
+
+        template <ck::index_t NDim, typename ck::enable_if<NDim == 1, bool>::type = false>
+        void CreateABCDesc()
+        {
+            const index_t ConvStrideW     = conv_filter_strides_[0];
+            const index_t ConvDilationW   = conv_filter_dilations_[0];
+            const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+            const auto XTilde             = ConvStrideW / GcdStrideDilationW;
+
+            const index_t X = filter_spatial_lengths_[0];
+
+            for(index_t i_xtilde = 0; i_xtilde < XTilde; ++i_xtilde)
+            {
+                // check slice is valid
+                const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde);
+                if(XDotSlice <= 0)
+                {
+                    continue;
+                }
+
+                const auto descs =
+                    DeviceOp::MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<NDimSpatial>(
+                        Conv_N_,
+                        Conv_K_,
+                        Conv_C_,
+                        input_spatial_lengths_,
+                        filter_spatial_lengths_,
+                        output_spatial_lengths_,
+                        conv_filter_strides_,
+                        conv_filter_dilations_,
+                        input_left_pads_,
+                        input_right_pads_,
+                        {i_xtilde});
+                a_grid_desc_k0_m_k1_container_.push_back(descs[I0]);
+                b_grid_desc_k0_n_k1_container_.push_back(descs[I1]);
+                c_grid_desc_m_n_container_.push_back(descs[I2]);
+
+                if(GridwiseGemm::CheckValidity(descs[I0], descs[I1], descs[I2]))
+                {
+                    a_grid_desc_k0_m0_m1_k1_container_.push_back(
+                        GridwiseGemm::MakeAGridDescriptor_K0_M0_M1_K1(descs[I0]));
+                    b_grid_desc_k0_n0_n1_k1_container_.push_back(
+                        GridwiseGemm::MakeBGridDescriptor_K0_N0_N1_K1(descs[I1]));
+                    c_grid_desc_m0_m10_m11_n0_n10_n11_container_.push_back(
+                        GridwiseGemm::MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(descs[I2]));
+
+                    block_2_ctile_map_container_.push_back(
+                        GridwiseGemm::MakeDefaultBlock2CTileMap(descs[I2]));
+                }
+            }
+        }
+        template <ck::index_t NDim, typename ck::enable_if<NDim == 2, bool>::type = false>
+        void CreateABCDesc()
+        {
+            const index_t ConvStrideH = conv_filter_strides_[0];
+            const index_t ConvStrideW = conv_filter_strides_[1];
+
+            const index_t ConvDilationH = conv_filter_dilations_[0];
+            const index_t ConvDilationW = conv_filter_dilations_[1];
+
+            const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+            const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+
+            const auto YTilde = ConvStrideH / GcdStrideDilationH;
+            const auto XTilde = ConvStrideW / GcdStrideDilationW;
+
+            const index_t Y = filter_spatial_lengths_[0];
+            const index_t X = filter_spatial_lengths_[1];
+            for(index_t i_ytilde = 0; i_ytilde < YTilde; ++i_ytilde)
+            {
+                for(index_t i_xtilde = 0; i_xtilde < XTilde; ++i_xtilde)
+                {
+                    // check slice is valid
+                    const auto YDotSlice = math::integer_divide_ceil(Y - i_ytilde, YTilde);
+                    const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde);
+                    if(YDotSlice * XDotSlice <= 0)
+                    {
+                        continue;
+                    }
+
+                    const auto descs =
+                        DeviceOp::MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<NDimSpatial>(
+                            Conv_N_,
+                            Conv_K_,
+                            Conv_C_,
+                            input_spatial_lengths_,
+                            filter_spatial_lengths_,
+                            output_spatial_lengths_,
+                            conv_filter_strides_,
+                            conv_filter_dilations_,
+                            input_left_pads_,
+                            input_right_pads_,
+                            {i_ytilde, i_xtilde});
+                    a_grid_desc_k0_m_k1_container_.push_back(descs[I0]);
+                    b_grid_desc_k0_n_k1_container_.push_back(descs[I1]);
+                    c_grid_desc_m_n_container_.push_back(descs[I2]);
+
+                    if(GridwiseGemm::CheckValidity(descs[I0], descs[I1], descs[I2]))
+                    {
+                        a_grid_desc_k0_m0_m1_k1_container_.push_back(
+                            GridwiseGemm::MakeAGridDescriptor_K0_M0_M1_K1(descs[I0]));
+                        b_grid_desc_k0_n0_n1_k1_container_.push_back(
+                            GridwiseGemm::MakeBGridDescriptor_K0_N0_N1_K1(descs[I1]));
+                        c_grid_desc_m0_m10_m11_n0_n10_n11_container_.push_back(
+                            GridwiseGemm::MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(descs[I2]));
+
+                        block_2_ctile_map_container_.push_back(
+                            GridwiseGemm::MakeDefaultBlock2CTileMap(descs[I2]));
+                    }
+                }
+            }
+        }
+        template <ck::index_t NDim, typename ck::enable_if<NDim == 3, bool>::type = false>
+        void CreateABCDesc()
+        {
+            const index_t ConvStrideD = conv_filter_strides_[0];
+            const index_t ConvStrideH = conv_filter_strides_[1];
+            const index_t ConvStrideW = conv_filter_strides_[2];
+
+            const index_t ConvDilationD = conv_filter_dilations_[0];
+            const index_t ConvDilationH = conv_filter_dilations_[1];
+            const index_t ConvDilationW = conv_filter_dilations_[2];
+
+            const auto GcdStrideDilationD = math::gcd(ConvStrideD, ConvDilationD);
+            const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+            const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+
+            const auto ZTilde = ConvStrideD / GcdStrideDilationD;
+            const auto YTilde = ConvStrideH / GcdStrideDilationH;
+            const auto XTilde = ConvStrideW / GcdStrideDilationW;
+
+            const index_t Z = filter_spatial_lengths_[0];
+            const index_t Y = filter_spatial_lengths_[1];
+            const index_t X = filter_spatial_lengths_[2];
+            for(index_t i_ztilde = 0; i_ztilde < ZTilde; ++i_ztilde)
+            {
+                for(index_t i_ytilde = 0; i_ytilde < YTilde; ++i_ytilde)
+                {
+                    for(index_t i_xtilde = 0; i_xtilde < XTilde; ++i_xtilde)
+                    {
+                        // check slice is valid
+                        const auto ZDotSlice = math::integer_divide_ceil(Z - i_ztilde, ZTilde);
+                        const auto YDotSlice = math::integer_divide_ceil(Y - i_ytilde, YTilde);
+                        const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde);
+                        if(ZDotSlice * YDotSlice * XDotSlice <= 0)
+                        {
+                            continue;
+                        }
+
+                        const auto descs =
+                            DeviceOp::MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<NDimSpatial>(
+                                Conv_N_,
+                                Conv_K_,
+                                Conv_C_,
+                                input_spatial_lengths_,
+                                filter_spatial_lengths_,
+                                output_spatial_lengths_,
+                                conv_filter_strides_,
+                                conv_filter_dilations_,
+                                input_left_pads_,
+                                input_right_pads_,
+                                {i_ztilde, i_ytilde, i_xtilde});
+                        a_grid_desc_k0_m_k1_container_.push_back(descs[I0]);
+                        b_grid_desc_k0_n_k1_container_.push_back(descs[I1]);
+                        c_grid_desc_m_n_container_.push_back(descs[I2]);
+
+                        if(GridwiseGemm::CheckValidity(descs[I0], descs[I1], descs[I2]))
+                        {
+                            a_grid_desc_k0_m0_m1_k1_container_.push_back(
+                                GridwiseGemm::MakeAGridDescriptor_K0_M0_M1_K1(descs[I0]));
+                            b_grid_desc_k0_n0_n1_k1_container_.push_back(
+                                GridwiseGemm::MakeBGridDescriptor_K0_N0_N1_K1(descs[I1]));
+                            c_grid_desc_m0_m10_m11_n0_n10_n11_container_.push_back(
+                                GridwiseGemm::MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(descs[I2]));
+
+                            block_2_ctile_map_container_.push_back(
+                                GridwiseGemm::MakeDefaultBlock2CTileMap(descs[I2]));
+                        }
+                    }
+                }
+            }
+        }
+
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        CDataType* p_c_grid_;
+        std::vector<AGridDesc_K0_M_K1> a_grid_desc_k0_m_k1_container_;
+        std::vector<BGridDesc_K0_N_K1> b_grid_desc_k0_n_k1_container_;
+        std::vector<CGridDesc_M_N> c_grid_desc_m_n_container_;
+
+        std::vector<AGridDesc_K0_M0_M1_K1> a_grid_desc_k0_m0_m1_k1_container_;
+        std::vector<BGridDesc_K0_N0_N1_K1> b_grid_desc_k0_n0_n1_k1_container_;
+        std::vector<CGridDesc_M0_M10_M11_N0_N10_N11> c_grid_desc_m0_m10_m11_n0_n10_n11_container_;
+
+        std::vector<DefaultBlock2CTileMap> block_2_ctile_map_container_;
+
+        // element-wise op
+        OutElementwiseOperation a_element_op_;
+        WeiElementwiseOperation b_element_op_;
+        InElementwiseOperation c_element_op_;
+        // for checking IsSupportedArgument()
+        index_t Conv_N_;
+        index_t Conv_K_;
+        index_t Conv_C_;
+
+        std::vector<ck::index_t> input_spatial_lengths_;
+        std::vector<ck::index_t> filter_spatial_lengths_;
+        std::vector<ck::index_t> output_spatial_lengths_;
+        std::vector<ck::index_t> conv_filter_strides_;
+        std::vector<ck::index_t> conv_filter_dilations_;
+        std::vector<ck::index_t> input_left_pads_;
+        std::vector<ck::index_t> input_right_pads_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            float ave_time = 0;
+            for(size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++)
+            {
+                {
+                    std::cout << "arg.a_grid_desc_k0_m_k1_container_{"
+                              << arg.a_grid_desc_k0_m_k1_container_[i].GetLength(I0) << ", "
+                              << arg.a_grid_desc_k0_m_k1_container_[i].GetLength(I1) << ", "
+                              << arg.a_grid_desc_k0_m_k1_container_[i].GetLength(I2) << "}"
+                              << std::endl;
+
+                    std::cout << "arg.b_grid_desc_k0_n_k1_container_{"
+                              << arg.b_grid_desc_k0_n_k1_container_[i].GetLength(I0) << ", "
+                              << arg.b_grid_desc_k0_n_k1_container_[i].GetLength(I1) << ", "
+                              << arg.b_grid_desc_k0_n_k1_container_[i].GetLength(I2) << "}"
+                              << std::endl;
+
+                    std::cout << "arg.c_grid_desc_m_n_container_{ "
+                              << arg.c_grid_desc_m_n_container_[i].GetLength(I0) << ", "
+                              << arg.c_grid_desc_m_n_container_[i].GetLength(I1) << "}"
+                              << std::endl;
+
+                    std::cout << "arg.c_grid_desc_m0_m10_m11_n0_n10_n11_container_( "
+                              << arg.c_grid_desc_m0_m10_m11_n0_n10_n11_container_[i].GetLength(I0)
+                              << ", "
+                              << arg.c_grid_desc_m0_m10_m11_n0_n10_n11_container_[i].GetLength(I1)
+                              << ", "
+                              << arg.c_grid_desc_m0_m10_m11_n0_n10_n11_container_[i].GetLength(I2)
+                              << ", "
+                              << arg.c_grid_desc_m0_m10_m11_n0_n10_n11_container_[i].GetLength(I3)
+                              << ", "
+                              << arg.c_grid_desc_m0_m10_m11_n0_n10_n11_container_[i].GetLength(I4)
+                              << ", "
+                              << arg.c_grid_desc_m0_m10_m11_n0_n10_n11_container_[i].GetLength(I5)
+                              << " ) " << std::endl;
+                }
+
+                if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_container_[i],
+                                                arg.b_grid_desc_k0_n_k1_container_[i],
+                                                arg.c_grid_desc_m_n_container_[i]))
+                {
+                    throw std::runtime_error(
+                        "wrong! GridwiseGemm_km_kn_m0m1n0n1_xdlops_v3r1 has invalid setting");
+                }
+
+                const index_t grid_size = arg.block_2_ctile_map_container_[i].CalculateGridSize(
+                    arg.c_grid_desc_m_n_container_[i]);
+
+                auto launch_kernel = [&](auto has_main_k_block_loop,
+                                         auto has_double_tail_k_block_loop) {
+                    constexpr bool has_main_loop   = has_main_k_block_loop.value;
+                    constexpr bool has_double_loop = has_double_tail_k_block_loop;
+
+                    const auto kernel = kernel_gemm_dl_v1r3<
+                        GridwiseGemm,
+                        ADataType, // TODO: distiguish A/B datatype
+                        CDataType,
+                        remove_reference_t<DeviceOp::AGridDesc_K0_M0_M1_K1>,
+                        remove_reference_t<DeviceOp::BGridDesc_K0_N0_N1_K1>,
+                        remove_reference_t<DeviceOp::CGridDesc_M0_M10_M11_N0_N10_N11>,
+                        remove_reference_t<DeviceOp::DefaultBlock2CTileMap>,
+                        has_main_loop,
+                        has_double_loop>;
+
+                    ave_time +=
+                        launch_and_time_kernel(stream_config,
+                                               kernel,
+                                               dim3(grid_size),
+                                               dim3(BlockSize),
+                                               0,
+                                               arg.p_a_grid_,
+                                               arg.p_b_grid_,
+                                               arg.p_c_grid_,
+                                               arg.a_grid_desc_k0_m0_m1_k1_container_[i],
+                                               arg.b_grid_desc_k0_n0_n1_k1_container_[i],
+                                               arg.c_grid_desc_m0_m10_m11_n0_n10_n11_container_[i],
+                                               arg.block_2_ctile_map_container_[i]);
+                };
+
+                const auto K0 = arg.a_grid_desc_k0_m0_m1_k1_container_[i].GetLength(I0);
+                const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K0);
+                const bool has_double_tail_k_block_loop =
+                    GridwiseGemm::CalculateHasDoubleTailKBlockLoop(K0);
+
+                if(has_main_k_block_loop && has_double_tail_k_block_loop)
+                {
+                    launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, true>{});
+                }
+                else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
+                {
+                    launch_kernel(integral_constant<bool, true>{},
+                                  integral_constant<bool, false>{});
+                }
+                else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
+                {
+                    launch_kernel(integral_constant<bool, false>{},
+                                  integral_constant<bool, true>{});
+                }
+                else
+                {
+                    launch_kernel(integral_constant<bool, false>{},
+                                  integral_constant<bool, false>{});
+                }
+            }
+            return ave_time;
+        }
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        // check device
+        if(!(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030"))
+        {
+            return false;
+        }
+
+        if constexpr(ConvBackwardDataSpecialization ==
+                     ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0)
+        {
+            // check if it's 1x1, stride=1 pad = 0 conv
+            for(int i = 0; i < NDimSpatial; i++)
+            {
+                if(!(arg.filter_spatial_lengths_[i] == 1 && arg.conv_filter_strides_[i] == 1 &&
+                     arg.input_left_pads_[i] == 0 && arg.input_right_pads_[i] == 0))
+                {
+                    return false;
+                }
+            }
+        }
+
+        // matrix A
+        {
+            auto srcVectorLengths = ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1{};
+            if(srcVectorLengths[I1] != 1 || srcVectorLengths[I2] != 1)
+            {
+                return false;
+            }
+            if(K1 % srcVectorLengths[I3] != 0 || K0PerBlock % srcVectorLengths[I0] != 0)
+            {
+                return false;
+            }
+
+            const index_t K = arg.Conv_K_;
+
+            if(K % (srcVectorLengths[I0] * srcVectorLengths[I3]) != 0)
+            {
+                return false;
+            }
+        }
+
+        // matrix B
+        {
+            auto srcLoadLenghts   = BBlockTransferThreadSliceLengths_K0_N0_N1_K1{};
+            auto srcVectorLengths = BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1{};
+            if(srcVectorLengths[I0] != 1 || srcVectorLengths[I3] != 1)
+            {
+                return false;
+            }
+            if(srcLoadLenghts[I1] % srcVectorLengths[I1] != 0 ||
+               srcLoadLenghts[I2] % srcVectorLengths[I2] != 0)
+            {
+                return false;
+            }
+
+            const index_t C = arg.Conv_K_;
+
+            if(C % (srcVectorLengths[I1] * srcVectorLengths[I2]) != 0)
+            {
+                return false;
+            }
+        }
+        // vector store C matrix into global memory
+        if(!(arg.Conv_C_ % CThreadTransferDstScalarPerVector == 0))
+        {
+            std::cout << "Not surpport,because: arg.Conv_C_ % CThreadTransferDstScalarPerVector = "
+                      << arg.Conv_C_ % CThreadTransferDstScalarPerVector << std::endl;
+            return false;
+        }
+
+        // Gridwise GEMM size
+        for(std::size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++)
+        {
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_container_[i],
+                                            arg.b_grid_desc_k0_n_k1_container_[i],
+                                            arg.c_grid_desc_m_n_container_[i]))
+            {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(InDataType* p_in_grid,
+                             const WeiDataType* p_wei_grid,
+                             const OutDataType* p_out_grid,
+                             ck::index_t N,
+                             ck::index_t K,
+                             ck::index_t C,
+                             std::vector<ck::index_t> input_spatial_lengths,
+                             std::vector<ck::index_t> filter_spatial_lengths,
+                             std::vector<ck::index_t> output_spatial_lengths,
+                             std::vector<ck::index_t> conv_filter_strides,
+                             std::vector<ck::index_t> conv_filter_dilations,
+                             std::vector<ck::index_t> input_left_pads,
+                             std::vector<ck::index_t> input_right_pads,
+                             InElementwiseOperation in_element_op,
+                             WeiElementwiseOperation wei_element_op,
+                             OutElementwiseOperation out_element_op)
+    {
+        return Argument{p_in_grid,
+                        p_wei_grid,
+                        p_out_grid,
+                        N,
+                        K,
+                        C,
+                        input_spatial_lengths,
+                        filter_spatial_lengths,
+                        output_spatial_lengths,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        in_element_op,
+                        wei_element_op,
+                        out_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(void* p_in_grid,
+                        const void* p_wei_grid,
+                        const void* p_out_grid,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t C,
+                        std::vector<ck::index_t> input_spatial_lengths,
+                        std::vector<ck::index_t> filter_spatial_lengths,
+                        std::vector<ck::index_t> output_spatial_lengths,
+                        std::vector<ck::index_t> conv_filter_strides,
+                        std::vector<ck::index_t> conv_filter_dilations,
+                        std::vector<ck::index_t> input_left_pads,
+                        std::vector<ck::index_t> input_right_pads,
+                        InElementwiseOperation in_element_op,
+                        WeiElementwiseOperation wei_element_op,
+                        OutElementwiseOperation out_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<InDataType*>(p_in_grid),
+                                          static_cast<const WeiDataType*>(p_wei_grid),
+                                          static_cast<const OutDataType*>(p_out_grid),
+                                          N,
+                                          K,
+                                          C,
+                                          input_spatial_lengths,
+                                          filter_spatial_lengths,
+                                          output_spatial_lengths,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads,
+                                          in_element_op,
+                                          wei_element_op,
+                                          out_element_op);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceConvNdBwdDataNwcKxcNwk_Dl"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << K0PerBlock
+            << ">";
+        if constexpr(ConvBackwardDataSpecialization ==
+                     ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0){
+
+            str<< " Filter1x1Stride1Pad0";
+        }
+
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp
new file mode 100644
index 00000000..e10e374b
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp
@@ -0,0 +1,1568 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename AccDataType,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation,
+          ConvolutionBackwardDataSpecialization ConvBackwardDataSpecialization,
+          ck::index_t BlockSize,
+          ck::index_t MPerBlock,
+          ck::index_t NPerBlock,
+          ck::index_t K0PerBlock,
+          ck::index_t K1,
+          ck::index_t MPerXdl,
+          ck::index_t NPerXdl,
+          ck::index_t MXdlPerWave,
+          ck::index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          ck::index_t ABlockTransferSrcVectorDim,
+          ck::index_t ABlockTransferSrcScalarPerVector,
+          ck::index_t ABlockTransferDstScalarPerVector_K1,
+          bool ABlockLdsAddExtraM,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          ck::index_t BBlockTransferSrcVectorDim,
+          ck::index_t BBlockTransferSrcScalarPerVector,
+          ck::index_t BBlockTransferDstScalarPerVector_K1,
+          bool BBlockLdsAddExtraN,
+          ck::index_t CThreadTransferSrcDstVectorDim,
+          ck::index_t CThreadTransferDstScalarPerVector>
+struct DeviceConvNdBwdDataNwcKxcNwk_Xdl
+    : public DeviceConvBwdData<
+          NDimSpatial,
+          ck::tuple_element_t<NDimSpatial - 1,
+                              ck::Tuple<ck::tensor_layout::convolution::NWC,
+                                        ck::tensor_layout::convolution::NHWC,
+                                        ck::tensor_layout::convolution::NDHWC>>,
+          ck::tuple_element_t<NDimSpatial - 1,
+                              ck::Tuple<ck::tensor_layout::convolution::KXC,
+                                        ck::tensor_layout::convolution::KYXC,
+                                        ck::tensor_layout::convolution::KZYXC>>,
+          ck::tuple_element_t<NDimSpatial - 1,
+                              ck::Tuple<ck::tensor_layout::convolution::NWK,
+                                        ck::tensor_layout::convolution::NHWK,
+                                        ck::tensor_layout::convolution::NDHWK>>,
+          InDataType,
+          WeiDataType,
+          OutDataType,
+          InElementwiseOperation,
+          WeiElementwiseOperation,
+          OutElementwiseOperation>
+{
+    using DeviceOp = DeviceConvNdBwdDataNwcKxcNwk_Xdl;
+
+    using ADataType = OutDataType;
+    using BDataType = WeiDataType;
+    using CDataType = InDataType;
+
+    // TODO make A/B datatype different
+    using ABDataType = InDataType;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    static_assert((K1 % ABlockTransferThreadClusterLengths_K0_M_K1{}[I2]) %
+                      ABlockTransferSrcScalarPerVector ==
+                  0);
+    static_assert((NPerBlock / BBlockTransferThreadClusterLengths_K0_N_K1{}[I1]) %
+                      BBlockTransferSrcScalarPerVector ==
+                  0);
+
+    static constexpr auto K1Number     = Number<K1>{};
+    static constexpr auto GemmK1Number = K1Number;
+
+    template <ck::index_t NDim, typename ck::enable_if<NDim == 1, bool>::type = false>
+    static auto
+    MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(ck::index_t N,
+                                                    ck::index_t K,
+                                                    ck::index_t C,
+                                                    std::vector<ck::index_t> input_spatial_lengths,
+                                                    std::vector<ck::index_t> filter_spatial_lengths,
+                                                    std::vector<ck::index_t> output_spatial_lengths,
+                                                    std::vector<ck::index_t> conv_filter_strides,
+                                                    std::vector<ck::index_t> conv_filter_dilations,
+                                                    std::vector<ck::index_t> input_left_pads,
+                                                    std::vector<ck::index_t> input_right_pads,
+                                                    std::vector<ck::index_t> tildes)
+    {
+        using namespace ck;
+
+        index_t i_xtilde = tildes[0];
+
+        const index_t Wi            = input_spatial_lengths[0];
+        const index_t Wo            = output_spatial_lengths[0];
+        const index_t X             = filter_spatial_lengths[0];
+        const index_t InLeftPadW    = input_left_pads[0];
+        const index_t InRightPadW   = input_right_pads[0];
+        const index_t ConvStrideW   = conv_filter_strides[0];
+        const index_t ConvDilationW = conv_filter_dilations[0];
+
+        const auto K0 = K / K1;
+
+        const auto in_n_wi_c_grid_desc = make_naive_tensor_descriptor_packed(make_tuple(N, Wi, C));
+
+        if constexpr(ConvBackwardDataSpecialization ==
+                     ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0)
+        {
+            // A: output tensor
+            const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                make_naive_tensor_descriptor_packed(make_tuple(N * Wo, K)),
+                make_tuple(make_pass_through_transform(N * Wo),
+                           make_unmerge_transform(make_tuple(K0, K1))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<1>{}, Sequence<0, 2>{}));
+
+            // B: weight tensor
+            const auto wei_gemmk0_gemmn_gemmk1_grid_desc =
+                transform_tensor_descriptor(make_naive_tensor_descriptor_packed(make_tuple(K, C)),
+                                            make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                                                       make_pass_through_transform(C)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // C: input tensor
+            const auto in_n_x_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_embed_transform(make_tuple(I1, Wo), make_tuple(I1, ConvStrideW)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+            const auto in_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+                in_n_x_wo_c_grid_desc,
+                make_tuple(make_freeze_transform(I0),
+                           make_merge_transform(make_tuple(N, Wo)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<1>{}, Sequence<0, 2>{}, Sequence<3>{}),
+                make_tuple(Sequence<>{}, Sequence<0>{}, Sequence<1>{}));
+
+            return make_tuple(out_gemmk0_gemmm_gemmk1_grid_desc,
+                              wei_gemmk0_gemmn_gemmk1_grid_desc,
+                              in_gemmm_gemmn_grid_desc);
+        }
+        else
+        {
+            const auto out_n_wo_k_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N, Wo, K));
+            const auto wei_k_x_c_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(K, X, C));
+
+            const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+
+            const auto XTilde = ConvStrideW / GcdStrideDilationW;
+
+            const auto XDot = math::integer_divide_ceil(X, XTilde);
+
+            const auto WTilde =
+                Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW);
+
+            // only work on HTilde and WTilde that contribute to non-padding area of input tensor
+            const auto IWTildeSliceBegin = math::integer_divide_floor(
+                math::max(I0, InLeftPadW - ConvDilationW * (XTilde - I1)), ConvStrideW);
+
+            const auto IWTildeSliceEnd = math::min(
+                WTilde, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1);
+
+            const auto WTildeSlice = IWTildeSliceEnd - IWTildeSliceBegin;
+
+            // GemmK is different for each GEMM
+            const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde);
+
+            // A: output tensor
+            const auto out_n_wop_k_grid_desc = transform_tensor_descriptor(
+                out_n_wo_k_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Wo, I0, I0),
+                           make_pass_through_transform(K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            const auto out_n_xdot_wtilde_k_grid_desc = transform_tensor_descriptor(
+                out_n_wop_k_grid_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(XDot, WTilde),
+                                         make_tuple(-ConvDilationW / GcdStrideDilationW, I1)),
+                    make_pass_through_transform(K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+            const auto out_n_xdotslice_wtildeslice_k0_k1_grid_desc = transform_tensor_descriptor(
+                out_n_xdot_wtilde_k_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_slice_transform(XDot, I0, XDotSlice),
+                           make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
+                           make_unmerge_transform(make_tuple(K0, K1))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4>{}));
+
+            const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                out_n_xdotslice_wtildeslice_k0_k1_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(XDotSlice, K0)),
+                           make_merge_transform(make_tuple(N, WTildeSlice)),
+                           make_pass_through_transform(K1)),
+                make_tuple(Sequence<1, 3>{}, Sequence<0, 2>{}, Sequence<4>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            // B weight tensor
+            const auto wei_k_xdot_xtilde_c_grid_desc = transform_tensor_descriptor(
+                wei_k_x_c_grid_desc,
+                make_tuple(make_pass_through_transform(K),
+                           make_embed_transform(make_tuple(XDot, XTilde),
+                                                make_tuple(ConvStrideW / GcdStrideDilationW, I1)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+            const auto wei_k0_k1_xdotslice_c_grid_desc = transform_tensor_descriptor(
+                wei_k_xdot_xtilde_c_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                           make_slice_transform(XDot, I0, XDotSlice),
+                           make_freeze_transform(i_xtilde),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<>{}, Sequence<3>{}));
+
+            const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                wei_k0_k1_xdotslice_c_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(XDotSlice, K0)),
+                           make_pass_through_transform(C),
+                           make_pass_through_transform(K1)),
+                make_tuple(Sequence<2, 0>{}, Sequence<3>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            // C: input tensor
+            const auto in_n_wip_c_grid_desc = transform_tensor_descriptor(
+                in_n_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            const auto in_n_xtilde_wtilde_c_grid_desc = transform_tensor_descriptor(
+                in_n_wip_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_embed_transform(make_tuple(XTilde, WTilde),
+                                                make_tuple(ConvDilationW, ConvStrideW)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+            const auto in_n_wtildeslice_c_grid_desc = transform_tensor_descriptor(
+                in_n_xtilde_wtilde_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_freeze_transform(i_xtilde),
+                           make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<>{}, Sequence<1>{}, Sequence<2>{}));
+
+            const auto in_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+                in_n_wtildeslice_c_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(N, WTildeSlice)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0, 1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return make_tuple(out_gemmk0_gemmm_gemmk1_grid_desc,
+                              wei_gemmk0_gemmn_gemmk1_grid_desc,
+                              in_gemmm_gemmn_grid_desc);
+        }
+
+    } // function end
+    template <ck::index_t NDim, typename ck::enable_if<NDim == 2, bool>::type = false>
+    static auto
+    MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(ck::index_t N,
+                                                    ck::index_t K,
+                                                    ck::index_t C,
+                                                    std::vector<ck::index_t> input_spatial_lengths,
+                                                    std::vector<ck::index_t> filter_spatial_lengths,
+                                                    std::vector<ck::index_t> output_spatial_lengths,
+                                                    std::vector<ck::index_t> conv_filter_strides,
+                                                    std::vector<ck::index_t> conv_filter_dilations,
+                                                    std::vector<ck::index_t> input_left_pads,
+                                                    std::vector<ck::index_t> input_right_pads,
+                                                    std::vector<ck::index_t> tildes)
+    {
+        using namespace ck;
+
+        index_t i_ytilde = tildes[0];
+        index_t i_xtilde = tildes[1];
+
+        const index_t Hi = input_spatial_lengths[0];
+        const index_t Wi = input_spatial_lengths[1];
+
+        const index_t Ho = output_spatial_lengths[0];
+        const index_t Wo = output_spatial_lengths[1];
+
+        const index_t Y = filter_spatial_lengths[0];
+        const index_t X = filter_spatial_lengths[1];
+
+        const index_t InLeftPadH = input_left_pads[0];
+        const index_t InLeftPadW = input_left_pads[1];
+
+        const index_t InRightPadH = input_right_pads[0];
+        const index_t InRightPadW = input_right_pads[1];
+
+        const index_t ConvStrideH = conv_filter_strides[0];
+        const index_t ConvStrideW = conv_filter_strides[1];
+
+        const index_t ConvDilationH = conv_filter_dilations[0];
+        const index_t ConvDilationW = conv_filter_dilations[1];
+
+        const auto K0 = K / K1;
+
+        const auto out_n_ho_wo_k_grid_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(N, Ho, Wo, K));
+        const auto wei_k_y_x_c_grid_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(K, Y, X, C));
+        const auto in_n_hi_wi_c_grid_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C));
+
+        if constexpr(ConvBackwardDataSpecialization ==
+                     ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0)
+        {
+            // A: output tensor
+            const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)),
+                make_tuple(make_pass_through_transform(N * Ho * Wo),
+                           make_unmerge_transform(make_tuple(K0, K1))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<1>{}, Sequence<0, 2>{}));
+
+            // B: weight tensor
+            const auto wei_gemmk0_gemmn_gemmk1_grid_desc =
+                transform_tensor_descriptor(make_naive_tensor_descriptor_packed(make_tuple(K, C)),
+                                            make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                                                       make_pass_through_transform(C)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // C: input tensor
+            const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_embed_transform(make_tuple(I1, Ho), make_tuple(I1, ConvStrideH)),
+                           make_embed_transform(make_tuple(I1, Wo), make_tuple(I1, ConvStrideW)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+            const auto in_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+                in_n_y_ho_x_wo_c_grid_desc,
+                make_tuple(make_freeze_transform(I0),
+                           make_freeze_transform(I0),
+                           make_merge_transform(make_tuple(N, Ho, Wo)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<1>{}, Sequence<3>{}, Sequence<0, 2, 4>{}, Sequence<5>{}),
+                make_tuple(Sequence<>{}, Sequence<>{}, Sequence<0>{}, Sequence<1>{}));
+
+            return make_tuple(out_gemmk0_gemmm_gemmk1_grid_desc,
+                              wei_gemmk0_gemmn_gemmk1_grid_desc,
+                              in_gemmm_gemmn_grid_desc);
+        }
+        else
+        {
+            const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+            const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+
+            const auto YTilde = ConvStrideH / GcdStrideDilationH;
+            const auto XTilde = ConvStrideW / GcdStrideDilationW;
+
+            const auto YDot = math::integer_divide_ceil(Y, YTilde);
+            const auto XDot = math::integer_divide_ceil(X, XTilde);
+
+            const auto HTilde =
+                Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH);
+            const auto WTilde =
+                Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW);
+
+            // only work on HTilde and WTilde that contribute to non-padding area of input tensor
+            const auto IHTildeSliceBegin = math::integer_divide_floor(
+                math::max(I0, InLeftPadH - ConvDilationH * (YTilde - I1)), ConvStrideH);
+            const auto IWTildeSliceBegin = math::integer_divide_floor(
+                math::max(I0, InLeftPadW - ConvDilationW * (XTilde - I1)), ConvStrideW);
+
+            const auto IHTildeSliceEnd = math::min(
+                HTilde, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1);
+            const auto IWTildeSliceEnd = math::min(
+                WTilde, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1);
+
+            const auto HTildeSlice = IHTildeSliceEnd - IHTildeSliceBegin;
+            const auto WTildeSlice = IWTildeSliceEnd - IWTildeSliceBegin;
+
+            // GemmK is different for each GEMM
+            const auto YDotSlice = math::integer_divide_ceil(Y - i_ytilde, YTilde);
+            const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde);
+
+            // A: output tensor
+            const auto out_n_hop_wop_k_grid_desc = transform_tensor_descriptor(
+                out_n_ho_wo_k_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Ho, I0, I0),
+                           make_pad_transform(Wo, I0, I0),
+                           make_pass_through_transform(K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto out_n_ydot_htilde_xdot_wtilde_k_grid_desc = transform_tensor_descriptor(
+                out_n_hop_wop_k_grid_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(YDot, HTilde),
+                                         make_tuple(-ConvDilationH / GcdStrideDilationH, I1)),
+                    make_embed_transform(make_tuple(XDot, WTilde),
+                                         make_tuple(-ConvDilationW / GcdStrideDilationW, I1)),
+                    make_pass_through_transform(K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+            const auto out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k0_k1_grid_desc =
+                transform_tensor_descriptor(
+                    out_n_ydot_htilde_xdot_wtilde_k_grid_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_slice_transform(YDot, I0, YDotSlice),
+                               make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice),
+                               make_slice_transform(XDot, I0, XDotSlice),
+                               make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
+                               make_unmerge_transform(make_tuple(K0, K1))),
+                    make_tuple(Sequence<0>{},
+                               Sequence<1>{},
+                               Sequence<2>{},
+                               Sequence<3>{},
+                               Sequence<4>{},
+                               Sequence<5>{}),
+                    make_tuple(Sequence<0>{},
+                               Sequence<1>{},
+                               Sequence<2>{},
+                               Sequence<3>{},
+                               Sequence<4>{},
+                               Sequence<5, 6>{}));
+
+            const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                out_n_ydotslice_htildeslice_xdotslice_wtildeslice_k0_k1_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
+                           make_merge_transform(make_tuple(N, HTildeSlice, WTildeSlice)),
+                           make_pass_through_transform(K1)),
+                make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}, Sequence<6>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            // B weight tensor
+            const auto wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc = transform_tensor_descriptor(
+                wei_k_y_x_c_grid_desc,
+                make_tuple(make_pass_through_transform(K),
+                           make_embed_transform(make_tuple(YDot, YTilde),
+                                                make_tuple(ConvStrideH / GcdStrideDilationH, I1)),
+                           make_embed_transform(make_tuple(XDot, XTilde),
+                                                make_tuple(ConvStrideW / GcdStrideDilationW, I1)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+            const auto wei_k0_k1_ydotslice_xdotslice_c_grid_desc =
+                transform_tensor_descriptor(wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc,
+                                            make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                                                       make_slice_transform(YDot, I0, YDotSlice),
+                                                       make_slice_transform(XDot, I0, XDotSlice),
+                                                       make_freeze_transform(i_ytilde),
+                                                       make_freeze_transform(i_xtilde),
+                                                       make_pass_through_transform(C)),
+                                            make_tuple(Sequence<0>{},
+                                                       Sequence<1>{},
+                                                       Sequence<3>{},
+                                                       Sequence<2>{},
+                                                       Sequence<4>{},
+                                                       Sequence<5>{}),
+                                            make_tuple(Sequence<0, 1>{},
+                                                       Sequence<2>{},
+                                                       Sequence<3>{},
+                                                       Sequence<>{},
+                                                       Sequence<>{},
+                                                       Sequence<4>{}));
+
+            const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                wei_k0_k1_ydotslice_xdotslice_c_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, K0)),
+                           make_pass_through_transform(C),
+                           make_pass_through_transform(K1)),
+                make_tuple(Sequence<2, 3, 0>{}, Sequence<4>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            // C: input tensor
+            const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+                in_n_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc = transform_tensor_descriptor(
+                in_n_hip_wip_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_embed_transform(make_tuple(YTilde, HTilde),
+                                                make_tuple(ConvDilationH, ConvStrideH)),
+                           make_embed_transform(make_tuple(XTilde, WTilde),
+                                                make_tuple(ConvDilationW, ConvStrideW)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+            const auto in_n_htildeslice_wtildeslice_c_grid_desc = transform_tensor_descriptor(
+                in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_freeze_transform(i_ytilde),
+                           make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice),
+                           make_freeze_transform(i_xtilde),
+                           make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{},
+                           Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<3>{},
+                           Sequence<4>{},
+                           Sequence<5>{}),
+                make_tuple(Sequence<0>{},
+                           Sequence<>{},
+                           Sequence<1>{},
+                           Sequence<>{},
+                           Sequence<2>{},
+                           Sequence<3>{}));
+
+            const auto in_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+                in_n_htildeslice_wtildeslice_c_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(N, HTildeSlice, WTildeSlice)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return make_tuple(out_gemmk0_gemmm_gemmk1_grid_desc,
+                              wei_gemmk0_gemmn_gemmk1_grid_desc,
+                              in_gemmm_gemmn_grid_desc);
+        }
+
+    } // function end
+
+    template <ck::index_t NDim, typename ck::enable_if<NDim == 3, bool>::type = false>
+    static auto
+    MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(ck::index_t N,
+                                                    ck::index_t K,
+                                                    ck::index_t C,
+                                                    std::vector<ck::index_t> input_spatial_lengths,
+                                                    std::vector<ck::index_t> filter_spatial_lengths,
+                                                    std::vector<ck::index_t> output_spatial_lengths,
+                                                    std::vector<ck::index_t> conv_filter_strides,
+                                                    std::vector<ck::index_t> conv_filter_dilations,
+                                                    std::vector<ck::index_t> input_left_pads,
+                                                    std::vector<ck::index_t> input_right_pads,
+                                                    std::vector<ck::index_t> tildes)
+    {
+        using namespace ck;
+
+        const index_t i_ztilde = tildes[0];
+        const index_t i_ytilde = tildes[1];
+        const index_t i_xtilde = tildes[2];
+
+        const index_t Di = input_spatial_lengths[0];
+        const index_t Hi = input_spatial_lengths[1];
+        const index_t Wi = input_spatial_lengths[2];
+
+        const index_t Do = output_spatial_lengths[0];
+        const index_t Ho = output_spatial_lengths[1];
+        const index_t Wo = output_spatial_lengths[2];
+
+        const index_t Z = filter_spatial_lengths[0];
+        const index_t Y = filter_spatial_lengths[1];
+        const index_t X = filter_spatial_lengths[2];
+
+        const index_t InLeftPadD = input_left_pads[0];
+        const index_t InLeftPadH = input_left_pads[1];
+        const index_t InLeftPadW = input_left_pads[2];
+
+        const index_t InRightPadD = input_right_pads[0];
+        const index_t InRightPadH = input_right_pads[1];
+        const index_t InRightPadW = input_right_pads[2];
+
+        const index_t ConvStrideD = conv_filter_strides[0];
+        const index_t ConvStrideH = conv_filter_strides[1];
+        const index_t ConvStrideW = conv_filter_strides[2];
+
+        const index_t ConvDilationD = conv_filter_dilations[0];
+        const index_t ConvDilationH = conv_filter_dilations[1];
+        const index_t ConvDilationW = conv_filter_dilations[2];
+
+        const auto K0 = K / K1;
+
+        const auto out_n_do_ho_wo_k_grid_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(N, Do, Ho, Wo, K));
+        const auto wei_k_z_y_x_c_grid_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(K, Z, Y, X, C));
+        const auto in_n_di_hi_wi_c_grid_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(N, Di, Hi, Wi, C));
+
+        if constexpr(ConvBackwardDataSpecialization ==
+                     ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0)
+        {
+            // A: output tensor
+            const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                make_naive_tensor_descriptor_packed(make_tuple(N * Do * Ho * Wo, K)),
+                make_tuple(make_pass_through_transform(N * Do * Ho * Wo),
+                           make_unmerge_transform(make_tuple(K0, K1))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<1>{}, Sequence<0, 2>{}));
+
+            // B: weight tensor
+            const auto wei_gemmk0_gemmn_gemmk1_grid_desc =
+                transform_tensor_descriptor(make_naive_tensor_descriptor_packed(make_tuple(K, C)),
+                                            make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                                                       make_pass_through_transform(C)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // C: input tensor
+            const auto in_n_z_do_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_di_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_embed_transform(make_tuple(I1, Do), make_tuple(I1, ConvStrideD)),
+                           make_embed_transform(make_tuple(I1, Ho), make_tuple(I1, ConvStrideH)),
+                           make_embed_transform(make_tuple(I1, Wo), make_tuple(I1, ConvStrideW)),
+                           make_pass_through_transform(C)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                make_tuple(Sequence<0>{},
+                           Sequence<1, 2>{},
+                           Sequence<3, 4>{},
+                           Sequence<5, 6>{},
+                           Sequence<7>{}));
+
+            const auto in_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+                in_n_z_do_y_ho_x_wo_c_grid_desc,
+                make_tuple(make_freeze_transform(I0),
+                           make_freeze_transform(I0),
+                           make_freeze_transform(I0),
+                           make_merge_transform(make_tuple(N, Do, Ho, Wo)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<1>{},
+                           Sequence<3>{},
+                           Sequence<5>{},
+                           Sequence<0, 2, 4, 6>{},
+                           Sequence<7>{}),
+                make_tuple(Sequence<>{}, Sequence<>{}, Sequence<>{}, Sequence<0>{}, Sequence<1>{}));
+
+            return make_tuple(out_gemmk0_gemmm_gemmk1_grid_desc,
+                              wei_gemmk0_gemmn_gemmk1_grid_desc,
+                              in_gemmm_gemmn_grid_desc);
+        }
+        else
+        {
+            const auto GcdStrideDilationD = math::gcd(ConvStrideD, ConvDilationD);
+            const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+            const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+
+            const auto ZTilde = ConvStrideD / GcdStrideDilationD;
+            const auto YTilde = ConvStrideH / GcdStrideDilationH;
+            const auto XTilde = ConvStrideW / GcdStrideDilationW;
+
+            const auto ZDot = math::integer_divide_ceil(Z, ZTilde);
+            const auto YDot = math::integer_divide_ceil(Y, YTilde);
+            const auto XDot = math::integer_divide_ceil(X, XTilde);
+
+            const auto DTilde =
+                Do + math::integer_divide_ceil(ConvDilationD * (Z - I1), ConvStrideD);
+            const auto HTilde =
+                Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH);
+            const auto WTilde =
+                Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW);
+
+            // only work on HTilde and WTilde that contribute to non-padding area of input tensor
+            const auto IDTildeSliceBegin = math::integer_divide_floor(
+                math::max(I0, InLeftPadD - ConvDilationD * (ZTilde - I1)), ConvStrideD);
+            const auto IHTildeSliceBegin = math::integer_divide_floor(
+                math::max(I0, InLeftPadH - ConvDilationH * (YTilde - I1)), ConvStrideH);
+            const auto IWTildeSliceBegin = math::integer_divide_floor(
+                math::max(I0, InLeftPadW - ConvDilationW * (XTilde - I1)), ConvStrideW);
+
+            const auto IDTildeSliceEnd = math::min(
+                DTilde, math::integer_divide_ceil(InLeftPadD + Di - I1, ConvStrideD) + I1);
+            const auto IHTildeSliceEnd = math::min(
+                HTilde, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1);
+            const auto IWTildeSliceEnd = math::min(
+                WTilde, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1);
+
+            const auto DTildeSlice = IDTildeSliceEnd - IDTildeSliceBegin;
+            const auto HTildeSlice = IHTildeSliceEnd - IHTildeSliceBegin;
+            const auto WTildeSlice = IWTildeSliceEnd - IWTildeSliceBegin;
+
+            // GemmK is different for each GEMM
+            const auto ZDotSlice = math::integer_divide_ceil(Z - i_ztilde, ZTilde);
+            const auto YDotSlice = math::integer_divide_ceil(Y - i_ytilde, YTilde);
+            const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde);
+
+            // A: output tensor
+            const auto out_n_dop_hop_wop_k_grid_desc = transform_tensor_descriptor(
+                out_n_do_ho_wo_k_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Do, I0, I0),
+                           make_pad_transform(Ho, I0, I0),
+                           make_pad_transform(Wo, I0, I0),
+                           make_pass_through_transform(K)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+            const auto out_n_zdot_dtilde_ydot_htilde_xdot_wtilde_k_grid_desc =
+                transform_tensor_descriptor(
+                    out_n_dop_hop_wop_k_grid_desc,
+                    make_tuple(
+                        make_pass_through_transform(N),
+                        make_embed_transform(make_tuple(ZDot, DTilde),
+                                             make_tuple(-ConvDilationD / GcdStrideDilationD, I1)),
+                        make_embed_transform(make_tuple(YDot, HTilde),
+                                             make_tuple(-ConvDilationH / GcdStrideDilationH, I1)),
+                        make_embed_transform(make_tuple(XDot, WTilde),
+                                             make_tuple(-ConvDilationW / GcdStrideDilationW, I1)),
+                        make_pass_through_transform(K)),
+                    make_tuple(
+                        Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                    make_tuple(Sequence<0>{},
+                               Sequence<1, 2>{},
+                               Sequence<3, 4>{},
+                               Sequence<5, 6>{},
+                               Sequence<7>{}));
+
+            const auto
+                out_n_zdotslice_dtildeslice_ydotslice_htildeslice_xdotslice_wtildeslice_k0_k1_grid_desc =
+                    transform_tensor_descriptor(
+                        out_n_zdot_dtilde_ydot_htilde_xdot_wtilde_k_grid_desc,
+                        make_tuple(make_pass_through_transform(N),
+                                   make_slice_transform(ZDot, I0, ZDotSlice),
+                                   make_slice_transform(DTilde, IDTildeSliceBegin, DTildeSlice),
+                                   make_slice_transform(YDot, I0, YDotSlice),
+                                   make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice),
+                                   make_slice_transform(XDot, I0, XDotSlice),
+                                   make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
+                                   make_unmerge_transform(make_tuple(K0, K1))),
+                        make_tuple(Sequence<0>{},
+                                   Sequence<1>{},
+                                   Sequence<2>{},
+                                   Sequence<3>{},
+                                   Sequence<4>{},
+                                   Sequence<5>{},
+                                   Sequence<6>{},
+                                   Sequence<7>{}),
+                        make_tuple(Sequence<0>{},
+                                   Sequence<1>{},
+                                   Sequence<2>{},
+                                   Sequence<3>{},
+                                   Sequence<4>{},
+                                   Sequence<5>{},
+                                   Sequence<6>{},
+                                   Sequence<7, 8>{}));
+
+            const auto out_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                out_n_zdotslice_dtildeslice_ydotslice_htildeslice_xdotslice_wtildeslice_k0_k1_grid_desc,
+                make_tuple(
+                    make_merge_transform(make_tuple(ZDotSlice, YDotSlice, XDotSlice, K0)),
+                    make_merge_transform(make_tuple(N, DTildeSlice, HTildeSlice, WTildeSlice)),
+                    make_pass_through_transform(K1)),
+                make_tuple(Sequence<1, 3, 5, 7>{}, Sequence<0, 2, 4, 6>{}, Sequence<8>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            // B weight tensor
+            const auto wei_k_zdot_ztilde_ydot_ytilde_xdot_xtilde_c_grid_desc =
+                transform_tensor_descriptor(
+                    wei_k_z_y_x_c_grid_desc,
+                    make_tuple(
+                        make_pass_through_transform(K),
+                        make_embed_transform(make_tuple(ZDot, ZTilde),
+                                             make_tuple(ConvStrideD / GcdStrideDilationD, I1)),
+                        make_embed_transform(make_tuple(YDot, YTilde),
+                                             make_tuple(ConvStrideH / GcdStrideDilationH, I1)),
+                        make_embed_transform(make_tuple(XDot, XTilde),
+                                             make_tuple(ConvStrideW / GcdStrideDilationW, I1)),
+                        make_pass_through_transform(C)),
+                    make_tuple(
+                        Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                    make_tuple(Sequence<0>{},
+                               Sequence<1, 2>{},
+                               Sequence<3, 4>{},
+                               Sequence<5, 6>{},
+                               Sequence<7>{}));
+
+            const auto wei_k0_k1_zdotslice_ydotslice_xdotslice_c_grid_desc =
+                transform_tensor_descriptor(wei_k_zdot_ztilde_ydot_ytilde_xdot_xtilde_c_grid_desc,
+                                            make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                                                       make_slice_transform(ZDot, I0, ZDotSlice),
+                                                       make_slice_transform(YDot, I0, YDotSlice),
+                                                       make_slice_transform(XDot, I0, XDotSlice),
+                                                       make_freeze_transform(i_ztilde),
+                                                       make_freeze_transform(i_ytilde),
+                                                       make_freeze_transform(i_xtilde),
+                                                       make_pass_through_transform(C)),
+                                            make_tuple(Sequence<0>{},
+                                                       Sequence<1>{},
+                                                       Sequence<3>{},
+                                                       Sequence<5>{},
+                                                       Sequence<2>{},
+                                                       Sequence<4>{},
+                                                       Sequence<6>{},
+                                                       Sequence<7>{}),
+                                            make_tuple(Sequence<0, 1>{},
+                                                       Sequence<2>{},
+                                                       Sequence<3>{},
+                                                       Sequence<4>{},
+                                                       Sequence<>{},
+                                                       Sequence<>{},
+                                                       Sequence<>{},
+                                                       Sequence<5>{}));
+
+            const auto wei_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                wei_k0_k1_zdotslice_ydotslice_xdotslice_c_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(ZDotSlice, YDotSlice, XDotSlice, K0)),
+                           make_pass_through_transform(C),
+                           make_pass_through_transform(K1)),
+                make_tuple(Sequence<2, 3, 4, 0>{}, Sequence<5>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            // C: input tensor
+            const auto in_n_dip_hip_wip_c_grid_desc = transform_tensor_descriptor(
+                in_n_di_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Di, InLeftPadD, InRightPadD),
+                           make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+            const auto in_n_ztilde_dtilde_ytilde_htilde_xtilde_wtilde_c_grid_desc =
+                transform_tensor_descriptor(
+                    in_n_dip_hip_wip_c_grid_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_embed_transform(make_tuple(ZTilde, DTilde),
+                                                    make_tuple(ConvDilationD, ConvStrideD)),
+                               make_embed_transform(make_tuple(YTilde, HTilde),
+                                                    make_tuple(ConvDilationH, ConvStrideH)),
+                               make_embed_transform(make_tuple(XTilde, WTilde),
+                                                    make_tuple(ConvDilationW, ConvStrideW)),
+                               make_pass_through_transform(C)),
+                    make_tuple(
+                        Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                    make_tuple(Sequence<0>{},
+                               Sequence<1, 2>{},
+                               Sequence<3, 4>{},
+                               Sequence<5, 6>{},
+                               Sequence<7>{}));
+
+            const auto in_n_dtildeslice_htildeslice_wtildeslice_c_grid_desc =
+                transform_tensor_descriptor(
+                    in_n_ztilde_dtilde_ytilde_htilde_xtilde_wtilde_c_grid_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_freeze_transform(i_ztilde),
+                               make_slice_transform(DTilde, IDTildeSliceBegin, DTildeSlice),
+                               make_freeze_transform(i_ytilde),
+                               make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice),
+                               make_freeze_transform(i_xtilde),
+                               make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
+                               make_pass_through_transform(C)),
+                    make_tuple(Sequence<0>{},
+                               Sequence<1>{},
+                               Sequence<2>{},
+                               Sequence<3>{},
+                               Sequence<4>{},
+                               Sequence<5>{},
+                               Sequence<6>{},
+                               Sequence<7>{}),
+                    make_tuple(Sequence<0>{},
+                               Sequence<>{},
+                               Sequence<1>{},
+                               Sequence<>{},
+                               Sequence<2>{},
+                               Sequence<>{},
+                               Sequence<3>{},
+                               Sequence<4>{}));
+
+            const auto in_gemmm_gemmn_grid_desc = transform_tensor_descriptor(
+                in_n_dtildeslice_htildeslice_wtildeslice_c_grid_desc,
+                make_tuple(
+                    make_merge_transform(make_tuple(N, DTildeSlice, HTildeSlice, WTildeSlice)),
+                    make_pass_through_transform(C)),
+                make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<4>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return make_tuple(out_gemmk0_gemmm_gemmk1_grid_desc,
+                              wei_gemmk0_gemmn_gemmk1_grid_desc,
+                              in_gemmm_gemmn_grid_desc);
+        }
+
+    } // function end
+
+    template <ck::index_t NDim, typename ck::enable_if<NDim == 1, bool>::type = false>
+    static auto GetABCGridDesc()
+    {
+        return MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<1>(
+            1, 1, 1, {1}, {1}, {1}, {1}, {1}, {1}, {1}, {0});
+    }
+
+    template <ck::index_t NDim, typename ck::enable_if<NDim == 2, bool>::type = false>
+    static auto GetABCGridDesc()
+    {
+        return MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<2>(
+            1, 1, 1, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {0, 0});
+    }
+
+    template <ck::index_t NDim, typename ck::enable_if<NDim == 3, bool>::type = false>
+    static auto GetABCGridDesc()
+    {
+        return MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<3>(1,
+                                                                  1,
+                                                                  1,
+                                                                  {1, 1, 1},
+                                                                  {1, 1, 1},
+                                                                  {1, 1, 1},
+                                                                  {1, 1, 1},
+                                                                  {1, 1, 1},
+                                                                  {1, 1, 1},
+                                                                  {1, 1, 1},
+                                                                  {0, 0, 0});
+    }
+
+    using ABCGridDescs = decltype(GetABCGridDesc<NDimSpatial>());
+
+    using AGridDesc_K0_M_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I0])>;
+    using BGridDesc_K0_N_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I1])>;
+    using CGridDesc_M_N     = remove_cvref_t<decltype(ABCGridDescs{}[I2])>;
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<
+        BlockSize,
+        ABDataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CDataType,
+        InMemoryDataOperationEnum::Set,
+        AGridDesc_K0_M_K1,
+        BGridDesc_K0_N_K1,
+        CGridDesc_M_N,
+        InElementwiseOperation,
+        WeiElementwiseOperation,
+        OutElementwiseOperation,
+        MPerBlock,
+        NPerBlock,
+        K0PerBlock,
+        MPerXdl,
+        NPerXdl,
+        K1,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_K0_M_K1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_K1,
+        false, // AThreadTransferSrcResetCoordinateAfterRun,
+        ABlockLdsAddExtraM,
+        BBlockTransferThreadClusterLengths_K0_N_K1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_K1,
+        false, // BThreadTransferSrcResetCoordinateAfterRun,
+        BBlockLdsAddExtraN,
+        Sequence<2, 3, 0, 1, 7, 5, 4, 6>, // CThreadTransferSrcDstAccessOrder,
+        7,                                // CThreadTransferSrcDstVectorDim,
+        CThreadTransferDstScalarPerVector>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(InDataType* p_in_grid,
+                 const WeiDataType* p_wei_grid,
+                 const OutDataType* p_out_grid,
+                 ck::index_t N,
+                 ck::index_t K,
+                 ck::index_t C,
+                 std::vector<ck::index_t> input_spatial_lengths,
+                 std::vector<ck::index_t> filter_spatial_lengths,
+                 std::vector<ck::index_t> output_spatial_lengths,
+                 std::vector<ck::index_t> conv_filter_strides,
+                 std::vector<ck::index_t> conv_filter_dilations,
+                 std::vector<ck::index_t> input_left_pads,
+                 std::vector<ck::index_t> input_right_pads,
+                 ck::index_t M01,
+                 ck::index_t N01,
+                 InElementwiseOperation in_element_op,
+                 WeiElementwiseOperation wei_element_op,
+                 OutElementwiseOperation out_element_op)
+            : p_a_grid_{p_out_grid},
+              p_b_grid_{p_wei_grid},
+              p_c_grid_{p_in_grid},
+              M01_{M01},
+              N01_{N01},
+              a_element_op_{out_element_op},
+              b_element_op_{wei_element_op},
+              c_element_op_{in_element_op},
+              Conv_N_{N},
+              Conv_K_{K},
+              Conv_C_{C},
+              input_spatial_lengths_{input_spatial_lengths},
+              filter_spatial_lengths_{filter_spatial_lengths},
+              output_spatial_lengths_{output_spatial_lengths},
+              conv_filter_strides_{conv_filter_strides},
+              conv_filter_dilations_{conv_filter_dilations},
+              input_left_pads_{input_left_pads},
+              input_right_pads_{input_right_pads}
+        {
+            CreateABCDesc<NDimSpatial>();
+        }
+
+        template <ck::index_t NDim, typename ck::enable_if<NDim == 1, bool>::type = false>
+        void CreateABCDesc()
+        {
+            const index_t ConvStrideW     = conv_filter_strides_[0];
+            const index_t ConvDilationW   = conv_filter_dilations_[0];
+            const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+            const auto XTilde             = ConvStrideW / GcdStrideDilationW;
+
+            const index_t X = filter_spatial_lengths_[0];
+
+            for(index_t i_xtilde = 0; i_xtilde < XTilde; ++i_xtilde)
+            {
+                // check slice is valid
+                const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde);
+                if(XDotSlice <= 0)
+                {
+                    continue;
+                }
+
+                const auto descs =
+                    DeviceOp::MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<NDimSpatial>(
+                        Conv_N_,
+                        Conv_K_,
+                        Conv_C_,
+                        input_spatial_lengths_,
+                        filter_spatial_lengths_,
+                        output_spatial_lengths_,
+                        conv_filter_strides_,
+                        conv_filter_dilations_,
+                        input_left_pads_,
+                        input_right_pads_,
+                        {i_xtilde});
+                a_grid_desc_k0_m_k1_container_.push_back(descs[I0]);
+                b_grid_desc_k0_n_k1_container_.push_back(descs[I1]);
+                c_grid_desc_m_n_container_.push_back(descs[I2]);
+
+                auto block_2_ctile_map =
+                    GridwiseGemm::MakeDefaultBlock2CTileMap(descs[I2], M01_, N01_);
+
+                if(GridwiseGemm::CheckValidity(descs[I0], descs[I1], descs[I2], block_2_ctile_map))
+                {
+                    c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_.push_back(
+                        GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(descs[I2]));
+
+                    block_2_ctile_map_container_.push_back(block_2_ctile_map);
+                }
+            }
+        }
+        template <ck::index_t NDim, typename ck::enable_if<NDim == 2, bool>::type = false>
+        void CreateABCDesc()
+        {
+            const index_t ConvStrideH = conv_filter_strides_[0];
+            const index_t ConvStrideW = conv_filter_strides_[1];
+
+            const index_t ConvDilationH = conv_filter_dilations_[0];
+            const index_t ConvDilationW = conv_filter_dilations_[1];
+
+            const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+            const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+
+            const auto YTilde = ConvStrideH / GcdStrideDilationH;
+            const auto XTilde = ConvStrideW / GcdStrideDilationW;
+
+            const index_t Y = filter_spatial_lengths_[0];
+            const index_t X = filter_spatial_lengths_[1];
+            for(index_t i_ytilde = 0; i_ytilde < YTilde; ++i_ytilde)
+            {
+                for(index_t i_xtilde = 0; i_xtilde < XTilde; ++i_xtilde)
+                {
+                    // check slice is valid
+                    const auto YDotSlice = math::integer_divide_ceil(Y - i_ytilde, YTilde);
+                    const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde);
+                    if(YDotSlice * XDotSlice <= 0)
+                    {
+                        continue;
+                    }
+
+                    const auto descs =
+                        DeviceOp::MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<NDimSpatial>(
+                            Conv_N_,
+                            Conv_K_,
+                            Conv_C_,
+                            input_spatial_lengths_,
+                            filter_spatial_lengths_,
+                            output_spatial_lengths_,
+                            conv_filter_strides_,
+                            conv_filter_dilations_,
+                            input_left_pads_,
+                            input_right_pads_,
+                            {i_ytilde, i_xtilde});
+                    a_grid_desc_k0_m_k1_container_.push_back(descs[I0]);
+                    b_grid_desc_k0_n_k1_container_.push_back(descs[I1]);
+                    c_grid_desc_m_n_container_.push_back(descs[I2]);
+
+                    auto block_2_ctile_map =
+                        GridwiseGemm::MakeDefaultBlock2CTileMap(descs[I2], M01_, N01_);
+
+                    if(GridwiseGemm::CheckValidity(
+                           descs[I0], descs[I1], descs[I2], block_2_ctile_map))
+                    {
+                        c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_.push_back(
+                            GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(descs[I2]));
+
+                        block_2_ctile_map_container_.push_back(block_2_ctile_map);
+                    }
+                }
+            }
+        }
+        template <ck::index_t NDim, typename ck::enable_if<NDim == 3, bool>::type = false>
+        void CreateABCDesc()
+        {
+            const index_t ConvStrideD = conv_filter_strides_[0];
+            const index_t ConvStrideH = conv_filter_strides_[1];
+            const index_t ConvStrideW = conv_filter_strides_[2];
+
+            const index_t ConvDilationD = conv_filter_dilations_[0];
+            const index_t ConvDilationH = conv_filter_dilations_[1];
+            const index_t ConvDilationW = conv_filter_dilations_[2];
+
+            const auto GcdStrideDilationD = math::gcd(ConvStrideD, ConvDilationD);
+            const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+            const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+
+            const auto ZTilde = ConvStrideD / GcdStrideDilationD;
+            const auto YTilde = ConvStrideH / GcdStrideDilationH;
+            const auto XTilde = ConvStrideW / GcdStrideDilationW;
+
+            const index_t Z = filter_spatial_lengths_[0];
+            const index_t Y = filter_spatial_lengths_[1];
+            const index_t X = filter_spatial_lengths_[2];
+            for(index_t i_ztilde = 0; i_ztilde < ZTilde; ++i_ztilde)
+            {
+                for(index_t i_ytilde = 0; i_ytilde < YTilde; ++i_ytilde)
+                {
+                    for(index_t i_xtilde = 0; i_xtilde < XTilde; ++i_xtilde)
+                    {
+                        // check slice is valid
+                        const auto ZDotSlice = math::integer_divide_ceil(Z - i_ztilde, ZTilde);
+                        const auto YDotSlice = math::integer_divide_ceil(Y - i_ytilde, YTilde);
+                        const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde);
+                        if(ZDotSlice * YDotSlice * XDotSlice <= 0)
+                        {
+                            continue;
+                        }
+
+                        const auto descs =
+                            DeviceOp::MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<NDimSpatial>(
+                                Conv_N_,
+                                Conv_K_,
+                                Conv_C_,
+                                input_spatial_lengths_,
+                                filter_spatial_lengths_,
+                                output_spatial_lengths_,
+                                conv_filter_strides_,
+                                conv_filter_dilations_,
+                                input_left_pads_,
+                                input_right_pads_,
+                                {i_ztilde, i_ytilde, i_xtilde});
+                        a_grid_desc_k0_m_k1_container_.push_back(descs[I0]);
+                        b_grid_desc_k0_n_k1_container_.push_back(descs[I1]);
+                        c_grid_desc_m_n_container_.push_back(descs[I2]);
+
+                        auto block_2_ctile_map =
+                            GridwiseGemm::MakeDefaultBlock2CTileMap(descs[I2], M01_, N01_);
+
+                        if(GridwiseGemm::CheckValidity(
+                               descs[I0], descs[I1], descs[I2], block_2_ctile_map))
+                        {
+                            c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_.push_back(
+                                GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(
+                                    descs[I2]));
+
+                            block_2_ctile_map_container_.push_back(block_2_ctile_map);
+                        }
+                    }
+                }
+            }
+        }
+
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        CDataType* p_c_grid_;
+        std::vector<AGridDesc_K0_M_K1> a_grid_desc_k0_m_k1_container_;
+        std::vector<BGridDesc_K0_N_K1> b_grid_desc_k0_n_k1_container_;
+        std::vector<CGridDesc_M_N> c_grid_desc_m_n_container_;
+        std::vector<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>
+            c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_;
+        std::vector<typename GridwiseGemm::DefaultBlock2CTileMap> block_2_ctile_map_container_;
+        index_t M01_;
+        index_t N01_;
+        OutElementwiseOperation a_element_op_;
+        WeiElementwiseOperation b_element_op_;
+        InElementwiseOperation c_element_op_;
+        // for checking IsSupportedArgument()
+        index_t Conv_N_;
+        index_t Conv_K_;
+        index_t Conv_C_;
+
+        std::vector<ck::index_t> input_spatial_lengths_;
+        std::vector<ck::index_t> filter_spatial_lengths_;
+        std::vector<ck::index_t> output_spatial_lengths_;
+        std::vector<ck::index_t> conv_filter_strides_;
+        std::vector<ck::index_t> conv_filter_dilations_;
+        std::vector<ck::index_t> input_left_pads_;
+        std::vector<ck::index_t> input_right_pads_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            float ave_time = 0;
+            for(size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++)
+            {
+                {
+                    std::cout << "arg.a_grid_desc_k0_m_k1_container_{"
+                              << arg.a_grid_desc_k0_m_k1_container_[i].GetLength(I0) << ", "
+                              << arg.a_grid_desc_k0_m_k1_container_[i].GetLength(I1) << ", "
+                              << arg.a_grid_desc_k0_m_k1_container_[i].GetLength(I2) << "}"
+                              << std::endl;
+
+                    std::cout << "arg.b_grid_desc_k0_n_k1_container_{"
+                              << arg.b_grid_desc_k0_n_k1_container_[i].GetLength(I0) << ", "
+                              << arg.b_grid_desc_k0_n_k1_container_[i].GetLength(I1) << ", "
+                              << arg.b_grid_desc_k0_n_k1_container_[i].GetLength(I2) << "}"
+                              << std::endl;
+
+                    std::cout << "arg.c_grid_desc_m_n_container_{ "
+                              << arg.c_grid_desc_m_n_container_[i].GetLength(I0) << ", "
+                              << arg.c_grid_desc_m_n_container_[i].GetLength(I1) << "}"
+                              << std::endl;
+
+                    std::cout << "arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_( "
+                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I0)
+                              << ", "
+                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I1)
+                              << ", "
+                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I2)
+                              << ", "
+                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I3)
+                              << ", "
+                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I4)
+                              << ", "
+                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I5)
+                              << ", "
+                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I6)
+                              << ", "
+                              << arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i].GetLength(I7)
+                              << " ) " << std::endl;
+                }
+
+                if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_container_[i],
+                                                arg.b_grid_desc_k0_n_k1_container_[i],
+                                                arg.c_grid_desc_m_n_container_[i],
+                                                arg.block_2_ctile_map_container_[i]))
+                {
+                    throw std::runtime_error(
+                        "wrong! GridwiseGemm_km_kn_m0m1n0n1_xdlops_v3r1 has invalid setting");
+                }
+
+                const index_t grid_size = arg.block_2_ctile_map_container_[i].CalculateGridSize(
+                    arg.c_grid_desc_m_n_container_[i]);
+
+                const auto K = arg.a_grid_desc_k0_m_k1_container_[i].GetLength(I0) *
+                               arg.a_grid_desc_k0_m_k1_container_[i].GetLength(I2);
+
+                if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+                {
+                    const auto kernel = kernel_gemm_xdlops_v2r3<
+                        GridwiseGemm,
+                        ADataType, // TODO: distiguish A/B datatype
+                        CDataType,
+                        remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                        remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                        remove_reference_t<
+                            typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
+                        OutElementwiseOperation,
+                        WeiElementwiseOperation,
+                        InElementwiseOperation,
+                        remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
+                        true>;
+
+                    ave_time += launch_and_time_kernel(
+                        stream_config,
+                        kernel,
+                        dim3(grid_size),
+                        dim3(BlockSize),
+                        0,
+                        arg.p_a_grid_,
+                        arg.p_b_grid_,
+                        arg.p_c_grid_,
+                        arg.a_grid_desc_k0_m_k1_container_[i],
+                        arg.b_grid_desc_k0_n_k1_container_[i],
+                        arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i],
+                        arg.a_element_op_,
+                        arg.b_element_op_,
+                        arg.c_element_op_,
+                        arg.block_2_ctile_map_container_[i]);
+                }
+                else
+                {
+                    const auto kernel = kernel_gemm_xdlops_v2r3<
+                        GridwiseGemm,
+                        ADataType, // TODO: distiguish A/B datatype
+                        CDataType,
+                        remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                        remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                        remove_reference_t<
+                            typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
+                        OutElementwiseOperation,
+                        WeiElementwiseOperation,
+                        InElementwiseOperation,
+                        remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
+                        false>;
+
+                    ave_time += launch_and_time_kernel(
+                        stream_config,
+                        kernel,
+                        dim3(grid_size),
+                        dim3(BlockSize),
+                        0,
+                        arg.p_a_grid_,
+                        arg.p_b_grid_,
+                        arg.p_c_grid_,
+                        arg.a_grid_desc_k0_m_k1_container_[i],
+                        arg.b_grid_desc_k0_n_k1_container_[i],
+                        arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_container_[i],
+                        arg.a_element_op_,
+                        arg.b_element_op_,
+                        arg.c_element_op_,
+                        arg.block_2_ctile_map_container_[i]);
+                }
+            }
+            return ave_time;
+        }
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if constexpr(ConvBackwardDataSpecialization ==
+                     ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0)
+        {
+            // check if it's 1x1, stride=1 pad = 0 conv
+            for(int i = 0; i < NDimSpatial; i++)
+            {
+                if(!(arg.filter_spatial_lengths_[i] == 1 && arg.conv_filter_strides_[i] == 1 &&
+                     arg.input_left_pads_[i] == 0 && arg.input_right_pads_[i] == 0))
+                {
+                    return false;
+                }
+            }
+        }
+
+        // vector load A/B matrix from global memory
+        if(!(ABlockTransferSrcVectorDim == 2 && BBlockTransferSrcVectorDim == 1 &&
+             arg.Conv_K_ % ABlockTransferSrcScalarPerVector == 0 &&
+             arg.Conv_C_ % BBlockTransferSrcScalarPerVector == 0))
+        {
+            return false;
+        }
+
+        // vector store C matrix into global memory
+        if(!(arg.Conv_C_ % CThreadTransferDstScalarPerVector == 0))
+        {
+            return false;
+        }
+
+        // Gridwise GEMM size
+        for(std::size_t i = 0; i < arg.a_grid_desc_k0_m_k1_container_.size(); i++)
+        {
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_container_[i],
+                                            arg.b_grid_desc_k0_n_k1_container_[i],
+                                            arg.c_grid_desc_m_n_container_[i],
+                                            arg.block_2_ctile_map_container_[i]))
+            {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(InDataType* p_in_grid,
+                             const WeiDataType* p_wei_grid,
+                             const OutDataType* p_out_grid,
+                             ck::index_t N,
+                             ck::index_t K,
+                             ck::index_t C,
+                             std::vector<ck::index_t> input_spatial_lengths,
+                             std::vector<ck::index_t> filter_spatial_lengths,
+                             std::vector<ck::index_t> output_spatial_lengths,
+                             std::vector<ck::index_t> conv_filter_strides,
+                             std::vector<ck::index_t> conv_filter_dilations,
+                             std::vector<ck::index_t> input_left_pads,
+                             std::vector<ck::index_t> input_right_pads,
+                             InElementwiseOperation in_element_op,
+                             WeiElementwiseOperation wei_element_op,
+                             OutElementwiseOperation out_element_op)
+    {
+        return Argument{p_in_grid,
+                        p_wei_grid,
+                        p_out_grid,
+                        N,
+                        K,
+                        C,
+                        input_spatial_lengths,
+                        filter_spatial_lengths,
+                        output_spatial_lengths,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        1,
+                        1,
+                        in_element_op,
+                        wei_element_op,
+                        out_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(void* p_in_grid,
+                        const void* p_wei_grid,
+                        const void* p_out_grid,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t C,
+                        std::vector<ck::index_t> input_spatial_lengths,
+                        std::vector<ck::index_t> filter_spatial_lengths,
+                        std::vector<ck::index_t> output_spatial_lengths,
+                        std::vector<ck::index_t> conv_filter_strides,
+                        std::vector<ck::index_t> conv_filter_dilations,
+                        std::vector<ck::index_t> input_left_pads,
+                        std::vector<ck::index_t> input_right_pads,
+                        InElementwiseOperation in_element_op,
+                        WeiElementwiseOperation wei_element_op,
+                        OutElementwiseOperation out_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<InDataType*>(p_in_grid),
+                                          static_cast<const WeiDataType*>(p_wei_grid),
+                                          static_cast<const OutDataType*>(p_out_grid),
+                                          N,
+                                          K,
+                                          C,
+                                          input_spatial_lengths,
+                                          filter_spatial_lengths,
+                                          output_spatial_lengths,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads,
+                                          1,
+                                          1,
+                                          in_element_op,
+                                          wei_element_op,
+                                          out_element_op);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceConvNdBwdDataNwcKxcNwk_Xdl"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << K0PerBlock
+            << ">";
+        if constexpr(ConvBackwardDataSpecialization ==
+                     ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0){
+
+            str<< " Filter1x1Stride1Pad0";
+        }
+
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_elementwise.hpp b/include/ck/tensor_operation/gpu/device/impl/device_elementwise.hpp
new file mode 100644
index 00000000..8e628800
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_elementwise.hpp
@@ -0,0 +1,304 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/math.hpp"
+#include "ck/utility/sequence.hpp"
+#include "ck/tensor_operation/gpu/device/device_elementwise_base.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename InDataTypeTuple,
+          typename OutDataTypeTuple,
+          typename ElementwiseOperation,
+          index_t NumDim,
+          index_t MPerThread,
+          typename InScalarPerVectorSeq,
+          typename OutScalarPerVectorSeq>
+struct DeviceElementwise
+    : public DeviceElementwiseBase<InDataTypeTuple, OutDataTypeTuple, ElementwiseOperation, NumDim>
+{
+    static constexpr int NumInput  = InDataTypeTuple::Size();
+    static constexpr int NumOutput = OutDataTypeTuple::Size();
+
+    static_assert(NumInput == InScalarPerVectorSeq::Size() &&
+                      NumOutput == OutScalarPerVectorSeq::Size(),
+                  "Tuple size is inconsistent with the number of in/out!");
+
+    static auto GenerateInDataTypePointerTuple()
+    {
+        return generate_tuple(
+            [&](auto I) {
+                using DataType = remove_cvref_t<decltype(InDataTypeTuple{}[I])>;
+
+                return static_cast<const DataType*>(nullptr);
+            },
+            Number<NumInput>{});
+    };
+
+    static auto GenerateOutDataTypePointerTuple()
+    {
+        return generate_tuple(
+            [&](auto I) {
+                using DataType = remove_cvref_t<decltype(OutDataTypeTuple{}[I])>;
+
+                return static_cast<DataType*>(nullptr);
+            },
+            Number<NumOutput>{});
+    };
+
+    using InDataTypePointerTuple  = decltype(GenerateInDataTypePointerTuple());
+    using OutDataTypePointerTuple = decltype(GenerateOutDataTypePointerTuple());
+
+    template <typename Desc_M>
+    static auto PadDescriptor_M_1d(Desc_M desc_m, index_t gridSize, index_t blockSize)
+    {
+        constexpr auto I0 = Number<0>{};
+
+        const auto m            = desc_m.GetLength(I0);
+        const index_t loop_step = gridSize * blockSize * MPerThread;
+        const auto pad          = math::integer_least_multiple(m, loop_step) - m;
+        const auto desc_m_pad =
+            transform_tensor_descriptor(desc_m,
+                                        make_tuple(make_right_pad_transform(m, pad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        return desc_m_pad;
+    }
+
+    static auto MakeDescriptor_M(const std::array<index_t, NumDim>& lengths,
+                                 const std::array<index_t, NumDim>& stride,
+                                 index_t gridSize,
+                                 index_t blockSize)
+    {
+        auto tupleOfShape  = generate_tuple([&](auto I) { return lengths[I]; }, Number<NumDim>{});
+        auto tupleOfStride = generate_tuple([&](auto I) { return stride[I]; }, Number<NumDim>{});
+
+        // nd desc - [s0, s1, s2, ...]
+        const auto desc = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride);
+
+        // merge nd to 1d desc - [s0 * s1 * ...]
+        if constexpr(NumDim > 1)
+        {
+            const auto desc_m = transform_tensor_descriptor(
+                desc,
+                make_tuple(make_merge_transform(tupleOfShape)),
+                make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number<NumDim>{})),
+                make_tuple(Sequence<0>{}));
+
+            return PadDescriptor_M_1d(desc_m, gridSize, blockSize);
+        }
+        else
+            return PadDescriptor_M_1d(desc, gridSize, blockSize);
+    }
+
+    template <index_t TupleSize>
+    static auto GenerateInOutGrid1dDescTuple(Number<TupleSize>)
+    {
+        return generate_tuple(
+            [&](auto) {
+                if constexpr(NumDim > 1)
+                {
+                    return MakeDescriptor_M({1, 1}, {1, 1}, 1, 1);
+                }
+                else
+                {
+                    return MakeDescriptor_M({1}, {1}, 1, 1);
+                };
+            },
+            Number<TupleSize>{});
+    };
+
+    using InGrid1dDescTuple  = decltype(GenerateInOutGrid1dDescTuple(Number<NumInput>{}));
+    using OutGrid1dDescTuple = decltype(GenerateInOutGrid1dDescTuple(Number<NumOutput>{}));
+
+    using GridwiseElementwise = GridwiseElementwise_1D<InGrid1dDescTuple,
+                                                       OutGrid1dDescTuple,
+                                                       InDataTypePointerTuple,
+                                                       OutDataTypePointerTuple,
+                                                       ElementwiseOperation,
+                                                       MPerThread,
+                                                       InScalarPerVectorSeq,
+                                                       OutScalarPerVectorSeq>;
+
+    struct Argument : public BaseArgument
+    {
+        Argument(const std::array<index_t, NumDim> lengths,
+                 const std::array<std::array<index_t, NumDim>, NumInput> inStridesArray,
+                 const std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray,
+                 const std::array<const void*, NumInput> in_dev_buffers,
+                 const std::array<void*, NumOutput> out_dev_buffers,
+                 ElementwiseOperation elementwise_op)
+
+            : lengths_(lengths),
+              inStridesArray_(inStridesArray),
+              outStridesArray_(outStridesArray),
+              elementwise_op_(elementwise_op),
+              blockSize_(256),
+              gridSize_(120) // FIXME - Calculate the grid size by number of CU in the future
+        {
+            in_dev_buffers_ = generate_tuple(
+                [&](auto I) {
+                    using DataType = remove_cvref_t<decltype(InDataTypeTuple{}[I])>;
+                    return static_cast<const DataType*>(in_dev_buffers[I.value]);
+                },
+                Number<NumInput>{});
+
+            out_dev_buffers_ = generate_tuple(
+                [&](auto I) {
+                    using DataType = remove_cvref_t<decltype(OutDataTypeTuple{}[I])>;
+                    return static_cast<DataType*>(out_dev_buffers[I.value]);
+                },
+                Number<NumOutput>{});
+
+            in_grid_1d_desc_tuple_ = generate_tuple(
+                [&](auto I) {
+                    return MakeDescriptor_M(
+                        lengths, inStridesArray[I.value], gridSize_, blockSize_);
+                },
+                Number<NumInput>{});
+
+            out_grid_1d_desc_tuple_ = generate_tuple(
+                [&](auto I) {
+                    return MakeDescriptor_M(
+                        lengths, outStridesArray[I.value], gridSize_, blockSize_);
+                },
+                Number<NumOutput>{});
+        }
+
+        InDataTypePointerTuple in_dev_buffers_;
+        OutDataTypePointerTuple out_dev_buffers_;
+        InGrid1dDescTuple in_grid_1d_desc_tuple_;
+        OutGrid1dDescTuple out_grid_1d_desc_tuple_;
+
+        std::array<index_t, NumDim> lengths_;
+        std::array<std::array<index_t, NumDim>, NumInput> inStridesArray_;
+        std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray_;
+
+        ElementwiseOperation elementwise_op_;
+        index_t blockSize_;
+        index_t gridSize_;
+    };
+
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            const auto kernel = kernel_elementwise_1d<GridwiseElementwise,
+                                                      InGrid1dDescTuple,
+                                                      OutGrid1dDescTuple,
+                                                      InDataTypePointerTuple,
+                                                      OutDataTypePointerTuple,
+                                                      ElementwiseOperation>;
+
+            float elapsed_time = launch_and_time_kernel(stream_config,
+                                                        kernel,
+                                                        dim3(arg.gridSize_),
+                                                        dim3(arg.blockSize_),
+                                                        0,
+                                                        arg.in_grid_1d_desc_tuple_,
+                                                        arg.out_grid_1d_desc_tuple_,
+                                                        arg.in_dev_buffers_,
+                                                        arg.out_dev_buffers_,
+                                                        arg.elementwise_op_);
+            return elapsed_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(arg.lengths_.back() % MPerThread != 0)
+            return false;
+
+        auto IsScalarPerVectorValid = [&](const std::array<index_t, NumDim>& lengths,
+                                          const std::array<index_t, NumDim>& strides,
+                                          index_t scalarPerVector) {
+            if(strides.back() == 1 && lengths.back() % scalarPerVector == 0)
+                return true;
+
+            if(strides.back() != 1 && scalarPerVector == 1)
+                return true;
+
+            return false;
+        };
+
+        bool valid = true;
+        static_for<0, NumInput, 1>{}([&](auto I) {
+            if(!IsScalarPerVectorValid(
+                   arg.lengths_, arg.inStridesArray_[I.value], InScalarPerVectorSeq::At(I)))
+                valid = false;
+        });
+
+        static_for<0, NumOutput, 1>{}([&](auto I) {
+            if(!IsScalarPerVectorValid(
+                   arg.lengths_, arg.outStridesArray_[I.value], OutScalarPerVectorSeq::At(I)))
+                valid = false;
+        });
+
+        return valid;
+    };
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto
+    MakeArgument(const std::array<index_t, NumDim> lengths,
+                 const std::array<std::array<index_t, NumDim>, NumInput> inStridesArray,
+                 const std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray,
+                 const std::array<const void*, NumInput> in_dev_buffers,
+                 const std::array<void*, NumOutput> out_dev_buffers,
+                 ElementwiseOperation elementwise_op)
+    {
+        return Argument{lengths,
+                        inStridesArray,
+                        outStridesArray,
+                        in_dev_buffers,
+                        out_dev_buffers,
+                        elementwise_op};
+    }
+
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::array<index_t, NumDim> lengths,
+                        const std::array<std::array<index_t, NumDim>, NumInput> inStridesArray,
+                        const std::array<std::array<index_t, NumDim>, NumOutput> outStridesArray,
+                        const std::array<const void*, NumInput> in_dev_buffers,
+                        const std::array<void*, NumOutput> out_dev_buffers,
+                        ElementwiseOperation elementwise_op) override
+    {
+        return std::make_unique<Argument>(lengths,
+                                          inStridesArray,
+                                          outStridesArray,
+                                          in_dev_buffers,
+                                          out_dev_buffers,
+                                          elementwise_op);
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>();
+    };
+}; // namespace device
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp
new file mode 100644
index 00000000..8ffc5ef9
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp
@@ -0,0 +1,592 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/math.hpp"
+#include "ck/utility/sequence.hpp"
+#include "ck/utility/reduction_operator.hpp"
+
+#include "ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+// X = Elementwise(input1, input2, input3, ...)
+// Y = Normalization(X, beta, gamma)
+namespace ck {
+template <typename GridwiseElementwiseReduction,
+          typename InDataTypePointerTuple, // Datatype tuple of inputs
+          typename XDataType,              // Datatype of X
+          typename GammaDataType,          // Datatype of Gamma
+          typename BetaDataType,           // Datatype of Beta
+          typename YDataType,              // Datatype of Y
+          typename AccDataType,            // AccDatatype
+          typename XElementwiseOperation,  // Operation of input
+          typename YElementwiseOperation,  // Operation of output of normalization
+          typename InGrid2dDescTuple,      // Descriptor tuple of inputs
+          typename GridDesc_M_K>           // Descriptor of inputs, Gamma, Beta
+__global__ void kernel_elementwise_layernorm(
+    const InGrid2dDescTuple in_grid_2d_desc_tuple,          // Descriptor tuple of inputs
+    const GridDesc_M_K x_grid_desc_m_k,                     // Descriptor of X
+    const GridDesc_M_K gamma_grid_desc_m_k,                 // Descriptor of gamma
+    const GridDesc_M_K beta_grid_desc_m_k,                  // Descriptor of beta
+    const GridDesc_M_K y_grid_desc_m_k,                     // Descriptor of Y
+    index_t num_k_block_tile_iteration,                     //
+    AccDataType epsilon,                                    // Datatype of epsilon
+    const InDataTypePointerTuple p_in_global_tuple,         // Ptr tuple of input matrixs
+    const GammaDataType* const __restrict__ p_gamma_global, // Ptr of gamma
+    const BetaDataType* const __restrict__ p_beta_global,   // Ptr of beta
+    YDataType* const __restrict__ p_y_global,               // Ptr of y
+    const XElementwiseOperation x_elementwise_op,           // Operation of input
+    const YElementwiseOperation y_elementwise_op)           // Operation of output of normalization
+{
+    extern __shared__ XDataType p_x_lds[];
+    GridwiseElementwiseReduction::Run(in_grid_2d_desc_tuple,      // Descriptor tuple of inputs
+                                      x_grid_desc_m_k,            // Descriptor of X
+                                      gamma_grid_desc_m_k,        // Descriptor of Gamma
+                                      beta_grid_desc_m_k,         // Descriptor of Beta
+                                      y_grid_desc_m_k,            // Descriptor of Y
+                                      num_k_block_tile_iteration, //
+                                      epsilon,                    // epsilon
+                                      p_in_global_tuple,          // Ptr tuple of inputs
+                                      p_x_lds,                    // Ptr of X
+                                      p_gamma_global,             // Ptr of gamma
+                                      p_beta_global,              // Ptr of beta
+                                      p_y_global,                 // Ptr of Y
+                                      x_elementwise_op,           // Operation of input
+                                      y_elementwise_op); // Operation of output of normalization
+};
+} // namespace ck
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// Y = LayerNorm(A + B, Beta, Gamma)
+template <typename InDataTypeTuple,       // Datatype of inputs
+          typename GammaDataType,         // Datatype of gamma
+          typename BetaDataType,          // Datatype of beta
+          typename AccDataType,           //
+          typename YDataType,             //
+          typename XElementwiseOperation, //
+          typename YElementwiseOperation, //
+          index_t Rank,                   //
+          index_t NumReduceDim,           //
+          index_t BlockSize,              //
+          index_t MThreadClusterSize,     // Num of threads in a block on M direction
+          index_t KThreadClusterSize,     // Num of threads in a block on N direction
+          index_t MThreadSliceSize,       // Each thread calculate rows
+          index_t KThreadSliceSize,       // Each thread calculate columns
+          index_t XYSrcVectorDim,         // Dimension to do reduce
+          index_t XSrcVectorSize,         // Size to fetch source x
+          index_t GammaSrcVectorDim,      // Dimension for gamma to do reduce
+          index_t GammaSrcVectorSize,     // Size to fetch source gamma
+          index_t BetaSrcVectorDim,       // Dimension for beta to do reduce
+          index_t BetaSrcVectorSize,      // Size to fetch source beta
+          index_t YDstVectorSize>         // Size to write destination Y
+struct DeviceElementwiseNormalizationImpl
+    : public DeviceElementwiseNormalization<InDataTypeTuple,
+                                            GammaDataType,
+                                            BetaDataType,
+                                            AccDataType,
+                                            YDataType,
+                                            XElementwiseOperation,
+                                            YElementwiseOperation,
+                                            Rank,
+                                            NumReduceDim>
+{
+    static constexpr int NumInput = InDataTypeTuple::Size();
+
+    using XDataType = YDataType;
+
+    static_assert(
+        (KThreadSliceSize % GammaSrcVectorSize == 0),
+        "Invalid thread slice sizes and/or gamma vector sizes configuration, please check!");
+
+    static_assert(
+        (KThreadSliceSize % BetaSrcVectorSize == 0),
+        "Invalid thread slice sizes and/or beta vector sizes configuration, please check!");
+
+    static constexpr index_t M_BlockTileSize =
+        MThreadClusterSize * MThreadSliceSize; // num of rows calculated in a block
+    static constexpr index_t K_BlockTileSize =
+        KThreadClusterSize * KThreadSliceSize; // num of columns calculated in a block
+
+    static auto GenerateInDataTypePointerTuple()
+    {
+        return generate_tuple(
+            [&](auto I) {
+                using DataType = remove_cvref_t<decltype(InDataTypeTuple{}[I])>;
+                return static_cast<const DataType*>(nullptr);
+            },
+            Number<NumInput>{});
+    };
+
+    using InDataTypePointerTuple = decltype(GenerateInDataTypePointerTuple());
+
+    static auto MakeSrc2dDescriptor(const std::vector<index_t>& inLengths,
+                                    const std::vector<index_t>& inStrides,
+                                    int blkGroupSize,
+                                    int numBlockTileIteration)
+    {
+        constexpr index_t NumInvariantDim  = Rank - NumReduceDim;
+        static constexpr index_t numSrcDim = Rank;
+        static constexpr bool reduceAllDim = (NumInvariantDim == 0);
+
+        const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<numSrcDim>{});
+        const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<numSrcDim>{});
+
+        const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
+
+        const auto in_grid_desc_m_k = [&]() {
+            if constexpr(reduceAllDim)
+            {
+                const auto one_dim_inDesc = transform_tensor_descriptor(
+                    inDesc,
+                    make_tuple(make_merge_transform(tupleSrcLengths)),
+                    make_tuple(typename arithmetic_sequence_gen<0, numSrcDim, 1>::type{}),
+                    make_tuple(Sequence<0>{}));
+
+                return transform_tensor_descriptor(one_dim_inDesc,
+                                                   make_tuple(make_unmerge_transform(make_tuple(
+                                                       1, one_dim_inDesc.GetLength(Number<0>{})))),
+                                                   make_tuple(Sequence<0>{}),
+                                                   make_tuple(Sequence<0, 1>{}));
+            }
+            else
+            {
+                using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type;
+                using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
+
+                const auto reduceDimLengths =
+                    make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
+                const auto invariantDimLengths =
+                    make_tuple_from_array_and_index_seq(inLengths, InvariantDims{});
+
+                return transform_tensor_descriptor(
+                    inDesc,
+                    make_tuple(make_merge_transform(invariantDimLengths),
+                               make_merge_transform(reduceDimLengths)),
+                    make_tuple(InvariantDims{}, ReduceDims{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+        }();
+
+        const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{});
+        const auto reduceLength    = in_grid_desc_m_k.GetLength(Number<1>{});
+
+        const int reduceSizePerBlock = K_BlockTileSize * numBlockTileIteration;
+        const auto inPad_M =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+        const auto inPad_K = reduceSizePerBlock * blkGroupSize - reduceLength;
+
+        auto in_grid_desc_m_k_padded = transform_tensor_descriptor(
+            in_grid_desc_m_k,
+            make_tuple(make_right_pad_transform(invariantLength, inPad_M),
+                       make_right_pad_transform(reduceLength, inPad_K)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        return (in_grid_desc_m_k_padded);
+    };
+
+    template <index_t TupleSize>
+    static auto GenerateSrcGrid2dDescTuple(Number<TupleSize>)
+    {
+        return generate_tuple([&](auto) { return MakeSrc2dDescriptor({1}, {1}, 1, 1); },
+                              Number<TupleSize>{});
+    };
+
+    using InGrid2dDescTuple = decltype(GenerateSrcGrid2dDescTuple(Number<NumInput>{}));
+
+    using GridDesc_M_K = decltype(MakeSrc2dDescriptor({1}, {1}, 1, 1));
+
+    using GridwiseReduceLayernormGeneric =
+        GridwiseElementwiseLayernormWelfordVariance_mk_to_mk<InDataTypePointerTuple,
+                                                             XDataType,
+                                                             GammaDataType,
+                                                             BetaDataType,
+                                                             YDataType,
+                                                             AccDataType,
+                                                             XElementwiseOperation,
+                                                             YElementwiseOperation,
+                                                             InGrid2dDescTuple,
+                                                             GridDesc_M_K,
+                                                             BlockSize,
+                                                             MThreadClusterSize,
+                                                             KThreadClusterSize,
+                                                             MThreadSliceSize,
+                                                             KThreadSliceSize,
+                                                             XYSrcVectorDim,
+                                                             XSrcVectorSize,
+                                                             GammaSrcVectorDim,
+                                                             GammaSrcVectorSize,
+                                                             BetaSrcVectorDim,
+                                                             BetaSrcVectorSize,
+                                                             XYSrcVectorDim,
+                                                             YDstVectorSize,
+                                                             false>;
+
+    using GridwiseReduceLayernormSweepOnce =
+        GridwiseElementwiseLayernormWelfordVariance_mk_to_mk<InDataTypePointerTuple,
+                                                             XDataType,
+                                                             GammaDataType,
+                                                             BetaDataType,
+                                                             YDataType,
+                                                             AccDataType,
+                                                             XElementwiseOperation,
+                                                             YElementwiseOperation,
+                                                             InGrid2dDescTuple,
+                                                             GridDesc_M_K,
+                                                             BlockSize,
+                                                             MThreadClusterSize,
+                                                             KThreadClusterSize,
+                                                             MThreadSliceSize,
+                                                             KThreadSliceSize,
+                                                             XYSrcVectorDim,
+                                                             XSrcVectorSize,
+                                                             GammaSrcVectorDim,
+                                                             GammaSrcVectorSize,
+                                                             BetaSrcVectorDim,
+                                                             BetaSrcVectorSize,
+                                                             XYSrcVectorDim,
+                                                             YDstVectorSize,
+                                                             true>;
+
+    struct Argument : public BaseArgument
+    {
+        Argument(const std::vector<index_t> lengths,
+                 const std::array<std::vector<index_t>, NumInput> inStridesArray,
+                 const std::vector<index_t> gammaStrides,
+                 const std::vector<index_t> betaStrides,
+                 const std::vector<index_t> yStrides,
+                 const std::vector<index_t> reduceDims,
+                 XElementwiseOperation x_elementwise_op,
+                 YElementwiseOperation y_elementwise_op,
+                 AccDataType epsilon,
+                 const std::array<const void*, NumInput> in_dev_buffers,
+                 const GammaDataType* p_gamma,
+                 const BetaDataType* p_beta,
+                 YDataType* p_y)
+            : epsilon_(epsilon),
+              p_gamma_(p_gamma),
+              p_beta_(p_beta),
+              p_y_(p_y),
+              x_elementwise_op_(x_elementwise_op),
+              y_elementwise_op_(y_elementwise_op)
+        {
+
+            Lengths_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(lengths, reduceDims);
+            for(int i = 0; i < NumInput; i++)
+            {
+                inStridesArray_[i] =
+                    shuffle_tensor_dimensions<Rank, NumReduceDim>(inStridesArray[i], reduceDims);
+            }
+
+            yStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(yStrides, reduceDims);
+            xStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(yStrides, reduceDims);
+
+            gammaStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(gammaStrides, reduceDims);
+            betaStrides_  = shuffle_tensor_dimensions<Rank, NumReduceDim>(betaStrides, reduceDims);
+
+            in_dev_buffers_ = generate_tuple(
+                [&](auto I) {
+                    using DataType = remove_cvref_t<decltype(InDataTypeTuple{}[I])>;
+                    return static_cast<const DataType*>(in_dev_buffers[I.value]);
+                },
+                Number<NumInput>{});
+
+            long_index_t invariant_total_length;
+            long_index_t reduce_total_length;
+
+            std::tie(invariant_total_length, reduce_total_length) =
+                get_2d_lengths<Rank, NumReduceDim>(Lengths_);
+
+            blkGroupSize_          = 1;
+            numBlockTileIteration_ = (reduce_total_length + K_BlockTileSize - 1) / K_BlockTileSize;
+
+            gridSize_ = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
+                        M_BlockTileSize * blkGroupSize_;
+
+            in_grid_2d_desc_tuple_ = generate_tuple(
+                [&](auto I) {
+                    return MakeSrc2dDescriptor(
+                        Lengths_, inStridesArray_[I.value], blkGroupSize_, numBlockTileIteration_);
+                },
+                Number<NumInput>{});
+
+            x_grid_desc_m_k_ =
+                MakeSrc2dDescriptor(Lengths_, xStrides_, blkGroupSize_, numBlockTileIteration_);
+
+            gamma_grid_desc_m_k_ =
+                MakeSrc2dDescriptor(Lengths_, gammaStrides_, blkGroupSize_, numBlockTileIteration_);
+
+            beta_grid_desc_m_k_ =
+                MakeSrc2dDescriptor(Lengths_, betaStrides_, blkGroupSize_, numBlockTileIteration_);
+
+            y_grid_desc_m_k_ =
+                MakeSrc2dDescriptor(Lengths_, yStrides_, blkGroupSize_, numBlockTileIteration_);
+
+            sweep_once_ =
+                x_grid_desc_m_k_.GetLength(Number<1>{}) <= KThreadClusterSize * KThreadSliceSize;
+
+            if(!sweep_once_) // if not sweep once, compute memory size for matrix X in lds for
+                             // store Intermediate results
+            {
+                int block_TileSize = M_BlockTileSize * reduce_total_length;
+                x_lds_size_        = block_TileSize * sizeof(XDataType);
+            }
+            else
+                x_lds_size_ = 0;
+        }
+
+        AccDataType epsilon_;
+
+        InDataTypePointerTuple in_dev_buffers_;
+        const GammaDataType* p_gamma_;
+        const BetaDataType* p_beta_;
+        YDataType* p_y_;
+
+        std::vector<index_t> Lengths_;
+        std::array<std::vector<index_t>, NumInput> inStridesArray_;
+        std::vector<index_t> xStrides_;
+        std::vector<index_t> gammaStrides_;
+        std::vector<index_t> betaStrides_;
+        std::vector<index_t> yStrides_;
+
+        XElementwiseOperation x_elementwise_op_;
+        YElementwiseOperation y_elementwise_op_;
+
+        int blkGroupSize_;
+        int numBlockTileIteration_;
+        size_t gridSize_;
+
+        InGrid2dDescTuple in_grid_2d_desc_tuple_;
+        GridDesc_M_K x_grid_desc_m_k_;
+        GridDesc_M_K gamma_grid_desc_m_k_;
+        GridDesc_M_K beta_grid_desc_m_k_;
+        GridDesc_M_K y_grid_desc_m_k_;
+        bool sweep_once_;
+        int x_lds_size_;
+    };
+
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            const auto kernel_main =
+                arg.sweep_once_ ? kernel_elementwise_layernorm<GridwiseReduceLayernormSweepOnce,
+                                                               InDataTypePointerTuple,
+                                                               XDataType,
+                                                               GammaDataType,
+                                                               BetaDataType,
+                                                               YDataType,
+                                                               AccDataType,
+                                                               XElementwiseOperation,
+                                                               YElementwiseOperation,
+                                                               InGrid2dDescTuple,
+                                                               GridDesc_M_K>
+                                : kernel_elementwise_layernorm<GridwiseReduceLayernormGeneric,
+                                                               InDataTypePointerTuple,
+                                                               XDataType,
+                                                               GammaDataType,
+                                                               BetaDataType,
+                                                               YDataType,
+                                                               AccDataType,
+                                                               XElementwiseOperation,
+                                                               YElementwiseOperation,
+                                                               InGrid2dDescTuple,
+                                                               GridDesc_M_K>;
+
+            float avg_time = 0;
+            avg_time += launch_and_time_kernel(stream_config,
+                                               kernel_main,
+                                               dim3(arg.gridSize_),
+                                               dim3(BlockSize),
+                                               arg.x_lds_size_,
+                                               arg.in_grid_2d_desc_tuple_,
+                                               arg.x_grid_desc_m_k_,
+                                               arg.gamma_grid_desc_m_k_,
+                                               arg.beta_grid_desc_m_k_,
+                                               arg.y_grid_desc_m_k_,
+                                               arg.numBlockTileIteration_,
+                                               arg.epsilon_,
+                                               arg.in_dev_buffers_,
+                                               arg.p_gamma_,
+                                               arg.p_beta_,
+                                               arg.p_y_,
+                                               arg.x_elementwise_op_,
+                                               arg.y_elementwise_op_);
+
+            return (avg_time);
+        };
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        };
+    };
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        const Argument* p_arg_ = dynamic_cast<const Argument*>(p_arg);
+
+        constexpr index_t NumInvariantDim = Rank - NumReduceDim;
+
+        if constexpr(XYSrcVectorDim == 0)
+        {
+            if constexpr(NumInvariantDim == 0)
+            {
+                return false;
+            }
+            else
+            {
+                for(int i = 0; i < NumInput; i++)
+                {
+                    if(p_arg_->inStridesArray_[i][NumInvariantDim - 1] != 1)
+                        return false;
+                }
+
+                if(p_arg_->inStridesArray_[0][NumInvariantDim - 1] != 1 &&
+                   p_arg_->inStridesArray_[1][NumInvariantDim - 1] != 1)
+                    return false;
+
+                if(p_arg_->invariant_lowest_length % XSrcVectorSize != 0)
+                    return false;
+            };
+        }
+        else
+        {
+            for(int i = 0; i < NumInput; i++)
+            {
+                if(p_arg_->inStridesArray_[i][Rank - 1] != 1)
+                    return false;
+            }
+
+            if(p_arg_->Lengths_[Rank - 1] % XSrcVectorSize != 0)
+                return false;
+        };
+
+        if(p_arg_->Lengths_[Rank - 1] % YDstVectorSize != 0)
+        {
+            return false;
+        }
+
+        auto IsScalarPerVectorValid = [](bool isLastDimensionCoalesced, int scalarPerVector) {
+            bool ret = true;
+
+            if(!isLastDimensionCoalesced)
+                ret = scalarPerVector == 1;
+            else
+                ret = KThreadSliceSize % scalarPerVector == 0;
+
+            return ret;
+        };
+
+        if(!IsScalarPerVectorValid(p_arg_->gammaStrides_.back() == 1, GammaSrcVectorSize))
+            return false;
+
+        if(!IsScalarPerVectorValid(p_arg_->betaStrides_.back() == 1, BetaSrcVectorSize))
+            return false;
+
+        // if fastest dim is not reduced
+        if constexpr(XYSrcVectorDim == 0) //
+        {
+            if(p_arg_->gammaStrides_[NumInvariantDim - 1] != 1)
+                return (false);
+
+            if(p_arg_->Lengths_[Rank - 1] % GammaSrcVectorSize != 0)
+                return (false);
+        }
+        else // if fastest dim is reduced
+        {
+            if(p_arg_->gammaStrides_[Rank - 1] != 1)
+                return (false);
+
+            if(p_arg_->Lengths_[Rank - 1] % GammaSrcVectorSize != 0)
+                return (false);
+        }
+
+        // if fastest dim is not reduced
+        if constexpr(XYSrcVectorDim == 0)
+        {
+            if(p_arg_->betaStrides_[NumInvariantDim - 1] != 1)
+                return (false);
+
+            if(p_arg_->invariant_lowest_length % BetaSrcVectorSize != 0)
+                return (false);
+        }
+        else // if fastest dim is reduced
+        {
+            if(p_arg_->betaStrides_[Rank - 1] != 1)
+                return (false);
+
+            if(p_arg_->Lengths_[Rank - 1] % BetaSrcVectorSize != 0)
+                return (false);
+        }
+
+        return true;
+    };
+
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::vector<index_t> lengths,
+                        const std::array<std::vector<index_t>, NumInput> inStridesArray,
+                        const std::vector<index_t> gammaStrides,
+                        const std::vector<index_t> betaStrides,
+                        const std::vector<index_t> yStrides,
+                        const std::vector<index_t> reduceDims,
+                        AccDataType epsilon,
+                        const std::array<const void*, NumInput> in_dev_buffers,
+                        const void* p_gamma,
+                        const void* p_beta,
+                        void* p_y,
+                        XElementwiseOperation x_elementwise_op,
+                        YElementwiseOperation y_elementwise_op) override
+    {
+        return std::make_unique<Argument>(lengths,
+                                          inStridesArray,
+                                          gammaStrides,
+                                          betaStrides,
+                                          yStrides,
+                                          reduceDims,
+                                          x_elementwise_op,
+                                          y_elementwise_op,
+                                          epsilon,
+                                          in_dev_buffers,
+                                          static_cast<const GammaDataType*>(p_gamma),
+                                          static_cast<const BetaDataType*>(p_beta),
+                                          static_cast<YDataType*>(p_y));
+    };
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>();
+    };
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceElementwiseNormalizationImpl<" << BlockSize << ",";
+        str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
+        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
+        str << "XYSrcVectorDim_" << XYSrcVectorDim  << ",";
+        str << "VectorSize_X" << XSrcVectorSize << "_Gamma" << GammaSrcVectorSize << "_Beta" << BetaSrcVectorSize << "_Y" << YDstVectorSize << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_add_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_add_reduce_xdl_cshuffle.hpp
new file mode 100644
index 00000000..b9a64e8c
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_add_reduce_xdl_cshuffle.hpp
@@ -0,0 +1,875 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// Note: inter-wave loop scheduler is rolled out to c-shuffle version first. Becuase non c-shuffle
+// version currently has compiler issues with register spill which further causes validation
+// failures.
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename BiasDataType,
+          typename D0DataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename ReduceAccDataType,
+          typename ReducePtrsGlobal,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename D0ElementwiseOperation,
+          typename ReduceOperations,
+          typename ReduceInElementwiseOperations,
+          typename ReduceAccElementwiseOperations,
+          typename ReduceGlobalMemoryDataOperation,
+          GemmSpecialization GemmSpec,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          typename CReduceThreadClusterLengths_MPerBlock_NPerBlock,
+          index_t CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
+          index_t CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
+          LoopScheduler LoopSched = make_default_loop_scheduler()>
+struct DeviceGemmBiasAddReduce_Xdl_CShuffle : public DeviceGemmReduce<1, ReduceOperations::Size()>
+{
+    using DeviceOp = DeviceGemmBiasAddReduce_Xdl_CShuffle;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    static auto MakeAGridDescriptor_AK0_M_AK1(index_t MRaw, index_t KRaw, index_t StrideA)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(I1, StrideA));
+            }
+        }();
+
+        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto K = math::integer_divide_ceil(KRaw, KPerBlock) * KPerBlock;
+
+        const auto MPad = M - MRaw;
+        const auto KPad = K - KRaw;
+
+        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both M and K
+            assert(K % AK1 == 0);
+
+            const auto AK0 = K / AK1;
+
+            const auto a_grid_desc_m_k =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_right_pad_transform(MRaw, MPad),
+                                                       make_right_pad_transform(KRaw, KPad)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_m_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(M)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad M, but not K
+            assert(KRaw % AK1 == 0);
+
+            const auto AK0 = KRaw / AK1;
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_right_pad_transform(MRaw, MPad)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad K, but not M
+            assert(K % AK1 == 0);
+
+            const auto AK0 = K / AK1;
+
+            const auto a_grid_desc_m_k = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(KRaw, KPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_m_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(MRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else
+        {
+            // not pad M or K
+            assert(KRaw % AK1 == 0);
+
+            const auto AK0 = KRaw / AK1;
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(MRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+    }
+
+    static auto MakeBGridDescriptor_BK0_N_BK1(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
+        const auto K = math::integer_divide_ceil(KRaw, KPerBlock) * KPerBlock;
+
+        const auto NPad = N - NRaw;
+        const auto KPad = K - KRaw;
+
+        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both N and K
+            assert(K % BK1 == 0);
+
+            const auto BK0 = K / BK1;
+
+            const auto b_grid_desc_n_k =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_right_pad_transform(NRaw, NPad),
+                                                       make_right_pad_transform(KRaw, KPad)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_n_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(N)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad N, but not K
+            assert(KRaw % BK1 == 0);
+
+            const auto BK0 = KRaw / BK1;
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_right_pad_transform(NRaw, NPad)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad K, but not N
+            assert(K % BK1 == 0);
+
+            const auto BK0 = K / BK1;
+
+            const auto b_grid_desc_n_k = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_pass_through_transform(NRaw), make_right_pad_transform(KRaw, KPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_n_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(NRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else
+        {
+            // not pad N or K
+            assert(KRaw % BK1 == 0);
+
+            const auto BK0 = KRaw / BK1;
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(NRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+    }
+
+    static auto MakeCGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(I1, StrideC));
+            }
+        }();
+
+        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
+
+        const auto MPad = M - MRaw;
+        const auto NPad = N - NRaw;
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad M and N
+            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                               make_tuple(make_right_pad_transform(MRaw, MPad),
+                                                          make_right_pad_transform(NRaw, NPad)),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad M, but not N
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_right_pad_transform(MRaw, MPad), make_pass_through_transform(NRaw)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad N, but not M
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(NRaw, NPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            // not pad M or N
+            return c_grid_desc_mraw_nraw;
+        }
+    }
+
+    // assume D is packed tensor
+    static auto MakeReduceGridDescriptor_M(index_t MRaw)
+    {
+        const auto d_grid_desc_mraw = make_naive_tensor_descriptor_packed(make_tuple(MRaw));
+
+        const auto M    = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto MPad = M - MRaw;
+
+        if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                     GemmSpec == GemmSpecialization::MNPadding ||
+                     GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad M
+            return transform_tensor_descriptor(d_grid_desc_mraw,
+                                               make_tuple(make_right_pad_transform(MRaw, MPad)),
+                                               make_tuple(Sequence<0>{}),
+                                               make_tuple(Sequence<0>{}));
+        }
+        else
+        {
+            // not pad M
+            return d_grid_desc_mraw;
+        }
+    }
+
+    using AGridDesc_AK0_M_AK1 = decltype(MakeAGridDescriptor_AK0_M_AK1(1, 1, 1));
+    using BGridDesc_BK0_N_BK1 = decltype(MakeBGridDescriptor_BK0_N_BK1(1, 1, 1));
+    using CGridDesc_M_N       = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
+    using C0GridDesc_M_N      = decltype(MakeCGridDescriptor_M_N(1, 1, 0));
+    using C1GridDesc_M_N      = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
+    using ReduceGridDesc_M    = decltype(MakeReduceGridDescriptor_M(1));
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemmBiasAddReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
+        ADataType, // TODO: distinguish A/B datatype
+        GemmAccDataType,
+        CShuffleDataType,
+        CDataType,
+        BiasDataType,
+        D0DataType,
+        ReduceAccDataType,
+        ReducePtrsGlobal,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        D0ElementwiseOperation,
+        ReduceOperations,
+        ReduceInElementwiseOperations,
+        ReduceAccElementwiseOperations,
+        InMemoryDataOperationEnum::Set,
+        ReduceGlobalMemoryDataOperation,
+        AGridDesc_AK0_M_AK1,
+        BGridDesc_BK0_N_BK1,
+        CGridDesc_M_N,
+        C0GridDesc_M_N,
+        C1GridDesc_M_N,
+        ReduceGridDesc_M,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        CReduceThreadClusterLengths_MPerBlock_NPerBlock,
+        CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
+        CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
+        LoopSched>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const ADataType* p_a_grid,
+                 const BDataType* p_b_grid,
+                 CDataType* p_c_grid,
+                 const BiasDataType* p_bias_grid,
+                 const D0DataType* p_d0_grid,
+                 ReducePtrsGlobal p_reduces_grid,
+                 index_t MRaw,
+                 index_t NRaw,
+                 index_t KRaw,
+                 index_t StrideA,
+                 index_t StrideB,
+                 index_t StrideC,
+                 index_t StrideC1,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op,
+                 D0ElementwiseOperation d0_element_op,
+                 ReduceInElementwiseOperations reduce_in_element_ops,
+                 ReduceAccElementwiseOperations reduce_out_element_ops)
+            : p_a_grid_{p_a_grid},
+              p_b_grid_{p_b_grid},
+              p_c_grid_{p_c_grid},
+              p_bias_grid_{p_bias_grid},
+              p_d0_grid_{p_d0_grid},
+              p_reduces_grid_{p_reduces_grid},
+              a_grid_desc_ak0_m_ak1_{DeviceOp::MakeAGridDescriptor_AK0_M_AK1(MRaw, KRaw, StrideA)},
+              b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1(KRaw, NRaw, StrideB)},
+              c_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, NRaw, StrideC)},
+              c0_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, NRaw, 0)},
+              c1_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, NRaw, StrideC1)},
+              reduce_grid_desc_m_{DeviceOp::MakeReduceGridDescriptor_M(MRaw)},
+              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              c0_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              c1_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              reduce_grid_desc_mblock_mperblock_{},
+              block_2_ctile_map_{GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_)},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op},
+              d0_element_op_{d0_element_op},
+              reduce_in_element_ops_{reduce_in_element_ops},
+              reduce_out_element_ops_{reduce_out_element_ops}
+        {
+            if(GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1_,
+                                           b_grid_desc_bk0_n_bk1_,
+                                           c_grid_desc_m_n_,
+                                           block_2_ctile_map_))
+            {
+                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        c_grid_desc_m_n_);
+
+                c0_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        c0_grid_desc_m_n_);
+
+                c1_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        c1_grid_desc_m_n_);
+
+                reduce_grid_desc_mblock_mperblock_ =
+                    GridwiseGemm::MakeReduceGridDescriptor_MBlock_MPerBlock(reduce_grid_desc_m_);
+            }
+        }
+
+        //  private:
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        CDataType* p_c_grid_;
+        const BiasDataType* p_bias_grid_;
+        const D0DataType* p_d0_grid_;
+        ReducePtrsGlobal p_reduces_grid_;
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        C0GridDesc_M_N c0_grid_desc_m_n_;
+        C1GridDesc_M_N c1_grid_desc_m_n_;
+        ReduceGridDesc_M reduce_grid_desc_m_;
+        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock_;
+        typename GridwiseGemm::C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c0_grid_desc_mblock_mperblock_nblock_nperblock_;
+        typename GridwiseGemm::C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c1_grid_desc_mblock_mperblock_nblock_nperblock_;
+        typename GridwiseGemm::ReduceGridDescriptor_MBlock_MPerBlock
+            reduce_grid_desc_mblock_mperblock_;
+        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+        D0ElementwiseOperation d0_element_op_;
+        ReduceInElementwiseOperations reduce_in_element_ops_;
+        ReduceAccElementwiseOperations reduce_out_element_ops_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                            arg.b_grid_desc_bk0_n_bk1_,
+                                            arg.c_grid_desc_m_n_,
+                                            arg.block_2_ctile_map_))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_);
+
+            const auto K =
+                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+
+            float elapsed_time = 0.0f;
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                const auto kernel = kernel_gemm_bias_add_reduce_xdl_cshuffle_v1<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    BiasDataType,
+                    D0DataType,
+                    ReducePtrsGlobal,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CElementwiseOperation,
+                    D0ElementwiseOperation,
+                    ReduceInElementwiseOperations,
+                    ReduceAccElementwiseOperations,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::ReduceGridDescriptor_MBlock_MPerBlock,
+                    typename GridwiseGemm::DefaultBlock2CTileMap,
+                    true>;
+
+                elapsed_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_,
+                                           arg.p_b_grid_,
+                                           arg.p_c_grid_,
+                                           arg.p_bias_grid_,
+                                           arg.p_d0_grid_,
+                                           arg.p_reduces_grid_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.d0_element_op_,
+                                           arg.reduce_in_element_ops_,
+                                           arg.reduce_out_element_ops_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.c0_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.c1_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.reduce_grid_desc_mblock_mperblock_,
+                                           arg.block_2_ctile_map_);
+            }
+            else
+            {
+                const auto kernel = kernel_gemm_bias_add_reduce_xdl_cshuffle_v1<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    BiasDataType,
+                    D0DataType,
+                    ReducePtrsGlobal,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CElementwiseOperation,
+                    D0ElementwiseOperation,
+                    ReduceInElementwiseOperations,
+                    ReduceAccElementwiseOperations,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::ReduceGridDescriptor_MBlock_MPerBlock,
+                    typename GridwiseGemm::DefaultBlock2CTileMap,
+                    false>;
+
+                elapsed_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_,
+                                           arg.p_b_grid_,
+                                           arg.p_c_grid_,
+                                           arg.p_bias_grid_,
+                                           arg.p_d0_grid_,
+                                           arg.p_reduces_grid_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.d0_element_op_,
+                                           arg.reduce_in_element_ops_,
+                                           arg.reduce_out_element_ops_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.c0_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.c1_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.reduce_grid_desc_mblock_mperblock_,
+                                           arg.block_2_ctile_map_);
+            }
+
+            return elapsed_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_m_n_,
+                                           arg.block_2_ctile_map_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static constexpr int NumReduce = ReduceOperations::Size();
+    static auto MakeArgument(const void* p_a,
+                             const void* p_b,
+                             const void* p_bias,
+                             std::array<const void*, 1> p_ds,
+                             void* p_c,
+                             std::array<void*, NumReduce> p_reduces,
+                             ck::index_t M,
+                             ck::index_t N,
+                             ck::index_t K,
+                             ck::index_t StrideA,
+                             ck::index_t StrideB,
+                             ck::index_t StrideC,
+                             std::array<ck::index_t, 1> StrideDs,
+                             std::array<void*, 3> gemm_element_ops,
+                             std::array<void*, 1> d_element_ops,
+                             std::array<void*, NumReduce> reduce_in_element_op,
+                             std::array<void*, NumReduce> reduce_out_element_op)
+    {
+        ReducePtrsGlobal reduce_tuple = generate_tuple(
+            [&](auto I) {
+                auto tmp = ReducePtrsGlobal{}[I];
+                using T  = remove_pointer_t<decltype(tmp)>;
+                return static_cast<T*>(p_reduces[I]);
+            },
+            Number<NumReduce>{});
+
+        ReduceInElementwiseOperations reduce_in_element_ops = generate_tuple(
+            [&](auto I) {
+                auto tmp = ReduceInElementwiseOperations{}[I];
+                using T  = remove_pointer_t<decltype(tmp)>;
+                return *(static_cast<T*>(reduce_in_element_op[I]));
+            },
+            Number<NumReduce>{});
+        ReduceAccElementwiseOperations reduce_out_element_ops = generate_tuple(
+            [&](auto I) {
+                auto tmp = ReduceAccElementwiseOperations{}[I];
+                using T  = remove_pointer_t<decltype(tmp)>;
+                return *(static_cast<T*>(reduce_out_element_op[I]));
+            },
+            Number<NumReduce>{});
+
+        AElementwiseOperation a_element_op =
+            *(static_cast<AElementwiseOperation*>(gemm_element_ops[0]));
+        BElementwiseOperation b_element_op =
+            *(static_cast<BElementwiseOperation*>(gemm_element_ops[1]));
+        CElementwiseOperation c_element_op =
+            *(static_cast<CElementwiseOperation*>(gemm_element_ops[2]));
+        D0ElementwiseOperation d_element_op =
+            *(static_cast<D0ElementwiseOperation*>(d_element_ops[0]));
+
+        return Argument{static_cast<const ADataType*>(p_a),
+                        static_cast<const BDataType*>(p_b),
+                        static_cast<CDataType*>(p_c),
+                        static_cast<const BiasDataType*>(p_bias),
+                        static_cast<const D0DataType*>(p_ds[0]),
+                        reduce_tuple,
+                        M,
+                        N,
+                        K,
+                        StrideA,
+                        StrideB,
+                        StrideC,
+                        StrideDs[0],
+                        a_element_op,
+                        b_element_op,
+                        c_element_op,
+                        d_element_op,
+                        reduce_in_element_ops,
+                        reduce_out_element_ops};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        const void* p_bias,
+                        std::array<const void*, 1> p_ds,
+                        void* p_c,
+                        std::array<void*, NumReduce> p_reduces,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB,
+                        ck::index_t StrideC,
+                        std::array<ck::index_t, 1> StrideDs,
+                        std::array<void*, 3> gemm_element_ops,
+                        std::array<void*, 1> d_element_ops,
+                        std::array<void*, NumReduce> reduce_in_element_op,
+                        std::array<void*, NumReduce> reduce_out_element_op,
+                        index_t /* KBatch */ = 1) override
+    {
+        ReducePtrsGlobal reduce_tuple = generate_tuple(
+            [&](auto I) {
+                auto tmp = ReducePtrsGlobal{}[I];
+                using T  = remove_pointer_t<decltype(tmp)>;
+                return static_cast<T*>(p_reduces[I]);
+            },
+            Number<NumReduce>{});
+
+        ReduceInElementwiseOperations reduce_in_element_ops = generate_tuple(
+            [&](auto I) {
+                auto tmp = ReduceInElementwiseOperations{}[I];
+                using T  = remove_pointer_t<decltype(tmp)>;
+                return *(static_cast<T*>(reduce_in_element_op[I]));
+            },
+            Number<NumReduce>{});
+        ReduceAccElementwiseOperations reduce_out_element_ops = generate_tuple(
+            [&](auto I) {
+                auto tmp = ReduceAccElementwiseOperations{}[I];
+                using T  = remove_pointer_t<decltype(tmp)>;
+                return *(static_cast<T*>(reduce_out_element_op[I]));
+            },
+            Number<NumReduce>{});
+
+        AElementwiseOperation a_element_op =
+            *(static_cast<AElementwiseOperation*>(gemm_element_ops[0]));
+        BElementwiseOperation b_element_op =
+            *(static_cast<BElementwiseOperation*>(gemm_element_ops[1]));
+        CElementwiseOperation c_element_op =
+            *(static_cast<CElementwiseOperation*>(gemm_element_ops[2]));
+        D0ElementwiseOperation d_element_op =
+            *(static_cast<D0ElementwiseOperation*>(d_element_ops[0]));
+
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<CDataType*>(p_c),
+                                          static_cast<const BiasDataType*>(p_bias),
+                                          static_cast<const D0DataType*>(p_ds[0]),
+                                          reduce_tuple,
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideB,
+                                          StrideC,
+                                          StrideDs[0],
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op,
+                                          d_element_op,
+                                          reduce_in_element_ops,
+                                          reduce_out_element_ops);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceGemmBiasAddReduce_Xdl_CShuffle"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_e_permute_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_e_permute_xdl.hpp
new file mode 100644
index 00000000..19140688
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_bias_e_permute_xdl.hpp
@@ -0,0 +1,572 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_bias_e_permute.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatDsPointer,
+          typename FloatE,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename Block2ETileMap,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_bias_e_permute(const FloatAB* __restrict__ p_a_grid,
+                                   const FloatAB* __restrict__ p_b_grid,
+                                   FloatDsPointer p_ds_grid,
+                                   FloatE* __restrict__ p_e_grid,
+                                   const AElementwiseOperation a_element_op,
+                                   const BElementwiseOperation b_element_op,
+                                   const CDEElementwiseOperation cde_element_op,
+                                   const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+                                   const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+                                   const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                                       ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                   const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                                       e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                   const Block2ETileMap block_2_etile_map)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
+                                                  p_b_grid,
+                                                  p_ds_grid,
+                                                  p_e_grid,
+                                                  p_shared,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  cde_element_op,
+                                                  a_grid_desc_ak0_m_ak1,
+                                                  b_grid_desc_bk0_n_bk1,
+                                                  ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  block_2_etile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_ds_grid;
+    ignore = p_e_grid;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = cde_element_op;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = e_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = block_2_etile_map;
+#endif
+}
+
+} // namespace ck
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// input : A[M, K], or A[K, N]
+// input : B[K, N], or A[N, K]
+// input : D0[M, N], D1[M, N], ...
+// output : E[M, N]
+// C = a_op(A) * b_op(B)
+// E = cde_op(C, D0, D1, ...)
+template <typename ALayout,
+          typename BLayout,
+          typename CDELayout,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CDEBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched = make_default_loop_scheduler()>
+struct DeviceGemmBiasEPermute_Xdl : public DeviceGemmBiasCPermute<AElementwiseOperation,
+                                                                  BElementwiseOperation,
+                                                                  CDEElementwiseOperation>
+{
+    using DeviceOp = DeviceGemmBiasEPermute_Xdl;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr auto matrix_padder =
+        MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock};
+
+    static constexpr index_t NumDTensor = 1;
+
+    static auto MakeAGridDescriptor_M_K(index_t MRaw, index_t KRaw, index_t StrideA)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(I1, StrideA));
+            }
+        }();
+
+        return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
+    }
+
+    static auto MakeBGridDescriptor_N_K(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
+    }
+
+    static auto MakeEGridDescriptor_M_N(DEGridDesc_M0_M1_M2_N0_N1 d_e_grid_desc)
+    {
+        index_t M0 = d_e_grid_desc.M0_;
+        index_t M1 = d_e_grid_desc.M1_;
+        index_t M2 = d_e_grid_desc.M2_;
+        index_t N0 = d_e_grid_desc.N0_;
+        index_t N1 = d_e_grid_desc.N1_;
+
+        index_t stride_M0 = d_e_grid_desc.stride_M0_;
+        index_t stride_M1 = d_e_grid_desc.stride_M1_;
+        index_t stride_M2 = d_e_grid_desc.stride_M2_;
+        index_t stride_N0 = d_e_grid_desc.stride_N0_;
+        index_t stride_N1 = d_e_grid_desc.stride_N1_;
+
+        const auto e_grid_desc_mraw_nraw = [&]() {
+            const auto e_grid_desc_m0_m1_m2_n0_n1 = make_naive_tensor_descriptor(
+                make_tuple(M0, M1, M2, N0, N1),
+                make_tuple(stride_M0, stride_M1, stride_M2, stride_N0, stride_N1));
+
+            return transform_tensor_descriptor(
+                e_grid_desc_m0_m1_m2_n0_n1,
+                make_tuple(make_merge_transform(make_tuple(M0, M1, M2)),
+                           make_merge_transform(make_tuple(N0, N1))),
+                make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }();
+
+        return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
+    }
+
+    using AGridDesc_M_K = decltype(MakeAGridDescriptor_M_K(1, 1, 1));
+    using BGridDesc_N_K = decltype(MakeBGridDescriptor_N_K(1, 1, 1));
+    using EGridDesc_M_N = decltype(MakeEGridDescriptor_M_N(DEGridDesc_M0_M1_M2_N0_N1{}));
+
+    using DsGridDesc_M_N = Tuple<EGridDesc_M_N>;
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<
+        ADataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CShuffleDataType,
+        ck::Tuple<DDataType>,
+        EDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CDEElementwiseOperation,
+        InMemoryDataOperationEnum::Set,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEBlockTransferScalarPerVector_NPerBlock,
+        LoopSched>;
+
+    using AGridDesc_AK0_M_AK1 = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(AGridDesc_M_K{}))>;
+    using BGridDesc_BK0_N_BK1 = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(BGridDesc_N_K{}))>;
+
+    using Block2ETileMap = typename GridwiseGemm::DefaultBlock2ETileMap;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const void* p_a_grid,
+                 const void* p_b_grid,
+                 const void* p_d_grid,
+                 void* p_e_grid,
+                 index_t MRaw,
+                 index_t NRaw,
+                 index_t KRaw,
+                 index_t StrideA,
+                 index_t StrideB,
+                 DEGridDesc_M0_M1_M2_N0_N1 d_grid_desc,
+                 DEGridDesc_M0_M1_M2_N0_N1 e_grid_desc,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+            : p_a_grid_{static_cast<const ADataType*>(p_a_grid)},
+              p_b_grid_{static_cast<const BDataType*>(p_b_grid)},
+              p_ds_grid_{},
+              p_e_grid_{static_cast<EDataType*>(p_e_grid)},
+              a_grid_desc_m_k_{DeviceOp::MakeAGridDescriptor_M_K(MRaw, KRaw, StrideA)},
+              b_grid_desc_n_k_{DeviceOp::MakeBGridDescriptor_N_K(KRaw, NRaw, StrideB)},
+              ds_grid_desc_m_n_{},
+              e_grid_desc_m_n_{DeviceOp::MakeEGridDescriptor_M_N(e_grid_desc)},
+              a_grid_desc_ak0_m_ak1_{
+                  GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
+              b_grid_desc_bk0_n_bk1_{
+                  GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
+              ds_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              e_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op}
+        {
+
+            if(MRaw != d_grid_desc.M0_ * d_grid_desc.M1_ * d_grid_desc.M2_)
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            if(NRaw != d_grid_desc.N0_ * d_grid_desc.N1_)
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            // populate pointer, desc for Ds
+            // D pointer
+            p_ds_grid_(I0) = static_cast<const DDataType*>(p_d_grid);
+
+            // D desc
+            ds_grid_desc_m_n_(I0) = DeviceOp::MakeEGridDescriptor_M_N(d_grid_desc);
+
+            if(GridwiseGemm::CheckValidity(a_grid_desc_m_k_,
+                                           b_grid_desc_n_k_,
+                                           ds_grid_desc_m_n_,
+                                           e_grid_desc_m_n_,
+                                           block_2_etile_map_))
+            {
+                e_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        e_grid_desc_m_n_);
+
+                ds_grid_desc_mblock_mperblock_nblock_nperblock_(I0) =
+                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        ds_grid_desc_m_n_[I0]);
+            }
+        }
+
+        //  private:
+        // pointers
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        EDataType* p_e_grid_;
+
+        // tensor descriptors for problem definiton
+        AGridDesc_M_K a_grid_desc_m_k_;
+        BGridDesc_N_K b_grid_desc_n_k_;
+        DsGridDesc_M_N ds_grid_desc_m_n_;
+        EGridDesc_M_N e_grid_desc_m_n_;
+
+        // tensor descriptors for block/thread-wise copy
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        typename GridwiseGemm::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            ds_grid_desc_mblock_mperblock_nblock_nperblock_;
+        typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            e_grid_desc_mblock_mperblock_nblock_nperblock_;
+
+        // block-to-e-tile map
+        Block2ETileMap block_2_etile_map_;
+
+        // element-wise op
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
+                                            arg.b_grid_desc_n_k_,
+                                            arg.ds_grid_desc_m_n_,
+                                            arg.e_grid_desc_m_n_,
+                                            arg.block_2_etile_map_))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_);
+
+            const auto K =
+                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+
+            auto launch_kernel = [&](auto has_main_k_block_loop) {
+                constexpr bool has_main_loop = has_main_k_block_loop.value;
+
+                const auto kernel = kernel_gemm_bias_e_permute<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    typename GridwiseGemm::DsGridPointer,
+                    EDataType,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CDEElementwiseOperation,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::DefaultBlock2ETileMap,
+                    has_main_loop>;
+
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_a_grid_,
+                                              arg.p_b_grid_,
+                                              arg.p_ds_grid_,
+                                              arg.p_e_grid_,
+                                              arg.a_element_op_,
+                                              arg.b_element_op_,
+                                              arg.cde_element_op_,
+                                              arg.a_grid_desc_ak0_m_ak1_,
+                                              arg.b_grid_desc_bk0_n_bk1_,
+                                              arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.block_2_etile_map_);
+            };
+
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                return launch_kernel(integral_constant<bool, true>{});
+            }
+            else
+            {
+                return launch_kernel(integral_constant<bool, false>{});
+            }
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
+                                           arg.b_grid_desc_n_k_,
+                                           arg.ds_grid_desc_m_n_,
+                                           arg.e_grid_desc_m_n_,
+                                           arg.block_2_etile_map_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const void* p_a,
+                             const void* p_b,
+                             const void* p_d,
+                             void* p_e,
+                             index_t MRaw,
+                             index_t NRaw,
+                             index_t KRaw,
+                             index_t StrideA,
+                             index_t StrideB,
+                             DEGridDesc_M0_M1_M2_N0_N1 d_grid_desc,
+                             DEGridDesc_M0_M1_M2_N0_N1 e_grid_desc,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_d,
+                        p_e,
+                        MRaw,
+                        NRaw,
+                        KRaw,
+                        StrideA,
+                        StrideB,
+                        d_grid_desc,
+                        e_grid_desc,
+                        a_element_op,
+                        b_element_op,
+                        cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        const void* p_d,
+                        void* p_e,
+                        index_t MRaw,
+                        index_t NRaw,
+                        index_t KRaw,
+                        index_t StrideA,
+                        index_t StrideB,
+                        DEGridDesc_M0_M1_M2_N0_N1 d_grid_desc,
+                        DEGridDesc_M0_M1_M2_N0_N1 e_grid_desc,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) override
+    {
+        return std::make_unique<Argument>(p_a,
+                                          p_b,
+                                          p_d,
+                                          p_e,
+                                          MRaw,
+                                          NRaw,
+                                          KRaw,
+                                          StrideA,
+                                          StrideB,
+                                          d_grid_desc,
+                                          e_grid_desc,
+                                          a_element_op,
+                                          b_element_op,
+                                          cde_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceGemmBiasEPermute_Xdl"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
new file mode 100644
index 00000000..7dc542ab
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp
@@ -0,0 +1,595 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <
+    typename ADataType,
+    typename BDataType,
+    typename CDataType,
+    typename AccDataType,
+    typename ALayout,
+    typename BLayout,
+    typename CLayout,
+    typename AElementwiseOperation,
+    typename BElementwiseOperation,
+    typename CElementwiseOperation,
+    GemmSpecialization GemmSpec,
+    index_t BlockSize,
+    index_t MPerBlock,
+    index_t NPerBlock,
+    index_t K0PerBlock,
+    index_t K1,
+    index_t M1PerThread,
+    index_t N1PerThread,
+    index_t KPerThread,
+    typename M1N1ThreadClusterM1Xs,
+    typename M1N1ThreadClusterN1Xs,
+    typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+    typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+    typename ABlockTransferThreadClusterArrangeOrder,
+    typename ABlockTransferSrcAccessOrder,
+    typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
+    typename ABlockTransferSrcVectorTensorContiguousDimOrder,
+    typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
+    typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+    typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+    typename BBlockTransferThreadClusterArrangeOrder,
+    typename BBlockTransferSrcAccessOrder,
+    typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
+    typename BBlockTransferSrcVectorTensorContiguousDimOrder,
+    typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
+    typename CThreadTransferSrcDstAccessOrder,
+    index_t CThreadTransferSrcDstVectorDim,
+    index_t CThreadTransferDstScalarPerVector,
+    enable_if_t<
+        is_same_v<AElementwiseOperation, ck::tensor_operation::element_wise::PassThrough> &&
+            is_same_v<BElementwiseOperation, ck::tensor_operation::element_wise::PassThrough> &&
+            is_same_v<CElementwiseOperation, ck::tensor_operation::element_wise::PassThrough>,
+        bool> = false>
+struct DeviceGemmDl : public DeviceGemm<ALayout,
+                                        BLayout,
+                                        CLayout,
+                                        ADataType,
+                                        BDataType,
+                                        CDataType,
+                                        AElementwiseOperation,
+                                        BElementwiseOperation,
+                                        CElementwiseOperation>
+
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+
+    static constexpr auto K1Number = Number<K1>{};
+
+    static auto MakeAGridDescriptor_K0_M_K1(index_t M, index_t K, index_t StrideA)
+    {
+        assert(K % K1 == 0);
+
+        const index_t K0 = K / K1;
+
+        const auto a_grid_desc_m_k = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ALayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
+            }
+        }();
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
+        {
+            const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
+
+            return transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
+                           make_right_pad_transform(M, PadM)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+    }
+
+    static auto MakeBGridDescriptor_K0_N_K1(index_t K, index_t N, index_t StrideB)
+    {
+        assert(K % K1 == 0);
+
+        const index_t K0 = K / K1;
+
+        const auto b_grid_desc_k_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB));
+            }
+        }();
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
+        {
+            const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
+
+            return transform_tensor_descriptor(
+                b_grid_desc_k_n,
+                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
+                           make_right_pad_transform(N, PadN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                b_grid_desc_k_n,
+                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+    }
+
+    static auto MakeCGridDescriptor_M_N(index_t M, index_t N, index_t StrideC)
+    {
+        const auto c_grid_desc_m_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
+            }
+        }();
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
+        {
+            const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
+            const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
+
+            return transform_tensor_descriptor(
+                c_grid_desc_m_n,
+                make_tuple(make_right_pad_transform(M, PadM), make_right_pad_transform(N, PadN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+
+            return transform_tensor_descriptor(
+                c_grid_desc_m_n,
+                make_tuple(make_pass_through_transform(M), make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+    }
+
+    using AGridDesc_K0_M_K1 = decltype(MakeAGridDescriptor_K0_M_K1(1, 1, 1));
+    using BGridDesc_K0_N_K1 = decltype(MakeBGridDescriptor_K0_N_K1(1, 1, 1));
+    using CGridDesc_M_N     = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
+
+    // GridwiseGemm
+    using GridwiseGemm =
+        GridwiseGemmDl_km_kn_mn_v1r3<BlockSize,
+                                     ADataType,
+                                     AccDataType,
+                                     CDataType,
+                                     InMemoryDataOperationEnum::Set,
+                                     AGridDesc_K0_M_K1,
+                                     BGridDesc_K0_N_K1,
+                                     CGridDesc_M_N,
+                                     MPerBlock,
+                                     NPerBlock,
+                                     K0PerBlock,
+                                     K1,
+                                     M1PerThread,
+                                     N1PerThread,
+                                     KPerThread,
+                                     M1N1ThreadClusterM1Xs,
+                                     M1N1ThreadClusterN1Xs,
+                                     ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+                                     ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+                                     ABlockTransferThreadClusterArrangeOrder,
+                                     ABlockTransferSrcAccessOrder,
+                                     ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
+                                     ABlockTransferSrcVectorTensorContiguousDimOrder,
+                                     ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
+                                     BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+                                     BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+                                     BBlockTransferThreadClusterArrangeOrder,
+                                     BBlockTransferSrcAccessOrder,
+                                     BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
+                                     BBlockTransferSrcVectorTensorContiguousDimOrder,
+                                     BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
+                                     CThreadTransferSrcDstAccessOrder,
+                                     CThreadTransferSrcDstVectorDim,
+                                     CThreadTransferDstScalarPerVector>;
+
+    using AGridDesc_K0_M0_M1_K1 =
+        decltype(GridwiseGemm::MakeAGridDescriptor_K0_M0_M1_K1(AGridDesc_K0_M_K1{}));
+    using BGridDesc_K0_N0_N1_K1 =
+        decltype(GridwiseGemm::MakeBGridDescriptor_K0_N0_N1_K1(BGridDesc_K0_N_K1{}));
+    using CGridDesc_M0_M10_M11_N0_N10_N11 =
+        decltype(GridwiseGemm::MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(CGridDesc_M_N{}));
+    using DefaultBlock2CTileMap =
+        decltype(GridwiseGemm::MakeDefaultBlock2CTileMap(CGridDesc_M_N{}));
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const ADataType* p_a_grid,
+                 const BDataType* p_b_grid,
+                 CDataType* p_c_grid,
+                 index_t M,
+                 index_t N,
+                 index_t K,
+                 index_t StrideA,
+                 index_t StrideB,
+                 index_t StrideC,
+                 index_t M01,
+                 index_t N01,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op)
+            : p_a_grid_{p_a_grid},
+              p_b_grid_{p_b_grid},
+              p_c_grid_{p_c_grid},
+              a_grid_desc_k0_m0_m1_k1_{},
+              b_grid_desc_k0_n0_n1_k1_{},
+              c_grid_desc_m0_m10_m11_n0_n10_n11_{},
+              block_2_ctile_map_{},
+              M01_{M01},
+              N01_{N01},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op}
+        {
+            a_grid_desc_k0_m_k1_ = DeviceGemmDl::MakeAGridDescriptor_K0_M_K1(M, K, StrideA);
+            b_grid_desc_k0_n_k1_ = DeviceGemmDl::MakeBGridDescriptor_K0_N_K1(K, N, StrideB);
+            c_grid_desc_m_n_     = DeviceGemmDl::MakeCGridDescriptor_M_N(M, N, StrideC);
+
+            if(GridwiseGemm::CheckValidity(
+                   a_grid_desc_k0_m_k1_, b_grid_desc_k0_n_k1_, c_grid_desc_m_n_))
+            {
+                a_grid_desc_k0_m0_m1_k1_ =
+                    GridwiseGemm::MakeAGridDescriptor_K0_M0_M1_K1(a_grid_desc_k0_m_k1_);
+                b_grid_desc_k0_n0_n1_k1_ =
+                    GridwiseGemm::MakeBGridDescriptor_K0_N0_N1_K1(b_grid_desc_k0_n_k1_);
+                c_grid_desc_m0_m10_m11_n0_n10_n11_ =
+                    GridwiseGemm::MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(c_grid_desc_m_n_);
+
+                block_2_ctile_map_ = GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_);
+            }
+        }
+
+        //  private:
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        CDataType* p_c_grid_;
+
+        AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1_;
+        BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+
+        AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1_;
+        BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1_;
+        CGridDesc_M0_M10_M11_N0_N10_N11 c_grid_desc_m0_m10_m11_n0_n10_n11_;
+
+        DefaultBlock2CTileMap block_2_ctile_map_;
+
+        // TODO: unused, but may be useful in future.
+        index_t M01_;
+        index_t N01_;
+
+        // TODO: unused since gridwise_gemm_dl_v1r3 does NOT support prologue for the time being.
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceGemmDl::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            {
+                std::cout << "arg.a_grid_desc_k0_m0_m1_k1_{"
+                          << arg.a_grid_desc_k0_m_k1_.GetLength(I0) << ", "
+                          << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", "
+                          << arg.a_grid_desc_k0_m_k1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.b_grid_desc_k0_n0_n1_k1_{"
+                          << arg.b_grid_desc_k0_n_k1_.GetLength(I0) << ", "
+                          << arg.b_grid_desc_k0_n_k1_.GetLength(I1) << ", "
+                          << arg.b_grid_desc_k0_n_k1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
+                          << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+            }
+
+            if(!GridwiseGemm::CheckValidity(
+                   arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.c_grid_desc_m_n_))
+            {
+                throw std::runtime_error(
+                    "wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdl_v2r3 has invalid setting");
+            }
+
+            const index_t grid_size = GridwiseGemm::CalculateGridSize(
+                arg.c_grid_desc_m_n_.GetLength(I0), arg.c_grid_desc_m_n_.GetLength(I1));
+
+            const auto K0                    = arg.a_grid_desc_k0_m0_m1_k1_.GetLength(I0);
+            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K0);
+            const bool has_double_tail_k_block_loop =
+                GridwiseGemm::CalculateHasDoubleTailKBlockLoop(K0);
+
+            float ave_time = 0;
+
+            if(has_main_k_block_loop && has_double_tail_k_block_loop)
+            {
+                const auto kernel =
+                    kernel_gemm_dl_v1r3<GridwiseGemm,
+                                        ADataType,
+                                        CDataType,
+                                        remove_reference_t<AGridDesc_K0_M0_M1_K1>,
+                                        remove_reference_t<BGridDesc_K0_N0_N1_K1>,
+                                        remove_reference_t<CGridDesc_M0_M10_M11_N0_N10_N11>,
+                                        remove_reference_t<DefaultBlock2CTileMap>,
+                                        true,
+                                        true>;
+
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
+                                                  dim3(grid_size),
+                                                  dim3(BlockSize),
+                                                  0,
+                                                  arg.p_a_grid_,
+                                                  arg.p_b_grid_,
+                                                  arg.p_c_grid_,
+                                                  arg.a_grid_desc_k0_m0_m1_k1_,
+                                                  arg.b_grid_desc_k0_n0_n1_k1_,
+                                                  arg.c_grid_desc_m0_m10_m11_n0_n10_n11_,
+                                                  arg.block_2_ctile_map_);
+            }
+            else if(has_main_k_block_loop && !has_double_tail_k_block_loop)
+            {
+                const auto kernel =
+                    kernel_gemm_dl_v1r3<GridwiseGemm,
+                                        ADataType,
+                                        CDataType,
+                                        remove_reference_t<AGridDesc_K0_M0_M1_K1>,
+                                        remove_reference_t<BGridDesc_K0_N0_N1_K1>,
+                                        remove_reference_t<CGridDesc_M0_M10_M11_N0_N10_N11>,
+                                        remove_reference_t<DefaultBlock2CTileMap>,
+                                        true,
+                                        false>;
+
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
+                                                  dim3(grid_size),
+                                                  dim3(BlockSize),
+                                                  0,
+                                                  arg.p_a_grid_,
+                                                  arg.p_b_grid_,
+                                                  arg.p_c_grid_,
+                                                  arg.a_grid_desc_k0_m0_m1_k1_,
+                                                  arg.b_grid_desc_k0_n0_n1_k1_,
+                                                  arg.c_grid_desc_m0_m10_m11_n0_n10_n11_,
+                                                  arg.block_2_ctile_map_);
+            }
+            else if(!has_main_k_block_loop && has_double_tail_k_block_loop)
+            {
+                const auto kernel =
+                    kernel_gemm_dl_v1r3<GridwiseGemm,
+                                        ADataType,
+                                        CDataType,
+                                        remove_reference_t<AGridDesc_K0_M0_M1_K1>,
+                                        remove_reference_t<BGridDesc_K0_N0_N1_K1>,
+                                        remove_reference_t<CGridDesc_M0_M10_M11_N0_N10_N11>,
+                                        remove_reference_t<DefaultBlock2CTileMap>,
+                                        false,
+                                        true>;
+
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
+                                                  dim3(grid_size),
+                                                  dim3(BlockSize),
+                                                  0,
+                                                  arg.p_a_grid_,
+                                                  arg.p_b_grid_,
+                                                  arg.p_c_grid_,
+                                                  arg.a_grid_desc_k0_m0_m1_k1_,
+                                                  arg.b_grid_desc_k0_n0_n1_k1_,
+                                                  arg.c_grid_desc_m0_m10_m11_n0_n10_n11_,
+                                                  arg.block_2_ctile_map_);
+            }
+            else
+            {
+                const auto kernel =
+                    kernel_gemm_dl_v1r3<GridwiseGemm,
+                                        ADataType,
+                                        CDataType,
+                                        remove_reference_t<AGridDesc_K0_M0_M1_K1>,
+                                        remove_reference_t<BGridDesc_K0_N0_N1_K1>,
+                                        remove_reference_t<CGridDesc_M0_M10_M11_N0_N10_N11>,
+                                        remove_reference_t<DefaultBlock2CTileMap>,
+                                        false,
+                                        false>;
+
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
+                                                  dim3(grid_size),
+                                                  dim3(BlockSize),
+                                                  0,
+                                                  arg.p_a_grid_,
+                                                  arg.p_b_grid_,
+                                                  arg.p_c_grid_,
+                                                  arg.a_grid_desc_k0_m0_m1_k1_,
+                                                  arg.b_grid_desc_k0_n0_n1_k1_,
+                                                  arg.c_grid_desc_m0_m10_m11_n0_n10_n11_,
+                                                  arg.block_2_ctile_map_);
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(ck::get_device_name() == "gfx906" || ck::get_device_name() == "gfx1030")
+        {
+            return GridwiseGemm::CheckValidity(
+                arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.c_grid_desc_m_n_);
+        }
+        else
+        {
+            return false;
+        }
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const ADataType* p_a,
+                             const BDataType* p_b,
+                             CDataType* p_c,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             index_t StrideA,
+                             index_t StrideB,
+                             index_t StrideC,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_c,
+                        M,
+                        N,
+                        K,
+                        StrideA,
+                        StrideB,
+                        StrideC,
+                        1,
+                        1,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_b,
+                                                      void* p_c,
+                                                      index_t M,
+                                                      index_t N,
+                                                      index_t K,
+                                                      index_t StrideA,
+                                                      index_t StrideB,
+                                                      index_t StrideC,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      CElementwiseOperation c_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<CDataType*>(p_c),
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideB,
+                                          StrideC,
+                                          1,
+                                          1,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceGemmDl"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << K0PerBlock << ", "
+            << K1 << ", "
+            << M1PerThread << ", "
+            << N1PerThread << ", "
+            << KPerThread
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
new file mode 100644
index 00000000..f1fb4ab4
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
@@ -0,0 +1,682 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_multiple_r.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatDsPointer,
+          typename FloatE,
+          typename FloatRsPointer,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename QsElementwiseOperation,
+          typename RsElementwiseOperation,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename RsGridDescriptor_MBlock_MPerBlock,
+          typename Block2ETileMap,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_multiple_d_multiple_r_xdl_cshuffle(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatDsPointer p_ds_grid,
+            FloatE* __restrict__ p_e_grid,
+            FloatRsPointer p_rs_grid,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CDEElementwiseOperation cde_element_op,
+            const QsElementwiseOperation qs_element_op,
+            const RsElementwiseOperation rs_element_op,
+            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                ds_grid_desc_mblock_mperblock_nblock_nperblock,
+            const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                e_grid_desc_mblock_mperblock_nblock_nperblock,
+            const RsGridDescriptor_MBlock_MPerBlock rs_grid_desc_mblock_mperblock,
+            const Block2ETileMap block_2_etile_map)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
+                                                  p_b_grid,
+                                                  p_ds_grid,
+                                                  p_e_grid,
+                                                  p_rs_grid,
+                                                  p_shared,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  cde_element_op,
+                                                  qs_element_op,
+                                                  rs_element_op,
+                                                  a_grid_desc_ak0_m_ak1,
+                                                  b_grid_desc_bk0_n_bk1,
+                                                  ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  rs_grid_desc_mblock_mperblock,
+                                                  block_2_etile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_ds_grid;
+    ignore = p_e_grid;
+    ignore = p_rs_grid;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = cde_element_op;
+    ignore = qs_element_op;
+    ignore = rs_element_op;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = e_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = rs_grid_desc_mblock_mperblock;
+    ignore = block_2_etile_map;
+#endif
+}
+
+} // namespace ck
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// GEMM:
+//   input : A[AK0, M, AK1]
+//   input : B[AK0, N, AK1]
+//   input : D0[M, N], D1[M, N], ...
+//   output : E[M, N]
+//   output : R0[M], R1[M], ...
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+//   Q0 = reduce0(q_op0(E)), Q1 = reduce1(q_op0(E)), ...
+//   R0 = r_op0(Q0), R1 = r_op1(Q1), ...
+// Assume:
+//   D0, D1, ... and E have the same layout
+template <typename ALayout,
+          typename BLayout,
+          typename DELayout,
+          typename ADataType,
+          typename BDataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ReduceAccDataType,
+          typename RsDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename QsElementwiseOperation,
+          typename RsElementwiseOperation,
+          typename ThreadReduceOperations,
+          typename RsGlobalMemoryDataOperation,
+          GemmSpecialization GemmSpec,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock,
+          index_t CDEReduceThreadTransferScalarPerVector_NPerBlock,
+          index_t RThreadTransferDstScalarPerVector_MPerBlock,
+          LoopScheduler LoopSched = make_default_loop_scheduler()>
+struct DeviceGemmMultipleDMultipleR_Xdl_CShuffle
+    : public DeviceGemmMultipleDMultipleR<ALayout,
+                                          BLayout,
+                                          DELayout,
+                                          ADataType,
+                                          BDataType,
+                                          DsDataType,
+                                          EDataType,
+                                          RsDataType,
+                                          AElementwiseOperation,
+                                          BElementwiseOperation,
+                                          CDEElementwiseOperation,
+                                          QsElementwiseOperation,
+                                          RsElementwiseOperation>
+{
+    using DeviceOp = DeviceGemmMultipleDMultipleR_Xdl_CShuffle;
+
+    static constexpr index_t NumDTensor = DsDataType::Size();
+    static constexpr index_t NumRTensor = RsDataType::Size();
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr auto matrix_padder =
+        MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock};
+
+    static auto MakeAGridDescriptor_M_K(index_t MRaw, index_t KRaw, index_t StrideA)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(I1, StrideA));
+            }
+        }();
+
+        return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
+    }
+
+    static auto MakeBGridDescriptor_N_K(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
+    }
+
+    static auto MakeEGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideE)
+    {
+        const auto e_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, DELayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(StrideE, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, DELayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(I1, StrideE));
+            }
+        }();
+
+        return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
+    }
+
+    // assume D is packed tensor
+    static auto MakeRGridDescriptor_M(index_t MRaw)
+    {
+        const auto r_grid_desc_mraw = make_naive_tensor_descriptor_packed(make_tuple(MRaw));
+
+        const auto M    = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto MPad = M - MRaw;
+
+        if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                     GemmSpec == GemmSpecialization::MNPadding ||
+                     GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad M
+            return transform_tensor_descriptor(r_grid_desc_mraw,
+                                               make_tuple(make_right_pad_transform(MRaw, MPad)),
+                                               make_tuple(Sequence<0>{}),
+                                               make_tuple(Sequence<0>{}));
+        }
+        else
+        {
+            // not pad M
+            return r_grid_desc_mraw;
+        }
+    }
+
+    using AGridDesc_M_K = decltype(MakeAGridDescriptor_M_K(1, 1, 1));
+    using BGridDesc_N_K = decltype(MakeBGridDescriptor_N_K(1, 1, 1));
+    using EGridDesc_M_N = decltype(MakeEGridDescriptor_M_N(1, 1, 1));
+    using RGridDesc_M   = decltype(MakeRGridDescriptor_M(1));
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemmMultipleDMultipleR_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
+        ADataType, // TODO: distinguish A/B datatype
+        GemmAccDataType,
+        CShuffleDataType,
+        DsDataType,
+        EDataType,
+        ReduceAccDataType,
+        RsDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CDEElementwiseOperation,
+        QsElementwiseOperation,
+        RsElementwiseOperation,
+        ThreadReduceOperations,
+        InMemoryDataOperationEnum::Set,
+        RsGlobalMemoryDataOperation,
+        AGridDesc_M_K,
+        BGridDesc_N_K,
+        EGridDesc_M_N,
+        RGridDesc_M,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CDRThreadTransferClusterLengths_MPerBlock_NPerBlock,
+        CDEReduceThreadTransferScalarPerVector_NPerBlock,
+        RThreadTransferDstScalarPerVector_MPerBlock,
+        LoopSched>;
+
+    using AGridDesc_AK0_M_AK1 = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(AGridDesc_M_K{}))>;
+    using BGridDesc_BK0_N_BK1 = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(BGridDesc_N_K{}))>;
+
+    using Block2ETileMap = typename GridwiseGemm::DefaultBlock2ETileMap;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const void* p_a_grid,
+                 const void* p_b_grid,
+                 std::array<const void*, NumDTensor> p_ds_grid,
+                 void* p_e_grid,
+                 std::array<void*, NumRTensor> p_rs_grid,
+                 index_t MRaw,
+                 index_t NRaw,
+                 index_t KRaw,
+                 index_t StrideA,
+                 index_t StrideB,
+                 std::array<index_t, NumDTensor> StrideDs,
+                 index_t StrideE,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op,
+                 QsElementwiseOperation qs_element_op,
+                 RsElementwiseOperation rs_element_op)
+            : p_a_grid_{static_cast<const ADataType*>(p_a_grid)},
+              p_b_grid_{static_cast<const BDataType*>(p_b_grid)},
+              p_ds_grid_{}, // FIXME
+              p_e_grid_{static_cast<EDataType*>(p_e_grid)},
+              p_rs_grid_{}, // FIXME
+              a_grid_desc_m_k_{DeviceOp::MakeAGridDescriptor_M_K(MRaw, KRaw, StrideA)},
+              b_grid_desc_n_k_{DeviceOp::MakeBGridDescriptor_N_K(KRaw, NRaw, StrideB)},
+              e_grid_desc_m_n_{DeviceOp::MakeEGridDescriptor_M_N(MRaw, NRaw, StrideE)},
+              r_grid_desc_m_{DeviceOp::MakeRGridDescriptor_M(MRaw)},
+              a_grid_desc_ak0_m_ak1_{
+                  GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
+              b_grid_desc_bk0_n_bk1_{
+                  GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
+              ds_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              e_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              rs_grid_desc_mblock_mperblock_{},
+              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op},
+              qs_element_op_{qs_element_op},
+              rs_element_op_{rs_element_op}
+        {
+            if(GridwiseGemm::CheckValidity(a_grid_desc_m_k_,
+                                           b_grid_desc_n_k_,
+                                           e_grid_desc_m_n_,
+                                           r_grid_desc_m_,
+                                           block_2_etile_map_))
+            {
+                e_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        e_grid_desc_m_n_);
+
+                static_for<0, NumDTensor, 1>{}([&](auto i) {
+                    using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                    p_ds_grid_(i) = static_cast<const DDataType*>(p_ds_grid[i]);
+
+                    const auto d_grid_desc_m_n =
+                        DeviceOp::MakeEGridDescriptor_M_N(MRaw, NRaw, StrideDs[i]);
+
+                    ds_grid_desc_mblock_mperblock_nblock_nperblock_(i) =
+                        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                            d_grid_desc_m_n);
+                });
+
+                static_for<0, NumRTensor, 1>{}([&](auto i) {
+                    using RDataType = remove_cvref_t<tuple_element_t<i.value, RsDataType>>;
+
+                    p_rs_grid_(i) = static_cast<RDataType*>(p_rs_grid[i]);
+
+                    rs_grid_desc_mblock_mperblock_(i) =
+                        GridwiseGemm::MakeRGridDescriptor_MBlock_MPerBlock(r_grid_desc_m_);
+                });
+            }
+        }
+
+        //  private:
+        // pointers
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        EDataType* p_e_grid_;
+        typename GridwiseGemm::RsGridPointer p_rs_grid_;
+
+        // tensor descriptors
+        AGridDesc_M_K a_grid_desc_m_k_;
+        BGridDesc_N_K b_grid_desc_n_k_;
+        EGridDesc_M_N e_grid_desc_m_n_;
+        RGridDesc_M r_grid_desc_m_;
+
+        // tensor descriptors for block/thread-wise copy
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        StaticallyIndexedArray<
+            typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+            NumDTensor>
+            ds_grid_desc_mblock_mperblock_nblock_nperblock_; // FIXME: Ds desc may be of different
+                                                             // type from E
+        typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            e_grid_desc_mblock_mperblock_nblock_nperblock_;
+
+        StaticallyIndexedArray<typename GridwiseGemm::RGridDescriptor_MBlock_MPerBlock, NumRTensor>
+            rs_grid_desc_mblock_mperblock_;
+
+        // block-to-e-tile map
+        Block2ETileMap block_2_etile_map_;
+
+        // element-wise op
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+        QsElementwiseOperation qs_element_op_;
+        RsElementwiseOperation rs_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
+                                            arg.b_grid_desc_n_k_,
+                                            arg.e_grid_desc_m_n_,
+                                            arg.r_grid_desc_m_,
+                                            arg.block_2_etile_map_))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_);
+
+            const auto K =
+                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+
+            auto launch_kernel = [&](auto has_main_k_block_loop) {
+                constexpr bool has_main_loop = has_main_k_block_loop.value;
+
+                const auto kernel = kernel_gemm_multiple_d_multiple_r_xdl_cshuffle<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    typename GridwiseGemm::DsGridPointer,
+                    EDataType,
+                    typename GridwiseGemm::RsGridPointer,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CDEElementwiseOperation,
+                    QsElementwiseOperation,
+                    RsElementwiseOperation,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    ck::StaticallyIndexedArray<
+                        typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                        NumDTensor>,
+                    typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    ck::StaticallyIndexedArray<
+                        typename GridwiseGemm::RGridDescriptor_MBlock_MPerBlock,
+                        NumRTensor>,
+                    typename GridwiseGemm::DefaultBlock2ETileMap,
+                    has_main_loop>;
+
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_a_grid_,
+                                              arg.p_b_grid_,
+                                              arg.p_ds_grid_,
+                                              arg.p_e_grid_,
+                                              arg.p_rs_grid_,
+                                              arg.a_element_op_,
+                                              arg.b_element_op_,
+                                              arg.cde_element_op_,
+                                              arg.qs_element_op_,
+                                              arg.rs_element_op_,
+                                              arg.a_grid_desc_ak0_m_ak1_,
+                                              arg.b_grid_desc_bk0_n_bk1_,
+                                              arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.rs_grid_desc_mblock_mperblock_,
+                                              arg.block_2_etile_map_);
+            };
+
+            float ave_time = 0;
+
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                ave_time = launch_kernel(integral_constant<bool, true>{});
+            }
+            else
+            {
+                ave_time = launch_kernel(integral_constant<bool, false>{});
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
+                                           arg.b_grid_desc_n_k_,
+                                           arg.e_grid_desc_m_n_,
+                                           arg.r_grid_desc_m_,
+                                           arg.block_2_etile_map_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const void* p_a,
+                             const void* p_b,
+                             std::array<const void*, NumDTensor> p_ds,
+                             void* p_e,
+                             std::array<void*, NumRTensor> p_rs,
+                             index_t MRaw,
+                             index_t NRaw,
+                             index_t KRaw,
+                             index_t StrideA,
+                             index_t StrideB,
+                             std::array<index_t, NumDTensor> StrideDs,
+                             index_t StrideE,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation cde_element_op,
+                             QsElementwiseOperation qs_element_op,
+                             RsElementwiseOperation rs_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_ds,
+                        p_e,
+                        p_rs,
+                        MRaw,
+                        NRaw,
+                        KRaw,
+                        StrideA,
+                        StrideB,
+                        StrideDs,
+                        StrideE,
+                        a_element_op,
+                        b_element_op,
+                        cde_element_op,
+                        qs_element_op,
+                        rs_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_b,
+                                                      std::array<const void*, NumDTensor> p_ds,
+                                                      void* p_e,
+                                                      std::array<void*, NumRTensor> p_rs,
+                                                      index_t MRaw,
+                                                      index_t NRaw,
+                                                      index_t KRaw,
+                                                      index_t StrideA,
+                                                      index_t StrideB,
+                                                      std::array<index_t, NumDTensor> StrideDs,
+                                                      index_t StrideE,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      CDEElementwiseOperation cde_element_op,
+                                                      QsElementwiseOperation qs_element_op,
+                                                      RsElementwiseOperation rs_element_op) override
+    {
+        return std::make_unique<Argument>(p_a,
+                                          p_b,
+                                          p_ds,
+                                          p_e,
+                                          p_rs,
+                                          MRaw,
+                                          NRaw,
+                                          KRaw,
+                                          StrideA,
+                                          StrideB,
+                                          StrideDs,
+                                          StrideE,
+                                          a_element_op,
+                                          b_element_op,
+                                          cde_element_op,
+                                          qs_element_op,
+                                          rs_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceGemmMultipleDMultipleR_Xdl_CShuffle"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1 << ", "
+            << getGemmSpecializationString(GemmSpec)
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
new file mode 100644
index 00000000..3830e1c0
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
@@ -0,0 +1,698 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+
+template <typename GridwiseGemm,
+          typename ABDataType,
+          typename DsPointer,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename Block2ETileMap,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_multiple_d_xdl_cshuffle(const ABDataType* __restrict__ p_a_grid,
+                                            const ABDataType* __restrict__ p_b_grid,
+                                            DsPointer p_ds_grid,
+                                            EDataType* __restrict__ p_e_grid,
+                                            const AElementwiseOperation a_element_op,
+                                            const BElementwiseOperation b_element_op,
+                                            const CDEElementwiseOperation cde_element_op,
+                                            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+                                            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+                                            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                                                ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                            const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                                                e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                            const Block2ETileMap block_2_etile_map)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
+                                                  p_b_grid,
+                                                  p_ds_grid,
+                                                  p_e_grid,
+                                                  p_shared,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  cde_element_op,
+                                                  a_grid_desc_ak0_m_ak1,
+                                                  b_grid_desc_bk0_n_bk1,
+                                                  ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  block_2_etile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_ds_grid;
+    ignore = p_e_grid;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = cde_element_op;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = e_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = block_2_etile_map;
+#endif
+}
+
+} // namespace ck
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// GEMM:
+//   input : A[M, K]
+//   input : B[N, K]
+//   input : D0[M, N], D1[M, N], ...
+//   output : E[M, N]
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   D0, D1, ... and E have the same layout
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CDEBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched     = make_default_loop_scheduler(),
+          PipelineVersion PipelineVer = PipelineVersion::v1>
+struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
+                                                                     BLayout,
+                                                                     DsLayout,
+                                                                     ELayout,
+                                                                     ADataType,
+                                                                     BDataType,
+                                                                     DsDataType,
+                                                                     EDataType,
+                                                                     AElementwiseOperation,
+                                                                     BElementwiseOperation,
+                                                                     CDEElementwiseOperation>
+{
+    using DeviceOp = DeviceGemmMultipleD_Xdl_CShuffle;
+
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr auto matrix_padder =
+        MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock};
+
+    static auto MakeAGridDescriptor_M_K(index_t MRaw, index_t KRaw, index_t StrideA)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(I1, StrideA));
+            }
+        }();
+
+        return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
+    }
+
+    static auto MakeBGridDescriptor_N_K(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
+    }
+
+    template <typename ELay>
+    static auto MakeEGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideE)
+    {
+        const auto e_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ELay>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(StrideE, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ELay>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(I1, StrideE));
+            }
+        }();
+
+        return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
+    }
+
+    static auto MakeDsGridDescriptor_M_N(const std::array<index_t, NumDTensor>& MRaws,
+                                         const std::array<index_t, NumDTensor>& NRaws,
+                                         const std::array<index_t, NumDTensor>& DsStride)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+
+                return DeviceOp::MakeEGridDescriptor_M_N<DLayout>(MRaws[i], NRaws[i], DsStride[i]);
+            },
+            Number<NumDTensor>{});
+    }
+
+    // desc for problem definition
+    using AGridDesc_M_K  = decltype(MakeAGridDescriptor_M_K(1, 1, 1));
+    using BGridDesc_N_K  = decltype(MakeBGridDescriptor_N_K(1, 1, 1));
+    using DsGridDesc_M_N = remove_cvref_t<decltype(MakeDsGridDescriptor_M_N({}, {}, {}))>;
+    using EGridDesc_M_N  = decltype(MakeEGridDescriptor_M_N<ELayout>(1, 1, 1));
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<
+        ADataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CShuffleDataType,
+        DsDataType,
+        EDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CDEElementwiseOperation,
+        InMemoryDataOperationEnum::Set,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEBlockTransferScalarPerVector_NPerBlock,
+        LoopSched,
+        PipelineVer>;
+
+    // desc for blockwise copy
+    using AGridDesc_AK0_M_AK1                          = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(AGridDesc_M_K{}))>;
+    using BGridDesc_BK0_N_BK1                          = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(BGridDesc_N_K{}))>;
+    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(DsGridDesc_M_N{}))>;
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock  = remove_cvref_t<decltype(
+        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}))>;
+
+    // block-to-e-tile map
+    using Block2ETileMap =
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const void* p_a_grid,
+                 const void* p_b_grid,
+                 std::array<const void*, NumDTensor> p_ds_grid,
+                 void* p_e_grid,
+                 index_t MRaw,
+                 index_t NRaw,
+                 index_t KRaw,
+                 index_t StrideA,
+                 index_t StrideB,
+                 std::array<index_t, NumDTensor> StrideDs,
+                 index_t StrideE,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+            : p_a_grid_{static_cast<const ADataType*>(p_a_grid)},
+              p_b_grid_{static_cast<const BDataType*>(p_b_grid)},
+              p_ds_grid_{},
+              p_e_grid_{static_cast<EDataType*>(p_e_grid)},
+              a_grid_desc_m_k_{DeviceOp::MakeAGridDescriptor_M_K(MRaw, KRaw, StrideA)},
+              b_grid_desc_n_k_{DeviceOp::MakeBGridDescriptor_N_K(KRaw, NRaw, StrideB)},
+              ds_grid_desc_m_n_{},
+              e_grid_desc_m_n_{DeviceOp::MakeEGridDescriptor_M_N<ELayout>(MRaw, NRaw, StrideE)},
+              a_grid_desc_ak0_m_ak1_{
+                  GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
+              b_grid_desc_bk0_n_bk1_{
+                  GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
+              ds_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              e_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op},
+              MRaw_{MRaw},
+              NRaw_{NRaw},
+              KRaw_{KRaw}
+        {
+            // populate pointer, desc for Ds
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DLayout   = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                // D pointer
+                p_ds_grid_(i) = static_cast<const DDataType*>(p_ds_grid[i]);
+
+                // D desc
+                ds_grid_desc_m_n_(i) =
+                    DeviceOp::MakeEGridDescriptor_M_N<DLayout>(MRaw, NRaw, StrideDs[i]);
+            });
+
+            // populate desc for Ds/E
+            if(GridwiseGemm::CheckValidity(a_grid_desc_m_k_,
+                                           b_grid_desc_n_k_,
+                                           ds_grid_desc_m_n_,
+                                           e_grid_desc_m_n_,
+                                           block_2_etile_map_))
+            {
+                ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        ds_grid_desc_m_n_);
+
+                e_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        e_grid_desc_m_n_);
+            }
+        }
+
+        void Print() const
+        {
+            std::cout << "A[M, K]: " << a_grid_desc_m_k_ << std::endl;
+            std::cout << "B[N, K]: " << b_grid_desc_n_k_ << std::endl;
+            static_for<0, NumDTensor, 1>{}(
+                [&](auto i) { std::cout << "Ds[M, N]: " << ds_grid_desc_m_n_[i] << std::endl; });
+            std::cout << "E[M, N]: " << e_grid_desc_m_n_ << std::endl;
+        }
+
+        //  private:
+        // pointers
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        EDataType* p_e_grid_;
+
+        // tensor descriptors for problem definiton
+        AGridDesc_M_K a_grid_desc_m_k_;
+        BGridDesc_N_K b_grid_desc_n_k_;
+        DsGridDesc_M_N ds_grid_desc_m_n_;
+        EGridDesc_M_N e_grid_desc_m_n_;
+
+        // tensor descriptors for block/thread-wise copy
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+            ds_grid_desc_mblock_mperblock_nblock_nperblock_;
+        EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_;
+
+        // block-to-e-tile map
+        Block2ETileMap block_2_etile_map_;
+
+        // element-wise op
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+
+        // for checking vector load/store
+        index_t MRaw_;
+        index_t NRaw_;
+        index_t KRaw_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
+                                            arg.b_grid_desc_n_k_,
+                                            arg.ds_grid_desc_m_n_,
+                                            arg.e_grid_desc_m_n_,
+                                            arg.block_2_etile_map_))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_);
+
+            const auto K =
+                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+
+            auto launch_kernel = [&](auto has_main_k_block_loop) {
+                constexpr bool has_main_loop = has_main_k_block_loop.value;
+
+                const auto kernel = kernel_gemm_multiple_d_xdl_cshuffle<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    typename GridwiseGemm::DsGridPointer,
+                    EDataType,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CDEElementwiseOperation,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    DeviceOp::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                    DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                    DeviceOp::Block2ETileMap,
+                    has_main_loop>;
+
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_a_grid_,
+                                              arg.p_b_grid_,
+                                              arg.p_ds_grid_,
+                                              arg.p_e_grid_,
+                                              arg.a_element_op_,
+                                              arg.b_element_op_,
+                                              arg.cde_element_op_,
+                                              arg.a_grid_desc_ak0_m_ak1_,
+                                              arg.b_grid_desc_bk0_n_bk1_,
+                                              arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.block_2_etile_map_);
+            };
+
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                return launch_kernel(integral_constant<bool, true>{});
+            }
+            else
+            {
+                return launch_kernel(integral_constant<bool, false>{});
+            }
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        {
+            return false;
+        }
+
+        // check vector load/store
+        {
+            using Row = ck::tensor_layout::gemm::RowMajor;
+            using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+            // check vector load of A
+            if constexpr(is_same_v<ALayout, Row> && ABlockTransferSrcVectorDim == 2)
+            {
+                if(arg.KRaw_ % ABlockTransferSrcScalarPerVector != 0)
+                {
+                    return false;
+                }
+            }
+            else if constexpr(is_same_v<ALayout, Col> && ABlockTransferSrcVectorDim == 1)
+            {
+                // FIXME: not rigorous
+                if(arg.MRaw_ % ABlockTransferSrcScalarPerVector != 0)
+                {
+                    return false;
+                }
+            }
+            else
+            {
+                return false;
+            }
+
+            // check vector laod of B
+            if constexpr(is_same_v<BLayout, Col> && BBlockTransferSrcVectorDim == 2)
+            {
+                if(arg.KRaw_ % BBlockTransferSrcScalarPerVector != 0)
+                {
+                    return false;
+                }
+            }
+            else if constexpr(is_same_v<BLayout, Row> && BBlockTransferSrcVectorDim == 1)
+            {
+                // FIXME: not rigorous
+                if(arg.NRaw_ % BBlockTransferSrcScalarPerVector != 0)
+                {
+                    return false;
+                }
+            }
+            else
+            {
+                return false;
+            }
+
+            // check vector load of Ds
+            // only support RowMajor for now
+            bool all_valid = true;
+
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+
+                if constexpr(!is_same_v<DLayout, Row>)
+                {
+                    all_valid = false;
+                }
+            });
+
+            if(!all_valid)
+            {
+                return false;
+            }
+
+            // check vector store of E
+            // only support RowMajor for now
+            if constexpr(is_same_v<ELayout, Row>)
+            {
+                if(arg.NRaw_ % CDEBlockTransferScalarPerVector_NPerBlock != 0)
+                {
+                    return false;
+                }
+            }
+            else
+            {
+                return false;
+            }
+        }
+
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
+                                           arg.b_grid_desc_n_k_,
+                                           arg.ds_grid_desc_m_n_,
+                                           arg.e_grid_desc_m_n_,
+                                           arg.block_2_etile_map_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const void* p_a,
+                             const void* p_b,
+                             std::array<const void*, NumDTensor> p_ds,
+                             void* p_e,
+                             index_t MRaw,
+                             index_t NRaw,
+                             index_t KRaw,
+                             index_t StrideA,
+                             index_t StrideB,
+                             std::array<index_t, NumDTensor> StrideDs,
+                             index_t StrideE,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_ds,
+                        p_e,
+                        MRaw,
+                        NRaw,
+                        KRaw,
+                        StrideA,
+                        StrideB,
+                        StrideDs,
+                        StrideE,
+                        a_element_op,
+                        b_element_op,
+                        cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        index_t MRaw,
+                        index_t NRaw,
+                        index_t KRaw,
+                        index_t StrideA,
+                        index_t StrideB,
+                        std::array<ck::index_t, NumDTensor> StrideDs,
+                        index_t StrideE,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) override
+    {
+        return std::make_unique<Argument>(p_a,
+                                          p_b,
+                                          p_ds,
+                                          p_e,
+                                          MRaw,
+                                          NRaw,
+                                          KRaw,
+                                          StrideA,
+                                          StrideB,
+                                          StrideDs,
+                                          StrideE,
+                                          a_element_op,
+                                          b_element_op,
+                                          cde_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        std::map<LoopScheduler, std::string> LoopSchedToString{
+            {LoopScheduler::Default, "Default"}, {LoopScheduler::Interwave, "Interwave"}};
+
+        std::map<PipelineVersion, std::string> PipelineVersionToString{{PipelineVersion::v1, "v1"},
+                                                                       {PipelineVersion::v2, "v2"}};
+
+        // clang-format off
+        str << "DeviceGemmMultipleD_Xdl_CShuffle"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1 << ", "
+            << getGemmSpecializationString(GemmSpec)
+            << ">"
+            << " LoopScheduler: "
+            << LoopSchedToString[LoopSched] << ", "
+            << "PipelineVersion: "
+            << PipelineVersionToString[PipelineVer];
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_reduce_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_reduce_xdl_cshuffle.hpp
new file mode 100644
index 00000000..cf190839
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_reduce_xdl_cshuffle.hpp
@@ -0,0 +1,835 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// Note: inter-wave loop scheduler is rolled out to c-shuffle version first. Becuase non c-shuffle
+// version currently has compiler issues with register spill which further causes validation
+// failures.
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename ReduceAccDataType,
+          typename ReducePtrsGlobal,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename ReduceOperations,
+          typename ReduceInElementwiseOperations,
+          typename ReduceAccElementwiseOperations,
+          typename ReduceGlobalMemoryDataOperation,
+          GemmSpecialization GemmSpec,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          typename CReduceThreadClusterLengths_MPerBlock_NPerBlock,
+          index_t CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
+          index_t CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
+          LoopScheduler LoopSched = make_default_loop_scheduler()>
+struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<0, ReduceOperations::Size()>
+{
+    using DeviceOp = DeviceGemmReduce_Xdl_CShuffle;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    static auto MakeAGridDescriptor_AK0_M_AK1(index_t MRaw, index_t KRaw, index_t StrideA)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(I1, StrideA));
+            }
+        }();
+
+        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto K = math::integer_divide_ceil(KRaw, KPerBlock) * KPerBlock;
+
+        const auto MPad = M - MRaw;
+        const auto KPad = K - KRaw;
+
+        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both M and K
+            assert(K % AK1 == 0);
+
+            const auto AK0 = K / AK1;
+
+            const auto a_grid_desc_m_k =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_right_pad_transform(MRaw, MPad),
+                                                       make_right_pad_transform(KRaw, KPad)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_m_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(M)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad M, but not K
+            assert(KRaw % AK1 == 0);
+
+            const auto AK0 = KRaw / AK1;
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_right_pad_transform(MRaw, MPad)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad K, but not M
+            assert(K % AK1 == 0);
+
+            const auto AK0 = K / AK1;
+
+            const auto a_grid_desc_m_k = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(KRaw, KPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_m_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(MRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else
+        {
+            // not pad M or K
+            assert(KRaw % AK1 == 0);
+
+            const auto AK0 = KRaw / AK1;
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(MRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+    }
+
+    static auto MakeBGridDescriptor_BK0_N_BK1(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
+        const auto K = math::integer_divide_ceil(KRaw, KPerBlock) * KPerBlock;
+
+        const auto NPad = N - NRaw;
+        const auto KPad = K - KRaw;
+
+        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both N and K
+            assert(K % BK1 == 0);
+
+            const auto BK0 = K / BK1;
+
+            const auto b_grid_desc_n_k =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_right_pad_transform(NRaw, NPad),
+                                                       make_right_pad_transform(KRaw, KPad)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_n_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(N)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad N, but not K
+            assert(KRaw % BK1 == 0);
+
+            const auto BK0 = KRaw / BK1;
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_right_pad_transform(NRaw, NPad)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad K, but not N
+            assert(K % BK1 == 0);
+
+            const auto BK0 = K / BK1;
+
+            const auto b_grid_desc_n_k = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_pass_through_transform(NRaw), make_right_pad_transform(KRaw, KPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_n_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(NRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else
+        {
+            // not pad N or K
+            assert(KRaw % BK1 == 0);
+
+            const auto BK0 = KRaw / BK1;
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(NRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+    }
+
+    static auto MakeCGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(I1, StrideC));
+            }
+        }();
+
+        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
+
+        const auto MPad = M - MRaw;
+        const auto NPad = N - NRaw;
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad M and N
+            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                               make_tuple(make_right_pad_transform(MRaw, MPad),
+                                                          make_right_pad_transform(NRaw, NPad)),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad M, but not N
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_right_pad_transform(MRaw, MPad), make_pass_through_transform(NRaw)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad N, but not M
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(NRaw, NPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            // not pad M or N
+            return c_grid_desc_mraw_nraw;
+        }
+    }
+
+    // assume Reduce is packed tensor
+    static auto MakeReduceGridDescriptor_M(index_t MRaw)
+    {
+        const auto d_grid_desc_mraw = make_naive_tensor_descriptor_packed(make_tuple(MRaw));
+
+        const auto M    = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto MPad = M - MRaw;
+
+        if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                     GemmSpec == GemmSpecialization::MNPadding ||
+                     GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad M
+            return transform_tensor_descriptor(d_grid_desc_mraw,
+                                               make_tuple(make_right_pad_transform(MRaw, MPad)),
+                                               make_tuple(Sequence<0>{}),
+                                               make_tuple(Sequence<0>{}));
+        }
+        else
+        {
+            // not pad M
+            return d_grid_desc_mraw;
+        }
+    }
+
+    using AGridDesc_AK0_M_AK1 = decltype(MakeAGridDescriptor_AK0_M_AK1(1, 1, 1));
+    using BGridDesc_BK0_N_BK1 = decltype(MakeBGridDescriptor_BK0_N_BK1(1, 1, 1));
+    using CGridDesc_M_N       = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
+    using ReduceGridDesc_M    = decltype(MakeReduceGridDescriptor_M(1));
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
+        ADataType, // TODO: distinguish A/B datatype
+        GemmAccDataType,
+        CShuffleDataType,
+        CDataType,
+        ReduceAccDataType,
+        ReducePtrsGlobal,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        ReduceOperations,
+        ReduceInElementwiseOperations,
+        ReduceAccElementwiseOperations,
+        InMemoryDataOperationEnum::Set,
+        ReduceGlobalMemoryDataOperation,
+        AGridDesc_AK0_M_AK1,
+        BGridDesc_BK0_N_BK1,
+        CGridDesc_M_N,
+        ReduceGridDesc_M,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        CReduceThreadClusterLengths_MPerBlock_NPerBlock,
+        CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
+        CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
+        LoopSched>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const ADataType* p_a_grid,
+                 const BDataType* p_b_grid,
+                 CDataType* p_c_grid,
+                 ReducePtrsGlobal p_reduces_grid,
+                 index_t MRaw,
+                 index_t NRaw,
+                 index_t KRaw,
+                 index_t StrideA,
+                 index_t StrideB,
+                 index_t StrideC,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op,
+                 ReduceInElementwiseOperations reduce_in_element_ops,
+                 ReduceAccElementwiseOperations reduce_out_element_ops)
+            : p_a_grid_{p_a_grid},
+              p_b_grid_{p_b_grid},
+              p_c_grid_{p_c_grid},
+              p_reduces_grid_{p_reduces_grid},
+              a_grid_desc_ak0_m_ak1_{DeviceOp::MakeAGridDescriptor_AK0_M_AK1(MRaw, KRaw, StrideA)},
+              b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1(KRaw, NRaw, StrideB)},
+              c_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, NRaw, StrideC)},
+              reduce_grid_desc_m_{DeviceOp::MakeReduceGridDescriptor_M(MRaw)},
+              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              reduce_grid_desc_mblock_mperblock_{},
+              block_2_ctile_map_{GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_)},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op},
+              reduce_in_element_ops_{reduce_in_element_ops},
+              reduce_out_element_ops_{reduce_out_element_ops}
+        {
+            if(GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1_,
+                                           b_grid_desc_bk0_n_bk1_,
+                                           c_grid_desc_m_n_,
+                                           block_2_ctile_map_))
+            {
+                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        c_grid_desc_m_n_);
+
+                reduce_grid_desc_mblock_mperblock_ =
+                    GridwiseGemm::MakeReduceGridDescriptor_MBlock_MPerBlock(reduce_grid_desc_m_);
+            }
+        }
+
+        //  private:
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        CDataType* p_c_grid_;
+        ReducePtrsGlobal p_reduces_grid_;
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        ReduceGridDesc_M reduce_grid_desc_m_;
+        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock_;
+        typename GridwiseGemm::ReduceGridDescriptor_MBlock_MPerBlock
+            reduce_grid_desc_mblock_mperblock_;
+        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+        ReduceInElementwiseOperations reduce_in_element_ops_;
+        ReduceAccElementwiseOperations reduce_out_element_ops_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+#if 0
+            {
+                std::cout << "arg.a_grid_desc_ak0_m_ak1_{"
+                          << arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) << ", "
+                          << arg.a_grid_desc_ak0_m_ak1_.GetLength(I1) << ", "
+                          << arg.a_grid_desc_ak0_m_ak1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.b_grid_desc_bk0_n_bk1_{"
+                          << arg.b_grid_desc_bk0_n_bk1_.GetLength(I0) << ", "
+                          << arg.b_grid_desc_bk0_n_bk1_.GetLength(I1) << ", "
+                          << arg.b_grid_desc_bk0_n_bk1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
+                          << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+
+                std::cout << "arg.reduce_grid_desc_m_{ " << arg.reduce_grid_desc_m_.GetLength(I0) << "}"
+                          << std::endl;
+            }
+#endif
+
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                            arg.b_grid_desc_bk0_n_bk1_,
+                                            arg.c_grid_desc_m_n_,
+                                            arg.block_2_ctile_map_))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_);
+
+            const auto K =
+                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+
+            float elapsed_time = 0.0f;
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                const auto kernel = kernel_gemm_reduce_xdl_cshuffle_v1<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    ReducePtrsGlobal,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CElementwiseOperation,
+                    ReduceInElementwiseOperations,
+                    ReduceAccElementwiseOperations,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::ReduceGridDescriptor_MBlock_MPerBlock,
+                    typename GridwiseGemm::DefaultBlock2CTileMap,
+                    true>;
+
+                elapsed_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_,
+                                           arg.p_b_grid_,
+                                           arg.p_c_grid_,
+                                           arg.p_reduces_grid_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.reduce_in_element_ops_,
+                                           arg.reduce_out_element_ops_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.reduce_grid_desc_mblock_mperblock_,
+                                           arg.block_2_ctile_map_);
+            }
+            else
+            {
+                const auto kernel = kernel_gemm_reduce_xdl_cshuffle_v1<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    ReducePtrsGlobal,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CElementwiseOperation,
+                    ReduceInElementwiseOperations,
+                    ReduceAccElementwiseOperations,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::ReduceGridDescriptor_MBlock_MPerBlock,
+                    typename GridwiseGemm::DefaultBlock2CTileMap,
+                    false>;
+
+                elapsed_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_,
+                                           arg.p_b_grid_,
+                                           arg.p_c_grid_,
+                                           arg.p_reduces_grid_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.reduce_in_element_ops_,
+                                           arg.reduce_out_element_ops_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.reduce_grid_desc_mblock_mperblock_,
+                                           arg.block_2_ctile_map_);
+            }
+
+            return elapsed_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_m_n_,
+                                           arg.block_2_ctile_map_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static constexpr int NumReduce = ReduceOperations::Size();
+    static auto MakeArgument(const void* p_a,
+                             const void* p_b,
+                             const void* p_bias,
+                             std::array<const void*, 0> p_ds,
+                             void* p_c,
+                             std::array<void*, NumReduce> p_reduces,
+                             ck::index_t M,
+                             ck::index_t N,
+                             ck::index_t K,
+                             ck::index_t StrideA,
+                             ck::index_t StrideB,
+                             ck::index_t StrideC,
+                             std::array<ck::index_t, 0> StrideDs,
+                             std::array<void*, 3> gemm_element_ops,
+                             std::array<void*, 0> d_element_ops,
+                             std::array<void*, NumReduce> reduce_in_element_op,
+                             std::array<void*, NumReduce> reduce_out_element_op)
+    {
+        (void)p_bias;
+        (void)p_ds;
+        (void)StrideDs;
+        (void)d_element_ops;
+
+        ReducePtrsGlobal reduce_tuple = generate_tuple(
+            [&](auto I) {
+                auto tmp = ReducePtrsGlobal{}[I];
+                using T  = remove_pointer_t<decltype(tmp)>;
+                return static_cast<T*>(p_reduces[I]);
+            },
+            Number<NumReduce>{});
+
+        ReduceInElementwiseOperations reduce_in_element_ops = generate_tuple(
+            [&](auto I) {
+                auto tmp = ReduceInElementwiseOperations{}[I];
+                using T  = remove_pointer_t<decltype(tmp)>;
+                return *(static_cast<T*>(reduce_in_element_op[I]));
+            },
+            Number<NumReduce>{});
+        ReduceAccElementwiseOperations reduce_out_element_ops = generate_tuple(
+            [&](auto I) {
+                auto tmp = ReduceAccElementwiseOperations{}[I];
+                using T  = remove_pointer_t<decltype(tmp)>;
+                return *(static_cast<T*>(reduce_out_element_op[I]));
+            },
+            Number<NumReduce>{});
+
+        AElementwiseOperation a_element_op =
+            *(static_cast<AElementwiseOperation*>(gemm_element_ops[0]));
+        BElementwiseOperation b_element_op =
+            *(static_cast<BElementwiseOperation*>(gemm_element_ops[1]));
+        CElementwiseOperation c_element_op =
+            *(static_cast<CElementwiseOperation*>(gemm_element_ops[2]));
+
+        return Argument{static_cast<const ADataType*>(p_a),
+                        static_cast<const BDataType*>(p_b),
+                        static_cast<CDataType*>(p_c),
+                        reduce_tuple,
+                        M,
+                        N,
+                        K,
+                        StrideA,
+                        StrideB,
+                        StrideC,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op,
+                        reduce_in_element_ops,
+                        reduce_out_element_ops};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        const void* p_bias,
+                        std::array<const void*, 0> p_ds,
+                        void* p_c,
+                        std::array<void*, NumReduce> p_reduces,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB,
+                        ck::index_t StrideC,
+                        std::array<ck::index_t, 0> StrideDs,
+                        std::array<void*, 3> gemm_element_ops,
+                        std::array<void*, 0> d_element_ops,
+                        std::array<void*, NumReduce> reduce_in_element_op,
+                        std::array<void*, NumReduce> reduce_out_element_op,
+                        ck::index_t = 1) override
+    {
+        (void)p_bias;
+        (void)p_ds;
+        (void)StrideDs;
+        (void)d_element_ops;
+
+        ReducePtrsGlobal reduce_tuple = generate_tuple(
+            [&](auto I) {
+                auto tmp = ReducePtrsGlobal{}[I];
+                using T  = remove_pointer_t<decltype(tmp)>;
+                return static_cast<T*>(p_reduces[I]);
+            },
+            Number<NumReduce>{});
+
+        ReduceInElementwiseOperations reduce_in_element_ops = generate_tuple(
+            [&](auto I) {
+                auto tmp = ReduceInElementwiseOperations{}[I];
+                using T  = remove_pointer_t<decltype(tmp)>;
+                return *(static_cast<T*>(reduce_in_element_op[I]));
+            },
+            Number<NumReduce>{});
+        ReduceAccElementwiseOperations reduce_out_element_ops = generate_tuple(
+            [&](auto I) {
+                auto tmp = ReduceAccElementwiseOperations{}[I];
+                using T  = remove_pointer_t<decltype(tmp)>;
+                return *(static_cast<T*>(reduce_out_element_op[I]));
+            },
+            Number<NumReduce>{});
+
+        AElementwiseOperation a_element_op =
+            *(static_cast<AElementwiseOperation*>(gemm_element_ops[0]));
+        BElementwiseOperation b_element_op =
+            *(static_cast<BElementwiseOperation*>(gemm_element_ops[1]));
+        CElementwiseOperation c_element_op =
+            *(static_cast<CElementwiseOperation*>(gemm_element_ops[2]));
+
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<CDataType*>(p_c),
+                                          reduce_tuple,
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideB,
+                                          StrideC,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op,
+                                          reduce_in_element_ops,
+                                          reduce_out_element_ops);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceGemmReduce_Xdl_CShuffle"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp
new file mode 100644
index 00000000..21bb36b7
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp
@@ -0,0 +1,570 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AccDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          ck::index_t BlockSize,
+          ck::index_t MPerBlock,
+          ck::index_t NPerBlock,
+          ck::index_t K0PerBlock,
+          ck::index_t K1,
+          ck::index_t MPerXDL,
+          ck::index_t NPerXDL,
+          ck::index_t MXdlPerWave,
+          ck::index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          ck::index_t ABlockTransferSrcVectorDim,
+          ck::index_t ABlockTransferSrcScalarPerVector,
+          ck::index_t ABlockTransferDstScalarPerVector_K1,
+          bool ABlockLdsAddExtraM,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          ck::index_t BBlockTransferSrcVectorDim,
+          ck::index_t BBlockTransferSrcScalarPerVector,
+          ck::index_t BBlockTransferDstScalarPerVector_K1,
+          bool BBlockLdsAddExtraN,
+          ck::index_t CThreadTransferSrcDstVectorDim,
+          ck::index_t CThreadTransferDstScalarPerVector,
+          ck::index_t NumPrefetch         = 1,
+          ck::LoopScheduler LoopSched     = make_default_loop_scheduler(),
+          ck::PipelineVersion PipelineVer = ck::PipelineVersion::v1>
+struct DeviceGemmXdl : public DeviceGemm<ALayout,
+                                         BLayout,
+                                         CLayout,
+                                         ADataType,
+                                         BDataType,
+                                         CDataType,
+                                         AElementwiseOperation,
+                                         BElementwiseOperation,
+                                         CElementwiseOperation>
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    static constexpr auto K1Number = Number<K1>{};
+
+    static auto MakeAGridDescriptor_K0_M_K1(index_t M, index_t K, index_t StrideA)
+    {
+        assert(K % K1 == 0);
+
+        const index_t K0 = K / K1;
+
+        const auto a_grid_desc_m_k = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ALayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
+            }
+        }();
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
+        {
+            const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
+
+            return transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
+                           make_right_pad_transform(M, PadM)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+    }
+
+    static auto MakeBGridDescriptor_K0_N_K1(index_t K, index_t N, index_t StrideB)
+    {
+        assert(K % K1 == 0);
+
+        const index_t K0 = K / K1;
+
+        const auto b_grid_desc_k_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB));
+            }
+        }();
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
+        {
+            const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
+
+            return transform_tensor_descriptor(
+                b_grid_desc_k_n,
+                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
+                           make_right_pad_transform(N, PadN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                b_grid_desc_k_n,
+                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+    }
+
+    static auto MakeCGridDescriptor_M_N(index_t M, index_t N, index_t StrideC)
+    {
+        const auto c_grid_desc_m_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
+            }
+        }();
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
+        {
+            const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
+            const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
+
+            return transform_tensor_descriptor(
+                c_grid_desc_m_n,
+                make_tuple(make_right_pad_transform(M, PadM), make_right_pad_transform(N, PadN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+
+            return transform_tensor_descriptor(
+                c_grid_desc_m_n,
+                make_tuple(make_pass_through_transform(M), make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+    }
+
+    using AGridDesc_K0_M_K1 = decltype(MakeAGridDescriptor_K0_M_K1(1, 1, 1));
+    using BGridDesc_K0_N_K1 = decltype(MakeBGridDescriptor_K0_N_K1(1, 1, 1));
+    using CGridDesc_M_N     = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3<
+        BlockSize,
+        ADataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CDataType,
+        InMemoryDataOperationEnum::Set,
+        AGridDesc_K0_M_K1,
+        BGridDesc_K0_N_K1,
+        CGridDesc_M_N,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        MPerBlock,
+        NPerBlock,
+        K0PerBlock,
+        MPerXDL,
+        NPerXDL,
+        K1,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_K0_M_K1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_K1,
+        false, // AThreadTransferSrcResetCoordinateAfterRun,
+        ABlockLdsAddExtraM,
+        BBlockTransferThreadClusterLengths_K0_N_K1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_K1,
+        false, // BThreadTransferSrcResetCoordinateAfterRun,
+        BBlockLdsAddExtraN,
+        Sequence<0, 2, 4, 5, 6, 1, 3, 7>, // CThreadTransferSrcDstAccessOrder,
+        CThreadTransferSrcDstVectorDim,
+        CThreadTransferDstScalarPerVector,
+        NumPrefetch,
+        LoopSched,
+        PipelineVer>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const ADataType* p_a_grid,
+                 const BDataType* p_b_grid,
+                 CDataType* p_c_grid,
+                 index_t M,
+                 index_t N,
+                 index_t K,
+                 index_t StrideA,
+                 index_t StrideB,
+                 index_t StrideC,
+                 index_t M01,
+                 index_t N01,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op)
+            : p_a_grid_{p_a_grid},
+              p_b_grid_{p_b_grid},
+              p_c_grid_{p_c_grid},
+              a_grid_desc_k0_m_k1_{},
+              b_grid_desc_k0_n_k1_{},
+              c_grid_desc_m_n_{},
+              c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_{},
+              block_2_ctile_map_{},
+              M01_{M01},
+              N01_{N01},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op},
+              kraw_{K}
+        {
+            a_grid_desc_k0_m_k1_ = DeviceGemmXdl::MakeAGridDescriptor_K0_M_K1(M, K, StrideA);
+            b_grid_desc_k0_n_k1_ = DeviceGemmXdl::MakeBGridDescriptor_K0_N_K1(K, N, StrideB);
+            c_grid_desc_m_n_     = DeviceGemmXdl::MakeCGridDescriptor_M_N(M, N, StrideC);
+
+            block_2_ctile_map_ =
+                GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_, M01, N01);
+
+            if(GridwiseGemm::CheckValidity(a_grid_desc_k0_m_k1_,
+                                           b_grid_desc_k0_n_k1_,
+                                           c_grid_desc_m_n_,
+                                           block_2_ctile_map_))
+            {
+                c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_ =
+                    GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m_n_);
+            }
+        }
+
+        //  private:
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        CDataType* p_c_grid_;
+        AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1_;
+        BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2
+            c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_;
+        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
+        index_t M01_;
+        index_t N01_;
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+        index_t kraw_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceGemmXdl::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+#if 0
+            {
+                std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
+                          << ", " << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", "
+                          << arg.a_grid_desc_k0_m_k1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.b_grid_desc_k0_n_k1_{" << arg.b_grid_desc_k0_n_k1_.GetLength(I0)
+                          << ", " << arg.b_grid_desc_k0_n_k1_.GetLength(I1) << ", "
+                          << arg.b_grid_desc_k0_n_k1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
+                          << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+            }
+#endif
+
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
+                                            arg.b_grid_desc_k0_n_k1_,
+                                            arg.c_grid_desc_m_n_,
+                                            arg.block_2_ctile_map_))
+            {
+                throw std::runtime_error(
+                    "wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_);
+
+            const auto K =
+                arg.a_grid_desc_k0_m_k1_.GetLength(I0) * arg.a_grid_desc_k0_m_k1_.GetLength(I2);
+
+            float ave_time = 0;
+
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                const auto kernel = kernel_gemm_xdlops_v2r3<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    remove_reference_t<DeviceGemmXdl::AGridDesc_K0_M_K1>,
+                    remove_reference_t<DeviceGemmXdl::BGridDesc_K0_N_K1>,
+                    remove_reference_t<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CElementwiseOperation,
+                    remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
+                    true>;
+
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
+                                                  dim3(grid_size),
+                                                  dim3(BlockSize),
+                                                  0,
+                                                  arg.p_a_grid_,
+                                                  arg.p_b_grid_,
+                                                  arg.p_c_grid_,
+                                                  arg.a_grid_desc_k0_m_k1_,
+                                                  arg.b_grid_desc_k0_n_k1_,
+                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
+                                                  arg.a_element_op_,
+                                                  arg.b_element_op_,
+                                                  arg.c_element_op_,
+                                                  arg.block_2_ctile_map_);
+            }
+            else
+            {
+                const auto kernel = kernel_gemm_xdlops_v2r3<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    remove_reference_t<DeviceGemmXdl::AGridDesc_K0_M_K1>,
+                    remove_reference_t<DeviceGemmXdl::BGridDesc_K0_N_K1>,
+                    remove_reference_t<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CElementwiseOperation,
+                    remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
+                    false>;
+
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
+                                                  dim3(grid_size),
+                                                  dim3(BlockSize),
+                                                  0,
+                                                  arg.p_a_grid_,
+                                                  arg.p_b_grid_,
+                                                  arg.p_c_grid_,
+                                                  arg.a_grid_desc_k0_m_k1_,
+                                                  arg.b_grid_desc_k0_n_k1_,
+                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
+                                                  arg.a_element_op_,
+                                                  arg.b_element_op_,
+                                                  arg.c_element_op_,
+                                                  arg.block_2_ctile_map_);
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(ck::get_device_name() == "gfx908")
+        {
+            if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, float> ||
+                           is_same_v<AccDataType, int32_t>))
+            {
+                return false;
+            }
+        }
+        else if(ck::get_device_name() == "gfx90a")
+        {
+            if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, float> ||
+                           is_same_v<AccDataType, int32_t> || is_same_v<AccDataType, double>))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+
+        if(arg.kraw_ % K1 != 0)
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
+                                           arg.b_grid_desc_k0_n_k1_,
+                                           arg.c_grid_desc_m_n_,
+                                           arg.block_2_ctile_map_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const ADataType* p_a,
+                             const BDataType* p_b,
+                             CDataType* p_c,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             index_t StrideA,
+                             index_t StrideB,
+                             index_t StrideC,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_c,
+                        M,
+                        N,
+                        K,
+                        StrideA,
+                        StrideB,
+                        StrideC,
+                        1,
+                        1,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_b,
+                                                      void* p_c,
+                                                      index_t M,
+                                                      index_t N,
+                                                      index_t K,
+                                                      index_t StrideA,
+                                                      index_t StrideB,
+                                                      index_t StrideC,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      CElementwiseOperation c_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<CDataType*>(p_c),
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideB,
+                                          StrideC,
+                                          1,
+                                          1,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        std::map<LoopScheduler, std::string> LoopSchedToString{
+            {LoopScheduler::Default, "Default"}, {LoopScheduler::Interwave, "Interwave"}};
+
+        std::map<PipelineVersion, std::string> PipelineVersionToString{{PipelineVersion::v1, "v1"},
+                                                                       {PipelineVersion::v2, "v2"}};
+
+        // clang-format off
+        str << "DeviceGemmXdl"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << K0PerBlock << ", "
+            << K1 << ", "
+            << MPerXDL << ", "
+            << NPerXDL << ", "
+            << MXdlPerWave << ", "
+            << NXdlPerWave
+            << ">"
+            << " NumPrefetch: "
+            << NumPrefetch << ", "
+            << "LoopScheduler: "
+            << LoopSchedToString[LoopSched] << ", "
+            << "PipelineVersion: "
+            << PipelineVersionToString[PipelineVer];
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
new file mode 100644
index 00000000..cc8c8d4d
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp
@@ -0,0 +1,700 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// Note: inter-wave loop scheduler is rolled out to c-shuffle version first. Becuase non c-shuffle
+// version currently has compiler issues with register spill which further causes validation
+// failures.
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched     = make_default_loop_scheduler(),
+          PipelineVersion PipelineVer = PipelineVersion::v1>
+struct DeviceGemm_Xdl_CShuffle : public DeviceGemm<ALayout,
+                                                   BLayout,
+                                                   CLayout,
+                                                   ADataType,
+                                                   BDataType,
+                                                   CDataType,
+                                                   AElementwiseOperation,
+                                                   BElementwiseOperation,
+                                                   CElementwiseOperation>
+{
+    using DeviceOp = DeviceGemm_Xdl_CShuffle;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    static auto MakeAGridDescriptor_AK0_M_AK1(index_t MRaw, index_t KRaw, index_t StrideA)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(I1, StrideA));
+            }
+        }();
+
+        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto K = math::integer_divide_ceil(KRaw, KPerBlock) * KPerBlock;
+
+        const auto MPad = M - MRaw;
+        const auto KPad = K - KRaw;
+
+        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both M and K
+            assert(K % AK1 == 0);
+
+            const auto AK0 = K / AK1;
+
+            const auto a_grid_desc_m_k =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_right_pad_transform(MRaw, MPad),
+                                                       make_right_pad_transform(KRaw, KPad)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_m_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(M)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad M, but not K
+            assert(KRaw % AK1 == 0);
+
+            const auto AK0 = KRaw / AK1;
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_right_pad_transform(MRaw, MPad)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad K, but not M
+            assert(K % AK1 == 0);
+
+            const auto AK0 = K / AK1;
+
+            const auto a_grid_desc_m_k = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(KRaw, KPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_m_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(MRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else
+        {
+            // not pad M or K
+            assert(KRaw % AK1 == 0);
+
+            const auto AK0 = KRaw / AK1;
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(MRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+    }
+
+    static auto MakeBGridDescriptor_BK0_N_BK1(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
+        const auto K = math::integer_divide_ceil(KRaw, KPerBlock) * KPerBlock;
+
+        const auto NPad = N - NRaw;
+        const auto KPad = K - KRaw;
+
+        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both N and K
+            assert(K % BK1 == 0);
+
+            const auto BK0 = K / BK1;
+
+            const auto b_grid_desc_n_k =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_right_pad_transform(NRaw, NPad),
+                                                       make_right_pad_transform(KRaw, KPad)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_n_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(N)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad N, but not K
+            assert(KRaw % BK1 == 0);
+
+            const auto BK0 = KRaw / BK1;
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_right_pad_transform(NRaw, NPad)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad K, but not N
+            assert(K % BK1 == 0);
+
+            const auto BK0 = K / BK1;
+
+            const auto b_grid_desc_n_k = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_pass_through_transform(NRaw), make_right_pad_transform(KRaw, KPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_n_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(NRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else
+        {
+            // not pad N or K
+            assert(KRaw % BK1 == 0);
+
+            const auto BK0 = KRaw / BK1;
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(NRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+    }
+
+    static auto MakeCGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(I1, StrideC));
+            }
+        }();
+
+        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
+
+        const auto MPad = M - MRaw;
+        const auto NPad = N - NRaw;
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad M and N
+            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                               make_tuple(make_right_pad_transform(MRaw, MPad),
+                                                          make_right_pad_transform(NRaw, NPad)),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad M, but not N
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_right_pad_transform(MRaw, MPad), make_pass_through_transform(NRaw)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad N, but not M
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(NRaw, NPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            // not pad M or N
+            return c_grid_desc_mraw_nraw;
+        }
+    }
+
+    using AGridDesc_AK0_M_AK1 = decltype(MakeAGridDescriptor_AK0_M_AK1(1, 1, 1));
+    using BGridDesc_BK0_N_BK1 = decltype(MakeBGridDescriptor_BK0_N_BK1(1, 1, 1));
+    using CGridDesc_M_N       = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
+        ADataType, // TODO: distinguish A/B datatype
+        GemmAccDataType,
+        CShuffleDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        InMemoryDataOperationEnum::Set,
+        AGridDesc_AK0_M_AK1,
+        BGridDesc_BK0_N_BK1,
+        CGridDesc_M_N,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        LoopSched,
+        PipelineVer>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const ADataType* p_a_grid,
+                 const BDataType* p_b_grid,
+                 CDataType* p_c_grid,
+                 index_t MRaw,
+                 index_t NRaw,
+                 index_t KRaw,
+                 index_t StrideA,
+                 index_t StrideB,
+                 index_t StrideC,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op)
+            : p_a_grid_{p_a_grid},
+              p_b_grid_{p_b_grid},
+              p_c_grid_{p_c_grid},
+              a_grid_desc_ak0_m_ak1_{DeviceOp::MakeAGridDescriptor_AK0_M_AK1(MRaw, KRaw, StrideA)},
+              b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1(KRaw, NRaw, StrideB)},
+              c_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, NRaw, StrideC)},
+              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              block_2_ctile_map_{GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_)},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op},
+              kraw_{KRaw}
+        {
+            if(GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1_,
+                                           b_grid_desc_bk0_n_bk1_,
+                                           c_grid_desc_m_n_,
+                                           block_2_ctile_map_))
+            {
+                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        c_grid_desc_m_n_);
+            }
+        }
+
+        //  private:
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        CDataType* p_c_grid_;
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock_;
+        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+        index_t kraw_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+#if 0
+            {
+                std::cout << "arg.a_grid_desc_ak0_m_ak1_{"
+                          << arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) << ", "
+                          << arg.a_grid_desc_ak0_m_ak1_.GetLength(I1) << ", "
+                          << arg.a_grid_desc_ak0_m_ak1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.b_grid_desc_bk0_n_bk1_{"
+                          << arg.b_grid_desc_bk0_n_bk1_.GetLength(I0) << ", "
+                          << arg.b_grid_desc_bk0_n_bk1_.GetLength(I1) << ", "
+                          << arg.b_grid_desc_bk0_n_bk1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
+                          << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+            }
+#endif
+
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                            arg.b_grid_desc_bk0_n_bk1_,
+                                            arg.c_grid_desc_m_n_,
+                                            arg.block_2_ctile_map_))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_);
+
+            const auto K =
+                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+
+            float ave_time = 0;
+
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                const auto kernel = kernel_gemm_xdl_cshuffle_v1<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CElementwiseOperation,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::DefaultBlock2CTileMap,
+                    true>;
+
+                ave_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_,
+                                           arg.p_b_grid_,
+                                           arg.p_c_grid_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.block_2_ctile_map_);
+            }
+            else
+            {
+                const auto kernel = kernel_gemm_xdl_cshuffle_v1<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CElementwiseOperation,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::DefaultBlock2CTileMap,
+                    false>;
+                ave_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_,
+                                           arg.p_b_grid_,
+                                           arg.p_c_grid_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.block_2_ctile_map_);
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        {
+            return false;
+        }
+
+        if((arg.kraw_ % AK1 != 0 || arg.kraw_ % BK1 != 0) &&
+           !(GemmSpec == GemmSpecialization::MKPadding ||
+             GemmSpec == GemmSpecialization::NKPadding ||
+             GemmSpec == GemmSpecialization::MNKPadding ||
+             GemmSpec == GemmSpecialization::KPadding))
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_m_n_,
+                                           arg.block_2_ctile_map_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const ADataType* p_a,
+                             const BDataType* p_b,
+                             CDataType* p_c,
+                             index_t MRaw,
+                             index_t NRaw,
+                             index_t KRaw,
+                             index_t StrideA,
+                             index_t StrideB,
+                             index_t StrideC,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_c,
+                        MRaw,
+                        NRaw,
+                        KRaw,
+                        StrideA,
+                        StrideB,
+                        StrideC,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_b,
+                                                      void* p_c,
+                                                      index_t MRaw,
+                                                      index_t NRaw,
+                                                      index_t KRaw,
+                                                      index_t StrideA,
+                                                      index_t StrideB,
+                                                      index_t StrideC,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      CElementwiseOperation c_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<CDataType*>(p_c),
+                                          MRaw,
+                                          NRaw,
+                                          KRaw,
+                                          StrideA,
+                                          StrideB,
+                                          StrideC,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        std::map<LoopScheduler, std::string> LoopSchedToString{
+            {LoopScheduler::Default, "Default"}, {LoopScheduler::Interwave, "Interwave"}};
+
+        std::map<PipelineVersion, std::string> PipelineVersionToString{{PipelineVersion::v1, "v1"},
+                                                                       {PipelineVersion::v2, "v2"}};
+
+        // clang-format off
+        str << "DeviceGemm_Xdl_CShuffle"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1
+            << ">"
+            << " LoopScheduler: "
+            << LoopSchedToString[LoopSched] << ", "
+            << "PipelineVersion: "
+            << PipelineVersionToString[PipelineVer];;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp
new file mode 100644
index 00000000..875623dc
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_layernorm_cshuffle.hpp
@@ -0,0 +1,773 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// The GEMM + Layernorm implementation is a specialized kernel which allows fusing both layers
+// together given the condition GEMM extents N of MNK is spanned by a single workgroup. For example,
+// a kernel configured with NPerBlock = 128 allows to operate on all GEMM sizes if N <= 128
+//
+// Note: inter-wave loop scheduler is rolled out to c-shuffle version first. Becuase non c-shuffle
+// version currently has compiler issues with register spill which further causes validation
+// failures.
+//
+// D = Layernorm(acc_element_op(A * B + broadcast(bias)) + add) * broadcast(gamma) + broadcast(beta)
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename C0DataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename ReduceAccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename AccElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          typename CReduceThreadClusterLengths_MPerBlock_NPerBlock,
+          index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched = make_default_loop_scheduler()>
+struct DeviceGemmLayerNorm_Xdl_CShuffle : public BaseOperator
+{
+    using DeviceOp = DeviceGemmLayerNorm_Xdl_CShuffle;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    static auto MakeAGridDescriptor_AK0_M_AK1(index_t MRaw, index_t KRaw, index_t StrideA)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(I1, StrideA));
+            }
+        }();
+
+        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto K = math::integer_divide_ceil(KRaw, KPerBlock) * KPerBlock;
+
+        const auto MPad = M - MRaw;
+        const auto KPad = K - KRaw;
+
+        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both M and K
+            assert(K % AK1 == 0);
+
+            const auto AK0 = K / AK1;
+
+            const auto a_grid_desc_m_k =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_right_pad_transform(MRaw, MPad),
+                                                       make_right_pad_transform(KRaw, KPad)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_m_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(M)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad M, but not K
+            assert(KRaw % AK1 == 0);
+
+            const auto AK0 = KRaw / AK1;
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_right_pad_transform(MRaw, MPad)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad K, but not M
+            assert(K % AK1 == 0);
+
+            const auto AK0 = K / AK1;
+
+            const auto a_grid_desc_m_k = transform_tensor_descriptor(
+                a_grid_desc_mraw_kraw,
+                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(KRaw, KPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_m_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(MRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+        else
+        {
+            // not pad M or K
+            assert(KRaw % AK1 == 0);
+
+            const auto AK0 = KRaw / AK1;
+
+            const auto a_grid_desc_ak0_m_ak1 =
+                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                       make_pass_through_transform(MRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return a_grid_desc_ak0_m_ak1;
+        }
+    }
+
+    static auto MakeBGridDescriptor_BK0_N_BK1(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
+        const auto K = math::integer_divide_ceil(KRaw, KPerBlock) * KPerBlock;
+
+        const auto NPad = N - NRaw;
+        const auto KPad = K - KRaw;
+
+        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad both N and K
+            assert(K % BK1 == 0);
+
+            const auto BK0 = K / BK1;
+
+            const auto b_grid_desc_n_k =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_right_pad_transform(NRaw, NPad),
+                                                       make_right_pad_transform(KRaw, KPad)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_n_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(N)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::MNPadding)
+        {
+            // pad N, but not K
+            assert(KRaw % BK1 == 0);
+
+            const auto BK0 = KRaw / BK1;
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_right_pad_transform(NRaw, NPad)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad K, but not N
+            assert(K % BK1 == 0);
+
+            const auto BK0 = K / BK1;
+
+            const auto b_grid_desc_n_k = transform_tensor_descriptor(
+                b_grid_desc_nraw_kraw,
+                make_tuple(make_pass_through_transform(NRaw), make_right_pad_transform(KRaw, KPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_n_k,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(NRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+        else
+        {
+            // not pad N or K
+            assert(KRaw % BK1 == 0);
+
+            const auto BK0 = KRaw / BK1;
+
+            const auto b_grid_desc_bk0_n_bk1 =
+                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(NRaw)),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return b_grid_desc_bk0_n_bk1;
+        }
+    }
+
+    static auto MakeCGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideC)
+    {
+        const auto c_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(I1, StrideC));
+            }
+        }();
+
+        const auto M = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto N = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
+
+        const auto MPad = M - MRaw;
+        const auto NPad = N - NRaw;
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad M and N
+            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                               make_tuple(make_right_pad_transform(MRaw, MPad),
+                                                          make_right_pad_transform(NRaw, NPad)),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                          GemmSpec == GemmSpecialization::MKPadding)
+        {
+            // pad M, but not N
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_right_pad_transform(MRaw, MPad), make_pass_through_transform(NRaw)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                          GemmSpec == GemmSpecialization::NKPadding)
+        {
+            // pad N, but not M
+            return transform_tensor_descriptor(
+                c_grid_desc_mraw_nraw,
+                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(NRaw, NPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            // not pad M or N
+            return c_grid_desc_mraw_nraw;
+        }
+    }
+
+    static auto MakeGridDescriptor_N(index_t NRaw)
+    {
+        const auto grid_desc_nraw = make_naive_tensor_descriptor_packed(make_tuple(NRaw));
+
+        const auto N    = math::integer_divide_ceil(NRaw, NPerBlock) * NPerBlock;
+        const auto NPad = N - NRaw;
+
+        if constexpr(GemmSpec == GemmSpecialization::NPadding ||
+                     GemmSpec == GemmSpecialization::MNPadding ||
+                     GemmSpec == GemmSpecialization::NKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad N
+            return transform_tensor_descriptor(grid_desc_nraw,
+                                               make_tuple(make_right_pad_transform(NRaw, NPad)),
+                                               make_tuple(Sequence<0>{}),
+                                               make_tuple(Sequence<0>{}));
+        }
+        else
+        {
+            // not pad N
+            return grid_desc_nraw;
+        }
+    }
+
+    using AGridDesc_AK0_M_AK1 = decltype(MakeAGridDescriptor_AK0_M_AK1(1, 1, 1));
+    using BGridDesc_BK0_N_BK1 = decltype(MakeBGridDescriptor_BK0_N_BK1(1, 1, 1));
+    using CGridDesc_M_N       = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
+    using C0GridDesc_N        = decltype(MakeGridDescriptor_N(1));
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
+        ADataType, // TODO: distinguish A/B datatype
+        GemmAccDataType,
+        CShuffleDataType,
+        CDataType,
+        C0DataType,
+        ReduceAccDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        AccElementwiseOperation,
+        CElementwiseOperation,
+        InMemoryDataOperationEnum::Set,
+        AGridDesc_AK0_M_AK1,
+        BGridDesc_BK0_N_BK1,
+        CGridDesc_M_N,
+        C0GridDesc_N,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        CReduceThreadClusterLengths_MPerBlock_NPerBlock,
+        CReduceThreadCopySrcDstScalarPerVector_NPerBlock,
+        LoopSched>;
+
+    using Block2CTileMap = typename GridwiseGemm::DefaultBlock2CTileMap;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const ADataType* p_a_grid,
+                 const BDataType* p_b_grid,
+                 CDataType* p_c_grid,
+                 const C0DataType* p_c0_grid_add,
+                 const C0DataType* p_c0_grid_bias,
+                 const C0DataType* p_c0_grid_gamma,
+                 const C0DataType* p_c0_grid_beta,
+                 index_t MRaw,
+                 index_t NRaw,
+                 index_t KRaw,
+                 index_t StrideA,
+                 index_t StrideB,
+                 index_t StrideC,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 AccElementwiseOperation acc_element_op,
+                 CElementwiseOperation c_element_op)
+            : p_a_grid_{p_a_grid},
+              p_b_grid_{p_b_grid},
+              p_c_grid_{p_c_grid},
+              p_c0_grid_bias_{p_c0_grid_bias},
+              p_c0_grid_add_{p_c0_grid_add},
+              p_c0_grid_gamma_{p_c0_grid_gamma},
+              p_c0_grid_beta_{p_c0_grid_beta},
+              a_grid_desc_ak0_m_ak1_{DeviceOp::MakeAGridDescriptor_AK0_M_AK1(MRaw, KRaw, StrideA)},
+              b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1(KRaw, NRaw, StrideB)},
+              c_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, NRaw, StrideC)},
+              c0_grid_desc_n_{MakeGridDescriptor_N(NRaw)},
+              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              c0_grid_desc_nblock_nperblock_{},
+              block_2_ctile_map_{Block2CTileMap(c_grid_desc_m_n_)},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              acc_element_op_{acc_element_op},
+              c_element_op_{c_element_op}
+        {
+            if(GridwiseGemm::CheckValidity(a_grid_desc_ak0_m_ak1_,
+                                           b_grid_desc_bk0_n_bk1_,
+                                           c_grid_desc_m_n_,
+                                           block_2_ctile_map_))
+            {
+                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        c_grid_desc_m_n_);
+
+                c0_grid_desc_nblock_nperblock_ =
+                    GridwiseGemm::MakeC0GridDescriptor_NBlock_NPerBlock(c0_grid_desc_n_);
+            }
+        }
+
+        //  private:
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        CDataType* p_c_grid_;
+        const C0DataType* p_c0_grid_bias_;
+        const C0DataType* p_c0_grid_add_;
+        const C0DataType* p_c0_grid_gamma_;
+        const C0DataType* p_c0_grid_beta_;
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        C0GridDesc_N c0_grid_desc_n_;
+        typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            c_grid_desc_mblock_mperblock_nblock_nperblock_;
+        typename GridwiseGemm::C0GridDescriptor_NBlock_NPerBlock c0_grid_desc_nblock_nperblock_;
+        Block2CTileMap block_2_ctile_map_;
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        AccElementwiseOperation acc_element_op_;
+        CElementwiseOperation c_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+#if 0
+            {
+                std::cout << "arg.a_grid_desc_ak0_m_ak1_{"
+                          << arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) << ", "
+                          << arg.a_grid_desc_ak0_m_ak1_.GetLength(I1) << ", "
+                          << arg.a_grid_desc_ak0_m_ak1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.b_grid_desc_bk0_n_bk1_{"
+                          << arg.b_grid_desc_bk0_n_bk1_.GetLength(I0) << ", "
+                          << arg.b_grid_desc_bk0_n_bk1_.GetLength(I1) << ", "
+                          << arg.b_grid_desc_bk0_n_bk1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
+                          << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+            }
+#endif
+
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                            arg.b_grid_desc_bk0_n_bk1_,
+                                            arg.c_grid_desc_m_n_,
+                                            arg.block_2_ctile_map_))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_);
+
+            const auto K =
+                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+
+            float ave_time = 0;
+
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                const auto kernel = kernel_gemm_layernorm_xdl_cshuffle_v1<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    C0DataType,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    AccElementwiseOperation,
+                    CElementwiseOperation,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::C0GridDescriptor_NBlock_NPerBlock,
+                    Block2CTileMap,
+                    true>;
+
+                ave_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_,
+                                           arg.p_b_grid_,
+                                           arg.p_c_grid_,
+                                           arg.p_c0_grid_bias_,
+                                           arg.p_c0_grid_add_,
+                                           arg.p_c0_grid_gamma_,
+                                           arg.p_c0_grid_beta_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.acc_element_op_,
+                                           arg.c_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.c0_grid_desc_nblock_nperblock_,
+                                           arg.block_2_ctile_map_);
+            }
+            else
+            {
+                const auto kernel = kernel_gemm_layernorm_xdl_cshuffle_v1<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    C0DataType,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    AccElementwiseOperation,
+                    CElementwiseOperation,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    typename GridwiseGemm::C0GridDescriptor_NBlock_NPerBlock,
+                    Block2CTileMap,
+                    false>;
+                ave_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_,
+                                           arg.p_b_grid_,
+                                           arg.p_c_grid_,
+                                           arg.p_c0_grid_bias_,
+                                           arg.p_c0_grid_add_,
+                                           arg.p_c0_grid_gamma_,
+                                           arg.p_c0_grid_beta_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.acc_element_op_,
+                                           arg.c_element_op_,
+                                           arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.c0_grid_desc_nblock_nperblock_,
+                                           arg.block_2_ctile_map_);
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_ak0_m_ak1_,
+                                           arg.b_grid_desc_bk0_n_bk1_,
+                                           arg.c_grid_desc_m_n_,
+                                           arg.block_2_ctile_map_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const ADataType* p_a,
+                             const BDataType* p_b,
+                             CDataType* p_c,
+                             const C0DataType* p_c0_bias,
+                             const C0DataType* p_c0_add,
+                             const C0DataType* p_c0_gamma,
+                             const C0DataType* p_c0_beta,
+                             index_t MRaw,
+                             index_t NRaw,
+                             index_t KRaw,
+                             index_t StrideA,
+                             index_t StrideB,
+                             index_t StrideC,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             AccElementwiseOperation acc_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_c,
+                        p_c0_bias,
+                        p_c0_add,
+                        p_c0_gamma,
+                        p_c0_beta,
+                        MRaw,
+                        NRaw,
+                        KRaw,
+                        StrideA,
+                        StrideB,
+                        StrideC,
+                        a_element_op,
+                        b_element_op,
+                        acc_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_b,
+                                                      void* p_c,
+                                                      const void* p_c0_bias,
+                                                      const void* p_c0_add,
+                                                      const void* p_c0_gamma,
+                                                      const void* p_c0_beta,
+                                                      index_t MRaw,
+                                                      index_t NRaw,
+                                                      index_t KRaw,
+                                                      index_t StrideA,
+                                                      index_t StrideB,
+                                                      index_t StrideC,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      AccElementwiseOperation acc_element_op,
+                                                      CElementwiseOperation c_element_op,
+                                                      index_t /* KBatch */ = 1)
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<CDataType*>(p_c),
+                                          static_cast<const C0DataType*>(p_c0_bias),
+                                          static_cast<const C0DataType*>(p_c0_add),
+                                          static_cast<const C0DataType*>(p_c0_gamma),
+                                          static_cast<const C0DataType*>(p_c0_beta),
+                                          MRaw,
+                                          NRaw,
+                                          KRaw,
+                                          StrideA,
+                                          StrideB,
+                                          StrideC,
+                                          a_element_op,
+                                          b_element_op,
+                                          acc_element_op,
+                                          c_element_op);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceGemmLayerNorm_Xdl_CShuffle"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp
new file mode 100644
index 00000000..42cabcea
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_skip_b_lds.hpp
@@ -0,0 +1,523 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp"
+
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AccDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          ck::index_t BlockSize,
+          ck::index_t MPerBlock,
+          ck::index_t NPerBlock,
+          ck::index_t K0PerBlock,
+          ck::index_t K1,
+          ck::index_t MPerXDL,
+          ck::index_t NPerXDL,
+          ck::index_t MXdlPerWave,
+          ck::index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          ck::index_t ABlockTransferSrcVectorDim,
+          ck::index_t ABlockTransferSrcScalarPerVector,
+          ck::index_t ABlockTransferDstScalarPerVector_K1,
+          bool ABlockLdsAddExtraM,
+          ck::index_t BBlockTransferSrcScalarPerVector,
+          ck::index_t BBlockBufferSize,
+          ck::index_t CThreadTransferSrcDstVectorDim,
+          ck::index_t CThreadTransferDstScalarPerVector>
+struct DeviceGemmXdlSkipBLds : public DeviceGemm<ALayout,
+                                                 BLayout,
+                                                 CLayout,
+                                                 ADataType,
+                                                 BDataType,
+                                                 CDataType,
+                                                 AElementwiseOperation,
+                                                 BElementwiseOperation,
+                                                 CElementwiseOperation>
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    static constexpr auto K1Number = Number<K1>{};
+    static_assert(BBlockBufferSize >= 2);
+
+    static auto MakeAGridDescriptor_K0_M_K1(index_t M, index_t K, index_t StrideA)
+    {
+        assert(K % K1 == 0);
+
+        const index_t K0 = K / K1;
+
+        const auto a_grid_desc_m_k = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ALayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
+            }
+        }();
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
+        {
+            const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
+
+            return transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
+                           make_right_pad_transform(M, PadM)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                a_grid_desc_m_k,
+                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+    }
+
+    static auto MakeBGridDescriptor_K0_N_K1(index_t K, index_t N, index_t StrideB)
+    {
+        assert(K % K1 == 0);
+
+        const index_t K0 = K / K1;
+
+        const auto b_grid_desc_k_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB));
+            }
+        }();
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
+        {
+            const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
+
+            return transform_tensor_descriptor(
+                b_grid_desc_k_n,
+                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
+                           make_right_pad_transform(N, PadN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                b_grid_desc_k_n,
+                make_tuple(make_unmerge_transform(make_tuple(K0, K1Number)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        }
+    }
+
+    static auto MakeCGridDescriptor_M_N(index_t M, index_t N, index_t StrideC)
+    {
+        const auto c_grid_desc_m_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
+            }
+        }();
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
+        {
+            const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
+            const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
+
+            return transform_tensor_descriptor(
+                c_grid_desc_m_n,
+                make_tuple(make_right_pad_transform(M, PadM), make_right_pad_transform(N, PadN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+
+            return transform_tensor_descriptor(
+                c_grid_desc_m_n,
+                make_tuple(make_pass_through_transform(M), make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+    }
+
+    using AGridDesc_K0_M_K1 = decltype(MakeAGridDescriptor_K0_M_K1(1, 1, 1));
+    using BGridDesc_K0_N_K1 = decltype(MakeBGridDescriptor_K0_N_K1(1, 1, 1));
+    using CGridDesc_M_N     = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemm_k0mk1_k0nk1_mn_xdlops_skip_b_lds_v1<
+        BlockSize,
+        ADataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CDataType,
+        InMemoryDataOperationEnum::Set,
+        AGridDesc_K0_M_K1,
+        BGridDesc_K0_N_K1,
+        CGridDesc_M_N,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        MPerBlock,
+        NPerBlock,
+        K0PerBlock,
+        MPerXDL,
+        NPerXDL,
+        K1,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_K0_M_K1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_K1,
+        false, // AThreadTransferSrcResetCoordinateAfterRun,
+        ABlockLdsAddExtraM,
+        BBlockTransferSrcScalarPerVector,
+        false, // BThreadTransferSrcResetCoordinateAfterRun,
+        BBlockBufferSize,
+        Sequence<0, 2, 4, 5, 6, 1, 3, 7>, // CThreadTransferSrcDstAccessOrder,
+        CThreadTransferSrcDstVectorDim,
+        CThreadTransferDstScalarPerVector>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const ADataType* p_a_grid,
+                 const BDataType* p_b_grid,
+                 CDataType* p_c_grid,
+                 index_t M,
+                 index_t N,
+                 index_t K,
+                 index_t StrideA,
+                 index_t StrideB,
+                 index_t StrideC,
+                 index_t M01,
+                 index_t N01,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op)
+            : p_a_grid_{p_a_grid},
+              p_b_grid_{p_b_grid},
+              p_c_grid_{p_c_grid},
+              a_grid_desc_k0_m_k1_{},
+              b_grid_desc_k0_n_k1_{},
+              c_grid_desc_m_n_{},
+              c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_{},
+              block_2_ctile_map_{},
+              M01_{M01},
+              N01_{N01},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op}
+        {
+            a_grid_desc_k0_m_k1_ =
+                DeviceGemmXdlSkipBLds::MakeAGridDescriptor_K0_M_K1(M, K, StrideA);
+            b_grid_desc_k0_n_k1_ =
+                DeviceGemmXdlSkipBLds::MakeBGridDescriptor_K0_N_K1(K, N, StrideB);
+            c_grid_desc_m_n_ = DeviceGemmXdlSkipBLds::MakeCGridDescriptor_M_N(M, N, StrideC);
+
+            if(GridwiseGemm::CheckValidity(
+                   a_grid_desc_k0_m_k1_, b_grid_desc_k0_n_k1_, c_grid_desc_m_n_, M01_, N01_))
+            {
+                c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_ =
+                    GridwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m_n_);
+
+                block_2_ctile_map_ =
+                    GridwiseGemm::MakeDefaultBlock2CTileMap(c_grid_desc_m_n_, M01, N01);
+
+                b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3_ =
+                    GridwiseGemm::MakeBGridDescriptor_K0_K1_K2_N0_N1_N2_N3_K3(b_grid_desc_k0_n_k1_);
+            }
+        }
+
+        //  private:
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        CDataType* p_c_grid_;
+        AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1_;
+        BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        typename GridwiseGemm::BGridDesc_K0_K1_K2_N0_N1_N2_N3_K3
+            b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3_;
+        typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2
+            c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_;
+        typename GridwiseGemm::DefaultBlock2CTileMap block_2_ctile_map_;
+        index_t M01_;
+        index_t N01_;
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceGemmXdlSkipBLds::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            {
+                std::cout << "arg.a_grid_desc_k0_m_k1_{" << arg.a_grid_desc_k0_m_k1_.GetLength(I0)
+                          << ", " << arg.a_grid_desc_k0_m_k1_.GetLength(I1) << ", "
+                          << arg.a_grid_desc_k0_m_k1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.b_grid_desc_k0_n_k1_{" << arg.b_grid_desc_k0_n_k1_.GetLength(I0)
+                          << ", " << arg.b_grid_desc_k0_n_k1_.GetLength(I1) << ", "
+                          << arg.b_grid_desc_k0_n_k1_.GetLength(I2) << "}" << std::endl;
+
+                std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
+                          << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+            }
+
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
+                                            arg.b_grid_desc_k0_n_k1_,
+                                            arg.c_grid_desc_m_n_,
+                                            arg.M01_,
+                                            arg.N01_))
+            {
+                throw std::runtime_error(
+                    "wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 has invalid setting");
+            }
+
+            const index_t grid_size = GridwiseGemm::CalculateGridSize(arg.c_grid_desc_m_n_);
+
+            const auto K0 = arg.a_grid_desc_k0_m_k1_.GetLength(I0);
+
+            const bool has_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0);
+
+            float ave_time = 0;
+
+            if(has_main_k0_block_loop)
+            {
+                const auto kernel = kernel_gemm_xdlops_skip_b_lds_v1<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    remove_reference_t<DeviceGemmXdlSkipBLds::AGridDesc_K0_M_K1>,
+                    remove_reference_t<DeviceGemmXdlSkipBLds::BGridDesc_K0_N_K1>,
+                    remove_reference_t<typename GridwiseGemm::BGridDesc_K0_K1_K2_N0_N1_N2_N3_K3>,
+                    remove_reference_t<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CElementwiseOperation,
+                    remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
+                    true>;
+
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
+                                                  dim3(grid_size),
+                                                  dim3(BlockSize),
+                                                  0,
+                                                  arg.p_a_grid_,
+                                                  arg.p_b_grid_,
+                                                  arg.p_c_grid_,
+                                                  arg.a_grid_desc_k0_m_k1_,
+                                                  arg.b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3_,
+                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
+                                                  arg.a_element_op_,
+                                                  arg.b_element_op_,
+                                                  arg.c_element_op_,
+                                                  arg.block_2_ctile_map_);
+            }
+            else
+            {
+                const auto kernel = kernel_gemm_xdlops_skip_b_lds_v1<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    remove_reference_t<DeviceGemmXdlSkipBLds::AGridDesc_K0_M_K1>,
+                    remove_reference_t<DeviceGemmXdlSkipBLds::BGridDesc_K0_N_K1>,
+                    remove_reference_t<typename GridwiseGemm::BGridDesc_K0_K1_K2_N0_N1_N2_N3_K3>,
+                    remove_reference_t<typename GridwiseGemm::CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2>,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CElementwiseOperation,
+                    remove_reference_t<typename GridwiseGemm::DefaultBlock2CTileMap>,
+                    false>;
+
+                ave_time = launch_and_time_kernel(stream_config,
+                                                  kernel,
+                                                  dim3(grid_size),
+                                                  dim3(BlockSize),
+                                                  0,
+                                                  arg.p_a_grid_,
+                                                  arg.p_b_grid_,
+                                                  arg.p_c_grid_,
+                                                  arg.a_grid_desc_k0_m_k1_,
+                                                  arg.b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3_,
+                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
+                                                  arg.a_element_op_,
+                                                  arg.b_element_op_,
+                                                  arg.c_element_op_,
+                                                  arg.block_2_ctile_map_);
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_k0_m_k1_,
+                                           arg.b_grid_desc_k0_n_k1_,
+                                           arg.c_grid_desc_m_n_,
+                                           arg.M01_,
+                                           arg.N01_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const ADataType* p_a,
+                             const BDataType* p_b,
+                             CDataType* p_c,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             index_t StrideA,
+                             index_t StrideB,
+                             index_t StrideC,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_c,
+                        M,
+                        N,
+                        K,
+                        StrideA,
+                        StrideB,
+                        StrideC,
+                        1,
+                        1,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_b,
+                                                      void* p_c,
+                                                      index_t M,
+                                                      index_t N,
+                                                      index_t K,
+                                                      index_t StrideA,
+                                                      index_t StrideB,
+                                                      index_t StrideC,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      CElementwiseOperation c_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<CDataType*>(p_c),
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideB,
+                                          StrideC,
+                                          1,
+                                          1,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceGemmXdlSkipBLds"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << K0PerBlock << ", "
+            << K1 << ", "
+            << MPerXDL << ", "
+            << NPerXDL << ", "
+            << MXdlPerWave << ", "
+            << NXdlPerWave
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
new file mode 100644
index 00000000..50515189
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp
@@ -0,0 +1,650 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_splitk.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AccDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          ck::index_t BlockSize,
+          ck::index_t MPerBlock,
+          ck::index_t NPerBlock,
+          ck::index_t K0PerBlock,
+          ck::index_t K1,
+          ck::index_t MPerXDL,
+          ck::index_t NPerXDL,
+          ck::index_t MXdlPerWave,
+          ck::index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          ck::index_t ABlockTransferSrcVectorDim,
+          ck::index_t ABlockTransferSrcScalarPerVector,
+          ck::index_t ABlockTransferDstScalarPerVector_K1,
+          bool ABlockLdsAddExtraM,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          ck::index_t BBlockTransferSrcVectorDim,
+          ck::index_t BBlockTransferSrcScalarPerVector,
+          ck::index_t BBlockTransferDstScalarPerVector_K1,
+          bool BBlockLdsAddExtraN,
+          index_t CShuffleMRepeatPerShuffle,
+          index_t CShuffleNRepeatPerShuffle,
+          typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CBlockTransferScalarPerVector_NWaveNPerXDL>
+struct DeviceGemmXdlSplitKCShuffle : public DeviceGemmSplitK<ALayout,
+                                                             BLayout,
+                                                             CLayout,
+                                                             ADataType,
+                                                             BDataType,
+                                                             CDataType,
+                                                             AElementwiseOperation,
+                                                             BElementwiseOperation,
+                                                             CElementwiseOperation>
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr auto K1Number = Number<K1>{};
+
+    static auto
+    MakeAGridDescriptor_KBatch_K0_M_K1(index_t M, index_t K, index_t StrideA, int KBatch, int KPad)
+    {
+        assert(KPad % (K1 * KBatch) == 0);
+
+        const index_t K0 = KPad / (K1 * KBatch);
+
+        const auto a_grid_desc_m_k = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ALayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ALayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, K), make_tuple(I1, StrideA));
+            }
+        }();
+
+        const auto a_grid_desc_m_kpad = transform_tensor_descriptor(
+            a_grid_desc_m_k,
+            make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
+        {
+            const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
+            return transform_tensor_descriptor(
+                a_grid_desc_m_kpad,
+                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1Number)),
+                           make_right_pad_transform(M, PadM)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                a_grid_desc_m_kpad,
+                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1Number)),
+                           make_pass_through_transform(M)),
+                make_tuple(Sequence<1>{}, Sequence<0>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+        }
+    }
+
+    static auto
+    MakeBGridDescriptor_KBatch_K0_N_K1(index_t K, index_t N, index_t StrideB, int KBatch, int KPad)
+    {
+        assert(KPad % (K1 * KBatch) == 0);
+
+        const index_t K0 = KPad / (K1 * KBatch);
+
+        const auto b_grid_desc_k_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(StrideB, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(K, N), make_tuple(I1, StrideB));
+            }
+        }();
+
+        const auto b_grid_desc_kpad_n = transform_tensor_descriptor(
+            b_grid_desc_k_n,
+            make_tuple(make_right_pad_transform(K, KPad - K), make_pass_through_transform(N)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
+        {
+            const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
+            return transform_tensor_descriptor(
+                b_grid_desc_kpad_n,
+                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1Number)),
+                           make_right_pad_transform(N, PadN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                b_grid_desc_kpad_n,
+                make_tuple(make_unmerge_transform(make_tuple(KBatch, K0, K1Number)),
+                           make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+        }
+    }
+
+    static auto MakeCGridDescriptor_M_N(index_t M, index_t N, index_t StrideC)
+    {
+        const auto c_grid_desc_m_n = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(StrideC, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, CLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(M, N), make_tuple(I1, StrideC));
+            }
+        }();
+
+        if constexpr(GemmSpec == GemmSpecialization::MNPadding)
+        {
+            const auto PadM = (MPerBlock - M % MPerBlock) % MPerBlock;
+            const auto PadN = (NPerBlock - N % NPerBlock) % NPerBlock;
+
+            return transform_tensor_descriptor(
+                c_grid_desc_m_n,
+                make_tuple(make_right_pad_transform(M, PadM), make_right_pad_transform(N, PadN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+
+            return transform_tensor_descriptor(
+                c_grid_desc_m_n,
+                make_tuple(make_pass_through_transform(M), make_pass_through_transform(N)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+    }
+
+    static auto GetKPad(index_t K, index_t KBatch)
+    {
+        const index_t K0   = math::integer_divide_ceil(K, K1 * K0PerBlock * KBatch) * K0PerBlock;
+        const index_t KPad = KBatch * K0 * K1;
+        return KPad;
+    }
+
+    using AGridDesc_K0_M_K1 = decltype(MakeAGridDescriptor_KBatch_K0_M_K1(1, 1, 1, 1, 1));
+    using BGridDesc_K0_N_K1 = decltype(MakeBGridDescriptor_KBatch_K0_N_K1(1, 1, 1, 1, 1));
+    using CGridDesc_M_N     = decltype(MakeCGridDescriptor_M_N(1, 1, 1));
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2<
+        BlockSize,
+        ADataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CDataType,
+        InMemoryDataOperationEnum::Set,
+        AGridDesc_K0_M_K1,
+        BGridDesc_K0_N_K1,
+        CGridDesc_M_N,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        MPerBlock,
+        NPerBlock,
+        K0PerBlock,
+        MPerXDL,
+        NPerXDL,
+        K1,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_K0_M_K1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_K1,
+        false, // AThreadTransferSrcResetCoordinateAfterRun,
+        ABlockLdsAddExtraM,
+        BBlockTransferThreadClusterLengths_K0_N_K1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_K1,
+        false, // BThreadTransferSrcResetCoordinateAfterRun,
+        BBlockLdsAddExtraN,
+        CShuffleMRepeatPerShuffle,
+        CShuffleNRepeatPerShuffle,
+        CBlockTransferScalarPerVector_NWaveNPerXDL,
+        CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>;
+
+    // GridwiseGemm
+    using GridwiseGemmAtomicAdd = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2<
+        BlockSize,
+        ADataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CDataType,
+        InMemoryDataOperationEnum::AtomicAdd,
+        AGridDesc_K0_M_K1,
+        BGridDesc_K0_N_K1,
+        CGridDesc_M_N,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        MPerBlock,
+        NPerBlock,
+        K0PerBlock,
+        MPerXDL,
+        NPerXDL,
+        K1,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_K0_M_K1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_K1,
+        false, // AThreadTransferSrcResetCoordinateAfterRun,
+        ABlockLdsAddExtraM,
+        BBlockTransferThreadClusterLengths_K0_N_K1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_K1,
+        false, // BThreadTransferSrcResetCoordinateAfterRun,
+        BBlockLdsAddExtraN,
+        CShuffleMRepeatPerShuffle,
+        CShuffleNRepeatPerShuffle,
+        CBlockTransferScalarPerVector_NWaveNPerXDL,
+        CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>;
+
+    using CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
+        decltype(GridwiseGemm::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(CGridDesc_M_N{}));
+
+    using Block2CTileMap = typename GridwiseGemm::CBlockClusterAdaptor;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const ADataType* p_a_grid,
+                 const BDataType* p_b_grid,
+                 CDataType* p_c_grid,
+                 index_t M,
+                 index_t N,
+                 index_t K,
+                 index_t StrideA,
+                 index_t StrideB,
+                 index_t StrideC,
+                 index_t M01,
+                 index_t N01,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op,
+                 index_t k_batch)
+            : p_a_grid_{p_a_grid},
+              p_b_grid_{p_b_grid},
+              p_c_grid_{p_c_grid},
+              a_grid_desc_kbatch_k0_m_k1_{},
+              b_grid_desc_kbatch_k0_n_k1_{},
+              c_grid_desc_m_n_{},
+              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              block_2_ctile_map_{},
+              M01_{M01},
+              N01_{N01},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op},
+              k_batch_{k_batch}
+        {
+            int KPad = DeviceGemmXdlSplitKCShuffle::GetKPad(K, k_batch_);
+
+            a_grid_desc_kbatch_k0_m_k1_ =
+                DeviceGemmXdlSplitKCShuffle::MakeAGridDescriptor_KBatch_K0_M_K1(
+                    M, K, StrideA, k_batch_, KPad);
+            b_grid_desc_kbatch_k0_n_k1_ =
+                DeviceGemmXdlSplitKCShuffle::MakeBGridDescriptor_KBatch_K0_N_K1(
+                    K, N, StrideB, k_batch_, KPad);
+            c_grid_desc_m_n_ = DeviceGemmXdlSplitKCShuffle::MakeCGridDescriptor_M_N(M, N, StrideC);
+
+            block_2_ctile_map_ =
+                GridwiseGemm::MakeCBlockClusterAdaptor(c_grid_desc_m_n_, M01, N01, k_batch_);
+
+            if(GridwiseGemm::CheckValidity(a_grid_desc_kbatch_k0_m_k1_,
+                                           b_grid_desc_kbatch_k0_n_k1_,
+                                           c_grid_desc_m_n_,
+                                           block_2_ctile_map_))
+            {
+                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n_);
+            }
+        }
+
+        //  private:
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        CDataType* p_c_grid_;
+        AGridDesc_K0_M_K1 a_grid_desc_kbatch_k0_m_k1_;
+        BGridDesc_K0_N_K1 b_grid_desc_kbatch_k0_n_k1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock_;
+        Block2CTileMap block_2_ctile_map_;
+        index_t M01_;
+        index_t N01_;
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+        index_t k_batch_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceGemmXdlSplitKCShuffle::Argument;
+
+        void ShowInfo(const Argument& arg)
+        {
+            std::cout << "arg.a_grid_desc_kbatch_k0_m_k1_{"
+                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0) << ", "
+                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I1) << ", "
+                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I2) << ", "
+                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I3) << "}" << std::endl;
+
+            std::cout << "arg.b_grid_desc_kbatch_k0_n_k1_{"
+                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I0) << ", "
+                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I1) << ", "
+                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I2) << ", "
+                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I3) << "}" << std::endl;
+
+            std::cout << "arg.c_grid_desc_m_n_{ " << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
+                      << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+        }
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            ShowInfo(arg);
+
+            const auto kbatch = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0);
+
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
+                                            arg.b_grid_desc_kbatch_k0_n_k1_,
+                                            arg.c_grid_desc_m_n_,
+                                            arg.block_2_ctile_map_))
+            {
+                throw std::runtime_error(
+                    "wrong! GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2 has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_);
+
+            const auto K0 = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I1);
+
+            const bool has_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0);
+
+            float ave_time = 0;
+
+            const auto Run = [&](const auto& kernel) {
+                hipGetErrorString(hipMemset(
+                    arg.p_c_grid_,
+                    0,
+                    arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
+                        sizeof(CDataType)));
+
+                ave_time =
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           arg.p_a_grid_,
+                                           arg.p_b_grid_,
+                                           arg.p_c_grid_,
+                                           arg.a_grid_desc_kbatch_k0_m_k1_,
+                                           arg.b_grid_desc_kbatch_k0_n_k1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.block_2_ctile_map_);
+            };
+
+            if(has_main_k0_block_loop)
+            {
+                if(kbatch == 1)
+                {
+                    const auto kernel = kernel_gemm_xdlops_v2r4r2<
+                        GridwiseGemm,
+                        ADataType, // TODO: distiguish A/B datatype
+                        CDataType,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::AGridDesc_K0_M_K1>,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::BGridDesc_K0_N_K1>,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::
+                                               CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                        AElementwiseOperation,
+                        BElementwiseOperation,
+                        CElementwiseOperation,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::Block2CTileMap>,
+                        true>;
+
+                    Run(kernel);
+                }
+                else
+                {
+                    const auto kernel = kernel_gemm_xdlops_v2r4r2<
+                        GridwiseGemmAtomicAdd,
+                        ADataType, // TODO: distiguish A/B datatype
+                        CDataType,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::AGridDesc_K0_M_K1>,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::BGridDesc_K0_N_K1>,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::
+                                               CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                        AElementwiseOperation,
+                        BElementwiseOperation,
+                        CElementwiseOperation,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::Block2CTileMap>,
+                        true>;
+
+                    Run(kernel);
+                }
+            }
+            else
+            {
+                if(kbatch == 1)
+                {
+                    const auto kernel = kernel_gemm_xdlops_v2r4r2<
+                        GridwiseGemm,
+                        ADataType, // TODO: distiguish A/B datatype
+                        CDataType,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::AGridDesc_K0_M_K1>,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::BGridDesc_K0_N_K1>,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::
+                                               CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                        AElementwiseOperation,
+                        BElementwiseOperation,
+                        CElementwiseOperation,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::Block2CTileMap>,
+                        false>;
+
+                    Run(kernel);
+                }
+                else
+                {
+                    const auto kernel = kernel_gemm_xdlops_v2r4r2<
+                        GridwiseGemmAtomicAdd,
+                        ADataType, // TODO: distiguish A/B datatype
+                        CDataType,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::AGridDesc_K0_M_K1>,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::BGridDesc_K0_N_K1>,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::
+                                               CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                        AElementwiseOperation,
+                        BElementwiseOperation,
+                        CElementwiseOperation,
+                        remove_reference_t<DeviceGemmXdlSplitKCShuffle::Block2CTileMap>,
+                        false>;
+
+                    Run(kernel);
+                }
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
+                                           arg.b_grid_desc_kbatch_k0_n_k1_,
+                                           arg.c_grid_desc_m_n_,
+                                           arg.block_2_ctile_map_);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const ADataType* p_a,
+                             const BDataType* p_b,
+                             CDataType* p_c,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             index_t StrideA,
+                             index_t StrideB,
+                             index_t StrideC,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op,
+                             index_t KBatch)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_c,
+                        M,
+                        N,
+                        K,
+                        StrideA,
+                        StrideB,
+                        StrideC,
+                        1,
+                        1,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op,
+                        KBatch};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_b,
+                                                      void* p_c,
+                                                      index_t M,
+                                                      index_t N,
+                                                      index_t K,
+                                                      index_t StrideA,
+                                                      index_t StrideB,
+                                                      index_t StrideC,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      CElementwiseOperation c_element_op,
+                                                      ck::index_t KBatch = 1) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<CDataType*>(p_c),
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideB,
+                                          StrideC,
+                                          1,
+                                          1,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op,
+                                          KBatch);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceGemmXdlSplitKCShuffle"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << K0PerBlock
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
new file mode 100644
index 00000000..03d9e26a
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_contraction_multiple_d_xdl_cshuffle.hpp
@@ -0,0 +1,907 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+
+template <typename GridwiseGemm,
+          typename ContractionMultiDKernelArg,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_grouped_contraction_multiple_d_xdl_cshuffle(
+            const void CK_CONSTANT_ADDRESS_SPACE* contraction_args,
+            const index_t group_count,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CDEElementwiseOperation cde_element_op)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    const index_t block_id = get_block_1d_id();
+
+    const auto contraction_arg_ptr = reinterpret_cast<const ContractionMultiDKernelArg*>(
+        cast_pointer_to_generic_address_space(contraction_args));
+
+    index_t left     = 0;
+    index_t right    = group_count;
+    index_t group_id = index_t((left + right) / 2);
+
+    while((!(block_id >= contraction_arg_ptr[group_id].block_start_ &&
+             block_id < contraction_arg_ptr[group_id].block_end_)) &&
+          left <= right)
+    {
+        if(block_id < contraction_arg_ptr[group_id].block_start_)
+        {
+            right = group_id;
+        }
+        else
+        {
+            left = group_id;
+        }
+        group_id = index_t((left + right) / 2);
+    }
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(
+        contraction_arg_ptr[group_id].p_a_grid_,
+        contraction_arg_ptr[group_id].p_b_grid_,
+        contraction_arg_ptr[group_id].p_ds_grid_,
+        contraction_arg_ptr[group_id].p_e_grid_,
+        p_shared,
+        a_element_op,
+        b_element_op,
+        cde_element_op,
+        contraction_arg_ptr[group_id].a_grid_desc_ak0_m_ak1_,
+        contraction_arg_ptr[group_id].b_grid_desc_bk0_n_bk1_,
+        contraction_arg_ptr[group_id].ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+        contraction_arg_ptr[group_id].e_grid_desc_mblock_mperblock_nblock_nperblock_,
+        contraction_arg_ptr[group_id].block_2_etile_map_);
+#else
+    ignore = contraction_args;
+    ignore = group_count;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = cde_element_op;
+#endif
+}
+
+} // namespace ck
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// Tensor Contraction:
+//   input : A
+//   input : B
+//   input : D0, D1, ...
+//   output : E
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   A[M0, M1, M2, ..., K0, K1, K2, ...]
+//   B[N0, N1, N2, ..., K0, K1, K2, ...]
+//   D[M0, M1, M2, ..., N0, N1, N2, ...]
+//   E[M0, M1, M2, ..., N0, N1, N2, ...]
+template <index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          TensorSpecialization ASpec,
+          TensorSpecialization BSpec,
+          TensorSpecialization DESpec,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CDEBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched = make_default_loop_scheduler()>
+struct DeviceGroupedContractionMultipleD_Xdl_CShuffle
+    : public DeviceGroupedContractionMultipleD<NumDimM,
+                                               NumDimN,
+                                               NumDimK,
+                                               ADataType,
+                                               BDataType,
+                                               DsDataType,
+                                               EDataType,
+                                               AElementwiseOperation,
+                                               BElementwiseOperation,
+                                               CDEElementwiseOperation>
+{
+    using DeviceOp = DeviceGroupedContractionMultipleD_Xdl_CShuffle;
+
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr auto matrix_padder =
+        MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock};
+
+    // Assume: A[M0, M1, M2, ..., K0, K1, K2, ...]
+    static auto MakeAGridDescriptor_M_K(const std::vector<index_t>& a_ms_ks_lengths_vec,
+                                        const std::vector<index_t>& a_ms_ks_strides_vec)
+    {
+        assert(a_ms_ks_lengths_vec.size() == NumDimM + NumDimK &&
+               a_ms_ks_strides_vec.size() == NumDimM + NumDimK);
+
+        const auto to_tuple = [&](auto& vec, auto num) {
+            return generate_tuple([&](auto i) { return vec[i]; }, num);
+        };
+
+        const auto a_ms_ks_lengths = to_tuple(a_ms_ks_lengths_vec, Number<NumDimM + NumDimK>{});
+        const auto a_ms_ks_strides = to_tuple(a_ms_ks_strides_vec, Number<NumDimM + NumDimK>{});
+
+        // dimension Ids for M0, M1, ...
+        constexpr auto mDimIds = typename arithmetic_sequence_gen<0, NumDimM, 1>::type{};
+
+        // dimension Ids for K0, K1, ...
+        constexpr auto kDimIds =
+            typename arithmetic_sequence_gen<NumDimM, NumDimM + NumDimK, 1>::type{};
+
+        // lengths for M0, M1, ...
+        const auto mLengths = get_container_subset(a_ms_ks_lengths, mDimIds);
+
+        // lengths for K0, K1, ...
+        const auto kLengths = get_container_subset(a_ms_ks_lengths, kDimIds);
+
+        if constexpr(ASpec == TensorSpecialization::Packed)
+        {
+            auto M = container_reduce(mLengths, math::multiplies{}, Number<1>{});
+            auto K = container_reduce(kLengths, math::multiplies{}, Number<1>{});
+            const auto a_grid_desc_mraw_kraw = make_naive_tensor_descriptor(
+                make_tuple(M, K),
+                make_tuple(a_ms_ks_strides[Number<NumDimM - 1>{}],
+                           a_ms_ks_strides[Number<NumDimM + NumDimK - 1>{}]));
+            return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
+        }
+        else
+        {
+            // naive tensor A[M0, M1, M2, ..., K0, K1, K2...]
+            const auto a_grid_desc_ms_ks =
+                make_naive_tensor_descriptor(a_ms_ks_lengths, a_ms_ks_strides);
+
+            // transformed tensor A[MRaw = M0 * M1 * M2 * ... , KRaw = K0 * K1 * K2 * ...]
+            const auto a_grid_desc_mraw_kraw = transform_tensor_descriptor(
+                a_grid_desc_ms_ks,
+                make_tuple(make_merge_transform(mLengths), make_merge_transform(kLengths)),
+                make_tuple(mDimIds, kDimIds),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
+        }
+    }
+
+    // Assume: B[N0, N1, N2, ..., K0, K1, K2, ...]
+    static auto MakeBGridDescriptor_N_K(const std::vector<index_t>& b_ns_ks_lengths_vec,
+                                        const std::vector<index_t>& b_ns_ks_strides_vec)
+    {
+        assert(b_ns_ks_lengths_vec.size() == NumDimN + NumDimK &&
+               b_ns_ks_strides_vec.size() == NumDimN + NumDimK);
+
+        const auto to_tuple = [&](auto& vec, auto num) {
+            return generate_tuple([&](auto i) { return vec[i]; }, num);
+        };
+
+        const auto b_ns_ks_lengths = to_tuple(b_ns_ks_lengths_vec, Number<NumDimN + NumDimK>{});
+        const auto b_ns_ks_strides = to_tuple(b_ns_ks_strides_vec, Number<NumDimN + NumDimK>{});
+
+        // dimension Ids for N0, N1, ...
+        constexpr auto nDimIds = typename arithmetic_sequence_gen<0, NumDimN, 1>::type{};
+
+        // dimension Ids for K0, K1, ...
+        constexpr auto kDimIds =
+            typename arithmetic_sequence_gen<NumDimN, NumDimN + NumDimK, 1>::type{};
+
+        // lengths for K0, K1, ...
+        const auto kLengths = get_container_subset(b_ns_ks_lengths, kDimIds);
+
+        // lengths for N0, N1, ...
+        const auto nLengths = get_container_subset(b_ns_ks_lengths, nDimIds);
+
+        if constexpr(BSpec == TensorSpecialization::Packed)
+        {
+            auto N = container_reduce(nLengths, math::multiplies{}, Number<1>{});
+            auto K = container_reduce(kLengths, math::multiplies{}, Number<1>{});
+            const auto b_grid_desc_nraw_kraw = make_naive_tensor_descriptor(
+                make_tuple(N, K),
+                make_tuple(b_ns_ks_strides[Number<NumDimN - 1>{}],
+                           b_ns_ks_strides[Number<NumDimN + NumDimK - 1>{}]));
+            return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
+        }
+        else
+        {
+            // naive tensor B[N0, N1, N2, ..., K0, K1, K2, ...]
+            const auto b_grid_desc_ns_ks =
+                make_naive_tensor_descriptor(b_ns_ks_lengths, b_ns_ks_strides);
+
+            // transformed tensor B[NRaw = N0 * N1 * N2 * ..., KRaw = K0 * K1 * K2 * ...]
+            const auto b_grid_desc_nraw_kraw = transform_tensor_descriptor(
+                b_grid_desc_ns_ks,
+                make_tuple(make_merge_transform(nLengths), make_merge_transform(kLengths)),
+                make_tuple(nDimIds, kDimIds),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
+        }
+    }
+
+    // assume E[M0, M1, M2, ..., N0, N1, N2...]
+    static auto MakeEGridDescriptor_M_N(const std::vector<index_t>& e_ms_ns_lengths_vec,
+                                        const std::vector<index_t>& e_ms_ns_strides_vec)
+    {
+        assert(e_ms_ns_lengths_vec.size() == NumDimM + NumDimN &&
+               e_ms_ns_strides_vec.size() == NumDimM + NumDimN);
+
+        const auto to_tuple = [&](auto& vec, auto num) {
+            return generate_tuple([&](auto i) { return vec[i]; }, num);
+        };
+
+        const auto e_ms_ns_lengths = to_tuple(e_ms_ns_lengths_vec, Number<NumDimM + NumDimN>{});
+        const auto e_ms_ns_strides = to_tuple(e_ms_ns_strides_vec, Number<NumDimM + NumDimN>{});
+
+        // dimension Ids for M0, M1, ...
+        constexpr auto mDimIds = typename arithmetic_sequence_gen<0, NumDimM, 1>::type{};
+
+        // dimension Ids for N0, N1, ...
+        constexpr auto nDimIds =
+            typename arithmetic_sequence_gen<NumDimM, NumDimM + NumDimN, 1>::type{};
+
+        // lengths for M0, M1, ...
+        const auto mLengths = get_container_subset(e_ms_ns_lengths, mDimIds);
+
+        // lengths for K0, K1, ...
+        const auto nLengths = get_container_subset(e_ms_ns_lengths, nDimIds);
+
+        if constexpr(DESpec == TensorSpecialization::Packed)
+        {
+            auto M = container_reduce(mLengths, math::multiplies{}, Number<1>{});
+            auto N = container_reduce(nLengths, math::multiplies{}, Number<1>{});
+            const auto e_grid_desc_mraw_nraw = make_naive_tensor_descriptor(
+                make_tuple(M, N),
+                make_tuple(e_ms_ns_strides[Number<NumDimM - 1>{}],
+                           e_ms_ns_strides[Number<NumDimM + NumDimN - 1>{}]));
+            return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
+        }
+        else
+        {
+            // naive tensor E[M0, M1, M2, ..., N0, N1, N2...]
+            const auto e_grid_desc_ms_ns =
+                make_naive_tensor_descriptor(e_ms_ns_lengths, e_ms_ns_strides);
+
+            // transformed tensor E[MRaw = M0 * M1 * M2 * ... , NRaw = N0 * N1 * N2 * ...]
+            const auto e_grid_desc_mraw_nraw = transform_tensor_descriptor(
+                e_grid_desc_ms_ns,
+                make_tuple(make_merge_transform(mLengths), make_merge_transform(nLengths)),
+                make_tuple(mDimIds, nDimIds),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
+        }
+    }
+
+    static auto MakeDsGridDescriptor_M_N(
+        const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_lengths_vec,
+        const std::array<std::vector<index_t>, NumDTensor>& ds_ms_ns_strides_vec)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                return DeviceOp::MakeEGridDescriptor_M_N(ds_ms_ns_lengths_vec[i],
+                                                         ds_ms_ns_strides_vec[i]);
+            },
+            Number<NumDTensor>{});
+    }
+
+    using AGridDesc_M_K  = decltype(MakeAGridDescriptor_M_K({}, {}));
+    using BGridDesc_N_K  = decltype(MakeBGridDescriptor_N_K({}, {}));
+    using DsGridDesc_M_N = remove_cvref_t<decltype(MakeDsGridDescriptor_M_N({{}}, {{}}))>;
+    using EGridDesc_M_N  = decltype(MakeEGridDescriptor_M_N({}, {}));
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<
+        ADataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CShuffleDataType,
+        DsDataType,
+        EDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CDEElementwiseOperation,
+        InMemoryDataOperationEnum::Set,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEBlockTransferScalarPerVector_NPerBlock,
+        LoopSched>;
+
+    // desc for blockwise copy
+    using AGridDesc_AK0_M_AK1                          = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(AGridDesc_M_K{}))>;
+    using BGridDesc_BK0_N_BK1                          = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(BGridDesc_N_K{}))>;
+    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(DsGridDesc_M_N{}))>;
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock  = remove_cvref_t<decltype(
+        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}))>;
+
+    struct GroupedContractionBlock2ETileMap
+    {
+        // block-to-e-tile map
+        using Block2ETileMap =
+            remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
+
+        GroupedContractionBlock2ETileMap(const EGridDesc_M_N& e_grid_desc_m_n,
+                                         ck::index_t BlockStart)
+        {
+            default_block_2_etile_map_ = GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n);
+            block_start_               = BlockStart;
+        }
+
+        template <typename TopIdx>
+        __host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx& idx_top) const
+        {
+            return default_block_2_etile_map_.CalculateBottomIndex(
+                make_multi_index(idx_top[I0] - block_start_));
+        }
+
+        // it's actually E-Tile
+        template <typename CTileIdx, typename CTileDim>
+        __host__ __device__ bool ValidCTileIndex(const CTileIdx& c_tile_idx,
+                                                 const CTileDim& c_tile_dim) const
+        {
+            return default_block_2_etile_map_.ValidCTileIndex(c_tile_idx, c_tile_dim);
+        }
+
+        __host__ bool CheckValidity(const EGridDesc_M_N& e_grid_desc_m_n) const
+        {
+            return default_block_2_etile_map_.CheckValidity(e_grid_desc_m_n);
+        }
+
+        Block2ETileMap default_block_2_etile_map_;
+        ck::index_t block_start_;
+    };
+
+    struct ContractionMultiDKernelArg
+    {
+        // pointers
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        EDataType* p_e_grid_;
+
+        // tensor descriptors for block/thread-wise copy
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+            ds_grid_desc_mblock_mperblock_nblock_nperblock_;
+        EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_;
+
+        // lock-to-e-tile map
+        GroupedContractionBlock2ETileMap block_2_etile_map_;
+
+        ck::index_t block_start_, block_end_;
+    };
+
+    struct ContractionMultiDDeviceArg
+    {
+        // tensor descriptors for problem definiton
+        AGridDesc_M_K a_grid_desc_m_k_;
+        BGridDesc_N_K b_grid_desc_n_k_;
+        DsGridDesc_M_N ds_grid_desc_m_n_;
+        EGridDesc_M_N e_grid_desc_m_n_;
+
+        // Strides for the last M/N/K dimensions of A/B/Ds/E
+        //   for sanity check of vector load/store
+        index_t a_mz_stride_;
+        index_t a_kz_stride_;
+        index_t b_nz_stride_;
+        index_t b_kz_stride_;
+        std::array<index_t, NumDTensor> ds_nz_stride_;
+        // index_t e_mz_stride_;
+        index_t e_nz_stride_;
+    };
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(std::vector<const void*> p_a_vec,
+                 std::vector<const void*> p_b_vec,
+                 std::vector<std::array<const void*, NumDTensor>> p_ds_vec,
+                 std::vector<void*> p_e_vec,
+                 std::vector<ContractionDesc<NumDTensor>> contraction_descs,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation cde_element_op)
+            : a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op}
+        {
+            group_count_ = contraction_descs.size();
+
+            if(!(group_count_ == p_a_vec.size() && group_count_ == p_b_vec.size() &&
+                 group_count_ == p_e_vec.size()))
+            {
+                throw std::runtime_error("wrong! group_count_ != a/b/e_vec.size");
+            }
+
+            contraction_multi_d_kernel_args_.reserve(group_count_);
+
+            grid_size_ = 0;
+
+            for(std::size_t i = 0; i < group_count_; i++)
+            {
+                const auto p_a_grid = static_cast<const ADataType*>(p_a_vec[i]);
+                const auto p_b_grid = static_cast<const BDataType*>(p_b_vec[i]);
+                const auto p_e_grid = static_cast<EDataType*>(p_e_vec[i]);
+
+                const auto a_grid_desc_m_k = DeviceOp::MakeAGridDescriptor_M_K(
+                    contraction_descs[i].a_ms_ks_lengths, contraction_descs[i].a_ms_ks_strides);
+                const auto b_grid_desc_n_k = DeviceOp::MakeBGridDescriptor_N_K(
+                    contraction_descs[i].b_ns_ks_lengths, contraction_descs[i].b_ns_ks_strides);
+
+                DsGridDesc_M_N ds_grid_desc_m_n;
+                typename GridwiseGemm::DsGridPointer p_ds_grid;
+
+                // populate pointer, batch stride, desc for Ds
+                static_for<0, NumDTensor, 1>{}([&](auto j) {
+                    using DDataType = remove_cvref_t<tuple_element_t<j.value, DsDataType>>;
+
+                    // D pointer
+                    p_ds_grid(j) = static_cast<const DDataType*>(p_ds_vec[i][j]);
+
+                    // D desc
+                    ds_grid_desc_m_n(j) =
+                        DeviceOp::MakeEGridDescriptor_M_N(contraction_descs[i].ds_ms_ns_lengths[j],
+                                                          contraction_descs[i].ds_ms_ns_strides[j]);
+                });
+
+                const auto e_grid_desc_m_n = DeviceOp::MakeEGridDescriptor_M_N(
+                    contraction_descs[i].e_ms_ns_lengths, contraction_descs[i].e_ms_ns_strides);
+
+                const auto a_grid_desc_ak0_m_ak1 =
+                    GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k);
+                const auto b_grid_desc_bk0_n_bk1 =
+                    GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k);
+
+                const auto ds_grid_desc_mblock_mperblock_nblock_nperblock =
+                    GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        ds_grid_desc_m_n);
+                const auto e_grid_desc_mblock_mperblock_nblock_nperblock =
+                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        e_grid_desc_m_n);
+
+                const index_t grid_size_grp =
+                    GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n)
+                        .CalculateGridSize(e_grid_desc_m_n);
+
+                const index_t BlockStart = grid_size_;
+                const index_t BlockEnd   = grid_size_ + grid_size_grp;
+
+                grid_size_ += grid_size_grp;
+
+                const auto block_2_etile_map =
+                    GroupedContractionBlock2ETileMap(e_grid_desc_m_n, BlockStart);
+
+                // for sanity check of vector memory access
+                const index_t a_mz_stride = contraction_descs[i].a_ms_ks_strides[NumDimM - 1];
+                const index_t a_kz_stride =
+                    contraction_descs[i].a_ms_ks_strides[NumDimM + NumDimK - 1];
+
+                const index_t b_nz_stride = contraction_descs[i].b_ns_ks_strides[NumDimN - 1];
+                const index_t b_kz_stride =
+                    contraction_descs[i].b_ns_ks_strides[NumDimN + NumDimK - 1];
+
+                std::array<index_t, NumDTensor> ds_nz_stride;
+                for(index_t j = 0; j < NumDTensor; ++j)
+                {
+                    ds_nz_stride[j] =
+                        contraction_descs[i].ds_ms_ns_strides[j][NumDimM + NumDimN - 1];
+                }
+
+                const index_t e_nz_stride =
+                    contraction_descs[i].e_ms_ns_strides[NumDimM + NumDimN - 1];
+
+                if(GridwiseGemm::CheckValidity(a_grid_desc_m_k,
+                                               b_grid_desc_n_k,
+                                               ds_grid_desc_m_n,
+                                               e_grid_desc_m_n,
+                                               block_2_etile_map))
+                {
+                    contraction_multi_d_kernel_args_.push_back(
+                        {p_a_grid,
+                         p_b_grid,
+                         p_ds_grid,
+                         p_e_grid,
+                         a_grid_desc_ak0_m_ak1,
+                         b_grid_desc_bk0_n_bk1,
+                         ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                         e_grid_desc_mblock_mperblock_nblock_nperblock,
+                         block_2_etile_map,
+                         BlockStart,
+                         BlockEnd});
+
+                    contraction_multi_d_device_args_.push_back({a_grid_desc_m_k,
+                                                                b_grid_desc_n_k,
+                                                                ds_grid_desc_m_n,
+                                                                e_grid_desc_m_n,
+                                                                a_mz_stride,
+                                                                a_kz_stride,
+                                                                b_nz_stride,
+                                                                b_kz_stride,
+                                                                ds_nz_stride,
+                                                                e_nz_stride});
+                }
+            }
+        }
+
+        std::vector<ContractionMultiDKernelArg> contraction_multi_d_kernel_args_;
+        std::vector<ContractionMultiDDeviceArg> contraction_multi_d_device_args_;
+
+        std::size_t group_count_;
+        index_t grid_size_;
+
+        // element-wise op
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            bool has_main_k_block_loop = true;
+
+            for(std::size_t i = 0; i < arg.group_count_; i++)
+            {
+                const auto K =
+                    arg.contraction_multi_d_kernel_args_[i].a_grid_desc_ak0_m_ak1_.GetLength(I0) *
+                    arg.contraction_multi_d_kernel_args_[i].a_grid_desc_ak0_m_ak1_.GetLength(I2);
+
+                if(GridwiseGemm::CalculateHasMainKBlockLoop(K) != has_main_k_block_loop)
+                {
+                    throw std::runtime_error("wrong! not all gemm has_main_k_block_loop");
+                }
+            }
+
+            hipGetErrorString(hipMemcpy(arg.p_workspace_,
+                                        arg.contraction_multi_d_kernel_args_.data(),
+                                        arg.contraction_multi_d_kernel_args_.size() *
+                                            sizeof(ContractionMultiDKernelArg),
+                                        hipMemcpyHostToDevice));
+
+            float ave_time = 0;
+
+            auto launch_kernel = [&](auto has_main_k_block_loop_) {
+                const auto kernel =
+                    kernel_grouped_contraction_multiple_d_xdl_cshuffle<GridwiseGemm,
+                                                                       ContractionMultiDKernelArg,
+                                                                       AElementwiseOperation,
+                                                                       BElementwiseOperation,
+                                                                       CDEElementwiseOperation,
+                                                                       has_main_k_block_loop_>;
+
+                return launch_and_time_kernel(
+                    stream_config,
+                    kernel,
+                    dim3(arg.grid_size_),
+                    dim3(BlockSize),
+                    0,
+                    cast_pointer_to_constant_address_space(arg.p_workspace_),
+                    arg.group_count_,
+                    arg.a_element_op_,
+                    arg.b_element_op_,
+                    arg.cde_element_op_);
+            };
+
+            if(has_main_k_block_loop)
+            {
+                ave_time = launch_kernel(integral_constant<bool, true>{});
+            }
+            else
+            {
+                ave_time = launch_kernel(integral_constant<bool, false>{});
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        {
+            return false;
+        }
+
+        for(std::size_t i = 0; i < arg.group_count_; i++)
+        {
+            const auto a_grid_desc_m_k_ = arg.contraction_multi_d_device_args_[i].a_grid_desc_m_k_;
+            const auto b_grid_desc_n_k_ = arg.contraction_multi_d_device_args_[i].b_grid_desc_n_k_;
+            const auto ds_grid_desc_m_n_ =
+                arg.contraction_multi_d_device_args_[i].ds_grid_desc_m_n_;
+            const auto e_grid_desc_m_n_ = arg.contraction_multi_d_device_args_[i].e_grid_desc_m_n_;
+            const auto a_grid_desc_ak0_m_ak1_ =
+                arg.contraction_multi_d_kernel_args_[i].a_grid_desc_ak0_m_ak1_;
+            const auto b_grid_desc_bk0_n_bk1_ =
+                arg.contraction_multi_d_kernel_args_[i].b_grid_desc_bk0_n_bk1_;
+            const auto ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                arg.contraction_multi_d_kernel_args_[i]
+                    .ds_grid_desc_mblock_mperblock_nblock_nperblock_;
+            const auto e_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                arg.contraction_multi_d_kernel_args_[i]
+                    .e_grid_desc_mblock_mperblock_nblock_nperblock_;
+
+            const auto block_2_etile_map_ =
+                arg.contraction_multi_d_kernel_args_[i].block_2_etile_map_;
+
+            const auto a_mz_stride_  = arg.contraction_multi_d_device_args_[i].a_mz_stride_;
+            const auto a_kz_stride_  = arg.contraction_multi_d_device_args_[i].a_kz_stride_;
+            const auto b_nz_stride_  = arg.contraction_multi_d_device_args_[i].b_nz_stride_;
+            const auto b_kz_stride_  = arg.contraction_multi_d_device_args_[i].b_kz_stride_;
+            const auto ds_nz_stride_ = arg.contraction_multi_d_device_args_[i].ds_nz_stride_;
+            const auto e_nz_stride_  = arg.contraction_multi_d_device_args_[i].e_nz_stride_;
+
+            if(!GridwiseGemm::CheckValidity(a_grid_desc_m_k_,
+                                            b_grid_desc_n_k_,
+                                            ds_grid_desc_m_n_,
+                                            e_grid_desc_m_n_,
+                                            block_2_etile_map_))
+            {
+                return false;
+            }
+
+            // check vector access
+            static_assert((ABlockTransferSrcVectorDim == 1 || ABlockTransferSrcVectorDim == 2) &&
+                              (BBlockTransferSrcVectorDim == 1 || BBlockTransferSrcVectorDim == 2),
+                          "wrong!");
+
+            // vector memory access of A: could be on M or AK1 dimension
+            if constexpr(ABlockTransferSrcVectorDim == 1)
+            {
+                if(!(a_mz_stride_ == 1 &&
+                     a_grid_desc_ak0_m_ak1_.GetLength(I1) % ABlockTransferSrcScalarPerVector == 0))
+                {
+                    return false;
+                }
+            }
+            else
+            {
+                if(!(a_kz_stride_ == 1 &&
+                     a_grid_desc_ak0_m_ak1_.GetLength(I2) % ABlockTransferSrcScalarPerVector == 0))
+                {
+                    return false;
+                }
+            }
+
+            // vector memory access of B: could be on N or BK1 dimension
+            if constexpr(BBlockTransferSrcVectorDim == 1)
+            {
+                if(!(b_nz_stride_ == 1 &&
+                     b_grid_desc_bk0_n_bk1_.GetLength(I1) % BBlockTransferSrcScalarPerVector == 0))
+                {
+                    return false;
+                }
+            }
+            else
+            {
+                if(!(b_kz_stride_ == 1 &&
+                     b_grid_desc_bk0_n_bk1_.GetLength(I2) % BBlockTransferSrcScalarPerVector == 0))
+                {
+                    return false;
+                }
+            }
+
+            // vector memory access of Ds: always on NPerBlock dimension
+            bool valid_d_access = true;
+
+            static_for<0, NumDTensor, 1>{}([&](auto j) {
+                if(!(ds_nz_stride_[j] == 1 &&
+                     ds_grid_desc_mblock_mperblock_nblock_nperblock_[j].GetLength(I3) %
+                             CDEBlockTransferScalarPerVector_NPerBlock ==
+                         0))
+                {
+                    valid_d_access = false;
+                }
+            });
+
+            if(valid_d_access == false)
+            {
+                return false;
+            }
+
+            // vector memory access of E: always on NPerBlock dimension
+            if(!(e_nz_stride_ == 1 && e_grid_desc_mblock_mperblock_nblock_nperblock_.GetLength(I3) %
+                                              CDEBlockTransferScalarPerVector_NPerBlock ==
+                                          0))
+            {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(std::vector<const void*> p_a_vec,
+                             std::vector<const void*> p_b_vec,
+                             std::vector<std::array<const void*, NumDTensor>> p_ds_vec,
+                             std::vector<void*> p_e_vec,
+                             std::vector<ContractionDesc<NumDTensor>> contraction_descs,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation cde_element_op)
+    {
+        return Argument{p_a_vec,
+                        p_b_vec,
+                        p_ds_vec,
+                        p_e_vec,
+                        contraction_descs,
+                        a_element_op,
+                        b_element_op,
+                        cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(std::vector<const void*> p_a_vec,
+                        std::vector<const void*> p_b_vec,
+                        std::vector<std::array<const void*, NumDTensor>> p_ds_vec,
+                        std::vector<void*> p_e_vec,
+                        std::vector<ContractionDesc<NumDTensor>> contraction_descs,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) override
+    {
+        return std::make_unique<Argument>(p_a_vec,
+                                          p_b_vec,
+                                          p_ds_vec,
+                                          p_e_vec,
+                                          contraction_descs,
+                                          a_element_op,
+                                          b_element_op,
+                                          cde_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceGroupedContractionMultipleD_Xdl_CShuffle"
+            << "<"
+            << NumDimM << ", "
+            << NumDimN << ", "
+            << NumDimK << ", "
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1 << ", "
+            << ABlockTransferSrcVectorDim << ", "
+            << BBlockTransferSrcVectorDim
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+
+    size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override
+    {
+        return dynamic_cast<const Argument*>(p_arg)->group_count_ *
+               sizeof(ContractionMultiDKernelArg);
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
new file mode 100644
index 00000000..682aba08
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -0,0 +1,1015 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp"
+#include "ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/io.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+namespace {
+
+template <index_t NumDTensor>
+struct ComputePtrOffsetOfStridedBatch
+{
+    ComputePtrOffsetOfStridedBatch() = default;
+
+    ComputePtrOffsetOfStridedBatch(index_t BatchStrideA,
+                                   index_t BatchStrideB,
+                                   Array<ck::index_t, NumDTensor> BatchStrideDs,
+                                   index_t BatchStrideE)
+        : BatchStrideA_(BatchStrideA),
+          BatchStrideB_(BatchStrideB),
+          BatchStrideDs_(BatchStrideDs),
+          BatchStrideE_(BatchStrideE)
+    {
+    }
+
+    __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
+    {
+        return g_idx * static_cast<long_index_t>(BatchStrideA_);
+    }
+
+    __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
+    {
+        return g_idx * static_cast<long_index_t>(BatchStrideB_);
+    }
+
+    __host__ __device__ constexpr auto GetDsPtrOffset(index_t g_idx) const
+    {
+        Array<long_index_t, NumDTensor> ds_offset;
+        static_for<0, NumDTensor, 1>{}(
+            [&](auto i) { ds_offset(i) = g_idx * static_cast<long_index_t>(BatchStrideDs_[i]); });
+        return ds_offset;
+    }
+
+    __host__ __device__ constexpr long_index_t GetEPtrOffset(index_t g_idx) const
+    {
+        return g_idx * static_cast<long_index_t>(BatchStrideE_);
+    }
+
+    index_t BatchStrideA_;
+    index_t BatchStrideB_;
+    Array<ck::index_t, NumDTensor> BatchStrideDs_;
+    index_t BatchStrideE_;
+};
+
+/*
+ * \brief Wrapper function of GridwiseGemm::Run to realize BatchedGEMM.
+ *
+ * \tparam ComputePtrOffsetOfBatch Class that computes the base pointer offsets of A, B, C matrix
+ * given the batch. For example, ComputePtrOffsetOfStridedBatch() computes the offsets of evenly
+ * strided batched, but we can easily extend to other layouts. The returned offset can be either \p
+ * index_t or \p long_index_t. If it returns \p long_index_t, we are not subject to the 2GB
+ * limitations.
+ *
+ * \tparam Block2ETileMap Block2ETileMap::CalculateBottomIndex() takes in id of a workgroup and
+ * returns the 2D index of the tile that it computes. \see
+ * GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3::Run().
+ *
+ * \note Using \p ComputePtrOffsetOfBatch gives us the flexibility that 2 workgroups can compute 2
+ * tiles from different matrices. Keep in mind that these 2 matrices can share the same grid
+ * descriptor (like in BatchedGEMM), or use their own grid descriptors (in GroupedGemm). \link
+ * device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp kernel_gemm_xdlops_v2r3_for_conv3d \endlink for \link
+ * DeviceConv3d \endlink uses the same concept, but currently does NOT encapsulate the computing of
+ * pointer offset into \p ComputePtrOffsetOfStridedBatch.
+ *
+ * \note \p Block2ETileMap allows customized mapping between a workgroup and the C-tile it computes.
+ * Together with \p ComputePtrOffsetOfBatch, we can reuse GridwiseGemm (and GridwiseGemm fusion ) to
+ * realize BatchedGemm and GroupedGemm (and the corresponding GEMM fusion).
+ *
+ */
+template <typename GridwiseGemm,
+          typename ABDataType,
+          typename DsPointer,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename Block2ETileMap,
+          typename ComputePtrOffsetOfBatch,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_grouped_conv_bwd_data_multiple_d_xdl_cshuffle(
+            const ABDataType* __restrict__ p_a_grid,
+            const ABDataType* __restrict__ p_b_grid,
+            DsPointer p_ds_grid,
+            EDataType* __restrict__ p_e_grid,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CDEElementwiseOperation cde_element_op,
+            const index_t batch_count,
+            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                ds_grid_desc_mblock_mperblock_nblock_nperblock,
+            const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+                e_grid_desc_mblock_mperblock_nblock_nperblock_,
+            const Block2ETileMap block_2_ctile_map,
+            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    // offset base pointer for each work-group
+    const index_t num_blocks_per_batch =
+        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+    const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
+
+    const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
+
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    DsPointer p_ds_grid_grp;
+
+    static constexpr index_t NumDTensor =
+        DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size();
+
+    static_for<0, NumDTensor, 1>{}(
+        [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+                                                  p_b_grid + b_batch_offset,
+                                                  p_ds_grid_grp,
+                                                  p_e_grid + e_batch_offset,
+                                                  p_shared,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  cde_element_op,
+                                                  a_grid_desc_ak0_m_ak1,
+                                                  b_grid_desc_bk0_n_bk1,
+                                                  ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                                  block_2_ctile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_ds_grid;
+    ignore = p_e_grid;
+    ignore = batch_count;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = e_grid_desc_mblock_mperblock_nblock_nperblock_;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = cde_element_op;
+    ignore = compute_ptr_offset_of_batch;
+    ignore = block_2_ctile_map;
+#endif
+}
+
+} // namespace
+
+// Conv backward data multiple D:
+//   input : output image A: [G, N, K, Ho, Wo]
+//   input : weight B: [G, K, C, Y, X],
+//   input : D0, D1, ... : [G, N, K, Ho, Wo]
+//   output : input image E: [G, N, C, Hi, Wi]
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+template <index_t NDimSpatial,
+          typename ALayout,   // output image
+          typename BLayout,   // weight
+          typename DsLayout,  // bias
+          typename ELayout,   // input image
+          typename ADataType, // output image
+          typename BDataType, // weight
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,       // bias
+          typename EDataType,        // input image
+          typename AElementwiseOp,   // output image
+          typename BElementwiseOp,   // weight
+          typename CDEElementwiseOp, // C, bias, and input image
+          ConvolutionBackwardDataSpecialization ConvBackwardDataSpecialization,
+          bool DoPadGemmM,
+          bool DoPadGemmN,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CDEBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched = make_default_loop_scheduler()>
+struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
+    : public DeviceGroupedConvBwdDataMultipleD<NDimSpatial,
+                                               ALayout,    // output image
+                                               BLayout,    // weight
+                                               DsLayout,   // bias
+                                               ELayout,    // input image
+                                               ADataType,  // output image
+                                               BDataType,  // weight
+                                               DsDataType, // bias
+                                               EDataType,  // input image
+                                               AElementwiseOp,
+                                               BElementwiseOp,
+                                               CDEElementwiseOp>
+{
+    // FIXME
+    static_assert(NDimSpatial == 2, "wrong! only implemented for 2D now");
+
+    using DeviceOp = DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1;
+
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    // TODO make A/B datatype different
+    using ABDataType = ADataType;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr auto transform_conv_to_gemm =
+        TransformConvBwdDataToGemm_v1<NDimSpatial,
+                                      ConvBackwardDataSpecialization,
+                                      AK1,
+                                      BK1,
+                                      MPerBlock,
+                                      NPerBlock,
+                                      DoPadGemmM,
+                                      DoPadGemmN>{};
+
+    static auto GetDummyABDsEGridDescriptor()
+    {
+        const std::array<index_t, NDimSpatial + 3> dummy_tensor_lengths = {1};
+        const std::array<index_t, NDimSpatial + 3> dummy_tensor_strides = {1};
+        const std::array<index_t, NDimSpatial> dummy_spatial_lengths    = {1};
+
+        const auto a_grid_desc_ak0_m_ak1 =
+            transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1<ALayout>(
+                dummy_tensor_lengths,
+                dummy_tensor_strides,
+                dummy_tensor_lengths,
+                dummy_tensor_strides,
+                dummy_tensor_lengths,
+                dummy_tensor_strides,
+                dummy_spatial_lengths,
+                dummy_spatial_lengths,
+                dummy_spatial_lengths,
+                dummy_spatial_lengths,
+                dummy_spatial_lengths);
+
+        const auto b_grid_desc_bk0_n_bk1 =
+            transform_conv_to_gemm.template MakeBDescriptor_BK0_N_BK1<BLayout>(
+                dummy_tensor_lengths,
+                dummy_tensor_strides,
+                dummy_tensor_lengths,
+                dummy_tensor_strides,
+                dummy_tensor_lengths,
+                dummy_tensor_strides,
+                dummy_spatial_lengths,
+                dummy_spatial_lengths,
+                dummy_spatial_lengths,
+                dummy_spatial_lengths,
+                dummy_spatial_lengths);
+
+        const auto ds_grid_desc_m_n = generate_tuple(
+            [&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+
+                return transform_conv_to_gemm.template MakeCDescriptor_M_N<DLayout>(
+                    dummy_tensor_lengths,
+                    dummy_tensor_strides,
+                    dummy_tensor_lengths,
+                    dummy_tensor_strides,
+                    dummy_tensor_lengths,
+                    dummy_tensor_strides,
+                    dummy_spatial_lengths,
+                    dummy_spatial_lengths,
+                    dummy_spatial_lengths,
+                    dummy_spatial_lengths,
+                    dummy_spatial_lengths);
+            },
+            Number<NumDTensor>{});
+
+        const auto e_grid_desc_m_n =
+            transform_conv_to_gemm.template MakeCDescriptor_M_N<ELayout>(dummy_tensor_lengths,
+                                                                         dummy_tensor_strides,
+                                                                         dummy_tensor_lengths,
+                                                                         dummy_tensor_strides,
+                                                                         dummy_tensor_lengths,
+                                                                         dummy_tensor_strides,
+                                                                         dummy_spatial_lengths,
+                                                                         dummy_spatial_lengths,
+                                                                         dummy_spatial_lengths,
+                                                                         dummy_spatial_lengths,
+                                                                         dummy_spatial_lengths);
+
+        return make_tuple(
+            a_grid_desc_ak0_m_ak1, b_grid_desc_bk0_n_bk1, ds_grid_desc_m_n, e_grid_desc_m_n);
+    }
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<
+        ABDataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CShuffleDataType,
+        DsDataType,
+        EDataType,
+        AElementwiseOp,
+        BElementwiseOp,
+        CDEElementwiseOp,
+        InMemoryDataOperationEnum::Set,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEBlockTransferScalarPerVector_NPerBlock,
+        LoopSched>;
+
+    template <typename Desc_K0_M_K1>
+    static auto transform_k0_m_k1_to_m_k(const Desc_K0_M_K1& desc_k0_m_k1)
+    {
+        const auto grid_desc_m_k = transform_tensor_descriptor(
+            desc_k0_m_k1,
+            make_tuple(make_pass_through_transform(desc_k0_m_k1.GetLength(I1)),
+                       make_merge_transform(
+                           make_tuple(desc_k0_m_k1.GetLength(I0), desc_k0_m_k1.GetLength(I2)))),
+            make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        return grid_desc_m_k;
+    }
+
+    // desc
+    using ABDsEGridDesc = decltype(GetDummyABDsEGridDescriptor());
+
+    using AGridDesc_AK0_M_AK1 = remove_cvref_t<tuple_element_t<0, ABDsEGridDesc>>;
+    using BGridDesc_BK0_N_BK1 = remove_cvref_t<tuple_element_t<1, ABDsEGridDesc>>;
+    using DsGridDesc_M_N      = remove_cvref_t<tuple_element_t<2, ABDsEGridDesc>>;
+    using EGridDesc_M_N       = remove_cvref_t<tuple_element_t<3, ABDsEGridDesc>>;
+
+    using AGridDesc_M_K = decltype(transform_k0_m_k1_to_m_k(AGridDesc_AK0_M_AK1{}));
+    using BGridDesc_N_K = decltype(transform_k0_m_k1_to_m_k(BGridDesc_BK0_N_BK1{}));
+
+    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = decltype(
+        GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(DsGridDesc_M_N{}));
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = decltype(
+        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}));
+
+    // block-to-e-tile map
+    using Block2ETileMap =
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const void* p_a,                                 // output image
+                 const void* p_b,                                 // weight
+                 const std::array<const void*, NumDTensor>& p_ds, // bias
+                 void* p_e,                                       // input image
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_lengths,
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_strides,
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+                 const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+                     ds_g_n_c_wis_lengths,
+                 const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+                     ds_g_n_c_wis_strides,
+                 const std::array<index_t, NDimSpatial + 3>& e_g_n_c_wis_lengths,
+                 const std::array<index_t, NDimSpatial + 3>& e_g_n_c_wis_strides,
+                 const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                 const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                 const std::array<index_t, NDimSpatial>& input_left_pads,
+                 const std::array<index_t, NDimSpatial>& input_right_pads,
+                 const AElementwiseOp& a_element_op,
+                 const BElementwiseOp& b_element_op,
+                 const CDEElementwiseOp& cde_element_op)
+            : p_a_grid_{static_cast<const ADataType*>(p_a)},
+              p_b_grid_{static_cast<const BDataType*>(p_b)},
+              p_ds_grid_{},
+              p_e_grid_{static_cast<EDataType*>(p_e)},
+              num_group_{a_g_n_k_wos_lengths[0]},
+              num_gemm_{},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op},
+              a_g_n_k_wos_lengths_{a_g_n_k_wos_lengths},
+              a_g_n_k_wos_strides_{a_g_n_k_wos_strides},
+              b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths},
+              b_g_k_c_xs_strides_{b_g_k_c_xs_strides},
+              ds_g_n_c_wis_lengths_{ds_g_n_c_wis_lengths},
+              ds_g_n_c_wis_strides_{ds_g_n_c_wis_strides},
+              e_g_n_c_wis_lengths_{e_g_n_c_wis_lengths},
+              e_g_n_c_wis_strides_{e_g_n_c_wis_strides},
+              conv_filter_strides_{conv_filter_strides},
+              conv_filter_dilations_{conv_filter_dilations},
+              input_left_pads_{input_left_pads},
+              input_right_pads_{input_right_pads}
+        {
+            // populate Ds pointer
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                p_ds_grid_(i) = static_cast<const DDataType*>(p_ds[i]);
+            });
+
+            // A/B/Ds/E Batch Stride
+            compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides[0];
+            compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_k_c_xs_strides[0];
+            compute_ptr_offset_of_batch_.BatchStrideE_ = e_g_n_c_wis_strides[0];
+
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                compute_ptr_offset_of_batch_.BatchStrideDs_(i) = ds_g_n_c_wis_strides[i][0];
+            });
+
+            // problem definition
+            const index_t Y = b_g_k_c_xs_lengths[3];
+            const index_t X = b_g_k_c_xs_lengths[4];
+
+            const index_t ConvStrideH = conv_filter_strides_[0];
+            const index_t ConvStrideW = conv_filter_strides_[1];
+
+            const index_t ConvDilationH = conv_filter_dilations_[0];
+            const index_t ConvDilationW = conv_filter_dilations_[1];
+
+            const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+            const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+
+            const auto YTilde = ConvStrideH / GcdStrideDilationH;
+            const auto XTilde = ConvStrideW / GcdStrideDilationW;
+
+            // number of GEMM
+            num_gemm_ = YTilde * XTilde;
+
+            for(index_t i_ytilde = 0; i_ytilde < YTilde; ++i_ytilde)
+            {
+                for(index_t i_xtilde = 0; i_xtilde < XTilde; ++i_xtilde)
+                {
+                    // check slice is valid
+                    const auto YDotSlice = math::integer_divide_ceil(Y - i_ytilde, YTilde);
+                    const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde);
+
+                    if(YDotSlice * XDotSlice <= 0)
+                    {
+                        continue;
+                    }
+
+                    const auto a_grid_desc_ak0_m_ak1 =
+                        transform_conv_to_gemm.template MakeADescriptor_AK0_M_AK1<ALayout>(
+                            a_g_n_k_wos_lengths,
+                            a_g_n_k_wos_strides,
+                            b_g_k_c_xs_lengths,
+                            b_g_k_c_xs_strides,
+                            e_g_n_c_wis_lengths,
+                            e_g_n_c_wis_strides,
+                            conv_filter_strides,
+                            conv_filter_dilations,
+                            input_left_pads,
+                            input_right_pads,
+                            {i_ytilde, i_xtilde});
+
+                    const auto b_grid_desc_bk0_n_bk1 =
+                        transform_conv_to_gemm.template MakeBDescriptor_BK0_N_BK1<BLayout>(
+                            a_g_n_k_wos_lengths,
+                            a_g_n_k_wos_strides,
+                            b_g_k_c_xs_lengths,
+                            b_g_k_c_xs_strides,
+                            e_g_n_c_wis_lengths,
+                            e_g_n_c_wis_strides,
+                            conv_filter_strides,
+                            conv_filter_dilations,
+                            input_left_pads,
+                            input_right_pads,
+                            {i_ytilde, i_xtilde});
+
+                    DsGridDesc_M_N ds_grid_desc_m_n;
+
+                    // populate Ds desc
+                    static_for<0, NumDTensor, 1>{}([&](auto i) {
+                        using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+
+                        ds_grid_desc_m_n(i) =
+                            transform_conv_to_gemm.template MakeCDescriptor_M_N<DLayout>(
+                                a_g_n_k_wos_lengths,
+                                a_g_n_k_wos_strides,
+                                b_g_k_c_xs_lengths,
+                                b_g_k_c_xs_strides,
+                                ds_g_n_c_wis_lengths[i],
+                                ds_g_n_c_wis_strides[i],
+                                conv_filter_strides,
+                                conv_filter_dilations,
+                                input_left_pads,
+                                input_right_pads,
+                                {i_ytilde, i_xtilde});
+                    });
+
+                    const auto e_grid_desc_m_n =
+                        transform_conv_to_gemm.template MakeCDescriptor_M_N<ELayout>(
+                            a_g_n_k_wos_lengths,
+                            a_g_n_k_wos_strides,
+                            b_g_k_c_xs_lengths,
+                            b_g_k_c_xs_strides,
+                            e_g_n_c_wis_lengths,
+                            e_g_n_c_wis_strides,
+                            conv_filter_strides,
+                            conv_filter_dilations,
+                            input_left_pads,
+                            input_right_pads,
+                            {i_ytilde, i_xtilde});
+
+                    // desc for problem definition
+                    const auto a_grid_desc_m_k = transform_k0_m_k1_to_m_k(a_grid_desc_ak0_m_ak1);
+                    const auto b_grid_desc_n_k = transform_k0_m_k1_to_m_k(b_grid_desc_bk0_n_bk1);
+
+                    a_grid_desc_m_k_container_.push_back(a_grid_desc_m_k);
+                    b_grid_desc_n_k_container_.push_back(b_grid_desc_n_k);
+                    ds_grid_desc_m_n_container_.push_back(ds_grid_desc_m_n);
+                    e_grid_desc_m_n_container_.push_back(e_grid_desc_m_n);
+
+                    // desc for blockwise copy
+                    a_grid_desc_ak0_m_ak1_container_.push_back(a_grid_desc_ak0_m_ak1);
+                    b_grid_desc_bk0_n_bk1_container_.push_back(b_grid_desc_bk0_n_bk1);
+
+                    // block-to-e-tile-map
+                    auto block_2_etile_map =
+                        GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n);
+
+                    block_2_etile_map_container_.push_back(block_2_etile_map);
+
+                    if(GridwiseGemm::CheckValidity(a_grid_desc_m_k,
+                                                   b_grid_desc_n_k,
+                                                   ds_grid_desc_m_n,
+                                                   e_grid_desc_m_n,
+                                                   block_2_etile_map))
+                    {
+                        ds_grid_desc_mblock_mperblock_nblock_nperblock_container_.push_back(
+                            GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                                ds_grid_desc_m_n));
+
+                        e_grid_desc_mblock_mperblock_nblock_nperblock_container_.push_back(
+                            GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                                e_grid_desc_m_n));
+                    }
+                }
+            }
+        }
+
+        void Print() const
+        {
+            for(index_t i = 0; i < num_gemm_; i++)
+            {
+                std::cout << "a_grid_desc_ak0_m_ak1_container_"
+                          << a_grid_desc_ak0_m_ak1_container_[i] << std::endl;
+
+                std::cout << "b_grid_desc_bk0_n_bk1_container_"
+                          << b_grid_desc_bk0_n_bk1_container_[i] << std::endl;
+
+                static_for<0, NumDTensor, 1>{}([&](auto j) {
+                    std::cout << "ds_grid_desc_mblock_mperblock_nblock_nperblock_container_"
+                              << ds_grid_desc_mblock_mperblock_nblock_nperblock_container_[i][j]
+                              << std::endl;
+                });
+
+                std::cout << "e_grid_desc_mblock_mperblock_nblock_nperblock_container_"
+                          << e_grid_desc_mblock_mperblock_nblock_nperblock_container_[i]
+                          << std::endl;
+            }
+        }
+
+        // pointers
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        EDataType* p_e_grid_;
+
+        // tensor descriptor for problem definition
+        index_t num_group_;
+        index_t num_gemm_;
+        std::vector<AGridDesc_M_K> a_grid_desc_m_k_container_;
+        std::vector<BGridDesc_N_K> b_grid_desc_n_k_container_;
+        std::vector<DsGridDesc_M_N> ds_grid_desc_m_n_container_;
+        std::vector<EGridDesc_M_N> e_grid_desc_m_n_container_;
+
+        // tensor descriptor for block-wise copy
+        std::vector<AGridDesc_AK0_M_AK1> a_grid_desc_ak0_m_ak1_container_;
+        std::vector<BGridDesc_BK0_N_BK1> b_grid_desc_bk0_n_bk1_container_;
+        std::vector<DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>
+            ds_grid_desc_mblock_mperblock_nblock_nperblock_container_;
+        std::vector<EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>
+            e_grid_desc_mblock_mperblock_nblock_nperblock_container_;
+
+        // block-to-e-tile map
+        std::vector<Block2ETileMap> block_2_etile_map_container_;
+
+        // for computing batch offset
+        ComputePtrOffsetOfStridedBatch<NumDTensor> compute_ptr_offset_of_batch_;
+
+        // element-wise op
+        AElementwiseOp a_element_op_;
+        BElementwiseOp b_element_op_;
+        CDEElementwiseOp cde_element_op_;
+
+        // for checking IsSupportedArgument()
+        std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_lengths_;
+        std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_strides_;
+        std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_lengths_;
+        std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_strides_;
+        std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_c_wis_lengths_;
+        std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_c_wis_strides_;
+        std::array<index_t, NDimSpatial + 3> e_g_n_c_wis_lengths_;
+        std::array<index_t, NDimSpatial + 3> e_g_n_c_wis_strides_;
+        std::array<index_t, NDimSpatial> conv_filter_strides_;
+        std::array<index_t, NDimSpatial> conv_filter_dilations_;
+        std::array<index_t, NDimSpatial> input_left_pads_;
+        std::array<index_t, NDimSpatial> input_right_pads_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+            }
+
+            float ave_time = 0;
+
+            for(index_t i = 0; i < arg.num_gemm_; i++)
+            {
+                if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_container_[i],
+                                                arg.b_grid_desc_n_k_container_[i],
+                                                arg.ds_grid_desc_m_n_container_[i],
+                                                arg.e_grid_desc_m_n_container_[i],
+                                                arg.block_2_etile_map_container_[i]))
+                {
+                    throw std::runtime_error("wrong! device_op has invalid setting");
+                }
+
+                const index_t grid_size = arg.block_2_etile_map_container_[i].CalculateGridSize(
+                                              arg.e_grid_desc_m_n_container_[i]) *
+                                          arg.num_group_;
+
+                const auto GemmK = arg.a_grid_desc_m_k_container_[i].GetLength(I1);
+
+                auto launch_kernel = [&](auto has_main_k_block_loop) {
+                    constexpr bool has_main_loop = has_main_k_block_loop.value;
+
+                    const auto kernel = kernel_grouped_conv_bwd_data_multiple_d_xdl_cshuffle<
+                        GridwiseGemm,
+                        ADataType, // TODO: distiguish A/B datatype
+                        typename GridwiseGemm::DsGridPointer,
+                        EDataType,
+                        AElementwiseOp,
+                        BElementwiseOp,
+                        CDEElementwiseOp,
+                        DeviceOp::AGridDesc_AK0_M_AK1,
+                        DeviceOp::BGridDesc_BK0_N_BK1,
+                        DeviceOp::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                        DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                        Block2ETileMap,
+                        ComputePtrOffsetOfStridedBatch<NumDTensor>,
+                        has_main_loop>;
+
+                    return launch_and_time_kernel(
+                        stream_config,
+                        kernel,
+                        dim3(grid_size),
+                        dim3(BlockSize),
+                        0,
+                        arg.p_a_grid_,
+                        arg.p_b_grid_,
+                        arg.p_ds_grid_,
+                        arg.p_e_grid_,
+                        arg.a_element_op_,
+                        arg.b_element_op_,
+                        arg.cde_element_op_,
+                        arg.a_g_n_k_wos_lengths_[0], // Group count
+                        arg.a_grid_desc_ak0_m_ak1_container_[i],
+                        arg.b_grid_desc_bk0_n_bk1_container_[i],
+                        arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_container_[i],
+                        arg.e_grid_desc_mblock_mperblock_nblock_nperblock_container_[i],
+                        arg.block_2_etile_map_container_[i],
+                        arg.compute_ptr_offset_of_batch_);
+                };
+
+                if(GridwiseGemm::CalculateHasMainKBlockLoop(GemmK))
+                {
+                    ave_time += launch_kernel(integral_constant<bool, true>{});
+                }
+                else
+                {
+                    ave_time += launch_kernel(integral_constant<bool, false>{});
+                }
+            }
+
+            return ave_time;
+        }
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        const index_t ConvK = arg.b_g_k_c_xs_lengths_[1];
+        const index_t ConvC = arg.b_g_k_c_xs_lengths_[2];
+
+        // Specifialization
+        if constexpr(ConvBackwardDataSpecialization ==
+                     ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0)
+        {
+            // check if it's 1x1, stride=1 pad = 0 conv
+            for(int i = 0; i < NDimSpatial; i++)
+            {
+                if(!(arg.b_g_k_c_xs_lengths_[3 + i] == 1 && arg.conv_filter_strides_[i] == 1 &&
+                     arg.input_left_pads_[i] == 0 && arg.input_right_pads_[i] == 0))
+                {
+                    return false;
+                }
+            }
+        }
+
+        // vector load for A matrix from global memory to LDS
+        if constexpr(is_same_v<ALayout, tensor_layout::convolution::GNHWK>)
+        {
+            if(!(ABlockTransferSrcVectorDim == 2 && ConvK % ABlockTransferSrcScalarPerVector == 0))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+
+        // vector load for B matrix from global memory to LDS
+        if constexpr(is_same_v<BLayout, tensor_layout::convolution::GKYXC>)
+        {
+            if(!(BBlockTransferSrcVectorDim == 1 && ConvC % BBlockTransferSrcScalarPerVector == 0))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+
+        // vector store for Ds
+        bool ds_valid = true;
+
+        static_for<0, NumDTensor, 1>{}([&](auto i) {
+            using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+
+            if constexpr(is_same_v<DLayout, tensor_layout::convolution::GNHWC> ||
+                         is_same_v<DLayout, tensor_layout::convolution::NHWGC> ||
+                         is_same_v<DLayout, tensor_layout::convolution::G_NHW_C> ||
+                         is_same_v<DLayout, tensor_layout::convolution::GC> ||
+                         is_same_v<DLayout, tensor_layout::convolution::G_C>)
+            {
+                // vector load D matrix from global memory
+                if(!(ConvC % CDEBlockTransferScalarPerVector_NPerBlock == 0))
+                {
+                    ds_valid = false;
+                }
+            }
+            else
+            {
+                ds_valid = false;
+            }
+        });
+
+        if(!ds_valid)
+        {
+            return false;
+        }
+
+        // vector store for E
+        if constexpr(is_same_v<ELayout, tensor_layout::convolution::GNHWC>)
+        {
+            // vector store C matrix into global memory
+            if(!(ConvC % CDEBlockTransferScalarPerVector_NPerBlock == 0))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+
+        // Gridwise GEMM size
+        for(std::size_t i = 0; i < arg.a_grid_desc_ak0_m_ak1_container_.size(); i++)
+        {
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_container_[i],
+                                            arg.b_grid_desc_n_k_container_[i],
+                                            arg.ds_grid_desc_m_n_container_[i],
+                                            arg.e_grid_desc_m_n_container_[i],
+                                            arg.block_2_etile_map_container_[i]))
+            {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto
+    MakeArgument(const void* p_a,                                                 // output image
+                 const void* p_b,                                                 // weight
+                 const std::array<const void*, NumDTensor>& p_ds,                 // bias
+                 void* p_e,                                                       // input image
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_lengths, // output image
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_strides, // output image
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,  // weight
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,  // weight
+                 const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+                     ds_g_n_c_wis_lengths, // bias
+                 const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+                     ds_g_n_c_wis_strides,                                        // bias
+                 const std::array<index_t, NDimSpatial + 3>& e_g_n_c_wis_lengths, // input image
+                 const std::array<index_t, NDimSpatial + 3>& e_g_n_c_wis_strides, // input image
+                 const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                 const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                 const std::array<index_t, NDimSpatial>& input_left_pads,
+                 const std::array<index_t, NDimSpatial>& input_right_pads,
+                 const AElementwiseOp& a_element_op,
+                 const BElementwiseOp& b_element_op,
+                 const CDEElementwiseOp& cde_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_ds,
+                        p_e,
+                        a_g_n_k_wos_lengths,
+                        a_g_n_k_wos_strides,
+                        b_g_k_c_xs_lengths,
+                        b_g_k_c_xs_strides,
+                        ds_g_n_c_wis_lengths,
+                        ds_g_n_c_wis_strides,
+                        e_g_n_c_wis_lengths,
+                        e_g_n_c_wis_strides,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        a_element_op,
+                        b_element_op,
+                        cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(
+        const void* p_a,                                                 // output image
+        const void* p_b,                                                 // weight
+        const std::array<const void*, NumDTensor>& p_ds,                 // bias
+        void* p_e,                                                       // input image
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_lengths, // output image
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_strides, // output image
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,  // weight
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,  // weight
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+            ds_g_n_c_wis_lengths, // bias
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+            ds_g_n_c_wis_strides,                                        // bias
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_c_wis_lengths, // input image
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_c_wis_strides, // input image
+        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+        const std::array<index_t, NDimSpatial>& input_left_pads,
+        const std::array<index_t, NDimSpatial>& input_right_pads,
+        const AElementwiseOp& a_element_op,
+        const BElementwiseOp& b_element_op,
+        const CDEElementwiseOp& cde_element_op) override
+    {
+        return std::make_unique<Argument>(p_a,
+                                          p_b,
+                                          p_ds,
+                                          p_e,
+                                          a_g_n_k_wos_lengths,
+                                          a_g_n_k_wos_strides,
+                                          b_g_k_c_xs_lengths,
+                                          b_g_k_c_xs_strides,
+                                          ds_g_n_c_wis_lengths,
+                                          ds_g_n_c_wis_strides,
+                                          e_g_n_c_wis_lengths,
+                                          e_g_n_c_wis_strides,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads,
+                                          a_element_op,
+                                          b_element_op,
+                                          cde_element_op);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1 << ", "
+            << getConvBackwardDataSpecializationString(ConvBackwardDataSpecialization)
+            << ">";
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp
new file mode 100644
index 00000000..d9e7b54c
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp
@@ -0,0 +1,1244 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <numeric>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+namespace {
+
+struct ComputePtrOffsetOfStridedBatch
+{
+    __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
+    {
+        return g_idx * static_cast<long_index_t>(BatchStrideA_);
+    }
+
+    __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
+    {
+        return g_idx * static_cast<long_index_t>(BatchStrideB_);
+    }
+
+    __host__ __device__ constexpr long_index_t GetCPtrOffset(index_t g_idx) const
+    {
+        return g_idx * static_cast<long_index_t>(BatchStrideC_);
+    }
+
+    index_t BatchStrideA_;
+    index_t BatchStrideB_;
+    index_t BatchStrideC_;
+};
+
+} // namespace
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename AGridDesc_B_K0_M_K1,
+          typename BGridDesc_B_K0_N_K1,
+          typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename Block2CTileMap,
+          typename ComputePtrOffsetOfBatch,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_batched_gemm_xdlops_bwd_weight(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CElementwiseOperation c_element_op,
+            const index_t batch_count,
+            const AGridDesc_B_K0_M_K1 a_b_k0_m_k1_grid_desc,
+            const BGridDesc_B_K0_N_K1 b_b_k0_n_k1_grid_desc,
+            const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+                c_grid_desc_mblock_mperblock_nblock_nperblock,
+            const Block2CTileMap block_2_ctile_map,
+            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    const index_t num_blocks_per_batch =
+        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+    const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx)));
+
+    __shared__ FloatAB p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB)];
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+                                                  p_b_grid + b_batch_offset,
+                                                  p_c_grid + c_batch_offset,
+                                                  p_shared,
+                                                  a_b_k0_m_k1_grid_desc,
+                                                  b_b_k0_n_k1_grid_desc,
+                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  c_element_op,
+                                                  block_2_ctile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_c_grid;
+    ignore = a_b_k0_m_k1_grid_desc;
+    ignore = b_b_k0_n_k1_grid_desc;
+    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = c_element_op;
+    ignore = batch_count;
+    ignore = block_2_ctile_map;
+    ignore = compute_ptr_offset_of_batch;
+
+    compute_ptr_offset_of_batch.GetAPtrOffset(0);
+    compute_ptr_offset_of_batch.GetBPtrOffset(0);
+    compute_ptr_offset_of_batch.GetCPtrOffset(0);
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename AccDataType,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation,
+          ConvolutionBackwardWeightSpecialization ConvBackwardWeightSpecialization,
+          ck::index_t BlockSize,
+          ck::index_t MPerBlock,
+          ck::index_t NPerBlock,
+          ck::index_t K0PerBlock,
+          ck::index_t K1,
+          ck::index_t MPerXdl,
+          ck::index_t NPerXdl,
+          ck::index_t MXdlPerWave,
+          ck::index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          ck::index_t ABlockTransferSrcVectorDim,
+          ck::index_t ABlockTransferSrcScalarPerVector,
+          ck::index_t ABlockTransferDstScalarPerVector_K1,
+          bool ABlockLdsAddExtraM,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          ck::index_t BBlockTransferSrcVectorDim,
+          ck::index_t BBlockTransferSrcScalarPerVector,
+          ck::index_t BBlockTransferDstScalarPerVector_K1,
+          bool BBlockLdsAddExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CBlockTransferScalarPerVector_NWaveNPerXdl>
+struct DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle
+    : public DeviceGroupedConvBwdWeight<
+          NDimSpatial,
+          ck::tuple_element_t<NDimSpatial - 1,
+                              ck::Tuple<ck::tensor_layout::convolution::GNWC,
+                                        ck::tensor_layout::convolution::GNHWC,
+                                        ck::tensor_layout::convolution::GNDHWC>>,
+          ck::tuple_element_t<NDimSpatial - 1,
+                              ck::Tuple<ck::tensor_layout::convolution::GKXC,
+                                        ck::tensor_layout::convolution::GKYXC,
+                                        ck::tensor_layout::convolution::GKZYXC>>,
+          ck::tuple_element_t<NDimSpatial - 1,
+                              ck::Tuple<ck::tensor_layout::convolution::GNWK,
+                                        ck::tensor_layout::convolution::GNHWK,
+                                        ck::tensor_layout::convolution::GNDHWK>>,
+          InDataType,
+          WeiDataType,
+          OutDataType,
+          InElementwiseOperation,
+          WeiElementwiseOperation,
+          OutElementwiseOperation>
+{
+    using DeviceOp = DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle;
+
+    using ADataType = OutDataType;
+    using BDataType = InDataType;
+    using CDataType = WeiDataType;
+
+    using AElementwiseOperation = OutElementwiseOperation;
+    using BElementwiseOperation = InElementwiseOperation;
+    using CElementwiseOperation = WeiElementwiseOperation;
+
+    // TODO make A/B datatype different
+    using ABDataType = InDataType;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+
+    static constexpr auto K1Number     = Number<K1>{};
+    static constexpr auto GemmK1Number = K1Number;
+
+    // Bytes per 32 lds bank: 32 * 4 bytes
+    static constexpr auto BankLength = 128;
+    static constexpr auto ElePerBank = BankLength / sizeof(ADataType);
+
+    // M1 & M0
+    static constexpr auto ABlockLdsM1PerBlock = ElePerBank / K1;
+    static constexpr auto ABlockLdsM0PerBlock = MPerBlock / ABlockLdsM1PerBlock;
+    static constexpr auto ABlockLdsM1Padding  = 4;
+
+    // N1 & N0
+    static constexpr auto BBlockLdsN1PerBlock = ElePerBank / K1;
+    static constexpr auto BBlockLdsN0PerBlock = NPerBlock / BBlockLdsN1PerBlock;
+    static constexpr auto BBlockLdsN1Padding  = 4;
+
+    template <ck::index_t NDim, typename ck::enable_if<NDim == 1, bool>::type = false>
+    static auto MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
+        ck::index_t N,
+        ck::index_t K,
+        ck::index_t C,
+        std::array<ck::index_t, NDimSpatial> input_spatial_lengths,
+        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths,
+        std::array<ck::index_t, NDimSpatial> output_spatial_lengths,
+        std::array<ck::index_t, NDimSpatial> conv_filter_strides,
+        std::array<ck::index_t, NDimSpatial> conv_filter_dilations,
+        std::array<ck::index_t, NDimSpatial> input_left_pads,
+        std::array<ck::index_t, NDimSpatial> input_right_pads,
+        ck::index_t batch_k)
+    {
+        using namespace ck;
+
+        const index_t Wi            = input_spatial_lengths[0];
+        const index_t Wo            = output_spatial_lengths[0];
+        const index_t X             = filter_spatial_lengths[0];
+        const index_t ConvStrideW   = conv_filter_strides[0];
+        const index_t ConvDilationW = conv_filter_dilations[0];
+        const index_t InLeftPadW    = input_left_pads[0];
+        const index_t InRightPadW   = input_right_pads[0];
+
+        const index_t GemmKTotal = N * Wo;
+        const index_t GemmM      = K;
+        const index_t GemmN      = C * X;
+
+        const index_t GemmKBatch = batch_k;
+        const index_t GemmK0 =
+            math::integer_divide_ceil(GemmKTotal, GemmK1Number * K0PerBlock * GemmKBatch) *
+            K0PerBlock;
+        const index_t GemmKPad = GemmKBatch * GemmK0 * GemmK1Number;
+
+        if constexpr(ConvBackwardWeightSpecialization ==
+                     ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
+        {
+            // A: output tensor
+            const auto out_gemmktotal_gemmm_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Wo, K));
+
+            const auto out_gemmkpad_gemmm_grid_desc = transform_tensor_descriptor(
+                out_gemmktotal_gemmm_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                out_gemmkpad_gemmm_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+            // B: input tensor
+            const auto in_gemmktotal_gemmn_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Wi, C));
+
+            const auto in_gemmkpad_gemmn_grid_desc = transform_tensor_descriptor(
+                in_gemmktotal_gemmn_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmkpad_gemmn_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+            // C: weight tensor
+            const auto wei_gemmm_gemmn_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(K, X * C));
+
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+                              wei_gemmm_gemmn_grid_desc);
+        }
+        else
+        {
+            const auto out_gemmktotal_gemmm_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Wo, K));
+            const auto in_n_wi_c_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N, Wi, C));
+
+            // A: output tensor
+            const auto out_gemmkpad_gemmm_grid_desc = transform_tensor_descriptor(
+                out_gemmktotal_gemmm_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                out_gemmkpad_gemmm_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+            // B: input tensor
+            const auto in_n_wip_c_grid_desc = transform_tensor_descriptor(
+                in_n_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            const auto in_n_x_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_wip_c_grid_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                    make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+            const auto in_gemmktotal_gemmn_grid_desc =
+                transform_tensor_descriptor(in_n_x_wo_c_grid_desc,
+                                            make_tuple(make_merge_transform(make_tuple(X, C)),
+                                                       make_merge_transform(make_tuple(N, Wo))),
+                                            make_tuple(Sequence<1, 3>{}, Sequence<0, 2>{}),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+            const auto in_gemmkpad_gemmn_grid_desc = transform_tensor_descriptor(
+                in_gemmktotal_gemmn_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmkpad_gemmn_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+            // C: weight tensor
+            const auto wei_gemmm_gemmn_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(K, X * C));
+
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+                              wei_gemmm_gemmn_grid_desc);
+        }
+    }
+
+    template <ck::index_t NDim, typename ck::enable_if<NDim == 2, bool>::type = false>
+    static auto MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
+        ck::index_t N,
+        ck::index_t K,
+        ck::index_t C,
+        std::array<ck::index_t, NDimSpatial> input_spatial_lengths,
+        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths,
+        std::array<ck::index_t, NDimSpatial> output_spatial_lengths,
+        std::array<ck::index_t, NDimSpatial> conv_filter_strides,
+        std::array<ck::index_t, NDimSpatial> conv_filter_dilations,
+        std::array<ck::index_t, NDimSpatial> input_left_pads,
+        std::array<ck::index_t, NDimSpatial> input_right_pads,
+        ck::index_t batch_k)
+    {
+        using namespace ck;
+
+        const index_t Hi = input_spatial_lengths[0];
+        const index_t Wi = input_spatial_lengths[1];
+
+        const index_t Ho = output_spatial_lengths[0];
+        const index_t Wo = output_spatial_lengths[1];
+
+        const index_t Y = filter_spatial_lengths[0];
+        const index_t X = filter_spatial_lengths[1];
+
+        const index_t ConvStrideH = conv_filter_strides[0];
+        const index_t ConvStrideW = conv_filter_strides[1];
+
+        const index_t ConvDilationH = conv_filter_dilations[0];
+        const index_t ConvDilationW = conv_filter_dilations[1];
+
+        const index_t InLeftPadH = input_left_pads[0];
+        const index_t InLeftPadW = input_left_pads[1];
+
+        const index_t InRightPadH = input_right_pads[0];
+        const index_t InRightPadW = input_right_pads[1];
+
+        const index_t GemmKTotal = N * Ho * Wo;
+        const index_t GemmM      = K;
+        const index_t GemmN      = C * X * Y;
+
+        const index_t GemmKBatch = batch_k;
+        const index_t GemmK0 =
+            math::integer_divide_ceil(GemmKTotal, GemmK1Number * K0PerBlock * GemmKBatch) *
+            K0PerBlock;
+        const index_t GemmKPad = GemmKBatch * GemmK0 * GemmK1Number;
+
+        if constexpr(ConvBackwardWeightSpecialization ==
+                     ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
+        {
+            // A: output tensor
+            const auto out_gemmktotal_gemmm_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K));
+
+            const auto out_gemmkpad_gemmm_grid_desc = transform_tensor_descriptor(
+                out_gemmktotal_gemmm_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                out_gemmkpad_gemmm_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+            // B: input tensor
+            const auto in_gemmktotal_gemmn_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Hi * Wi, C));
+
+            const auto in_gemmkpad_gemmn_grid_desc = transform_tensor_descriptor(
+                in_gemmktotal_gemmn_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmkpad_gemmn_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+            // C: weight tensor
+            const auto wei_gemmm_gemmn_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C));
+
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+                              wei_gemmm_gemmn_grid_desc);
+        }
+        else
+        {
+            const auto out_gemmktotal_gemmm_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K));
+            const auto in_n_hi_wi_c_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C));
+
+            // A: output tensor
+            const auto out_gemmkpad_gemmm_grid_desc = transform_tensor_descriptor(
+                out_gemmktotal_gemmm_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                out_gemmkpad_gemmm_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+            // B: input tensor
+            const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+                in_n_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_hip_wip_c_grid_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                    make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                    make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+            const auto in_gemmktotal_gemmn_grid_desc =
+                transform_tensor_descriptor(in_n_y_ho_x_wo_c_grid_desc,
+                                            make_tuple(make_merge_transform(make_tuple(Y, X, C)),
+                                                       make_merge_transform(make_tuple(N, Ho, Wo))),
+                                            make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}),
+                                            make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+            const auto in_gemmkpad_gemmn_grid_desc = transform_tensor_descriptor(
+                in_gemmktotal_gemmn_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmkpad_gemmn_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+            // C: weight tensor
+            const auto wei_gemmm_gemmn_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(K, Y * X * C));
+
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+                              wei_gemmm_gemmn_grid_desc);
+        }
+    }
+
+    template <ck::index_t NDim, typename ck::enable_if<NDim == 3, bool>::type = false>
+    static auto MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
+        ck::index_t N,
+        ck::index_t K,
+        ck::index_t C,
+        std::array<ck::index_t, NDimSpatial> input_spatial_lengths,
+        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths,
+        std::array<ck::index_t, NDimSpatial> output_spatial_lengths,
+        std::array<ck::index_t, NDimSpatial> conv_filter_strides,
+        std::array<ck::index_t, NDimSpatial> conv_filter_dilations,
+        std::array<ck::index_t, NDimSpatial> input_left_pads,
+        std::array<ck::index_t, NDimSpatial> input_right_pads,
+        ck::index_t batch_k)
+    {
+        using namespace ck;
+
+        const index_t Di = input_spatial_lengths[0];
+        const index_t Hi = input_spatial_lengths[1];
+        const index_t Wi = input_spatial_lengths[2];
+
+        const index_t Do = output_spatial_lengths[0];
+        const index_t Ho = output_spatial_lengths[1];
+        const index_t Wo = output_spatial_lengths[2];
+
+        const index_t Z = filter_spatial_lengths[0];
+        const index_t Y = filter_spatial_lengths[1];
+        const index_t X = filter_spatial_lengths[2];
+
+        const index_t ConvStrideD = conv_filter_strides[0];
+        const index_t ConvStrideH = conv_filter_strides[1];
+        const index_t ConvStrideW = conv_filter_strides[2];
+
+        const index_t ConvDilationD = conv_filter_dilations[0];
+        const index_t ConvDilationH = conv_filter_dilations[1];
+        const index_t ConvDilationW = conv_filter_dilations[2];
+
+        const index_t InLeftPadD = input_left_pads[0];
+        const index_t InLeftPadH = input_left_pads[1];
+        const index_t InLeftPadW = input_left_pads[2];
+
+        const index_t InRightPadD = input_right_pads[0];
+        const index_t InRightPadH = input_right_pads[1];
+        const index_t InRightPadW = input_right_pads[2];
+
+        const index_t GemmKTotal = N * Do * Ho * Wo;
+        const index_t GemmM      = K;
+        const index_t GemmN      = C * Z * X * Y;
+
+        const index_t GemmKBatch = batch_k;
+        const index_t GemmK0 =
+            math::integer_divide_ceil(GemmKTotal, GemmK1Number * K0PerBlock * GemmKBatch) *
+            K0PerBlock;
+        const index_t GemmKPad = GemmKBatch * GemmK0 * GemmK1Number;
+
+        if constexpr(ConvBackwardWeightSpecialization ==
+                     ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
+        {
+            // A: output tensor
+            const auto out_gemmktotal_gemmm_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Do * Ho * Wo, K));
+
+            const auto out_gemmkpad_gemmm_grid_desc = transform_tensor_descriptor(
+                out_gemmktotal_gemmm_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                out_gemmkpad_gemmm_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+            // B: input tensor
+            const auto in_gemmktotal_gemmn_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Di * Hi * Wi, C));
+
+            const auto in_gemmkpad_gemmn_grid_desc = transform_tensor_descriptor(
+                in_gemmktotal_gemmn_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmkpad_gemmn_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+            // C: weight tensor
+            const auto wei_gemmm_gemmn_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(K, Z * Y * X * C));
+
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+                              wei_gemmm_gemmn_grid_desc);
+        }
+        else
+        {
+            const auto out_gemmktotal_gemmm_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N * Do * Ho * Wo, K));
+            const auto in_n_di_hi_wi_c_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N, Di, Hi, Wi, C));
+
+            // A: output tensor
+            const auto out_gemmkpad_gemmm_grid_desc = transform_tensor_descriptor(
+                out_gemmktotal_gemmm_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                out_gemmkpad_gemmm_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+            // B: input tensor
+            const auto in_n_dip_hip_wip_c_grid_desc = transform_tensor_descriptor(
+                in_n_di_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Di, InLeftPadD, InRightPadD),
+                           make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+            const auto in_n_z_do_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_dip_hip_wip_c_grid_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(Z, Do), make_tuple(ConvDilationD, ConvStrideD)),
+                    make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                    make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                    make_pass_through_transform(C)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                make_tuple(Sequence<0>{},
+                           Sequence<1, 2>{},
+                           Sequence<3, 4>{},
+                           Sequence<5, 6>{},
+                           Sequence<7>{}));
+
+            const auto in_gemmktotal_gemmn_grid_desc = transform_tensor_descriptor(
+                in_n_z_do_y_ho_x_wo_c_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(Z, Y, X, C)),
+                           make_merge_transform(make_tuple(N, Do, Ho, Wo))),
+                make_tuple(Sequence<1, 3, 5, 7>{}, Sequence<0, 2, 4, 6>{}),
+                make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+            const auto in_gemmkpad_gemmn_grid_desc = transform_tensor_descriptor(
+                in_gemmktotal_gemmn_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmkpad_gemmn_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch, GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+
+            // C: weight tensor
+            const auto wei_gemmm_gemmn_grid_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(K, Z * Y * X * C));
+
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+                              wei_gemmm_gemmn_grid_desc);
+        }
+    } // function end
+
+    template <ck::index_t NDim, typename ck::enable_if<NDim == 1, bool>::type = false>
+    static auto GetABCGridDesc()
+    {
+        return MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<1>(
+            1, 1, 1, {1}, {1}, {1}, {1}, {1}, {1}, {1}, 1);
+    }
+
+    template <ck::index_t NDim, typename ck::enable_if<NDim == 2, bool>::type = false>
+    static auto GetABCGridDesc()
+    {
+        return MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<2>(
+            1, 1, 1, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, 1);
+    }
+
+    template <ck::index_t NDim, typename ck::enable_if<NDim == 3, bool>::type = false>
+    static auto GetABCGridDesc()
+    {
+        return MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<3>(1,
+                                                                  1,
+                                                                  1,
+                                                                  {1, 1, 1},
+                                                                  {1, 1, 1},
+                                                                  {1, 1, 1},
+                                                                  {1, 1, 1},
+                                                                  {1, 1, 1},
+                                                                  {1, 1, 1},
+                                                                  {1, 1, 1},
+                                                                  1);
+    }
+
+    // type convert descs
+    template <typename Desc_M0>
+    static auto PadDescriptor_M0_1d(Desc_M0 desc_m0, index_t gridSize, index_t blockSize)
+    {
+        const auto m0           = desc_m0.GetLength(I0);
+        const index_t loop_step = gridSize * blockSize * 4;
+        const auto pad          = math::integer_least_multiple(m0, loop_step) - m0;
+        const auto desc_m0_pad =
+            transform_tensor_descriptor(desc_m0,
+                                        make_tuple(make_right_pad_transform(m0, pad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        return desc_m0_pad;
+    }
+
+    template <index_t Dim>
+    static auto MakeDescriptor_M0(const std::array<index_t, Dim>& shape,
+                                  const std::array<index_t, Dim>& stride,
+                                  index_t gridSize,
+                                  index_t blockSize)
+    {
+        auto tupleOfShape  = generate_tuple([&](auto I) { return shape[I]; }, Number<Dim>{});
+        auto tupleOfStride = generate_tuple([&](auto I) { return stride[I]; }, Number<Dim>{});
+
+        // nd desc - [s0, s1, s2, ...]
+        const auto desc = make_naive_tensor_descriptor(tupleOfShape, tupleOfStride);
+
+        // merge nd to 1d desc - [s0 * s1 * ...]
+        if constexpr(Dim > 1)
+        {
+            const auto desc_m0 = transform_tensor_descriptor(
+                desc,
+                make_tuple(make_merge_transform(tupleOfShape)),
+                make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number<Dim>{})),
+                make_tuple(Sequence<0>{}));
+
+            return PadDescriptor_M0_1d(desc_m0, gridSize, blockSize);
+        }
+        else
+            return PadDescriptor_M0_1d(desc, gridSize, blockSize);
+    }
+
+    using GridDesc_M0 = decltype(MakeDescriptor_M0<1>({1}, {1}, 1, 1));
+
+    using ABCGridDescs = decltype(GetABCGridDesc<NDimSpatial>());
+
+    using AGridDesc_K0_M_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I0])>;
+    using BGridDesc_K0_N_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I1])>;
+    using CGridDesc_M_N     = remove_cvref_t<decltype(ABCGridDescs{}[I2])>;
+
+    using GridwiseGemm = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight<
+        BlockSize,
+        ADataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CDataType,
+        InMemoryDataOperationEnum::AtomicAdd,
+        AGridDesc_K0_M_K1,
+        BGridDesc_K0_N_K1,
+        CGridDesc_M_N,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        MPerBlock,
+        NPerBlock,
+        K0PerBlock,
+        MPerXdl,
+        NPerXdl,
+        K1,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_K0_M_K1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_K1,
+        false, // AThreadTransferSrcResetCoordinateAfterRun,
+        ABlockLdsAddExtraM,
+        ABlockLdsM1PerBlock,
+        ABlockLdsM0PerBlock,
+        ABlockLdsM1Padding,
+        BBlockTransferThreadClusterLengths_K0_N_K1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_K1,
+        false, // BThreadTransferSrcResetCoordinateAfterRun,
+        BBlockLdsAddExtraN,
+        BBlockLdsN1PerBlock,
+        BBlockLdsN0PerBlock,
+        BBlockLdsN1Padding,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CBlockTransferScalarPerVector_NWaveNPerXdl,
+        CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        true,
+        true>;
+
+    // Argument
+    using CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
+        decltype(GridwiseGemm::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(CGridDesc_M_N{}));
+
+    using Block2CTileMap =
+        decltype(GridwiseGemm::MakeCBlockClusterAdaptor(CGridDesc_M_N{}, 1, 1, 1));
+
+    struct Argument : public BaseArgument
+    {
+        Argument(const InDataType* p_in_grid,
+                 WeiDataType* p_wei_grid,
+                 const OutDataType* p_out_grid,
+                 ck::index_t G,
+                 ck::index_t N,
+                 ck::index_t K,
+                 ck::index_t C,
+                 std::array<ck::index_t, NDimSpatial> input_spatial_lengths,
+                 std::array<ck::index_t, NDimSpatial> filter_spatial_lengths,
+                 std::array<ck::index_t, NDimSpatial> output_spatial_lengths,
+                 std::array<ck::index_t, NDimSpatial> conv_filter_strides,
+                 std::array<ck::index_t, NDimSpatial> conv_filter_dilations,
+                 std::array<ck::index_t, NDimSpatial> input_left_pads,
+                 std::array<ck::index_t, NDimSpatial> input_right_pads,
+                 ck::index_t M01,
+                 ck::index_t N01,
+                 InElementwiseOperation in_element_op,
+                 WeiElementwiseOperation wei_element_op,
+                 OutElementwiseOperation out_element_op,
+                 ck::index_t split_k)
+            : p_a_grid_{p_out_grid},
+              p_b_grid_{p_in_grid},
+              p_c_grid_{p_wei_grid},
+              a_grid_desc_kbatch_k0_m_k1_{},
+              b_grid_desc_kbatch_k0_n_k1_{},
+              c_grid_desc_m_n_{},
+              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              block_2_ctile_map_{},
+              compute_ptr_offset_of_batch_{},
+              M01_{M01},
+              N01_{N01},
+              a_element_op_{out_element_op},
+              b_element_op_{in_element_op},
+              c_element_op_{wei_element_op},
+              Conv_G_{G},
+              Conv_N_{N},
+              Conv_K_{K},
+              Conv_C_{C},
+              output_spatial_lengths_{output_spatial_lengths},
+              filter_spatial_lengths_{filter_spatial_lengths},
+              conv_filter_strides_{conv_filter_strides},
+              input_left_pads_{input_left_pads},
+              input_right_pads_{input_right_pads},
+              k_batch_{split_k}
+        {
+            const auto descs =
+                DeviceOp::MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<NDimSpatial>(
+                    N,
+                    K,
+                    C,
+                    input_spatial_lengths,
+                    filter_spatial_lengths,
+                    output_spatial_lengths,
+                    conv_filter_strides,
+                    conv_filter_dilations,
+                    input_left_pads,
+                    input_right_pads,
+                    k_batch_);
+
+            a_grid_desc_kbatch_k0_m_k1_ = descs[I0];
+            b_grid_desc_kbatch_k0_n_k1_ = descs[I1];
+            c_grid_desc_m_n_            = descs[I2];
+
+            block_2_ctile_map_ =
+                GridwiseGemm::MakeCBlockClusterAdaptor(c_grid_desc_m_n_, M01, N01, k_batch_);
+
+            // A/B/C Batch Stride
+            compute_ptr_offset_of_batch_.BatchStrideA_ =
+                N * K *
+                std::accumulate(begin(output_spatial_lengths),
+                                end(output_spatial_lengths),
+                                index_t{1},
+                                std::multiplies<>{});
+            compute_ptr_offset_of_batch_.BatchStrideB_ =
+                N * C *
+                std::accumulate(begin(input_spatial_lengths),
+                                end(input_spatial_lengths),
+                                index_t{1},
+                                std::multiplies<>{});
+            compute_ptr_offset_of_batch_.BatchStrideC_ =
+                K * C *
+                std::accumulate(begin(filter_spatial_lengths),
+                                end(filter_spatial_lengths),
+                                index_t{1},
+                                std::multiplies<>{});
+
+            if(GridwiseGemm::CheckValidity(a_grid_desc_kbatch_k0_m_k1_,
+                                           b_grid_desc_kbatch_k0_n_k1_,
+                                           c_grid_desc_m_n_,
+                                           block_2_ctile_map_))
+            {
+                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n_);
+            }
+        }
+
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        CDataType* p_c_grid_;
+        AGridDesc_K0_M_K1 a_grid_desc_kbatch_k0_m_k1_;
+        BGridDesc_K0_N_K1 b_grid_desc_kbatch_k0_n_k1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock_;
+
+        Block2CTileMap block_2_ctile_map_;
+
+        // for computing batch offset
+        ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_;
+
+        index_t M01_;
+        index_t N01_;
+
+        InElementwiseOperation a_element_op_;
+        OutElementwiseOperation b_element_op_;
+        WeiElementwiseOperation c_element_op_;
+
+        // for checking IsSupportedArgument()
+        index_t Conv_G_;
+        index_t Conv_N_;
+        index_t Conv_K_;
+        index_t Conv_C_;
+        std::array<ck::index_t, NDimSpatial> output_spatial_lengths_;
+        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths_;
+        std::array<ck::index_t, NDimSpatial> conv_filter_strides_;
+        std::array<ck::index_t, NDimSpatial> input_left_pads_;
+        std::array<ck::index_t, NDimSpatial> input_right_pads_;
+        index_t k_batch_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        void ShowInfo(const Argument& arg)
+        {
+            std::cout << "arg.a_grid_desc_kbatch_k0_m_k1_{"
+                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0) << ", "
+                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I1) << ", "
+                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I2) << ", "
+                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I3) << "}" << std::endl;
+
+            std::cout << "arg.b_grid_desc_kbatch_k0_n_k1_{"
+                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I0) << ", "
+                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I1) << ", "
+                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I2) << ", "
+                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I3) << "}" << std::endl;
+
+            std::cout << "arg.c_grid_desc_m_n_{" << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
+                      << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+        }
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
+                                            arg.b_grid_desc_kbatch_k0_n_k1_,
+                                            arg.c_grid_desc_m_n_,
+                                            arg.block_2_ctile_map_))
+            {
+                throw std::runtime_error(
+                    "wrong! GridwiseGemm_km_kn_m0m1n0n1_xdlops_v3r1 has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) * arg.Conv_G_;
+
+            const auto K0 = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I1);
+
+            const bool has_main_k0_block_loop = GridwiseGemm::CalculateHasMainK0BlockLoop(K0);
+
+            auto launch_kernel = [&](auto has_main_k_block_loop) {
+                constexpr bool has_main_loop = has_main_k_block_loop.value;
+
+                const auto kernel = kernel_batched_gemm_xdlops_bwd_weight<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    CDataType,
+                    OutElementwiseOperation,
+                    InElementwiseOperation,
+                    WeiElementwiseOperation,
+                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                    remove_reference_t<DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                    remove_reference_t<DeviceOp::Block2CTileMap>,
+                    ComputePtrOffsetOfStridedBatch,
+                    has_main_loop>;
+
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_a_grid_,
+                                              arg.p_b_grid_,
+                                              arg.p_c_grid_,
+                                              arg.a_element_op_,
+                                              arg.b_element_op_,
+                                              arg.c_element_op_,
+                                              arg.Conv_G_,
+                                              arg.a_grid_desc_kbatch_k0_m_k1_,
+                                              arg.b_grid_desc_kbatch_k0_n_k1_,
+                                              arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.block_2_ctile_map_,
+                                              arg.compute_ptr_offset_of_batch_);
+            };
+
+            if(has_main_k0_block_loop)
+            {
+                return launch_kernel(integral_constant<bool, true>{});
+            }
+            else
+            {
+                return launch_kernel(integral_constant<bool, false>{});
+            }
+        }
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if constexpr(ConvBackwardWeightSpecialization ==
+                     ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
+        {
+            // check if it's 1x1, stride=1 pad = 0 conv
+            for(int i = 0; i < NDimSpatial; i++)
+            {
+                if(!(arg.filter_spatial_lengths_[i] == 1 && arg.conv_filter_strides_[i] == 1 &&
+                     arg.input_left_pads_[i] == 0 && arg.input_right_pads_[i] == 0))
+                {
+                    return false;
+                }
+            }
+        }
+
+        // vector load A/B matrix from global memory
+        if(!(ABlockTransferSrcVectorDim == 2 && BBlockTransferSrcVectorDim == 2 &&
+             arg.Conv_K_ % ABlockTransferSrcScalarPerVector == 0 &&
+             arg.Conv_C_ % BBlockTransferSrcScalarPerVector == 0))
+        {
+            return false;
+        }
+
+        // vector store C matrix into global memory
+        if(!(arg.Conv_C_ % CBlockTransferScalarPerVector_NWaveNPerXdl == 0))
+        {
+            return false;
+        }
+
+        // Gridwise GEMM size
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
+                                           arg.b_grid_desc_kbatch_k0_n_k1_,
+                                           arg.c_grid_desc_m_n_,
+                                           arg.block_2_ctile_map_);
+    }
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const InDataType* p_in_grid,
+                             WeiDataType* p_wei_grid,
+                             const OutDataType* p_out_grid,
+                             ck::index_t G,
+                             ck::index_t N,
+                             ck::index_t K,
+                             ck::index_t C,
+                             std::array<ck::index_t, NDimSpatial> input_spatial_lengths,
+                             std::array<ck::index_t, NDimSpatial> filter_spatial_lengths,
+                             std::array<ck::index_t, NDimSpatial> output_spatial_lengths,
+                             std::array<ck::index_t, NDimSpatial> conv_filter_strides,
+                             std::array<ck::index_t, NDimSpatial> conv_filter_dilations,
+                             std::array<ck::index_t, NDimSpatial> input_left_pads,
+                             std::array<ck::index_t, NDimSpatial> input_right_pads,
+                             InElementwiseOperation in_element_op,
+                             WeiElementwiseOperation wei_element_op,
+                             OutElementwiseOperation out_element_op,
+                             ck::index_t split_k)
+    {
+        return Argument{p_in_grid,
+                        p_wei_grid,
+                        p_out_grid,
+                        G,
+                        N,
+                        K,
+                        C,
+                        input_spatial_lengths,
+                        filter_spatial_lengths,
+                        output_spatial_lengths,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        1,
+                        1,
+                        in_element_op,
+                        wei_element_op,
+                        out_element_op,
+                        split_k};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_in_grid,
+                        void* p_wei_grid,
+                        const void* p_out_grid,
+                        ck::index_t G,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t C,
+                        std::array<ck::index_t, NDimSpatial> input_spatial_lengths,
+                        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths,
+                        std::array<ck::index_t, NDimSpatial> output_spatial_lengths,
+                        std::array<ck::index_t, NDimSpatial> conv_filter_strides,
+                        std::array<ck::index_t, NDimSpatial> conv_filter_dilations,
+                        std::array<ck::index_t, NDimSpatial> input_left_pads,
+                        std::array<ck::index_t, NDimSpatial> input_right_pads,
+                        InElementwiseOperation in_element_op,
+                        WeiElementwiseOperation wei_element_op,
+                        OutElementwiseOperation out_element_op,
+                        ck::index_t split_k) override
+    {
+        return std::make_unique<Argument>(static_cast<const InDataType*>(p_in_grid),
+                                          static_cast<WeiDataType*>(p_wei_grid),
+                                          static_cast<const OutDataType*>(p_out_grid),
+                                          G,
+                                          N,
+                                          K,
+                                          C,
+                                          input_spatial_lengths,
+                                          filter_spatial_lengths,
+                                          output_spatial_lengths,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads,
+                                          1,
+                                          1,
+                                          in_element_op,
+                                          wei_element_op,
+                                          out_element_op,
+                                          split_k);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << K0PerBlock << ", "
+            << getConvBackwardWeightSpecializationString(ConvBackwardWeightSpecialization)
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r.hpp
new file mode 100644
index 00000000..03185d5b
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r.hpp
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// Grouped Convolution Forward:
+//   input : input image A[G, N, C, Hi, Wi],
+//   input : weight B[G, K, C, Y, X],
+//   input : D0[G, N, K, Ho, Wo], D1[G, N, K, Ho, Wo], ...
+//   output : output image E[G, N, K, Ho, Wo]
+//   output : R0[G, N, Ho, Wo], R1[G, N, Ho, Wo], ...
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+//   Q0 = reduce0(q_op0(E)), Q1 = reduce1(q_op0(E)), ...
+//   R0 = r_op0(Q0), R1 = r_op1(Q1), ...
+// Assume:
+//   D0, D1, ... and E have the same layout
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DELayout,
+          typename RLayout,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename RsDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename QsElementwiseOperation,
+          typename RsElementwiseOperation>
+struct DeviceGroupedConvFwdMultipleDMultipleR : public BaseOperator
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+    static constexpr index_t NumRTensor = RsDataType::Size();
+
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(
+        const void* p_a,
+        const void* p_b,
+        const std::array<const void*, NumDTensor>& p_ds,
+        void* p_e,
+        std::array<void*, NumRTensor> p_rs,
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_lengths,
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_strides,
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
+        const std::array<index_t, NDimSpatial + 2>& r_g_n_wos_lengths,
+        const std::array<index_t, NDimSpatial + 2>& r_g_n_wos_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+        const std::array<index_t, NDimSpatial>& input_left_pads,
+        const std::array<index_t, NDimSpatial>& input_right_pads,
+        const AElementwiseOperation& a_element_op,
+        const BElementwiseOperation& b_element_op,
+        const CDEElementwiseOperation& cde_element_op,
+        const QsElementwiseOperation& qs_element_op,
+        const RsElementwiseOperation& rs_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
new file mode 100644
index 00000000..8b54ee49
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r_xdl_cshuffle.hpp
@@ -0,0 +1,1105 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_multiple_r.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/io.hpp"
+#include "ck/library/utility/numeric.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+namespace {
+
+template <index_t NumDTensor, index_t NumRTensor>
+struct ComputePtrOffsetOfStridedBatch
+{
+    ComputePtrOffsetOfStridedBatch() = default;
+
+    ComputePtrOffsetOfStridedBatch(index_t BatchStrideA,
+                                   index_t BatchStrideB,
+                                   Array<ck::index_t, NumDTensor> BatchStrideDs,
+                                   index_t BatchStrideE,
+                                   Array<ck::index_t, NumRTensor> BatchStrideRs)
+        : BatchStrideA_(BatchStrideA),
+          BatchStrideB_(BatchStrideB),
+          BatchStrideDs_(BatchStrideDs),
+          BatchStrideE_(BatchStrideE),
+          BatchStrideRs_(BatchStrideRs)
+    {
+    }
+
+    __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
+    {
+        return g_idx * static_cast<long_index_t>(BatchStrideA_);
+    }
+
+    __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
+    {
+        return g_idx * static_cast<long_index_t>(BatchStrideB_);
+    }
+
+    __host__ __device__ constexpr auto GetDsPtrOffset(index_t g_idx) const
+    {
+        Array<long_index_t, NumDTensor> ds_offset;
+        static_for<0, NumDTensor, 1>{}(
+            [&](auto i) { ds_offset(i) = g_idx * static_cast<long_index_t>(BatchStrideDs_[i]); });
+        return ds_offset;
+    }
+
+    __host__ __device__ constexpr long_index_t GetEPtrOffset(index_t g_idx) const
+    {
+        return g_idx * static_cast<long_index_t>(BatchStrideE_);
+    }
+
+    __host__ __device__ constexpr auto GetRsPtrOffset(index_t g_idx) const
+    {
+        Array<long_index_t, NumRTensor> rs_offset;
+        static_for<0, NumRTensor, 1>{}(
+            [&](auto i) { rs_offset(i) = g_idx * static_cast<long_index_t>(BatchStrideRs_[i]); });
+        return rs_offset;
+    }
+
+    index_t BatchStrideA_;
+    index_t BatchStrideB_;
+    Array<ck::index_t, NumDTensor> BatchStrideDs_;
+    index_t BatchStrideE_;
+    Array<ck::index_t, NumRTensor> BatchStrideRs_;
+};
+
+/*
+ * \brief Wrapper function of GridwiseGemm::Run to realize BatchedGEMM.
+ *
+ * \tparam ComputePtrOffsetOfBatch Class that computes the base pointer offsets of A, B, C matrix
+ * given the batch. For example, ComputePtrOffsetOfStridedBatch() computes the offsets of evenly
+ * strided batched, but we can easily extend to other layouts. The returned offset can be either \p
+ * index_t or \p long_index_t. If it returns \p long_index_t, we are not subject to the 2GB
+ * limitations.
+ *
+ * \tparam Block2ETileMap Block2ETileMap::CalculateBottomIndex() takes in id of a workgroup and
+ * returns the 2D index of the tile that it computes. \see
+ * GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3::Run().
+ *
+ * \note Using \p ComputePtrOffsetOfBatch gives us the flexibility that 2 workgroups can compute 2
+ * tiles from different matrices. Keep in mind that these 2 matrices can share the same grid
+ * descriptor (like in BatchedGEMM), or use their own grid descriptors (in GroupedGemm). \link
+ * impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp kernel_gemm_xdlops_v2r3_for_conv3d \endlink for
+ * \link DeviceConv3d \endlink uses the same concept, but currently does NOT encapsulate the
+ * computing of pointer offset into \p ComputePtrOffsetOfStridedBatch.
+ *
+ * \note \p Block2ETileMap allows customized mapping between a workgroup and the C-tile it computes.
+ * Together with \p ComputePtrOffsetOfBatch, we can reuse GridwiseGemm (and GridwiseGemm fusion ) to
+ * realize BatchedGemm and GroupedGemm (and the corresponding GEMM fusion).
+ *
+ */
+template <typename GridwiseGemm,
+          typename ABDataType,
+          typename DsPointer,
+          typename EDataType,
+          typename RsPointer,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename QsElementwiseOperation,
+          typename RsElementwiseOperation,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename RsGridDescriptor_MBlock_MPerBlock,
+          typename Block2ETileMap,
+          typename ComputePtrOffsetOfBatch,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_batch_gemm_multiple_d_xdl_cshuffle(
+            const ABDataType* __restrict__ p_a_grid,
+            const ABDataType* __restrict__ p_b_grid,
+            DsPointer p_ds_grid,
+            EDataType* __restrict__ p_e_grid,
+            RsPointer p_rs_grid,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CDEElementwiseOperation cde_element_op,
+            const QsElementwiseOperation qs_element_op,
+            const RsElementwiseOperation rs_element_op,
+            const index_t batch_count,
+            const AGridDesc_AK0_M_AK1 a_grid_desc_k0_m_k1,
+            const BGridDesc_BK0_N_BK1 b_grid_desc_k0_n_k1,
+            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                ds_grid_desc_mblock_mperblock_nblock_nperblock,
+            const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+                e_grid_desc_mblock_mperblock_nblock_nperblock_,
+            const RsGridDescriptor_MBlock_MPerBlock rs_grid_desc_mblock_mperblock,
+            const Block2ETileMap block_2_ctile_map,
+            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    const index_t num_blocks_per_batch =
+        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+    const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
+
+    const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
+    const auto rs_batch_offset = compute_ptr_offset_of_batch.GetRsPtrOffset(g_idx);
+
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    DsPointer p_ds_grid_grp;
+
+    static constexpr index_t NumDTensor =
+        DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size();
+
+    static_for<0, NumDTensor, 1>{}(
+        [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
+
+    RsPointer p_rs_grid_grp;
+
+    static constexpr index_t NumRTensor = RsGridDescriptor_MBlock_MPerBlock::Size();
+
+    static_for<0, NumRTensor, 1>{}(
+        [&](auto i) { p_rs_grid_grp(i) = p_rs_grid[i] + rs_batch_offset[i]; });
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+                                                  p_b_grid + b_batch_offset,
+                                                  p_ds_grid_grp,
+                                                  p_e_grid + e_batch_offset,
+                                                  p_rs_grid_grp,
+                                                  p_shared,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  cde_element_op,
+                                                  qs_element_op,
+                                                  rs_element_op,
+                                                  a_grid_desc_k0_m_k1,
+                                                  b_grid_desc_k0_n_k1,
+                                                  ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                                  rs_grid_desc_mblock_mperblock,
+                                                  block_2_ctile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_ds_grid;
+    ignore = p_e_grid;
+    ignore = p_rs_grid;
+    ignore = batch_count;
+    ignore = a_grid_desc_k0_m_k1;
+    ignore = b_grid_desc_k0_n_k1;
+    ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = e_grid_desc_mblock_mperblock_nblock_nperblock_;
+    ignore = rs_grid_desc_mblock_mperblock;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = cde_element_op;
+    ignore = qs_element_op;
+    ignore = rs_element_op;
+    ignore = compute_ptr_offset_of_batch;
+    ignore = block_2_ctile_map;
+#endif
+}
+
+} // namespace
+
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DELayout,
+          typename RLayout,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename ReduceAccDataType,
+          typename RsDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename QsElementwiseOperation,
+          typename RsElementwiseOperation,
+          typename ThreadReduceOperations,
+          typename RsGlobalMemoryDataOperation,
+          ConvolutionForwardSpecialization ConvForwardSpecialization,
+          GemmSpecialization GemmSpec,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock,
+          index_t CDEBlockTransferScalarPerVector_NPerBlock,
+          index_t RThreadTransferDstScalarPerVector_MPerBlock,
+          LoopScheduler LoopSched = make_default_loop_scheduler()>
+struct DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle
+    : public DeviceGroupedConvFwdMultipleDMultipleR<NDimSpatial,
+                                                    ALayout,
+                                                    BLayout,
+                                                    DELayout,
+                                                    RLayout,
+                                                    ADataType,
+                                                    BDataType,
+                                                    DsDataType,
+                                                    EDataType,
+                                                    RsDataType,
+                                                    AElementwiseOperation,
+                                                    BElementwiseOperation,
+                                                    CDEElementwiseOperation,
+                                                    RsElementwiseOperation,
+                                                    QsElementwiseOperation>
+{
+    using DeviceOp = DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle;
+
+    static constexpr index_t NumDTensor = DsDataType::Size();
+    static constexpr index_t NumRTensor = RsDataType::Size();
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr auto conv_to_gemm_transformer =
+        TransformConvFwdToGemm<NDimSpatial, ConvForwardSpecialization>{};
+
+    static constexpr auto matrix_padder =
+        MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock};
+
+    template <typename ALay>
+    static auto
+    MakeAGridDescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+                            const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                            const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                            const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+                            const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
+                            const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
+                            const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                            const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                            const std::array<index_t, NDimSpatial>& input_left_pads,
+                            const std::array<index_t, NDimSpatial>& input_right_pads)
+    {
+        const auto in_gemmmraw_gemmkraw_desc =
+            conv_to_gemm_transformer.template MakeADescriptor_M_K<ALay>(a_g_n_c_wis_lengths,
+                                                                        a_g_n_c_wis_strides,
+                                                                        b_g_k_c_xs_lengths,
+                                                                        b_g_k_c_xs_strides,
+                                                                        e_g_n_k_wos_lengths,
+                                                                        e_g_n_k_wos_strides,
+                                                                        conv_filter_strides,
+                                                                        conv_filter_dilations,
+                                                                        input_left_pads,
+                                                                        input_right_pads);
+
+        const auto in_gemmm_gemmk_desc =
+            matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmkraw_desc);
+
+        return in_gemmm_gemmk_desc;
+    }
+
+    template <typename BLay>
+    static auto
+    MakeBGridDescriptor_N_K(const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                            const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides)
+    {
+        const auto wei_gemmnraw_gemmkraw_desc =
+            conv_to_gemm_transformer.template MakeBDescriptor_N_K<BLay>(b_g_k_c_xs_lengths,
+                                                                        b_g_k_c_xs_strides);
+
+        const auto wei_gemmn_gemmk_desc =
+            matrix_padder.PadBDescriptor_N_K(wei_gemmnraw_gemmkraw_desc);
+
+        return wei_gemmn_gemmk_desc;
+    }
+
+    template <typename ELay>
+    static auto
+    MakeEGridDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
+                            const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides)
+    {
+        const auto out_gemmmraw_gemmnraw_desc =
+            conv_to_gemm_transformer.template MakeCDescriptor_M_N<ELay>(e_g_n_k_wos_lengths,
+                                                                        e_g_n_k_wos_strides);
+
+        const auto out_gemmm_gemmn_desc =
+            matrix_padder.PadCDescriptor_M_N(out_gemmmraw_gemmnraw_desc);
+
+        return out_gemmm_gemmn_desc;
+    }
+
+    template <typename Descriptor>
+    static auto GetPaddedRGridDescriptor(Descriptor descriptor, index_t MRaw)
+    {
+        const auto M    = math::integer_divide_ceil(MRaw, MPerBlock) * MPerBlock;
+        const auto MPad = M - MRaw;
+
+        if constexpr(GemmSpec == GemmSpecialization::MPadding ||
+                     GemmSpec == GemmSpecialization::MNPadding ||
+                     GemmSpec == GemmSpecialization::MKPadding ||
+                     GemmSpec == GemmSpecialization::MNKPadding)
+        {
+            // pad M
+            return transform_tensor_descriptor(
+                descriptor,
+                make_tuple(make_right_pad_transform(descriptor, MPad)),
+                make_tuple(Sequence<0>{}),
+                make_tuple(Sequence<0>{}));
+        }
+        else
+        {
+            // not pad M
+            return descriptor;
+        }
+    }
+
+    template <typename RLay,
+              typename std::enable_if<is_same_v<RLay, tensor_layout::convolution::GNW> ||
+                                          is_same_v<RLay, tensor_layout::convolution::GNHW> ||
+                                          is_same_v<RLay, tensor_layout::convolution::GNDHW>,
+                                      bool>::type = false>
+    static auto
+    MakeRGridDescriptor_M(const std::array<index_t, NDimSpatial + 2>& r_g_n_wos_lengths,
+                          const std::array<index_t, NDimSpatial + 2>& /* r_g_n_wos_strides */)
+    {
+        const index_t N = r_g_n_wos_lengths[1];
+
+        const index_t NHoWo =
+            N * ck::accumulate_n<index_t>(
+                    r_g_n_wos_lengths.begin() + 2, NDimSpatial, 1, std::multiplies<>());
+
+        const auto r_grid_desc_mraw = make_naive_tensor_descriptor_packed(make_tuple(NHoWo));
+
+        return GetPaddedRGridDescriptor(r_grid_desc_mraw, NHoWo);
+    }
+
+    template <typename RLay,
+              typename std::enable_if<is_same_v<RLay, tensor_layout::convolution::G_NW> ||
+                                          is_same_v<RLay, tensor_layout::convolution::G_NHW> ||
+                                          is_same_v<RLay, tensor_layout::convolution::G_NDHW> ||
+                                          is_same_v<RLay, tensor_layout::convolution::NWG> ||
+                                          is_same_v<RLay, tensor_layout::convolution::NHWG> ||
+                                          is_same_v<RLay, tensor_layout::convolution::NDHWG>,
+                                      bool>::type = false>
+    static auto MakeRGridDescriptor_M(const std::array<index_t, NDimSpatial + 2>& r_g_n_wos_lengths,
+                                      const std::array<index_t, NDimSpatial + 2>& r_g_n_wos_strides)
+    {
+        const index_t N = r_g_n_wos_lengths[1];
+
+        const index_t WoStride = r_g_n_wos_strides[NDimSpatial + 2];
+
+        const index_t NHoWo =
+            N * ck::accumulate_n<index_t>(
+                    r_g_n_wos_lengths.begin() + 2, NDimSpatial, 1, std::multiplies<>());
+
+        const auto r_grid_desc_mraw =
+            make_naive_tensor_descriptor(make_tuple(NHoWo), make_tuple(WoStride));
+
+        return GetPaddedRGridDescriptor(r_grid_desc_mraw, NHoWo);
+    }
+
+    using AGridDesc_M_K = remove_cvref_t<decltype(
+        MakeAGridDescriptor_M_K<ALayout>({}, {}, {}, {}, {}, {}, {}, {}, {}, {}))>;
+    using BGridDesc_N_K = remove_cvref_t<decltype(MakeBGridDescriptor_N_K<BLayout>({}, {}))>;
+    using EGridDesc_M_N = remove_cvref_t<decltype(MakeEGridDescriptor_M_N<DELayout>({}, {}))>;
+    using RGridDesc_M   = remove_cvref_t<decltype(MakeRGridDescriptor_M<RLayout>({}, {}))>;
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemmMultipleDMultipleR_k0mk1_k0nk1_mn_xdl_cshuffle_v1<
+        ADataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CShuffleDataType,
+        DsDataType,
+        EDataType,
+        ReduceAccDataType,
+        RsDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CDEElementwiseOperation,
+        QsElementwiseOperation,
+        RsElementwiseOperation,
+        ThreadReduceOperations,
+        InMemoryDataOperationEnum::Set,
+        RsGlobalMemoryDataOperation,
+        AGridDesc_M_K,
+        BGridDesc_N_K,
+        EGridDesc_M_N,
+        RGridDesc_M,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CDRThreadTransferClusterLengths_MPerBlock_NPerBlock,
+        CDEBlockTransferScalarPerVector_NPerBlock,
+        RThreadTransferDstScalarPerVector_MPerBlock,
+        LoopSched>;
+
+    using AGridDesc_AK0_M_AK1 = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(AGridDesc_M_K{}))>;
+    using BGridDesc_BK0_N_BK1 = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(BGridDesc_N_K{}))>;
+
+    using Block2ETileMap = typename GridwiseGemm::DefaultBlock2ETileMap;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const void* p_a,
+                 const void* p_b,
+                 const std::array<const void*, NumDTensor>& p_ds,
+                 void* p_e,
+                 std::array<void*, NumRTensor> p_rs,
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+                 const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+                     ds_g_n_k_wos_lengths,
+                 const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+                     ds_g_n_k_wos_strides,
+                 const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
+                 const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
+                 const std::array<index_t, NDimSpatial + 2>& r_g_n_wos_lengths,
+                 const std::array<index_t, NDimSpatial + 2>& r_g_n_wos_strides,
+                 const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                 const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                 const std::array<index_t, NDimSpatial>& input_left_pads,
+                 const std::array<index_t, NDimSpatial>& input_right_pads,
+                 const AElementwiseOperation& a_element_op,
+                 const BElementwiseOperation& b_element_op,
+                 const CDEElementwiseOperation& cde_element_op,
+                 const QsElementwiseOperation& qs_element_op,
+                 const RsElementwiseOperation& rs_element_op)
+            : p_a_grid_{static_cast<const ADataType*>(p_a)},
+              p_b_grid_{static_cast<const BDataType*>(p_b)},
+              p_ds_grid_{},
+              p_e_grid_{static_cast<EDataType*>(p_e)},
+              p_rs_grid_{}, // FIXME
+              a_grid_desc_m_k_{DeviceOp::MakeAGridDescriptor_M_K<ALayout>(a_g_n_c_wis_lengths,
+                                                                          a_g_n_c_wis_strides,
+                                                                          b_g_k_c_xs_lengths,
+                                                                          b_g_k_c_xs_strides,
+                                                                          e_g_n_k_wos_lengths,
+                                                                          e_g_n_k_wos_strides,
+                                                                          conv_filter_strides,
+                                                                          conv_filter_dilations,
+                                                                          input_left_pads,
+                                                                          input_right_pads)},
+              b_grid_desc_n_k_{DeviceOp::MakeBGridDescriptor_N_K<BLayout>(b_g_k_c_xs_lengths,
+                                                                          b_g_k_c_xs_strides)},
+              ds_grid_desc_m_n_{},
+              e_grid_desc_m_n_{DeviceOp::MakeEGridDescriptor_M_N<DELayout>(e_g_n_k_wos_lengths,
+                                                                           e_g_n_k_wos_strides)},
+              r_grid_desc_m_{
+                  DeviceOp::MakeRGridDescriptor_M<RLayout>(r_g_n_wos_lengths, r_g_n_wos_strides)},
+              a_grid_desc_ak0_m_ak1_{
+                  GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
+              b_grid_desc_bk0_n_bk1_{
+                  GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
+              ds_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              e_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              rs_grid_desc_mblock_mperblock_{},
+              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
+              compute_ptr_offset_of_batch_{},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op},
+              qs_element_op_{qs_element_op},
+              rs_element_op_{rs_element_op},
+              a_g_n_c_wis_lengths_{a_g_n_c_wis_lengths},
+              a_g_n_c_wis_strides_{a_g_n_c_wis_strides},
+              b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths},
+              b_g_k_c_xs_strides_{b_g_k_c_xs_strides},
+              ds_g_n_k_wos_lengths_{ds_g_n_k_wos_lengths},
+              ds_g_n_k_wos_strides_{ds_g_n_k_wos_strides},
+              e_g_n_k_wos_lengths_{e_g_n_k_wos_lengths},
+              e_g_n_k_wos_strides_{e_g_n_k_wos_strides},
+              conv_filter_strides_{conv_filter_strides},
+              conv_filter_dilations_{conv_filter_dilations},
+              input_left_pads_{input_left_pads},
+              input_right_pads_{input_right_pads}
+        {
+            // A/B/E Batch Stride
+            compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_c_wis_strides[0];
+            compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_k_c_xs_strides[0];
+            compute_ptr_offset_of_batch_.BatchStrideE_ = e_g_n_k_wos_strides[0];
+
+            // populate desc for Ds/E
+            if(GridwiseGemm::CheckValidity(a_grid_desc_m_k_,
+                                           b_grid_desc_n_k_,
+                                           e_grid_desc_m_n_,
+                                           r_grid_desc_m_,
+                                           block_2_etile_map_))
+            {
+                e_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        e_grid_desc_m_n_);
+
+                // populate pointer, batch stride, desc for Ds
+                static_for<0, NumDTensor, 1>{}([&](auto i) {
+                    using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                    // D pointer
+                    p_ds_grid_(i) = static_cast<const DDataType*>(p_ds[i]);
+
+                    // D batch stride
+                    compute_ptr_offset_of_batch_.BatchStrideDs_(i) = ds_g_n_k_wos_strides[i][0];
+
+                    // D desc
+                    ds_grid_desc_m_n_(i) = DeviceOp::MakeEGridDescriptor_M_N<DELayout>(
+                        ds_g_n_k_wos_lengths[i], ds_g_n_k_wos_strides[i]);
+
+                    ds_grid_desc_mblock_mperblock_nblock_nperblock_(i) =
+                        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                            ds_grid_desc_m_n_(i));
+                });
+
+                // populate pointer for Rs
+                static_for<0, NumRTensor, 1>{}([&](auto i) {
+                    using RDataType = remove_cvref_t<tuple_element_t<i.value, RsDataType>>;
+
+                    // R pointer
+                    p_rs_grid_(i) = static_cast<RDataType*>(p_rs[i]);
+
+                    rs_grid_desc_mblock_mperblock_(i) =
+                        GridwiseGemm::MakeRGridDescriptor_MBlock_MPerBlock(r_grid_desc_m_);
+                });
+            }
+        }
+
+        void Print() const
+        {
+            std::cout << "A[M, K]: " << a_grid_desc_m_k_ << std::endl;
+            std::cout << "B[N, K]: " << b_grid_desc_n_k_ << std::endl;
+            static_for<0, NumDTensor, 1>{}(
+                [&](auto i) { std::cout << "Ds[M, N]: " << ds_grid_desc_m_n_[i] << std::endl; });
+            std::cout << "E[M, N]: " << e_grid_desc_m_n_ << std::endl;
+        }
+
+        //  private:
+        // pointers
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        EDataType* p_e_grid_;
+        typename GridwiseGemm::RsGridPointer p_rs_grid_;
+
+        // tensor descriptors for problem definiton
+        AGridDesc_M_K a_grid_desc_m_k_;
+        BGridDesc_N_K b_grid_desc_n_k_;
+        EGridDesc_M_N ds_grid_desc_m_n_;
+        EGridDesc_M_N e_grid_desc_m_n_;
+        RGridDesc_M r_grid_desc_m_;
+
+        // tensor descriptors for block/thread-wise copy
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        StaticallyIndexedArray<
+            typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+            NumDTensor>
+            ds_grid_desc_mblock_mperblock_nblock_nperblock_; // FIXME: Ds desc may be of different
+                                                             // type from E
+        typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+            e_grid_desc_mblock_mperblock_nblock_nperblock_;
+
+        StaticallyIndexedArray<typename GridwiseGemm::RGridDescriptor_MBlock_MPerBlock, NumRTensor>
+            rs_grid_desc_mblock_mperblock_;
+
+        // block-to-e-tile map
+        Block2ETileMap block_2_etile_map_;
+
+        ComputePtrOffsetOfStridedBatch<NumDTensor, NumRTensor> compute_ptr_offset_of_batch_;
+
+        // element-wise op
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+        QsElementwiseOperation qs_element_op_;
+        RsElementwiseOperation rs_element_op_;
+
+        // for checking IsSupportedArgument()
+        std::array<index_t, NDimSpatial + 3> a_g_n_c_wis_lengths_;
+        std::array<index_t, NDimSpatial + 3> a_g_n_c_wis_strides_;
+        std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_lengths_;
+        std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_strides_;
+        std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_k_wos_lengths_;
+        std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_k_wos_strides_;
+        std::array<index_t, NDimSpatial + 3> e_g_n_k_wos_lengths_;
+        std::array<index_t, NDimSpatial + 3> e_g_n_k_wos_strides_;
+        std::array<index_t, NDimSpatial> conv_filter_strides_;
+        std::array<index_t, NDimSpatial> conv_filter_dilations_;
+        std::array<index_t, NDimSpatial> input_left_pads_;
+        std::array<index_t, NDimSpatial> input_right_pads_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
+                                            arg.b_grid_desc_n_k_,
+                                            arg.e_grid_desc_m_n_,
+                                            arg.r_grid_desc_m_,
+                                            arg.block_2_etile_map_))
+            {
+                throw std::runtime_error(
+                    "wrong! GridwiseGemmMultipleD_xdl_cshuffle has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_) *
+                arg.a_g_n_c_wis_lengths_[0]; // Group count
+
+            const auto K =
+                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+
+            auto launch_kernel = [&](auto has_main_k_block_loop) {
+                constexpr bool has_main_loop = has_main_k_block_loop.value;
+
+                const auto kernel = kernel_batch_gemm_multiple_d_xdl_cshuffle<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    typename GridwiseGemm::DsGridPointer,
+                    EDataType,
+                    typename GridwiseGemm::RsGridPointer,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CDEElementwiseOperation,
+                    QsElementwiseOperation,
+                    RsElementwiseOperation,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    ck::StaticallyIndexedArray<
+                        typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                        NumDTensor>,
+                    typename GridwiseGemm::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                    ck::StaticallyIndexedArray<
+                        typename GridwiseGemm::RGridDescriptor_MBlock_MPerBlock,
+                        NumRTensor>,
+                    Block2ETileMap,
+                    ComputePtrOffsetOfStridedBatch<NumDTensor, NumRTensor>,
+                    has_main_loop>;
+
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_a_grid_,
+                                              arg.p_b_grid_,
+                                              arg.p_ds_grid_,
+                                              arg.p_e_grid_,
+                                              arg.p_rs_grid_,
+                                              arg.a_element_op_,
+                                              arg.b_element_op_,
+                                              arg.cde_element_op_,
+                                              arg.qs_element_op_,
+                                              arg.rs_element_op_,
+                                              arg.a_g_n_c_wis_lengths_[0], // Group count
+                                              arg.a_grid_desc_ak0_m_ak1_,
+                                              arg.b_grid_desc_bk0_n_bk1_,
+                                              arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.rs_grid_desc_mblock_mperblock_,
+                                              arg.block_2_etile_map_,
+                                              arg.compute_ptr_offset_of_batch_);
+            };
+
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                return launch_kernel(integral_constant<bool, true>{});
+            }
+            else
+            {
+                return launch_kernel(integral_constant<bool, false>{});
+            }
+        }
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        namespace ctc = tensor_layout::convolution;
+
+        // check device
+        if(get_device_name() == "gfx908")
+        {
+            if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, float> ||
+                           is_same_v<AccDataType, int32_t>))
+            {
+                return false;
+            }
+        }
+        else if(get_device_name() == "gfx90a")
+        {
+            if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, float> ||
+                           is_same_v<AccDataType, int32_t> || is_same_v<AccDataType, double>))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+
+        // check ConvolutionForwardSpecialization
+        if constexpr(ConvForwardSpecialization ==
+                     ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
+        {
+            // check if it's 1x1, stride=1 conv
+            for(index_t i = 0; i < NDimSpatial; ++i)
+            {
+                const index_t X          = arg.b_g_k_c_xs_lengths_[i + 2];
+                const index_t ConvStride = arg.conv_filter_strides_[i];
+                const index_t LeftPad    = arg.input_left_pads_[i];
+                const index_t RightPad   = arg.input_right_pads_[i];
+
+                if(!(X == 1 && ConvStride == 1 && LeftPad == 0 && RightPad == 0))
+                {
+                    return false;
+                }
+            }
+        }
+        else if constexpr(ConvForwardSpecialization ==
+                          ConvolutionForwardSpecialization::Filter1x1Pad0)
+        {
+            // check if it's 1x1 conv
+            for(index_t i = 0; i < NDimSpatial; ++i)
+            {
+                const index_t X        = arg.b_g_k_c_xs_lengths_[i + 2];
+                const index_t LeftPad  = arg.input_left_pads_[i];
+                const index_t RightPad = arg.input_right_pads_[i];
+
+                if(!(X == 1 && LeftPad == 0 && RightPad == 0))
+                {
+                    return false;
+                }
+            }
+        }
+
+        // check vector access of A
+        // FIXME: layout
+        if constexpr(is_same_v<ALayout, ctc::G_NW_C> || is_same_v<ALayout, ctc::G_NHW_C> ||
+                     is_same_v<ALayout, ctc::G_NDHW_C> || is_same_v<ALayout, ctc::GNWC> ||
+                     is_same_v<ALayout, ctc::GNHWC> || is_same_v<ALayout, ctc::GNDHWC> ||
+                     is_same_v<ALayout, ctc::NWGC> || is_same_v<ALayout, ctc::NHWGC> ||
+                     is_same_v<ALayout, ctc::NDHWGC>)
+        {
+            const index_t C = arg.a_g_n_c_wis_lengths_[2];
+
+            if(!(ABlockTransferSrcVectorDim == 2 && C % ABlockTransferSrcScalarPerVector == 0))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+
+        // check vector access of B
+        // FIXME: layout
+        if constexpr(is_same_v<BLayout, ctc::G_K_X_C> || is_same_v<BLayout, ctc::G_K_YX_C> ||
+                     is_same_v<BLayout, ctc::G_K_ZYX_C> || is_same_v<BLayout, ctc::GKXC> ||
+                     is_same_v<BLayout, ctc::GKYXC> || is_same_v<BLayout, ctc::GKZYXC> ||
+                     is_same_v<BLayout, ctc::KXGC> || is_same_v<BLayout, ctc::KYXGC> ||
+                     is_same_v<BLayout, ctc::KZYXGC>)
+
+        {
+            const index_t C = arg.b_g_k_c_xs_lengths_[2];
+
+            if(!(BBlockTransferSrcVectorDim == 2 && C % BBlockTransferSrcScalarPerVector == 0))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+
+        //  check vector access of Ds
+        bool valid = true;
+
+        static_for<0, NumDTensor, 1>{}([&](auto i) {
+            // FIXME: layout
+            if constexpr(is_same_v<DELayout, ctc::G_NW_K> || is_same_v<DELayout, ctc::G_NHW_K> ||
+                         is_same_v<DELayout, ctc::G_NDHW_K> || is_same_v<DELayout, ctc::GNWK> ||
+                         is_same_v<DELayout, ctc::GNHWK> || is_same_v<DELayout, ctc::GNDHWK> ||
+                         is_same_v<DELayout, ctc::NWGK> || is_same_v<DELayout, ctc::NHWGK> ||
+                         is_same_v<DELayout, ctc::NDHWGK>)
+            {
+                const index_t K = arg.ds_g_n_k_wos_lengths_[i][2];
+
+                if(!(K % CDEBlockTransferScalarPerVector_NPerBlock == 0))
+                {
+                    valid = false;
+                }
+            }
+            else
+            {
+                valid = false;
+            }
+        });
+
+        if(!valid)
+        {
+            return false;
+        }
+
+        // check vector access of E
+        if constexpr(is_same_v<DELayout, ctc::G_NW_K> || is_same_v<DELayout, ctc::G_NHW_K> ||
+                     is_same_v<DELayout, ctc::G_NDHW_K> || is_same_v<DELayout, ctc::GNWK> ||
+                     is_same_v<DELayout, ctc::GNHWK> || is_same_v<DELayout, ctc::GNDHWK> ||
+                     is_same_v<DELayout, ctc::NWGK> || is_same_v<DELayout, ctc::NHWGK> ||
+                     is_same_v<DELayout, ctc::NDHWGK>)
+        {
+            const index_t K = arg.e_g_n_k_wos_lengths_[2];
+
+            if(!(K % CDEBlockTransferScalarPerVector_NPerBlock == 0))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+
+        // check vector access of R
+        if constexpr(!(is_same_v<RLayout, ctc::G_NW> || is_same_v<RLayout, ctc::G_NHW> ||
+                       is_same_v<RLayout, ctc::G_NDHW> || is_same_v<RLayout, ctc::GNW> ||
+                       is_same_v<RLayout, ctc::GNHW> || is_same_v<RLayout, ctc::GNDHW> ||
+                       is_same_v<RLayout, ctc::NWG> || is_same_v<RLayout, ctc::NHWG> ||
+                       is_same_v<RLayout, ctc::NDHWG>))
+        {
+            return false;
+        }
+
+        // check Gridwise GEMM
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
+                                           arg.b_grid_desc_n_k_,
+                                           arg.e_grid_desc_m_n_,
+                                           arg.r_grid_desc_m_,
+                                           arg.block_2_etile_map_);
+    }
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(
+        const void* p_a,
+        const void* p_b,
+        const std::array<const void*, NumDTensor>& p_ds,
+        void* p_e,
+        std::array<void*, NumRTensor> p_rs,
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_lengths,
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_strides,
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
+        const std::array<index_t, NDimSpatial + 2>& r_g_n_wos_lengths,
+        const std::array<index_t, NDimSpatial + 2>& r_g_n_wos_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+        const std::array<index_t, NDimSpatial>& input_left_pads,
+        const std::array<index_t, NDimSpatial>& input_right_pads,
+        const AElementwiseOperation& a_element_op,
+        const BElementwiseOperation& b_element_op,
+        const CDEElementwiseOperation& cde_element_op,
+        const QsElementwiseOperation& qs_element_op,
+        const RsElementwiseOperation& rs_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_ds,
+                        p_e,
+                        p_rs,
+                        a_g_n_c_wis_lengths,
+                        a_g_n_c_wis_strides,
+                        b_g_k_c_xs_lengths,
+                        b_g_k_c_xs_strides,
+                        ds_g_n_k_wos_lengths,
+                        ds_g_n_k_wos_strides,
+                        e_g_n_k_wos_lengths,
+                        e_g_n_k_wos_strides,
+                        r_g_n_wos_lengths,
+                        r_g_n_wos_strides,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        a_element_op,
+                        b_element_op,
+                        cde_element_op,
+                        qs_element_op,
+                        rs_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(
+        const void* p_a,
+        const void* p_b,
+        const std::array<const void*, NumDTensor>& p_ds,
+        void* p_e,
+        std::array<void*, NumRTensor> p_rs,
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_lengths,
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_strides,
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
+        const std::array<index_t, NDimSpatial + 2>& r_g_n_wos_lengths,
+        const std::array<index_t, NDimSpatial + 2>& r_g_n_wos_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+        const std::array<index_t, NDimSpatial>& input_left_pads,
+        const std::array<index_t, NDimSpatial>& input_right_pads,
+        const AElementwiseOperation& a_element_op,
+        const BElementwiseOperation& b_element_op,
+        const CDEElementwiseOperation& cde_element_op,
+        const QsElementwiseOperation& qs_element_op,
+        const RsElementwiseOperation& rs_element_op) override
+    {
+        return std::make_unique<Argument>(p_a,
+                                          p_b,
+                                          p_ds,
+                                          p_e,
+                                          p_rs,
+                                          a_g_n_c_wis_lengths,
+                                          a_g_n_c_wis_strides,
+                                          b_g_k_c_xs_lengths,
+                                          b_g_k_c_xs_strides,
+                                          ds_g_n_k_wos_lengths,
+                                          ds_g_n_k_wos_strides,
+                                          e_g_n_k_wos_lengths,
+                                          e_g_n_k_wos_strides,
+                                          r_g_n_wos_lengths,
+                                          r_g_n_wos_strides,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads,
+                                          a_element_op,
+                                          b_element_op,
+                                          cde_element_op,
+                                          qs_element_op,
+                                          rs_element_op);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << getConvForwardSpecializationString(ConvForwardSpecialization)
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp
new file mode 100644
index 00000000..bb7a2f8c
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp
@@ -0,0 +1,952 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/io.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+namespace {
+
+template <index_t NumDTensor>
+struct ComputePtrOffsetOfStridedBatch
+{
+    ComputePtrOffsetOfStridedBatch() = default;
+
+    ComputePtrOffsetOfStridedBatch(index_t BatchStrideA,
+                                   index_t BatchStrideB,
+                                   Array<ck::index_t, NumDTensor> BatchStrideDs,
+                                   index_t BatchStrideE)
+        : BatchStrideA_(BatchStrideA),
+          BatchStrideB_(BatchStrideB),
+          BatchStrideDs_(BatchStrideDs),
+          BatchStrideE_(BatchStrideE)
+    {
+    }
+
+    __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
+    {
+        return g_idx * static_cast<long_index_t>(BatchStrideA_);
+    }
+
+    __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
+    {
+        return g_idx * static_cast<long_index_t>(BatchStrideB_);
+    }
+
+    __host__ __device__ constexpr auto GetDsPtrOffset(index_t g_idx) const
+    {
+        Array<long_index_t, NumDTensor> ds_offset;
+        static_for<0, NumDTensor, 1>{}(
+            [&](auto i) { ds_offset(i) = g_idx * static_cast<long_index_t>(BatchStrideDs_[i]); });
+        return ds_offset;
+    }
+
+    __host__ __device__ constexpr long_index_t GetEPtrOffset(index_t g_idx) const
+    {
+        return g_idx * static_cast<long_index_t>(BatchStrideE_);
+    }
+
+    index_t BatchStrideA_;
+    index_t BatchStrideB_;
+    Array<ck::index_t, NumDTensor> BatchStrideDs_;
+    index_t BatchStrideE_;
+};
+
+/*
+ * \brief Wrapper function of GridwiseGemm::Run to realize BatchedGEMM.
+ *
+ * \tparam ComputePtrOffsetOfBatch Class that computes the base pointer offsets of A, B, C matrix
+ * given the batch. For example, ComputePtrOffsetOfStridedBatch() computes the offsets of evenly
+ * strided batched, but we can easily extend to other layouts. The returned offset can be either \p
+ * index_t or \p long_index_t. If it returns \p long_index_t, we are not subject to the 2GB
+ * limitations.
+ *
+ * \tparam Block2ETileMap Block2ETileMap::CalculateBottomIndex() takes in id of a workgroup and
+ * returns the 2D index of the tile that it computes. \see
+ * GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3::Run().
+ *
+ * \note Using \p ComputePtrOffsetOfBatch gives us the flexibility that 2 workgroups can compute 2
+ * tiles from different matrices. Keep in mind that these 2 matrices can share the same grid
+ * descriptor (like in BatchedGEMM), or use their own grid descriptors (in GroupedGemm). \link
+ * impl/device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp kernel_gemm_xdlops_v2r3_for_conv3d \endlink for
+ * \link DeviceConv3d \endlink uses the same concept, but currently does NOT encapsulate the
+ * computing of pointer offset into \p ComputePtrOffsetOfStridedBatch.
+ *
+ * \note \p Block2ETileMap allows customized mapping between a workgroup and the C-tile it computes.
+ * Together with \p ComputePtrOffsetOfBatch, we can reuse GridwiseGemm (and GridwiseGemm fusion ) to
+ * realize BatchedGemm and GroupedGemm (and the corresponding GEMM fusion).
+ *
+ */
+template <typename GridwiseGemm,
+          typename ABDataType,
+          typename DsPointer,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename Block2ETileMap,
+          typename ComputePtrOffsetOfBatch,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_grouped_conv_fwd_multiple_d_xdl_cshuffle(
+            const ABDataType* __restrict__ p_a_grid,
+            const ABDataType* __restrict__ p_b_grid,
+            DsPointer p_ds_grid,
+            EDataType* __restrict__ p_e_grid,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CDEElementwiseOperation cde_element_op,
+            const index_t batch_count,
+            const AGridDesc_AK0_M_AK1 a_grid_desc_k0_m_k1,
+            const BGridDesc_BK0_N_BK1 b_grid_desc_k0_n_k1,
+            const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                ds_grid_desc_mblock_mperblock_nblock_nperblock,
+            const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+                e_grid_desc_mblock_mperblock_nblock_nperblock_,
+            const Block2ETileMap block_2_ctile_map,
+            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    // offset base pointer for each work-group
+    const index_t num_blocks_per_batch =
+        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+
+    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
+    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
+    const long_index_t e_batch_offset = __builtin_amdgcn_readfirstlane(
+        static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
+
+    const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
+
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    DsPointer p_ds_grid_grp;
+
+    static constexpr index_t NumDTensor =
+        DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size();
+
+    static_for<0, NumDTensor, 1>{}(
+        [&](auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+                                                  p_b_grid + b_batch_offset,
+                                                  p_ds_grid_grp,
+                                                  p_e_grid + e_batch_offset,
+                                                  p_shared,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  cde_element_op,
+                                                  a_grid_desc_k0_m_k1,
+                                                  b_grid_desc_k0_n_k1,
+                                                  ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                                  block_2_ctile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_ds_grid;
+    ignore = p_e_grid;
+    ignore = batch_count;
+    ignore = a_grid_desc_k0_m_k1;
+    ignore = b_grid_desc_k0_n_k1;
+    ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = e_grid_desc_mblock_mperblock_nblock_nperblock_;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = cde_element_op;
+    ignore = compute_ptr_offset_of_batch;
+    ignore = block_2_ctile_map;
+#endif
+}
+
+} // namespace
+
+//
+// @brief      Device Convolution operation.
+//
+// Supports:
+//  @li         Forward convolution with up to 3 spatial dimentions
+//  @li         Input tensor in GNWC data format
+//  @li         Weight tensor in GKXC data format
+//  @li         Output tensor in GNWK data format
+//
+// 1D:
+// out[N, Wo, K] = in[N, Wi, C] * wei[K, X, C]
+// 2D:
+// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
+// 3D:
+// out[N, Do, Ho, Wo, K] = in[N, Di, Hi, Wi, C] * wei[K, Z, Y, X, C]
+//
+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          ConvolutionForwardSpecialization ConvForwardSpecialization,
+          GemmSpecialization GemmSpec,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CDEBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched = make_default_loop_scheduler()>
+struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle
+    : public DeviceGroupedConvFwdMultipleD<NDimSpatial,
+                                           ALayout,
+                                           BLayout,
+                                           DsLayout,
+                                           ELayout,
+                                           ADataType,
+                                           BDataType,
+                                           DsDataType,
+                                           EDataType,
+                                           AElementwiseOperation,
+                                           BElementwiseOperation,
+                                           CDEElementwiseOperation>
+{
+    using DeviceOp = DeviceGroupedConvFwdMultipleD_Xdl_CShuffle;
+
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    static constexpr auto conv_to_gemm_transformer =
+        TransformConvFwdToGemm<NDimSpatial, ConvForwardSpecialization>{};
+
+    static constexpr auto matrix_padder =
+        MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock};
+
+    template <typename ALay>
+    static auto
+    MakeAGridDescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+                            const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                            const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                            const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+                            const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
+                            const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
+                            const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                            const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                            const std::array<index_t, NDimSpatial>& input_left_pads,
+                            const std::array<index_t, NDimSpatial>& input_right_pads)
+    {
+        const auto in_gemmmraw_gemmkraw_desc =
+            conv_to_gemm_transformer.template MakeADescriptor_M_K<ALay>(a_g_n_c_wis_lengths,
+                                                                        a_g_n_c_wis_strides,
+                                                                        b_g_k_c_xs_lengths,
+                                                                        b_g_k_c_xs_strides,
+                                                                        e_g_n_k_wos_lengths,
+                                                                        e_g_n_k_wos_strides,
+                                                                        conv_filter_strides,
+                                                                        conv_filter_dilations,
+                                                                        input_left_pads,
+                                                                        input_right_pads);
+
+        const auto in_gemmm_gemmk_desc =
+            matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmkraw_desc);
+
+        return in_gemmm_gemmk_desc;
+    }
+
+    template <typename BLay>
+    static auto
+    MakeBGridDescriptor_N_K(const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                            const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides)
+    {
+        const auto wei_gemmnraw_gemmkraw_desc =
+            conv_to_gemm_transformer.template MakeBDescriptor_N_K<BLay>(b_g_k_c_xs_lengths,
+                                                                        b_g_k_c_xs_strides);
+
+        const auto wei_gemmn_gemmk_desc =
+            matrix_padder.PadBDescriptor_N_K(wei_gemmnraw_gemmkraw_desc);
+
+        return wei_gemmn_gemmk_desc;
+    }
+
+    template <typename ELay>
+    static auto
+    MakeEGridDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
+                            const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides)
+    {
+        const auto out_gemmmraw_gemmnraw_desc =
+            conv_to_gemm_transformer.template MakeCDescriptor_M_N<ELay>(e_g_n_k_wos_lengths,
+                                                                        e_g_n_k_wos_strides);
+
+        const auto out_gemmm_gemmn_desc =
+            matrix_padder.PadCDescriptor_M_N(out_gemmmraw_gemmnraw_desc);
+
+        return out_gemmm_gemmn_desc;
+    }
+
+    static auto MakeDsGridDescriptor_M_N(
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_lengths,
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_strides)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+
+                return DeviceOp::MakeEGridDescriptor_M_N<DLayout>(ds_g_n_k_wos_lengths[i],
+                                                                  ds_g_n_k_wos_strides[i]);
+            },
+            Number<NumDTensor>{});
+    }
+
+    // desc for problem definition
+    using AGridDesc_M_K  = remove_cvref_t<decltype(
+        MakeAGridDescriptor_M_K<ALayout>({}, {}, {}, {}, {}, {}, {}, {}, {}, {}))>;
+    using BGridDesc_N_K  = remove_cvref_t<decltype(MakeBGridDescriptor_N_K<BLayout>({}, {}))>;
+    using DsGridDesc_M_N = remove_cvref_t<decltype(MakeDsGridDescriptor_M_N({}, {}))>;
+    using EGridDesc_M_N  = remove_cvref_t<decltype(MakeEGridDescriptor_M_N<ELayout>({}, {}))>;
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<
+        ADataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CShuffleDataType,
+        DsDataType,
+        EDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CDEElementwiseOperation,
+        InMemoryDataOperationEnum::Set,
+        NumGemmKPrefetchStage,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEBlockTransferScalarPerVector_NPerBlock,
+        LoopSched>;
+
+    // desc for blockwise copy
+    using AGridDesc_AK0_M_AK1                          = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(AGridDesc_M_K{}))>;
+    using BGridDesc_BK0_N_BK1                          = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(BGridDesc_N_K{}))>;
+    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(DsGridDesc_M_N{}))>;
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock  = remove_cvref_t<decltype(
+        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}))>;
+
+    // block-to-e-tile map
+    using Block2ETileMap =
+        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(const void* p_a,
+                 const void* p_b,
+                 const std::array<const void*, NumDTensor>& p_ds,
+                 void* p_e,
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                 const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+                 const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+                     ds_g_n_k_wos_lengths,
+                 const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>&
+                     ds_g_n_k_wos_strides,
+                 const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
+                 const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
+                 const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                 const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                 const std::array<index_t, NDimSpatial>& input_left_pads,
+                 const std::array<index_t, NDimSpatial>& input_right_pads,
+                 const AElementwiseOperation& a_element_op,
+                 const BElementwiseOperation& b_element_op,
+                 const CDEElementwiseOperation& cde_element_op)
+            : p_a_grid_{static_cast<const ADataType*>(p_a)},
+              p_b_grid_{static_cast<const BDataType*>(p_b)},
+              p_ds_grid_{},
+              p_e_grid_{static_cast<EDataType*>(p_e)},
+              num_group_{a_g_n_c_wis_lengths[0]},
+              a_grid_desc_m_k_{DeviceOp::MakeAGridDescriptor_M_K<ALayout>(a_g_n_c_wis_lengths,
+                                                                          a_g_n_c_wis_strides,
+                                                                          b_g_k_c_xs_lengths,
+                                                                          b_g_k_c_xs_strides,
+                                                                          e_g_n_k_wos_lengths,
+                                                                          e_g_n_k_wos_strides,
+                                                                          conv_filter_strides,
+                                                                          conv_filter_dilations,
+                                                                          input_left_pads,
+                                                                          input_right_pads)},
+              b_grid_desc_n_k_{DeviceOp::MakeBGridDescriptor_N_K<BLayout>(b_g_k_c_xs_lengths,
+                                                                          b_g_k_c_xs_strides)},
+              ds_grid_desc_m_n_{},
+              e_grid_desc_m_n_{DeviceOp::MakeEGridDescriptor_M_N<ELayout>(e_g_n_k_wos_lengths,
+                                                                          e_g_n_k_wos_strides)},
+              a_grid_desc_ak0_m_ak1_{
+                  GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k_)},
+              b_grid_desc_bk0_n_bk1_{
+                  GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
+              ds_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              e_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
+              compute_ptr_offset_of_batch_{},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              cde_element_op_{cde_element_op},
+              a_g_n_c_wis_lengths_{a_g_n_c_wis_lengths},
+              a_g_n_c_wis_strides_{a_g_n_c_wis_strides},
+              b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths},
+              b_g_k_c_xs_strides_{b_g_k_c_xs_strides},
+              ds_g_n_k_wos_lengths_{ds_g_n_k_wos_lengths},
+              ds_g_n_k_wos_strides_{ds_g_n_k_wos_strides},
+              e_g_n_k_wos_lengths_{e_g_n_k_wos_lengths},
+              e_g_n_k_wos_strides_{e_g_n_k_wos_strides},
+              conv_filter_strides_{conv_filter_strides},
+              conv_filter_dilations_{conv_filter_dilations},
+              input_left_pads_{input_left_pads},
+              input_right_pads_{input_right_pads}
+        {
+            // A/B/E Batch Stride
+            compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_c_wis_strides[0];
+            compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_k_c_xs_strides[0];
+            compute_ptr_offset_of_batch_.BatchStrideE_ = e_g_n_k_wos_strides[0];
+
+            // populate pointer, batch stride, desc for Ds
+            static_for<0, NumDTensor, 1>{}([&](auto i) {
+                using DLayout   = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                // D pointer
+                p_ds_grid_(i) = static_cast<const DDataType*>(p_ds[i]);
+
+                // D batch stride
+                compute_ptr_offset_of_batch_.BatchStrideDs_(i) = ds_g_n_k_wos_strides[i][0];
+
+                // D desc
+                ds_grid_desc_m_n_(i) = DeviceOp::MakeEGridDescriptor_M_N<DLayout>(
+                    ds_g_n_k_wos_lengths[i], ds_g_n_k_wos_strides[i]);
+            });
+
+            // populate desc for Ds/E
+            if(GridwiseGemm::CheckValidity(a_grid_desc_m_k_,
+                                           b_grid_desc_n_k_,
+                                           ds_grid_desc_m_n_,
+                                           e_grid_desc_m_n_,
+                                           block_2_etile_map_))
+            {
+                e_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        e_grid_desc_m_n_);
+
+                ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                    GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                        ds_grid_desc_m_n_);
+            }
+        }
+
+        void Print() const
+        {
+            std::cout << "A[M, K]: " << a_grid_desc_m_k_ << std::endl;
+            std::cout << "B[N, K]: " << b_grid_desc_n_k_ << std::endl;
+            static_for<0, NumDTensor, 1>{}(
+                [&](auto i) { std::cout << "Ds[M, N]: " << ds_grid_desc_m_n_[i] << std::endl; });
+            std::cout << "E[M, N]: " << e_grid_desc_m_n_ << std::endl;
+        }
+
+        //  private:
+        // pointers
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        typename GridwiseGemm::DsGridPointer p_ds_grid_;
+        EDataType* p_e_grid_;
+
+        // tensor descriptors for problem definiton
+        index_t num_group_;
+        AGridDesc_M_K a_grid_desc_m_k_;
+        BGridDesc_N_K b_grid_desc_n_k_;
+        DsGridDesc_M_N ds_grid_desc_m_n_;
+        EGridDesc_M_N e_grid_desc_m_n_;
+
+        // tensor descriptors for block/thread-wise copy
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+            ds_grid_desc_mblock_mperblock_nblock_nperblock_;
+        EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_;
+
+        // block-to-e-tile map
+        Block2ETileMap block_2_etile_map_;
+
+        // for computing batch offset
+        ComputePtrOffsetOfStridedBatch<NumDTensor> compute_ptr_offset_of_batch_;
+
+        // element-wise op
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation cde_element_op_;
+
+        // for checking IsSupportedArgument()
+        std::array<index_t, NDimSpatial + 3> a_g_n_c_wis_lengths_;
+        std::array<index_t, NDimSpatial + 3> a_g_n_c_wis_strides_;
+        std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_lengths_;
+        std::array<index_t, NDimSpatial + 3> b_g_k_c_xs_strides_;
+        std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_k_wos_lengths_;
+        std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor> ds_g_n_k_wos_strides_;
+        std::array<index_t, NDimSpatial + 3> e_g_n_k_wos_lengths_;
+        std::array<index_t, NDimSpatial + 3> e_g_n_k_wos_strides_;
+        std::array<index_t, NDimSpatial> conv_filter_strides_;
+        std::array<index_t, NDimSpatial> conv_filter_dilations_;
+        std::array<index_t, NDimSpatial> input_left_pads_;
+        std::array<index_t, NDimSpatial> input_right_pads_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+            }
+
+            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
+                                            arg.b_grid_desc_n_k_,
+                                            arg.ds_grid_desc_m_n_,
+                                            arg.e_grid_desc_m_n_,
+                                            arg.block_2_etile_map_))
+            {
+                throw std::runtime_error(
+                    "wrong! GridwiseGemmMultipleD_xdl_cshuffle has invalid setting");
+            }
+
+            const index_t grid_size =
+                arg.block_2_etile_map_.CalculateGridSize(arg.e_grid_desc_m_n_) * arg.num_group_;
+
+            const auto K =
+                arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) * arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
+
+            auto launch_kernel = [&](auto has_main_k_block_loop) {
+                constexpr bool has_main_loop = has_main_k_block_loop.value;
+
+                const auto kernel = kernel_grouped_conv_fwd_multiple_d_xdl_cshuffle<
+                    GridwiseGemm,
+                    ADataType, // TODO: distiguish A/B datatype
+                    typename GridwiseGemm::DsGridPointer,
+                    EDataType,
+                    AElementwiseOperation,
+                    BElementwiseOperation,
+                    CDEElementwiseOperation,
+                    DeviceOp::AGridDesc_AK0_M_AK1,
+                    DeviceOp::BGridDesc_BK0_N_BK1,
+                    DeviceOp::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                    DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                    Block2ETileMap,
+                    ComputePtrOffsetOfStridedBatch<NumDTensor>,
+                    has_main_loop>;
+
+                return launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(grid_size),
+                                              dim3(BlockSize),
+                                              0,
+                                              arg.p_a_grid_,
+                                              arg.p_b_grid_,
+                                              arg.p_ds_grid_,
+                                              arg.p_e_grid_,
+                                              arg.a_element_op_,
+                                              arg.b_element_op_,
+                                              arg.cde_element_op_,
+                                              arg.a_g_n_c_wis_lengths_[0], // Group count
+                                              arg.a_grid_desc_ak0_m_ak1_,
+                                              arg.b_grid_desc_bk0_n_bk1_,
+                                              arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                              arg.block_2_etile_map_,
+                                              arg.compute_ptr_offset_of_batch_);
+            };
+
+            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
+            {
+                return launch_kernel(integral_constant<bool, true>{});
+            }
+            else
+            {
+                return launch_kernel(integral_constant<bool, false>{});
+            }
+        }
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        namespace ctc = tensor_layout::convolution;
+
+        // check device
+        if(get_device_name() == "gfx908")
+        {
+            if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, float> ||
+                           is_same_v<AccDataType, int32_t>))
+            {
+                return false;
+            }
+        }
+        else if(get_device_name() == "gfx90a")
+        {
+            if constexpr(!(is_same_v<AccDataType, float> || is_same_v<AccDataType, float> ||
+                           is_same_v<AccDataType, int32_t> || is_same_v<AccDataType, double>))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+
+        // check ConvolutionForwardSpecialization
+        if constexpr(ConvForwardSpecialization ==
+                     ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
+        {
+            // check if it's 1x1, stride=1 conv
+            for(index_t i = 0; i < NDimSpatial; ++i)
+            {
+                const index_t X          = arg.b_g_k_c_xs_lengths_[i + 2];
+                const index_t ConvStride = arg.conv_filter_strides_[i];
+                const index_t LeftPad    = arg.input_left_pads_[i];
+                const index_t RightPad   = arg.input_right_pads_[i];
+
+                if(!(X == 1 && ConvStride == 1 && LeftPad == 0 && RightPad == 0))
+                {
+                    return false;
+                }
+            }
+        }
+        else if constexpr(ConvForwardSpecialization ==
+                          ConvolutionForwardSpecialization::Filter1x1Pad0)
+        {
+            // check if it's 1x1 conv
+            for(index_t i = 0; i < NDimSpatial; ++i)
+            {
+                const index_t X        = arg.b_g_k_c_xs_lengths_[i + 2];
+                const index_t LeftPad  = arg.input_left_pads_[i];
+                const index_t RightPad = arg.input_right_pads_[i];
+
+                if(!(X == 1 && LeftPad == 0 && RightPad == 0))
+                {
+                    return false;
+                }
+            }
+        }
+
+        // check vector access of A
+        // FIXME: layout
+        if constexpr(is_same_v<ALayout, ctc::G_NW_C> || is_same_v<ALayout, ctc::G_NHW_C> ||
+                     is_same_v<ALayout, ctc::G_NDHW_C> || is_same_v<ALayout, ctc::GNWC> ||
+                     is_same_v<ALayout, ctc::GNHWC> || is_same_v<ALayout, ctc::GNDHWC> ||
+                     is_same_v<ALayout, ctc::NWGC> || is_same_v<ALayout, ctc::NHWGC> ||
+                     is_same_v<ALayout, ctc::NDHWGC>)
+        {
+            const index_t C = arg.a_g_n_c_wis_lengths_[2];
+
+            if(!(ABlockTransferSrcVectorDim == 2 && C % ABlockTransferSrcScalarPerVector == 0))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+
+        // check vector access of B
+        // FIXME: layout
+        if constexpr(is_same_v<BLayout, ctc::G_K_X_C> || is_same_v<BLayout, ctc::G_K_YX_C> ||
+                     is_same_v<BLayout, ctc::G_K_ZYX_C> || is_same_v<BLayout, ctc::GKXC> ||
+                     is_same_v<BLayout, ctc::GKYXC> || is_same_v<BLayout, ctc::GKZYXC> ||
+                     is_same_v<BLayout, ctc::KXGC> || is_same_v<BLayout, ctc::KYXGC> ||
+                     is_same_v<BLayout, ctc::KZYXGC>)
+
+        {
+            const index_t C = arg.b_g_k_c_xs_lengths_[2];
+
+            if(!(BBlockTransferSrcVectorDim == 2 && C % BBlockTransferSrcScalarPerVector == 0))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+
+        //  check vector access of Ds
+        bool valid = true;
+
+        static_for<0, NumDTensor, 1>{}([&](auto i) {
+            using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+
+            // FIXME: layout
+            if constexpr(is_same_v<DLayout, ctc::G_NW_K> || is_same_v<DLayout, ctc::G_NHW_K> ||
+                         is_same_v<DLayout, ctc::G_NDHW_K> || is_same_v<DLayout, ctc::GNWK> ||
+                         is_same_v<DLayout, ctc::GNHWK> || is_same_v<DLayout, ctc::GNDHWK> ||
+                         is_same_v<DLayout, ctc::NWGK> || is_same_v<DLayout, ctc::NHWGK> ||
+                         is_same_v<DLayout, ctc::NDHWGK> || is_same_v<DLayout, ctc::GK> ||
+                         is_same_v<DLayout, ctc::G_K>)
+            {
+                const index_t K = arg.ds_g_n_k_wos_lengths_[i][2];
+
+                if(!(K % CDEBlockTransferScalarPerVector_NPerBlock == 0))
+                {
+                    valid = false;
+                }
+            }
+            else
+            {
+                valid = false;
+            }
+        });
+
+        if(!valid)
+        {
+            return false;
+        }
+
+        // check vector access of E
+        if constexpr(is_same_v<ELayout, ctc::G_NW_K> || is_same_v<ELayout, ctc::G_NHW_K> ||
+                     is_same_v<ELayout, ctc::G_NDHW_K> || is_same_v<ELayout, ctc::GNWK> ||
+                     is_same_v<ELayout, ctc::GNHWK> || is_same_v<ELayout, ctc::GNDHWK> ||
+                     is_same_v<ELayout, ctc::NWGK> || is_same_v<ELayout, ctc::NHWGK> ||
+                     is_same_v<ELayout, ctc::NDHWGK>)
+        {
+            const index_t K = arg.e_g_n_k_wos_lengths_[2];
+
+            if(!(K % CDEBlockTransferScalarPerVector_NPerBlock == 0))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+
+        // check Gridwise GEMM
+        return GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
+                                           arg.b_grid_desc_n_k_,
+                                           arg.ds_grid_desc_m_n_,
+                                           arg.e_grid_desc_m_n_,
+                                           arg.block_2_etile_map_);
+    }
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(
+        const void* p_a,
+        const void* p_b,
+        const std::array<const void*, NumDTensor>& p_ds,
+        void* p_e,
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_lengths,
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_strides,
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+        const std::array<index_t, NDimSpatial>& input_left_pads,
+        const std::array<index_t, NDimSpatial>& input_right_pads,
+        const AElementwiseOperation& a_element_op,
+        const BElementwiseOperation& b_element_op,
+        const CDEElementwiseOperation& cde_element_op)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_ds,
+                        p_e,
+                        a_g_n_c_wis_lengths,
+                        a_g_n_c_wis_strides,
+                        b_g_k_c_xs_lengths,
+                        b_g_k_c_xs_strides,
+                        ds_g_n_k_wos_lengths,
+                        ds_g_n_k_wos_strides,
+                        e_g_n_k_wos_lengths,
+                        e_g_n_k_wos_strides,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        a_element_op,
+                        b_element_op,
+                        cde_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(
+        const void* p_a,
+        const void* p_b,
+        const std::array<const void*, NumDTensor>& p_ds,
+        void* p_e,
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides,
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_lengths,
+        const std::array<std::array<index_t, NDimSpatial + 3>, NumDTensor>& ds_g_n_k_wos_strides,
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_lengths,
+        const std::array<index_t, NDimSpatial + 3>& e_g_n_k_wos_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+        const std::array<index_t, NDimSpatial>& input_left_pads,
+        const std::array<index_t, NDimSpatial>& input_right_pads,
+        const AElementwiseOperation& a_element_op,
+        const BElementwiseOperation& b_element_op,
+        const CDEElementwiseOperation& cde_element_op) override
+    {
+        return std::make_unique<Argument>(p_a,
+                                          p_b,
+                                          p_ds,
+                                          p_e,
+                                          a_g_n_c_wis_lengths,
+                                          a_g_n_c_wis_strides,
+                                          b_g_k_c_xs_lengths,
+                                          b_g_k_c_xs_strides,
+                                          ds_g_n_k_wos_lengths,
+                                          ds_g_n_k_wos_strides,
+                                          e_g_n_k_wos_lengths,
+                                          e_g_n_k_wos_strides,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads,
+                                          a_element_op,
+                                          b_element_op,
+                                          cde_element_op);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << getConvForwardSpecializationString(ConvForwardSpecialization)
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
new file mode 100644
index 00000000..aabcc73a
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
@@ -0,0 +1,677 @@
+#pragma once
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename GridwiseGemm,
+          typename GemmDesc,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_grouped_gemm_xdl(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
+                                const index_t group_count,
+                                const AElementwiseOperation a_element_op,
+                                const BElementwiseOperation b_element_op,
+                                const CDEElementwiseOperation c_element_op)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    const index_t block_id = get_block_1d_id();
+
+    const auto gemm_desc_ptr =
+        reinterpret_cast<const GemmDesc*>(cast_pointer_to_generic_address_space(gemm_descs_const));
+
+    index_t left     = 0;
+    index_t right    = group_count;
+    index_t group_id = index_t((left + right) / 2);
+    while((!(block_id >= gemm_desc_ptr[group_id].BlockStart_ &&
+             block_id < gemm_desc_ptr[group_id].BlockEnd_)) &&
+          left <= right)
+    {
+        if(block_id < gemm_desc_ptr[group_id].BlockStart_)
+        {
+            right = group_id;
+        }
+        else
+        {
+            left = group_id;
+        }
+        group_id = index_t((left + right) / 2);
+    }
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(
+        gemm_desc_ptr[group_id].a_ptr_,
+        gemm_desc_ptr[group_id].b_ptr_,
+        gemm_desc_ptr[group_id].ds_ptr_,
+        gemm_desc_ptr[group_id].e_ptr_,
+        p_shared,
+        a_element_op,
+        b_element_op,
+        c_element_op,
+        gemm_desc_ptr[group_id].a_grid_desc_ak0_m_ak1_,
+        gemm_desc_ptr[group_id].b_grid_desc_bk0_n_bk1_,
+        gemm_desc_ptr[group_id].ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+        gemm_desc_ptr[group_id].e_grid_desc_mblock_mperblock_nblock_nperblock_,
+        gemm_desc_ptr[group_id].block_2_etile_map_);
+#else
+    ignore = gemm_descs_const;
+    ignore = group_count;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = c_element_op;
+#endif
+}
+
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          ck::index_t NumPrefetch,
+          ck::index_t BlockSize,
+          ck::index_t MPerBlock,
+          ck::index_t NPerBlock,
+          ck::index_t KPerBlock,
+          ck::index_t AK1,
+          ck::index_t BK1,
+          ck::index_t MPerXDL,
+          ck::index_t NPerXDL,
+          ck::index_t MXdlPerWave,
+          ck::index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          ck::index_t ABlockTransferSrcVectorDim,
+          ck::index_t ABlockTransferSrcScalarPerVector,
+          ck::index_t ABlockTransferDstScalarPerVector_K1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          ck::index_t BBlockTransferSrcVectorDim,
+          ck::index_t BBlockTransferSrcScalarPerVector,
+          ck::index_t BBlockTransferDstScalarPerVector_K1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CDEBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched = make_default_loop_scheduler()>
+struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
+                                                        BLayout,
+                                                        DsLayout,
+                                                        ELayout,
+                                                        ADataType,
+                                                        BDataType,
+                                                        DsDataType,
+                                                        EDataType,
+                                                        AElementwiseOperation,
+                                                        BElementwiseOperation,
+                                                        CDEElementwiseOperation>
+{
+    using DeviceOp = DeviceGroupedGemm_Xdl;
+
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    static constexpr auto matrix_padder =
+        MatrixPadder<GemmSpec, index_t, index_t, index_t>{MPerBlock, NPerBlock, KPerBlock};
+
+    static auto MakeAGridDescriptor_M_K(index_t MRaw, index_t KRaw, index_t StrideA)
+    {
+        const auto a_grid_desc_mraw_kraw = [&]() {
+            if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(StrideA, I1));
+            }
+            else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout>)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, KRaw),
+                                                    make_tuple(I1, StrideA));
+            }
+        }();
+
+        return matrix_padder.PadADescriptor_M_K(a_grid_desc_mraw_kraw);
+    }
+
+    static auto MakeBGridDescriptor_N_K(index_t KRaw, index_t NRaw, index_t StrideB)
+    {
+        const auto b_grid_desc_nraw_kraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(I1, StrideB));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(NRaw, KRaw),
+                                                    make_tuple(StrideB, I1));
+            }
+        }();
+
+        return matrix_padder.PadBDescriptor_N_K(b_grid_desc_nraw_kraw);
+    }
+
+    template <typename ELay>
+    static auto MakeEGridDescriptor_M_N(index_t MRaw, index_t NRaw, index_t StrideE)
+    {
+        const auto e_grid_desc_mraw_nraw = [&]() {
+            if constexpr(is_same<tensor_layout::gemm::RowMajor, ELay>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(StrideE, I1));
+            }
+            else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, ELay>::value)
+            {
+                return make_naive_tensor_descriptor(make_tuple(MRaw, NRaw),
+                                                    make_tuple(I1, StrideE));
+            }
+        }();
+
+        return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
+    }
+
+    static auto MakeDsGridDescriptor_M_N(const std::array<index_t, NumDTensor>& MRaws,
+                                         const std::array<index_t, NumDTensor>& NRaws,
+                                         const std::array<index_t, NumDTensor>& DsStride)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DLayout = remove_cvref_t<tuple_element_t<i.value, DsLayout>>;
+
+                return DeviceOp::MakeEGridDescriptor_M_N<DLayout>(MRaws[i], NRaws[i], DsStride[i]);
+            },
+            Number<NumDTensor>{});
+    }
+
+    using AGridDesc_M_K  = decltype(MakeAGridDescriptor_M_K(1, 1, 1));
+    using BGridDesc_N_K  = decltype(MakeBGridDescriptor_N_K(1, 1, 1));
+    using DsGridDesc_M_N = remove_cvref_t<decltype(MakeDsGridDescriptor_M_N({}, {}, {}))>;
+    using EGridDesc_M_N  = decltype(MakeEGridDescriptor_M_N<ELayout>(1, 1, 1));
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<
+        ADataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CShuffleDataType,
+        DsDataType,
+        EDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CDEElementwiseOperation,
+        InMemoryDataOperationEnum::Set,
+        NumPrefetch, // NumGemmKPrefetchStage
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_K0_M_K1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_K1,
+        false, // AThreadTransferSrcResetCoordinateAfterRun,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_K0_N_K1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_K1,
+        false, // BThreadTransferSrcResetCoordinateAfterRun,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEBlockTransferScalarPerVector_NPerBlock,
+        LoopSched>;
+
+    using AGridDesc_AK0_M_AK1                          = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(AGridDesc_M_K{}))>;
+    using BGridDesc_BK0_N_BK1                          = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(BGridDesc_N_K{}))>;
+    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(DsGridDesc_M_N{}))>;
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock  = remove_cvref_t<decltype(
+        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}))>;
+
+    struct GroupedGemmBlock2ETileMap
+    {
+        using Block2ETileMap =
+            remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
+
+        GroupedGemmBlock2ETileMap()
+        {
+            block_2_etile_map_ = GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{});
+            BlockStart_        = -1;
+        }
+
+        GroupedGemmBlock2ETileMap(const EGridDesc_M_N& e_grid_desc_m_n, ck::index_t BlockStart)
+        {
+            block_2_etile_map_ = GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n);
+            BlockStart_        = BlockStart;
+        }
+
+        template <typename TopIdx>
+        __host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx& idx_top) const
+        {
+            return block_2_etile_map_.CalculateBottomIndex(
+                make_multi_index(idx_top[I0] - BlockStart_));
+        }
+
+        // it's actually E-Tile
+        template <typename CTileIdx, typename CTileDim>
+        __host__ __device__ bool ValidCTileIndex(const CTileIdx& c_tile_idx,
+                                                 const CTileDim& c_tile_dim) const
+        {
+            return block_2_etile_map_.ValidCTileIndex(c_tile_idx, c_tile_dim);
+        }
+
+        __host__ bool CheckValidity(const EGridDesc_M_N& e_grid_desc_m_n) const
+        {
+            return block_2_etile_map_.CheckValidity(e_grid_desc_m_n);
+        }
+
+        Block2ETileMap block_2_etile_map_;
+        ck::index_t BlockStart_;
+    };
+
+    struct GemmBiasTransKernelArg
+    {
+        // pointers
+        const ADataType* a_ptr_;
+        const BDataType* b_ptr_;
+        typename GridwiseGemm::DsGridPointer ds_ptr_;
+        EDataType* e_ptr_;
+
+        // tensor descriptors for problem definiton
+        AGridDesc_M_K a_grid_desc_m_k_;
+        BGridDesc_N_K b_grid_desc_n_k_;
+        DsGridDesc_M_N ds_grid_desc_m_n_;
+        EGridDesc_M_N e_grid_desc_m_n_;
+
+        // tensor descriptors for block/thread-wise copy
+        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
+        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
+        DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+            ds_grid_desc_mblock_mperblock_nblock_nperblock_;
+        EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_;
+
+        // block-to-e-tile map
+        GroupedGemmBlock2ETileMap block_2_etile_map_;
+        ck::index_t BlockStart_, BlockEnd_;
+    };
+
+    // Argument
+    struct Argument : public BaseArgument
+    {
+        Argument(std::vector<const void*>& p_As,
+                 std::vector<const void*>& p_Bs,
+                 std::vector<std::array<const void*, NumDTensor>>& p_Ds,
+                 std::vector<void*>& p_Es,
+                 std::vector<GemmDesc>& gemm_descs,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CDEElementwiseOperation c_element_op)
+            : a_element_op_{a_element_op}, b_element_op_{b_element_op}, c_element_op_{c_element_op}
+        {
+            grid_size_ = 0;
+
+            group_count_ = ck::type_convert<ck::index_t>(gemm_descs.size());
+
+            if(!(group_count_ == ck::type_convert<ck::index_t>(p_As.size()) &&
+                 group_count_ == ck::type_convert<ck::index_t>(p_Bs.size()) &&
+                 group_count_ == ck::type_convert<ck::index_t>(p_Es.size())))
+            {
+                throw std::runtime_error("wrong! group_count_ != p_As/b/c.size");
+            }
+
+            gemm_desc_kernel_arg_.reserve(group_count_);
+
+            skipped_group_count_ = 0;
+
+            for(std::size_t i = 0; i < gemm_descs.size(); i++)
+            {
+                const index_t M = gemm_descs[i].M_;
+                const index_t N = gemm_descs[i].N_;
+                const index_t K = gemm_descs[i].K_;
+
+                if(M == 0)
+                {
+                    skipped_group_count_++;
+                    continue;
+                }
+
+                const index_t StrideA = gemm_descs[i].stride_A_;
+                const index_t StrideB = gemm_descs[i].stride_B_;
+                const index_t StrideC = gemm_descs[i].stride_C_;
+
+                // pointer
+                typename GridwiseGemm::DsGridPointer p_ds_grid{};
+
+                static_for<0, NumDTensor, 1>{}([&](auto j) {
+                    using DDataType = remove_cvref_t<tuple_element_t<j.value, DsDataType>>;
+
+                    p_ds_grid(j) = static_cast<const DDataType*>(p_Ds[i][j]);
+                });
+
+                // tensor descriptors for problem definiton
+                const auto a_grid_desc_m_k = DeviceOp::MakeAGridDescriptor_M_K(M, K, StrideA);
+                const auto b_grid_desc_n_k = DeviceOp::MakeBGridDescriptor_N_K(K, N, StrideB);
+
+                DsGridDesc_M_N ds_grid_desc_m_n;
+
+                static_for<0, NumDTensor, 1>{}([&](auto j) {
+                    using DLayout = remove_cvref_t<tuple_element_t<j.value, DsLayout>>;
+
+                    ds_grid_desc_m_n(j) = DeviceOp::MakeEGridDescriptor_M_N<DLayout>(
+                        M, N, gemm_descs[i].stride_Ds_[j]);
+                });
+
+                const auto e_grid_desc_m_n =
+                    DeviceOp::MakeEGridDescriptor_M_N<ELayout>(M, N, StrideC);
+
+                // tensor descriptors for block/thread-wise copy
+                const auto a_grid_desc_ak0_m_ak1 =
+                    GridwiseGemm::MakeDefaultAGridDescriptor_AK0_M_AK1(a_grid_desc_m_k);
+
+                const auto b_grid_desc_bk0_n_bk1 =
+                    GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k);
+
+                const index_t grid_size_grp =
+                    GroupedGemmBlock2ETileMap(e_grid_desc_m_n, 0)
+                        .block_2_etile_map_.CalculateGridSize(e_grid_desc_m_n);
+
+                const index_t BlockStart = grid_size_;
+                const index_t BlockEnd   = grid_size_ + grid_size_grp;
+
+                grid_size_ += grid_size_grp;
+
+                // block-to-e-tile map
+                const auto block_2_etile_map =
+                    GroupedGemmBlock2ETileMap(e_grid_desc_m_n, BlockStart);
+
+                if(GridwiseGemm::CheckValidity(a_grid_desc_m_k,
+                                               b_grid_desc_n_k,
+                                               ds_grid_desc_m_n,
+                                               e_grid_desc_m_n,
+                                               block_2_etile_map))
+                {
+                    // tensor descriptors for block/thread-wise copy
+                    DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+                        ds_grid_desc_mblock_mperblock_nblock_nperblock;
+
+                    static_for<0, NumDTensor, 1>{}([&](auto j) {
+                        ds_grid_desc_mblock_mperblock_nblock_nperblock(j) =
+                            GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                                ds_grid_desc_m_n[j]);
+                    });
+
+                    const auto e_grid_desc_mblock_mperblock_nblock_nperblock =
+                        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                            e_grid_desc_m_n);
+
+                    gemm_desc_kernel_arg_.push_back(
+                        GemmBiasTransKernelArg{static_cast<const ADataType*>(p_As[i]),
+                                               static_cast<const BDataType*>(p_Bs[i]),
+                                               p_ds_grid,
+                                               static_cast<EDataType*>(p_Es[i]),
+                                               a_grid_desc_m_k,
+                                               b_grid_desc_n_k,
+                                               ds_grid_desc_m_n,
+                                               e_grid_desc_m_n,
+                                               a_grid_desc_ak0_m_ak1,
+                                               b_grid_desc_bk0_n_bk1,
+                                               ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                               e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                               block_2_etile_map,
+                                               BlockStart,
+                                               BlockEnd});
+                }
+            }
+        }
+
+        //  private:
+        index_t group_count_;
+        index_t skipped_group_count_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CDEElementwiseOperation c_element_op_;
+
+        std::vector<GemmBiasTransKernelArg> gemm_desc_kernel_arg_;
+
+        index_t grid_size_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            bool has_main_k_block_loop = true;
+
+            for(std::size_t i = 0; i < arg.gemm_desc_kernel_arg_.size(); i++)
+            {
+                std::cout << "group: " << i << " arg.a_grid_desc_ak0_m_ak1_{"
+                          << arg.gemm_desc_kernel_arg_[i].a_grid_desc_ak0_m_ak1_.GetLength(I0)
+                          << ", "
+                          << arg.gemm_desc_kernel_arg_[i].a_grid_desc_ak0_m_ak1_.GetLength(I1)
+                          << ", "
+                          << arg.gemm_desc_kernel_arg_[i].a_grid_desc_ak0_m_ak1_.GetLength(I2)
+                          << "}";
+
+                std::cout << ", arg.b_grid_desc_bk0_n_bk1_{"
+                          << arg.gemm_desc_kernel_arg_[i].b_grid_desc_bk0_n_bk1_.GetLength(I0)
+                          << ", "
+                          << arg.gemm_desc_kernel_arg_[i].b_grid_desc_bk0_n_bk1_.GetLength(I1)
+                          << ", "
+                          << arg.gemm_desc_kernel_arg_[i].b_grid_desc_bk0_n_bk1_.GetLength(I2)
+                          << "}";
+
+                std::cout << ", arg.e_grid_desc_m_n_{ "
+                          << arg.gemm_desc_kernel_arg_[i].e_grid_desc_m_n_.GetLength(I0) << ", "
+                          << arg.gemm_desc_kernel_arg_[i].e_grid_desc_m_n_.GetLength(I1) << "}"
+                          << std::endl;
+
+                if(!GridwiseGemm::CheckValidity(arg.gemm_desc_kernel_arg_[i].a_grid_desc_m_k_,
+                                                arg.gemm_desc_kernel_arg_[i].b_grid_desc_n_k_,
+                                                arg.gemm_desc_kernel_arg_[i].ds_grid_desc_m_n_,
+                                                arg.gemm_desc_kernel_arg_[i].e_grid_desc_m_n_,
+                                                arg.gemm_desc_kernel_arg_[i].block_2_etile_map_))
+                {
+                    throw std::runtime_error(
+                        "wrong! GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 has invalid setting");
+                }
+
+                const auto K = arg.gemm_desc_kernel_arg_[i].a_grid_desc_ak0_m_ak1_.GetLength(I0) *
+                               arg.gemm_desc_kernel_arg_[i].a_grid_desc_ak0_m_ak1_.GetLength(I2);
+
+                if(GridwiseGemm::CalculateHasMainKBlockLoop(K) != has_main_k_block_loop)
+                {
+                    throw std::runtime_error("wrong! not all gemm has_main_k_block_loop");
+                }
+            }
+
+            hipGetErrorString(
+                hipMemcpy(arg.p_workspace_,
+                          arg.gemm_desc_kernel_arg_.data(),
+                          arg.gemm_desc_kernel_arg_.size() * sizeof(GemmBiasTransKernelArg),
+                          hipMemcpyHostToDevice));
+
+            float ave_time = 0;
+
+            auto launch_kernel = [&](auto has_main_k_block_loop_) {
+                const auto kernel = kernel_grouped_gemm_xdl<GridwiseGemm,
+                                                            GemmBiasTransKernelArg,
+                                                            AElementwiseOperation,
+                                                            BElementwiseOperation,
+                                                            CDEElementwiseOperation,
+                                                            has_main_k_block_loop_>;
+
+                return launch_and_time_kernel(
+                    stream_config,
+                    kernel,
+                    dim3(arg.grid_size_),
+                    dim3(BlockSize),
+                    0,
+                    cast_pointer_to_constant_address_space(arg.p_workspace_),
+                    arg.gemm_desc_kernel_arg_.size(),
+                    arg.a_element_op_,
+                    arg.b_element_op_,
+                    arg.c_element_op_);
+            };
+
+            if(has_main_k_block_loop)
+            {
+                ave_time = launch_kernel(integral_constant<bool, true>{});
+            }
+            else
+            {
+                ave_time = launch_kernel(integral_constant<bool, false>{});
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if((ck::type_convert<ck::index_t>(arg.gemm_desc_kernel_arg_.size()) +
+            arg.skipped_group_count_) != arg.group_count_)
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(std::vector<const void*>& p_As,
+                             std::vector<const void*>& p_Bs,
+                             std::vector<std::array<const void*, NumDTensor>>& p_Ds,
+                             std::vector<void*>& p_Es,
+                             std::vector<GemmDesc> gemm_descs,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CDEElementwiseOperation c_element_op)
+    {
+        return Argument{
+            p_As, p_Bs, p_Ds, p_Es, gemm_descs, a_element_op, b_element_op, c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(std::vector<const void*>& p_As,
+                        std::vector<const void*>& p_Bs,
+                        std::vector<std::array<const void*, NumDTensor>>& p_Ds,
+                        std::vector<void*>& p_Es,
+                        std::vector<GemmDesc>& gemm_descs,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation c_element_op) override
+    {
+        return std::make_unique<Argument>(
+            p_As, p_Bs, p_Ds, p_Es, gemm_descs, a_element_op, b_element_op, c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceGroupedGemm_Xdl"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << KPerBlock << ", "
+            << AK1 << ", "
+            << BK1 << ", "
+            << MPerXDL << ", "
+            << NPerXDL << ", "
+            << MXdlPerWave << ", "
+            << NXdlPerWave
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+
+    size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override
+    {
+        return dynamic_cast<const Argument*>(p_arg)->group_count_ * sizeof(GemmBiasTransKernelArg);
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp b/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp
new file mode 100644
index 00000000..dbeeb980
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp
@@ -0,0 +1,595 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/sequence.hpp"
+#include "ck/utility/reduction_operator.hpp"
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/device/device_multiple_reduce.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_multiblock.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_set_multiple_buffer_value.hpp"
+
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <index_t NumReduction,
+          typename InDataType,
+          typename AccDataType,
+          typename OutDataTypeTuple,
+          index_t Rank,
+          index_t NumReduceDim,
+          typename ReduceOperation,
+          typename InElementwiseOperationTuple,
+          typename AccElementwiseOperationTuple,
+          InMemoryDataOperationEnum OutMemoryDataOperation,
+          bool PropagateNan,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t InSrcVectorDim,
+          index_t InSrcVectorSize,
+          typename OutDstVectorSizeSeq>
+struct DeviceMultipleReduceMultiBlock : public DeviceMultipleReduce<Rank,
+                                                                    NumReduceDim,
+                                                                    NumReduction,
+                                                                    InElementwiseOperationTuple,
+                                                                    AccElementwiseOperationTuple>
+{
+    static_assert(Rank <= 6, "Bigger Rank size is not supported!");
+    static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
+                  "Invalid thread cluster size assignments!");
+
+    static_assert((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
+                      (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static_assert(NumReduction == OutDataTypeTuple::Size() &&
+                      NumReduction == InElementwiseOperationTuple::Size() &&
+                      NumReduction == AccElementwiseOperationTuple::Size() &&
+                      NumReduction == OutDstVectorSizeSeq::Size(),
+                  "All tuple should have the same size as the number of Reductions!");
+
+    static_assert(sequence_all_of(OutDstVectorSizeSeq{},
+                                  [](auto vectorSize) {
+                                      return (MThreadSliceSize % vectorSize == 0);
+                                  }),
+                  "The OutDstVectorSize should completely divide the MThreadSliceSize!");
+
+    static constexpr bool CheckDataTypeTuple()
+    {
+        bool flag = true;
+
+        static_for<0, NumReduction, 1>{}([&](auto I) {
+            using OutDataType = remove_cvref_t<decltype(OutDataTypeTuple{}[I])>;
+            flag =
+                flag && ck::reduce::InMemoryDataOperatonSupportedOnDataType<OutMemoryDataOperation,
+                                                                            OutDataType>::value;
+        });
+
+        return flag;
+    };
+
+    static_assert(CheckDataTypeTuple(),
+                  "The OutDataType must support the specified OutMemoryDataOperation!");
+
+    static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
+
+    static constexpr index_t NumInputDim  = Rank;
+    static constexpr index_t NumOutputDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim;
+    static constexpr bool reduceAllDim    = (NumInvariantDim == 0);
+
+    // So far, only AtomicAdd is considered, other Atomic Operation like AtomicMax can be added
+    // later
+    static constexpr bool use_multiblock =
+        (OutMemoryDataOperation == InMemoryDataOperationEnum::AtomicAdd);
+
+    static_assert(
+        ReduceOperation::IsCompatibleInMemoryDataOperation(OutMemoryDataOperation),
+        "The reduction accumulation operation must be compatible with the OutMemoryDataOperation!");
+
+    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+
+    static auto GenerateOutDataTypePointerTuple()
+    {
+        return generate_tuple(
+            [&](auto I) {
+                using DataType = remove_cvref_t<decltype(OutDataTypeTuple{}[I])>;
+
+                return static_cast<DataType*>(nullptr);
+            },
+            Number<NumReduction>{});
+    };
+
+    using OutDataTypePointerTuple = decltype(GenerateOutDataTypePointerTuple());
+
+    static auto MakeSrc2dDescriptor(const std::array<index_t, NumInputDim>& inLengths,
+                                    const std::array<index_t, NumInputDim>& inStrides,
+                                    int blkGroupSize,
+                                    int numBlockTileIteration)
+    {
+        const auto tupleSrcLengths =
+            generate_tuple([&](auto I) { return inLengths[I]; }, Number<NumInputDim>{});
+        const auto tupleSrcStrides =
+            generate_tuple([&](auto I) { return inStrides[I]; }, Number<NumInputDim>{});
+
+        const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
+
+        const auto in_grid_desc_m_k = [&]() {
+            if constexpr(reduceAllDim)
+            {
+                const auto one_dim_inDesc = transform_tensor_descriptor(
+                    inDesc,
+                    make_tuple(make_merge_transform(tupleSrcLengths)),
+                    make_tuple(typename arithmetic_sequence_gen<0, NumInputDim, 1>::type{}),
+                    make_tuple(Sequence<0>{}));
+
+                return transform_tensor_descriptor(one_dim_inDesc,
+                                                   make_tuple(make_unmerge_transform(make_tuple(
+                                                       1, one_dim_inDesc.GetLength(Number<0>{})))),
+                                                   make_tuple(Sequence<0>{}),
+                                                   make_tuple(Sequence<0, 1>{}));
+            }
+            else
+            {
+                using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type;
+                using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
+
+                const auto reduceDimLengths = generate_tuple(
+                    [&](auto I) { return inLengths[NumInvariantDim + I]; }, Number<NumReduceDim>{});
+                const auto invariantDimLengths =
+                    generate_tuple([&](auto I) { return inLengths[I]; }, Number<NumInvariantDim>{});
+
+                return transform_tensor_descriptor(
+                    inDesc,
+                    make_tuple(make_merge_transform(invariantDimLengths),
+                               make_merge_transform(reduceDimLengths)),
+                    make_tuple(InvariantDims{}, ReduceDims{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+        }();
+
+        const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{});
+        const auto reduceLength    = in_grid_desc_m_k.GetLength(Number<1>{});
+
+        const int reduceSizePerBlock = K_BlockTileSize * numBlockTileIteration;
+        const auto inPad_M =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+        const auto inPad_K = reduceSizePerBlock * blkGroupSize - reduceLength;
+
+        auto in_grid_desc_m_k_padded = transform_tensor_descriptor(
+            in_grid_desc_m_k,
+            make_tuple(make_right_pad_transform(invariantLength, inPad_M),
+                       make_right_pad_transform(reduceLength, inPad_K)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        return (in_grid_desc_m_k_padded);
+    };
+
+    static auto MakeDst1dDescriptor(const std::array<index_t, NumOutputDim>& outLengths,
+                                    const std::array<index_t, NumOutputDim>& outStrides)
+    {
+        const auto tupleDstLengths =
+            generate_tuple([&](auto I) { return outLengths[I]; }, Number<NumOutputDim>{});
+        const auto tupleDstStrides =
+            generate_tuple([&](auto I) { return outStrides[I]; }, Number<NumOutputDim>{});
+
+        auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+
+        auto out_grid_desc_m = transform_tensor_descriptor(
+            outDesc,
+            make_tuple(make_merge_transform(tupleDstLengths)),
+            make_tuple(typename arithmetic_sequence_gen<0, NumOutputDim, 1>::type{}),
+            make_tuple(Sequence<0>{}));
+
+        const auto invariantLength = out_grid_desc_m.GetLength(Number<0>{});
+
+        const auto outPad =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+
+        auto out_grid_desc_m_padded = transform_tensor_descriptor(
+            out_grid_desc_m,
+            make_tuple(make_right_pad_transform(invariantLength, outPad)),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0>{}));
+        return (out_grid_desc_m_padded);
+    };
+
+    static auto GenerateOutGrid1dDescTuple()
+    {
+        return generate_tuple(
+            [&](auto I) {
+                (void)I;
+                return MakeDst1dDescriptor(std::array<index_t, NumOutputDim>{},
+                                           std::array<index_t, NumOutputDim>{});
+            },
+            Number<NumReduction>{});
+    };
+
+    using InGridDesc_M_K      = decltype(MakeSrc2dDescriptor(
+        std::array<index_t, NumInputDim>{}, std::array<index_t, NumInputDim>{}, 1, 1));
+    using OutGridDesc_M_Tuple = decltype(GenerateOutGrid1dDescTuple());
+
+    static auto MakeDst1dDescriptorForBufferSet(const std::array<index_t, NumOutputDim>& outLengths,
+                                                const std::array<index_t, NumOutputDim>& outStrides)
+    {
+        const auto tupleDstLengths =
+            generate_tuple([&](auto I) { return outLengths[I]; }, Number<NumOutputDim>{});
+        const auto tupleDstStrides =
+            generate_tuple([&](auto I) { return outStrides[I]; }, Number<NumOutputDim>{});
+
+        auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+
+        auto out_grid_desc_m = transform_tensor_descriptor(
+            outDesc,
+            make_tuple(make_merge_transform(tupleDstLengths)),
+            make_tuple(typename arithmetic_sequence_gen<0, NumOutputDim, 1>::type{}),
+            make_tuple(Sequence<0>{}));
+
+        const auto length = out_grid_desc_m.GetLength(Number<0>{});
+
+        const auto pad = math::integer_least_multiple(length, BlockSize) - length;
+
+        auto out_grid_desc_m_padded =
+            transform_tensor_descriptor(out_grid_desc_m,
+                                        make_tuple(make_right_pad_transform(length, pad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        return (out_grid_desc_m_padded);
+    };
+
+    static auto GenerateOutGrid1dDescTuple_2()
+    {
+        return generate_tuple(
+            [&](auto I) {
+                (void)I;
+                return MakeDst1dDescriptorForBufferSet(std::array<index_t, NumOutputDim>{},
+                                                       std::array<index_t, NumOutputDim>{});
+            },
+            Number<NumReduction>{});
+    };
+
+    using OutGridDesc_M_Tuple_2 = decltype(GenerateOutGrid1dDescTuple_2());
+
+    struct Argument : public BaseArgument
+    {
+        Argument(const std::array<index_t, NumInputDim>& inLengths,
+                 const std::array<index_t, NumInputDim>& inStrides,
+                 const std::array<index_t, NumOutputDim>& outLengths,
+                 const std::array<std::array<index_t, NumOutputDim>, NumReduction>& outStridesArray,
+                 const std::array<int, NumReduceDim>& reduceDims,
+                 const std::array<const void*, NumReduction>& alphas,
+                 const std::array<const void*, NumReduction>& betas,
+                 const void* in_dev,
+                 const std::array<void*, NumReduction>& out_dev_buffers,
+                 const InElementwiseOperationTuple in_elementwise_op_tuple,
+                 const AccElementwiseOperationTuple acc_elementwise_op_tuple)
+            : outLengths_{outLengths},
+              outStridesArray_{outStridesArray},
+              in_elementwise_op_tuple_{in_elementwise_op_tuple},
+              acc_elementwise_op_tuple_{acc_elementwise_op_tuple}
+        {
+            inLengths_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, reduceDims);
+            inStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inStrides, reduceDims);
+
+            for(size_t i = 0; i < NumReduction; i++)
+            {
+                alpha_values_(i) = *static_cast<const AccDataType*>(alphas[i]);
+                beta_values_(i)  = *static_cast<const AccDataType*>(betas[i]);
+            };
+
+            in_dev_ = static_cast<const InDataType*>(in_dev);
+
+            out_dev_buffers_ = generate_tuple(
+                [&](auto iR) {
+                    using OutDataTypePointer =
+                        remove_cvref_t<decltype(OutDataTypePointerTuple{}[iR])>;
+                    using OutDataType = remove_cvref_t<remove_pointer_t<OutDataTypePointer>>;
+                    return static_cast<OutDataType*>(out_dev_buffers[iR]);
+                },
+                Number<NumReduction>{});
+
+            std::tie(invariant_total_length, reduce_total_length) =
+                get_2d_lengths<Rank, NumReduceDim>(inLengths_);
+
+            if constexpr(use_multiblock)
+            {
+
+                int iterations = 1;
+                while(true)
+                {
+                    int testBlkGroupSize =
+                        (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
+                        (K_BlockTileSize * iterations);
+
+                    // we want the blkGroupSize be not more than 128
+                    if(testBlkGroupSize <= 128)
+                        break;
+
+                    iterations++;
+                };
+
+                blkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
+                               (K_BlockTileSize * iterations);
+
+                numBlockTileIteration = iterations;
+            }
+            else
+            {
+                blkGroupSize = 1;
+                numBlockTileIteration =
+                    (reduce_total_length + K_BlockTileSize - 1) / K_BlockTileSize;
+            };
+
+            in_grid_desc_m_k =
+                MakeSrc2dDescriptor(inLengths_, inStrides_, blkGroupSize, numBlockTileIteration);
+
+            out_grid_desc_m_tuple = generate_tuple(
+                [&](auto I) { return MakeDst1dDescriptor(outLengths, outStridesArray[I]); },
+                Number<NumReduction>{});
+
+            out_grid_desc_m_tuple_2 = generate_tuple(
+                [&](auto I) {
+                    return MakeDst1dDescriptorForBufferSet(outLengths, outStridesArray[I]);
+                },
+                Number<NumReduction>{});
+
+            gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
+                       M_BlockTileSize * blkGroupSize;
+
+            gridSize_pre =
+                math::integer_least_multiple(invariant_total_length, BlockSize) / BlockSize;
+        }
+
+        std::array<index_t, NumInputDim> inLengths_;
+        std::array<index_t, NumInputDim> inStrides_;
+
+        std::array<index_t, NumOutputDim> outLengths_;
+        std::array<std::array<index_t, NumOutputDim>, NumReduction> outStridesArray_;
+
+        Array<AccDataType, NumReduction> alpha_values_;
+        Array<AccDataType, NumReduction> beta_values_;
+
+        const InDataType* in_dev_;
+        OutDataTypePointerTuple out_dev_buffers_;
+
+        InGridDesc_M_K in_grid_desc_m_k;
+        OutGridDesc_M_Tuple out_grid_desc_m_tuple;
+        OutGridDesc_M_Tuple_2 out_grid_desc_m_tuple_2;
+
+        InElementwiseOperationTuple in_elementwise_op_tuple_;
+        AccElementwiseOperationTuple acc_elementwise_op_tuple_;
+
+        long_index_t invariant_total_length;
+        long_index_t reduce_total_length;
+
+        int blkGroupSize;
+        int numBlockTileIteration;
+        size_t gridSize;
+
+        size_t gridSize_pre;
+    };
+
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            using GridwiseMultipleReduce =
+                GridwiseMultipleReduction_mk_to_m_multiblock<NumReduction,
+                                                             InDataType,
+                                                             OutDataTypePointerTuple,
+                                                             AccDataType,
+                                                             InGridDesc_M_K,
+                                                             OutGridDesc_M_Tuple,
+                                                             ReduceOperation,
+                                                             InElementwiseOperationTuple,
+                                                             AccElementwiseOperationTuple,
+                                                             OutMemoryDataOperation,
+                                                             PropagateNan,
+                                                             BlockSize,
+                                                             MThreadClusterSize,
+                                                             KThreadClusterSize,
+                                                             MThreadSliceSize,
+                                                             KThreadSliceSize,
+                                                             InSrcVectorDim,
+                                                             InSrcVectorSize,
+                                                             OutDstVectorSizeSeq>;
+
+            const auto kernel_main =
+                kernel_multiple_reduce_multiblock<GridwiseMultipleReduce,
+                                                  NumReduction,
+                                                  InDataType,
+                                                  OutDataTypePointerTuple,
+                                                  AccDataType,
+                                                  InGridDesc_M_K,
+                                                  OutGridDesc_M_Tuple,
+                                                  InElementwiseOperationTuple,
+                                                  AccElementwiseOperationTuple>;
+
+            float avg_time = 0;
+
+            if constexpr(use_multiblock)
+            {
+                auto identity_values = generate_tuple(
+                    [&](auto iR) {
+                        using OutDataType = remove_cvref_t<decltype(OutDataTypeTuple{}[iR])>;
+                        return ck::reduce::GetIdentityValueForInMemoryDataOperation<OutDataType>(
+                            OutMemoryDataOperation);
+                    },
+                    Number<NumReduction>{});
+
+                const auto kernel_pre = kernel_multiple_buffer_set_value<OutGridDesc_M_Tuple_2,
+                                                                         NumReduction,
+                                                                         BlockSize,
+                                                                         OutDataTypePointerTuple,
+                                                                         OutDataTypeTuple>;
+
+                avg_time += launch_and_time_kernel(stream_config,
+                                                   kernel_pre,
+                                                   dim3(arg.gridSize_pre),
+                                                   dim3(BlockSize),
+                                                   0,
+                                                   arg.out_grid_desc_m_tuple_2,
+                                                   arg.out_dev_buffers_,
+                                                   identity_values);
+            };
+
+            avg_time += launch_and_time_kernel(stream_config,
+                                               kernel_main,
+                                               dim3(arg.gridSize),
+                                               dim3(BlockSize),
+                                               0,
+                                               arg.in_grid_desc_m_k,
+                                               arg.out_grid_desc_m_tuple,
+                                               arg.in_elementwise_op_tuple_,
+                                               arg.acc_elementwise_op_tuple_,
+                                               arg.blkGroupSize,
+                                               arg.numBlockTileIteration,
+                                               arg.alpha_values_,
+                                               arg.in_dev_,
+                                               arg.beta_values_,
+                                               arg.out_dev_buffers_);
+
+            return (avg_time);
+        };
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        };
+    };
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
+
+        if constexpr(use_multiblock)
+        {
+            for(size_t i = 0; i < pArg->beta_values_.Size(); i++)
+                if(pArg->beta_values_[i] != 0.0f)
+                    return (false);
+        };
+
+        if constexpr(InSrcVectorDim == 0)
+        {
+            if constexpr(NumInvariantDim == 0)
+            {
+                return (false);
+            }
+            else
+            {
+                if(pArg->inStrides_[NumInvariantDim - 1] != 1 && InSrcVectorSize != 1)
+                    return (false);
+
+                if(pArg->inLengths_[NumInvariantDim - 1] % InSrcVectorSize != 0)
+                    return (false);
+            };
+        }
+        else
+        {
+            if(pArg->inStrides_[Rank - 1] != 1 && InSrcVectorSize != 1)
+                return (false);
+
+            if(pArg->inLengths_[Rank - 1] % InSrcVectorSize != 0)
+                return (false);
+        };
+        // To improve
+        bool valid = true;
+        static_for<0, NumReduction, 1>{}([&](auto I) {
+            if(pArg->outStridesArray_[I.value][NumOutputDim - 1] != 1 &&
+               OutDstVectorSizeSeq::At(I) != 1)
+                valid = false;
+
+            if(pArg->outLengths_[NumOutputDim - 1] % OutDstVectorSizeSeq::At(I) != 0)
+                valid = false;
+        });
+
+        if(!valid)
+            return (false);
+
+        if constexpr(use_multiblock)
+        {
+            // blkGroupSize of 1 should be handled by Blockwise path using
+            // InMemoryDataOperationEnum::Set
+            if(pArg->blkGroupSize == 1)
+                return (false);
+
+            // This is very strong restriction, but needed to avoid some failure
+            if(pArg->outLengths_[NumOutputDim - 1] % M_BlockTileSize != 0)
+                return (false);
+        }
+        else
+        {
+            // cases with very small reduce_total_length should be handled by ThreadWise kernel
+            if(pArg->reduce_total_length / KThreadSliceSize < 2)
+                return (false);
+        };
+
+        return (true);
+    };
+
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(
+        const std::array<index_t, NumInputDim> inLengths,
+        const std::array<index_t, NumInputDim> inStrides,
+        const std::array<index_t, NumOutputDim> outLengths,
+        const std::array<std::array<index_t, NumOutputDim>, NumReduction> outStridesArray,
+        const std::array<int, NumReduceDim> reduceDims,
+        const std::array<const void*, NumReduction> alphas,
+        const std::array<const void*, NumReduction> betas,
+        const void* in_dev,
+        const std::array<void*, NumReduction> out_dev_buffers,
+        const InElementwiseOperationTuple in_elementwise_op_tuple,
+        const AccElementwiseOperationTuple acc_elementwise_op_tuple) override
+    {
+        return std::make_unique<Argument>(inLengths,
+                                          inStrides,
+                                          outLengths,
+                                          outStridesArray,
+                                          reduceDims,
+                                          alphas,
+                                          betas,
+                                          in_dev,
+                                          out_dev_buffers,
+                                          in_elementwise_op_tuple,
+                                          acc_elementwise_op_tuple);
+    };
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>();
+    };
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << (OutMemoryDataOperation == InMemoryDataOperationEnum::Set? "DeviceMultipleReduceBlockWise<" : "DeviceMultipleReduceMultiBlock<") << BlockSize << ",";
+        str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
+        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
+        str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << ",";
+        str << "OutDstVectorSize"; 
+        static_for<0, OutDstVectorSizeSeq::Size(), 1>{}([&](auto I) {str << "_" << OutDstVectorSizeSeq::At(I); }); 
+        str << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_threadwise.hpp b/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_threadwise.hpp
new file mode 100644
index 00000000..ff8465e9
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_threadwise.hpp
@@ -0,0 +1,422 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/sequence.hpp"
+#include "ck/utility/reduction_operator.hpp"
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/device/device_multiple_reduce.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_threadwise.hpp"
+
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <index_t NumReduction,
+          typename InDataType,
+          typename AccDataType,
+          typename OutDataTypeTuple,
+          index_t Rank,
+          index_t NumReduceDim,
+          typename ReduceOperation,
+          typename InElementwiseOperationTuple,
+          typename AccElementwiseOperationTuple,
+          bool PropagateNan,
+          index_t BlockSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t InSrcVectorDim,
+          index_t InSrcVectorSize,
+          typename OutDstVectorSizeSeq>
+struct DeviceMultipleReduceThreadWise : public DeviceMultipleReduce<Rank,
+                                                                    NumReduceDim,
+                                                                    NumReduction,
+                                                                    InElementwiseOperationTuple,
+                                                                    AccElementwiseOperationTuple>
+{
+    static_assert(Rank <= 6, "Bigger Rank size is not supported!");
+
+    static_assert((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
+                      (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static_assert(NumReduction == OutDataTypeTuple::Size() &&
+                      NumReduction == InElementwiseOperationTuple::Size() &&
+                      NumReduction == AccElementwiseOperationTuple::Size() &&
+                      NumReduction == OutDstVectorSizeSeq::Size(),
+                  "All tuple should have the same size as the number of Reductions!");
+
+    static_assert(sequence_all_of(OutDstVectorSizeSeq{},
+                                  [](auto vectorSize) {
+                                      return (MThreadSliceSize % vectorSize == 0);
+                                  }),
+                  "The OutDstVectorSize should completely divide the MThreadSliceSize!");
+
+    static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
+
+    static constexpr index_t NumInputDim  = Rank;
+    static constexpr index_t NumOutputDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim;
+    static constexpr bool reduceAllDim    = (NumInvariantDim == 0);
+
+    static constexpr index_t M_BlockTileSize = BlockSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize = 1 * KThreadSliceSize;
+
+    static auto GenerateOutDataTypePointerTuple()
+    {
+        return generate_tuple(
+            [&](auto I) {
+                using DataType = remove_cvref_t<decltype(OutDataTypeTuple{}[I])>;
+
+                return static_cast<DataType*>(nullptr);
+            },
+            Number<NumReduction>{});
+    };
+
+    using OutDataTypePointerTuple = decltype(GenerateOutDataTypePointerTuple());
+
+    static auto MakeSrc2dDescriptor(const std::array<index_t, NumInputDim>& inLengths,
+                                    const std::array<index_t, NumInputDim>& inStrides)
+    {
+        const auto tupleSrcLengths =
+            generate_tuple([&](auto I) { return inLengths[I]; }, Number<NumInputDim>{});
+        const auto tupleSrcStrides =
+            generate_tuple([&](auto I) { return inStrides[I]; }, Number<NumInputDim>{});
+
+        const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
+
+        const auto in_grid_desc_m_k = [&]() {
+            if constexpr(reduceAllDim)
+            {
+                const auto one_dim_inDesc = transform_tensor_descriptor(
+                    inDesc,
+                    make_tuple(make_merge_transform(tupleSrcLengths)),
+                    make_tuple(typename arithmetic_sequence_gen<0, NumInputDim, 1>::type{}),
+                    make_tuple(Sequence<0>{}));
+
+                return transform_tensor_descriptor(one_dim_inDesc,
+                                                   make_tuple(make_unmerge_transform(make_tuple(
+                                                       1, one_dim_inDesc.GetLength(Number<0>{})))),
+                                                   make_tuple(Sequence<0>{}),
+                                                   make_tuple(Sequence<0, 1>{}));
+            }
+            else
+            {
+                using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type;
+                using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
+
+                const auto reduceDimLengths = generate_tuple(
+                    [&](auto I) { return inLengths[NumInvariantDim + I]; }, Number<NumReduceDim>{});
+                const auto invariantDimLengths =
+                    generate_tuple([&](auto I) { return inLengths[I]; }, Number<NumInvariantDim>{});
+
+                return transform_tensor_descriptor(
+                    inDesc,
+                    make_tuple(make_merge_transform(invariantDimLengths),
+                               make_merge_transform(reduceDimLengths)),
+                    make_tuple(InvariantDims{}, ReduceDims{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+        }();
+
+        const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{});
+        const auto reduceLength    = in_grid_desc_m_k.GetLength(Number<1>{});
+
+        const auto inPad_M =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+        const auto inPad_K =
+            math::integer_least_multiple(reduceLength, K_BlockTileSize) - reduceLength;
+
+        auto in_grid_desc_m_k_padded = transform_tensor_descriptor(
+            in_grid_desc_m_k,
+            make_tuple(make_right_pad_transform(invariantLength, inPad_M),
+                       make_right_pad_transform(reduceLength, inPad_K)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        return (in_grid_desc_m_k_padded);
+    };
+
+    static auto MakeDst1dDescriptor(const std::array<index_t, NumOutputDim>& outLengths,
+                                    const std::array<index_t, NumOutputDim>& outStrides)
+    {
+        const auto tupleDstLengths =
+            generate_tuple([&](auto I) { return outLengths[I]; }, Number<NumOutputDim>{});
+        const auto tupleDstStrides =
+            generate_tuple([&](auto I) { return outStrides[I]; }, Number<NumOutputDim>{});
+
+        auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+
+        auto out_grid_desc_m = transform_tensor_descriptor(
+            outDesc,
+            make_tuple(make_merge_transform(tupleDstLengths)),
+            make_tuple(typename arithmetic_sequence_gen<0, NumOutputDim, 1>::type{}),
+            make_tuple(Sequence<0>{}));
+
+        const auto invariantLength = out_grid_desc_m.GetLength(Number<0>{});
+
+        const auto outPad =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+
+        auto out_grid_desc_m_padded = transform_tensor_descriptor(
+            out_grid_desc_m,
+            make_tuple(make_right_pad_transform(invariantLength, outPad)),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0>{}));
+        return (out_grid_desc_m_padded);
+    };
+
+    static auto GenerateOutGrid1dDescTuple()
+    {
+        return generate_tuple(
+            [&](auto I) {
+                (void)I;
+                return MakeDst1dDescriptor(std::array<index_t, NumOutputDim>{},
+                                           std::array<index_t, NumOutputDim>{});
+            },
+            Number<NumReduction>{});
+    };
+
+    using InGridDesc_M_K      = decltype(MakeSrc2dDescriptor(std::array<index_t, NumInputDim>{},
+                                                        std::array<index_t, NumInputDim>{}));
+    using OutGridDesc_M_Tuple = decltype(GenerateOutGrid1dDescTuple());
+
+    struct Argument : public BaseArgument
+    {
+        Argument(const std::array<index_t, NumInputDim>& inLengths,
+                 const std::array<index_t, NumInputDim>& inStrides,
+                 const std::array<index_t, NumOutputDim>& outLengths,
+                 const std::array<std::array<index_t, NumOutputDim>, NumReduction>& outStridesArray,
+                 const std::array<int, NumReduceDim>& reduceDims,
+                 const std::array<const void*, NumReduction>& alphas,
+                 const std::array<const void*, NumReduction>& betas,
+                 const void* in_dev,
+                 const std::array<void*, NumReduction>& out_dev_buffers,
+                 const InElementwiseOperationTuple in_elementwise_op_tuple,
+                 const AccElementwiseOperationTuple acc_elementwise_op_tuple)
+            : outLengths_{outLengths},
+              outStridesArray_{outStridesArray},
+              in_elementwise_op_tuple_{in_elementwise_op_tuple},
+              acc_elementwise_op_tuple_{acc_elementwise_op_tuple}
+        {
+            inLengths_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, reduceDims);
+            inStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inStrides, reduceDims);
+
+            for(size_t i = 0; i < NumReduction; i++)
+            {
+                alpha_values_(i) = *static_cast<const AccDataType*>(alphas[i]);
+                beta_values_(i)  = *static_cast<const AccDataType*>(betas[i]);
+            };
+
+            in_dev_ = static_cast<const InDataType*>(in_dev);
+
+            out_dev_buffers_ = generate_tuple(
+                [&](auto iR) {
+                    using OutDataTypePointer =
+                        remove_cvref_t<decltype(OutDataTypePointerTuple{}[iR])>;
+                    using OutDataType = remove_cvref_t<remove_pointer_t<OutDataTypePointer>>;
+                    return static_cast<OutDataType*>(out_dev_buffers[iR]);
+                },
+                Number<NumReduction>{});
+
+            std::tie(invariant_total_length, reduce_total_length) =
+                get_2d_lengths<Rank, NumReduceDim>(inLengths_);
+
+            in_grid_desc_m_k = MakeSrc2dDescriptor(inLengths_, inStrides_);
+
+            out_grid_desc_m_tuple = generate_tuple(
+                [&](auto I) { return MakeDst1dDescriptor(outLengths, outStridesArray[I]); },
+                Number<NumReduction>{});
+
+            gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
+                       M_BlockTileSize;
+        }
+
+        std::array<index_t, NumInputDim> inLengths_;
+        std::array<index_t, NumInputDim> inStrides_;
+
+        std::array<index_t, NumOutputDim> outLengths_;
+        std::array<std::array<index_t, NumOutputDim>, NumReduction> outStridesArray_;
+
+        Array<AccDataType, NumReduction> alpha_values_;
+        Array<AccDataType, NumReduction> beta_values_;
+
+        const InDataType* in_dev_;
+        OutDataTypePointerTuple out_dev_buffers_;
+
+        InGridDesc_M_K in_grid_desc_m_k;
+        OutGridDesc_M_Tuple out_grid_desc_m_tuple;
+
+        InElementwiseOperationTuple in_elementwise_op_tuple_;
+        AccElementwiseOperationTuple acc_elementwise_op_tuple_;
+
+        long_index_t invariant_total_length;
+        long_index_t reduce_total_length;
+
+        size_t gridSize;
+    };
+
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            using GridwiseMultipleReduce =
+                GridwiseMultipleReduction_mk_to_m_threadwise<NumReduction,
+                                                             InDataType,
+                                                             OutDataTypePointerTuple,
+                                                             AccDataType,
+                                                             InGridDesc_M_K,
+                                                             OutGridDesc_M_Tuple,
+                                                             ReduceOperation,
+                                                             InElementwiseOperationTuple,
+                                                             AccElementwiseOperationTuple,
+                                                             InMemoryDataOperationEnum::Set,
+                                                             PropagateNan,
+                                                             BlockSize,
+                                                             MThreadSliceSize,
+                                                             KThreadSliceSize,
+                                                             InSrcVectorDim,
+                                                             InSrcVectorSize,
+                                                             OutDstVectorSizeSeq>;
+
+            const auto kernel_main =
+                kernel_multiple_reduce_threadwise<GridwiseMultipleReduce,
+                                                  NumReduction,
+                                                  InDataType,
+                                                  OutDataTypePointerTuple,
+                                                  AccDataType,
+                                                  InGridDesc_M_K,
+                                                  OutGridDesc_M_Tuple,
+                                                  InElementwiseOperationTuple,
+                                                  AccElementwiseOperationTuple>;
+
+            float avg_time = 0;
+
+            avg_time += launch_and_time_kernel(stream_config,
+                                               kernel_main,
+                                               dim3(arg.gridSize),
+                                               dim3(BlockSize),
+                                               0,
+                                               arg.in_grid_desc_m_k,
+                                               arg.out_grid_desc_m_tuple,
+                                               arg.in_elementwise_op_tuple_,
+                                               arg.acc_elementwise_op_tuple_,
+                                               arg.alpha_values_,
+                                               arg.in_dev_,
+                                               arg.beta_values_,
+                                               arg.out_dev_buffers_);
+
+            return (avg_time);
+        };
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        };
+    };
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
+
+        if constexpr(InSrcVectorDim == 0)
+        {
+            if constexpr(NumInvariantDim == 0)
+            {
+                return (false);
+            }
+            else
+            {
+                if(pArg->inStrides_[NumInvariantDim - 1] != 1 && InSrcVectorSize != 1)
+                    return (false);
+
+                if(pArg->inLengths_[NumInvariantDim - 1] % InSrcVectorSize != 0)
+                    return (false);
+            };
+        }
+        else
+        {
+            if(pArg->inStrides_[Rank - 1] != 1 && InSrcVectorSize != 1)
+                return (false);
+
+            if(pArg->inLengths_[Rank - 1] % InSrcVectorSize != 0)
+                return (false);
+        };
+
+        // To improve
+        bool valid = true;
+        static_for<0, NumReduction, 1>{}([&](auto I) {
+            if(pArg->outStridesArray_[I.value][NumOutputDim - 1] != 1 &&
+               OutDstVectorSizeSeq::At(I) != 1)
+                valid = false;
+
+            if(pArg->outLengths_[NumOutputDim - 1] % OutDstVectorSizeSeq::At(I) != 0)
+                valid = false;
+        });
+
+        if(!valid)
+            return (false);
+
+        return (true);
+    };
+
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(
+        const std::array<index_t, NumInputDim> inLengths,
+        const std::array<index_t, NumInputDim> inStrides,
+        const std::array<index_t, NumOutputDim> outLengths,
+        const std::array<std::array<index_t, NumOutputDim>, NumReduction> outStridesArray,
+        const std::array<int, NumReduceDim> reduceDims,
+        const std::array<const void*, NumReduction> alphas,
+        const std::array<const void*, NumReduction> betas,
+        const void* in_dev,
+        const std::array<void*, NumReduction> out_dev_buffers,
+        const InElementwiseOperationTuple in_elementwise_op_tuple,
+        const AccElementwiseOperationTuple acc_elementwise_op_tuple) override
+    {
+        return std::make_unique<Argument>(inLengths,
+                                          inStrides,
+                                          outLengths,
+                                          outStridesArray,
+                                          reduceDims,
+                                          alphas,
+                                          betas,
+                                          in_dev,
+                                          out_dev_buffers,
+                                          in_elementwise_op_tuple,
+                                          acc_elementwise_op_tuple);
+    };
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>();
+    };
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceMultipleReduceThreadwise<" << BlockSize << ",";
+        str << "M_C" << BlockSize << "_S" << MThreadSliceSize << ",";
+        str << "K_C" << 1 << "_S" << KThreadSliceSize << ",";
+        str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << ",";
+        str << "OutDstVectorSize"; 
+        static_for<0, OutDstVectorSizeSeq::Size(), 1>{}([&](auto I) {str << "_" << OutDstVectorSizeSeq::At(I); }); 
+        str << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp
new file mode 100644
index 00000000..47d9df80
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp
@@ -0,0 +1,476 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/device/device_normalization.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+template <typename GridwiseReduction,
+          typename XDataType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename AccElementwiseOperation,
+          typename GridDesc_M_K>
+__global__ void kernel_normalization(const GridDesc_M_K x_grid_desc_m_k,
+                                     const GridDesc_M_K gamma_grid_desc_m_k,
+                                     const GridDesc_M_K beta_grid_desc_m_k,
+                                     const GridDesc_M_K y_grid_desc_m_k,
+                                     index_t num_k_block_tile_iteration,
+                                     AccDataType epsilon,
+                                     const XDataType* const __restrict__ p_x_global,
+                                     const GammaDataType* const __restrict__ p_gamma_global,
+                                     const BetaDataType* const __restrict__ p_beta_global,
+                                     YDataType* const __restrict__ p_y_global,
+                                     const AccElementwiseOperation acc_elementwise_op)
+{
+    GridwiseReduction::Run(x_grid_desc_m_k,
+                           gamma_grid_desc_m_k,
+                           beta_grid_desc_m_k,
+                           y_grid_desc_m_k,
+                           num_k_block_tile_iteration,
+                           epsilon,
+                           p_x_global,
+                           p_gamma_global,
+                           p_beta_global,
+                           p_y_global,
+                           acc_elementwise_op);
+};
+} // namespace ck
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// Y = Normalization(X, Beta, Gamma)
+template <typename XDataType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename AccDataType,
+          typename YDataType,
+          typename AccElementwiseOperation,
+          index_t Rank,
+          index_t NumReduceDim,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t XYSrcVectorDim,
+          index_t XSrcVectorSize,
+          index_t GammaSrcVectorDim,
+          index_t GammaSrcVectorSize,
+          index_t BetaSrcVectorDim,
+          index_t BetaSrcVectorSize,
+          index_t YDstVectorSize>
+struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
+                                                            GammaDataType,
+                                                            BetaDataType,
+                                                            AccDataType,
+                                                            YDataType,
+                                                            AccElementwiseOperation,
+                                                            Rank,
+                                                            NumReduceDim>
+{
+    static_assert(
+        ((GammaSrcVectorDim == 0 && MThreadSliceSize % GammaSrcVectorSize == 0) ||
+         (GammaSrcVectorDim == 1 && KThreadSliceSize % GammaSrcVectorSize == 0)),
+        "Invalid thread slice sizes and/or gamma vector sizes configuration, please check!");
+
+    static_assert(
+        ((BetaSrcVectorDim == 0 && MThreadSliceSize % BetaSrcVectorSize == 0) ||
+         (BetaSrcVectorDim == 1 && KThreadSliceSize % BetaSrcVectorSize == 0)),
+        "Invalid thread slice sizes and/or beta vector sizes configuration, please check!");
+
+    using PassThrough = tensor_operation::element_wise::PassThrough;
+
+    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+
+    static auto MakeSrc2dDescriptor(const std::vector<index_t>& inLengths,
+                                    const std::vector<index_t>& inStrides,
+                                    int blkGroupSize,
+                                    int numBlockTileIteration)
+    {
+        constexpr index_t NumInvariantDim  = Rank - NumReduceDim;
+        static constexpr index_t numSrcDim = Rank;
+        static constexpr bool reduceAllDim = (NumInvariantDim == 0);
+
+        const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number<numSrcDim>{});
+        const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number<numSrcDim>{});
+
+        const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
+
+        const auto in_grid_desc_m_k = [&]() {
+            if constexpr(reduceAllDim)
+            {
+                const auto one_dim_inDesc = transform_tensor_descriptor(
+                    inDesc,
+                    make_tuple(make_merge_transform(tupleSrcLengths)),
+                    make_tuple(typename arithmetic_sequence_gen<0, numSrcDim, 1>::type{}),
+                    make_tuple(Sequence<0>{}));
+
+                return transform_tensor_descriptor(one_dim_inDesc,
+                                                   make_tuple(make_unmerge_transform(make_tuple(
+                                                       1, one_dim_inDesc.GetLength(Number<0>{})))),
+                                                   make_tuple(Sequence<0>{}),
+                                                   make_tuple(Sequence<0, 1>{}));
+            }
+            else
+            {
+                using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type;
+                using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
+
+                const auto reduceDimLengths =
+                    make_tuple_from_array_and_index_seq(inLengths, ReduceDims{});
+                const auto invariantDimLengths =
+                    make_tuple_from_array_and_index_seq(inLengths, InvariantDims{});
+
+                return transform_tensor_descriptor(
+                    inDesc,
+                    make_tuple(make_merge_transform(invariantDimLengths),
+                               make_merge_transform(reduceDimLengths)),
+                    make_tuple(InvariantDims{}, ReduceDims{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+        }();
+
+        const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{});
+        const auto reduceLength    = in_grid_desc_m_k.GetLength(Number<1>{});
+
+        const int reduceSizePerBlock = K_BlockTileSize * numBlockTileIteration;
+        const auto inPad_M =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+        const auto inPad_K = reduceSizePerBlock * blkGroupSize - reduceLength;
+
+        auto in_grid_desc_m_k_padded = transform_tensor_descriptor(
+            in_grid_desc_m_k,
+            make_tuple(make_right_pad_transform(invariantLength, inPad_M),
+                       make_right_pad_transform(reduceLength, inPad_K)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        return (in_grid_desc_m_k_padded);
+    };
+
+    using GridDesc_M_K = decltype(MakeSrc2dDescriptor({1}, {1}, 1, 1));
+
+    using GridwiseReduceLayernormGeneric =
+        GridwiseNormalizationWelfordVariance_mk_to_mk<XDataType,
+                                                      GammaDataType,
+                                                      BetaDataType,
+                                                      YDataType,
+                                                      AccDataType,
+                                                      AccElementwiseOperation,
+                                                      GridDesc_M_K,
+                                                      BlockSize,
+                                                      MThreadClusterSize,
+                                                      KThreadClusterSize,
+                                                      MThreadSliceSize,
+                                                      KThreadSliceSize,
+                                                      XYSrcVectorDim,
+                                                      XSrcVectorSize,
+                                                      GammaSrcVectorDim,
+                                                      GammaSrcVectorSize,
+                                                      BetaSrcVectorDim,
+                                                      BetaSrcVectorSize,
+                                                      XYSrcVectorDim,
+                                                      YDstVectorSize,
+                                                      false>;
+    using GridwiseNormalizationSweepOnce =
+        GridwiseNormalizationWelfordVariance_mk_to_mk<XDataType,
+                                                      GammaDataType,
+                                                      BetaDataType,
+                                                      YDataType,
+                                                      AccDataType,
+                                                      AccElementwiseOperation,
+                                                      GridDesc_M_K,
+                                                      BlockSize,
+                                                      MThreadClusterSize,
+                                                      KThreadClusterSize,
+                                                      MThreadSliceSize,
+                                                      KThreadSliceSize,
+                                                      XYSrcVectorDim,
+                                                      XSrcVectorSize,
+                                                      GammaSrcVectorDim,
+                                                      GammaSrcVectorSize,
+                                                      BetaSrcVectorDim,
+                                                      BetaSrcVectorSize,
+                                                      XYSrcVectorDim,
+                                                      YDstVectorSize,
+                                                      true>;
+
+    struct Argument : public BaseArgument
+    {
+        Argument(const std::vector<index_t> lengths,
+                 const std::vector<index_t> xStrides,
+                 const std::vector<index_t> gammaStrides,
+                 const std::vector<index_t> betaStrides,
+                 const std::vector<index_t> yStrides,
+                 const std::vector<index_t> reduceDims,
+                 AccElementwiseOperation acc_elementwise_op,
+                 AccDataType epsilon,
+                 const XDataType* p_x,
+                 const GammaDataType* p_gamma,
+                 const BetaDataType* p_beta,
+                 YDataType* p_y)
+            : epsilon_(epsilon),
+              p_x_(p_x),
+              p_gamma_(p_gamma),
+              p_beta_(p_beta),
+              p_y_(p_y),
+              acc_elementwise_op_(acc_elementwise_op)
+        {
+            Lengths_      = shuffle_tensor_dimensions<Rank, NumReduceDim>(lengths, reduceDims);
+            xStrides_     = shuffle_tensor_dimensions<Rank, NumReduceDim>(xStrides, reduceDims);
+            yStrides_     = shuffle_tensor_dimensions<Rank, NumReduceDim>(yStrides, reduceDims);
+            gammaStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(gammaStrides, reduceDims);
+            betaStrides_  = shuffle_tensor_dimensions<Rank, NumReduceDim>(betaStrides, reduceDims);
+
+            long_index_t invariant_total_length;
+            long_index_t reduce_total_length;
+
+            std::tie(invariant_total_length, reduce_total_length) =
+                get_2d_lengths<Rank, NumReduceDim>(Lengths_);
+
+            blkGroupSize_          = 1;
+            numBlockTileIteration_ = (reduce_total_length + K_BlockTileSize - 1) / K_BlockTileSize;
+
+            gridSize_ = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
+                        M_BlockTileSize * blkGroupSize_;
+
+            x_grid_desc_m_k_ =
+                MakeSrc2dDescriptor(Lengths_, xStrides_, blkGroupSize_, numBlockTileIteration_);
+            gamma_grid_desc_m_k_ =
+                MakeSrc2dDescriptor(Lengths_, gammaStrides_, blkGroupSize_, numBlockTileIteration_);
+            beta_grid_desc_m_k_ =
+                MakeSrc2dDescriptor(Lengths_, betaStrides_, blkGroupSize_, numBlockTileIteration_);
+            y_grid_desc_m_k_ =
+                MakeSrc2dDescriptor(Lengths_, yStrides_, blkGroupSize_, numBlockTileIteration_);
+
+            isSweeponce_ =
+                x_grid_desc_m_k_.GetLength(Number<1>{}) <= KThreadClusterSize * KThreadSliceSize;
+        }
+
+        AccDataType epsilon_;
+
+        const XDataType* p_x_;
+        const GammaDataType* p_gamma_;
+        const BetaDataType* p_beta_;
+        YDataType* p_y_;
+
+        std::vector<index_t> Lengths_;
+        std::vector<index_t> xStrides_;
+        std::vector<index_t> gammaStrides_;
+        std::vector<index_t> betaStrides_;
+        std::vector<index_t> yStrides_;
+
+        AccElementwiseOperation acc_elementwise_op_;
+
+        int blkGroupSize_;
+        int numBlockTileIteration_;
+        size_t gridSize_;
+
+        GridDesc_M_K x_grid_desc_m_k_;
+        GridDesc_M_K gamma_grid_desc_m_k_;
+        GridDesc_M_K beta_grid_desc_m_k_;
+        GridDesc_M_K y_grid_desc_m_k_;
+        bool isSweeponce_;
+    };
+
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            const auto kernel_main = arg.isSweeponce_
+                                         ? kernel_normalization<GridwiseNormalizationSweepOnce,
+                                                                XDataType,
+                                                                GammaDataType,
+                                                                BetaDataType,
+                                                                YDataType,
+                                                                AccDataType,
+                                                                AccElementwiseOperation,
+                                                                GridDesc_M_K>
+                                         : kernel_normalization<GridwiseReduceLayernormGeneric,
+                                                                XDataType,
+                                                                GammaDataType,
+                                                                BetaDataType,
+                                                                YDataType,
+                                                                AccDataType,
+                                                                AccElementwiseOperation,
+                                                                GridDesc_M_K>;
+
+            float avg_time = 0;
+            avg_time += launch_and_time_kernel(stream_config,
+                                               kernel_main,
+                                               dim3(arg.gridSize_),
+                                               dim3(BlockSize),
+                                               0,
+                                               arg.x_grid_desc_m_k_,
+                                               arg.gamma_grid_desc_m_k_,
+                                               arg.beta_grid_desc_m_k_,
+                                               arg.y_grid_desc_m_k_,
+                                               arg.numBlockTileIteration_,
+                                               arg.epsilon_,
+                                               arg.p_x_,
+                                               arg.p_gamma_,
+                                               arg.p_beta_,
+                                               arg.p_y_,
+                                               arg.acc_elementwise_op_);
+
+            return (avg_time);
+        };
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        };
+    };
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        const Argument* p_arg_ = dynamic_cast<const Argument*>(p_arg);
+
+        constexpr index_t NumInvariantDim = Rank - NumReduceDim;
+
+        if constexpr(XYSrcVectorDim == 0)
+        {
+            if constexpr(NumInvariantDim == 0)
+            {
+                return false;
+            }
+            else
+            {
+                if(p_arg_->xStrides_[NumInvariantDim - 1] != 1)
+                    return false;
+
+                if(p_arg_->invariant_lowest_length % XSrcVectorSize != 0)
+                    return false;
+            };
+        }
+        else
+        {
+            if(p_arg_->xStrides_[Rank - 1] != 1)
+                return false;
+
+            if(p_arg_->Lengths_[Rank - 1] % XSrcVectorSize != 0)
+                return false;
+        };
+
+        if(p_arg_->Lengths_[Rank - 1] % YDstVectorSize != 0)
+        {
+            return false;
+        }
+
+        // if fastest dim is not reduced
+        if constexpr(GammaSrcVectorDim == 0)
+        {
+            if(p_arg_->gammaStrides_[NumInvariantDim - 1] != 1)
+                return (false);
+
+            if(p_arg_->Lengths_[Rank - 1] % GammaSrcVectorSize != 0)
+                return (false);
+        }
+        else // if fastest dim is reduced
+        {
+            if(p_arg_->gammaStrides_[Rank - 1] != 1)
+                return (false);
+
+            if(p_arg_->Lengths_[Rank - 1] % GammaSrcVectorSize != 0)
+                return (false);
+        }
+
+        // if fastest dim is not reduced
+        if constexpr(BetaSrcVectorDim == 0)
+        {
+            if(p_arg_->betaStrides_[NumInvariantDim - 1] != 1)
+                return (false);
+
+            if(p_arg_->invariant_lowest_length % BetaSrcVectorSize != 0)
+                return (false);
+        }
+        else // if fastest dim is reduced
+        {
+            if(p_arg_->betaStrides_[Rank - 1] != 1)
+                return (false);
+
+            if(p_arg_->Lengths_[Rank - 1] % BetaSrcVectorSize != 0)
+                return (false);
+        }
+
+        return true;
+    };
+
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::vector<index_t> lengths,
+                        const std::vector<index_t> xStrides,
+                        const std::vector<index_t> gammaStrides,
+                        const std::vector<index_t> betaStrides,
+                        const std::vector<index_t> yStrides,
+                        const std::vector<index_t> reduceDims,
+                        AccDataType epsilon,
+                        const void* p_x,
+                        const void* p_gamma,
+                        const void* p_beta,
+                        void* p_y,
+                        void* p_saveMean,
+                        void* p_saveInvVar,
+                        AccElementwiseOperation acc_elementwise_op) override
+    {
+        // TODO
+        // Optional cache of the intermediate results (mean and InvVariance) during the
+        // forward pass could speedup in the backward
+        ignore = p_saveMean;
+        ignore = p_saveInvVar;
+
+        return std::make_unique<Argument>(lengths,
+                                          xStrides,
+                                          gammaStrides,
+                                          betaStrides,
+                                          yStrides,
+                                          reduceDims,
+                                          acc_elementwise_op,
+                                          epsilon,
+                                          static_cast<const XDataType*>(p_x),
+                                          static_cast<const GammaDataType*>(p_gamma),
+                                          static_cast<const BetaDataType*>(p_beta),
+                                          static_cast<YDataType*>(p_y));
+    };
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>();
+    };
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceNormalizationImpl<" << BlockSize << ",";
+        str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
+        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
+        str << "XYSrcVectorDim_" << XYSrcVectorDim  << ",";
+        str << "VectorSize_X" << XSrcVectorSize << "_Gamma" << GammaSrcVectorSize << "_Beta" << BetaSrcVectorSize << "_Y" << YDstVectorSize << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp
new file mode 100644
index 00000000..7b96373c
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_permute_impl.hpp
@@ -0,0 +1,282 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <array>
+#include <memory>
+#include <utility>
+
+#include "ck/utility/math.hpp"
+#include "ck/utility/sequence.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/device/device_permute.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_permute.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// Swap last 2 dimensions
+// input shape: [d[0], d[1], d[2], ..., d[NumDim-3], d[NumDim-2], d[NumDim-1]]
+//                                                                ^^^^^^^^^^^
+// output shape: [d[0], d[1], d[2], ..., d[NumDim-3], d[NumDim-1], d[NumDim-2]]
+//                                                    ^^^^^^^^^^^
+template <index_t NumDim,
+          typename InDataType,
+          typename OutDataType,
+          typename ElementwiseOperation,
+          index_t BlockSize,
+          index_t NPerBlock,
+          index_t HPerBlock,
+          index_t WPerBlock,
+          index_t InBlockLdsExtraW,
+          typename InBlockTransferThreadClusterLengths,
+          typename InBlockTransferThreadClusterArrangeOrder,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          index_t SrcScalarPerVector,
+          index_t DstScalarPerVector>
+struct DevicePermuteImpl : DevicePermute<NumDim, InDataType, OutDataType, ElementwiseOperation>
+{
+    using BaseType = DevicePermute<NumDim, InDataType, OutDataType, ElementwiseOperation>;
+    using typename BaseType::Lengths;
+    using typename BaseType::Strides;
+
+    static_assert(3 <= NumDim, "Only accept at least 3D dimension tensor");
+    static_assert((NumDim - 2) <= SrcVectorDim && SrcVectorDim < NumDim);
+    static_assert((NumDim - 2) <= DstVectorDim && DstVectorDim < NumDim);
+    static_assert(SrcVectorDim != DstVectorDim);
+
+    template <index_t N = NumDim>
+    static auto ConvertArrayToTuple(const std::array<index_t, NumDim>& array)
+    {
+        static_assert(1 <= N && N <= NumDim);
+
+        return generate_tuple([&](auto I) { return array[I]; }, Number<N>{});
+    }
+
+    static auto MakeDescriptor_N_H_W(const Lengths& lengths, const Strides& stride)
+    {
+        // create nd descriptor, shape: [d[0], d[1], d[2], ..., d[NumDim-3], d[NumDim-2],
+        // d[NumDim-1]]
+        const auto desc =
+            make_naive_tensor_descriptor(ConvertArrayToTuple(lengths), ConvertArrayToTuple(stride));
+
+        // merge nd to 3d descriptor, shape: [(d[0] * d[1] * d[2] * ... * d[NumDim-3]), d[NumDim-2],
+        // d[NumDim-1]]
+        //                                   => [N, H, W]
+        const index_t H       = *std::next(rbegin(lengths));
+        const index_t W       = *rbegin(lengths);
+        const auto desc_n_h_w = transform_tensor_descriptor(
+            desc,
+            make_tuple(make_merge_transform(ConvertArrayToTuple<NumDim - 2>(lengths)),
+                       make_pass_through_transform(H),
+                       make_pass_through_transform(W)),
+            make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number<NumDim - 2>{}),
+                       Sequence<NumDim - 2>{},
+                       Sequence<NumDim - 1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+        return PadTensorDescriptor(
+            desc_n_h_w, make_tuple(NPerBlock, HPerBlock, WPerBlock), Sequence<true, true, true>{});
+    }
+
+    using InGridDesc  = decltype(MakeDescriptor_N_H_W({1, 1}, {1, 1}));
+    using OutGridDesc = InGridDesc;
+
+    using GridwisePermute = GridwisePermute<
+        InGridDesc,
+        OutGridDesc,
+        InDataType,
+        OutDataType,
+        ElementwiseOperation,
+        BlockSize,
+        NPerBlock,
+        HPerBlock,
+        WPerBlock,
+        InBlockLdsExtraW,
+        InBlockTransferThreadClusterLengths,
+        InBlockTransferThreadClusterArrangeOrder,
+        SrcVectorDim - (NumDim - 3), // calculate new SrcVectorDim for the merged descriptor
+        DstVectorDim - (NumDim - 3), // calculate new DstVectorDim for the merged descriptor
+        SrcScalarPerVector,
+        DstScalarPerVector>;
+
+    using Block2TileMap = typename GridwisePermute::DefaultBlock2TileMap;
+
+    struct Argument : public BaseArgument
+    {
+        Argument(const Lengths& in_lengths,
+                 const Strides& in_strides,
+                 const Lengths& out_lengths,
+                 const Strides& out_strides,
+                 const void* in_dev_buffer,
+                 void* out_dev_buffer,
+                 ElementwiseOperation elementwise_op)
+            : in_dev_buffer_(static_cast<const InDataType*>(in_dev_buffer)),
+              out_dev_buffer_(static_cast<OutDataType*>(out_dev_buffer)),
+              in_grid_desc_(MakeDescriptor_N_H_W(in_lengths, in_strides)),
+              out_grid_desc_(MakeDescriptor_N_H_W(out_lengths, out_strides)),
+              in_lengths_(in_lengths),
+              in_strides_(in_strides),
+              out_lengths_(out_lengths),
+              out_strides_(out_strides),
+              elementwise_op_(elementwise_op),
+              block_2_tile_map_(GridwisePermute::MakeDefaultBlock2TileMap(in_grid_desc_))
+        {
+        }
+
+        const InDataType* in_dev_buffer_;
+        OutDataType* out_dev_buffer_;
+        InGridDesc in_grid_desc_;
+        OutGridDesc out_grid_desc_;
+
+        Lengths in_lengths_;
+        Strides in_strides_;
+        Lengths out_lengths_;
+        Strides out_strides_;
+
+        ElementwiseOperation elementwise_op_;
+
+        Block2TileMap block_2_tile_map_;
+    };
+
+    struct Invoker : BaseInvoker
+    {
+        static float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            const index_t grid_size = arg.block_2_tile_map_.CalculateGridSize(arg.in_grid_desc_);
+
+            const auto kernel = kernel_nd_permute<GridwisePermute,
+                                                  InGridDesc,
+                                                  OutGridDesc,
+                                                  InDataType,
+                                                  OutDataType,
+                                                  ElementwiseOperation,
+                                                  Block2TileMap>;
+
+            float elapsed_time = launch_and_time_kernel(stream_config,
+                                                        kernel,
+                                                        dim3(grid_size),
+                                                        dim3(BlockSize),
+                                                        0,
+                                                        arg.in_grid_desc_,
+                                                        arg.out_grid_desc_,
+                                                        arg.in_dev_buffer_,
+                                                        arg.out_dev_buffer_,
+                                                        arg.elementwise_op_,
+                                                        arg.block_2_tile_map_);
+            return elapsed_time;
+        }
+
+        float Run(const BaseArgument* arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override final
+        {
+            const auto* const argument = dynamic_cast<const Argument*>(arg);
+            if(!argument)
+            {
+                return NAN;
+            }
+
+            return Run(*argument, stream_config);
+        }
+    };
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        constexpr auto GetPaddedLength = [](index_t length, index_t tile_length) {
+            return math::integer_divide_ceil(length, tile_length) * tile_length;
+        };
+
+        constexpr auto IsScalarPerVectorValid =
+            [](index_t length, index_t stride, index_t scalar_per_vector) {
+                if(stride == 1 && length % scalar_per_vector == 0)
+                {
+                    return true;
+                }
+                else if(stride != 1 && scalar_per_vector == 1)
+                {
+                    return true;
+                }
+
+                return false;
+            };
+
+        return IsScalarPerVectorValid(arg.in_lengths_[SrcVectorDim],
+                                      arg.in_strides_[SrcVectorDim],
+                                      SrcScalarPerVector) &&
+               IsScalarPerVectorValid(
+                   GetPaddedLength(arg.in_lengths_[SrcVectorDim],
+                                   (SrcVectorDim == NumDim - 2 ? HPerBlock : WPerBlock)),
+                   arg.in_strides_[SrcVectorDim],
+                   SrcScalarPerVector) &&
+               IsScalarPerVectorValid(arg.out_lengths_[DstVectorDim],
+                                      arg.out_strides_[DstVectorDim],
+                                      DstScalarPerVector) &&
+               IsScalarPerVectorValid(
+                   GetPaddedLength(arg.out_lengths_[DstVectorDim],
+                                   (DstVectorDim == NumDim - 2 ? HPerBlock : WPerBlock)),
+                   arg.in_strides_[DstVectorDim],
+                   DstScalarPerVector) &&
+               GridwisePermute::CheckValidity(arg.in_grid_desc_, arg.out_grid_desc_);
+    };
+
+    // override methods inherited from 'BaseOperator'
+    bool IsSupportedArgument(const BaseArgument* arg) override final
+    {
+        const auto* const argument = dynamic_cast<const Argument*>(arg);
+        if(!argument)
+        {
+            return false;
+        }
+
+        return IsSupportedArgument(*argument);
+    }
+
+    // override methods inherited from 'DevicePermute'
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const Lengths& in_lengths,
+                        const Strides& in_strides,
+                        const Lengths& out_lengths,
+                        const Strides& out_strides,
+                        const void* in_dev_buffer,
+                        void* out_dev_buffer,
+                        ElementwiseOperation elementwise_op) override final
+    {
+        return std::make_unique<Argument>(in_lengths,
+                                          in_strides,
+                                          out_lengths,
+                                          out_strides,
+                                          in_dev_buffer,
+                                          out_dev_buffer,
+                                          elementwise_op);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override final
+    {
+        return std::make_unique<Invoker>();
+    };
+
+    // other constructor methods
+    template <typename... Args>
+    static std::enable_if_t<std::is_constructible_v<Argument, Args...>, Argument>
+    MakeArgument(Args&&... args) noexcept(std::is_nothrow_constructible_v<Argument, Args...>)
+    {
+        return Argument{std::forward<Args>(args)...};
+    }
+
+    static std::enable_if_t<std::is_default_constructible_v<Invoker>, Invoker>
+    MakeInvoker() noexcept(std::is_nothrow_default_constructible_v<Invoker>)
+    {
+        return Invoker{};
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp b/include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp
new file mode 100644
index 00000000..bfde40cd
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_pool2d_fwd_nhwc_nhwc.hpp
@@ -0,0 +1,327 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/device/device_pool2d_fwd.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename InDataType,
+          typename OutDataType,
+          typename AccDataType,
+          ck::ReduceTensorOp ReduceOpId,
+          bool OuputIndex,
+          ck::index_t BlockSize,
+          ck::index_t ReduceMThreadClusterSize,
+          ck::index_t ReduceKThreadClusterSize,
+          ck::index_t ReduceMThreadSliceSize,
+          ck::index_t ReduceKThreadSliceSize,
+          ck::index_t InSrcOutDstVectorSize>
+struct DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C : public DevicePool2dFwd<ReduceOpId>
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+
+    using IndexDataType = int32_t;
+
+    using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
+
+    using InElementwiseOperation =
+        typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation;
+
+    using AccElementwiseOperation =
+        typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;
+
+    static constexpr index_t InSrcOutDstVectorDim =
+        0; // for NHWC, the dim C is the vector Dim for both input and output in memory, which is
+           // not reduced.
+
+    static constexpr ck::index_t ReduceM_BlockTileSize =
+        ReduceMThreadClusterSize * ReduceMThreadSliceSize;
+    static constexpr ck::index_t ReduceK_BlockTileSize =
+        ReduceKThreadClusterSize * ReduceKThreadSliceSize;
+
+    static auto MakeABGridDescriptor_A_M_K_B_M(ck::index_t N,
+                                               ck::index_t C,
+                                               std::array<ck::index_t, 2> input_spatial_lengths,
+                                               std::array<ck::index_t, 2> window_spatial_lengths,
+                                               std::array<ck::index_t, 2> output_spatial_lengths,
+                                               std::array<ck::index_t, 2> window_strides,
+                                               std::array<ck::index_t, 2> input_left_pads,
+                                               std::array<ck::index_t, 2> input_right_pads)
+    {
+        const index_t Hi = input_spatial_lengths[0];
+        const index_t Wi = input_spatial_lengths[1];
+
+        const index_t Ho = output_spatial_lengths[0];
+        const index_t Wo = output_spatial_lengths[1];
+
+        const index_t Y = window_spatial_lengths[0];
+        const index_t X = window_spatial_lengths[1];
+
+        const index_t ConvStrideH = window_strides[0];
+        const index_t ConvStrideW = window_strides[1];
+
+        const index_t InLeftPadH = input_left_pads[0];
+        const index_t InLeftPadW = input_left_pads[1];
+
+        const index_t InRightPadH = input_right_pads[0];
+        const index_t InRightPadW = input_right_pads[1];
+
+        const index_t ReduceMRaw = N * Ho * Wo * C;
+        const index_t ReduceMPad =
+            math::integer_least_multiple(ReduceMRaw, ReduceM_BlockTileSize) - ReduceMRaw;
+
+        const index_t ReduceKRaw = Y * X;
+        const index_t ReduceKPad =
+            math::integer_least_multiple(ReduceKRaw, ReduceK_BlockTileSize) - ReduceKRaw;
+
+        // A[ReduceM, ReduceK]
+        const auto in_grid_desc_n_hi_wi_c =
+            make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C));
+
+        const auto in_grid_desc_n_hip_wip_c = transform_tensor_descriptor(
+            in_grid_desc_n_hi_wi_c,
+            make_tuple(make_pass_through_transform(N),
+                       make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                       make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                       make_pass_through_transform(C)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        const auto in_grid_desc_n_y_ho_x_wo_c = transform_tensor_descriptor(
+            in_grid_desc_n_hip_wip_c,
+            make_tuple(make_pass_through_transform(N),
+                       make_embed_transform(make_tuple(Y, Ho), make_tuple(I1, ConvStrideH)),
+                       make_embed_transform(make_tuple(X, Wo), make_tuple(I1, ConvStrideW)),
+                       make_pass_through_transform(C)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+        const auto in_grid_desc_reducemraw_reducekraw =
+            transform_tensor_descriptor(in_grid_desc_n_y_ho_x_wo_c,
+                                        make_tuple(make_merge_transform(make_tuple(N, Ho, Wo, C)),
+                                                   make_merge_transform(make_tuple(Y, X))),
+                                        make_tuple(Sequence<0, 2, 4, 5>{}, Sequence<1, 3>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        const auto in_grid_desc_reducem_reducek = transform_tensor_descriptor(
+            in_grid_desc_reducemraw_reducekraw,
+            make_tuple(make_right_pad_transform(ReduceMRaw, ReduceMPad),
+                       make_right_pad_transform(ReduceKRaw, ReduceKPad)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        // B[ReduceM]
+        const auto out_grid_desc_reducemraw =
+            make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo * C));
+
+        const auto out_grid_desc_reducem = transform_tensor_descriptor(
+            out_grid_desc_reducemraw,
+            make_tuple(make_right_pad_transform(ReduceMRaw, ReduceMPad)),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0>{}));
+
+        return make_tuple(in_grid_desc_reducem_reducek, out_grid_desc_reducem);
+    }
+
+    using ABGridDescs = decltype(
+        MakeABGridDescriptor_A_M_K_B_M(1, 1, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}, {1, 1}));
+
+    using AGridDesc_M_K = remove_cvref_t<decltype(ABGridDescs{}[I0])>;
+    using BGridDesc_M   = remove_cvref_t<decltype(ABGridDescs{}[I1])>;
+
+    // TODO
+    struct Argument : public BaseArgument
+    {
+        Argument(const InDataType* p_in_dev,
+                 OutDataType* p_out_dev,
+                 int* p_out_indices_dev,
+                 ck::index_t N,
+                 ck::index_t C,
+                 std::array<ck::index_t, 2>& input_spatial_lengths,
+                 std::array<ck::index_t, 2>& window_spatial_lengths,
+                 std::array<ck::index_t, 2>& output_spatial_lengths,
+                 std::array<ck::index_t, 2>& window_strides,
+                 std::array<ck::index_t, 2>& input_left_pads,
+                 std::array<ck::index_t, 2>& input_right_pads)
+            : p_in_dev_{p_in_dev},
+              p_out_dev_{p_out_dev},
+              p_out_indices_dev_{p_out_indices_dev},
+              a_grid_desc_m_k_{},
+              b_grid_desc_m_{}
+        {
+            const auto descs = MakeABGridDescriptor_A_M_K_B_M(N,
+                                                              C,
+                                                              input_spatial_lengths,
+                                                              window_spatial_lengths,
+                                                              output_spatial_lengths,
+                                                              window_strides,
+                                                              input_left_pads,
+                                                              input_right_pads);
+
+            a_grid_desc_m_k_ = descs[I0];
+            b_grid_desc_m_   = descs[I1];
+
+            invariant_lowest_length_ = C;
+            reduce_lowest_length_    = window_spatial_lengths[1];
+
+            int32_t reduceLength = window_spatial_lengths[0] * window_spatial_lengths[1];
+
+            std::tie(in_element_op_, acc_element_op_) =
+                reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(reduceLength);
+        }
+
+        const InDataType* p_in_dev_;
+        OutDataType* p_out_dev_;
+        int* p_out_indices_dev_;
+        AGridDesc_M_K a_grid_desc_m_k_;
+        BGridDesc_M b_grid_desc_m_;
+        InElementwiseOperation in_element_op_;
+        AccElementwiseOperation acc_element_op_;
+
+        // for checking vector load/store
+        ck::index_t invariant_lowest_length_;
+        ck::index_t reduce_lowest_length_;
+    };
+
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            using gridwise_reduce =
+                GridwiseReduction_mk_to_m_threadwise<InDataType,
+                                                     OutDataType,
+                                                     AccDataType,
+                                                     IndexDataType,
+                                                     AGridDesc_M_K,
+                                                     BGridDesc_M,
+                                                     ReduceOperation,
+                                                     InElementwiseOperation,
+                                                     AccElementwiseOperation,
+                                                     InMemoryDataOperationEnum::Set,
+                                                     false, // propagate_nan
+                                                     BlockSize,
+                                                     ReduceMThreadSliceSize,
+                                                     ReduceKThreadSliceSize,
+                                                     InSrcOutDstVectorDim,
+                                                     InSrcOutDstVectorSize,
+                                                     InSrcOutDstVectorSize>;
+
+            const auto kernel = kernel_reduce_threadwise<gridwise_reduce,
+                                                         OuputIndex,
+                                                         false, // don't have index input
+                                                         InDataType,
+                                                         OutDataType,
+                                                         AccDataType,
+                                                         IndexDataType,
+                                                         AGridDesc_M_K,
+                                                         BGridDesc_M,
+                                                         InElementwiseOperation,
+                                                         AccElementwiseOperation>;
+
+            ck::index_t ReduceM = arg.a_grid_desc_m_k_.GetLength(I0);
+
+            const index_t grid_size = (ReduceM / ReduceM_BlockTileSize);
+
+            return launch_and_time_kernel(stream_config,
+                                          kernel,
+                                          dim3(grid_size),
+                                          dim3(BlockSize),
+                                          0,
+                                          arg.a_grid_desc_m_k_,
+                                          arg.b_grid_desc_m_,
+                                          arg.in_element_op_,
+                                          arg.acc_element_op_,
+                                          float(1),
+                                          arg.p_in_dev_,
+                                          nullptr,
+                                          float(0),
+                                          arg.p_out_dev_,
+                                          arg.p_out_indices_dev_);
+        }
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
+
+        if(pArg->invariant_lowest_length_ % InSrcOutDstVectorSize != 0)
+        {
+            return (false);
+        }
+
+        return (true);
+    }
+
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_in_dev,
+                        void* p_out_dev,
+                        void* p_out_indices_dev,
+                        ck::index_t N,
+                        ck::index_t C,
+                        std::array<ck::index_t, 2> input_spatial_lengths,
+                        std::array<ck::index_t, 2> window_spatial_lengths,
+                        std::array<ck::index_t, 2> output_spatial_lengths,
+                        std::array<ck::index_t, 2> window_strides,
+                        std::array<ck::index_t, 2> input_left_pads,
+                        std::array<ck::index_t, 2> input_right_pads) override
+    {
+        return std::make_unique<Argument>(static_cast<const InDataType*>(p_in_dev),
+                                          static_cast<OutDataType*>(p_out_dev),
+                                          static_cast<int*>(p_out_indices_dev),
+                                          N,
+                                          C,
+                                          input_spatial_lengths,
+                                          window_spatial_lengths,
+                                          output_spatial_lengths,
+                                          window_strides,
+                                          input_left_pads,
+                                          input_right_pads);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DevicePool2dFwd_Input_N_Hi_Wi_C_Output_N_Ho_Wo_C<" << BlockSize << ",";
+        str << "M_C" << ReduceMThreadClusterSize << "_S" << ReduceMThreadSliceSize << ",";
+        str << "K_C" << ReduceKThreadClusterSize << "_S" << ReduceKThreadSliceSize << ",";
+        str <<"InSrcOutDstVectorSize_" << InSrcOutDstVectorSize << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp
new file mode 100644
index 00000000..5dc051be
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+#include <cassert>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/utility/reduction_operator.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+// here, inLengths[] is already shuffled so that lengths of invariant dims are included before those
+// of reduce dims
+template <index_t Rank, int NumReduceDim>
+std::pair<long_index_t, long_index_t> get_2d_lengths(const std::vector<index_t>& inLengths)
+{
+    static_assert(Rank <= 6, "bigger Rank size not supported!");
+
+    long_index_t invariant_total_length = 1;
+    long_index_t reduce_total_length    = 1;
+
+    constexpr int NumInvariantDim = Rank - NumReduceDim;
+
+    for(int i = NumInvariantDim; i < Rank; i++)
+        reduce_total_length *= inLengths[i];
+
+    for(int i = 0; i < NumInvariantDim; i++)
+        invariant_total_length *= inLengths[i];
+
+    return std::make_pair(invariant_total_length, reduce_total_length);
+};
+
+template <index_t Rank, int NumReduceDim>
+std::pair<long_index_t, long_index_t> get_2d_lengths(const std::array<index_t, Rank>& inLengths)
+{
+    static_assert(Rank <= 6, "bigger Rank size not supported!");
+
+    long_index_t invariant_total_length = 1;
+    long_index_t reduce_total_length    = 1;
+
+    constexpr int NumInvariantDim = Rank - NumReduceDim;
+
+    for(int i = NumInvariantDim; i < Rank; i++)
+        reduce_total_length *= inLengths[i];
+
+    for(int i = 0; i < NumInvariantDim; i++)
+        invariant_total_length *= inLengths[i];
+
+    return std::make_pair(invariant_total_length, reduce_total_length);
+};
+
+// helper functions using variadic template arguments
+template <index_t... Ns>
+auto make_tuple_from_array_and_index_seq(const std::vector<index_t>& lengths, Sequence<Ns...>)
+{
+    return make_tuple(static_cast<index_t>(lengths[Ns])...);
+};
+
+template <index_t arraySize>
+auto make_tuple_from_array(const std::vector<index_t>& lengths, Number<arraySize>)
+{
+    static_assert(arraySize >= 1 && arraySize <= 6, "The tensor should have 1 to 6 dimensions");
+
+    constexpr auto index_seq = typename arithmetic_sequence_gen<0, arraySize, 1>::type{};
+
+    return make_tuple_from_array_and_index_seq(lengths, index_seq);
+};
+
+template <index_t Rank, index_t NumReduceDim>
+std::vector<index_t> shuffle_tensor_dimensions(const std::vector<index_t>& origLengthsStrides,
+                                               const std::vector<int>& reduceDims)
+{
+    std::vector<index_t> newLengthsStrides;
+
+    assert(Rank == origLengthsStrides.size() && NumReduceDim == reduceDims.size());
+
+    int reduceFlag = 0;
+
+    // flag the bits for the reduceDims
+    for(int i = 0; i < NumReduceDim; i++)
+    {
+        reduceFlag |= 1 << reduceDims[i];
+    };
+
+    // collect invariant dimensions
+    for(int i = 0; i < Rank; i++)
+        if((reduceFlag & (1 << i)) == 0)
+        {
+            newLengthsStrides.push_back(origLengthsStrides[i]);
+        };
+
+    // collect reduce dimensions
+    for(int i = 0; i < Rank; i++)
+        if((reduceFlag & (1 << i)) > 0)
+        {
+            newLengthsStrides.push_back(origLengthsStrides[i]);
+        };
+
+    return newLengthsStrides;
+};
+
+template <index_t Rank, index_t NumReduceDim>
+std::array<index_t, Rank>
+shuffle_tensor_dimensions(const std::array<index_t, Rank>& origLengthsStrides,
+                          const std::array<int, NumReduceDim>& reduceDims)
+{
+    std::array<index_t, Rank> newLengthsStrides;
+
+    int reduceFlag = 0;
+
+    // flag the bits for the reduceDims
+    for(int i = 0; i < NumReduceDim; i++)
+    {
+        reduceFlag |= 1 << reduceDims[i];
+    };
+
+    // collect invariant dimensions
+    int pos = 0;
+    for(int i = 0; i < Rank; i++)
+        if((reduceFlag & (1 << i)) == 0)
+        {
+            newLengthsStrides[pos++] = origLengthsStrides[i];
+        };
+
+    // collect reduce dimensions
+    for(int i = 0; i < Rank; i++)
+        if((reduceFlag & (1 << i)) > 0)
+        {
+            newLengthsStrides[pos++] = origLengthsStrides[i];
+        };
+
+    return newLengthsStrides;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp
new file mode 100644
index 00000000..93855eb3
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp
@@ -0,0 +1,543 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <array>
+
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          index_t Rank,
+          index_t NumReduceDim,
+          typename ReduceOperation,
+          typename InElementwiseOperation,
+          typename AccElementwiseOperation,
+          InMemoryDataOperationEnum OutMemoryDataOperation,
+          bool PropagateNan,
+          bool OutputIndex,
+          bool HaveIndexInputIfOutputIndex,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t InSrcVectorDim,
+          index_t InSrcVectorSize,
+          index_t OutDstVectorSize>
+struct DeviceReduceMultiBlock
+    : public DeviceReduce<Rank, NumReduceDim, InElementwiseOperation, AccElementwiseOperation>
+{
+    static_assert(Rank <= 6, "Bigger Rank size is not supported!");
+    static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize,
+                  "Invalid thread cluster size assignments!");
+
+    static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
+                   (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) &&
+                      (MThreadSliceSize % OutDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    using IndexDataType = int32_t;
+
+    static constexpr bool HaveIndexInput = OutputIndex && HaveIndexInputIfOutputIndex;
+
+    static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
+
+    static constexpr index_t NumSrcDim = Rank;
+    static constexpr index_t NumDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim;
+    static constexpr bool reduceAllDim = (NumInvariantDim == 0);
+
+    // So far, only AtomicAdd is considered, other Atomic Operation like AtomicMax can be added
+    // later
+    static constexpr bool use_multiblock =
+        (OutMemoryDataOperation == InMemoryDataOperationEnum::AtomicAdd);
+
+    static_assert(ck::reduce::InMemoryDataOperatonSupportedOnDataType<OutMemoryDataOperation,
+                                                                      OutDataType>::value,
+                  "The OutDataType must support the specified OutMemoryDataOperation!");
+
+    static_assert(!use_multiblock || (use_multiblock && !OutputIndex),
+                  "MultiBlock reduction can only be used when outputing index is not required");
+
+    static_assert(
+        ReduceOperation::IsCompatibleInMemoryDataOperation(OutMemoryDataOperation),
+        "The reduction accumulation operation must be compatible with the OutMemoryDataOperation!");
+
+    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+
+    static auto MakeSrc2dDescriptor(const std::array<index_t, Rank>& inLengths,
+                                    const std::array<index_t, Rank>& inStrides,
+                                    int blkGroupSize,
+                                    int numBlockTileIteration)
+    {
+        const auto tupleSrcLengths =
+            generate_tuple([&](auto I) { return inLengths[I]; }, Number<Rank>{});
+        const auto tupleSrcStrides =
+            generate_tuple([&](auto I) { return inStrides[I]; }, Number<Rank>{});
+
+        const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
+
+        const auto in_grid_desc_m_k = [&]() {
+            if constexpr(reduceAllDim)
+            {
+                const auto one_dim_inDesc = transform_tensor_descriptor(
+                    inDesc,
+                    make_tuple(make_merge_transform(tupleSrcLengths)),
+                    make_tuple(typename arithmetic_sequence_gen<0, NumSrcDim, 1>::type{}),
+                    make_tuple(Sequence<0>{}));
+
+                return transform_tensor_descriptor(one_dim_inDesc,
+                                                   make_tuple(make_unmerge_transform(make_tuple(
+                                                       1, one_dim_inDesc.GetLength(Number<0>{})))),
+                                                   make_tuple(Sequence<0>{}),
+                                                   make_tuple(Sequence<0, 1>{}));
+            }
+            else
+            {
+                using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type;
+                using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
+
+                const auto reduceDimLengths = generate_tuple(
+                    [&](auto I) { return inLengths[NumInvariantDim + I]; }, Number<NumReduceDim>{});
+                const auto invariantDimLengths =
+                    generate_tuple([&](auto I) { return inLengths[I]; }, Number<NumInvariantDim>{});
+
+                return transform_tensor_descriptor(
+                    inDesc,
+                    make_tuple(make_merge_transform(invariantDimLengths),
+                               make_merge_transform(reduceDimLengths)),
+                    make_tuple(InvariantDims{}, ReduceDims{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+        }();
+
+        const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{});
+        const auto reduceLength    = in_grid_desc_m_k.GetLength(Number<1>{});
+
+        const int reduceSizePerBlock = K_BlockTileSize * numBlockTileIteration;
+        const auto inPad_M =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+        const auto inPad_K = reduceSizePerBlock * blkGroupSize - reduceLength;
+
+        auto in_grid_desc_m_k_padded = transform_tensor_descriptor(
+            in_grid_desc_m_k,
+            make_tuple(make_right_pad_transform(invariantLength, inPad_M),
+                       make_right_pad_transform(reduceLength, inPad_K)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        return (in_grid_desc_m_k_padded);
+    };
+
+    static auto MakeDst1dDescriptor(const std::array<index_t, NumDstDim>& outLengths,
+                                    const std::array<index_t, NumDstDim>& outStrides)
+    {
+        const auto tupleDstLengths =
+            generate_tuple([&](auto I) { return outLengths[I]; }, Number<NumDstDim>{});
+        const auto tupleDstStrides =
+            generate_tuple([&](auto I) { return outStrides[I]; }, Number<NumDstDim>{});
+
+        auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+
+        auto out_grid_desc_m = transform_tensor_descriptor(
+            outDesc,
+            make_tuple(make_merge_transform(tupleDstLengths)),
+            make_tuple(typename arithmetic_sequence_gen<0, NumDstDim, 1>::type{}),
+            make_tuple(Sequence<0>{}));
+
+        const auto invariantLength = out_grid_desc_m.GetLength(Number<0>{});
+
+        const auto outPad =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+
+        auto out_grid_desc_m_padded = transform_tensor_descriptor(
+            out_grid_desc_m,
+            make_tuple(make_right_pad_transform(invariantLength, outPad)),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0>{}));
+        return (out_grid_desc_m_padded);
+    };
+
+    static auto MakeDst1dDescriptorForBufferSet(const std::array<index_t, NumDstDim>& outLengths,
+                                                const std::array<index_t, NumDstDim>& outStrides)
+    {
+        const auto tupleDstLengths =
+            generate_tuple([&](auto I) { return outLengths[I]; }, Number<NumDstDim>{});
+        const auto tupleDstStrides =
+            generate_tuple([&](auto I) { return outStrides[I]; }, Number<NumDstDim>{});
+
+        auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+
+        auto out_grid_desc_m = transform_tensor_descriptor(
+            outDesc,
+            make_tuple(make_merge_transform(tupleDstLengths)),
+            make_tuple(typename arithmetic_sequence_gen<0, NumDstDim, 1>::type{}),
+            make_tuple(Sequence<0>{}));
+
+        const auto length = out_grid_desc_m.GetLength(Number<0>{});
+
+        const auto pad = math::integer_least_multiple(length, BlockSize) - length;
+
+        auto out_grid_desc_m_padded =
+            transform_tensor_descriptor(out_grid_desc_m,
+                                        make_tuple(make_right_pad_transform(length, pad)),
+                                        make_tuple(Sequence<0>{}),
+                                        make_tuple(Sequence<0>{}));
+        return (out_grid_desc_m_padded);
+    };
+
+    struct Argument : public BaseArgument
+    {
+        Argument(const std::array<index_t, Rank> inLengths,
+                 const std::array<index_t, Rank> inStrides,
+                 const std::array<index_t, NumDstDim> outLengths,
+                 const std::array<index_t, NumDstDim> outStrides,
+                 const std::array<int, NumReduceDim> reduceDims,
+                 float alpha,
+                 float beta,
+                 const InDataType* in_dev,
+                 const IndexDataType* in_index_dev,
+                 OutDataType* out_dev,
+                 IndexDataType* out_index_dev,
+                 const InElementwiseOperation in_elementwise_op,
+                 const AccElementwiseOperation acc_elementwise_op)
+            : outLengths_{outLengths},
+              outStrides_{outStrides},
+              in_dev_{in_dev},
+              in_index_dev_{in_index_dev},
+              out_dev_{out_dev},
+              out_index_dev_{out_index_dev},
+              in_elementwise_op_{in_elementwise_op},
+              acc_elementwise_op_{acc_elementwise_op}
+        {
+            if(Rank != inLengths.size() || Rank != inStrides.size() ||
+               NumReduceDim != reduceDims.size())
+            {
+                throw std::runtime_error(
+                    "One of inLengths/inStrides/reduceDims has invalid size!"
+                    "\nExpected size inLengths: " +
+                    std::to_string(Rank) + ", inStrides: " + std::to_string(Rank) +
+                    ", reduceDims: " + std::to_string(NumReduceDim) +
+                    "\nBut have inLengths: " + std::to_string(inLengths.size()) +
+                    ", inStrides: " + std::to_string(inStrides.size()) +
+                    ", reduceDims: " + std::to_string(reduceDims.size()));
+            }
+
+            for(std::size_t i = 0; i < reduceDims.size(); ++i)
+            {
+                if(reduceDims[i] < 0 || reduceDims[i] >= Rank)
+                {
+                    throw std::runtime_error("Provided reduce dimension exceed input tensor Rank!"
+                                             "\nHave reduceDims[" +
+                                             std::to_string(i) +
+                                             "]: " + std::to_string(reduceDims[i]));
+                }
+            }
+
+            inLengths_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, reduceDims);
+            inStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inStrides, reduceDims);
+
+            alpha_ = type_convert<AccDataType>(alpha);
+            beta_  = type_convert<AccDataType>(beta);
+
+            std::tie(invariant_total_length, reduce_total_length) =
+                get_2d_lengths<Rank, NumReduceDim>(inLengths_);
+
+            if constexpr(NumInvariantDim == 0)
+                invariant_lowest_length = 1;
+            else
+                invariant_lowest_length = inLengths_[NumInvariantDim - 1];
+
+            reduce_lowest_length = inLengths_[Rank - 1];
+
+            if constexpr(use_multiblock)
+            {
+
+                int iterations = 1;
+                while(true)
+                {
+                    int testBlkGroupSize =
+                        (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
+                        (K_BlockTileSize * iterations);
+
+                    // we want the blkGroupSize be not more than 128
+                    if(testBlkGroupSize <= 128)
+                        break;
+
+                    iterations++;
+                };
+
+                blkGroupSize = (reduce_total_length + (K_BlockTileSize * iterations) - 1) /
+                               (K_BlockTileSize * iterations);
+
+                numBlockTileIteration = iterations;
+            }
+            else
+            {
+                blkGroupSize = 1;
+                numBlockTileIteration =
+                    (reduce_total_length + K_BlockTileSize - 1) / K_BlockTileSize;
+            };
+
+            gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
+                       M_BlockTileSize * blkGroupSize;
+
+            gridSize_pre =
+                math::integer_least_multiple(invariant_total_length, BlockSize) / BlockSize;
+        }
+
+        std::array<index_t, Rank> inLengths_;
+        std::array<index_t, Rank> inStrides_;
+        std::array<index_t, NumDstDim> outLengths_;
+        std::array<index_t, NumDstDim> outStrides_;
+
+        AccDataType alpha_;
+        AccDataType beta_;
+
+        const InDataType* in_dev_;
+        const IndexDataType* in_index_dev_;
+        OutDataType* out_dev_;
+        IndexDataType* out_index_dev_;
+
+        InElementwiseOperation in_elementwise_op_;
+        AccElementwiseOperation acc_elementwise_op_;
+
+        index_t invariant_lowest_length;
+        index_t reduce_lowest_length;
+        long_index_t invariant_total_length;
+        long_index_t reduce_total_length;
+
+        int blkGroupSize;
+        int numBlockTileIteration;
+        size_t gridSize;
+
+        size_t gridSize_pre;
+    };
+
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            const auto in_grid_desc_m_k = DeviceReduceMultiBlock::MakeSrc2dDescriptor(
+                arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.numBlockTileIteration);
+            const auto out_grid_desc_m =
+                DeviceReduceMultiBlock::MakeDst1dDescriptor(arg.outLengths_, arg.outStrides_);
+            const auto out_grid_desc_m_2 = DeviceReduceMultiBlock::MakeDst1dDescriptorForBufferSet(
+                arg.outLengths_, arg.outStrides_);
+
+            using InGridDesc_M_K  = decltype(in_grid_desc_m_k);
+            using OutGridDesc_M   = decltype(out_grid_desc_m);
+            using OutGridDesc_M_2 = decltype(out_grid_desc_m_2);
+
+            using GridwiseReduce = GridwiseReduction_mk_to_m_multiblock<InDataType,
+                                                                        OutDataType,
+                                                                        AccDataType,
+                                                                        IndexDataType,
+                                                                        InGridDesc_M_K,
+                                                                        OutGridDesc_M,
+                                                                        ReduceOperation,
+                                                                        InElementwiseOperation,
+                                                                        AccElementwiseOperation,
+                                                                        OutMemoryDataOperation,
+                                                                        PropagateNan,
+                                                                        BlockSize,
+                                                                        MThreadClusterSize,
+                                                                        KThreadClusterSize,
+                                                                        MThreadSliceSize,
+                                                                        KThreadSliceSize,
+                                                                        InSrcVectorDim,
+                                                                        InSrcVectorSize,
+                                                                        OutDstVectorSize>;
+
+            const auto kernel_main = kernel_reduce_multiblock<GridwiseReduce,
+                                                              OutputIndex,
+                                                              HaveIndexInput,
+                                                              InDataType,
+                                                              OutDataType,
+                                                              AccDataType,
+                                                              int32_t,
+                                                              InGridDesc_M_K,
+                                                              OutGridDesc_M,
+                                                              InElementwiseOperation,
+                                                              AccElementwiseOperation>;
+
+            float avg_time = 0;
+
+            if constexpr(use_multiblock)
+            {
+                const auto identityVal =
+                    ck::reduce::GetIdentityValueForInMemoryDataOperation<OutDataType>(
+                        OutMemoryDataOperation);
+
+                const auto kernel_pre =
+                    kernel_buffer_set_value<BlockSize, OutDataType, OutGridDesc_M_2>;
+
+                avg_time += launch_and_time_kernel(stream_config,
+                                                   kernel_pre,
+                                                   dim3(arg.gridSize_pre),
+                                                   dim3(BlockSize),
+                                                   0,
+                                                   out_grid_desc_m_2,
+                                                   arg.out_dev_,
+                                                   identityVal);
+            };
+
+            avg_time += launch_and_time_kernel(stream_config,
+                                               kernel_main,
+                                               dim3(arg.gridSize),
+                                               dim3(BlockSize),
+                                               0,
+                                               in_grid_desc_m_k,
+                                               out_grid_desc_m,
+                                               arg.in_elementwise_op_,
+                                               arg.acc_elementwise_op_,
+                                               arg.blkGroupSize,
+                                               arg.numBlockTileIteration,
+                                               arg.alpha_,
+                                               arg.in_dev_,
+                                               arg.in_index_dev_,
+                                               arg.beta_,
+                                               arg.out_dev_,
+                                               arg.out_index_dev_);
+
+            return (avg_time);
+        };
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        };
+    };
+
+    static bool IsSupportedArgument(const Argument* pArg)
+    {
+        if constexpr(use_multiblock)
+        {
+            if(static_cast<float>(pArg->beta_) != 0.0f)
+                return (false);
+        };
+
+        if constexpr(InSrcVectorDim == 0)
+        {
+            if constexpr(NumInvariantDim == 0)
+            {
+                return (false);
+            }
+            else
+            {
+                if(pArg->inStrides_[NumInvariantDim - 1] != 1)
+                    return (false);
+
+                if(pArg->invariant_lowest_length % InSrcVectorSize != 0)
+                    return (false);
+            };
+        }
+        else
+        {
+            if(pArg->inStrides_[Rank - 1] != 1)
+                return (false);
+
+            if(pArg->reduce_lowest_length % InSrcVectorSize != 0)
+                return (false);
+        };
+
+        // To improve
+        if(pArg->invariant_lowest_length % OutDstVectorSize != 0)
+            return (false);
+
+        if constexpr(use_multiblock)
+        {
+            // blkGroupSize of 1 should be handled by Blockwise path using
+            // InMemoryDataOperationEnum::Set
+            if(pArg->blkGroupSize == 1)
+                return (false);
+
+            // This is very strong restriction, but needed to avoid some failure
+            if(pArg->invariant_lowest_length % M_BlockTileSize != 0)
+                return (false);
+        }
+        else
+        {
+            // cases with very small reduce_total_length should be handled by ThreadWise kernel
+            // if(pArg->reduce_total_length / KThreadSliceSize < 2)
+            //     return (false);
+        };
+
+        return (true);
+    }
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(dynamic_cast<const Argument*>(p_arg));
+    };
+
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::array<index_t, Rank> inLengths,
+                        const std::array<index_t, Rank> inStrides,
+                        const std::array<index_t, NumDstDim> outLengths,
+                        const std::array<index_t, NumDstDim> outStrides,
+                        const std::array<int, NumReduceDim> reduceDims,
+                        float alpha,
+                        float beta,
+                        const void* in_dev,
+                        const void* in_index_dev,
+                        void* out_dev,
+                        void* out_index_dev,
+                        const InElementwiseOperation in_elementwise_op,
+                        const AccElementwiseOperation acc_elementwise_op) override
+    {
+        return std::make_unique<Argument>(inLengths,
+                                          inStrides,
+                                          outLengths,
+                                          outStrides,
+                                          reduceDims,
+                                          alpha,
+                                          beta,
+                                          static_cast<const InDataType*>(in_dev),
+                                          static_cast<const IndexDataType*>(in_index_dev),
+                                          static_cast<OutDataType*>(out_dev),
+                                          static_cast<IndexDataType*>(out_index_dev),
+                                          in_elementwise_op,
+                                          acc_elementwise_op);
+    };
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>();
+    };
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << (OutMemoryDataOperation == InMemoryDataOperationEnum::Set? "DeviceReduceBlockWise<" : "DeviceReduceMultiBlock<") << BlockSize << ",";
+        str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
+        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
+        str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp
new file mode 100644
index 00000000..05e14f08
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp
@@ -0,0 +1,382 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <array>
+
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          index_t Rank,
+          index_t NumReduceDim,
+          typename ReduceOperation,
+          typename InElementwiseOperation,
+          typename AccElementwiseOperation,
+          bool PropagateNan,
+          bool OutputIndex,
+          bool HaveIndexInputIfOutputIndex,
+          index_t BlockSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t InSrcVectorDim,
+          index_t InSrcVectorSize,
+          index_t OutDstVectorSize>
+struct DeviceReduceThreadWise
+    : public DeviceReduce<Rank, NumReduceDim, InElementwiseOperation, AccElementwiseOperation>
+{
+    static_assert(Rank <= 6, "Bigger Rank size is not supported!");
+
+    static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
+                   (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) &&
+                      (MThreadSliceSize % OutDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    using IndexDataType = int32_t;
+
+    static constexpr bool HaveIndexInput = OutputIndex && HaveIndexInputIfOutputIndex;
+
+    static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
+
+    static constexpr index_t NumSrcDim = Rank;
+    static constexpr index_t NumDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim;
+    static constexpr bool reduceAllDim = (NumInvariantDim == 0);
+
+    static constexpr index_t M_BlockTileSize = BlockSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize = 1 * KThreadSliceSize;
+
+    static auto MakeSrc2dDescriptor(const std::array<index_t, Rank>& inLengths,
+                                    const std::array<index_t, Rank>& inStrides)
+    {
+        const auto tupleSrcLengths =
+            generate_tuple([&](auto I) { return inLengths[I]; }, Number<Rank>{});
+        const auto tupleSrcStrides =
+            generate_tuple([&](auto I) { return inStrides[I]; }, Number<Rank>{});
+
+        const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
+
+        const auto in_grid_desc_m_k = [&]() {
+            if constexpr(reduceAllDim)
+            {
+                const auto one_dim_inDesc = transform_tensor_descriptor(
+                    inDesc,
+                    make_tuple(make_merge_transform(tupleSrcLengths)),
+                    make_tuple(typename arithmetic_sequence_gen<0, NumSrcDim, 1>::type{}),
+                    make_tuple(Sequence<0>{}));
+
+                return transform_tensor_descriptor(one_dim_inDesc,
+                                                   make_tuple(make_unmerge_transform(make_tuple(
+                                                       1, one_dim_inDesc.GetLength(Number<0>{})))),
+                                                   make_tuple(Sequence<0>{}),
+                                                   make_tuple(Sequence<0, 1>{}));
+            }
+            else
+            {
+                using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type;
+                using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
+
+                const auto reduceDimLengths = generate_tuple(
+                    [&](auto I) { return inLengths[NumInvariantDim + I]; }, Number<NumReduceDim>{});
+                const auto invariantDimLengths =
+                    generate_tuple([&](auto I) { return inLengths[I]; }, Number<NumInvariantDim>{});
+
+                return transform_tensor_descriptor(
+                    inDesc,
+                    make_tuple(make_merge_transform(invariantDimLengths),
+                               make_merge_transform(reduceDimLengths)),
+                    make_tuple(InvariantDims{}, ReduceDims{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+        }();
+
+        const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{});
+        const auto reduceLength    = in_grid_desc_m_k.GetLength(Number<1>{});
+
+        const auto inPad_M =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+        const auto inPad_K =
+            math::integer_least_multiple(reduceLength, K_BlockTileSize) - reduceLength;
+
+        auto in_grid_desc_m_k_padded = transform_tensor_descriptor(
+            in_grid_desc_m_k,
+            make_tuple(make_right_pad_transform(invariantLength, inPad_M),
+                       make_right_pad_transform(reduceLength, inPad_K)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        return (in_grid_desc_m_k_padded);
+    };
+
+    static auto MakeDst1dDescriptor(const std::array<index_t, NumDstDim>& outLengths,
+                                    const std::array<index_t, NumDstDim>& outStrides)
+    {
+        const auto tupleDstLengths =
+            generate_tuple([&](auto I) { return outLengths[I]; }, Number<NumDstDim>{});
+        const auto tupleDstStrides =
+            generate_tuple([&](auto I) { return outStrides[I]; }, Number<NumDstDim>{});
+
+        auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides);
+
+        auto out_grid_desc_m = transform_tensor_descriptor(
+            outDesc,
+            make_tuple(make_merge_transform(tupleDstLengths)),
+            make_tuple(typename arithmetic_sequence_gen<0, NumDstDim, 1>::type{}),
+            make_tuple(Sequence<0>{}));
+
+        const auto invariantLength = out_grid_desc_m.GetLength(Number<0>{});
+
+        const auto outPad =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+
+        auto out_grid_desc_m_padded = transform_tensor_descriptor(
+            out_grid_desc_m,
+            make_tuple(make_right_pad_transform(invariantLength, outPad)),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0>{}));
+        return (out_grid_desc_m_padded);
+    };
+
+    struct Argument : public BaseArgument
+    {
+        Argument(const std::array<index_t, Rank> inLengths,
+                 const std::array<index_t, Rank> inStrides,
+                 const std::array<index_t, NumDstDim> outLengths,
+                 const std::array<index_t, NumDstDim> outStrides,
+                 const std::array<int, NumReduceDim> reduceDims,
+                 float alpha,
+                 float beta,
+                 const InDataType* in_dev,
+                 OutDataType* out_dev,
+                 IndexDataType* out_index_dev,
+                 const InElementwiseOperation in_elementwise_op,
+                 const AccElementwiseOperation acc_elementwise_op)
+            : outLengths_{outLengths},
+              outStrides_{outStrides},
+              in_dev_{in_dev},
+              out_dev_{out_dev},
+              out_index_dev_{out_index_dev},
+              in_elementwise_op_{in_elementwise_op},
+              acc_elementwise_op_{acc_elementwise_op}
+        {
+            inLengths_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, reduceDims);
+            inStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inStrides, reduceDims);
+
+            alpha_ = type_convert<AccDataType>(alpha);
+            beta_  = type_convert<AccDataType>(beta);
+
+            std::tie(invariant_total_length, reduce_total_length) =
+                get_2d_lengths<Rank, NumReduceDim>(inLengths_);
+
+            if constexpr(NumInvariantDim == 0)
+                invariant_lowest_length = 1;
+            else
+                invariant_lowest_length = inLengths_[NumInvariantDim - 1];
+
+            reduce_lowest_length = inLengths_[Rank - 1];
+
+            numBlockTileIteration = (reduce_total_length + K_BlockTileSize - 1) / K_BlockTileSize;
+
+            gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
+                       M_BlockTileSize;
+        }
+
+        std::array<index_t, Rank> inLengths_;
+        std::array<index_t, Rank> inStrides_;
+        std::array<index_t, NumDstDim> outLengths_;
+        std::array<index_t, NumDstDim> outStrides_;
+
+        AccDataType alpha_;
+        AccDataType beta_;
+
+        const InDataType* in_dev_;
+        OutDataType* out_dev_;
+        IndexDataType* out_index_dev_;
+
+        InElementwiseOperation in_elementwise_op_;
+        AccElementwiseOperation acc_elementwise_op_;
+
+        index_t invariant_lowest_length;
+        index_t reduce_lowest_length;
+        long_index_t invariant_total_length;
+        long_index_t reduce_total_length;
+
+        int numBlockTileIteration;
+        size_t gridSize;
+    };
+
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            const auto in_grid_desc_m_k =
+                DeviceReduceThreadWise::MakeSrc2dDescriptor(arg.inLengths_, arg.inStrides_);
+            const auto out_grid_desc_m =
+                DeviceReduceThreadWise::MakeDst1dDescriptor(arg.outLengths_, arg.outStrides_);
+            using InGridDesc_M_K = decltype(in_grid_desc_m_k);
+            using OutGridDesc_M  = decltype(out_grid_desc_m);
+
+            float avg_time = 0;
+
+            using GridwiseReduce =
+                GridwiseReduction_mk_to_m_threadwise<InDataType,
+                                                     OutDataType,
+                                                     AccDataType,
+                                                     IndexDataType,
+                                                     InGridDesc_M_K,
+                                                     OutGridDesc_M,
+                                                     ReduceOperation,
+                                                     InElementwiseOperation,
+                                                     AccElementwiseOperation,
+                                                     InMemoryDataOperationEnum::Set,
+                                                     PropagateNan,
+                                                     BlockSize,
+                                                     MThreadSliceSize,
+                                                     KThreadSliceSize,
+                                                     InSrcVectorDim,
+                                                     InSrcVectorSize,
+                                                     OutDstVectorSize>;
+
+            const auto kernel = kernel_reduce_threadwise<GridwiseReduce,
+                                                         OutputIndex,
+                                                         HaveIndexInput,
+                                                         InDataType,
+                                                         OutDataType,
+                                                         AccDataType,
+                                                         IndexDataType,
+                                                         InGridDesc_M_K,
+                                                         OutGridDesc_M,
+                                                         InElementwiseOperation,
+                                                         AccElementwiseOperation>;
+
+            avg_time = launch_and_time_kernel(stream_config,
+                                              kernel,
+                                              dim3(arg.gridSize),
+                                              dim3(BlockSize),
+                                              0,
+                                              in_grid_desc_m_k,
+                                              out_grid_desc_m,
+                                              arg.in_elementwise_op_,
+                                              arg.acc_elementwise_op_,
+                                              arg.alpha_,
+                                              arg.in_dev_,
+                                              nullptr,
+                                              arg.beta_,
+                                              arg.out_dev_,
+                                              arg.out_index_dev_);
+
+            return (avg_time);
+        };
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        };
+    };
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        const Argument* pArg = dynamic_cast<const Argument*>(p_arg);
+
+        if constexpr(InSrcVectorDim == 0)
+        {
+            if constexpr(NumInvariantDim == 0)
+            {
+                return (false);
+            }
+            else
+            {
+                if(pArg->inStrides_[NumInvariantDim - 1] != 1)
+                    return (false);
+
+                if(pArg->invariant_lowest_length % InSrcVectorSize != 0)
+                    return (false);
+            };
+        }
+        else
+        {
+            if(pArg->inStrides_[Rank - 1] != 1)
+                return (false);
+
+            if(pArg->reduce_lowest_length % InSrcVectorSize != 0)
+                return (false);
+        };
+
+        // To improve
+        if(pArg->invariant_lowest_length % OutDstVectorSize != 0)
+            return (false);
+
+        // cases with big reduce_total_length should be handled by Blockwise kernel
+        if(pArg->reduce_total_length / KThreadSliceSize >= 32)
+            return (false);
+
+        return (true);
+    };
+
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::array<index_t, Rank> inLengths,
+                        const std::array<index_t, Rank> inStrides,
+                        const std::array<index_t, NumDstDim> outLengths,
+                        const std::array<index_t, NumDstDim> outStrides,
+                        const std::array<int, NumReduceDim> reduceDims,
+                        float alpha,
+                        float beta,
+                        const void* in_dev,
+                        const void* in_index_dev,
+                        void* out_dev,
+                        void* out_index_dev,
+                        const InElementwiseOperation in_elementwise_op,
+                        const AccElementwiseOperation acc_elementwise_op) override
+    {
+        (void)in_index_dev;
+
+        return std::make_unique<Argument>(inLengths,
+                                          inStrides,
+                                          outLengths,
+                                          outStrides,
+                                          reduceDims,
+                                          alpha,
+                                          beta,
+                                          static_cast<const InDataType*>(in_dev),
+                                          static_cast<OutDataType*>(out_dev),
+                                          static_cast<IndexDataType*>(out_index_dev),
+                                          in_elementwise_op,
+                                          acc_elementwise_op);
+    };
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>();
+    };
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceReduceThreadWise<" << BlockSize << ",";
+        str << "M_C" << BlockSize << "_S" << MThreadSliceSize << ",";
+        str << "K_C" << 1 << "_S" << KThreadSliceSize << ",";
+        str << "InSrcVectorDim_" << InSrcVectorDim << "_InSrcVectorSize_" << InSrcVectorSize << "_OutDstVectorSize_" << OutDstVectorSize << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
new file mode 100644
index 00000000..8630a2c6
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp
@@ -0,0 +1,423 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_softmax.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          typename InElementwiseOp,
+          typename AccElementwiseOp,
+          index_t Rank,
+          index_t NumReduceDim,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t InSrcVectorDim,
+          index_t InSrcVectorSize,
+          index_t OutDstVectorSize>
+struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
+                                                AccDataType,
+                                                OutDataType,
+                                                InElementwiseOp,
+                                                AccElementwiseOp,
+                                                Rank>
+{
+    static constexpr index_t kRank            = Rank;
+    static constexpr index_t kNumReduceDim    = NumReduceDim;
+    static constexpr index_t kNumInvariantDim = Rank - NumReduceDim;
+
+    virtual index_t GetRank() const override { return kRank; }
+
+    virtual index_t GetNumReduceDim() const override { return kNumReduceDim; }
+
+    static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
+
+    static constexpr index_t NumSrcDim = Rank;
+    static constexpr index_t NumDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim;
+    static constexpr bool reduceAllDim = (NumInvariantDim == 0);
+
+    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+
+    static auto MakeSrc2dDescriptor(const std::vector<index_t>& inLengths,
+                                    const std::vector<index_t>& inStrides,
+                                    int blkGroupSize,
+                                    int numBlockTileIteration)
+    {
+        const auto tupleSrcLengths =
+            generate_tuple([&](auto I) { return inLengths[I]; }, Number<Rank>{});
+        const auto tupleSrcStrides =
+            generate_tuple([&](auto I) { return inStrides[I]; }, Number<Rank>{});
+
+        const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides);
+
+        const auto in_grid_desc_m_k = [&]() {
+            if constexpr(reduceAllDim)
+            {
+                const auto one_dim_inDesc = transform_tensor_descriptor(
+                    inDesc,
+                    make_tuple(make_merge_transform(tupleSrcLengths)),
+                    make_tuple(typename arithmetic_sequence_gen<0, NumSrcDim, 1>::type{}),
+                    make_tuple(Sequence<0>{}));
+
+                return transform_tensor_descriptor(one_dim_inDesc,
+                                                   make_tuple(make_unmerge_transform(make_tuple(
+                                                       1, one_dim_inDesc.GetLength(Number<0>{})))),
+                                                   make_tuple(Sequence<0>{}),
+                                                   make_tuple(Sequence<0, 1>{}));
+            }
+            else
+            {
+                using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type;
+                using ReduceDims = typename arithmetic_sequence_gen<NumInvariantDim, Rank, 1>::type;
+
+                const auto reduceDimLengths = generate_tuple(
+                    [&](auto I) { return inLengths[NumInvariantDim + I]; }, Number<NumReduceDim>{});
+                const auto invariantDimLengths =
+                    generate_tuple([&](auto I) { return inLengths[I]; }, Number<NumInvariantDim>{});
+
+                return transform_tensor_descriptor(
+                    inDesc,
+                    make_tuple(make_merge_transform(invariantDimLengths),
+                               make_merge_transform(reduceDimLengths)),
+                    make_tuple(InvariantDims{}, ReduceDims{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}));
+            }
+        }();
+
+        const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{});
+        const auto reduceLength    = in_grid_desc_m_k.GetLength(Number<1>{});
+
+        const int reduceSizePerBlock = K_BlockTileSize * numBlockTileIteration;
+        const auto inPad_M =
+            math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength;
+        const auto inPad_K = reduceSizePerBlock * blkGroupSize - reduceLength;
+
+        auto in_grid_desc_m_k_padded = transform_tensor_descriptor(
+            in_grid_desc_m_k,
+            make_tuple(make_right_pad_transform(invariantLength, inPad_M),
+                       make_right_pad_transform(reduceLength, inPad_K)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        return (in_grid_desc_m_k_padded);
+    };
+
+    using GridDesc_M_K = decltype(MakeSrc2dDescriptor({1}, {1}, 1, 1));
+
+    using GridwiseSoftmaxGeneric = GridwiseSoftmax_mk_to_mk<InDataType,
+                                                            OutDataType,
+                                                            AccDataType,
+                                                            GridDesc_M_K,
+                                                            BlockSize,
+                                                            MThreadClusterSize,
+                                                            KThreadClusterSize,
+                                                            MThreadSliceSize,
+                                                            KThreadSliceSize,
+                                                            InSrcVectorDim,
+                                                            InSrcVectorSize,
+                                                            OutDstVectorSize,
+                                                            false>;
+
+    using GridwiseSoftmaxSweepOnce = GridwiseSoftmax_mk_to_mk<InDataType,
+                                                              OutDataType,
+                                                              AccDataType,
+                                                              GridDesc_M_K,
+                                                              BlockSize,
+                                                              MThreadClusterSize,
+                                                              KThreadClusterSize,
+                                                              MThreadSliceSize,
+                                                              KThreadSliceSize,
+                                                              InSrcVectorDim,
+                                                              InSrcVectorSize,
+                                                              OutDstVectorSize,
+                                                              true>;
+
+    struct Argument : public BaseArgument
+    {
+        Argument(const std::vector<index_t> inLengths,
+                 const std::vector<index_t> inStrides,
+                 const std::vector<index_t> reduceDims,
+                 AccDataType alpha,
+                 AccDataType beta,
+                 const InDataType* in_dev,
+                 OutDataType* out_dev,
+                 InElementwiseOp in_elementwise_op,
+                 AccElementwiseOp acc_elementwise_op)
+            : alpha_{alpha},
+              beta_{beta},
+              in_dev_{in_dev},
+              out_dev_{out_dev},
+              in_elementwise_op_{in_elementwise_op},
+              acc_elementwise_op_{acc_elementwise_op}
+        {
+            if(Rank != inLengths.size() || Rank != inStrides.size() ||
+               NumReduceDim != reduceDims.size())
+            {
+                throw std::runtime_error(
+                    "One of inLengths/inStrides/reduceDims has invalid size!"
+                    "\nExpected size inLengths: " +
+                    std::to_string(Rank) + ", inStrides: " + std::to_string(Rank) +
+                    ", reduceDims: " + std::to_string(NumReduceDim) +
+                    "\nBut have inLengths: " + std::to_string(inLengths.size()) +
+                    ", inStrides: " + std::to_string(inStrides.size()) +
+                    ", reduceDims: " + std::to_string(reduceDims.size()));
+            }
+
+            for(std::size_t i = 0; i < reduceDims.size(); ++i)
+            {
+                if(reduceDims[i] < 0 || reduceDims[i] >= Rank)
+                {
+                    throw std::runtime_error("Provided reduce dimension exceed input tensor Rank!"
+                                             "\nHave reduceDims[" +
+                                             std::to_string(i) +
+                                             "]: " + std::to_string(reduceDims[i]));
+                }
+            }
+
+            inLengths_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inLengths, reduceDims);
+            inStrides_ = shuffle_tensor_dimensions<Rank, NumReduceDim>(inStrides, reduceDims);
+
+            long_index_t invariant_total_length;
+            long_index_t reduce_total_length;
+
+            std::tie(invariant_total_length, reduce_total_length) =
+                get_2d_lengths<Rank, NumReduceDim>(inLengths_);
+
+            if constexpr(NumInvariantDim == 0)
+                invariant_lowest_length_ = 1;
+            else
+                invariant_lowest_length_ = inLengths_[NumInvariantDim - 1];
+
+            blkGroupSize          = 1;
+            numBlockTileIteration = (reduce_total_length + K_BlockTileSize - 1) / K_BlockTileSize;
+
+            gridSize = math::integer_least_multiple(invariant_total_length, M_BlockTileSize) /
+                       M_BlockTileSize * blkGroupSize;
+        }
+
+        std::vector<index_t> inLengths_;
+        std::vector<index_t> inStrides_;
+
+        AccDataType alpha_;
+        AccDataType beta_;
+
+        const InDataType* in_dev_;
+        OutDataType* out_dev_;
+
+        InElementwiseOp in_elementwise_op_;
+        AccElementwiseOp acc_elementwise_op_;
+
+        index_t invariant_lowest_length_;
+
+        int blkGroupSize;
+        int numBlockTileIteration;
+        size_t gridSize;
+    };
+
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            const auto in_grid_desc_m_k = DeviceSoftmaxImpl::MakeSrc2dDescriptor(
+                arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.numBlockTileIteration);
+            const auto out_grid_desc_m_k = DeviceSoftmaxImpl::MakeSrc2dDescriptor(
+                arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.numBlockTileIteration);
+
+            bool sweep_once =
+                in_grid_desc_m_k.GetLength(Number<1>{}) <= KThreadClusterSize * KThreadSliceSize;
+
+            const auto kernel_main = sweep_once ? kernel_softmax<GridwiseSoftmaxSweepOnce,
+                                                                 InDataType,
+                                                                 OutDataType,
+                                                                 AccDataType,
+                                                                 GridDesc_M_K>
+                                                : kernel_softmax<GridwiseSoftmaxGeneric,
+                                                                 InDataType,
+                                                                 OutDataType,
+                                                                 AccDataType,
+                                                                 GridDesc_M_K>;
+
+            float avg_time = 0;
+
+            avg_time += launch_and_time_kernel(stream_config,
+                                               kernel_main,
+                                               dim3(arg.gridSize),
+                                               dim3(BlockSize),
+                                               0,
+                                               in_grid_desc_m_k,
+                                               out_grid_desc_m_k,
+                                               arg.blkGroupSize,
+                                               arg.numBlockTileIteration,
+                                               arg.alpha_,
+                                               arg.in_dev_,
+                                               arg.beta_,
+                                               arg.out_dev_);
+
+            return (avg_time);
+        };
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        };
+    };
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if constexpr(InSrcVectorDim == 0)
+        {
+            if constexpr(kNumInvariantDim == 0)
+            {
+                return false;
+            }
+            else
+            {
+                if(arg.inStrides_[kNumInvariantDim - 1] != 1 && InSrcVectorSize != 1)
+                {
+                    return false;
+                }
+                if(arg.invariant_lowest_length_ % InSrcVectorSize != 0)
+                {
+                    return false;
+                }
+            }
+        }
+        else
+        {
+            if(arg.inStrides_[Rank - 1] != 1 && InSrcVectorSize != 1)
+            {
+                return false;
+            }
+            if(arg.inLengths_[Rank - 1] % InSrcVectorSize != 0)
+            {
+                return false;
+            }
+        }
+
+        // To improve
+        if(kNumInvariantDim > 0 && arg.invariant_lowest_length_ % OutDstVectorSize != 0)
+        {
+            return false;
+        }
+
+        if(arg.inLengths_[Rank - 1] % OutDstVectorSize != 0)
+        {
+            return false;
+        }
+
+        return true;
+    };
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const std::vector<index_t> inLengths,
+                             const std::vector<index_t> inStrides,
+                             const std::vector<int> reduceDims,
+                             const AccDataType alpha,
+                             const AccDataType beta,
+                             const InDataType* in_dev,
+                             OutDataType* out_dev,
+                             InElementwiseOp in_elementwise_op,
+                             AccElementwiseOp acc_elementwise_op)
+    {
+        return Argument{inLengths,
+                        inStrides,
+                        reduceDims,
+                        alpha,
+                        beta,
+                        in_dev,
+                        out_dev,
+                        in_elementwise_op,
+                        acc_elementwise_op};
+    };
+
+    //
+    // @brief      Makes a pointer to Argument class.
+    //
+    // @param[in]  inLengths           Input tensor extent(s) from high to low dimension
+    // @param[in]  inStrides           Input tensor stride(s) from high to low dimension
+    // @param[in]  reduceDims          The dimension(s) the normalization operation is applied
+    // @param[in]  alpha               Typeless pointer in host memory storing the alpha scaling
+    //                                 value as type AccDataType
+    // @param[in]  beta                Typeless pointer in host memory storing the beta scaling
+    //                                 value as type AccDataType
+    // @param[in]  in_dev              Typeless const pointer in device memory storing the input
+    //                                 tensor
+    // @param      out_dev             Typeless pointer in device memory storing the output tensor
+    // @param[in]  in_elementwise_op   The input elementwise operation.
+    // @param[in]  acc_elementwise_op  The accumulation elementwise operation.
+    //
+    // @return     Unique pointer to the Argument class.
+    //
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const std::vector<index_t> inLengths,
+                                                      const std::vector<index_t> inStrides,
+                                                      const std::vector<int> reduceDims,
+                                                      const void* alpha,
+                                                      const void* beta,
+                                                      const void* in_dev,
+                                                      void* out_dev,
+                                                      InElementwiseOp in_elementwise_op,
+                                                      AccElementwiseOp acc_elementwise_op) override
+    {
+        return std::make_unique<Argument>(inLengths,
+                                          inStrides,
+                                          reduceDims,
+                                          *static_cast<const AccDataType*>(alpha),
+                                          *static_cast<const AccDataType*>(beta),
+                                          static_cast<const InDataType*>(in_dev),
+                                          static_cast<OutDataType*>(out_dev),
+                                          in_elementwise_op,
+                                          acc_elementwise_op);
+    };
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>();
+    };
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceReduceSoftmax<" 
+            << Rank << "," << NumReduceDim << "," << BlockSize << ","
+            << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ","
+            << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ","
+            << "InSrcVectorDim_" << InSrcVectorDim 
+            << "_InSrcVectorSize_" << InSrcVectorSize 
+            << "_OutDstVectorSize_" << OutDstVectorSize << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_sparse_embedding3_forward_layernorm.hpp b/include/ck/tensor_operation/gpu/device/impl/device_sparse_embedding3_forward_layernorm.hpp
new file mode 100644
index 00000000..1f2b46ed
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_sparse_embedding3_forward_layernorm.hpp
@@ -0,0 +1,210 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_sparse_embedding3_forward_layernorm.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename EmbType,
+          typename IndexType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename AccDataType,
+          typename OutType,
+          ck::index_t BlockSize,
+          ck::index_t DimClusterSize,
+          ck::index_t RowClusterSize,
+          ck::index_t DimPerBlock,
+          ck::index_t RowPerBlock,
+          ck::index_t DimThreadSize,
+          ck::index_t RowVectorSize>
+struct DeviceSparseEmbedding3ForwardLayernorm : public BaseOperator
+{
+
+    static auto MakeOutputDescriptor(const index_t index_length, const index_t rows)
+    {
+        return make_naive_tensor_descriptor_packed(make_tuple(index_length, rows));
+    }
+
+    struct Argument : public BaseArgument
+    {
+        Argument(OutType* p_out,
+                 const EmbType* p_emb_a,
+                 const EmbType* p_emb_b,
+                 const EmbType* p_emb_c,
+                 const IndexType* p_index_a,
+                 const IndexType* p_index_b,
+                 const IndexType* p_index_c,
+                 const GammaDataType* p_gamma,
+                 const BetaDataType* p_beta,
+                 const ck::index_t NumRows,
+                 const ck::index_t EmbeddingDim,
+                 const ck::index_t IndexLength,
+                 const AccDataType epsilon)
+            : p_out_(p_out),
+              p_emb_a_(p_emb_a),
+              p_emb_b_(p_emb_b),
+              p_emb_c_(p_emb_c),
+              p_index_a_(p_index_a),
+              p_index_b_(p_index_b),
+              p_index_c_(p_index_c),
+              p_gamma_(p_gamma),
+              p_beta_(p_beta),
+              NumRows_(NumRows),
+              EmbeddingDim_(EmbeddingDim),
+              IndexLength_(IndexLength),
+              epsilon_(epsilon)
+        {
+            grid_size_ = (IndexLength + DimClusterSize - 1) / DimClusterSize;
+        }
+
+        OutType* p_out_;
+        const EmbType* p_emb_a_;
+        const EmbType* p_emb_b_;
+        const EmbType* p_emb_c_;
+        const IndexType* p_index_a_;
+        const IndexType* p_index_b_;
+        const IndexType* p_index_c_;
+        const GammaDataType* p_gamma_;
+        const BetaDataType* p_beta_;
+        ck::index_t NumRows_;
+        ck::index_t EmbeddingDim_;
+        ck::index_t IndexLength_;
+        AccDataType epsilon_;
+
+        size_t grid_size_;
+    };
+
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(void* p_out,
+                                                              const void* p_emb_a,
+                                                              const void* p_emb_b,
+                                                              const void* p_emb_c,
+                                                              const void* p_index_a,
+                                                              const void* p_index_b,
+                                                              const void* p_index_c,
+                                                              const void* p_gamma,
+                                                              const void* p_beta,
+                                                              ck::index_t NumRows,
+                                                              ck::index_t EmbeddingDim,
+                                                              ck::index_t IndexLength,
+                                                              const AccDataType epsilon)
+    {
+        return std::make_unique<Argument>(reinterpret_cast<OutType*>(p_out),
+                                          reinterpret_cast<const EmbType*>(p_emb_a),
+                                          reinterpret_cast<const EmbType*>(p_emb_b),
+                                          reinterpret_cast<const EmbType*>(p_emb_c),
+                                          reinterpret_cast<const IndexType*>(p_index_a),
+                                          reinterpret_cast<const IndexType*>(p_index_b),
+                                          reinterpret_cast<const IndexType*>(p_index_c),
+                                          reinterpret_cast<const GammaDataType*>(p_gamma),
+                                          reinterpret_cast<const BetaDataType*>(p_beta),
+                                          NumRows,
+                                          EmbeddingDim,
+                                          IndexLength,
+                                          epsilon);
+    }
+
+    using GridwiseSparseEmbedding =
+        GridwiseSparseEmbedding3ForwardLayernorm<EmbType,
+                                                 IndexType,
+                                                 GammaDataType,
+                                                 BetaDataType,
+                                                 AccDataType,
+                                                 OutType,
+                                                 decltype(MakeOutputDescriptor(1, 1)),
+                                                 BlockSize,
+                                                 DimClusterSize,
+                                                 RowClusterSize,
+                                                 DimPerBlock,
+                                                 RowPerBlock,
+                                                 DimThreadSize,
+                                                 RowVectorSize>;
+
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            auto out_desc = MakeOutputDescriptor(arg.IndexLength_, arg.EmbeddingDim_);
+            const auto kernel_main =
+                kernel_sparse_embedding3_forward_layernorm<GridwiseSparseEmbedding,
+                                                           EmbType,
+                                                           IndexType,
+                                                           GammaDataType,
+                                                           BetaDataType,
+                                                           AccDataType,
+                                                           OutType,
+                                                           decltype(out_desc)>;
+            float avg_time = 0;
+            avg_time += launch_and_time_kernel(stream_config,
+                                               kernel_main,
+                                               dim3(arg.grid_size_),
+                                               dim3(BlockSize),
+                                               0,
+                                               arg.p_out_,
+                                               arg.p_emb_a_,
+                                               arg.p_emb_b_,
+                                               arg.p_emb_c_,
+                                               arg.p_index_a_,
+                                               arg.p_index_b_,
+                                               arg.p_index_c_,
+                                               arg.p_gamma_,
+                                               arg.p_beta_,
+                                               out_desc,
+                                               arg.epsilon_);
+
+            return (avg_time);
+        }
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        };
+    };
+
+    static bool IsSupportedArgument(const Argument* p_arg)
+    {
+        return (RowPerBlock == p_arg->EmbeddingDim_) && (p_arg->NumRows_ % DimPerBlock == 0);
+    }
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(dynamic_cast<const Argument*>(p_arg));
+    }
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>();
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceSparseEmbedding3ForwardLayernorm_"<< BlockSize << "_" <<
+            DimClusterSize << "x" << RowClusterSize << "_" <<
+            DimPerBlock << "x" << RowPerBlock << "_" <<
+            DimThreadSize << "x" << RowVectorSize;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/masking_specialization.hpp b/include/ck/tensor_operation/gpu/device/masking_specialization.hpp
new file mode 100644
index 00000000..ea0f5897
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/masking_specialization.hpp
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+enum struct MaskingSpecialization
+{
+    MaskDisabled,
+    MaskOutUpperTriangle
+};
+
+inline std::string getMaskingSpecializationString(const MaskingSpecialization& s)
+{
+    switch(s)
+    {
+    case MaskingSpecialization::MaskDisabled: return "MaskDisabled";
+    case MaskingSpecialization::MaskOutUpperTriangle: return "MaskOutUpperTriangle";
+    default: return "Unrecognized specialization!";
+    }
+}
+
+struct MaskDisabledPredicate
+{
+    __host__ __device__ constexpr bool operator()(index_t /*m*/, index_t /*n*/) const
+    {
+        return false;
+    };
+
+    __host__ __device__ constexpr bool
+        IsTileSkippable(index_t /*m*/, index_t /*n*/, index_t /*m_tile*/, index_t /*n_tile*/) const
+    {
+        return false;
+    }
+};
+
+struct MaskOutUpperTrianglePredicate
+{
+    __host__ __device__ constexpr bool operator()(index_t m, index_t n) const { return n > m; }
+
+    __host__ __device__ constexpr bool
+    IsTileSkippable(index_t m, index_t n, index_t m_tile, index_t /*n_tile*/) const
+    {
+        return operator()(m + m_tile - 1, n);
+    }
+};
+
+// to track the points which need to be set to -inf on C0
+// Note: no need to reset M padding value, because they will not be stored out.
+template <typename MaskOutPredicate>
+struct C0MatrixMask_impl
+{
+    C0MatrixMask_impl(index_t NRaw) : NRaw_(NRaw), predicate_(MaskOutPredicate{}) {}
+
+    __host__ __device__ constexpr bool IsNOutOfBound(/*index_t m, */ index_t n) const
+    {
+        return n >= NRaw_;
+    }
+
+    __host__ __device__ constexpr bool IsMaskedElement(index_t m, index_t n) const
+    {
+        return predicate_(m, n) || IsNOutOfBound(n);
+    }
+
+    __host__ __device__ constexpr bool
+    IsTileSkippable(index_t m, index_t n, index_t m_tile, index_t n_tile) const
+    {
+        return predicate_.IsTileSkippable(m, n, m_tile, n_tile);
+    }
+
+    private:
+    // index_t MRaw_;
+    index_t NRaw_;
+    MaskOutPredicate predicate_;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/matrix_padder.hpp b/include/ck/tensor_operation/gpu/device/matrix_padder.hpp
new file mode 100644
index 00000000..70e61bc7
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/matrix_padder.hpp
@@ -0,0 +1,382 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename TensorDesc,
+          typename TileLengths, // Tuple<...>
+          typename DoPads>      // Sequence<bool, bool, ...>
+__host__ __device__ constexpr auto
+PadTensorDescriptor(const TensorDesc& desc, const TileLengths& tile_lengths, DoPads)
+{
+    constexpr index_t num_dim = DoPads::Size();
+
+    static_assert(num_dim == TileLengths::Size() && num_dim == TensorDesc::GetNumOfDimension(),
+                  "wrong! inconsistent # of dimensions");
+
+    // transforms
+    const auto transforms = generate_tuple(
+        [&](auto idim) {
+            const auto MRaw = desc.GetLength(idim);
+
+            const auto MPerTile = tile_lengths[idim];
+
+            const auto M = math::integer_divide_ceil(MRaw, MPerTile) * MPerTile;
+
+            const auto MPad = M - MRaw;
+
+            const bool DoPadM = DoPads::At(idim);
+
+            const auto MTransform = conditional_expr<DoPadM>(make_right_pad_transform(MRaw, MPad),
+                                                             make_pass_through_transform(MRaw));
+
+            return MTransform;
+        },
+        Number<num_dim>{});
+
+    // lower dimension Id
+    const auto lower_dimss =
+        generate_tuple([&](auto idim) { return Sequence<idim.value>{}; }, Number<num_dim>{});
+
+    // upper dimension Id
+    const auto upper_dimss = lower_dimss;
+
+    return transform_tensor_descriptor(desc, transforms, lower_dimss, upper_dimss);
+}
+
+// M/N/K/OPerTileType could be index_t or Number<>
+template <GemmSpecialization GemmSpec,
+          typename MPerTileType,
+          typename NPerTileType,
+          typename KPerTileType,
+          typename OPerTileType>
+struct GemmGemmPadder
+{
+    // TODO: hard to scale; use mask instead
+    static constexpr bool PadM =
+        GemmSpec == GemmSpecialization::MPadding || GemmSpec == GemmSpecialization::MNPadding ||
+        GemmSpec == GemmSpecialization::MKPadding || GemmSpec == GemmSpecialization::MNKPadding ||
+        GemmSpec == GemmSpecialization::MOPadding || GemmSpec == GemmSpecialization::MNOPadding ||
+        GemmSpec == GemmSpecialization::MKOPadding || GemmSpec == GemmSpecialization::MNKOPadding;
+    static constexpr bool PadN =
+        GemmSpec == GemmSpecialization::NPadding || GemmSpec == GemmSpecialization::MNPadding ||
+        GemmSpec == GemmSpecialization::NKPadding || GemmSpec == GemmSpecialization::MNKPadding ||
+        GemmSpec == GemmSpecialization::NOPadding || GemmSpec == GemmSpecialization::MNOPadding ||
+        GemmSpec == GemmSpecialization::NKOPadding || GemmSpec == GemmSpecialization::MNKOPadding;
+    static constexpr bool PadK =
+        GemmSpec == GemmSpecialization::KPadding || GemmSpec == GemmSpecialization::MKPadding ||
+        GemmSpec == GemmSpecialization::NKPadding || GemmSpec == GemmSpecialization::MNKPadding ||
+        GemmSpec == GemmSpecialization::KOPadding || GemmSpec == GemmSpecialization::MKOPadding ||
+        GemmSpec == GemmSpecialization::NKOPadding || GemmSpec == GemmSpecialization::MNKOPadding;
+    static constexpr bool PadO =
+        GemmSpec == GemmSpecialization::OPadding || GemmSpec == GemmSpecialization::MOPadding ||
+        GemmSpec == GemmSpecialization::NOPadding || GemmSpec == GemmSpecialization::KOPadding ||
+        GemmSpec == GemmSpecialization::MNOPadding || GemmSpec == GemmSpecialization::MKOPadding ||
+        GemmSpec == GemmSpecialization::NKOPadding || GemmSpec == GemmSpecialization::MNKOPadding;
+
+    // A[M, K]
+    template <typename ADesc_MRaw_KRaw>
+    __host__ __device__ constexpr auto
+    PadADescriptor_M_K(const ADesc_MRaw_KRaw& a_desc_mraw_kraw) const
+    {
+        return PadTensorDescriptor(
+            a_desc_mraw_kraw, make_tuple(MPerTile_, KPerTile_), Sequence<PadM, PadK>{});
+    }
+
+    // B[K, N]
+    template <typename BDesc_NRaw_KRaw>
+    __host__ __device__ constexpr auto
+    PadBDescriptor_N_K(const BDesc_NRaw_KRaw& b_desc_nraw_kraw) const
+    {
+        return PadTensorDescriptor(
+            b_desc_nraw_kraw, make_tuple(NPerTile_, KPerTile_), Sequence<PadN, PadK>{});
+    }
+
+    // B1[Gemm1N, Gemm1K] = B1[O, N]
+    template <typename B1Desc_NRaw_KRaw>
+    __host__ __device__ constexpr auto
+    PadB1Descriptor_N_K(const B1Desc_NRaw_KRaw& b1_desc_nraw_kraw) const
+    {
+        return PadTensorDescriptor(
+            b1_desc_nraw_kraw, make_tuple(OPerTile_, NPerTile_), Sequence<PadO, PadN>{});
+    }
+
+    // C[M, Gemm1N] = C[M, O]
+    template <typename CDesc_MRaw_NRaw>
+    __host__ __device__ constexpr auto
+    PadCDescriptor_M_N(const CDesc_MRaw_NRaw& c_desc_mraw_nraw) const
+    {
+        return PadTensorDescriptor(
+            c_desc_mraw_nraw, make_tuple(MPerTile_, OPerTile_), Sequence<PadM, PadO>{});
+    }
+
+    MPerTileType MPerTile_;
+    NPerTileType NPerTile_;
+    KPerTileType KPerTile_;
+    OPerTileType OPerTile_;
+};
+
+// M/N/KPerTileType could be index_t or Number<>
+template <GemmSpecialization GemmSpec,
+          typename MPerTileType,
+          typename NPerTileType,
+          typename KPerTileType>
+struct GemmPadder
+{
+    static constexpr bool PadM =
+        (GemmSpec == GemmSpecialization::MPadding || GemmSpec == GemmSpecialization::MNPadding ||
+         GemmSpec == GemmSpecialization::MKPadding || GemmSpec == GemmSpecialization::MNKPadding);
+    static constexpr bool PadN =
+        (GemmSpec == GemmSpecialization::NPadding || GemmSpec == GemmSpecialization::MNPadding ||
+         GemmSpec == GemmSpecialization::NKPadding || GemmSpec == GemmSpecialization::MNKPadding);
+    static constexpr bool PadK =
+        (GemmSpec == GemmSpecialization::KPadding || GemmSpec == GemmSpecialization::MKPadding ||
+         GemmSpec == GemmSpecialization::NKPadding || GemmSpec == GemmSpecialization::MNKPadding);
+
+    template <typename ADesc_MRaw_KRaw>
+    __host__ __device__ constexpr auto
+    PadADescriptor_M_K(const ADesc_MRaw_KRaw& a_desc_mraw_kraw) const
+    {
+        return PadTensorDescriptor(
+            a_desc_mraw_kraw, make_tuple(MPerTile_, KPerTile_), Sequence<PadM, PadK>{});
+    }
+
+    template <typename BDesc_NRaw_KRaw>
+    __host__ __device__ constexpr auto
+    PadBDescriptor_N_K(const BDesc_NRaw_KRaw& b_desc_nraw_kraw) const
+    {
+        return PadTensorDescriptor(
+            b_desc_nraw_kraw, make_tuple(NPerTile_, KPerTile_), Sequence<PadN, PadK>{});
+    }
+
+    template <typename CDesc_MRaw_NRaw>
+    __host__ __device__ constexpr auto
+    PadCDescriptor_M_N(const CDesc_MRaw_NRaw& c_desc_mraw_nraw) const
+    {
+        return PadTensorDescriptor(
+            c_desc_mraw_nraw, make_tuple(MPerTile_, NPerTile_), Sequence<PadM, PadN>{});
+    }
+
+    MPerTileType MPerTile_;
+    NPerTileType NPerTile_;
+    KPerTileType KPerTile_;
+};
+
+// Alias of GemmPadder; to deprecate
+template <GemmSpecialization GemmSpec,
+          typename MPerTileType,
+          typename NPerTileType,
+          typename KPerTileType>
+struct MatrixPadder : public GemmPadder<GemmSpec, MPerTileType, NPerTileType, KPerTileType>
+{
+};
+
+// M/N/KPerTileType could be index_t or Number<>
+template <bool PadM,
+          bool PadN,
+          bool PadK,
+          typename MPerTileType,
+          typename NPerTileType,
+          typename KPerTileType>
+struct GemmPadder_v2
+{
+    template <typename ADesc_MRaw_KRaw>
+    __host__ __device__ constexpr auto
+    PadADescriptor_M_K(const ADesc_MRaw_KRaw& a_desc_mraw_kraw) const
+    {
+        return PadTensorDescriptor(
+            a_desc_mraw_kraw, make_tuple(MPerTile_, KPerTile_), Sequence<PadM, PadK>{});
+    }
+
+    template <typename BDesc_NRaw_KRaw>
+    __host__ __device__ constexpr auto
+    PadBDescriptor_N_K(const BDesc_NRaw_KRaw& b_desc_nraw_kraw) const
+    {
+        return PadTensorDescriptor(
+            b_desc_nraw_kraw, make_tuple(NPerTile_, KPerTile_), Sequence<PadN, PadK>{});
+    }
+
+    template <typename CDesc_MRaw_NRaw>
+    __host__ __device__ constexpr auto
+    PadCDescriptor_M_N(const CDesc_MRaw_NRaw& c_desc_mraw_nraw) const
+    {
+        return PadTensorDescriptor(
+            c_desc_mraw_nraw, make_tuple(MPerTile_, NPerTile_), Sequence<PadM, PadN>{});
+    }
+
+    MPerTileType MPerTile_;
+    NPerTileType NPerTile_;
+    KPerTileType KPerTile_;
+};
+
+// M/N/KPerTileType could be index_t or Number<>
+template <bool PadM,
+          bool PadN,
+          bool PadK,
+          typename MPerTileType,
+          typename NPerTileType,
+          typename KPerTileType>
+struct MatrixPadder_v2
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    template <typename ADesc_MRaw_KRaw>
+    __host__ __device__ constexpr auto
+    PadADescriptor_M_K(const ADesc_MRaw_KRaw& a_desc_mraw_kraw) const
+    {
+        const auto MRaw = a_desc_mraw_kraw.GetLength(I0);
+        const auto KRaw = a_desc_mraw_kraw.GetLength(I1);
+
+        const auto M = math::integer_divide_ceil(MRaw, MPerTile_) * MPerTile_;
+        const auto K = math::integer_divide_ceil(KRaw, KPerTile_) * KPerTile_;
+
+        const auto MPad = M - MRaw;
+        const auto KPad = K - KRaw;
+
+        if constexpr(PadM && PadK)
+        {
+            // pad both M and K
+            return transform_tensor_descriptor(a_desc_mraw_kraw,
+                                               make_tuple(make_right_pad_transform(MRaw, MPad),
+                                                          make_right_pad_transform(KRaw, KPad)),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(PadM && (!PadK))
+        {
+            // pad M, but not K
+            return transform_tensor_descriptor(
+                a_desc_mraw_kraw,
+                make_tuple(make_right_pad_transform(MRaw, MPad), make_pass_through_transform(KRaw)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr((!PadM) && PadK)
+        {
+            // pad K, but not M
+            return transform_tensor_descriptor(
+                a_desc_mraw_kraw,
+                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(KRaw, KPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            // not pad M or K
+            return a_desc_mraw_kraw;
+        }
+    }
+
+    template <typename BDesc_NRaw_KRaw>
+    __host__ __device__ constexpr auto
+    PadBDescriptor_N_K(const BDesc_NRaw_KRaw& b_desc_nraw_kraw) const
+    {
+        const auto NRaw = b_desc_nraw_kraw.GetLength(I0);
+        const auto KRaw = b_desc_nraw_kraw.GetLength(I1);
+
+        const auto N = math::integer_divide_ceil(NRaw, NPerTile_) * NPerTile_;
+        const auto K = math::integer_divide_ceil(KRaw, KPerTile_) * KPerTile_;
+
+        const auto NPad = N - NRaw;
+        const auto KPad = K - KRaw;
+
+        if constexpr(PadN && PadK)
+        {
+            // pad both N and K
+            return transform_tensor_descriptor(b_desc_nraw_kraw,
+                                               make_tuple(make_right_pad_transform(NRaw, NPad),
+                                                          make_right_pad_transform(KRaw, KPad)),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(PadN && (!PadK))
+        {
+            // pad N, but not K
+            return transform_tensor_descriptor(
+                b_desc_nraw_kraw,
+                make_tuple(make_right_pad_transform(NRaw, NPad), make_pass_through_transform(KRaw)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr((!PadN) && PadK)
+        {
+            // pad K, but not N
+            return transform_tensor_descriptor(
+                b_desc_nraw_kraw,
+                make_tuple(make_pass_through_transform(NRaw), make_right_pad_transform(KRaw, KPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            // not pad N or K
+            return b_desc_nraw_kraw;
+        }
+    }
+
+    template <typename CDesc_MRaw_NRaw>
+    __host__ __device__ constexpr auto
+    PadCDescriptor_M_N(const CDesc_MRaw_NRaw& c_desc_mraw_nraw) const
+    {
+        const auto MRaw = c_desc_mraw_nraw.GetLength(I0);
+        const auto NRaw = c_desc_mraw_nraw.GetLength(I1);
+
+        const auto M = math::integer_divide_ceil(MRaw, MPerTile_) * MPerTile_;
+        const auto N = math::integer_divide_ceil(NRaw, NPerTile_) * NPerTile_;
+
+        const auto MPad = M - MRaw;
+        const auto NPad = N - NRaw;
+
+        if constexpr(PadM && PadN)
+        {
+            // pad M and N
+            return transform_tensor_descriptor(c_desc_mraw_nraw,
+                                               make_tuple(make_right_pad_transform(MRaw, MPad),
+                                                          make_right_pad_transform(NRaw, NPad)),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr(PadM && (!PadN))
+        {
+            // pad M, but not N
+            return transform_tensor_descriptor(
+                c_desc_mraw_nraw,
+                make_tuple(make_right_pad_transform(MRaw, MPad), make_pass_through_transform(NRaw)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else if constexpr((!PadM) && PadN)
+        {
+            // pad N, but not M
+            return transform_tensor_descriptor(
+                c_desc_mraw_nraw,
+                make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(NRaw, NPad)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            // not pad M or N
+            return c_desc_mraw_nraw;
+        }
+    }
+
+    MPerTileType MPerTile_;
+    NPerTileType NPerTile_;
+    KPerTileType KPerTile_;
+};
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp b/include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp
new file mode 100644
index 00000000..d3531835
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+// FIXME: can it be replaced with ck::Tuple?
+#include <tuple>
+
+namespace ck {
+
+// The templated struct reduce_binary_operator maps the enum Ids of binary operators to their
+// respective functor classes.
+// The boolean member "indexable" are also provided in reduce_binary_operactor for
+// easier checking by the upper-layer codes in the kernels.
+
+template <ReduceTensorOp Op>
+struct reduce_binary_operator;
+
+template <>
+struct reduce_binary_operator<ReduceTensorOp::ADD>
+{
+    using opType = reduce::Add;
+
+    static constexpr bool indexable = false;
+};
+
+template <>
+struct reduce_binary_operator<ReduceTensorOp::MUL>
+{
+    using opType = reduce::Mul;
+
+    static constexpr bool indexable = false;
+};
+
+template <>
+struct reduce_binary_operator<ReduceTensorOp::MIN>
+{
+    using opType = reduce::Min;
+
+    static constexpr bool indexable = true;
+};
+
+template <>
+struct reduce_binary_operator<ReduceTensorOp::MAX>
+{
+    using opType = reduce::Max;
+
+    static constexpr bool indexable = true;
+};
+
+template <>
+struct reduce_binary_operator<ReduceTensorOp::AMAX>
+{
+    using opType = reduce::AMax;
+
+    static constexpr bool indexable = true;
+};
+
+template <>
+struct reduce_binary_operator<ReduceTensorOp::AVG>
+{
+    using opType = reduce::Add;
+
+    static constexpr bool indexable = false;
+};
+
+template <>
+struct reduce_binary_operator<ReduceTensorOp::NORM1>
+{
+    using opType = reduce::Add;
+
+    static constexpr bool indexable = false;
+};
+
+template <>
+struct reduce_binary_operator<ReduceTensorOp::NORM2>
+{
+    using opType = reduce::Add;
+
+    static constexpr bool indexable = false;
+};
+
+// The templated struct reduce_unary_operator maps the enum Ids of Reduce operators to two unary
+// functor classes.
+// The two unary functors are called before and afer the Reduction is executed respectively
+template <ReduceTensorOp Op, bool IsFirstReduce, bool IsLastReduce>
+struct reduce_unary_operator
+{
+    using InElementwiseOperation  = tensor_operation::element_wise::PassThrough;
+    using AccElementwiseOperation = tensor_operation::element_wise::PassThrough;
+
+    static std::tuple<InElementwiseOperation, AccElementwiseOperation>
+    GetElementwiseOperator(int32_t reduceLength)
+    {
+        (void)reduceLength;
+        return std::make_tuple(InElementwiseOperation{}, AccElementwiseOperation{});
+    };
+};
+
+template <bool IsFirstReduce>
+struct reduce_unary_operator<ReduceTensorOp::AVG, IsFirstReduce, true>
+{
+    using InElementwiseOperation  = tensor_operation::element_wise::PassThrough;
+    using AccElementwiseOperation = tensor_operation::element_wise::UnaryDivide;
+
+    static std::tuple<InElementwiseOperation, AccElementwiseOperation>
+    GetElementwiseOperator(int32_t reduceLength)
+    {
+        return std::make_tuple(InElementwiseOperation{}, AccElementwiseOperation{reduceLength});
+    };
+};
+
+template <bool IsLastReduce>
+struct reduce_unary_operator<ReduceTensorOp::NORM1, true, IsLastReduce>
+{
+    using InElementwiseOperation  = tensor_operation::element_wise::UnaryAbs;
+    using AccElementwiseOperation = tensor_operation::element_wise::PassThrough;
+
+    static std::tuple<InElementwiseOperation, AccElementwiseOperation>
+    GetElementwiseOperator(int32_t reduceLength)
+    {
+        (void)reduceLength;
+        return std::make_tuple(InElementwiseOperation{}, AccElementwiseOperation{});
+    };
+};
+
+template <bool IsLastReduce>
+struct reduce_unary_operator<ReduceTensorOp::AMAX, true, IsLastReduce>
+{
+    using InElementwiseOperation  = tensor_operation::element_wise::UnaryAbs;
+    using AccElementwiseOperation = tensor_operation::element_wise::PassThrough;
+
+    static std::tuple<InElementwiseOperation, AccElementwiseOperation>
+    GetElementwiseOperator(int32_t reduceLength)
+    {
+        (void)reduceLength;
+        return std::make_tuple(InElementwiseOperation{}, AccElementwiseOperation{});
+    };
+};
+
+template <>
+struct reduce_unary_operator<ReduceTensorOp::NORM2, true, false>
+{
+    using InElementwiseOperation  = tensor_operation::element_wise::UnarySquare;
+    using AccElementwiseOperation = tensor_operation::element_wise::PassThrough;
+
+    static std::tuple<InElementwiseOperation, AccElementwiseOperation>
+    GetElementwiseOperator(int32_t reduceLength)
+    {
+        (void)reduceLength;
+        return std::make_tuple(InElementwiseOperation{}, AccElementwiseOperation{});
+    };
+};
+
+template <>
+struct reduce_unary_operator<ReduceTensorOp::NORM2, true, true>
+{
+    using InElementwiseOperation  = tensor_operation::element_wise::UnarySquare;
+    using AccElementwiseOperation = tensor_operation::element_wise::UnarySqrt;
+
+    static std::tuple<InElementwiseOperation, AccElementwiseOperation>
+    GetElementwiseOperator(int32_t reduceLength)
+    {
+        (void)reduceLength;
+        return std::make_tuple(InElementwiseOperation{}, AccElementwiseOperation{});
+    };
+};
+
+template <>
+struct reduce_unary_operator<ReduceTensorOp::NORM2, false, true>
+{
+    using InElementwiseOperation  = tensor_operation::element_wise::PassThrough;
+    using AccElementwiseOperation = tensor_operation::element_wise::UnarySqrt;
+
+    static std::tuple<InElementwiseOperation, AccElementwiseOperation>
+    GetElementwiseOperator(int32_t reduceLength)
+    {
+        (void)reduceLength;
+        return std::make_tuple(InElementwiseOperation{}, AccElementwiseOperation{});
+    };
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
new file mode 100644
index 00000000..b4442741
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
@@ -0,0 +1,417 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+namespace tensor_layout {
+
+struct BaseTensorLayout
+{
+};
+
+namespace gemm {
+
+struct RowMajor : public BaseTensorLayout
+{
+    static constexpr const char* name = "RowMajor";
+};
+
+struct ColumnMajor : public BaseTensorLayout
+{
+    static constexpr const char* name = "ColumnMajor";
+};
+} // namespace gemm
+
+namespace convolution {
+
+// input tensor
+// packed NCW/NCHW/NCDHW
+struct NCW : public BaseTensorLayout
+{
+    static constexpr const char* name = "NCW";
+};
+
+struct NCHW : public BaseTensorLayout
+{
+    static constexpr const char* name = "NCHW";
+};
+
+struct NCDHW : public BaseTensorLayout
+{
+    static constexpr const char* name = "NCDHW";
+};
+
+// packed GNCW/GNCHW/GNCDHW
+struct GNCW : public BaseTensorLayout
+{
+    static constexpr const char* name = "GNCW";
+};
+
+struct GNCHW : public BaseTensorLayout
+{
+    static constexpr const char* name = "GNCHW";
+};
+
+struct GNCDHW : public BaseTensorLayout
+{
+    static constexpr const char* name = "GNCDHW";
+};
+
+// input tensor
+// packed NWC/NHWC/NDHWC
+struct NWC : public BaseTensorLayout
+{
+    static constexpr const char* name = "NWC";
+};
+
+struct NHWC : public BaseTensorLayout
+{
+    static constexpr const char* name = "NHWC";
+};
+
+struct NDHWC : public BaseTensorLayout
+{
+    static constexpr const char* name = "NDHWC";
+};
+
+// input tensor
+// packed GNWC/GNHWC/GNDHWC
+struct GNWC : public BaseTensorLayout
+{
+    static constexpr const char* name = "GNWC";
+};
+
+struct GNHWC : public BaseTensorLayout
+{
+    static constexpr const char* name = "GNHWC";
+};
+
+struct GNDHWC : public BaseTensorLayout
+{
+    static constexpr const char* name = "GNDHWC";
+};
+
+// for input bias
+struct GC : public BaseTensorLayout
+{
+    static constexpr const char* name = "GC";
+};
+
+// input tensor
+// packed NWGC/NHWGC/NDHWGC
+struct NWGC : public BaseTensorLayout
+{
+    static constexpr const char* name = "NWGC";
+};
+
+struct NHWGC : public BaseTensorLayout
+{
+    static constexpr const char* name = "NHWGC";
+};
+
+struct NDHWGC : public BaseTensorLayout
+{
+    static constexpr const char* name = "NDHWGC";
+};
+
+// input tensor
+// strided layout
+struct G_NW_C : public BaseTensorLayout
+{
+    static constexpr const char* name = "G_NW_C";
+};
+
+struct G_NHW_C : public BaseTensorLayout
+{
+    static constexpr const char* name = "G_NHW_C";
+};
+
+struct G_NDHW_C : public BaseTensorLayout
+{
+    static constexpr const char* name = "G_NDHW_C";
+};
+
+// for input bias
+struct G_C : public BaseTensorLayout
+{
+    static constexpr const char* name = "G_C";
+};
+
+// weight tensor
+// packed KCX/KCYX/KCZYX
+struct KCX : public BaseTensorLayout
+{
+    static constexpr const char* name = "KCX";
+};
+
+struct KCYX : public BaseTensorLayout
+{
+    static constexpr const char* name = "KCYX";
+};
+
+struct KCZYX : public BaseTensorLayout
+{
+    static constexpr const char* name = "KCZYX";
+};
+
+// weight tensor
+// packed KCX/KCYX/KCZYX
+struct GKCX : public BaseTensorLayout
+{
+    static constexpr const char* name = "GKCX";
+};
+
+struct GKCYX : public BaseTensorLayout
+{
+    static constexpr const char* name = "GKCYX";
+};
+
+struct GKCZYX : public BaseTensorLayout
+{
+    static constexpr const char* name = "GKCZYX";
+};
+
+// weight tensor
+// packed KXC/KYXC/KZYXC
+struct KXC : public BaseTensorLayout
+{
+    static constexpr const char* name = "KXC";
+};
+
+struct KYXC : public BaseTensorLayout
+{
+    static constexpr const char* name = "KYXC";
+};
+
+struct KZYXC : public BaseTensorLayout
+{
+    static constexpr const char* name = "KZYXC";
+};
+
+// weight tensor
+// packed GKXC/GKYXC/GKZYXC
+struct GKXC : public BaseTensorLayout
+{
+    static constexpr const char* name = "GKXC";
+};
+
+struct GKYXC : public BaseTensorLayout
+{
+    static constexpr const char* name = "GKYXC";
+};
+
+struct GKZYXC : public BaseTensorLayout
+{
+    static constexpr const char* name = "GKZYXC";
+};
+
+// weight tensor
+// packed KXGC/KYXGC/KZYXGC
+struct KXGC : public BaseTensorLayout
+{
+    static constexpr const char* name = "KXGC";
+};
+
+struct KYXGC : public BaseTensorLayout
+{
+    static constexpr const char* name = "KYXGC";
+};
+
+struct KZYXGC : public BaseTensorLayout
+{
+    static constexpr const char* name = "KZYXGC";
+};
+
+// weight tensor
+// strided
+struct G_K_X_C : public BaseTensorLayout
+{
+    static constexpr const char* name = "G_K_X_C";
+};
+
+struct G_K_YX_C : public BaseTensorLayout
+{
+    static constexpr const char* name = "G_K_YX_C";
+};
+
+struct G_K_ZYX_C : public BaseTensorLayout
+{
+    static constexpr const char* name = "G_K_ZYX_C";
+};
+
+// output tensor
+// packed NKW/NKHW/NKDHW
+struct NKW : public BaseTensorLayout
+{
+    static constexpr const char* name = "NKW";
+};
+
+struct NKHW : public BaseTensorLayout
+{
+    static constexpr const char* name = "NKHW";
+};
+
+struct NKDHW : public BaseTensorLayout
+{
+    static constexpr const char* name = "NKDHW";
+};
+
+// output tensor
+// packed GNKW/GNKHW/GNKDHW
+struct GNKW : public BaseTensorLayout
+{
+    static constexpr const char* name = "GNKW";
+};
+
+struct GNKHW : public BaseTensorLayout
+{
+    static constexpr const char* name = "GNKHW";
+};
+
+struct GNKDHW : public BaseTensorLayout
+{
+    static constexpr const char* name = "GNKDHW";
+};
+
+// output tensor
+// packed NWK/NHWK/NDHWK
+struct NWK : public BaseTensorLayout
+{
+    static constexpr const char* name = "NWK";
+};
+
+struct NHWK : public BaseTensorLayout
+{
+    static constexpr const char* name = "NHWK";
+};
+
+struct NDHWK : public BaseTensorLayout
+{
+    static constexpr const char* name = "NDHWK";
+};
+
+// output tensor
+// packed GNWK/GNHWK/GNDHWK
+struct GNWK : public BaseTensorLayout
+{
+    static constexpr const char* name = "GNWK";
+};
+
+struct GNHWK : public BaseTensorLayout
+{
+    static constexpr const char* name = "GNHWK";
+};
+
+struct GNDHWK : public BaseTensorLayout
+{
+    static constexpr const char* name = "GNDHWK";
+};
+
+// for output bias
+struct GK : public BaseTensorLayout
+{
+    static constexpr const char* name = "GK";
+};
+
+// output tensor
+// packed NWGK/NHWGK/NDHWGK
+struct NWGK : public BaseTensorLayout
+{
+    static constexpr const char* name = "NWGK";
+};
+
+struct NHWGK : public BaseTensorLayout
+{
+    static constexpr const char* name = "NHWGK";
+};
+
+struct NDHWGK : public BaseTensorLayout
+{
+    static constexpr const char* name = "NDHWGK";
+};
+
+// output tensor
+// strided layout
+struct G_NW_K : public BaseTensorLayout
+{
+    static constexpr const char* name = "G_NW_K";
+};
+
+struct G_NHW_K : public BaseTensorLayout
+{
+    static constexpr const char* name = "G_NHW_K";
+};
+
+struct G_NDHW_K : public BaseTensorLayout
+{
+    static constexpr const char* name = "G_NDHW_K";
+};
+
+// for output bias
+struct G_K : public BaseTensorLayout
+{
+    static constexpr const char* name = "G_K";
+};
+
+// K-reduced output tensor (packed)
+struct GNW : public BaseTensorLayout
+{
+    static constexpr const char* name = "GNW";
+};
+
+struct GNHW : public BaseTensorLayout
+{
+    static constexpr const char* name = "GNHW";
+};
+
+struct GNDHW : public BaseTensorLayout
+{
+    static constexpr const char* name = "GNDHW";
+};
+
+// K-reduced output tensor (packed)
+struct NWG : public BaseTensorLayout
+{
+    static constexpr const char* name = "NWG";
+};
+
+struct NHWG : public BaseTensorLayout
+{
+    static constexpr const char* name = "NHWG";
+};
+
+struct NDHWG : public BaseTensorLayout
+{
+    static constexpr const char* name = "NDHWG";
+};
+
+// K-reduced output tensor (strided)
+struct G_NW : public BaseTensorLayout
+{
+    static constexpr const char* name = "G_NW";
+};
+
+struct G_NHW : public BaseTensorLayout
+{
+    static constexpr const char* name = "G_NHW";
+};
+
+struct G_NDHW : public BaseTensorLayout
+{
+    static constexpr const char* name = "G_NDHW";
+};
+
+} // namespace convolution
+
+template <
+    typename Layout,
+    typename std::enable_if<std::is_base_of<BaseTensorLayout, Layout>::value, bool>::type = false>
+std::ostream& operator<<(std::ostream& os, const Layout&)
+{
+    os << Layout::name;
+    return os;
+}
+
+} // namespace tensor_layout
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/tensor_specialization.hpp b/include/ck/tensor_operation/gpu/device/tensor_specialization.hpp
new file mode 100644
index 00000000..0ec0df2c
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/tensor_specialization.hpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+enum struct TensorSpecialization
+{
+    Default,
+    Packed
+};
+
+inline std::string getTensorSpecializationString(const TensorSpecialization& s)
+{
+    switch(s)
+    {
+    case TensorSpecialization::Default: return "Default";
+    case TensorSpecialization::Packed: return "Packed";
+    default: return "Unrecognized specialization!";
+    }
+}
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/welford_helper.hpp b/include/ck/tensor_operation/gpu/device/welford_helper.hpp
new file mode 100644
index 00000000..6c909b76
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/welford_helper.hpp
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <index_t K_BlockTileSize, index_t KThreadSliceSize>
+struct GetReduceCountPerThreadForBlockwiseWelford
+{
+    GetReduceCountPerThreadForBlockwiseWelford(index_t numBlockTileIteration,
+                                               long_index_t reduce_length)
+        : numBlockTileIteration_{numBlockTileIteration}
+    {
+        count_in_last_tile_ = reduce_length % K_BlockTileSize;
+    };
+
+    __device__ index_t operator()(index_t thread_k_cluster_id) const
+    {
+        if(count_in_last_tile_ == 0)
+            return (KThreadSliceSize * numBlockTileIteration_);
+        else
+        {
+            index_t num_complete_slice  = count_in_last_tile_ / KThreadSliceSize;
+            index_t count_in_last_slice = count_in_last_tile_ % KThreadSliceSize;
+
+            if(thread_k_cluster_id < num_complete_slice)
+                return (KThreadSliceSize * numBlockTileIteration_);
+            else if(thread_k_cluster_id == num_complete_slice)
+                return (KThreadSliceSize * (numBlockTileIteration_ - 1) + count_in_last_slice);
+            else
+                return (KThreadSliceSize * (numBlockTileIteration_ - 1));
+        };
+    };
+
+    index_t numBlockTileIteration_;
+    index_t count_in_last_tile_;
+};
+
+template <index_t K_BlockTileSize, index_t KThreadSliceSize>
+struct GetReduceCountPerThreadForMultiblockWelford
+{
+    GetReduceCountPerThreadForMultiblockWelford(index_t blkGroupSize,
+                                                index_t numBlockTileIteration,
+                                                long_index_t reduce_length)
+        : blkGroupSize_(blkGroupSize), numBlockTileIteration_{numBlockTileIteration}
+    {
+        last_block_reduce_length_ =
+            reduce_length - K_BlockTileSize * numBlockTileIteration_ * (blkGroupSize_ - 1);
+        numBlockTileIterationByLastBlock_ =
+            (last_block_reduce_length_ + K_BlockTileSize - 1) / K_BlockTileSize;
+    };
+
+    __device__ index_t operator()(index_t block_local_id, index_t thread_k_cluster_id) const
+    {
+        if(last_block_reduce_length_ == K_BlockTileSize * numBlockTileIteration_ ||
+           block_local_id < blkGroupSize_ - 1)
+            return (KThreadSliceSize * numBlockTileIteration_);
+
+        index_t count_in_last_tile = last_block_reduce_length_ % K_BlockTileSize;
+
+        if(count_in_last_tile == 0)
+            return (KThreadSliceSize * numBlockTileIterationByLastBlock_);
+        else
+        {
+            index_t num_complete_slice = count_in_last_tile / KThreadSliceSize;
+
+            if(thread_k_cluster_id < num_complete_slice)
+                return (KThreadSliceSize * numBlockTileIterationByLastBlock_);
+            else if(thread_k_cluster_id == num_complete_slice)
+                return (KThreadSliceSize * (numBlockTileIterationByLastBlock_ - 1) +
+                        count_in_last_tile);
+            else
+                return (KThreadSliceSize * (numBlockTileIterationByLastBlock_ - 1));
+        };
+    };
+
+    index_t blkGroupSize_;
+    index_t numBlockTileIteration_;
+
+    index_t last_block_reduce_length_;
+    index_t numBlockTileIterationByLastBlock_;
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
new file mode 100644
index 00000000..a4053b1f
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
@@ -0,0 +1,286 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace element_wise {
+
+struct Add
+{
+    template <typename Y, typename X0, typename X1>
+    __host__ __device__ constexpr void operator()(Y& y, const X0& x0, const X1& x1) const;
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<float>(float& y, const float& x0, const float& x1) const
+    {
+        y = x0 + x1;
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<double>(double& y, const double& x0, const double& x1) const
+    {
+        y = x0 + x1;
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<float>(float& y, const float& x0, const half_t& x1) const
+    {
+        y = x0 + type_convert<half_t>(x1);
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<half_t>(half_t& y, const float& x0, const half_t& x1) const
+    {
+        y = type_convert<half_t>(x0) + x1;
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const
+    {
+        y = x0 + x1;
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<bhalf_t>(bhalf_t& y, const bhalf_t& x0, const bhalf_t& x1) const
+    {
+        const float x1_tmp = ck::type_convert<float>(x0);
+        const float x2_tmp = ck::type_convert<float>(x1);
+        const float y_tmp  = x1_tmp + x2_tmp;
+        y                  = ck::type_convert<bhalf_t>(y_tmp);
+    }
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<int8_t>(int8_t& y, const int8_t& x0, const int8_t& x1) const
+    {
+        y = x0 + x1;
+    };
+};
+
+struct Subtract
+{
+    template <typename T>
+    __host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const;
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<float>(float& y, const float& x0, const float& x1) const
+    {
+        y = x0 - x1;
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<double>(double& y, const double& x0, const double& x1) const
+    {
+        y = x0 - x1;
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const
+    {
+        y = x0 - x1;
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<bhalf_t>(bhalf_t& y, const bhalf_t& x0, const bhalf_t& x1) const
+    {
+        const float x1_tmp = ck::type_convert<float>(x0);
+        const float x2_tmp = ck::type_convert<float>(x1);
+        const float y_tmp  = x1_tmp - x2_tmp;
+        y                  = ck::type_convert<bhalf_t>(y_tmp);
+    }
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<int8_t>(int8_t& y, const int8_t& x0, const int8_t& x1) const
+    {
+        y = x0 - x1;
+    };
+};
+
+struct Bilinear
+{
+    Bilinear(float alpha, float beta) : alpha_(alpha), beta_(beta){};
+
+    template <typename Y, typename X0, typename X1>
+    __host__ __device__ constexpr void operator()(Y&, const X0&, const X1&) const;
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<float, float, float>(float& y, const float& x0, const float& x1) const
+    {
+        y = alpha_ * x0 + beta_ * x1;
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<half_t, half_t, half_t>(half_t& y, const half_t& x0, const half_t& x1) const
+    {
+        y = type_convert<half_t>(alpha_) * x0 + type_convert<half_t>(beta_) * x1;
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<half_t, float, half_t>(half_t& y, const float& x0, const half_t& x1) const
+    {
+        y = type_convert<half_t>(alpha_ * x0 + beta_ * ck::type_convert<float>(x1));
+    };
+
+    float alpha_;
+    float beta_;
+};
+
+struct AddRelu
+{
+    template <typename Y, typename X0, typename X1>
+    __host__ __device__ constexpr void operator()(Y& y, const X0& x0, const X1& x1) const;
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<float, float, float>(float& y, const float& x0, const float& x1) const
+    {
+        const float a = x0 + x1;
+        y             = a > 0.0f ? a : 0.0f;
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<double, double, double>(double& y, const double& x0, const double& x1) const
+    {
+        const double a = x0 + x1;
+        y              = a > 0.0 ? a : 0.0;
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<half_t, half_t, half_t>(half_t& y, const half_t& x0, const half_t& x1) const
+    {
+        const half_t a = x0 + x1;
+        y              = a > type_convert<half_t>(0.0f) ? a : type_convert<half_t>(0.0f);
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<half_t, float, half_t>(half_t& y, const float& x0, const half_t& x1) const
+    {
+        const float a = x0 + x1;
+        y             = a > type_convert<half_t>(0.0f) ? a : type_convert<half_t>(0.0f);
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<float, float, half_t>(float& y, const float& x0, const half_t& x1) const
+    {
+        const float a = x0 + type_convert<float>(x1);
+        y             = a > 0.0f ? a : 0.0f;
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<int, int, int8_t>(int& y, const int& x0, const int8_t& x1) const
+    {
+        const int8_t a = x0 + x1;
+        y              = a > 0 ? a : 0;
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<int8_t, int8_t, int8_t>(int8_t& y, const int8_t& x0, const int8_t& x1) const
+    {
+        const int8_t a = x0 + x1;
+        y              = a > 0 ? a : 0;
+    };
+};
+
+struct AddHardswish
+{
+    template <typename T>
+    __host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const;
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<float>(float& y, const float& x0, const float& x1) const
+    {
+        float a = x0 + x1;
+        float b = a + float{3};
+        float c = (b > 0) * (b > 6.0f ? 6.0f : b) * a * 0.166667f;
+        y       = c;
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<double>(double& y, const double& x0, const double& x1) const
+    {
+        double a = x0 + x1;
+        double b = a + 3.0;
+        double c = (b > 0) * (b > 6.0 ? 6.0 : b) * a * 0.166667;
+        y        = c;
+    };
+
+    template <>
+    __host__ __device__ constexpr void
+    operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const
+    {
+        float a = x0 + x1;
+        float b = a + 3.0f;
+        float c = (b > 0) * (b > 6.0f ? 6.0f : b) * a * 0.166667f;
+        y       = c;
+    };
+};
+
+// C = A * B
+// E = FastGelu(C + D)
+struct AddFastGelu
+{
+    // Fast GeLU
+    // https://paperswithcode.com/method/gelu
+    // y = 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3)))
+    __host__ __device__ static constexpr float GetFastGeLU(float x)
+    {
+        const float u   = 2.f * x * (0.035677f * x * x + 0.797885f);
+        const float emu = exp(-u);
+        const float cdf = 0.5f + 0.5f * (2.f / (1.f + emu) - 1.f);
+        return x * cdf;
+    }
+
+    template <typename T>
+    static inline constexpr bool is_valid_param_type_v =
+        std::is_same_v<T, float> || std::is_same_v<T, half_t> || std::is_same_v<T, bhalf_t> ||
+        std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>;
+
+    template <typename E, typename C, typename D>
+    __host__ __device__ constexpr void operator()(E& e, const C& c, const D& d) const
+    {
+        static_assert(is_valid_param_type_v<E> && is_valid_param_type_v<C> &&
+                      is_valid_param_type_v<D>);
+
+        const float y = GetFastGeLU(type_convert<float>(c) + type_convert<float>(d));
+
+        e = type_convert<E>(y);
+    }
+
+    template <typename D>
+    __host__ __device__ constexpr void operator()(float& e, const float& c, const D& d) const
+    {
+        static_assert(is_valid_param_type_v<D>);
+
+        e = GetFastGeLU(c + type_convert<float>(d));
+    }
+};
+
+} // namespace element_wise
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
new file mode 100644
index 00000000..b66107a5
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
@@ -0,0 +1,304 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/math_v2.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/element/quantization_operation.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace element_wise {
+
+// Need to ensure compiler will fail if there is no matching candidate, instead of compiler
+// siliently do implicit type conversion
+//
+// Method 1:
+//
+// struct ExampleElementwiseOp
+// {
+//     template<typename Y, typename X>
+//     __host__ __device__ constexpr void
+//     operator()(Y&, const X) const;
+//
+//     template<>
+//     __host__ __device__ constexpr void
+//     operator()<half_t, half_t>(half_t& y, const half_t& x) const
+//     {
+//     }
+// };
+//
+// Method 2:
+//
+// template <typename Y, typename X>
+// struct ExampleElementwiseOp;
+//
+// template <>
+// struct ExampleElementwiseOp<float, ck::bhalf_t>
+// {
+//     __host__ __device__ void operator()(float& y, ck::bhalf_t& x) const
+//     {
+//     }
+// };
+
+struct AddReluAdd
+{
+    template <typename Y, typename X0, typename X1, typename X2>
+    __host__ __device__ constexpr void operator()(Y&, const X0&, const X1&, const X2&) const;
+
+    template <>
+    __host__ __device__ constexpr void operator()<half_t, half_t, half_t, half_t>(
+        half_t& y, const half_t& x0, const half_t& x1, const half_t& x2) const
+    {
+        half_t a = x0 + x1;
+        half_t b = a > 0 ? a : 0;
+        y        = b + x2;
+    }
+
+    template <>
+    __host__ __device__ constexpr void operator()<float, float, float, float>(float& y,
+                                                                              const float& x0,
+                                                                              const float& x1,
+                                                                              const float& x2) const
+    {
+        float a = x0 + x1;
+        float b = a > 0 ? a : 0;
+        float c = b + x2;
+        y       = c;
+    }
+
+    template <>
+    __host__ __device__ constexpr void operator()<half_t, float, half_t, half_t>(
+        half_t& y, const float& x0, const half_t& x1, const half_t& x2) const
+    {
+        float a = x0 + x1;
+        float b = a > 0 ? a : 0;
+        float c = b + x2;
+        y       = c;
+    }
+
+    template <>
+    __host__ __device__ constexpr void operator()<bhalf_t, float, bhalf_t, bhalf_t>(
+        bhalf_t& y, const float& x0, const bhalf_t& x1, const bhalf_t& x2) const
+    {
+        float a = x0 + x1;
+        float b = a > 0 ? a : 0;
+        float c = b + x2;
+        y       = c;
+    }
+
+    template <>
+    __host__ __device__ constexpr void operator()<int8_t, int8_t, int8_t, int8_t>(
+        int8_t& y, const int8_t& x0, const int8_t& x1, const int8_t& x2) const
+    {
+        int32_t a = x0 + x1;
+        int32_t b = a > 0 ? a : 0;
+        int32_t c = b + x2;
+        y         = c;
+    }
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    template <>
+    __host__ __device__ constexpr void operator()<int4_t, int8_t, int4_t, int4_t>(
+        int4_t& y, const int8_t& x0, const int4_t& x1, const int4_t& x2) const
+    {
+        int32_t a = x0 + x1;
+        int32_t b = a > 0 ? a : 0;
+        int32_t c = b + x2;
+        y         = c;
+    }
+#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+};
+
+struct AddHardswishAdd
+{
+    template <typename Y, typename X0, typename X1, typename X2>
+    __host__ __device__ constexpr void operator()(Y&, const X0&, const X1&, const X2&) const;
+
+    template <>
+    __host__ __device__ constexpr void operator()<float, float, float, float>(float& y,
+                                                                              const float& x0,
+                                                                              const float& x1,
+                                                                              const float& x2) const
+    {
+        float a = x0 + x1;
+        float b = a + float{3};
+        float c = (b > 0) * (b > float{6} ? float{6} : b) * a * float{0.166667};
+        float d = c + x2;
+        y       = d;
+    }
+
+    template <>
+    __host__ __device__ constexpr void operator()<half_t, half_t, half_t, half_t>(
+        half_t& y, const half_t& x0, const half_t& x1, const half_t& x2) const
+    {
+        float a = x0 + x1;
+        float b = a + float{3};
+        float c = (b > 0) * (b > float{6} ? float{6} : b) * a * float{0.166667};
+        float d = c + x2;
+        y       = d;
+    }
+};
+
+// C = A * B
+// E = C + D0 + D1
+struct AddAdd
+{
+    template <typename E, typename C, typename D0, typename D1>
+    __host__ __device__ void operator()(E& e, const C& c, const D0& d0, const D1& d1) const
+    {
+        // Only support floating so far
+        static_assert(is_same<E, half_t>::value || is_same<E, float>::value ||
+                          is_same<E, double>::value,
+                      "Data type is not supported by this operation!");
+
+        static_assert(is_same<C, half_t>::value || is_same<C, float>::value ||
+                          is_same<C, double>::value,
+                      "Data type is not supported by this operation!");
+
+        static_assert(is_same<D0, half_t>::value || is_same<D0, float>::value ||
+                          is_same<D0, double>::value,
+                      "Data type is not supported by this operation!");
+
+        static_assert(is_same<D1, half_t>::value || is_same<D1, float>::value ||
+                          is_same<D1, double>::value,
+                      "Data type is not supported by this operation!");
+
+        const C y = c + type_convert<C>(d0) + type_convert<C>(d1);
+        e         = type_convert<E>(y);
+    }
+};
+
+// C = A * B
+// E = FastGelu(C + D0 + D1)
+struct AddAddFastGelu
+{
+    // Fast GeLU
+    // https://paperswithcode.com/method/gelu
+    // y = 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3)))
+    __host__ __device__ static constexpr float GetFastGeLU(float x)
+    {
+        const float u   = 2.f * x * (0.035677f * x * x + 0.797885f);
+        const float emu = exp(-u);
+        const float cdf = 0.5f + 0.5f * (2.f / (1.f + emu) - 1.f);
+        return x * cdf;
+    }
+
+    template <typename T>
+    static inline constexpr bool is_valid_param_type_v =
+        std::is_same_v<T, float> || std::is_same_v<T, half_t> || std::is_same_v<T, bhalf_t> ||
+        std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+        || std::is_same_v<T, ck::int4_t>
+#endif
+        ;
+
+    template <typename E, typename C, typename D0, typename D1>
+    __host__ __device__ constexpr void
+    operator()(E& e, const C& c, const D0& d0, const D1& d1) const
+    {
+        static_assert(is_valid_param_type_v<E> && is_valid_param_type_v<C> &&
+                      is_valid_param_type_v<D0> && is_valid_param_type_v<D1>);
+
+        const float y =
+            GetFastGeLU(type_convert<float>(c) + type_convert<float>(d0) + type_convert<float>(d1));
+
+        e = type_convert<E>(y);
+    }
+};
+
+struct Normalize
+{
+    // FIXME: is double absolutely necessary?
+    Normalize(double epsilon = 1e-4) : epsilon_(epsilon) {}
+
+    template <typename T1, typename T2, typename T3>
+    __host__ __device__ constexpr void operator()(T1& y,
+                                                  const T1& x,
+                                                  const T2& mean,
+                                                  const T2& mean_square,
+                                                  const T3& gamma,
+                                                  const T3& beta) const;
+
+    template <>
+    __host__ __device__ constexpr void operator()<half_t, float, half_t>(half_t& y,
+                                                                         const half_t& x,
+                                                                         const float& mean,
+                                                                         const float& mean_square,
+                                                                         const half_t& gamma,
+                                                                         const half_t& beta) const
+    {
+        using ck::math::sqrt;
+
+        float variance = mean_square - (mean * mean);
+
+        float tmp_x     = type_convert<float>(x);
+        float tmp_gamma = type_convert<float>(gamma);
+        float tmp_beta  = type_convert<float>(beta);
+
+        float tmp_y =
+            ((tmp_x - mean) / sqrt(variance + type_convert<float>(epsilon_))) * tmp_gamma +
+            tmp_beta;
+
+        y = type_convert<half_t>(tmp_y);
+    };
+
+    template <>
+    __host__ __device__ constexpr void operator()<float, float, float>(float& y,
+                                                                       const float& x,
+                                                                       const float& mean,
+                                                                       const float& mean_square,
+                                                                       const float& gamma,
+                                                                       const float& beta) const
+    {
+        using ck::math::sqrt;
+
+        float variance = mean_square - (mean * mean);
+        y = ((x - mean) / sqrt(variance + type_convert<float>(epsilon_))) * gamma + beta;
+    };
+
+    template <>
+    __host__ __device__ constexpr void operator()<double, double, double>(double& y,
+                                                                          const double& x,
+                                                                          const double& mean,
+                                                                          const double& mean_square,
+                                                                          const double& gamma,
+                                                                          const double& beta) const
+    {
+        using ck::math::sqrt;
+
+        double variance = mean_square - (mean * mean);
+        y               = ((x - mean) / sqrt(variance + epsilon_)) * gamma + beta;
+    };
+
+    // FIXME: is double absolutely necessary?
+    double epsilon_;
+};
+
+template <typename Y, typename X>
+struct UnaryTypeConvert;
+
+template <>
+struct UnaryTypeConvert<float, ck::bhalf_t>
+{
+    __host__ __device__ void operator()(float& y, ck::bhalf_t& x) const
+    {
+        y = ck::type_convert<float, ck::bhalf_t>(x);
+    }
+};
+
+template <>
+struct UnaryTypeConvert<ck::bhalf_t, float>
+{
+    __host__ __device__ void operator()(ck::bhalf_t& y, float& x) const
+    {
+        y = ck::type_convert<ck::bhalf_t, float>(x);
+    }
+};
+
+} // namespace element_wise
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/element/quantization_operation.hpp b/include/ck/tensor_operation/gpu/element/quantization_operation.hpp
new file mode 100644
index 00000000..3f2c2f87
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/element/quantization_operation.hpp
@@ -0,0 +1,124 @@
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace element_wise {
+
+// For Activation function which is piecewise linear function, such as relu, leaky relu ...etc
+template <typename Activation>
+struct Activation_Mul_Clamp
+{
+    Activation_Mul_Clamp(float requantScale, Activation activationOp)
+        : requantScale_(requantScale), activationOp_(activationOp)
+    {
+    }
+
+    __host__ __device__ constexpr void operator()(int8_t& y, const int32_t& x) const
+    {
+        float x_fp32 = ck::type_convert<float>(x);
+        activationOp_(x_fp32, x_fp32);
+        float y_fp32 = math::clamp(requantScale_ * x_fp32, -128.f, 127.f);
+        y            = ck::type_convert<int8_t>(y_fp32);
+    }
+
+    __host__ __device__ constexpr void operator()(float& y, const int32_t& x) const
+    {
+        // We might type_convert to int8 after lambda in someplace
+        float x_fp32 = ck::type_convert<float>(x);
+        activationOp_(x_fp32, x_fp32);
+        y = math::clamp(requantScale_ * x_fp32, -128.f, 127.f);
+    }
+
+    float requantScale_;
+    Activation activationOp_;
+};
+
+// Conv Perchannel quantization + Activation function which is piecewise linear function, such as
+// relu, leaky relu ...etc
+template <typename Activation>
+struct Activation_Mul2_Clamp
+{
+    Activation_Mul2_Clamp(Activation activationOp) : activationOp_(activationOp) {}
+
+    __host__ __device__ constexpr void
+    operator()(int8_t& y, const int32_t& x, const float& requantScale) const
+    {
+        float y_fp32 = ck::type_convert<float>(x);
+        activationOp_(y_fp32, y_fp32);
+        y_fp32 = math::clamp(requantScale * y_fp32, -128.f, 127.f);
+        y      = ck::type_convert<int8_t>(y_fp32);
+    }
+
+    Activation activationOp_;
+};
+
+// For Activation function which is piecewise linear function, such as relu, leaky relu ...etc
+template <typename Activation>
+struct Add_Activation_Mul_Clamp
+{
+    Add_Activation_Mul_Clamp(float requantScale, Activation activationOp)
+        : requantScale_(requantScale), activationOp_(activationOp)
+    {
+    }
+
+    __host__ __device__ constexpr void
+    operator()(int8_t& y, const int32_t& x, const int32_t& bias) const
+    {
+        float y_fp32 = ck::type_convert<float>(x + bias);
+        activationOp_(y_fp32, y_fp32);
+        y_fp32 = math::clamp(requantScale_ * y_fp32, -128.f, 127.f);
+        y      = ck::type_convert<int8_t>(y_fp32);
+    }
+
+    float requantScale_;
+    Activation activationOp_;
+};
+
+// Conv Perchannel quantization + Activation function which is piecewise linear function, such as
+// relu, leaky relu ...etc
+template <typename Activation>
+struct Add_Activation_Mul2_Clamp
+{
+    Add_Activation_Mul2_Clamp(Activation activationOp) : activationOp_(activationOp) {}
+
+    __host__ __device__ constexpr void
+    operator()(int8_t& y, const int32_t& x, const int32_t& bias, const float& requantScale) const
+    {
+        float y_fp32 = ck::type_convert<float>(x + bias);
+        activationOp_(y_fp32, y_fp32);
+        y_fp32 = math::clamp(requantScale * y_fp32, -128.f, 127.f);
+        y      = ck::type_convert<int8_t>(y_fp32);
+    }
+
+    Activation activationOp_;
+};
+
+// For Activation function which is non piecewise linear function, such as TanH, Sigmoid ...etc
+template <typename Activation>
+struct Add_Mul_Activation_Mul_Clamp
+{
+    Add_Mul_Activation_Mul_Clamp(float requantScale1, float requantScale2, Activation activationOp)
+        : requantScale1_(requantScale1), requantScale2_(requantScale2), activationOp_(activationOp)
+    {
+    }
+
+    __host__ __device__ constexpr void
+    operator()(int8_t& y, const int32_t& x, const int32_t& bias) const
+    {
+        float y_fp32 = ck::type_convert<float>(x + bias);
+        y_fp32       = requantScale1_ * y_fp32;
+        activationOp_(y_fp32, y_fp32);
+        y_fp32 = math::clamp(requantScale2_ * y_fp32, -128.f, 127.f);
+        y      = ck::type_convert<int8_t>(y_fp32);
+    }
+
+    float requantScale1_;
+    float requantScale2_;
+    Activation activationOp_;
+};
+
+} // namespace element_wise
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
new file mode 100644
index 00000000..fbdfe926
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -0,0 +1,268 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/math.hpp"
+#include "ck/utility/math_v2.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace element_wise {
+
+struct PassThrough
+{
+    template <typename Y, typename X>
+    __host__ __device__ void operator()(Y& y, const X& x) const;
+
+    template <>
+    __host__ __device__ void operator()<double, double>(double& y, const double& x) const
+    {
+        y = x;
+    }
+
+    template <>
+    __host__ __device__ void operator()<float, float>(float& y, const float& x) const
+    {
+        y = x;
+    }
+
+    template <>
+    __host__ __device__ void operator()<half_t, half_t>(half_t& y, const half_t& x) const
+    {
+        y = x;
+    }
+
+    template <>
+    __host__ __device__ void operator()<bhalf_t, bhalf_t>(bhalf_t& y, const bhalf_t& x) const
+    {
+        y = x;
+    }
+
+    template <>
+    __host__ __device__ void operator()<int32_t, int32_t>(int32_t& y, const int32_t& x) const
+    {
+        y = x;
+    }
+
+    template <>
+    __host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
+    {
+        y = type_convert<bhalf_t>(x);
+    }
+
+    template <>
+    __host__ __device__ void operator()<int8_t, int8_t>(int8_t& y, const int8_t& x) const
+    {
+        y = x;
+    }
+
+    template <>
+    __host__ __device__ void operator()<int8_t, int32_t>(int8_t& y, const int32_t& x) const
+    {
+        y = type_convert<int8_t>(x);
+    }
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+    template <>
+    __host__ __device__ void operator()<int4_t, int4_t>(int4_t& y, const int4_t& x) const
+    {
+        y = x;
+    }
+#endif
+};
+
+struct UnaryConvert
+{
+    template <typename Y, typename X>
+    __host__ __device__ void operator()(Y& y, const X& x) const
+    {
+        y = type_convert<Y>(x);
+    }
+};
+
+struct Scale
+{
+    __host__ __device__ Scale(float scale) : scale_(scale) {}
+
+    template <typename Y, typename X>
+    __host__ __device__ void operator()(Y& y, const X& x) const;
+
+    template <>
+    __host__ __device__ void operator()<float, float>(float& y, const float& x) const
+    {
+        y = scale_ * x;
+    };
+
+    float scale_;
+};
+
+struct ScaleAndResetNaNToMinusInfinity
+{
+    __host__ __device__ ScaleAndResetNaNToMinusInfinity(float scale) : scale_(scale) {}
+
+    template <typename Y, typename X>
+    __host__ __device__ void operator()(Y& y, const X& x) const;
+
+    template <>
+    __host__ __device__ void operator()<float, float>(float& y, const float& x) const
+    {
+        y = ck::math::isnan(x) ? -ck::NumericLimits<float>::Infinity() : scale_ * x;
+    };
+
+    float scale_;
+};
+
+struct UnaryDivide
+{
+    __host__ __device__ UnaryDivide(const int32_t divider = 1) : divider_(divider) {}
+
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, int32_t>::value,
+                      "Data type is not supported by this operation!");
+
+        y = x / type_convert<T>(divider_);
+    };
+
+    int32_t divider_ = 1;
+};
+
+struct UnarySquare
+{
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same_v<T, float> || is_same_v<T, double> || is_same_v<T, int32_t> ||
+                          is_same_v<T, int8_t>
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+                          || is_same_v<T, int4_t>
+#endif
+                      ,
+                      "Data type is not supported by this operation!");
+        y = x * x;
+    };
+};
+
+struct UnaryAbs
+{
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+                          is_same<T, int8_t>::value,
+                      "Data type is not supported by this operation!");
+
+        y = ck::math::abs(x);
+    };
+};
+
+struct UnarySqrt
+{
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value,
+                      "Data type is not supported by this operation!");
+
+        y = ck::math::sqrt(x);
+    };
+};
+
+struct Relu
+{
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+                          is_same<T, int8_t>::value,
+                      "Data type is not supported by this operation!");
+        y = x > 0 ? x : 0;
+    }
+
+    template <>
+    __host__ __device__ void operator()(bhalf_t& y, const bhalf_t& x) const
+    {
+        float x_f32 = ck::type_convert<float>(x);
+        float y_f32 = x_f32 > 0 ? x_f32 : 0;
+        y           = ck::type_convert<bhalf_t>(y_f32);
+    }
+};
+
+// Y = FastGelu(X)
+struct FastGelu
+{
+    // Fast GeLU
+    // https://paperswithcode.com/method/gelu
+    // y = 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3)))
+    __host__ __device__ static constexpr float GetFastGeLU(float x)
+    {
+        const float u   = 2.f * x * (0.035677f * x * x + 0.797885f);
+        const float emu = exp(-u);
+        const float cdf = 0.5f + 0.5f * (2.f / (1.f + emu) - 1.f);
+        return x * cdf;
+    }
+
+    template <typename T>
+    static inline constexpr bool is_valid_param_type_v =
+        std::is_same_v<T, float> || std::is_same_v<T, half_t> || std::is_same_v<T, bhalf_t> ||
+        std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+        || std::is_same_v<T, ck::int4_t>
+#endif
+        ;
+
+    template <typename Y, typename X>
+    __host__ __device__ void operator()(Y& y, const X& x) const
+    {
+        static_assert(is_valid_param_type_v<Y> && is_valid_param_type_v<X>);
+
+        const float tmp_y = GetFastGeLU(type_convert<float>(x));
+        y                 = type_convert<Y>(tmp_y);
+    }
+};
+
+// https://paperswithcode.com/method/gelu
+// y = 0.5*x*(1+erf(x/sqrt(2)))
+struct Gelu
+{
+    template <typename Y, typename X>
+    __host__ __device__ void operator()(Y& y, const X& x) const;
+
+    template <>
+    __host__ __device__ void operator()<float, float>(float& y, const float& x) const
+    {
+        y = 0.5f * x * (1.f + erf(float(0.70710678118f * x)));
+    }
+
+    template <>
+    __host__ __device__ void operator()<ck::half_t, ck::half_t>(ck::half_t& y,
+                                                                const ck::half_t& x) const
+    {
+        y = ck::half_t(0.5) * x * (ck::half_t(1) + ck::half_t(erf(float(0.70710678118f * x))));
+    }
+};
+
+struct Sigmoid
+{
+    template <typename T>
+    __host__ __device__ void operator()(T& y, const T& x) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, ck::half_t>::value,
+                      "Data type is not supported by this operation!");
+
+        y = 1 / (ck::type_convert<T>(1) + exp(-x));
+    };
+
+    int32_t divider_ = 1;
+};
+
+} // namespace element_wise
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp
new file mode 100644
index 00000000..a72a4ee0
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_reduce_second_half_batchnorm_backward_final.hpp
@@ -0,0 +1,498 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseReduceSecondHalfBatchNormBackwardFinal_,
+          typename XDataType,
+          typename DyDataType,
+          typename DxDataType,
+          typename ScaleDataType,
+          typename DscaleDbiasDataType,
+          typename MeanVarDataType,
+          typename DyElementwiseOp,
+          typename XYGridDesc_M_K,
+          typename DscaleDbiasGridDesc_M_K,
+          typename MeanVarGridDesc_M,
+          typename ScaleBiasGridDesc_M>
+__global__ void kernel_reduce_second_half_batchnorm_backward_final(
+    const XYGridDesc_M_K x_grid_desc_m_k,
+    const XYGridDesc_M_K dy_grid_desc_m_k,
+    const XYGridDesc_M_K dx_grid_desc_m_k,
+    const DscaleDbiasGridDesc_M_K dscale_dbias_grid_desc_m_k,
+    const MeanVarGridDesc_M mean_var_grid_desc_m,
+    const ScaleBiasGridDesc_M scale_grid_desc_m,
+    const ScaleBiasGridDesc_M bias_grid_desc_m,
+    index_t blkgroup_size,
+    long_index_t reduce_size,
+    index_t num_xy_k_block_tile_iteration,
+    index_t num_dscale_dbias_k_block_tile_iteration,
+    const DscaleDbiasDataType* const __restrict__ p_reduce_dscale,
+    const DscaleDbiasDataType* const __restrict__ p_reduce_dbias,
+    const MeanVarDataType* const __restrict__ p_mean,
+    const MeanVarDataType* const __restrict__ p_inv_var,
+    const XDataType* const __restrict__ p_x,
+    const DyDataType* const __restrict__ p_dy,
+    const ScaleDataType* const __restrict__ p_scale,
+    const DyElementwiseOp dy_elementwise_op,
+    DxDataType* const __restrict__ p_dx,
+    DscaleDbiasDataType* const __restrict__ p_dscale,
+    DscaleDbiasDataType* const __restrict__ p_dbias)
+{
+    GridwiseReduceSecondHalfBatchNormBackwardFinal_::Run(x_grid_desc_m_k,
+                                                         dy_grid_desc_m_k,
+                                                         dx_grid_desc_m_k,
+                                                         dscale_dbias_grid_desc_m_k,
+                                                         mean_var_grid_desc_m,
+                                                         scale_grid_desc_m,
+                                                         bias_grid_desc_m,
+                                                         blkgroup_size,
+                                                         reduce_size,
+                                                         num_xy_k_block_tile_iteration,
+                                                         num_dscale_dbias_k_block_tile_iteration,
+                                                         p_reduce_dscale,
+                                                         p_reduce_dbias,
+                                                         p_mean,
+                                                         p_inv_var,
+                                                         p_x,
+                                                         p_dy,
+                                                         p_scale,
+                                                         dy_elementwise_op,
+                                                         p_dx,
+                                                         p_dscale,
+                                                         p_dbias);
+};
+
+template <typename XDataType,
+          typename DyDataType,
+          typename DxDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename DscaleDbiasDataType,
+          typename MeanVarDataType,
+          typename DyElementwiseOp,
+          typename XYGridDesc_M_K,
+          typename DscaleDbiasGridDesc_M_K,
+          typename MeanVarGridDesc_M,
+          typename ScaleBiasGridDesc_M,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t XDyDxVectorDim,
+          index_t XSrcVectorSize,
+          index_t DySrcVectorSize,
+          index_t DxDstVectorSize,
+          index_t ScaleSrcVectorSize,
+          index_t DscaleDbiasDstVectorSize,
+          index_t MeanVarSrcVectorSize>
+struct GridwiseReduceSecondHalfBatchNormBackwardFinal
+{
+    static_assert((XDyDxVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0 &&
+                   MThreadSliceSize % DySrcVectorSize == 0 &&
+                   MThreadSliceSize % DxDstVectorSize == 0) ||
+                      (XDyDxVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0 &&
+                       KThreadSliceSize % DySrcVectorSize == 0 &&
+                       KThreadSliceSize % DxDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static constexpr bool reorder_thread_cluster = (XDyDxVectorDim == 0);
+
+    using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
+
+    using ThreadBufferDimAccessOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    using ThreadClusterArrangeOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    static constexpr auto thread_cluster_desc =
+        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
+
+    using ThreadReduceSrcDesc_M_1 = decltype(
+        make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}, Number<1>{})));
+    using ThreadReduceDstDesc_M =
+        decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
+
+    using BlockwiseReduce = PartitionedBlockwiseReduction<AccDataType,
+                                                          BlockSize,
+                                                          ThreadClusterLengths_M_K,
+                                                          ThreadClusterArrangeOrder,
+                                                          ck::reduce::Add,
+                                                          false>;
+
+    using ThreadwiseReduce = ThreadwiseReduction<AccDataType,
+                                                 ThreadReduceSrcDesc_M_1,
+                                                 ThreadReduceDstDesc_M,
+                                                 ck::reduce::Add,
+                                                 false>;
+
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+
+    // clang-format off
+    // Two of the steps of Multiblock BatchNorm Backward
+    // Step 1: Second half of Reduction: dbias = sum(dy), dscale = sum(dy * (x-mean) * inv-variance)
+    // Step 2: calculating dx = 1/reduce_size * inv-variance * scale * (reduce_size * dy - dbias - dscale * (x - mean) * inv-variance)) elementwise-ly
+    // clang-format on
+    __device__ static void Run(const XYGridDesc_M_K& x_grid_desc_m_k,
+                               const XYGridDesc_M_K& dy_grid_desc_m_k,
+                               const XYGridDesc_M_K& dx_grid_desc_m_k,
+                               const DscaleDbiasGridDesc_M_K& dscale_dbias_grid_desc_m_k,
+                               const MeanVarGridDesc_M& mean_var_grid_desc_m,
+                               const ScaleBiasGridDesc_M& scale_grid_desc_m,
+                               const ScaleBiasGridDesc_M& dscale_dbias_grid_desc_m,
+                               index_t blkgroup_size,
+                               long_index_t reduce_size,
+                               index_t num_xy_k_block_tile_iteration,
+                               index_t num_dscale_dbias_k_block_tile_iteration,
+                               const DscaleDbiasDataType* const __restrict__ p_reduce_dscale,
+                               const DscaleDbiasDataType* const __restrict__ p_reduce_dbias,
+                               const MeanVarDataType* const __restrict__ p_mean,
+                               const MeanVarDataType* const __restrict__ p_inv_var,
+                               const XDataType* const __restrict__ p_x,
+                               const DyDataType* const __restrict__ p_dy,
+                               const ScaleDataType* const __restrict__ p_scale,
+                               const DyElementwiseOp dy_elementwise_op,
+                               DxDataType* const __restrict__ p_dx,
+                               DscaleDbiasDataType* const __restrict__ p_dscale,
+                               DscaleDbiasDataType* const __restrict__ p_dbias)
+    {
+        __shared__ AccDataType p_reduce_work_buffer[BlockSize];
+
+        auto reduce_work_buf =
+            make_dynamic_buffer<AddressSpaceEnum::Lds>(p_reduce_work_buffer, BlockSize);
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * 1, true>
+            reduce_dscale_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * 1, true>
+            reduce_dbias_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> dscale_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> dbias_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            x_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            dy_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            dx_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
+            inv_var_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> scale_thread_buf;
+
+        const index_t thread_local_id = get_thread_local_1d_id();
+        const index_t block_global_id = get_block_1d_id();
+        const index_t blkgroup_id     = block_global_id / blkgroup_size;
+        const index_t block_local_id  = block_global_id % blkgroup_size;
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
+
+        const auto thread_m_cluster_id = thread_cluster_idx[I0];
+        const auto thread_k_cluster_id = thread_cluster_idx[I1];
+
+        using ThreadBufferLengths_M_K         = Sequence<MThreadSliceSize, KThreadSliceSize>;
+        using ThreadBufferLengths_M           = Sequence<MThreadSliceSize>;
+        using ThreadBufferLengths_M_1         = Sequence<MThreadSliceSize, 1>;
+        constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
+        constexpr auto thread_buffer_desc_m =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}));
+        constexpr auto thread_buffer_desc_m_1 = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<1>{}));
+
+        // clang-format off
+        // Step 1: do final reduction of dbias = sum(dy), dscale = sum(dy * (x-mean) * inv-variance)
+        // clang-format on
+
+        auto threadwise_dscale_dbias_load_m_k =
+            ThreadwiseTensorSliceTransfer_v2<DscaleDbiasDataType,
+                                             AccDataType,
+                                             DscaleDbiasGridDesc_M_K,
+                                             decltype(thread_buffer_desc_m_1),
+                                             ThreadBufferLengths_M_1,
+                                             Sequence<0, 1>,
+                                             1,
+                                             1,
+                                             1,
+                                             true>(
+                dscale_dbias_grid_desc_m_k,
+                make_multi_index(blkgroup_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 thread_k_cluster_id * 1));
+
+        auto threadwise_dscale_dbias_store_m =
+            ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                               DscaleDbiasDataType,
+                                               decltype(thread_buffer_desc_m),
+                                               ScaleBiasGridDesc_M,
+                                               PassThroughOp,
+                                               ThreadBufferLengths_M,
+                                               Sequence<0>,
+                                               0,
+                                               DscaleDbiasDstVectorSize,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                dscale_dbias_grid_desc_m,
+                make_multi_index(blkgroup_id * M_BlockTileSize +
+                                 thread_m_cluster_id * MThreadSliceSize),
+                PassThroughOp{});
+
+        const auto reduce_dscale_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_reduce_dscale, dscale_dbias_grid_desc_m_k.GetElementSpaceSize());
+
+        const auto reduce_dbias_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_reduce_dbias, dscale_dbias_grid_desc_m_k.GetElementSpaceSize());
+
+        auto dscale_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_dscale, dscale_dbias_grid_desc_m.GetElementSpaceSize());
+
+        auto dbias_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_dbias, dscale_dbias_grid_desc_m.GetElementSpaceSize());
+
+        constexpr auto dscale_dbias_thread_copy_step_m_k =
+            make_multi_index(0, KThreadClusterSize * 1);
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            dscale_thread_buf(I) = type_convert<AccDataType>(0.0f);
+            dbias_thread_buf(I)  = type_convert<AccDataType>(0.0f);
+        });
+
+        for(index_t reducedTiles = 0; reducedTiles < num_dscale_dbias_k_block_tile_iteration;
+            ++reducedTiles)
+        {
+            threadwise_dscale_dbias_load_m_k.Run(dscale_dbias_grid_desc_m_k,
+                                                 reduce_dscale_global_buf,
+                                                 thread_buffer_desc_m_1,
+                                                 make_tuple(I0, I0),
+                                                 reduce_dscale_thread_buf);
+
+            threadwise_dscale_dbias_load_m_k.Run(dscale_dbias_grid_desc_m_k,
+                                                 reduce_dbias_global_buf,
+                                                 thread_buffer_desc_m_1,
+                                                 make_tuple(I0, I0),
+                                                 reduce_dbias_thread_buf);
+
+            ThreadwiseReduce::Reduce(reduce_dscale_thread_buf, dscale_thread_buf);
+            ThreadwiseReduce::Reduce(reduce_dbias_thread_buf, dbias_thread_buf);
+
+            threadwise_dscale_dbias_load_m_k.MoveSrcSliceWindow(dscale_dbias_grid_desc_m_k,
+                                                                dscale_dbias_thread_copy_step_m_k);
+        }
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            if constexpr(I > 0)
+                block_sync_lds();
+
+            BlockwiseReduce::Reduce(reduce_work_buf, dscale_thread_buf(I));
+            block_sync_lds();
+            BlockwiseReduce::Reduce(reduce_work_buf, dbias_thread_buf(I));
+        });
+
+        threadwise_dscale_dbias_store_m.Run(thread_buffer_desc_m,
+                                            make_tuple(I0),
+                                            dscale_thread_buf,
+                                            dscale_dbias_grid_desc_m,
+                                            dscale_global_buf);
+
+        threadwise_dscale_dbias_store_m.Run(thread_buffer_desc_m,
+                                            make_tuple(I0),
+                                            dbias_thread_buf,
+                                            dscale_dbias_grid_desc_m,
+                                            dbias_global_buf);
+
+        // clang-format off
+        // Step 2: calculate dx = 1/N * inv-variance * scale * (N * dy - dbias - dscale * (x - mean) * inv-variance)
+        // clang-format on
+
+        const index_t workSizePerBlock = K_BlockTileSize * num_xy_k_block_tile_iteration;
+
+        auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2<XDataType,
+                                                                  AccDataType,
+                                                                  XYGridDesc_M_K,
+                                                                  decltype(thread_buffer_desc_m_k),
+                                                                  ThreadBufferLengths_M_K,
+                                                                  ThreadBufferDimAccessOrder,
+                                                                  XDyDxVectorDim,
+                                                                  XSrcVectorSize,
+                                                                  1,
+                                                                  true>(
+            x_grid_desc_m_k,
+            make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
+                             workSizePerBlock * block_local_id +
+                                 thread_k_cluster_id * KThreadSliceSize));
+
+        auto threadwise_dy_load = ThreadwiseTensorSliceTransfer_v2<DyDataType,
+                                                                   AccDataType,
+                                                                   XYGridDesc_M_K,
+                                                                   decltype(thread_buffer_desc_m_k),
+                                                                   ThreadBufferLengths_M_K,
+                                                                   ThreadBufferDimAccessOrder,
+                                                                   XDyDxVectorDim,
+                                                                   DySrcVectorSize,
+                                                                   1,
+                                                                   true>(
+            dy_grid_desc_m_k,
+            make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
+                             workSizePerBlock * block_local_id +
+                                 thread_k_cluster_id * KThreadSliceSize));
+
+        auto threadwise_dx_store =
+            ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                               DxDataType,
+                                               decltype(thread_buffer_desc_m_k),
+                                               XYGridDesc_M_K,
+                                               PassThroughOp,
+                                               ThreadBufferLengths_M_K,
+                                               ThreadBufferDimAccessOrder,
+                                               XDyDxVectorDim,
+                                               DxDstVectorSize,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                dx_grid_desc_m_k,
+                make_multi_index(
+                    blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
+                    workSizePerBlock * block_local_id + thread_k_cluster_id * KThreadSliceSize),
+                PassThroughOp{});
+
+        auto threadwise_scale_load =
+            ThreadwiseTensorSliceTransfer_v2<ScaleDataType,
+                                             AccDataType,
+                                             ScaleBiasGridDesc_M,
+                                             decltype(thread_buffer_desc_m),
+                                             ThreadBufferLengths_M,
+                                             Sequence<0>,
+                                             0,
+                                             ScaleSrcVectorSize,
+                                             1,
+                                             true>(
+                scale_grid_desc_m,
+                make_multi_index(blkgroup_id * M_BlockTileSize +
+                                 thread_m_cluster_id * MThreadSliceSize));
+
+        auto threadwise_mean_var_load =
+            ThreadwiseTensorSliceTransfer_v2<MeanVarDataType,
+                                             AccDataType,
+                                             MeanVarGridDesc_M,
+                                             decltype(thread_buffer_desc_m),
+                                             ThreadBufferLengths_M,
+                                             Sequence<0>,
+                                             0,
+                                             MeanVarSrcVectorSize,
+                                             1,
+                                             true>(
+                mean_var_grid_desc_m,
+                make_multi_index(blkgroup_id * M_BlockTileSize +
+                                 thread_m_cluster_id * MThreadSliceSize));
+
+        const auto x_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_x, x_grid_desc_m_k.GetElementSpaceSize());
+
+        const auto dy_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_dy, dy_grid_desc_m_k.GetElementSpaceSize());
+
+        auto dx_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_dx, dx_grid_desc_m_k.GetElementSpaceSize());
+
+        const auto scale_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_scale, scale_grid_desc_m.GetElementSpaceSize());
+
+        const auto mean_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_mean, mean_var_grid_desc_m.GetElementSpaceSize());
+
+        const auto inv_var_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_inv_var, mean_var_grid_desc_m.GetElementSpaceSize());
+
+        threadwise_scale_load.Run(scale_grid_desc_m,
+                                  scale_global_buf,
+                                  thread_buffer_desc_m,
+                                  make_tuple(I0),
+                                  scale_thread_buf);
+
+        threadwise_mean_var_load.Run(mean_var_grid_desc_m,
+                                     mean_global_buf,
+                                     thread_buffer_desc_m,
+                                     make_tuple(I0),
+                                     mean_thread_buf);
+
+        threadwise_mean_var_load.Run(mean_var_grid_desc_m,
+                                     inv_var_global_buf,
+                                     thread_buffer_desc_m,
+                                     make_tuple(I0),
+                                     inv_var_thread_buf);
+
+        constexpr auto xy_thread_copy_step_m_k = make_multi_index(0, K_BlockTileSize);
+
+        AccDataType inv_reduce_size =
+            type_convert<AccDataType>(1.0) / type_convert<AccDataType>(reduce_size);
+
+        for(index_t reducedTiles = 0; reducedTiles < num_xy_k_block_tile_iteration; ++reducedTiles)
+        {
+            threadwise_x_load.Run(x_grid_desc_m_k,
+                                  x_global_buf,
+                                  thread_buffer_desc_m_k,
+                                  make_tuple(I0, I0),
+                                  x_thread_buf);
+
+            threadwise_dy_load.Run(dy_grid_desc_m_k,
+                                   dy_global_buf,
+                                   thread_buffer_desc_m_k,
+                                   make_tuple(I0, I0),
+                                   dy_thread_buf);
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                AccDataType multiplier =
+                    inv_reduce_size * inv_var_thread_buf[iM] * scale_thread_buf[iM];
+
+                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                    constexpr auto offset =
+                        thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK));
+
+                    dy_elementwise_op(dy_thread_buf(Number<offset>{}),
+                                      dy_thread_buf[Number<offset>{}]);
+
+                    AccDataType norm_x = (x_thread_buf[Number<offset>{}] - mean_thread_buf[iM]) *
+                                         inv_var_thread_buf[iM];
+
+                    AccDataType tmpVal = norm_x * dscale_thread_buf[iM];
+
+                    dx_thread_buf(Number<offset>{}) =
+                        multiplier *
+                        (type_convert<AccDataType>(reduce_size) * dy_thread_buf[Number<offset>{}] -
+                         dbias_thread_buf[iM] - tmpVal);
+                });
+            });
+
+            threadwise_dx_store.Run(thread_buffer_desc_m_k,
+                                    make_tuple(I0, I0),
+                                    dx_thread_buf,
+                                    dx_grid_desc_m_k,
+                                    dx_global_buf);
+
+            threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, xy_thread_copy_step_m_k);
+            threadwise_dy_load.MoveSrcSliceWindow(dy_grid_desc_m_k, xy_thread_copy_step_m_k);
+            threadwise_dx_store.MoveDstSliceWindow(dx_grid_desc_m_k, xy_thread_copy_step_m_k);
+        }
+    };
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp
new file mode 100644
index 00000000..08cb0dd1
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_first_half.hpp
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/math.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_welford.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseMultiblockWelfordFirstHalf_,
+          typename XDataType,
+          typename MeanVarDataType,
+          typename XGridDesc_M_K,
+          typename MeanVarCountGridDesc_M_G,
+          typename GetReduceCountPerThreadFunctor>
+__global__ void kernel_multiblock_welford_first_half(
+    const XGridDesc_M_K x_grid_desc_m_k,
+    const MeanVarCountGridDesc_M_G mean_var_count_grid_desc_m_g,
+    const GetReduceCountPerThreadFunctor get_reduce_count_per_thread,
+    index_t num_k_block_tile_iteration,
+    const XDataType* const __restrict__ p_x,
+    MeanVarDataType* const p_welford_mean,
+    MeanVarDataType* const p_welford_variance,
+    int32_t* const p_welford_count)
+{
+    GridwiseMultiblockWelfordFirstHalf_::Run(x_grid_desc_m_k,
+                                             mean_var_count_grid_desc_m_g,
+                                             get_reduce_count_per_thread,
+                                             num_k_block_tile_iteration,
+                                             p_x,
+                                             p_welford_mean,
+                                             p_welford_variance,
+                                             p_welford_count);
+};
+
+template <typename XDataType,
+          typename AccDataType,
+          typename MeanVarDataType,
+          typename XGridDesc_M_K,
+          typename MeanVarCountGridDesc_M_G,
+          typename GetReduceCountPerThreadFunctor,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t XSrcCountSrcVectorDim,
+          index_t XSrcCountSrcVectorSize>
+struct GridwiseMultiblockWelfordFirstHalf
+{
+    static_assert((XSrcCountSrcVectorDim == 0 && MThreadSliceSize % XSrcCountSrcVectorSize == 0) ||
+                      (XSrcCountSrcVectorDim == 1 &&
+                       KThreadSliceSize % XSrcCountSrcVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static constexpr bool reorder_thread_cluster = (XSrcCountSrcVectorDim == 0);
+
+    using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
+
+    using ThreadBufferDimAccessOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    using ThreadClusterArrangeOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    static constexpr auto thread_cluster_desc =
+        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
+
+    using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{})));
+    using ThreadReduceDstDesc_M =
+        decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
+
+    using ThreadwiseWelford =
+        ThreadwiseWelford<AccDataType, ThreadReduceSrcDesc_M_K, ThreadReduceDstDesc_M>;
+
+    using BlockwiseWelford = BlockwiseWelford<AccDataType,
+                                              BlockSize,
+                                              ThreadClusterLengths_M_K,
+                                              ThreadClusterArrangeOrder,
+                                              false>;
+
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+
+    // clang-format off
+    // First half of the Multiblock Welford method to calculate mean and variance, used by both batchnorm-forward and batchnorm-backward.
+    // clang-format on
+    __device__ static void Run(const XGridDesc_M_K& x_grid_desc_m_k,
+                               const MeanVarCountGridDesc_M_G& mean_var_count_grid_desc_m_g,
+                               const GetReduceCountPerThreadFunctor& get_reduce_count_per_thread,
+                               index_t num_k_block_tile_iteration,
+                               const XDataType* const __restrict__ p_x,
+                               MeanVarDataType* const p_welford_mean,
+                               MeanVarDataType* const p_welford_variance,
+                               int32_t* const p_welford_count)
+    {
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            x_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
+            welford_mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
+            welford_var_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, int32_t, MThreadSliceSize, true>
+            welford_count_thread_buf;
+
+        const index_t blkgroup_size = mean_var_count_grid_desc_m_g.GetLength(I1);
+
+        const index_t thread_local_id = get_thread_local_1d_id();
+        const index_t block_global_id = get_block_1d_id();
+        const index_t blkgroup_id     = block_global_id / blkgroup_size;
+        const index_t block_local_id  = block_global_id % blkgroup_size;
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
+
+        const auto thread_m_cluster_id = thread_cluster_idx[I0];
+        const auto thread_k_cluster_id = thread_cluster_idx[I1];
+
+        using ThreadBufferLengths_M_K = Sequence<MThreadSliceSize, KThreadSliceSize>;
+        using ThreadBufferLengths_M_1 = Sequence<MThreadSliceSize, 1>;
+
+        constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
+        constexpr auto thread_buffer_desc_m_1 = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<1>{}));
+
+        const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration;
+
+        auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2<XDataType,
+                                                                  AccDataType,
+                                                                  XGridDesc_M_K,
+                                                                  decltype(thread_buffer_desc_m_k),
+                                                                  ThreadBufferLengths_M_K,
+                                                                  ThreadBufferDimAccessOrder,
+                                                                  XSrcCountSrcVectorDim,
+                                                                  XSrcCountSrcVectorSize,
+                                                                  1,
+                                                                  true>(
+            x_grid_desc_m_k,
+            make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
+                             block_local_id * reduceSizePerBlock +
+                                 thread_k_cluster_id * KThreadSliceSize));
+
+        auto threadwise_welford_mean_var_store =
+            ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                               MeanVarDataType,
+                                               decltype(thread_buffer_desc_m_1),
+                                               MeanVarCountGridDesc_M_G,
+                                               PassThroughOp,
+                                               ThreadBufferLengths_M_1,
+                                               Sequence<0, 1>,
+                                               1,
+                                               1,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                mean_var_count_grid_desc_m_g,
+                make_multi_index(blkgroup_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 block_local_id),
+                PassThroughOp{});
+
+        auto threadwise_welford_count_store =
+            ThreadwiseTensorSliceTransfer_v1r3<int32_t,
+                                               int32_t,
+                                               decltype(thread_buffer_desc_m_1),
+                                               MeanVarCountGridDesc_M_G,
+                                               PassThroughOp,
+                                               ThreadBufferLengths_M_1,
+                                               Sequence<0, 1>,
+                                               1,
+                                               1,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                mean_var_count_grid_desc_m_g,
+                make_multi_index(blkgroup_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 block_local_id),
+                PassThroughOp{});
+
+        constexpr auto thread_copy_fwd_step_m_k = make_multi_index(0, K_BlockTileSize);
+
+        const auto x_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_x, x_grid_desc_m_k.GetElementSpaceSize());
+
+        auto welford_mean_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_welford_mean, mean_var_count_grid_desc_m_g.GetElementSpaceSize());
+
+        auto welford_var_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_welford_variance, mean_var_count_grid_desc_m_g.GetElementSpaceSize());
+
+        auto welford_count_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_welford_count, mean_var_count_grid_desc_m_g.GetElementSpaceSize());
+
+        auto threadwise_welford = ThreadwiseWelford();
+        threadwise_welford.max_count_ =
+            get_reduce_count_per_thread(block_local_id, thread_k_cluster_id);
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            welford_mean_thread_buf(I) = type_convert<AccDataType>(0.0f);
+            welford_var_thread_buf(I)  = type_convert<AccDataType>(0.0f);
+        });
+
+        for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles)
+        {
+            threadwise_x_load.Run(x_grid_desc_m_k,
+                                  x_global_val_buf,
+                                  thread_buffer_desc_m_k,
+                                  make_tuple(I0, I0),
+                                  x_thread_buf);
+
+            threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k);
+            threadwise_welford.Run(x_thread_buf, welford_mean_thread_buf, welford_var_thread_buf);
+        }
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            if constexpr(I > 0)
+                block_sync_lds();
+
+            welford_count_thread_buf(I) = threadwise_welford.cur_count_;
+            BlockwiseWelford::Run(
+                welford_mean_thread_buf(I), welford_var_thread_buf(I), welford_count_thread_buf(I));
+        });
+
+        if(thread_k_cluster_id == 0)
+        {
+            threadwise_welford_mean_var_store.Run(thread_buffer_desc_m_1,
+                                                  make_tuple(I0, I0),
+                                                  welford_mean_thread_buf,
+                                                  mean_var_count_grid_desc_m_g,
+                                                  welford_mean_global_val_buf);
+
+            threadwise_welford_mean_var_store.Run(thread_buffer_desc_m_1,
+                                                  make_tuple(I0, I0),
+                                                  welford_var_thread_buf,
+                                                  mean_var_count_grid_desc_m_g,
+                                                  welford_var_global_val_buf);
+
+            threadwise_welford_count_store.Run(thread_buffer_desc_m_1,
+                                               make_tuple(I0, I0),
+                                               welford_count_thread_buf,
+                                               mean_var_count_grid_desc_m_g,
+                                               welford_count_global_val_buf);
+        };
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final.hpp b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final.hpp
new file mode 100644
index 00000000..548d7fd4
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_batchnorm_forward_final.hpp
@@ -0,0 +1,571 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/math_v2.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_welford.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseWelfordSecondHalfBatchNormForwardFinal_,
+          typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename YElementwiseOp,
+          typename XYGridDesc_M_K,
+          typename MeanVarCountGridDesc_M_K,
+          typename ScaleBiasGridDesc_M,
+          typename MeanVarGridDesc_M>
+__global__ void kernel_welford_second_half_batchnorm_forward_final(
+    const XYGridDesc_M_K x_grid_desc_m_k,
+    const XYGridDesc_M_K y_grid_desc_m_k,
+    const MeanVarCountGridDesc_M_K mean_var_count_grid_desc_m_k,
+    const ScaleBiasGridDesc_M scale_grid_desc_m,
+    const ScaleBiasGridDesc_M bias_grid_desc_m,
+    const MeanVarGridDesc_M mean_var_grid_desc_m,
+    index_t blkgroup_size,
+    index_t num_xy_k_block_tile_iteration,
+    index_t num_mean_var_count_k_block_tile_iteration,
+    AccDataType epsilon,
+    const MeanVarDataType* const __restrict__ p_in_welford_mean,
+    const MeanVarDataType* const __restrict__ p_in_welford_variance,
+    const int32_t* const __restrict__ p_in_welford_count,
+    const XDataType* const __restrict__ p_x,
+    const ScaleDataType* const __restrict__ p_scale,
+    const BiasDataType* const __restrict__ p_bias,
+    const YElementwiseOp y_elementwise_op,
+    YDataType* const __restrict__ p_y,
+    bool updateMovingAverage,
+    AccDataType averageFactor,
+    MeanVarDataType* const __restrict__ resultRunningMean,
+    MeanVarDataType* const __restrict__ resultRunningVariance,
+    bool saveMeanInvVariance,
+    MeanVarDataType* const __restrict__ resultSaveMean,
+    MeanVarDataType* const __restrict__ resultSaveInvVariance)
+{
+    GridwiseWelfordSecondHalfBatchNormForwardFinal_::Run(x_grid_desc_m_k,
+                                                         y_grid_desc_m_k,
+                                                         mean_var_count_grid_desc_m_k,
+                                                         scale_grid_desc_m,
+                                                         bias_grid_desc_m,
+                                                         mean_var_grid_desc_m,
+                                                         blkgroup_size,
+                                                         num_xy_k_block_tile_iteration,
+                                                         num_mean_var_count_k_block_tile_iteration,
+                                                         epsilon,
+                                                         p_in_welford_mean,
+                                                         p_in_welford_variance,
+                                                         p_in_welford_count,
+                                                         p_x,
+                                                         p_scale,
+                                                         p_bias,
+                                                         y_elementwise_op,
+                                                         p_y,
+                                                         updateMovingAverage,
+                                                         averageFactor,
+                                                         resultRunningMean,
+                                                         resultRunningVariance,
+                                                         saveMeanInvVariance,
+                                                         resultSaveMean,
+                                                         resultSaveInvVariance);
+};
+
+template <typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename YElementwiseOp,
+          typename XYGridDesc_M_K,
+          typename MeanVarCountGridDesc_M_K,
+          typename ScaleBiasGridDesc_M,
+          typename MeanVarGridDesc_M,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t XSrcYDstVectorDim,
+          index_t XSrcVectorSize,
+          index_t YDstVectorSize,
+          index_t ScaleSrcVectorSize,
+          index_t BiasSrcVectorSize,
+          index_t MeanVarSrcDstVectorSize>
+struct GridwiseWelfordSecondHalfBatchNormForwardFinal
+{
+    static_assert((XSrcYDstVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0) ||
+                      (XSrcYDstVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static_assert((XSrcYDstVectorDim == 0 && MThreadSliceSize % YDstVectorSize == 0) ||
+                      (XSrcYDstVectorDim == 1 && KThreadSliceSize % YDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static constexpr bool reorder_thread_cluster = (XSrcYDstVectorDim == 0);
+
+    using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
+
+    using ThreadBufferDimAccessOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    using ThreadClusterArrangeOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    static constexpr auto thread_cluster_desc =
+        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
+
+    using ThreadReduceSrcDesc_M_1 = decltype(
+        make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}, Number<1>{})));
+    using ThreadReduceDstDesc_M =
+        decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
+
+    using ThreadwiseWelford =
+        ThreadwiseWelfordMerge<AccDataType, ThreadReduceSrcDesc_M_1, ThreadReduceDstDesc_M>;
+
+    using BlockwiseWelford = BlockwiseWelford<AccDataType,
+                                              BlockSize,
+                                              ThreadClusterLengths_M_K,
+                                              ThreadClusterArrangeOrder>;
+
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+
+    __device__ static void Run(const XYGridDesc_M_K& x_grid_desc_m_k,
+                               const XYGridDesc_M_K& y_grid_desc_m_k,
+                               const MeanVarCountGridDesc_M_K& mean_var_count_grid_desc_m_k,
+                               const ScaleBiasGridDesc_M& scale_grid_desc_m,
+                               const ScaleBiasGridDesc_M& bias_grid_desc_m,
+                               const MeanVarGridDesc_M& mean_var_grid_desc_m,
+                               index_t blkgroup_size,
+                               index_t num_xy_k_block_tile_iteration,
+                               index_t num_mean_var_count_k_block_tile_iteration,
+                               AccDataType epsilon,
+                               const MeanVarDataType* const __restrict__ p_in_welford_mean,
+                               const MeanVarDataType* const __restrict__ p_in_welford_variance,
+                               const int32_t* const __restrict__ p_in_welford_count,
+                               const XDataType* const __restrict__ p_x,
+                               const ScaleDataType* const __restrict__ p_scale,
+                               const BiasDataType* const __restrict__ p_bias,
+                               const YElementwiseOp y_elementwise_op,
+                               YDataType* const __restrict__ p_y,
+                               bool updateMovingAverage,
+                               AccDataType averageFactor,
+                               MeanVarDataType* const __restrict__ resultRunningMean,
+                               MeanVarDataType* const __restrict__ resultRunningVariance,
+                               bool saveMeanInvVariance,
+                               MeanVarDataType* const __restrict__ resultSaveMean,
+                               MeanVarDataType* const __restrict__ resultSaveInvVariance)
+
+    {
+        using ck::math::sqrt;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * 1, true>
+            in_welford_mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * 1, true>
+            in_welford_var_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, int32_t, MThreadSliceSize * 1, true>
+            in_welford_count_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
+            welford_mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
+            welford_var_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, int32_t, MThreadSliceSize, true>
+            welford_count_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            x_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            y_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> scale_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> bias_thread_buf;
+
+        const index_t thread_local_id = get_thread_local_1d_id();
+        const index_t block_global_id = get_block_1d_id();
+        const index_t blkgroup_id     = block_global_id / blkgroup_size;
+        const index_t block_local_id  = block_global_id % blkgroup_size;
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
+
+        const auto thread_m_cluster_id = thread_cluster_idx[I0];
+        const auto thread_k_cluster_id = thread_cluster_idx[I1];
+
+        using ThreadBufferLengths_M_K         = Sequence<MThreadSliceSize, KThreadSliceSize>;
+        using ThreadBufferLengths_M           = Sequence<MThreadSliceSize>;
+        using ThreadBufferLengths_M_1         = Sequence<MThreadSliceSize, 1>;
+        constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
+        constexpr auto thread_buffer_desc_m =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}));
+        constexpr auto thread_buffer_desc_m_1 = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<1>{}));
+
+        auto threadwise_mean_var_load_m_k =
+            ThreadwiseTensorSliceTransfer_v2<MeanVarDataType,
+                                             AccDataType,
+                                             MeanVarCountGridDesc_M_K,
+                                             decltype(thread_buffer_desc_m_1),
+                                             ThreadBufferLengths_M_1,
+                                             Sequence<0, 1>,
+                                             1,
+                                             1,
+                                             1,
+                                             true>(
+                mean_var_count_grid_desc_m_k,
+                make_multi_index(blkgroup_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 thread_k_cluster_id * 1));
+
+        auto threadwise_count_load_m_k =
+            ThreadwiseTensorSliceTransfer_v2<int32_t,
+                                             int32_t,
+                                             MeanVarCountGridDesc_M_K,
+                                             decltype(thread_buffer_desc_m_1),
+                                             ThreadBufferLengths_M_1,
+                                             Sequence<0, 1>,
+                                             1,
+                                             1,
+                                             1,
+                                             true>(
+                mean_var_count_grid_desc_m_k,
+                make_multi_index(blkgroup_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 thread_k_cluster_id * 1));
+
+        const auto welford_mean_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_in_welford_mean, mean_var_count_grid_desc_m_k.GetElementSpaceSize());
+
+        const auto welford_var_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_in_welford_variance, mean_var_count_grid_desc_m_k.GetElementSpaceSize());
+
+        const auto welford_count_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_in_welford_count, mean_var_count_grid_desc_m_k.GetElementSpaceSize());
+
+        constexpr auto mean_var_count_thread_copy_step_m_k =
+            make_multi_index(0, KThreadClusterSize * 1);
+
+        // Step 1: do final welford reduction to get mean and variance
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            welford_mean_thread_buf(I)  = type_convert<AccDataType>(0.0f);
+            welford_var_thread_buf(I)   = type_convert<AccDataType>(0.0f);
+            welford_count_thread_buf(I) = 0;
+        });
+
+        for(index_t reducedTiles = 0; reducedTiles < num_mean_var_count_k_block_tile_iteration;
+            ++reducedTiles)
+        {
+            threadwise_mean_var_load_m_k.Run(mean_var_count_grid_desc_m_k,
+                                             welford_mean_global_val_buf,
+                                             thread_buffer_desc_m_1,
+                                             make_tuple(I0, I0),
+                                             in_welford_mean_thread_buf);
+
+            threadwise_mean_var_load_m_k.Run(mean_var_count_grid_desc_m_k,
+                                             welford_var_global_val_buf,
+                                             thread_buffer_desc_m_1,
+                                             make_tuple(I0, I0),
+                                             in_welford_var_thread_buf);
+
+            threadwise_count_load_m_k.Run(mean_var_count_grid_desc_m_k,
+                                          welford_count_global_val_buf,
+                                          thread_buffer_desc_m_1,
+                                          make_tuple(I0, I0),
+                                          in_welford_count_thread_buf);
+
+            ThreadwiseWelford::Run(in_welford_mean_thread_buf,
+                                   in_welford_var_thread_buf,
+                                   in_welford_count_thread_buf,
+                                   welford_mean_thread_buf,
+                                   welford_var_thread_buf,
+                                   welford_count_thread_buf);
+
+            threadwise_mean_var_load_m_k.MoveSrcSliceWindow(mean_var_count_grid_desc_m_k,
+                                                            mean_var_count_thread_copy_step_m_k);
+            threadwise_count_load_m_k.MoveSrcSliceWindow(mean_var_count_grid_desc_m_k,
+                                                         mean_var_count_thread_copy_step_m_k);
+        }
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            if constexpr(I > 0)
+                block_sync_lds();
+
+            BlockwiseWelford::Run(
+                welford_mean_thread_buf(I), welford_var_thread_buf(I), welford_count_thread_buf(I));
+        });
+
+        // Step 2: do normalization and output y
+
+        const index_t workSizePerBlock = K_BlockTileSize * num_xy_k_block_tile_iteration;
+
+        auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2<XDataType,
+                                                                  AccDataType,
+                                                                  XYGridDesc_M_K,
+                                                                  decltype(thread_buffer_desc_m_k),
+                                                                  ThreadBufferLengths_M_K,
+                                                                  ThreadBufferDimAccessOrder,
+                                                                  XSrcYDstVectorDim,
+                                                                  XSrcVectorSize,
+                                                                  1,
+                                                                  true>(
+            x_grid_desc_m_k,
+            make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
+                             workSizePerBlock * block_local_id +
+                                 thread_k_cluster_id * KThreadSliceSize));
+
+        auto threadwise_y_store =
+            ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                               YDataType,
+                                               decltype(thread_buffer_desc_m_k),
+                                               XYGridDesc_M_K,
+                                               YElementwiseOp,
+                                               ThreadBufferLengths_M_K,
+                                               ThreadBufferDimAccessOrder,
+                                               XSrcYDstVectorDim,
+                                               YDstVectorSize,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                y_grid_desc_m_k,
+                make_multi_index(
+                    blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
+                    workSizePerBlock * block_local_id + thread_k_cluster_id * KThreadSliceSize),
+                y_elementwise_op);
+
+        auto threadwise_scale_load =
+            ThreadwiseTensorSliceTransfer_v2<ScaleDataType,
+                                             AccDataType,
+                                             ScaleBiasGridDesc_M,
+                                             decltype(thread_buffer_desc_m),
+                                             ThreadBufferLengths_M,
+                                             Sequence<0>,
+                                             0,
+                                             ScaleSrcVectorSize,
+                                             1,
+                                             true>(
+                scale_grid_desc_m,
+                make_multi_index(blkgroup_id * M_BlockTileSize +
+                                 thread_m_cluster_id * MThreadSliceSize));
+
+        auto threadwise_bias_load = ThreadwiseTensorSliceTransfer_v2<BiasDataType,
+                                                                     AccDataType,
+                                                                     ScaleBiasGridDesc_M,
+                                                                     decltype(thread_buffer_desc_m),
+                                                                     ThreadBufferLengths_M,
+                                                                     Sequence<0>,
+                                                                     0,
+                                                                     BiasSrcVectorSize,
+                                                                     1,
+                                                                     true>(
+            bias_grid_desc_m,
+            make_multi_index(blkgroup_id * M_BlockTileSize +
+                             thread_m_cluster_id * MThreadSliceSize));
+
+        const auto x_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_x, x_grid_desc_m_k.GetElementSpaceSize());
+
+        const auto scale_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_scale, scale_grid_desc_m.GetElementSpaceSize());
+
+        const auto bias_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_bias, bias_grid_desc_m.GetElementSpaceSize());
+
+        auto y_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_y, y_grid_desc_m_k.GetElementSpaceSize());
+
+        threadwise_scale_load.Run(scale_grid_desc_m,
+                                  scale_global_val_buf,
+                                  thread_buffer_desc_m,
+                                  make_tuple(I0),
+                                  scale_thread_buf);
+
+        threadwise_bias_load.Run(bias_grid_desc_m,
+                                 bias_global_val_buf,
+                                 thread_buffer_desc_m,
+                                 make_tuple(I0),
+                                 bias_thread_buf);
+
+        constexpr auto xy_thread_copy_step_m_k = make_multi_index(0, K_BlockTileSize);
+
+        for(index_t workTiles = 0; workTiles < num_xy_k_block_tile_iteration; ++workTiles)
+        {
+            threadwise_x_load.Run(x_grid_desc_m_k,
+                                  x_global_val_buf,
+                                  thread_buffer_desc_m_k,
+                                  make_tuple(I0, I0),
+                                  x_thread_buf);
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                AccDataType multiplier =
+                    scale_thread_buf[iM] / sqrt(welford_var_thread_buf[iM] + epsilon);
+
+                AccDataType fused_mean_bias =
+                    bias_thread_buf[iM] - welford_mean_thread_buf[iM] * multiplier;
+
+                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                    constexpr auto offset =
+                        thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK));
+
+                    y_thread_buf(Number<offset>{}) =
+                        x_thread_buf[Number<offset>{}] * multiplier + fused_mean_bias;
+                });
+            });
+
+            threadwise_y_store.Run(thread_buffer_desc_m_k,
+                                   make_tuple(I0, I0),
+                                   y_thread_buf,
+                                   y_grid_desc_m_k,
+                                   y_global_val_buf);
+
+            threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, xy_thread_copy_step_m_k);
+            threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, xy_thread_copy_step_m_k);
+        }
+
+        // Step 3: update the moving average of mean and variance (optional)
+
+        if(updateMovingAverage && block_local_id == 0 && thread_k_cluster_id == 0)
+        {
+            StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
+                running_mean_thread_buf;
+            StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
+                running_var_thread_buf;
+
+            auto running_mean_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                resultRunningMean, mean_var_grid_desc_m.GetElementSpaceSize());
+
+            auto running_var_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                resultRunningVariance, mean_var_grid_desc_m.GetElementSpaceSize());
+
+            auto threadwise_mean_var_load_m =
+                ThreadwiseTensorSliceTransfer_v2<MeanVarDataType,
+                                                 AccDataType,
+                                                 MeanVarGridDesc_M,
+                                                 decltype(thread_buffer_desc_m),
+                                                 ThreadBufferLengths_M,
+                                                 Sequence<0>,
+                                                 0,
+                                                 MeanVarSrcDstVectorSize,
+                                                 1,
+                                                 true>(
+                    mean_var_grid_desc_m,
+                    make_multi_index(blkgroup_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize));
+
+            threadwise_mean_var_load_m.Run(mean_var_grid_desc_m,
+                                           running_mean_global_buf,
+                                           thread_buffer_desc_m,
+                                           make_tuple(I0),
+                                           running_mean_thread_buf);
+
+            threadwise_mean_var_load_m.Run(mean_var_grid_desc_m,
+                                           running_var_global_buf,
+                                           thread_buffer_desc_m,
+                                           make_tuple(I0),
+                                           running_var_thread_buf);
+
+            AccDataType oneMinusAverageFactor = type_convert<AccDataType>(1.0) - averageFactor;
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                running_mean_thread_buf(I) = running_mean_thread_buf[I] * oneMinusAverageFactor +
+                                             welford_mean_thread_buf[I] * averageFactor;
+                running_var_thread_buf(I) = running_var_thread_buf[I] * oneMinusAverageFactor +
+                                            welford_var_thread_buf[I] * averageFactor;
+            });
+
+            auto threadwise_mean_var_store =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   MeanVarDataType,
+                                                   decltype(thread_buffer_desc_m),
+                                                   MeanVarGridDesc_M,
+                                                   PassThroughOp,
+                                                   ThreadBufferLengths_M,
+                                                   Sequence<0>,
+                                                   0,
+                                                   MeanVarSrcDstVectorSize,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>(
+                    mean_var_grid_desc_m,
+                    make_multi_index(blkgroup_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize),
+                    PassThroughOp{});
+
+            threadwise_mean_var_store.Run(thread_buffer_desc_m,
+                                          make_tuple(I0),
+                                          running_mean_thread_buf,
+                                          mean_var_grid_desc_m,
+                                          running_mean_global_buf);
+
+            threadwise_mean_var_store.Run(thread_buffer_desc_m,
+                                          make_tuple(I0),
+                                          running_var_thread_buf,
+                                          mean_var_grid_desc_m,
+                                          running_var_global_buf);
+        };
+
+        // Step 4: save mean and inv-variance (optional)
+
+        if(saveMeanInvVariance && block_local_id == 0 && thread_k_cluster_id == 0)
+        {
+            auto result_mean_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                resultSaveMean, mean_var_grid_desc_m.GetElementSpaceSize());
+
+            auto result_inv_var_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                resultSaveInvVariance, mean_var_grid_desc_m.GetElementSpaceSize());
+
+            // calculate inv-variance as 1/sqrt(epsilon+variance)
+            static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                welford_var_thread_buf(I) =
+                    type_convert<AccDataType>(1.0f) / sqrt(epsilon + welford_var_thread_buf[I]);
+            });
+
+            auto threadwise_mean_inv_var_store =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   MeanVarDataType,
+                                                   decltype(thread_buffer_desc_m),
+                                                   MeanVarGridDesc_M,
+                                                   PassThroughOp,
+                                                   ThreadBufferLengths_M,
+                                                   Sequence<0>,
+                                                   0,
+                                                   MeanVarSrcDstVectorSize,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>(
+                    mean_var_grid_desc_m,
+                    make_multi_index(blkgroup_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize),
+                    PassThroughOp{});
+
+            threadwise_mean_inv_var_store.Run(thread_buffer_desc_m,
+                                              make_tuple(I0),
+                                              welford_mean_thread_buf,
+                                              mean_var_grid_desc_m,
+                                              result_mean_global_buf);
+
+            threadwise_mean_inv_var_store.Run(thread_buffer_desc_m,
+                                              make_tuple(I0),
+                                              welford_var_thread_buf,
+                                              mean_var_grid_desc_m,
+                                              result_inv_var_global_buf);
+        };
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_multiblock_reduce_first_half.hpp b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_multiblock_reduce_first_half.hpp
new file mode 100644
index 00000000..42b7e172
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/batchnorm_multiblock/gridwise_multiblock_welford_second_half_multiblock_reduce_first_half.hpp
@@ -0,0 +1,556 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_welford.hpp"
+#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseWelfordSecondHalfReduceFirstHalf_,
+          typename XDataType,
+          typename DyDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename DscaleDbiasDataType,
+          typename MeanVarDataType,
+          typename DyElementwiseOp,
+          typename XYGridDesc_M_K,
+          typename MeanVarGridDesc_M,
+          typename MeanVarCountGridDesc_M_K,
+          typename DscaleDbiasGridDesc_M_G>
+__global__ void kernel_welford_second_half_reduce_first_half(
+    const XYGridDesc_M_K x_grid_desc_m_k,
+    const XYGridDesc_M_K dy_grid_desc_m_k,
+    const MeanVarGridDesc_M mean_var_grid_desc_m,
+    const MeanVarCountGridDesc_M_K mean_var_count_grid_desc_m_k,
+    const DscaleDbiasGridDesc_M_G dscale_dbias_grid_desc_m_g,
+    index_t blkgroup_size,
+    index_t num_xy_k_block_tile_iteration,
+    index_t num_mean_var_count_k_block_tile_iteration,
+    AccDataType epsilon,
+    bool haveSavedMeanInvVar,
+    const MeanVarDataType* const __restrict__ p_savedMean,
+    const MeanVarDataType* const __restrict__ p_savedInvVar,
+    const MeanVarDataType* const __restrict__ p_in_welford_mean,
+    const MeanVarDataType* const __restrict__ p_in_welford_variance,
+    const int32_t* const __restrict__ p_in_welford_count,
+    const DyElementwiseOp dy_elementwise_op,
+    MeanVarDataType* const __restrict__ p_out_welford_mean,
+    MeanVarDataType* const __restrict__ p_out_welford_inv_variance,
+    const XDataType* const __restrict__ p_x,
+    const DyDataType* const __restrict__ p_dy,
+    DscaleDbiasDataType* const __restrict__ p_reduce_dscale,
+    DscaleDbiasDataType* const __restrict__ p_reduce_dbias)
+{
+    GridwiseWelfordSecondHalfReduceFirstHalf_::Run(x_grid_desc_m_k,
+                                                   dy_grid_desc_m_k,
+                                                   mean_var_grid_desc_m,
+                                                   mean_var_count_grid_desc_m_k,
+                                                   dscale_dbias_grid_desc_m_g,
+                                                   blkgroup_size,
+                                                   num_xy_k_block_tile_iteration,
+                                                   num_mean_var_count_k_block_tile_iteration,
+                                                   epsilon,
+                                                   haveSavedMeanInvVar,
+                                                   p_savedMean,
+                                                   p_savedInvVar,
+                                                   p_in_welford_mean,
+                                                   p_in_welford_variance,
+                                                   p_in_welford_count,
+                                                   dy_elementwise_op,
+                                                   p_out_welford_mean,
+                                                   p_out_welford_inv_variance,
+                                                   p_x,
+                                                   p_dy,
+                                                   p_reduce_dscale,
+                                                   p_reduce_dbias);
+};
+
+template <typename XDataType,
+          typename DyDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename DscaleDbiasDataType,
+          typename MeanVarDataType,
+          typename DyElementwiseOp,
+          typename XYGridDesc_M_K,
+          typename MeanVarGridDesc_M,
+          typename MeanVarCountGridDesc_M_K,
+          typename DscaleDbiasGridDesc_M_G,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t XDyVectorDim,
+          index_t XSrcVectorSize,
+          index_t DySrcVectorSize,
+          index_t MeanVarSrcVectorSize>
+struct GridwiseWelfordSecondHalfReduceFirstHalf
+{
+    static_assert((XDyVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0 &&
+                   MThreadSliceSize % DySrcVectorSize == 0) ||
+                      (XDyVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0 &&
+                       KThreadSliceSize % DySrcVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static constexpr bool reorder_thread_cluster = (XDyVectorDim == 0);
+
+    using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
+
+    using ThreadBufferDimAccessOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    using ThreadClusterArrangeOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    static constexpr auto thread_cluster_desc =
+        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
+
+    using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{})));
+    using ThreadReduceSrcDesc_M_1 = decltype(
+        make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}, Number<1>{})));
+    using ThreadReduceDstDesc_M =
+        decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
+
+    using ThreadwiseWelford =
+        ThreadwiseWelfordMerge<AccDataType, ThreadReduceSrcDesc_M_1, ThreadReduceDstDesc_M>;
+
+    using BlockwiseWelford = BlockwiseWelford<AccDataType,
+                                              BlockSize,
+                                              ThreadClusterLengths_M_K,
+                                              ThreadClusterArrangeOrder>;
+
+    using BlockwiseReduce = PartitionedBlockwiseReduction<AccDataType,
+                                                          BlockSize,
+                                                          ThreadClusterLengths_M_K,
+                                                          ThreadClusterArrangeOrder,
+                                                          ck::reduce::Add,
+                                                          false>;
+
+    using ThreadwiseReduce = ThreadwiseReduction<AccDataType,
+                                                 ThreadReduceSrcDesc_M_K,
+                                                 ThreadReduceDstDesc_M,
+                                                 ck::reduce::Add,
+                                                 false>;
+
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+
+    // clang-format off
+    // Two of the steps of Multiblock BatchNorm Backward
+    // Step 1: Second half of Welford method to calculate mean and variance, as well as getting inv-variance = 1/sqrt(epsilon+variance) 
+    // Step 2: First half of Reduction: dbias = sum(dy), dscale = sum(dy * (x-mean) * inv-variance)
+    // clang-format on
+    __device__ static void Run(const XYGridDesc_M_K& x_grid_desc_m_k,
+                               const XYGridDesc_M_K& dy_grid_desc_m_k,
+                               const MeanVarGridDesc_M& mean_var_grid_desc_m,
+                               const MeanVarCountGridDesc_M_K& mean_var_count_grid_desc_m_k,
+                               const DscaleDbiasGridDesc_M_G& dscale_dbias_grid_desc_m_g,
+                               index_t blkgroup_size,
+                               index_t num_xy_k_block_tile_iteration,
+                               index_t num_mean_var_count_k_block_tile_iteration,
+                               AccDataType epsilon,
+                               bool haveSavedMeanInvVar,
+                               const MeanVarDataType* const __restrict__ p_savedMean,
+                               const MeanVarDataType* const __restrict__ p_savedInvVar,
+                               const MeanVarDataType* const __restrict__ p_in_welford_mean,
+                               const MeanVarDataType* const __restrict__ p_in_welford_variance,
+                               const int32_t* const __restrict__ p_in_welford_count,
+                               const DyElementwiseOp dy_elementwise_op,
+                               MeanVarDataType* const __restrict__ p_out_welford_mean,
+                               MeanVarDataType* const __restrict__ p_out_welford_inv_variance,
+                               const XDataType* const __restrict__ p_x,
+                               const DyDataType* const __restrict__ p_dy,
+                               DscaleDbiasDataType* const __restrict__ p_reduce_dscale,
+                               DscaleDbiasDataType* const __restrict__ p_reduce_dbias)
+    {
+        __shared__ AccDataType p_reduce_work_buffer[BlockSize];
+
+        auto reduce_work_buf =
+            make_dynamic_buffer<AddressSpaceEnum::Lds>(p_reduce_work_buffer, BlockSize);
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * 1, true>
+            in_welford_mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * 1, true>
+            in_welford_var_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, int32_t, MThreadSliceSize * 1, true>
+            in_welford_count_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
+            welford_mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
+            welford_var_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, int32_t, MThreadSliceSize, true>
+            welford_count_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>& mean_thread_buf =
+            welford_mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>&
+            inv_var_thread_buf = welford_var_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            x_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            dy_thread_buf;
+
+        // buffer of values of dy * (x-mean) * inv-variance, used as input of Blockwise reduction
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            tmp1_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
+            reduce_dscale_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
+            reduce_dbias_thread_buf;
+
+        const index_t thread_local_id = get_thread_local_1d_id();
+        const index_t block_global_id = get_block_1d_id();
+        const index_t blkgroup_id     = block_global_id / blkgroup_size;
+        const index_t block_local_id  = block_global_id % blkgroup_size;
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
+
+        const auto thread_m_cluster_id = thread_cluster_idx[I0];
+        const auto thread_k_cluster_id = thread_cluster_idx[I1];
+
+        using ThreadBufferLengths_M_K         = Sequence<MThreadSliceSize, KThreadSliceSize>;
+        using ThreadBufferLengths_M           = Sequence<MThreadSliceSize>;
+        using ThreadBufferLengths_M_1         = Sequence<MThreadSliceSize, 1>;
+        constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
+        constexpr auto thread_buffer_desc_m =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}));
+        constexpr auto thread_buffer_desc_m_1 = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<1>{}));
+
+        // clang-format off
+        // Step 1: load existing mean and inv-variance, or do final welford reduction on mean and variance as well as get inv-variance = 1/sqrt(epsilon+variance)
+        // clang-format on
+
+        if(haveSavedMeanInvVar)
+        {
+            const auto mean_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_savedMean, mean_var_grid_desc_m.GetElementSpaceSize());
+
+            const auto inv_var_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_savedInvVar, mean_var_grid_desc_m.GetElementSpaceSize());
+
+            auto threadwise_mean_inv_var_load =
+                ThreadwiseTensorSliceTransfer_v2<MeanVarDataType,
+                                                 AccDataType,
+                                                 MeanVarGridDesc_M,
+                                                 decltype(thread_buffer_desc_m),
+                                                 ThreadBufferLengths_M,
+                                                 Sequence<0>,
+                                                 0,
+                                                 MeanVarSrcVectorSize,
+                                                 1,
+                                                 true>(
+                    mean_var_grid_desc_m,
+                    make_multi_index(blkgroup_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize));
+
+            threadwise_mean_inv_var_load.Run(mean_var_grid_desc_m,
+                                             mean_global_buf,
+                                             thread_buffer_desc_m,
+                                             make_tuple(I0),
+                                             mean_thread_buf);
+
+            threadwise_mean_inv_var_load.Run(mean_var_grid_desc_m,
+                                             inv_var_global_buf,
+                                             thread_buffer_desc_m,
+                                             make_tuple(I0),
+                                             inv_var_thread_buf);
+        }
+        else
+        {
+            const auto welford_mean_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_in_welford_mean, mean_var_count_grid_desc_m_k.GetElementSpaceSize());
+
+            const auto welford_var_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_in_welford_variance, mean_var_count_grid_desc_m_k.GetElementSpaceSize());
+
+            const auto welford_count_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_in_welford_count, mean_var_count_grid_desc_m_k.GetElementSpaceSize());
+
+            auto threadwise_mean_var_load_m_k =
+                ThreadwiseTensorSliceTransfer_v2<AccDataType,
+                                                 AccDataType,
+                                                 MeanVarCountGridDesc_M_K,
+                                                 decltype(thread_buffer_desc_m_1),
+                                                 ThreadBufferLengths_M_1,
+                                                 Sequence<0, 1>,
+                                                 1,
+                                                 1,
+                                                 1,
+                                                 true>(
+                    mean_var_count_grid_desc_m_k,
+                    make_multi_index(blkgroup_id * M_BlockTileSize +
+                                         thread_m_cluster_id * MThreadSliceSize,
+                                     thread_k_cluster_id * 1));
+
+            auto threadwise_count_load_m_k =
+                ThreadwiseTensorSliceTransfer_v2<int32_t,
+                                                 int32_t,
+                                                 MeanVarCountGridDesc_M_K,
+                                                 decltype(thread_buffer_desc_m_1),
+                                                 ThreadBufferLengths_M_1,
+                                                 Sequence<0, 1>,
+                                                 1,
+                                                 1,
+                                                 1,
+                                                 true>(
+                    mean_var_count_grid_desc_m_k,
+                    make_multi_index(blkgroup_id * M_BlockTileSize +
+                                         thread_m_cluster_id * MThreadSliceSize,
+                                     thread_k_cluster_id * 1));
+
+            constexpr auto mean_var_count_thread_copy_step_m_k =
+                make_multi_index(0, KThreadClusterSize * 1);
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                welford_mean_thread_buf(I)  = type_convert<AccDataType>(0.0f);
+                welford_var_thread_buf(I)   = type_convert<AccDataType>(0.0f);
+                welford_count_thread_buf(I) = 0;
+            });
+
+            for(index_t reducedTiles = 0; reducedTiles < num_mean_var_count_k_block_tile_iteration;
+                ++reducedTiles)
+            {
+                threadwise_mean_var_load_m_k.Run(mean_var_count_grid_desc_m_k,
+                                                 welford_mean_global_buf,
+                                                 thread_buffer_desc_m_1,
+                                                 make_tuple(I0, I0),
+                                                 in_welford_mean_thread_buf);
+
+                threadwise_mean_var_load_m_k.Run(mean_var_count_grid_desc_m_k,
+                                                 welford_var_global_buf,
+                                                 thread_buffer_desc_m_1,
+                                                 make_tuple(I0, I0),
+                                                 in_welford_var_thread_buf);
+
+                threadwise_count_load_m_k.Run(mean_var_count_grid_desc_m_k,
+                                              welford_count_global_buf,
+                                              thread_buffer_desc_m_1,
+                                              make_tuple(I0, I0),
+                                              in_welford_count_thread_buf);
+
+                ThreadwiseWelford::Run(in_welford_mean_thread_buf,
+                                       in_welford_var_thread_buf,
+                                       in_welford_count_thread_buf,
+                                       welford_mean_thread_buf,
+                                       welford_var_thread_buf,
+                                       welford_count_thread_buf);
+
+                threadwise_mean_var_load_m_k.MoveSrcSliceWindow(
+                    mean_var_count_grid_desc_m_k, mean_var_count_thread_copy_step_m_k);
+                threadwise_count_load_m_k.MoveSrcSliceWindow(mean_var_count_grid_desc_m_k,
+                                                             mean_var_count_thread_copy_step_m_k);
+            }
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                if constexpr(I > 0)
+                    block_sync_lds();
+
+                BlockwiseWelford::Run(welford_mean_thread_buf(I),
+                                      welford_var_thread_buf(I),
+                                      welford_count_thread_buf(I));
+            });
+
+            // calculate inv-variance as 1/sqrt(epsilon+variance), stored in place of variance
+            static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                welford_var_thread_buf(I) =
+                    type_convert<AccDataType>(1.0) / sqrt(welford_var_thread_buf[I] + epsilon);
+            });
+
+            if(block_local_id == 0 && thread_k_cluster_id == 0)
+            {
+
+                auto threadwise_mean_inv_var_store =
+                    ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                       MeanVarDataType,
+                                                       decltype(thread_buffer_desc_m),
+                                                       MeanVarGridDesc_M,
+                                                       PassThroughOp,
+                                                       ThreadBufferLengths_M,
+                                                       Sequence<0>,
+                                                       0,
+                                                       1,
+                                                       InMemoryDataOperationEnum::Set,
+                                                       1,
+                                                       true>(
+                        mean_var_grid_desc_m,
+                        make_multi_index(blkgroup_id * M_BlockTileSize +
+                                         thread_m_cluster_id * MThreadSliceSize),
+                        PassThroughOp{});
+
+                auto mean_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                    p_out_welford_mean, mean_var_grid_desc_m.GetElementSpaceSize());
+
+                auto inv_var_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                    p_out_welford_inv_variance, mean_var_grid_desc_m.GetElementSpaceSize());
+
+                threadwise_mean_inv_var_store.Run(thread_buffer_desc_m,
+                                                  make_tuple(I0),
+                                                  mean_thread_buf,
+                                                  mean_var_grid_desc_m,
+                                                  mean_global_buf);
+
+                threadwise_mean_inv_var_store.Run(thread_buffer_desc_m,
+                                                  make_tuple(I0),
+                                                  inv_var_thread_buf,
+                                                  mean_var_grid_desc_m,
+                                                  inv_var_global_buf);
+            };
+        };
+
+        const index_t workSizePerBlock = K_BlockTileSize * num_xy_k_block_tile_iteration;
+
+        auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2<XDataType,
+                                                                  AccDataType,
+                                                                  XYGridDesc_M_K,
+                                                                  decltype(thread_buffer_desc_m_k),
+                                                                  ThreadBufferLengths_M_K,
+                                                                  ThreadBufferDimAccessOrder,
+                                                                  XDyVectorDim,
+                                                                  XSrcVectorSize,
+                                                                  1,
+                                                                  true>(
+            x_grid_desc_m_k,
+            make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
+                             workSizePerBlock * block_local_id +
+                                 thread_k_cluster_id * KThreadSliceSize));
+
+        auto threadwise_dy_load = ThreadwiseTensorSliceTransfer_v2<DyDataType,
+                                                                   AccDataType,
+                                                                   XYGridDesc_M_K,
+                                                                   decltype(thread_buffer_desc_m_k),
+                                                                   ThreadBufferLengths_M_K,
+                                                                   ThreadBufferDimAccessOrder,
+                                                                   XDyVectorDim,
+                                                                   DySrcVectorSize,
+                                                                   1,
+                                                                   true>(
+            dy_grid_desc_m_k,
+            make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
+                             workSizePerBlock * block_local_id +
+                                 thread_k_cluster_id * KThreadSliceSize));
+
+        const auto x_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_x, x_grid_desc_m_k.GetElementSpaceSize());
+
+        const auto dy_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_dy, dy_grid_desc_m_k.GetElementSpaceSize());
+
+        constexpr auto xy_thread_copy_step_m_k = make_multi_index(0, K_BlockTileSize);
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            reduce_dscale_thread_buf(I) = type_convert<AccDataType>(0);
+            reduce_dbias_thread_buf(I)  = type_convert<AccDataType>(0);
+        });
+
+        // clang-format off
+        // Step 2: first-half of reduction: dbias = sum(dy), dscale = sum(dy * (x-mean) * inv-variance)
+        // clang-format on
+
+        for(index_t reducedTiles = 0; reducedTiles < num_xy_k_block_tile_iteration; ++reducedTiles)
+        {
+            threadwise_x_load.Run(x_grid_desc_m_k,
+                                  x_global_buf,
+                                  thread_buffer_desc_m_k,
+                                  make_tuple(I0, I0),
+                                  x_thread_buf);
+
+            threadwise_dy_load.Run(dy_grid_desc_m_k,
+                                   dy_global_buf,
+                                   thread_buffer_desc_m_k,
+                                   make_tuple(I0, I0),
+                                   dy_thread_buf);
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                    constexpr auto offset =
+                        thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK));
+
+                    dy_elementwise_op(dy_thread_buf(Number<offset>{}),
+                                      dy_thread_buf[Number<offset>{}]);
+
+                    AccDataType norm_x = (x_thread_buf[Number<offset>{}] - mean_thread_buf[iM]) *
+                                         inv_var_thread_buf[iM];
+
+                    tmp1_thread_buf(Number<offset>{}) = norm_x * dy_thread_buf[Number<offset>{}];
+                });
+            });
+
+            ThreadwiseReduce::Reduce(tmp1_thread_buf, reduce_dscale_thread_buf);
+            ThreadwiseReduce::Reduce(dy_thread_buf, reduce_dbias_thread_buf);
+
+            threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, xy_thread_copy_step_m_k);
+            threadwise_dy_load.MoveSrcSliceWindow(dy_grid_desc_m_k, xy_thread_copy_step_m_k);
+        };
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            if constexpr(I > 0)
+                block_sync_lds();
+
+            BlockwiseReduce::Reduce(reduce_work_buf, reduce_dscale_thread_buf(I));
+            block_sync_lds();
+            BlockwiseReduce::Reduce(reduce_work_buf, reduce_dbias_thread_buf(I));
+        });
+
+        auto threadwise_dscale_dbias_store =
+            ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                               DscaleDbiasDataType,
+                                               decltype(thread_buffer_desc_m_1),
+                                               DscaleDbiasGridDesc_M_G,
+                                               PassThroughOp,
+                                               ThreadBufferLengths_M_1,
+                                               Sequence<0, 1>,
+                                               1,
+                                               1,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                dscale_dbias_grid_desc_m_g,
+                make_multi_index(blkgroup_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 block_local_id),
+                PassThroughOp{});
+
+        auto reduce_dscale_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_reduce_dscale, dscale_dbias_grid_desc_m_g.GetElementSpaceSize());
+
+        auto reduce_dbias_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_reduce_dbias, dscale_dbias_grid_desc_m_g.GetElementSpaceSize());
+
+        if(thread_k_cluster_id == 0)
+        {
+            threadwise_dscale_dbias_store.Run(thread_buffer_desc_m_1,
+                                              make_tuple(I0, I0),
+                                              reduce_dscale_thread_buf,
+                                              dscale_dbias_grid_desc_m_g,
+                                              reduce_dscale_global_buf);
+
+            threadwise_dscale_dbias_store.Run(thread_buffer_desc_m_1,
+                                              make_tuple(I0, I0),
+                                              reduce_dbias_thread_buf,
+                                              dscale_dbias_grid_desc_m_g,
+                                              reduce_dbias_global_buf);
+        };
+    };
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
new file mode 100644
index 00000000..460b6843
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
@@ -0,0 +1,546 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/math.hpp"
+#include "ck/utility/number.hpp"
+#include "ck/tensor_description/tensor_adaptor.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+
+namespace ck {
+
+// Rows of column-vectors
+template <index_t MPerBlock,
+          index_t NPerBlock,
+          typename CGridDesc_M_N,
+          bool DeviceCTileIndexCheck = false>
+struct BlockToCTileMap_M00_N0_M01
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    __host__ __device__ BlockToCTileMap_M00_N0_M01() = default;
+
+    __host__ __device__ BlockToCTileMap_M00_N0_M01(const CGridDesc_M_N& c_grid_desc_m_n,
+                                                   index_t M01 = 1)
+        : M01_(M01), underlying_map_(GetBlockToCTileMap(c_grid_desc_m_n, M01))
+    {
+    }
+
+    __host__ constexpr index_t CalculateGridSize(const CGridDesc_M_N& c_grid_desc_m_n) const
+    {
+        const auto M0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I0), MPerBlock);
+        const auto N0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I1), NPerBlock);
+
+        const auto M00 = math::integer_divide_ceil(M0, M01_);
+
+        const index_t grid_size = M00 * M01_ * N0;
+
+        return grid_size;
+    }
+
+    template <typename TopIdx>
+    __host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx& idx_top) const
+    {
+        return underlying_map_.CalculateBottomIndex(idx_top);
+    }
+
+    template <typename CTileIdx, typename CTileDim>
+    __host__ __device__ bool ValidCTileIndex(const CTileIdx& c_tile_idx,
+                                             const CTileDim& c_tile_dim) const
+    {
+        if constexpr(DeviceCTileIndexCheck)
+            return DefaultValidCTileIndex(c_tile_idx, c_tile_dim);
+        else
+            return true;
+    }
+
+    __host__ bool CheckValidity(const CGridDesc_M_N& c_grid_desc_m_n) const
+    {
+        if constexpr(DeviceCTileIndexCheck)
+            return true; // validity check moved to kernel
+
+        const index_t M0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I0), MPerBlock);
+        if(M0 % M01_ == 0)
+        {
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+    }
+
+    private:
+    __host__ __device__ static constexpr auto
+    GetBlockToCTileMap(const CGridDesc_M_N& c_grid_desc_m_n, index_t M01)
+    {
+        const auto M0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I0), MPerBlock);
+        const auto N0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I1), NPerBlock);
+
+        const auto M00 = math::integer_divide_ceil(M0, M01);
+
+        const auto m00_n0_m01_to_m0_n0_block_cluster_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_insert_transform(1),
+                       make_unmerge_transform(make_tuple(M00, M01)),
+                       make_pass_through_transform(make_tuple(N0))),
+            make_tuple(Sequence<>{}, Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 3>{}, Sequence<2>{}));
+
+        const auto cblockid_to_m00_n0_m01_block_cluster_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(1, M00, N0, M01))),
+            make_tuple(Sequence<0, 1, 2, 3>{}),
+            make_tuple(Sequence<0>{}));
+
+        const auto cblockid_to_m0_n0_block_cluster_adaptor =
+            chain_tensor_adaptors(m00_n0_m01_to_m0_n0_block_cluster_adaptor,
+                                  cblockid_to_m00_n0_m01_block_cluster_adaptor);
+
+        return cblockid_to_m0_n0_block_cluster_adaptor;
+    }
+
+    index_t M01_;
+    using UnderlyingMap = decltype(GetBlockToCTileMap(CGridDesc_M_N{}, 1));
+    UnderlyingMap underlying_map_;
+};
+
+// Rows of column-vectors
+// This C-tile map dynamically adjusts M01 when C-tile index is out of range
+template <index_t MPerBlock, index_t NPerBlock, typename CGridDesc_M_N>
+struct BlockToCTileMap_M00_N0_M01Adapt
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    __host__ __device__ BlockToCTileMap_M00_N0_M01Adapt() = default;
+
+    __host__ __device__ BlockToCTileMap_M00_N0_M01Adapt(const CGridDesc_M_N& c_grid_desc_m_n,
+                                                        index_t M01 = 8)
+        : M01_(M01), c_grid_desc_m_n_(c_grid_desc_m_n)
+    {
+    }
+
+    __host__ constexpr index_t CalculateGridSize(const CGridDesc_M_N& c_grid_desc_m_n) const
+    {
+        const auto M0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I0), MPerBlock);
+        const auto N0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I1), NPerBlock);
+
+        const index_t grid_size = M0 * N0;
+
+        return grid_size;
+    }
+
+    template <typename TopIdx>
+    __host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx& idx_top) const
+    {
+        auto block_1d_id = idx_top[I0];
+
+        const auto M0 = math::integer_divide_ceil(c_grid_desc_m_n_.GetLength(I0), MPerBlock);
+        const auto N0 = math::integer_divide_ceil(c_grid_desc_m_n_.GetLength(I1), NPerBlock);
+
+        block_1d_id = block_1d_id % (M0 * N0); // swallow batch index
+
+        index_t idx_N0 = block_1d_id % N0;
+        index_t idx_M0 = block_1d_id / N0;
+
+        const auto M01_adapt = (idx_M0 < M0 - M0 % M01_) ? M01_ : M0 % M01_;
+
+        index_t idx_M00          = idx_M0 / M01_;
+        index_t idx_M01          = idx_M0 % M01_;
+        index_t idx_N0_M01_local = idx_N0 + idx_M01 * N0;
+
+        return make_tuple(idx_N0_M01_local % M01_adapt + idx_M00 * M01_,
+                          idx_N0_M01_local / M01_adapt);
+    }
+
+    template <typename CTileIdx, typename CTileDim>
+    __host__ __device__ bool ValidCTileIndex(const CTileIdx& /* c_tile_idx */,
+                                             const CTileDim& /* c_tile_dim */) const
+    {
+        return true; // always valid provided that user gets grid size from CalculateGridSize()
+    }
+
+    __host__ bool CheckValidity(const CGridDesc_M_N& /* c_grid_desc_m_n */) const { return true; }
+
+    private:
+    index_t M01_;
+    CGridDesc_M_N c_grid_desc_m_n_;
+};
+
+// 2D slices of column-vectors in 3D space
+// This C-tile map dynamically adjusts M01 when C-tile index is out of range
+template <index_t MPerBlock, index_t NPerBlock, typename CGridDesc_M_N>
+struct BlockToCTileMap_KSplit_M00_N0_M01Adapt
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    __host__ __device__ BlockToCTileMap_KSplit_M00_N0_M01Adapt() = default;
+
+    __host__ __device__ BlockToCTileMap_KSplit_M00_N0_M01Adapt(const CGridDesc_M_N& c_grid_desc_m_n,
+                                                               index_t M01    = 8,
+                                                               index_t KSplit = 1)
+        : M01_(M01), KSplit_(KSplit), c_grid_desc_m_n_(c_grid_desc_m_n)
+    {
+    }
+
+    __host__ constexpr index_t CalculateGridSize(const CGridDesc_M_N& c_grid_desc_m_n) const
+    {
+        const auto M0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I0), MPerBlock);
+        const auto N0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I1), NPerBlock);
+
+        const index_t grid_size = M0 * N0 * KSplit_;
+
+        return grid_size;
+    }
+
+    template <typename TopIdx>
+    __host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx& idx_top) const
+    {
+        auto block_1d_id = idx_top[I0];
+
+        const auto M0 = math::integer_divide_ceil(c_grid_desc_m_n_.GetLength(I0), MPerBlock);
+        const auto N0 = math::integer_divide_ceil(c_grid_desc_m_n_.GetLength(I1), NPerBlock);
+
+        block_1d_id = block_1d_id % (M0 * N0 * KSplit_); // hide groups
+
+        const index_t idx_ksplit = block_1d_id / (M0 * N0);
+        block_1d_id              = block_1d_id % (M0 * N0);
+
+        index_t idx_N0 = block_1d_id % N0;
+        index_t idx_M0 = block_1d_id / N0;
+
+        const auto M01_adapt = (idx_M0 < M0 - M0 % M01_) ? M01_ : M0 % M01_;
+
+        index_t idx_M00          = idx_M0 / M01_;
+        index_t idx_M01          = idx_M0 % M01_;
+        index_t idx_N0_M01_local = idx_N0 + idx_M01 * N0;
+
+        return make_tuple(idx_ksplit,
+                          idx_N0_M01_local % M01_adapt + idx_M00 * M01_,
+                          idx_N0_M01_local / M01_adapt);
+    }
+
+    template <typename CTileIdx, typename CTileDim>
+    __host__ __device__ bool ValidCTileIndex(const CTileIdx& /* c_tile_idx */,
+                                             const CTileDim& /* c_tile_dim */) const
+    {
+        return true; // always valid provided that user gets grid size from CalculateGridSize()
+    }
+
+    __host__ bool CheckValidity(const CGridDesc_M_N& /* c_grid_desc_m_n */) const { return true; }
+
+    private:
+    index_t M01_;
+    index_t KSplit_;
+    CGridDesc_M_N c_grid_desc_m_n_;
+};
+
+// Blocks of row-vectors
+template <index_t MPerBlock,
+          index_t NPerBlock,
+          typename CGridDesc_M_N,
+          bool DeviceCTileIndexCheck = false>
+struct BlockToCTileMap_M00_N00_M01_N01
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    __host__ __device__ BlockToCTileMap_M00_N00_M01_N01() = default;
+
+    __host__ __device__ BlockToCTileMap_M00_N00_M01_N01(const CGridDesc_M_N& c_grid_desc_m_n,
+                                                        index_t M01 = 1,
+                                                        index_t N01 = 1)
+        : M01_(M01), N01_(N01), underlying_map_(GetBlockToCTileMap(c_grid_desc_m_n, M01, N01))
+    {
+    }
+
+    __host__ constexpr index_t CalculateGridSize(const CGridDesc_M_N& c_grid_desc_m_n) const
+    {
+        const auto M0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I0), MPerBlock);
+        const auto N0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I1), NPerBlock);
+
+        const auto M00 = math::integer_divide_ceil(M0, M01_);
+        const auto N00 = math::integer_divide_ceil(N0, N01_);
+
+        const index_t grid_size = M00 * M01_ * N00 * N01_;
+
+        return grid_size;
+    }
+
+    template <typename TopIdx>
+    __host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx& idx_top) const
+    {
+        return underlying_map_.CalculateBottomIndex(idx_top);
+    }
+
+    template <typename CTileIdx, typename CTileDim>
+    __host__ __device__ bool ValidCTileIndex(const CTileIdx& c_tile_idx,
+                                             const CTileDim& c_tile_dim) const
+    {
+        if constexpr(DeviceCTileIndexCheck)
+            return DefaultValidCTileIndex(c_tile_idx, c_tile_dim);
+        else
+            return true;
+    }
+
+    __host__ bool CheckValidity(const CGridDesc_M_N& c_grid_desc_m_n) const
+    {
+        if constexpr(DeviceCTileIndexCheck)
+            return true; // validity check moved to kernel
+
+        const index_t M0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I0), MPerBlock);
+        const index_t N0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I1), NPerBlock);
+        if(M0 % M01_ == 0 && N0 % N01_ == 0)
+        {
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+    }
+
+    private:
+    __host__ __device__ static constexpr auto
+    GetBlockToCTileMap(const CGridDesc_M_N& c_grid_desc_m_n, index_t M01, index_t N01)
+    {
+        const auto M0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I0), MPerBlock);
+        const auto N0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I1), NPerBlock);
+
+        const auto M00 = math::integer_divide_ceil(M0, M01);
+        const auto N00 = math::integer_divide_ceil(N0, N01);
+
+        const auto m00_m01_n00_n01_to_m0_n0_block_cluster_adaptor =
+            make_single_stage_tensor_adaptor(
+                make_tuple(make_insert_transform(1), // swallow the carry from lower dimensions
+                           make_unmerge_transform(make_tuple(M00, M01)),
+                           make_unmerge_transform(make_tuple(N00, N01))),
+                make_tuple(Sequence<>{}, Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 3>{}, Sequence<2, 4>{}));
+
+        const auto cblockid_to_m00_m01_n00_n01_block_cluster_adaptor =
+            make_single_stage_tensor_adaptor(
+                make_tuple(make_merge_transform(make_tuple(1, M00, N00, M01, N01))),
+                make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                make_tuple(Sequence<0>{}));
+
+        const auto cblockid_to_m0_n0_block_cluster_adaptor =
+            chain_tensor_adaptors(m00_m01_n00_n01_to_m0_n0_block_cluster_adaptor,
+                                  cblockid_to_m00_m01_n00_n01_block_cluster_adaptor);
+
+        return cblockid_to_m0_n0_block_cluster_adaptor;
+    }
+
+    index_t M01_, N01_;
+    using UnderlyingMap = decltype(GetBlockToCTileMap(CGridDesc_M_N{}, 1, 1));
+    UnderlyingMap underlying_map_;
+};
+
+// 2D slices of row-vectors in 3D space
+template <index_t MPerBlock,
+          index_t NPerBlock,
+          typename CGridDesc_M_N,
+          bool DeviceCTileIndexCheck = false>
+struct BlockToCTileMap_KSplit_M00_N00_M01_N01
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    __host__ BlockToCTileMap_KSplit_M00_N00_M01_N01() = default;
+
+    __host__ BlockToCTileMap_KSplit_M00_N00_M01_N01(const CGridDesc_M_N& c_grid_desc_m_n,
+                                                    index_t M01    = 1,
+                                                    index_t N01    = 1,
+                                                    index_t KSplit = 1)
+        : c_grid_desc_m_n_(c_grid_desc_m_n),
+          M01_(M01),
+          N01_(N01),
+          KSplit_(KSplit),
+          underlying_map_(GetBlockToCTileMap(c_grid_desc_m_n, M01, N01, KSplit))
+    {
+    }
+
+    __host__ __device__ constexpr index_t
+    CalculateGridSize(const CGridDesc_M_N& c_grid_desc_m_n) const
+    {
+        const auto M0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I0), MPerBlock);
+        const auto N0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I1), NPerBlock);
+
+        const auto M00 = math::integer_divide_ceil(M0, M01_);
+        const auto N00 = math::integer_divide_ceil(N0, N01_);
+
+        const index_t grid_size = M00 * M01_ * N00 * N01_ * KSplit_;
+
+        return grid_size;
+    }
+
+    template <typename TopIdx>
+    __host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx& idx_top) const
+    {
+        static_assert(TopIdx::Size() == 1);
+
+        return underlying_map_.CalculateBottomIndex(
+            make_multi_index(idx_top[I0] % CalculateGridSize()));
+    }
+
+    template <typename CTileIdx, typename CTileDim>
+    __host__ __device__ bool ValidCTileIndex(const CTileIdx& c_tile_idx,
+                                             const CTileDim& c_tile_dim) const
+    {
+        if constexpr(DeviceCTileIndexCheck)
+            return DefaultValidCTileIndex(c_tile_idx, c_tile_dim);
+        else
+            return true;
+    }
+
+    __host__ bool CheckValidity(const CGridDesc_M_N& c_grid_desc_m_n) const
+    {
+        if constexpr(DeviceCTileIndexCheck)
+            return true; // validity check moved to kernel
+
+        const index_t M0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I0), MPerBlock);
+        const index_t N0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I1), NPerBlock);
+        if(M0 % M01_ == 0 && N0 % N01_ == 0)
+        {
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+    }
+
+    private:
+    __device__ constexpr index_t CalculateGridSize() const
+    {
+        return CalculateGridSize(c_grid_desc_m_n_);
+    }
+
+    __host__ static constexpr auto GetBlockToCTileMap(const CGridDesc_M_N& c_grid_desc_m_n,
+                                                      index_t M01,
+                                                      index_t N01,
+                                                      index_t KSplit)
+    {
+        const auto M0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I0), MPerBlock);
+        const auto N0 = math::integer_divide_ceil(c_grid_desc_m_n.GetLength(I1), NPerBlock);
+
+        const auto M00 = math::integer_divide_ceil(M0, M01);
+        const auto N00 = math::integer_divide_ceil(N0, N01);
+
+        const auto ksplit_m00_m01_n00_n01_to_m0_n0_block_cluster_adaptor =
+            make_single_stage_tensor_adaptor(
+                make_tuple(make_pass_through_transform(KSplit),
+                           make_unmerge_transform(make_tuple(M00, M01)),
+                           make_unmerge_transform(make_tuple(N00, N01))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 3>{}, Sequence<2, 4>{}));
+
+        const auto c_blockid_to_ksplit_m00_m01_n00_n01_block_cluster_adaptor =
+            make_single_stage_tensor_adaptor(
+                make_tuple(make_merge_transform(make_tuple(KSplit, M00, N00, M01, N01))),
+                make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                make_tuple(Sequence<0>{}));
+
+        const auto c_blockid_to_ksplit_m0_n0_block_cluster_adaptor =
+            chain_tensor_adaptors(ksplit_m00_m01_n00_n01_to_m0_n0_block_cluster_adaptor,
+                                  c_blockid_to_ksplit_m00_m01_n00_n01_block_cluster_adaptor);
+
+        return c_blockid_to_ksplit_m0_n0_block_cluster_adaptor;
+    }
+
+    CGridDesc_M_N c_grid_desc_m_n_;
+    index_t M01_, N01_, KSplit_;
+    using UnderlyingMap = decltype(GetBlockToCTileMap(CGridDesc_M_N{}, 1, 1, 1));
+    UnderlyingMap underlying_map_;
+};
+
+template <typename CTileIdx, typename CTileDim>
+__host__ __device__ bool DefaultValidCTileIndex(const CTileIdx& c_tile_idx,
+                                                const CTileDim& c_tile_dim)
+{
+    bool is_valid = false;
+
+    const index_t m_block = c_tile_dim[Number<0>{}];
+    const index_t n_block = c_tile_dim[Number<1>{}];
+
+    if constexpr(CTileIdx::Size() == 2)
+    {
+        const index_t m_block_idx = c_tile_idx[Number<0>{}];
+        const index_t n_block_idx = c_tile_idx[Number<1>{}];
+        if(0 <= m_block_idx && m_block_idx < m_block && 0 <= n_block_idx && n_block_idx < n_block)
+        {
+            is_valid = true;
+        }
+    }
+    else if constexpr(CTileIdx::Size() == 3)
+    {
+        const index_t ksplit_idx  = c_tile_idx[Number<0>{}];
+        const index_t m_block_idx = c_tile_idx[Number<1>{}];
+        const index_t n_block_idx = c_tile_idx[Number<2>{}];
+        if(0 <= m_block_idx && m_block_idx < m_block && 0 <= n_block_idx && n_block_idx < n_block)
+        {
+            is_valid = true;
+        }
+        ignore = ksplit_idx;
+    }
+
+    return is_valid;
+}
+
+// This wrapper class is for grouped gemm where it subtracts blockIdx by a value so that the
+// workgroups assigned to a given gemm problem have top index offsetted to range [0,
+// grid_size_per_gemm]
+template <typename UnderlyingBlockToCTileMap>
+struct OffsettedBlockToCTileMap
+{
+    using underlying_type = UnderlyingBlockToCTileMap;
+
+    OffsettedBlockToCTileMap(UnderlyingBlockToCTileMap block_to_ctile_map, index_t block_start)
+    {
+        block_to_ctile_map_ = block_to_ctile_map;
+        block_start_        = block_start;
+    }
+
+    template <typename TopIdx>
+    __host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx& idx_top) const
+    {
+        return block_to_ctile_map_.CalculateBottomIndex(
+            make_multi_index(idx_top[Number<0>{}] - block_start_));
+    }
+
+    template <typename CTileIdx, typename CTileDim>
+    __host__ __device__ bool ValidCTileIndex(const CTileIdx& c_tile_idx,
+                                             const CTileDim& c_tile_dim) const
+    {
+        return block_to_ctile_map_.ValidCTileIndex(c_tile_idx, c_tile_dim);
+    }
+
+    template <typename CGridDesc_M_N>
+    __host__ bool CheckValidity(const CGridDesc_M_N& c_grid_desc_m_n) const
+    {
+        return block_to_ctile_map_.CheckValidity(c_grid_desc_m_n);
+    }
+
+    template <typename CGridDesc_M_N>
+    __host__ constexpr index_t CalculateGridSize(const CGridDesc_M_N& c_grid_desc_m_n) const
+    {
+        return block_to_ctile_map_.CalculateGridSize(c_grid_desc_m_n);
+    }
+
+    UnderlyingBlockToCTileMap block_to_ctile_map_;
+    index_t block_start_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_multiblock.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_multiblock.hpp
new file mode 100644
index 00000000..bdebe381
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_multiblock.hpp
@@ -0,0 +1,321 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/reduction_common.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/utility/reduction_functions_accumulate.hpp"
+#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp"
+#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseMultipleReduction,
+          index_t NumReduction,
+          typename InDataType,
+          typename OutDataTypePointerTuple,
+          typename AccDataType,
+          typename InGridDesc_M_K,
+          typename OutGridDesc_M_Tuple,
+          typename InElementwiseOperationTuple,
+          typename AccElementwiseOperationTuple>
+__global__ void
+kernel_multiple_reduce_multiblock(const InGridDesc_M_K in_grid_desc_m_k,
+                                  const OutGridDesc_M_Tuple out_grid_desc_m_tuple,
+                                  const InElementwiseOperationTuple in_elementwise_op_tuple,
+                                  const AccElementwiseOperationTuple acc_elementwise_op_tuple,
+                                  index_t block_group_size,
+                                  index_t num_k_block_tile_iteration,
+                                  Array<AccDataType, NumReduction> alpha_values,
+                                  const InDataType* const __restrict__ p_in_value_global,
+                                  Array<AccDataType, NumReduction> beta_values,
+                                  OutDataTypePointerTuple p_out_value_global_tuple)
+{
+    GridwiseMultipleReduction::Run(in_grid_desc_m_k,
+                                   out_grid_desc_m_tuple,
+                                   in_elementwise_op_tuple,
+                                   acc_elementwise_op_tuple,
+                                   block_group_size,
+                                   num_k_block_tile_iteration,
+                                   alpha_values,
+                                   p_in_value_global,
+                                   beta_values,
+                                   p_out_value_global_tuple);
+};
+
+template <index_t NumReduction,
+          typename InDataType,
+          typename OutDataTypePointerTuple,
+          typename AccDataType,
+          typename InGridDesc_M_K,
+          typename OutGridDesc_M_Tuple,
+          typename ReduceOperation,
+          typename InElementwiseOperationTuple,
+          typename AccElementwiseOperationTuple,
+          InMemoryDataOperationEnum OutMemoryDataOperation,
+          bool PropagateNan,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t InSrcVectorDim,
+          index_t InSrcVectorSize,
+          typename OutDstVectorSizeSeq>
+struct GridwiseMultipleReduction_mk_to_m_multiblock
+{
+    static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
+                   (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static_assert(NumReduction == OutDataTypePointerTuple::Size() &&
+                      NumReduction == OutGridDesc_M_Tuple::Size() &&
+                      NumReduction == OutDstVectorSizeSeq::Size() &&
+                      NumReduction == InElementwiseOperationTuple::Size() &&
+                      NumReduction == AccElementwiseOperationTuple::Size(),
+                  "All tuple should have the same size as the number of Reductions!");
+
+    static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0);
+
+    using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
+
+    using ThreadBufferDimAccessOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    using ThreadClusterArrangeOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    static constexpr auto thread_cluster_desc =
+        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
+
+    using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{})));
+    using ThreadReduceDstDesc_M =
+        decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
+
+    using BlockwiseReduce = PartitionedBlockwiseReduction<AccDataType,
+                                                          BlockSize,
+                                                          ThreadClusterLengths_M_K,
+                                                          ThreadClusterArrangeOrder,
+                                                          ReduceOperation,
+                                                          PropagateNan>;
+
+    using ThreadwiseReduce = ThreadwiseReduction<AccDataType,
+                                                 ThreadReduceSrcDesc_M_K,
+                                                 ThreadReduceDstDesc_M,
+                                                 ReduceOperation,
+                                                 PropagateNan>;
+
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+
+    using Accumulation = detail::AccumulateWithNanCheck<PropagateNan, ReduceOperation, AccDataType>;
+
+    __device__ static void Run(const InGridDesc_M_K& in_grid_desc_m_k,
+                               const OutGridDesc_M_Tuple& out_grid_desc_m_tuple,
+                               const InElementwiseOperationTuple& in_elementwise_op_tuple,
+                               const AccElementwiseOperationTuple& acc_elementwise_op_tuple,
+                               index_t block_group_size,
+                               index_t num_k_block_tile_iteration,
+                               Array<AccDataType, NumReduction> alpha_values,
+                               const InDataType* const __restrict__ p_in_value_global,
+                               Array<AccDataType, NumReduction> beta_values,
+                               OutDataTypePointerTuple p_out_value_global_tuple)
+    {
+        const auto identityVal = ReduceOperation::template GetIdentityValue<AccDataType>();
+
+        // LDS,  reused by all reductions
+        __shared__ AccDataType p_reduce_work_buffer[BlockSize];
+
+        const auto in_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_in_value_global,
+            in_grid_desc_m_k.GetElementSpaceSize(),
+            ReduceOperation::template GetIdentityValue<InDataType>());
+        auto out_global_val_buf_tuple = generate_tuple(
+            [&](auto iR) {
+                return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                    p_out_value_global_tuple[iR], out_grid_desc_m_tuple[iR].GetElementSpaceSize());
+            },
+            Number<NumReduction>{});
+
+        auto reduce_work_buf =
+            make_dynamic_buffer<AddressSpaceEnum::Lds>(p_reduce_work_buffer, BlockSize);
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            in_thread_buf;
+
+        auto in_thread_buf_tuple = generate_tuple(
+            [&](auto iR) {
+                (void)iR;
+                return StaticBuffer<AddressSpaceEnum::Vgpr,
+                                    AccDataType,
+                                    MThreadSliceSize * KThreadSliceSize,
+                                    true>{};
+            },
+            Number<NumReduction>{});
+
+        auto accu_value_buf_tuple = generate_tuple(
+            [&](auto iR) {
+                (void)iR;
+                return StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>{};
+            },
+            Number<NumReduction>{});
+
+        static_for<0, NumReduction, 1>{}([&](auto iR) {
+            static_for<0, MThreadSliceSize, 1>{}(
+                [&](auto J) { accu_value_buf_tuple(iR)(J) = identityVal; });
+        });
+
+        const index_t thread_local_id = get_thread_local_1d_id();
+        const index_t block_global_id = get_block_1d_id();
+        const index_t blkgroup_id     = block_global_id / block_group_size;
+        const index_t block_local_id  = block_global_id % block_group_size;
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
+
+        const auto thread_m_cluster_id = thread_cluster_idx[I0];
+        const auto thread_k_cluster_id = thread_cluster_idx[I1];
+
+        const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration;
+
+        using ThreadBufferLengths         = Sequence<MThreadSliceSize, KThreadSliceSize>;
+        constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
+
+        auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<InDataType,
+                                                                    AccDataType,
+                                                                    InGridDesc_M_K,
+                                                                    decltype(thread_buffer_desc),
+                                                                    ThreadBufferLengths,
+                                                                    ThreadBufferDimAccessOrder,
+                                                                    InSrcVectorDim,
+                                                                    InSrcVectorSize,
+                                                                    1,
+                                                                    false>(
+            in_grid_desc_m_k,
+            make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
+                             block_local_id * reduceSizePerBlock +
+                                 thread_k_cluster_id * KThreadSliceSize));
+
+        constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize);
+
+        index_t reducedTiles = 0;
+        do
+        {
+            threadwise_src_load.Run(in_grid_desc_m_k,
+                                    in_global_val_buf,
+                                    thread_buffer_desc,
+                                    make_tuple(I0, I0),
+                                    in_thread_buf);
+
+            static_for<0, NumReduction, 1>{}([&](auto iR) {
+                static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                    // do element-wise pre-reduction operation
+                    static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                        constexpr auto offset =
+                            thread_buffer_desc.CalculateOffset(make_tuple(iM, iK));
+                        in_elementwise_op_tuple[iR](in_thread_buf_tuple(iR)(Number<offset>{}),
+                                                    in_thread_buf(Number<offset>{}));
+                    });
+                });
+
+                ThreadwiseReduce::Reduce(in_thread_buf_tuple(iR), accu_value_buf_tuple(iR));
+            });
+
+            threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
+
+            reducedTiles++;
+        } while(reducedTiles < num_k_block_tile_iteration);
+
+        constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{};
+
+        static_for<0, NumReduction, 1>{}([&](auto iR) {
+            using OutDataTypePointer = remove_cvref_t<decltype(OutDataTypePointerTuple{}[iR])>;
+            using OutDataType        = remove_cvref_t<remove_pointer_t<OutDataTypePointer>>;
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                BlockwiseReduce::Reduce(reduce_work_buf, accu_value_buf_tuple(iR)(I));
+            });
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                if(thread_k_cluster_id == 0)
+                {
+                    acc_elementwise_op_tuple[iR](accu_value_buf_tuple(iR)(I),
+                                                 accu_value_buf_tuple(iR)(I));
+
+                    accu_value_buf_tuple(iR)(I) *= alpha_values[iR];
+                }
+            });
+
+            if(thread_k_cluster_id == 0)
+            {
+                if(block_group_size == 0 && !float_equal_zero{}(beta_values[iR]))
+                {
+                    StaticBuffer<AddressSpaceEnum::Vgpr, OutDataType, MThreadSliceSize, true>
+                        priorDstValueBuf;
+
+                    auto threadwise_dst_load =
+                        ThreadwiseTensorSliceTransfer_v2<OutDataType,
+                                                         OutDataType,
+                                                         decltype(out_grid_desc_m_tuple[iR]),
+                                                         decltype(reduced_data_desc),
+                                                         Sequence<MThreadSliceSize>,
+                                                         Sequence<0>,
+                                                         0,
+                                                         OutDstVectorSizeSeq::At(iR),
+                                                         1,
+                                                         false>(
+                            out_grid_desc_m_tuple[iR],
+                            make_multi_index(blkgroup_id * M_BlockTileSize +
+                                             thread_m_cluster_id * MThreadSliceSize));
+
+                    threadwise_dst_load.Run(out_grid_desc_m_tuple[iR],
+                                            out_global_val_buf_tuple(iR),
+                                            reduced_data_desc,
+                                            make_tuple(I0),
+                                            priorDstValueBuf);
+
+                    static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                        accu_value_buf_tuple(iR)(I) +=
+                            type_convert<AccDataType>(priorDstValueBuf[I]) * beta_values[iR];
+                    });
+                };
+
+                auto threadwise_dst_store =
+                    ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                       OutDataType,
+                                                       decltype(reduced_data_desc),
+                                                       decltype(out_grid_desc_m_tuple[iR]),
+                                                       PassThroughOp,
+                                                       Sequence<MThreadSliceSize>,
+                                                       Sequence<0>,
+                                                       0,
+                                                       OutDstVectorSizeSeq::At(iR),
+                                                       OutMemoryDataOperation,
+                                                       1,
+                                                       true>(
+                        out_grid_desc_m_tuple[iR],
+                        make_multi_index(blkgroup_id * M_BlockTileSize +
+                                         thread_m_cluster_id * MThreadSliceSize),
+                        PassThroughOp{});
+
+                threadwise_dst_store.Run(reduced_data_desc,
+                                         make_tuple(I0),
+                                         accu_value_buf_tuple[iR],
+                                         out_grid_desc_m_tuple[iR],
+                                         out_global_val_buf_tuple(iR));
+            };
+        });
+    };
+}; // namespace ck
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_threadwise.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_threadwise.hpp
new file mode 100644
index 00000000..1313ec94
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_multiple_reduction_threadwise.hpp
@@ -0,0 +1,264 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/reduction_common.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/utility/reduction_functions_accumulate.hpp"
+#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp"
+#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseMultipleReduction,
+          index_t NumReduction,
+          typename InDataType,
+          typename OutDataTypePointerTuple,
+          typename AccDataType,
+          typename InGridDesc_M_K,
+          typename OutGridDesc_M_Tuple,
+          typename InElementwiseOperationTuple,
+          typename AccElementwiseOperationTuple>
+__global__ void
+kernel_multiple_reduce_threadwise(const InGridDesc_M_K in_grid_desc_m_k,
+                                  const OutGridDesc_M_Tuple out_grid_desc_m_tuple,
+                                  const InElementwiseOperationTuple in_elementwise_op_tuple,
+                                  const AccElementwiseOperationTuple acc_elementwise_op_tuple,
+                                  Array<AccDataType, NumReduction> alpha_values,
+                                  const InDataType* const __restrict__ p_in_value_global,
+                                  Array<AccDataType, NumReduction> beta_values,
+                                  OutDataTypePointerTuple p_out_value_global_tuple)
+{
+    GridwiseMultipleReduction::Run(in_grid_desc_m_k,
+                                   out_grid_desc_m_tuple,
+                                   in_elementwise_op_tuple,
+                                   acc_elementwise_op_tuple,
+                                   alpha_values,
+                                   p_in_value_global,
+                                   beta_values,
+                                   p_out_value_global_tuple);
+};
+
+template <index_t NumReduction,
+          typename InDataType,
+          typename OutDataTypePointerTuple,
+          typename AccDataType,
+          typename InGridDesc_M_K,
+          typename OutGridDesc_M_Tuple,
+          typename ReduceOperation,
+          typename InElementwiseOperationTuple,
+          typename AccElementwiseOperationTuple,
+          InMemoryDataOperationEnum OutMemoryDataOperation,
+          bool PropagateNan,
+          index_t BlockSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t InSrcVectorDim,
+          index_t InSrcVectorSize,
+          typename OutDstVectorSizeSeq>
+struct GridwiseMultipleReduction_mk_to_m_threadwise
+{
+    static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
+                   (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static_assert(NumReduction == OutDataTypePointerTuple::Size() &&
+                      NumReduction == OutGridDesc_M_Tuple::Size() &&
+                      NumReduction == OutDstVectorSizeSeq::Size() &&
+                      NumReduction == InElementwiseOperationTuple::Size() &&
+                      NumReduction == AccElementwiseOperationTuple::Size(),
+                  "All tuple should have the same size as the number of Reductions!");
+
+    static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0);
+
+    using ThreadBufferDimAccessOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{})));
+    using ThreadReduceDstDesc_M =
+        decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
+
+    using ThreadwiseReduce = ThreadwiseReduction<AccDataType,
+                                                 ThreadReduceSrcDesc_M_K,
+                                                 ThreadReduceDstDesc_M,
+                                                 ReduceOperation,
+                                                 PropagateNan>;
+
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
+
+    static constexpr auto I0 = Number<0>{};
+
+    using Accumulation = detail::AccumulateWithNanCheck<PropagateNan, ReduceOperation, AccDataType>;
+
+    __device__ static void Run(const InGridDesc_M_K& in_grid_desc_m_k,
+                               const OutGridDesc_M_Tuple& out_grid_desc_m_tuple,
+                               const InElementwiseOperationTuple& in_elementwise_op_tuple,
+                               const AccElementwiseOperationTuple& acc_elementwise_op_tuple,
+                               Array<AccDataType, NumReduction> alpha_values,
+                               const InDataType* const __restrict__ p_in_value_global,
+                               Array<AccDataType, NumReduction> beta_values,
+                               OutDataTypePointerTuple p_out_value_global_tuple)
+    {
+        const auto identityVal = ReduceOperation::template GetIdentityValue<AccDataType>();
+
+        const auto in_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_in_value_global,
+            in_grid_desc_m_k.GetElementSpaceSize(),
+            ReduceOperation::template GetIdentityValue<InDataType>());
+        auto out_global_val_buf_tuple = generate_tuple(
+            [&](auto iR) {
+                return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                    p_out_value_global_tuple[iR], out_grid_desc_m_tuple[iR].GetElementSpaceSize());
+            },
+            Number<NumReduction>{});
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            in_thread_buf;
+
+        auto in_thread_buf_tuple = generate_tuple(
+            [&](auto iR) {
+                (void)iR;
+                return StaticBuffer<AddressSpaceEnum::Vgpr,
+                                    AccDataType,
+                                    MThreadSliceSize * KThreadSliceSize,
+                                    true>{};
+            },
+            Number<NumReduction>{});
+
+        auto accu_value_buf_tuple = generate_tuple(
+            [&](auto iR) {
+                (void)iR;
+                return StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>{};
+            },
+            Number<NumReduction>{});
+
+        static_for<0, NumReduction, 1>{}([&](auto iR) {
+            static_for<0, MThreadSliceSize, 1>{}(
+                [&](auto J) { accu_value_buf_tuple(iR)(J) = identityVal; });
+        });
+
+        const index_t thread_global_1d_id = get_thread_global_1d_id();
+
+        const auto toReduceLength = in_grid_desc_m_k.GetLength(Number<1>{});
+
+        using ThreadBufferLengths         = Sequence<MThreadSliceSize, KThreadSliceSize>;
+        constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
+
+        auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<InDataType,
+                                                                    AccDataType,
+                                                                    InGridDesc_M_K,
+                                                                    decltype(thread_buffer_desc),
+                                                                    ThreadBufferLengths,
+                                                                    ThreadBufferDimAccessOrder,
+                                                                    InSrcVectorDim,
+                                                                    InSrcVectorSize,
+                                                                    1,
+                                                                    false>(
+            in_grid_desc_m_k, make_multi_index(thread_global_1d_id * MThreadSliceSize, 0));
+
+        constexpr auto in_thread_copy_step = make_multi_index(0, KThreadSliceSize);
+
+        index_t reducedLength = 0;
+        do
+        {
+            threadwise_src_load.Run(in_grid_desc_m_k,
+                                    in_global_val_buf,
+                                    thread_buffer_desc,
+                                    make_tuple(I0, I0),
+                                    in_thread_buf);
+
+            static_for<0, NumReduction, 1>{}([&](auto iR) {
+                static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                    // do element-wise pre-reduction operation
+                    static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                        constexpr auto offset =
+                            thread_buffer_desc.CalculateOffset(make_tuple(iM, iK));
+                        in_elementwise_op_tuple[iR](in_thread_buf_tuple(iR)(Number<offset>{}),
+                                                    in_thread_buf(Number<offset>{}));
+                    });
+                });
+
+                ThreadwiseReduce::Reduce(in_thread_buf_tuple(iR), accu_value_buf_tuple(iR));
+            });
+
+            threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
+
+            reducedLength += KThreadSliceSize;
+        } while(reducedLength < toReduceLength);
+
+        constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{};
+
+        static_for<0, NumReduction, 1>{}([&](auto iR) {
+            using OutDataTypePointer = remove_cvref_t<decltype(OutDataTypePointerTuple{}[iR])>;
+            using OutDataType        = remove_cvref_t<remove_pointer_t<OutDataTypePointer>>;
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                acc_elementwise_op_tuple[iR](accu_value_buf_tuple(iR)(I),
+                                             accu_value_buf_tuple(iR)(I));
+
+                accu_value_buf_tuple(iR)(I) *= alpha_values[iR];
+            });
+
+            if(!float_equal_zero{}(beta_values[iR]))
+            {
+                StaticBuffer<AddressSpaceEnum::Vgpr, OutDataType, MThreadSliceSize, true>
+                    priorDstValueBuf;
+
+                auto threadwise_dst_load =
+                    ThreadwiseTensorSliceTransfer_v2<OutDataType,
+                                                     OutDataType,
+                                                     decltype(out_grid_desc_m_tuple[iR]),
+                                                     decltype(reduced_data_desc),
+                                                     Sequence<MThreadSliceSize>,
+                                                     Sequence<0>,
+                                                     0,
+                                                     OutDstVectorSizeSeq::At(iR),
+                                                     1,
+                                                     false>(
+                        out_grid_desc_m_tuple[iR],
+                        make_multi_index(thread_global_1d_id * MThreadSliceSize));
+
+                threadwise_dst_load.Run(out_grid_desc_m_tuple[iR],
+                                        out_global_val_buf_tuple(iR),
+                                        reduced_data_desc,
+                                        make_tuple(I0),
+                                        priorDstValueBuf);
+
+                static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                    accu_value_buf_tuple(iR)(I) +=
+                        type_convert<AccDataType>(priorDstValueBuf[I]) * beta_values[iR];
+                });
+            };
+
+            auto threadwise_dst_store =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   OutDataType,
+                                                   decltype(reduced_data_desc),
+                                                   decltype(out_grid_desc_m_tuple[iR]),
+                                                   PassThroughOp,
+                                                   Sequence<MThreadSliceSize>,
+                                                   Sequence<0>,
+                                                   0,
+                                                   OutDstVectorSizeSeq::At(iR),
+                                                   OutMemoryDataOperation,
+                                                   1,
+                                                   true>(
+                    out_grid_desc_m_tuple[iR],
+                    make_multi_index(thread_global_1d_id * MThreadSliceSize),
+                    PassThroughOp{});
+
+            threadwise_dst_store.Run(reduced_data_desc,
+                                     make_tuple(I0),
+                                     accu_value_buf_tuple[iR],
+                                     out_grid_desc_m_tuple[iR],
+                                     out_global_val_buf_tuple(iR));
+        });
+    };
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
new file mode 100644
index 00000000..6836a660
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_multiblock.hpp
@@ -0,0 +1,613 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/reduction_common.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/utility/reduction_functions_accumulate.hpp"
+#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp"
+#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseReduction,
+          bool OutputIndex,
+          bool HaveIndexInput,
+          typename InDataType,
+          typename OutDataType,
+          typename AccDataType,
+          typename IndexDataType,
+          typename InGridDesc_M_K,
+          typename OutGridDesc_M,
+          typename InElementwiseOperation,
+          typename AccElementwiseOperation>
+__global__ void kernel_reduce_multiblock(const InGridDesc_M_K in_grid_desc_m_k,
+                                         const OutGridDesc_M out_grid_desc_m,
+                                         const InElementwiseOperation in_elementwise_op,
+                                         const AccElementwiseOperation acc_elementwise_op,
+                                         index_t block_group_size,
+                                         index_t num_k_block_tile_iteration,
+                                         AccDataType alpha,
+                                         const InDataType* const __restrict__ p_in_value_global,
+                                         const IndexDataType* const __restrict__ p_in_index_global,
+                                         AccDataType beta,
+                                         OutDataType* const __restrict__ p_out_value_global,
+                                         IndexDataType* const __restrict__ p_out_index_global)
+{
+    if constexpr(!OutputIndex)
+    {
+        (void)p_in_index_global;
+        (void)p_out_index_global;
+
+        GridwiseReduction::Run(in_grid_desc_m_k,
+                               out_grid_desc_m,
+                               in_elementwise_op,
+                               acc_elementwise_op,
+                               block_group_size,
+                               num_k_block_tile_iteration,
+                               alpha,
+                               p_in_value_global,
+                               beta,
+                               p_out_value_global);
+    }
+    else
+    {
+        GridwiseReduction::template RunWithIndex<HaveIndexInput>(in_grid_desc_m_k,
+                                                                 out_grid_desc_m,
+                                                                 in_elementwise_op,
+                                                                 acc_elementwise_op,
+                                                                 num_k_block_tile_iteration,
+                                                                 alpha,
+                                                                 p_in_value_global,
+                                                                 p_in_index_global,
+                                                                 beta,
+                                                                 p_out_value_global,
+                                                                 p_out_index_global);
+    };
+};
+
+template <typename InDataType,
+          typename OutDataType,
+          typename AccDataType,
+          typename IndexDataType,
+          typename InGridDesc_M_K,
+          typename OutGridDesc_M,
+          typename ReduceOperation,
+          typename InElementwiseOperation,
+          typename AccElementwiseOperation,
+          InMemoryDataOperationEnum OutMemoryDataOperation,
+          bool PropagateNan,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t InSrcVectorDim,
+          index_t InSrcVectorSize,
+          index_t OutDstVectorSize>
+struct GridwiseReduction_mk_to_m_multiblock
+{
+    static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
+                   (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) &&
+                      (MThreadSliceSize % OutDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0);
+
+    using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
+
+    using ThreadBufferDimAccessOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    using ThreadClusterArrangeOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    static constexpr auto thread_cluster_desc =
+        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
+
+    using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{})));
+    using ThreadReduceDstDesc_M =
+        decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
+
+    using BlockwiseReduce = PartitionedBlockwiseReduction<AccDataType,
+                                                          BlockSize,
+                                                          ThreadClusterLengths_M_K,
+                                                          ThreadClusterArrangeOrder,
+                                                          ReduceOperation,
+                                                          PropagateNan>;
+
+    using ThreadwiseReduce = ThreadwiseReduction<AccDataType,
+                                                 ThreadReduceSrcDesc_M_K,
+                                                 ThreadReduceDstDesc_M,
+                                                 ReduceOperation,
+                                                 PropagateNan>;
+
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+
+    using Accumulation = detail::AccumulateWithNanCheck<PropagateNan, ReduceOperation, AccDataType>;
+
+    __device__ static void Run(const InGridDesc_M_K& in_grid_desc_m_k,
+                               const OutGridDesc_M& out_grid_desc_m,
+                               const InElementwiseOperation& in_elementwise_op,
+                               const AccElementwiseOperation& acc_elementwise_op,
+                               index_t block_group_size,
+                               index_t num_k_block_tile_iteration,
+                               AccDataType alpha,
+                               const InDataType* const __restrict__ p_in_value_global,
+                               AccDataType beta,
+                               OutDataType* const __restrict__ p_out_value_global)
+    {
+        const auto identityVal = ReduceOperation::template GetIdentityValue<AccDataType>();
+
+        // LDS
+        __shared__ AccDataType p_reduce_work_buffer[BlockSize];
+
+        const auto in_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_in_value_global,
+            in_grid_desc_m_k.GetElementSpaceSize(),
+            ReduceOperation::template GetIdentityValue<InDataType>());
+        auto out_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_out_value_global, out_grid_desc_m.GetElementSpaceSize());
+
+        auto reduce_work_buf =
+            make_dynamic_buffer<AddressSpaceEnum::Lds>(p_reduce_work_buffer, BlockSize);
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            in_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> accu_value_buf;
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = identityVal; });
+
+        const index_t thread_local_id = get_thread_local_1d_id();
+        const index_t block_global_id = get_block_1d_id();
+        const index_t blkgroup_id     = block_global_id / block_group_size;
+        const index_t block_local_id  = block_global_id % block_group_size;
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
+
+        const auto thread_m_cluster_id = thread_cluster_idx[I0];
+        const auto thread_k_cluster_id = thread_cluster_idx[I1];
+
+        const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration;
+
+        using ThreadBufferLengths         = Sequence<MThreadSliceSize, KThreadSliceSize>;
+        constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
+
+        auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<InDataType,
+                                                                    AccDataType,
+                                                                    InGridDesc_M_K,
+                                                                    decltype(thread_buffer_desc),
+                                                                    ThreadBufferLengths,
+                                                                    ThreadBufferDimAccessOrder,
+                                                                    InSrcVectorDim,
+                                                                    InSrcVectorSize,
+                                                                    1,
+                                                                    false>(
+            in_grid_desc_m_k,
+            make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
+                             block_local_id * reduceSizePerBlock +
+                                 thread_k_cluster_id * KThreadSliceSize));
+
+        constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize);
+
+        index_t reducedTiles = 0;
+        do
+        {
+            threadwise_src_load.Run(in_grid_desc_m_k,
+                                    in_global_val_buf,
+                                    thread_buffer_desc,
+                                    make_tuple(I0, I0),
+                                    in_thread_buf);
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                // do element-wise pre-reduction operation
+                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                    constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK));
+                    in_elementwise_op(in_thread_buf(Number<offset>{}),
+                                      in_thread_buf(Number<offset>{}));
+                });
+            });
+
+            ThreadwiseReduce::Reduce(in_thread_buf, accu_value_buf);
+
+            threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
+
+            reducedTiles++;
+        } while(reducedTiles < num_k_block_tile_iteration);
+
+        constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{};
+
+        static_for<0, MThreadSliceSize, 1>{}(
+            [&](auto I) { BlockwiseReduce::Reduce(reduce_work_buf, accu_value_buf(I)); });
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            if(thread_k_cluster_id == 0)
+            {
+                acc_elementwise_op(accu_value_buf(I), accu_value_buf(I));
+
+                accu_value_buf(I) *= alpha;
+            }
+        });
+
+        if(thread_k_cluster_id == 0)
+        {
+            if(block_group_size == 0 && !float_equal_zero{}(beta))
+            {
+                StaticBuffer<AddressSpaceEnum::Vgpr, OutDataType, MThreadSliceSize, true>
+                    priorDstValueBuf;
+
+                auto threadwise_dst_load =
+                    ThreadwiseTensorSliceTransfer_v2<OutDataType,
+                                                     OutDataType,
+                                                     OutGridDesc_M,
+                                                     decltype(reduced_data_desc),
+                                                     Sequence<MThreadSliceSize>,
+                                                     Sequence<0>,
+                                                     0,
+                                                     OutDstVectorSize,
+                                                     1,
+                                                     false>(
+                        out_grid_desc_m,
+                        make_multi_index(blkgroup_id * M_BlockTileSize +
+                                         thread_m_cluster_id * MThreadSliceSize));
+
+                threadwise_dst_load.Run(out_grid_desc_m,
+                                        out_global_val_buf,
+                                        reduced_data_desc,
+                                        make_tuple(I0),
+                                        priorDstValueBuf);
+
+                static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                    accu_value_buf(I) += type_convert<AccDataType>(priorDstValueBuf[I]) * beta;
+                });
+            };
+
+            auto threadwise_dst_store =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   OutDataType,
+                                                   decltype(reduced_data_desc),
+                                                   OutGridDesc_M,
+                                                   PassThroughOp,
+                                                   Sequence<MThreadSliceSize>,
+                                                   Sequence<0>,
+                                                   0,
+                                                   OutDstVectorSize,
+                                                   OutMemoryDataOperation,
+                                                   1,
+                                                   true>(
+                    out_grid_desc_m,
+                    make_multi_index(blkgroup_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize),
+                    PassThroughOp{});
+
+            threadwise_dst_store.Run(reduced_data_desc,
+                                     make_tuple(I0),
+                                     accu_value_buf,
+                                     out_grid_desc_m,
+                                     out_global_val_buf);
+        }
+    };
+
+    template <bool HaveIndexInput>
+    __device__ static void RunWithIndex(const InGridDesc_M_K& in_grid_desc_m_k,
+                                        const OutGridDesc_M& out_grid_desc_m,
+                                        const InElementwiseOperation in_elementwise_op,
+                                        const AccElementwiseOperation acc_elementwise_op,
+                                        index_t num_k_block_tile_iteration,
+                                        AccDataType alpha,
+                                        const InDataType* const __restrict__ p_in_value_global,
+                                        const IndexDataType* const __restrict__ p_in_index_global,
+                                        AccDataType beta,
+                                        OutDataType* const __restrict__ p_out_value_global,
+                                        IndexDataType* const __restrict__ p_out_index_global)
+    {
+        using BlockwiseReduceWithIndex =
+            PartitionedBlockwiseReductionWithIndex<AccDataType,
+                                                   IndexDataType,
+                                                   BlockSize,
+                                                   Sequence<MThreadClusterSize, KThreadClusterSize>,
+                                                   ThreadClusterArrangeOrder,
+                                                   ReduceOperation,
+                                                   PropagateNan>;
+
+        using AccumulationWithIndex = detail::AccumulateWithIndexAndNanCheck<PropagateNan,
+                                                                             ReduceOperation,
+                                                                             AccDataType,
+                                                                             IndexDataType>;
+
+        (void)in_elementwise_op;
+
+        // LDS
+        __shared__ AccDataType p_reduce_work_val_buffer[BlockSize];
+        __shared__ IndexDataType p_reduce_work_idx_buffer[BlockSize];
+
+        const auto identityVal = ReduceOperation::template GetIdentityValue<AccDataType>();
+
+        const auto in_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_in_value_global,
+            in_grid_desc_m_k.GetElementSpaceSize(),
+            ReduceOperation::template GetIdentityValue<InDataType>());
+        const auto in_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_in_index_global, in_grid_desc_m_k.GetElementSpaceSize());
+        auto out_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_out_value_global, out_grid_desc_m.GetElementSpaceSize());
+        auto out_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_out_index_global, out_grid_desc_m.GetElementSpaceSize());
+
+        auto reduce_work_val_buf =
+            make_dynamic_buffer<AddressSpaceEnum::Lds>(p_reduce_work_val_buffer, BlockSize);
+        auto reduce_work_idx_buf =
+            make_dynamic_buffer<AddressSpaceEnum::Lds>(p_reduce_work_idx_buffer, BlockSize);
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            in_thread_val_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr,
+                     IndexDataType,
+                     MThreadSliceSize * KThreadSliceSize,
+                     true>
+            in_thread_idx_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> accu_value_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, IndexDataType, MThreadSliceSize, true> accu_index_buf;
+
+        const index_t thread_local_id    = get_thread_local_1d_id();
+        const index_t block_global_1d_id = get_block_1d_id();
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
+
+        const auto thread_m_cluster_id = thread_cluster_idx[I0];
+        const auto thread_k_cluster_id = thread_cluster_idx[I1];
+
+        using ThreadBufferLengths         = Sequence<MThreadSliceSize, KThreadSliceSize>;
+        constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
+
+        auto threadwise_src_val_load =
+            ThreadwiseTensorSliceTransfer_v2<InDataType,
+                                             AccDataType,
+                                             InGridDesc_M_K,
+                                             decltype(thread_buffer_desc),
+                                             ThreadBufferLengths,
+                                             ThreadBufferDimAccessOrder,
+                                             InSrcVectorDim,
+                                             InSrcVectorSize,
+                                             1,
+                                             false>(
+                in_grid_desc_m_k,
+                make_multi_index(block_global_1d_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 thread_k_cluster_id * KThreadSliceSize));
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            accu_value_buf(I) = identityVal;
+            accu_index_buf(I) = 0;
+        });
+
+        constexpr auto in_thread_copy_step = make_multi_index(0, K_BlockTileSize);
+
+        index_t reducedTiles = 0;
+
+        if constexpr(HaveIndexInput)
+        {
+            auto threadwise_src_idx_load =
+                ThreadwiseTensorSliceTransfer_v2<IndexDataType,
+                                                 IndexDataType,
+                                                 InGridDesc_M_K,
+                                                 decltype(thread_buffer_desc),
+                                                 ThreadBufferLengths,
+                                                 ThreadBufferDimAccessOrder,
+                                                 InSrcVectorDim,
+                                                 InSrcVectorSize,
+                                                 1,
+                                                 false>(
+                    in_grid_desc_m_k,
+                    make_multi_index(block_global_1d_id * M_BlockTileSize +
+                                         thread_m_cluster_id * MThreadSliceSize,
+                                     thread_k_cluster_id * KThreadSliceSize));
+
+            do
+            {
+                // load the thread slice
+                threadwise_src_val_load.Run(in_grid_desc_m_k,
+                                            in_global_val_buf,
+                                            thread_buffer_desc,
+                                            make_tuple(I0, I0),
+                                            in_thread_val_buf);
+                threadwise_src_idx_load.Run(in_grid_desc_m_k,
+                                            in_global_idx_buf,
+                                            thread_buffer_desc,
+                                            make_tuple(I0, I0),
+                                            in_thread_idx_buf);
+
+                static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                    AccDataType tmpValue   = identityVal;
+                    IndexDataType tmpIndex = 0;
+
+                    static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                        constexpr auto offset =
+                            thread_buffer_desc.CalculateOffset(make_tuple(iM, iK));
+
+                        AccumulationWithIndex::Calculate(tmpValue,
+                                                         in_thread_val_buf[Number<offset>{}],
+                                                         tmpIndex,
+                                                         in_thread_idx_buf[Number<offset>{}]);
+                    });
+
+                    BlockwiseReduceWithIndex::Reduce(
+                        reduce_work_val_buf, reduce_work_idx_buf, tmpValue, tmpIndex);
+
+                    AccumulationWithIndex::Calculate(
+                        accu_value_buf(iM), tmpValue, accu_index_buf(iM), tmpIndex);
+                });
+
+                threadwise_src_val_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
+                threadwise_src_idx_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
+
+                reducedTiles++;
+            } while(reducedTiles < num_k_block_tile_iteration);
+        }
+        else
+        {
+            index_t indexOffset = 0;
+
+            do
+            {
+                // load the thread slice
+                threadwise_src_val_load.Run(in_grid_desc_m_k,
+                                            in_global_val_buf,
+                                            thread_buffer_desc,
+                                            make_tuple(I0, I0),
+                                            in_thread_val_buf);
+
+                static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                    static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                        constexpr auto offset =
+                            thread_buffer_desc.CalculateOffset(make_tuple(iM, iK));
+
+                        // initialize the indices for the per-thread to-reduce values
+                        in_thread_idx_buf(Number<offset>{}) =
+                            indexOffset + thread_k_cluster_id * KThreadSliceSize + iK();
+
+                        // do element-wise pre-reduction operation
+                        in_elementwise_op(in_thread_val_buf(Number<offset>{}),
+                                          in_thread_val_buf(Number<offset>{}));
+                    });
+
+                    AccDataType tmpValue   = identityVal;
+                    IndexDataType tmpIndex = 0;
+
+                    static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                        constexpr auto offset =
+                            thread_buffer_desc.CalculateOffset(make_tuple(iM, iK));
+
+                        AccumulationWithIndex::Calculate(tmpValue,
+                                                         in_thread_val_buf[Number<offset>{}],
+                                                         tmpIndex,
+                                                         in_thread_idx_buf[Number<offset>{}]);
+                    });
+
+                    BlockwiseReduceWithIndex::Reduce(
+                        reduce_work_val_buf, reduce_work_idx_buf, tmpValue, tmpIndex);
+
+                    AccumulationWithIndex::Calculate(
+                        accu_value_buf(iM), tmpValue, accu_index_buf(iM), tmpIndex);
+                });
+
+                threadwise_src_val_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
+
+                indexOffset += K_BlockTileSize;
+                reducedTiles++;
+            } while(reducedTiles < num_k_block_tile_iteration);
+        };
+
+        constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{};
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            if(thread_k_cluster_id == 0)
+            {
+                // for indiced operation, acc_elementwise_op shoud do nothing
+                acc_elementwise_op(accu_value_buf(I), accu_value_buf(I));
+
+                accu_value_buf(I) *= alpha;
+            }
+        });
+
+        if(thread_k_cluster_id == 0)
+        {
+            if(!float_equal_zero{}(beta))
+            {
+                StaticBuffer<AddressSpaceEnum::Vgpr, OutDataType, MThreadSliceSize, true>
+                    priorDstValueBuf;
+
+                auto threadwise_dst_load =
+                    ThreadwiseTensorSliceTransfer_v2<OutDataType,
+                                                     OutDataType,
+                                                     OutGridDesc_M,
+                                                     decltype(reduced_data_desc),
+                                                     Sequence<MThreadSliceSize>,
+                                                     Sequence<0>,
+                                                     0,
+                                                     OutDstVectorSize,
+                                                     1,
+                                                     true>(
+                        out_grid_desc_m,
+                        make_multi_index(block_global_1d_id * M_BlockTileSize +
+                                         thread_m_cluster_id * MThreadSliceSize));
+
+                threadwise_dst_load.Run(out_grid_desc_m,
+                                        out_global_val_buf,
+                                        reduced_data_desc,
+                                        make_tuple(I0),
+                                        priorDstValueBuf);
+
+                static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                    accu_value_buf(I) += type_convert<AccDataType>(priorDstValueBuf[I]) * beta;
+                });
+            };
+
+            auto threadwise_dst_val_store =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   OutDataType,
+                                                   decltype(reduced_data_desc),
+                                                   OutGridDesc_M,
+                                                   PassThroughOp,
+                                                   Sequence<MThreadSliceSize>,
+                                                   Sequence<0>,
+                                                   0,
+                                                   OutDstVectorSize,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>(
+                    out_grid_desc_m,
+                    make_multi_index(block_global_1d_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize),
+                    PassThroughOp{});
+
+            auto threadwise_dst_idx_store =
+                ThreadwiseTensorSliceTransfer_v1r3<IndexDataType,
+                                                   IndexDataType,
+                                                   decltype(reduced_data_desc),
+                                                   OutGridDesc_M,
+                                                   PassThroughOp,
+                                                   Sequence<MThreadSliceSize>,
+                                                   Sequence<0>,
+                                                   0,
+                                                   OutDstVectorSize,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>(
+                    out_grid_desc_m,
+                    make_multi_index(block_global_1d_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize),
+                    PassThroughOp{});
+
+            threadwise_dst_val_store.Run(reduced_data_desc,
+                                         make_tuple(I0),
+                                         accu_value_buf,
+                                         out_grid_desc_m,
+                                         out_global_val_buf);
+            threadwise_dst_idx_store.Run(reduced_data_desc,
+                                         make_tuple(I0),
+                                         accu_index_buf,
+                                         out_grid_desc_m,
+                                         out_global_idx_buf);
+        }
+    };
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
new file mode 100644
index 00000000..6c5bd29f
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_2d_reduction_threadwise.hpp
@@ -0,0 +1,474 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_common.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/utility/reduction_functions_accumulate.hpp"
+#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseReduction,
+          bool OutputIndex,
+          bool HaveIndexInput,
+          typename InDataType,
+          typename OutDataType,
+          typename AccDataType,
+          typename IndexDataType,
+          typename InGridDesc_M_K,
+          typename OutGridDesc_M,
+          typename InElementwiseOperation,
+          typename AccElementwiseOperation>
+__global__ void kernel_reduce_threadwise(const InGridDesc_M_K in_grid_desc_m_k,
+                                         const OutGridDesc_M out_grid_desc_m,
+                                         const InElementwiseOperation in_elementwise_op,
+                                         const AccElementwiseOperation acc_elementwise_op,
+                                         AccDataType alpha,
+                                         const InDataType* const __restrict__ p_in_value_global,
+                                         const IndexDataType* const __restrict__ p_in_index_global,
+                                         AccDataType beta,
+                                         OutDataType* const __restrict__ p_out_value_global,
+                                         IndexDataType* const __restrict__ p_out_index_global)
+{
+    if constexpr(!OutputIndex)
+    {
+        GridwiseReduction::Run(in_grid_desc_m_k,
+                               out_grid_desc_m,
+                               in_elementwise_op,
+                               acc_elementwise_op,
+                               alpha,
+                               p_in_value_global,
+                               beta,
+                               p_out_value_global);
+    }
+    else
+    {
+        GridwiseReduction::template RunWithIndex<HaveIndexInput>(in_grid_desc_m_k,
+                                                                 out_grid_desc_m,
+                                                                 in_elementwise_op,
+                                                                 acc_elementwise_op,
+                                                                 alpha,
+                                                                 p_in_value_global,
+                                                                 p_in_index_global,
+                                                                 beta,
+                                                                 p_out_value_global,
+                                                                 p_out_index_global);
+    };
+};
+
+template <typename InDataType,
+          typename OutDataType,
+          typename AccDataType,
+          typename IndexDataType,
+          typename InGridDesc_M_K,
+          typename OutGridDesc_M,
+          typename ReduceOperation,
+          typename InElementwiseOperation,
+          typename AccElementwiseOperation,
+          InMemoryDataOperationEnum OutMemoryDataOperation,
+          bool PropagateNan,
+          index_t BlockSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t InSrcVectorDim,
+          index_t InSrcVectorSize,
+          index_t OutDstVectorSize>
+struct GridwiseReduction_mk_to_m_threadwise
+{
+    static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
+                   (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) &&
+                      (MThreadSliceSize % OutDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    using ThreadBufferDimAccessOrder =
+        typename conditional<InSrcVectorDim == 0, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{})));
+    using ThreadReduceDstDesc_M =
+        decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
+
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
+
+    static constexpr auto I0 = Number<0>{};
+
+    __device__ static void Run(const InGridDesc_M_K& in_grid_desc_m_k,
+                               const OutGridDesc_M& out_grid_desc_m,
+                               const InElementwiseOperation& in_elementwise_op,
+                               const AccElementwiseOperation& acc_elementwise_op,
+                               AccDataType alpha,
+                               const InDataType* const __restrict__ p_in_value_global,
+                               AccDataType beta,
+                               OutDataType* const __restrict__ p_out_value_global)
+    {
+        using ThreadwiseReduce = ThreadwiseReduction<AccDataType,
+                                                     ThreadReduceSrcDesc_M_K,
+                                                     ThreadReduceDstDesc_M,
+                                                     ReduceOperation,
+                                                     PropagateNan>;
+
+        const auto identityVal = ReduceOperation::template GetIdentityValue<AccDataType>();
+
+        const auto in_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_in_value_global,
+            in_grid_desc_m_k.GetElementSpaceSize(),
+            ReduceOperation::template GetIdentityValue<InDataType>());
+        auto dst_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_out_value_global, out_grid_desc_m.GetElementSpaceSize());
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            in_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> accu_value_buf;
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) { accu_value_buf(I) = identityVal; });
+
+        const auto toReduceLength = in_grid_desc_m_k.GetLength(Number<1>{});
+
+        using ThreadBufferLengths         = Sequence<MThreadSliceSize, KThreadSliceSize>;
+        constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
+
+        index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
+
+        auto threadwise_src_val_load =
+            ThreadwiseTensorSliceTransfer_v2<InDataType,
+                                             AccDataType,
+                                             InGridDesc_M_K,
+                                             decltype(thread_buffer_desc),
+                                             ThreadBufferLengths,
+                                             ThreadBufferDimAccessOrder,
+                                             InSrcVectorDim,
+                                             InSrcVectorSize,
+                                             1,
+                                             false>(
+                in_grid_desc_m_k, make_multi_index(thread_global_1d_id * MThreadSliceSize, 0));
+
+        constexpr auto in_thread_copy_step = make_multi_index(0, KThreadSliceSize);
+
+        index_t reducedLength = 0;
+        do
+        {
+            threadwise_src_val_load.Run(in_grid_desc_m_k,
+                                        in_global_val_buf,
+                                        thread_buffer_desc,
+                                        make_tuple(I0, I0),
+                                        in_thread_buf);
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                // do element-wise pre-reduction operation
+                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                    constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK));
+                    in_elementwise_op(in_thread_buf(Number<offset>{}),
+                                      in_thread_buf(Number<offset>{}));
+                });
+            });
+
+            ThreadwiseReduce::Reduce(in_thread_buf, accu_value_buf);
+
+            threadwise_src_val_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
+
+            reducedLength += KThreadSliceSize;
+        } while(reducedLength < toReduceLength);
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            acc_elementwise_op(accu_value_buf(I), accu_value_buf(I));
+
+            accu_value_buf(I) *= alpha;
+        });
+
+        constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{};
+
+        if(!float_equal_zero{}(beta))
+        {
+            auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2<OutDataType,
+                                                                        OutDataType,
+                                                                        OutGridDesc_M,
+                                                                        decltype(reduced_data_desc),
+                                                                        Sequence<MThreadSliceSize>,
+                                                                        Sequence<0>,
+                                                                        0,
+                                                                        1,
+                                                                        1,
+                                                                        true>(
+                out_grid_desc_m, make_multi_index(thread_global_1d_id * MThreadSliceSize));
+
+            StaticBuffer<AddressSpaceEnum::Vgpr, OutDataType, MThreadSliceSize, true>
+                priorDstValue_buf;
+
+            threadwise_dst_load.Run(out_grid_desc_m,
+                                    dst_global_buf,
+                                    reduced_data_desc,
+                                    make_tuple(I0),
+                                    priorDstValue_buf);
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                accu_value_buf(I) += type_convert<AccDataType>(priorDstValue_buf[I]) * beta;
+            });
+        };
+
+        auto threadwise_dst_store = ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                                       OutDataType,
+                                                                       decltype(reduced_data_desc),
+                                                                       OutGridDesc_M,
+                                                                       PassThroughOp,
+                                                                       Sequence<MThreadSliceSize>,
+                                                                       Sequence<0>,
+                                                                       0,
+                                                                       OutDstVectorSize,
+                                                                       OutMemoryDataOperation,
+                                                                       1,
+                                                                       false>(
+            out_grid_desc_m,
+            make_multi_index(thread_global_1d_id * MThreadSliceSize),
+            PassThroughOp{});
+
+        threadwise_dst_store.Run(
+            reduced_data_desc, make_tuple(I0), accu_value_buf, out_grid_desc_m, dst_global_buf);
+    };
+
+    template <bool HaveIndexInput>
+    __device__ static void RunWithIndex(const InGridDesc_M_K& in_grid_desc_m_k,
+                                        const OutGridDesc_M& out_grid_desc_m,
+                                        const InElementwiseOperation& in_elementwise_op,
+                                        const AccElementwiseOperation& acc_elementwise_op,
+                                        AccDataType alpha,
+                                        const InDataType* const __restrict__ p_in_value_global,
+                                        const IndexDataType* const __restrict__ p_in_index_global,
+                                        AccDataType beta,
+                                        OutDataType* const __restrict__ p_out_value_global,
+                                        IndexDataType* const __restrict__ p_out_index_global)
+    {
+        using ThreadwiseReduceWithIndex = ThreadwiseReductionWithIndex<AccDataType,
+                                                                       IndexDataType,
+                                                                       ThreadReduceSrcDesc_M_K,
+                                                                       ThreadReduceDstDesc_M,
+                                                                       ReduceOperation,
+                                                                       PropagateNan>;
+
+        (void)acc_elementwise_op;
+
+        const auto identityVal = ReduceOperation::template GetIdentityValue<AccDataType>();
+
+        const auto in_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_in_value_global,
+            in_grid_desc_m_k.GetElementSpaceSize(),
+            ReduceOperation::template GetIdentityValue<InDataType>());
+        const auto in_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_in_index_global, in_grid_desc_m_k.GetElementSpaceSize());
+
+        auto out_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_out_value_global, out_grid_desc_m.GetElementSpaceSize());
+        auto out_global_idx_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_out_index_global, out_grid_desc_m.GetElementSpaceSize());
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            in_thread_val_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr,
+                     IndexDataType,
+                     MThreadSliceSize * KThreadSliceSize,
+                     true>
+            in_thread_idx_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> accu_value_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, IndexDataType, MThreadSliceSize, true> accu_index_buf;
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            accu_value_buf(I) = identityVal;
+            accu_index_buf(I) = 0;
+        });
+
+        const auto toReduceLength = in_grid_desc_m_k.GetLength(Number<1>{});
+
+        using ThreadBufferLengths         = Sequence<MThreadSliceSize, KThreadSliceSize>;
+        constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
+
+        index_t thread_global_1d_id = get_block_1d_id() * BlockSize + get_thread_local_1d_id();
+
+        auto threadwise_src_val_load =
+            ThreadwiseTensorSliceTransfer_v2<InDataType,
+                                             AccDataType,
+                                             InGridDesc_M_K,
+                                             decltype(thread_buffer_desc),
+                                             ThreadBufferLengths,
+                                             ThreadBufferDimAccessOrder,
+                                             InSrcVectorDim,
+                                             InSrcVectorSize,
+                                             1,
+                                             false>(
+                in_grid_desc_m_k, make_multi_index(thread_global_1d_id * MThreadSliceSize, 0));
+
+        constexpr auto in_thread_copy_step = make_multi_index(0, KThreadSliceSize);
+
+        index_t indexStart    = 0;
+        index_t reducedLength = 0;
+        if constexpr(HaveIndexInput)
+        {
+            auto threadwise_src_idx_load =
+                ThreadwiseTensorSliceTransfer_v2<IndexDataType,
+                                                 IndexDataType,
+                                                 InGridDesc_M_K,
+                                                 decltype(thread_buffer_desc),
+                                                 ThreadBufferLengths,
+                                                 ThreadBufferDimAccessOrder,
+                                                 InSrcVectorDim,
+                                                 InSrcVectorSize,
+                                                 1,
+                                                 false>(
+                    in_grid_desc_m_k, make_multi_index(thread_global_1d_id * MThreadSliceSize, 0));
+
+            do
+            {
+                threadwise_src_val_load.Run(in_grid_desc_m_k,
+                                            in_global_val_buf,
+                                            thread_buffer_desc,
+                                            make_tuple(I0, I0),
+                                            in_thread_val_buf);
+
+                threadwise_src_idx_load.Run(in_grid_desc_m_k,
+                                            in_global_idx_buf,
+                                            thread_buffer_desc,
+                                            make_tuple(I0, I0),
+                                            in_thread_idx_buf);
+
+                static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                    // do element-wise pre-reduction operation
+                    static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                        constexpr auto offset =
+                            thread_buffer_desc.CalculateOffset(make_tuple(iM, iK));
+
+                        in_elementwise_op(in_thread_val_buf(Number<offset>{}),
+                                          in_thread_val_buf(Number<offset>{}));
+                    });
+                });
+
+                ThreadwiseReduceWithIndex::Reduce(
+                    in_thread_val_buf, in_thread_idx_buf, accu_value_buf, accu_index_buf);
+
+                threadwise_src_val_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
+                threadwise_src_idx_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
+
+                indexStart += KThreadSliceSize;
+                reducedLength += KThreadSliceSize;
+            } while(reducedLength < toReduceLength);
+        }
+        else
+        {
+            do
+            {
+                threadwise_src_val_load.Run(in_grid_desc_m_k,
+                                            in_global_val_buf,
+                                            thread_buffer_desc,
+                                            make_tuple(I0, I0),
+                                            in_thread_val_buf);
+
+                static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                    // do element-wise pre-reduction operation
+                    static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                        constexpr auto offset =
+                            thread_buffer_desc.CalculateOffset(make_tuple(iM, iK));
+
+                        in_thread_idx_buf(Number<offset>{}) = indexStart + iK();
+
+                        in_elementwise_op(in_thread_val_buf(Number<offset>{}),
+                                          in_thread_val_buf(Number<offset>{}));
+                    });
+                });
+
+                ThreadwiseReduceWithIndex::Reduce(
+                    in_thread_val_buf, in_thread_idx_buf, accu_value_buf, accu_index_buf);
+
+                threadwise_src_val_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_step);
+
+                indexStart += KThreadSliceSize;
+                reducedLength += KThreadSliceSize;
+            } while(reducedLength < toReduceLength);
+        };
+
+        // for indiced operation, acc_elementwise_op shoud do nothing
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            acc_elementwise_op(accu_value_buf(I), accu_value_buf(I));
+
+            accu_value_buf(I) *= alpha;
+        });
+
+        constexpr auto reduced_data_desc = ThreadReduceDstDesc_M{};
+
+        if(!float_equal_zero{}(beta))
+        {
+            auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2<OutDataType,
+                                                                        OutDataType,
+                                                                        OutGridDesc_M,
+                                                                        decltype(reduced_data_desc),
+                                                                        Sequence<MThreadSliceSize>,
+                                                                        Sequence<0>,
+                                                                        0,
+                                                                        1,
+                                                                        1,
+                                                                        false>(
+                out_grid_desc_m, make_multi_index(thread_global_1d_id * MThreadSliceSize));
+
+            StaticBuffer<AddressSpaceEnum::Vgpr, OutDataType, MThreadSliceSize, true>
+                priorDstValue_buf;
+
+            threadwise_dst_load.Run(out_grid_desc_m,
+                                    out_global_val_buf,
+                                    reduced_data_desc,
+                                    make_tuple(I0),
+                                    priorDstValue_buf);
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                accu_value_buf(I) += type_convert<AccDataType>(priorDstValue_buf[I]) * beta;
+            });
+        };
+
+        auto threadwise_dst_val_store =
+            ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                               OutDataType,
+                                               decltype(reduced_data_desc),
+                                               OutGridDesc_M,
+                                               PassThroughOp,
+                                               Sequence<MThreadSliceSize>,
+                                               Sequence<0>,
+                                               0,
+                                               OutDstVectorSize,
+                                               OutMemoryDataOperation,
+                                               1,
+                                               false>(
+                out_grid_desc_m,
+                make_multi_index(thread_global_1d_id * MThreadSliceSize),
+                PassThroughOp{});
+
+        auto threadwise_dst_idx_store =
+            ThreadwiseTensorSliceTransfer_v1r3<IndexDataType,
+                                               IndexDataType,
+                                               decltype(reduced_data_desc),
+                                               OutGridDesc_M,
+                                               PassThroughOp,
+                                               Sequence<MThreadSliceSize>,
+                                               Sequence<0>,
+                                               0,
+                                               OutDstVectorSize,
+                                               OutMemoryDataOperation,
+                                               1,
+                                               false>(
+                out_grid_desc_m,
+                make_multi_index(thread_global_1d_id * MThreadSliceSize),
+                PassThroughOp{});
+
+        threadwise_dst_val_store.Run(
+            reduced_data_desc, make_tuple(I0), accu_value_buf, out_grid_desc_m, out_global_val_buf);
+
+        threadwise_dst_idx_store.Run(
+            reduced_data_desc, make_tuple(I0), accu_index_buf, out_grid_desc_m, out_global_idx_buf);
+    };
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp
new file mode 100644
index 00000000..fccb127d
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_xdl_cshuffle_v1.hpp
@@ -0,0 +1,931 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename FloatAB,
+          typename FloatGemmAcc,
+          typename FloatCShuffle,
+          typename FloatC,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename AccElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename B1GridDesc_BK0_N_BK1,
+          typename CGridDesc_M_N,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t Gemm1NPerBlock,
+          index_t Gemm1KPerBlock,
+          index_t AK1Value,
+          index_t BK1Value,
+          index_t B1K1Value,
+          index_t MPerXdl,
+          index_t NPerXdl,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          index_t Gemm1NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool AThreadTransferSrcResetCoordinateAfterRun, // ignored
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BThreadTransferSrcResetCoordinateAfterRun, // ignored
+          index_t BBlockLdsExtraN,
+          typename B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename B1BlockTransferThreadClusterArrangeOrder,
+          typename B1BlockTransferSrcAccessOrder,
+          index_t B1BlockTransferSrcVectorDim,
+          index_t B1BlockTransferSrcScalarPerVector,
+          index_t B1BlockTransferDstScalarPerVector_BK1,
+          bool B1ThreadTransferSrcResetCoordinateAfterRun,
+          index_t B1BlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched,
+          PipelineVersion PipelineVer = PipelineVersion::v1>
+struct GridwiseBatchedGemmGemm_Xdl_CShuffle
+{
+    static_assert(LoopSched == LoopScheduler::Default,
+                  "Non-default loop scheduler is currently not supported");
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    // K1 should be Number<...>
+    // Gemm0
+    static constexpr auto AK0 = Number<KPerBlock / AK1Value>{};
+    static constexpr auto BK0 = Number<KPerBlock / BK1Value>{};
+    static constexpr auto AK1 = Number<AK1Value>{};
+    static constexpr auto BK1 = Number<BK1Value>{};
+    // Gemm1
+    static constexpr auto B1K0 = Number<Gemm1KPerBlock / B1K1Value>{};
+    static constexpr auto B1K1 = Number<B1K1Value>{};
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    using GridwiseGemmPipe = remove_cvref_t<decltype(
+        GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage>())>;
+
+    template <typename ABlockDesc_AK0_M_AK1>
+    __host__ __device__ static constexpr auto
+    MakeGemm0AMmaTileDescriptor_M0_M1_M2_K(const ABlockDesc_AK0_M_AK1&)
+    {
+        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
+
+        return MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K<MXdlPerWave, MWaves, MPerXdl>(
+            ABlockDesc_AK0_M_AK1{});
+    }
+
+    template <typename BBlockDesc_BK0_N_BK1>
+    __host__ __device__ static constexpr auto
+    MakeGemm0BMmaTileDescriptor_N0_N1_N2_K(const BBlockDesc_BK0_N_BK1&)
+    {
+        constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        return MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K<NXdlPerWave, NWaves, NPerXdl>(
+            BBlockDesc_BK0_N_BK1{});
+    }
+
+    template <typename ABlockDesc_AK0_M_AK1>
+    __host__ __device__ static constexpr auto
+    MakeGemm1AMmaTileDescriptor_M0_M1_M2_K(const ABlockDesc_AK0_M_AK1&)
+    {
+        return MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K<MXdlPerWave, 1, 1>(ABlockDesc_AK0_M_AK1{});
+    }
+
+    template <typename BBlockDesc_BK0_N_BK1>
+    __host__ __device__ static constexpr auto
+    MakeGemm1BMmaTileDescriptor_N0_N1_N2_K(const BBlockDesc_BK0_N_BK1&)
+    {
+        constexpr index_t Gemm1NWaves = Gemm1NPerBlock / (Gemm1NXdlPerWave * NPerXdl);
+        return MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K<Gemm1NXdlPerWave, Gemm1NWaves, NPerXdl>(
+            BBlockDesc_BK0_N_BK1{});
+    }
+
+    __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
+    {
+        // A matrix in LDS memory, dst of blockwise copy
+        return make_naive_tensor_descriptor(
+            make_tuple(AK0, Number<MPerBlock>{}, AK1),
+            make_tuple(Number<MPerBlock + ABlockLdsExtraM>{} * AK1, AK1, I1));
+    }
+
+    __host__ __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    {
+        // B matrix in LDS memory, dst of blockwise copy
+        return make_naive_tensor_descriptor(
+            make_tuple(BK0, Number<NPerBlock>{}, BK1),
+            make_tuple(Number<NPerBlock + BBlockLdsExtraN>{} * BK1, BK1, I1));
+    }
+
+    __host__ __device__ static constexpr auto GetB1BlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    {
+        // B1 matrix in LDS memory, dst of blockwise copy
+        return make_naive_tensor_descriptor(
+            make_tuple(B1K0, Number<Gemm1NPerBlock>{}, B1K1),
+            make_tuple(Number<Gemm1NPerBlock + B1BlockLdsExtraN>{} * B1K1, B1K1, I1));
+    }
+
+    __host__ __device__ static constexpr auto
+    GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
+    {
+        constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave = Gemm1NPerBlock / (Gemm1NXdlPerWave * NPerXdl);
+
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            make_naive_tensor_descriptor_packed(
+                make_tuple(I1,
+                           Number<CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl>{},
+                           I1,
+                           Number<CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>{}));
+
+        return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        const index_t gemm0_bytes_end = (SharedMemTrait::a_block_space_size_aligned +
+                                         SharedMemTrait::b_block_space_size_aligned) *
+                                        sizeof(FloatAB);
+        const index_t gemm1_bytes_end =
+            (SharedMemTrait::b1_block_space_offset + SharedMemTrait::b1_block_space_size_aligned) *
+            sizeof(FloatAB);
+        const index_t c_block_bytes_end =
+            SharedMemTrait::c_block_space_size * sizeof(FloatCShuffle);
+
+        return math::max(gemm0_bytes_end, gemm1_bytes_end, c_block_bytes_end);
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    template <typename Block2CTileMap>
+    __host__ __device__ static constexpr bool
+    CheckValidity(const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1,
+                  const BGridDesc_BK0_N_BK1& b_grid_desc_bk0_n_bk1,
+                  const B1GridDesc_BK0_N_BK1& b1_grid_desc_bk0_n_bk1,
+                  const CGridDesc_M_N& c_grid_desc_m_n,
+                  const Block2CTileMap& block_2_ctile_map)
+    {
+        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
+                      "Invalid tuning param!");
+
+        const auto M = a_grid_desc_ak0_m_ak1.GetLength(I1);
+        const auto N = b_grid_desc_bk0_n_bk1.GetLength(I1);
+        const auto K = a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2);
+        const auto Gemm1N = b1_grid_desc_bk0_n_bk1.GetLength(I1);
+
+        if(!(M == c_grid_desc_m_n.GetLength(I0) && Gemm1N == c_grid_desc_m_n.GetLength(I1)))
+        {
+            return false;
+        }
+
+        if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0 &&
+             Gemm1N % Gemm1NPerBlock == 0))
+        {
+            return false;
+        }
+
+        // check gemm0 gridwise gemm pipeline
+        const auto num_gemm0_k_loop = K / KPerBlock;
+        if(!GridwiseGemmPipe::IsSupported(num_gemm0_k_loop))
+        {
+            return false;
+        }
+
+        // check gemm1 gridwise gemm pipeline
+        if(!(NPerBlock % Gemm1KPerBlock == 0))
+        {
+            return false;
+        }
+
+        const auto num_gemm1_k_inner_loop = NPerBlock / Gemm1KPerBlock;
+        if(!GridwiseGemmPipe::IsSupported(num_gemm1_k_inner_loop))
+        {
+            return false;
+        }
+
+        if(!block_2_ctile_map.CheckValidity(c_grid_desc_m_n))
+        {
+            return false;
+        }
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        const auto M = c_grid_desc_m_n.GetLength(I0);
+        const auto N = c_grid_desc_m_n.GetLength(I1);
+
+        const auto MBlock = M / MPerBlock;
+        const auto NBlock = N / Gemm1NPerBlock;
+
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
+                       make_unmerge_transform(make_tuple(NBlock, Number<Gemm1NPerBlock>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+
+        return c_grid_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    __host__ __device__ static constexpr auto
+    MakeDefaultBlock2CTileMap(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, Gemm1NPerBlock, CGridDesc_M_N>(
+            c_grid_desc_m_n);
+    }
+
+    using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(CGridDesc_M_N{}))>;
+
+    using DefaultBlock2CTileMap =
+        remove_cvref_t<decltype(MakeDefaultBlock2CTileMap(CGridDesc_M_N{}))>;
+
+    struct SharedMemTrait
+    {
+        // LDS allocation for A and B: be careful of alignment
+        static constexpr auto a_block_desc_ak0_m_ak1 =
+            GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+        static constexpr auto b_block_desc_bk0_n_bk1 =
+            GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+        static constexpr auto b1_block_desc_bk0_n_bk1 =
+            GetB1BlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        static constexpr auto max_lds_align = math::lcm(math::lcm(AK1, BK1), B1K1);
+
+        static constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+        static constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
+        static constexpr auto b1_block_space_size_aligned = math::integer_least_multiple(
+            b1_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
+
+        static constexpr auto a_block_space_offset  = 0;
+        static constexpr auto b_block_space_offset  = a_block_space_size_aligned.value;
+        static constexpr auto b1_block_space_offset = 0;
+
+        // LDS allocation for C shuffle in LDS
+        static constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+        static constexpr auto c_block_space_size =
+            c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();
+    };
+
+    template <bool HasMainKBlockLoop, typename Block2CTileMap>
+    __device__ static void Run(const FloatAB* __restrict__ p_a_grid,
+                               const FloatAB* __restrict__ p_b_grid,
+                               const FloatAB* __restrict__ p_b1_grid,
+                               FloatC* __restrict__ p_c_grid,
+                               void* __restrict__ p_shared,
+                               const AElementwiseOperation& a_element_op,
+                               const BElementwiseOperation& b_element_op,
+                               const AccElementwiseOperation& acc_element_op,
+                               const B1ElementwiseOperation& b1_element_op,
+                               const CElementwiseOperation& c_element_op,
+                               const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1,
+                               const BGridDesc_BK0_N_BK1& b_grid_desc_bk0_n_bk1,
+                               const B1GridDesc_BK0_N_BK1& b1_grid_desc_bk0_n_bk1,
+                               const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock&
+                                   c_grid_desc_mblock_mperblock_nblock_nperblock,
+                               const Block2CTileMap& block_2_ctile_map)
+    {
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
+        const auto b1_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b1_grid, b1_grid_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+        // divide block work by [M, N]
+        const auto block_work_idx =
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!block_2_ctile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * Gemm1NPerBlock);
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        //
+        // set up Gemm0
+        //
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                AElementwiseOperation,
+                                                tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<AK0, MPerBlock, AK1>,
+                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(a_grid_desc_ak0_m_ak1),
+                                                decltype(a_block_desc_ak0_m_ak1),
+                                                ABlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                ABlockTransferSrcVectorDim,
+                                                2,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_AK1,
+                                                1,
+                                                1,
+                                                true, // SrcResetCoord
+                                                true, // DstResetCoord
+                                                NumGemmKPrefetchStage>(
+                a_grid_desc_ak0_m_ak1,
+                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                a_element_op,
+                a_block_desc_ak0_m_ak1,
+                make_multi_index(0, 0, 0),
+                tensor_operation::element_wise::PassThrough{});
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                BElementwiseOperation,
+                                                tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<BK0, NPerBlock, BK1>,
+                                                BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(b_grid_desc_bk0_n_bk1),
+                                                decltype(b_block_desc_bk0_n_bk1),
+                                                BBlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                BBlockTransferSrcVectorDim,
+                                                2,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_BK1,
+                                                1,
+                                                1,
+                                                true, // SrcResetCoord
+                                                true, // DstResetCoord
+                                                NumGemmKPrefetchStage>(
+                b_grid_desc_bk0_n_bk1,
+                make_multi_index(0, 0, 0), // will loop over GemmN dimension
+                b_element_op,
+                b_block_desc_bk0_n_bk1,
+                make_multi_index(0, 0, 0),
+                tensor_operation::element_wise::PassThrough{});
+
+        // Fused Gemm+Gemm pipeline
+        // for n in N0:
+        //   for k in K0:
+        //     acc[m][n] += A[m][k] * B0[k][n]
+        //   acc1[m][o] += acc[m][n] * B1[n][o]
+
+        // sanity check
+        constexpr index_t KPack = math::max(
+            math::lcm(AK1, BK1), MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+
+        auto blockwise_gemm = BlockwiseGemmXdlops_v2<
+            BlockSize,
+            FloatAB,
+            FloatGemmAcc,
+            decltype(a_block_desc_ak0_m_ak1),
+            decltype(b_block_desc_bk0_n_bk1),
+            decltype(MakeGemm0AMmaTileDescriptor_M0_M1_M2_K(a_block_desc_ak0_m_ak1)),
+            decltype(MakeGemm0BMmaTileDescriptor_N0_N1_N2_K(b_block_desc_bk0_n_bk1)),
+            MPerBlock,
+            NPerBlock,
+            KPerBlock,
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            KPack,
+            true>{}; // TransposeC
+
+        auto acc_thread_buf = blockwise_gemm.GetCThreadBuffer();
+
+        // LDS allocation for A and B: be careful of alignment
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<FloatAB*>(p_shared) + SharedMemTrait::a_block_space_offset,
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<FloatAB*>(p_shared) + SharedMemTrait::b_block_space_offset,
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1, 0, 0);
+        const auto a_block_reset_copy_step =
+            make_multi_index(-a_grid_desc_ak0_m_ak1.GetLength(I0), 0, 0);
+        const auto b_block_reset_copy_step =
+            make_multi_index(-b_grid_desc_bk0_n_bk1.GetLength(I0), NPerBlock, 0);
+
+        // gridwise GEMM pipeline
+        // Only supports LoopScheduler::Default
+        const auto gridwise_gemm_pipeline = GridwiseGemmPipeline_Selector<PipelineVer,
+                                                                          NumGemmKPrefetchStage,
+                                                                          LoopScheduler::Default>();
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        //
+        // set up Gemm1
+        //
+
+        // Acc matrix threadwise copy: AccVGPR to VGPR and downcast to XDL input data type
+        constexpr auto acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4 =
+            blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4();
+
+        constexpr auto m0 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I0);
+        constexpr auto n0 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I1);
+        constexpr auto m1 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I2);
+        constexpr auto n1 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I3);
+        constexpr auto m2 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I4);
+        constexpr auto n2 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I5);
+        constexpr auto n3 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I6);
+        constexpr auto n4 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I7);
+
+        constexpr auto b1_block_slice_copy_step = make_multi_index(Gemm1KPerBlock / B1K1, 0, 0);
+
+        // acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4 to acc_thread_desc_k0_m_k1
+        // n0_n1_n2_n3 -> k0
+        // m0_m1_m2 -> m
+        // n4 -> k1
+        // NOTE: had to use merge_v3 or will spit out compilation errors
+        constexpr auto acc_thread_desc_k0_m_k1 = transform_tensor_descriptor(
+            acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4,
+            make_tuple(make_merge_transform_v3_division_mod(make_tuple(n0, n1, n2, n3)),
+                       make_merge_transform_v3_division_mod(make_tuple(m0, m1, m2)),
+                       make_pass_through_transform(n4)),
+            make_tuple(Sequence<1, 3, 5, 6>{}, Sequence<0, 2, 4>{}, Sequence<7>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+        // A1 matrix in AccVGPR
+        // N2 num_groups_per_blk, N3 num_input_blks, N4 group_size
+        constexpr auto AccN3 =
+            blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4().GetLength(I6);
+
+        constexpr auto A1ThreadSlice_K0_M_K1 =
+            make_tuple(Number<Gemm1KPerBlock / n4 / AccN3>{}, Number<m0 * m1 * m2>{}, Number<n4>{});
+
+        constexpr auto A1ThreadSliceK0        = A1ThreadSlice_K0_M_K1[I0];
+        constexpr auto A1ThreadSliceM         = A1ThreadSlice_K0_M_K1[I1];
+        constexpr auto A1ThreadSliceK1        = A1ThreadSlice_K0_M_K1[I2];
+        constexpr auto a1_thread_desc_k0_m_k1 = make_naive_tensor_descriptor(
+            A1ThreadSlice_K0_M_K1,
+            make_tuple(A1ThreadSliceM * A1ThreadSliceK1, A1ThreadSliceK1, I1));
+
+        // B1 matrix in LDS memory, dst of blockwise copy
+        constexpr auto b1_block_desc_bk0_n_bk1 = GetB1BlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // A1 matrix blockwise copy
+        auto a1_blockwise_copy = ThreadwiseTensorSliceTransfer_StaticToStatic<
+            FloatGemmAcc,
+            FloatAB,
+            decltype(acc_thread_desc_k0_m_k1),
+            decltype(a1_thread_desc_k0_m_k1),
+            decltype(acc_element_op),
+            Sequence<A1ThreadSliceK0, A1ThreadSliceM, A1ThreadSliceK1>,
+            Sequence<1, 0, 2>,
+            2,
+            n4>{acc_element_op};
+
+        // B1 matrix blockwise copy
+        auto b1_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                BElementwiseOperation,
+                                                tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<B1K0, Gemm1NPerBlock, B1K1>,
+                                                B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                B1BlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(b1_grid_desc_bk0_n_bk1),
+                                                decltype(b1_block_desc_bk0_n_bk1),
+                                                B1BlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                B1BlockTransferSrcVectorDim,
+                                                2,
+                                                B1BlockTransferSrcScalarPerVector,
+                                                B1BlockTransferDstScalarPerVector_BK1,
+                                                1,
+                                                1,
+                                                B1ThreadTransferSrcResetCoordinateAfterRun,
+                                                true, // DstResetCoord
+                                                NumGemmKPrefetchStage>(
+                b1_grid_desc_bk0_n_bk1,
+                make_multi_index(0, n_block_data_idx_on_grid, 0),
+                b1_element_op,
+                b1_block_desc_bk0_n_bk1,
+                make_multi_index(0, 0, 0),
+                tensor_operation::element_wise::PassThrough{});
+
+        auto a1_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
+            a1_thread_desc_k0_m_k1.GetElementSpaceSize());
+
+        // reuse LDS space for gemm0's b_block_buf
+        auto b1_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<FloatAB*>(p_shared) + SharedMemTrait::b1_block_space_offset,
+            b1_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        // selected_mfma.group_size or B1K1 <= Gemm1KPack <= selected_mfma.group_size
+        // selected_mfma.k_per_blk <= Gemm1KPack
+        //
+        // Following similar rationale behind Gemm0KPack, let Gemm1KPack be the lowest common
+        // multiples of A1K1 (predetermined by selected_mfma.group_size) and B1K1. But in this case
+        // Gemm1KPack can't be higher than A1K1 itself because A1 matrix is distributed in VGPRs
+        // with 'group_size' amount of contiguous elements. Having Gemm1KPack greater than A1K1 will
+        // cause mismatch in summation index for example c[0:7] = a1[[0:3, 8:11]] * b1[0:7].
+        // therefore we may just as well assign Gemm1KPack = group_size
+        constexpr index_t Gemm1KPack =
+            MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.group_size;
+
+        auto gemm1_blockwise_gemm = BlockwiseGemmXdlops_v2<
+            BlockSize,
+            FloatAB,
+            FloatGemmAcc,
+            decltype(a1_thread_desc_k0_m_k1),
+            decltype(b1_block_desc_bk0_n_bk1),
+            decltype(MakeGemm1AMmaTileDescriptor_M0_M1_M2_K(a1_thread_desc_k0_m_k1)),
+            decltype(MakeGemm1BMmaTileDescriptor_N0_N1_N2_K(b1_block_desc_bk0_n_bk1)),
+            MPerBlock,
+            Gemm1NPerBlock,
+            Gemm1KPerBlock,
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            Gemm1NXdlPerWave,
+            Gemm1KPack,
+            false,      // TransposeC
+            Gemm1KPack, // AMmaKStride
+            Gemm1KPack * XdlopsGemm<FloatAB, MPerXdl, NPerXdl, Gemm1KPack, false>{}.K0PerXdlops>{
+            // BMmaKStride
+            make_tuple(0, 0, 0, 0)}; // A_origin
+
+        auto c_thread_buf = gemm1_blockwise_gemm.GetCThreadBuffer();
+
+        const index_t num_gemm1_k_block_outer_loop =
+            b_grid_desc_bk0_n_bk1.GetLength(I1) / NPerBlock;
+        constexpr index_t num_gemm1_k_block_inner_loop = NPerBlock / Gemm1KPerBlock;
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        // gemm1 K loop
+        index_t gemm1_k_block_outer_index = 0;
+        do
+        {
+            // gemm0
+            gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_grid_desc_ak0_m_ak1,
+                                                                   a_block_desc_ak0_m_ak1,
+                                                                   a_blockwise_copy,
+                                                                   a_grid_buf,
+                                                                   a_block_buf,
+                                                                   a_block_slice_copy_step,
+                                                                   b_grid_desc_bk0_n_bk1,
+                                                                   b_block_desc_bk0_n_bk1,
+                                                                   b_blockwise_copy,
+                                                                   b_grid_buf,
+                                                                   b_block_buf,
+                                                                   b_block_slice_copy_step,
+                                                                   blockwise_gemm,
+                                                                   acc_thread_buf,
+                                                                   num_k_block_main_loop);
+            // gemm1
+            {
+                // TODO: explore using dynamic buffer for a1 thread buffer
+                // For a1_blockwise_copy, the goal is to satisfy pipeline requirements RunRead(),
+                // RunWrite(), and MoveSliceWindow(). But it is impossible to implement given that
+                // the A1 source buffer is static buffer holding the output of first GEMM and
+                // requires constexpr offset by design. Therefore, we pass tensor coordinate offset
+                // explicitly in Run() below.
+
+                // preload data into LDS
+                b1_blockwise_copy.RunRead(b1_grid_desc_bk0_n_bk1, b1_grid_buf);
+
+                b1_blockwise_copy.MoveSrcSliceWindow(b1_grid_desc_bk0_n_bk1,
+                                                     b1_block_slice_copy_step);
+
+                block_sync_lds(); // wait for gemm0 LDS read
+
+                b1_blockwise_copy.RunWrite(b1_block_desc_bk0_n_bk1, b1_block_buf);
+
+                // main body
+                if constexpr(num_gemm1_k_block_inner_loop > 1)
+                {
+                    static_for<0, num_gemm1_k_block_inner_loop - 1, 1>{}([&](auto i) {
+                        a1_blockwise_copy.Run(acc_thread_desc_k0_m_k1,
+                                              make_tuple(Number<i * A1ThreadSliceK0>{}, I0, I0),
+                                              acc_thread_buf,
+                                              a1_thread_desc_k0_m_k1,
+                                              make_tuple(I0, I0, I0),
+                                              a1_thread_buf);
+
+                        b1_blockwise_copy.RunRead(b1_grid_desc_bk0_n_bk1, b1_grid_buf);
+
+                        block_sync_lds();
+
+                        gemm1_blockwise_gemm.Run(a1_thread_buf, b1_block_buf, c_thread_buf);
+
+                        block_sync_lds();
+
+                        b1_blockwise_copy.MoveSrcSliceWindow(b1_grid_desc_bk0_n_bk1,
+                                                             b1_block_slice_copy_step);
+
+                        b1_blockwise_copy.RunWrite(b1_block_desc_bk0_n_bk1, b1_block_buf);
+                    });
+                }
+                // tail
+                {
+                    a1_blockwise_copy.Run(
+                        acc_thread_desc_k0_m_k1,
+                        make_tuple(
+                            Number<(num_gemm1_k_block_inner_loop - 1) * A1ThreadSliceK0>{}, I0, I0),
+                        acc_thread_buf,
+                        a1_thread_desc_k0_m_k1,
+                        make_tuple(I0, I0, I0),
+                        a1_thread_buf);
+
+                    block_sync_lds();
+
+                    gemm1_blockwise_gemm.Run(a1_thread_buf, b1_block_buf, c_thread_buf);
+                }
+            } // end gemm1
+
+            a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_ak0_m_ak1,
+                                                a_block_reset_copy_step); // rewind K
+            b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_bk0_n_bk1,
+                                                b_block_reset_copy_step); // rewind K and step N
+
+            block_sync_lds(); // wait for gemm1 LDS read
+        } while(++gemm1_k_block_outer_index < num_gemm1_k_block_outer_loop); // end j loop
+
+        // shuffle C and write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              Gemm1NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+            constexpr index_t NWave = Gemm1NPerBlock / (Gemm1NXdlPerWave * NPerXdl);
+
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                gemm1_blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                gemm1_blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<FloatCShuffle*>(p_shared),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
+                        M1,                                      // M1 = MWave
+                        M2,                                      // M2 * M3 * M4 = MPerXdl
+                        M3,
+                        M4)),
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
+                        N1,                                      // N1 = NWave
+                        N2))),                                   // N2 = NPerXdl
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                gemm1_blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<FloatGemmAcc,
+                                                   FloatCShuffle,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            M2,
+                                                            I1,
+                                                            M4,
+                                                            I1>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     m_thread_data_on_block_idx[I3],
+                                     m_thread_data_on_block_idx[I4],
+                                     n_thread_data_on_block_idx[I2]),
+                    tensor_operation::element_wise::PassThrough{}};
+
+            // shuffle: blockwise copy C from LDS to global
+            auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1<
+                ThisThreadBlock,            // ThreadGroup
+                CElementwiseOperation,      // ElementwiseOperation,
+                CGlobalMemoryDataOperation, // DstInMemOp,
+                Sequence<1,
+                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                         1,
+                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                FloatCShuffle,        // typename SrcData,
+                FloatC,               // typename DstData,
+                decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+                Sequence<0, 1, 2, 3>,                           // typename DimAccessOrder,
+                3,                                              // index_t VectorDim,
+                CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector,
+                true,  // bool ThreadTransferSrcResetCoordinateAfterRun,
+                false> // bool ThreadTransferDstResetCoordinateAfterRun>
+                {c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(0, 0, 0, 0),
+                 c_grid_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(block_work_idx[I0], 0, block_work_idx[I1], 0),
+                 c_element_op};
+
+            // space filling curve for threadwise C in VGPR
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MXdlPerWave, Gemm1NXdlPerWave, 1, 1, M2, 1, M4, 1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle,
+                                           CShuffleNXdlPerWavePerShuffle,
+                                           1,
+                                           1,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>>{};
+
+            // space filling curve for shuffled blockwise C in global mem
+            constexpr auto sfc_c_global =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, Gemm1NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                           1,
+                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            static_assert(num_access == sfc_c_global.GetNumOfAccess(), "wrong!");
+
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf,
+                                              c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              c_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                c_shuffle_block_copy_lds_to_global.Run(
+                    c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                    c_shuffle_block_buf,
+                    c_grid_desc_mblock_mperblock_nblock_nperblock,
+                    c_grid_buf);
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id);
+
+                    // move on C
+                    c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow(
+                        c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
+                }
+            });
+        }
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp
new file mode 100644
index 00000000..b9f4a308
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle_v1.hpp
@@ -0,0 +1,1268 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename A0B0B1DataType, // FIXME: don't assume A0/B0/B1 have same datatype
+          typename Acc0DataType,
+          typename D0sDataType,
+          typename Acc1DataType,
+          typename C1ShuffleDataType,
+          typename D1sDataType,
+          typename E1DataType,
+          typename A0ElementwiseOperation,
+          typename B0ElementwiseOperation,
+          typename CDE0ElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CDE1ElementwiseOperation,
+          InMemoryDataOperationEnum E1GlobalMemoryDataOperation,
+          typename A0GridDesc_M_K,
+          typename B0GridDesc_N_K,
+          typename D0sGridDesc_M_N,
+          typename B1GridDesc_N_K,
+          typename D1sGridDesc_M_N,
+          typename E1GridDesc_M_N,
+          index_t NumGemm0KPrefetchStage,
+          index_t BlockSize,
+          index_t Gemm0MPerBlock,
+          index_t Gemm0NPerBlock,
+          index_t Gemm0KPerBlock,
+          index_t Gemm1NPerBlock,
+          index_t Gemm1KPerBlock,
+          index_t A0K1Value,
+          index_t B0K1Value,
+          index_t B1K1Value,
+          index_t Gemm0MPerXdl,
+          index_t Gemm0NPerXdl,
+          index_t Gemm0MXdlPerWave,
+          index_t Gemm0NXdlPerWave,
+          index_t Gemm1NXdlPerWave,
+          typename A0BlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename A0BlockTransferThreadClusterArrangeOrder,
+          typename A0BlockTransferSrcAccessOrder,
+          index_t A0BlockTransferSrcVectorDim,
+          index_t A0BlockTransferSrcScalarPerVector,
+          index_t A0BlockTransferDstScalarPerVector_AK1,
+          bool A0ThreadTransferSrcResetCoordinateAfterRun, // ignored
+          index_t A0BlockLdsExtraM,
+          typename B0BlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename B0BlockTransferThreadClusterArrangeOrder,
+          typename B0BlockTransferSrcAccessOrder,
+          index_t B0BlockTransferSrcVectorDim,
+          index_t B0BlockTransferSrcScalarPerVector,
+          index_t B0BlockTransferDstScalarPerVector_BK1,
+          bool B0ThreadTransferSrcResetCoordinateAfterRun, // ignored
+          index_t B0BlockLdsExtraN,
+          typename B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename B1BlockTransferThreadClusterArrangeOrder,
+          typename B1BlockTransferSrcAccessOrder,
+          index_t B1BlockTransferSrcVectorDim,
+          index_t B1BlockTransferSrcScalarPerVector,
+          index_t B1BlockTransferDstScalarPerVector_BK1,
+          bool B1ThreadTransferSrcResetCoordinateAfterRun,
+          index_t B1BlockLdsExtraN,
+          index_t C1ShuffleGemm0MXdlPerWavePerShuffle,
+          index_t C1ShuffleGemm0NXdlPerWavePerShuffle,
+          typename CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CDE1ShuffleBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched>
+struct GridwiseBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
+{
+    static_assert(LoopSched == LoopScheduler::Default,
+                  "Non-default loop scheduler is currently not supported");
+
+    static constexpr index_t NumD0Tensor = D0sDataType::Size();
+    static constexpr index_t NumD1Tensor = D1sDataType::Size();
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    static constexpr auto WaveSize = 64;
+    // K1 should be Number<...>
+    // Gemm0
+    static constexpr auto A0K1 = Number<A0K1Value>{};
+    static constexpr auto B0K1 = Number<B0K1Value>{};
+
+    static constexpr auto A0K0PerBlock = Number<Gemm0KPerBlock / A0K1Value>{};
+    static constexpr auto B0K0PerBlock = Number<Gemm0KPerBlock / B0K1Value>{};
+
+    static constexpr auto Gemm0MWaves = Gemm0MPerBlock / (Gemm0MPerXdl * Gemm0MXdlPerWave);
+    static constexpr auto Gemm0NWaves = Gemm0NPerBlock / (Gemm0NPerXdl * Gemm0NXdlPerWave);
+    // Gemm1
+    static constexpr auto B1K1         = Number<B1K1Value>{};
+    static constexpr auto B1K0PerBlock = Number<Gemm1KPerBlock / B1K1Value>{};
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    using GridwiseGemmPipe = GridwiseGemmPipeline_v1<NumGemm0KPrefetchStage>;
+
+    // ck::Tuple<const D0DataType1*, const D0DataType2*, ...>
+    static constexpr auto MakeD0sGridPointer()
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using D0DataType = remove_cvref_t<tuple_element_t<i.value, D0sDataType>>;
+
+                return static_cast<const D0DataType*>(nullptr);
+            },
+            Number<NumD0Tensor>{});
+    }
+
+    // ck::Tuple<const D1DataType1*, const D1DataType2*, ...>
+    static constexpr auto MakeD1sGridPointer()
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using D1DataType = remove_cvref_t<tuple_element_t<i.value, D1sDataType>>;
+
+                return static_cast<const D1DataType*>(nullptr);
+            },
+            Number<NumD1Tensor>{});
+    }
+
+    __device__ static auto GetGemm0WaveIdx()
+    {
+        const index_t thread_id = get_thread_local_1d_id();
+
+        constexpr auto threadid_to_wave_idx_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(Gemm0MWaves, Gemm0NWaves, WaveSize))),
+            make_tuple(Sequence<0, 1, 2>{}),
+            make_tuple(Sequence<0>{}));
+
+        return threadid_to_wave_idx_adaptor.CalculateBottomIndex(make_multi_index(thread_id));
+    }
+
+    __device__ static auto GetGemm0WaveMNIdx(const index_t thread_id)
+    {
+        constexpr auto wave_threadid_to_mn_idx_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(WaveSize / Gemm0NPerXdl, Gemm0NPerXdl))),
+            make_tuple(Sequence<0, 1>{}),
+            make_tuple(Sequence<0>{}));
+
+        return wave_threadid_to_mn_idx_adaptor.CalculateBottomIndex(make_multi_index(thread_id));
+    }
+
+    template <typename A0BlockDesc_AK0_M_AK1>
+    __host__ __device__ static constexpr auto
+    MakeGemm0AMmaTileDescriptor_M0_M1_M2_K(const A0BlockDesc_AK0_M_AK1&)
+    {
+        constexpr index_t MWaves = Gemm0MPerBlock / (Gemm0MXdlPerWave * Gemm0MPerXdl);
+
+        return MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K<Gemm0MXdlPerWave, MWaves, Gemm0MPerXdl>(
+            A0BlockDesc_AK0_M_AK1{});
+    }
+
+    template <typename BBlockDesc_BK0_N_BK1>
+    __host__ __device__ static constexpr auto
+    MakeGemm0BMmaTileDescriptor_N0_N1_N2_K(const BBlockDesc_BK0_N_BK1&)
+    {
+        constexpr index_t NWaves = Gemm0NPerBlock / (Gemm0NXdlPerWave * Gemm0NPerXdl);
+
+        return MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K<Gemm0NXdlPerWave, NWaves, Gemm0NPerXdl>(
+            BBlockDesc_BK0_N_BK1{});
+    }
+
+    template <typename A0BlockDesc_AK0_M_AK1>
+    __host__ __device__ static constexpr auto
+    MakeGemm1AMmaTileDescriptor_M0_M1_M2_K(const A0BlockDesc_AK0_M_AK1&)
+    {
+        return MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K<Gemm0MXdlPerWave, 1, 1>(
+            A0BlockDesc_AK0_M_AK1{});
+    }
+
+    template <typename BBlockDesc_BK0_N_BK1>
+    __host__ __device__ static constexpr auto
+    MakeGemm1BMmaTileDescriptor_N0_N1_N2_K(const BBlockDesc_BK0_N_BK1&)
+    {
+        constexpr index_t Gemm1NWaves = Gemm1NPerBlock / (Gemm1NXdlPerWave * Gemm0NPerXdl);
+        return MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K<Gemm1NXdlPerWave, Gemm1NWaves, Gemm0NPerXdl>(
+            BBlockDesc_BK0_N_BK1{});
+    }
+
+    __host__ __device__ static constexpr auto GetA0BlockDescriptor_AK0PerBlock_MPerBlock_AK1()
+    {
+        // A0 matrix in LDS memory, dst of blockwise copy
+        return make_naive_tensor_descriptor(
+            make_tuple(A0K0PerBlock, Number<Gemm0MPerBlock>{}, A0K1),
+            make_tuple(Number<Gemm0MPerBlock + A0BlockLdsExtraM>{} * A0K1, A0K1, I1));
+    }
+
+    __host__ __device__ static constexpr auto GetB0BlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    {
+        // B0 matrix in LDS memory, dst of blockwise copy
+        return make_naive_tensor_descriptor(
+            make_tuple(B0K0PerBlock, Number<Gemm0NPerBlock>{}, B0K1),
+            make_tuple(Number<Gemm0NPerBlock + B0BlockLdsExtraN>{} * B0K1, B0K1, I1));
+    }
+
+    __host__ __device__ static constexpr auto GetB1BlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    {
+        // B1 matrix in LDS memory, dst of blockwise copy
+        return make_naive_tensor_descriptor(
+            make_tuple(B1K0PerBlock, Number<Gemm1NPerBlock>{}, B1K1),
+            make_tuple(Number<Gemm1NPerBlock + B1BlockLdsExtraN>{} * B1K1, B1K1, I1));
+    }
+
+    __host__ __device__ static constexpr auto
+    GetC1ShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
+    {
+        constexpr index_t MWave = Gemm0MPerBlock / (Gemm0MXdlPerWave * Gemm0MPerXdl);
+        constexpr index_t NWave = Gemm1NPerBlock / (Gemm1NXdlPerWave * Gemm0NPerXdl);
+
+        constexpr auto c1_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            make_naive_tensor_descriptor_packed(
+                make_tuple(I1,
+                           Number<C1ShuffleGemm0MXdlPerWavePerShuffle * MWave * Gemm0MPerXdl>{},
+                           I1,
+                           Number<C1ShuffleGemm0NXdlPerWavePerShuffle * NWave * Gemm0NPerXdl>{}));
+
+        return c1_shuffle_block_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        const index_t gemm0_bytes_end = (SharedMemTrait::a0_block_space_size_aligned +
+                                         SharedMemTrait::b0_block_space_size_aligned) *
+                                        sizeof(A0B0B1DataType);
+        const index_t gemm1_bytes_end =
+            (SharedMemTrait::b1_block_space_offset + SharedMemTrait::b1_block_space_size_aligned) *
+            sizeof(A0B0B1DataType);
+        const index_t c1_block_bytes_end =
+            SharedMemTrait::c1_block_space_size * sizeof(C1ShuffleDataType);
+
+        return math::max(gemm0_bytes_end, gemm1_bytes_end, c1_block_bytes_end);
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    template <typename Block2E1TileMap>
+    __host__ __device__ static constexpr bool
+    CheckValidity(const A0GridDesc_M_K& a0_grid_desc_m_k,
+                  const B0GridDesc_N_K& b0_grid_desc_n_k,
+                  const B1GridDesc_N_K& b1_grid_desc_n_k,
+                  const E1GridDesc_M_N& e1_grid_desc_m_n,
+                  const Block2E1TileMap& block_2_e1tile_map)
+    {
+        static_assert((Gemm0MPerBlock % (Gemm0MPerXdl * Gemm0MXdlPerWave) == 0) &&
+                          (Gemm0NPerBlock % (Gemm0NXdlPerWave * Gemm0NPerXdl)) == 0,
+                      "Invalid tuning param!");
+
+        const auto M      = a0_grid_desc_m_k.GetLength(I0);
+        const auto N      = b0_grid_desc_n_k.GetLength(I0);
+        const auto K      = a0_grid_desc_m_k.GetLength(I1);
+        const auto Gemm1N = b1_grid_desc_n_k.GetLength(I0);
+
+        if(!(M == e1_grid_desc_m_n.GetLength(I0) && Gemm1N == e1_grid_desc_m_n.GetLength(I1)))
+        {
+            return false;
+        }
+
+        if(!(M % Gemm0MPerBlock == 0 && N % Gemm0NPerBlock == 0 && K % Gemm0KPerBlock == 0 &&
+             Gemm1N % Gemm1NPerBlock == 0))
+        {
+            return false;
+        }
+
+        // check gemm0 gridwise gemm pipeline
+        const auto num_gemm0_k_loop = K / Gemm0KPerBlock;
+        if(!GridwiseGemmPipe::IsSupported(num_gemm0_k_loop))
+        {
+            return false;
+        }
+
+        // check gemm1 gridwise gemm pipeline
+        if(!(Gemm0NPerBlock % Gemm1KPerBlock == 0))
+        {
+            return false;
+        }
+
+        const auto num_gemm1_k_inner_loop = Gemm0NPerBlock / Gemm1KPerBlock;
+        if(!GridwiseGemmPipe::IsSupported(num_gemm1_k_inner_loop))
+        {
+            return false;
+        }
+
+        if(!block_2_e1tile_map.CheckValidity(e1_grid_desc_m_n))
+        {
+            return false;
+        }
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const index_t num_loop = K / Gemm0KPerBlock;
+
+        return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
+    }
+
+    // A0 desc for source in blockwise copy
+    __host__ __device__ static constexpr auto
+    MakeDefaultA0GridDescriptor_AK0_M_AK1(const A0GridDesc_M_K& a0_grid_desc_m_k)
+    {
+        const auto M = a0_grid_desc_m_k.GetLength(I0);
+        const auto K = a0_grid_desc_m_k.GetLength(I1);
+
+        const auto A0K0 = K / A0K1;
+
+        return transform_tensor_descriptor(
+            a0_grid_desc_m_k,
+            make_tuple(make_unmerge_transform(make_tuple(A0K0, A0K1)),
+                       make_pass_through_transform(M)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    // B0 desc for source in blockwise copy
+    __host__ __device__ static constexpr auto
+    MakeDefaultB0GridDescriptor_BK0_N_BK1(const B0GridDesc_N_K& b0_grid_desc_n_k)
+    {
+        const auto N = b0_grid_desc_n_k.GetLength(I0);
+        const auto K = b0_grid_desc_n_k.GetLength(I1);
+
+        const auto B0K0 = K / B0K1;
+
+        return transform_tensor_descriptor(
+            b0_grid_desc_n_k,
+            make_tuple(make_unmerge_transform(make_tuple(B0K0, B0K1)),
+                       make_pass_through_transform(N)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    // D0 desc for source in blockwise copy
+    template <typename D0GridDesc_M_N>
+    __host__ __device__ static constexpr auto
+    MakeGemm0D0GridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5(const D0GridDesc_M_N& d0_grid_desc_m_n)
+    {
+        const auto M = d0_grid_desc_m_n.GetLength(I0);
+        const auto N = d0_grid_desc_m_n.GetLength(I1);
+
+        constexpr auto mfma =
+            MfmaSelector<A0B0B1DataType, Gemm0MPerXdl, Gemm0NPerXdl>::selected_mfma;
+        constexpr auto N3 = mfma.num_groups_per_blk;
+        constexpr auto N5 = mfma.group_size;
+        return transform_tensor_descriptor(
+            d0_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(
+                           M / Gemm0MPerBlock, Gemm0MXdlPerWave, Gemm0MWaves, Gemm0MPerXdl)),
+                       make_unmerge_transform(make_tuple(N / Gemm0NPerBlock,
+                                                         Gemm0NXdlPerWave,
+                                                         Gemm0NWaves,
+                                                         N3,
+                                                         WaveSize / Gemm0NPerXdl,
+                                                         N5))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 2, 4, 6>{}, Sequence<1, 3, 5, 7, 8, 9>{}));
+    }
+
+    // B1 desc for source in blockwise copy
+    __host__ __device__ static constexpr auto
+    MakeDefaultB1GridDescriptor_BK0_N_BK1(const B1GridDesc_N_K& b1_grid_desc_n_k)
+    {
+        const auto N = b1_grid_desc_n_k.GetLength(I0);
+        const auto K = b1_grid_desc_n_k.GetLength(I1);
+
+        const auto B1K0 = K / B1K1;
+
+        return transform_tensor_descriptor(
+            b1_grid_desc_n_k,
+            make_tuple(make_unmerge_transform(make_tuple(B1K0, B1K1)),
+                       make_pass_through_transform(N)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    // C1 desc for destination in blockwise copy
+    __host__ __device__ static constexpr auto
+    MakeE1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const E1GridDesc_M_N& e1_grid_desc_m_n)
+    {
+        const auto M = e1_grid_desc_m_n.GetLength(I0);
+        const auto N = e1_grid_desc_m_n.GetLength(I1);
+
+        const auto MBlock = M / Gemm0MPerBlock;
+        const auto NBlock = N / Gemm1NPerBlock;
+
+        const auto e1_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
+            e1_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<Gemm0MPerBlock>{})),
+                       make_unmerge_transform(make_tuple(NBlock, Number<Gemm1NPerBlock>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+
+        return e1_grid_desc_mblock_mperblock_nblock_nperblock;
+    }
+    // D0s desc for source in blockwise copy
+    __host__ __device__ static constexpr auto
+    MakeD0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5(const D0sGridDesc_M_N& ds_grid_desc_m_n)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                return MakeGemm0D0GridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5(ds_grid_desc_m_n[i]);
+            },
+            Number<NumD0Tensor>{});
+    }
+    // Ds desc for source in blockwise copy
+    template <typename DsGridDescriptor_M_N>
+    __host__ __device__ static constexpr auto
+    MakeD1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        const DsGridDescriptor_M_N& ds_grid_desc_m_n)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                return MakeE1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(ds_grid_desc_m_n[i]);
+            },
+            Number<NumD1Tensor>{});
+    }
+
+    // return block_id to C1 matrix tile idx (m0, n0) mapping
+    __host__ __device__ static constexpr auto
+    MakeDefaultBlock2E1TileMap(const E1GridDesc_M_N& e1_grid_desc_m_n)
+    {
+        return BlockToCTileMap_M00_N0_M01Adapt<Gemm0MPerBlock, Gemm1NPerBlock, E1GridDesc_M_N>(
+            e1_grid_desc_m_n);
+    }
+
+    using E1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        MakeE1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(E1GridDesc_M_N{}))>;
+
+    using D0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5 = remove_cvref_t<decltype(
+        MakeD0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5(D0sGridDesc_M_N{}))>;
+
+    using D1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        MakeD1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(D1sGridDesc_M_N{}))>;
+
+    using DefaultBlock2E1TileMap =
+        remove_cvref_t<decltype(MakeDefaultBlock2E1TileMap(E1GridDesc_M_N{}))>;
+
+    struct SharedMemTrait
+    {
+        // LDS allocation for A0 and B0: be careful of alignment
+        static constexpr auto a0_block_desc_ak0_m_ak1 =
+            GetA0BlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+        static constexpr auto b0_block_desc_bk0_n_bk1 =
+            GetB0BlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+        static constexpr auto b1_block_desc_bk0_n_bk1 =
+            GetB1BlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        static constexpr auto max_lds_align = math::lcm(math::lcm(A0K1, B0K1), B1K1);
+
+        static constexpr auto a0_block_space_size_aligned = math::integer_least_multiple(
+            a0_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+        static constexpr auto b0_block_space_size_aligned = math::integer_least_multiple(
+            b0_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
+        static constexpr auto b1_block_space_size_aligned = math::integer_least_multiple(
+            b1_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
+
+        static constexpr auto a0_block_space_offset = 0;
+        static constexpr auto b0_block_space_offset = a0_block_space_size_aligned.value;
+        static constexpr auto b1_block_space_offset = 0;
+
+        // LDS allocation for C1 shuffle in LDS
+        static constexpr auto c1_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            GetC1ShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+        static constexpr auto c1_block_space_size =
+            c1_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();
+    };
+
+    using D0sGridPointer = decltype(MakeD0sGridPointer());
+    using D1sGridPointer = decltype(MakeD1sGridPointer());
+
+    template <bool HasMainKBlockLoop,
+              typename A0GridDesc_AK0_M_AK1,
+              typename B0GridDesc_BK0_N_BK1,
+              typename B1GridDesc_BK0_N_BK1,
+              typename Block2E1TileMap>
+    __device__ static void Run(const A0B0B1DataType* __restrict__ p_a0_grid,
+                               const A0B0B1DataType* __restrict__ p_b0_grid,
+                               D0sGridPointer p_d0s_grid,
+                               const A0B0B1DataType* __restrict__ p_b1_grid,
+                               D1sGridPointer p_d1s_grid,
+                               E1DataType* __restrict__ p_e1_grid,
+                               void* __restrict__ p_shared,
+                               const A0ElementwiseOperation& a0_element_op,
+                               const B0ElementwiseOperation& b0_element_op,
+                               const CDE0ElementwiseOperation& cde0_element_op,
+                               const B1ElementwiseOperation& b1_element_op,
+                               const CDE1ElementwiseOperation& cde1_element_op,
+                               const A0GridDesc_AK0_M_AK1& a0_grid_desc_ak0_m_ak1,
+                               const B0GridDesc_BK0_N_BK1& b0_grid_desc_bk0_n_bk1,
+                               const D0sGridDescriptor_M0_N0_M1_N1_M2_N2_M3_N3_N4_N5&
+                                   d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
+                               const B1GridDesc_BK0_N_BK1& b1_grid_desc_bk0_n_bk1,
+                               const D1sGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock&
+                                   d1s_grid_desc_mblock_mperblock_nblock_nperblock,
+                               const E1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock&
+                                   e1_grid_desc_mblock_mperblock_nblock_nperblock,
+                               const Block2E1TileMap& block_2_e1tile_map)
+    {
+        const auto a0_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a0_grid, a0_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b0_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b0_grid, b0_grid_desc_bk0_n_bk1.GetElementSpaceSize());
+        const auto b1_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b1_grid, b1_grid_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto e1_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_e1_grid, e1_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+        const auto d0s_grid_buf = generate_tuple(
+            [&](auto i) {
+                return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                    p_d0s_grid[i],
+                    d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5[i].GetElementSpaceSize());
+            },
+            Number<NumD0Tensor>{});
+        const auto d1s_grid_buf = generate_tuple(
+            [&](auto i) {
+                return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                    p_d1s_grid[i],
+                    d1s_grid_desc_mblock_mperblock_nblock_nperblock[i].GetElementSpaceSize());
+            },
+            Number<NumD1Tensor>{});
+
+        // divide block work by [M, N]
+        const auto block_work_idx =
+            block_2_e1tile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!block_2_e1tile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(e1_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          e1_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I0] * Gemm0MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * Gemm1NPerBlock);
+
+        // A0 matrix in LDS memory, dst of blockwise copy
+        constexpr auto a0_block_desc_ak0_m_ak1 = GetA0BlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B0 matrix in LDS memory, dst of blockwise copy
+        constexpr auto b0_block_desc_bk0_n_bk1 = GetB0BlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        //
+        // set up Gemm0
+        //
+
+        // A0 matrix blockwise copy
+        auto a0_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                A0ElementwiseOperation,
+                                                tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<A0K0PerBlock, Gemm0MPerBlock, A0K1>,
+                                                A0BlockTransferThreadClusterLengths_AK0_M_AK1,
+                                                A0BlockTransferThreadClusterArrangeOrder,
+                                                A0B0B1DataType,
+                                                A0B0B1DataType,
+                                                decltype(a0_grid_desc_ak0_m_ak1),
+                                                decltype(a0_block_desc_ak0_m_ak1),
+                                                A0BlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                A0BlockTransferSrcVectorDim,
+                                                2,
+                                                A0BlockTransferSrcScalarPerVector,
+                                                A0BlockTransferDstScalarPerVector_AK1,
+                                                1,
+                                                1,
+                                                true, // SrcResetCoord
+                                                true, // DstResetCoord
+                                                NumGemm0KPrefetchStage>(
+                a0_grid_desc_ak0_m_ak1,
+                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                a0_element_op,
+                a0_block_desc_ak0_m_ak1,
+                make_multi_index(0, 0, 0),
+                tensor_operation::element_wise::PassThrough{});
+
+        // B0 matrix blockwise copy
+        auto b0_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                B0ElementwiseOperation,
+                                                tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<B0K0PerBlock, Gemm0NPerBlock, B0K1>,
+                                                B0BlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                B0BlockTransferThreadClusterArrangeOrder,
+                                                A0B0B1DataType,
+                                                A0B0B1DataType,
+                                                decltype(b0_grid_desc_bk0_n_bk1),
+                                                decltype(b0_block_desc_bk0_n_bk1),
+                                                B0BlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                B0BlockTransferSrcVectorDim,
+                                                2,
+                                                B0BlockTransferSrcScalarPerVector,
+                                                B0BlockTransferDstScalarPerVector_BK1,
+                                                1,
+                                                1,
+                                                true, // SrcResetCoord
+                                                true, // DstResetCoord
+                                                NumGemm0KPrefetchStage>(
+                b0_grid_desc_bk0_n_bk1,
+                make_multi_index(0, 0, 0), // will loop over GemmN dimension
+                b0_element_op,
+                b0_block_desc_bk0_n_bk1,
+                make_multi_index(0, 0, 0),
+                tensor_operation::element_wise::PassThrough{});
+
+        // Fused Gemm+Gemm pipeline
+        // for n in N0:
+        //   for k in K0:
+        //     acc[m][n] += A[m][k] * B0[k][n]
+        //   acc1[m][o] += acc[m][n] * B1[n][o]
+
+        // sanity check
+        constexpr index_t KPack = math::max(
+            math::lcm(A0K1, B0K1),
+            MfmaSelector<A0B0B1DataType, Gemm0MPerXdl, Gemm0NPerXdl>::selected_mfma.k_per_blk);
+
+        auto blockwise_gemm0 = BlockwiseGemmXdlops_v2<
+            BlockSize,
+            A0B0B1DataType,
+            Acc0DataType,
+            decltype(a0_block_desc_ak0_m_ak1),
+            decltype(b0_block_desc_bk0_n_bk1),
+            decltype(MakeGemm0AMmaTileDescriptor_M0_M1_M2_K(a0_block_desc_ak0_m_ak1)),
+            decltype(MakeGemm0BMmaTileDescriptor_N0_N1_N2_K(b0_block_desc_bk0_n_bk1)),
+            Gemm0MPerBlock,
+            Gemm0NPerBlock,
+            Gemm0KPerBlock,
+            Gemm0MPerXdl,
+            Gemm0NPerXdl,
+            Gemm0MXdlPerWave,
+            Gemm0NXdlPerWave,
+            KPack,
+            true>{}; // TransposeC
+
+        auto acc0_thread_buf = blockwise_gemm0.GetCThreadBuffer();
+
+        // LDS allocation for A0 and B0: be careful of alignment
+        auto a0_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<A0B0B1DataType*>(p_shared) + SharedMemTrait::a0_block_space_offset,
+            a0_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto b0_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<A0B0B1DataType*>(p_shared) + SharedMemTrait::b0_block_space_offset,
+            b0_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        constexpr auto a0_block_slice_copy_step = make_multi_index(Gemm0KPerBlock / A0K1, 0, 0);
+        constexpr auto b0_block_slice_copy_step = make_multi_index(Gemm0KPerBlock / B0K1, 0, 0);
+        const auto a0_block_reset_copy_step =
+            make_multi_index(-a0_grid_desc_ak0_m_ak1.GetLength(I0), 0, 0);
+        const auto b0_block_reset_copy_step =
+            make_multi_index(-b0_grid_desc_bk0_n_bk1.GetLength(I0), Gemm0NPerBlock, 0);
+
+        // gridwise GEMM pipeline
+        // Only supports LoopScheduler::Default
+        const auto gridwise_gemm0_pipeline =
+            GridwiseGemmPipeline_v1_Selector<NumGemm0KPrefetchStage, LoopScheduler::Default>();
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a0_grid_desc_ak0_m_ak1.GetLength(I0) * a0_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            Gemm0KPerBlock);
+
+        //
+        // set up Gemm1
+        //
+
+        // Acc0 matrix threadwise copy: AccVGPR to VGPR and downcast to XDL input data type
+        constexpr auto acc0_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4 =
+            blockwise_gemm0.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4();
+
+        constexpr auto m0 = acc0_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I0);
+        constexpr auto n0 = acc0_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I1);
+        constexpr auto m1 = acc0_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I2);
+        constexpr auto n1 = acc0_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I3);
+        constexpr auto m2 = acc0_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I4);
+        constexpr auto n2 = acc0_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I5);
+        constexpr auto n3 = acc0_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I6);
+        constexpr auto n4 = acc0_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I7);
+
+        constexpr auto b1_block_slice_copy_step = make_multi_index(Gemm1KPerBlock / B1K1, 0, 0);
+
+        // d0 matrix threadwise copy
+        constexpr auto d0_thread_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5 =
+            make_naive_tensor_descriptor_packed(make_tuple(I1,   // MBlockId
+                                                           I1,   // NBlockID
+                                                           I1,   // MRepeat
+                                                           I1,   // NRepeat
+                                                           I1,   // MWaveId
+                                                           I1,   // NWaveId
+                                                           I1,   // MPerXdl
+                                                           I1,   // NGroupNum
+                                                           I1,   // NInputNum
+                                                           n4)); // registerNum
+
+        auto d0s_thread_buf = generate_tuple(
+            [&](auto) {
+                return StaticBuffer<
+                    AddressSpaceEnum::Vgpr,
+                    A0B0B1DataType,
+                    d0_thread_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5.GetElementSpaceSize(),
+                    true>{};
+            },
+            Number<NumD0Tensor>{});
+
+        const auto wave_id     = GetGemm0WaveIdx();
+        const auto wave_m_n_id = GetGemm0WaveMNIdx(wave_id[I2]); // I2: 0~63
+
+        constexpr auto acc0_thread_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<Gemm0MXdlPerWave>{}, Number<Gemm0NXdlPerWave>{}, n2, n4));
+
+        auto d0s_threadwise_copy = generate_tuple(
+            [&](auto i) {
+                return ThreadwiseTensorSliceTransfer_v2<
+                    A0B0B1DataType,
+                    A0B0B1DataType,
+                    decltype(d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5[i]),
+                    decltype(d0_thread_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5),
+                    Sequence<I1, I1, I1, I1, I1, I1, I1, I1, I1, n4>,
+                    Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>,
+                    9,
+                    n4,
+                    1,
+                    false>(d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5[i],
+                           make_multi_index(block_work_idx[I0], // MBlockId
+                                            0,                  // NBlockId
+                                            0,                  // mrepeat
+                                            0,                  // nrepeat
+                                            wave_id[I0],        // MWaveId
+                                            wave_id[I1],        // NWaveId
+                                            wave_m_n_id[I1],    // MPerXdl
+                                            0,                  // group
+                                            wave_m_n_id[I0],    // NInputIndex
+                                            0));                // register number
+            },
+            Number<NumD0Tensor>{});
+        // acc0_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4 to acc0_thread_desc_k0_m_k1
+        // n0_n1_n2_n3 -> k0
+        // m0_m1_m2 -> m
+        // n4 -> k1
+        // NOTE: had to use merge_v3 or will spit out compilation errors
+        constexpr auto acc0_thread_desc_k0_m_k1 = transform_tensor_descriptor(
+            acc0_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4,
+            make_tuple(make_merge_transform_v3_division_mod(make_tuple(n0, n1, n2, n3)),
+                       make_merge_transform_v3_division_mod(make_tuple(m0, m1, m2)),
+                       make_pass_through_transform(n4)),
+            make_tuple(Sequence<1, 3, 5, 6>{}, Sequence<0, 2, 4>{}, Sequence<7>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+        // A1 matrix in AccVGPR
+        // N2 num_groups_per_blk, N3 num_input_blks, N4 group_size
+        constexpr auto Acc0N3 =
+            blockwise_gemm0.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4().GetLength(I6);
+
+        constexpr auto A1ThreadSlice_K0_M_K1 = make_tuple(
+            Number<Gemm1KPerBlock / n4 / Acc0N3>{}, Number<m0 * m1 * m2>{}, Number<n4>{});
+
+        constexpr auto A1ThreadSliceK0        = A1ThreadSlice_K0_M_K1[I0];
+        constexpr auto A1ThreadSliceM         = A1ThreadSlice_K0_M_K1[I1];
+        constexpr auto A1ThreadSliceK1        = A1ThreadSlice_K0_M_K1[I2];
+        constexpr auto a1_thread_desc_k0_m_k1 = make_naive_tensor_descriptor(
+            A1ThreadSlice_K0_M_K1,
+            make_tuple(A1ThreadSliceM * A1ThreadSliceK1, A1ThreadSliceK1, I1));
+
+        // B1 matrix in LDS memory, dst of blockwise copy
+        constexpr auto b1_block_desc_bk0_n_bk1 = GetB1BlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // A1 matrix blockwise copy
+        auto a1_blockwise_copy = ThreadwiseTensorSliceTransfer_StaticToStatic<
+            Acc0DataType,
+            A0B0B1DataType,
+            decltype(acc0_thread_desc_k0_m_k1),
+            decltype(a1_thread_desc_k0_m_k1),
+            tensor_operation::element_wise::PassThrough,
+            Sequence<A1ThreadSliceK0, A1ThreadSliceM, A1ThreadSliceK1>,
+            Sequence<1, 0, 2>,
+            2,
+            n4>{tensor_operation::element_wise::PassThrough{}};
+
+        // B1 matrix blockwise copy
+        auto b1_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                B0ElementwiseOperation,
+                                                tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<B1K0PerBlock, Gemm1NPerBlock, B1K1>,
+                                                B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                B1BlockTransferThreadClusterArrangeOrder,
+                                                A0B0B1DataType,
+                                                A0B0B1DataType,
+                                                decltype(b1_grid_desc_bk0_n_bk1),
+                                                decltype(b1_block_desc_bk0_n_bk1),
+                                                B1BlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                B1BlockTransferSrcVectorDim,
+                                                2,
+                                                B1BlockTransferSrcScalarPerVector,
+                                                B1BlockTransferDstScalarPerVector_BK1,
+                                                1,
+                                                1,
+                                                B1ThreadTransferSrcResetCoordinateAfterRun,
+                                                true, // DstResetCoord
+                                                1>(b1_grid_desc_bk0_n_bk1,
+                                                   make_multi_index(0, n_block_data_idx_on_grid, 0),
+                                                   b1_element_op,
+                                                   b1_block_desc_bk0_n_bk1,
+                                                   make_multi_index(0, 0, 0),
+                                                   tensor_operation::element_wise::PassThrough{});
+
+        auto a1_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, A0B0B1DataType>(
+            a1_thread_desc_k0_m_k1.GetElementSpaceSize());
+
+        // reuse LDS space for gemm0's b0_block_buf
+        auto b1_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<A0B0B1DataType*>(p_shared) + SharedMemTrait::b1_block_space_offset,
+            b1_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        constexpr index_t Gemm1KPack = math::max(
+            math::lcm(
+                MfmaSelector<A0B0B1DataType, Gemm0MPerXdl, Gemm0NPerXdl>::selected_mfma.group_size,
+                B1K1),
+            MfmaSelector<A0B0B1DataType, Gemm0MPerXdl, Gemm0NPerXdl>::selected_mfma.k_per_blk);
+
+        auto blockwise_gemm1 = BlockwiseGemmXdlops_v2<
+            BlockSize,
+            A0B0B1DataType,
+            Acc1DataType,
+            decltype(a1_thread_desc_k0_m_k1),
+            decltype(b1_block_desc_bk0_n_bk1),
+            decltype(MakeGemm1AMmaTileDescriptor_M0_M1_M2_K(a1_thread_desc_k0_m_k1)),
+            decltype(MakeGemm1BMmaTileDescriptor_N0_N1_N2_K(b1_block_desc_bk0_n_bk1)),
+            Gemm0MPerBlock,
+            Gemm1NPerBlock,
+            Gemm1KPerBlock,
+            Gemm0MPerXdl,
+            Gemm0NPerXdl,
+            Gemm0MXdlPerWave,
+            Gemm1NXdlPerWave,
+            Gemm1KPack,
+            false,      // TransposeC
+            Gemm1KPack, // AMmaKStride
+            Gemm1KPack * XdlopsGemm<A0B0B1DataType, Gemm0MPerXdl, Gemm0NPerXdl, Gemm1KPack, false>{}
+                             .K0PerXdlops>{                         // BMmaKStride
+                                           make_tuple(0, 0, 0, 0)}; // A_origin
+
+        auto c1_thread_buf = blockwise_gemm1.GetCThreadBuffer();
+
+        const index_t num_gemm1_k_block_outer_loop =
+            b0_grid_desc_bk0_n_bk1.GetLength(I1) / Gemm0NPerBlock;
+        constexpr index_t num_gemm1_k_block_inner_loop = Gemm0NPerBlock / Gemm1KPerBlock;
+
+        // Initialize C1
+        c1_thread_buf.Clear();
+
+        // gemm1 K loop
+        index_t gemm1_k_block_outer_index = 0;
+        do
+        {
+            // gemm0
+            gridwise_gemm0_pipeline.template Run<HasMainKBlockLoop>(a0_grid_desc_ak0_m_ak1,
+                                                                    a0_block_desc_ak0_m_ak1,
+                                                                    a0_blockwise_copy,
+                                                                    a0_grid_buf,
+                                                                    a0_block_buf,
+                                                                    a0_block_slice_copy_step,
+                                                                    b0_grid_desc_bk0_n_bk1,
+                                                                    b0_block_desc_bk0_n_bk1,
+                                                                    b0_blockwise_copy,
+                                                                    b0_grid_buf,
+                                                                    b0_block_buf,
+                                                                    b0_block_slice_copy_step,
+                                                                    blockwise_gemm0,
+                                                                    acc0_thread_buf,
+                                                                    num_k_block_main_loop);
+            // bias+gelu
+            {
+                static_for<0, Gemm0MXdlPerWave, 1>{}([&](auto mr) {
+                    static_for<0, Gemm0NXdlPerWave, 1>{}([&](auto nr) {
+                        static_for<0, n2, 1>{}([&](auto groupid) {
+                            static_for<0, NumD0Tensor, 1>{}([&](auto i) {
+                                d0s_threadwise_copy(i).Run(
+                                    d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5[i],
+                                    d0s_grid_buf[i],
+                                    d0_thread_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
+                                    make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                                    d0s_thread_buf(i));
+                            });
+
+                            static_for<0, n4, 1>{}([&](auto i) {
+                                constexpr index_t c_offset = acc0_thread_desc.CalculateOffset(
+                                    make_tuple(mr, nr, groupid, i));
+
+                                // get reference to src data
+                                const auto src_data_refs = generate_tie(
+                                    // return type should be lvalue
+                                    [&](auto iSrc) -> const auto& {
+                                        return d0s_thread_buf[iSrc][i];
+                                    },
+                                    Number<NumD0Tensor>{});
+
+                                // get reference to dst data
+                                auto dst_data_refs = generate_tie(
+                                    // return type should be lvalue
+                                    [&](auto) -> auto& {
+                                        return acc0_thread_buf(Number<c_offset>{});
+                                    },
+                                    Number<2>{});
+
+                                unpack2(cde0_element_op, dst_data_refs, src_data_refs);
+                            });
+                            static_for<0, NumD0Tensor, 1>{}([&](auto i) {
+                                d0s_threadwise_copy(i).MoveSrcSliceWindow(
+                                    d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5[i],
+                                    make_multi_index(0, 0, 0, 0, 0, 0, 0, 1, 0, 0));
+                            });
+                        });
+                        static_for<0, NumD0Tensor, 1>{}([&](auto i) {
+                            d0s_threadwise_copy(i).MoveSrcSliceWindow(
+                                d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5[i],
+                                make_multi_index(0, 0, 0, 1, 0, 0, 0, -n2.value, 0, 0));
+                        });
+                    });
+                    static_for<0, NumD0Tensor, 1>{}([&](auto i) {
+                        d0s_threadwise_copy(i).MoveSrcSliceWindow(
+                            d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5[i],
+                            make_multi_index(0, 0, 1, -Gemm0NXdlPerWave, 0, 0, 0, 0, 0, 0));
+                    });
+                });
+                static_for<0, NumD0Tensor, 1>{}([&](auto i) {
+                    d0s_threadwise_copy(i).MoveSrcSliceWindow(
+                        d0s_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5[i],
+                        make_multi_index(0, 1, -Gemm0MXdlPerWave, 0, 0, 0, 0, 0, 0, 0));
+                });
+            }
+            // gemm1
+            {
+                // TODO: explore using dynamic buffer for a1 thread buffer
+                // For a1_blockwise_copy, the goal is to satisfy pipeline requirements RunRead(),
+                // RunWrite(), and MoveSliceWindow(). But it is impossible to implement given that
+                // the A1 source buffer is static buffer holding the output of first GEMM and
+                // requires constexpr offset by design. Therefore, we pass tensor coordinate offset
+                // explicitly in Run() below.
+
+                // preload data into LDS
+                b1_blockwise_copy.RunRead(b1_grid_desc_bk0_n_bk1, b1_grid_buf);
+
+                b1_blockwise_copy.MoveSrcSliceWindow(b1_grid_desc_bk0_n_bk1,
+                                                     b1_block_slice_copy_step);
+
+                block_sync_lds(); // wait for gemm0 LDS read
+
+                b1_blockwise_copy.RunWrite(b1_block_desc_bk0_n_bk1, b1_block_buf);
+
+                // main body
+                if constexpr(num_gemm1_k_block_inner_loop > 1)
+                {
+                    static_for<0, num_gemm1_k_block_inner_loop - 1, 1>{}([&](auto i) {
+                        a1_blockwise_copy.Run(acc0_thread_desc_k0_m_k1,
+                                              make_tuple(Number<i * A1ThreadSliceK0>{}, I0, I0),
+                                              acc0_thread_buf,
+                                              a1_thread_desc_k0_m_k1,
+                                              make_tuple(I0, I0, I0),
+                                              a1_thread_buf);
+
+                        b1_blockwise_copy.RunRead(b1_grid_desc_bk0_n_bk1, b1_grid_buf);
+
+                        block_sync_lds();
+
+                        blockwise_gemm1.Run(a1_thread_buf, b1_block_buf, c1_thread_buf);
+
+                        block_sync_lds();
+
+                        b1_blockwise_copy.MoveSrcSliceWindow(b1_grid_desc_bk0_n_bk1,
+                                                             b1_block_slice_copy_step);
+
+                        b1_blockwise_copy.RunWrite(b1_block_desc_bk0_n_bk1, b1_block_buf);
+                    });
+                }
+                // tail
+                {
+                    a1_blockwise_copy.Run(
+                        acc0_thread_desc_k0_m_k1,
+                        make_tuple(
+                            Number<(num_gemm1_k_block_inner_loop - 1) * A1ThreadSliceK0>{}, I0, I0),
+                        acc0_thread_buf,
+                        a1_thread_desc_k0_m_k1,
+                        make_tuple(I0, I0, I0),
+                        a1_thread_buf);
+
+                    block_sync_lds();
+
+                    blockwise_gemm1.Run(a1_thread_buf, b1_block_buf, c1_thread_buf);
+                }
+            } // end gemm1
+
+            a0_blockwise_copy.MoveSrcSliceWindow(a0_grid_desc_ak0_m_ak1,
+                                                 a0_block_reset_copy_step); // rewind K
+            b0_blockwise_copy.MoveSrcSliceWindow(b0_grid_desc_bk0_n_bk1,
+                                                 b0_block_reset_copy_step); // rewind K and step N
+
+            block_sync_lds(); // wait for gemm1 LDS read
+        } while(++gemm1_k_block_outer_index < num_gemm1_k_block_outer_loop); // end j loop
+
+        // shuffle C1 and write out
+        {
+            static_assert(Gemm0MXdlPerWave % C1ShuffleGemm0MXdlPerWavePerShuffle == 0 &&
+                              Gemm1NXdlPerWave % C1ShuffleGemm0NXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = Gemm0MPerBlock / (Gemm0MXdlPerWave * Gemm0MPerXdl);
+            constexpr index_t NWave = Gemm1NPerBlock / (Gemm1NXdlPerWave * Gemm0NPerXdl);
+
+            // TODO: hacky, fix it!
+            constexpr auto c1_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm1.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            // TODO: hacky, fix it!
+            // c1_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c1_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                blockwise_gemm1.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c1_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c1_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c1_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c1_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c1_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto M3 = c1_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M4 = c1_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto N2 = c1_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+
+            constexpr auto c1_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetC1ShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c1_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<C1ShuffleDataType*>(p_shared),
+                c1_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c1_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c1_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<C1ShuffleGemm0MXdlPerWavePerShuffle>{}, // M0 (Gemm0MXdlPerWave) per
+                                                                       // shuffle
+                        M1,                                            // M1 = MWave
+                        M2, // M2 * M3 * M4 = Gemm0MPerXdl
+                        M3,
+                        M4)),
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<C1ShuffleGemm0NXdlPerWavePerShuffle>{}, // N0 (Gemm0NXdlPerWave) per
+                                                                       // shuffle
+                        N1,                                            // N1 = NWave
+                        N2))),                                         // N2 = Gemm0NPerXdl
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM C1 matrix starting index
+            const auto c1_thread_mtx_on_block =
+                blockwise_gemm1.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c1_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c1_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c1_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<Acc1DataType,
+                                                   C1ShuffleDataType,
+                                                   decltype(c1_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   decltype(c1_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   tensor_operation::element_wise::PassThrough,
+                                                   Sequence<C1ShuffleGemm0MXdlPerWavePerShuffle,
+                                                            C1ShuffleGemm0NXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            M2,
+                                                            I1,
+                                                            M4,
+                                                            I1>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c1_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     m_thread_data_on_block_idx[I3],
+                                     m_thread_data_on_block_idx[I4],
+                                     n_thread_data_on_block_idx[I2]),
+                    tensor_operation::element_wise::PassThrough{}};
+
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c1_d1s_desc_refs = concat_tuple_of_reference(
+                tie(c1_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                generate_tie(
+                    [&](auto i) -> const auto& // return type should be reference
+                    { return d1s_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                    Number<NumD1Tensor>{}));
+
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c1_d1s_buf_refs = concat_tuple_of_reference(
+                tie(c1_shuffle_block_buf),
+                generate_tie(
+                    [&](auto i) -> const auto& // return type should be reference
+                    { return d1s_grid_buf[i]; },
+                    Number<NumD1Tensor>{}));
+
+            // tuple of starting index of C/Ds blockwise copy
+            const auto idx_c1_d1s_block_begin = container_concat(
+                make_tuple(make_multi_index(0, 0, 0, 0)),
+                generate_tuple(
+                    [&](auto) {
+                        return make_multi_index(block_work_idx[I0], 0, block_work_idx[I1], 0);
+                    },
+                    Number<NumD1Tensor>{}));
+
+            // shuffle: blockwise copy C from LDS to global
+            auto cde1_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v7<
+                ThisThreadBlock,
+                decltype(container_concat(make_tuple(C1ShuffleDataType{}), D1sDataType{})),
+                Tuple<E1DataType>,
+                decltype(c1_d1s_desc_refs),
+                decltype(tie(e1_grid_desc_mblock_mperblock_nblock_nperblock)),
+                CDE1ElementwiseOperation,
+                Sequence<static_cast<index_t>(E1GlobalMemoryDataOperation)>, // FIXME: make Sequence
+                                                                             // support arbitray
+                                                                             // type
+                Sequence<1,
+                         C1ShuffleGemm0MXdlPerWavePerShuffle * MWave * Gemm0MPerXdl,
+                         1,
+                         C1ShuffleGemm0NXdlPerWavePerShuffle * NWave *
+                             Gemm0NPerXdl>, // BlockSliceLengths,
+                CDE1ShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                Sequence<0, 1, 2, 3>, // typename DimAccessOrder,
+                3,                    // index_t VectorDim,
+                CDE1ShuffleBlockTransferScalarPerVector_NPerBlock,
+                sequence_merge_t<
+                    Sequence<true>,
+                    uniform_sequence_gen_t<NumD1Tensor,
+                                           false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
+                Sequence<false>>                    // ThreadTransferDstResetCoordinateAfterRunFlags
+                {c1_d1s_desc_refs,
+                 idx_c1_d1s_block_begin,
+                 tie(e1_grid_desc_mblock_mperblock_nblock_nperblock),
+                 make_tuple(make_multi_index(block_work_idx[I0], 0, block_work_idx[I1], 0)),
+                 cde1_element_op};
+
+            // space filling curve for threadwise C in VGPR
+            constexpr auto sfc_c1_vgpr =
+                SpaceFillingCurve<Sequence<Gemm0MXdlPerWave, Gemm1NXdlPerWave, 1, 1, M2, 1, M4, 1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                  Sequence<C1ShuffleGemm0MXdlPerWavePerShuffle,
+                                           C1ShuffleGemm0NXdlPerWavePerShuffle,
+                                           1,
+                                           1,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>>{};
+
+            // space filling curve for shuffled blockwise C in global mem
+            constexpr auto sfc_e1_global = SpaceFillingCurve<
+                Sequence<1, Gemm0MPerBlock, 1, Gemm1NPerBlock>,
+                Sequence<0, 2, 1, 3>,
+                Sequence<1,
+                         C1ShuffleGemm0MXdlPerWavePerShuffle * MWave * Gemm0MPerXdl,
+                         1,
+                         C1ShuffleGemm0NXdlPerWavePerShuffle * NWave * Gemm0NPerXdl>>{};
+
+            constexpr index_t num_access = sfc_c1_vgpr.GetNumOfAccess();
+
+            static_assert(num_access == sfc_e1_global.GetNumOfAccess(), "wrong!");
+
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c1_thread_copy_vgpr_to_lds.Run(c1_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                               sfc_c1_vgpr.GetIndexTupleOfNumber(access_id),
+                                               c1_thread_buf,
+                                               c1_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                               c1_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                cde1_shuffle_block_copy_lds_to_global.Run(
+                    c1_d1s_desc_refs,
+                    c1_d1s_buf_refs,
+                    tie(e1_grid_desc_mblock_mperblock_nblock_nperblock),
+                    tie(e1_grid_buf));
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto e1_global_step = sfc_e1_global.GetForwardStep(access_id);
+
+                    // move on D1s
+                    static_for<0, NumD1Tensor, 1>{}([&](auto i) {
+                        cde1_shuffle_block_copy_lds_to_global.MoveSrcSliceWindow(
+                            c1_d1s_desc_refs, i + I1, e1_global_step);
+                    });
+
+                    // move on C
+                    cde1_shuffle_block_copy_lds_to_global.MoveDstSliceWindow(
+                        tie(e1_grid_desc_mblock_mperblock_nblock_nperblock), I0, e1_global_step);
+                }
+            });
+        }
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
new file mode 100644
index 00000000..fec360b7
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
@@ -0,0 +1,1131 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_softmax.hpp"
+
+namespace ck {
+
+template <typename FloatAB,
+          typename FloatGemmAcc,
+          typename FloatCShuffle,
+          typename FloatC,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename AccElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename B1GridDesc_BK0_N_BK1,
+          typename CGridDesc_M_N,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t Gemm1NPerBlock,
+          index_t Gemm1KPerBlock,
+          index_t AK1Value,
+          index_t BK1Value,
+          index_t B1K1Value,
+          index_t MPerXdl,
+          index_t NPerXdl,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          index_t Gemm1NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool AThreadTransferSrcResetCoordinateAfterRun, // ignored
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BThreadTransferSrcResetCoordinateAfterRun, // ignored
+          index_t BBlockLdsExtraN,
+          typename B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename B1BlockTransferThreadClusterArrangeOrder,
+          typename B1BlockTransferSrcAccessOrder,
+          index_t B1BlockTransferSrcVectorDim,
+          index_t B1BlockTransferSrcScalarPerVector,
+          index_t B1BlockTransferDstScalarPerVector_BK1,
+          bool B1ThreadTransferSrcResetCoordinateAfterRun,
+          index_t B1BlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched,
+          bool PadN,
+          bool MaskOutUpperTriangle,
+          PipelineVersion PipelineVer = PipelineVersion::v1>
+struct GridwiseBatchedGemmSoftmaxGemm_Xdl_CShuffle
+{
+    static_assert(LoopSched == LoopScheduler::Default,
+                  "Non-default loop scheduler is currently not supported");
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    // K1 should be Number<...>
+    // Gemm0
+    static constexpr auto AK0 = Number<KPerBlock / AK1Value>{};
+    static constexpr auto BK0 = Number<KPerBlock / BK1Value>{};
+    static constexpr auto AK1 = Number<AK1Value>{};
+    static constexpr auto BK1 = Number<BK1Value>{};
+
+    static constexpr auto Gemm0MWaves = MPerBlock / (MPerXdl * MXdlPerWave);
+    static constexpr auto Gemm0NWaves = NPerBlock / (NPerXdl * NXdlPerWave);
+
+    // Gemm1
+    static constexpr auto B1K0 = Number<Gemm1KPerBlock / B1K1Value>{};
+    static constexpr auto B1K1 = Number<B1K1Value>{};
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    using GridwiseGemmPipe = remove_cvref_t<decltype(
+        GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage>())>;
+
+    template <typename ABlockDesc_AK0_M_AK1>
+    __host__ __device__ static constexpr auto
+    MakeGemm0AMmaTileDescriptor_M0_M1_M2_K(const ABlockDesc_AK0_M_AK1&)
+    {
+        constexpr index_t MWaves = MPerBlock / (MXdlPerWave * MPerXdl);
+
+        return MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K<MXdlPerWave, MWaves, MPerXdl>(
+            ABlockDesc_AK0_M_AK1{});
+    }
+
+    template <typename BBlockDesc_BK0_N_BK1>
+    __host__ __device__ static constexpr auto
+    MakeGemm0BMmaTileDescriptor_N0_N1_N2_K(const BBlockDesc_BK0_N_BK1&)
+    {
+        constexpr index_t NWaves = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        return MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K<NXdlPerWave, NWaves, NPerXdl>(
+            BBlockDesc_BK0_N_BK1{});
+    }
+
+    template <typename ABlockDesc_AK0_M_AK1>
+    __host__ __device__ static constexpr auto
+    MakeGemm1AMmaTileDescriptor_M0_M1_M2_K(const ABlockDesc_AK0_M_AK1&)
+    {
+        return MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K<MXdlPerWave, 1, 1>(ABlockDesc_AK0_M_AK1{});
+    }
+
+    template <typename BBlockDesc_BK0_N_BK1>
+    __host__ __device__ static constexpr auto
+    MakeGemm1BMmaTileDescriptor_N0_N1_N2_K(const BBlockDesc_BK0_N_BK1&)
+    {
+        constexpr index_t Gemm1NWaves = Gemm1NPerBlock / (Gemm1NXdlPerWave * NPerXdl);
+        return MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K<Gemm1NXdlPerWave, Gemm1NWaves, NPerXdl>(
+            BBlockDesc_BK0_N_BK1{});
+    }
+
+    __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
+    {
+        // A matrix in LDS memory, dst of blockwise copy
+        return make_naive_tensor_descriptor(
+            make_tuple(AK0, Number<MPerBlock>{}, AK1),
+            make_tuple(Number<MPerBlock + ABlockLdsExtraM>{} * AK1, AK1, I1));
+    }
+
+    __host__ __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    {
+        // B matrix in LDS memory, dst of blockwise copy
+        return make_naive_tensor_descriptor(
+            make_tuple(BK0, Number<NPerBlock>{}, BK1),
+            make_tuple(Number<NPerBlock + BBlockLdsExtraN>{} * BK1, BK1, I1));
+    }
+
+    __host__ __device__ static constexpr auto GetB1BlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    {
+        // B1 matrix in LDS memory, dst of blockwise copy
+        return make_naive_tensor_descriptor(
+            make_tuple(B1K0, Number<Gemm1NPerBlock>{}, B1K1),
+            make_tuple(Number<Gemm1NPerBlock + B1BlockLdsExtraN>{} * B1K1, B1K1, I1));
+    }
+
+    __host__ __device__ static constexpr auto
+    GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
+    {
+        constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave = Gemm1NPerBlock / (Gemm1NXdlPerWave * NPerXdl);
+
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            make_naive_tensor_descriptor_packed(
+                make_tuple(I1,
+                           Number<CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl>{},
+                           I1,
+                           Number<CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>{}));
+
+        return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        const index_t gemm0_bytes_end = (SharedMemTrait::a_block_space_size_aligned +
+                                         SharedMemTrait::b_block_space_size_aligned) *
+                                        sizeof(FloatAB);
+        const index_t gemm1_bytes_end =
+            (SharedMemTrait::b1_block_space_offset + SharedMemTrait::b1_block_space_size_aligned) *
+            sizeof(FloatAB);
+        const index_t softmax_bytes_end = (SharedMemTrait::reduction_space_offset +
+                                           SharedMemTrait::reduction_space_size_aligned) *
+                                          sizeof(FloatGemmAcc);
+        const index_t c_block_bytes_end =
+            SharedMemTrait::c_block_space_size * sizeof(FloatCShuffle);
+
+        return math::max(gemm0_bytes_end, gemm1_bytes_end, softmax_bytes_end, c_block_bytes_end);
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    template <typename Block2CTileMap>
+    __host__ __device__ static constexpr bool
+    CheckValidity(const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1,
+                  const BGridDesc_BK0_N_BK1& b_grid_desc_bk0_n_bk1,
+                  const B1GridDesc_BK0_N_BK1& b1_grid_desc_bk0_n_bk1,
+                  const CGridDesc_M_N& c_grid_desc_m_n,
+                  const Block2CTileMap& block_2_ctile_map)
+    {
+        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
+                      "Invalid tuning param!");
+
+        const auto M = a_grid_desc_ak0_m_ak1.GetLength(I1);
+        const auto N = b_grid_desc_bk0_n_bk1.GetLength(I1);
+        const auto K = a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2);
+        const auto Gemm1N = b1_grid_desc_bk0_n_bk1.GetLength(I1);
+
+        if(!(M == c_grid_desc_m_n.GetLength(I0) && Gemm1N == c_grid_desc_m_n.GetLength(I1)))
+        {
+            return false;
+        }
+
+        if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0 &&
+             Gemm1N % Gemm1NPerBlock == 0))
+        {
+            return false;
+        }
+
+        // check gemm0 gridwise gemm pipeline
+        const auto num_gemm0_k_loop = K / KPerBlock;
+        if(!GridwiseGemmPipe::IsSupported(num_gemm0_k_loop))
+        {
+            return false;
+        }
+
+        // check gemm1 gridwise gemm pipeline
+        if(!(NPerBlock % Gemm1KPerBlock == 0))
+        {
+            return false;
+        }
+
+        const auto num_gemm1_k_inner_loop = NPerBlock / Gemm1KPerBlock;
+        if(!GridwiseGemmPipe::IsSupported(num_gemm1_k_inner_loop))
+        {
+            return false;
+        }
+
+        if(!block_2_ctile_map.CheckValidity(c_grid_desc_m_n))
+        {
+            return false;
+        }
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        const auto M = c_grid_desc_m_n.GetLength(I0);
+        const auto N = c_grid_desc_m_n.GetLength(I1);
+
+        const auto MBlock = M / MPerBlock;
+        const auto NBlock = N / Gemm1NPerBlock;
+
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
+                       make_unmerge_transform(make_tuple(NBlock, Number<Gemm1NPerBlock>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+
+        return c_grid_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    __host__ __device__ static constexpr auto
+    MakeDefaultBlock2CTileMap(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, Gemm1NPerBlock, CGridDesc_M_N>(
+            c_grid_desc_m_n);
+    }
+
+    using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(CGridDesc_M_N{}))>;
+
+    using DefaultBlock2CTileMap =
+        remove_cvref_t<decltype(MakeDefaultBlock2CTileMap(CGridDesc_M_N{}))>;
+
+    struct SharedMemTrait
+    {
+        // LDS allocation for A and B: be careful of alignment
+        static constexpr auto a_block_desc_ak0_m_ak1 =
+            GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+        static constexpr auto b_block_desc_bk0_n_bk1 =
+            GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+        static constexpr auto b1_block_desc_bk0_n_bk1 =
+            GetB1BlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        static constexpr auto max_lds_align = math::lcm(math::lcm(AK1, BK1), B1K1);
+
+        static constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+        static constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
+        static constexpr auto b1_block_space_size_aligned = math::integer_least_multiple(
+            b1_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
+
+        static constexpr auto a_block_space_offset  = 0;
+        static constexpr auto b_block_space_offset  = a_block_space_size_aligned.value;
+        static constexpr auto b1_block_space_offset = 0;
+
+        // LDS allocation for reduction
+        static constexpr index_t reduction_space_size_aligned =
+            math::integer_least_multiple(BlockSize, max_lds_align);
+
+        static constexpr auto reduction_space_offset = 0;
+
+        // LDS allocation for C shuffle in LDS
+        static constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+        static constexpr auto c_block_space_size =
+            c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();
+    };
+
+    template <bool HasMainKBlockLoop, typename Block2CTileMap, typename C0MatrixMask>
+    __device__ static void Run(const FloatAB* __restrict__ p_a_grid,
+                               const FloatAB* __restrict__ p_b_grid,
+                               const FloatAB* __restrict__ p_b1_grid,
+                               FloatC* __restrict__ p_c_grid,
+                               void* __restrict__ p_shared,
+                               const AElementwiseOperation& a_element_op,
+                               const BElementwiseOperation& b_element_op,
+                               const AccElementwiseOperation& acc_element_op,
+                               const B1ElementwiseOperation& b1_element_op,
+                               const CElementwiseOperation& c_element_op,
+                               const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1,
+                               const BGridDesc_BK0_N_BK1& b_grid_desc_bk0_n_bk1,
+                               const B1GridDesc_BK0_N_BK1& b1_grid_desc_bk0_n_bk1,
+                               const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock&
+                                   c_grid_desc_mblock_mperblock_nblock_nperblock,
+                               const Block2CTileMap& block_2_ctile_map,
+                               const C0MatrixMask& c0_matrix_mask)
+    {
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
+        const auto b1_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b1_grid, b1_grid_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+        // divide block work by [M, N]
+        const auto block_work_idx =
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!block_2_ctile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+
+        // HACK: this force m/gemm1_n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock);
+
+        const index_t gemm1_n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * Gemm1NPerBlock);
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        //
+        // set up Gemm0
+        //
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                AElementwiseOperation,
+                                                tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<AK0, MPerBlock, AK1>,
+                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(a_grid_desc_ak0_m_ak1),
+                                                decltype(a_block_desc_ak0_m_ak1),
+                                                ABlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                ABlockTransferSrcVectorDim,
+                                                2,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_AK1,
+                                                1,
+                                                1,
+                                                true, // SrcResetCoord
+                                                true, // DstResetCoord
+                                                NumGemmKPrefetchStage>(
+                a_grid_desc_ak0_m_ak1,
+                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                a_element_op,
+                a_block_desc_ak0_m_ak1,
+                make_multi_index(0, 0, 0),
+                tensor_operation::element_wise::PassThrough{});
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                BElementwiseOperation,
+                                                tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<BK0, NPerBlock, BK1>,
+                                                BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(b_grid_desc_bk0_n_bk1),
+                                                decltype(b_block_desc_bk0_n_bk1),
+                                                BBlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                BBlockTransferSrcVectorDim,
+                                                2,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_BK1,
+                                                1,
+                                                1,
+                                                true, // SrcResetCoord
+                                                true, // DstResetCoord
+                                                NumGemmKPrefetchStage>(
+                b_grid_desc_bk0_n_bk1,
+                make_multi_index(0, 0, 0), // will loop over GemmN dimension
+                b_element_op,
+                b_block_desc_bk0_n_bk1,
+                make_multi_index(0, 0, 0),
+                tensor_operation::element_wise::PassThrough{});
+
+        // Fused Gemm+Gemm pipeline
+        // for n in N0:
+        //   for k in K0:
+        //     acc[m][n] += A[m][k] * B0[k][n]
+        //   acc1[m][o] += acc[m][n] * B1[n][o]
+
+        // sanity check
+        constexpr index_t KPack = math::max(
+            math::lcm(AK1, BK1), MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+
+        auto blockwise_gemm = BlockwiseGemmXdlops_v2<
+            BlockSize,
+            FloatAB,
+            FloatGemmAcc,
+            decltype(a_block_desc_ak0_m_ak1),
+            decltype(b_block_desc_bk0_n_bk1),
+            decltype(MakeGemm0AMmaTileDescriptor_M0_M1_M2_K(a_block_desc_ak0_m_ak1)),
+            decltype(MakeGemm0BMmaTileDescriptor_N0_N1_N2_K(b_block_desc_bk0_n_bk1)),
+            MPerBlock,
+            NPerBlock,
+            KPerBlock,
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            KPack,
+            true>{}; // TransposeC
+
+        auto acc_thread_buf = blockwise_gemm.GetCThreadBuffer();
+
+        // LDS allocation for A and B: be careful of alignment
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<FloatAB*>(p_shared) + SharedMemTrait::a_block_space_offset,
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<FloatAB*>(p_shared) + SharedMemTrait::b_block_space_offset,
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1, 0, 0);
+        const auto a_block_reset_copy_step =
+            make_multi_index(-a_grid_desc_ak0_m_ak1.GetLength(I0), 0, 0);
+        const auto b_block_reset_copy_step =
+            make_multi_index(-b_grid_desc_bk0_n_bk1.GetLength(I0), NPerBlock, 0);
+
+        // gridwise GEMM pipeline
+        // Only supports LoopScheduler::Default
+        const auto gridwise_gemm_pipeline = GridwiseGemmPipeline_Selector<PipelineVer,
+                                                                          NumGemmKPrefetchStage,
+                                                                          LoopScheduler::Default>();
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        //
+        // set up Gemm1
+        //
+
+        // Acc matrix threadwise copy: AccVGPR to VGPR and downcast to XDL input data type
+        constexpr auto acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4 =
+            blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4();
+
+        constexpr auto m0 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I0);
+        constexpr auto n0 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I1);
+        constexpr auto m1 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I2);
+        constexpr auto n1 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I3);
+        constexpr auto m2 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I4);
+        constexpr auto n2 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I5);
+        constexpr auto n3 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I6);
+        constexpr auto n4 = acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I7);
+
+        constexpr auto b1_block_slice_copy_step = make_multi_index(Gemm1KPerBlock / B1K1, 0, 0);
+
+        // acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4 to acc_thread_desc_k0_m_k1
+        // n0_n1_n2_n3 -> k0
+        // m0_m1_m2 -> m
+        // n4 -> k1
+        // NOTE: had to use merge_v3 or will spit out compilation errors
+        constexpr auto acc_thread_desc_k0_m_k1 = transform_tensor_descriptor(
+            acc_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4,
+            make_tuple(make_merge_transform_v3_division_mod(make_tuple(n0, n1, n2, n3)),
+                       make_merge_transform_v3_division_mod(make_tuple(m0, m1, m2)),
+                       make_pass_through_transform(n4)),
+            make_tuple(Sequence<1, 3, 5, 6>{}, Sequence<0, 2, 4>{}, Sequence<7>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+        // A1 matrix in AccVGPR
+        // N2 num_groups_per_blk, N3 num_input_blks, N4 group_size
+        constexpr auto AccN3 =
+            blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4().GetLength(I6);
+
+        constexpr auto A1ThreadSlice_K0_M_K1 =
+            make_tuple(Number<Gemm1KPerBlock / n4 / AccN3>{}, Number<m0 * m1 * m2>{}, Number<n4>{});
+
+        constexpr auto A1ThreadSliceK0        = A1ThreadSlice_K0_M_K1[I0];
+        constexpr auto A1ThreadSliceM         = A1ThreadSlice_K0_M_K1[I1];
+        constexpr auto A1ThreadSliceK1        = A1ThreadSlice_K0_M_K1[I2];
+        constexpr auto a1_thread_desc_k0_m_k1 = make_naive_tensor_descriptor(
+            A1ThreadSlice_K0_M_K1,
+            make_tuple(A1ThreadSliceM * A1ThreadSliceK1, A1ThreadSliceK1, I1));
+
+        // B1 matrix in LDS memory, dst of blockwise copy
+        constexpr auto b1_block_desc_bk0_n_bk1 = GetB1BlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // A1 matrix blockwise copy
+        auto a1_blockwise_copy = ThreadwiseTensorSliceTransfer_StaticToStatic<
+            FloatGemmAcc,
+            FloatAB,
+            decltype(acc_thread_desc_k0_m_k1),
+            decltype(a1_thread_desc_k0_m_k1),
+            tensor_operation::element_wise::PassThrough,
+            Sequence<A1ThreadSliceK0, A1ThreadSliceM, A1ThreadSliceK1>,
+            Sequence<1, 0, 2>,
+            2,
+            n4>{tensor_operation::element_wise::PassThrough{}};
+
+        // B1 matrix blockwise copy
+        auto b1_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                BElementwiseOperation,
+                                                tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<B1K0, Gemm1NPerBlock, B1K1>,
+                                                B1BlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                B1BlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(b1_grid_desc_bk0_n_bk1),
+                                                decltype(b1_block_desc_bk0_n_bk1),
+                                                B1BlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                B1BlockTransferSrcVectorDim,
+                                                2,
+                                                B1BlockTransferSrcScalarPerVector,
+                                                B1BlockTransferDstScalarPerVector_BK1,
+                                                1,
+                                                1,
+                                                B1ThreadTransferSrcResetCoordinateAfterRun,
+                                                true, // DstResetCoord
+                                                NumGemmKPrefetchStage>(
+                b1_grid_desc_bk0_n_bk1,
+                make_multi_index(0, gemm1_n_block_data_idx_on_grid, 0),
+                b1_element_op,
+                b1_block_desc_bk0_n_bk1,
+                make_multi_index(0, 0, 0),
+                tensor_operation::element_wise::PassThrough{});
+
+        auto a1_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
+            a1_thread_desc_k0_m_k1.GetElementSpaceSize());
+
+        // reuse LDS space for gemm0's b_block_buf
+        auto b1_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<FloatAB*>(p_shared) + SharedMemTrait::b1_block_space_offset,
+            b1_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        // selected_mfma.group_size or B1K1 <= Gemm1KPack <= selected_mfma.group_size
+        // selected_mfma.k_per_blk <= Gemm1KPack
+        //
+        // Following similar rationale behind Gemm0KPack, let Gemm1KPack be the lowest common
+        // multiples of A1K1 (predetermined by selected_mfma.group_size) and B1K1. But in this case
+        // Gemm1KPack can't be higher than A1K1 itself because A1 matrix is distributed in VGPRs
+        // with 'group_size' amount of contiguous elements. Having Gemm1KPack greater than A1K1 will
+        // cause mismatch in summation index for example c[0:7] = a1[[0:3, 8:11]] * b1[0:7].
+        // therefore we may just as well assign Gemm1KPack = group_size
+        constexpr index_t Gemm1KPack =
+            MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.group_size;
+
+        auto gemm1_blockwise_gemm = BlockwiseGemmXdlops_v2<
+            BlockSize,
+            FloatAB,
+            FloatGemmAcc,
+            decltype(a1_thread_desc_k0_m_k1),
+            decltype(b1_block_desc_bk0_n_bk1),
+            decltype(MakeGemm1AMmaTileDescriptor_M0_M1_M2_K(a1_thread_desc_k0_m_k1)),
+            decltype(MakeGemm1BMmaTileDescriptor_N0_N1_N2_K(b1_block_desc_bk0_n_bk1)),
+            MPerBlock,
+            Gemm1NPerBlock,
+            Gemm1KPerBlock,
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            Gemm1NXdlPerWave,
+            Gemm1KPack,
+            true,       // TransposeC
+            Gemm1KPack, // AMmaKStride
+            Gemm1KPack * XdlopsGemm<FloatAB, MPerXdl, NPerXdl, Gemm1KPack, false>{}.K0PerXdlops>{
+            // BMmaKStride
+            make_tuple(0, 0, 0, 0)}; // A_origin
+
+        auto acc1_thread_buf = gemm1_blockwise_gemm.GetCThreadBuffer();
+
+        //
+        // Blockwise softmax
+        //
+        auto workspace_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<FloatGemmAcc*>(p_shared) + SharedMemTrait::reduction_space_offset,
+            SharedMemTrait::reduction_space_size_aligned);
+
+        // get acc0 8D thread cluster
+        constexpr auto thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4 =
+            blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4().GetLengths() /
+            blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4().GetLengths();
+        constexpr auto tm0 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I0);
+        constexpr auto tn0 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I1);
+        constexpr auto tm1 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I2);
+        constexpr auto tn1 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I3);
+        constexpr auto tm2 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I4);
+        constexpr auto tn2 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I5);
+        constexpr auto tn3 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I6);
+        constexpr auto tn4 = thread_cluster_m0_n0_m1_n1_m2_n2_n3_n4.At(I7);
+
+        // get acc0 thread map
+        constexpr auto m0_n_m1_to_m_n_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_unmerge_transform(make_tuple(tm0 * tm1, tm2)),
+                       make_pass_through_transform(I1)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+        constexpr auto threadid_to_m0_n_m1_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(
+                make_merge_transform(make_tuple(tm0 * tm1, tn0 * tn1 * tn2 * tn3 * tn4, tm2))),
+            make_tuple(Sequence<0, 1, 2>{}),
+            make_tuple(Sequence<0>{}));
+        const auto threadid_to_m_n_thread_cluster_adaptor =
+            chain_tensor_adaptors(m0_n_m1_to_m_n_adaptor, threadid_to_m0_n_m1_adaptor);
+
+        // get acc0 2D thread cluster & 2D thread slice
+        constexpr auto thread_cluster_desc_m_n = make_naive_tensor_descriptor_packed(
+            make_tuple(tm0 * tm1 * tm2, tn0 * tn1 * tn2 * tn3 * tn4));
+        constexpr auto thread_slice_desc_m_n =
+            make_naive_tensor_descriptor_packed(make_tuple(m0 * m1 * m2, n0 * n1 * n2 * n3 * n4));
+
+        auto blockwise_softmax = BlockwiseSoftmax<BlockSize,
+                                                  FloatGemmAcc,
+                                                  decltype(threadid_to_m_n_thread_cluster_adaptor),
+                                                  decltype(thread_cluster_desc_m_n),
+                                                  decltype(thread_slice_desc_m_n)>{};
+
+        const index_t num_gemm1_k_block_outer_loop =
+            b_grid_desc_bk0_n_bk1.GetLength(I1) / NPerBlock;
+        constexpr index_t num_gemm1_k_block_inner_loop = NPerBlock / Gemm1KPerBlock;
+
+        // Initialize C
+        StaticBuffer<AddressSpaceEnum::Vgpr, FloatGemmAcc, acc1_thread_buf.Size(), true>
+            c_thread_buf;
+        c_thread_buf.Clear();
+
+        // Initialize running sum and max of exponentiating row vectors
+        using SoftmaxBuf = typename decltype(blockwise_softmax)::BufferType;
+        SoftmaxBuf running_sum, running_sum_new, running_max, running_max_new;
+        running_sum     = 0;
+        running_sum_new = 0;
+        running_max     = NumericLimits<FloatGemmAcc>::Lowest();
+        running_max_new = NumericLimits<FloatGemmAcc>::Lowest();
+
+        // gemm1 K loop
+        index_t gemm1_k_block_outer_index = 0;
+        do
+        {
+            auto n_block_data_idx_on_grid =
+                __builtin_amdgcn_readfirstlane(gemm1_k_block_outer_index * NPerBlock);
+            if(c0_matrix_mask.IsTileSkippable(
+                   m_block_data_idx_on_grid, n_block_data_idx_on_grid, MPerBlock, NPerBlock))
+            {
+                continue;
+            }
+            // gemm0
+            gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_grid_desc_ak0_m_ak1,
+                                                                   a_block_desc_ak0_m_ak1,
+                                                                   a_blockwise_copy,
+                                                                   a_grid_buf,
+                                                                   a_block_buf,
+                                                                   a_block_slice_copy_step,
+                                                                   b_grid_desc_bk0_n_bk1,
+                                                                   b_block_desc_bk0_n_bk1,
+                                                                   b_blockwise_copy,
+                                                                   b_grid_buf,
+                                                                   b_block_buf,
+                                                                   b_block_slice_copy_step,
+                                                                   blockwise_gemm,
+                                                                   acc_thread_buf,
+                                                                   num_k_block_main_loop);
+
+            // do MNK padding or upper triangular masking
+            if constexpr(MaskOutUpperTriangle || PadN)
+            {
+                // 8d thread_desc in thread scope
+                constexpr auto c_thread_lengths =
+                    blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4().GetLengths();
+
+                // 8d block_desc in block scope
+                constexpr auto c_block_lengths =
+                    blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4().GetLengths();
+
+                constexpr auto M0 = c_block_lengths[I0];
+                constexpr auto N0 = c_block_lengths[I1];
+                constexpr auto M1 = c_block_lengths[I2];
+                constexpr auto N1 = c_block_lengths[I3];
+                constexpr auto M2 = c_block_lengths[I4];
+                constexpr auto N2 = c_block_lengths[I5];
+                constexpr auto N3 = c_block_lengths[I6];
+                constexpr auto N4 = c_block_lengths[I7];
+
+                // works like multi-dimension static_for (static_ford), but provides both the linear
+                // index as well as n-d index
+                using Acc0TileIterator = SpaceFillingCurve<
+                    decltype(c_thread_lengths),
+                    typename arithmetic_sequence_gen<0, c_thread_lengths.Size(), 1>::type,
+                    typename uniform_sequence_gen<c_thread_lengths.Size(), 1>::type,
+                    false>; // SnakeCurved
+
+                auto acc0_thread_origin = blockwise_gemm.CalculateCThreadOriginDataIndex8D(
+                    Number<0>{}, Number<0>{}, Number<0>{}, Number<0>{});
+
+                constexpr auto block_idx_to_m_n_adaptor = make_single_stage_tensor_adaptor(
+                    make_tuple(make_unmerge_transform(make_tuple(M0, M1, M2)),
+                               make_unmerge_transform(make_tuple(N0, N1, N2, N3, N4))),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}),
+                    make_tuple(Sequence<0, 2, 4>{}, Sequence<1, 3, 5, 6, 7>{}));
+
+                static_for<0, Acc0TileIterator::GetNumOfAccess(), 1>{}([&](auto i) {
+                    auto acc0_thread_idx = Acc0TileIterator::GetIndex(i) + acc0_thread_origin;
+                    auto m_local =
+                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
+                    auto n_local =
+                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
+                    auto m_global = m_local + m_block_data_idx_on_grid;
+                    auto n_global = n_local + n_block_data_idx_on_grid;
+                    if(c0_matrix_mask.IsMaskedElement(m_global, n_global))
+                    {
+                        acc_thread_buf(i) = -ck::NumericLimits<float>::Infinity();
+                    }
+                    else
+                    {
+                        acc_element_op(acc_thread_buf(i), acc_thread_buf[i]);
+                    }
+                });
+            }
+            else
+            {
+                static_for<0, acc_thread_buf.Size(), 1>{}(
+                    [&](auto i) { acc_element_op(acc_thread_buf(i), acc_thread_buf[i]); });
+            }
+
+            block_sync_lds(); // wait for lds read in gemm0 blockwise gemm
+
+            // softmax
+            SoftmaxBuf& max = blockwise_softmax.max_value_buf;
+            SoftmaxBuf& sum = blockwise_softmax.sum_value_buf;
+
+            blockwise_softmax.Run(acc_thread_buf, workspace_buf);
+
+            // TODO: may convert to log domain
+            running_max_new = mathext::max(max, running_max);
+            running_sum_new = mathext::exp(running_max - running_max_new) * running_sum +
+                              mathext::exp(max - running_max_new) * sum;
+
+            // gemm1
+            {
+                // TODO: explore using dynamic buffer for a1 thread buffer
+                // For a1_blockwise_copy, the goal is to satisfy pipeline requirements RunRead(),
+                // RunWrite(), and MoveSliceWindow(). But it is impossible to implement given that
+                // the A1 source buffer is static buffer holding the output of first GEMM and
+                // requires constexpr offset by design. Therefore, we pass tensor coordinate offset
+                // explicitly in Run() below.
+
+                // Initialize acc1
+                acc1_thread_buf.Clear();
+
+                // preload data into LDS
+                b1_blockwise_copy.RunRead(b1_grid_desc_bk0_n_bk1, b1_grid_buf);
+
+                b1_blockwise_copy.MoveSrcSliceWindow(b1_grid_desc_bk0_n_bk1,
+                                                     b1_block_slice_copy_step);
+
+                block_sync_lds(); // wait for reduction LDS read
+
+                b1_blockwise_copy.RunWrite(b1_block_desc_bk0_n_bk1, b1_block_buf);
+
+                // main body
+                if constexpr(num_gemm1_k_block_inner_loop > 1)
+                {
+                    static_for<0, num_gemm1_k_block_inner_loop - 1, 1>{}([&](auto i) {
+                        a1_blockwise_copy.Run(acc_thread_desc_k0_m_k1,
+                                              make_tuple(Number<i * A1ThreadSliceK0>{}, I0, I0),
+                                              acc_thread_buf,
+                                              a1_thread_desc_k0_m_k1,
+                                              make_tuple(I0, I0, I0),
+                                              a1_thread_buf);
+
+                        b1_blockwise_copy.RunRead(b1_grid_desc_bk0_n_bk1, b1_grid_buf);
+
+                        block_sync_lds();
+
+                        gemm1_blockwise_gemm.Run(a1_thread_buf, b1_block_buf, acc1_thread_buf);
+
+                        block_sync_lds();
+
+                        b1_blockwise_copy.MoveSrcSliceWindow(b1_grid_desc_bk0_n_bk1,
+                                                             b1_block_slice_copy_step);
+
+                        b1_blockwise_copy.RunWrite(b1_block_desc_bk0_n_bk1, b1_block_buf);
+                    });
+                }
+                // tail
+                {
+                    a1_blockwise_copy.Run(
+                        acc_thread_desc_k0_m_k1,
+                        make_tuple(
+                            Number<(num_gemm1_k_block_inner_loop - 1) * A1ThreadSliceK0>{}, I0, I0),
+                        acc_thread_buf,
+                        a1_thread_desc_k0_m_k1,
+                        make_tuple(I0, I0, I0),
+                        a1_thread_buf);
+
+                    block_sync_lds();
+
+                    gemm1_blockwise_gemm.Run(a1_thread_buf, b1_block_buf, acc1_thread_buf);
+                }
+            } // end gemm1
+
+            // workaround compiler issue; see ck/ck.hpp
+            if constexpr(CK_WORKAROUND_SWDEV_XXXXXX_BF16_ATTEN_FWD_GFX908_ISSUE == 1 &&
+                         is_same_v<FloatAB, bhalf_t> && MPerBlock == 256 && NPerBlock == 128 &&
+                         Gemm1NPerBlock == 128)
+            {
+                __builtin_amdgcn_sched_barrier(0);
+            }
+
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4 =
+                gemm1_blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4();
+            constexpr auto cm0 = c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I0);
+            constexpr auto cn0 = c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I1);
+            constexpr auto cm1 = c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I2);
+            constexpr auto cn1 = c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I3);
+            constexpr auto cm2 = c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I4);
+            constexpr auto cn2 = c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I5);
+            constexpr auto cn3 = c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I6);
+            constexpr auto cn4 = c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4.GetLength(I7);
+            constexpr auto c_thread_slice_desc_m_n = make_naive_tensor_descriptor_packed(
+                make_tuple(cm0 * cm1 * cm2, cn0 * cn1 * cn2 * cn3 * cn4));
+            constexpr auto c_thread_buf_slice_m = c_thread_slice_desc_m_n.GetLength(I0);
+            constexpr auto c_thread_buf_slice_n = c_thread_slice_desc_m_n.GetLength(I1);
+
+            static_for<0, c_thread_buf_slice_m, 1>{}([&](auto iM) {
+                static_for<0, c_thread_buf_slice_n, 1>{}([&](auto iN) {
+                    auto I = Number<c_thread_slice_desc_m_n.CalculateOffset(make_tuple(iM, iN))>{};
+                    FloatGemmAcc acc1 = acc1_thread_buf[I]; // P*V
+                    FloatGemmAcc c    = c_thread_buf[I];    // O
+                    FloatGemmAcc c_new =
+                        (running_sum[iM] * math::exp(running_max[iM] - running_max_new[iM]) * c +
+                         math::exp(max[iM] - running_max_new[iM]) * acc1) /
+                        running_sum_new[iM]; // Formula by Dao et al.,
+                                             // https://arxiv.org/pdf/2205.14135v2.pdf section 3.1
+
+                    c_thread_buf(I) = c_new; // O_new
+                });
+            });
+
+            a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_ak0_m_ak1,
+                                                a_block_reset_copy_step); // rewind K
+            b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_bk0_n_bk1,
+                                                b_block_reset_copy_step); // rewind K and step N
+
+            // update before next j iteration
+            running_max = running_max_new;
+            running_sum = running_sum_new;
+
+            block_sync_lds(); // wait for gemm1 LDS read
+        } while(++gemm1_k_block_outer_index < num_gemm1_k_block_outer_loop); // end j loop
+
+        // shuffle C and write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              Gemm1NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+            constexpr index_t NWave = Gemm1NPerBlock / (Gemm1NXdlPerWave * NPerXdl);
+
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4 =
+                gemm1_blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp =
+                gemm1_blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I4);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I5);
+            constexpr auto N3 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I6);
+            constexpr auto N4 = c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4_tmp.GetLength(I7);
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<FloatCShuffle*>(p_shared),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
+                        M1,                                      // M1 = MWave
+                        M2)),                                    // M2 = MPerXdl
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
+                        N1,                                      // N1 = NWave
+                        N2,                                      // N2 * N3 * N4 = NPerXdl
+                        N3,
+                        N4))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4>{}, Sequence<>{}, Sequence<1, 3, 5, 6, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                gemm1_blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_n3_n4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2, N3, N4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_n3_n4_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<FloatGemmAcc,
+                                                   FloatCShuffle,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4),
+                                                   tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            I1,
+                                                            N2,
+                                                            I1,
+                                                            N4>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     n_thread_data_on_block_idx[I2],
+                                     n_thread_data_on_block_idx[I3],
+                                     n_thread_data_on_block_idx[I4]),
+                    tensor_operation::element_wise::PassThrough{}};
+
+            // shuffle: blockwise copy C from LDS to global
+            auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1<
+                ThisThreadBlock,            // ThreadGroup
+                CElementwiseOperation,      // ElementwiseOperation,
+                CGlobalMemoryDataOperation, // DstInMemOp,
+                Sequence<1,
+                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                         1,
+                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                FloatCShuffle,        // typename SrcData,
+                FloatC,               // typename DstData,
+                decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+                Sequence<0, 1, 2, 3>,                           // typename DimAccessOrder,
+                3,                                              // index_t VectorDim,
+                CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector,
+                true,  // bool ThreadTransferSrcResetCoordinateAfterRun,
+                false> // bool ThreadTransferDstResetCoordinateAfterRun>
+                {c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(0, 0, 0, 0),
+                 c_grid_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(block_work_idx[I0], 0, block_work_idx[I1], 0),
+                 c_element_op};
+
+            // space filling curve for threadwise C in VGPR
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MXdlPerWave, Gemm1NXdlPerWave, 1, 1, 1, N2, 1, N4>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle,
+                                           CShuffleNXdlPerWavePerShuffle,
+                                           1,
+                                           1,
+                                           1,
+                                           N2,
+                                           1,
+                                           N4>>{};
+
+            // space filling curve for shuffled blockwise C in global mem
+            constexpr auto sfc_c_global =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, Gemm1NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                           1,
+                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            static_assert(num_access == sfc_c_global.GetNumOfAccess(), "wrong!");
+
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_n2_n3_n4,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf,
+                                              c_block_desc_m0_n0_m1_n1_m2_n2_n3_n4,
+                                              c_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                c_shuffle_block_copy_lds_to_global.Run(
+                    c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                    c_shuffle_block_buf,
+                    c_grid_desc_mblock_mperblock_nblock_nperblock,
+                    c_grid_buf);
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id);
+
+                    // move on C
+                    c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow(
+                        c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
+                }
+            });
+        }
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp
new file mode 100644
index 00000000..ede6a96d
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_backward_blockwise_welford.hpp
@@ -0,0 +1,554 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/math_v2.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_welford.hpp"
+#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp"
+#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseBatchrNormBackwardWithBlockwiseWelford_,
+          typename XDataType,
+          typename DyDataType,
+          typename DxDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename DscaleDbiasDataType,
+          typename MeanVarDataType,
+          typename DyElementwiseOp,
+          typename XYGridDesc_M_K,
+          typename ScaleBiasGridDesc_M,
+          typename MeanVarGridDesc_M,
+          typename GetReduceCountPerThreadFunctor>
+__global__ void kernel_batchnorm_backward_with_blockwise_welford(
+    const XYGridDesc_M_K x_grid_desc_m_k,
+    const XYGridDesc_M_K dy_grid_desc_m_k,
+    const XYGridDesc_M_K dx_grid_desc_m_k,
+    const ScaleBiasGridDesc_M scale_grid_desc_m,
+    const ScaleBiasGridDesc_M dscale_dbias_grid_desc_m,
+    const MeanVarGridDesc_M mean_var_grid_desc_m,
+    const GetReduceCountPerThreadFunctor get_reduce_count_per_thread,
+    long_index_t reduce_size,
+    index_t num_k_block_tile_iteration,
+    AccDataType epsilon,
+    const XDataType* const __restrict__ p_x,
+    const DyDataType* const __restrict__ p_dy,
+    const ScaleDataType* const __restrict__ p_scale,
+    bool haveSavedMeanInvVar,
+    const MeanVarDataType* const __restrict__ p_savedMean,
+    const MeanVarDataType* const __restrict__ p_savedInvVar,
+    const DyElementwiseOp dy_elementwise_op,
+    DxDataType* const __restrict__ p_dx,
+    DscaleDbiasDataType* const __restrict__ p_dscale,
+    DscaleDbiasDataType* const __restrict__ p_dbias)
+{
+    GridwiseBatchrNormBackwardWithBlockwiseWelford_::Run(x_grid_desc_m_k,
+                                                         dy_grid_desc_m_k,
+                                                         dx_grid_desc_m_k,
+                                                         scale_grid_desc_m,
+                                                         dscale_dbias_grid_desc_m,
+                                                         mean_var_grid_desc_m,
+                                                         get_reduce_count_per_thread,
+                                                         reduce_size,
+                                                         num_k_block_tile_iteration,
+                                                         epsilon,
+                                                         p_x,
+                                                         p_dy,
+                                                         p_scale,
+                                                         haveSavedMeanInvVar,
+                                                         p_savedMean,
+                                                         p_savedInvVar,
+                                                         dy_elementwise_op,
+                                                         p_dx,
+                                                         p_dscale,
+                                                         p_dbias);
+};
+
+template <typename XDataType,
+          typename DyDataType,
+          typename DxDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename DscaleDbiasDataType,
+          typename MeanVarDataType,
+          typename DyElementwiseOp,
+          typename XYGridDesc_M_K,
+          typename ScaleBiasGridDesc_M,
+          typename MeanVarGridDesc_M,
+          typename GetReduceCountPerThreadFunctor,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t XDyDxVectorDim,
+          index_t XSrcVectorSize,
+          index_t DySrcVectorSize,
+          index_t DxDstVectorSize,
+          index_t ScaleSrcVectorSize,
+          index_t DscaleDbiasDstVectorSize,
+          index_t MeanVarSrcVectorSize>
+struct GridwiseBatchNormBackwardWithBlockwiseWelford
+{
+    static_assert((XDyDxVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0 &&
+                   MThreadSliceSize % DySrcVectorSize == 0 &&
+                   MThreadSliceSize % DxDstVectorSize == 0) ||
+                      (XDyDxVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0 &&
+                       KThreadSliceSize % DySrcVectorSize == 0 &&
+                       KThreadSliceSize % DxDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static constexpr bool reorder_thread_cluster = (XDyDxVectorDim == 0);
+
+    using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
+
+    using ThreadBufferDimAccessOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    using ThreadClusterArrangeOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    static constexpr auto thread_cluster_desc =
+        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
+
+    using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{})));
+    using ThreadReduceDstDesc_M =
+        decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
+
+    using ThreadwiseWelford =
+        ThreadwiseWelford<AccDataType, ThreadReduceSrcDesc_M_K, ThreadReduceDstDesc_M>;
+
+    using BlockwiseWelford = BlockwiseWelford<AccDataType,
+                                              BlockSize,
+                                              ThreadClusterLengths_M_K,
+                                              ThreadClusterArrangeOrder>;
+
+    using BlockwiseReduce = PartitionedBlockwiseReduction<AccDataType,
+                                                          BlockSize,
+                                                          ThreadClusterLengths_M_K,
+                                                          ThreadClusterArrangeOrder,
+                                                          ck::reduce::Add,
+                                                          false>;
+
+    using ThreadwiseReduce = ThreadwiseReduction<AccDataType,
+                                                 ThreadReduceSrcDesc_M_K,
+                                                 ThreadReduceDstDesc_M,
+                                                 ck::reduce::Add,
+                                                 false>;
+
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+
+    // clang-format off
+    // Blockwise BatchNorm Backward
+    // Input: x, dy, scale, savedMean and savedInvVar (optional), reduce_size
+    // Output: dx, dscale, dbias
+    // Step 1: calculating mean and inv-variance using welford method (if savedMean/savedInvVar not available), where inv-variance = 1/sqrt(epsilon+variance)
+    // Step 2: reduction: dbias = sum(dy),  dscale = sum(dy *(x-mean) * inv-variance)
+    // Step 3: calculating dx = 1/reduce_size * inv-variance * scale * (reduce_size * dy - dbias - dscale * (x - mean) * inv-variance)) elementwise-ly
+    // clang-format on
+    __device__ static void Run(const XYGridDesc_M_K x_grid_desc_m_k,
+                               const XYGridDesc_M_K dy_grid_desc_m_k,
+                               const XYGridDesc_M_K dx_grid_desc_m_k,
+                               const ScaleBiasGridDesc_M scale_grid_desc_m,
+                               const ScaleBiasGridDesc_M dscale_dbias_grid_desc_m,
+                               const MeanVarGridDesc_M mean_var_grid_desc_m,
+                               const GetReduceCountPerThreadFunctor get_reduce_count_per_thread,
+                               long_index_t reduce_size,
+                               index_t num_k_block_tile_iteration,
+                               AccDataType epsilon,
+                               const XDataType* const __restrict__ p_x,
+                               const DyDataType* const __restrict__ p_dy,
+                               const ScaleDataType* const __restrict__ p_scale,
+                               bool haveSavedMeanInvVar,
+                               const MeanVarDataType* const __restrict__ p_savedMean,
+                               const MeanVarDataType* const __restrict__ p_savedInvVar,
+                               const DyElementwiseOp dy_elementwise_op,
+                               DxDataType* const __restrict__ p_dx,
+                               DscaleDbiasDataType* const __restrict__ p_dscale,
+                               DscaleDbiasDataType* const __restrict__ p_dbias)
+    {
+        using ck::math::sqrt;
+
+        __shared__ AccDataType p_reduce_work_buffer[BlockSize];
+
+        auto reduce_work_buf =
+            make_dynamic_buffer<AddressSpaceEnum::Lds>(p_reduce_work_buffer, BlockSize);
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            x_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            dy_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            dx_thread_buf;
+
+        // buffer of values of dy * (x-mean) * invVariance, used as input of Blockwise reduction
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            tmp1_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> scale_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> var_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>&
+            inv_var_thread_buf = var_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> dscale_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> dbias_thread_buf;
+
+        const index_t thread_local_id = get_thread_local_1d_id();
+        const index_t block_global_id = get_block_1d_id();
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
+
+        const auto thread_m_cluster_id = thread_cluster_idx[I0];
+        const auto thread_k_cluster_id = thread_cluster_idx[I1];
+
+        using ThreadBufferLengths_M_K         = Sequence<MThreadSliceSize, KThreadSliceSize>;
+        using ThreadBufferLengths_M           = Sequence<MThreadSliceSize>;
+        constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
+        constexpr auto thread_buffer_desc_m =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}));
+
+        auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2<XDataType,
+                                                                  AccDataType,
+                                                                  XYGridDesc_M_K,
+                                                                  decltype(thread_buffer_desc_m_k),
+                                                                  ThreadBufferLengths_M_K,
+                                                                  ThreadBufferDimAccessOrder,
+                                                                  XDyDxVectorDim,
+                                                                  XSrcVectorSize,
+                                                                  1,
+                                                                  true>(
+            x_grid_desc_m_k,
+            make_multi_index(block_global_id * M_BlockTileSize +
+                                 thread_m_cluster_id * MThreadSliceSize,
+                             thread_k_cluster_id * KThreadSliceSize));
+
+        auto threadwise_dy_load = ThreadwiseTensorSliceTransfer_v2<DyDataType,
+                                                                   AccDataType,
+                                                                   XYGridDesc_M_K,
+                                                                   decltype(thread_buffer_desc_m_k),
+                                                                   ThreadBufferLengths_M_K,
+                                                                   ThreadBufferDimAccessOrder,
+                                                                   XDyDxVectorDim,
+                                                                   XSrcVectorSize,
+                                                                   1,
+                                                                   true>(
+            dy_grid_desc_m_k,
+            make_multi_index(block_global_id * M_BlockTileSize +
+                                 thread_m_cluster_id * MThreadSliceSize,
+                             thread_k_cluster_id * KThreadSliceSize));
+
+        auto threadwise_dx_store =
+            ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                               DxDataType,
+                                               decltype(thread_buffer_desc_m_k),
+                                               XYGridDesc_M_K,
+                                               PassThroughOp,
+                                               ThreadBufferLengths_M_K,
+                                               ThreadBufferDimAccessOrder,
+                                               XDyDxVectorDim,
+                                               DxDstVectorSize,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                dx_grid_desc_m_k,
+                make_multi_index(block_global_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 thread_k_cluster_id * KThreadSliceSize),
+                PassThroughOp{});
+
+        auto threadwise_scale_load =
+            ThreadwiseTensorSliceTransfer_v2<ScaleDataType,
+                                             AccDataType,
+                                             ScaleBiasGridDesc_M,
+                                             decltype(thread_buffer_desc_m),
+                                             ThreadBufferLengths_M,
+                                             Sequence<0>,
+                                             0,
+                                             ScaleSrcVectorSize,
+                                             1,
+                                             true>(
+                scale_grid_desc_m,
+                make_multi_index(block_global_id * M_BlockTileSize +
+                                 thread_m_cluster_id * MThreadSliceSize));
+
+        auto threadwise_dscale_dbias_store =
+            ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                               DscaleDbiasDataType,
+                                               decltype(thread_buffer_desc_m),
+                                               ScaleBiasGridDesc_M,
+                                               PassThroughOp,
+                                               ThreadBufferLengths_M,
+                                               Sequence<0>,
+                                               0,
+                                               DscaleDbiasDstVectorSize,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                dscale_dbias_grid_desc_m,
+                make_multi_index(block_global_id * M_BlockTileSize +
+                                 thread_m_cluster_id * MThreadSliceSize),
+                PassThroughOp{});
+
+        constexpr auto thread_copy_fwd_step_m_k = make_multi_index(0, K_BlockTileSize);
+        constexpr auto thread_copy_bwd_step_m_k = make_multi_index(0, -K_BlockTileSize);
+
+        const auto x_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_x, x_grid_desc_m_k.GetElementSpaceSize());
+
+        const auto dy_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_dy, dy_grid_desc_m_k.GetElementSpaceSize());
+
+        auto dx_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_dx, dx_grid_desc_m_k.GetElementSpaceSize());
+
+        const auto scale_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_scale, scale_grid_desc_m.GetElementSpaceSize());
+
+        auto dscale_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_dscale, dscale_dbias_grid_desc_m.GetElementSpaceSize());
+
+        auto dbias_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_dbias, dscale_dbias_grid_desc_m.GetElementSpaceSize());
+
+        // clang-format off
+        // Step 1: calculating mean and inv-variance using welford method (if savedMean/savedInvVar not available), where inv-variance = 1/sqrt(epsilon+variance)
+        // clang-format on
+
+        if(haveSavedMeanInvVar)
+        {
+            const auto mean_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_savedMean, mean_var_grid_desc_m.GetElementSpaceSize());
+
+            const auto inv_var_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_savedInvVar, mean_var_grid_desc_m.GetElementSpaceSize());
+
+            auto threadwise_mean_inv_var_load =
+                ThreadwiseTensorSliceTransfer_v2<MeanVarDataType,
+                                                 AccDataType,
+                                                 MeanVarGridDesc_M,
+                                                 decltype(thread_buffer_desc_m),
+                                                 ThreadBufferLengths_M,
+                                                 Sequence<0>,
+                                                 0,
+                                                 MeanVarSrcVectorSize,
+                                                 1,
+                                                 true>(
+                    mean_var_grid_desc_m,
+                    make_multi_index(block_global_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize));
+
+            threadwise_mean_inv_var_load.Run(mean_var_grid_desc_m,
+                                             mean_global_buf,
+                                             thread_buffer_desc_m,
+                                             make_tuple(I0),
+                                             mean_thread_buf);
+
+            threadwise_mean_inv_var_load.Run(mean_var_grid_desc_m,
+                                             inv_var_global_buf,
+                                             thread_buffer_desc_m,
+                                             make_tuple(I0),
+                                             inv_var_thread_buf);
+        }
+        else
+        {
+            auto threadwise_welford       = ThreadwiseWelford();
+            threadwise_welford.max_count_ = get_reduce_count_per_thread(thread_k_cluster_id);
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                mean_thread_buf(I) = type_convert<AccDataType>(0.0f);
+                var_thread_buf(I)  = type_convert<AccDataType>(0.0f);
+            });
+
+            for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles)
+            {
+
+                threadwise_x_load.Run(x_grid_desc_m_k,
+                                      x_global_buf,
+                                      thread_buffer_desc_m_k,
+                                      make_tuple(I0, I0),
+                                      x_thread_buf);
+
+                threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k);
+                threadwise_welford.Run(x_thread_buf, mean_thread_buf, var_thread_buf);
+            }
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                if constexpr(I > 0)
+                    block_sync_lds();
+
+                int count = threadwise_welford.cur_count_;
+                BlockwiseWelford::Run(mean_thread_buf(I), var_thread_buf(I), count);
+            });
+
+            // calculate inv-variance as 1/sqrt(epsilon+variance)
+            static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                inv_var_thread_buf(I) =
+                    type_convert<AccDataType>(1.0) / sqrt(var_thread_buf[I] + epsilon);
+            });
+
+            threadwise_x_load.SetSrcSliceOrigin(
+                x_grid_desc_m_k,
+                make_multi_index(block_global_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 thread_k_cluster_id * KThreadSliceSize));
+        };
+
+        // clang-format off
+        // Step 2: reduction: dbias = sum(dy),  dscale = sum(dy *(x-mean) * inv-variance)
+        // clang-format on
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            dscale_thread_buf(I) = type_convert<AccDataType>(0);
+            dbias_thread_buf(I)  = type_convert<AccDataType>(0);
+        });
+
+        for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles)
+        {
+            threadwise_x_load.Run(x_grid_desc_m_k,
+                                  x_global_buf,
+                                  thread_buffer_desc_m_k,
+                                  make_tuple(I0, I0),
+                                  x_thread_buf);
+
+            threadwise_dy_load.Run(dx_grid_desc_m_k,
+                                   dy_global_buf,
+                                   thread_buffer_desc_m_k,
+                                   make_tuple(I0, I0),
+                                   dy_thread_buf);
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                    constexpr auto offset =
+                        thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK));
+
+                    dy_elementwise_op(dy_thread_buf(Number<offset>{}),
+                                      dy_thread_buf[Number<offset>{}]);
+
+                    AccDataType norm_x = (x_thread_buf[Number<offset>{}] - mean_thread_buf[iM]) *
+                                         inv_var_thread_buf[iM];
+
+                    tmp1_thread_buf(Number<offset>{}) = norm_x * dy_thread_buf[Number<offset>{}];
+                });
+            });
+
+            ThreadwiseReduce::Reduce(tmp1_thread_buf, dscale_thread_buf);
+            ThreadwiseReduce::Reduce(dy_thread_buf, dbias_thread_buf);
+
+            threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k);
+            threadwise_dy_load.MoveSrcSliceWindow(dy_grid_desc_m_k, thread_copy_fwd_step_m_k);
+        };
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            if constexpr(I > 0)
+                block_sync_lds();
+            BlockwiseReduce::Reduce(reduce_work_buf, dscale_thread_buf(I));
+            block_sync_lds();
+            BlockwiseReduce::Reduce(reduce_work_buf, dbias_thread_buf(I));
+        });
+
+        if(thread_k_cluster_id == 0)
+        {
+            threadwise_dscale_dbias_store.Run(thread_buffer_desc_m,
+                                              make_tuple(I0),
+                                              dscale_thread_buf,
+                                              dscale_dbias_grid_desc_m,
+                                              dscale_global_buf);
+
+            threadwise_dscale_dbias_store.Run(thread_buffer_desc_m,
+                                              make_tuple(I0),
+                                              dbias_thread_buf,
+                                              dscale_dbias_grid_desc_m,
+                                              dbias_global_buf);
+        };
+
+        // clang-format off
+        // Step 3: calculating dx = 1/reduce_size * inv-variance * scale * (reduce_size * dy - dbias - dscale * (x - mean) * inv-variance)) elementwise-ly
+        // clang-format on
+
+        threadwise_scale_load.Run(scale_grid_desc_m,
+                                  scale_global_buf,
+                                  thread_buffer_desc_m,
+                                  make_tuple(I0),
+                                  scale_thread_buf);
+
+        auto thread_copy_tail_m_k = (num_k_block_tile_iteration - 1) * thread_copy_fwd_step_m_k;
+
+        threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_bwd_step_m_k);
+        threadwise_dy_load.MoveSrcSliceWindow(dy_grid_desc_m_k, thread_copy_bwd_step_m_k);
+        threadwise_dx_store.MoveDstSliceWindow(dx_grid_desc_m_k, thread_copy_tail_m_k);
+
+        AccDataType inv_reduce_size =
+            type_convert<AccDataType>(1.0) / type_convert<AccDataType>(reduce_size);
+
+        for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles)
+        {
+            threadwise_x_load.Run(x_grid_desc_m_k,
+                                  x_global_buf,
+                                  thread_buffer_desc_m_k,
+                                  make_tuple(I0, I0),
+                                  x_thread_buf);
+
+            threadwise_dy_load.Run(dy_grid_desc_m_k,
+                                   dy_global_buf,
+                                   thread_buffer_desc_m_k,
+                                   make_tuple(I0, I0),
+                                   dy_thread_buf);
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                AccDataType multiplier =
+                    inv_reduce_size * inv_var_thread_buf[iM] * scale_thread_buf[iM];
+
+                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                    constexpr auto offset =
+                        thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK));
+
+                    dy_elementwise_op(dy_thread_buf(Number<offset>{}),
+                                      dy_thread_buf[Number<offset>{}]);
+
+                    AccDataType norm_x = (x_thread_buf[Number<offset>{}] - mean_thread_buf[iM]) *
+                                         inv_var_thread_buf[iM];
+
+                    AccDataType tmpVal = norm_x * dscale_thread_buf[iM];
+
+                    dx_thread_buf(Number<offset>{}) =
+                        multiplier *
+                        (type_convert<AccDataType>(reduce_size) * dy_thread_buf[Number<offset>{}] -
+                         dbias_thread_buf[iM] - tmpVal);
+                });
+            });
+
+            threadwise_dx_store.Run(thread_buffer_desc_m_k,
+                                    make_tuple(I0, I0),
+                                    dx_thread_buf,
+                                    dx_grid_desc_m_k,
+                                    dx_global_buf);
+
+            threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_bwd_step_m_k);
+            threadwise_dy_load.MoveSrcSliceWindow(dy_grid_desc_m_k, thread_copy_bwd_step_m_k);
+            threadwise_dx_store.MoveDstSliceWindow(dx_grid_desc_m_k, thread_copy_bwd_step_m_k);
+        }
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_forward_blockwise_welford.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_forward_blockwise_welford.hpp
new file mode 100644
index 00000000..33c45a0f
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batchnorm_forward_blockwise_welford.hpp
@@ -0,0 +1,483 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/math_v2.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_welford.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseBatchrNormForwardWithBlockwiseWelford_,
+          typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename YElementwiseOp,
+          typename XYGridDesc_M_K,
+          typename ScaleBiasGridDesc_M,
+          typename MeanVarGridDesc_M,
+          typename GetReduceCountPerThreadFunctor>
+__global__ void kernel_batchnorm_forward_with_blockwise_welford(
+    const XYGridDesc_M_K x_grid_desc_m_k,
+    const XYGridDesc_M_K y_grid_desc_m_k,
+    const ScaleBiasGridDesc_M scale_grid_desc_m,
+    const ScaleBiasGridDesc_M bias_grid_desc_m,
+    const MeanVarGridDesc_M mean_var_grid_desc_m,
+    const GetReduceCountPerThreadFunctor get_reduce_count_per_thread,
+    index_t num_k_block_tile_iteration,
+    AccDataType epsilon,
+    const XDataType* const __restrict__ p_x,
+    const ScaleDataType* const __restrict__ p_scale,
+    const BiasDataType* const __restrict__ p_bias,
+    const YElementwiseOp y_elementwise_op,
+    YDataType* const __restrict__ p_y,
+    bool updateMovingAverage,
+    AccDataType averageFactor,
+    MeanVarDataType* const __restrict__ resultRunningMean,
+    MeanVarDataType* const __restrict__ resultRunningVariance,
+    bool saveMeanInvVariance,
+    MeanVarDataType* const __restrict__ resultSaveMean,
+    MeanVarDataType* const __restrict__ resultSaveInvVariance)
+{
+    GridwiseBatchrNormForwardWithBlockwiseWelford_::Run(x_grid_desc_m_k,
+                                                        y_grid_desc_m_k,
+                                                        scale_grid_desc_m,
+                                                        bias_grid_desc_m,
+                                                        mean_var_grid_desc_m,
+                                                        get_reduce_count_per_thread,
+                                                        num_k_block_tile_iteration,
+                                                        epsilon,
+                                                        p_x,
+                                                        p_scale,
+                                                        p_bias,
+                                                        y_elementwise_op,
+                                                        p_y,
+                                                        updateMovingAverage,
+                                                        averageFactor,
+                                                        resultRunningMean,
+                                                        resultRunningVariance,
+                                                        saveMeanInvVariance,
+                                                        resultSaveMean,
+                                                        resultSaveInvVariance);
+};
+
+template <typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename YElementwiseOp,
+          typename XYGridDesc_M_K,
+          typename ScaleBiasGridDesc_M,
+          typename MeanVarGridDesc_M,
+          typename GetReduceCountPerThreadFunctor,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t XSrcYDstVectorDim,
+          index_t XSrcVectorSize,
+          index_t YDstVectorSize,
+          index_t ScaleSrcVectorSize,
+          index_t BiasSrcVectorSize,
+          index_t MeanVarSrcDstVectorSize>
+struct GridwiseBatchNormForwardWithBlockwiseWelford
+{
+    static_assert((XSrcYDstVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0) ||
+                      (XSrcYDstVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static_assert((XSrcYDstVectorDim == 0 && MThreadSliceSize % YDstVectorSize == 0) ||
+                      (XSrcYDstVectorDim == 1 && KThreadSliceSize % YDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static constexpr bool reorder_thread_cluster = (XSrcYDstVectorDim == 0);
+
+    using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
+
+    using ThreadBufferDimAccessOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    using ThreadClusterArrangeOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    static constexpr auto thread_cluster_desc =
+        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
+
+    using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{})));
+    using ThreadReduceDstDesc_M =
+        decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
+
+    using ThreadwiseWelford =
+        ThreadwiseWelford<AccDataType, ThreadReduceSrcDesc_M_K, ThreadReduceDstDesc_M>;
+
+    using BlockwiseWelford = BlockwiseWelford<AccDataType,
+                                              BlockSize,
+                                              ThreadClusterLengths_M_K,
+                                              ThreadClusterArrangeOrder>;
+
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+
+    __device__ static void Run(const XYGridDesc_M_K& x_grid_desc_m_k,
+                               const XYGridDesc_M_K& y_grid_desc_m_k,
+                               const ScaleBiasGridDesc_M& scale_grid_desc_m,
+                               const ScaleBiasGridDesc_M& bias_grid_desc_m,
+                               const MeanVarGridDesc_M& mean_var_grid_desc_m,
+                               const GetReduceCountPerThreadFunctor& get_reduce_count_per_thread,
+                               index_t num_k_block_tile_iteration,
+                               AccDataType epsilon,
+                               const XDataType* const __restrict__ p_x,
+                               const ScaleDataType* const __restrict__ p_scale,
+                               const BiasDataType* const __restrict__ p_bias,
+                               const YElementwiseOp y_elementwise_op,
+                               YDataType* const __restrict__ p_y,
+                               bool updateMovingAverage,
+                               AccDataType averageFactor,
+                               MeanVarDataType* const __restrict__ resultRunningMean,
+                               MeanVarDataType* const __restrict__ resultRunningVariance,
+                               bool saveMeanInvVariance,
+                               MeanVarDataType* const __restrict__ resultSaveMean,
+                               MeanVarDataType* const __restrict__ resultSaveInvVariance)
+    {
+        using ck::math::sqrt;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            x_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> scale_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> bias_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            y_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> var_thread_buf;
+
+        const index_t thread_local_id = get_thread_local_1d_id();
+        const index_t block_global_id = get_block_1d_id();
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
+
+        const auto thread_m_cluster_id = thread_cluster_idx[I0];
+        const auto thread_k_cluster_id = thread_cluster_idx[I1];
+
+        using ThreadBufferLengths_M_K         = Sequence<MThreadSliceSize, KThreadSliceSize>;
+        using ThreadBufferLengths_M           = Sequence<MThreadSliceSize>;
+        constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
+        constexpr auto thread_buffer_desc_m =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}));
+
+        auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2<XDataType,
+                                                                  AccDataType,
+                                                                  XYGridDesc_M_K,
+                                                                  decltype(thread_buffer_desc_m_k),
+                                                                  ThreadBufferLengths_M_K,
+                                                                  ThreadBufferDimAccessOrder,
+                                                                  XSrcYDstVectorDim,
+                                                                  XSrcVectorSize,
+                                                                  1,
+                                                                  true>(
+            x_grid_desc_m_k,
+            make_multi_index(block_global_id * M_BlockTileSize +
+                                 thread_m_cluster_id * MThreadSliceSize,
+                             thread_k_cluster_id * KThreadSliceSize));
+
+        auto threadwise_y_store =
+            ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                               YDataType,
+                                               decltype(thread_buffer_desc_m_k),
+                                               XYGridDesc_M_K,
+                                               YElementwiseOp,
+                                               ThreadBufferLengths_M_K,
+                                               ThreadBufferDimAccessOrder,
+                                               XSrcYDstVectorDim,
+                                               YDstVectorSize,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                y_grid_desc_m_k,
+                make_multi_index(block_global_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 thread_k_cluster_id * KThreadSliceSize),
+                y_elementwise_op);
+
+        auto threadwise_scale_load =
+            ThreadwiseTensorSliceTransfer_v2<ScaleDataType,
+                                             AccDataType,
+                                             ScaleBiasGridDesc_M,
+                                             decltype(thread_buffer_desc_m),
+                                             ThreadBufferLengths_M,
+                                             Sequence<0>,
+                                             0,
+                                             ScaleSrcVectorSize,
+                                             1,
+                                             true>(
+                scale_grid_desc_m,
+                make_multi_index(block_global_id * M_BlockTileSize +
+                                 thread_m_cluster_id * MThreadSliceSize));
+
+        auto threadwise_bias_load = ThreadwiseTensorSliceTransfer_v2<BiasDataType,
+                                                                     AccDataType,
+                                                                     ScaleBiasGridDesc_M,
+                                                                     decltype(thread_buffer_desc_m),
+                                                                     ThreadBufferLengths_M,
+                                                                     Sequence<0>,
+                                                                     0,
+                                                                     BiasSrcVectorSize,
+                                                                     1,
+                                                                     true>(
+            bias_grid_desc_m,
+            make_multi_index(block_global_id * M_BlockTileSize +
+                             thread_m_cluster_id * MThreadSliceSize));
+
+        constexpr auto thread_copy_fwd_step_m_k = make_multi_index(0, K_BlockTileSize);
+        constexpr auto thread_copy_bwd_step_m_k = make_multi_index(0, -K_BlockTileSize);
+
+        const auto x_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_x, x_grid_desc_m_k.GetElementSpaceSize());
+
+        const auto scale_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_scale, scale_grid_desc_m.GetElementSpaceSize());
+
+        const auto bias_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_bias, bias_grid_desc_m.GetElementSpaceSize());
+
+        auto y_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_y, y_grid_desc_m_k.GetElementSpaceSize());
+
+        // Step 1:  do welford reduction to get mean and variance
+
+        auto threadwise_welford       = ThreadwiseWelford();
+        threadwise_welford.max_count_ = get_reduce_count_per_thread(thread_k_cluster_id);
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            mean_thread_buf(I) = type_convert<AccDataType>(0.0f);
+            var_thread_buf(I)  = type_convert<AccDataType>(0.0f);
+        });
+
+        for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles)
+        {
+
+            threadwise_x_load.Run(x_grid_desc_m_k,
+                                  x_global_val_buf,
+                                  thread_buffer_desc_m_k,
+                                  make_tuple(I0, I0),
+                                  x_thread_buf);
+
+            threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k);
+            threadwise_welford.Run(x_thread_buf, mean_thread_buf, var_thread_buf);
+        }
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            if constexpr(I > 0)
+                block_sync_lds();
+
+            int count = threadwise_welford.cur_count_;
+            BlockwiseWelford::Run(mean_thread_buf(I), var_thread_buf(I), count);
+        });
+
+        // Step 2: do normalization and output y
+
+        threadwise_scale_load.Run(scale_grid_desc_m,
+                                  scale_global_val_buf,
+                                  thread_buffer_desc_m,
+                                  make_tuple(I0),
+                                  scale_thread_buf);
+
+        threadwise_bias_load.Run(bias_grid_desc_m,
+                                 bias_global_val_buf,
+                                 thread_buffer_desc_m,
+                                 make_tuple(I0),
+                                 bias_thread_buf);
+
+        auto thread_copy_tail_m_k = (num_k_block_tile_iteration - 1) * thread_copy_fwd_step_m_k;
+
+        threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_bwd_step_m_k);
+        threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, thread_copy_tail_m_k);
+
+        for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles)
+        {
+            threadwise_x_load.Run(x_grid_desc_m_k,
+                                  x_global_val_buf,
+                                  thread_buffer_desc_m_k,
+                                  make_tuple(I0, I0),
+                                  x_thread_buf);
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                AccDataType multiplier =
+                    scale_thread_buf[Number<iM>{}] / sqrt(var_thread_buf[iM] + epsilon);
+
+                AccDataType fused_mean_bias =
+                    bias_thread_buf[Number<iM>{}] - mean_thread_buf[iM] * multiplier;
+
+                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                    constexpr auto offset =
+                        thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK));
+
+                    // normalize
+                    y_thread_buf(Number<offset>{}) =
+                        x_thread_buf[Number<offset>{}] * multiplier + fused_mean_bias;
+                });
+            });
+
+            threadwise_y_store.Run(thread_buffer_desc_m_k,
+                                   make_tuple(I0, I0),
+                                   y_thread_buf,
+                                   y_grid_desc_m_k,
+                                   y_global_val_buf);
+
+            threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_bwd_step_m_k);
+            threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, thread_copy_bwd_step_m_k);
+        }
+
+        // Step 3: update the moving average of mean and variance (optional)
+
+        if(updateMovingAverage && thread_k_cluster_id == 0)
+        {
+            StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
+                running_mean_thread_buf;
+            StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
+                running_var_thread_buf;
+
+            auto running_mean_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                resultRunningMean, mean_var_grid_desc_m.GetElementSpaceSize());
+
+            auto running_var_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                resultRunningVariance, mean_var_grid_desc_m.GetElementSpaceSize());
+
+            auto threadwise_mean_var_load =
+                ThreadwiseTensorSliceTransfer_v2<MeanVarDataType,
+                                                 AccDataType,
+                                                 MeanVarGridDesc_M,
+                                                 decltype(thread_buffer_desc_m),
+                                                 ThreadBufferLengths_M,
+                                                 Sequence<0>,
+                                                 0,
+                                                 MeanVarSrcDstVectorSize,
+                                                 1,
+                                                 true>(
+                    mean_var_grid_desc_m,
+                    make_multi_index(block_global_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize));
+
+            threadwise_mean_var_load.Run(mean_var_grid_desc_m,
+                                         running_mean_global_buf,
+                                         thread_buffer_desc_m,
+                                         make_tuple(I0),
+                                         running_mean_thread_buf);
+
+            threadwise_mean_var_load.Run(mean_var_grid_desc_m,
+                                         running_var_global_buf,
+                                         thread_buffer_desc_m,
+                                         make_tuple(I0),
+                                         running_var_thread_buf);
+
+            AccDataType oneMinusAverageFactor = type_convert<AccDataType>(1.0) - averageFactor;
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                running_mean_thread_buf(I) = running_mean_thread_buf[I] * oneMinusAverageFactor +
+                                             mean_thread_buf[I] * averageFactor;
+                running_var_thread_buf(I) = running_var_thread_buf[I] * oneMinusAverageFactor +
+                                            var_thread_buf[I] * averageFactor;
+            });
+
+            auto threadwise_mean_var_store =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   MeanVarDataType,
+                                                   decltype(thread_buffer_desc_m),
+                                                   MeanVarGridDesc_M,
+                                                   PassThroughOp,
+                                                   ThreadBufferLengths_M,
+                                                   Sequence<0>,
+                                                   0,
+                                                   MeanVarSrcDstVectorSize,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>(
+                    mean_var_grid_desc_m,
+                    make_multi_index(block_global_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize),
+                    PassThroughOp{});
+
+            threadwise_mean_var_store.Run(thread_buffer_desc_m,
+                                          make_tuple(I0),
+                                          running_mean_thread_buf,
+                                          mean_var_grid_desc_m,
+                                          running_mean_global_buf);
+
+            threadwise_mean_var_store.Run(thread_buffer_desc_m,
+                                          make_tuple(I0),
+                                          running_var_thread_buf,
+                                          mean_var_grid_desc_m,
+                                          running_var_global_buf);
+        };
+
+        // Step 4: save mean and inv-variance (optional)
+
+        if(saveMeanInvVariance && thread_k_cluster_id == 0)
+        {
+            auto result_mean_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                resultSaveMean, mean_var_grid_desc_m.GetElementSpaceSize());
+
+            auto result_inv_var_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                resultSaveInvVariance, mean_var_grid_desc_m.GetElementSpaceSize());
+
+            // calculate inv-variance as 1/sqrt(epsilon+variance), stored in place of variance
+            static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+                var_thread_buf(I) =
+                    type_convert<AccDataType>(1.0f) / sqrt(epsilon + var_thread_buf[I]);
+            });
+
+            auto threadwise_mean_inv_var_store =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   MeanVarDataType,
+                                                   decltype(thread_buffer_desc_m),
+                                                   MeanVarGridDesc_M,
+                                                   PassThroughOp,
+                                                   ThreadBufferLengths_M,
+                                                   Sequence<0>,
+                                                   0,
+                                                   MeanVarSrcDstVectorSize,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>(
+                    mean_var_grid_desc_m,
+                    make_multi_index(block_global_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize),
+                    PassThroughOp{});
+
+            threadwise_mean_inv_var_store.Run(thread_buffer_desc_m,
+                                              make_tuple(I0),
+                                              mean_thread_buf,
+                                              mean_var_grid_desc_m,
+                                              result_mean_global_buf);
+
+            threadwise_mean_inv_var_store.Run(thread_buffer_desc_m,
+                                              make_tuple(I0),
+                                              var_thread_buf,
+                                              mean_var_grid_desc_m,
+                                              result_inv_var_global_buf);
+        };
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_contraction_dlops_v1r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_contraction_dlops_v1r2.hpp
new file mode 100644
index 00000000..2369f517
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_contraction_dlops_v1r2.hpp
@@ -0,0 +1,662 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_GRIDWISE_CONTRACTION_DLOPS_V1R2_HPP
+#define CK_GRIDWISE_CONTRACTION_DLOPS_V1R2_HPP
+
+#include "common_header.hpp"
+#include "multi_index_transform_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "blockwise_gemm_dlops_v2r3.hpp"
+#include "blockwise_tensor_slice_transfer_v2.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_set.hpp"
+
+namespace ck {
+
+template <typename GridwiseContraction,
+          typename FloatAB,
+          typename FloatC,
+          typename AGridDesc_GK0_GM0_GM10_GM11_GK1,
+          typename BGridDesc_GK0_GN0_GN10_GN11_GK1,
+          typename CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1,
+          typename CGridBlockCluster_BlockId_To_GM10_GN10,
+          bool HasMainKBlockLoop,
+          bool HasDoubleTailKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_contraction_dlops_v1r2(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const AGridDesc_GK0_GM0_GM10_GM11_GK1 a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+            const BGridDesc_GK0_GN0_GN10_GN11_GK1 b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+            const CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1 c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
+            const CGridBlockCluster_BlockId_To_GM10_GN10 c_grid_block_cluster_blockid_to_gm10_gn10)
+{
+    constexpr index_t shared_block_size =
+        GridwiseContraction::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseContraction::Run(p_a_grid,
+                             p_b_grid,
+                             p_c_grid,
+                             p_shared_block,
+                             a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+                             b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+                             c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
+                             c_grid_block_cluster_blockid_to_gm10_gn10,
+                             integral_constant<bool, HasMainKBlockLoop>{},
+                             integral_constant<bool, HasDoubleTailKBlockLoop>{});
+}
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          typename AGridDesc_GK0_GM0_GM1_GK1,
+          typename BGridDesc_GK0_GN0_GN1_GK1,
+          typename CGridDesc_GM0_GM1_GN0_GN1,
+          index_t GM1PerBlockGM11,
+          index_t GN1PerBlockGN11,
+          index_t GK0PerBlock,
+          index_t BM1PerThreadBM11,
+          index_t BN1PerThreadBN11,
+          index_t BK0PerThread,
+          typename BM10BN10ThreadClusterBM10Xs,
+          typename BM10BN10ThreadClusterBN10Xs,
+          typename ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
+          typename ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          typename ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+          typename ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1,
+          typename ABlockTransferSrcVectorTensorContiguousDimOrder,
+          typename BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
+          typename BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          typename BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+          typename BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1,
+          typename BBlockTransferSrcVectorTensorContiguousDimOrder,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector,
+          typename AGridStepHacks,
+          typename BGridStepHacks,
+          typename CGridStepHacks,
+          typename AGridMoveSliceWindowStepHacks,
+          typename BGridMoveSliceWindowStepHacks>
+struct GridwiseContractionDlops_A_GK0_GM0_GM1_GK1_B_GK0_GN0_GN1_GK1_C_GM0_GM1_GN0_GN1
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    // GM0 and GN0 need to known at compile-time
+    static constexpr auto GM0 = CGridDesc_GM0_GM1_GN0_GN1{}.GetLength(I0);
+    static constexpr auto GN0 = CGridDesc_GM0_GM1_GN0_GN1{}.GetLength(I2);
+    static constexpr auto GK1 = AGridDesc_GK0_GM0_GM1_GK1{}.GetLength(I3);
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // lds max alignment
+        // TODO: part of them should be moved into blockwise-gemm
+        // TODO: change this. I think it needs multi-dimensional alignment
+        constexpr auto max_lds_align = GK1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<GK0PerBlock>{}, GM0, I1, Number<GM1PerBlockGM11>{}, GK1),
+            max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<GK0PerBlock>{}, GN0, I1, Number<GN1PerBlockGN11>{}, GK1),
+            max_lds_align);
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_aligned_space_size = math::integer_least_multiple(
+            a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_aligned_space_size = math::integer_least_multiple(
+            b_block_desc_gk0_gn0_gn10_gn11_gk1.GetElementSpaceSize(), max_lds_align);
+
+        return 2 * (a_block_aligned_space_size + b_block_aligned_space_size) * sizeof(FloatAB);
+    }
+
+    __host__ __device__ static constexpr bool
+    CheckValidity(const AGridDesc_GK0_GM0_GM1_GK1& a_grid_desc_gk0_gm0_gm1_gk1,
+                  const BGridDesc_GK0_GN0_GN1_GK1& b_grid_desc_gk0_gn0_gn1_gk1,
+                  const CGridDesc_GM0_GM1_GN0_GN1& c_grid_desc_gm0_gm1_gn0_gn1)
+    {
+        static_assert(is_known_at_compile_time<remove_cv_t<decltype(GM0)>>::value &&
+                          is_known_at_compile_time<remove_cv_t<decltype(GN0)>>::value,
+                      "wrong! GM0 and GN0 need to be known at compile-time");
+
+        const auto GM1 = a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I2);
+        const auto GN1 = b_grid_desc_gk0_gn0_gn1_gk1.GetLength(I2);
+        const auto GK0 = a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I0);
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+
+        return (
+            (GM0 == c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I0) &&
+             GM1 == c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I1) &&
+             GN0 == c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I2) &&
+             GN1 == c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I3) &&
+             GM0 == a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I1) &&
+             GM1 == a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I2) &&
+             GN0 == b_grid_desc_gk0_gn0_gn1_gk1.GetLength(I1) &&
+             GN1 == b_grid_desc_gk0_gn0_gn1_gk1.GetLength(I2) &&
+             GK0 == b_grid_desc_gk0_gn0_gn1_gk1.GetLength(I0) &&
+             GK1 == b_grid_desc_gk0_gn0_gn1_gk1.GetLength(I3)) &&
+            (GM1 % GM1PerBlockGM11 == 0 && GN1 % GN1PerBlockGN11 == 0 && GK0 % GK0PerBlock == 0));
+    }
+
+    __host__ __device__ static constexpr index_t
+    CalculateGridSize(const CGridDesc_GM0_GM1_GN0_GN1& c_grid_desc_gm0_gm1_gn0_gn1)
+    {
+        const auto GM1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I1);
+        const auto GN1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I3);
+
+        constexpr index_t GM11 = GM1PerBlockGM11;
+        constexpr index_t GN11 = GN1PerBlockGN11;
+
+        const index_t GM10 = GM1 / GM11;
+        const index_t GN10 = GN1 / GN11;
+
+        const index_t grid_size = GM10 * GN10;
+
+        return grid_size;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t GK0)
+    {
+        const bool has_main_k_block_loop = (GK0 + GK0PerBlock) / (2 * GK0PerBlock) > 1;
+
+        return has_main_k_block_loop;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasDoubleTailKBlockLoop(index_t GK0)
+    {
+        const bool has_double_tail_k_block_loop = (GK0 / GK0PerBlock) % 2 == 0;
+
+        return has_double_tail_k_block_loop;
+    }
+
+    __host__ __device__ static constexpr auto MakeAGridDescriptor_GK0_GM0_GM10_GM11_GK1(
+        const AGridDesc_GK0_GM0_GM1_GK1& a_grid_desc_gk0_gm0_gm1_gk1)
+    {
+        const auto GK0 = a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I0);
+        const auto GM1 = a_grid_desc_gk0_gm0_gm1_gk1.GetLength(I2);
+
+        const auto GM11 = Number<GM1PerBlockGM11>{};
+        const auto GM10 = GM1 / GM11;
+
+        const auto a_grid_desc_gk0_gm0_gm10_gm11_gk1 = transform_tensor_descriptor(
+            a_grid_desc_gk0_gm0_gm1_gk1,
+            make_tuple(make_pass_through_transform(GK0),
+                       make_pass_through_transform(GM0),
+                       make_unmerge_transform(make_tuple(GM10, GM11)),
+                       make_pass_through_transform(GK1)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}));
+
+        return a_grid_desc_gk0_gm0_gm10_gm11_gk1;
+    }
+
+    __host__ __device__ static constexpr auto MakeBGridDescriptor_GK0_GN0_GN10_GN11_GK1(
+        const BGridDesc_GK0_GN0_GN1_GK1& b_grid_desc_gk0_gn0_gn1_gk1)
+    {
+        const auto GK0 = b_grid_desc_gk0_gn0_gn1_gk1.GetLength(I0);
+        const auto GN1 = b_grid_desc_gk0_gn0_gn1_gk1.GetLength(I2);
+
+        const auto GN11 = Number<GN1PerBlockGN11>{};
+        const auto GN10 = GN1 / GN11;
+
+        const auto b_grid_desc_gk0_gn0_gn10_gn11_gk1 = transform_tensor_descriptor(
+            b_grid_desc_gk0_gn0_gn1_gk1,
+            make_tuple(make_pass_through_transform(GK0),
+                       make_pass_through_transform(GN0),
+                       make_unmerge_transform(make_tuple(GN10, GN11)),
+                       make_pass_through_transform(GK1)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}));
+
+        return b_grid_desc_gk0_gn0_gn10_gn11_gk1;
+    }
+
+    __host__ __device__ static constexpr auto MakeCGridDescriptor_GM10_BM0_BM1_GN10_BN0_BN1(
+        const CGridDesc_GM0_GM1_GN0_GN1& c_grid_desc_gm0_gm1_gn0_gn1)
+    {
+        const auto GM1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I1);
+        const auto GN1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I3);
+
+        constexpr auto GM11 = Number<GM1PerBlockGM11>{};
+        constexpr auto GN11 = Number<GN1PerBlockGN11>{};
+
+        const auto GM10 = GM1 / GM11;
+        const auto GN10 = GN1 / GN11;
+
+        constexpr auto BM = GM0 * GM11;
+        constexpr auto BN = GN0 * GN11;
+
+        constexpr auto BM1 =
+            Number<container_reduce(BM10BN10ThreadClusterBM10Xs{}, math::multiplies{}, I1) *
+                   BM1PerThreadBM11>{};
+        constexpr auto BN1 =
+            Number<container_reduce(BM10BN10ThreadClusterBN10Xs{}, math::multiplies{}, I1) *
+                   BN1PerThreadBN11>{};
+
+        constexpr auto BM0 = BM / BM1;
+        constexpr auto BN0 = BN / BN1;
+
+        const auto c_gm0_gm10_gm11_gn0_gn10_gn11_grid_desc = transform_tensor_descriptor(
+            c_grid_desc_gm0_gm1_gn0_gn1,
+            make_tuple(make_pass_through_transform(GM0),
+                       make_unmerge_transform(make_tuple(GM10, GM11)),
+                       make_pass_through_transform(GN0),
+                       make_unmerge_transform(make_tuple(GN10, GN11))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}, Sequence<4, 5>{}));
+
+        const auto c_gm10_bm_gn10_bn_grid_desc = transform_tensor_descriptor(
+            c_gm0_gm10_gm11_gn0_gn10_gn11_grid_desc,
+            make_tuple(make_pass_through_transform(GM10),
+                       make_merge_transform(make_tuple(GM0, GM11)),
+                       make_pass_through_transform(GN10),
+                       make_merge_transform(make_tuple(GN0, GN11))),
+            make_tuple(Sequence<1>{}, Sequence<0, 2>{}, Sequence<4>{}, Sequence<3, 5>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+        const auto c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1 = transform_tensor_descriptor(
+            c_gm10_bm_gn10_bn_grid_desc,
+            make_tuple(make_pass_through_transform(GM10),
+                       make_unmerge_transform(make_tuple(BM0, BM1)),
+                       make_pass_through_transform(GN10),
+                       make_unmerge_transform(make_tuple(BN0, BN1))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}, Sequence<4, 5>{}));
+
+        return c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1;
+    }
+
+    __host__ __device__ static constexpr auto MakeCGridBlockCluster_BlockId_To_GM10_GN10(
+        const CGridDesc_GM0_GM1_GN0_GN1& c_grid_desc_gm0_gm1_gn0_gn1)
+    {
+        const auto GM1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I1);
+        const auto GN1 = c_grid_desc_gm0_gm1_gn0_gn1.GetLength(I3);
+
+        constexpr auto GM11 = Number<GM1PerBlockGM11>{};
+        constexpr auto GN11 = Number<GN1PerBlockGN11>{};
+
+        const auto GM10 = GM1 / GM11;
+        const auto GN10 = GN1 / GN11;
+
+        const auto c_grid_block_cluster_blockid_to_gm10_gn10 = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(GM10, GN10))),
+            make_tuple(Sequence<0, 1>{}),
+            make_tuple(Sequence<0>{}));
+
+        return c_grid_block_cluster_blockid_to_gm10_gn10;
+    }
+
+    using AGridDesc_GK0_GM0_GM10_GM11_GK1 =
+        decltype(MakeAGridDescriptor_GK0_GM0_GM10_GM11_GK1(AGridDesc_GK0_GM0_GM1_GK1{}));
+    using BGridDesc_GK0_GN0_GN10_GN11_GK1 =
+        decltype(MakeBGridDescriptor_GK0_GN0_GN10_GN11_GK1(BGridDesc_GK0_GN0_GN1_GK1{}));
+    using CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1 =
+        decltype(MakeCGridDescriptor_GM10_BM0_BM1_GN10_BN0_BN1(CGridDesc_GM0_GM1_GN0_GN1{}));
+    using CGridBlockCluster_BlockId_To_GM10_GN10 =
+        decltype(MakeCGridBlockCluster_BlockId_To_GM10_GN10(CGridDesc_GM0_GM1_GN0_GN1{}));
+
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ static void
+    Run(const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatC* __restrict__ p_c_grid,
+        FloatAB* __restrict__ p_shared_block,
+        const AGridDesc_GK0_GM0_GM10_GM11_GK1& a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+        const BGridDesc_GK0_GN0_GN10_GN11_GK1& b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+        const CGridDesc_GM10_BM0_BM1_GN10_BN0_BN1& c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
+        const CGridBlockCluster_BlockId_To_GM10_GN10& c_grid_block_cluster_blockid_to_gm10_gn10,
+        integral_constant<bool, HasMainKBlockLoop>,
+        integral_constant<bool, HasDoubleTailKBlockLoop>)
+    {
+        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize());
+        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_gk0_gn0_gn10_gn11_gk1.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1.GetElementSpaceSize());
+
+        const auto GK0 = a_grid_desc_gk0_gm0_gm10_gm11_gk1.GetLength(I0);
+
+        // divide block work by [GM10, GN10]
+        const auto c_gm10_gn10_block_cluster_idx =
+            c_grid_block_cluster_blockid_to_gm10_gn10.CalculateBottomIndex(
+                make_multi_index(get_block_1d_id()));
+
+        // HACK: this force index data into SGPR
+        const index_t igm10 = __builtin_amdgcn_readfirstlane(c_gm10_gn10_block_cluster_idx[I0]);
+        const index_t ign10 = __builtin_amdgcn_readfirstlane(c_gm10_gn10_block_cluster_idx[I1]);
+
+        // lds max alignment
+        // TODO: part of them should be moved into blockwise-gemm
+        // TODO: change this. I think it needs multi-dimensional alignment
+        constexpr auto max_lds_align = GK1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_block_desc_gk0_gm0_gm10_gm11_gk1 = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<GK0PerBlock>{}, GM0, I1, Number<GM1PerBlockGM11>{}, GK1),
+            max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_block_desc_gk0_gn0_gn10_gn11_gk1 = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<GK0PerBlock>{}, GN0, I1, Number<GN1PerBlockGN11>{}, GK1),
+            max_lds_align);
+
+        // A matrix in LDS memory for blockwise GEMM
+        //   be careful of LDS alignment
+        constexpr auto a_block_desc_gk0_bm_gk1 = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<GK0PerBlock>{}, GM0 * Number<GM1PerBlockGM11>{}, GK1), max_lds_align);
+
+        // B matrix in LDS memory for blockwise GEMM
+        //   be careful of LDS alignment
+        constexpr auto b_block_desc_gk0_bn_gk1 = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<GK0PerBlock>{}, GN0 * Number<GN1PerBlockGN11>{}, GK1), max_lds_align);
+
+        static_assert(a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize() ==
+                              a_block_desc_gk0_bm_gk1.GetElementSpaceSize() &&
+                          b_block_desc_gk0_gn0_gn10_gn11_gk1.GetElementSpaceSize() ==
+                              b_block_desc_gk0_bn_gk1.GetElementSpaceSize(),
+                      "wrong!");
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy = BlockwiseTensorSliceTransfer_v5r1<
+            BlockSize,
+            InMemoryDataOperationEnum::Set,
+            Sequence<GK0PerBlock, GM0, 1, GM1PerBlockGM11, GK1.value>,
+            ABlockTransferThreadSliceLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferThreadClusterLengths_GK0_GM0_GM10_GM11_GK1,
+            ABlockTransferThreadClusterArrangeOrder,
+            FloatAB,
+            FloatAB,
+            decltype(a_grid_desc_gk0_gm0_gm10_gm11_gk1),
+            decltype(a_block_desc_gk0_gm0_gm10_gm11_gk1),
+            ABlockTransferSrcAccessOrder,
+            Sequence<0, 1, 2, 3, 4>,
+            ABlockTransferSrcVectorTensorLengths_GK0_GM0_GM10_GM11_GK1, // SrcVectorTensorLengths
+            ABlockTransferDstVectorTensorLengths_GK0_GM0_GM10_GM11_GK1, // DstVectorTensorLengths
+            ABlockTransferSrcVectorTensorContiguousDimOrder, // SrcVectorTensorContiguousDimOrder
+            Sequence<0, 1, 2, 3, 4>,                         // DstVectorTensorContiguousDimOrder
+            false,
+            true>(a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+                  make_multi_index(0, 0, igm10, 0, 0),
+                  a_block_desc_gk0_gm0_gm10_gm11_gk1,
+                  make_multi_index(0, 0, 0, 0, 0));
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy = BlockwiseTensorSliceTransfer_v5r1<
+            BlockSize,
+            InMemoryDataOperationEnum::Set,
+            Sequence<GK0PerBlock, GN0, 1, GN1PerBlockGN11, GK1.value>,
+            BBlockTransferThreadSliceLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferThreadClusterLengths_GK0_GN0_GN10_GN11_GK1,
+            BBlockTransferThreadClusterArrangeOrder,
+            FloatAB,
+            FloatAB,
+            decltype(b_grid_desc_gk0_gn0_gn10_gn11_gk1),
+            decltype(b_block_desc_gk0_gn0_gn10_gn11_gk1),
+            BBlockTransferSrcAccessOrder,
+            Sequence<0, 1, 2, 3, 4>,
+            BBlockTransferSrcVectorTensorLengths_GK0_GN0_GN10_GN11_GK1, // SrcVectorTensorLengths
+            BBlockTransferDstVectorTensorLengths_GK0_GN0_GN10_GN11_GK1, // DstVectorTensorLengths
+            BBlockTransferSrcVectorTensorContiguousDimOrder, // SrcVectorTensorContiguousDimOrder
+            Sequence<0, 1, 2, 3, 4>,                         // DstVectorTensorContiguousDimOrder
+            false,
+            true>(b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+                  make_multi_index(0, 0, ign10, 0, 0),
+                  b_block_desc_gk0_gn0_gn10_gn11_gk1,
+                  make_multi_index(0, 0, 0, 0, 0));
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[GK0PerBlock, GM1PerBlockGM11] is in LDS
+        //     b_mtx[KPerBlocl, GN1PerBlockGN11] is in LDS
+        //     c_mtx[GM1PerBlockGM11, GN1PerBlockGN11] is distributed among threads, and saved in
+        //       register
+        const auto blockwise_gemm =
+            BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2<
+                BlockSize,
+                FloatAB,
+                FloatAB,
+                FloatAcc,
+                decltype(a_block_desc_gk0_bm_gk1),
+                decltype(b_block_desc_gk0_bn_gk1),
+                BM1PerThreadBM11,
+                BN1PerThreadBN11,
+                BK0PerThread,
+                BM10BN10ThreadClusterBM10Xs,
+                BM10BN10ThreadClusterBN10Xs,
+                BM1PerThreadBM11,
+                BN1PerThreadBN11>{};
+
+        constexpr auto c_thread_tensor_lengths_bm0_bm1_bn0_bn1 =
+            decltype(blockwise_gemm)::GetCThreadTensorLengths_BM0_BM1_BN0_BN1();
+
+        constexpr auto c_thread_desc_bm0_bm1_bn0_bn1 = make_naive_tensor_descriptor_packed(
+            sequence_to_tuple_of_number(c_thread_tensor_lengths_bm0_bm1_bn0_bn1));
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_aligned_space_size = math::integer_least_multiple(
+            a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_aligned_space_size = math::integer_least_multiple(
+            b_block_desc_gk0_gn0_gn10_gn11_gk1.GetElementSpaceSize(), max_lds_align);
+
+        FloatAB* p_a_block_double = p_shared_block;
+        FloatAB* p_b_block_double = p_shared_block + 2 * a_block_aligned_space_size;
+
+        // register allocation for output
+        auto c_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAcc>(
+            c_thread_desc_bm0_bm1_bn0_bn1.GetElementSpaceSize());
+
+        ThreadwiseTensorSliceSet_v1<FloatAcc,
+                                    decltype(c_thread_desc_bm0_bm1_bn0_bn1),
+                                    decltype(c_thread_tensor_lengths_bm0_bm1_bn0_bn1)>{}
+            .Run(c_thread_desc_bm0_bm1_bn0_bn1,
+                 make_tuple(I0, I0, I0, I0),
+                 c_thread_buf,
+                 FloatAcc{0});
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(GK0PerBlock, 0, 0, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(GK0PerBlock, 0, 0, 0, 0);
+
+        auto a_block_even_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            p_a_block_double, a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize());
+        auto b_block_even_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            p_b_block_double, b_block_desc_gk0_gn0_gn10_gn11_gk1.GetElementSpaceSize());
+
+        auto a_block_odd_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            p_a_block_double + a_block_aligned_space_size,
+            a_block_desc_gk0_gm0_gm10_gm11_gk1.GetElementSpaceSize());
+        auto b_block_odd_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            p_b_block_double + b_block_aligned_space_size,
+            b_block_desc_gk0_gn0_gn10_gn11_gk1.GetElementSpaceSize());
+
+        // LDS double buffer: preload data into LDS
+        {
+            a_blockwise_copy.RunRead(
+                a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridStepHacks{});
+            b_blockwise_copy.RunRead(
+                b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridStepHacks{});
+
+            a_blockwise_copy.RunWrite(a_block_desc_gk0_gm0_gm10_gm11_gk1, a_block_even_buf);
+            b_blockwise_copy.RunWrite(b_block_desc_gk0_gn0_gn10_gn11_gk1, b_block_even_buf);
+        }
+
+        if constexpr(HasMainKBlockLoop)
+        {
+            index_t gk0_block_on_grid = 0;
+
+            // LDS double buffer: main body
+            // use Do-While loop instead of For loop to simplify control flow
+            do
+            {
+                // even iteration
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+                                                    a_block_slice_copy_step,
+                                                    AGridMoveSliceWindowStepHacks{});
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+                                                    b_block_slice_copy_step,
+                                                    BGridMoveSliceWindowStepHacks{});
+
+                __syncthreads();
+
+                // LDS doubel buffer: load next data from device mem
+                a_blockwise_copy.RunRead(
+                    a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridStepHacks{});
+                b_blockwise_copy.RunRead(
+                    b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridStepHacks{});
+
+                // LDS double buffer: GEMM on current data
+                blockwise_gemm.Run(c_thread_desc_bm0_bm1_bn0_bn1,
+                                   a_block_even_buf,
+                                   b_block_even_buf,
+                                   c_thread_buf);
+
+                // LDS double buffer: store next data to LDS
+                a_blockwise_copy.RunWrite(a_block_desc_gk0_gm0_gm10_gm11_gk1, a_block_odd_buf);
+                b_blockwise_copy.RunWrite(b_block_desc_gk0_gn0_gn10_gn11_gk1, b_block_odd_buf);
+
+                // odd iteration
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+                                                    a_block_slice_copy_step,
+                                                    AGridMoveSliceWindowStepHacks{});
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+                                                    b_block_slice_copy_step,
+                                                    BGridMoveSliceWindowStepHacks{});
+
+                __syncthreads();
+
+                // LDS doubel buffer: load next data from device mem
+                a_blockwise_copy.RunRead(
+                    a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridStepHacks{});
+                b_blockwise_copy.RunRead(
+                    b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridStepHacks{});
+
+                // LDS double buffer: GEMM on current data
+                blockwise_gemm.Run(
+                    c_thread_desc_bm0_bm1_bn0_bn1, a_block_odd_buf, b_block_odd_buf, c_thread_buf);
+
+                // LDS double buffer: store next data to LDS
+                a_blockwise_copy.RunWrite(a_block_desc_gk0_gm0_gm10_gm11_gk1, a_block_even_buf);
+                b_blockwise_copy.RunWrite(b_block_desc_gk0_gn0_gn10_gn11_gk1, b_block_even_buf);
+
+                gk0_block_on_grid += 2 * GK0PerBlock;
+            } while(gk0_block_on_grid < GK0 - 2 * GK0PerBlock);
+        }
+
+        // LDS double buffer: tail
+        if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left
+        {
+            a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_gk0_gm0_gm10_gm11_gk1,
+                                                a_block_slice_copy_step,
+                                                AGridMoveSliceWindowStepHacks{});
+            b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_gk0_gn0_gn10_gn11_gk1,
+                                                b_block_slice_copy_step,
+                                                BGridMoveSliceWindowStepHacks{});
+
+            __syncthreads();
+
+            // LDS double buffer: load last data from device mem
+            a_blockwise_copy.RunRead(
+                a_grid_desc_gk0_gm0_gm10_gm11_gk1, a_global_buf, AGridStepHacks{});
+            b_blockwise_copy.RunRead(
+                b_grid_desc_gk0_gn0_gn10_gn11_gk1, b_global_buf, BGridStepHacks{});
+
+            // LDS double buffer: GEMM on 2nd-last data
+            blockwise_gemm.Run(
+                c_thread_desc_bm0_bm1_bn0_bn1, a_block_even_buf, b_block_even_buf, c_thread_buf);
+
+            // LDS double buffer: store last data to LDS
+            a_blockwise_copy.RunWrite(a_block_desc_gk0_gm0_gm10_gm11_gk1, a_block_odd_buf);
+            b_blockwise_copy.RunWrite(b_block_desc_gk0_gn0_gn10_gn11_gk1, b_block_odd_buf);
+
+            __syncthreads();
+
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(
+                c_thread_desc_bm0_bm1_bn0_bn1, a_block_odd_buf, b_block_odd_buf, c_thread_buf);
+        }
+        else // if has 1 iteration left
+        {
+            __syncthreads();
+
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(
+                c_thread_desc_bm0_bm1_bn0_bn1, a_block_even_buf, b_block_even_buf, c_thread_buf);
+        }
+
+        // output: register to global memory
+        {
+            constexpr auto c_thread_desc_gm10_bm0_bm1_gn10_bn0_bn1 =
+                make_naive_tensor_descriptor_packed(
+                    make_tuple(I1,
+                               Number<c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I0]>{},
+                               Number<c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I1]>{},
+                               I1,
+                               Number<c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I2]>{},
+                               Number<c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I3]>{}));
+
+            const auto c_thread_origin_on_block_bm0_bm1_bn0_bn1 =
+                blockwise_gemm.CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1(
+                    get_thread_local_1d_id());
+
+            ThreadwiseTensorSliceTransfer_v1r3<
+                FloatAcc,
+                FloatC,
+                decltype(c_thread_desc_gm10_bm0_bm1_gn10_bn0_bn1),
+                decltype(c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1),
+                Sequence<1,
+                         c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I0],
+                         c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I1],
+                         1,
+                         c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I2],
+                         c_thread_tensor_lengths_bm0_bm1_bn0_bn1[I3]>,
+                CThreadTransferSrcDstAccessOrder,
+                CThreadTransferSrcDstVectorDim,
+                CThreadTransferDstScalarPerVector,
+                CGlobalMemoryDataOperation,
+                1,
+                false>{c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
+                       make_multi_index(igm10,
+                                        c_thread_origin_on_block_bm0_bm1_bn0_bn1[I0],
+                                        c_thread_origin_on_block_bm0_bm1_bn0_bn1[I1],
+                                        ign10,
+                                        c_thread_origin_on_block_bm0_bm1_bn0_bn1[I2],
+                                        c_thread_origin_on_block_bm0_bm1_bn0_bn1[I3])}
+                .Run(c_thread_desc_gm10_bm0_bm1_gn10_bn0_bn1,
+                     make_tuple(I0, I0, I0, I0, I0, I0),
+                     c_thread_buf,
+                     c_grid_desc_gm10_bm0_bm1_gn10_bn0_bn1,
+                     c_grid_buf,
+                     CGridStepHacks{});
+        }
+    }
+};
+
+} // namespace ck
+#endif
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp
new file mode 100644
index 00000000..8b82b655
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_1d.hpp
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseElementwise1dFunctor,
+          typename InGrid1dDescTuple,
+          typename OutGrid1dDescTuple,
+          typename InDataTypePointerTuple,
+          typename OutDataTypePointerTuple,
+          typename ElementwiseOperation>
+__global__ void kernel_elementwise_1d(const InGrid1dDescTuple in_grid_1d_desc_tuple,
+                                      const OutGrid1dDescTuple out_grid_1d_desc_tuple,
+                                      const InDataTypePointerTuple p_in_global_tuple,
+                                      const OutDataTypePointerTuple p_out_global_tuple,
+                                      const ElementwiseOperation elementwise_op)
+{
+    GridwiseElementwise1dFunctor::Run(in_grid_1d_desc_tuple,
+                                      out_grid_1d_desc_tuple,
+                                      p_in_global_tuple,
+                                      p_out_global_tuple,
+                                      elementwise_op);
+}
+
+template <typename InGrid1dDescTuple,
+          typename OutGrid1dDescTuple,
+          typename InDataTypePointerTuple,
+          typename OutDataTypePointerTuple,
+          typename ElementwiseOperation,
+          index_t MPerThread,
+          typename InScalarPerVectorSeq,
+          typename OutScalarPerVectorSeq>
+struct GridwiseElementwise_1D
+{
+    static constexpr index_t NumInput  = InDataTypePointerTuple::Size();
+    static constexpr index_t NumOutput = OutDataTypePointerTuple::Size();
+
+    static_assert(NumInput == InScalarPerVectorSeq::Size() &&
+                      NumOutput == OutScalarPerVectorSeq::Size() &&
+                      NumInput == InGrid1dDescTuple::Size() &&
+                      NumOutput == OutGrid1dDescTuple::Size(),
+                  "Tuple size is inconsistent with the number of in/out!");
+
+    static constexpr auto I0 = Number<0>{};
+
+    static constexpr auto thread_buffer_desc_m =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<MPerThread>{}));
+
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
+
+    __device__ static void Run(const InGrid1dDescTuple in_grid_1d_desc_tuple,
+                               const OutGrid1dDescTuple out_grid_1d_desc_tuple,
+                               const InDataTypePointerTuple p_in_global_tuple,
+                               const OutDataTypePointerTuple p_out_global_tuple,
+                               const ElementwiseOperation elementwise_op)
+    {
+        const index_t thread_global_id = get_thread_global_1d_id();
+
+        auto in_thread_buf_tuple = generate_tuple(
+            [&](auto I) {
+                using DataTypePointer = remove_cvref_t<decltype(InDataTypePointerTuple{}[I])>;
+                using DataType        = remove_cv_t<remove_pointer_t<DataTypePointer>>;
+
+                return StaticBuffer<AddressSpaceEnum::Vgpr, DataType, MPerThread, true>{};
+            },
+            Number<NumInput>{});
+
+        auto out_thread_buf_tuple = generate_tuple(
+            [&](auto I) {
+                using DataTypePointer = remove_cvref_t<decltype(OutDataTypePointerTuple{}[I])>;
+                using DataType        = remove_pointer_t<DataTypePointer>;
+
+                return StaticBuffer<AddressSpaceEnum::Vgpr, DataType, MPerThread, true>{};
+            },
+            Number<NumOutput>{});
+
+        auto in_global_buf_tuple = generate_tuple(
+            [&](auto I) {
+                static_assert(in_grid_1d_desc_tuple[I].GetNumOfDimension() == 1);
+
+                return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                    p_in_global_tuple[I], in_grid_1d_desc_tuple[I].GetElementSpaceSize());
+            },
+            Number<NumInput>{});
+
+        auto out_global_buf_tuple = generate_tuple(
+            [&](auto I) {
+                static_assert(out_grid_1d_desc_tuple[I].GetNumOfDimension() == 1);
+
+                return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                    p_out_global_tuple[I], out_grid_1d_desc_tuple[I].GetElementSpaceSize());
+            },
+            Number<NumOutput>{});
+
+        const auto thread_global_offset = make_multi_index(thread_global_id * MPerThread);
+
+        const index_t blockSize    = get_block_size();
+        const index_t blockPerGrid = get_grid_size();
+        const auto M               = in_grid_1d_desc_tuple[I0].GetLength(I0);
+        const index_t loop_step    = blockPerGrid * blockSize * MPerThread;
+        const auto loop_step_index = make_multi_index(loop_step);
+
+        auto in_global_load_tuple = generate_tuple(
+            [&](auto I) {
+                using DataTypePointer = remove_cvref_t<decltype(InDataTypePointerTuple{}[I])>;
+                using DataType        = remove_cv_t<remove_pointer_t<DataTypePointer>>;
+
+                return ThreadwiseTensorSliceTransfer_v2<DataType,
+                                                        DataType,
+                                                        decltype(in_grid_1d_desc_tuple[I]),
+                                                        decltype(thread_buffer_desc_m),
+                                                        Sequence<MPerThread>, // SliceLengths
+                                                        Sequence<0>,          // DimAccessOrder
+                                                        0,                    // SrcVectorDim
+                                                        InScalarPerVectorSeq::At(
+                                                            I), // ScalarPerVector
+                                                        1,      // SrcScalarStrideInVector
+                                                        false>{in_grid_1d_desc_tuple[I],
+                                                               thread_global_offset};
+            },
+            Number<NumInput>{});
+
+        auto out_global_store_tuple = generate_tuple(
+            [&](auto I) {
+                using DataTypePointer = remove_cvref_t<decltype(OutDataTypePointerTuple{}[I])>;
+                using DataType        = remove_pointer_t<DataTypePointer>;
+
+                return ThreadwiseTensorSliceTransfer_v1r3<DataType,
+                                                          DataType,
+                                                          decltype(thread_buffer_desc_m),
+                                                          decltype(out_grid_1d_desc_tuple[I]),
+                                                          PassThroughOp,
+                                                          Sequence<MPerThread>, // SliceLengths
+                                                          Sequence<0>,          // DimAccessOrder
+                                                          0,                    // SrcVectorDim
+                                                          OutScalarPerVectorSeq::At(I),
+                                                          InMemoryDataOperationEnum::Set,
+                                                          1,
+                                                          false>(
+                    out_grid_1d_desc_tuple[I], thread_global_offset, PassThroughOp{});
+            },
+            Number<NumOutput>{});
+
+        index_t num_iter = M / (loop_step);
+        do
+        {
+            static_for<0, NumInput, 1>{}([&](auto I) {
+                in_global_load_tuple(I).Run(in_grid_1d_desc_tuple[I],
+                                            in_global_buf_tuple[I],
+                                            thread_buffer_desc_m,
+                                            make_tuple(I0),
+                                            in_thread_buf_tuple(I));
+
+                in_global_load_tuple(I).MoveSrcSliceWindow(in_grid_1d_desc_tuple[I],
+                                                           loop_step_index);
+            });
+
+            static_for<0, MPerThread, 1>{}([&](auto iM) {
+                // get reference to in data
+                const auto in_data_refs = generate_tie(
+                    // return type should be lvalue
+                    [&](auto I) -> const auto& { return in_thread_buf_tuple(I)(iM); },
+                    Number<NumInput>{});
+
+                // get reference to dst data
+                auto out_data_refs = generate_tie(
+                    // return type should be lvalue
+                    [&](auto I) -> auto& { return out_thread_buf_tuple(I)(iM); },
+                    Number<NumOutput>{});
+
+                unpack2(elementwise_op, out_data_refs, in_data_refs);
+            });
+
+            static_for<0, NumOutput, 1>{}([&](auto I) {
+                out_global_store_tuple(I).Run(thread_buffer_desc_m,
+                                              make_tuple(I0),
+                                              out_thread_buf_tuple[I],
+                                              out_grid_1d_desc_tuple[I],
+                                              out_global_buf_tuple(I));
+
+                out_global_store_tuple(I).MoveDstSliceWindow(out_grid_1d_desc_tuple[I],
+                                                             loop_step_index);
+            });
+        } while(--num_iter);
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
new file mode 100644
index 00000000..05257d16
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: MIT
+// // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+//
+#pragma once
+
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseElementwise2dFunctor,
+          typename InGrid2dDescTuple,
+          typename OutGrid2dDescTuple,
+          typename InDataTypePointerTuple,
+          typename OutDataTypePointerTuple,
+          typename ElementwiseOperation>
+__global__ void kernel_elementwise_2d(const InGrid2dDescTuple in_grid_2d_desc_tuple,
+                                      const OutGrid2dDescTuple out_grid_2d_desc_tuple,
+                                      const InDataTypePointerTuple p_in_global_tuple,
+                                      const OutDataTypePointerTuple p_out_global_tuple,
+                                      const ElementwiseOperation elementwise_op,
+                                      const index_t num_threads_m,
+                                      const index_t num_threads_n)
+{
+    GridwiseElementwise2dFunctor::Run(in_grid_2d_desc_tuple,
+                                      out_grid_2d_desc_tuple,
+                                      p_in_global_tuple,
+                                      p_out_global_tuple,
+                                      elementwise_op,
+                                      num_threads_m,
+                                      num_threads_n);
+}
+
+template <typename InGrid2dDescTuple,
+          typename OutGrid2dDescTuple,
+          typename InDataTypePointerTuple,
+          typename OutDataTypePointerTuple,
+          typename ElementwiseOperation,
+          index_t MPerThread,
+          index_t NPerThread,
+          typename InScalarPerVectorSeq,
+          typename OutScalarPerVectorSeq>
+struct GridwiseElementwise_2D
+{
+    static constexpr index_t NumInput  = InDataTypePointerTuple::Size();
+    static constexpr index_t NumOutput = OutDataTypePointerTuple::Size();
+
+    static_assert(NumInput == InScalarPerVectorSeq::Size() &&
+                      NumOutput == OutScalarPerVectorSeq::Size() &&
+                      NumInput == InGrid2dDescTuple::Size() &&
+                      NumOutput == OutGrid2dDescTuple::Size(),
+                  "Tuple size is inconsistent with the number of in/out!");
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    static constexpr auto thread_buffer_desc_mn =
+        make_naive_tensor_descriptor_packed(make_tuple(Number<MPerThread>{}, Number<NPerThread>{}));
+
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
+
+    __device__ static void Run(const InGrid2dDescTuple in_grid_2d_desc_tuple,
+                               const OutGrid2dDescTuple out_grid_2d_desc_tuple,
+                               const InDataTypePointerTuple p_in_global_tuple,
+                               const OutDataTypePointerTuple p_out_global_tuple,
+                               const ElementwiseOperation elementwise_op,
+                               const index_t num_threads_m,
+                               const index_t num_threads_n)
+    {
+        auto in_thread_buf_tuple = generate_tuple(
+            [&](auto I) {
+                using DataTypePointer = remove_cvref_t<decltype(InDataTypePointerTuple{}[I])>;
+                using DataType        = remove_cv_t<remove_pointer_t<DataTypePointer>>;
+
+                return StaticBuffer<AddressSpaceEnum::Vgpr,
+                                    DataType,
+                                    MPerThread * NPerThread,
+                                    true>{};
+            },
+            Number<NumInput>{});
+
+        auto out_thread_buf_tuple = generate_tuple(
+            [&](auto I) {
+                using DataTypePointer = remove_cvref_t<decltype(OutDataTypePointerTuple{}[I])>;
+                using DataType        = remove_pointer_t<DataTypePointer>;
+
+                return StaticBuffer<AddressSpaceEnum::Vgpr,
+                                    DataType,
+                                    MPerThread * NPerThread,
+                                    true>{};
+            },
+            Number<NumOutput>{});
+
+        auto in_global_buf_tuple = generate_tuple(
+            [&](auto I) {
+                return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                    p_in_global_tuple[I], in_grid_2d_desc_tuple[I].GetElementSpaceSize());
+            },
+            Number<NumInput>{});
+
+        auto out_global_buf_tuple = generate_tuple(
+            [&](auto I) {
+                return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                    p_out_global_tuple[I], out_grid_2d_desc_tuple[I].GetElementSpaceSize());
+            },
+            Number<NumOutput>{});
+
+        const auto M = in_grid_2d_desc_tuple[I0].GetLength(I0);
+        const auto N = in_grid_2d_desc_tuple[I0].GetLength(I1);
+
+        const index_t loop_step_m = num_threads_m * MPerThread;
+        const index_t loop_step_n = num_threads_n * NPerThread;
+
+        const index_t thread_1d_id = get_thread_global_1d_id();
+        index_t tid_m              = thread_1d_id / num_threads_n;
+        index_t tid_n              = thread_1d_id % num_threads_n;
+
+        const auto thread_global_offset = make_multi_index(tid_m * MPerThread, tid_n * NPerThread);
+
+        auto in_global_load_tuple = generate_tuple(
+            [&](auto I) {
+                using DataTypePointer = remove_cvref_t<decltype(InDataTypePointerTuple{}[I])>;
+                using DataType        = remove_cv_t<remove_pointer_t<DataTypePointer>>;
+
+                return ThreadwiseTensorSliceTransfer_v2<
+                    DataType,
+                    DataType,
+                    decltype(in_grid_2d_desc_tuple[I]),
+                    decltype(thread_buffer_desc_mn),
+                    Sequence<MPerThread, NPerThread>, // SliceLengths
+                    Sequence<0, 1>,                   // DimAccessOrder
+                    0,                                // SrcVectorDim
+                    InScalarPerVectorSeq::At(I),      // ScalarPerVector
+                    1,                                // SrcScalarStrideInVector
+                    true>{in_grid_2d_desc_tuple[I], thread_global_offset};
+            },
+            Number<NumInput>{});
+
+        auto out_global_store_tuple = generate_tuple(
+            [&](auto I) {
+                using DataTypePointer = remove_cvref_t<decltype(OutDataTypePointerTuple{}[I])>;
+                using DataType        = remove_pointer_t<DataTypePointer>;
+
+                return ThreadwiseTensorSliceTransfer_v1r3<
+                    DataType,
+                    DataType,
+                    decltype(thread_buffer_desc_mn),
+                    decltype(out_grid_2d_desc_tuple[I]),
+                    PassThroughOp,
+                    Sequence<MPerThread, NPerThread>, // SliceLengths
+                    Sequence<0, 1>,                   // DimAccessOrder
+                    1,                                // SrcVectorDim
+                    1,                                // OutScalarPerVectorSeq::At(I),
+                    InMemoryDataOperationEnum::Set,
+                    1,
+                    true>(out_grid_2d_desc_tuple[I], thread_global_offset, PassThroughOp{});
+            },
+            Number<NumOutput>{});
+
+        index_t num_iter_m = M / (loop_step_m);
+        do
+        {
+            index_t num_iter_n = N / (loop_step_n);
+            do
+            {
+                static_for<0, NumInput, 1>{}([&](auto I) {
+                    in_global_load_tuple(I).Run(in_grid_2d_desc_tuple[I],
+                                                in_global_buf_tuple[I],
+                                                thread_buffer_desc_mn,
+                                                make_tuple(I0, I0),
+                                                in_thread_buf_tuple(I));
+
+                    in_global_load_tuple(I).MoveSrcSliceWindow(in_grid_2d_desc_tuple[I],
+                                                               make_multi_index(0, loop_step_n));
+                });
+
+                static_for<0, MPerThread, 1>{}([&](auto iM) {
+                    static_for<0, NPerThread, 1>{}([&](auto iN) {
+                        constexpr auto offset =
+                            thread_buffer_desc_mn.CalculateOffset(make_tuple(iM, iN));
+                        // get reference to in data
+                        const auto in_data_refs = generate_tie(
+                            // return type should be lvalue
+                            [&](auto I) -> const auto& {
+                                return in_thread_buf_tuple(I)(Number<offset>{});
+                            },
+                            Number<NumInput>{});
+
+                        // get referenec to dst data
+                        auto out_data_refs = generate_tie(
+                            // return type should be lvalue
+                            [&](auto I) -> auto& {
+                                return out_thread_buf_tuple(I)(Number<offset>{});
+                            },
+                            Number<NumOutput>{});
+                        unpack2(elementwise_op, out_data_refs, in_data_refs);
+                    });
+                });
+
+                static_for<0, NumOutput, 1>{}([&](auto I) {
+                    out_global_store_tuple(I).Run(thread_buffer_desc_mn,
+                                                  make_tuple(I0, I0),
+                                                  out_thread_buf_tuple[I],
+                                                  out_grid_2d_desc_tuple[I],
+                                                  out_global_buf_tuple(I));
+
+                    out_global_store_tuple(I).MoveDstSliceWindow(out_grid_2d_desc_tuple[I],
+                                                                 make_multi_index(0, loop_step_n));
+                });
+
+            } while(--num_iter_n);
+
+            static_for<0, NumInput, 1>{}([&](auto I) {
+                in_global_load_tuple(I).MoveSrcSliceWindow(
+                    in_grid_2d_desc_tuple[I],
+                    make_multi_index(loop_step_m, -(N / loop_step_n) * loop_step_n));
+            });
+
+            static_for<0, NumOutput, 1>{}([&](auto I) {
+                out_global_store_tuple(I).MoveDstSliceWindow(
+                    out_grid_2d_desc_tuple[I],
+                    make_multi_index(loop_step_m, -(N / loop_step_n) * loop_step_n));
+            });
+        } while(--num_iter_m);
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp
new file mode 100644
index 00000000..989f7f1c
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_elementwise_layernorm_welford_variance.hpp
@@ -0,0 +1,500 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_welford.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+// X = Elementwise(input1, input2, input3, ...)
+// Y = Normalization(X, beta, gamma)
+template <typename InDataTypePointerTuple,
+          typename XDataType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename XElementwiseOperation,
+          typename YElementwiseOperation,
+          typename InGrid2dDescTuple,
+          typename GridDesc_M_K,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t XSrcVectorDim,
+          index_t XSrcVectorSize,
+          index_t GammaSrcVectorDim,
+          index_t GammaSrcVectorSize,
+          index_t BetaSrcVectorDim,
+          index_t BetaSrcVectorSize,
+          index_t YDstVectorDim,
+          index_t YDstVectorSize,
+          bool SweepOnce>
+struct GridwiseElementwiseLayernormWelfordVariance_mk_to_mk
+{
+    static_assert((XSrcVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0) ||
+                      (XSrcVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static_assert((YDstVectorDim == 0 && MThreadSliceSize % YDstVectorSize == 0) ||
+                      (YDstVectorDim == 1 && KThreadSliceSize % YDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static constexpr index_t NumInput = InDataTypePointerTuple::Size();
+
+    static constexpr bool reorder_thread_cluster = (XSrcVectorDim == 0);
+
+    using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
+
+    using ThreadBufferDimAccessOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    using ThreadClusterArrangeOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    static constexpr auto thread_cluster_desc =
+        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
+
+    using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MThreadSliceSize>{}, Number<XSrcVectorSize>{})));
+    using ThreadReduceDstDesc_M =
+        decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
+
+    using ThreadwiseWelford =
+        ThreadwiseWelford<AccDataType, ThreadReduceSrcDesc_M_K, ThreadReduceDstDesc_M>;
+
+    using BlockwiseWelford = BlockwiseWelford<AccDataType,
+                                              BlockSize,
+                                              ThreadClusterLengths_M_K,
+                                              ThreadClusterArrangeOrder>;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    static constexpr index_t M_BlockTileSize     = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize     = KThreadClusterSize * KThreadSliceSize;
+    static constexpr index_t K_BlockTileStepSize = KThreadClusterSize * XSrcVectorSize;
+
+    static constexpr auto XThreadBufferNumber     = Number<KThreadSliceSize / XSrcVectorSize>{};
+    static constexpr auto GammaThreadBufferNumber = Number<KThreadSliceSize / GammaSrcVectorSize>{};
+    static constexpr auto BetaThreadBufferNumber  = Number<KThreadSliceSize / BetaSrcVectorSize>{};
+    static constexpr auto YThreadBufferNumber     = Number<KThreadSliceSize / YDstVectorSize>{};
+
+    __device__ static int GetKPerThread(const GridDesc_M_K& x_grid_desc_m_k,
+                                        int thread_k_cluster_id)
+    {
+        int kPerBlock = x_grid_desc_m_k.GetTransforms()[I2].GetUpperLengths()[I0];
+        int kPerThread =
+            kPerBlock < K_BlockTileSize ? 0 : KThreadSliceSize * (kPerBlock / K_BlockTileSize);
+        int kPerBlockTail = kPerBlock - kPerThread * KThreadClusterSize;
+
+        if(kPerBlockTail > 0)
+        {
+            static_for<0, XThreadBufferNumber, 1>{}([&](auto i) {
+                int thread_max_len =
+                    (thread_k_cluster_id + 1) * XSrcVectorSize + K_BlockTileStepSize * i;
+                int delta = thread_max_len - kPerBlockTail;
+                delta     = math::clamp(thread_max_len - kPerBlockTail, 0, XSrcVectorSize);
+                kPerThread += XSrcVectorSize - delta;
+            });
+        }
+
+        return kPerThread;
+    }
+
+    __device__ static void Run(const InGrid2dDescTuple in_grid_2d_desc_tuple,
+                               const GridDesc_M_K& x_grid_desc_m_k,
+                               const GridDesc_M_K& gamma_grid_desc_m_k,
+                               const GridDesc_M_K& beta_grid_desc_m_k,
+                               const GridDesc_M_K& y_grid_desc_m_k,
+                               index_t num_k_block_tile_iteration,
+                               AccDataType epsilon,
+                               const InDataTypePointerTuple p_in_global_tuple,
+                               XDataType* const __restrict__ p_x_lds,
+                               const GammaDataType* const __restrict__ p_gamma_global,
+                               const BetaDataType* const __restrict__ p_beta_global,
+                               YDataType* const __restrict__ p_y_global,
+                               const XElementwiseOperation x_elementwise_op,
+                               const YElementwiseOperation y_elementwise_op)
+    {
+        if constexpr(SweepOnce)
+        {
+            num_k_block_tile_iteration = 1;
+        }
+
+        const index_t thread_local_id = get_thread_local_1d_id();
+        const index_t block_global_id = get_block_1d_id();
+        const index_t grid_size       = get_grid_size();
+
+        auto in_global_buf_tuple = generate_tuple(
+            [&](auto I) {
+                static_assert(in_grid_2d_desc_tuple[I].GetNumOfDimension() ==
+                              2); // matrix dimension
+
+                return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                    p_in_global_tuple[I], in_grid_2d_desc_tuple[I].GetElementSpaceSize());
+            },
+            Number<NumInput>{});
+
+        auto y_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_y_global, y_grid_desc_m_k.GetElementSpaceSize());
+
+        auto x_lds_val_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            p_x_lds, x_grid_desc_m_k.GetElementSpaceSize() / grid_size);
+
+        auto in_thread_buf_tuple = generate_tuple(
+            [&](auto) {
+                return generate_tuple(
+                    [&](auto) {
+                        return StaticBuffer<AddressSpaceEnum::Vgpr,
+                                            AccDataType,
+                                            MThreadSliceSize * XSrcVectorSize,
+                                            true>{};
+                    },
+                    Number<NumInput>{});
+            },
+            Number<XThreadBufferNumber>{});
+
+        auto x_thread_buf = generate_tuple(
+            [&](auto) {
+                return StaticBuffer<AddressSpaceEnum::Vgpr,
+                                    AccDataType,
+                                    MThreadSliceSize * XSrcVectorSize,
+                                    true>{};
+            },
+            Number<XThreadBufferNumber>{});
+
+        auto gamma_thread_buf = generate_tuple(
+            [&](auto) {
+                return StaticBuffer<AddressSpaceEnum::Vgpr,
+                                    AccDataType,
+                                    MThreadSliceSize * GammaSrcVectorSize,
+                                    true>{};
+            },
+            Number<GammaThreadBufferNumber>{});
+
+        auto beta_thread_buf = generate_tuple(
+            [&](auto) {
+                return StaticBuffer<AddressSpaceEnum::Vgpr,
+                                    AccDataType,
+                                    MThreadSliceSize * BetaSrcVectorSize,
+                                    true>{};
+            },
+            Number<BetaThreadBufferNumber>{});
+
+        auto y_thread_buf = generate_tuple(
+            [&](auto) {
+                return StaticBuffer<AddressSpaceEnum::Vgpr,
+                                    AccDataType,
+                                    MThreadSliceSize * YDstVectorSize,
+                                    true>{};
+            },
+            Number<YThreadBufferNumber>{});
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> var_thread_buf;
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
+
+        const auto thread_m_cluster_id = thread_cluster_idx[I0];
+        const auto thread_k_cluster_id = thread_cluster_idx[I1];
+
+        using ThreadBufferLengths_M_K = Sequence<MThreadSliceSize, XSrcVectorSize>;
+
+        constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<XSrcVectorSize>{}));
+
+        auto in_global_load_tuple = generate_tuple(
+            [&](auto I) {
+                using DataTypePointer = remove_cvref_t<decltype(InDataTypePointerTuple{}[I])>;
+                using DataType        = remove_cv_t<remove_pointer_t<DataTypePointer>>;
+
+                return ThreadwiseTensorSliceTransfer_v2<DataType,
+                                                        AccDataType,
+                                                        decltype(in_grid_2d_desc_tuple[I]),
+                                                        decltype(thread_buffer_desc_m_k),
+                                                        ThreadBufferLengths_M_K,
+                                                        ThreadBufferDimAccessOrder,
+                                                        XSrcVectorDim,
+                                                        XSrcVectorSize,
+                                                        1,
+                                                        false>{
+                    in_grid_2d_desc_tuple[I],
+                    make_multi_index(block_global_id * M_BlockTileSize +
+                                         thread_m_cluster_id * MThreadSliceSize,
+                                     thread_k_cluster_id * XSrcVectorSize)};
+            },
+            Number<NumInput>{});
+
+        auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2<XDataType,
+                                                                  AccDataType,
+                                                                  GridDesc_M_K,
+                                                                  decltype(thread_buffer_desc_m_k),
+                                                                  ThreadBufferLengths_M_K,
+                                                                  ThreadBufferDimAccessOrder,
+                                                                  XSrcVectorDim,
+                                                                  XSrcVectorSize,
+                                                                  1,
+                                                                  true>(
+            x_grid_desc_m_k,
+            make_multi_index(thread_m_cluster_id * MThreadSliceSize,
+                             thread_k_cluster_id * XSrcVectorSize));
+
+        auto threadwise_gamma_load =
+            ThreadwiseTensorSliceTransfer_v2<GammaDataType,
+                                             AccDataType,
+                                             GridDesc_M_K,
+                                             decltype(thread_buffer_desc_m_k),
+                                             ThreadBufferLengths_M_K,
+                                             ThreadBufferDimAccessOrder,
+                                             GammaSrcVectorDim,
+                                             GammaSrcVectorSize,
+                                             1,
+                                             true>(
+                gamma_grid_desc_m_k,
+                make_multi_index(block_global_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 thread_k_cluster_id * GammaSrcVectorSize));
+
+        auto threadwise_beta_load =
+            ThreadwiseTensorSliceTransfer_v2<BetaDataType,
+                                             AccDataType,
+                                             GridDesc_M_K,
+                                             decltype(thread_buffer_desc_m_k),
+                                             ThreadBufferLengths_M_K,
+                                             ThreadBufferDimAccessOrder,
+                                             BetaSrcVectorDim,
+                                             BetaSrcVectorSize,
+                                             1,
+                                             true>(
+                beta_grid_desc_m_k,
+                make_multi_index(block_global_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 thread_k_cluster_id * BetaSrcVectorSize));
+
+        using PassThrough = tensor_operation::element_wise::PassThrough;
+        PassThrough pass_through_op;
+        auto threadwise_x_store =
+            ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                               XDataType,
+                                               decltype(thread_buffer_desc_m_k),
+                                               GridDesc_M_K,
+                                               PassThrough,
+                                               ThreadBufferLengths_M_K,
+                                               ThreadBufferDimAccessOrder,
+                                               XSrcVectorDim,
+                                               XSrcVectorSize,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                x_grid_desc_m_k,
+                make_multi_index(thread_m_cluster_id * MThreadSliceSize,
+                                 thread_k_cluster_id * XSrcVectorSize),
+                pass_through_op);
+
+        auto threadwise_y_store =
+            ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                               YDataType,
+                                               decltype(thread_buffer_desc_m_k),
+                                               GridDesc_M_K,
+                                               YElementwiseOperation,
+                                               ThreadBufferLengths_M_K,
+                                               ThreadBufferDimAccessOrder,
+                                               YDstVectorDim,
+                                               YDstVectorSize,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                y_grid_desc_m_k,
+                make_multi_index(block_global_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 thread_k_cluster_id * YDstVectorSize),
+                y_elementwise_op);
+
+        // Copy x from Cache
+        // one pass: fwd, second pass: bwd
+        constexpr auto thread_copy_fwd_step_m_k = make_multi_index(0, K_BlockTileStepSize);
+        constexpr auto thread_copy_bwd_step_m_k =
+            make_multi_index(0, SweepOnce ? 0 : -K_BlockTileSize);
+
+        const auto gamma_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_gamma_global, gamma_grid_desc_m_k.GetElementSpaceSize());
+
+        const auto beta_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_beta_global, beta_grid_desc_m_k.GetElementSpaceSize());
+
+        auto threadwise_welford       = ThreadwiseWelford();
+        threadwise_welford.max_count_ = GetKPerThread(x_grid_desc_m_k, thread_k_cluster_id);
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            mean_thread_buf(I) = type_convert<AccDataType>(0.0f);
+            var_thread_buf(I)  = type_convert<AccDataType>(0.0f);
+        });
+
+        for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles)
+        {
+            static_for<0, XThreadBufferNumber, 1>{}([&](auto iK0) {
+                static_for<0, NumInput, 1>{}([&](auto I) { // input load loop
+                    in_global_load_tuple(I).Run(in_grid_2d_desc_tuple[I],
+                                                in_global_buf_tuple[I],
+                                                thread_buffer_desc_m_k,
+                                                make_tuple(I0, I0),
+                                                in_thread_buf_tuple(iK0)(I));
+
+                    in_global_load_tuple(I).MoveSrcSliceWindow(in_grid_2d_desc_tuple[I],
+                                                               thread_copy_fwd_step_m_k);
+                });
+
+                static_for<0, MThreadSliceSize, 1>{}([&](auto iM) { // input add loop
+                    static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) {
+                        constexpr auto offset_m_k =
+                            thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1));
+
+                        // get reference to in data
+                        const auto in_data_refs = generate_tie(
+                            // return type should be lvalue
+                            [&](auto I) -> const auto& {
+                                return in_thread_buf_tuple(iK0)(I)(Number<offset_m_k>{});
+                            },
+                            Number<NumInput>{});
+
+                        // get reference to dst data
+                        auto out_data_refs = generate_tie(
+                            // return type should be lvalue
+                            [&](auto) -> auto& { return x_thread_buf(iK0)(Number<offset_m_k>{}); },
+                            I1);
+
+                        unpack2(x_elementwise_op, out_data_refs, in_data_refs);
+                    });
+                });
+                threadwise_welford.Run(x_thread_buf[iK0], mean_thread_buf, var_thread_buf);
+
+                if constexpr(!SweepOnce)
+                {
+                    threadwise_x_store.Run(thread_buffer_desc_m_k,
+                                           make_tuple(I0, I0),
+                                           x_thread_buf(iK0),
+                                           x_grid_desc_m_k,
+                                           x_lds_val_buf);
+                    threadwise_x_store.MoveDstSliceWindow(x_grid_desc_m_k,
+                                                          thread_copy_fwd_step_m_k);
+                }
+            });
+        }
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            if constexpr(I > 0)
+                block_sync_lds();
+
+            int count = threadwise_welford.cur_count_;
+            BlockwiseWelford::Run(mean_thread_buf(I), var_thread_buf(I), count);
+        });
+
+        auto thread_copy_tail_m_k =
+            (num_k_block_tile_iteration - 1) * XThreadBufferNumber * thread_copy_fwd_step_m_k;
+
+        if constexpr(!SweepOnce)
+            threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_tail_m_k);
+        threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, thread_copy_tail_m_k);
+        threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, thread_copy_tail_m_k);
+        threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, thread_copy_tail_m_k);
+
+        for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles)
+        {
+            if constexpr(!SweepOnce)
+            {
+                static_for<0, XThreadBufferNumber, 1>{}([&](auto i) {
+                    threadwise_x_load.Run(x_grid_desc_m_k,
+                                          x_lds_val_buf,
+                                          thread_buffer_desc_m_k,
+                                          make_tuple(I0, I0),
+                                          x_thread_buf(i));
+                    threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k);
+                });
+            }
+
+            static_for<0, GammaThreadBufferNumber, 1>{}([&](auto i) {
+                threadwise_gamma_load.Run(gamma_grid_desc_m_k,
+                                          gamma_global_val_buf,
+                                          thread_buffer_desc_m_k,
+                                          make_tuple(I0, I0),
+                                          gamma_thread_buf(i));
+                threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k,
+                                                         thread_copy_fwd_step_m_k);
+            });
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                auto divisor = 1 / __builtin_amdgcn_sqrtf(var_thread_buf(iM) + epsilon);
+                static_for<0, XThreadBufferNumber, 1>{}([&](auto iK0) {
+                    static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) {
+                        constexpr auto offset_m_k =
+                            thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1));
+
+                        // normalize
+                        y_thread_buf(iK0)(Number<offset_m_k>{}) =
+                            (x_thread_buf(iK0)(Number<offset_m_k>{}) - mean_thread_buf(iM)) *
+                            divisor;
+
+                        // gamma
+                        y_thread_buf(iK0)(Number<offset_m_k>{}) =
+                            y_thread_buf(iK0)(Number<offset_m_k>{}) *
+                            gamma_thread_buf(iK0)(Number<offset_m_k>{});
+                    });
+                });
+            });
+
+            static_for<0, BetaThreadBufferNumber, 1>{}([&](auto i) {
+                threadwise_beta_load.Run(beta_grid_desc_m_k,
+                                         beta_global_val_buf,
+                                         thread_buffer_desc_m_k,
+                                         make_tuple(I0, I0),
+                                         beta_thread_buf(i));
+                threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k,
+                                                        thread_copy_fwd_step_m_k);
+            });
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                static_for<0, XThreadBufferNumber, 1>{}([&](auto iK0) {
+                    static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) {
+                        constexpr auto offset_m_k =
+                            thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1));
+
+                        // beta
+                        y_thread_buf(iK0)(Number<offset_m_k>{}) =
+                            y_thread_buf(iK0)(Number<offset_m_k>{}) +
+                            beta_thread_buf(iK0)(Number<offset_m_k>{});
+                    });
+                });
+            });
+
+            static_for<0, YThreadBufferNumber, 1>{}([&](auto i) {
+                threadwise_y_store.Run(thread_buffer_desc_m_k,
+                                       make_tuple(I0, I0),
+                                       y_thread_buf(i),
+                                       y_grid_desc_m_k,
+                                       y_global_val_buf);
+                threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, thread_copy_fwd_step_m_k);
+            });
+
+            if constexpr(!SweepOnce)
+                threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, 2 * thread_copy_bwd_step_m_k);
+            threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k,
+                                                     2 * thread_copy_bwd_step_m_k);
+            threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k,
+                                                    2 * thread_copy_bwd_step_m_k);
+            threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, 2 * thread_copy_bwd_step_m_k);
+        }
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
new file mode 100644
index 00000000..16ba2328
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
@@ -0,0 +1,997 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename FloatC0,
+          typename FloatC1,
+          typename ReducePtrsGlobal,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename C1ElementwiseOperation,
+          typename ReduceInElementwiseOperations,
+          typename ReduceAccElementwiseOperations,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename ReduceGridDescriptor_MBlock_MPerBlock,
+          typename Block2CTileMap,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_bias_add_reduce_xdl_cshuffle_v1(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const FloatC0* __restrict__ p_bias_grid,
+            const FloatC1* __restrict__ p_d0_grid,
+            ReducePtrsGlobal p_reduces_grid,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CElementwiseOperation c_element_op,
+            const C1ElementwiseOperation c1_element_op,
+            const ReduceInElementwiseOperations reduce_in_element_ops,
+            const ReduceAccElementwiseOperations reduce_out_element_ops,
+            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+            const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                c_grid_desc_mblock_mperblock_nblock_nperblock,
+            const C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                c0_grid_desc_mblock_mperblock_nblock_nperblock,
+            const C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                c1_grid_desc_mblock_mperblock_nblock_nperblock,
+            const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock,
+            const Block2CTileMap block_2_ctile_map)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
+                                                  p_b_grid,
+                                                  p_c_grid,
+                                                  p_bias_grid,
+                                                  p_d0_grid,
+                                                  p_reduces_grid,
+                                                  p_shared,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  c_element_op,
+                                                  c1_element_op,
+                                                  reduce_in_element_ops,
+                                                  reduce_out_element_ops,
+                                                  a_grid_desc_ak0_m_ak1,
+                                                  b_grid_desc_bk0_n_bk1,
+                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  c0_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  c1_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  reduce_grid_desc_mblock_mperblock,
+                                                  block_2_ctile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_c_grid;
+    ignore = p_bias_grid;
+    ignore = p_d0_grid;
+    ignore = p_reduces_grid;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = c_element_op;
+    ignore = c1_element_op;
+    ignore = reduce_in_element_ops;
+    ignore = reduce_out_element_ops;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = c0_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = c1_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = reduce_grid_desc_mblock_mperblock;
+    ignore = block_2_ctile_map;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+template <typename FloatAB,
+          typename FloatGemmAcc,
+          typename FloatCShuffle,
+          typename FloatC,
+          typename FloatC0,
+          typename FloatC1,
+          typename FloatReduceAcc,
+          typename ReducePtrsGlobal,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename C1ElementwiseOperation,
+          typename ReduceOperations,
+          typename ReduceInElementwiseOperations,
+          typename ReduceAccElementwiseOperations,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          typename ReduceGlobalMemoryDataOperation,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename CGridDesc_M_N,
+          typename C0GridDesc_M_N,
+          typename C1GridDesc_M_N,
+          typename ReduceGridDesc_M,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1Value,
+          index_t BK1Value,
+          index_t MPerXdl,
+          index_t NPerXdl,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          typename CReduceThreadClusterLengths_MPerBlock_NPerBlock,
+          index_t CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
+          index_t CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
+          LoopScheduler LoopSched,
+          PipelineVersion PipelineVer = PipelineVersion::v1>
+struct GridwiseGemmBiasAddReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    // K1 should be Number<...>
+    static constexpr auto AK0 = Number<KPerBlock / AK1Value>{};
+    static constexpr auto BK0 = Number<KPerBlock / BK1Value>{};
+    static constexpr auto AK1 = Number<AK1Value>{};
+    static constexpr auto BK1 = Number<BK1Value>{};
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    using GridwiseGemmPipe = remove_cvref_t<decltype(
+        GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage>())>;
+
+    __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
+    {
+        // A matrix in LDS memory, dst of blockwise copy
+        return make_naive_tensor_descriptor(
+            make_tuple(AK0, Number<MPerBlock>{}, AK1),
+            make_tuple(Number<MPerBlock + ABlockLdsExtraM>{} * AK1, AK1, I1));
+    }
+
+    __host__ __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    {
+        // B matrix in LDS memory, dst of blockwise copy
+        return make_naive_tensor_descriptor(
+            make_tuple(BK0, Number<NPerBlock>{}, BK1),
+            make_tuple(Number<NPerBlock + BBlockLdsExtraN>{} * BK1, BK1, I1));
+    }
+
+    __host__ __device__ static constexpr auto
+    GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
+    {
+        constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            make_naive_tensor_descriptor_packed(
+                make_tuple(I1,
+                           Number<CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl>{},
+                           I1,
+                           Number<CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>{}));
+
+        return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1, BK1);
+
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
+
+        // LDS allocation for C shuffle in LDS
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+        constexpr auto c_block_size =
+            c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();
+
+        return math::max((a_block_space_size_aligned + b_block_space_size_aligned) *
+                             sizeof(FloatAB),
+                         c_block_size * sizeof(FloatCShuffle));
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    template <typename Block2CTileMap>
+    __host__ __device__ static constexpr bool
+    CheckValidity(const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1,
+                  const BGridDesc_BK0_N_BK1& b_grid_desc_bk0_n_bk1,
+                  const CGridDesc_M_N& c_grid_desc_m_n,
+                  const Block2CTileMap& block_2_ctile_map)
+    {
+        // static_assert(is_known_at_compile_time<remove_cv_t<decltype(AK1)>>::value &&
+        //               is_known_at_compile_time<remove_cv_t<decltype(BK1)>>::value,
+        //               "wrong! K1 need to be known at compile-time");
+
+        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
+                      "Invalid tuning param!");
+
+        const auto M = a_grid_desc_ak0_m_ak1.GetLength(I1);
+        const auto N = b_grid_desc_bk0_n_bk1.GetLength(I1);
+        const auto K = a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2);
+
+        if(!(M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1)))
+            return false;
+
+        if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0))
+            return false;
+
+        // check gridwise gemm pipeline
+        const auto num_k_loop = K / KPerBlock;
+
+        if(!GridwiseGemmPipe::IsSupported(num_k_loop))
+        {
+            return false;
+        }
+
+        if(!block_2_ctile_map.CheckValidity(c_grid_desc_m_n))
+        {
+            return false;
+        }
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
+    }
+
+    template <typename CGridDesc_M_N_>
+    __host__ __device__ static constexpr auto
+    MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const CGridDesc_M_N_& c_grid_desc_m_n)
+    {
+        const auto M = c_grid_desc_m_n.GetLength(I0);
+        const auto N = c_grid_desc_m_n.GetLength(I1);
+
+        const auto MBlock = M / MPerBlock;
+        const auto NBlock = N / NPerBlock;
+
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
+                       make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+
+        return c_grid_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeReduceGridDescriptor_MBlock_MPerBlock(const ReduceGridDesc_M& d_grid_desc_m)
+    {
+        const auto M      = d_grid_desc_m.GetLength(I0);
+        const auto MBlock = M / MPerBlock;
+
+        const auto reduce_grid_desc_mblock_mperblock = transform_tensor_descriptor(
+            d_grid_desc_m,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{}))),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0, 1>{}));
+
+        return reduce_grid_desc_mblock_mperblock;
+    }
+
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    __host__ __device__ static constexpr auto
+    MakeDefaultBlock2CTileMap(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc_M_N>(
+            c_grid_desc_m_n);
+    }
+
+    using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(CGridDesc_M_N{}))>;
+
+    using C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(C0GridDesc_M_N{}))>;
+
+    using C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(C1GridDesc_M_N{}))>;
+
+    using ReduceGridDescriptor_MBlock_MPerBlock =
+        remove_cvref_t<decltype(MakeReduceGridDescriptor_MBlock_MPerBlock(ReduceGridDesc_M{}))>;
+
+    using DefaultBlock2CTileMap =
+        remove_cvref_t<decltype(MakeDefaultBlock2CTileMap(CGridDesc_M_N{}))>;
+
+    template <bool HasMainKBlockLoop, typename Block2CTileMap>
+    __device__ static void
+    Run(const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatC* __restrict__ p_c_grid,
+        const FloatC0* __restrict__ p_bias_grid,
+        const FloatC1* __restrict__ p_d0_grid,
+        ReducePtrsGlobal p_reduces_grid,
+        void* __restrict__ p_shared,
+        const AElementwiseOperation& a_element_op,
+        const BElementwiseOperation& b_element_op,
+        const CElementwiseOperation& c_element_op,
+        const C1ElementwiseOperation& c1_element_op,
+        const ReduceInElementwiseOperations& reduce_in_element_ops,
+        const ReduceAccElementwiseOperations& reduce_out_element_ops,
+        const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1,
+        const BGridDesc_BK0_N_BK1& b_grid_desc_bk0_n_bk1,
+        const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock&
+            c_grid_desc_mblock_mperblock_nblock_nperblock,
+        const C0GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock&
+            c0_grid_desc_mblock_mperblock_nblock_nperblock,
+        const C1GridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock&
+            c1_grid_desc_mblock_mperblock_nblock_nperblock,
+        const ReduceGridDescriptor_MBlock_MPerBlock& reduce_grid_desc_mblock_mperblock,
+        const Block2CTileMap& block_2_ctile_map)
+    {
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+        auto c0_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_bias_grid, c0_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+        auto c1_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_d0_grid, c1_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+        // divide block work by [M, N]
+        const auto block_work_idx =
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!block_2_ctile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1, BK1);
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                AElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<AK0, MPerBlock, AK1>,
+                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(a_grid_desc_ak0_m_ak1),
+                                                decltype(a_block_desc_ak0_m_ak1),
+                                                ABlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                ABlockTransferSrcVectorDim,
+                                                2,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_AK1,
+                                                1,
+                                                1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                NumGemmKPrefetchStage>(
+                a_grid_desc_ak0_m_ak1,
+                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                a_element_op,
+                a_block_desc_ak0_m_ak1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                BElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<BK0, NPerBlock, BK1>,
+                                                BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(b_grid_desc_bk0_n_bk1),
+                                                decltype(b_block_desc_bk0_n_bk1),
+                                                BBlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                BBlockTransferSrcVectorDim,
+                                                2,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_BK1,
+                                                1,
+                                                1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                NumGemmKPrefetchStage>(
+                b_grid_desc_bk0_n_bk1,
+                make_multi_index(0, n_block_data_idx_on_grid, 0),
+                b_element_op,
+                b_block_desc_bk0_n_bk1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[K0PerBlock, MPerBlock] is in LDS
+        //     b_mtx[K0PerBlock, NPerBlock] is in LDS
+        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
+        //       register
+        // sanity check
+        constexpr index_t KPack = math::max(
+            math::lcm(AK1, BK1), MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+
+        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
+            BlockSize,
+            FloatAB,
+            FloatGemmAcc,
+            decltype(a_block_desc_ak0_m_ak1),
+            decltype(b_block_desc_bk0_n_bk1),
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            KPack,
+            LoopSched>();
+
+        auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<FloatAB*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<FloatAB*>(p_shared) + a_block_space_size_aligned,
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1, 0, 0);
+
+        // gridwise GEMM pipeline
+        const auto gridwise_gemm_pipeline =
+            GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage, LoopSched>();
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_grid_desc_ak0_m_ak1,
+                                                               a_block_desc_ak0_m_ak1,
+                                                               a_blockwise_copy,
+                                                               a_grid_buf,
+                                                               a_block_buf,
+                                                               a_block_slice_copy_step,
+                                                               b_grid_desc_bk0_n_bk1,
+                                                               b_block_desc_bk0_n_bk1,
+                                                               b_blockwise_copy,
+                                                               b_grid_buf,
+                                                               b_block_buf,
+                                                               b_block_slice_copy_step,
+                                                               blockwise_gemm,
+                                                               c_thread_buf,
+                                                               num_k_block_main_loop);
+
+        // shuffle C + reduction + write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+            constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<FloatCShuffle*>(p_shared),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
+                        M1,                                      // M1 = MWave
+                        M2,                                      // M2 * M3 * M4 = MPerXdl
+                        M3,
+                        M4)),
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
+                        N1,                                      // N1 = NWave
+                        N2))),                                   // N2 = NPerXdl
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<FloatGemmAcc,
+                                                   FloatCShuffle,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            M2,
+                                                            I1,
+                                                            M4,
+                                                            I1>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     m_thread_data_on_block_idx[I3],
+                                     m_thread_data_on_block_idx[I4],
+                                     n_thread_data_on_block_idx[I2]),
+                    ck::tensor_operation::element_wise::PassThrough{}};
+
+            // space filling curve for threadwise C in VGPR
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle,
+                                           CShuffleNXdlPerWavePerShuffle,
+                                           1,
+                                           1,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>>{};
+
+            // space filling curve for shuffled blockwise C in global mem
+            constexpr auto sfc_c_global =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                           1,
+                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+            // TODO: this should be implemented as a blockwise reduction
+            // LDS c_reduce_block_desc_mperblock_nperblock
+            constexpr auto c_reduce_block_desc_mperblock_nperblock = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_pass_through_transform(
+                        c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetLength(I1)),
+                    make_freeze_transform(I0),
+                    make_pass_through_transform(
+                        c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetLength(I3))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<>{}, Sequence<0>{}, Sequence<>{}, Sequence<1>{}));
+
+            static_assert(CReduceThreadClusterLengths_MPerBlock_NPerBlock::At(I0) *
+                                  CReduceThreadClusterLengths_MPerBlock_NPerBlock::At(I1) ==
+                              BlockSize,
+                          "wrong!");
+
+            static_assert((CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl) %
+                                      CReduceThreadClusterLengths_MPerBlock_NPerBlock::At(I0) ==
+                                  0 &&
+                              (CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl) %
+                                      CReduceThreadClusterLengths_MPerBlock_NPerBlock::At(I1) ==
+                                  0,
+                          "wrong!");
+
+            constexpr index_t mreduce_per_thread =
+                (CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl) /
+                CReduceThreadClusterLengths_MPerBlock_NPerBlock::At(I0);
+
+            constexpr index_t nreduce_per_thread =
+                (CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl) /
+                CReduceThreadClusterLengths_MPerBlock_NPerBlock::At(I1);
+
+            constexpr auto c_reduce_thread_lengths_mperblock_nperblock =
+                Sequence<mreduce_per_thread, nreduce_per_thread>{};
+
+            // VGPR c_reduce_thread_desc_mperblock_nperblock
+            constexpr auto c_reduce_thread_desc_mperblock_nperblock =
+                make_naive_tensor_descriptor_packed(
+                    make_tuple(Number<mreduce_per_thread>{}, Number<nreduce_per_thread>{}));
+
+            // VGPR reduce_thread_desc_mperblock
+            constexpr auto reduce_thread_desc_mperblock =
+                make_naive_tensor_descriptor_packed(make_tuple(Number<mreduce_per_thread>{}));
+
+            // VGPR reduce_thread_desc_mblock_mperblock
+            constexpr auto reduce_thread_desc_mblock_mperblock =
+                make_naive_tensor_descriptor_packed(make_tuple(I1, Number<mreduce_per_thread>{}));
+
+            auto c_reduce_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatReduceAcc>(
+                c_reduce_thread_desc_mperblock_nperblock.GetElementSpaceSize());
+
+            // reduce: threadwise copy from LDS to VGPR
+            constexpr auto c_reduce_thread_cluster_desc = make_cluster_descriptor(
+                CReduceThreadClusterLengths_MPerBlock_NPerBlock{}, Sequence<1, 0>{});
+
+            const auto c_reduce_thread_cluster_idx =
+                c_reduce_thread_cluster_desc.CalculateBottomIndex(
+                    make_multi_index(get_thread_local_1d_id()));
+
+            const auto c_reduce_thread_data_idx_begin =
+                c_reduce_thread_cluster_idx * c_reduce_thread_lengths_mperblock_nperblock;
+
+            auto c_reduce_thread_copy_lds_to_vgpr = ThreadwiseTensorSliceTransfer_v2<
+                FloatCShuffle,
+                FloatReduceAcc,
+                decltype(c_reduce_block_desc_mperblock_nperblock),
+                decltype(c_reduce_thread_desc_mperblock_nperblock),
+                decltype(c_reduce_thread_lengths_mperblock_nperblock),
+                Sequence<0, 1>,
+                1,
+                CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
+                1,
+                true>{c_reduce_block_desc_mperblock_nperblock, c_reduce_thread_data_idx_begin};
+
+            auto reduce_tuple_thread_copy_vgpr_to_global = generate_tuple(
+                [&](auto I) {
+                    auto p_reduce_grid         = p_reduces_grid[I];
+                    auto reduce_acc_element_op = reduce_out_element_ops[I];
+
+                    return ThreadwiseTensorSliceTransfer_v1r3<
+                        FloatReduceAcc,
+                        remove_pointer_t<decltype(p_reduce_grid)>,
+                        decltype(reduce_thread_desc_mblock_mperblock),
+                        decltype(reduce_grid_desc_mblock_mperblock),
+                        decltype(reduce_acc_element_op),
+                        Sequence<1, mreduce_per_thread>,
+                        Sequence<0, 1>,
+                        1,
+                        CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
+                        ReduceGlobalMemoryDataOperation::At(I),
+                        1,
+                        false>{reduce_grid_desc_mblock_mperblock,
+                               make_multi_index(block_work_idx[I0],                  // mblock
+                                                c_reduce_thread_data_idx_begin[I0]), // mperblock
+                               reduce_acc_element_op};
+                },
+                Number<p_reduces_grid.Size()>{});
+
+            // c0 and c1
+            constexpr auto c0_reduce_thread_desc_mblock_mperblock_nblock_nperblock =
+                make_naive_tensor_descriptor_packed(
+                    make_tuple(I1, Number<mreduce_per_thread>{}, I1, Number<nreduce_per_thread>{}));
+
+            constexpr auto c1_reduce_thread_desc_mblock_mperblock_nblock_nperblock =
+                c0_reduce_thread_desc_mblock_mperblock_nblock_nperblock;
+
+            auto c01_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatReduceAcc>(
+                c0_reduce_thread_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            auto c0_thread_copy_global_to_vgpr = ThreadwiseTensorSliceTransfer_v2<
+                FloatC0,
+                FloatReduceAcc,
+                decltype(c0_grid_desc_mblock_mperblock_nblock_nperblock),
+                decltype(c0_reduce_thread_desc_mblock_mperblock_nblock_nperblock),
+                Sequence<I1, mreduce_per_thread, I1, nreduce_per_thread>,
+                Sequence<0, 1, 2, 3>,
+                3,
+                CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
+                1,
+                true>(
+                c0_grid_desc_mblock_mperblock_nblock_nperblock,
+                make_multi_index(I0,
+                                 m_block_data_idx_on_grid + c_reduce_thread_data_idx_begin[I0],
+                                 I0,
+                                 n_block_data_idx_on_grid + c_reduce_thread_data_idx_begin[I1]));
+
+            auto c1_thread_copy_global_to_vgpr = ThreadwiseTensorSliceTransfer_v2<
+                FloatC1,
+                FloatReduceAcc,
+                decltype(c1_grid_desc_mblock_mperblock_nblock_nperblock),
+                decltype(c1_reduce_thread_desc_mblock_mperblock_nblock_nperblock),
+                Sequence<I1, mreduce_per_thread, I1, nreduce_per_thread>,
+                Sequence<0, 1, 2, 3>,
+                3,
+                CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
+                1,
+                true>(
+                c1_grid_desc_mblock_mperblock_nblock_nperblock,
+                make_multi_index(I0,
+                                 m_block_data_idx_on_grid + c_reduce_thread_data_idx_begin[I0],
+                                 I0,
+                                 n_block_data_idx_on_grid + c_reduce_thread_data_idx_begin[I1]));
+
+            constexpr auto c_reduce_thread_desc_mblock_mperblock_nblock_nperblock =
+                make_naive_tensor_descriptor_packed(
+                    make_tuple(I1, Number<mreduce_per_thread>{}, I1, Number<nreduce_per_thread>{}));
+
+            auto c_reduce_thread_copy_vgpr_to_global = ThreadwiseTensorSliceTransfer_v1r3<
+                FloatReduceAcc,
+                FloatC,
+                decltype(c_reduce_thread_desc_mblock_mperblock_nblock_nperblock),
+                decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+                tensor_operation::element_wise::PassThrough,
+                Sequence<I1, mreduce_per_thread, I1, nreduce_per_thread>, // SliceLengths
+                Sequence<0, 1, 2, 3>,                                     // DimAccessOrder
+                3,                                                        // DstVectorDim
+                CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
+                InMemoryDataOperationEnum::Set,
+                1,
+                true>{
+                c_grid_desc_mblock_mperblock_nblock_nperblock,
+                make_multi_index(I0,
+                                 m_block_data_idx_on_grid + c_reduce_thread_data_idx_begin[I0],
+                                 I0,
+                                 n_block_data_idx_on_grid + c_reduce_thread_data_idx_begin[I1]),
+                tensor_operation::element_wise::PassThrough{}};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            static_assert(num_access == sfc_c_global.GetNumOfAccess(), "wrong!");
+
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf,
+                                              c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              c_shuffle_block_buf);
+
+                // make sure it's safe to write to LDS
+                block_sync_lds();
+                {
+                    c_reduce_thread_copy_lds_to_vgpr.Run(c_reduce_block_desc_mperblock_nperblock,
+                                                         c_shuffle_block_buf,
+                                                         c_reduce_thread_desc_mperblock_nperblock,
+                                                         make_tuple(I0, I0),
+                                                         c_reduce_thread_buf);
+
+                    c0_thread_copy_global_to_vgpr.Run(
+                        c0_grid_desc_mblock_mperblock_nblock_nperblock,
+                        c0_grid_buf,
+                        c0_reduce_thread_desc_mblock_mperblock_nblock_nperblock,
+                        make_tuple(I0, I0, I0, I0),
+                        c01_thread_buf);
+
+                    // c = activation(c + bias)
+                    static_for<0, c_reduce_thread_desc_mperblock_nperblock.GetElementSize(), 1>{}(
+                        [&](auto i) {
+                            FloatReduceAcc out;
+                            c_element_op(out, c_reduce_thread_buf(i) + c01_thread_buf(i));
+                            c_reduce_thread_buf(i) = out;
+                        });
+
+                    c1_thread_copy_global_to_vgpr.Run(
+                        c1_grid_desc_mblock_mperblock_nblock_nperblock,
+                        c1_grid_buf,
+                        c1_reduce_thread_desc_mblock_mperblock_nblock_nperblock,
+                        make_tuple(I0, I0, I0, I0),
+                        c01_thread_buf);
+
+                    // c = c + c1_functior(c1)
+                    static_for<0, c_reduce_thread_desc_mperblock_nperblock.GetElementSize(), 1>{}(
+                        [&](auto i) {
+                            c1_element_op(c01_thread_buf(i), c01_thread_buf(i));
+                            c_reduce_thread_buf(i) += c01_thread_buf(i);
+                        });
+
+                    c_reduce_thread_copy_vgpr_to_global.Run(
+                        c_reduce_thread_desc_mblock_mperblock_nblock_nperblock,
+                        make_tuple(I0, I0, I0, I0),
+                        c_reduce_thread_buf,
+                        c_grid_desc_mblock_mperblock_nblock_nperblock,
+                        c_grid_buf);
+
+                    static_for<0, p_reduces_grid.Size(), 1>{}([&](auto In) {
+                        auto& p_reduce_grid = p_reduces_grid[In];
+
+                        auto reduce_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                            p_reduce_grid, reduce_grid_desc_mblock_mperblock.GetElementSpaceSize());
+
+                        auto reduce_thread_buf =
+                            make_static_buffer<AddressSpaceEnum::Vgpr, FloatReduceAcc>(
+                                reduce_thread_desc_mperblock.GetElementSpaceSize());
+
+                        auto& reduce_in_element_op = reduce_in_element_ops[In];
+
+                        auto& reduce_thread_copy_vgpr_to_global =
+                            reduce_tuple_thread_copy_vgpr_to_global(In);
+
+                        using ReduceOperation = remove_cvref_t<decltype(ReduceOperations{}[In])>;
+                        using ThreadwiseReduce =
+                            ThreadwiseReduction<FloatReduceAcc,
+                                                decltype(c_reduce_thread_desc_mperblock_nperblock),
+                                                decltype(reduce_thread_desc_mperblock),
+                                                ReduceOperation,
+                                                false>;
+
+                        // Global write Gemm shuffle + reduction
+                        const auto reduce_identityVal =
+                            ReduceOperation::template GetIdentityValue<FloatReduceAcc>();
+
+                        static_for<0, mreduce_per_thread, 1>{}(
+                            [&](auto I) { reduce_thread_buf(I) = reduce_identityVal; });
+
+                        // reduce in VGPR
+                        static_for<0, mreduce_per_thread, 1>{}([&](auto im) {
+                            static_for<0, nreduce_per_thread, 1>{}([&](auto in) {
+                                constexpr auto offset =
+                                    Number<c_reduce_thread_desc_mperblock_nperblock.CalculateOffset(
+                                        make_tuple(im, in))>{};
+
+                                reduce_in_element_op(c_reduce_thread_buf(offset),
+                                                     c_reduce_thread_buf(offset));
+                            });
+                        });
+
+                        ThreadwiseReduce::Reduce(c_reduce_thread_buf, reduce_thread_buf);
+
+                        // copy from VGPR to Global
+                        reduce_thread_copy_vgpr_to_global.Run(reduce_thread_desc_mblock_mperblock,
+                                                              make_tuple(I0, I0),
+                                                              reduce_thread_buf,
+                                                              reduce_grid_desc_mblock_mperblock,
+                                                              reduce_grid_buf);
+
+                        if constexpr(access_id < num_access - 1)
+                        {
+                            constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id);
+                            reduce_thread_copy_vgpr_to_global.MoveDstSliceWindow(
+                                reduce_grid_desc_mblock_mperblock,
+                                make_tuple(c_global_step[I0], c_global_step[I1]));
+                        }
+                    });
+                }
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id);
+
+                    // move on C
+                    c_reduce_thread_copy_vgpr_to_global.MoveDstSliceWindow(
+                        c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
+
+                    // move on C0
+                    c0_thread_copy_global_to_vgpr.MoveSrcSliceWindow(
+                        c0_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
+
+                    // move on C1
+                    c1_thread_copy_global_to_vgpr.MoveSrcSliceWindow(
+                        c1_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
+                }
+            });
+        } // Reduction
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp
new file mode 100644
index 00000000..a9522a66
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_multiple_d.hpp
@@ -0,0 +1,678 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_set.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename DsDataType,
+          typename FloatC,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          typename AGridDesc_K0_M_K1,
+          typename BGridDesc_K0_N_K1,
+          typename CGridDesc_M_N,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t K0PerBlock,
+          index_t K1Value,
+          index_t M1PerThreadM111,
+          index_t N1PerThreadN111,
+          index_t KPerThread,
+          typename M11N11ThreadClusterM110Xs,
+          typename M11N11ThreadClusterN110Xs,
+          typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+          typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
+          typename ABlockTransferSrcVectorTensorContiguousDimOrder,
+          typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
+          typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+          typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
+          typename BBlockTransferSrcVectorTensorContiguousDimOrder,
+          typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector>
+struct GridwiseGemmDlMultipleD_km_kn_mn
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    // K1 should be Number<...>
+    static constexpr auto K1 = Number<K1Value>{};
+
+    // ck::Tuple<const D0DataType*, const D1DataType*, ...>
+    static constexpr auto MakeDsGridPointer()
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                return static_cast<const DDataType*>(nullptr);
+            },
+            Number<NumDTensor>{});
+    }
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // TODO: change this. I think it needs multi-dimensional alignment
+        constexpr auto max_lds_align = K1;
+
+        // TODO: check alignment
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_k_m = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+
+        // TODO: check alignment
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_k_n = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+
+        // TODO: check alignment
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_aligned_space_size =
+            math::integer_least_multiple(a_block_desc_k_m.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_aligned_space_size =
+            math::integer_least_multiple(b_block_desc_k_n.GetElementSpaceSize(), max_lds_align);
+
+        return 2 * (a_block_aligned_space_size + b_block_aligned_space_size) * sizeof(FloatAB);
+    }
+
+    __host__ __device__ static constexpr bool
+    CheckValidity(const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1,
+                  const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1,
+                  const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        const auto M  = a_grid_desc_k0_m_k1.GetLength(I1);
+        const auto N  = b_grid_desc_k0_n_k1.GetLength(I1);
+        const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0);
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+
+        return (M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1) &&
+                K0 == b_grid_desc_k0_n_k1.GetLength(I0) &&
+                K1 == a_grid_desc_k0_m_k1.GetLength(I2) &&
+                K1 == b_grid_desc_k0_n_k1.GetLength(I2)) &&
+               (M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % K0PerBlock == 0);
+    }
+
+    __host__ __device__ static constexpr index_t CalculateGridSize(index_t M, index_t N)
+    {
+        const index_t grid_size = (M / MPerBlock) * (N / NPerBlock);
+
+        return grid_size;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K0)
+    {
+        const bool has_main_k_block_loop = (K0 + K0PerBlock) / (2 * K0PerBlock) > 1;
+
+        return has_main_k_block_loop;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasDoubleTailKBlockLoop(index_t K0)
+    {
+        const bool has_double_tail_k_block_loop = (K0 / K0PerBlock) % 2 == 0;
+
+        return has_double_tail_k_block_loop;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeAGridDescriptor_K0_M0_M1_K1(const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1)
+    {
+        const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0);
+        const auto M  = a_grid_desc_k0_m_k1.GetLength(I1);
+
+        const auto M1 = Number<MPerBlock>{};
+        const auto M0 = M / M1;
+
+        const auto a_grid_desc_k0_m0_m1_k1 =
+            transform_tensor_descriptor(a_grid_desc_k0_m_k1,
+                                        make_tuple(make_pass_through_transform(K0),
+                                                   make_unmerge_transform(make_tuple(M0, M1)),
+                                                   make_pass_through_transform(K1)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+        return a_grid_desc_k0_m0_m1_k1;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeBGridDescriptor_K0_N0_N1_K1(const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1)
+    {
+        const auto K0 = b_grid_desc_k0_n_k1.GetLength(I0);
+        const auto N  = b_grid_desc_k0_n_k1.GetLength(I1);
+
+        const auto N1 = Number<NPerBlock>{};
+        const auto N0 = N / N1;
+
+        const auto b_grid_desc_k0_n0_n1_k1 =
+            transform_tensor_descriptor(b_grid_desc_k0_n_k1,
+                                        make_tuple(make_pass_through_transform(K0),
+                                                   make_unmerge_transform(make_tuple(N0, N1)),
+                                                   make_pass_through_transform(K1)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+        return b_grid_desc_k0_n0_n1_k1;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        const auto M = c_grid_desc_m_n.GetLength(I0);
+        const auto N = c_grid_desc_m_n.GetLength(I1);
+
+        constexpr auto M1 = Number<MPerBlock>{};
+        constexpr auto N1 = Number<NPerBlock>{};
+
+        const auto M0 = M / M1;
+        const auto N0 = N / N1;
+
+        constexpr auto M11 =
+            Number<container_reduce(M11N11ThreadClusterM110Xs{}, math::multiplies{}, I1) *
+                   M1PerThreadM111>{};
+        constexpr auto N11 =
+            Number<container_reduce(M11N11ThreadClusterN110Xs{}, math::multiplies{}, I1) *
+                   N1PerThreadN111>{};
+
+        constexpr auto M10 = M1 / M11;
+        constexpr auto N10 = N1 / N11;
+
+        const auto c_grid_desc_m0_m10_m11_n0_n10_n11 = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(M0, M10, M11)),
+                       make_unmerge_transform(make_tuple(N0, N10, N11))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{}));
+
+        return c_grid_desc_m0_m10_m11_n0_n10_n11;
+    }
+
+    // Ds desc for source in blockwise copy
+    template <typename DsGridDesc_M_N>
+    __host__ __device__ static constexpr auto
+    MakeDsGridDescriptor_M0_M10_M11_N0_N10_N11(const DsGridDesc_M_N& ds_grid_desc_m_n)
+    {
+        return generate_tuple(
+            [&](auto i) { return MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(ds_grid_desc_m_n[i]); },
+            Number<NumDTensor>{});
+    }
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    __host__ __device__ static constexpr auto
+    MakeDefaultBlock2CTileMap(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        return BlockToCTileMap_M00_N00_M01_N01<MPerBlock, NPerBlock, CGridDesc_M_N>(
+            c_grid_desc_m_n);
+    }
+
+    using AGridDesc_K0_M0_M1_K1 = decltype(MakeAGridDescriptor_K0_M0_M1_K1(AGridDesc_K0_M_K1{}));
+    using BGridDesc_K0_N0_N1_K1 = decltype(MakeBGridDescriptor_K0_N0_N1_K1(BGridDesc_K0_N_K1{}));
+    using CGridDesc_M0_M10_M11_N0_N10_N11 =
+        decltype(MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(CGridDesc_M_N{}));
+    using Block2CTileMap = decltype(MakeDefaultBlock2CTileMap(CGridDesc_M_N{}));
+
+    using DsGridPointer = decltype(MakeDsGridPointer());
+
+    template <typename DsGridDesc_M0_M10_M11_N0_N10_N11,
+              bool HasMainKBlockLoop,
+              bool HasDoubleTailKBlockLoop>
+    __device__ static void
+    Run(const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        DsGridPointer p_ds_grid,
+        FloatC* __restrict__ p_c_grid,
+        FloatAB* __restrict__ p_shared_block,
+        const AElementwiseOperation&,
+        const BElementwiseOperation&,
+        const CDEElementwiseOperation& cde_element_op,
+        const AGridDesc_K0_M0_M1_K1& a_grid_desc_k0_m0_m1_k1,
+        const BGridDesc_K0_N0_N1_K1& b_grid_desc_k0_n0_n1_k1,
+        const DsGridDesc_M0_M10_M11_N0_N10_N11& ds_grid_desc_m0_m10_m11_n0_n10_n11,
+        const CGridDesc_M0_M10_M11_N0_N10_N11& c_grid_desc_m0_m10_m11_n0_n10_n11,
+        const Block2CTileMap& block_2_ctile_map,
+        integral_constant<bool, HasMainKBlockLoop>,
+        integral_constant<bool, HasDoubleTailKBlockLoop>)
+    {
+        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_k0_m0_m1_k1.GetElementSpaceSize());
+        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_k0_n0_n1_k1.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_grid_desc_m0_m10_m11_n0_n10_n11.GetElementSpaceSize());
+
+        // divide block work by [M, N]
+        const auto c_m0_n0_block_cluster_idx =
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        // HACK: this force index data into SGPR
+        const index_t im0 = __builtin_amdgcn_readfirstlane(c_m0_n0_block_cluster_idx[I0]);
+        const index_t in0 = __builtin_amdgcn_readfirstlane(c_m0_n0_block_cluster_idx[I1]);
+
+        if(!block_2_ctile_map.ValidCTileIndex(
+               make_tuple(im0, in0),
+               make_tuple(c_grid_desc_m0_m10_m11_n0_n10_n11.GetLength(I0),
+                          c_grid_desc_m0_m10_m11_n0_n10_n11.GetLength(I3))))
+        {
+            return;
+        }
+
+        // TODO: change this. I think it needs multi-dimensional alignment
+        constexpr auto max_lds_align = K1;
+
+        // TODO: check alignment
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_block_desc_k0_m0_m1_k1 = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<K0PerBlock>{}, I1, Number<MPerBlock>{}, K1), max_lds_align);
+
+        // TODO: check alignment
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_block_desc_k0_n0_n1_k1 = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<K0PerBlock>{}, I1, Number<NPerBlock>{}, K1), max_lds_align);
+
+        // TODO: check alignment
+        // A matrix in LDS memory, for blockwise GEMM
+        constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+
+        // TODO: check alignment
+        // B matrix in LDS memory, for blockwise GEMM
+        constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+
+        static_assert(a_block_desc_k0_m0_m1_k1.GetElementSpaceSize() ==
+                          a_k0_m_k1_block_desc.GetElementSpaceSize() &&
+                      b_block_desc_k0_n0_n1_k1.GetElementSpaceSize() ==
+                          b_k0_n_k1_block_desc.GetElementSpaceSize() &&
+                      "wrong!");
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy = BlockwiseTensorSliceTransfer_v5r1<
+            BlockSize,
+            InMemoryDataOperationEnum::Set,
+            Sequence<K0PerBlock, 1, MPerBlock, K1.value>,
+            ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+            ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+            ABlockTransferThreadClusterArrangeOrder,
+            FloatAB,
+            FloatAB,
+            remove_reference_t<decltype(a_grid_desc_k0_m0_m1_k1)>,
+            decltype(a_block_desc_k0_m0_m1_k1),
+            ABlockTransferSrcAccessOrder,
+            Sequence<0, 1, 2, 3>,
+            ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, // SrcVectorTensorLengths
+            ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, // DstVectorTensorLengths
+            ABlockTransferSrcVectorTensorContiguousDimOrder,  // SrcVectorTensorContiguousDimOrder
+            Sequence<0, 1, 2, 3>,                             // DstVectorTensorContiguousDimOrder
+            false,
+            true>(a_grid_desc_k0_m0_m1_k1,
+                  make_multi_index(0, im0, 0, 0),
+                  a_block_desc_k0_m0_m1_k1,
+                  make_multi_index(0, 0, 0, 0));
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy = BlockwiseTensorSliceTransfer_v5r1<
+            BlockSize,
+            InMemoryDataOperationEnum::Set,
+            Sequence<K0PerBlock, 1, NPerBlock, K1.value>,
+            BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+            BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+            BBlockTransferThreadClusterArrangeOrder,
+            FloatAB,
+            FloatAB,
+            remove_reference_t<decltype(b_grid_desc_k0_n0_n1_k1)>,
+            decltype(b_block_desc_k0_n0_n1_k1),
+            BBlockTransferSrcAccessOrder,
+            Sequence<0, 1, 2, 3>,
+            BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, // SrcVectorTensorLengths
+            BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, // DstVectorTensorLengths
+            BBlockTransferSrcVectorTensorContiguousDimOrder,  // SrcVectorTensorContiguousDimOrder
+            Sequence<0, 1, 2, 3>,                             // DstVectorTensorContiguousDimOrder
+            false,
+            true>(b_grid_desc_k0_n0_n1_k1,
+                  make_multi_index(0, in0, 0, 0),
+                  b_block_desc_k0_n0_n1_k1,
+                  make_multi_index(0, 0, 0, 0));
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[K0PerBlock, MPerBlock] is in LDS
+        //     b_mtx[KPerBlocl, NPerBlock] is in LDS
+        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
+        //       register
+        const auto blockwise_gemm =
+            BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2<
+                BlockSize,
+                FloatAB,
+                FloatAB,
+                FloatAcc,
+                decltype(a_k0_m_k1_block_desc),
+                decltype(b_k0_n_k1_block_desc),
+                M1PerThreadM111,
+                N1PerThreadN111,
+                KPerThread,
+                M11N11ThreadClusterM110Xs,
+                M11N11ThreadClusterN110Xs,
+                M1PerThreadM111,
+                N1PerThreadN111>{};
+
+        constexpr auto c_m10_m11_n10_n11_thread_tensor_lengths =
+            decltype(blockwise_gemm)::GetCThreadTensorLengths_BM0_BM1_BN0_BN1();
+
+        constexpr auto c_thread_desc_m10_m11_n10_n11 = make_naive_tensor_descriptor_packed(
+            sequence_to_tuple_of_number(c_m10_m11_n10_n11_thread_tensor_lengths));
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_aligned_space_size = math::integer_least_multiple(
+            a_block_desc_k0_m0_m1_k1.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_aligned_space_size = math::integer_least_multiple(
+            b_block_desc_k0_n0_n1_k1.GetElementSpaceSize(), max_lds_align);
+
+        FloatAB* p_a_block_double = p_shared_block;
+        FloatAB* p_b_block_double = p_shared_block + 2 * a_block_aligned_space_size;
+
+        // register allocation for output
+        auto c_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAcc>(
+            c_thread_desc_m10_m11_n10_n11.GetElementSpaceSize());
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0, 0);
+
+        auto a_block_even_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            p_a_block_double, a_block_desc_k0_m0_m1_k1.GetElementSpaceSize());
+        auto b_block_even_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            p_b_block_double, b_block_desc_k0_n0_n1_k1.GetElementSpaceSize());
+
+        auto a_block_odd_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            p_a_block_double + a_block_aligned_space_size,
+            a_block_desc_k0_m0_m1_k1.GetElementSpaceSize());
+        auto b_block_odd_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            p_b_block_double + b_block_aligned_space_size,
+            b_block_desc_k0_n0_n1_k1.GetElementSpaceSize());
+
+        // LDS double buffer: preload data into LDS
+        {
+            a_blockwise_copy.RunRead(a_grid_desc_k0_m0_m1_k1, a_global_buf);
+            b_blockwise_copy.RunRead(b_grid_desc_k0_n0_n1_k1, b_global_buf);
+
+            a_blockwise_copy.RunWrite(a_block_desc_k0_m0_m1_k1, a_block_even_buf);
+            b_blockwise_copy.RunWrite(b_block_desc_k0_n0_n1_k1, b_block_even_buf);
+        }
+
+        if constexpr(HasMainKBlockLoop)
+        {
+            const auto K0 = a_grid_desc_k0_m0_m1_k1.GetLength(I0);
+
+            index_t k_block_data_begin = 0;
+
+            // LDS double buffer: main body
+            // use Do-While loop instead of For loop to simplify control flow
+            do
+            {
+                // even iteration
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_k0_m0_m1_k1,
+                                                    a_block_slice_copy_step);
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_k0_n0_n1_k1,
+                                                    b_block_slice_copy_step);
+
+                // LDS doubel buffer: load next data from device mem
+                a_blockwise_copy.RunRead(a_grid_desc_k0_m0_m1_k1, a_global_buf);
+                b_blockwise_copy.RunRead(b_grid_desc_k0_n0_n1_k1, b_global_buf);
+
+                block_sync_lds();
+
+                // LDS double buffer: GEMM on current data
+                blockwise_gemm.Run(c_thread_desc_m10_m11_n10_n11,
+                                   a_block_even_buf,
+                                   b_block_even_buf,
+                                   c_thread_buf);
+
+                // LDS double buffer: store next data to LDS
+                a_blockwise_copy.RunWrite(a_block_desc_k0_m0_m1_k1, a_block_odd_buf);
+                b_blockwise_copy.RunWrite(b_block_desc_k0_n0_n1_k1, b_block_odd_buf);
+
+                // odd iteration
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_k0_m0_m1_k1,
+                                                    a_block_slice_copy_step);
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_k0_n0_n1_k1,
+                                                    b_block_slice_copy_step);
+
+                // LDS doubel buffer: load next data from device mem
+                a_blockwise_copy.RunRead(a_grid_desc_k0_m0_m1_k1, a_global_buf);
+                b_blockwise_copy.RunRead(b_grid_desc_k0_n0_n1_k1, b_global_buf);
+
+                block_sync_lds();
+
+                // LDS double buffer: GEMM on current data
+                blockwise_gemm.Run(
+                    c_thread_desc_m10_m11_n10_n11, a_block_odd_buf, b_block_odd_buf, c_thread_buf);
+
+                // LDS double buffer: store next data to LDS
+                a_blockwise_copy.RunWrite(a_block_desc_k0_m0_m1_k1, a_block_even_buf);
+                b_blockwise_copy.RunWrite(b_block_desc_k0_n0_n1_k1, b_block_even_buf);
+
+                k_block_data_begin += 2 * K0PerBlock;
+            } while(k_block_data_begin < K0 - 2 * K0PerBlock);
+        }
+
+        // LDS double buffer: tail
+        if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left
+        {
+            a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_k0_m0_m1_k1, a_block_slice_copy_step);
+            b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_k0_n0_n1_k1, b_block_slice_copy_step);
+
+            block_sync_lds();
+
+            // LDS double buffer: load last data from device mem
+            a_blockwise_copy.RunRead(a_grid_desc_k0_m0_m1_k1, a_global_buf);
+            b_blockwise_copy.RunRead(b_grid_desc_k0_n0_n1_k1, b_global_buf);
+
+            // LDS double buffer: GEMM on 2nd-last data
+            blockwise_gemm.Run(
+                c_thread_desc_m10_m11_n10_n11, a_block_even_buf, b_block_even_buf, c_thread_buf);
+
+            // LDS double buffer: store last data to LDS
+            a_blockwise_copy.RunWrite(a_block_desc_k0_m0_m1_k1, a_block_odd_buf);
+            b_blockwise_copy.RunWrite(b_block_desc_k0_n0_n1_k1, b_block_odd_buf);
+
+            block_sync_lds();
+
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(
+                c_thread_desc_m10_m11_n10_n11, a_block_odd_buf, b_block_odd_buf, c_thread_buf);
+        }
+        else // if has 1 iteration left
+        {
+            __syncthreads();
+
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(
+                c_thread_desc_m10_m11_n10_n11, a_block_even_buf, b_block_even_buf, c_thread_buf);
+        }
+
+        // output: register to global memory
+        {
+            constexpr auto c_thread_desc_m0_m10_m11_n0_n10_n11 =
+                make_naive_tensor_descriptor_packed(
+                    make_tuple(I1,
+                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I0]>{},
+                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I1]>{},
+                               I1,
+                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I2]>{},
+                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I3]>{}));
+
+            const auto c_m10_m11_n10_n11_thread_origin_idx_on_block =
+                blockwise_gemm.CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1(
+                    get_thread_local_1d_id());
+
+            const auto ds_grid_buf = generate_tuple(
+                [&](auto i) {
+                    return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                        p_ds_grid[i], ds_grid_desc_m0_m10_m11_n0_n10_n11[i].GetElementSpaceSize());
+                },
+                Number<NumDTensor>{});
+
+            auto ds_thread_buf = generate_tuple(
+                [&](auto i) {
+                    using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                    return StaticBuffer<AddressSpaceEnum::Vgpr,
+                                        DDataType,
+                                        c_m10_m11_n10_n11_thread_tensor_lengths[I3],
+                                        true>{};
+                },
+                Number<NumDTensor>{});
+
+            auto ds_threadwise_copy = generate_tuple(
+                [&](auto i) {
+                    using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                    return ThreadwiseTensorSliceTransfer_v2<
+                        DDataType,
+                        DDataType,
+                        decltype(ds_grid_desc_m0_m10_m11_n0_n10_n11[i]),
+                        decltype(c_thread_desc_m0_m10_m11_n0_n10_n11),
+                        Sequence<I1,
+                                 I1,
+                                 I1,
+                                 I1,
+                                 I1,
+                                 Number<c_m10_m11_n10_n11_thread_tensor_lengths[I3]>{}>,
+                        CThreadTransferSrcDstAccessOrder,
+                        CThreadTransferSrcDstVectorDim,
+                        CThreadTransferDstScalarPerVector,
+                        1,
+                        false>(ds_grid_desc_m0_m10_m11_n0_n10_n11[i],
+                               make_multi_index(im0,
+                                                c_m10_m11_n10_n11_thread_origin_idx_on_block[I0],
+                                                c_m10_m11_n10_n11_thread_origin_idx_on_block[I1],
+                                                in0,
+                                                c_m10_m11_n10_n11_thread_origin_idx_on_block[I2],
+                                                c_m10_m11_n10_n11_thread_origin_idx_on_block[I3]));
+                },
+                Number<NumDTensor>{});
+
+            static_for<0, c_m10_m11_n10_n11_thread_tensor_lengths[I0], 1>{}([&](auto m10) {
+                static_for<0, c_m10_m11_n10_n11_thread_tensor_lengths[I1], 1>{}([&](auto m11) {
+                    static_for<0, c_m10_m11_n10_n11_thread_tensor_lengths[I2], 1>{}([&](auto n10) {
+                        // load d matrix data
+                        static_for<0, NumDTensor, 1>{}([&](auto i) {
+                            ds_threadwise_copy(i).Run(ds_grid_desc_m0_m10_m11_n0_n10_n11[i],
+                                                      ds_grid_buf[i],
+                                                      c_thread_desc_m0_m10_m11_n0_n10_n11,
+                                                      make_tuple(I0, I0, I0, I0, I0, I0),
+                                                      ds_thread_buf(i));
+                        });
+                        // cal element op
+                        static_for<0, c_m10_m11_n10_n11_thread_tensor_lengths[I3], 1>{}(
+                            [&](auto i) {
+                                // get reference to src data
+                                const auto src_data_refs = generate_tie(
+                                    // return type should be lvalue
+                                    [&](auto iSrc) -> const auto& {
+                                        return ds_thread_buf[iSrc][i];
+                                    },
+                                    Number<NumDTensor>{});
+
+                                // get reference to dst data
+                                constexpr index_t c_offset =
+                                    c_thread_desc_m0_m10_m11_n0_n10_n11.CalculateOffset(
+                                        make_tuple(0, m10, m11, 0, n10, i));
+                                auto dst_data_refs = generate_tie(
+                                    // return type should be lvalue
+                                    [&](auto) -> auto& { return c_thread_buf(Number<c_offset>{}); },
+                                    Number<2>{});
+
+                                unpack2(cde_element_op, dst_data_refs, src_data_refs);
+                            });
+
+                        static_for<0, NumDTensor, 1>{}([&](auto i) {
+                            ds_threadwise_copy(i).MoveSrcSliceWindow(
+                                ds_grid_desc_m0_m10_m11_n0_n10_n11[i],
+                                make_multi_index(0, 0, 0, 0, 1, 0));
+                        });
+                    });
+                    static_for<0, NumDTensor, 1>{}([&](auto i) {
+                        ds_threadwise_copy(i).MoveSrcSliceWindow(
+                            ds_grid_desc_m0_m10_m11_n0_n10_n11[i],
+                            make_multi_index(
+                                0, 0, 1, 0, -c_m10_m11_n10_n11_thread_tensor_lengths[I2], 0));
+                    });
+                });
+                static_for<0, NumDTensor, 1>{}([&](auto i) {
+                    ds_threadwise_copy(i).MoveSrcSliceWindow(
+                        ds_grid_desc_m0_m10_m11_n0_n10_n11[i],
+                        make_multi_index(
+                            0, 1, -c_m10_m11_n10_n11_thread_tensor_lengths[I1], 0, 0, 0));
+                });
+            });
+
+            ThreadwiseTensorSliceTransfer_v1r3<
+                FloatAcc,
+                FloatC,
+                decltype(c_thread_desc_m0_m10_m11_n0_n10_n11),
+                decltype(c_grid_desc_m0_m10_m11_n0_n10_n11),
+                ck::tensor_operation::element_wise::PassThrough,
+                Sequence<1,
+                         c_m10_m11_n10_n11_thread_tensor_lengths[I0],
+                         c_m10_m11_n10_n11_thread_tensor_lengths[I1],
+                         1,
+                         c_m10_m11_n10_n11_thread_tensor_lengths[I2],
+                         c_m10_m11_n10_n11_thread_tensor_lengths[I3]>,
+                CThreadTransferSrcDstAccessOrder,
+                CThreadTransferSrcDstVectorDim,
+                CThreadTransferDstScalarPerVector,
+                CGlobalMemoryDataOperation,
+                1,
+                true>{c_grid_desc_m0_m10_m11_n0_n10_n11,
+                      make_multi_index(im0,
+                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I0],
+                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I1],
+                                       in0,
+                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I2],
+                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I3]),
+                      ck::tensor_operation::element_wise::PassThrough{}}
+                .Run(c_thread_desc_m0_m10_m11_n0_n10_n11,
+                     make_tuple(I0, I0, I0, I0, I0, I0),
+                     c_thread_buf,
+                     c_grid_desc_m0_m10_m11_n0_n10_n11,
+                     c_grid_buf);
+        }
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp
new file mode 100644
index 00000000..c839cde0
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dl_v1r3.hpp
@@ -0,0 +1,577 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_dl_v2r3.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_set.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AGridDesc_K0_M0_M1_K1,
+          typename BGridDesc_K0_N0_N1_K1,
+          typename CGridDesc_M0_M10_M11_N0_N10_N11,
+          typename Block2CTileMap,
+          bool HasMainKBlockLoop,
+          bool HasDoubleTailKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_dl_v1r3(const FloatAB* __restrict__ p_a_grid,
+                            const FloatAB* __restrict__ p_b_grid,
+                            FloatC* __restrict__ p_c_grid,
+                            const AGridDesc_K0_M0_M1_K1 a_grid_desc_k0_m0_m1_k1,
+                            const BGridDesc_K0_N0_N1_K1 b_grid_desc_k0_n0_n1_k1,
+                            const CGridDesc_M0_M10_M11_N0_N10_N11 c_grid_desc_m0_m10_m11_n0_n10_n11,
+                            const Block2CTileMap block_2_ctile_map)
+{
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::Run(p_a_grid,
+                      p_b_grid,
+                      p_c_grid,
+                      p_shared_block,
+                      a_grid_desc_k0_m0_m1_k1,
+                      b_grid_desc_k0_n0_n1_k1,
+                      c_grid_desc_m0_m10_m11_n0_n10_n11,
+                      block_2_ctile_map,
+                      integral_constant<bool, HasMainKBlockLoop>{},
+                      integral_constant<bool, HasDoubleTailKBlockLoop>{});
+}
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          typename AGridDesc_K0_M_K1,
+          typename BGridDesc_K0_N_K1,
+          typename CGridDesc_M_N,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t K0PerBlock,
+          index_t K1Value,
+          index_t M1PerThreadM111,
+          index_t N1PerThreadN111,
+          index_t KPerThread,
+          typename M11N11ThreadClusterM110Xs,
+          typename M11N11ThreadClusterN110Xs,
+          typename ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+          typename ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          typename ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
+          typename ABlockTransferSrcVectorTensorContiguousDimOrder,
+          typename ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
+          typename BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+          typename BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          typename BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
+          typename BBlockTransferSrcVectorTensorContiguousDimOrder,
+          typename BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector>
+struct GridwiseGemmDl_km_kn_mn_v1r3
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    // K1 should be Number<...>
+    static constexpr auto K1 = Number<K1Value>{};
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // TODO: change this. I think it needs multi-dimensional alignment
+        constexpr auto max_lds_align = K1;
+
+        // TODO: check alignment
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_k_m = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+
+        // TODO: check alignment
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_k_n = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+
+        // TODO: check alignment
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_aligned_space_size =
+            math::integer_least_multiple(a_block_desc_k_m.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_aligned_space_size =
+            math::integer_least_multiple(b_block_desc_k_n.GetElementSpaceSize(), max_lds_align);
+
+        return 2 * (a_block_aligned_space_size + b_block_aligned_space_size) * sizeof(FloatAB);
+    }
+
+    __host__ __device__ static constexpr bool
+    CheckValidity(const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1,
+                  const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1,
+                  const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        const auto M  = a_grid_desc_k0_m_k1.GetLength(I1);
+        const auto N  = b_grid_desc_k0_n_k1.GetLength(I1);
+        const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0);
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+
+        return (M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1) &&
+                K0 == b_grid_desc_k0_n_k1.GetLength(I0) &&
+                K1 == a_grid_desc_k0_m_k1.GetLength(I2) &&
+                K1 == b_grid_desc_k0_n_k1.GetLength(I2)) &&
+               (M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % K0PerBlock == 0);
+    }
+
+    __host__ __device__ static constexpr index_t CalculateGridSize(index_t M, index_t N)
+    {
+        const index_t grid_size = (M / MPerBlock) * (N / NPerBlock);
+
+        return grid_size;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K0)
+    {
+        const bool has_main_k_block_loop = (K0 + K0PerBlock) / (2 * K0PerBlock) > 1;
+
+        return has_main_k_block_loop;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasDoubleTailKBlockLoop(index_t K0)
+    {
+        const bool has_double_tail_k_block_loop = (K0 / K0PerBlock) % 2 == 0;
+
+        return has_double_tail_k_block_loop;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeAGridDescriptor_K0_M0_M1_K1(const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1)
+    {
+        const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0);
+        const auto M  = a_grid_desc_k0_m_k1.GetLength(I1);
+
+        const auto M1 = Number<MPerBlock>{};
+        const auto M0 = M / M1;
+
+        const auto a_grid_desc_k0_m0_m1_k1 =
+            transform_tensor_descriptor(a_grid_desc_k0_m_k1,
+                                        make_tuple(make_pass_through_transform(K0),
+                                                   make_unmerge_transform(make_tuple(M0, M1)),
+                                                   make_pass_through_transform(K1)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+        return a_grid_desc_k0_m0_m1_k1;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeBGridDescriptor_K0_N0_N1_K1(const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1)
+    {
+        const auto K0 = b_grid_desc_k0_n_k1.GetLength(I0);
+        const auto N  = b_grid_desc_k0_n_k1.GetLength(I1);
+
+        const auto N1 = Number<NPerBlock>{};
+        const auto N0 = N / N1;
+
+        const auto b_grid_desc_k0_n0_n1_k1 =
+            transform_tensor_descriptor(b_grid_desc_k0_n_k1,
+                                        make_tuple(make_pass_through_transform(K0),
+                                                   make_unmerge_transform(make_tuple(N0, N1)),
+                                                   make_pass_through_transform(K1)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+        return b_grid_desc_k0_n0_n1_k1;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        const auto M = c_grid_desc_m_n.GetLength(I0);
+        const auto N = c_grid_desc_m_n.GetLength(I1);
+
+        constexpr auto M1 = Number<MPerBlock>{};
+        constexpr auto N1 = Number<NPerBlock>{};
+
+        const auto M0 = M / M1;
+        const auto N0 = N / N1;
+
+        constexpr auto M11 =
+            Number<container_reduce(M11N11ThreadClusterM110Xs{}, math::multiplies{}, I1) *
+                   M1PerThreadM111>{};
+        constexpr auto N11 =
+            Number<container_reduce(M11N11ThreadClusterN110Xs{}, math::multiplies{}, I1) *
+                   N1PerThreadN111>{};
+
+        constexpr auto M10 = M1 / M11;
+        constexpr auto N10 = N1 / N11;
+
+        const auto c_grid_desc_m0_m10_m11_n0_n10_n11 = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(M0, M10, M11)),
+                       make_unmerge_transform(make_tuple(N0, N10, N11))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{}));
+
+        return c_grid_desc_m0_m10_m11_n0_n10_n11;
+    }
+
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    __host__ __device__ static constexpr auto
+    MakeDefaultBlock2CTileMap(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        return BlockToCTileMap_M00_N00_M01_N01<MPerBlock, NPerBlock, CGridDesc_M_N>(
+            c_grid_desc_m_n);
+    }
+
+    using AGridDesc_K0_M0_M1_K1 = decltype(MakeAGridDescriptor_K0_M0_M1_K1(AGridDesc_K0_M_K1{}));
+    using BGridDesc_K0_N0_N1_K1 = decltype(MakeBGridDescriptor_K0_N0_N1_K1(BGridDesc_K0_N_K1{}));
+    using CGridDesc_M0_M10_M11_N0_N10_N11 =
+        decltype(MakeCGridDescriptor_M0_M10_M11_N0_N10_N11(CGridDesc_M_N{}));
+    using Block2CTileMap = decltype(MakeDefaultBlock2CTileMap(CGridDesc_M_N{}));
+
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ static void
+    Run(const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatC* __restrict__ p_c_grid,
+        FloatAB* __restrict__ p_shared_block,
+        const AGridDesc_K0_M0_M1_K1& a_grid_desc_k0_m0_m1_k1,
+        const BGridDesc_K0_N0_N1_K1& b_grid_desc_k0_n0_n1_k1,
+        const CGridDesc_M0_M10_M11_N0_N10_N11& c_grid_desc_m0_m10_m11_n0_n10_n11,
+        const Block2CTileMap& block_2_ctile_map,
+        integral_constant<bool, HasMainKBlockLoop>,
+        integral_constant<bool, HasDoubleTailKBlockLoop>)
+    {
+        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_k0_m0_m1_k1.GetElementSpaceSize());
+        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_k0_n0_n1_k1.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_grid_desc_m0_m10_m11_n0_n10_n11.GetElementSpaceSize());
+
+        // divide block work by [M, N]
+        const auto c_m0_n0_block_cluster_idx =
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        // HACK: this force index data into SGPR
+        const index_t im0 = __builtin_amdgcn_readfirstlane(c_m0_n0_block_cluster_idx[I0]);
+        const index_t in0 = __builtin_amdgcn_readfirstlane(c_m0_n0_block_cluster_idx[I1]);
+
+        if(!block_2_ctile_map.ValidCTileIndex(
+               make_tuple(im0, in0),
+               make_tuple(c_grid_desc_m0_m10_m11_n0_n10_n11.GetLength(I0),
+                          c_grid_desc_m0_m10_m11_n0_n10_n11.GetLength(I3))))
+        {
+            return;
+        }
+
+        // TODO: change this. I think it needs multi-dimensional alignment
+        constexpr auto max_lds_align = K1;
+
+        // TODO: check alignment
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_block_desc_k0_m0_m1_k1 = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<K0PerBlock>{}, I1, Number<MPerBlock>{}, K1), max_lds_align);
+
+        // TODO: check alignment
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_block_desc_k0_n0_n1_k1 = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<K0PerBlock>{}, I1, Number<NPerBlock>{}, K1), max_lds_align);
+
+        // TODO: check alignment
+        // A matrix in LDS memory, for blockwise GEMM
+        constexpr auto a_k0_m_k1_block_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+
+        // TODO: check alignment
+        // B matrix in LDS memory, for blockwise GEMM
+        constexpr auto b_k0_n_k1_block_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+
+        static_assert(a_block_desc_k0_m0_m1_k1.GetElementSpaceSize() ==
+                          a_k0_m_k1_block_desc.GetElementSpaceSize() &&
+                      b_block_desc_k0_n0_n1_k1.GetElementSpaceSize() ==
+                          b_k0_n_k1_block_desc.GetElementSpaceSize() &&
+                      "wrong!");
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy = BlockwiseTensorSliceTransfer_v5r1<
+            BlockSize,
+            InMemoryDataOperationEnum::Set,
+            Sequence<K0PerBlock, 1, MPerBlock, K1.value>,
+            ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+            ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+            ABlockTransferThreadClusterArrangeOrder,
+            FloatAB,
+            FloatAB,
+            remove_reference_t<decltype(a_grid_desc_k0_m0_m1_k1)>,
+            decltype(a_block_desc_k0_m0_m1_k1),
+            ABlockTransferSrcAccessOrder,
+            Sequence<0, 1, 2, 3>,
+            ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1, // SrcVectorTensorLengths
+            ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1, // DstVectorTensorLengths
+            ABlockTransferSrcVectorTensorContiguousDimOrder,  // SrcVectorTensorContiguousDimOrder
+            Sequence<0, 1, 2, 3>,                             // DstVectorTensorContiguousDimOrder
+            false,
+            true>(a_grid_desc_k0_m0_m1_k1,
+                  make_multi_index(0, im0, 0, 0),
+                  a_block_desc_k0_m0_m1_k1,
+                  make_multi_index(0, 0, 0, 0));
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy = BlockwiseTensorSliceTransfer_v5r1<
+            BlockSize,
+            InMemoryDataOperationEnum::Set,
+            Sequence<K0PerBlock, 1, NPerBlock, K1.value>,
+            BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+            BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+            BBlockTransferThreadClusterArrangeOrder,
+            FloatAB,
+            FloatAB,
+            remove_reference_t<decltype(b_grid_desc_k0_n0_n1_k1)>,
+            decltype(b_block_desc_k0_n0_n1_k1),
+            BBlockTransferSrcAccessOrder,
+            Sequence<0, 1, 2, 3>,
+            BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1, // SrcVectorTensorLengths
+            BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1, // DstVectorTensorLengths
+            BBlockTransferSrcVectorTensorContiguousDimOrder,  // SrcVectorTensorContiguousDimOrder
+            Sequence<0, 1, 2, 3>,                             // DstVectorTensorContiguousDimOrder
+            false,
+            true>(b_grid_desc_k0_n0_n1_k1,
+                  make_multi_index(0, in0, 0, 0),
+                  b_block_desc_k0_n0_n1_k1,
+                  make_multi_index(0, 0, 0, 0));
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[K0PerBlock, MPerBlock] is in LDS
+        //     b_mtx[KPerBlocl, NPerBlock] is in LDS
+        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
+        //       register
+        const auto blockwise_gemm =
+            BlockwiseGemmDl_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_BM0_2_BN0_2<
+                BlockSize,
+                FloatAB,
+                FloatAB,
+                FloatAcc,
+                decltype(a_k0_m_k1_block_desc),
+                decltype(b_k0_n_k1_block_desc),
+                M1PerThreadM111,
+                N1PerThreadN111,
+                KPerThread,
+                M11N11ThreadClusterM110Xs,
+                M11N11ThreadClusterN110Xs,
+                M1PerThreadM111,
+                N1PerThreadN111>{};
+
+        constexpr auto c_m10_m11_n10_n11_thread_tensor_lengths =
+            decltype(blockwise_gemm)::GetCThreadTensorLengths_BM0_BM1_BN0_BN1();
+
+        constexpr auto c_thread_desc_m10_m11_n10_n11 = make_naive_tensor_descriptor_packed(
+            sequence_to_tuple_of_number(c_m10_m11_n10_n11_thread_tensor_lengths));
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_aligned_space_size = math::integer_least_multiple(
+            a_block_desc_k0_m0_m1_k1.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_aligned_space_size = math::integer_least_multiple(
+            b_block_desc_k0_n0_n1_k1.GetElementSpaceSize(), max_lds_align);
+
+        FloatAB* p_a_block_double = p_shared_block;
+        FloatAB* p_b_block_double = p_shared_block + 2 * a_block_aligned_space_size;
+
+        // register allocation for output
+        auto c_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAcc>(
+            c_thread_desc_m10_m11_n10_n11.GetElementSpaceSize());
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0, 0);
+
+        auto a_block_even_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            p_a_block_double, a_block_desc_k0_m0_m1_k1.GetElementSpaceSize());
+        auto b_block_even_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            p_b_block_double, b_block_desc_k0_n0_n1_k1.GetElementSpaceSize());
+
+        auto a_block_odd_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            p_a_block_double + a_block_aligned_space_size,
+            a_block_desc_k0_m0_m1_k1.GetElementSpaceSize());
+        auto b_block_odd_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            p_b_block_double + b_block_aligned_space_size,
+            b_block_desc_k0_n0_n1_k1.GetElementSpaceSize());
+
+        // LDS double buffer: preload data into LDS
+        {
+            a_blockwise_copy.RunRead(a_grid_desc_k0_m0_m1_k1, a_global_buf);
+            b_blockwise_copy.RunRead(b_grid_desc_k0_n0_n1_k1, b_global_buf);
+
+            a_blockwise_copy.RunWrite(a_block_desc_k0_m0_m1_k1, a_block_even_buf);
+            b_blockwise_copy.RunWrite(b_block_desc_k0_n0_n1_k1, b_block_even_buf);
+        }
+
+        if constexpr(HasMainKBlockLoop)
+        {
+            const auto K0 = a_grid_desc_k0_m0_m1_k1.GetLength(I0);
+
+            index_t k_block_data_begin = 0;
+
+            // LDS double buffer: main body
+            // use Do-While loop instead of For loop to simplify control flow
+            do
+            {
+                // even iteration
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_k0_m0_m1_k1,
+                                                    a_block_slice_copy_step);
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_k0_n0_n1_k1,
+                                                    b_block_slice_copy_step);
+
+                // LDS doubel buffer: load next data from device mem
+                a_blockwise_copy.RunRead(a_grid_desc_k0_m0_m1_k1, a_global_buf);
+                b_blockwise_copy.RunRead(b_grid_desc_k0_n0_n1_k1, b_global_buf);
+
+                block_sync_lds();
+
+                // LDS double buffer: GEMM on current data
+                blockwise_gemm.Run(c_thread_desc_m10_m11_n10_n11,
+                                   a_block_even_buf,
+                                   b_block_even_buf,
+                                   c_thread_buf);
+
+                // LDS double buffer: store next data to LDS
+                a_blockwise_copy.RunWrite(a_block_desc_k0_m0_m1_k1, a_block_odd_buf);
+                b_blockwise_copy.RunWrite(b_block_desc_k0_n0_n1_k1, b_block_odd_buf);
+
+                // odd iteration
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_k0_m0_m1_k1,
+                                                    a_block_slice_copy_step);
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_k0_n0_n1_k1,
+                                                    b_block_slice_copy_step);
+
+                // LDS doubel buffer: load next data from device mem
+                a_blockwise_copy.RunRead(a_grid_desc_k0_m0_m1_k1, a_global_buf);
+                b_blockwise_copy.RunRead(b_grid_desc_k0_n0_n1_k1, b_global_buf);
+
+                block_sync_lds();
+
+                // LDS double buffer: GEMM on current data
+                blockwise_gemm.Run(
+                    c_thread_desc_m10_m11_n10_n11, a_block_odd_buf, b_block_odd_buf, c_thread_buf);
+
+                // LDS double buffer: store next data to LDS
+                a_blockwise_copy.RunWrite(a_block_desc_k0_m0_m1_k1, a_block_even_buf);
+                b_blockwise_copy.RunWrite(b_block_desc_k0_n0_n1_k1, b_block_even_buf);
+
+                k_block_data_begin += 2 * K0PerBlock;
+            } while(k_block_data_begin < K0 - 2 * K0PerBlock);
+        }
+
+        // LDS double buffer: tail
+        if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left
+        {
+            a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_k0_m0_m1_k1, a_block_slice_copy_step);
+            b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc_k0_n0_n1_k1, b_block_slice_copy_step);
+
+            block_sync_lds();
+
+            // LDS double buffer: load last data from device mem
+            a_blockwise_copy.RunRead(a_grid_desc_k0_m0_m1_k1, a_global_buf);
+            b_blockwise_copy.RunRead(b_grid_desc_k0_n0_n1_k1, b_global_buf);
+
+            // LDS double buffer: GEMM on 2nd-last data
+            blockwise_gemm.Run(
+                c_thread_desc_m10_m11_n10_n11, a_block_even_buf, b_block_even_buf, c_thread_buf);
+
+            // LDS double buffer: store last data to LDS
+            a_blockwise_copy.RunWrite(a_block_desc_k0_m0_m1_k1, a_block_odd_buf);
+            b_blockwise_copy.RunWrite(b_block_desc_k0_n0_n1_k1, b_block_odd_buf);
+
+            block_sync_lds();
+
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(
+                c_thread_desc_m10_m11_n10_n11, a_block_odd_buf, b_block_odd_buf, c_thread_buf);
+        }
+        else // if has 1 iteration left
+        {
+            __syncthreads();
+
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(
+                c_thread_desc_m10_m11_n10_n11, a_block_even_buf, b_block_even_buf, c_thread_buf);
+        }
+
+        // output: register to global memory
+        {
+            constexpr auto c_thread_desc_m0_m10_m11_n0_n10_n11 =
+                make_naive_tensor_descriptor_packed(
+                    make_tuple(I1,
+                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I0]>{},
+                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I1]>{},
+                               I1,
+                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I2]>{},
+                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I3]>{}));
+
+            const auto c_m10_m11_n10_n11_thread_origin_idx_on_block =
+                blockwise_gemm.CalculateCThreadOriginOnBlock_BM0_BM1_BN0_BN1(
+                    get_thread_local_1d_id());
+
+            ThreadwiseTensorSliceTransfer_v1r3<
+                FloatAcc,
+                FloatC,
+                decltype(c_thread_desc_m0_m10_m11_n0_n10_n11),
+                decltype(c_grid_desc_m0_m10_m11_n0_n10_n11),
+                ck::tensor_operation::element_wise::PassThrough,
+                Sequence<1,
+                         c_m10_m11_n10_n11_thread_tensor_lengths[I0],
+                         c_m10_m11_n10_n11_thread_tensor_lengths[I1],
+                         1,
+                         c_m10_m11_n10_n11_thread_tensor_lengths[I2],
+                         c_m10_m11_n10_n11_thread_tensor_lengths[I3]>,
+                CThreadTransferSrcDstAccessOrder,
+                CThreadTransferSrcDstVectorDim,
+                CThreadTransferDstScalarPerVector,
+                CGlobalMemoryDataOperation,
+                1,
+                true>{c_grid_desc_m0_m10_m11_n0_n10_n11,
+                      make_multi_index(im0,
+                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I0],
+                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I1],
+                                       in0,
+                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I2],
+                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I3]),
+                      ck::tensor_operation::element_wise::PassThrough{}}
+                .Run(c_thread_desc_m0_m10_m11_n0_n10_n11,
+                     make_tuple(I0, I0, I0, I0, I0, I0),
+                     c_thread_buf,
+                     c_grid_desc_m0_m10_m11_n0_n10_n11,
+                     c_grid_buf);
+        }
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r2.hpp
new file mode 100644
index 00000000..84e033e1
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v1r2.hpp
@@ -0,0 +1,608 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_GRIDWISE_GEMM_DLOPS_V1R2_HPP
+#define CK_GRIDWISE_GEMM_DLOPS_V1R2_HPP
+
+#include "common_header.hpp"
+#include "multi_index_transform_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "blockwise_gemm_dlops_v2r2.hpp"
+#include "blockwise_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_set.hpp"
+
+namespace ck {
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AKM0M1GridDesc,
+          typename BKN0N1GridDesc,
+          typename CM0M10M11N0N10N11GridDesc,
+          typename CBlockIdToM0N0BlockClusterAdaptor,
+          bool HasMainKBlockLoop,
+          bool HasDoubleTailKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_dlops_v1r2(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const AKM0M1GridDesc a_k_m0_m1_grid_desc,
+            const BKN0N1GridDesc b_k_n0_n1_grid_desc,
+            const CM0M10M11N0N10N11GridDesc c_m0_m10_m11_n0_n10_n11_grid_desc,
+            const CBlockIdToM0N0BlockClusterAdaptor cblockid_to_m0_n0_block_cluster_adaptor)
+{
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::Run(p_a_grid,
+                      p_b_grid,
+                      p_c_grid,
+                      p_shared_block,
+                      a_k_m0_m1_grid_desc,
+                      b_k_n0_n1_grid_desc,
+                      c_m0_m10_m11_n0_n10_n11_grid_desc,
+                      cblockid_to_m0_n0_block_cluster_adaptor,
+                      integral_constant<bool, HasMainKBlockLoop>{},
+                      integral_constant<bool, HasDoubleTailKBlockLoop>{});
+}
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          typename AKMGridDesc,
+          typename BKNGridDesc,
+          typename CMNGridDesc,
+          index_t MPerBlockM1,
+          index_t NPerBlockN1,
+          index_t KPerBlock,
+          index_t M1PerThreadM111,
+          index_t N1PerThreadN111,
+          index_t KPerThread,
+          index_t M11N11ThreadClusterM1100,
+          index_t M11N11ThreadClusterN1100,
+          index_t M11N11ThreadClusterM1101,
+          index_t M11N11ThreadClusterN1101,
+          typename ABlockTransferThreadSliceLengths_K_M0_M1,
+          typename ABlockTransferThreadClusterLengths_K_M0_M1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_M1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          typename BBlockTransferThreadSliceLengths_K_N0_N1,
+          typename BBlockTransferThreadClusterLengths_K_N0_N1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_N1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector,
+          typename AGridStepHacks,
+          typename BGridStepHacks,
+          typename CGridStepHacks,
+          typename AGridMoveSliceWindowStepHacks,
+          typename BGridMoveSliceWindowStepHacks>
+struct GridwiseGemmDlops_km_kn_mn_v1r2
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        constexpr auto max_lds_align = math::lcm(Number<ABlockTransferDstScalarPerVector_M1>{},
+                                                 Number<BBlockTransferDstScalarPerVector_N1>{},
+                                                 Number<M1PerThreadM111>{},
+                                                 Number<N1PerThreadN111>{});
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}), max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}), max_lds_align);
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_aligned_space_size =
+            math::integer_least_multiple(a_k_m_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_aligned_space_size =
+            math::integer_least_multiple(b_k_n_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        return 2 * (a_block_aligned_space_size + b_block_aligned_space_size) * sizeof(FloatAB);
+    }
+
+    __host__ __device__ static constexpr bool CheckValidity(const AKMGridDesc& a_k_m_grid_desc,
+                                                            const BKNGridDesc& b_k_n_grid_desc,
+                                                            const CMNGridDesc& c_m_n_grid_desc)
+    {
+        const auto M = a_k_m_grid_desc.GetLength(I1);
+        const auto N = b_k_n_grid_desc.GetLength(I1);
+        const auto K = a_k_m_grid_desc.GetLength(I0);
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+
+        return (M == c_m_n_grid_desc.GetLength(I0) && N == c_m_n_grid_desc.GetLength(I1) &&
+                K == b_k_n_grid_desc.GetLength(I0)) &&
+               (M % MPerBlockM1 == 0 && N % NPerBlockN1 == 0 && K % KPerBlock == 0);
+    }
+
+    __host__ __device__ static constexpr index_t CalculateGridSize(index_t M, index_t N)
+    {
+        const index_t grid_size = (M / MPerBlockM1) * (N / NPerBlockN1);
+
+        return grid_size;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const bool has_main_k_block_loop = (K + KPerBlock) / (2 * KPerBlock) > 1;
+
+        return has_main_k_block_loop;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasDoubleTailKBlockLoop(index_t K)
+    {
+        const bool has_double_tail_k_block_loop = (K / KPerBlock) % 2 == 0;
+
+        return has_double_tail_k_block_loop;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeAKM0M1GridDescriptor(const AKMGridDesc& a_k_m_grid_desc)
+    {
+        const auto K = a_k_m_grid_desc.GetLength(I0);
+        const auto M = a_k_m_grid_desc.GetLength(I1);
+
+        const auto M1 = Number<MPerBlockM1>{};
+        const auto M0 = M / M1;
+
+        const auto a_k_m0_m1_grid_desc = transform_tensor_descriptor(
+            a_k_m_grid_desc,
+            make_tuple(make_pass_through_transform(K), make_unmerge_transform(make_tuple(M0, M1))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}));
+
+        return a_k_m0_m1_grid_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeBKN0N1GridDescriptor(const BKNGridDesc& b_k_n_grid_desc)
+    {
+        const auto K = b_k_n_grid_desc.GetLength(I0);
+        const auto N = b_k_n_grid_desc.GetLength(I1);
+
+        const auto N1 = Number<NPerBlockN1>{};
+        const auto N0 = N / N1;
+
+        const auto b_k_n0_n1_grid_desc = transform_tensor_descriptor(
+            b_k_n_grid_desc,
+            make_tuple(make_pass_through_transform(K), make_unmerge_transform(make_tuple(N0, N1))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}));
+
+        return b_k_n0_n1_grid_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCM0M10M11N0N10N11GridDescriptor(const CMNGridDesc& c_m_n_grid_desc)
+    {
+        const auto M = c_m_n_grid_desc.GetLength(I0);
+        const auto N = c_m_n_grid_desc.GetLength(I1);
+
+        constexpr auto M1 = Number<MPerBlockM1>{};
+        constexpr auto N1 = Number<NPerBlockN1>{};
+
+        const auto M0 = M / M1;
+        const auto N0 = N / N1;
+
+        constexpr auto M11 =
+            Number<M11N11ThreadClusterM1100 * M11N11ThreadClusterM1101 * M1PerThreadM111>{};
+        constexpr auto N11 =
+            Number<M11N11ThreadClusterN1100 * M11N11ThreadClusterN1101 * N1PerThreadN111>{};
+
+        constexpr auto M10 = M1 / M11;
+        constexpr auto N10 = N1 / N11;
+
+        const auto c_m0_m10_m11_n0_n10_n11_grid_desc = transform_tensor_descriptor(
+            c_m_n_grid_desc,
+            make_tuple(make_unmerge_transform(make_tuple(M0, M10, M11)),
+                       make_unmerge_transform(make_tuple(N0, N10, N11))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{}));
+
+        return c_m0_m10_m11_n0_n10_n11_grid_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCBlockIdToM0N0BlockClusterAdaptor(const CMNGridDesc& c_m_n_grid_desc)
+    {
+        const auto M = c_m_n_grid_desc.GetLength(I0);
+        const auto N = c_m_n_grid_desc.GetLength(I1);
+
+        constexpr auto M1 = Number<MPerBlockM1>{};
+        constexpr auto N1 = Number<NPerBlockN1>{};
+
+        const auto M0 = M / M1;
+        const auto N0 = N / N1;
+
+        const auto cblockid_to_m0_n0_block_cluster_adaptor =
+            make_single_stage_tensor_adaptor(make_tuple(make_merge_transform(make_tuple(M0, N0))),
+                                             make_tuple(Sequence<0, 1>{}),
+                                             make_tuple(Sequence<0>{}));
+
+        return cblockid_to_m0_n0_block_cluster_adaptor;
+    }
+
+    using AKM0M1GridDesc            = decltype(MakeAKM0M1GridDescriptor(AKMGridDesc{}));
+    using BKN0N1GridDesc            = decltype(MakeBKN0N1GridDescriptor(BKNGridDesc{}));
+    using CM0M10M11N0N10N11GridDesc = decltype(MakeCM0M10M11N0N10N11GridDescriptor(CMNGridDesc{}));
+    using CBlockIdToM0N0BlockClusterAdaptor =
+        decltype(MakeCBlockIdToM0N0BlockClusterAdaptor(CMNGridDesc{}));
+
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ static void
+    Run(const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatC* __restrict__ p_c_grid,
+        FloatAB* __restrict__ p_shared_block,
+        const AKM0M1GridDesc& a_k_m0_m1_grid_desc,
+        const BKN0N1GridDesc& b_k_n0_n1_grid_desc,
+        const CM0M10M11N0N10N11GridDesc& c_m0_m10_m11_n0_n10_n11_grid_desc,
+        const CBlockIdToM0N0BlockClusterAdaptor& cblockid_to_m0_n0_block_cluster_adaptor,
+        integral_constant<bool, HasMainKBlockLoop>,
+        integral_constant<bool, HasDoubleTailKBlockLoop>)
+    {
+        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_k_m0_m1_grid_desc.GetElementSpaceSize());
+        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_k_n0_n1_grid_desc.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_m0_m10_m11_n0_n10_n11_grid_desc.GetElementSpaceSize());
+
+        const auto K = a_k_m0_m1_grid_desc.GetLength(I0);
+
+        // divide block work by [M, N]
+        const auto c_m0_n0_block_cluster_idx =
+            cblockid_to_m0_n0_block_cluster_adaptor.CalculateBottomIndex(
+                make_multi_index(get_block_1d_id()));
+
+        // HACK: this force index data into SGPR
+        const index_t im0 = __builtin_amdgcn_readfirstlane(c_m0_n0_block_cluster_idx[I0]);
+        const index_t in0 = __builtin_amdgcn_readfirstlane(c_m0_n0_block_cluster_idx[I1]);
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(Number<ABlockTransferDstScalarPerVector_M1>{},
+                                                 Number<BBlockTransferDstScalarPerVector_N1>{},
+                                                 Number<M1PerThreadM111>{},
+                                                 Number<N1PerThreadN111>{});
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_k_m_block_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<KPerBlock>{}, Number<MPerBlockM1>{}), max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_k_n_block_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<KPerBlock>{}, Number<NPerBlockN1>{}), max_lds_align);
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_k_m0_m1_block_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<KPerBlock>{}, I1, Number<MPerBlockM1>{}), max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_k_n0_n1_block_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<KPerBlock>{}, I1, Number<NPerBlockN1>{}), max_lds_align);
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            BlockwiseTensorSliceTransfer_v4<BlockSize,
+                                            InMemoryDataOperationEnum::Set,
+                                            Sequence<KPerBlock, 1, MPerBlockM1>,
+                                            ABlockTransferThreadSliceLengths_K_M0_M1,
+                                            ABlockTransferThreadClusterLengths_K_M0_M1,
+                                            ABlockTransferThreadClusterArrangeOrder,
+                                            FloatAB,
+                                            FloatAB,
+                                            decltype(a_k_m0_m1_grid_desc),
+                                            decltype(a_k_m0_m1_block_desc),
+                                            ABlockTransferSrcAccessOrder,
+                                            Sequence<0, 1, 2>,
+                                            ABlockTransferSrcVectorDim,
+                                            2,
+                                            ABlockTransferSrcScalarPerVector,
+                                            ABlockTransferDstScalarPerVector_M1,
+                                            1,
+                                            1,
+                                            AThreadTransferSrcResetCoordinateAfterRun,
+                                            true>(a_k_m0_m1_grid_desc,
+                                                  make_multi_index(0, im0, 0),
+                                                  a_k_m0_m1_block_desc,
+                                                  make_multi_index(0, 0, 0));
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            BlockwiseTensorSliceTransfer_v4<BlockSize,
+                                            InMemoryDataOperationEnum::Set,
+                                            Sequence<KPerBlock, 1, NPerBlockN1>,
+                                            BBlockTransferThreadSliceLengths_K_N0_N1,
+                                            BBlockTransferThreadClusterLengths_K_N0_N1,
+                                            BBlockTransferThreadClusterArrangeOrder,
+                                            FloatAB,
+                                            FloatAB,
+                                            decltype(b_k_n0_n1_grid_desc),
+                                            decltype(b_k_n0_n1_block_desc),
+                                            BBlockTransferSrcAccessOrder,
+                                            Sequence<0, 1, 2>,
+                                            BBlockTransferSrcVectorDim,
+                                            2,
+                                            BBlockTransferSrcScalarPerVector,
+                                            BBlockTransferDstScalarPerVector_N1,
+                                            1,
+                                            1,
+                                            BThreadTransferSrcResetCoordinateAfterRun,
+                                            true>(b_k_n0_n1_grid_desc,
+                                                  make_multi_index(0, in0, 0),
+                                                  b_k_n0_n1_block_desc,
+                                                  make_multi_index(0, 0, 0));
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[KPerBlock, MPerBlockM1] is in LDS
+        //     b_mtx[KPerBlocl, NPerBlockN1] is in LDS
+        //     c_mtx[MPerBlockM1, NPerBlockN1] is distributed among threads, and saved in
+        //       register
+        const auto blockwise_gemm =
+            BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2<BlockSize,
+                                                                FloatAB,
+                                                                FloatAB,
+                                                                FloatAcc,
+                                                                decltype(a_k_m_block_desc),
+                                                                decltype(b_k_n_block_desc),
+                                                                M1PerThreadM111,
+                                                                N1PerThreadN111,
+                                                                KPerThread,
+                                                                M11N11ThreadClusterM1100,
+                                                                M11N11ThreadClusterN1100,
+                                                                M11N11ThreadClusterM1101,
+                                                                M11N11ThreadClusterN1101,
+                                                                M1PerThreadM111,
+                                                                N1PerThreadN111>{};
+        constexpr auto c_m10_m11_n10_n11_thread_tensor_lengths =
+            decltype(blockwise_gemm)::GetCM0M1N0N1ThreadTensorLengths();
+
+        constexpr auto c_m10_m11_n10_n11_thread_desc = make_naive_tensor_descriptor_packed(
+            sequence_to_tuple_of_number(c_m10_m11_n10_n11_thread_tensor_lengths));
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_aligned_space_size =
+            math::integer_least_multiple(a_k_m0_m1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_aligned_space_size =
+            math::integer_least_multiple(b_k_n0_n1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        FloatAB* p_a_block_double = p_shared_block;
+        FloatAB* p_b_block_double = p_shared_block + 2 * a_block_aligned_space_size;
+
+        // register allocation for output
+        auto c_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAcc>(
+            c_m10_m11_n10_n11_thread_desc.GetElementSpaceSize());
+
+        ThreadwiseTensorSliceSet_v1<FloatAcc,
+                                    decltype(c_m10_m11_n10_n11_thread_desc),
+                                    decltype(c_m10_m11_n10_n11_thread_tensor_lengths)>{}
+            .Run(c_m10_m11_n10_n11_thread_desc,
+                 make_tuple(I0, I0, I0, I0),
+                 c_thread_buf,
+                 FloatAcc{0});
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0, 0);
+
+        // hack to control index calculation when iterating over A and B matrix for threadwise copy
+        constexpr auto a_k_m0_m1_global_step_hacks = AGridStepHacks{};
+        constexpr auto b_k_n0_n1_global_step_hacks = BGridStepHacks{};
+
+        // hack to control index calculation when move slice window for A and B matrix for
+        // threadwise copy
+        constexpr auto a_k_m0_m1_global_move_slice_window_step_hack =
+            AGridMoveSliceWindowStepHacks{};
+        constexpr auto b_k_n0_n1_global_move_slice_window_step_hack =
+            BGridMoveSliceWindowStepHacks{};
+
+        auto a_block_even_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            p_a_block_double, a_k_m0_m1_block_desc.GetElementSpaceSize());
+        auto b_block_even_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            p_b_block_double, b_k_n0_n1_block_desc.GetElementSpaceSize());
+
+        auto a_block_odd_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            p_a_block_double + a_block_aligned_space_size,
+            a_k_m0_m1_block_desc.GetElementSpaceSize());
+        auto b_block_odd_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            p_b_block_double + b_block_aligned_space_size,
+            b_k_n0_n1_block_desc.GetElementSpaceSize());
+
+        // LDS double buffer: preload data into LDS
+        {
+            a_blockwise_copy.RunRead(
+                a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_step_hacks);
+            b_blockwise_copy.RunRead(
+                b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_step_hacks);
+
+            a_blockwise_copy.RunWrite(a_k_m0_m1_block_desc, a_block_even_buf);
+            b_blockwise_copy.RunWrite(b_k_n0_n1_block_desc, b_block_even_buf);
+        }
+
+        if constexpr(HasMainKBlockLoop)
+        {
+            index_t k_block_data_begin = 0;
+
+            // LDS double buffer: main body
+            // use Do-While loop instead of For loop to simplify control flow
+            do
+            {
+                // even iteration
+                a_blockwise_copy.MoveSrcSliceWindow(a_k_m0_m1_grid_desc,
+                                                    a_block_slice_copy_step,
+                                                    a_k_m0_m1_global_move_slice_window_step_hack);
+                b_blockwise_copy.MoveSrcSliceWindow(b_k_n0_n1_grid_desc,
+                                                    b_block_slice_copy_step,
+                                                    b_k_n0_n1_global_move_slice_window_step_hack);
+
+                __syncthreads();
+
+                // LDS doubel buffer: load next data from device mem
+                a_blockwise_copy.RunRead(
+                    a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_step_hacks);
+                b_blockwise_copy.RunRead(
+                    b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_step_hacks);
+
+                // LDS double buffer: GEMM on current data
+                blockwise_gemm.Run(c_m10_m11_n10_n11_thread_desc,
+                                   a_block_even_buf,
+                                   b_block_even_buf,
+                                   c_thread_buf);
+
+                // LDS double buffer: store next data to LDS
+                a_blockwise_copy.RunWrite(a_k_m0_m1_block_desc, a_block_odd_buf);
+                b_blockwise_copy.RunWrite(b_k_n0_n1_block_desc, b_block_odd_buf);
+
+                // odd iteration
+                a_blockwise_copy.MoveSrcSliceWindow(a_k_m0_m1_grid_desc,
+                                                    a_block_slice_copy_step,
+                                                    a_k_m0_m1_global_move_slice_window_step_hack);
+                b_blockwise_copy.MoveSrcSliceWindow(b_k_n0_n1_grid_desc,
+                                                    b_block_slice_copy_step,
+                                                    b_k_n0_n1_global_move_slice_window_step_hack);
+
+                __syncthreads();
+
+                // LDS doubel buffer: load next data from device mem
+                a_blockwise_copy.RunRead(
+                    a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_step_hacks);
+                b_blockwise_copy.RunRead(
+                    b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_step_hacks);
+
+                // LDS double buffer: GEMM on current data
+                blockwise_gemm.Run(
+                    c_m10_m11_n10_n11_thread_desc, a_block_odd_buf, b_block_odd_buf, c_thread_buf);
+
+                // LDS double buffer: store next data to LDS
+                a_blockwise_copy.RunWrite(a_k_m0_m1_block_desc, a_block_even_buf);
+                b_blockwise_copy.RunWrite(b_k_n0_n1_block_desc, b_block_even_buf);
+
+                k_block_data_begin += 2 * KPerBlock;
+            } while(k_block_data_begin < K - 2 * KPerBlock);
+        }
+
+        // LDS double buffer: tail
+        if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left
+        {
+            a_blockwise_copy.MoveSrcSliceWindow(a_k_m0_m1_grid_desc,
+                                                a_block_slice_copy_step,
+                                                a_k_m0_m1_global_move_slice_window_step_hack);
+            b_blockwise_copy.MoveSrcSliceWindow(b_k_n0_n1_grid_desc,
+                                                b_block_slice_copy_step,
+                                                b_k_n0_n1_global_move_slice_window_step_hack);
+
+            __syncthreads();
+
+            // LDS double buffer: load last data from device mem
+            a_blockwise_copy.RunRead(
+                a_k_m0_m1_grid_desc, a_global_buf, a_k_m0_m1_global_step_hacks);
+            b_blockwise_copy.RunRead(
+                b_k_n0_n1_grid_desc, b_global_buf, b_k_n0_n1_global_step_hacks);
+
+            // LDS double buffer: GEMM on 2nd-last data
+            blockwise_gemm.Run(
+                c_m10_m11_n10_n11_thread_desc, a_block_even_buf, b_block_even_buf, c_thread_buf);
+
+            // LDS double buffer: store last data to LDS
+            a_blockwise_copy.RunWrite(a_k_m0_m1_block_desc, a_block_odd_buf);
+            b_blockwise_copy.RunWrite(b_k_n0_n1_block_desc, b_block_odd_buf);
+
+            __syncthreads();
+
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(
+                c_m10_m11_n10_n11_thread_desc, a_block_odd_buf, b_block_odd_buf, c_thread_buf);
+        }
+        else // if has 1 iteration left
+        {
+            __syncthreads();
+
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(
+                c_m10_m11_n10_n11_thread_desc, a_block_even_buf, b_block_even_buf, c_thread_buf);
+        }
+
+        // output: register to global memory
+        {
+            constexpr auto c_m0_m10_m11_n0_n10_n11_thread_desc =
+                make_naive_tensor_descriptor_packed(
+                    make_tuple(I1,
+                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I0]>{},
+                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I1]>{},
+                               I1,
+                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I2]>{},
+                               Number<c_m10_m11_n10_n11_thread_tensor_lengths[I3]>{}));
+
+            const auto c_m10_m11_n10_n11_thread_origin_idx_on_block =
+                blockwise_gemm.CalculateCM0M1N0N1ThreadOriginOnBlock(get_thread_local_1d_id());
+
+            ThreadwiseTensorSliceTransfer_v1r3<
+                FloatAcc,
+                FloatC,
+                decltype(c_m0_m10_m11_n0_n10_n11_thread_desc),
+                decltype(c_m0_m10_m11_n0_n10_n11_grid_desc),
+                Sequence<1,
+                         c_m10_m11_n10_n11_thread_tensor_lengths[I0],
+                         c_m10_m11_n10_n11_thread_tensor_lengths[I1],
+                         1,
+                         c_m10_m11_n10_n11_thread_tensor_lengths[I2],
+                         c_m10_m11_n10_n11_thread_tensor_lengths[I3]>,
+                CThreadTransferSrcDstAccessOrder,
+                CThreadTransferSrcDstVectorDim,
+                CThreadTransferDstScalarPerVector,
+                CGlobalMemoryDataOperation,
+                1,
+                true>{c_m0_m10_m11_n0_n10_n11_grid_desc,
+                      make_multi_index(im0,
+                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I0],
+                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I1],
+                                       in0,
+                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I2],
+                                       c_m10_m11_n10_n11_thread_origin_idx_on_block[I3])}
+                .Run(c_m0_m10_m11_n0_n10_n11_thread_desc,
+                     make_tuple(I0, I0, I0, I0, I0, I0),
+                     c_thread_buf,
+                     c_m0_m10_m11_n0_n10_n11_grid_desc,
+                     c_grid_buf,
+                     CGridStepHacks{});
+        }
+    }
+};
+
+} // namespace ck
+#endif
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v2.hpp
new file mode 100644
index 00000000..b1dfb0c7
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v2.hpp
@@ -0,0 +1,461 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_GRIDWISE_GEMM_V2_HPP
+#define CK_GRIDWISE_GEMM_V2_HPP
+
+#include "common_header.hpp"
+#include "multi_index_transform_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "blockwise_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
+#include "blockwise_gemm_dlops_v3.hpp"
+
+namespace ck {
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          typename AGlobalDesc,
+          typename BGlobalDesc,
+          typename CGlobalDesc,
+          index_t KPerBlock,
+          index_t HoPerBlock,
+          index_t WoPerBlock,
+          index_t EPerBlock,
+          index_t KPerThread,
+          index_t HoPerThread,
+          index_t WoPerThread,
+          index_t EPerThread,
+          typename ABlockTransferThreadSliceLengths_E_K,
+          typename ABlockTransferThreadClusterLengths_E_K,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_K,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector,
+          typename AGlobalStepHacks,
+          typename BGlobalStepHacks,
+          typename CGlobalStepHacks,
+          typename AGlobalMoveSliceWindowStepHacks,
+          typename BGlobalMoveSliceWindowStepHacks>
+struct GridwiseGemmDlops_km_kn_mn_v3
+{
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        constexpr auto E = EPerBlock * 3 * 3;
+
+        constexpr auto max_lds_align =
+            math::lcm(Number<ABlockTransferDstScalarPerVector_K>{}, Number<KPerBlock>{});
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_e_k_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align);
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size =
+            math::integer_least_multiple(a_e_k_desc.GetElementSpaceSize(), max_lds_align);
+
+        return a_block_space_size * sizeof(FloatAB);
+    }
+
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ void Run(const AGlobalDesc& a_e_k_global_desc,
+                        const FloatAB* __restrict__ p_a_global,
+                        const BGlobalDesc& b_e_n_ho_wo_global_desc,
+                        const FloatAB* __restrict__ p_b_global,
+                        const CGlobalDesc& c_k_n_ho_wo_global_desc,
+                        FloatC* __restrict__ p_c_global,
+                        FloatAB* __restrict__ p_shared_block,
+                        integral_constant<bool, HasMainKBlockLoop>,
+                        integral_constant<bool, HasDoubleTailKBlockLoop>) const
+    {
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_global, a_e_k_global_desc.GetElementSpaceSize());
+        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_global, b_e_n_ho_wo_global_desc.GetElementSpaceSize());
+        auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_global, c_k_n_ho_wo_global_desc.GetElementSpaceSize());
+
+        constexpr auto E = EPerBlock * 3 * 3;
+
+        // const auto E = a_e_k_global_desc.GetLength(I0);
+        const auto K = a_e_k_global_desc.GetLength(I1);
+
+        const auto N  = b_e_n_ho_wo_global_desc.GetLength(I1);
+        const auto Ho = b_e_n_ho_wo_global_desc.GetLength(I2);
+        const auto Wo = b_e_n_ho_wo_global_desc.GetLength(I3);
+
+// divide block work by [M, N]
+#if 0
+        const auto ho_block_work_num  = Ho / Number<HoPerBlock>{};
+        const auto wo_block_work_num  = Wo / Number<WoPerBlock>{};
+        const auto hwo_block_work_num = ho_block_work_num * wo_block_work_num;
+
+        const index_t k_block_work_id   = get_block_1d_id() / hwo_block_work_num;
+        const index_t hwo_block_work_id = get_block_1d_id() - k_block_work_id * hwo_block_work_num;
+
+        const index_t ho_block_work_id = hwo_block_work_id / wo_block_work_num;
+        const index_t wo_block_work_id = hwo_block_work_id - ho_block_work_id * wo_block_work_num;
+#else
+        // Hack: this force result into SGPR
+        const index_t ho_block_work_num  = __builtin_amdgcn_readfirstlane(Ho / HoPerBlock);
+        const index_t wo_block_work_num  = __builtin_amdgcn_readfirstlane(Wo / WoPerBlock);
+        const index_t hwo_block_work_num = ho_block_work_num * wo_block_work_num;
+
+        const index_t k_block_work_id =
+            __builtin_amdgcn_readfirstlane(get_block_1d_id() / hwo_block_work_num);
+        const index_t hwo_block_work_id = get_block_1d_id() - k_block_work_id * hwo_block_work_num;
+
+        const index_t ho_block_work_id =
+            __builtin_amdgcn_readfirstlane(hwo_block_work_id / wo_block_work_num);
+        const index_t wo_block_work_id = hwo_block_work_id - ho_block_work_id * wo_block_work_num;
+#endif
+
+        // lds max alignment
+        constexpr auto max_lds_align =
+            math::lcm(Number<ABlockTransferDstScalarPerVector_K>{}, Number<KPerBlock>{});
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_e_k_block_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<EPerBlock>{}, Number<KPerBlock>{}), max_lds_align);
+
+        constexpr auto a_e_k_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<E>{}, Number<KPerBlock>{}), max_lds_align);
+
+        // B matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto b_e_n_ho_wo_block_desc = make_naive_tensor_descriptor_packed(make_tuple(
+            Number<EPerBlock>{}, Number<1>{}, Number<HoPerBlock>{}, Number<WoPerBlock>{}));
+
+        // c_thread_mtx definition: this is a mess
+        // TODO:: more elegent way of defining c_thread_mtx
+        constexpr auto c_k_n_ho_wo_thread_desc = make_naive_tensor_descriptor_packed(make_tuple(
+            Number<KPerThread>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{}));
+
+        auto blockwise_gemm =
+            BlockwiseGemmDlops_km_kn_m0m1n0n1_v3<BlockSize,
+                                                 FloatAB,
+                                                 FloatAB,
+                                                 FloatAcc,
+                                                 decltype(a_e_k_block_desc),
+                                                 decltype(b_e_n_ho_wo_block_desc),
+                                                 decltype(c_k_n_ho_wo_thread_desc),
+                                                 KPerThread,
+                                                 HoPerThread,
+                                                 WoPerThread,
+                                                 EPerThread,
+                                                 ABlockTransferSrcScalarPerVector,
+                                                 ABlockTransferDstScalarPerVector_K>{};
+
+        auto c_thread_mtx_index = blockwise_gemm.GetBeginOfThreadMatrixC(get_thread_local_1d_id());
+
+        const auto k_thread_id  = c_thread_mtx_index.k;
+        const auto ho_thread_id = c_thread_mtx_index.h;
+        const auto wo_thread_id = c_thread_mtx_index.w;
+
+        const index_t k_block_data_on_global  = k_block_work_id * KPerBlock;
+        const index_t ho_block_data_on_global = ho_block_work_id * HoPerBlock;
+        const index_t wo_block_data_on_global = wo_block_work_id * WoPerBlock;
+
+        const index_t ho_thread_data_on_global =
+            ho_block_data_on_global + ho_thread_id * HoPerThread;
+        const index_t wo_thread_data_on_global =
+            wo_block_data_on_global + wo_thread_id * WoPerThread;
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            BlockwiseTensorSliceTransfer_v4<BlockSize,
+                                            InMemoryDataOperationEnum::Set,
+                                            Sequence<E, KPerBlock>,
+                                            ABlockTransferThreadSliceLengths_E_K,
+                                            ABlockTransferThreadClusterLengths_E_K,
+                                            ABlockTransferThreadClusterArrangeOrder,
+                                            FloatAB,
+                                            FloatAB,
+                                            decltype(a_e_k_global_desc),
+                                            decltype(a_e_k_desc),
+                                            ABlockTransferSrcAccessOrder,
+                                            Sequence<0, 1>,
+                                            ABlockTransferSrcVectorDim,
+                                            1,
+                                            ABlockTransferSrcScalarPerVector,
+                                            ABlockTransferDstScalarPerVector_K,
+                                            1,
+                                            1,
+                                            AThreadTransferSrcResetCoordinateAfterRun,
+                                            true>(a_e_k_global_desc,
+                                                  make_multi_index(0, k_block_data_on_global),
+                                                  a_e_k_desc,
+                                                  make_multi_index(0, 0));
+
+        constexpr auto b_e_n_ho_wo_thread_desc = make_naive_tensor_descriptor_packed(make_tuple(
+            Number<EPerBlock>{}, Number<1>{}, Number<HoPerThread>{}, Number<WoPerThread>{}));
+
+        auto b_threadwise_transfer =
+            ThreadwiseTensorSliceTransfer_v2<FloatAB,
+                                             FloatAB,
+                                             decltype(b_e_n_ho_wo_global_desc),
+                                             decltype(b_e_n_ho_wo_thread_desc),
+                                             Sequence<EPerBlock, 1, HoPerThread, WoPerThread>,
+                                             BBlockTransferSrcAccessOrder,
+                                             BBlockTransferSrcVectorDim,
+                                             BBlockTransferSrcScalarPerVector,
+                                             1,
+                                             true>(
+                b_e_n_ho_wo_global_desc,
+                make_multi_index(0, 0, ho_thread_data_on_global, wo_thread_data_on_global));
+
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            p_shared_block, a_e_k_desc.GetElementSpaceSize());
+
+        // register allocation for output
+        StaticBuffer<AddressSpaceEnum::Vgpr,
+                     FloatAcc,
+                     c_k_n_ho_wo_thread_desc.GetElementSpaceSize(),
+                     true>
+            c_thread_buf;
+
+        // initialize output thread tensor
+        ThreadwiseTensorSliceSet_v1<FloatAcc,
+                                    decltype(c_k_n_ho_wo_thread_desc),
+                                    Sequence<KPerThread, 1, HoPerThread, WoPerThread>>{}
+            .Run(c_k_n_ho_wo_thread_desc, make_tuple(I0, I0, I0, I0), c_thread_buf, FloatAcc{0});
+
+        constexpr auto b_thread_slice_copy_step = make_multi_index(EPerBlock, 0, 0, 0);
+
+        // hack to control index calculation when iterating over A and B matrix for threadwise copy
+        constexpr auto a_e_k_global_step_hacks       = AGlobalStepHacks{};
+        constexpr auto b_e_n_ho_wo_global_step_hacks = BGlobalStepHacks{};
+
+        // hack to control index calculation when move slice window for A and B matrix for
+        // threadwise copy
+        constexpr auto a_e_k_global_move_slice_window_step_hack = AGlobalMoveSliceWindowStepHacks{};
+        constexpr auto b_e_n_ho_wo_global_move_slice_window_step_hack =
+            BGlobalMoveSliceWindowStepHacks{};
+
+        // double regsiter buffer for b
+        StaticBuffer<AddressSpaceEnum::Vgpr,
+                     FloatAB,
+                     b_e_n_ho_wo_thread_desc.GetElementSpaceSize(),
+                     true>
+            b_thread_even_buf, b_thread_odd_buf;
+
+        // LDS double buffer: preload data
+        {
+            a_blockwise_copy.RunRead(a_e_k_global_desc, a_global_buf, a_e_k_global_step_hacks);
+
+            b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
+                                      b_global_buf,
+                                      b_e_n_ho_wo_thread_desc,
+                                      make_tuple(I0, I0, I0, I0),
+                                      b_thread_even_buf,
+                                      b_e_n_ho_wo_global_step_hacks);
+
+            a_blockwise_copy.RunWrite(a_e_k_desc, a_block_buf);
+        }
+
+        __syncthreads();
+
+        if constexpr(HasMainKBlockLoop)
+        {
+            index_t e_block_data_begin = 0;
+
+            // LDS double buffer: main body
+            // use Do-While loop instead of For loop to simplify control flow
+            do
+            {
+                // even iteration
+                b_threadwise_transfer.MoveSrcSliceWindow(b_e_n_ho_wo_global_desc,
+                                                         b_thread_slice_copy_step);
+
+                b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
+                                          b_global_buf,
+                                          b_e_n_ho_wo_thread_desc,
+                                          make_tuple(I0, I0, I0, I0),
+                                          b_thread_odd_buf,
+                                          b_e_n_ho_wo_global_step_hacks);
+
+                // LDS double buffer: GEMM on current data
+                // TODO: @Zhang Jing: blockwise gemm should be able to move slice window
+                blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
+
+                blockwise_gemm.MoveASliceWindow(a_e_k_block_desc, make_tuple(EPerBlock, 0));
+
+                b_threadwise_transfer.MoveSrcSliceWindow(b_e_n_ho_wo_global_desc,
+                                                         b_thread_slice_copy_step);
+
+                b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
+                                          b_global_buf,
+                                          b_e_n_ho_wo_thread_desc,
+                                          make_tuple(I0, I0, I0, I0),
+                                          b_thread_even_buf,
+                                          b_e_n_ho_wo_global_step_hacks);
+
+                // LDS double buffer: GEMM on current data
+                blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
+
+                blockwise_gemm.MoveASliceWindow(a_e_k_block_desc, make_tuple(EPerBlock, 0));
+
+                e_block_data_begin += 2 * EPerBlock;
+
+            } while(e_block_data_begin < E - 2 * EPerBlock);
+        }
+
+        // LDS double buffer: tail
+        if constexpr(HasDoubleTailKBlockLoop) // if has 2 iteration left
+        {
+            b_threadwise_transfer.MoveSrcSliceWindow(b_e_n_ho_wo_global_desc,
+                                                     b_thread_slice_copy_step);
+
+            b_threadwise_transfer.Run(b_e_n_ho_wo_global_desc,
+                                      b_global_buf,
+                                      b_e_n_ho_wo_thread_desc,
+                                      make_tuple(I0, I0, I0, I0),
+                                      b_thread_odd_buf,
+                                      b_e_n_ho_wo_global_step_hacks);
+
+            // LDS double buffer: GEMM on 2nd-last data
+            blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
+
+            blockwise_gemm.MoveASliceWindow(a_e_k_block_desc, make_tuple(EPerBlock, 0));
+
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
+        }
+        else // if has 1 iteration left
+        {
+            // LDS double buffer: GEMM on last data
+            blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
+        }
+
+        // output: register to global memory
+        {
+            // hack to control index calculation when iterating over c_k_n_ho_wo_global tensor
+            constexpr auto c_k_n_ho_wo_global_tensor_step_hacks = CGlobalStepHacks{};
+
+            const index_t k_thread_data_on_global =
+                k_block_data_on_global + k_thread_id * KPerThread;
+
+            ThreadwiseTensorSliceTransfer_v1r3<FloatAcc,
+                                               FloatC,
+                                               decltype(c_k_n_ho_wo_thread_desc),
+                                               decltype(c_k_n_ho_wo_global_desc),
+                                               Sequence<KPerThread, 1, HoPerThread, WoPerThread>,
+                                               CThreadTransferSrcDstAccessOrder,
+                                               CThreadTransferSrcDstVectorDim,
+                                               CThreadTransferDstScalarPerVector,
+                                               CGlobalMemoryDataOperation,
+                                               1,
+                                               true>(
+                c_k_n_ho_wo_global_desc,
+                make_multi_index(
+                    k_thread_data_on_global, 0, ho_thread_data_on_global, wo_thread_data_on_global))
+                .Run(c_k_n_ho_wo_thread_desc,
+                     make_tuple(I0, I0, I0, I0),
+                     c_thread_buf,
+                     c_k_n_ho_wo_global_desc,
+                     c_global_buf,
+                     c_k_n_ho_wo_global_tensor_step_hacks);
+        }
+    }
+
+    // pass tensor descriptor by reference
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ void Run(const AGlobalDesc& a_e_k_global_desc,
+                        const FloatAB* __restrict__ p_a_global,
+                        const BGlobalDesc& b_e_n_ho_wo_global_desc,
+                        const FloatAB* __restrict__ p_b_global,
+                        const CGlobalDesc& c_k_n_ho_wo_global_desc,
+                        FloatC* __restrict__ p_c_global,
+                        integral_constant<bool, HasMainKBlockLoop>,
+                        integral_constant<bool, HasDoubleTailKBlockLoop>) const
+    {
+        constexpr index_t shared_block_size = GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+        __shared__ FloatAB p_shared_block[shared_block_size];
+
+        Run(a_e_k_global_desc,
+            p_a_global,
+            b_e_n_ho_wo_global_desc,
+            p_b_global,
+            c_k_n_ho_wo_global_desc,
+            p_c_global,
+            p_shared_block,
+            integral_constant<bool, HasMainKBlockLoop>{},
+            integral_constant<bool, HasDoubleTailKBlockLoop>{});
+    }
+
+    // pass tensor descriptors by their pointers
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ void Run(const AGlobalDesc* p_a_e_k_global_desc,
+                        const FloatAB* __restrict__ p_a_global,
+                        const BGlobalDesc* p_b_e_n_ho_wo_global_desc,
+                        const FloatAB* __restrict__ p_b_global,
+                        const CGlobalDesc* p_c_k_n_ho_wo_global_desc,
+                        FloatC* __restrict__ p_c_global,
+                        integral_constant<bool, HasMainKBlockLoop>,
+                        integral_constant<bool, HasDoubleTailKBlockLoop>) const
+    {
+        const auto a_e_k_global_desc       = *p_a_e_k_global_desc;
+        const auto b_e_n_ho_wo_global_desc = *p_b_e_n_ho_wo_global_desc;
+        const auto c_k_n_ho_wo_global_desc = *p_c_k_n_ho_wo_global_desc;
+
+        Run(a_e_k_global_desc,
+            p_a_global,
+            b_e_n_ho_wo_global_desc,
+            p_b_global,
+            c_k_n_ho_wo_global_desc,
+            p_c_global,
+            integral_constant<bool, HasMainKBlockLoop>{},
+            integral_constant<bool, HasDoubleTailKBlockLoop>{});
+    }
+
+    // pass tensor descriptors by void*
+    template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
+    __device__ void Run(const void* p_a_e_k_global_desc,
+                        const FloatAB* __restrict__ p_a_global,
+                        const void* p_b_e_n_ho_wo_global_desc,
+                        const FloatAB* __restrict__ p_b_global,
+                        const void* p_c_k_n_ho_wo_global_desc,
+                        FloatC* __restrict__ p_c_global,
+                        integral_constant<bool, HasMainKBlockLoop>,
+                        integral_constant<bool, HasDoubleTailKBlockLoop>) const
+    {
+        const auto a_e_k_global_desc = *reinterpret_cast<const AGlobalDesc*>(p_a_e_k_global_desc);
+        const auto b_e_n_ho_wo_global_desc =
+            *reinterpret_cast<const BGlobalDesc*>(p_b_e_n_ho_wo_global_desc);
+        const auto c_k_n_ho_wo_global_desc =
+            *reinterpret_cast<const CGlobalDesc*>(p_c_k_n_ho_wo_global_desc);
+
+        Run(a_e_k_global_desc,
+            p_a_global,
+            b_e_n_ho_wo_global_desc,
+            p_b_global,
+            c_k_n_ho_wo_global_desc,
+            p_c_global,
+            integral_constant<bool, HasMainKBlockLoop>{},
+            integral_constant<bool, HasDoubleTailKBlockLoop>{});
+    }
+};
+
+} // namespace ck
+#endif
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v3.hpp
new file mode 100644
index 00000000..ace84433
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_dlops_v3.hpp
@@ -0,0 +1,1597 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_GRIDWISE_GEMM_V3_HPP
+#define CK_GRIDWISE_GEMM_V3_HPP
+
+#include "common_header.hpp"
+#include "multi_index_transform_helper.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "blockwise_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_transfer.hpp"
+#include "threadwise_tensor_slice_set.hpp"
+#include "blockwise_gemm_dlops_v3.hpp"
+
+namespace ck {
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AGridDesc_E0_E1_K0_K1_E2,
+          typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
+          typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
+          typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
+          bool HasMainE0BlockLoop,
+          ActivTypeEnum ActivType>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_dlops_v3(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            const FloatC* __restrict__ p_bias_grid,
+            FloatC* __restrict__ p_c_grid,
+            const AGridDesc_E0_E1_K0_K1_E2 a_e0_e1_k0_k1_e2_grid_desc,
+            const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+            const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+            const CBlockIdToBlockClusterAdaptor_K_N_H_W cblockid_to_k_n_h_w_block_cluster_adaptor)
+{
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::ConvBiasActiv(p_a_grid,
+                                p_b_grid,
+                                p_bias_grid,
+                                p_c_grid,
+                                p_shared_block,
+                                a_e0_e1_k0_k1_e2_grid_desc,
+                                b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+                                cblockid_to_k_n_h_w_block_cluster_adaptor,
+                                integral_constant<bool, HasMainE0BlockLoop>{},
+                                integral_constant<ActivTypeEnum, ActivType>{});
+}
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AGridDesc_E0_E1_K0_K1_E2,
+          typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
+          typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
+          typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx,
+          typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
+          bool HasMainE0BlockLoop,
+          ActivTypeEnum ActivType>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_dlops_v3_resize_add(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            const FloatC* __restrict__ p_bias_grid,
+            FloatC* __restrict__ p_d_grid,
+            const AGridDesc_E0_E1_K0_K1_E2 a_e0_e1_k0_k1_e2_grid_desc,
+            const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+            const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+            const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+            const CBlockIdToBlockClusterAdaptor_K_N_H_W cblockid_to_k_n_h_w_block_cluster_adaptor)
+{
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::ConvBiasActivResizeAdd(p_a_grid,
+                                         p_b_grid,
+                                         p_bias_grid,
+                                         p_d_grid,
+                                         p_shared_block,
+                                         a_e0_e1_k0_k1_e2_grid_desc,
+                                         b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                         c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+                                         d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+                                         cblockid_to_k_n_h_w_block_cluster_adaptor,
+                                         integral_constant<bool, HasMainE0BlockLoop>{},
+                                         integral_constant<ActivTypeEnum, ActivType>{});
+}
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AGridDesc_E0_E1_K0_K1_E2,
+          typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
+          typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
+          typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx,
+          typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
+          bool HasMainE0BlockLoop,
+          ActivTypeEnum ActivType>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_dlops_v3_maxpool(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            const FloatC* __restrict__ p_bias_grid,
+            FloatC* __restrict__ p_c_grid,
+            FloatC* __restrict__ p_d_grid,
+            const AGridDesc_E0_E1_K0_K1_E2 a_e0_e1_k0_k1_e2_grid_desc,
+            const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+            const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+            const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+            const CBlockIdToBlockClusterAdaptor_K_N_H_W cblockid_to_k_n_h_w_block_cluster_adaptor)
+{
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::ConvBiasActivMaxpool(p_a_grid,
+                                       p_b_grid,
+                                       p_bias_grid,
+                                       p_c_grid,
+                                       p_d_grid,
+                                       p_shared_block,
+                                       a_e0_e1_k0_k1_e2_grid_desc,
+                                       b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                       c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+                                       d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+                                       cblockid_to_k_n_h_w_block_cluster_adaptor,
+                                       integral_constant<bool, HasMainE0BlockLoop>{},
+                                       integral_constant<ActivTypeEnum, ActivType>{});
+}
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          typename AGridDesc_E0_E1_K_E2,
+          typename BGridDesc_E0_E1_N_Ho_Wo_E2,
+          typename CGridDesc_K_N_Ho_Wo,
+          typename DGridDesc_K_N_Hx_Wx,
+          index_t E1_,
+          index_t E2_,
+          index_t K2_,
+          index_t KPerBlock,
+          index_t HoPerBlock,
+          index_t WoPerBlock,
+          index_t E1PerBlock,
+          index_t KPerThread,
+          index_t HoPerThread,
+          index_t WoPerThread,
+          index_t EPerThread,
+          typename ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
+          typename ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_E2,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector,
+          typename AGlobalStepHacks,
+          typename BGlobalStepHacks,
+          typename CGlobalStepHacks,
+          typename DGlobalStepHacks,
+          typename AGlobalMoveSliceWindowStepHacks,
+          typename BGlobalMoveSliceWindowStepHacks>
+struct GridwiseGemmDlops_km_kn_mn_v3
+{
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+
+    static constexpr auto E1 = Number<E1_>{};
+    static constexpr auto E2 = Number<E2_>{};
+    static constexpr auto K2 = Number<K2_>{};
+
+    static constexpr auto NPerBlock = I1;
+
+    static constexpr FloatAcc alpha = 0.3;
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        constexpr auto max_lds_align = Number<ABlockTransferDstScalarPerVector_E2>{};
+
+        // A matrix in LDS memory, dst of blockwise copy
+        //   be careful of LDS alignment
+        constexpr auto a_e0_e1_k1_e2_block_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(I1, Number<E1>{}, Number<KPerBlock>{}, Number<E2>{}), max_lds_align);
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size = math::integer_least_multiple(
+            a_e0_e1_k1_e2_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        return a_block_space_size * sizeof(FloatAB);
+    }
+
+    __host__ __device__ static constexpr index_t
+    CalculateGridSize(const CGridDesc_K_N_Ho_Wo& c_k_n_ho_wo_grid_desc)
+    {
+        const auto K  = c_k_n_ho_wo_grid_desc.GetLength(I0);
+        const auto N  = c_k_n_ho_wo_grid_desc.GetLength(I1);
+        const auto Ho = c_k_n_ho_wo_grid_desc.GetLength(I2);
+        const auto Wo = c_k_n_ho_wo_grid_desc.GetLength(I3);
+
+        const auto K0 = K / KPerBlock;
+        const auto N0 = N / NPerBlock;
+        const auto H0 = Ho / HoPerBlock;
+        const auto W0 = Wo / WoPerBlock;
+
+        const index_t grid_size = K0 * N0 * H0 * W0;
+
+        return grid_size;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainE0BlockLoop(const index_t E0)
+    {
+        const bool has_main_e0_block_loop = E0 > 1;
+
+        return has_main_e0_block_loop;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainE1BlockLoop()
+    {
+        const bool has_main_e1_block_loop = ((E1 + E1PerBlock) / (2 * E1PerBlock)) > 1;
+
+        return has_main_e1_block_loop;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasDoubleTailE1BlockLoop()
+    {
+        const bool has_double_tail_e1_block_loop = (E1 / E1PerBlock) % 2 == 0;
+
+        return has_double_tail_e1_block_loop;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeAE0E1K0K1E2GridDescriptor(const AGridDesc_E0_E1_K_E2& a_e0_e1_k_e2_grid_desc)
+    {
+        const auto E0 = a_e0_e1_k_e2_grid_desc.GetLength(I0);
+        const auto K  = a_e0_e1_k_e2_grid_desc.GetLength(I2);
+
+        const auto K1 = Number<KPerBlock>{};
+        const auto K0 = K / K1;
+
+        const auto a_e0_e1_k0_k1_e2_grid_desc = transform_tensor_descriptor(
+            a_e0_e1_k_e2_grid_desc,
+            make_tuple(make_pass_through_transform(E0),
+                       make_pass_through_transform(E1),
+                       make_unmerge_transform(make_tuple(K0, K1)),
+                       make_pass_through_transform(E2)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}));
+
+        return a_e0_e1_k0_k1_e2_grid_desc;
+    }
+
+    __host__ __device__ static constexpr auto MakeBE0E1NH0H1H2W0W1W2E2GridDescriptor(
+        const BGridDesc_E0_E1_N_Ho_Wo_E2& b_e0_e1_n_ho_wo_e2_grid_desc)
+    {
+        const auto E0 = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I0);
+        // const auto E1 = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I1);
+        const auto N  = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I2);
+        const auto Ho = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I3);
+        const auto Wo = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I4);
+        // const auto E2 = b_e0_e1_n_ho_wo_e2_grid_desc.GetLength(I5);
+
+        const auto H2 = Number<HoPerThread>{};
+        const auto H1 = Number<HoPerBlock / HoPerThread>{};
+        const auto H0 = Ho / (H1 * H2);
+
+        const auto W2 = Number<WoPerThread>{};
+        const auto W1 = Number<WoPerBlock / WoPerThread>{};
+        const auto W0 = Wo / (W1 * W2);
+
+        const auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc =
+            transform_tensor_descriptor(b_e0_e1_n_ho_wo_e2_grid_desc,
+                                        make_tuple(make_pass_through_transform(E0),
+                                                   make_pass_through_transform(E1),
+                                                   make_pass_through_transform(N),
+                                                   make_unmerge_transform(make_tuple(H0, H1, H2)),
+                                                   make_unmerge_transform(make_tuple(W0, W1, W2)),
+                                                   make_pass_through_transform(E2)),
+                                        make_tuple(Sequence<0>{},
+                                                   Sequence<1>{},
+                                                   Sequence<2>{},
+                                                   Sequence<3>{},
+                                                   Sequence<4>{},
+                                                   Sequence<5>{}),
+                                        make_tuple(Sequence<0>{},
+                                                   Sequence<1>{},
+                                                   Sequence<2>{},
+                                                   Sequence<3, 4, 5>{},
+                                                   Sequence<6, 7, 8>{},
+                                                   Sequence<9>{}));
+
+        return b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCK0K1NH0H1H2W0W1W2GridDescriptor(const CGridDesc_K_N_Ho_Wo& c_k_n_ho_wo_grid_desc)
+    {
+        const auto K  = c_k_n_ho_wo_grid_desc.GetLength(I0);
+        const auto N  = c_k_n_ho_wo_grid_desc.GetLength(I1);
+        const auto Ho = c_k_n_ho_wo_grid_desc.GetLength(I2);
+        const auto Wo = c_k_n_ho_wo_grid_desc.GetLength(I3);
+
+        const auto K1 = Number<KPerBlock>{};
+        const auto K0 = K / K1;
+
+        const auto H2 = Number<HoPerThread>{};
+        const auto H1 = Number<HoPerBlock / HoPerThread>{};
+        const auto H0 = Ho / (H1 * H2);
+
+        const auto W2 = Number<WoPerThread>{};
+        const auto W1 = Number<WoPerBlock / WoPerThread>{};
+        const auto W0 = Wo / (W1 * W2);
+
+        const auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc = transform_tensor_descriptor(
+            c_k_n_ho_wo_grid_desc,
+            make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                       make_pass_through_transform(N),
+                       make_unmerge_transform(make_tuple(H0, H1, H2)),
+                       make_unmerge_transform(make_tuple(W0, W1, W2))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}, Sequence<6, 7, 8>{}));
+
+        return c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeDK0K1NH0H1HxW0W1WxGridDescriptorMaxPool(const DGridDesc_K_N_Hx_Wx& d_k_n_hx_wx_grid_desc)
+    {
+        const auto K  = d_k_n_hx_wx_grid_desc.GetLength(I0);
+        const auto N  = d_k_n_hx_wx_grid_desc.GetLength(I1);
+        const auto Hx = d_k_n_hx_wx_grid_desc.GetLength(I2);
+        const auto Wx = d_k_n_hx_wx_grid_desc.GetLength(I3);
+
+        const auto K1 = Number<KPerBlock>{};
+        const auto K0 = K / K1;
+
+#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
+        const auto H2 = Number<HoPerThread / 2>{};
+        const auto H1 = Number<HoPerBlock / HoPerThread>{};
+        const auto H0 = Number<Hx / (H1 * H2)>{};
+
+        const auto W2 = Number<WoPerThread / 2>{};
+        const auto W1 = Number<WoPerBlock / WoPerThread>{};
+        const auto W0 = Number<Wx / (W1 * W2)>{};
+#else
+        const auto H2 = HoPerThread / 2;
+        const auto H1 = HoPerBlock / HoPerThread;
+        const auto H0 = Hx / (H1 * H2);
+
+        const auto W2 = WoPerThread / 2;
+        const auto W1 = WoPerBlock / WoPerThread;
+        const auto W0 = Wx / (W1 * W2);
+#endif
+
+        const auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc = transform_tensor_descriptor(
+            d_k_n_hx_wx_grid_desc,
+            make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                       make_pass_through_transform(N),
+                       make_unmerge_transform(make_tuple(H0, H1, H2)),
+                       make_unmerge_transform(make_tuple(W0, W1, W2))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}, Sequence<6, 7, 8>{}));
+
+        return d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeDK0K1NH0H1HxW0W1WxGridDescriptorResizeAdd(const DGridDesc_K_N_Hx_Wx& d_k_n_hx_wx_grid_desc)
+    {
+        const auto K  = d_k_n_hx_wx_grid_desc.GetLength(I0);
+        const auto N  = d_k_n_hx_wx_grid_desc.GetLength(I1);
+        const auto Hx = d_k_n_hx_wx_grid_desc.GetLength(I2);
+        const auto Wx = d_k_n_hx_wx_grid_desc.GetLength(I3);
+
+        const auto K1 = Number<KPerBlock>{};
+        const auto K0 = K / K1;
+
+        const auto H2 = Number<HoPerThread * 2>{};
+        const auto H1 = Number<HoPerBlock / HoPerThread>{};
+
+        const auto W2 = Number<WoPerThread * 2>{};
+        const auto W1 = Number<WoPerBlock / WoPerThread>{};
+
+#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
+        const auto H0 = Number<Hx / (H1 * H2)>{};
+        const auto W0 = Number<Wx / (W1 * W2)>{};
+#else
+        const auto H0 = Hx / (H1 * H2);
+        const auto W0 = Wx / (W1 * W2);
+#endif
+
+        const auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc = transform_tensor_descriptor(
+            d_k_n_hx_wx_grid_desc,
+            make_tuple(make_unmerge_transform(make_tuple(K0, K1)),
+                       make_pass_through_transform(N),
+                       make_unmerge_transform(make_tuple(H0, H1, H2)),
+                       make_unmerge_transform(make_tuple(W0, W1, W2))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}, Sequence<6, 7, 8>{}));
+
+        return d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCBlockIdToKNHoWoBlockClusterAdaptor(const CGridDesc_K_N_Ho_Wo& c_k_n_ho_wo_grid_desc)
+    {
+        const auto K  = c_k_n_ho_wo_grid_desc.GetLength(I0);
+        const auto N  = c_k_n_ho_wo_grid_desc.GetLength(I1);
+        const auto Ho = c_k_n_ho_wo_grid_desc.GetLength(I2);
+        const auto Wo = c_k_n_ho_wo_grid_desc.GetLength(I3);
+
+#if CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR
+        const auto K0 = Number<K / KPerBlock>{};
+        const auto N0 = Number<N / NPerBlock>{};
+        const auto H0 = Number<Ho / HoPerBlock>{};
+        const auto W0 = Number<Wo / WoPerBlock>{};
+#else
+        const auto K0 = K / KPerBlock;
+        const auto N0 = N / NPerBlock;
+        const auto H0 = Ho / HoPerBlock;
+        const auto W0 = Wo / WoPerBlock;
+#endif
+
+        const auto cblockid_to_k_n_ho_wo_block_cluster_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(K0, N0, H0, W0))),
+            make_tuple(Sequence<0, 1, 2, 3>{}),
+            make_tuple(Sequence<0>{}));
+
+        return cblockid_to_k_n_ho_wo_block_cluster_adaptor;
+    }
+
+    // using AGridDesc_E0_E1_K0_K1_E2 =
+    // decltype(MakeAE0E1K0K1E2GridDescriptor(AGridDesc_E0_E1_K_E2{}));
+    // using BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2 =
+    // decltype(MakeBE0E1NH0H1H2W0W1W2E2GridDescriptor(BGridDesc_E0_E1_N_Ho_Wo_E2{}));
+    // using CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2 =
+    // decltype(MakeCK0K1NH0H1H2W0W1W2GridDescriptor(CGridDesc_K_N_Ho_Wo{}));
+    // using DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx =
+    // decltype(MakeDK0K1NH0H1HxW0W1WxGridDescriptor(DGridDesc_K_N_Hx_Wx{}));
+
+    using CBlockIdToBlockClusterAdaptor_K_N_H_W =
+        decltype(MakeCBlockIdToKNHoWoBlockClusterAdaptor(CGridDesc_K_N_Ho_Wo{}));
+
+    template <typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>
+    __host__ __device__ static constexpr auto MakeBiasK0K1GridDescriptor(
+        const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc)
+    {
+        const auto K0 = c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.GetLength(I0);
+        const auto K1 = c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.GetLength(I1);
+
+        return make_naive_tensor_descriptor_packed(make_tuple(K0, K1));
+    }
+
+    __host__ __device__ static constexpr auto MakeCK1NH2W2ThreadDescriptor()
+    {
+        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<KPerThread>{}, I1, Number<HoPerThread>{}, Number<WoPerThread>{}));
+        return c_k1_n_h2_w2_thread_gemm_desc;
+    }
+
+    // using CThreadDesc_K1_N_H2_W2 = decltype(MakeCK1NH2W2ThreadDescriptor());
+
+    __host__ __device__ static constexpr auto GetBlockWiseGemm()
+    {
+        constexpr auto max_lds_align = Number<ABlockTransferDstScalarPerVector_E2>{};
+
+        constexpr auto a_e1_k1_e2_block_gemm_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<E1PerBlock>{}, Number<KPerBlock>{}, Number<E2>{}), max_lds_align);
+
+        constexpr auto b_e1_n_h_w_e2_block_gemm_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<E1PerBlock>{},
+                                                           I1,
+                                                           Number<HoPerBlock>{},
+                                                           Number<WoPerBlock>{},
+                                                           Number<E2>{}));
+
+        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = MakeCK1NH2W2ThreadDescriptor();
+
+        auto blockwise_gemm =
+            BlockwiseGemmDlops_km_kn_m0m1n0n1_v3<BlockSize,
+                                                 FloatAB,
+                                                 FloatAB,
+                                                 FloatAcc,
+                                                 decltype(a_e1_k1_e2_block_gemm_desc),
+                                                 decltype(b_e1_n_h_w_e2_block_gemm_desc),
+                                                 decltype(c_k1_n_h2_w2_thread_gemm_desc),
+                                                 EPerThread,
+                                                 K2>{};
+
+        return blockwise_gemm;
+    }
+
+    __device__ static constexpr auto GetCThreadIndex()
+    {
+        auto blockwise_gemm = GetBlockWiseGemm();
+        auto c_thread_mtx_index =
+            blockwise_gemm.GetBeginOfCThreadDesc_K_N_Ho_Wo(get_thread_local_1d_id());
+
+        return c_thread_mtx_index;
+    };
+
+    __device__ static constexpr auto GetCBlockIndex(
+        const CBlockIdToBlockClusterAdaptor_K_N_H_W& cblockid_to_k_n_h_w_block_cluster_adaptor)
+    {
+        const auto c_k_n_h_w_block_cluster_idx =
+            cblockid_to_k_n_h_w_block_cluster_adaptor.CalculateBottomIndex(
+                make_multi_index(get_block_1d_id()));
+        return c_k_n_h_w_block_cluster_idx;
+    }
+
+    template <typename BiasGlobalBuff,
+              typename CThreadBuff,
+              typename CBlockIndex,
+              typename CThreadIndex,
+              typename BiasGridDesc_K0_K1,
+              typename CThreadDesc_K1_N_H2_W2>
+    __device__ static void BiasOp(BiasGlobalBuff& bias_global_buf,
+                                  CThreadBuff& c_thread_buf,
+                                  const CBlockIndex& c_block_idx,
+                                  const CThreadIndex& c_thread_idx,
+                                  const BiasGridDesc_K0_K1& bias_k0_k1_grid_desc,
+                                  const CThreadDesc_K1_N_H2_W2&)
+
+    {
+        const index_t k_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I0]);
+
+        const auto k_thread_id = c_thread_idx[I0];
+
+        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = CThreadDesc_K1_N_H2_W2{};
+
+        constexpr auto bias_k0_k1_thread_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(I1, Number<KPerThread>{}));
+
+        StaticBuffer<AddressSpaceEnum::Vgpr,
+                     FloatC,
+                     bias_k0_k1_thread_desc.GetElementSpaceSize(),
+                     true>
+            bias_thread_buf;
+
+        const index_t k_thread_data_on_global = k_thread_id * KPerThread;
+
+        auto bias_threadwise_transfer =
+            ThreadwiseTensorSliceTransfer_v2<FloatC,
+                                             FloatC,
+                                             decltype(bias_k0_k1_grid_desc),
+                                             decltype(bias_k0_k1_thread_desc),
+                                             Sequence<I1, Number<KPerThread>{}>,
+                                             Sequence<0, 1>,
+                                             1,
+                                             CThreadTransferDstScalarPerVector,
+                                             false,
+                                             true>(
+                bias_k0_k1_grid_desc, make_multi_index(k_block_work_id, k_thread_data_on_global));
+
+        constexpr auto bias_k0_k1_global_tensor_step_hacks = make_tuple(
+            make_tuple(Sequence<0>{}, Sequence<0>{}), make_tuple(Sequence<0>{}, Sequence<0>{}));
+
+        bias_threadwise_transfer.Run(bias_k0_k1_grid_desc,
+                                     bias_global_buf,
+                                     bias_k0_k1_thread_desc,
+                                     make_tuple(I0, I0),
+                                     bias_thread_buf,
+                                     bias_k0_k1_global_tensor_step_hacks);
+
+        static_for<0, KPerThread, 1>{}([&](auto ki) {
+            static_for<0, HoPerThread, 1>{}([&](auto hi) {
+                static_for<0, WoPerThread, 1>{}([&](auto wi) {
+                    constexpr index_t c_offset =
+                        c_k1_n_h2_w2_thread_gemm_desc.CalculateOffset(make_tuple(ki, 0, hi, wi));
+                    c_thread_buf(Number<c_offset>{}) =
+                        c_thread_buf[Number<c_offset>{}] + bias_thread_buf[ki];
+                });
+            });
+        });
+    }
+
+    template <typename CThreadBuff, typename CThreadDesc_K1_N_H2_W2, ActivTypeEnum activ_type_>
+    __device__ static void Activation(CThreadBuff& c_thread_buf,
+                                      const CThreadDesc_K1_N_H2_W2&,
+                                      integral_constant<ActivTypeEnum, activ_type_>)
+    {
+        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = CThreadDesc_K1_N_H2_W2{};
+
+        static_for<0, c_k1_n_h2_w2_thread_gemm_desc.GetElementSpaceSize(), 1>{}([&](auto i) {
+            if constexpr(activ_type_ == 1)
+            {
+                c_thread_buf(i) = c_thread_buf[i] >= 0 ? c_thread_buf[i] : alpha * c_thread_buf[i];
+            }
+            else if constexpr(activ_type_ == 2)
+            {
+                FloatAcc x = 1.0 + exp(-c_thread_buf[i]);
+
+                asm volatile("\n \
+                        v_rcp_f32 %0, %1 \n"
+                             : "=v"(x)
+                             : "0"(x));
+
+                c_thread_buf(i) = x;
+            }
+        });
+    }
+
+    template <typename CThreadBuff,
+              typename CGlobalBuff,
+              typename CBlockIndex,
+              typename CThreadIndex,
+              typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2>
+    __device__ static void
+    WriteOut(const CThreadBuff& c_thread_buf,
+             CGlobalBuff& c_global_buf,
+             const CBlockIndex& c_block_idx,
+             const CThreadIndex& c_thread_idx,
+             const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc)
+    {
+        const index_t k_block_work_id  = __builtin_amdgcn_readfirstlane(c_block_idx[I0]);
+        const index_t n_block_work_id  = __builtin_amdgcn_readfirstlane(c_block_idx[I1]);
+        const index_t ho_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I2]);
+        const index_t wo_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I3]);
+
+        const auto k_thread_id  = c_thread_idx[I0];
+        const auto ho_thread_id = c_thread_idx[I2];
+        const auto wo_thread_id = c_thread_idx[I3];
+
+        // hack to control index calculation when iterating over c_k_n_h0_h1_h2_w0_w1_w2_global
+        // tensor
+        constexpr auto c_k_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks = CGlobalStepHacks{};
+
+        constexpr auto c_k0_k1_n_h0_h1_h2_w0_w1_w2_thread_copy_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(I1,
+                                                           Number<KPerThread>{},
+                                                           I1,
+                                                           I1,
+                                                           I1,
+                                                           Number<HoPerThread>{},
+                                                           I1,
+                                                           I1,
+                                                           Number<WoPerThread>{}));
+
+        const index_t k_thread_data_on_global = k_thread_id * KPerThread;
+
+        ThreadwiseTensorSliceTransfer_v1r3<
+            FloatAcc,
+            FloatC,
+            decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_thread_copy_desc),
+            decltype(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc),
+            Sequence<I1, KPerThread, I1, I1, I1, HoPerThread, I1, I1, WoPerThread>,
+            CThreadTransferSrcDstAccessOrder,
+            CThreadTransferSrcDstVectorDim,
+            CThreadTransferDstScalarPerVector,
+            CGlobalMemoryDataOperation,
+            1,
+            true>(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+                  make_multi_index(k_block_work_id,
+                                   k_thread_data_on_global,
+                                   n_block_work_id,
+                                   ho_block_work_id,
+                                   ho_thread_id,
+                                   0,
+                                   wo_block_work_id,
+                                   wo_thread_id,
+                                   0))
+            .Run(c_k0_k1_n_h0_h1_h2_w0_w1_w2_thread_copy_desc,
+                 make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                 c_thread_buf,
+                 c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+                 c_global_buf,
+                 c_k_n_h0_h1_h2_w0_w1_w2_global_tensor_step_hacks);
+    }
+
+    template <typename CThreadBuff,
+              typename DGlobalBuff,
+              typename CBlockIndex,
+              typename CThreadIndex,
+              typename CThreadDesc_K1_N_H2_W2,
+              typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>
+    __device__ static void
+    MaxPool(const CThreadBuff& c_thread_buf,
+            DGlobalBuff& d_global_buf,
+            const CBlockIndex& c_block_idx,
+            const CThreadIndex& c_thread_idx,
+            const CThreadDesc_K1_N_H2_W2&,
+            const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx& d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc)
+    {
+
+        const index_t k_block_work_id  = __builtin_amdgcn_readfirstlane(c_block_idx[I0]);
+        const index_t n_block_work_id  = __builtin_amdgcn_readfirstlane(c_block_idx[I1]);
+        const index_t ho_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I2]);
+        const index_t wo_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I3]);
+
+        const auto k_thread_id  = c_thread_idx[I0];
+        const auto ho_thread_id = c_thread_idx[I2];
+        const auto wo_thread_id = c_thread_idx[I3];
+
+        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = CThreadDesc_K1_N_H2_W2{};
+
+        static_assert(HoPerThread % 2 == 0 && WoPerThread % 2 == 0, "");
+
+        constexpr auto HoPerThread_2 = HoPerThread / 2;
+        constexpr auto WoPerThread_2 = WoPerThread / 2;
+
+        constexpr auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(I1,
+                                                           Number<KPerThread>{},
+                                                           I1,
+                                                           I1,
+                                                           I1,
+                                                           Number<HoPerThread_2>{},
+                                                           I1,
+                                                           I1,
+                                                           Number<WoPerThread_2>{}));
+
+        StaticBuffer<AddressSpaceEnum::Vgpr,
+                     FloatC,
+                     d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc.GetElementSpaceSize(),
+                     true>
+            d_thread_buf;
+
+        static_for<0, KPerThread, 1>{}([&](auto ki) {
+            static_for<0, HoPerThread_2, 1>{}([&](auto hi) {
+                static_for<0, WoPerThread_2, 1>{}([&](auto wi) {
+                    constexpr index_t d_offset =
+                        d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc.CalculateOffset(
+                            make_tuple(0, ki, 0, 0, 0, hi, 0, 0, wi));
+
+                    constexpr index_t c_offset_0 = c_k1_n_h2_w2_thread_gemm_desc.CalculateOffset(
+                        make_tuple(ki, 0, hi * 2, wi * 2));
+                    constexpr index_t c_offset_1 = c_k1_n_h2_w2_thread_gemm_desc.CalculateOffset(
+                        make_tuple(ki, 0, hi * 2, wi * 2 + 1));
+                    constexpr index_t c_offset_2 = c_k1_n_h2_w2_thread_gemm_desc.CalculateOffset(
+                        make_tuple(ki, 0, hi * 2 + 1, wi * 2));
+                    constexpr index_t c_offset_3 = c_k1_n_h2_w2_thread_gemm_desc.CalculateOffset(
+                        make_tuple(ki, 0, hi * 2 + 1, wi * 2 + 1));
+
+                    d_thread_buf(Number<d_offset>{}) = c_thread_buf[Number<c_offset_0>{}];
+                    d_thread_buf(Number<d_offset>{}) =
+                        fmaxf(c_thread_buf[Number<c_offset_1>{}], d_thread_buf(Number<d_offset>{}));
+                    d_thread_buf(Number<d_offset>{}) =
+                        fmaxf(c_thread_buf[Number<c_offset_2>{}], d_thread_buf(Number<d_offset>{}));
+                    d_thread_buf(Number<d_offset>{}) =
+                        fmax(c_thread_buf[Number<c_offset_3>{}], d_thread_buf(Number<d_offset>{}));
+                });
+            });
+        });
+
+        const index_t k_thread_data_on_global = k_thread_id * KPerThread;
+
+        constexpr auto d_k_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks = DGlobalStepHacks{};
+
+        ThreadwiseTensorSliceTransfer_v1r3<
+            FloatC,
+            FloatC,
+            decltype(d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc),
+            decltype(d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc),
+            Sequence<I1, KPerThread, I1, I1, I1, HoPerThread_2, I1, I1, WoPerThread_2>,
+            CThreadTransferSrcDstAccessOrder,
+            CThreadTransferSrcDstVectorDim,
+            CThreadTransferDstScalarPerVector,
+            InMemoryDataOperationEnum::Set,
+            1,
+            true>(d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+                  make_multi_index(k_block_work_id,
+                                   k_thread_data_on_global,
+                                   n_block_work_id,
+                                   ho_block_work_id,
+                                   ho_thread_id,
+                                   0,
+                                   wo_block_work_id,
+                                   wo_thread_id,
+                                   0))
+            .Run(d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc,
+                 make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                 d_thread_buf,
+                 d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+                 d_global_buf,
+                 d_k_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks);
+    }
+
+    template <typename CThreadBuff,
+              typename DGlobalBuff,
+              typename CBlockIndex,
+              typename CThreadIndex,
+              typename CThreadDesc_K1_N_H2_W2,
+              typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx>
+    __device__ static void
+    ResizeAdd(const CThreadBuff& c_thread_buf,
+              DGlobalBuff& d_global_buf,
+              const CBlockIndex& c_block_idx,
+              const CThreadIndex& c_thread_idx,
+              const CThreadDesc_K1_N_H2_W2&,
+              const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx& d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc)
+    {
+
+        const index_t k_block_work_id  = __builtin_amdgcn_readfirstlane(c_block_idx[I0]);
+        const index_t n_block_work_id  = __builtin_amdgcn_readfirstlane(c_block_idx[I1]);
+        const index_t ho_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I2]);
+        const index_t wo_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I3]);
+
+        const auto k_thread_id  = c_thread_idx[I0];
+        const auto ho_thread_id = c_thread_idx[I2];
+        const auto wo_thread_id = c_thread_idx[I3];
+
+        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = CThreadDesc_K1_N_H2_W2{};
+
+        constexpr auto HoPerThreadx2 = HoPerThread * 2;
+        constexpr auto WoPerThreadx2 = WoPerThread * 2;
+
+        constexpr auto d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(I1,
+                                                           Number<KPerThread>{},
+                                                           I1,
+                                                           I1,
+                                                           I1,
+                                                           Number<HoPerThreadx2>{},
+                                                           I1,
+                                                           I1,
+                                                           Number<WoPerThreadx2>{}));
+
+        StaticBuffer<AddressSpaceEnum::Vgpr,
+                     FloatC,
+                     d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc.GetElementSpaceSize(),
+                     true>
+            d_thread_buf;
+
+        static_for<0, KPerThread, 1>{}([&](auto k_i) {
+            static_for<0, HoPerThreadx2, 1>{}([&](auto h_i) {
+                static_for<0, WoPerThreadx2, 1>{}([&](auto w_i) {
+                    d_thread_buf(Number<d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc.CalculateOffset(
+                                     make_tuple(0, k_i, 0, 0, 0, h_i, 0, 0, w_i))>{}) =
+                        c_thread_buf[Number<c_k1_n_h2_w2_thread_gemm_desc.CalculateOffset(
+                            make_tuple(k_i, 0, h_i / 2, w_i / 2))>{}];
+                });
+            });
+        });
+
+        // hack to control index calculation when iterating over d_k_n_ho_wo_global tensor
+        constexpr auto d_k_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks = DGlobalStepHacks{};
+
+        const index_t k_thread_data_on_global = k_thread_id * KPerThread;
+
+        ThreadwiseTensorSliceTransfer_v1r3<
+            FloatC,
+            FloatC,
+            decltype(d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc),
+            decltype(d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc),
+            Sequence<I1, KPerThread, I1, I1, I1, HoPerThreadx2, I1, I1, WoPerThreadx2>,
+            CThreadTransferSrcDstAccessOrder,
+            CThreadTransferSrcDstVectorDim,
+            CThreadTransferDstScalarPerVector,
+            InMemoryDataOperationEnum::Add,
+            1,
+            true>(d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+                  make_multi_index(k_block_work_id,
+                                   k_thread_data_on_global,
+                                   n_block_work_id,
+                                   ho_block_work_id,
+                                   ho_thread_id,
+                                   0,
+                                   wo_block_work_id,
+                                   wo_thread_id,
+                                   0))
+            .Run(d_k0_k1_n_h0_h1_hx_w0_w1_wx_thread_desc,
+                 make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                 d_thread_buf,
+                 d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+                 d_global_buf,
+                 d_k_n_h0_h1_hx_w0_w1_wx_global_tensor_step_hacks);
+    }
+
+    template <typename AGlobalBuff,
+              typename BGlobalBuff,
+              typename CThreadBuff,
+              typename CBlockIndex,
+              typename CThreadIndex,
+              typename AGridDesc_E0_E1_K0_K1_E2,
+              typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
+              typename CThreadDesc_K1_N_H2_W2,
+              bool HasMainE0BlockLoop>
+    __device__ static void
+    GemmOp(const AGlobalBuff& a_global_buf,
+           const BGlobalBuff& b_global_buf,
+           CThreadBuff& c_thread_buf,
+           FloatAB* __restrict__ p_shared_block,
+           const CBlockIndex& c_block_idx,
+           const CThreadIndex& c_thread_idx,
+           const AGridDesc_E0_E1_K0_K1_E2& a_e0_e1_k0_k1_e2_grid_desc,
+           const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2& b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+           const CThreadDesc_K1_N_H2_W2&,
+           integral_constant<bool, HasMainE0BlockLoop>)
+    {
+        constexpr auto HasMainE1BlockLoop       = CalculateHasMainE1BlockLoop();
+        constexpr auto HasDoubleTailE1BlockLoop = CalculateHasDoubleTailE1BlockLoop();
+
+        // const auto c_k_n_h_w_block_cluster_idx =
+        // GetCBlockIndex(cblockid_to_k_n_h_w_block_cluster_adaptor);
+        // cblockid_to_k_n_h_w_block_cluster_adaptor.CalculateBottomIndex(
+        // make_multi_index(get_block_1d_id()));
+
+        const index_t k_block_work_id  = __builtin_amdgcn_readfirstlane(c_block_idx[I0]);
+        const index_t n_block_work_id  = __builtin_amdgcn_readfirstlane(c_block_idx[I1]);
+        const index_t ho_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I2]);
+        const index_t wo_block_work_id = __builtin_amdgcn_readfirstlane(c_block_idx[I3]);
+
+        constexpr auto max_lds_align = Number<ABlockTransferDstScalarPerVector_E2>{};
+
+        constexpr auto a_e1_k1_e2_block_gemm_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<E1PerBlock>{}, Number<KPerBlock>{}, Number<E2>{}), max_lds_align);
+
+        constexpr auto b_e1_n_h_w_e2_block_gemm_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(Number<E1PerBlock>{},
+                                                           I1,
+                                                           Number<HoPerBlock>{},
+                                                           Number<WoPerBlock>{},
+                                                           Number<E2>{}));
+
+        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = CThreadDesc_K1_N_H2_W2{};
+
+        auto blockwise_gemm =
+            BlockwiseGemmDlops_km_kn_m0m1n0n1_v3<BlockSize,
+                                                 FloatAB,
+                                                 FloatAB,
+                                                 FloatAcc,
+                                                 decltype(a_e1_k1_e2_block_gemm_desc),
+                                                 decltype(b_e1_n_h_w_e2_block_gemm_desc),
+                                                 decltype(c_k1_n_h2_w2_thread_gemm_desc),
+                                                 EPerThread,
+                                                 K2>{};
+        // blockwise_gemm.GetBeginOfCThreadDesc_K_N_Ho_Wo(get_thread_local_1d_id());
+
+        const auto ho_thread_id = c_thread_idx[I2];
+        const auto wo_thread_id = c_thread_idx[I3];
+
+        constexpr auto a_e0_e1_k0_k1_e2_block_copy_desc = make_naive_tensor_descriptor_aligned(
+            make_tuple(Number<I1>{}, Number<E1>{}, I1, Number<KPerBlock>{}, Number<E2>{}),
+            max_lds_align);
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            BlockwiseTensorSliceTransfer_v4<BlockSize,
+                                            InMemoryDataOperationEnum::Set,
+                                            Sequence<I1, E1, I1, KPerBlock, E2>,
+                                            ABlockTransferThreadSliceLengths_E0_E1_K0_K1_E2,
+                                            ABlockTransferThreadClusterLengths_E0_E1_K0_K1_E2,
+                                            ABlockTransferThreadClusterArrangeOrder,
+                                            FloatAB,
+                                            FloatAB,
+                                            decltype(a_e0_e1_k0_k1_e2_grid_desc),
+                                            decltype(a_e0_e1_k0_k1_e2_block_copy_desc),
+                                            ABlockTransferSrcAccessOrder,
+                                            Sequence<0, 1, 2, 3, 4>,
+                                            ABlockTransferSrcVectorDim,
+                                            4,
+                                            ABlockTransferSrcScalarPerVector,
+                                            ABlockTransferDstScalarPerVector_E2,
+                                            1,
+                                            1,
+                                            AThreadTransferSrcResetCoordinateAfterRun,
+                                            false>(a_e0_e1_k0_k1_e2_grid_desc,
+                                                   make_multi_index(0, 0, k_block_work_id, 0, 0),
+                                                   a_e0_e1_k0_k1_e2_block_copy_desc,
+                                                   make_multi_index(0, 0, 0, 0, 0));
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(I1, 0, 0, 0, 0);
+
+        constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(I1,
+                                                           Number<E1PerBlock>{},
+                                                           I1,
+                                                           I1,
+                                                           I1,
+                                                           Number<HoPerThread>{},
+                                                           I1,
+                                                           I1,
+                                                           Number<WoPerThread>{},
+                                                           Number<E2>{}));
+
+        auto b_threadwise_transfer = ThreadwiseTensorSliceTransfer_v2<
+            FloatAB,
+            FloatAB,
+            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc),
+            decltype(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc),
+            Sequence<I1, E1PerBlock, I1, I1, I1, HoPerThread, I1, I1, WoPerThread, E2>,
+            BBlockTransferSrcAccessOrder,
+            BBlockTransferSrcVectorDim,
+            BBlockTransferSrcScalarPerVector,
+            BThreadTransferSrcResetCoordinateAfterRun,
+            true>(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                  make_multi_index(0,
+                                   0,
+                                   n_block_work_id,
+                                   ho_block_work_id,
+                                   ho_thread_id,
+                                   0,
+                                   wo_block_work_id,
+                                   wo_thread_id,
+                                   0,
+                                   0));
+
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            p_shared_block, a_e0_e1_k0_k1_e2_block_copy_desc.GetElementSpaceSize());
+
+        //// register allocation for output
+        // StaticBuffer<AddressSpaceEnum::Vgpr,
+        // FloatAcc,
+        // c_k1_n_h2_w2_thread_gemm_desc.GetElementSpaceSize(),
+        // true>
+        // c_thread_buf;
+
+        // initialize output thread tensor
+        ThreadwiseTensorSliceSet_v1<FloatAcc,
+                                    decltype(c_k1_n_h2_w2_thread_gemm_desc),
+                                    Sequence<KPerThread, I1, HoPerThread, WoPerThread>>{}
+            .Run(c_k1_n_h2_w2_thread_gemm_desc,
+                 make_tuple(I0, I0, I0, I0),
+                 c_thread_buf,
+                 FloatAcc{0});
+
+        constexpr auto b_thread_slice_copy_step =
+            make_multi_index(0, E1PerBlock, 0, 0, 0, 0, 0, 0, 0, 0);
+
+        // hack to control index calculation when iterating over A and B matrix for threadwise copy
+        constexpr auto a_e0_e1_k_e2_global_step_hacks                   = AGlobalStepHacks{};
+        constexpr auto b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks = BGlobalStepHacks{};
+
+        // double regsiter buffer for b
+        StaticBuffer<AddressSpaceEnum::Vgpr,
+                     FloatAB,
+                     b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc.GetElementSpaceSize(),
+                     true>
+            b_thread_even_buf, b_thread_odd_buf;
+
+        if constexpr(HasMainE0BlockLoop)
+        {
+            const auto E0 = b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetLength(I0);
+
+            index_t e0_block_data_begin = 0;
+
+            do
+            {
+                // LDS double buffer: preload data
+                {
+                    a_blockwise_copy.RunRead(
+                        a_e0_e1_k0_k1_e2_grid_desc, a_global_buf, a_e0_e1_k_e2_global_step_hacks);
+
+                    b_threadwise_transfer.Run(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                              b_global_buf,
+                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc,
+                                              make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                                              b_thread_even_buf,
+                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks);
+
+                    a_blockwise_copy.RunWrite(a_e0_e1_k0_k1_e2_block_copy_desc, a_block_buf);
+                }
+
+                __syncthreads();
+
+                if constexpr(HasMainE1BlockLoop)
+                {
+                    index_t e1_block_data_begin = 0;
+
+                    // LDS double buffer: main body
+                    // use Do-While loop instead of For loop to simplify control flow
+                    do
+                    {
+                        // even iteration
+                        b_threadwise_transfer.MoveSrcSliceWindow(
+                            b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                            b_thread_slice_copy_step,
+                            BGlobalMoveSliceWindowStepHacks{});
+
+                        b_threadwise_transfer.Run(
+                            b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                            b_global_buf,
+                            b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc,
+                            make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                            b_thread_odd_buf,
+                            b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks);
+
+                        // LDS double buffer: GEMM on current data
+                        blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
+
+                        blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0));
+
+                        b_threadwise_transfer.MoveSrcSliceWindow(
+                            b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                            b_thread_slice_copy_step,
+                            BGlobalMoveSliceWindowStepHacks{});
+
+                        b_threadwise_transfer.Run(
+                            b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                            b_global_buf,
+                            b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc,
+                            make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                            b_thread_even_buf,
+                            b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks);
+
+                        // LDS double buffer: GEMM on current data
+                        blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
+
+                        blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0));
+
+                        e1_block_data_begin += 2 * E1PerBlock;
+
+                    } while(e1_block_data_begin < E1 - 2 * E1PerBlock);
+                }
+
+                // LDS double buffer: tail
+                if constexpr(HasDoubleTailE1BlockLoop) // if has 2 iteration left
+                {
+                    b_threadwise_transfer.MoveSrcSliceWindow(
+                        b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                        b_thread_slice_copy_step,
+                        BGlobalMoveSliceWindowStepHacks{});
+
+                    b_threadwise_transfer.Run(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                              b_global_buf,
+                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc,
+                                              make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                                              b_thread_odd_buf,
+                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks);
+
+                    // LDS double buffer: GEMM on 2nd-last data
+                    blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
+
+                    blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0));
+
+                    // LDS double buffer: GEMM on last data
+                    blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
+                }
+                else // if has 1 iteration left
+                {
+                    // LDS double buffer: GEMM on last data
+                    blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
+                }
+
+                a_blockwise_copy.MoveSrcSliceWindow(a_e0_e1_k0_k1_e2_grid_desc,
+                                                    a_block_slice_copy_step,
+                                                    AGlobalMoveSliceWindowStepHacks{});
+
+                blockwise_gemm.MoveABlockSliceWindow(make_tuple(-(E1 - E1PerBlock), 0, 0));
+
+                b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                                         b_thread_slice_copy_step,
+                                                         BGlobalMoveSliceWindowStepHacks{});
+
+                e0_block_data_begin += 1;
+
+            } while(e0_block_data_begin < E0);
+        }
+        else
+        {
+            // LDS double buffer: preload data
+            {
+                a_blockwise_copy.RunRead(
+                    a_e0_e1_k0_k1_e2_grid_desc, a_global_buf, a_e0_e1_k_e2_global_step_hacks);
+
+                b_threadwise_transfer.Run(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                          b_global_buf,
+                                          b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc,
+                                          make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                                          b_thread_even_buf,
+                                          b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks);
+
+                a_blockwise_copy.RunWrite(a_e0_e1_k0_k1_e2_block_copy_desc, a_block_buf);
+            }
+
+            __syncthreads();
+
+            if constexpr(HasMainE1BlockLoop)
+            {
+                index_t e1_block_data_begin = 0;
+
+                // LDS double buffer: main body
+                // use Do-While loop instead of For loop to simplify control flow
+                do
+                {
+                    // even iteration
+                    b_threadwise_transfer.MoveSrcSliceWindow(
+                        b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                        b_thread_slice_copy_step,
+                        BGlobalMoveSliceWindowStepHacks{});
+
+                    b_threadwise_transfer.Run(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                              b_global_buf,
+                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc,
+                                              make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                                              b_thread_odd_buf,
+                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks);
+
+                    // LDS double buffer: GEMM on current data
+                    blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
+
+                    blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0));
+
+                    b_threadwise_transfer.MoveSrcSliceWindow(
+                        b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                        b_thread_slice_copy_step,
+                        BGlobalMoveSliceWindowStepHacks{});
+
+                    b_threadwise_transfer.Run(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                              b_global_buf,
+                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc,
+                                              make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                                              b_thread_even_buf,
+                                              b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks);
+
+                    // LDS double buffer: GEMM on current data
+                    blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
+
+                    blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0));
+
+                    e1_block_data_begin += 2 * E1PerBlock;
+
+                } while(e1_block_data_begin < E1 - 2 * E1PerBlock);
+            }
+
+            // LDS double buffer: tail
+            if constexpr(HasDoubleTailE1BlockLoop) // if has 2 iteration left
+            {
+                b_threadwise_transfer.MoveSrcSliceWindow(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                                         b_thread_slice_copy_step,
+                                                         BGlobalMoveSliceWindowStepHacks{});
+
+                b_threadwise_transfer.Run(b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+                                          b_global_buf,
+                                          b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_thread_copy_desc,
+                                          make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                                          b_thread_odd_buf,
+                                          b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_global_step_hacks);
+
+                // LDS double buffer: GEMM on 2nd-last data
+                blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
+
+                blockwise_gemm.MoveABlockSliceWindow(make_tuple(E1PerBlock, 0, 0));
+
+                // LDS double buffer: GEMM on last data
+                blockwise_gemm.Run(a_block_buf, b_thread_odd_buf, c_thread_buf);
+            }
+            else // if has 1 iteration left
+            {
+                // LDS double buffer: GEMM on last data
+                blockwise_gemm.Run(a_block_buf, b_thread_even_buf, c_thread_buf);
+            }
+        }
+    }
+
+    template <typename AGridDesc_E0_E1_K0_K1_E2,
+              typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
+              typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
+              typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx,
+              typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
+              bool HasMainE0BlockLoop>
+    __device__ static void
+    Conv(const FloatAB* __restrict__ p_a_global,
+         const FloatAB* __restrict__ p_b_global,
+         const FloatC* __restrict__ p_bias_global,
+         FloatC* __restrict__ p_c_global,
+         FloatC* __restrict__ p_d_global,
+         FloatAB* __restrict__ p_shared_block,
+         const AGridDesc_E0_E1_K0_K1_E2& a_e0_e1_k0_k1_e2_grid_desc,
+         const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2& b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+         const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+         const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx& d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+         const CBlockIdToBlockClusterAdaptor_K_N_H_W& cblockid_to_k_n_h_w_block_cluster_adaptor,
+         integral_constant<bool, HasMainE0BlockLoop>)
+    {
+        const auto bias_k0_k1_grid_desc =
+            MakeBiasK0K1GridDescriptor(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
+
+        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_global, a_e0_e1_k0_k1_e2_grid_desc.GetElementSpaceSize());
+        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_global, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetElementSpaceSize());
+        auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_global, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.GetElementSpaceSize());
+        auto d_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_d_global, d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc.GetElementSpaceSize());
+        auto bias_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_bias_global, bias_k0_k1_grid_desc.GetElementSpaceSize());
+
+        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = MakeCK1NH2W2ThreadDescriptor();
+
+        // register allocation for output
+        StaticBuffer<AddressSpaceEnum::Vgpr,
+                     FloatAcc,
+                     c_k1_n_h2_w2_thread_gemm_desc.GetElementSpaceSize(),
+                     true>
+            c_thread_buf;
+
+        const auto c_k_n_h_w_block_cluster_idx =
+            GetCBlockIndex(cblockid_to_k_n_h_w_block_cluster_adaptor);
+
+        const auto c_thread_mtx_index = GetCThreadIndex();
+
+        // GemmOp
+        GemmOp(a_global_buf,
+               b_global_buf,
+               c_thread_buf,
+               p_shared_block,
+               c_k_n_h_w_block_cluster_idx,
+               c_thread_mtx_index,
+               a_e0_e1_k0_k1_e2_grid_desc,
+               b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+               c_k1_n_h2_w2_thread_gemm_desc,
+               integral_constant<bool, HasMainE0BlockLoop>{});
+
+        // Output
+        WriteOut(c_thread_buf,
+                 c_global_buf,
+                 c_k_n_h_w_block_cluster_idx,
+                 c_thread_mtx_index,
+                 c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
+    }
+
+    template <typename AGridDesc_E0_E1_K0_K1_E2,
+              typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
+              typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
+              typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
+              bool HasMainE0BlockLoop,
+              ActivTypeEnum ActivType>
+    __device__ static void ConvBiasActiv(
+        const FloatAB* __restrict__ p_a_global,
+        const FloatAB* __restrict__ p_b_global,
+        const FloatC* __restrict__ p_bias_global,
+        FloatC* __restrict__ p_c_global,
+        FloatAB* __restrict__ p_shared_block,
+        const AGridDesc_E0_E1_K0_K1_E2& a_e0_e1_k0_k1_e2_grid_desc,
+        const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2& b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+        const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+        const CBlockIdToBlockClusterAdaptor_K_N_H_W& cblockid_to_k_n_h_w_block_cluster_adaptor,
+        integral_constant<bool, HasMainE0BlockLoop>,
+        integral_constant<ActivTypeEnum, ActivType>)
+    {
+        static constexpr auto activ_type = integral_constant<ActivTypeEnum, ActivType>{};
+
+        const auto bias_k0_k1_grid_desc =
+            MakeBiasK0K1GridDescriptor(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
+
+        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_global, a_e0_e1_k0_k1_e2_grid_desc.GetElementSpaceSize());
+        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_global, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetElementSpaceSize());
+        auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_global, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.GetElementSpaceSize());
+        auto bias_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_bias_global, bias_k0_k1_grid_desc.GetElementSpaceSize());
+
+        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = MakeCK1NH2W2ThreadDescriptor();
+
+        // register allocation for output
+        StaticBuffer<AddressSpaceEnum::Vgpr,
+                     FloatAcc,
+                     c_k1_n_h2_w2_thread_gemm_desc.GetElementSpaceSize(),
+                     true>
+            c_thread_buf;
+
+        const auto c_k_n_h_w_block_cluster_idx =
+            GetCBlockIndex(cblockid_to_k_n_h_w_block_cluster_adaptor);
+
+        const auto c_thread_mtx_index = GetCThreadIndex();
+
+        // GemmOp
+        GemmOp(a_global_buf,
+               b_global_buf,
+               c_thread_buf,
+               p_shared_block,
+               c_k_n_h_w_block_cluster_idx,
+               c_thread_mtx_index,
+               a_e0_e1_k0_k1_e2_grid_desc,
+               b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+               c_k1_n_h2_w2_thread_gemm_desc,
+               integral_constant<bool, HasMainE0BlockLoop>{});
+
+        // Bias
+        BiasOp(bias_global_buf,
+               c_thread_buf,
+               c_k_n_h_w_block_cluster_idx,
+               c_thread_mtx_index,
+               bias_k0_k1_grid_desc,
+               c_k1_n_h2_w2_thread_gemm_desc);
+
+        // Activ
+        Activation(c_thread_buf, c_k1_n_h2_w2_thread_gemm_desc, activ_type);
+
+        // Output
+        WriteOut(c_thread_buf,
+                 c_global_buf,
+                 c_k_n_h_w_block_cluster_idx,
+                 c_thread_mtx_index,
+                 c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
+    }
+
+    template <typename AGridDesc_E0_E1_K0_K1_E2,
+              typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
+              typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
+              typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx,
+              typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
+              bool HasMainE0BlockLoop,
+              ActivTypeEnum ActivType>
+    __device__ static void ConvBiasActivMaxpool(
+        const FloatAB* __restrict__ p_a_global,
+        const FloatAB* __restrict__ p_b_global,
+        const FloatC* __restrict__ p_bias_global,
+        FloatC* __restrict__ p_c_global,
+        FloatC* __restrict__ p_d_global,
+        FloatAB* __restrict__ p_shared_block,
+        const AGridDesc_E0_E1_K0_K1_E2& a_e0_e1_k0_k1_e2_grid_desc,
+        const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2& b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+        const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+        const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx& d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+        const CBlockIdToBlockClusterAdaptor_K_N_H_W& cblockid_to_k_n_h_w_block_cluster_adaptor,
+        integral_constant<bool, HasMainE0BlockLoop>,
+        integral_constant<ActivTypeEnum, ActivType>)
+    {
+        static constexpr auto activ_type = integral_constant<ActivTypeEnum, ActivType>{};
+
+        const auto bias_k0_k1_grid_desc =
+            MakeBiasK0K1GridDescriptor(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
+
+        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_global, a_e0_e1_k0_k1_e2_grid_desc.GetElementSpaceSize());
+        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_global, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetElementSpaceSize());
+        auto c_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_global, c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc.GetElementSpaceSize());
+        auto d_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_d_global, d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc.GetElementSpaceSize());
+        auto bias_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_bias_global, bias_k0_k1_grid_desc.GetElementSpaceSize());
+
+        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = MakeCK1NH2W2ThreadDescriptor();
+
+        // register allocation for output
+        StaticBuffer<AddressSpaceEnum::Vgpr,
+                     FloatAcc,
+                     c_k1_n_h2_w2_thread_gemm_desc.GetElementSpaceSize(),
+                     true>
+            c_thread_buf;
+
+        const auto c_k_n_h_w_block_cluster_idx =
+            GetCBlockIndex(cblockid_to_k_n_h_w_block_cluster_adaptor);
+
+        const auto c_thread_mtx_index = GetCThreadIndex();
+
+        // GemmOp
+        GemmOp(a_global_buf,
+               b_global_buf,
+               c_thread_buf,
+               p_shared_block,
+               c_k_n_h_w_block_cluster_idx,
+               c_thread_mtx_index,
+               a_e0_e1_k0_k1_e2_grid_desc,
+               b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+               c_k1_n_h2_w2_thread_gemm_desc,
+               integral_constant<bool, HasMainE0BlockLoop>{});
+
+        // Bias
+        BiasOp(bias_global_buf,
+               c_thread_buf,
+               c_k_n_h_w_block_cluster_idx,
+               c_thread_mtx_index,
+               bias_k0_k1_grid_desc,
+               c_k1_n_h2_w2_thread_gemm_desc);
+
+        // Activ
+        Activation(c_thread_buf, c_k1_n_h2_w2_thread_gemm_desc, activ_type);
+
+        // Output
+        WriteOut(c_thread_buf,
+                 c_global_buf,
+                 c_k_n_h_w_block_cluster_idx,
+                 c_thread_mtx_index,
+                 c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
+
+        // MaxPool
+        MaxPool(c_thread_buf,
+                d_global_buf,
+                c_k_n_h_w_block_cluster_idx,
+                c_thread_mtx_index,
+                c_k1_n_h2_w2_thread_gemm_desc,
+                d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc);
+    }
+
+    template <typename AGridDesc_E0_E1_K0_K1_E2,
+              typename BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2,
+              typename CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2,
+              typename DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx,
+              typename CBlockIdToBlockClusterAdaptor_K_N_H_W,
+              bool HasMainE0BlockLoop,
+              ActivTypeEnum ActivType>
+    __device__ static void ConvBiasActivResizeAdd(
+        const FloatAB* __restrict__ p_a_global,
+        const FloatAB* __restrict__ p_b_global,
+        const FloatC* __restrict__ p_bias_global,
+        FloatC* __restrict__ p_d_global,
+        FloatAB* __restrict__ p_shared_block,
+        const AGridDesc_E0_E1_K0_K1_E2& a_e0_e1_k0_k1_e2_grid_desc,
+        const BGridDesc_E0_E1_N_H0_H1_H2_W0_W1_W2_E2& b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+        const CGridDesc_K0_K1_N_H0_H1_H2_W0_W1_W2& c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc,
+        const DGridDesc_K0_K1_N_H0_H1_Hx_W0_W1_Wx& d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc,
+        const CBlockIdToBlockClusterAdaptor_K_N_H_W& cblockid_to_k_n_h_w_block_cluster_adaptor,
+        integral_constant<bool, HasMainE0BlockLoop>,
+        integral_constant<ActivTypeEnum, ActivType>)
+    {
+        static constexpr auto activ_type = integral_constant<ActivTypeEnum, ActivType>{};
+
+        const auto bias_k0_k1_grid_desc =
+            MakeBiasK0K1GridDescriptor(c_k0_k1_n_h0_h1_h2_w0_w1_w2_grid_desc);
+
+        const auto a_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_global, a_e0_e1_k0_k1_e2_grid_desc.GetElementSpaceSize());
+        const auto b_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_global, b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc.GetElementSpaceSize());
+        auto d_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_d_global, d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc.GetElementSpaceSize());
+        auto bias_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_bias_global, bias_k0_k1_grid_desc.GetElementSpaceSize());
+
+        constexpr auto c_k1_n_h2_w2_thread_gemm_desc = MakeCK1NH2W2ThreadDescriptor();
+
+        // register allocation for output
+        StaticBuffer<AddressSpaceEnum::Vgpr,
+                     FloatAcc,
+                     c_k1_n_h2_w2_thread_gemm_desc.GetElementSpaceSize(),
+                     true>
+            c_thread_buf;
+
+        const auto c_k_n_h_w_block_cluster_idx =
+            GetCBlockIndex(cblockid_to_k_n_h_w_block_cluster_adaptor);
+
+        const auto c_thread_mtx_index = GetCThreadIndex();
+
+        // GemmOp
+        GemmOp(a_global_buf,
+               b_global_buf,
+               c_thread_buf,
+               p_shared_block,
+               c_k_n_h_w_block_cluster_idx,
+               c_thread_mtx_index,
+               a_e0_e1_k0_k1_e2_grid_desc,
+               b_e0_e1_n_h0_h1_h2_w0_w1_w2_e2_grid_desc,
+               c_k1_n_h2_w2_thread_gemm_desc,
+               integral_constant<bool, HasMainE0BlockLoop>{});
+
+        // Bias
+        BiasOp(bias_global_buf,
+               c_thread_buf,
+               c_k_n_h_w_block_cluster_idx,
+               c_thread_mtx_index,
+               bias_k0_k1_grid_desc,
+               c_k1_n_h2_w2_thread_gemm_desc);
+
+        // Activ
+        Activation(c_thread_buf, c_k1_n_h2_w2_thread_gemm_desc, activ_type);
+
+        // Resize_Add
+        ResizeAdd(c_thread_buf,
+                  d_global_buf,
+                  c_k_n_h_w_block_cluster_idx,
+                  c_thread_mtx_index,
+                  c_k1_n_h2_w2_thread_gemm_desc,
+                  d_k0_k1_n_h0_h1_hx_w0_w1_wx_grid_desc);
+    }
+};
+
+} // namespace ck
+#endif
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
new file mode 100644
index 00000000..578665ea
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_multiple_r_xdl_cshuffle.hpp
@@ -0,0 +1,944 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+template <typename FloatAB,
+          typename FloatGemmAcc,
+          typename FloatCShuffle,
+          typename DsDataType,
+          typename FloatE,
+          typename FloatReduceAcc,
+          typename RsDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          typename QsElementwiseOperation,
+          typename RsElementwiseOperation,
+          typename ThreadReduceOperations,
+          InMemoryDataOperationEnum EGlobalMemoryDataOperation,
+          typename RsGlobalMemoryDataOperation,
+          typename AGridDesc_M_K,
+          typename BGridDesc_N_K,
+          typename EGridDesc_M_N,
+          typename RGridDesc_M,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1Value,
+          index_t BK1Value,
+          index_t MPerXdl,
+          index_t NPerXdl,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CDRThreadTransferClusterLengths_MPerBlock_NPerBlock,
+          index_t CDEReduceThreadTransferScalarPerVector_NPerBlock,
+          index_t RThreadTransferDstScalarPerVector_MPerBlock,
+          LoopScheduler LoopSched,
+          PipelineVersion PipelineVer = PipelineVersion::v1>
+struct GridwiseGemmMultipleDMultipleR_k0mk1_k0nk1_mn_xdl_cshuffle_v1
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+    static constexpr index_t NumRTensor = RsDataType::Size();
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    // K1 should be Number<...>
+    static constexpr auto AK1         = Number<AK1Value>{};
+    static constexpr auto BK1         = Number<BK1Value>{};
+    static constexpr auto AK0PerBlock = Number<KPerBlock / AK1Value>{};
+    static constexpr auto BK0PerBlock = Number<KPerBlock / BK1Value>{};
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    using GridwiseGemmPipe = remove_cvref_t<decltype(
+        GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage>())>;
+
+    __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
+    {
+        // A matrix in LDS memory, dst of blockwise copy
+        return make_naive_tensor_descriptor(
+            make_tuple(AK0PerBlock, Number<MPerBlock>{}, AK1),
+            make_tuple(Number<MPerBlock + ABlockLdsExtraM>{} * AK1, AK1, I1));
+    }
+
+    __host__ __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    {
+        // B matrix in LDS memory, dst of blockwise copy
+        return make_naive_tensor_descriptor(
+            make_tuple(BK0PerBlock, Number<NPerBlock>{}, BK1),
+            make_tuple(Number<NPerBlock + BBlockLdsExtraN>{} * BK1, BK1, I1));
+    }
+
+    __host__ __device__ static constexpr auto
+    GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
+    {
+        constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            make_naive_tensor_descriptor_packed(
+                make_tuple(I1,
+                           Number<CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl>{},
+                           I1,
+                           Number<CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>{}));
+
+        return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    // ck::Tuple<const T0DataType*, const T1DataType*, ...>
+    template <typename Ts, bool isConst = true>
+    static constexpr auto MakeTsGridPointer()
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using T = remove_cvref_t<tuple_element_t<i.value, Ts>>;
+                if constexpr(isConst)
+                    return static_cast<const T*>(nullptr);
+                else
+                    return static_cast<T*>(nullptr);
+            },
+            Number<Ts::Size()>{});
+    }
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1, BK1);
+
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
+
+        // LDS allocation for C shuffle in LDS
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+        constexpr auto c_block_size =
+            c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();
+
+        return math::max((a_block_space_size_aligned + b_block_space_size_aligned) *
+                             sizeof(FloatAB),
+                         c_block_size * sizeof(FloatCShuffle));
+    }
+
+    // A desc for source in blockwise copy
+    __host__ __device__ static constexpr auto
+    MakeDefaultAGridDescriptor_AK0_M_AK1(const AGridDesc_M_K& a_grid_desc_m_k)
+    {
+        const auto M = a_grid_desc_m_k.GetLength(I0);
+        const auto K = a_grid_desc_m_k.GetLength(I1);
+
+        const auto AK0 = K / AK1;
+
+        return transform_tensor_descriptor(a_grid_desc_m_k,
+                                           make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                      make_pass_through_transform(M)),
+                                           make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                           make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    // B desc for source in blockwise copy
+    __host__ __device__ static constexpr auto
+    MakeDefaultBGridDescriptor_BK0_N_BK1(const BGridDesc_N_K& b_grid_desc_n_k)
+    {
+        const auto N = b_grid_desc_n_k.GetLength(I0);
+        const auto K = b_grid_desc_n_k.GetLength(I1);
+
+        const auto BK0 = K / BK1;
+
+        return transform_tensor_descriptor(b_grid_desc_n_k,
+                                           make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                      make_pass_through_transform(N)),
+                                           make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                           make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    template <typename Block2ETileMap>
+    __host__ __device__ static constexpr bool CheckValidity(const AGridDesc_M_K& a_grid_desc_m_k,
+                                                            const BGridDesc_N_K& b_grid_desc_n_k,
+                                                            const EGridDesc_M_N& e_grid_desc_m_n,
+                                                            const RGridDesc_M& r_grid_desc_m,
+                                                            const Block2ETileMap& block_2_etile_map)
+    {
+        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
+                      "Invalid tuning param!");
+
+        static_assert(AGridDesc_M_K::GetNumOfDimension() == 2);
+        static_assert(BGridDesc_N_K::GetNumOfDimension() == 2);
+        static_assert(EGridDesc_M_N::GetNumOfDimension() == 2);
+
+        const auto M = a_grid_desc_m_k.GetLength(I0);
+        const auto N = b_grid_desc_n_k.GetLength(I0);
+        const auto K = a_grid_desc_m_k.GetLength(I1);
+
+        if(!(M == e_grid_desc_m_n.GetLength(I0) && N == e_grid_desc_m_n.GetLength(I1)))
+            return false;
+
+        if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0))
+            return false;
+
+        if(M != r_grid_desc_m.GetLength(I0))
+            return false;
+
+        // check gridwise gemm pipeline
+        const auto num_k_loop = K / KPerBlock;
+
+        if(!GridwiseGemmPipe::IsSupported(num_k_loop))
+        {
+            return false;
+        }
+
+        if(!block_2_etile_map.CheckValidity(e_grid_desc_m_n))
+        {
+            return false;
+        }
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const EGridDesc_M_N& e_grid_desc_m_n)
+    {
+        const auto M = e_grid_desc_m_n.GetLength(I0);
+        const auto N = e_grid_desc_m_n.GetLength(I1);
+
+        const auto MBlock = M / MPerBlock;
+        const auto NBlock = N / NPerBlock;
+
+        const auto e_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
+            e_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
+                       make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+
+        return e_grid_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeRGridDescriptor_MBlock_MPerBlock(const RGridDesc_M& r_grid_desc_m)
+    {
+        const auto M      = r_grid_desc_m.GetLength(I0);
+        const auto MBlock = M / MPerBlock;
+
+        const auto r_grid_desc_mblock_mperblock = transform_tensor_descriptor(
+            r_grid_desc_m,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{}))),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0, 1>{}));
+
+        return r_grid_desc_mblock_mperblock;
+    }
+
+    // return block_id to E matrix tile idx (m0, n0) mapping
+    __host__ __device__ static constexpr auto
+    MakeDefaultBlock2ETileMap(const EGridDesc_M_N& e_grid_desc_m_n)
+    {
+        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, EGridDesc_M_N>(
+            e_grid_desc_m_n);
+    }
+
+    using DefaultAGridDesc_AK0_M_AK1 =
+        remove_cvref_t<decltype(MakeDefaultAGridDescriptor_AK0_M_AK1(AGridDesc_M_K{}))>;
+    using DefaultBGridDesc_BK0_N_BK1 =
+        remove_cvref_t<decltype(MakeDefaultBGridDescriptor_BK0_N_BK1(BGridDesc_N_K{}))>;
+    using EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}))>;
+
+    // Support 2 dimension in the future. Not only M
+    using RGridDescriptor_MBlock_MPerBlock =
+        remove_cvref_t<decltype(MakeRGridDescriptor_MBlock_MPerBlock(RGridDesc_M{}))>;
+
+    using DefaultBlock2ETileMap =
+        remove_cvref_t<decltype(MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
+
+    using DsGridPointer = decltype(MakeTsGridPointer<DsDataType, true>());
+    using RsGridPointer = decltype(MakeTsGridPointer<RsDataType, false>());
+
+    template <bool HasMainKBlockLoop,
+              typename AGridDesc_AK0_M_AK1,
+              typename BGridDesc_BK0_N_BK1,
+              typename Block2ETileMap>
+    __device__ static void
+    Run(const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        DsGridPointer p_ds_grid,
+        FloatE* __restrict__ p_e_grid,
+        RsGridPointer p_rs_grid,
+        void* __restrict__ p_shared,
+        const AElementwiseOperation& a_element_op,
+        const BElementwiseOperation& b_element_op,
+        const CDEElementwiseOperation& cde_element_op,
+        const QsElementwiseOperation& qs_element_op,
+        const RsElementwiseOperation& rs_element_op,
+        const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1,
+        const BGridDesc_BK0_N_BK1& b_grid_desc_bk0_n_bk1,
+        const StaticallyIndexedArray<EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+                                     NumDTensor>&
+            ds_grid_desc_mblock_mperblock_nblock_nperblock, // FIXME: Ds desc may be of different
+        const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock&
+            e_grid_desc_mblock_mperblock_nblock_nperblock,
+        const StaticallyIndexedArray<RGridDescriptor_MBlock_MPerBlock,
+                                     NumRTensor>&
+            rs_grid_desc_mblock_mperblock, // FIXME: Rs desc may be of different
+        const Block2ETileMap& block_2_etile_map)
+    {
+        // FIXME - Share code with other gemm kernel
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        const auto ds_grid_buf = generate_tuple(
+            [&](auto i) {
+                return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                    p_ds_grid[i],
+                    ds_grid_desc_mblock_mperblock_nblock_nperblock[i].GetElementSpaceSize());
+            },
+            Number<NumDTensor>{});
+
+        auto e_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_e_grid, e_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+        auto rs_grid_buf = generate_tuple(
+            [&](auto i) {
+                return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                    p_rs_grid(i), rs_grid_desc_mblock_mperblock[i].GetElementSpaceSize());
+            },
+            Number<NumRTensor>{});
+
+        // divide block work by [M, N]
+        const auto block_work_idx =
+            block_2_etile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!block_2_etile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(e_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          e_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1, BK1);
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                AElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<AK0PerBlock, MPerBlock, AK1>,
+                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(a_grid_desc_ak0_m_ak1),
+                                                decltype(a_block_desc_ak0_m_ak1),
+                                                ABlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                ABlockTransferSrcVectorDim,
+                                                2,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_AK1,
+                                                1,
+                                                1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                NumGemmKPrefetchStage>(
+                a_grid_desc_ak0_m_ak1,
+                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                a_element_op,
+                a_block_desc_ak0_m_ak1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                BElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<BK0PerBlock, NPerBlock, BK1>,
+                                                BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(b_grid_desc_bk0_n_bk1),
+                                                decltype(b_block_desc_bk0_n_bk1),
+                                                BBlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                BBlockTransferSrcVectorDim,
+                                                2,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_BK1,
+                                                1,
+                                                1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                NumGemmKPrefetchStage>(
+                b_grid_desc_bk0_n_bk1,
+                make_multi_index(0, n_block_data_idx_on_grid, 0),
+                b_element_op,
+                b_block_desc_bk0_n_bk1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[K0PerBlock, MPerBlock] is in LDS
+        //     b_mtx[K0PerBlock, NPerBlock] is in LDS
+        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
+        //       register
+        // sanity check
+        constexpr index_t KPack = math::max(
+            math::lcm(AK1, BK1), MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+
+        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
+            BlockSize,
+            FloatAB,
+            FloatGemmAcc,
+            decltype(a_block_desc_ak0_m_ak1),
+            decltype(b_block_desc_bk0_n_bk1),
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            KPack,
+            LoopSched>();
+
+        auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<FloatAB*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<FloatAB*>(p_shared) + a_block_space_size_aligned,
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1, 0, 0);
+
+        // gridwise GEMM pipeline
+        const auto gridwise_gemm_pipeline =
+            GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage, LoopSched>();
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_grid_desc_ak0_m_ak1,
+                                                               a_block_desc_ak0_m_ak1,
+                                                               a_blockwise_copy,
+                                                               a_grid_buf,
+                                                               a_block_buf,
+                                                               a_block_slice_copy_step,
+                                                               b_grid_desc_bk0_n_bk1,
+                                                               b_block_desc_bk0_n_bk1,
+                                                               b_blockwise_copy,
+                                                               b_grid_buf,
+                                                               b_block_buf,
+                                                               b_block_slice_copy_step,
+                                                               blockwise_gemm,
+                                                               c_thread_buf,
+                                                               num_k_block_main_loop);
+
+        // shuffle C + Ds + reduction + write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+            constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<FloatCShuffle*>(p_shared),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
+                        M1,                                      // M1 = MWave
+                        M2,                                      // M2 * M3 * M4 = MPerXdl
+                        M3,
+                        M4)),
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
+                        N1,                                      // N1 = NWave
+                        N2))),                                   // N2 = NPerXdl
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<FloatGemmAcc,
+                                                   FloatCShuffle,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            M2,
+                                                            I1,
+                                                            M4,
+                                                            I1>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     m_thread_data_on_block_idx[I3],
+                                     m_thread_data_on_block_idx[I4],
+                                     n_thread_data_on_block_idx[I2]),
+                    ck::tensor_operation::element_wise::PassThrough{}};
+
+            // space filling curve for threadwise C in VGPR
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle,
+                                           CShuffleNXdlPerWavePerShuffle,
+                                           1,
+                                           1,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>>{};
+
+            // space filling curve for shuffled blockwise C in global mem
+            constexpr auto sfc_der_global =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                           1,
+                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+            // TODO: this should be implemented as a blockwise reduction
+            // LDS c_reduce_block_desc_mperblock_nperblock
+            constexpr auto c_reduce_block_desc_mperblock_nperblock = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_pass_through_transform(
+                        c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetLength(I1)),
+                    make_freeze_transform(I0),
+                    make_pass_through_transform(
+                        c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetLength(I3))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<>{}, Sequence<0>{}, Sequence<>{}, Sequence<1>{}));
+
+            static_assert(CDRThreadTransferClusterLengths_MPerBlock_NPerBlock::At(I0) *
+                                  CDRThreadTransferClusterLengths_MPerBlock_NPerBlock::At(I1) ==
+                              BlockSize,
+                          "wrong!");
+
+            static_assert((CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl) %
+                                      CDRThreadTransferClusterLengths_MPerBlock_NPerBlock::At(I0) ==
+                                  0 &&
+                              (CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl) %
+                                      CDRThreadTransferClusterLengths_MPerBlock_NPerBlock::At(I1) ==
+                                  0,
+                          "wrong!");
+
+            constexpr index_t mreduce_per_thread =
+                (CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl) /
+                CDRThreadTransferClusterLengths_MPerBlock_NPerBlock::At(I0);
+
+            constexpr index_t nreduce_per_thread =
+                (CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl) /
+                CDRThreadTransferClusterLengths_MPerBlock_NPerBlock::At(I1);
+
+            constexpr auto c_reduce_thread_lengths_mperblock_nperblock =
+                Sequence<mreduce_per_thread, nreduce_per_thread>{};
+
+            // VGPR cde_reduce_thread_desc_mperblock_nperblock
+            constexpr auto cde_reduce_thread_desc_mperblock_nperblock =
+                make_naive_tensor_descriptor_packed(
+                    make_tuple(Number<mreduce_per_thread>{}, Number<nreduce_per_thread>{}));
+
+            constexpr auto r_thread_desc_mperblock =
+                make_naive_tensor_descriptor_packed(make_tuple(Number<mreduce_per_thread>{}));
+
+            constexpr auto r_thread_desc_mblock_mperblock =
+                make_naive_tensor_descriptor_packed(make_tuple(I1, Number<mreduce_per_thread>{}));
+
+            auto e_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatReduceAcc>(
+                cde_reduce_thread_desc_mperblock_nperblock.GetElementSpaceSize());
+
+            // reduce: threadwise copy from LDS to VGPR
+            constexpr auto c_reduce_thread_cluster_desc = make_cluster_descriptor(
+                CDRThreadTransferClusterLengths_MPerBlock_NPerBlock{}, Sequence<1, 0>{});
+
+            const auto c_reduce_thread_cluster_idx =
+                c_reduce_thread_cluster_desc.CalculateBottomIndex(
+                    make_multi_index(get_thread_local_1d_id()));
+
+            const auto c_reduce_thread_data_idx_begin =
+                c_reduce_thread_cluster_idx * c_reduce_thread_lengths_mperblock_nperblock;
+
+            // To apply D0, D1, ... and reduction.
+            // Copy c shuffle from LDS back to VGPR
+            auto c_reduce_thread_copy_lds_to_vgpr = ThreadwiseTensorSliceTransfer_v2<
+                FloatCShuffle,
+                FloatReduceAcc,
+                decltype(c_reduce_block_desc_mperblock_nperblock),
+                decltype(cde_reduce_thread_desc_mperblock_nperblock),
+                decltype(c_reduce_thread_lengths_mperblock_nperblock),
+                Sequence<0, 1>,
+                1,
+                CDEReduceThreadTransferScalarPerVector_NPerBlock,
+                1,
+                true>{c_reduce_block_desc_mperblock_nperblock, c_reduce_thread_data_idx_begin};
+
+            // Copy result of reduction back from VGPR to global
+            auto reduce_tuple_thread_copy_vgpr_to_global = generate_tuple(
+                [&](auto I) {
+                    auto p_r_grid                     = p_rs_grid[I];
+                    auto r_element_op                 = rs_element_op[I];
+                    auto r_grid_desc_mblock_mperblock = rs_grid_desc_mblock_mperblock[I];
+
+                    return ThreadwiseTensorSliceTransfer_v1r3<
+                        FloatReduceAcc,
+                        remove_pointer_t<decltype(p_r_grid)>,
+                        decltype(r_thread_desc_mblock_mperblock),
+                        decltype(r_grid_desc_mblock_mperblock),
+                        decltype(r_element_op),
+                        Sequence<1, mreduce_per_thread>,
+                        Sequence<0, 1>,
+                        1,
+                        RThreadTransferDstScalarPerVector_MPerBlock,
+                        RsGlobalMemoryDataOperation::At(I),
+                        1,
+                        false>{r_grid_desc_mblock_mperblock,
+                               make_multi_index(block_work_idx[I0],                  // mblock
+                                                c_reduce_thread_data_idx_begin[I0]), // mperblock
+                               r_element_op};
+                },
+                Number<NumRTensor>{});
+
+            // D0, D1, ..., Dn
+            constexpr auto cde_reduce_thread_desc_I1_mperblock_I1_nperblock =
+                make_naive_tensor_descriptor_packed(
+                    make_tuple(I1, Number<mreduce_per_thread>{}, I1, Number<nreduce_per_thread>{}));
+
+            // FIXME: Decrease usage of VGPR
+            // Apply pointwise lambda function from multi-source (Global and LDS) into VGPR
+            auto ds_thread_buf = generate_tuple(
+                [&](auto) {
+                    return make_static_buffer<AddressSpaceEnum::Vgpr, FloatReduceAcc>(
+                        cde_reduce_thread_desc_I1_mperblock_I1_nperblock.GetElementSpaceSize());
+                },
+                Number<NumDTensor>{});
+
+            // Copy D0, D1, ..., Dn from global to VGPR
+            auto ds_thread_copy_global_to_vgpr = generate_tuple(
+                [&](auto I) {
+                    using DDataType = remove_cvref_t<tuple_element_t<I.value, DsDataType>>;
+                    return ThreadwiseTensorSliceTransfer_v2<
+                        DDataType,
+                        FloatReduceAcc,
+                        decltype(ds_grid_desc_mblock_mperblock_nblock_nperblock[I]),
+                        decltype(cde_reduce_thread_desc_I1_mperblock_I1_nperblock),
+                        Sequence<I1, mreduce_per_thread, I1, nreduce_per_thread>,
+                        Sequence<0, 1, 2, 3>,
+                        3,
+                        CDEReduceThreadTransferScalarPerVector_NPerBlock,
+                        1,
+                        true>(ds_grid_desc_mblock_mperblock_nblock_nperblock[I],
+                              make_multi_index(
+                                  I0,
+                                  m_block_data_idx_on_grid + c_reduce_thread_data_idx_begin[I0],
+                                  I0,
+                                  n_block_data_idx_on_grid + c_reduce_thread_data_idx_begin[I1]));
+                },
+                Number<NumDTensor>{});
+
+            auto e_thread_copy_vgpr_to_global = ThreadwiseTensorSliceTransfer_v1r3<
+                FloatReduceAcc,
+                FloatE,
+                decltype(cde_reduce_thread_desc_I1_mperblock_I1_nperblock),
+                decltype(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                tensor_operation::element_wise::PassThrough,
+                Sequence<I1, mreduce_per_thread, I1, nreduce_per_thread>, // SliceLengths
+                Sequence<0, 1, 2, 3>,                                     // DimAccessOrder
+                3,                                                        // DstVectorDim
+                CDEReduceThreadTransferScalarPerVector_NPerBlock,
+                InMemoryDataOperationEnum::Set,
+                1,
+                true>{
+                e_grid_desc_mblock_mperblock_nblock_nperblock,
+                make_multi_index(I0,
+                                 m_block_data_idx_on_grid + c_reduce_thread_data_idx_begin[I0],
+                                 I0,
+                                 n_block_data_idx_on_grid + c_reduce_thread_data_idx_begin[I1]),
+                tensor_operation::element_wise::PassThrough{}};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            static_assert(num_access == sfc_der_global.GetNumOfAccess(), "wrong!");
+
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each thread shuffle data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf,
+                                              c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              c_shuffle_block_buf);
+
+                // make sure it's safe to write to LDS
+                block_sync_lds();
+
+                // Get shuffle data from LDS to VGPR
+                c_reduce_thread_copy_lds_to_vgpr.Run(c_reduce_block_desc_mperblock_nperblock,
+                                                     c_shuffle_block_buf,
+                                                     cde_reduce_thread_desc_mperblock_nperblock,
+                                                     make_tuple(I0, I0),
+                                                     e_thread_buf);
+
+                // Global read D0, D1, ...
+                static_for<0, NumDTensor, 1>{}([&](auto Id) {
+                    auto& d_thread_copy_global_to_vgpr = ds_thread_copy_global_to_vgpr(Id);
+                    d_thread_copy_global_to_vgpr.Run(
+                        ds_grid_desc_mblock_mperblock_nblock_nperblock[Id],
+                        ds_grid_buf[Id],
+                        cde_reduce_thread_desc_I1_mperblock_I1_nperblock,
+                        make_tuple(I0, I0, I0, I0),
+                        ds_thread_buf(Id));
+
+                    if constexpr(access_id < num_access - 1)
+                    {
+                        // move on D0, D1, ...
+                        constexpr auto de_global_step = sfc_der_global.GetForwardStep(access_id);
+                        d_thread_copy_global_to_vgpr.MoveSrcSliceWindow(
+                            ds_grid_desc_mblock_mperblock_nblock_nperblock[Id], de_global_step);
+                    }
+                });
+
+                // cde_element_op(e, c, d0, d1, ...);
+                static_for<0, cde_reduce_thread_desc_mperblock_nperblock.GetElementSize(), 1>{}(
+                    [&](auto i) {
+                        const auto c_ds_src_data_refs = concat_tuple_of_reference(
+                            tie(e_thread_buf[i]),
+                            generate_tie(
+                                [&](auto Id) -> const auto& { return ds_thread_buf[Id][i]; },
+                                Number<NumDTensor>{}));
+                        auto e_dst_data_refs = tie(e_thread_buf(i));
+                        unpack2(cde_element_op, e_dst_data_refs, c_ds_src_data_refs);
+                    });
+
+                // Global write E
+                e_thread_copy_vgpr_to_global.Run(cde_reduce_thread_desc_I1_mperblock_I1_nperblock,
+                                                 make_tuple(I0, I0, I0, I0),
+                                                 e_thread_buf,
+                                                 e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                 e_grid_buf);
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    // move on E
+                    constexpr auto de_global_step = sfc_der_global.GetForwardStep(access_id);
+                    e_thread_copy_vgpr_to_global.MoveDstSliceWindow(
+                        e_grid_desc_mblock_mperblock_nblock_nperblock, de_global_step);
+                }
+
+                // reduction
+                static_for<0, NumRTensor, 1>{}([&](auto Ir) {
+                    auto r_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatReduceAcc>(
+                        r_thread_desc_mperblock.GetElementSpaceSize());
+
+                    auto& reduce_thread_copy_vgpr_to_global =
+                        reduce_tuple_thread_copy_vgpr_to_global(Ir);
+
+                    using ThreadReduceOperation =
+                        remove_cvref_t<decltype(ThreadReduceOperations{}[Ir])>;
+
+                    using ThreadwiseReduce =
+                        ThreadwiseReduction<FloatReduceAcc,
+                                            decltype(cde_reduce_thread_desc_mperblock_nperblock),
+                                            decltype(r_thread_desc_mperblock),
+                                            ThreadReduceOperation,
+                                            false>;
+
+                    // threadwise reduction
+                    const auto reduce_identityVal =
+                        ThreadReduceOperation::template GetIdentityValue<FloatReduceAcc>();
+                    static_for<0, mreduce_per_thread, 1>{}(
+                        [&](auto I) { r_thread_buf(I) = reduce_identityVal; });
+                    static_for<0, mreduce_per_thread, 1>{}([&](auto im) {
+                        static_for<0, nreduce_per_thread, 1>{}([&](auto in) {
+                            constexpr auto offset =
+                                Number<cde_reduce_thread_desc_mperblock_nperblock.CalculateOffset(
+                                    make_tuple(im, in))>{};
+
+                            qs_element_op[Ir](e_thread_buf(offset), e_thread_buf(offset));
+                        });
+                    });
+                    ThreadwiseReduce::Reduce(e_thread_buf, r_thread_buf);
+
+                    // gridwise reduction
+                    reduce_thread_copy_vgpr_to_global.Run(r_thread_desc_mblock_mperblock,
+                                                          make_tuple(I0, I0),
+                                                          r_thread_buf,
+                                                          rs_grid_desc_mblock_mperblock[Ir],
+                                                          rs_grid_buf(Ir));
+
+                    if constexpr(access_id < num_access - 1)
+                    {
+                        // move on R0, R1, ...
+                        constexpr auto de_global_step = sfc_der_global.GetForwardStep(access_id);
+                        reduce_thread_copy_vgpr_to_global.MoveDstSliceWindow(
+                            rs_grid_desc_mblock_mperblock[Ir],
+                            make_tuple(de_global_step[I0], de_global_step[I1]));
+                    }
+                });
+            }); // copy c, d, e + reduction
+
+        } // shuffle C + Ds + reduction + write out
+    }     // Run
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
new file mode 100644
index 00000000..da0b0cea
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
@@ -0,0 +1,753 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+// GEMM:
+//   input : A[M, K]
+//   input : B[N, K]
+//   input : D0[M, N], D1[M, N], ...
+//   output : E[M, N]
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   D0, D1, ... and E have the same layout
+template <typename ABDataType, // FIXME: don't assume A/B have same datatype
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          InMemoryDataOperationEnum EGlobalMemoryDataOperation,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1Value,
+          index_t BK1Value,
+          index_t MPerXdl,
+          index_t NPerXdl,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CDEShuffleBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched,
+          PipelineVersion PipelineVer = PipelineVersion::v1>
+struct GridwiseGemmMultipleD_xdl_cshuffle
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    // K1 should be Number<...>
+    static constexpr auto AK1         = Number<AK1Value>{};
+    static constexpr auto BK1         = Number<BK1Value>{};
+    static constexpr auto AK0PerBlock = Number<KPerBlock / AK1Value>{};
+    static constexpr auto BK0PerBlock = Number<KPerBlock / BK1Value>{};
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    using GridwiseGemmPipe = remove_cvref_t<decltype(
+        GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage, LoopSched>())>;
+
+    __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
+    {
+        // A matrix in LDS memory, dst of blockwise copy
+        return make_naive_tensor_descriptor(
+            make_tuple(AK0PerBlock, Number<MPerBlock>{}, AK1),
+            make_tuple(Number<MPerBlock + ABlockLdsExtraM>{} * AK1, AK1, I1));
+    }
+
+    __host__ __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    {
+        // B matrix in LDS memory, dst of blockwise copy
+        return make_naive_tensor_descriptor(
+            make_tuple(BK0PerBlock, Number<NPerBlock>{}, BK1),
+            make_tuple(Number<NPerBlock + BBlockLdsExtraN>{} * BK1, BK1, I1));
+    }
+
+    __host__ __device__ static constexpr auto
+    GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
+    {
+        constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            make_naive_tensor_descriptor_packed(
+                make_tuple(I1,
+                           Number<CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl>{},
+                           I1,
+                           Number<CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>{}));
+
+        return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    // ck::Tuple<const D0DataType*, const D1DataType*, ...>
+    static constexpr auto MakeDsGridPointer()
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                return static_cast<const DDataType*>(nullptr);
+            },
+            Number<NumDTensor>{});
+    }
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1, BK1);
+
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
+
+        // LDS allocation for C shuffle in LDS
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+        constexpr auto c_block_size =
+            c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();
+
+        return math::max((a_block_space_size_aligned + b_block_space_size_aligned) *
+                             sizeof(ABDataType),
+                         c_block_size * sizeof(CShuffleDataType));
+    }
+
+    // A desc for source in blockwise copy
+    template <typename AGridDesc_M_K>
+    __host__ __device__ static constexpr auto
+    MakeDefaultAGridDescriptor_AK0_M_AK1(const AGridDesc_M_K& a_grid_desc_m_k)
+    {
+        const auto M = a_grid_desc_m_k.GetLength(I0);
+        const auto K = a_grid_desc_m_k.GetLength(I1);
+
+        const auto AK0 = K / AK1;
+
+        return transform_tensor_descriptor(a_grid_desc_m_k,
+                                           make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                      make_pass_through_transform(M)),
+                                           make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                           make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    // B desc for source in blockwise copy
+    template <typename BGridDesc_N_K>
+    __host__ __device__ static constexpr auto
+    MakeDefaultBGridDescriptor_BK0_N_BK1(const BGridDesc_N_K& b_grid_desc_n_k)
+    {
+        const auto N = b_grid_desc_n_k.GetLength(I0);
+        const auto K = b_grid_desc_n_k.GetLength(I1);
+
+        const auto BK0 = K / BK1;
+
+        return transform_tensor_descriptor(b_grid_desc_n_k,
+                                           make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                      make_pass_through_transform(N)),
+                                           make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                           make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    // E desc for destination in blockwise copy
+    template <typename EGridDesc_M_N>
+    __host__ __device__ static constexpr auto
+    MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const EGridDesc_M_N& e_grid_desc_m_n)
+    {
+        const auto M = e_grid_desc_m_n.GetLength(I0);
+        const auto N = e_grid_desc_m_n.GetLength(I1);
+
+        const auto MBlock = M / MPerBlock;
+        const auto NBlock = N / NPerBlock;
+
+        const auto e_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
+            e_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
+                       make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+
+        return e_grid_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    // Ds desc for source in blockwise copy
+    template <typename DsGridDesc_M_N>
+    __host__ __device__ static constexpr auto
+    MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const DsGridDesc_M_N& ds_grid_desc_m_n)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                return MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(ds_grid_desc_m_n[i]);
+            },
+            Number<NumDTensor>{});
+    }
+
+    // return block_id to E matrix tile idx (m0, n0) mapping
+    template <typename EGridDesc_M_N>
+    __host__ __device__ static constexpr auto
+    MakeDefaultBlock2ETileMap(const EGridDesc_M_N& e_grid_desc_m_n)
+    {
+        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, EGridDesc_M_N>(
+            e_grid_desc_m_n);
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    template <typename AGridDesc_M_K,
+              typename BGridDesc_N_K,
+              typename DsGridDesc_M_N,
+              typename EGridDesc_M_N,
+              typename Block2ETileMap>
+    __host__ __device__ static constexpr bool CheckValidity(const AGridDesc_M_K& a_grid_desc_m_k,
+                                                            const BGridDesc_N_K& b_grid_desc_n_k,
+                                                            const DsGridDesc_M_N& ds_grid_desc_m_n,
+                                                            const EGridDesc_M_N& e_grid_desc_m_n,
+                                                            const Block2ETileMap& block_2_etile_map)
+    {
+        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
+                      "Invalid tuning param!");
+
+        const auto M = a_grid_desc_m_k.GetLength(I0);
+        const auto N = b_grid_desc_n_k.GetLength(I0);
+        const auto K = a_grid_desc_m_k.GetLength(I1);
+
+        // check consistency of desc
+        if(!(M == e_grid_desc_m_n.GetLength(I0) && N == e_grid_desc_m_n.GetLength(I1)))
+        {
+            return false;
+        }
+
+        bool valid = true;
+
+        static_for<0, NumDTensor, 1>{}([&](auto i) {
+            valid = valid && (M == ds_grid_desc_m_n[i].GetLength(I0) &&
+                              N == ds_grid_desc_m_n[i].GetLength(I1));
+        });
+
+        if(!valid)
+        {
+            return false;
+        }
+
+        // check tile size
+        if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0))
+        {
+            return false;
+        }
+
+        // check gridwise gemm pipeline
+        const auto num_k_loop = K / KPerBlock;
+
+        if(!GridwiseGemmPipe::IsSupported(num_k_loop))
+        {
+            return false;
+        }
+
+        // check block-to-E-tile
+        if(!block_2_etile_map.CheckValidity(e_grid_desc_m_n))
+        {
+            return false;
+        }
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        // check tensor size: cannot be larger than 2GB each
+        constexpr long_index_t TwoGB = (long_index_t{1} << 31);
+
+        if(!(a_grid_desc_m_k.GetElementSpaceSize() * sizeof(ABDataType) <= TwoGB &&
+             b_grid_desc_n_k.GetElementSpaceSize() * sizeof(ABDataType) <= TwoGB &&
+             e_grid_desc_m_n.GetElementSpaceSize() * sizeof(EDataType) <= TwoGB))
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
+    }
+
+    using DsGridPointer = decltype(MakeDsGridPointer());
+
+    template <bool HasMainKBlockLoop,
+              typename AGridDesc_AK0_M_AK1,
+              typename BGridDesc_BK0_N_BK1,
+              typename DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+              typename EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+              typename Block2ETileMap>
+    __device__ static void Run(const ABDataType* __restrict__ p_a_grid,
+                               const ABDataType* __restrict__ p_b_grid,
+                               DsGridPointer p_ds_grid,
+                               EDataType* __restrict__ p_e_grid,
+                               void* __restrict__ p_shared,
+                               const AElementwiseOperation& a_element_op,
+                               const BElementwiseOperation& b_element_op,
+                               const CDEElementwiseOperation& cde_element_op,
+                               const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1,
+                               const BGridDesc_BK0_N_BK1& b_grid_desc_bk0_n_bk1,
+                               const DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
+                                   ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                               const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
+                                   e_grid_desc_mblock_mperblock_nblock_nperblock,
+                               const Block2ETileMap& block_2_etile_map)
+    {
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        const auto ds_grid_buf = generate_tuple(
+            [&](auto i) {
+                return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                    p_ds_grid[i],
+                    ds_grid_desc_mblock_mperblock_nblock_nperblock[i].GetElementSpaceSize());
+            },
+            Number<NumDTensor>{});
+
+        auto e_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_e_grid, e_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+        // divide block work by [M, N]
+        const auto block_work_idx =
+            block_2_etile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!block_2_etile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(e_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          e_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1, BK1);
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                AElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<AK0PerBlock, MPerBlock, AK1>,
+                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ABDataType,
+                                                ABDataType,
+                                                decltype(a_grid_desc_ak0_m_ak1),
+                                                decltype(a_block_desc_ak0_m_ak1),
+                                                ABlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                ABlockTransferSrcVectorDim,
+                                                2,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_AK1,
+                                                1,
+                                                1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                NumGemmKPrefetchStage>(
+                a_grid_desc_ak0_m_ak1,
+                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                a_element_op,
+                a_block_desc_ak0_m_ak1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                BElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<BK0PerBlock, NPerBlock, BK1>,
+                                                BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                ABDataType,
+                                                ABDataType,
+                                                decltype(b_grid_desc_bk0_n_bk1),
+                                                decltype(b_block_desc_bk0_n_bk1),
+                                                BBlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                BBlockTransferSrcVectorDim,
+                                                2,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_BK1,
+                                                1,
+                                                1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                NumGemmKPrefetchStage>(
+                b_grid_desc_bk0_n_bk1,
+                make_multi_index(0, n_block_data_idx_on_grid, 0),
+                b_element_op,
+                b_block_desc_bk0_n_bk1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[K0PerBlock, MPerBlock] is in LDS
+        //     b_mtx[K0PerBlock, NPerBlock] is in LDS
+        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
+        //       register
+        // sanity check
+        constexpr index_t KPack =
+            math::max(math::lcm(AK1, BK1),
+                      MfmaSelector<ABDataType, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+
+        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
+            BlockSize,
+            ABDataType,
+            AccDataType,
+            decltype(a_block_desc_ak0_m_ak1),
+            decltype(b_block_desc_bk0_n_bk1),
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            KPack,
+            LoopSched>();
+
+        auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ABDataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ABDataType*>(p_shared) + a_block_space_size_aligned,
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1, 0, 0);
+
+        // gridwise GEMM pipeline
+        const auto gridwise_gemm_pipeline =
+            GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage, LoopSched>();
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_grid_desc_ak0_m_ak1,
+                                                               a_block_desc_ak0_m_ak1,
+                                                               a_blockwise_copy,
+                                                               a_grid_buf,
+                                                               a_block_buf,
+                                                               a_block_slice_copy_step,
+                                                               b_grid_desc_bk0_n_bk1,
+                                                               b_block_desc_bk0_n_bk1,
+                                                               b_blockwise_copy,
+                                                               b_grid_buf,
+                                                               b_block_buf,
+                                                               b_block_slice_copy_step,
+                                                               blockwise_gemm,
+                                                               c_thread_buf,
+                                                               num_k_block_main_loop);
+
+        // shuffle C and write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+            constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<CShuffleDataType*>(p_shared),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
+                        M1,                                      // M1 = MWave
+                        M2,                                      // M2 * M3 * M4 = MPerXdl
+                        M3,
+                        M4)),
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
+                        N1,                                      // N1 = NWave
+                        N2))),                                   // N2 = NPerXdl
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   CShuffleDataType,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            M2,
+                                                            I1,
+                                                            M4,
+                                                            I1>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     m_thread_data_on_block_idx[I3],
+                                     m_thread_data_on_block_idx[I4],
+                                     n_thread_data_on_block_idx[I2]),
+                    ck::tensor_operation::element_wise::PassThrough{}};
+
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c_ds_desc_refs = concat_tuple_of_reference(
+                tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                generate_tie(
+                    [&](auto i) -> const auto& // return type should be reference
+                    { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                    Number<NumDTensor>{}));
+
+            // tuple of reference to C/Ds tensor descriptors
+            const auto c_ds_buf_refs = concat_tuple_of_reference(
+                tie(c_shuffle_block_buf),
+                generate_tie(
+                    [&](auto i) -> const auto& // return type should be reference
+                    { return ds_grid_buf[i]; },
+                    Number<NumDTensor>{}));
+
+            // tuple of starting index of C/Ds blockwise copy
+            const auto idx_c_ds_block_begin = container_concat(
+                make_tuple(make_multi_index(0, 0, 0, 0)),
+                generate_tuple(
+                    [&](auto) {
+                        return make_multi_index(block_work_idx[I0], 0, block_work_idx[I1], 0);
+                    },
+                    Number<NumDTensor>{}));
+
+            // blockwise copy C/D/E between LDS and global
+            auto cde_block_copy_lds_and_global = ThreadGroupTensorSliceTransfer_v7<
+                ThisThreadBlock,
+                decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
+                Tuple<EDataType>,
+                decltype(c_ds_desc_refs),
+                decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
+                CDEElementwiseOperation,
+                Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make Sequence
+                                                                            // support arbitray type
+                Sequence<1,
+                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                         1,
+                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                Sequence<0, 1, 2, 3>, // typename DimAccessOrder,
+                3,                    // index_t VectorDim,
+                CDEShuffleBlockTransferScalarPerVector_NPerBlock,
+                sequence_merge_t<
+                    Sequence<true>,
+                    uniform_sequence_gen_t<NumDTensor,
+                                           false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
+                Sequence<false>>                    // ThreadTransferDstResetCoordinateAfterRunFlags
+                {c_ds_desc_refs,
+                 idx_c_ds_block_begin,
+                 tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                 make_tuple(make_multi_index(block_work_idx[I0], 0, block_work_idx[I1], 0)),
+                 cde_element_op};
+
+            // space filling curve for threadwise C in VGPR before shuffle
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle,
+                                           CShuffleNXdlPerWavePerShuffle,
+                                           1,
+                                           1,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>>{};
+
+            // space filling curve for shuffled blockwise C/D/E
+            constexpr auto sfc_cde_block =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                           1,
+                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            static_assert(num_access == sfc_cde_block.GetNumOfAccess(), "wrong!");
+
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf,
+                                              c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              c_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                cde_block_copy_lds_and_global.Run(
+                    c_ds_desc_refs,
+                    c_ds_buf_refs,
+                    tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                    tie(e_grid_buf));
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto cde_lds_and_global_step =
+                        sfc_cde_block.GetForwardStep(access_id);
+
+                    // move on Ds
+                    static_for<0, NumDTensor, 1>{}([&](auto i) {
+                        cde_block_copy_lds_and_global.MoveSrcSliceWindow(
+                            c_ds_desc_refs, i + I1, cde_lds_and_global_step);
+                    });
+
+                    // move on E
+                    cde_block_copy_lds_and_global.MoveDstSliceWindow(
+                        tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                        I0,
+                        cde_lds_and_global_step);
+                }
+            });
+        }
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
new file mode 100644
index 00000000..98331d85
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp"
+
+namespace ck {
+
+enum struct PipelineVersion
+{
+    v1,
+    v2,
+};
+
+template <PipelineVersion PipelineVer,
+          index_t NumPrefetch     = 1,
+          LoopScheduler LoopSched = LoopScheduler::Default>
+constexpr auto GridwiseGemmPipeline_Selector()
+{
+    if constexpr(PipelineVer == PipelineVersion::v1)
+    {
+        if constexpr(LoopSched == LoopScheduler::Default)
+        {
+            return GridwiseGemmPipeline_v1<NumPrefetch>{};
+        }
+        else if constexpr(LoopSched == LoopScheduler::Interwave)
+        {
+            return GridwiseGemmPipelineInterwave_v1<NumPrefetch>{};
+        }
+    }
+    else if constexpr(PipelineVer == PipelineVersion::v2)
+    {
+        return GridwiseGemmPipeline_v2{};
+    }
+    else
+    {
+        std::cerr << "GridwiseGemmPipeline configuration is not available" << std::endl;
+    }
+}
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp
new file mode 100644
index 00000000..e9097552
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp
@@ -0,0 +1,369 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
+
+namespace ck {
+
+template <index_t NumPrefetch>
+struct GridwiseGemmPipeline_v1;
+
+// 1-stage prefetch
+template <>
+struct GridwiseGemmPipeline_v1<1>
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    __host__ __device__ static constexpr bool IsSupported(index_t /* num_loop */) { return true; }
+
+    __host__ __device__ static constexpr bool CalculateHasMainLoop(index_t num_loop)
+    {
+        return num_loop > 1;
+    }
+
+    template <bool HasMainLoop,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename BlockwiseGemm,
+              typename CThreadBuffer>
+    __device__ static void Run(const AGridDesc& a_grid_desc,
+                               const ABlockDesc& a_block_desc,
+                               ABlockTransfer& a_blockwise_copy,
+                               const AGridBuffer& a_grid_buf,
+                               ABlockBuffer& a_block_buf,
+                               const ABlockTransferStep& a_block_copy_step,
+                               const BGridDesc& b_grid_desc,
+                               const BBlockDesc& b_block_desc,
+                               BBlockTransfer& b_blockwise_copy,
+                               const BGridBuffer& b_grid_buf,
+                               BBlockBuffer& b_block_buf,
+                               const BBlockTransferStep& b_block_copy_step,
+                               const BlockwiseGemm& blockwise_gemm,
+                               CThreadBuffer& c_thread_buf,
+                               index_t num_loop)
+    {
+        // preload data into LDS
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+
+            do
+            {
+                a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+
+                block_sync_lds();
+
+                b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+                blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+
+                block_sync_lds();
+
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+                ++i;
+            } while(i < (num_loop - 1));
+        }
+
+        // tail
+        {
+            block_sync_lds();
+
+            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+        }
+    }
+};
+
+// 2-stage prefetch
+template <>
+struct GridwiseGemmPipeline_v1<2>
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    __host__ __device__ static constexpr bool IsSupported(index_t num_loop)
+    {
+        // TODO: improve applicability
+        return num_loop % 2 == 0;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainLoop(index_t num_loop)
+    {
+        return (num_loop / 2) > 1;
+    }
+
+    template <bool HasMainLoop,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename BlockwiseGemm,
+              typename CThreadBuffer>
+    static __device__ void Run(const AGridDesc& a_grid_desc,
+                               const ABlockDesc& a_block_desc,
+                               ABlockTransfer& a_blockwise_copy,
+                               const AGridBuffer& a_grid_buf,
+                               ABlockBuffer& a_block_buf,
+                               const ABlockTransferStep& a_block_copy_step,
+                               const BGridDesc& b_grid_desc,
+                               const BBlockDesc& b_block_desc,
+                               BBlockTransfer& b_blockwise_copy,
+                               const BGridBuffer& b_grid_buf,
+                               BBlockBuffer& b_block_buf,
+                               const BBlockTransferStep& b_block_copy_step,
+                               const BlockwiseGemm& blockwise_gemm,
+                               CThreadBuffer& c_thread_buf,
+                               index_t num_loop)
+    {
+        // preload data into LDS
+        {
+            // Read 0
+            a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+            b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I0);
+
+            // Move
+            a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+            b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+            // Read 1
+            a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I1);
+            b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I1);
+        }
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+
+            do
+            {
+                // Move
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                // Write i
+                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
+                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, I0);
+
+                // Read i+2
+                a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+                b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I0);
+
+                // Sync
+                block_sync_lds();
+
+                // Gemm i
+                blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+
+                // Sync
+                block_sync_lds();
+
+                // Move
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                // Write i+1
+                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I1);
+                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, I1);
+
+                // Read i+3
+                a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I1);
+                b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I1);
+
+                // Sync
+                block_sync_lds();
+
+                // Gemm i+1
+                blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+
+                // Sync
+                block_sync_lds();
+
+                i += 2;
+            } while(i < (num_loop - 2));
+        }
+
+        // tail
+        {
+            // Write num_loop - 2
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
+            b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, I0);
+
+            // Sync
+            block_sync_lds();
+
+            // Gemm num_loop - 2
+            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+
+            // Sync
+            block_sync_lds();
+
+            // Write num_loop - 1
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I1);
+            b_blockwise_copy.RunWrite(b_block_desc, b_block_buf, I1);
+
+            // Sync
+            block_sync_lds();
+
+            // Gemm num_loop - 1
+            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+        }
+    }
+};
+
+template <index_t NumPrefetch>
+struct GridwiseGemmPipelineInterwave_v1;
+
+template <>
+struct GridwiseGemmPipelineInterwave_v1<1>
+{
+    __host__ __device__ static constexpr bool IsSupported(index_t /* num_loop */) { return true; }
+
+    __host__ __device__ static constexpr bool CalculateHasMainLoop(index_t num_loop)
+    {
+        return num_loop > 1;
+    }
+
+    template <bool HasMainLoop,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename BlockwiseGemm,
+              typename CThreadBuffer>
+    static __device__ void Run(const AGridDesc& a_grid_desc,
+                               const ABlockDesc& a_block_desc,
+                               ABlockTransfer& a_blockwise_copy,
+                               const AGridBuffer& a_grid_buf,
+                               ABlockBuffer& a_block_buf,
+                               const ABlockTransferStep& a_block_copy_step,
+                               const BGridDesc& b_grid_desc,
+                               const BBlockDesc& b_block_desc,
+                               BBlockTransfer& b_blockwise_copy,
+                               const BGridBuffer& b_grid_buf,
+                               BBlockBuffer& b_block_buf,
+                               const BBlockTransferStep& b_block_copy_step,
+                               const BlockwiseGemm& blockwise_gemm,
+                               CThreadBuffer& c_thread_buf,
+                               index_t num_loop)
+    {
+        // preload data into LDS
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+
+            do
+            {
+                a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+
+                block_sync_lds();
+
+                b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+                blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+
+                // block_sync_lds(); // moved into blockwise_gemm
+
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+                ++i;
+            } while(i < (num_loop - 1));
+        }
+
+        // tail
+        {
+            block_sync_lds();
+
+            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+        }
+    }
+};
+
+// Note: 2 stage prefetch not optimized for inter-wave loop scheduler
+template <>
+struct GridwiseGemmPipelineInterwave_v1<2> : public GridwiseGemmPipeline_v1<2>
+{
+};
+
+// TODO: deprecate as GridwiseGemmPipeline_Selector covers the functionality
+template <index_t NumPrefetch, LoopScheduler LoopSched>
+constexpr auto GridwiseGemmPipeline_v1_Selector()
+{
+    if constexpr(LoopSched == LoopScheduler::Default)
+    {
+        return GridwiseGemmPipeline_v1<NumPrefetch>{};
+    }
+    else if constexpr(LoopSched == LoopScheduler::Interwave)
+    {
+        return GridwiseGemmPipelineInterwave_v1<NumPrefetch>{};
+    }
+}
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp
new file mode 100644
index 00000000..3281b910
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v2.hpp
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+
+namespace ck {
+
+struct GridwiseGemmPipeline_v2
+{
+    __host__ __device__ static constexpr bool IsSupported(index_t num_loop)
+    {
+        // TODO: improve applicability
+        return num_loop % 2 == 0;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainLoop(index_t num_loop)
+    {
+        return (num_loop / 2) > 1;
+    }
+
+    template <bool HasMainLoop,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename BlockwiseGemm,
+              typename CThreadBuffer>
+    __device__ static void Run(const AGridDesc& a_grid_desc,
+                               const ABlockDesc& a_block_desc,
+                               ABlockTransfer& a_blockwise_copy,
+                               const AGridBuffer& a_grid_buf,
+                               ABlockBuffer& a_block_buf,
+                               const ABlockTransferStep& a_block_copy_step,
+                               const BGridDesc& b_grid_desc,
+                               const BBlockDesc& b_block_desc,
+                               BBlockTransfer& b_blockwise_copy,
+                               const BGridBuffer& b_grid_buf,
+                               BBlockBuffer& b_block_buf,
+                               const BBlockTransferStep& b_block_copy_step,
+                               const BlockwiseGemm& blockwise_gemm,
+                               CThreadBuffer& c_thread_buf,
+                               index_t num_loop)
+    {
+        // global read 0
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+        // move to 1
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        // LDS write 0
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+        // global Read 1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+
+        // LDS write 0
+        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+        // global Read 1
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+
+            do
+            {
+                block_sync_lds();
+
+                // GEMM i
+                blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+
+                block_sync_lds();
+
+                // move to i + 2
+                a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                // LDS write i + 1
+                a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+                // global read i + 2
+                a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+
+                // LDS write i + 1
+                b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+                // global read i + 2
+                b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);
+
+                ++i;
+            } while(i < (num_loop - 2));
+        }
+
+        // tail
+        {
+            block_sync_lds();
+
+            // GEMM num_loop - 2
+            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+
+            block_sync_lds();
+
+            // LDS write num_loop - 1
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+            b_blockwise_copy.RunWrite(b_block_desc, b_block_buf);
+
+            block_sync_lds();
+
+            // GEMM num_loop - 1
+            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+        }
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
new file mode 100644
index 00000000..2fe55068
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
@@ -0,0 +1,879 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename ReducePtrsGlobal,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename ReduceInElementwiseOperations,
+          typename ReduceAccElementwiseOperations,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename ReduceGridDescriptor_MBlock_MPerBlock,
+          typename Block2CTileMap,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_reduce_xdl_cshuffle_v1(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            ReducePtrsGlobal p_reduces_grid,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CElementwiseOperation c_element_op,
+            const ReduceInElementwiseOperations reduce_in_element_ops,
+            const ReduceAccElementwiseOperations reduce_out_element_ops,
+            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+            const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                c_grid_desc_mblock_mperblock_nblock_nperblock,
+            const ReduceGridDescriptor_MBlock_MPerBlock reduce_grid_desc_mblock_mperblock,
+            const Block2CTileMap block_2_ctile_map)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
+                                                  p_b_grid,
+                                                  p_c_grid,
+                                                  p_reduces_grid,
+                                                  p_shared,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  c_element_op,
+                                                  reduce_in_element_ops,
+                                                  reduce_out_element_ops,
+                                                  a_grid_desc_ak0_m_ak1,
+                                                  b_grid_desc_bk0_n_bk1,
+                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  reduce_grid_desc_mblock_mperblock,
+                                                  block_2_ctile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_c_grid;
+    ignore = p_reduces_grid;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = c_element_op;
+    ignore = reduce_in_element_ops;
+    ignore = reduce_out_element_ops;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = reduce_grid_desc_mblock_mperblock;
+    ignore = block_2_ctile_map;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+template <typename FloatAB,
+          typename FloatGemmAcc,
+          typename FloatCShuffle,
+          typename FloatC,
+          typename FloatReduceAcc,
+          typename ReducePtrsGlobal,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename ReduceOperations,
+          typename ReduceInElementwiseOperations,
+          typename ReduceAccElementwiseOperations,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          typename ReduceGlobalMemoryDataOperation,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename CGridDesc_M_N,
+          typename ReduceGridDesc_M,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1Value,
+          index_t BK1Value,
+          index_t MPerXdl,
+          index_t NPerXdl,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          typename CReduceThreadClusterLengths_MPerBlock_NPerBlock,
+          index_t CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
+          index_t CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
+          LoopScheduler LoopSched,
+          PipelineVersion PipelineVer = PipelineVersion::v1>
+struct GridwiseGemmReduce_k0mk1_k0nk1_mn_xdl_cshuffle_v1
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    // K1 should be Number<...>
+    static constexpr auto AK0 = Number<KPerBlock / AK1Value>{};
+    static constexpr auto BK0 = Number<KPerBlock / BK1Value>{};
+    static constexpr auto AK1 = Number<AK1Value>{};
+    static constexpr auto BK1 = Number<BK1Value>{};
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    using GridwiseGemmPipe = remove_cvref_t<decltype(
+        GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage>())>;
+
+    __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
+    {
+        // A matrix in LDS memory, dst of blockwise copy
+        return make_naive_tensor_descriptor(
+            make_tuple(AK0, Number<MPerBlock>{}, AK1),
+            make_tuple(Number<MPerBlock + ABlockLdsExtraM>{} * AK1, AK1, I1));
+    }
+
+    __host__ __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    {
+        // B matrix in LDS memory, dst of blockwise copy
+        return make_naive_tensor_descriptor(
+            make_tuple(BK0, Number<NPerBlock>{}, BK1),
+            make_tuple(Number<NPerBlock + BBlockLdsExtraN>{} * BK1, BK1, I1));
+    }
+
+    __host__ __device__ static constexpr auto
+    GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
+    {
+        constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            make_naive_tensor_descriptor_packed(
+                make_tuple(I1,
+                           Number<CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl>{},
+                           I1,
+                           Number<CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>{}));
+
+        return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1, BK1);
+
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
+
+        // LDS allocation for C shuffle in LDS
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+        constexpr auto c_block_size =
+            c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();
+
+        return math::max((a_block_space_size_aligned + b_block_space_size_aligned) *
+                             sizeof(FloatAB),
+                         c_block_size * sizeof(FloatCShuffle));
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    template <typename Block2CTileMap>
+    __host__ __device__ static constexpr bool
+    CheckValidity(const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1,
+                  const BGridDesc_BK0_N_BK1& b_grid_desc_bk0_n_bk1,
+                  const CGridDesc_M_N& c_grid_desc_m_n,
+                  const Block2CTileMap& block_2_ctile_map)
+    {
+        // static_assert(is_known_at_compile_time<remove_cv_t<decltype(AK1)>>::value &&
+        //               is_known_at_compile_time<remove_cv_t<decltype(BK1)>>::value,
+        //               "wrong! K1 need to be known at compile-time");
+
+        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
+                      "Invalid tuning param!");
+
+        const auto M = a_grid_desc_ak0_m_ak1.GetLength(I1);
+        const auto N = b_grid_desc_bk0_n_bk1.GetLength(I1);
+        const auto K = a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2);
+
+        if(!(M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1)))
+            return false;
+
+        if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0))
+            return false;
+
+        // check gridwise gemm pipeline
+        const auto num_k_loop = K / KPerBlock;
+
+        if(!GridwiseGemmPipe::IsSupported(num_k_loop))
+        {
+            return false;
+        }
+
+        if(!block_2_ctile_map.CheckValidity(c_grid_desc_m_n))
+        {
+            return false;
+        }
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        const auto M = c_grid_desc_m_n.GetLength(I0);
+        const auto N = c_grid_desc_m_n.GetLength(I1);
+
+        const auto MBlock = M / MPerBlock;
+        const auto NBlock = N / NPerBlock;
+
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
+                       make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+
+        return c_grid_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeReduceGridDescriptor_MBlock_MPerBlock(const ReduceGridDesc_M& d_grid_desc_m)
+    {
+        const auto M      = d_grid_desc_m.GetLength(I0);
+        const auto MBlock = M / MPerBlock;
+
+        const auto reduce_grid_desc_mblock_mperblock = transform_tensor_descriptor(
+            d_grid_desc_m,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{}))),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0, 1>{}));
+
+        return reduce_grid_desc_mblock_mperblock;
+    }
+
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    __host__ __device__ static constexpr auto
+    MakeDefaultBlock2CTileMap(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc_M_N>(
+            c_grid_desc_m_n);
+    }
+
+    using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(CGridDesc_M_N{}))>;
+
+    using ReduceGridDescriptor_MBlock_MPerBlock =
+        remove_cvref_t<decltype(MakeReduceGridDescriptor_MBlock_MPerBlock(ReduceGridDesc_M{}))>;
+
+    using DefaultBlock2CTileMap =
+        remove_cvref_t<decltype(MakeDefaultBlock2CTileMap(CGridDesc_M_N{}))>;
+
+    template <bool HasMainKBlockLoop, typename Block2CTileMap>
+    __device__ static void
+    Run(const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatC* __restrict__ p_c_grid,
+        ReducePtrsGlobal p_reduces_grid,
+        void* __restrict__ p_shared,
+        const AElementwiseOperation& a_element_op,
+        const BElementwiseOperation& b_element_op,
+        const CElementwiseOperation& c_element_op,
+        const ReduceInElementwiseOperations& reduce_in_element_ops,
+        const ReduceAccElementwiseOperations& reduce_out_element_ops,
+        const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1,
+        const BGridDesc_BK0_N_BK1& b_grid_desc_bk0_n_bk1,
+        const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock&
+            c_grid_desc_mblock_mperblock_nblock_nperblock,
+        const ReduceGridDescriptor_MBlock_MPerBlock& reduce_grid_desc_mblock_mperblock,
+        const Block2CTileMap& block_2_ctile_map)
+    {
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+        // divide block work by [M, N]
+        const auto block_work_idx =
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!block_2_ctile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1, BK1);
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                AElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<AK0, MPerBlock, AK1>,
+                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(a_grid_desc_ak0_m_ak1),
+                                                decltype(a_block_desc_ak0_m_ak1),
+                                                ABlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                ABlockTransferSrcVectorDim,
+                                                2,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_AK1,
+                                                1,
+                                                1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                NumGemmKPrefetchStage>(
+                a_grid_desc_ak0_m_ak1,
+                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                a_element_op,
+                a_block_desc_ak0_m_ak1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                BElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<BK0, NPerBlock, BK1>,
+                                                BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(b_grid_desc_bk0_n_bk1),
+                                                decltype(b_block_desc_bk0_n_bk1),
+                                                BBlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                BBlockTransferSrcVectorDim,
+                                                2,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_BK1,
+                                                1,
+                                                1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                NumGemmKPrefetchStage>(
+                b_grid_desc_bk0_n_bk1,
+                make_multi_index(0, n_block_data_idx_on_grid, 0),
+                b_element_op,
+                b_block_desc_bk0_n_bk1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[K0PerBlock, MPerBlock] is in LDS
+        //     b_mtx[K0PerBlock, NPerBlock] is in LDS
+        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
+        //       register
+        // sanity check
+        constexpr index_t KPack = math::max(
+            math::lcm(AK1, BK1), MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+
+        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
+            BlockSize,
+            FloatAB,
+            FloatGemmAcc,
+            decltype(a_block_desc_ak0_m_ak1),
+            decltype(b_block_desc_bk0_n_bk1),
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            KPack,
+            LoopSched>();
+
+        auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<FloatAB*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<FloatAB*>(p_shared) + a_block_space_size_aligned,
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1, 0, 0);
+
+        // gridwise GEMM pipeline
+        const auto gridwise_gemm_pipeline =
+            GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage, LoopSched>();
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_grid_desc_ak0_m_ak1,
+                                                               a_block_desc_ak0_m_ak1,
+                                                               a_blockwise_copy,
+                                                               a_grid_buf,
+                                                               a_block_buf,
+                                                               a_block_slice_copy_step,
+                                                               b_grid_desc_bk0_n_bk1,
+                                                               b_block_desc_bk0_n_bk1,
+                                                               b_blockwise_copy,
+                                                               b_grid_buf,
+                                                               b_block_buf,
+                                                               b_block_slice_copy_step,
+                                                               blockwise_gemm,
+                                                               c_thread_buf,
+                                                               num_k_block_main_loop);
+
+        // shuffle C + reduction + write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+            constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<FloatCShuffle*>(p_shared),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
+                        M1,                                      // M1 = MWave
+                        M2,                                      // M2 * M3 * M4 = MPerXdl
+                        M3,
+                        M4)),
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
+                        N1,                                      // N1 = NWave
+                        N2))),                                   // N2 = NPerXdl
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<FloatGemmAcc,
+                                                   FloatCShuffle,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            M2,
+                                                            I1,
+                                                            M4,
+                                                            I1>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     m_thread_data_on_block_idx[I3],
+                                     m_thread_data_on_block_idx[I4],
+                                     n_thread_data_on_block_idx[I2]),
+                    ck::tensor_operation::element_wise::PassThrough{}};
+
+            // shuffle: blockwise copy C from LDS to global
+            auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1<
+                ThisThreadBlock,            // ThreadGroup
+                CElementwiseOperation,      // ElementwiseOperation,
+                CGlobalMemoryDataOperation, // DstInMemOp,
+                Sequence<1,
+                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                         1,
+                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                FloatCShuffle,        // typename SrcData,
+                FloatC,               // typename DstData,
+                decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+                Sequence<0, 1, 2, 3>,                           // typename DimAccessOrder,
+                3,                                              // index_t VectorDim,
+                CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector,
+                true,  // bool ThreadTransferSrcResetCoordinateAfterRun,
+                false> // bool ThreadTransferDstResetCoordinateAfterRun>
+                {c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(0, 0, 0, 0),
+                 c_grid_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(block_work_idx[I0], 0, block_work_idx[I1], 0),
+                 c_element_op};
+
+            // space filling curve for threadwise C in VGPR
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle,
+                                           CShuffleNXdlPerWavePerShuffle,
+                                           1,
+                                           1,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>>{};
+
+            // space filling curve for shuffled blockwise C in global mem
+            constexpr auto sfc_c_global =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                           1,
+                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+            // TODO: this should be implemented as a blockwise reduction
+            // LDS c_reduce_block_desc_mperblock_nperblock
+            constexpr auto c_reduce_block_desc_mperblock_nperblock = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_pass_through_transform(
+                        c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetLength(I1)),
+                    make_freeze_transform(I0),
+                    make_pass_through_transform(
+                        c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetLength(I3))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<>{}, Sequence<0>{}, Sequence<>{}, Sequence<1>{}));
+
+            static_assert(CReduceThreadClusterLengths_MPerBlock_NPerBlock::At(I0) *
+                                  CReduceThreadClusterLengths_MPerBlock_NPerBlock::At(I1) ==
+                              BlockSize,
+                          "wrong!");
+
+            static_assert((CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl) %
+                                      CReduceThreadClusterLengths_MPerBlock_NPerBlock::At(I0) ==
+                                  0 &&
+                              (CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl) %
+                                      CReduceThreadClusterLengths_MPerBlock_NPerBlock::At(I1) ==
+                                  0,
+                          "wrong!");
+
+            constexpr index_t mreduce_per_thread =
+                (CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl) /
+                CReduceThreadClusterLengths_MPerBlock_NPerBlock::At(I0);
+
+            constexpr index_t nreduce_per_thread =
+                (CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl) /
+                CReduceThreadClusterLengths_MPerBlock_NPerBlock::At(I1);
+
+            constexpr auto c_reduce_thread_lengths_mperblock_nperblock =
+                Sequence<mreduce_per_thread, nreduce_per_thread>{};
+
+            // VGPR c_reduce_thread_desc_mperblock_nperblock
+            constexpr auto c_reduce_thread_desc_mperblock_nperblock =
+                make_naive_tensor_descriptor_packed(
+                    make_tuple(Number<mreduce_per_thread>{}, Number<nreduce_per_thread>{}));
+
+            // VGPR reduce_thread_desc_mperblock
+            constexpr auto reduce_thread_desc_mperblock =
+                make_naive_tensor_descriptor_packed(make_tuple(Number<mreduce_per_thread>{}));
+
+            // VGPR reduce_thread_desc_mblock_mperblock
+            constexpr auto reduce_thread_desc_mblock_mperblock =
+                make_naive_tensor_descriptor_packed(make_tuple(I1, Number<mreduce_per_thread>{}));
+
+            auto c_reduce_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatReduceAcc>(
+                c_reduce_thread_desc_mperblock_nperblock.GetElementSpaceSize());
+
+            // reduce: threadwise copy from LDS to VGPR
+            constexpr auto c_reduce_thread_cluster_desc = make_cluster_descriptor(
+                CReduceThreadClusterLengths_MPerBlock_NPerBlock{}, Sequence<1, 0>{});
+
+            const auto c_reduce_thread_cluster_idx =
+                c_reduce_thread_cluster_desc.CalculateBottomIndex(
+                    make_multi_index(get_thread_local_1d_id()));
+
+            const auto c_reduce_thread_data_idx_begin =
+                c_reduce_thread_cluster_idx * c_reduce_thread_lengths_mperblock_nperblock;
+
+            auto c_reduce_thread_copy_lds_to_vgpr = ThreadwiseTensorSliceTransfer_v2<
+                FloatCShuffle,
+                FloatReduceAcc,
+                decltype(c_reduce_block_desc_mperblock_nperblock),
+                decltype(c_reduce_thread_desc_mperblock_nperblock),
+                decltype(c_reduce_thread_lengths_mperblock_nperblock),
+                Sequence<0, 1>,
+                1,
+                CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
+                1,
+                true>{c_reduce_block_desc_mperblock_nperblock, c_reduce_thread_data_idx_begin};
+
+            auto reduce_tuple_thread_copy_vgpr_to_global = generate_tuple(
+                [&](auto I) {
+                    auto p_reduce_grid         = p_reduces_grid[I];
+                    auto reduce_acc_element_op = reduce_out_element_ops[I];
+
+                    return ThreadwiseTensorSliceTransfer_v1r3<
+                        FloatReduceAcc,
+                        remove_pointer_t<decltype(p_reduce_grid)>,
+                        decltype(reduce_thread_desc_mblock_mperblock),
+                        decltype(reduce_grid_desc_mblock_mperblock),
+                        decltype(reduce_acc_element_op),
+                        Sequence<1, mreduce_per_thread>,
+                        Sequence<0, 1>,
+                        1,
+                        CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
+                        ReduceGlobalMemoryDataOperation::At(I),
+                        1,
+                        false>{reduce_grid_desc_mblock_mperblock,
+                               make_multi_index(block_work_idx[I0],                  // mblock
+                                                c_reduce_thread_data_idx_begin[I0]), // mperblock
+                               reduce_acc_element_op};
+                },
+                Number<p_reduces_grid.Size()>{});
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            static_assert(num_access == sfc_c_global.GetNumOfAccess(), "wrong!");
+
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf,
+                                              c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              c_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                c_shuffle_block_copy_lds_to_global.Run(
+                    c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                    c_shuffle_block_buf,
+                    c_grid_desc_mblock_mperblock_nblock_nperblock,
+                    c_grid_buf);
+
+                // TODO - extract following into reduction_blockwise
+                {
+                    c_reduce_thread_copy_lds_to_vgpr.Run(c_reduce_block_desc_mperblock_nperblock,
+                                                         c_shuffle_block_buf,
+                                                         c_reduce_thread_desc_mperblock_nperblock,
+                                                         make_tuple(I0, I0),
+                                                         c_reduce_thread_buf);
+
+                    static_for<0, p_reduces_grid.Size(), 1>{}([&](auto In) {
+                        auto& p_reduce_grid = p_reduces_grid[In];
+
+                        auto reduce_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                            p_reduce_grid, reduce_grid_desc_mblock_mperblock.GetElementSpaceSize());
+
+                        auto reduce_thread_buf =
+                            make_static_buffer<AddressSpaceEnum::Vgpr, FloatReduceAcc>(
+                                reduce_thread_desc_mperblock.GetElementSpaceSize());
+
+                        auto& reduce_in_element_op = reduce_in_element_ops[In];
+
+                        auto& reduce_thread_copy_vgpr_to_global =
+                            reduce_tuple_thread_copy_vgpr_to_global(In);
+
+                        using ReduceOperation = remove_cvref_t<decltype(ReduceOperations{}[In])>;
+                        using ThreadwiseReduce =
+                            ThreadwiseReduction<FloatReduceAcc,
+                                                decltype(c_reduce_thread_desc_mperblock_nperblock),
+                                                decltype(reduce_thread_desc_mperblock),
+                                                ReduceOperation,
+                                                false>;
+
+                        // Global write Gemm shuffle + reduction
+                        const auto reduce_identityVal =
+                            ReduceOperation::template GetIdentityValue<FloatReduceAcc>();
+
+                        static_for<0, mreduce_per_thread, 1>{}(
+                            [&](auto I) { reduce_thread_buf(I) = reduce_identityVal; });
+
+                        // reduce in VGPR
+                        static_for<0, mreduce_per_thread, 1>{}([&](auto im) {
+                            static_for<0, nreduce_per_thread, 1>{}([&](auto in) {
+                                constexpr auto offset =
+                                    Number<c_reduce_thread_desc_mperblock_nperblock.CalculateOffset(
+                                        make_tuple(im, in))>{};
+
+                                reduce_in_element_op(c_reduce_thread_buf(offset),
+                                                     c_reduce_thread_buf(offset));
+                            });
+                        });
+
+                        ThreadwiseReduce::Reduce(c_reduce_thread_buf, reduce_thread_buf);
+
+                        // copy from VGPR to Global
+                        reduce_thread_copy_vgpr_to_global.Run(reduce_thread_desc_mblock_mperblock,
+                                                              make_tuple(I0, I0),
+                                                              reduce_thread_buf,
+                                                              reduce_grid_desc_mblock_mperblock,
+                                                              reduce_grid_buf);
+
+                        if constexpr(access_id < num_access - 1)
+                        {
+                            constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id);
+                            reduce_thread_copy_vgpr_to_global.MoveDstSliceWindow(
+                                reduce_grid_desc_mblock_mperblock,
+                                make_tuple(c_global_step[I0], c_global_step[I1]));
+                        }
+                    });
+                }
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id);
+
+                    // move on C
+                    c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow(
+                        c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
+                }
+            });
+
+            // Reduction
+        }
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp
new file mode 100644
index 00000000..aa89bff9
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_split_k_multiple_d_xdl_cshuffle.hpp
@@ -0,0 +1,1263 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+// GEMM:
+//   input : A[M, K]
+//   input : B[N, K]
+//   input : D0[M, N], D1[M, N], ...
+//   output : E[M, N]
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   D0, D1, ... and E have the same layout
+template <typename ABDataType, // FIXME: don't assume A/B have same datatype
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation,
+          InMemoryDataOperationEnum EGlobalMemoryDataOperation,
+          typename AGridDesc_M_K,
+          typename BGridDesc_N_K,
+          typename DsGridDesc_M_N,
+          typename EGridDesc_M_N,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1Value,
+          index_t BK1Value,
+          index_t MPerXdl,
+          index_t NPerXdl,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CDEShuffleBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched>
+struct GridwiseGemmSplitKMultipleD_xdl_cshuffle
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    // K1 should be Number<...>
+    static constexpr auto AK1         = Number<AK1Value>{};
+    static constexpr auto BK1         = Number<BK1Value>{};
+    static constexpr auto AK0PerBlock = Number<KPerBlock / AK1Value>{};
+    static constexpr auto BK0PerBlock = Number<KPerBlock / BK1Value>{};
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    using GridwiseGemmPipe = GridwiseGemmPipeline_v1<NumGemmKPrefetchStage>;
+
+    __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
+    {
+        // A matrix in LDS memory, src of blockwise copy
+        return make_naive_tensor_descriptor(
+            make_tuple(AK0PerBlock, Number<MPerBlock>{}, AK1),
+            make_tuple(Number<MPerBlock + ABlockLdsExtraM>{} * AK1, AK1, I1));
+    }
+
+    __host__ __device__ static constexpr auto GetABlockDescriptor_AKB_AK0PerBlock_MPerBlock_AK1()
+    {
+        // A matrix in LDS memory, dst of blockwise copy
+        return make_naive_tensor_descriptor(
+            make_tuple(I1, AK0PerBlock, Number<MPerBlock>{}, AK1),
+            make_tuple(AK0PerBlock * Number<MPerBlock + ABlockLdsExtraM>{} * AK1,
+                       Number<MPerBlock + ABlockLdsExtraM>{} * AK1,
+                       AK1,
+                       I1));
+    }
+
+    __host__ __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    {
+        // B matrix in LDS memory, src of blockwise copy
+        return make_naive_tensor_descriptor(
+            make_tuple(BK0PerBlock, Number<NPerBlock>{}, BK1),
+            make_tuple(Number<NPerBlock + BBlockLdsExtraN>{} * BK1, BK1, I1));
+    }
+
+    __host__ __device__ static constexpr auto GetBBlockDescriptor_BKB_BK0PerBlock_NPerBlock_BK1()
+    {
+        // B matrix in LDS memory, dst of blockwise copy
+        return make_naive_tensor_descriptor(
+            make_tuple(I1, BK0PerBlock, Number<NPerBlock>{}, BK1),
+            make_tuple(BK0PerBlock * Number<NPerBlock + BBlockLdsExtraN>{} * BK1,
+                       Number<NPerBlock + BBlockLdsExtraN>{} * BK1,
+                       BK1,
+                       I1));
+    }
+
+    __host__ __device__ static constexpr auto
+    GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
+    {
+        constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            make_naive_tensor_descriptor_packed(
+                make_tuple(I1,
+                           Number<CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl>{},
+                           I1,
+                           Number<CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>{}));
+
+        return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    // ck::Tuple<const D0DataType*, const D1DataType*, ...>
+    static constexpr auto MakeDsGridPointer()
+    {
+        return generate_tuple(
+            [&](auto i) {
+                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+
+                return static_cast<const DDataType*>(nullptr);
+            },
+            Number<NumDTensor>{});
+    }
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1, BK1);
+
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
+
+        // LDS allocation for C shuffle in LDS
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+        constexpr auto c_block_size =
+            c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();
+
+        return math::max((a_block_space_size_aligned + b_block_space_size_aligned) *
+                             sizeof(ABDataType),
+                         c_block_size * sizeof(CShuffleDataType));
+    }
+
+    // A desc for source in blockwise copy
+    __host__ __device__ static constexpr auto
+    MakeDefaultAGridDescriptor_AKB_AK0_M_AK1(const AGridDesc_M_K& a_grid_desc_m_k,
+                                             const int split_k)
+    {
+        const auto MRaw = a_grid_desc_m_k.GetLength(I0);
+        const auto KRaw = a_grid_desc_m_k.GetLength(I1);
+
+        const index_t AK0 =
+            (math::integer_divide_ceil(KRaw, KPerBlock * split_k) * KPerBlock) / AK1;
+        const index_t K = split_k * AK0 * AK1;
+        const auto KPad = K - KRaw;
+
+        const auto a_grid_desc_m_kpad = transform_tensor_descriptor(
+            a_grid_desc_m_k,
+            make_tuple(make_pass_through_transform(MRaw), make_right_pad_transform(KRaw, KPad)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+        return transform_tensor_descriptor(
+            a_grid_desc_m_kpad,
+            make_tuple(make_unmerge_transform(make_tuple(split_k, AK0, AK1)),
+                       make_pass_through_transform(MRaw)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}),
+            make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+    }
+
+    // B desc for source in blockwise copy
+    __host__ __device__ static constexpr auto
+    MakeDefaultBGridDescriptor_BKB_BK0_N_BK1(const BGridDesc_N_K& b_grid_desc_n_k,
+                                             const int split_k)
+    {
+        const auto NRaw = b_grid_desc_n_k.GetLength(I0);
+        const auto KRaw = b_grid_desc_n_k.GetLength(I1);
+
+        const index_t BK0 =
+            (math::integer_divide_ceil(KRaw, KPerBlock * split_k) * KPerBlock) / BK1;
+        const index_t K = split_k * BK0 * BK1;
+        const auto KPad = K - KRaw;
+
+        const auto b_grid_desc_n_kpad = transform_tensor_descriptor(
+            b_grid_desc_n_k,
+            make_tuple(make_pass_through_transform(NRaw), make_right_pad_transform(KRaw, KPad)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        return transform_tensor_descriptor(
+            b_grid_desc_n_kpad,
+            make_tuple(make_unmerge_transform(make_tuple(split_k, BK0, BK1)),
+                       make_pass_through_transform(NRaw)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}),
+            make_tuple(Sequence<0, 1, 3>{}, Sequence<2>{}));
+    }
+
+    // E desc for destination in blockwise copy
+    template <typename EGridDescriptor_M_N>
+    __host__ __device__ static constexpr auto MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        const EGridDescriptor_M_N& e_grid_desc_m_n)
+    {
+        const auto M = e_grid_desc_m_n.GetLength(I0);
+        const auto N = e_grid_desc_m_n.GetLength(I1);
+
+        const auto MBlock = M / MPerBlock;
+        const auto NBlock = N / NPerBlock;
+
+        const auto e_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
+            e_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
+                       make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+
+        return e_grid_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    // Ds desc for source in blockwise copy
+    template <typename DsGridDescriptor_M_N>
+    __host__ __device__ static constexpr auto
+    MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        const DsGridDescriptor_M_N& ds_grid_desc_m_n)
+    {
+        return generate_tuple(
+            [&](auto i) {
+                return MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(ds_grid_desc_m_n[i]);
+            },
+            Number<NumDTensor>{});
+    }
+
+    // return block_id to E matrix tile idx (m0, n0) mapping
+    __host__ __device__ static constexpr auto
+    MakeDefaultBlock2ETileMap(const EGridDesc_M_N& e_grid_desc_m_n, const int split_k)
+    {
+        return BlockToCTileMap_KSplit_M00_N0_M01Adapt<MPerBlock, NPerBlock, EGridDesc_M_N>(
+            e_grid_desc_m_n, 8, split_k);
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    template <typename AGridDesc_AKB_AK0_M_AK1,
+              typename BGridDesc_BKB_BK0_N_BK1,
+              typename Block2ETileMap>
+    __host__ __device__ static constexpr bool
+    CheckValidity(const AGridDesc_AKB_AK0_M_AK1& a_grid_desc_akb_ak0_m_ak1,
+                  const BGridDesc_BKB_BK0_N_BK1& b_grid_desc_bkb_bk0_n_bk1,
+                  const DsGridDesc_M_N& ds_grid_desc_m_n,
+                  const EGridDesc_M_N& e_grid_desc_m_n,
+                  const Block2ETileMap& block_2_etile_map)
+    {
+        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
+                      "Invalid tuning param!");
+
+        const auto M = a_grid_desc_akb_ak0_m_ak1.GetLength(I2);
+        const auto N = b_grid_desc_bkb_bk0_n_bk1.GetLength(I2);
+        const auto K =
+            a_grid_desc_akb_ak0_m_ak1.GetLength(I1) * a_grid_desc_akb_ak0_m_ak1.GetLength(I3);
+
+        if(K != b_grid_desc_bkb_bk0_n_bk1.GetLength(I1) * b_grid_desc_bkb_bk0_n_bk1.GetLength(I3))
+        {
+            return false;
+        }
+        if(a_grid_desc_akb_ak0_m_ak1.GetLength(I0) != b_grid_desc_bkb_bk0_n_bk1.GetLength(I0))
+        {
+            return false;
+        }
+
+        // check consistency of desc
+        if(!(M == e_grid_desc_m_n.GetLength(I0) && N == e_grid_desc_m_n.GetLength(I1)))
+        {
+            return false;
+        }
+
+        bool valid = true;
+
+        static_for<0, NumDTensor, 1>{}([&](auto i) {
+            valid = valid && (M == ds_grid_desc_m_n[i].GetLength(I0) &&
+                              N == ds_grid_desc_m_n[i].GetLength(I1));
+        });
+
+        if(!valid)
+        {
+            return false;
+        }
+
+        // check tile size
+        if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0))
+        {
+            return false;
+        }
+
+        // check gridwise gemm pipeline
+        const auto num_k_loop = K / KPerBlock;
+
+        if(!GridwiseGemmPipe::IsSupported(num_k_loop))
+        {
+            return false;
+        }
+
+        // check block-to-E-tile
+        if(!block_2_etile_map.CheckValidity(e_grid_desc_m_n))
+        {
+            return false;
+        }
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        // check tensor size: cannot be larger than 2GB each
+        constexpr long_index_t TwoGB = (long_index_t{1} << 31);
+
+        if(!(a_grid_desc_akb_ak0_m_ak1.GetElementSpaceSize() * sizeof(ABDataType) <= TwoGB &&
+             b_grid_desc_bkb_bk0_n_bk1.GetElementSpaceSize() * sizeof(ABDataType) <= TwoGB &&
+             e_grid_desc_m_n.GetElementSpaceSize() * sizeof(EDataType) <= TwoGB))
+        {
+            return false;
+        }
+
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
+    }
+
+    using DefaultAGridDesc_AK0_M_AK1 =
+        remove_cvref_t<decltype(MakeDefaultAGridDescriptor_AKB_AK0_M_AK1(AGridDesc_M_K{}, 1))>;
+    using DefaultBGridDesc_BK0_N_BK1 =
+        remove_cvref_t<decltype(MakeDefaultBGridDescriptor_BKB_BK0_N_BK1(BGridDesc_N_K{}, 1))>;
+    using EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock  = remove_cvref_t<decltype(
+        MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}))>;
+    using DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(DsGridDesc_M_N{}))>;
+
+    using DefaultBlock2ETileMap =
+        remove_cvref_t<decltype(MakeDefaultBlock2ETileMap(EGridDesc_M_N{}, 1))>;
+
+    using DsGridPointer = decltype(MakeDsGridPointer());
+
+    template <bool HasMainKBlockLoop,
+              typename AGridDesc_AKB_AK0_M_AK1,
+              typename BGridDesc_BKB_BK0_N_BK1,
+              typename Block2ETileMap>
+    __device__ static void Run(const ABDataType* __restrict__ p_a_grid,
+                               const ABDataType* __restrict__ p_b_grid,
+                               DsGridPointer p_ds_grid,
+                               EDataType* __restrict__ p_e_grid,
+                               void* __restrict__ p_shared,
+                               const AElementwiseOperation& a_element_op,
+                               const BElementwiseOperation& b_element_op,
+                               const CDEElementwiseOperation& cde_element_op,
+                               const AGridDesc_AKB_AK0_M_AK1& a_grid_desc_akb_ak0_m_ak1,
+                               const BGridDesc_BKB_BK0_N_BK1& b_grid_desc_bkb_bk0_n_bk1,
+                               const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock&
+                                   ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                               const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock&
+                                   e_grid_desc_mblock_mperblock_nblock_nperblock,
+                               const Block2ETileMap& block_2_etile_map)
+    {
+        const auto block_work_idx =
+            block_2_etile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(block_work_idx[Number<0>{}] == 0)
+        {
+            Run0<HasMainKBlockLoop>(p_a_grid,
+                                    p_b_grid,
+                                    p_ds_grid,
+                                    p_e_grid,
+                                    p_shared,
+                                    a_element_op,
+                                    b_element_op,
+                                    cde_element_op,
+                                    a_grid_desc_akb_ak0_m_ak1,
+                                    b_grid_desc_bkb_bk0_n_bk1,
+                                    ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                    e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                    block_2_etile_map);
+        }
+        else
+        {
+            Run1<HasMainKBlockLoop>(p_a_grid,
+                                    p_b_grid,
+                                    p_e_grid,
+                                    p_shared,
+                                    a_element_op,
+                                    b_element_op,
+                                    a_grid_desc_akb_ak0_m_ak1,
+                                    b_grid_desc_bkb_bk0_n_bk1,
+                                    ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                    e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                    block_2_etile_map);
+        }
+    }
+    template <bool HasMainKBlockLoop,
+              typename AGridDesc_AKB_AK0_M_AK1,
+              typename BGridDesc_BKB_BK0_N_BK1,
+              typename Block2ETileMap>
+    __device__ static void Run0(const ABDataType* __restrict__ p_a_grid,
+                                const ABDataType* __restrict__ p_b_grid,
+                                DsGridPointer p_ds_grid,
+                                EDataType* __restrict__ p_e_grid,
+                                void* __restrict__ p_shared,
+                                const AElementwiseOperation& a_element_op,
+                                const BElementwiseOperation& b_element_op,
+                                const CDEElementwiseOperation& cde_element_op,
+                                const AGridDesc_AKB_AK0_M_AK1& a_grid_desc_akb_ak0_m_ak1,
+                                const BGridDesc_BKB_BK0_N_BK1& b_grid_desc_bkb_bk0_n_bk1,
+                                const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock&
+                                    ds_grid_desc_mblock_mperblock_nblock_nperblock,
+                                const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock&
+                                    e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                const Block2ETileMap& block_2_etile_map)
+    {
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_akb_ak0_m_ak1.GetElementSpaceSize());
+
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_bkb_bk0_n_bk1.GetElementSpaceSize());
+
+        const auto ds_grid_buf = generate_tuple(
+            [&](auto i) {
+                return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                    p_ds_grid[i],
+                    ds_grid_desc_mblock_mperblock_nblock_nperblock[i].GetElementSpaceSize());
+            },
+            Number<NumDTensor>{});
+
+        auto e_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_e_grid, e_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+        // divide block work by [M, N]
+        const auto block_work_idx =
+            block_2_etile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!block_2_etile_map.ValidCTileIndex(
+               make_tuple(block_work_idx[I1], block_work_idx[I2]),
+               make_tuple(e_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          e_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t k_batch_id = block_work_idx[I0];
+
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I2] * NPerBlock);
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1, BK1);
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+        constexpr auto a_block_desc_akb_ak0_m_ak1 =
+            GetABlockDescriptor_AKB_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+        constexpr auto b_block_desc_bkb_bk0_n_bk1 =
+            GetBBlockDescriptor_BKB_BK0PerBlock_NPerBlock_BK1();
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                AElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<I1, AK0PerBlock, MPerBlock, AK1>,
+                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ABDataType,
+                                                ABDataType,
+                                                decltype(a_grid_desc_akb_ak0_m_ak1),
+                                                decltype(a_block_desc_akb_ak0_m_ak1),
+                                                ABlockTransferSrcAccessOrder,
+                                                Sequence<0, 2, 1, 3>,
+                                                ABlockTransferSrcVectorDim,
+                                                3,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_AK1,
+                                                1,
+                                                1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                NumGemmKPrefetchStage>(
+                a_grid_desc_akb_ak0_m_ak1,
+                make_multi_index(k_batch_id, 0, m_block_data_idx_on_grid, 0),
+                a_element_op,
+                a_block_desc_akb_ak0_m_ak1,
+                make_multi_index(0, 0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                BElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<I1, BK0PerBlock, NPerBlock, BK1>,
+                                                BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                ABDataType,
+                                                ABDataType,
+                                                decltype(b_grid_desc_bkb_bk0_n_bk1),
+                                                decltype(b_block_desc_bkb_bk0_n_bk1),
+                                                BBlockTransferSrcAccessOrder,
+                                                Sequence<0, 2, 1, 3>,
+                                                BBlockTransferSrcVectorDim,
+                                                3,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_BK1,
+                                                1,
+                                                1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                NumGemmKPrefetchStage>(
+                b_grid_desc_bkb_bk0_n_bk1,
+                make_multi_index(k_batch_id, 0, n_block_data_idx_on_grid, 0),
+                b_element_op,
+                b_block_desc_bkb_bk0_n_bk1,
+                make_multi_index(0, 0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[K0PerBlock, MPerBlock] is in LDS
+        //     b_mtx[K0PerBlock, NPerBlock] is in LDS
+        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
+        //       register
+        // sanity check
+        constexpr index_t KPack =
+            math::max(math::lcm(AK1, BK1),
+                      MfmaSelector<ABDataType, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+
+        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
+            BlockSize,
+            ABDataType,
+            AccDataType,
+            decltype(a_block_desc_ak0_m_ak1),
+            decltype(b_block_desc_bk0_n_bk1),
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            KPack,
+            LoopSched>();
+
+        auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ABDataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ABDataType*>(p_shared) + a_block_space_size_aligned,
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(0, KPerBlock / AK1, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(0, KPerBlock / BK1, 0, 0);
+
+        // gridwise GEMM pipeline
+        const auto gridwise_gemm_pipeline =
+            GridwiseGemmPipeline_v1_Selector<NumGemmKPrefetchStage, LoopSched>();
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_akb_ak0_m_ak1.GetLength(I1) * a_grid_desc_akb_ak0_m_ak1.GetLength(I3)) /
+            KPerBlock);
+
+        gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_grid_desc_akb_ak0_m_ak1,
+                                                               a_block_desc_akb_ak0_m_ak1,
+                                                               a_blockwise_copy,
+                                                               a_grid_buf,
+                                                               a_block_buf,
+                                                               a_block_slice_copy_step,
+                                                               b_grid_desc_bkb_bk0_n_bk1,
+                                                               b_block_desc_bkb_bk0_n_bk1,
+                                                               b_blockwise_copy,
+                                                               b_grid_buf,
+                                                               b_block_buf,
+                                                               b_block_slice_copy_step,
+                                                               blockwise_gemm,
+                                                               c_thread_buf,
+                                                               num_k_block_main_loop);
+
+        // shuffle C and write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+            constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<CShuffleDataType*>(p_shared),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
+                        M1,                                      // M1 = MWave
+                        M2,                                      // M2 * M3 * M4 = MPerXdl
+                        M3,
+                        M4)),
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
+                        N1,                                      // N1 = NWave
+                        N2))),                                   // N2 = NPerXdl
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   CShuffleDataType,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            M2,
+                                                            I1,
+                                                            M4,
+                                                            I1>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     m_thread_data_on_block_idx[I3],
+                                     m_thread_data_on_block_idx[I4],
+                                     n_thread_data_on_block_idx[I2]),
+                    ck::tensor_operation::element_wise::PassThrough{}};
+            {
+                // tuple of reference to C/Ds tensor descriptors
+                const auto c_ds_desc_refs = concat_tuple_of_reference(
+                    tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                    generate_tie(
+                        [&](auto i) -> const auto& // return type should be reference
+                        { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
+                        Number<NumDTensor>{}));
+
+                // tuple of reference to C/Ds tensor descriptors
+                const auto c_ds_buf_refs = concat_tuple_of_reference(
+                    tie(c_shuffle_block_buf),
+                    generate_tie(
+                        [&](auto i) -> const auto& // return type should be reference
+                        { return ds_grid_buf[i]; },
+                        Number<NumDTensor>{}));
+
+                // tuple of starting index of C/Ds blockwise copy
+                const auto idx_c_ds_block_begin = container_concat(
+                    make_tuple(make_multi_index(0, 0, 0, 0)),
+                    generate_tuple(
+                        [&](auto) {
+                            return make_multi_index(block_work_idx[I1], 0, block_work_idx[I2], 0);
+                        },
+                        Number<NumDTensor>{}));
+
+                // blockwise copy C/D/E between LDS and global
+                auto cde_block_copy_lds_and_global = ThreadGroupTensorSliceTransfer_v7<
+                    ThisThreadBlock,
+                    decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
+                    Tuple<EDataType>,
+                    decltype(c_ds_desc_refs),
+                    decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
+                    CDEElementwiseOperation,
+                    Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make
+                                                                                // Sequence support
+                                                                                // arbitray type
+                    Sequence<1,
+                             CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                             1,
+                             CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                    CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                    Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                    Sequence<0, 1, 2, 3>, // typename DimAccessOrder,
+                    3,                    // index_t VectorDim,
+                    CDEShuffleBlockTransferScalarPerVector_NPerBlock,
+                    sequence_merge_t<Sequence<true>,
+                                     uniform_sequence_gen_t<
+                                         NumDTensor,
+                                         false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
+                    Sequence<false>>              // ThreadTransferDstResetCoordinateAfterRunFlags
+                    {c_ds_desc_refs,
+                     idx_c_ds_block_begin,
+                     tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                     make_tuple(make_multi_index(block_work_idx[I1], 0, block_work_idx[I2], 0)),
+                     cde_element_op};
+
+                // space filling curve for threadwise C in VGPR before shuffle
+                constexpr auto sfc_c_vgpr =
+                    SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
+                                      Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                      Sequence<CShuffleMXdlPerWavePerShuffle,
+                                               CShuffleNXdlPerWavePerShuffle,
+                                               1,
+                                               1,
+                                               M2,
+                                               1,
+                                               M4,
+                                               1>>{};
+
+                // space filling curve for shuffled blockwise C/D/E
+                constexpr auto sfc_cde_block =
+                    SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                      Sequence<0, 2, 1, 3>,
+                                      Sequence<1,
+                                               CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                               1,
+                                               CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+                constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+                static_assert(num_access == sfc_cde_block.GetNumOfAccess(), "wrong!");
+
+                static_for<0, num_access, 1>{}([&](auto access_id) {
+                    // make sure it's safe to write to LDS
+                    block_sync_lds();
+
+                    // each thread write its data from VGPR to LDS
+                    c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                                  sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                                  c_thread_buf,
+                                                  c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                                  c_shuffle_block_buf);
+
+                    // make sure it's safe to read from LDS
+                    block_sync_lds();
+
+                    // each block copy its data from LDS to global
+                    cde_block_copy_lds_and_global.Run(
+                        c_ds_desc_refs,
+                        c_ds_buf_refs,
+                        tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                        tie(e_grid_buf));
+
+                    if constexpr(access_id < num_access - 1)
+                    {
+                        constexpr auto cde_lds_and_global_step =
+                            sfc_cde_block.GetForwardStep(access_id);
+
+                        // move on Ds
+                        static_for<0, NumDTensor, 1>{}([&](auto i) {
+                            cde_block_copy_lds_and_global.MoveSrcSliceWindow(
+                                c_ds_desc_refs, i + I1, cde_lds_and_global_step);
+                        });
+
+                        // move on E
+                        cde_block_copy_lds_and_global.MoveDstSliceWindow(
+                            tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                            I0,
+                            cde_lds_and_global_step);
+                    }
+                });
+            }
+        }
+    }
+
+    template <bool HasMainKBlockLoop,
+              typename AGridDesc_AKB_AK0_M_AK1,
+              typename BGridDesc_BKB_BK0_N_BK1,
+              typename Block2ETileMap>
+    __device__ static void Run1(const ABDataType* __restrict__ p_a_grid,
+                                const ABDataType* __restrict__ p_b_grid,
+                                EDataType* __restrict__ p_e_grid,
+                                void* __restrict__ p_shared,
+                                const AElementwiseOperation& a_element_op,
+                                const BElementwiseOperation& b_element_op,
+                                const AGridDesc_AKB_AK0_M_AK1& a_grid_desc_akb_ak0_m_ak1,
+                                const BGridDesc_BKB_BK0_N_BK1& b_grid_desc_bkb_bk0_n_bk1,
+                                const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock&,
+                                const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock&
+                                    e_grid_desc_mblock_mperblock_nblock_nperblock,
+                                const Block2ETileMap& block_2_etile_map)
+    {
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_akb_ak0_m_ak1.GetElementSpaceSize());
+
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_bkb_bk0_n_bk1.GetElementSpaceSize());
+
+        auto e_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_e_grid, e_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+        // divide block work by [M, N]
+        const auto block_work_idx =
+            block_2_etile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!block_2_etile_map.ValidCTileIndex(
+               make_tuple(block_work_idx[I1], block_work_idx[I2]),
+               make_tuple(e_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          e_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t k_batch_id = block_work_idx[I0];
+
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I2] * NPerBlock);
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1, BK1);
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+        constexpr auto a_block_desc_akb_ak0_m_ak1 =
+            GetABlockDescriptor_AKB_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+        constexpr auto b_block_desc_bkb_bk0_n_bk1 =
+            GetBBlockDescriptor_BKB_BK0PerBlock_NPerBlock_BK1();
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                AElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<I1, AK0PerBlock, MPerBlock, AK1>,
+                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                ABDataType,
+                                                ABDataType,
+                                                decltype(a_grid_desc_akb_ak0_m_ak1),
+                                                decltype(a_block_desc_akb_ak0_m_ak1),
+                                                ABlockTransferSrcAccessOrder,
+                                                Sequence<0, 2, 1, 3>,
+                                                ABlockTransferSrcVectorDim,
+                                                3,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_AK1,
+                                                1,
+                                                1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                NumGemmKPrefetchStage>(
+                a_grid_desc_akb_ak0_m_ak1,
+                make_multi_index(k_batch_id, 0, m_block_data_idx_on_grid, 0),
+                a_element_op,
+                a_block_desc_akb_ak0_m_ak1,
+                make_multi_index(0, 0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                BElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<I1, BK0PerBlock, NPerBlock, BK1>,
+                                                BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                ABDataType,
+                                                ABDataType,
+                                                decltype(b_grid_desc_bkb_bk0_n_bk1),
+                                                decltype(b_block_desc_bkb_bk0_n_bk1),
+                                                BBlockTransferSrcAccessOrder,
+                                                Sequence<0, 2, 1, 3>,
+                                                BBlockTransferSrcVectorDim,
+                                                3,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_BK1,
+                                                1,
+                                                1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                NumGemmKPrefetchStage>(
+                b_grid_desc_bkb_bk0_n_bk1,
+                make_multi_index(k_batch_id, 0, n_block_data_idx_on_grid, 0),
+                b_element_op,
+                b_block_desc_bkb_bk0_n_bk1,
+                make_multi_index(0, 0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[K0PerBlock, MPerBlock] is in LDS
+        //     b_mtx[K0PerBlock, NPerBlock] is in LDS
+        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
+        //       register
+        // sanity check
+        constexpr index_t KPack =
+            math::max(math::lcm(AK1, BK1),
+                      MfmaSelector<ABDataType, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+
+        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
+            BlockSize,
+            ABDataType,
+            AccDataType,
+            decltype(a_block_desc_ak0_m_ak1),
+            decltype(b_block_desc_bk0_n_bk1),
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            KPack,
+            LoopSched>();
+
+        auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ABDataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<ABDataType*>(p_shared) + a_block_space_size_aligned,
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(0, KPerBlock / AK1, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(0, KPerBlock / BK1, 0, 0);
+
+        // gridwise GEMM pipeline
+        const auto gridwise_gemm_pipeline =
+            GridwiseGemmPipeline_v1_Selector<NumGemmKPrefetchStage, LoopSched>();
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_akb_ak0_m_ak1.GetLength(I1) * a_grid_desc_akb_ak0_m_ak1.GetLength(I3)) /
+            KPerBlock);
+
+        gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_grid_desc_akb_ak0_m_ak1,
+                                                               a_block_desc_akb_ak0_m_ak1,
+                                                               a_blockwise_copy,
+                                                               a_grid_buf,
+                                                               a_block_buf,
+                                                               a_block_slice_copy_step,
+                                                               b_grid_desc_bkb_bk0_n_bk1,
+                                                               b_block_desc_bkb_bk0_n_bk1,
+                                                               b_blockwise_copy,
+                                                               b_grid_buf,
+                                                               b_block_buf,
+                                                               b_block_slice_copy_step,
+                                                               blockwise_gemm,
+                                                               c_thread_buf,
+                                                               num_k_block_main_loop);
+
+        // shuffle C and write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+            constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<CShuffleDataType*>(p_shared),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
+                        M1,                                      // M1 = MWave
+                        M2,                                      // M2 * M3 * M4 = MPerXdl
+                        M3,
+                        M4)),
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
+                        N1,                                      // N1 = NWave
+                        N2))),                                   // N2 = NPerXdl
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                                   CShuffleDataType,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            M2,
+                                                            I1,
+                                                            M4,
+                                                            I1>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     m_thread_data_on_block_idx[I3],
+                                     m_thread_data_on_block_idx[I4],
+                                     n_thread_data_on_block_idx[I2]),
+                    ck::tensor_operation::element_wise::PassThrough{}};
+            {
+                // shuffle: blockwise copy C from LDS to global
+                auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1<
+                    ThisThreadBlock,                                 // ThreadGroup
+                    ck::tensor_operation::element_wise::PassThrough, // ElementwiseOperation,
+                    EGlobalMemoryDataOperation,                      // DstInMemOp,
+                    Sequence<1,
+                             CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                             1,
+                             CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                    CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                    Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                    CShuffleDataType,     // typename SrcData,
+                    EDataType,            // typename DstData,
+                    decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                    decltype(e_grid_desc_mblock_mperblock_nblock_nperblock),
+                    Sequence<0, 1, 2, 3>,                             // typename DimAccessOrder,
+                    3,                                                // index_t VectorDim,
+                    CDEShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector,
+                    true,  // bool ThreadTransferSrcResetCoordinateAfterRun,
+                    false> // bool ThreadTransferDstResetCoordinateAfterRun>
+                    {c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                     make_multi_index(0, 0, 0, 0),
+                     e_grid_desc_mblock_mperblock_nblock_nperblock,
+                     make_multi_index(block_work_idx[I1], 0, block_work_idx[I2], 0),
+                     ck::tensor_operation::element_wise::PassThrough{}};
+
+                // space filling curve for threadwise C in VGPR
+                constexpr auto sfc_c_vgpr =
+                    SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
+                                      Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                      Sequence<CShuffleMXdlPerWavePerShuffle,
+                                               CShuffleNXdlPerWavePerShuffle,
+                                               1,
+                                               1,
+                                               M2,
+                                               1,
+                                               M4,
+                                               1>>{};
+
+                // space filling curve for shuffled blockwise C in global mem
+                constexpr auto sfc_c_global =
+                    SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                      Sequence<0, 2, 1, 3>,
+                                      Sequence<1,
+                                               CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                               1,
+                                               CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+                constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+                static_assert(num_access == sfc_c_global.GetNumOfAccess(), "wrong!");
+
+                static_for<0, num_access, 1>{}([&](auto access_id) {
+                    // make sure it's safe to write to LDS
+                    block_sync_lds();
+
+                    // each thread write its data from VGPR to LDS
+                    c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                                  sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                                  c_thread_buf,
+                                                  c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                                  c_shuffle_block_buf);
+
+                    // make sure it's safe to read from LDS
+                    block_sync_lds();
+
+                    // each block copy its data from LDS to global
+                    c_shuffle_block_copy_lds_to_global.Run(
+                        c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                        c_shuffle_block_buf,
+                        e_grid_desc_mblock_mperblock_nblock_nperblock,
+                        e_grid_buf);
+
+                    if constexpr(access_id < num_access - 1)
+                    {
+                        constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id);
+
+                        // move on C
+                        c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow(
+                            e_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
+                    }
+                });
+            }
+        }
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
new file mode 100644
index 00000000..ecc528a7
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v1.hpp
@@ -0,0 +1,653 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename Block2CTileMap,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_xdl_cshuffle_v1(const FloatAB* __restrict__ p_a_grid,
+                                    const FloatAB* __restrict__ p_b_grid,
+                                    FloatC* __restrict__ p_c_grid,
+                                    const AElementwiseOperation a_element_op,
+                                    const BElementwiseOperation b_element_op,
+                                    const CElementwiseOperation c_element_op,
+                                    const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+                                    const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+                                    const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                                        c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                    const Block2CTileMap block_2_ctile_map)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
+                                                  p_b_grid,
+                                                  p_c_grid,
+                                                  p_shared,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  c_element_op,
+                                                  a_grid_desc_ak0_m_ak1,
+                                                  b_grid_desc_bk0_n_bk1,
+                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  block_2_ctile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_c_grid;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = c_element_op;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = block_2_ctile_map;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+template <typename FloatAB,
+          typename FloatGemmAcc,
+          typename FloatCShuffle,
+          typename FloatC,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename CGridDesc_M_N,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1Value,
+          index_t BK1Value,
+          index_t MPerXdl,
+          index_t NPerXdl,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched,
+          PipelineVersion PipelineVer = PipelineVersion::v1>
+struct GridwiseGemm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    // K1 should be Number<...>
+    static constexpr auto AK0 = Number<KPerBlock / AK1Value>{};
+    static constexpr auto BK0 = Number<KPerBlock / BK1Value>{};
+    static constexpr auto AK1 = Number<AK1Value>{};
+    static constexpr auto BK1 = Number<BK1Value>{};
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    // FIXME: pass GridwiseGemmPipe as a template arguement into GridwiseGemm
+    using GridwiseGemmPipe = remove_cvref_t<decltype(
+        GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage, LoopSched>())>;
+
+    __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
+    {
+        // A matrix in LDS memory, dst of blockwise copy
+        return make_naive_tensor_descriptor(
+            make_tuple(AK0, Number<MPerBlock>{}, AK1),
+            make_tuple(Number<MPerBlock + ABlockLdsExtraM>{} * AK1, AK1, I1));
+    }
+
+    __host__ __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    {
+        // B matrix in LDS memory, dst of blockwise copy
+        return make_naive_tensor_descriptor(
+            make_tuple(BK0, Number<NPerBlock>{}, BK1),
+            make_tuple(Number<NPerBlock + BBlockLdsExtraN>{} * BK1, BK1, I1));
+    }
+
+    __host__ __device__ static constexpr auto
+    GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
+    {
+        constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            make_naive_tensor_descriptor_packed(
+                make_tuple(I1,
+                           Number<CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl>{},
+                           I1,
+                           Number<CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>{}));
+
+        return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1, BK1);
+
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
+
+        // LDS allocation for C shuffle in LDS
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+        constexpr auto c_block_size =
+            c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize();
+
+        return math::max((a_block_space_size_aligned + b_block_space_size_aligned) *
+                             sizeof(FloatAB),
+                         c_block_size * sizeof(FloatCShuffle));
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    template <typename Block2CTileMap>
+    __host__ __device__ static constexpr bool
+    CheckValidity(const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1,
+                  const BGridDesc_BK0_N_BK1& b_grid_desc_bk0_n_bk1,
+                  const CGridDesc_M_N& c_grid_desc_m_n,
+                  const Block2CTileMap& block_2_ctile_map)
+    {
+        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
+                      "Invalid tuning param!");
+
+        const auto M = a_grid_desc_ak0_m_ak1.GetLength(I1);
+        const auto N = b_grid_desc_bk0_n_bk1.GetLength(I1);
+        const auto K = a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2);
+
+        if(!(M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1)))
+            return false;
+
+        if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0))
+            return false;
+
+        // check gridwise gemm pipeline
+        const auto num_k_loop = K / KPerBlock;
+
+        if(!GridwiseGemmPipe::IsSupported(num_k_loop))
+        {
+            return false;
+        }
+
+        if(!block_2_ctile_map.CheckValidity(c_grid_desc_m_n))
+        {
+            return false;
+        }
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        const auto M = c_grid_desc_m_n.GetLength(I0);
+        const auto N = c_grid_desc_m_n.GetLength(I1);
+
+        const auto MBlock = M / MPerBlock;
+        const auto NBlock = N / NPerBlock;
+
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
+                       make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+
+        return c_grid_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    __host__ __device__ static constexpr auto
+    MakeDefaultBlock2CTileMap(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc_M_N>(
+            c_grid_desc_m_n);
+    }
+
+    using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(CGridDesc_M_N{}))>;
+
+    using DefaultBlock2CTileMap =
+        remove_cvref_t<decltype(MakeDefaultBlock2CTileMap(CGridDesc_M_N{}))>;
+
+    template <bool HasMainKBlockLoop, typename Block2CTileMap>
+    __device__ static void Run(const FloatAB* __restrict__ p_a_grid,
+                               const FloatAB* __restrict__ p_b_grid,
+                               FloatC* __restrict__ p_c_grid,
+                               void* __restrict__ p_shared,
+                               const AElementwiseOperation& a_element_op,
+                               const BElementwiseOperation& b_element_op,
+                               const CElementwiseOperation& c_element_op,
+                               const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1,
+                               const BGridDesc_BK0_N_BK1& b_grid_desc_bk0_n_bk1,
+                               const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock&
+                                   c_grid_desc_mblock_mperblock_nblock_nperblock,
+                               const Block2CTileMap& block_2_ctile_map)
+    {
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+        // divide block work by [M, N]
+        const auto block_work_idx =
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!block_2_ctile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1, BK1);
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                AElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<AK0, MPerBlock, AK1>,
+                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(a_grid_desc_ak0_m_ak1),
+                                                decltype(a_block_desc_ak0_m_ak1),
+                                                ABlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                ABlockTransferSrcVectorDim,
+                                                2,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_AK1,
+                                                1,
+                                                1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                NumGemmKPrefetchStage>(
+                a_grid_desc_ak0_m_ak1,
+                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                a_element_op,
+                a_block_desc_ak0_m_ak1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                BElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<BK0, NPerBlock, BK1>,
+                                                BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(b_grid_desc_bk0_n_bk1),
+                                                decltype(b_block_desc_bk0_n_bk1),
+                                                BBlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                BBlockTransferSrcVectorDim,
+                                                2,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_BK1,
+                                                1,
+                                                1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                NumGemmKPrefetchStage>(
+                b_grid_desc_bk0_n_bk1,
+                make_multi_index(0, n_block_data_idx_on_grid, 0),
+                b_element_op,
+                b_block_desc_bk0_n_bk1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[K0PerBlock, MPerBlock] is in LDS
+        //     b_mtx[K0PerBlock, NPerBlock] is in LDS
+        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
+        //       register
+        // sanity check
+        constexpr index_t KPack = math::max(
+            math::lcm(AK1, BK1), MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+
+        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
+            BlockSize,
+            FloatAB,
+            FloatGemmAcc,
+            decltype(a_block_desc_ak0_m_ak1),
+            decltype(b_block_desc_bk0_n_bk1),
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            KPack,
+            LoopSched>();
+
+        auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<FloatAB*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<FloatAB*>(p_shared) + a_block_space_size_aligned,
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1, 0, 0);
+
+        // gridwise GEMM pipeline
+        static_assert(std::is_default_constructible_v<GridwiseGemmPipe>);
+        const auto gridwise_gemm_pipeline = GridwiseGemmPipe{};
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_grid_desc_ak0_m_ak1,
+                                                               a_block_desc_ak0_m_ak1,
+                                                               a_blockwise_copy,
+                                                               a_grid_buf,
+                                                               a_block_buf,
+                                                               a_block_slice_copy_step,
+                                                               b_grid_desc_bk0_n_bk1,
+                                                               b_block_desc_bk0_n_bk1,
+                                                               b_blockwise_copy,
+                                                               b_grid_buf,
+                                                               b_block_buf,
+                                                               b_block_slice_copy_step,
+                                                               blockwise_gemm,
+                                                               c_thread_buf,
+                                                               num_k_block_main_loop);
+
+        // shuffle C and write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+            constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<FloatCShuffle*>(p_shared),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
+                        M1,                                      // M1 = MWave
+                        M2,                                      // M2 * M3 * M4 = MPerXdl
+                        M3,
+                        M4)),
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
+                        N1,                                      // N1 = NWave
+                        N2))),                                   // N2 = NPerXdl
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<FloatGemmAcc,
+                                                   FloatCShuffle,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            M2,
+                                                            I1,
+                                                            M4,
+                                                            I1>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     m_thread_data_on_block_idx[I3],
+                                     m_thread_data_on_block_idx[I4],
+                                     n_thread_data_on_block_idx[I2]),
+                    ck::tensor_operation::element_wise::PassThrough{}};
+
+            // shuffle: blockwise copy C from LDS to global
+            auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1<
+                ThisThreadBlock,            // ThreadGroup
+                CElementwiseOperation,      // ElementwiseOperation,
+                CGlobalMemoryDataOperation, // DstInMemOp,
+                Sequence<1,
+                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                         1,
+                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                FloatCShuffle,        // typename SrcData,
+                FloatC,               // typename DstData,
+                decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+                Sequence<0, 1, 2, 3>,                           // typename DimAccessOrder,
+                3,                                              // index_t VectorDim,
+                CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector,
+                true,  // bool ThreadTransferSrcResetCoordinateAfterRun,
+                false> // bool ThreadTransferDstResetCoordinateAfterRun>
+                {c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(0, 0, 0, 0),
+                 c_grid_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(block_work_idx[I0], 0, block_work_idx[I1], 0),
+                 c_element_op};
+
+            // space filling curve for threadwise C in VGPR
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle,
+                                           CShuffleNXdlPerWavePerShuffle,
+                                           1,
+                                           1,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>>{};
+
+            // space filling curve for shuffled blockwise C in global mem
+            constexpr auto sfc_c_global =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                           1,
+                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            static_assert(num_access == sfc_c_global.GetNumOfAccess(), "wrong!");
+
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf,
+                                              c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              c_shuffle_block_buf);
+
+                // make sure it's safe to read from LDS
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                c_shuffle_block_copy_lds_to_global.Run(
+                    c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                    c_shuffle_block_buf,
+                    c_grid_desc_mblock_mperblock_nblock_nperblock,
+                    c_grid_buf);
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id);
+
+                    // move on C
+                    c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow(
+                        c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
+                }
+            });
+        }
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
new file mode 100644
index 00000000..94e181cd
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_layernorm_cshuffle_v1.hpp
@@ -0,0 +1,1068 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
+#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp"
+
+namespace ck {
+
+// D = Layernorm(acc_element_op(A * B + broadcast(bias)) + add) * broadcast(gamma) + broadcast(beta)
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename FloatC0,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename AccElementwiseOperation,
+          typename CElementwiseOperation,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename C0GridDescriptor_NBlock_NPerBlock,
+          typename Block2CTileMap,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_layernorm_xdl_cshuffle_v1(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,               // MxN
+            const FloatC0* __restrict__ p_c0_bias_grid,  // 1xN
+            const FloatC0* __restrict__ p_c0_add_grid,   // MxN
+            const FloatC0* __restrict__ p_c0_gamma_grid, // 1xN
+            const FloatC0* __restrict__ p_c0_beta_grid,  // 1xN
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const AccElementwiseOperation acc_element_op,
+            const CElementwiseOperation c_element_op,
+            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+            const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
+                c_grid_desc_mblock_mperblock_nblock_nperblock,
+            const C0GridDescriptor_NBlock_NPerBlock c0_grid_desc_nblock_nperblock,
+            const Block2CTileMap block_2_ctile_map)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    // TODO ANT: separate into MMA + Epilogue
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
+                                                  p_b_grid,
+                                                  p_c_grid,
+                                                  p_c0_bias_grid,
+                                                  p_c0_add_grid,
+                                                  p_c0_gamma_grid,
+                                                  p_c0_beta_grid,
+                                                  p_shared,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  acc_element_op,
+                                                  c_element_op,
+                                                  a_grid_desc_ak0_m_ak1,
+                                                  b_grid_desc_bk0_n_bk1,
+                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  c0_grid_desc_nblock_nperblock,
+                                                  block_2_ctile_map);
+
+    // TODO ANT: Run layernorm epilogue here
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_c_grid;
+    ignore = p_c0_bias_grid;
+    ignore = p_c0_add_grid;
+    ignore = p_c0_gamma_grid;
+    ignore = p_c0_beta_grid;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = acc_element_op;
+    ignore = c_element_op;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = c0_grid_desc_nblock_nperblock;
+    ignore = block_2_ctile_map;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+// The GEMM + Layernorm implementation is a specialized kernel which allows fusing both layers
+// together given the condition GEMM extents N of MNK is spanned by a single workgroup. For example,
+// a kernel configured with NPerBlock = 128 allows to operate on all GEMM sizes if N <= 128
+template <typename FloatAB,
+          typename FloatGemmAcc,
+          typename FloatCShuffle,
+          typename FloatC,
+          typename FloatC0,
+          typename FloatReduceAcc, // Data type after shuffle
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename AccElementwiseOperation,
+          typename CElementwiseOperation,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename CGridDesc_M_N,
+          typename C0GridDesc_N,
+          index_t NumGemmKPrefetchStage,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1Value,
+          index_t BK1Value,
+          index_t MPerXdl,
+          index_t NPerXdl,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          index_t ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          index_t BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          typename CReduceThreadClusterLengths_MPerBlock_NPerBlock,
+          index_t CReduceThreadCopySrcDstScalarPerVector_NPerBlock,
+          LoopScheduler LoopSched,
+          PipelineVersion PipelineVer = PipelineVersion::v1>
+struct GridwiseGemmLayernorm_k0mk1_k0nk1_mn_xdl_cshuffle_v1
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    // K1 should be Number<...>
+    static constexpr auto AK0 = Number<KPerBlock / AK1Value>{};
+    static constexpr auto BK0 = Number<KPerBlock / BK1Value>{};
+    static constexpr auto AK1 = Number<AK1Value>{};
+    static constexpr auto BK1 = Number<BK1Value>{};
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    using GridwiseGemmPipe = remove_cvref_t<decltype(
+        GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage>())>;
+
+    __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
+    {
+        // A matrix in LDS memory, dst of blockwise copy
+        return make_naive_tensor_descriptor(
+            make_tuple(AK0, Number<MPerBlock>{}, AK1),
+            make_tuple(Number<MPerBlock + ABlockLdsExtraM>{} * AK1, AK1, I1));
+    }
+
+    __host__ __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    {
+        // B matrix in LDS memory, dst of blockwise copy
+        return make_naive_tensor_descriptor(
+            make_tuple(BK0, Number<NPerBlock>{}, BK1),
+            make_tuple(Number<NPerBlock + BBlockLdsExtraN>{} * BK1, BK1, I1));
+    }
+
+    __host__ __device__ static constexpr auto
+    GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
+    {
+        constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            make_naive_tensor_descriptor_packed(
+                make_tuple(I1,
+                           Number<CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl>{},
+                           I1,
+                           Number<CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>{}));
+
+        return c_shuffle_block_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1, BK1);
+
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
+
+        // LDS allocation for C shuffle in LDS
+        constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+            GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+        // Align 16 bytes (maximum LDS read/write width)
+        constexpr auto c_block_size_aligned =
+            math::integer_least_multiple(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize() *
+                    sizeof(FloatCShuffle),
+                16) /
+            sizeof(FloatCShuffle);
+
+        // LDS allocation for reduction workspace
+        constexpr index_t c_lds_workspace_size = BlockSize;
+
+        return math::max((a_block_space_size_aligned + b_block_space_size_aligned) *
+                             sizeof(FloatAB),
+                         c_block_size_aligned * sizeof(FloatCShuffle) +
+                             c_lds_workspace_size * sizeof(FloatReduceAcc));
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    template <typename Block2CTileMap>
+    __host__ __device__ static constexpr bool
+    CheckValidity(const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1,
+                  const BGridDesc_BK0_N_BK1& b_grid_desc_bk0_n_bk1,
+                  const CGridDesc_M_N& c_grid_desc_m_n,
+                  const Block2CTileMap& block_2_ctile_map)
+    {
+        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
+                      "Invalid tuning param!");
+
+        const auto M = a_grid_desc_ak0_m_ak1.GetLength(I1);
+        const auto N = b_grid_desc_bk0_n_bk1.GetLength(I1);
+        const auto K = a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2);
+
+        if(!(M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1)))
+            return false;
+
+        if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0))
+            return false;
+
+        // in order to reduce N dim without elaborate sync across CUs in single kernel, one
+        // workgroup must span the entire N extent
+        if(math::integer_divide_ceil(N, NPerBlock) > 1)
+        {
+            return false;
+        }
+
+        // static check: all waves in the workgroups combined must cover whole N extent in order
+        // to have efficient N-dim reduction
+        static_assert(CShuffleNXdlPerWavePerShuffle == NXdlPerWave,
+                      "condition not met for efficient layernorm");
+
+        // check gridwise gemm pipeline
+        const auto num_k_loop = K / KPerBlock;
+
+        if(!GridwiseGemmPipe::IsSupported(num_k_loop))
+        {
+            return false;
+        }
+
+        if(!block_2_ctile_map.CheckValidity(c_grid_desc_m_n))
+        {
+            return false;
+        }
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        const auto M = c_grid_desc_m_n.GetLength(I0);
+        const auto N = c_grid_desc_m_n.GetLength(I1);
+
+        const auto MBlock = M / MPerBlock;
+        const auto NBlock = N / NPerBlock;
+
+        const auto c_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
+            c_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
+                       make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+
+        return c_grid_desc_mblock_mperblock_nblock_nperblock;
+    }
+
+    // for bias, beta, gamma
+    __host__ __device__ static constexpr auto
+    MakeC0GridDescriptor_NBlock_NPerBlock(const C0GridDesc_N& c0_grid_desc_n)
+    {
+        const auto N      = c0_grid_desc_n.GetLength(I0);
+        const auto NBlock = N / NPerBlock;
+
+        const auto c0_grid_desc_nblock_nperblock = transform_tensor_descriptor(
+            c0_grid_desc_n,
+            make_tuple(make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
+            make_tuple(Sequence<0>{}),
+            make_tuple(Sequence<0, 1>{}));
+
+        return c0_grid_desc_nblock_nperblock;
+    }
+
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    __host__ __device__ static constexpr auto
+    MakeDefaultBlock2CTileMap(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc_M_N>(
+            c_grid_desc_m_n);
+    }
+
+    using CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<decltype(
+        MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(CGridDesc_M_N{}))>;
+
+    using C0GridDescriptor_NBlock_NPerBlock =
+        remove_cvref_t<decltype(MakeC0GridDescriptor_NBlock_NPerBlock(C0GridDesc_N{}))>;
+
+    using DefaultBlock2CTileMap =
+        remove_cvref_t<decltype(MakeDefaultBlock2CTileMap(CGridDesc_M_N{}))>;
+
+    template <bool HasMainKBlockLoop, typename Block2CTileMap>
+    __device__ static void
+    Run(const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatC* __restrict__ p_c_grid,
+        const FloatC0* __restrict__ p_c0_bias_grid,  // 1xN
+        const FloatC0* __restrict__ p_c0_add_grid,   // MxN
+        const FloatC0* __restrict__ p_c0_gamma_grid, // 1xN
+        const FloatC0* __restrict__ p_c0_beta_grid,  // 1xN
+        void* __restrict__ p_shared,
+        const AElementwiseOperation& a_element_op,
+        const BElementwiseOperation& b_element_op,
+        const AccElementwiseOperation& acc_element_op,
+        const CElementwiseOperation& c_element_op,
+        const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1,
+        const BGridDesc_BK0_N_BK1& b_grid_desc_bk0_n_bk1,
+        const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock&
+            c_grid_desc_mblock_mperblock_nblock_nperblock,
+        const C0GridDescriptor_NBlock_NPerBlock& c0_grid_desc_nblock_nperblock,
+        const Block2CTileMap& block_2_ctile_map)
+    {
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+        auto c0_bias_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c0_bias_grid, c0_grid_desc_nblock_nperblock.GetElementSpaceSize());
+        // Note: c0_add is of same layout as c so we don't declare new c0_add_desc here
+        auto c0_add_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c0_add_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+        auto c0_gamma_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c0_gamma_grid, c0_grid_desc_nblock_nperblock.GetElementSpaceSize());
+        auto c0_beta_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c0_beta_grid, c0_grid_desc_nblock_nperblock.GetElementSpaceSize());
+
+        // divide block work by [M, N]
+        const auto block_work_idx =
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!block_2_ctile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1, BK1);
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                AElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<AK0, MPerBlock, AK1>,
+                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(a_grid_desc_ak0_m_ak1),
+                                                decltype(a_block_desc_ak0_m_ak1),
+                                                ABlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                ABlockTransferSrcVectorDim,
+                                                2,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_AK1,
+                                                1,
+                                                1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                NumGemmKPrefetchStage>(
+                a_grid_desc_ak0_m_ak1,
+                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                a_element_op,
+                a_block_desc_ak0_m_ak1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                BElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<BK0, NPerBlock, BK1>,
+                                                BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(b_grid_desc_bk0_n_bk1),
+                                                decltype(b_block_desc_bk0_n_bk1),
+                                                BBlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                BBlockTransferSrcVectorDim,
+                                                2,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_BK1,
+                                                1,
+                                                1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                NumGemmKPrefetchStage>(
+                b_grid_desc_bk0_n_bk1,
+                make_multi_index(0, n_block_data_idx_on_grid, 0),
+                b_element_op,
+                b_block_desc_bk0_n_bk1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[K0PerBlock, MPerBlock] is in LDS
+        //     b_mtx[K0PerBlock, NPerBlock] is in LDS
+        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
+        //       register
+        // sanity check
+        constexpr index_t KPack = math::max(
+            math::lcm(AK1, BK1), MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+
+        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
+            BlockSize,
+            FloatAB,
+            FloatGemmAcc,
+            decltype(a_block_desc_ak0_m_ak1),
+            decltype(b_block_desc_bk0_n_bk1),
+            MPerXdl,
+            NPerXdl,
+            MXdlPerWave,
+            NXdlPerWave,
+            KPack,
+            LoopSched>();
+
+        auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<FloatAB*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<FloatAB*>(p_shared) + a_block_space_size_aligned,
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1, 0, 0);
+
+        // gridwise GEMM pipeline
+        const auto gridwise_gemm_pipeline =
+            GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage, LoopSched>();
+
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        gridwise_gemm_pipeline.template Run<HasMainKBlockLoop>(a_grid_desc_ak0_m_ak1,
+                                                               a_block_desc_ak0_m_ak1,
+                                                               a_blockwise_copy,
+                                                               a_grid_buf,
+                                                               a_block_buf,
+                                                               a_block_slice_copy_step,
+                                                               b_grid_desc_bk0_n_bk1,
+                                                               b_block_desc_bk0_n_bk1,
+                                                               b_blockwise_copy,
+                                                               b_grid_buf,
+                                                               b_block_buf,
+                                                               b_block_slice_copy_step,
+                                                               blockwise_gemm,
+                                                               c_thread_buf,
+                                                               num_k_block_main_loop);
+
+        // shuffle C and write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+            constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+
+            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<FloatCShuffle*>(p_shared),
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
+                        M1,                                      // M1 = MWave
+                        M2,                                      // M2 * M3 * M4 = MPerXdl
+                        M3,
+                        M4)),
+                    make_freeze_transform(I0),
+                    make_unmerge_transform(make_tuple(
+                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
+                        N1,                                      // N1 = NWave
+                        N2))),                                   // N2 = NPerXdl
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // shuffle: threadwise copy C from VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<FloatGemmAcc,
+                                                   FloatCShuffle,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            M2,
+                                                            I1,
+                                                            M4,
+                                                            I1>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     m_thread_data_on_block_idx[I3],
+                                     m_thread_data_on_block_idx[I4],
+                                     n_thread_data_on_block_idx[I2]),
+                    tensor_operation::element_wise::PassThrough{}};
+
+            // shuffle: blockwise copy C from LDS to global
+            auto c_shuffle_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1<
+                ThisThreadBlock,            // ThreadGroup
+                CElementwiseOperation,      // ElementwiseOperation,
+                CGlobalMemoryDataOperation, // DstInMemOp,
+                Sequence<1,
+                         CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                         1,
+                         CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
+                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                FloatCShuffle,        // typename SrcData,
+                FloatC,               // typename DstData,
+                decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
+                decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+                Sequence<0, 1, 2, 3>,                           // typename DimAccessOrder,
+                3,                                              // index_t VectorDim,
+                CShuffleBlockTransferScalarPerVector_NPerBlock, // index_t ScalarPerVector,
+                true,  // bool ThreadTransferSrcResetCoordinateAfterRun,
+                false> // bool ThreadTransferDstResetCoordinateAfterRun>
+                {c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(0, 0, 0, 0),
+                 c_grid_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(block_work_idx[I0], 0, block_work_idx[I1], 0),
+                 c_element_op};
+
+            const auto NBlock = c0_grid_desc_nblock_nperblock.GetLength(I0);
+
+            // for broadcasting bias, beta, gamma
+            const auto c0_grid_desc_mblock_mperblock_nblock_nperblock = transform_tensor_descriptor(
+                c0_grid_desc_nblock_nperblock,
+                make_tuple(make_insert_transform(I1),
+                           make_insert_transform(I1),
+                           make_pass_through_transform(NBlock),
+                           make_pass_through_transform(NPerBlock)),
+                make_tuple(Sequence<>{}, Sequence<>{}, Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            // LDS c_reduce_block_desc_mperblock_nperblock
+            constexpr auto c_reduce_block_desc_mperblock_nperblock = transform_tensor_descriptor(
+                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0),
+                    make_pass_through_transform(
+                        c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetLength(I1)),
+                    make_freeze_transform(I0),
+                    make_pass_through_transform(
+                        c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetLength(I3))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<>{}, Sequence<0>{}, Sequence<>{}, Sequence<1>{}));
+
+            static_assert(CReduceThreadClusterLengths_MPerBlock_NPerBlock::At(I0) *
+                                  CReduceThreadClusterLengths_MPerBlock_NPerBlock::At(I1) ==
+                              BlockSize,
+                          "wrong!");
+
+            static_assert((CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl) %
+                                      CReduceThreadClusterLengths_MPerBlock_NPerBlock::At(I0) ==
+                                  0 &&
+                              (CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl) %
+                                      CReduceThreadClusterLengths_MPerBlock_NPerBlock::At(I1) ==
+                                  0,
+                          "wrong!");
+
+            constexpr index_t mreduce_per_thread =
+                (CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl) /
+                CReduceThreadClusterLengths_MPerBlock_NPerBlock::At(I0);
+
+            constexpr index_t nreduce_per_thread =
+                (CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl) /
+                CReduceThreadClusterLengths_MPerBlock_NPerBlock::At(I1);
+
+            constexpr auto c_reduce_thread_lengths_mperblock_nperblock =
+                Sequence<mreduce_per_thread, nreduce_per_thread>{};
+
+            // pytorch default
+            // https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html
+            static constexpr FloatReduceAcc epsilon = 1e-5;
+
+            // VGPR c_reduce_thread_desc_mperblock_nperblock
+            constexpr auto c_reduce_thread_desc_mperblock_nperblock =
+                make_naive_tensor_descriptor_packed(
+                    make_tuple(Number<mreduce_per_thread>{}, Number<nreduce_per_thread>{}));
+
+            constexpr auto c_reduce_thread_desc_mblock_mperblock_nblock_nperblock =
+                make_naive_tensor_descriptor_packed(
+                    make_tuple(I1, Number<mreduce_per_thread>{}, I1, Number<nreduce_per_thread>{}));
+
+            // VGPR d_reduce_thread_desc_mperblock
+            constexpr auto d_reduce_thread_desc_mperblock =
+                make_naive_tensor_descriptor_packed(make_tuple(Number<mreduce_per_thread>{}));
+
+            // TODO: this should be implemented as a blockwise reduction
+            auto c_reduce_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatReduceAcc>(
+                c_reduce_thread_desc_mperblock_nperblock.GetElementSpaceSize());
+
+            auto c0_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatC0>(
+                c_reduce_thread_desc_mperblock_nperblock.GetElementSpaceSize());
+
+            // Align 16 bytes (maximum LDS read/write width)
+            constexpr auto c_block_size_aligned =
+                math::integer_least_multiple(
+                    c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize() *
+                        sizeof(FloatCShuffle),
+                    16) /
+                sizeof(FloatCShuffle);
+
+            auto d_reduce_work_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                reinterpret_cast<FloatReduceAcc*>(static_cast<FloatCShuffle*>(p_shared) +
+                                                  c_block_size_aligned),
+                BlockSize);
+
+            // Sum thread workspace
+            auto d0_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatReduceAcc>(
+                d_reduce_thread_desc_mperblock.GetElementSpaceSize());
+
+            // Squared sum thread workspace
+            auto d1_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatReduceAcc>(
+                d_reduce_thread_desc_mperblock.GetElementSpaceSize());
+
+            // reduce: threadwise copy from LDS to VGPR
+            constexpr auto c_reduce_thread_cluster_desc = make_cluster_descriptor(
+                CReduceThreadClusterLengths_MPerBlock_NPerBlock{}, Sequence<1, 0>{});
+
+            const auto c_reduce_thread_cluster_idx =
+                c_reduce_thread_cluster_desc.CalculateBottomIndex(
+                    make_multi_index(get_thread_local_1d_id()));
+
+            const auto c_reduce_thread_data_idx_begin =
+                c_reduce_thread_cluster_idx * c_reduce_thread_lengths_mperblock_nperblock;
+
+            auto c_reduce_thread_copy_lds_to_vgpr = ThreadwiseTensorSliceTransfer_v2<
+                FloatCShuffle,
+                FloatReduceAcc,
+                decltype(c_reduce_block_desc_mperblock_nperblock),
+                decltype(c_reduce_thread_desc_mperblock_nperblock),
+                decltype(c_reduce_thread_lengths_mperblock_nperblock),
+                Sequence<0, 1>,
+                1,
+                CReduceThreadCopySrcDstScalarPerVector_NPerBlock,
+                1,
+                true>{c_reduce_block_desc_mperblock_nperblock, c_reduce_thread_data_idx_begin};
+
+            auto c_reduce_thread_copy_vgpr_to_lds = ThreadwiseTensorSliceTransfer_v1r3<
+                FloatReduceAcc,
+                FloatCShuffle,
+                decltype(c_reduce_thread_desc_mperblock_nperblock),
+                decltype(c_reduce_block_desc_mperblock_nperblock),
+                tensor_operation::element_wise::PassThrough,
+                decltype(c_reduce_thread_lengths_mperblock_nperblock),
+                Sequence<0, 1>,
+                1,
+                CReduceThreadCopySrcDstScalarPerVector_NPerBlock,
+                InMemoryDataOperationEnum::Set,
+                1,
+                true>{c_reduce_block_desc_mperblock_nperblock,
+                      c_reduce_thread_data_idx_begin,
+                      tensor_operation::element_wise::PassThrough{}};
+
+            auto c0_thread_copy_global_to_vgpr = ThreadwiseTensorSliceTransfer_v2<
+                FloatC0,
+                FloatC0,
+                decltype(c0_grid_desc_mblock_mperblock_nblock_nperblock),
+                decltype(c_reduce_thread_desc_mblock_mperblock_nblock_nperblock),
+                Sequence<I1, mreduce_per_thread, I1, nreduce_per_thread>,
+                Sequence<0, 1, 2, 3>,
+                3,
+                CReduceThreadCopySrcDstScalarPerVector_NPerBlock,
+                1,
+                true>(c0_grid_desc_mblock_mperblock_nblock_nperblock,
+                      make_multi_index(block_work_idx[I0],
+                                       c_reduce_thread_data_idx_begin[I0],
+                                       block_work_idx[I1],
+                                       c_reduce_thread_data_idx_begin[I1]));
+
+            // Note: c0_add is of same layout as c so we don't declare new c0_add_desc here
+            auto c0_add_thread_copy_global_to_vgpr = ThreadwiseTensorSliceTransfer_v2<
+                FloatC0,
+                FloatC0,
+                decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+                decltype(c_reduce_thread_desc_mblock_mperblock_nblock_nperblock),
+                Sequence<I1, mreduce_per_thread, I1, nreduce_per_thread>,
+                Sequence<0, 1, 2, 3>,
+                3,
+                CReduceThreadCopySrcDstScalarPerVector_NPerBlock,
+                1,
+                true>(c_grid_desc_mblock_mperblock_nblock_nperblock,
+                      make_multi_index(block_work_idx[I0],
+                                       c_reduce_thread_data_idx_begin[I0],
+                                       block_work_idx[I1],
+                                       c_reduce_thread_data_idx_begin[I1]));
+
+            // space filling curve for threadwise C in VGPR
+            constexpr auto sfc_c_vgpr =
+                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle,
+                                           CShuffleNXdlPerWavePerShuffle,
+                                           1,
+                                           1,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>>{};
+
+            // space filling curve for shuffled blockwise C in global mem
+            constexpr auto sfc_c_global =
+                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
+                                  Sequence<0, 2, 1, 3>,
+                                  Sequence<1,
+                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
+                                           1,
+                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
+
+            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
+
+            static_assert(num_access == sfc_c_global.GetNumOfAccess(), "wrong!");
+
+            static_for<0, num_access, 1>{}([&](auto access_id) {
+                // make sure it's safe to write to LDS
+                block_sync_lds();
+
+                // each thread write its data from VGPR to LDS
+                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
+                                              c_thread_buf,
+                                              c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                              c_shuffle_block_buf);
+
+                block_sync_lds();
+
+                // load from LDS and global, add bias
+                c_reduce_thread_copy_lds_to_vgpr.Run(c_reduce_block_desc_mperblock_nperblock,
+                                                     c_shuffle_block_buf,
+                                                     c_reduce_thread_desc_mperblock_nperblock,
+                                                     make_tuple(I0, I0),
+                                                     c_reduce_thread_buf);
+
+                c0_thread_copy_global_to_vgpr.Run(
+                    c0_grid_desc_mblock_mperblock_nblock_nperblock,
+                    c0_bias_grid_buf,
+                    c_reduce_thread_desc_mblock_mperblock_nblock_nperblock,
+                    make_tuple(I0, I0, I0, I0),
+                    c0_thread_buf);
+
+                static_for<0, c_reduce_thread_desc_mperblock_nperblock.GetElementSize(), 1>{}(
+                    [&](auto i) {
+                        FloatReduceAcc out;
+                        acc_element_op(out,
+                                       c_reduce_thread_buf(i) +
+                                           static_cast<FloatReduceAcc>(c0_thread_buf(i)));
+                        c_reduce_thread_buf(i) = out; // acc_element_op(acc + bias)
+                    });
+
+                c0_add_thread_copy_global_to_vgpr.Run(
+                    c_grid_desc_mblock_mperblock_nblock_nperblock,
+                    c0_add_grid_buf,
+                    c_reduce_thread_desc_mblock_mperblock_nblock_nperblock,
+                    make_tuple(I0, I0, I0, I0),
+                    c0_thread_buf);
+
+                static_for<0, c_reduce_thread_desc_mperblock_nperblock.GetElementSize(), 1>{}(
+                    [&](auto i) {
+                        c_reduce_thread_buf(i) +=
+                            static_cast<FloatReduceAcc>(c0_thread_buf(i)); // add
+                    });
+
+                // layernorm
+                {
+                    using ThreadwiseReduceD0 =
+                        ThreadwiseReduction<FloatReduceAcc,
+                                            decltype(c_reduce_thread_desc_mperblock_nperblock),
+                                            decltype(d_reduce_thread_desc_mperblock),
+                                            reduce::Add,
+                                            false>;
+                    using ThreadwiseReduceD1 =
+                        ThreadwiseReduction<FloatReduceAcc,
+                                            decltype(c_reduce_thread_desc_mperblock_nperblock),
+                                            decltype(d_reduce_thread_desc_mperblock),
+                                            reduce::SquaredAdd,
+                                            false>;
+
+                    const auto d0_zeroVal =
+                        ThreadwiseReduceD0::Op::template GetIdentityValue<FloatReduceAcc>();
+                    const auto d1_zeroVal =
+                        ThreadwiseReduceD1::Op::template GetIdentityValue<FloatReduceAcc>();
+                    static_for<0, mreduce_per_thread, 1>{}(
+                        [&](auto i) { d0_thread_buf(i) = d0_zeroVal; });
+                    static_for<0, mreduce_per_thread, 1>{}(
+                        [&](auto i) { d1_thread_buf(i) = d1_zeroVal; });
+
+                    // reduce sum in VGPR
+                    ThreadwiseReduceD0::Reduce(c_reduce_thread_buf, d0_thread_buf);
+
+                    // reduce squared sum in VGPR
+                    ThreadwiseReduceD1::Reduce(c_reduce_thread_buf, d1_thread_buf);
+
+                    // reduce within workgroup
+                    using BlockwiseReduce = PartitionedBlockwiseReduction<
+                        FloatReduceAcc,
+                        BlockSize,
+                        CReduceThreadClusterLengths_MPerBlock_NPerBlock, // ThreadClusterLengths_M_K
+                        Sequence<1, 0>, // ThreadClusterArrangeOrder
+                        reduce::Add,
+                        false>;
+
+                    static_for<0, mreduce_per_thread, 1>{}([&](auto i) {
+                        block_sync_lds();
+                        BlockwiseReduce::Reduce(d_reduce_work_buf,
+                                                d0_thread_buf(i)); // blockwise reduced sum
+                        block_sync_lds();
+                        BlockwiseReduce::Reduce(d_reduce_work_buf,
+                                                d1_thread_buf(i)); // blockwise reduced squared sum
+                    });
+
+                    // normalize
+                    const index_t NRaw =
+                        c_grid_desc_mblock_mperblock_nblock_nperblock.GetTransforms()[I0]
+                            .GetUpperLengths()[I1]; // TODO: proper handle
+
+                    static_for<0, mreduce_per_thread, 1>{}([&](auto im) {
+                        static_for<0, nreduce_per_thread, 1>{}([&](auto in) {
+                            constexpr auto dst_offset =
+                                Number<c_reduce_thread_desc_mperblock_nperblock.CalculateOffset(
+                                    make_tuple(im, in))>{};
+
+                            constexpr auto src_offset =
+                                Number<d_reduce_thread_desc_mperblock.CalculateOffset(
+                                    make_tuple(im))>{};
+
+                            FloatReduceAcc avg_sum         = d0_thread_buf(src_offset) / NRaw;
+                            FloatReduceAcc avg_squared_sum = d1_thread_buf(src_offset) / NRaw;
+
+                            FloatReduceAcc numerator = c_reduce_thread_buf(dst_offset) - avg_sum;
+                            FloatReduceAcc divisor = epsilon + avg_squared_sum - avg_sum * avg_sum;
+                            FloatReduceAcc divisor_sqrt;
+                            tensor_operation::element_wise::UnarySqrt{}(divisor_sqrt, divisor);
+
+                            c_reduce_thread_buf(dst_offset) = numerator / divisor_sqrt;
+                        });
+                    });
+
+                    // scaling
+                    c0_thread_copy_global_to_vgpr.Run(
+                        c0_grid_desc_mblock_mperblock_nblock_nperblock,
+                        c0_gamma_grid_buf,
+                        c_reduce_thread_desc_mblock_mperblock_nblock_nperblock,
+                        make_tuple(I0, I0, I0, I0),
+                        c0_thread_buf);
+
+                    static_for<0, c_reduce_thread_desc_mperblock_nperblock.GetElementSize(), 1>{}(
+                        [&](auto i) {
+                            c_reduce_thread_buf(i) *=
+                                static_cast<FloatReduceAcc>(c0_thread_buf(i)); // * gamma
+                        });
+
+                    c0_thread_copy_global_to_vgpr.Run(
+                        c0_grid_desc_mblock_mperblock_nblock_nperblock,
+                        c0_beta_grid_buf,
+                        c_reduce_thread_desc_mblock_mperblock_nblock_nperblock,
+                        make_tuple(I0, I0, I0, I0),
+                        c0_thread_buf);
+
+                    static_for<0, c_reduce_thread_desc_mperblock_nperblock.GetElementSize(), 1>{}(
+                        [&](auto i) {
+                            c_reduce_thread_buf(i) +=
+                                static_cast<FloatReduceAcc>(c0_thread_buf(i)); // + beta
+                        });
+
+                    block_sync_lds();
+
+                    c_reduce_thread_copy_vgpr_to_lds.Run(c_reduce_thread_desc_mperblock_nperblock,
+                                                         make_tuple(I0, I0),
+                                                         c_reduce_thread_buf,
+                                                         c_reduce_block_desc_mperblock_nperblock,
+                                                         c_shuffle_block_buf);
+
+                } // end layernorm
+
+                block_sync_lds();
+
+                // each block copy its data from LDS to global
+                c_shuffle_block_copy_lds_to_global.Run(
+                    c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
+                    c_shuffle_block_buf,
+                    c_grid_desc_mblock_mperblock_nblock_nperblock,
+                    c_grid_buf);
+
+                if constexpr(access_id < num_access - 1)
+                {
+                    constexpr auto c_global_step = sfc_c_global.GetForwardStep(access_id);
+
+                    // move on C
+                    c_shuffle_block_copy_lds_to_global.MoveDstSliceWindow(
+                        c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
+
+                    // move on C0
+                    c0_thread_copy_global_to_vgpr.MoveSrcSliceWindow(
+                        c0_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
+
+                    // move on C0_add
+                    c0_add_thread_copy_global_to_vgpr.MoveSrcSliceWindow(
+                        c_grid_desc_mblock_mperblock_nblock_nperblock, c_global_step);
+                }
+            });
+        }
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
new file mode 100644
index 00000000..126887cb
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
@@ -0,0 +1,983 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+// Implementation of "Merge" transformation primitive that uses division and mod. It is supposed to
+// be used for low_lengths that are known at compile time and are power of 2, otherwise performance
+// will be very bad
+template <typename LowLengths>
+struct Merge_v4_no_carry
+{
+    static constexpr index_t NDimLow = LowLengths::Size();
+
+    using LowerIndex = MultiIndex<NDimLow>;
+    using UpperIndex = MultiIndex<1>;
+
+    using LowLengthsScan =
+        decltype(container_reverse_exclusive_scan(LowLengths{}, math::multiplies{}, Number<1>{}));
+
+    using UpLengths =
+        decltype(make_tuple(container_reduce(LowLengths{}, math::multiplies{}, Number<1>{})));
+
+    LowLengths low_lengths_;
+    LowLengthsScan low_lengths_scan_;
+    UpLengths up_lengths_;
+
+    __host__ __device__ constexpr Merge_v4_no_carry() = default;
+
+    __host__ __device__ constexpr Merge_v4_no_carry(const LowLengths& low_lengths)
+        : low_lengths_{low_lengths},
+          low_lengths_scan_{
+              container_reverse_exclusive_scan(low_lengths, math::multiplies{}, Number<1>{})},
+          up_lengths_{make_tuple(container_reduce(low_lengths, math::multiplies{}, Number<1>{}))}
+    {
+        static_assert(LowerIndex::Size() == NDimLow, "wrong!");
+    }
+
+    __host__ __device__ static constexpr index_t GetNumOfLowerDimension() { return NDimLow; }
+
+    __host__ __device__ static constexpr index_t GetNumOfUpperDimension() { return 1; }
+
+    __host__ __device__ constexpr const auto& GetUpperLengths() const { return up_lengths_; }
+
+    template <typename LowIdx, typename UpIdx>
+    __host__ __device__ constexpr void CalculateLowerIndex(LowIdx& idx_low,
+                                                           const UpIdx& idx_up) const
+    {
+        static_assert(LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        index_t tmp = idx_up[Number<0>{}];
+
+        // division and mod
+        static_for<0, NDimLow - 1, 1>{}([&](auto i) {
+            idx_low(i) = tmp / this->low_lengths_scan_[i];
+            tmp %= this->low_lengths_scan_[i];
+        });
+
+        idx_low(Number<NDimLow - 1>{}) = tmp;
+    }
+
+    template <typename LowIdxDiff,
+              typename UpIdxDiff,
+              typename LowIdx,
+              typename UpIdx,
+              index_t Hack>
+    __host__ __device__ void UpdateLowerIndex(LowIdxDiff& idx_diff_low,
+                                              const UpIdxDiff& idx_up_diff,
+                                              LowIdx& idx_low,
+                                              const UpIdx& idx_up_new,
+                                              Number<Hack>) const
+    {
+        static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
+                          LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
+                      "wrong! inconsistent # of dimension");
+
+        constexpr auto I0   = Number<0>{};
+        constexpr auto INm1 = Number<NDimLow - 1>{};
+
+        index_t tmp = idx_up_new[I0];
+
+        idx_low(INm1)      = tmp;
+        idx_diff_low(INm1) = idx_up_diff[I0];
+    }
+
+    __host__ __device__ static constexpr bool IsLinearTransform() { return false; }
+
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return is_known_at_compile_time<LowLengths>::value &&
+               is_known_at_compile_time<LowLengthsScan>::value &&
+               is_known_at_compile_time<UpLengths>::value;
+    }
+
+    template <typename UpIdx>
+    __host__ __device__ static constexpr bool
+    IsValidUpperIndexMappedToValidLowerIndex(const UpIdx& /* idx_up */)
+    {
+        return true;
+    }
+
+    __host__ __device__ void Print() const
+    {
+        printf("{");
+        printf("Merge_v3_direct_division_mod_wrw, ");
+        printf("low_lengths_ ");
+        print_multi_index(low_lengths_);
+        printf("low_lengths_scan_ ");
+        print_multi_index(low_lengths_scan_);
+        printf("up_lengths_ ");
+        print_multi_index(up_lengths_);
+        printf("}");
+    }
+};
+
+template <typename LowLengths>
+__host__ __device__ constexpr auto make_merge_transform_v4_no_carry(const LowLengths& low_lengths)
+{
+    return Merge_v4_no_carry<LowLengths>{low_lengths};
+}
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AGridDesc_B_K0_M_K1,
+          typename BGridDesc_B_K0_N_K1,
+          typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename CBlockClusterAdaptor,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_xdlops_bwd_weight(const FloatAB* __restrict__ p_a_grid,
+                                      const FloatAB* __restrict__ p_b_grid,
+                                      FloatC* __restrict__ p_c_grid,
+                                      const AGridDesc_B_K0_M_K1 a_b_k0_m_k1_grid_desc,
+                                      const BGridDesc_B_K0_N_K1 b_b_k0_n_k1_grid_desc,
+                                      const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+                                          c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                      const AElementwiseOperation a_element_op,
+                                      const BElementwiseOperation b_element_op,
+                                      const CElementwiseOperation c_element_op,
+                                      const CBlockClusterAdaptor c_block_cluster_adaptor)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
+                                                  p_b_grid,
+                                                  p_c_grid,
+                                                  p_shared_block,
+                                                  a_b_k0_m_k1_grid_desc,
+                                                  b_b_k0_n_k1_grid_desc,
+                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  c_element_op,
+                                                  c_block_cluster_adaptor);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_c_grid;
+    ignore = a_b_k0_m_k1_grid_desc;
+    ignore = b_b_k0_n_k1_grid_desc;
+    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = c_element_op;
+    ignore = c_block_cluster_adaptor;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          typename AGridDesc_B_K0_M_K1,
+          typename BGridDesc_B_K0_N_K1,
+          typename CMNGridDesc,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t K0PerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t K1Value,
+          index_t MRepeat,
+          index_t NRepeat,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_K1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          bool ABlockLdsExtraM,
+          index_t ABlockLdsM1PerBlock,
+          index_t ABlockLdsM0PerBlock,
+          index_t ABlockLdsM1Padding,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_K1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          bool BBlockLdsExtraN,
+          index_t BBlockLdsN1PerBlock,
+          index_t BBlockLdsN0PerBlock,
+          index_t BBlockLdsN1Padding,
+          index_t CShuffleMRepeatPerShuffle,
+          index_t CShuffleNRepeatPerShuffle,
+          index_t CBlockTransferScalarPerVector_NWaveNPerXDL,
+          typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          bool ABlockLdsExtraM1Wrw      = false,
+          bool BBlockLdsExtraN1Wrw      = false,
+          index_t NumGemmKPrefetchStage = 1,
+          PipelineVersion PipelineVer   = PipelineVersion::v1>
+struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    // K1 should be Number<...>
+    static constexpr auto K1 = Number<K1Value>{};
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    using GridwiseGemmPipe = remove_cvref_t<decltype(
+        GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage>())>;
+
+    // M0/M1/M1Padding
+    static constexpr auto M1PerBlock = Number<ABlockLdsM1PerBlock>{};
+    static constexpr auto M0PerBlock = Number<ABlockLdsM0PerBlock>{};
+    static constexpr auto M1Padding  = Number<ABlockLdsM1Padding>{};
+
+    // N0/N1/N1Padding
+    static constexpr auto N1PerBlock = Number<BBlockLdsN1PerBlock>{};
+    static constexpr auto N0PerBlock = Number<BBlockLdsN0PerBlock>{};
+    static constexpr auto N1Padding  = Number<BBlockLdsN1Padding>{};
+
+    __host__ __device__ static constexpr auto GetABlockDescriptor_K0PerBlock_MPerBlock_K1()
+    {
+        constexpr auto max_lds_align = K1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_k0_m_k1 = [&]() {
+            if constexpr(ABlockLdsExtraM)
+            {
+                if constexpr(ABlockLdsExtraM1Wrw)
+                {
+                    constexpr auto a_block_desc_k0_m0_m1_k1 = make_naive_tensor_descriptor(
+                        make_tuple(
+                            Number<K0PerBlock>{}, Number<M0PerBlock>{}, Number<M1PerBlock>{}, K1),
+                        make_tuple(Number<M0PerBlock>{} * (Number<M1PerBlock>{} * K1 + M1Padding),
+                                   Number<M1PerBlock>{} * K1 + M1Padding,
+                                   K1,
+                                   I1));
+
+                    constexpr auto a_block_desc_k0_m_k1_tmp = transform_tensor_descriptor(
+                        a_block_desc_k0_m0_m1_k1,
+                        make_tuple(make_pass_through_transform(Number<K0PerBlock>{}),
+                                   make_merge_transform_v3_division_mod(
+                                       make_tuple(Number<M0PerBlock>{}, Number<M1PerBlock>{})),
+                                   make_pass_through_transform(K1)),
+                        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}),
+                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+                    return a_block_desc_k0_m_k1_tmp;
+                }
+                else
+                {
+                    return make_naive_tensor_descriptor(
+                        make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1),
+                        make_tuple(Number<MPerBlock + 1>{} * K1, K1, I1));
+                }
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+            }
+        }();
+
+        return a_block_desc_k0_m_k1;
+    }
+
+    __host__ __device__ static constexpr auto GetABlockDescriptor_Batch_K0PerBlock_MPerBlock_K1()
+    {
+        constexpr auto max_lds_align = K1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_b_k0_m_k1 = [&]() {
+            if constexpr(ABlockLdsExtraM)
+            {
+                if constexpr(ABlockLdsExtraM1Wrw)
+                {
+                    constexpr auto a_block_desc_b_k0_m0_m1_k1 = make_naive_tensor_descriptor(
+                        make_tuple(Number<1>{},
+                                   Number<K0PerBlock>{},
+                                   Number<M0PerBlock>{},
+                                   Number<M1PerBlock>{},
+                                   K1),
+                        make_tuple(Number<K0PerBlock>{} * Number<M0PerBlock>{} *
+                                       (Number<M1PerBlock>{} * K1 + M1Padding),
+                                   Number<M0PerBlock>{} * (Number<M1PerBlock>{} * K1 + M1Padding),
+                                   Number<M1PerBlock>{} * K1 + M1Padding,
+                                   K1,
+                                   I1));
+
+                    constexpr auto a_block_desc_b_k0_m_k1_tmp = transform_tensor_descriptor(
+                        a_block_desc_b_k0_m0_m1_k1,
+                        make_tuple(make_pass_through_transform(Number<1>{}),
+                                   make_pass_through_transform(Number<K0PerBlock>{}),
+                                   make_merge_transform_v4_no_carry(
+                                       make_tuple(Number<M0PerBlock>{}, Number<M1PerBlock>{})),
+                                   make_pass_through_transform(K1)),
+                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}),
+                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+                    return a_block_desc_b_k0_m_k1_tmp;
+                }
+                else
+                {
+                    return make_naive_tensor_descriptor(
+                        make_tuple(Number<1>{}, Number<K0PerBlock>{}, Number<MPerBlock>{}, K1),
+                        make_tuple(Number<K0PerBlock>{} * Number<MPerBlock + 1>{} * K1,
+                                   Number<MPerBlock + 1>{} * K1,
+                                   K1,
+                                   I1));
+                }
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<1>{}, Number<K0PerBlock>{}, Number<MPerBlock>{}, K1),
+                    max_lds_align);
+            }
+        }();
+
+        return a_block_desc_b_k0_m_k1;
+    }
+
+    __host__ __device__ static constexpr auto GetBBlockDescriptor_K0PerBlock_NPerBlock_K1()
+    {
+        constexpr auto max_lds_align = K1;
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_k0_n_k1 = [&]() {
+            if constexpr(BBlockLdsExtraN)
+            {
+                if constexpr(BBlockLdsExtraN1Wrw)
+                {
+                    constexpr auto b_block_desc_k0_n0_n1_k1 = make_naive_tensor_descriptor(
+                        make_tuple(
+                            Number<K0PerBlock>{}, Number<N0PerBlock>{}, Number<N1PerBlock>{}, K1),
+                        make_tuple(Number<N0PerBlock>{} * (Number<N1PerBlock>{} * K1 + N1Padding),
+                                   Number<N1PerBlock>{} * K1 + N1Padding,
+                                   K1,
+                                   I1));
+
+                    constexpr auto b_block_desc_k0_n_k1_tmp = transform_tensor_descriptor(
+                        b_block_desc_k0_n0_n1_k1,
+                        make_tuple(make_pass_through_transform(Number<K0PerBlock>{}),
+                                   make_merge_transform_v3_division_mod(
+                                       make_tuple(Number<N0PerBlock>{}, Number<N1PerBlock>{})),
+                                   make_pass_through_transform(K1)),
+                        make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}),
+                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+                    return b_block_desc_k0_n_k1_tmp;
+                }
+                else
+                {
+                    return make_naive_tensor_descriptor(
+                        make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1),
+                        make_tuple(Number<NPerBlock + 1>{} * K1, K1, I1));
+                }
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+            }
+        }();
+
+        return b_block_desc_k0_n_k1;
+    }
+
+    __host__ __device__ static constexpr auto GetBBlockDescriptor_Batch_K0PerBlock_NPerBlock_K1()
+    {
+        constexpr auto max_lds_align = K1;
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_b_k0_n_k1 = [&]() {
+            if constexpr(BBlockLdsExtraN)
+            {
+                if constexpr(BBlockLdsExtraN1Wrw)
+                {
+                    constexpr auto b_block_desc_b_k0_n0_n1_k1 = make_naive_tensor_descriptor(
+                        make_tuple(Number<1>{},
+                                   Number<K0PerBlock>{},
+                                   Number<N0PerBlock>{},
+                                   Number<N1PerBlock>{},
+                                   K1),
+                        make_tuple(Number<K0PerBlock>{} * Number<N0PerBlock>{} *
+                                       (Number<N1PerBlock>{} * K1 + N1Padding),
+                                   Number<N0PerBlock>{} * (Number<N1PerBlock>{} * K1 + N1Padding),
+                                   Number<N1PerBlock>{} * K1 + N1Padding,
+                                   K1,
+                                   I1));
+
+                    constexpr auto b_block_desc_b_k0_n_k1_tmp = transform_tensor_descriptor(
+                        b_block_desc_b_k0_n0_n1_k1,
+                        make_tuple(make_pass_through_transform(Number<1>{}),
+                                   make_pass_through_transform(Number<K0PerBlock>{}),
+                                   make_merge_transform_v4_no_carry(
+                                       make_tuple(Number<N0PerBlock>{}, Number<N1PerBlock>{})),
+                                   make_pass_through_transform(K1)),
+                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4>{}),
+                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+                    return b_block_desc_b_k0_n_k1_tmp;
+                }
+                else
+                {
+                    return make_naive_tensor_descriptor(
+                        make_tuple(Number<1>{}, Number<K0PerBlock>{}, Number<NPerBlock>{}, K1),
+                        make_tuple(Number<K0PerBlock>{} * Number<NPerBlock + 1>{} * K1,
+                                   Number<NPerBlock + 1>{} * K1,
+                                   K1,
+                                   I1));
+                }
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<1>{}, Number<K0PerBlock>{}, Number<NPerBlock>{}, K1),
+                    max_lds_align);
+            }
+        }();
+
+        return b_block_desc_b_k0_n_k1;
+    }
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        constexpr auto max_lds_align = K1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_b_k0_m_k1_block_desc = GetABlockDescriptor_Batch_K0PerBlock_MPerBlock_K1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_b_k0_n_k1_block_desc = GetBBlockDescriptor_Batch_K0PerBlock_NPerBlock_K1();
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size = math::integer_least_multiple(
+            a_b_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size = math::integer_least_multiple(
+            b_b_k0_n_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto c_block_size =
+            GetCBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock().GetElementSpaceSize();
+
+        return math::max((a_block_space_size + b_block_space_size) * sizeof(FloatAB),
+                         c_block_size * sizeof(FloatC));
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    template <typename Block2CTileMap>
+    __host__ __device__ static constexpr bool
+    CheckValidity(const AGridDesc_B_K0_M_K1& a_b_k0_m_k1_grid_desc,
+                  const BGridDesc_B_K0_N_K1& b_b_k0_n_k1_grid_desc,
+                  const CMNGridDesc& c_m_n_grid_desc,
+                  const Block2CTileMap& block_2_ctile_map)
+    {
+        static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
+                      "wrong! K1 need to be known at compile-time");
+
+        static_assert((MPerBlock % (MPerXDL * MRepeat) == 0) &&
+                          (NPerBlock % (NRepeat * NPerXDL)) == 0,
+                      "Invalid tuning param!");
+
+        const auto M      = a_b_k0_m_k1_grid_desc.GetLength(I2);
+        const auto N      = b_b_k0_n_k1_grid_desc.GetLength(I2);
+        const auto K0     = a_b_k0_m_k1_grid_desc.GetLength(I1);
+        const auto KBatch = a_b_k0_m_k1_grid_desc.GetLength(I0);
+
+        // check gridwise gemm pipeline
+        const auto num_k_loop = K0 / K0PerBlock;
+
+        if(!GridwiseGemmPipe::IsSupported(num_k_loop))
+        {
+            return false;
+        }
+
+        if(!(M == c_m_n_grid_desc.GetLength(I0) && N == c_m_n_grid_desc.GetLength(I1) &&
+             K0 == b_b_k0_n_k1_grid_desc.GetLength(I1) &&
+             K1 == a_b_k0_m_k1_grid_desc.GetLength(I3) &&
+             K1 == b_b_k0_n_k1_grid_desc.GetLength(I3) &&
+             KBatch == b_b_k0_n_k1_grid_desc.GetLength(I0)))
+            return false;
+
+        if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % K0PerBlock == 0))
+            return false;
+
+        if(!block_2_ctile_map.CheckValidity(c_m_n_grid_desc))
+        {
+            return false;
+        }
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainK0BlockLoop(index_t K0)
+    {
+        // const bool has_main_k0_block_loop = K0 > K0PerBlock;
+        const index_t num_loop = K0 / K0PerBlock;
+
+        return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
+
+        // return has_main_k0_block_loop;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(const CMNGridDesc& c_m_n_grid_desc)
+    {
+        const auto M = c_m_n_grid_desc.GetLength(I0);
+        const auto N = c_m_n_grid_desc.GetLength(I1);
+
+        const auto MBlock = M / MPerBlock;
+        const auto NBlock = N / NPerBlock;
+
+        return transform_tensor_descriptor(
+            c_m_n_grid_desc,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
+                       make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+    }
+
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    __host__ __device__ static constexpr auto MakeCBlockClusterAdaptor(
+        const CMNGridDesc& c_m_n_grid_desc, index_t M01, index_t N01, index_t KBatch)
+    {
+        return BlockToCTileMap_KSplit_M00_N00_M01_N01<MPerBlock, NPerBlock, CMNGridDesc>(
+            c_m_n_grid_desc, M01, N01, KBatch);
+    }
+
+    __host__ __device__ static constexpr auto
+    GetCBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
+    {
+        constexpr index_t MWave = MPerBlock / (MRepeat * MPerXDL);
+        constexpr index_t NWave = NPerBlock / (NRepeat * NPerXDL);
+
+        return make_naive_tensor_descriptor_packed(
+            make_tuple(I1,
+                       Number<CShuffleMRepeatPerShuffle * MWave * MPerXDL>{},
+                       I1,
+                       Number<CShuffleNRepeatPerShuffle * NWave * NPerXDL>{}));
+    }
+
+    using CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
+        decltype(MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(CMNGridDesc{}));
+    using CBlockClusterAdaptor = decltype(MakeCBlockClusterAdaptor(CMNGridDesc{}, 1, 1, 1));
+
+    template <bool HasMainKBlockLoop>
+    __device__ static void Run(const FloatAB* __restrict__ p_a_grid,
+                               const FloatAB* __restrict__ p_b_grid,
+                               FloatC* __restrict__ p_c_grid,
+                               FloatAB* __restrict__ p_shared_block,
+                               const AGridDesc_B_K0_M_K1& a_b_k0_m_k1_grid_desc,
+                               const BGridDesc_B_K0_N_K1& b_b_k0_n_k1_grid_desc,
+                               const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
+                                   c_grid_desc_mblock_mperblock_nblock_nperblock,
+                               const AElementwiseOperation& a_element_op,
+                               const BElementwiseOperation& b_element_op,
+                               const CElementwiseOperation& c_element_op,
+                               const CBlockClusterAdaptor& c_block_cluster_adaptor)
+    {
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_b_k0_m_k1_grid_desc.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_b_k0_n_k1_grid_desc.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+        const auto K0 = a_b_k0_m_k1_grid_desc.GetLength(I1);
+
+        // divide block work by [M, N]
+        const auto block_work_idx =
+            c_block_cluster_adaptor.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        const index_t k_batch_id = block_work_idx[I0];
+
+        if(!c_block_cluster_adaptor.ValidCTileIndex(
+               make_tuple(block_work_idx[I1], block_work_idx[I2]),
+               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I2] * NPerBlock);
+
+        // lds max alignment
+        constexpr auto max_lds_align = K1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_k0_m_k1_block_desc = GetABlockDescriptor_K0PerBlock_MPerBlock_K1();
+
+        constexpr auto a_b_k0_m_k1_block_desc = GetABlockDescriptor_Batch_K0PerBlock_MPerBlock_K1();
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_k0_n_k1_block_desc = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1();
+
+        constexpr auto b_b_k0_n_k1_block_desc = GetBBlockDescriptor_Batch_K0PerBlock_NPerBlock_K1();
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                AElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<1, K0PerBlock, MPerBlock, K1>,
+                                                ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(a_b_k0_m_k1_grid_desc),
+                                                decltype(a_b_k0_m_k1_block_desc),
+                                                ABlockTransferSrcAccessOrder,
+                                                Sequence<0, 2, 1, 3>,
+                                                ABlockTransferSrcVectorDim,
+                                                3,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_K1,
+                                                1,
+                                                1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                true>(
+                a_b_k0_m_k1_grid_desc,
+                make_multi_index(k_batch_id, 0, m_block_data_idx_on_grid, 0),
+                a_element_op,
+                a_b_k0_m_k1_block_desc,
+                make_multi_index(0, 0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                BElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<1, K0PerBlock, NPerBlock, K1>,
+                                                BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(b_b_k0_n_k1_grid_desc),
+                                                decltype(b_b_k0_n_k1_block_desc),
+                                                BBlockTransferSrcAccessOrder,
+                                                Sequence<0, 2, 1, 3>,
+                                                BBlockTransferSrcVectorDim,
+                                                3,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_K1,
+                                                1,
+                                                1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                true>(
+                b_b_k0_n_k1_grid_desc,
+                make_multi_index(k_batch_id, 0, n_block_data_idx_on_grid, 0),
+                b_element_op,
+                b_b_k0_n_k1_block_desc,
+                make_multi_index(0, 0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[K0PerBlock, MPerBlock] is in LDS
+        //     b_mtx[K0PerBlock, NPerBlock] is in LDS
+        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
+        //       register
+        // sanity check
+
+        constexpr index_t KPack =
+            math::max(K1, MfmaSelector<FloatAB, MPerXDL, NPerXDL>::selected_mfma.k_per_blk);
+
+        auto blockwise_gemm =
+            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
+                                                                FloatAB,
+                                                                FloatAcc,
+                                                                decltype(a_k0_m_k1_block_desc),
+                                                                decltype(b_k0_n_k1_block_desc),
+                                                                MPerXDL,
+                                                                NPerXDL,
+                                                                MRepeat,
+                                                                NRepeat,
+                                                                KPack>{};
+
+        auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size =
+            math::integer_least_multiple(a_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        FloatAB* p_a_block = p_shared_block;
+        FloatAB* p_b_block = p_shared_block + a_block_space_size;
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(0, K0PerBlock, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(0, K0PerBlock, 0, 0);
+
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            p_a_block, a_k0_m_k1_block_desc.GetElementSpaceSize());
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            p_b_block, b_k0_n_k1_block_desc.GetElementSpaceSize());
+
+        // gridwise GEMM pipeline
+        const index_t K0BlockMainLoop = __builtin_amdgcn_readfirstlane(K0 / K0PerBlock);
+
+        GridwiseGemmPipe::template Run<HasMainKBlockLoop>(a_b_k0_m_k1_grid_desc,
+                                                          a_b_k0_m_k1_block_desc,
+                                                          a_blockwise_copy,
+                                                          a_grid_buf,
+                                                          a_block_buf,
+                                                          a_block_slice_copy_step,
+                                                          b_b_k0_n_k1_grid_desc,
+                                                          b_b_k0_n_k1_block_desc,
+                                                          b_blockwise_copy,
+                                                          b_grid_buf,
+                                                          b_block_buf,
+                                                          b_block_slice_copy_step,
+                                                          blockwise_gemm,
+                                                          c_thread_buf,
+                                                          K0BlockMainLoop);
+
+        // output: register to global memory
+        {
+            constexpr index_t MWave = MPerBlock / (MRepeat * MPerXDL);
+            constexpr index_t NWave = NPerBlock / (NRepeat * NPerXDL);
+
+            constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc =
+                blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_thread_desc =
+                blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I0);
+            constexpr auto N0 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I1);
+            constexpr auto M1 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I2);
+            constexpr auto N1 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I3);
+            constexpr auto M2 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I4);
+            constexpr auto M3 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I5);
+            constexpr auto M4 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I6);
+            constexpr auto N2 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I7);
+
+            constexpr auto c_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            void* p_shared = static_cast<void*>(p_shared_block);
+
+            auto c_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<FloatC*>(p_shared),
+                c_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            static_assert(M1 == MWave, "");
+            static_assert(N1 == NWave, "");
+            static_assert(M2 * M3 * M4 == MPerXDL, "");
+            static_assert(N2 == NPerXDL, "");
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0), // freeze mblock
+                    make_unmerge_transform(make_tuple(CShuffleMRepeatPerShuffle,
+                                                      M1,
+                                                      M2,
+                                                      M3,
+                                                      M4)), // M1 = MWave, M2 * M3 * M4 = MPerXDL
+                    make_freeze_transform(I0),              // freeze nblock
+                    make_unmerge_transform(make_tuple(CShuffleNRepeatPerShuffle,
+                                                      N1,
+                                                      N2))), // M1 = MWave, M2 * M3 * M4 = MPerXDL
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<FloatAcc,
+                                                   FloatC,
+                                                   decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_thread_desc),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMRepeatPerShuffle,
+                                                            CShuffleNRepeatPerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            M2,
+                                                            I1,
+                                                            M4,
+                                                            I1>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     m_thread_data_on_block_idx[I3],
+                                     m_thread_data_on_block_idx[I4],
+                                     n_thread_data_on_block_idx[I2]),
+                    ck::tensor_operation::element_wise::PassThrough{}};
+
+            // LDS to global
+            auto c_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1<
+                ThisThreadBlock,            // index_t BlockSize,
+                CElementwiseOperation,      // ElementwiseOperation,
+                CGlobalMemoryDataOperation, // DstInMemOp,
+                Sequence<1,
+                         CShuffleMRepeatPerShuffle * MWave * MPerXDL,
+                         1,
+                         CShuffleNRepeatPerShuffle * NWave * NPerXDL>, // BlockSliceLengths,
+                CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                FloatC,               // typename SrcData,
+                FloatC,               // typename DstData,
+                decltype(c_block_desc_mblock_mperblock_nblock_nperblock),
+                decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+                Sequence<0, 1, 2, 3>,                       // typename DimAccessOrder,
+                3,                                          // index_t VectorDim,
+                CBlockTransferScalarPerVector_NWaveNPerXDL, // index_t ScalarPerVector,
+                true,  // bool ThreadTransferSrcResetCoordinateAfterRun,
+                false> // bool ThreadTransferDstResetCoordinateAfterRun
+                {c_block_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(0, 0, 0, 0),
+                 c_grid_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(block_work_idx[I1], 0, block_work_idx[I2], 0),
+                 c_element_op};
+
+            constexpr auto mxdlperwave_forward_step =
+                make_multi_index(0, CShuffleMRepeatPerShuffle * MWave * MPerXDL, 0, 0);
+            constexpr auto nxdlperwave_forward_step =
+                make_multi_index(0, 0, 0, CShuffleNRepeatPerShuffle * NWave * NPerXDL);
+            constexpr auto nxdlperwave_backward_step =
+                make_multi_index(0, 0, 0, -CShuffleNRepeatPerShuffle * NWave * NPerXDL);
+
+            static_for<0, MRepeat, CShuffleMRepeatPerShuffle>{}([&](auto mxdlperwave_iter) {
+                constexpr auto mxdlperwave = mxdlperwave_iter;
+
+                static_for<0, NRepeat, CShuffleNRepeatPerShuffle>{}([&](auto nxdlperwave_iter) {
+                    constexpr bool nxdlperwave_forward_sweep =
+                        (mxdlperwave % (2 * CShuffleMRepeatPerShuffle) == 0);
+
+                    constexpr index_t nxdlperwave_value =
+                        nxdlperwave_forward_sweep
+                            ? nxdlperwave_iter
+                            : (NRepeat - nxdlperwave_iter - CShuffleNRepeatPerShuffle);
+
+                    constexpr auto nxdlperwave = Number<nxdlperwave_value>{};
+
+                    // make sure it's safe to do ds_write
+                    block_sync_lds();
+
+                    // VGPR to LDS
+                    c_thread_copy_vgpr_to_lds.Run(
+                        c_m0_n0_m1_n1_m2_m3_m4_n2_thread_desc,
+                        make_tuple(mxdlperwave, nxdlperwave, I0, I0, I0, I0, I0, I0),
+                        c_thread_buf,
+                        c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                        c_block_buf);
+
+                    // make sure it's safe to do ds_read
+                    block_sync_lds();
+
+                    // LDS to global
+                    c_block_copy_lds_to_global.Run(c_block_desc_mblock_mperblock_nblock_nperblock,
+                                                   c_block_buf,
+                                                   c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                   c_grid_buf);
+
+                    // move on nxdlperwave dimension
+                    if constexpr(nxdlperwave_forward_sweep &&
+                                 (nxdlperwave < NRepeat - CShuffleNRepeatPerShuffle))
+                    {
+                        c_block_copy_lds_to_global.MoveDstSliceWindow(
+                            c_grid_desc_mblock_mperblock_nblock_nperblock,
+                            nxdlperwave_forward_step);
+                    }
+                    else if constexpr((!nxdlperwave_forward_sweep) && (nxdlperwave > 0))
+                    {
+                        c_block_copy_lds_to_global.MoveDstSliceWindow(
+                            c_grid_desc_mblock_mperblock_nblock_nperblock,
+                            nxdlperwave_backward_step);
+                    }
+                });
+
+                // move on mxdlperwave dimension
+                if constexpr(mxdlperwave < MRepeat - CShuffleMRepeatPerShuffle)
+                {
+                    c_block_copy_lds_to_global.MoveDstSliceWindow(
+                        c_grid_desc_mblock_mperblock_nblock_nperblock, mxdlperwave_forward_step);
+                }
+            });
+        }
+    }
+}; // namespace ck
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
new file mode 100644
index 00000000..2aad7128
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_skip_b_lds_v1.hpp
@@ -0,0 +1,678 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops_skip_b_lds.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AGridDesc_K0_M_K1,
+          typename BGridDesc_K0_N_K1,
+          typename BGridDesc_K0_K1_K2_N0_N1_N2_N3_K3,
+          typename CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename Block2CTileMap,
+          bool HasMainK0BlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_xdlops_skip_b_lds_v1(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
+            const BGridDesc_K0_K1_K2_N0_N1_N2_N3_K3 b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3,
+            const CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CElementwiseOperation c_element_op,
+            const Block2CTileMap block_2_ctile_map)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    GridwiseGemm::template Run<HasMainK0BlockLoop>(p_a_grid,
+                                                   p_b_grid,
+                                                   p_c_grid,
+                                                   p_shared,
+                                                   a_grid_desc_k0_m_k1,
+                                                   b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3,
+                                                   c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                                   a_element_op,
+                                                   b_element_op,
+                                                   c_element_op,
+                                                   block_2_ctile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_c_grid;
+    ignore = a_grid_desc_k0_m_k1;
+    ignore = b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3;
+    ignore = c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = c_element_op;
+    ignore = block_2_ctile_map;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          typename AGridDesc_K0_M_K1,
+          typename BGridDesc_K0_N_K1,
+          typename CGridDesc_M_N,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t K0PerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t K1Value,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_K1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          bool ABlockLdsExtraM,
+          index_t BBlockTransferSrcScalarPerVector,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          index_t BBlockBufferSize,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector>
+struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_skip_b_lds_v1
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    // K1 should be Number<...>
+    static constexpr auto K1 = Number<K1Value>{};
+
+    static constexpr index_t WaveSize = 64;
+    static constexpr index_t MWaves   = MPerBlock / (MXdlPerWave * MPerXDL);
+    static constexpr index_t NWaves   = NPerBlock / (NXdlPerWave * NPerXDL);
+
+    static constexpr auto xdlops_gemm    = XdlopsGemm<FloatAB, MPerXDL, NPerXDL, K1>{};
+    static constexpr index_t K0PerThread = K0PerBlock / xdlops_gemm.K0PerXdlops;
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    __host__ __device__ static constexpr auto GetABlockDescriptor_K0PerBlock_MPerBlock_K1()
+    {
+        constexpr auto max_lds_align = K1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_k0_m_k1 = [&]() {
+            if constexpr(ABlockLdsExtraM)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<K0PerBlock * BBlockBufferSize>{}, Number<MPerBlock>{}, K1),
+                    make_tuple(Number<MPerBlock + 1>{} * K1, K1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<K0PerBlock * BBlockBufferSize>{}, Number<MPerBlock>{}, K1),
+                    max_lds_align);
+            }
+        }();
+
+        return a_block_desc_k0_m_k1;
+    }
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_desc_k0_m_k1 = GetABlockDescriptor_K0PerBlock_MPerBlock_K1();
+
+        constexpr auto max_lds_align = K1;
+
+        constexpr auto a_block_space_size_aligned =
+            math::integer_least_multiple(a_block_desc_k0_m_k1.GetElementSpaceSize(), max_lds_align);
+
+        return (a_block_space_size_aligned) * sizeof(FloatAB);
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    __host__ __device__ static constexpr bool
+    CheckValidity(const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1,
+                  const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1,
+                  const CGridDesc_M_N& c_grid_desc_m_n,
+                  index_t M01,
+                  index_t N01)
+    {
+        static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
+                      "wrong! K1 need to be known at compile-time");
+
+        static_assert((MPerBlock % (MPerXDL * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXDL)) == 0,
+                      "Invalid tuning param!");
+
+        const auto M  = a_grid_desc_k0_m_k1.GetLength(I1);
+        const auto N  = b_grid_desc_k0_n_k1.GetLength(I1);
+        const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0);
+
+        if(!(M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1) &&
+             K0 == b_grid_desc_k0_n_k1.GetLength(I0) && K1 == a_grid_desc_k0_m_k1.GetLength(I2) &&
+             K1 == b_grid_desc_k0_n_k1.GetLength(I2)))
+            return false;
+
+        if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % K0PerBlock == 0))
+            return false;
+
+        // 2-stage prefetch currently only support even number of K0 loop
+        // TODO: add support for odd number of K0 loop
+        if(!((K0 / K0PerBlock) % BBlockBufferSize == 0))
+        {
+            return false;
+        }
+
+        // check M01, N01
+        constexpr auto M1 = Number<MPerBlock>{};
+        constexpr auto N1 = Number<NPerBlock>{};
+
+        const auto M0 = M / M1;
+        const auto N0 = N / N1;
+
+        if(!(M0 % M01 == 0 && N0 % N01 == 0))
+            return false;
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ __device__ static constexpr index_t
+    CalculateGridSize(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        const auto M = c_grid_desc_m_n.GetLength(I0);
+        const auto N = c_grid_desc_m_n.GetLength(I1);
+
+        const index_t grid_size = (M / MPerBlock) * (N / NPerBlock);
+
+        return grid_size;
+    }
+
+    // TODO move this function into GEMM-pipeline class
+    __host__ __device__ static constexpr bool CalculateHasMainK0BlockLoop(index_t K0)
+    {
+        const bool has_main_k0_block_loop = (K0 / (BBlockBufferSize * K0PerBlock)) > 1;
+
+        return has_main_k0_block_loop;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeBGridDescriptor_K0_K1_K2_N0_N1_N2_N3_K3(const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1)
+    {
+        const auto K0 = b_grid_desc_k0_n_k1.GetLength(I0);
+        const auto N  = b_grid_desc_k0_n_k1.GetLength(I1);
+
+        const auto b_griddesc_k0_nblockid_nrepeat_waves_nperxdlops_k1 = transform_tensor_descriptor(
+            b_grid_desc_k0_n_k1,
+            make_tuple(make_unmerge_transform(
+                           make_tuple(K0 / K0PerBlock, xdlops_gemm.K0PerXdlops, K0PerThread)),
+                       make_unmerge_transform(make_tuple(
+                           N / (NXdlPerWave * NWaves * NPerXDL), NXdlPerWave, NWaves, NPerXDL)),
+                       make_pass_through_transform(K1)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5, 6>{}, Sequence<7>{}));
+        return b_griddesc_k0_nblockid_nrepeat_waves_nperxdlops_k1;
+    }
+
+    __device__ static auto GetWaveIdx()
+    {
+        const index_t thread_id = get_thread_local_1d_id();
+
+        constexpr auto threadid_to_wave_idx_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(MWaves, NWaves, WaveSize))),
+            make_tuple(Sequence<0, 1, 2>{}),
+            make_tuple(Sequence<0>{}));
+
+        return threadid_to_wave_idx_adaptor.CalculateBottomIndex(make_multi_index(thread_id));
+    }
+
+    __device__ static auto GetWaveKNIdx(const index_t thread_id)
+    {
+        constexpr auto wave_threadid_to_nk_idx_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(make_tuple(xdlops_gemm.K0PerXdlops, NPerXDL))),
+            make_tuple(Sequence<0, 1>{}),
+            make_tuple(Sequence<0>{}));
+
+        return wave_threadid_to_nk_idx_adaptor.CalculateBottomIndex(make_multi_index(thread_id));
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        constexpr auto max_lds_align = K1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_k0_m_k1 = [&]() {
+            if constexpr(ABlockLdsExtraM)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1),
+                    make_tuple(Number<MPerBlock + 1>{} * K1, K1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+            }
+        }();
+
+        // B matrix threadwise copy
+        constexpr auto b_thread_desc_k0_k1_k2_n0_n1_n2_n3_k3 =
+            make_naive_tensor_descriptor_packed(make_tuple(I1,
+                                                           I1,
+                                                           Number<K0PerThread>{}, // K0PerThread
+                                                           I1,                    // NBlockId
+                                                           Number<NXdlPerWave>{}, // repeat
+                                                           I1,                    // waves
+                                                           I1,                    // NPerXdlops
+                                                           Number<K1>{}));
+
+        using BlockwiseGemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1r1<
+            BlockSize,
+            FloatAB,
+            FloatAcc,
+            decltype(a_block_desc_k0_m_k1),
+            decltype(b_thread_desc_k0_k1_k2_n0_n1_n2_n3_k3),
+            MPerBlock,
+            NPerBlock,
+            K0PerBlock,
+            MPerXDL,
+            NPerXDL,
+            MXdlPerWave,
+            NXdlPerWave,
+            K1>;
+
+        return BlockwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m_n);
+    }
+
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    __host__ __device__ static constexpr auto
+    MakeDefaultBlock2CTileMap(const CGridDesc_M_N& c_grid_desc_m_n, index_t M01, index_t N01)
+    {
+        const auto M = c_grid_desc_m_n.GetLength(I0);
+        const auto N = c_grid_desc_m_n.GetLength(I1);
+
+        constexpr auto M1 = Number<MPerBlock>{};
+        constexpr auto N1 = Number<NPerBlock>{};
+
+        const auto M0 = M / M1;
+        const auto N0 = N / N1;
+
+        const auto M00 = M0 / M01;
+        const auto N00 = N0 / N01;
+
+        const auto m00_m01_n00_n01_to_m0_n0_block_cluster_adaptor =
+            make_single_stage_tensor_adaptor(
+                make_tuple(make_unmerge_transform(make_tuple(M00, M01)),
+                           make_unmerge_transform(make_tuple(N00, N01))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1, 3>{}));
+
+        const auto cblockid_to_m00_m01_n00_n01_block_cluster_adaptor =
+            make_single_stage_tensor_adaptor(
+                make_tuple(make_merge_transform(make_tuple(M00, N00, M01, N01))),
+                make_tuple(Sequence<0, 1, 2, 3>{}),
+                make_tuple(Sequence<0>{}));
+
+        const auto cblockid_to_m0_n0_block_cluster_adaptor =
+            chain_tensor_adaptors(m00_m01_n00_n01_to_m0_n0_block_cluster_adaptor,
+                                  cblockid_to_m00_m01_n00_n01_block_cluster_adaptor);
+
+        return cblockid_to_m0_n0_block_cluster_adaptor;
+    }
+
+    using CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 =
+        decltype(MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(CGridDesc_M_N{}));
+    using DefaultBlock2CTileMap = decltype(MakeDefaultBlock2CTileMap(CGridDesc_M_N{}, 1, 1));
+    using BGridDesc_K0_K1_K2_N0_N1_N2_N3_K3 =
+        decltype(MakeBGridDescriptor_K0_K1_K2_N0_N1_N2_N3_K3(BGridDesc_K0_N_K1{}));
+
+    template <bool HasMainK0BlockLoop, typename Block2CTileMap = DefaultBlock2CTileMap>
+    __device__ static void
+    Run(const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatC* __restrict__ p_c_grid,
+        void* __restrict__ p_shared,
+        const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1,
+        const BGridDesc_K0_K1_K2_N0_N1_N2_N3_K3 b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3,
+        const CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2& c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+        const AElementwiseOperation& a_element_op,
+        const BElementwiseOperation& b_element_op,
+        const CElementwiseOperation& c_element_op,
+        const Block2CTileMap& block_2_ctile_map)
+    {
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_k0_m_k1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetElementSpaceSize());
+
+        const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0);
+
+        // divide block work by [M, N]
+        const auto block_work_idx =
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_k0_m_k1 = GetABlockDescriptor_K0PerBlock_MPerBlock_K1();
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1<
+            ThisThreadBlock,
+            AElementwiseOperation,
+            ck::tensor_operation::element_wise::PassThrough,
+            InMemoryDataOperationEnum::Set,
+            Sequence<K0PerBlock * BBlockBufferSize, MPerBlock, K1>,
+            ABlockTransferThreadClusterLengths_K0_M_K1,
+            ABlockTransferThreadClusterArrangeOrder,
+            FloatAB,
+            FloatAB,
+            decltype(a_grid_desc_k0_m_k1),
+            decltype(a_block_desc_k0_m_k1),
+            ABlockTransferSrcAccessOrder,
+            Sequence<1, 0, 2>,
+            ABlockTransferSrcVectorDim,
+            2,
+            ABlockTransferSrcScalarPerVector,
+            ABlockTransferDstScalarPerVector_K1,
+            1,
+            1,
+            AThreadTransferSrcResetCoordinateAfterRun,
+            true,
+            1>(a_grid_desc_k0_m_k1,
+               make_multi_index(0, m_block_data_idx_on_grid, 0),
+               a_element_op,
+               a_block_desc_k0_m_k1,
+               make_multi_index(0, 0, 0),
+               ck::tensor_operation::element_wise::PassThrough{});
+
+        ignore = b_element_op;
+        // B matrix threadwise copy
+        constexpr auto b_thread_desc_k0_k1_k2_n0_n1_n2_n3_k3 =
+            make_naive_tensor_descriptor_packed(make_tuple(I1,
+                                                           I1,
+                                                           Number<K0PerThread>{}, // K0PerThread
+                                                           I1,                    // NBlockId
+                                                           Number<NXdlPerWave>{}, // repeat
+                                                           I1,                    // waves
+                                                           I1,                    // NPerXdlops
+                                                           Number<K1>{}));
+
+        auto b_thread_buf = generate_tuple(
+            [&](auto i) {
+                ignore = i;
+                return StaticBuffer<AddressSpaceEnum::Vgpr,
+                                    FloatAB,
+                                    b_thread_desc_k0_k1_k2_n0_n1_n2_n3_k3.GetElementSpaceSize(),
+                                    true>{};
+            },
+            Number<BBlockBufferSize>{});
+
+        const auto wave_id     = GetWaveIdx();
+        const auto wave_k_n_id = GetWaveKNIdx(wave_id[I2]);
+
+#if 0
+        const index_t block_id  = get_block_1d_id();
+        const index_t thread_id = get_thread_local_1d_id();
+        printf("block id: %d  m blockid: %d n block id: %d ,thread id: %d, wave id :{%d %d %d} "
+               "kn id: {%d %d}\n",
+               block_id,
+               block_work_idx[I0],
+               block_work_idx[I1],
+               thread_id,
+               wave_id[I0],
+               wave_id[I1],
+               wave_id[I2],
+               wave_k_n_id[I0],
+               wave_k_n_id[I1]);
+        printf("mfma thread k per xdlops: %d K0PerThread: %d HasMainK0BlockLoop: %d K0: %d  \t", 
+                xdlops_gemm.K0PerXdlops, K0PerThread, HasMainK0BlockLoop, b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3.GetLength(I0));
+#endif
+
+        auto b_threadwise_copy =
+            ThreadwiseTensorSliceTransfer_v2<FloatAB,
+                                             FloatAB,
+                                             decltype(b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3),
+                                             decltype(b_thread_desc_k0_k1_k2_n0_n1_n2_n3_k3),
+                                             Sequence<I1,
+                                                      I1,
+                                                      Number<K0PerThread>{},
+                                                      I1,
+                                                      Number<NXdlPerWave>{},
+                                                      I1,
+                                                      I1,
+                                                      Number<K1>{}>,
+                                             Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                             7,
+                                             BBlockTransferSrcScalarPerVector,
+                                             BThreadTransferSrcResetCoordinateAfterRun,
+                                             true>(
+                b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3,
+                make_multi_index(
+                    0, wave_k_n_id[I0], 0, block_work_idx[I1], 0, wave_id[I1], wave_k_n_id[I1], 0));
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[K0PerBlock, MPerBlock] is in LDS
+        //     b_mtx[K0PerBlock, NPerBlock] is in LDS
+        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
+        //       register
+        // sanity check
+        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1r1<
+            BlockSize,
+            FloatAB,
+            FloatAcc,
+            decltype(a_block_desc_k0_m_k1),
+            decltype(b_thread_desc_k0_k1_k2_n0_n1_n2_n3_k3),
+            MPerBlock,
+            NPerBlock,
+            K0PerBlock,
+            MPerXDL,
+            NPerXDL,
+            MXdlPerWave,
+            NXdlPerWave,
+            K1>{};
+
+        auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
+
+        // LDS allocation for A
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<FloatAB*>(p_shared), a_block_desc_k0_m_k1.GetElementSpaceSize());
+
+        // gridwise GEMM pipeline
+        constexpr auto a_block_slice_copy_step =
+            make_multi_index(K0PerBlock * BBlockBufferSize, 0, 0);
+        constexpr auto b_thread_slice_copy_step = make_multi_index(1, 0, 0, 0, 0, 0, 0, 0);
+        // preload data to regiester and LDS
+        {
+            // Read
+            a_blockwise_copy.RunRead(a_grid_desc_k0_m_k1, a_grid_buf);
+            a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_k0_m_k1, a_block_slice_copy_step);
+
+            static_for<0, BBlockBufferSize, 1>{}([&](auto ii) {
+                b_threadwise_copy.Run(b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3,
+                                      b_grid_buf,
+                                      b_thread_desc_k0_k1_k2_n0_n1_n2_n3_k3,
+                                      make_tuple(I0, I0, I0, I0, I0, I0, I0, I0),
+                                      b_thread_buf(Number<ii>{}));
+                b_threadwise_copy.MoveSrcSliceWindow(b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3,
+                                                     b_thread_slice_copy_step);
+            });
+
+            // Initialize C
+            c_thread_buf.Clear();
+            // a data write to lds
+            a_blockwise_copy.RunWrite(a_block_desc_k0_m_k1, a_block_buf);
+            // main body
+            if constexpr(HasMainK0BlockLoop)
+            {
+                index_t K0BlockMainLoop =
+                    __builtin_amdgcn_readfirstlane(K0 / (BBlockBufferSize * K0PerBlock));
+                index_t i = 0;
+                do
+                {
+                    a_blockwise_copy.RunRead(a_grid_desc_k0_m_k1, a_grid_buf);
+                    blockwise_gemm.ResetABlockStartWindow();
+                    block_sync_lds();
+
+                    static_for<0, BBlockBufferSize, 1>{}([&](auto ii) {
+                        blockwise_gemm.Run(a_block_buf, b_thread_buf(Number<ii>{}), c_thread_buf);
+                        blockwise_gemm.MoveABlockSliceWindow();
+                        s_nop();
+
+                        b_threadwise_copy.Run(b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3,
+                                              b_grid_buf,
+                                              b_thread_desc_k0_k1_k2_n0_n1_n2_n3_k3,
+                                              make_tuple(I0, I0, I0, I0, I0, I0, I0, I0),
+                                              b_thread_buf(Number<ii>{}));
+                        b_threadwise_copy.MoveSrcSliceWindow(b_grid_desc_k0_k1_k2_n0_n1_n2_n3_k3,
+                                                             b_thread_slice_copy_step);
+                    });
+
+                    block_sync_lds();
+                    a_blockwise_copy.RunWrite(a_block_desc_k0_m_k1, a_block_buf);
+                    // move a and b window
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc_k0_m_k1,
+                                                        a_block_slice_copy_step);
+
+                    i += 1;
+                } while(i < (K0BlockMainLoop - 1));
+            }
+
+            // tail
+            {
+                block_sync_lds();
+
+                blockwise_gemm.ResetABlockStartWindow();
+
+                static_for<0, BBlockBufferSize, 1>{}([&](auto ii) {
+                    blockwise_gemm.Run(a_block_buf, b_thread_buf(Number<ii>{}), c_thread_buf);
+                    blockwise_gemm.MoveABlockSliceWindow();
+                });
+            }
+        }
+
+        // output: register to global memory
+        {
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I4);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I5);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I6);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I7);
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_grid =
+                m_block_data_idx_on_grid + c_thread_mtx_on_block[I0];
+
+            const index_t n_thread_data_on_grid =
+                n_block_data_idx_on_grid + c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_grid_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_grid_idx =
+                m_thread_data_on_grid_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_grid));
+
+            const auto n_thread_data_on_grid_to_n0_n1_n2_adaptor = make_single_stage_tensor_adaptor(
+                make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                make_tuple(Sequence<0, 1, 2>{}),
+                make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_grid_idx =
+                n_thread_data_on_grid_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_grid));
+
+            auto c_thread_copy =
+                ThreadwiseTensorSliceTransfer_v1r3<FloatAcc,
+                                                   FloatC,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   decltype(c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   CElementwiseOperation,
+                                                   Sequence<M0, N0, I1, I1, M2, I1, M4, I1>,
+                                                   CThreadTransferSrcDstAccessOrder,
+                                                   CThreadTransferSrcDstVectorDim,
+                                                   CThreadTransferDstScalarPerVector,
+                                                   CGlobalMemoryDataOperation,
+                                                   1,
+                                                   true>{
+                    c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(m_thread_data_on_grid_idx[I0],
+                                     n_thread_data_on_grid_idx[I0],
+                                     m_thread_data_on_grid_idx[I1],
+                                     n_thread_data_on_grid_idx[I1],
+                                     m_thread_data_on_grid_idx[I2],
+                                     m_thread_data_on_grid_idx[I3],
+                                     m_thread_data_on_grid_idx[I4],
+                                     n_thread_data_on_grid_idx[I2]),
+                    c_element_op};
+
+            c_thread_copy.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                              make_tuple(I0, I0, I0, I0, I0, I0, I0, I0),
+                              c_thread_buf,
+                              c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                              c_grid_buf);
+        }
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
new file mode 100644
index 00000000..d1149c0c
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
@@ -0,0 +1,557 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AGridDesc_K0_M_K1,
+          typename BGridDesc_K0_N_K1,
+          typename CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename Block2CTileMap,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_xdlops_v2r3(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
+            const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
+            const CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CElementwiseOperation c_element_op,
+            const Block2CTileMap block_2_ctile_map)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
+                                                  p_b_grid,
+                                                  p_c_grid,
+                                                  p_shared,
+                                                  a_grid_desc_k0_m_k1,
+                                                  b_grid_desc_k0_n_k1,
+                                                  c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  c_element_op,
+                                                  block_2_ctile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_c_grid;
+    ignore = a_grid_desc_k0_m_k1;
+    ignore = b_grid_desc_k0_n_k1;
+    ignore = c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = c_element_op;
+    ignore = block_2_ctile_map;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          typename AGridDesc_K0_M_K1,
+          typename BGridDesc_K0_N_K1,
+          typename CGridDesc_M_N,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t K0PerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t K1Value,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_K1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_K1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          bool BBlockLdsExtraN,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector,
+          index_t NumGemmKPrefetchStage = 1,
+          LoopScheduler LoopSched       = make_default_loop_scheduler(),
+          PipelineVersion PipelineVer   = PipelineVersion::v1>
+struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    // K1 should be Number<...>
+    static constexpr auto K1 = Number<K1Value>{};
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    using GridwiseGemmPipe = remove_cvref_t<decltype(
+        GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage, LoopSched>())>;
+
+    __host__ __device__ static constexpr auto GetABlockDescriptor_K0PerBlock_MPerBlock_K1()
+    {
+        constexpr auto max_lds_align = K1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_k0_m_k1 = [&]() {
+            if constexpr(ABlockLdsExtraM)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1),
+                    make_tuple(Number<MPerBlock + 1>{} * K1, K1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+            }
+        }();
+
+        return a_block_desc_k0_m_k1;
+    }
+
+    __host__ __device__ static constexpr auto GetBBlockDescriptor_K0PerBlock_NPerBlock_K1()
+    {
+        constexpr auto max_lds_align = K1;
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_k0_n_k1 = [&]() {
+            if constexpr(BBlockLdsExtraN)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1),
+                    make_tuple(Number<NPerBlock + 1>{} * K1, K1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+            }
+        }();
+
+        return b_block_desc_k0_n_k1;
+    }
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_desc_k0_m_k1 = GetABlockDescriptor_K0PerBlock_MPerBlock_K1();
+
+        constexpr auto b_block_desc_k0_n_k1 = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1();
+
+        constexpr auto max_lds_align = K1;
+
+        constexpr auto a_block_space_size_aligned =
+            math::integer_least_multiple(a_block_desc_k0_m_k1.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size_aligned =
+            math::integer_least_multiple(b_block_desc_k0_n_k1.GetElementSpaceSize(), max_lds_align);
+
+        return (a_block_space_size_aligned + b_block_space_size_aligned) * sizeof(FloatAB);
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    template <typename Block2CTileMap>
+    __host__ __device__ static constexpr bool
+    CheckValidity(const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1,
+                  const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1,
+                  const CGridDesc_M_N& c_grid_desc_m_n,
+                  const Block2CTileMap& block_2_ctile_map)
+    {
+        static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
+                      "wrong! K1 need to be known at compile-time");
+
+        static_assert((MPerBlock % (MPerXDL * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXDL)) == 0,
+                      "Invalid tuning param!");
+
+        const auto M  = a_grid_desc_k0_m_k1.GetLength(I1);
+        const auto N  = b_grid_desc_k0_n_k1.GetLength(I1);
+        const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0);
+
+        if(!(M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1) &&
+             K0 == b_grid_desc_k0_n_k1.GetLength(I0) && K1 == a_grid_desc_k0_m_k1.GetLength(I2) &&
+             K1 == b_grid_desc_k0_n_k1.GetLength(I2)))
+            return false;
+
+        if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % K0PerBlock == 0))
+            return false;
+
+        // check gridwise gemm pipeline
+        const auto num_k_loop = K0 / K0PerBlock;
+
+        if(!GridwiseGemmPipe::IsSupported(num_k_loop))
+        {
+            return false;
+        }
+
+        if(!block_2_ctile_map.CheckValidity(c_grid_desc_m_n))
+        {
+            return false;
+        }
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const index_t num_loop = K / (K0PerBlock * K1);
+
+        return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        constexpr auto max_lds_align = K1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_k0_m_k1 = [&]() {
+            if constexpr(ABlockLdsExtraM)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1),
+                    make_tuple(Number<MPerBlock + 1>{} * K1, K1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+            }
+        }();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_k0_n_k1 = [&]() {
+            if constexpr(BBlockLdsExtraN)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1),
+                    make_tuple(Number<NPerBlock + 1>{} * K1, K1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+            }
+        }();
+
+        using BlockwiseGemm =
+            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
+                                                                FloatAB,
+                                                                FloatAcc,
+                                                                decltype(a_block_desc_k0_m_k1),
+                                                                decltype(b_block_desc_k0_n_k1),
+                                                                MPerXDL,
+                                                                NPerXDL,
+                                                                MXdlPerWave,
+                                                                NXdlPerWave,
+                                                                K1>;
+
+        return BlockwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m_n);
+    }
+
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    __host__ __device__ static constexpr auto MakeDefaultBlock2CTileMap(
+        const CGridDesc_M_N& c_grid_desc_m_n, index_t /* M01 */, index_t /* N01 */)
+    {
+        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc_M_N>(
+            c_grid_desc_m_n);
+    }
+
+    using CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 =
+        decltype(MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(CGridDesc_M_N{}));
+    using DefaultBlock2CTileMap = decltype(MakeDefaultBlock2CTileMap(CGridDesc_M_N{}, 1, 1));
+
+    template <bool HasMainKBlockLoop, typename Block2CTileMap = DefaultBlock2CTileMap>
+    __device__ static void
+    Run(const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatC* __restrict__ p_c_grid,
+        void* __restrict__ p_shared,
+        const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1,
+        const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1,
+        const CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2& c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+        const AElementwiseOperation& a_element_op,
+        const BElementwiseOperation& b_element_op,
+        const CElementwiseOperation& c_element_op,
+        const Block2CTileMap& block_2_ctile_map)
+    {
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_k0_m_k1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_k0_n_k1.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetElementSpaceSize());
+
+        const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0);
+
+        // divide block work by [M, N]
+        const auto block_work_idx =
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!block_2_ctile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I0),
+                          c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I1))))
+        {
+            return;
+        }
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
+
+        // lds max alignment
+        constexpr auto max_lds_align = K1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_k0_m_k1 = GetABlockDescriptor_K0PerBlock_MPerBlock_K1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_k0_n_k1 = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1();
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                AElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<K0PerBlock, MPerBlock, K1>,
+                                                ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(a_grid_desc_k0_m_k1),
+                                                decltype(a_block_desc_k0_m_k1),
+                                                ABlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                ABlockTransferSrcVectorDim,
+                                                2,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_K1,
+                                                1,
+                                                1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                NumGemmKPrefetchStage>(
+                a_grid_desc_k0_m_k1,
+                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                a_element_op,
+                a_block_desc_k0_m_k1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                BElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<K0PerBlock, NPerBlock, K1>,
+                                                BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(b_grid_desc_k0_n_k1),
+                                                decltype(b_block_desc_k0_n_k1),
+                                                BBlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                BBlockTransferSrcVectorDim,
+                                                2,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_K1,
+                                                1,
+                                                1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                NumGemmKPrefetchStage>(
+                b_grid_desc_k0_n_k1,
+                make_multi_index(0, n_block_data_idx_on_grid, 0),
+                b_element_op,
+                b_block_desc_k0_n_k1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[K0PerBlock, MPerBlock] is in LDS
+        //     b_mtx[K0PerBlock, NPerBlock] is in LDS
+        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
+        //       register
+        // sanity check
+        auto blockwise_gemm = BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_Selector<
+            BlockSize,
+            FloatAB,
+            FloatAcc,
+            decltype(a_block_desc_k0_m_k1),
+            decltype(b_block_desc_k0_n_k1),
+            MPerXDL,
+            NPerXDL,
+            MXdlPerWave,
+            NXdlPerWave,
+            K1,
+            LoopSched>();
+
+        auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size_aligned =
+            math::integer_least_multiple(a_block_desc_k0_m_k1.GetElementSpaceSize(), max_lds_align);
+
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<FloatAB*>(p_shared), a_block_desc_k0_m_k1.GetElementSpaceSize());
+
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<FloatAB*>(p_shared) + a_block_space_size_aligned,
+            b_block_desc_k0_n_k1.GetElementSpaceSize());
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0);
+
+        // gridwise GEMM pipeline
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(K0 / K0PerBlock);
+
+        GridwiseGemmPipe::template Run<HasMainKBlockLoop>(a_grid_desc_k0_m_k1,
+                                                          a_block_desc_k0_m_k1,
+                                                          a_blockwise_copy,
+                                                          a_grid_buf,
+                                                          a_block_buf,
+                                                          a_block_slice_copy_step,
+                                                          b_grid_desc_k0_n_k1,
+                                                          b_block_desc_k0_n_k1,
+                                                          b_blockwise_copy,
+                                                          b_grid_buf,
+                                                          b_block_buf,
+                                                          b_block_slice_copy_step,
+                                                          blockwise_gemm,
+                                                          c_thread_buf,
+                                                          num_k_block_main_loop);
+
+        // output: register to global memory
+        {
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I4);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I5);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I6);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2.GetLength(I7);
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_grid =
+                m_block_data_idx_on_grid + c_thread_mtx_on_block[I0];
+
+            const index_t n_thread_data_on_grid =
+                n_block_data_idx_on_grid + c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_grid_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_grid_idx =
+                m_thread_data_on_grid_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_grid));
+
+            const auto n_thread_data_on_grid_to_n0_n1_n2_adaptor = make_single_stage_tensor_adaptor(
+                make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                make_tuple(Sequence<0, 1, 2>{}),
+                make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_grid_idx =
+                n_thread_data_on_grid_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_grid));
+
+            auto c_thread_copy =
+                ThreadwiseTensorSliceTransfer_v1r3<FloatAcc,
+                                                   FloatC,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   decltype(c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   CElementwiseOperation,
+                                                   Sequence<M0, N0, I1, I1, M2, I1, M4, I1>,
+                                                   CThreadTransferSrcDstAccessOrder,
+                                                   CThreadTransferSrcDstVectorDim,
+                                                   CThreadTransferDstScalarPerVector,
+                                                   CGlobalMemoryDataOperation,
+                                                   1,
+                                                   true>{
+                    c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(m_thread_data_on_grid_idx[I0],
+                                     n_thread_data_on_grid_idx[I0],
+                                     m_thread_data_on_grid_idx[I1],
+                                     n_thread_data_on_grid_idx[I1],
+                                     m_thread_data_on_grid_idx[I2],
+                                     m_thread_data_on_grid_idx[I3],
+                                     m_thread_data_on_grid_idx[I4],
+                                     n_thread_data_on_grid_idx[I2]),
+                    c_element_op};
+
+            c_thread_copy.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                              make_tuple(I0, I0, I0, I0, I0, I0, I0, I0),
+                              c_thread_buf,
+                              c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                              c_grid_buf);
+        }
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
new file mode 100644
index 00000000..949d5648
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp
@@ -0,0 +1,616 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename ABK0MK1GridDesc,
+          typename BBK0NK1GridDesc,
+          typename CM0N0M1N1M2M3M4N2GridDesc,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename CBlockClusterAdaptor,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_xdlops_v2r4(const FloatAB* __restrict__ p_a_grid,
+                                const FloatAB* __restrict__ p_b_grid,
+                                FloatC* __restrict__ p_c_grid,
+                                const ABK0MK1GridDesc a_b_k0_m_k1_grid_desc,
+                                const BBK0NK1GridDesc b_b_k0_n_k1_grid_desc,
+                                const CM0N0M1N1M2M3M4N2GridDesc c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc,
+                                const AElementwiseOperation a_element_op,
+                                const BElementwiseOperation b_element_op,
+                                const CElementwiseOperation c_element_op,
+                                const CBlockClusterAdaptor c_block_cluster_adaptor)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
+                                                  p_b_grid,
+                                                  p_c_grid,
+                                                  p_shared_block,
+                                                  a_b_k0_m_k1_grid_desc,
+                                                  b_b_k0_n_k1_grid_desc,
+                                                  c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  c_element_op,
+                                                  c_block_cluster_adaptor);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_c_grid;
+    ignore = a_b_k0_m_k1_grid_desc;
+    ignore = b_b_k0_n_k1_grid_desc;
+    ignore = c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = c_element_op;
+    ignore = c_block_cluster_adaptor;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          typename ABK0MK1GridDesc,
+          typename BBK0NK1GridDesc,
+          typename CMNGridDesc,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t K0PerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t K1Value,
+          index_t MRepeat,
+          index_t NRepeat,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_K1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_K1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          bool BBlockLdsExtraN,
+          typename CThreadTransferSrcDstAccessOrder,
+          index_t CThreadTransferSrcDstVectorDim,
+          index_t CThreadTransferDstScalarPerVector>
+struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    // K1 should be Number<...>
+    static constexpr auto K1 = Number<K1Value>{};
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        constexpr auto max_lds_align = K1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_k0_m_k1_block_desc = [&]() {
+            if constexpr(ABlockLdsExtraM)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1),
+                    make_tuple(Number<MPerBlock + 1>{} * K1, K1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+            }
+        }();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_k0_n_k1_block_desc = [&]() {
+            if constexpr(BBlockLdsExtraN)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1),
+                    make_tuple(Number<NPerBlock + 1>{} * K1, K1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+            }
+        }();
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size =
+            math::integer_least_multiple(a_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size =
+            math::integer_least_multiple(b_k0_n_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        return (a_block_space_size + b_block_space_size) * sizeof(FloatAB);
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    template <typename Block2CTileMap>
+    __host__ __device__ static constexpr bool
+    CheckValidity(const ABK0MK1GridDesc& a_b_k0_m_k1_grid_desc,
+                  const BBK0NK1GridDesc& b_b_k0_n_k1_grid_desc,
+                  const CMNGridDesc& c_m_n_grid_desc,
+                  const Block2CTileMap& block_2_ctile_map)
+    {
+        static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
+                      "wrong! K1 need to be known at compile-time");
+
+        static_assert((MPerBlock % (MPerXDL * MRepeat) == 0) &&
+                          (NPerBlock % (NRepeat * NPerXDL)) == 0,
+                      "Invalid tuning param!");
+
+        const auto M      = a_b_k0_m_k1_grid_desc.GetLength(I2);
+        const auto N      = b_b_k0_n_k1_grid_desc.GetLength(I2);
+        const auto K0     = a_b_k0_m_k1_grid_desc.GetLength(I1);
+        const auto KBatch = a_b_k0_m_k1_grid_desc.GetLength(I0);
+
+        if(!(M == c_m_n_grid_desc.GetLength(I0) && N == c_m_n_grid_desc.GetLength(I1) &&
+             K0 == b_b_k0_n_k1_grid_desc.GetLength(I1) &&
+             K1 == a_b_k0_m_k1_grid_desc.GetLength(I3) &&
+             K1 == b_b_k0_n_k1_grid_desc.GetLength(I3) &&
+             KBatch == b_b_k0_n_k1_grid_desc.GetLength(I0)))
+            return false;
+
+        if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % K0PerBlock == 0))
+            return false;
+
+        if(!block_2_ctile_map.CheckValidity(c_m_n_grid_desc))
+        {
+            return false;
+        }
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainK0BlockLoop(index_t K0)
+    {
+        const bool has_main_k0_block_loop = K0 > K0PerBlock;
+
+        return has_main_k0_block_loop;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCM0N0M1N1M2M3M4N2GridDescriptor(const CMNGridDesc& c_m_n_grid_desc)
+    {
+        constexpr auto max_lds_align = K1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_k0_m_k1_block_desc = [&]() {
+            if constexpr(ABlockLdsExtraM)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1),
+                    make_tuple(Number<MPerBlock + 1>{} * K1, K1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+            }
+        }();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_k0_n_k1_block_desc = [&]() {
+            if constexpr(BBlockLdsExtraN)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1),
+                    make_tuple(Number<NPerBlock + 1>{} * K1, K1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+            }
+        }();
+
+        using BlockwiseGemm =
+            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
+                                                                FloatAB,
+                                                                FloatAcc,
+                                                                decltype(a_k0_m_k1_block_desc),
+                                                                decltype(b_k0_n_k1_block_desc),
+                                                                MPerXDL,
+                                                                NPerXDL,
+                                                                MRepeat,
+                                                                NRepeat,
+                                                                K1>;
+
+        return BlockwiseGemm::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_m_n_grid_desc);
+    }
+
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    __host__ __device__ static constexpr auto MakeCBlockClusterAdaptor(
+        const CMNGridDesc& c_m_n_grid_desc, index_t /* M01 */, index_t /* N01 */, index_t KBatch)
+    {
+        return BlockToCTileMap_KSplit_M00_N0_M01Adapt<MPerBlock, NPerBlock, CMNGridDesc>(
+            c_m_n_grid_desc, 8, KBatch);
+    }
+
+    using CM0N0M1N1M2M3M4N2GridDesc = decltype(MakeCM0N0M1N1M2M3M4N2GridDescriptor(CMNGridDesc{}));
+    using CBlockClusterAdaptor      = decltype(MakeCBlockClusterAdaptor(CMNGridDesc{}, 1, 1, 1));
+
+    template <bool HasMainKBlockLoop>
+    __device__ static void Run(const FloatAB* __restrict__ p_a_grid,
+                               const FloatAB* __restrict__ p_b_grid,
+                               FloatC* __restrict__ p_c_grid,
+                               FloatAB* __restrict__ p_shared_block,
+                               const ABK0MK1GridDesc& a_b_k0_m_k1_grid_desc,
+                               const BBK0NK1GridDesc& b_b_k0_n_k1_grid_desc,
+                               const CM0N0M1N1M2M3M4N2GridDesc& c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc,
+                               const AElementwiseOperation& a_element_op,
+                               const BElementwiseOperation& b_element_op,
+                               const CElementwiseOperation& c_element_op,
+                               const CBlockClusterAdaptor& c_block_cluster_adaptor)
+    {
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_b_k0_m_k1_grid_desc.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_b_k0_n_k1_grid_desc.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc.GetElementSpaceSize());
+
+        const auto K0 = a_b_k0_m_k1_grid_desc.GetLength(I1);
+
+        // divide block work by [M, N]
+        const auto block_work_idx =
+            c_block_cluster_adaptor.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!c_block_cluster_adaptor.ValidCTileIndex(
+               make_tuple(block_work_idx[I1], block_work_idx[I2]),
+               make_tuple(c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc.GetLength(I0),
+                          c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc.GetLength(I1))))
+        {
+            return;
+        }
+
+        const index_t k_batch_id = block_work_idx[I0];
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I2] * NPerBlock);
+
+        // lds max alignment
+        constexpr auto max_lds_align = K1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_k0_m_k1_block_desc = [&]() {
+            if constexpr(ABlockLdsExtraM)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1),
+                    make_tuple(Number<MPerBlock + 1>{} * K1, K1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+            }
+        }();
+
+        constexpr auto a_b_k0_m_k1_block_desc = [&]() {
+            if constexpr(ABlockLdsExtraM)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<1>{}, Number<K0PerBlock>{}, Number<MPerBlock>{}, K1),
+                    make_tuple(Number<K0PerBlock>{} * Number<MPerBlock + 1>{} * K1,
+                               Number<MPerBlock + 1>{} * K1,
+                               K1,
+                               I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<1>{}, Number<K0PerBlock>{}, Number<MPerBlock>{}, K1),
+                    max_lds_align);
+            }
+        }();
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_k0_n_k1_block_desc = [&]() {
+            if constexpr(BBlockLdsExtraN)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1),
+                    make_tuple(Number<NPerBlock + 1>{} * K1, K1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+            }
+        }();
+
+        constexpr auto b_b_k0_n_k1_block_desc = [&]() {
+            if constexpr(BBlockLdsExtraN)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<1>{}, Number<K0PerBlock>{}, Number<NPerBlock>{}, K1),
+                    make_tuple(Number<K0PerBlock>{} * Number<NPerBlock + 1>{} * K1,
+                               Number<NPerBlock + 1>{} * K1,
+                               K1,
+                               I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<1>{}, Number<K0PerBlock>{}, Number<NPerBlock>{}, K1),
+                    max_lds_align);
+            }
+        }();
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                AElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<1, K0PerBlock, MPerBlock, K1>,
+                                                ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(a_b_k0_m_k1_grid_desc),
+                                                decltype(a_b_k0_m_k1_block_desc),
+                                                ABlockTransferSrcAccessOrder,
+                                                Sequence<0, 2, 1, 3>,
+                                                ABlockTransferSrcVectorDim,
+                                                3,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_K1,
+                                                1,
+                                                1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                true>(
+                a_b_k0_m_k1_grid_desc,
+                make_multi_index(k_batch_id, 0, m_block_data_idx_on_grid, 0),
+                a_element_op,
+                a_b_k0_m_k1_block_desc,
+                make_multi_index(0, 0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                BElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<1, K0PerBlock, NPerBlock, K1>,
+                                                BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(b_b_k0_n_k1_grid_desc),
+                                                decltype(b_b_k0_n_k1_block_desc),
+                                                BBlockTransferSrcAccessOrder,
+                                                Sequence<0, 2, 1, 3>,
+                                                BBlockTransferSrcVectorDim,
+                                                3,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_K1,
+                                                1,
+                                                1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                true>(
+                b_b_k0_n_k1_grid_desc,
+                make_multi_index(k_batch_id, 0, n_block_data_idx_on_grid, 0),
+                b_element_op,
+                b_b_k0_n_k1_block_desc,
+                make_multi_index(0, 0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[K0PerBlock, MPerBlock] is in LDS
+        //     b_mtx[K0PerBlock, NPerBlock] is in LDS
+        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
+        //       register
+        // sanity check
+
+        auto blockwise_gemm =
+            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
+                                                                FloatAB,
+                                                                FloatAcc,
+                                                                decltype(a_k0_m_k1_block_desc),
+                                                                decltype(b_k0_n_k1_block_desc),
+                                                                MPerXDL,
+                                                                NPerXDL,
+                                                                MRepeat,
+                                                                NRepeat,
+                                                                K1>{};
+
+        auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size =
+            math::integer_least_multiple(a_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        FloatAB* p_a_block = p_shared_block;
+        FloatAB* p_b_block = p_shared_block + a_block_space_size;
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(0, K0PerBlock, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(0, K0PerBlock, 0, 0);
+
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            p_a_block, a_k0_m_k1_block_desc.GetElementSpaceSize());
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            p_b_block, b_k0_n_k1_block_desc.GetElementSpaceSize());
+
+        // preload data into LDS
+        {
+            a_blockwise_copy.RunRead(a_b_k0_m_k1_grid_desc, a_grid_buf);
+            b_blockwise_copy.RunRead(b_b_k0_n_k1_grid_desc, b_grid_buf);
+
+            a_blockwise_copy.RunWrite(a_b_k0_m_k1_block_desc, a_block_buf);
+            b_blockwise_copy.RunWrite(b_b_k0_n_k1_block_desc, b_block_buf);
+        }
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        // main body
+        if constexpr(HasMainKBlockLoop)
+        {
+            index_t k0_block_data_begin = 0;
+
+            do
+            {
+                a_blockwise_copy.MoveSrcSliceWindow(a_b_k0_m_k1_grid_desc, a_block_slice_copy_step);
+                b_blockwise_copy.MoveSrcSliceWindow(b_b_k0_n_k1_grid_desc, b_block_slice_copy_step);
+
+                a_blockwise_copy.RunRead(a_b_k0_m_k1_grid_desc, a_grid_buf);
+
+                block_sync_lds();
+
+                b_blockwise_copy.RunRead(b_b_k0_n_k1_grid_desc, b_grid_buf);
+
+                blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+
+                block_sync_lds();
+
+                a_blockwise_copy.RunWrite(a_b_k0_m_k1_block_desc, a_block_buf);
+                b_blockwise_copy.RunWrite(b_b_k0_n_k1_block_desc, b_block_buf);
+
+                k0_block_data_begin += K0PerBlock;
+            } while(k0_block_data_begin < (K0 - K0PerBlock));
+        }
+
+        // tail
+        {
+            block_sync_lds();
+
+            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+        }
+
+        // output: register to global memory
+        {
+            constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc =
+                blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I0);
+            constexpr auto N0 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I1);
+            constexpr auto M1 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I2);
+            constexpr auto N1 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I3);
+            constexpr auto M2 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I4);
+            constexpr auto M3 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I5);
+            constexpr auto M4 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I6);
+            constexpr auto N2 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I7);
+
+            constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_thread_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(
+                    Number<M0>{}, Number<N0>{}, I1, I1, Number<M2>{}, I1, Number<M4>{}, I1));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_grid =
+                m_block_data_idx_on_grid + c_thread_mtx_on_block[I0];
+
+            const index_t n_thread_data_on_grid =
+                n_block_data_idx_on_grid + c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_grid_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_grid_idx =
+                m_thread_data_on_grid_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_grid));
+
+            const auto n_thread_data_on_grid_to_n0_n1_n2_adaptor = make_single_stage_tensor_adaptor(
+                make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                make_tuple(Sequence<0, 1, 2>{}),
+                make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_grid_idx =
+                n_thread_data_on_grid_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_grid));
+
+            auto c_thread_copy =
+                ThreadwiseTensorSliceTransfer_v1r3<FloatAcc,
+                                                   FloatC,
+                                                   decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_thread_desc),
+                                                   decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc),
+                                                   CElementwiseOperation,
+                                                   Sequence<M0, N0, I1, I1, M2, I1, M4, I1>,
+                                                   CThreadTransferSrcDstAccessOrder,
+                                                   CThreadTransferSrcDstVectorDim,
+                                                   CThreadTransferDstScalarPerVector,
+                                                   CGlobalMemoryDataOperation,
+                                                   1,
+                                                   true>{
+
+                    c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc,
+                    make_multi_index(m_thread_data_on_grid_idx[I0],
+                                     n_thread_data_on_grid_idx[I0],
+                                     m_thread_data_on_grid_idx[I1],
+                                     n_thread_data_on_grid_idx[I1],
+                                     m_thread_data_on_grid_idx[I2],
+                                     m_thread_data_on_grid_idx[I3],
+                                     m_thread_data_on_grid_idx[I4],
+                                     n_thread_data_on_grid_idx[I2]),
+                    c_element_op};
+
+            c_thread_copy.Run(c_m0_n0_m1_n1_m2_m3_m4_n2_thread_desc,
+                              make_tuple(I0, I0, I0, I0, I0, I0, I0, I0),
+                              c_thread_buf,
+                              c_m0_n0_m1_n1_m2_m3_m4_n2_grid_desc,
+                              c_grid_buf);
+        }
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
new file mode 100644
index 00000000..190194f1
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp
@@ -0,0 +1,721 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_v1.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AGridDesc_B_K0_M_K1,
+          typename BGridDesc_B_K0_N_K1,
+          typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename CBlockClusterAdaptor,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_xdlops_v2r4r2(const FloatAB* __restrict__ p_a_grid,
+                                  const FloatAB* __restrict__ p_b_grid,
+                                  FloatC* __restrict__ p_c_grid,
+                                  const AGridDesc_B_K0_M_K1 a_b_k0_m_k1_grid_desc,
+                                  const BGridDesc_B_K0_N_K1 b_b_k0_n_k1_grid_desc,
+                                  const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+                                      c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                  const AElementwiseOperation a_element_op,
+                                  const BElementwiseOperation b_element_op,
+                                  const CElementwiseOperation c_element_op,
+                                  const CBlockClusterAdaptor c_block_cluster_adaptor)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    constexpr index_t shared_block_size =
+        GridwiseGemm::GetSharedMemoryNumberOfByte() / sizeof(FloatAB);
+
+    __shared__ FloatAB p_shared_block[shared_block_size];
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid,
+                                                  p_b_grid,
+                                                  p_c_grid,
+                                                  static_cast<void*>(p_shared_block),
+                                                  a_b_k0_m_k1_grid_desc,
+                                                  b_b_k0_n_k1_grid_desc,
+                                                  c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                  a_element_op,
+                                                  b_element_op,
+                                                  c_element_op,
+                                                  c_block_cluster_adaptor);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_c_grid;
+    ignore = a_b_k0_m_k1_grid_desc;
+    ignore = b_b_k0_n_k1_grid_desc;
+    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = c_element_op;
+    ignore = c_block_cluster_adaptor;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+template <index_t BlockSize,
+          typename FloatAB,
+          typename FloatAcc,
+          typename FloatC,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          typename AGridDesc_B_K0_M_K1,
+          typename BGridDesc_B_K0_N_K1,
+          typename CMNGridDesc,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t K0PerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t K1Value,
+          index_t MRepeat,
+          index_t NRepeat,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_K1,
+          bool AThreadTransferSrcResetCoordinateAfterRun,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_K1,
+          bool BThreadTransferSrcResetCoordinateAfterRun,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMRepeatPerShuffle,
+          index_t CShuffleNRepeatPerShuffle,
+          index_t CBlockTransferScalarPerVector_NWaveNPerXDL,
+          typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>
+struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4r2
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    // K1 should be Number<...>
+    static constexpr auto K1 = Number<K1Value>{};
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        constexpr auto max_lds_align = K1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_k0_m_k1_block_desc = [&]() {
+            if constexpr(ABlockLdsExtraM)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1),
+                    make_tuple(Number<MPerBlock + 1>{} * K1, K1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+            }
+        }();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_k0_n_k1_block_desc = [&]() {
+            if constexpr(BBlockLdsExtraN)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1),
+                    make_tuple(Number<NPerBlock + 1>{} * K1, K1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+            }
+        }();
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size =
+            math::integer_least_multiple(a_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size =
+            math::integer_least_multiple(b_k0_n_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto c_block_size =
+            GetCBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock().GetElementSpaceSize();
+
+        return math::max((a_block_space_size + b_block_space_size) * sizeof(FloatAB),
+                         c_block_size * sizeof(FloatC));
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    template <typename Block2CTileMap>
+    __host__ __device__ static constexpr bool
+    CheckValidity(const AGridDesc_B_K0_M_K1& a_b_k0_m_k1_grid_desc,
+                  const BGridDesc_B_K0_N_K1& b_b_k0_n_k1_grid_desc,
+                  const CMNGridDesc& c_m_n_grid_desc,
+                  const Block2CTileMap& block_2_ctile_map)
+    {
+        static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
+                      "wrong! K1 need to be known at compile-time");
+
+        static_assert((MPerBlock % (MPerXDL * MRepeat) == 0) &&
+                          (NPerBlock % (NRepeat * NPerXDL)) == 0,
+                      "Invalid tuning param!");
+
+        const auto M      = a_b_k0_m_k1_grid_desc.GetLength(I2);
+        const auto N      = b_b_k0_n_k1_grid_desc.GetLength(I2);
+        const auto K0     = a_b_k0_m_k1_grid_desc.GetLength(I1);
+        const auto KBatch = a_b_k0_m_k1_grid_desc.GetLength(I0);
+
+        if(!(M == c_m_n_grid_desc.GetLength(I0) && N == c_m_n_grid_desc.GetLength(I1) &&
+             K0 == b_b_k0_n_k1_grid_desc.GetLength(I1) &&
+             K1 == a_b_k0_m_k1_grid_desc.GetLength(I3) &&
+             K1 == b_b_k0_n_k1_grid_desc.GetLength(I3) &&
+             KBatch == b_b_k0_n_k1_grid_desc.GetLength(I0)))
+            return false;
+
+        if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % K0PerBlock == 0))
+            return false;
+
+        if(!block_2_ctile_map.CheckValidity(c_m_n_grid_desc))
+        {
+            return false;
+        }
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainK0BlockLoop(index_t K0)
+    {
+        const bool has_main_k0_block_loop = K0 > K0PerBlock;
+
+        return has_main_k0_block_loop;
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(const CMNGridDesc& c_m_n_grid_desc)
+    {
+        const auto M = c_m_n_grid_desc.GetLength(I0);
+        const auto N = c_m_n_grid_desc.GetLength(I1);
+
+        const auto MBlock = M / MPerBlock;
+        const auto NBlock = N / NPerBlock;
+
+        return transform_tensor_descriptor(
+            c_m_n_grid_desc,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, Number<MPerBlock>{})),
+                       make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3>{}));
+    }
+
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    __host__ __device__ static constexpr auto MakeCBlockClusterAdaptor(
+        const CMNGridDesc& c_m_n_grid_desc, index_t /* M01 */, index_t /* N01 */, index_t KBatch)
+    {
+        return BlockToCTileMap_KSplit_M00_N0_M01Adapt<MPerBlock, NPerBlock, CMNGridDesc>(
+            c_m_n_grid_desc, 8, KBatch);
+    }
+
+    __host__ __device__ static constexpr auto
+    GetCBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock()
+    {
+        constexpr index_t MWave = MPerBlock / (MRepeat * MPerXDL);
+        constexpr index_t NWave = NPerBlock / (NRepeat * NPerXDL);
+
+        return make_naive_tensor_descriptor_packed(
+            make_tuple(I1,
+                       Number<CShuffleMRepeatPerShuffle * MWave * MPerXDL>{},
+                       I1,
+                       Number<CShuffleNRepeatPerShuffle * NWave * NPerXDL>{}));
+    }
+
+    using CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
+        decltype(MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(CMNGridDesc{}));
+    using CBlockClusterAdaptor = decltype(MakeCBlockClusterAdaptor(CMNGridDesc{}, 1, 1, 1));
+
+    template <bool HasMainKBlockLoop>
+    __device__ static void Run(const FloatAB* __restrict__ p_a_grid,
+                               const FloatAB* __restrict__ p_b_grid,
+                               FloatC* __restrict__ p_c_grid,
+                               void* __restrict__ p_shared_block,
+                               const AGridDesc_B_K0_M_K1& a_b_k0_m_k1_grid_desc,
+                               const BGridDesc_B_K0_N_K1& b_b_k0_n_k1_grid_desc,
+                               const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
+                                   c_grid_desc_mblock_mperblock_nblock_nperblock,
+                               const AElementwiseOperation& a_element_op,
+                               const BElementwiseOperation& b_element_op,
+                               const CElementwiseOperation& c_element_op,
+                               const CBlockClusterAdaptor& c_block_cluster_adaptor)
+    {
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_b_k0_m_k1_grid_desc.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_b_k0_n_k1_grid_desc.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+        const auto K0 = a_b_k0_m_k1_grid_desc.GetLength(I1);
+
+        // divide block work by [M, N]
+        const auto block_work_idx =
+            c_block_cluster_adaptor.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!c_block_cluster_adaptor.ValidCTileIndex(
+               make_tuple(block_work_idx[I1], block_work_idx[I2]),
+               make_tuple(c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I0),
+                          c_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(I2))))
+        {
+            return;
+        }
+
+        const index_t k_batch_id = block_work_idx[I0];
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I2] * NPerBlock);
+
+        // lds max alignment
+        constexpr auto max_lds_align = K1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_k0_m_k1_block_desc = [&]() {
+            if constexpr(ABlockLdsExtraM)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1),
+                    make_tuple(Number<MPerBlock + 1>{} * K1, K1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+            }
+        }();
+
+        constexpr auto a_b_k0_m_k1_block_desc = [&]() {
+            if constexpr(ABlockLdsExtraM)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<1>{}, Number<K0PerBlock>{}, Number<MPerBlock>{}, K1),
+                    make_tuple(Number<K0PerBlock>{} * Number<MPerBlock + 1>{} * K1,
+                               Number<MPerBlock + 1>{} * K1,
+                               K1,
+                               I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<1>{}, Number<K0PerBlock>{}, Number<MPerBlock>{}, K1),
+                    max_lds_align);
+            }
+        }();
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_k0_n_k1_block_desc = [&]() {
+            if constexpr(BBlockLdsExtraN)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1),
+                    make_tuple(Number<NPerBlock + 1>{} * K1, K1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+            }
+        }();
+
+        constexpr auto b_b_k0_n_k1_block_desc = [&]() {
+            if constexpr(BBlockLdsExtraN)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<1>{}, Number<K0PerBlock>{}, Number<NPerBlock>{}, K1),
+                    make_tuple(Number<K0PerBlock>{} * Number<NPerBlock + 1>{} * K1,
+                               Number<NPerBlock + 1>{} * K1,
+                               K1,
+                               I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<1>{}, Number<K0PerBlock>{}, Number<NPerBlock>{}, K1),
+                    max_lds_align);
+            }
+        }();
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                AElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<1, K0PerBlock, MPerBlock, K1>,
+                                                ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(a_b_k0_m_k1_grid_desc),
+                                                decltype(a_b_k0_m_k1_block_desc),
+                                                ABlockTransferSrcAccessOrder,
+                                                Sequence<0, 2, 1, 3>,
+                                                ABlockTransferSrcVectorDim,
+                                                3,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_K1,
+                                                1,
+                                                1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                true>(
+                a_b_k0_m_k1_grid_desc,
+                make_multi_index(k_batch_id, 0, m_block_data_idx_on_grid, 0),
+                a_element_op,
+                a_b_k0_m_k1_block_desc,
+                make_multi_index(0, 0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                BElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<1, K0PerBlock, NPerBlock, K1>,
+                                                BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(b_b_k0_n_k1_grid_desc),
+                                                decltype(b_b_k0_n_k1_block_desc),
+                                                BBlockTransferSrcAccessOrder,
+                                                Sequence<0, 2, 1, 3>,
+                                                BBlockTransferSrcVectorDim,
+                                                3,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_K1,
+                                                1,
+                                                1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                true>(
+                b_b_k0_n_k1_grid_desc,
+                make_multi_index(k_batch_id, 0, n_block_data_idx_on_grid, 0),
+                b_element_op,
+                b_b_k0_n_k1_block_desc,
+                make_multi_index(0, 0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[K0PerBlock, MPerBlock] is in LDS
+        //     b_mtx[K0PerBlock, NPerBlock] is in LDS
+        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
+        //       register
+        // sanity check
+
+        auto blockwise_gemm =
+            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
+                                                                FloatAB,
+                                                                FloatAcc,
+                                                                decltype(a_k0_m_k1_block_desc),
+                                                                decltype(b_k0_n_k1_block_desc),
+                                                                MPerXDL,
+                                                                NPerXDL,
+                                                                MRepeat,
+                                                                NRepeat,
+                                                                K1>{};
+
+        auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size =
+            math::integer_least_multiple(a_k0_m_k1_block_desc.GetElementSpaceSize(), max_lds_align);
+
+        FloatAB* p_a_block = static_cast<FloatAB*>(p_shared_block);
+        FloatAB* p_b_block = static_cast<FloatAB*>(p_shared_block) + a_block_space_size;
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(0, K0PerBlock, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(0, K0PerBlock, 0, 0);
+
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            p_a_block, a_k0_m_k1_block_desc.GetElementSpaceSize());
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            p_b_block, b_k0_n_k1_block_desc.GetElementSpaceSize());
+
+        // preload data into LDS
+        {
+            a_blockwise_copy.RunRead(a_b_k0_m_k1_grid_desc, a_grid_buf);
+            b_blockwise_copy.RunRead(b_b_k0_n_k1_grid_desc, b_grid_buf);
+
+            a_blockwise_copy.RunWrite(a_b_k0_m_k1_block_desc, a_block_buf);
+            b_blockwise_copy.RunWrite(b_b_k0_n_k1_block_desc, b_block_buf);
+        }
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        // main body
+        if constexpr(HasMainKBlockLoop)
+        {
+            index_t k0_block_data_begin = 0;
+
+            do
+            {
+                a_blockwise_copy.MoveSrcSliceWindow(a_b_k0_m_k1_grid_desc, a_block_slice_copy_step);
+                b_blockwise_copy.MoveSrcSliceWindow(b_b_k0_n_k1_grid_desc, b_block_slice_copy_step);
+
+                a_blockwise_copy.RunRead(a_b_k0_m_k1_grid_desc, a_grid_buf);
+
+                block_sync_lds();
+
+                b_blockwise_copy.RunRead(b_b_k0_n_k1_grid_desc, b_grid_buf);
+
+                blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+
+                block_sync_lds();
+
+                a_blockwise_copy.RunWrite(a_b_k0_m_k1_block_desc, a_block_buf);
+                b_blockwise_copy.RunWrite(b_b_k0_n_k1_block_desc, b_block_buf);
+
+                k0_block_data_begin += K0PerBlock;
+            } while(k0_block_data_begin < (K0 - K0PerBlock));
+        }
+
+        // tail
+        {
+            block_sync_lds();
+
+            blockwise_gemm.Run(a_block_buf, b_block_buf, c_thread_buf);
+        }
+
+        // output: register to global memory
+        {
+            constexpr index_t MWave = MPerBlock / (MRepeat * MPerXDL);
+            constexpr index_t NWave = NPerBlock / (NRepeat * NPerXDL);
+
+            constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc =
+                blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto c_m0_n0_m1_n1_m2_m3_m4_n2_thread_desc =
+                blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I0);
+            constexpr auto N0 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I1);
+            constexpr auto M1 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I2);
+            constexpr auto N1 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I3);
+            constexpr auto M2 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I4);
+            constexpr auto M3 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I5);
+            constexpr auto M4 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I6);
+            constexpr auto N2 = c_m0_n0_m1_n1_m2_m3_m4_n2_block_desc.GetLength(I7);
+
+            constexpr auto c_block_desc_mblock_mperblock_nblock_nperblock =
+                GetCBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
+
+            auto c_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<FloatC*>(p_shared_block),
+                c_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_block_desc_mblock_mperblock_nblock_nperblock,
+                make_tuple(
+                    make_freeze_transform(I0), // freeze mblock
+                    make_unmerge_transform(make_tuple(CShuffleMRepeatPerShuffle,
+                                                      M1,
+                                                      M2,
+                                                      M3,
+                                                      M4)), // M1 = MWave, M2 * M3 * M4 = MPerXDL
+                    make_freeze_transform(I0),              // freeze nblock
+                    make_unmerge_transform(make_tuple(CShuffleNRepeatPerShuffle,
+                                                      N1,
+                                                      N2))), // M1 = MWave, M2 * M3 * M4 = MPerXDL
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(
+                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<FloatAcc,
+                                                   FloatC,
+                                                   decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_thread_desc),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMRepeatPerShuffle,
+                                                            CShuffleNRepeatPerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            M2,
+                                                            I1,
+                                                            M4,
+                                                            I1>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     m_thread_data_on_block_idx[I3],
+                                     m_thread_data_on_block_idx[I4],
+                                     n_thread_data_on_block_idx[I2]),
+                    ck::tensor_operation::element_wise::PassThrough{}};
+
+            // LDS to global
+            auto c_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1<
+                ThisThreadBlock,            // index_t BlockSize,
+                CElementwiseOperation,      // ElementwiseOperation,
+                CGlobalMemoryDataOperation, // DstInMemOp,
+                Sequence<1,
+                         CShuffleMRepeatPerShuffle * MWave * MPerXDL,
+                         1,
+                         CShuffleNRepeatPerShuffle * NWave * NPerXDL>, // BlockSliceLengths,
+                CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
+                FloatC,               // typename SrcData,
+                FloatC,               // typename DstData,
+                decltype(c_block_desc_mblock_mperblock_nblock_nperblock),
+                decltype(c_grid_desc_mblock_mperblock_nblock_nperblock),
+                Sequence<0, 1, 2, 3>,                       // typename DimAccessOrder,
+                3,                                          // index_t VectorDim,
+                CBlockTransferScalarPerVector_NWaveNPerXDL, // index_t ScalarPerVector,
+                true,  // bool ThreadTransferSrcResetCoordinateAfterRun,
+                false> // bool ThreadTransferDstResetCoordinateAfterRun
+                {c_block_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(0, 0, 0, 0),
+                 c_grid_desc_mblock_mperblock_nblock_nperblock,
+                 make_multi_index(block_work_idx[I1], 0, block_work_idx[I2], 0),
+                 c_element_op};
+
+            constexpr auto mxdlperwave_forward_step =
+                make_multi_index(0, CShuffleMRepeatPerShuffle * MWave * MPerXDL, 0, 0);
+            constexpr auto nxdlperwave_forward_step =
+                make_multi_index(0, 0, 0, CShuffleNRepeatPerShuffle * NWave * NPerXDL);
+            constexpr auto nxdlperwave_backward_step =
+                make_multi_index(0, 0, 0, -CShuffleNRepeatPerShuffle * NWave * NPerXDL);
+
+            static_for<0, MRepeat, CShuffleMRepeatPerShuffle>{}([&](auto mxdlperwave_iter) {
+                constexpr auto mxdlperwave = mxdlperwave_iter;
+
+                static_for<0, NRepeat, CShuffleNRepeatPerShuffle>{}([&](auto nxdlperwave_iter) {
+                    constexpr bool nxdlperwave_forward_sweep =
+                        (mxdlperwave % (2 * CShuffleMRepeatPerShuffle) == 0);
+
+                    constexpr index_t nxdlperwave_value =
+                        nxdlperwave_forward_sweep
+                            ? nxdlperwave_iter
+                            : (NRepeat - nxdlperwave_iter - CShuffleNRepeatPerShuffle);
+
+                    constexpr auto nxdlperwave = Number<nxdlperwave_value>{};
+
+                    // make sure it's safe to do ds_write
+                    block_sync_lds();
+
+                    // VGPR to LDS
+                    c_thread_copy_vgpr_to_lds.Run(
+                        c_m0_n0_m1_n1_m2_m3_m4_n2_thread_desc,
+                        make_tuple(mxdlperwave, nxdlperwave, I0, I0, I0, I0, I0, I0),
+                        c_thread_buf,
+                        c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                        c_block_buf);
+
+                    // make sure it's safe to do ds_read
+                    block_sync_lds();
+
+                    // LDS to global
+                    c_block_copy_lds_to_global.Run(c_block_desc_mblock_mperblock_nblock_nperblock,
+                                                   c_block_buf,
+                                                   c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                   c_grid_buf);
+
+                    // move on nxdlperwave dimension
+                    if constexpr(nxdlperwave_forward_sweep &&
+                                 (nxdlperwave < NRepeat - CShuffleNRepeatPerShuffle))
+                    {
+                        c_block_copy_lds_to_global.MoveDstSliceWindow(
+                            c_grid_desc_mblock_mperblock_nblock_nperblock,
+                            nxdlperwave_forward_step);
+                    }
+                    else if constexpr((!nxdlperwave_forward_sweep) && (nxdlperwave > 0))
+                    {
+                        c_block_copy_lds_to_global.MoveDstSliceWindow(
+                            c_grid_desc_mblock_mperblock_nblock_nperblock,
+                            nxdlperwave_backward_step);
+                    }
+                });
+
+                // move on mxdlperwave dimension
+                if constexpr(mxdlperwave < MRepeat - CShuffleMRepeatPerShuffle)
+                {
+                    c_block_copy_lds_to_global.MoveDstSliceWindow(
+                        c_grid_desc_mblock_mperblock_nblock_nperblock, mxdlperwave_forward_step);
+                }
+            });
+        }
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
new file mode 100644
index 00000000..ffb2926c
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r1.hpp
@@ -0,0 +1,723 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_space_filling_curve.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AGridDesc_AK0_M_AK1,
+          typename BGridDesc_BK0_N_BK1,
+          typename CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename Block2CTileMap,
+          bool HasMainK0BlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_xdlops_v3r1(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
+            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
+            const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
+                c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CElementwiseOperation c_element_op,
+            const Block2CTileMap block_2_ctile_map)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    GridwiseGemm::template Run<HasMainK0BlockLoop>(
+        p_a_grid,
+        p_b_grid,
+        p_c_grid,
+        p_shared,
+        a_grid_desc_ak0_m_ak1,
+        b_grid_desc_bk0_n_bk1,
+        c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+        a_element_op,
+        b_element_op,
+        c_element_op,
+        block_2_ctile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_c_grid;
+    ignore = a_grid_desc_ak0_m_ak1;
+    ignore = b_grid_desc_bk0_n_bk1;
+    ignore = c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = c_element_op;
+    ignore = block_2_ctile_map;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+template <
+    index_t BlockSize,
+    typename FloatAB,
+    typename FloatAcc,
+    typename FloatCShuffle,
+    typename FloatC,
+    InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+    typename AGridDesc_AK0_M_AK1,
+    typename BGridDesc_BK0_N_BK1,
+    typename CGridDesc_M_N,
+    typename AElementwiseOperation,
+    typename BElementwiseOperation,
+    typename CElementwiseOperation,
+    index_t MPerBlock,
+    index_t NPerBlock,
+    index_t KPerBlock,
+    index_t AK1Value,
+    index_t BK1Value,
+    index_t MPerXdl,
+    index_t NPerXdl,
+    index_t MXdlPerWave,
+    index_t NXdlPerWave,
+    typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+    typename ABlockTransferThreadClusterArrangeOrder,
+    typename ABlockTransferSrcAccessOrder,
+    index_t ABlockTransferSrcVectorDim,
+    index_t ABlockTransferSrcScalarPerVector,
+    index_t ABlockTransferDstScalarPerVector_K1,
+    bool AThreadTransferSrcResetCoordinateAfterRun,
+    bool ABlockLdsExtraM,
+    typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+    typename BBlockTransferThreadClusterArrangeOrder,
+    typename BBlockTransferSrcAccessOrder,
+    index_t BBlockTransferSrcVectorDim,
+    index_t BBlockTransferSrcScalarPerVector,
+    index_t BBlockTransferDstScalarPerVector_K1,
+    bool BThreadTransferSrcResetCoordinateAfterRun,
+    bool BBlockLdsExtraN,
+    index_t CShuffleMXdlPerWavePerShuffle,
+    index_t CShuffleNXdlPerWavePerShuffle,
+    typename CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl,
+    index_t CBlockTransferScalarPerVector_NWaveNPerXdl,
+    index_t NumGemmKPrefetchStage = 1,
+    PipelineVersion PipelineVer   = PipelineVersion::v1>
+struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    // K1 should be Number<...>
+    static constexpr auto AK0 = Number<KPerBlock / AK1Value>{};
+    static constexpr auto BK0 = Number<KPerBlock / BK1Value>{};
+    static constexpr auto AK1 = Number<AK1Value>{};
+    static constexpr auto BK1 = Number<BK1Value>{};
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    using GridwiseGemmPipe = remove_cvref_t<decltype(
+        GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage>())>;
+
+    __host__ __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
+    {
+        constexpr auto max_lds_align = AK1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = [&]() {
+            if constexpr(ABlockLdsExtraM)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(AK0, Number<MPerBlock>{}, AK1),
+                    make_tuple(Number<MPerBlock + 1>{} * AK1, AK1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(AK0, Number<MPerBlock>{}, AK1), max_lds_align);
+            }
+        }();
+
+        return a_block_desc_ak0_m_ak1;
+    }
+
+    __host__ __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
+    {
+        constexpr auto max_lds_align = BK1;
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_bk0_n_bk1 = [&]() {
+            if constexpr(BBlockLdsExtraN)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(BK0, Number<NPerBlock>{}, BK1),
+                    make_tuple(Number<NPerBlock + 1>{} * BK1, BK1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(BK0, Number<NPerBlock>{}, BK1), max_lds_align);
+            }
+        }();
+
+        return b_block_desc_bk0_n_bk1;
+    }
+
+    __host__ __device__ static constexpr auto
+    GetCBlockDescriptor_MBlock_NXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl()
+    {
+        constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        constexpr auto
+            c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl =
+                make_naive_tensor_descriptor_packed(
+                    make_tuple(I1,
+                               Number<CShuffleMXdlPerWavePerShuffle>{},
+                               Number<MWave * MPerXdl>{},
+                               I1,
+                               Number<CShuffleNXdlPerWavePerShuffle>{},
+                               Number<NWave * NPerXdl>{}));
+
+        return c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl;
+    }
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        constexpr auto a_block_space_size_aligned =
+            math::integer_least_multiple(a_block_desc_ak0_m_ak1.GetElementSpaceSize(), AK1);
+
+        constexpr auto b_block_space_size_aligned =
+            math::integer_least_multiple(b_block_desc_bk0_n_bk1.GetElementSpaceSize(), BK1);
+
+        // LDS allocation for C shuffle in LDS
+        constexpr auto c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl =
+            GetCBlockDescriptor_MBlock_NXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl();
+
+        constexpr auto c_block_size =
+            c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
+                .GetElementSpaceSize();
+
+        return math::max((a_block_space_size_aligned + b_block_space_size_aligned) *
+                             sizeof(FloatAB),
+                         c_block_size * sizeof(FloatCShuffle));
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    template <typename Block2CTileMap>
+    __host__ __device__ static constexpr bool
+    CheckValidity(const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1,
+                  const BGridDesc_BK0_N_BK1& b_grid_desc_bk0_n_bk1,
+                  const CGridDesc_M_N& c_grid_desc_m_n,
+                  const Block2CTileMap& block_2_ctile_map)
+    {
+        // static_assert(is_known_at_compile_time<remove_cv_t<decltype(AK1)>>::value &&
+        //               is_known_at_compile_time<remove_cv_t<decltype(BK1)>>::value,
+        //               "wrong! K1 need to be known at compile-time");
+
+        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
+                      "Invalid tuning param!");
+
+        const auto M = a_grid_desc_ak0_m_ak1.GetLength(I1);
+        const auto N = b_grid_desc_bk0_n_bk1.GetLength(I1);
+        const auto K = a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2);
+
+        if(!(M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1)))
+            return false;
+
+        if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0))
+            return false;
+
+        // check gridwise gemm pipeline
+        const auto num_k_loop = K / KPerBlock;
+
+        if(!GridwiseGemmPipe::IsSupported(num_k_loop))
+        {
+            return false;
+        }
+
+        if(!block_2_ctile_map.CheckValidity(c_grid_desc_m_n))
+        {
+            return false;
+        }
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const index_t num_loop = K / KPerBlock;
+
+        return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
+    }
+
+    __host__ __device__ static constexpr auto
+    MakeCGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl(
+        const CGridDesc_M_N& c_grid_desc_m_n)
+    {
+        const auto M = c_grid_desc_m_n.GetLength(I0);
+        const auto N = c_grid_desc_m_n.GetLength(I1);
+
+        const auto MBlock = M / MPerBlock;
+        const auto NBlock = N / NPerBlock;
+
+        constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        const auto c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl =
+            transform_tensor_descriptor(
+                c_grid_desc_m_n,
+                make_tuple(make_unmerge_transform(make_tuple(
+                               MBlock, Number<MXdlPerWave>{}, Number<MWave * MPerXdl>{})),
+                           make_unmerge_transform(make_tuple(
+                               NBlock, Number<NXdlPerWave>{}, Number<NWave * NPerXdl>{}))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{}));
+
+        return c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl;
+    }
+
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    __host__ __device__ static constexpr auto MakeDefaultBlock2CTileMap(
+        const CGridDesc_M_N& c_grid_desc_m_n, index_t /* M01 */, index_t /* N01 */)
+    {
+        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc_M_N>(
+            c_grid_desc_m_n);
+    }
+    using CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl =
+        remove_cvref_t<decltype(
+            MakeCGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl(
+                CGridDesc_M_N{}))>;
+
+    using DefaultBlock2CTileMap =
+        remove_cvref_t<decltype(MakeDefaultBlock2CTileMap(CGridDesc_M_N{}, 1, 1))>;
+
+    template <bool HasMainK0BlockLoop, typename Block2CTileMap>
+    __device__ static void
+    Run(const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatC* __restrict__ p_c_grid,
+        void* __restrict__ p_shared,
+        const AGridDesc_AK0_M_AK1& a_grid_desc_ak0_m_ak1,
+        const BGridDesc_BK0_N_BK1& b_grid_desc_bk0_n_bk1,
+        const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl&
+            c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+        const AElementwiseOperation& a_element_op,
+        const BElementwiseOperation& b_element_op,
+        const CElementwiseOperation& c_element_op,
+        const Block2CTileMap& block_2_ctile_map)
+    {
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid,
+            c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
+                .GetElementSpaceSize());
+
+        // divide block work by [M, N]
+        const auto block_work_idx =
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!block_2_ctile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(
+                   c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
+                       .GetLength(I0),
+                   c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
+                       .GetLength(I3))))
+        {
+            return;
+        }
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
+
+        // lds max alignment
+        constexpr auto max_lds_align = math::lcm(AK1, BK1);
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                AElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<AK0, MPerBlock, AK1>,
+                                                ABlockTransferThreadClusterLengths_AK0_M_AK1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(a_grid_desc_ak0_m_ak1),
+                                                decltype(a_block_desc_ak0_m_ak1),
+                                                ABlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                ABlockTransferSrcVectorDim,
+                                                2,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_K1,
+                                                1,
+                                                1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                NumGemmKPrefetchStage>(
+                a_grid_desc_ak0_m_ak1,
+                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                a_element_op,
+                a_block_desc_ak0_m_ak1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                BElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<BK0, NPerBlock, BK1>,
+                                                BBlockTransferThreadClusterLengths_BK0_N_BK1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(b_grid_desc_bk0_n_bk1),
+                                                decltype(b_block_desc_bk0_n_bk1),
+                                                BBlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                BBlockTransferSrcVectorDim,
+                                                2,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_K1,
+                                                1,
+                                                1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                NumGemmKPrefetchStage>(
+                b_grid_desc_bk0_n_bk1,
+                make_multi_index(0, n_block_data_idx_on_grid, 0),
+                b_element_op,
+                b_block_desc_bk0_n_bk1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[K0PerBlock, MPerBlock] is in LDS
+        //     b_mtx[K0PerBlock, NPerBlock] is in LDS
+        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
+        //       register
+        // sanity check
+        constexpr index_t k_pack = math::max(
+            math::lcm(AK1, BK1), MfmaSelector<FloatAB, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
+
+        auto blockwise_gemm =
+            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
+                                                                FloatAB,
+                                                                FloatAcc,
+                                                                decltype(a_block_desc_ak0_m_ak1),
+                                                                decltype(b_block_desc_bk0_n_bk1),
+                                                                MPerXdl,
+                                                                NPerXdl,
+                                                                MXdlPerWave,
+                                                                NXdlPerWave,
+                                                                k_pack>{};
+
+        auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
+            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
+
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<FloatAB*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
+
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<FloatAB*>(p_shared) + a_block_space_size_aligned,
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1, 0, 0);
+
+        // gridwise GEMM pipeline
+        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
+            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
+            KPerBlock);
+
+        GridwiseGemmPipe::template Run<HasMainK0BlockLoop>(a_grid_desc_ak0_m_ak1,
+                                                           a_block_desc_ak0_m_ak1,
+                                                           a_blockwise_copy,
+                                                           a_grid_buf,
+                                                           a_block_buf,
+                                                           a_block_slice_copy_step,
+                                                           b_grid_desc_bk0_n_bk1,
+                                                           b_block_desc_bk0_n_bk1,
+                                                           b_blockwise_copy,
+                                                           b_grid_buf,
+                                                           b_block_buf,
+                                                           b_block_slice_copy_step,
+                                                           blockwise_gemm,
+                                                           c_thread_buf,
+                                                           num_k_block_main_loop);
+
+        // shuffle C and write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+            constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+
+            constexpr auto c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl =
+                GetCBlockDescriptor_MBlock_NXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl();
+
+            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<FloatCShuffle*>(p_shared),
+                c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
+                    .GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                make_tuple(
+                    make_freeze_transform(I0), // freeze mblock
+                    make_pass_through_transform(
+                        Number<CShuffleMXdlPerWavePerShuffle>{}), // M0 (MXdlPerWave) per shuffle
+                    make_unmerge_transform(
+                        make_tuple(M1, M2, M3, M4)), // M1 = MWave, M2 * M3 * M4 = MPerXdl
+                    make_freeze_transform(I0),       // freeze nblock
+                    make_pass_through_transform(
+                        Number<CShuffleNXdlPerWavePerShuffle>{}), // N0 (NXdlPerWave) per shuffle
+                    make_unmerge_transform(
+                        make_tuple(N1, N2))), // M1 = MWave, M2 * M3 * M4 = MPerXdl
+                make_tuple(Sequence<0>{},
+                           Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<3>{},
+                           Sequence<4>{},
+                           Sequence<5>{}),
+                make_tuple(Sequence<>{},
+                           Sequence<0>{},
+                           Sequence<2, 4, 5, 6>{},
+                           Sequence<>{},
+                           Sequence<1>{},
+                           Sequence<3, 7>{}));
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<FloatAcc,
+                                                   FloatCShuffle,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            M2,
+                                                            I1,
+                                                            M4,
+                                                            I1>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     m_thread_data_on_block_idx[I3],
+                                     m_thread_data_on_block_idx[I4],
+                                     n_thread_data_on_block_idx[I2]),
+                    ck::tensor_operation::element_wise::PassThrough{}};
+
+            // LDS to global
+            auto c_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r1<
+                ThisThreadBlock,            // ThreadGroup
+                CElementwiseOperation,      // ElementwiseOperation,
+                CGlobalMemoryDataOperation, // DstInMemOp,
+                Sequence<1,
+                         CShuffleMXdlPerWavePerShuffle,
+                         MWave * MPerXdl,
+                         1,
+                         CShuffleNXdlPerWavePerShuffle,
+                         NWave * NPerXdl>, // BlockSliceLengths,
+                CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl,
+                Sequence<0, 1, 2, 3, 4, 5>, // typename ThreadClusterArrangeOrder,
+                FloatCShuffle,              // typename SrcData,
+                FloatC,                     // typename DstData,
+                decltype(
+                    c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl),
+                decltype(
+                    c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl),
+                Sequence<0, 1, 2, 3, 4, 5>,                 // typename DimAccessOrder,
+                5,                                          // index_t VectorDim,
+                CBlockTransferScalarPerVector_NWaveNPerXdl, // index_t ScalarPerVector,
+                true,  // bool ThreadTransferSrcResetCoordinateAfterRun,
+                false> // bool ThreadTransferDstResetCoordinateAfterRun>
+                {c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                 make_multi_index(0, 0, 0, 0, 0, 0),
+                 c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                 make_multi_index(block_work_idx[I0], 0, 0, block_work_idx[I1], 0, 0),
+                 c_element_op};
+
+            constexpr auto mxdlperwave_forward_step =
+                make_multi_index(0, CShuffleMXdlPerWavePerShuffle, 0, 0, 0, 0);
+            constexpr auto nxdlperwave_forward_step =
+                make_multi_index(0, 0, 0, 0, CShuffleNXdlPerWavePerShuffle, 0);
+            constexpr auto nxdlperwave_backward_step =
+                make_multi_index(0, 0, 0, 0, -CShuffleNXdlPerWavePerShuffle, 0);
+
+            static_for<0, MXdlPerWave, CShuffleMXdlPerWavePerShuffle>{}([&](auto mxdlperwave_iter) {
+                constexpr auto mxdlperwave = mxdlperwave_iter;
+
+                static_for<0,
+                           NXdlPerWave,
+                           CShuffleNXdlPerWavePerShuffle>{}([&](auto nxdlperwave_iter) {
+                    constexpr bool nxdlperwave_forward_sweep =
+                        (mxdlperwave % (2 * CShuffleMXdlPerWavePerShuffle) == 0);
+
+                    constexpr index_t nxdlperwave_value =
+                        nxdlperwave_forward_sweep
+                            ? nxdlperwave_iter
+                            : (NXdlPerWave - nxdlperwave_iter - CShuffleNXdlPerWavePerShuffle);
+
+                    constexpr auto nxdlperwave = Number<nxdlperwave_value>{};
+
+                    // make sure it's safe to do ds_write
+                    block_sync_lds();
+
+                    // VGPR to LDS
+                    c_thread_copy_vgpr_to_lds.Run(
+                        c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                        make_tuple(mxdlperwave, nxdlperwave, I0, I0, I0, I0, I0, I0),
+                        c_thread_buf,
+                        c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                        c_shuffle_block_buf);
+
+                    // make sure it's safe to do ds_read
+                    block_sync_lds();
+
+                    // LDS to global
+                    c_block_copy_lds_to_global.Run(
+                        c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                        c_shuffle_block_buf,
+                        c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                        c_grid_buf);
+
+                    // move on nxdlperwave dimension
+                    if constexpr(nxdlperwave_forward_sweep &&
+                                 (nxdlperwave < NXdlPerWave - CShuffleNXdlPerWavePerShuffle))
+                    {
+                        c_block_copy_lds_to_global.MoveDstSliceWindow(
+                            c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                            nxdlperwave_forward_step);
+                    }
+                    else if constexpr((!nxdlperwave_forward_sweep) && (nxdlperwave > 0))
+                    {
+                        c_block_copy_lds_to_global.MoveDstSliceWindow(
+                            c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                            nxdlperwave_backward_step);
+                    }
+                });
+
+                // move on mxdlperwave dimension
+                if constexpr(mxdlperwave < MXdlPerWave - CShuffleMXdlPerWavePerShuffle)
+                {
+                    c_block_copy_lds_to_global.MoveDstSliceWindow(
+                        c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                        mxdlperwave_forward_step);
+                }
+            });
+        }
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
new file mode 100644
index 00000000..7e6dbb3b
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r2.hpp
@@ -0,0 +1,762 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r2.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AGridDesc_K0_M_K1,
+          typename BGridDesc_K0_N_K1,
+          typename CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl,
+          typename C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename Block2CTileMap,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_xdlops_v3r2(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const FloatC* __restrict__ p_c0_grid,
+            const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
+            const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
+            const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
+                c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+            const C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
+                c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CElementwiseOperation c_element_op,
+            const Block2CTileMap block_2_ctile_map)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(
+        p_a_grid,
+        p_b_grid,
+        p_c_grid,
+        p_c0_grid,
+        p_shared,
+        a_grid_desc_k0_m_k1,
+        b_grid_desc_k0_n_k1,
+        c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+        c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+        a_element_op,
+        b_element_op,
+        c_element_op,
+        block_2_ctile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_c_grid;
+    ignore = p_c0_grid;
+    ignore = a_grid_desc_k0_m_k1;
+    ignore = b_grid_desc_k0_n_k1;
+    ignore = c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl;
+    ignore = c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = c_element_op;
+    ignore = block_2_ctile_map;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+template <
+    index_t BlockSize,
+    typename FloatAB,
+    typename FloatAcc,
+    typename FloatC,
+    InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+    typename AGridDesc_K0_M_K1,
+    typename BGridDesc_K0_N_K1,
+    typename CGridDesc_M_N,
+    typename C0GridDesc_M_N,
+    typename AElementwiseOperation,
+    typename BElementwiseOperation,
+    typename CElementwiseOperation,
+    index_t MPerBlock,
+    index_t NPerBlock,
+    index_t K0PerBlock,
+    index_t MPerXdl,
+    index_t NPerXdl,
+    index_t K1Value,
+    index_t MXdlPerWave,
+    index_t NXdlPerWave,
+    typename ABlockTransferThreadClusterLengths_K0_M_K1,
+    typename ABlockTransferThreadClusterArrangeOrder,
+    typename ABlockTransferSrcAccessOrder,
+    index_t ABlockTransferSrcVectorDim,
+    index_t ABlockTransferSrcScalarPerVector,
+    index_t ABlockTransferDstScalarPerVector_K1,
+    bool AThreadTransferSrcResetCoordinateAfterRun,
+    bool ABlockLdsExtraM,
+    typename BBlockTransferThreadClusterLengths_K0_N_K1,
+    typename BBlockTransferThreadClusterArrangeOrder,
+    typename BBlockTransferSrcAccessOrder,
+    index_t BBlockTransferSrcVectorDim,
+    index_t BBlockTransferSrcScalarPerVector,
+    index_t BBlockTransferDstScalarPerVector_K1,
+    bool BThreadTransferSrcResetCoordinateAfterRun,
+    bool BBlockLdsExtraN,
+    index_t CShuffleMXdlPerWavePerShuffle,
+    index_t CShuffleNXdlPerWavePerShuffle,
+    typename CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl,
+    index_t CBlockTransferScalarPerVector_NWaveNPerXdl,
+    index_t NumGemmKPrefetchStage = 1,
+    PipelineVersion PipelineVer   = PipelineVersion::v1>
+struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r2
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    // K1 should be Number<...>
+    static constexpr auto K1 = Number<K1Value>{};
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    using GridwiseGemmPipe = remove_cvref_t<decltype(
+        GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage>())>;
+
+    __host__ __device__ static constexpr auto GetABlockDescriptor_K0PerBlock_MPerBlock_K1()
+    {
+        constexpr auto max_lds_align = K1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_k0_m_k1 = [&]() {
+            if constexpr(ABlockLdsExtraM)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1),
+                    make_tuple(Number<MPerBlock + 1>{} * K1, K1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+            }
+        }();
+
+        return a_block_desc_k0_m_k1;
+    }
+
+    __host__ __device__ static constexpr auto GetBBlockDescriptor_K0PerBlock_NPerBlock_K1()
+    {
+        constexpr auto max_lds_align = K1;
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_k0_n_k1 = [&]() {
+            if constexpr(BBlockLdsExtraN)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1),
+                    make_tuple(Number<NPerBlock + 1>{} * K1, K1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+            }
+        }();
+
+        return b_block_desc_k0_n_k1;
+    }
+
+    __host__ __device__ static constexpr auto
+    GetCBlockDescriptor_MBlock_NXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl()
+    {
+        constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        constexpr auto
+            c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl =
+                make_naive_tensor_descriptor_packed(
+                    make_tuple(I1,
+                               Number<CShuffleMXdlPerWavePerShuffle>{},
+                               Number<MWave * MPerXdl>{},
+                               I1,
+                               Number<CShuffleNXdlPerWavePerShuffle>{},
+                               Number<NWave * NPerXdl>{}));
+
+        return c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl;
+    }
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_desc_k0_m_k1 = GetABlockDescriptor_K0PerBlock_MPerBlock_K1();
+
+        constexpr auto b_block_desc_k0_n_k1 = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1();
+
+        constexpr auto max_lds_align = K1;
+
+        constexpr auto a_block_space_size_aligned =
+            math::integer_least_multiple(a_block_desc_k0_m_k1.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size_aligned =
+            math::integer_least_multiple(b_block_desc_k0_n_k1.GetElementSpaceSize(), max_lds_align);
+
+        // LDS allocation for C shuffle in LDS
+        constexpr auto c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl =
+            GetCBlockDescriptor_MBlock_NXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl();
+
+        constexpr auto c_block_size =
+            c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
+                .GetElementSpaceSize();
+
+        return math::max((a_block_space_size_aligned + b_block_space_size_aligned) *
+                             sizeof(FloatAB),
+                         c_block_size * sizeof(FloatC));
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    template <typename Block2CTileMap>
+    __host__ __device__ static constexpr bool
+    CheckValidity(const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1,
+                  const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1,
+                  const CGridDesc_M_N& c_grid_desc_m_n,
+                  const Block2CTileMap& block_2_ctile_map)
+    {
+        static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
+                      "wrong! K1 need to be known at compile-time");
+
+        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
+                      "Invalid tuning param!");
+
+        const auto M  = a_grid_desc_k0_m_k1.GetLength(I1);
+        const auto N  = b_grid_desc_k0_n_k1.GetLength(I1);
+        const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0);
+
+        if(!(M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1) &&
+             K0 == b_grid_desc_k0_n_k1.GetLength(I0) && K1 == a_grid_desc_k0_m_k1.GetLength(I2) &&
+             K1 == b_grid_desc_k0_n_k1.GetLength(I2)))
+            return false;
+
+        if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % K0PerBlock == 0))
+            return false;
+
+        // check gridwise gemm pipeline
+        const auto num_k_loop = K0 / K0PerBlock;
+
+        if(!GridwiseGemmPipe::IsSupported(num_k_loop))
+        {
+            return false;
+        }
+
+        if(!block_2_ctile_map.CheckValidity(c_grid_desc_m_n))
+        {
+            return false;
+        }
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const index_t num_loop = K / (K0PerBlock * K1);
+
+        return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
+    }
+
+    template <typename CGridDesc_M_N_>
+    __host__ __device__ static constexpr auto
+    MakeCGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl(
+        const CGridDesc_M_N_& c_grid_desc_m_n)
+    {
+        const auto M = c_grid_desc_m_n.GetLength(I0);
+        const auto N = c_grid_desc_m_n.GetLength(I1);
+
+        const auto MBlock = M / MPerBlock;
+        const auto NBlock = N / NPerBlock;
+
+        constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        const auto c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl =
+            transform_tensor_descriptor(
+                c_grid_desc_m_n,
+                make_tuple(make_unmerge_transform(make_tuple(
+                               MBlock, Number<MXdlPerWave>{}, Number<MWave * MPerXdl>{})),
+                           make_unmerge_transform(make_tuple(
+                               NBlock, Number<NXdlPerWave>{}, Number<NWave * NPerXdl>{}))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{}));
+
+        return c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl;
+    }
+
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    __host__ __device__ static constexpr auto MakeDefaultBlock2CTileMap(
+        const CGridDesc_M_N& c_grid_desc_m_n, index_t /* M01 */, index_t /* N01 */)
+    {
+        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc_M_N>(
+            c_grid_desc_m_n);
+    }
+
+    using CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl =
+        remove_cvref_t<decltype(
+            MakeCGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl(
+                CGridDesc_M_N{}))>;
+
+    using C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl =
+        remove_cvref_t<decltype(
+            MakeCGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl(
+                C0GridDesc_M_N{}))>;
+
+    using DefaultBlock2CTileMap =
+        remove_cvref_t<decltype(MakeDefaultBlock2CTileMap(CGridDesc_M_N{}, 1, 1))>;
+
+    template <bool HasMainKBlockLoop, typename Block2CTileMap = DefaultBlock2CTileMap>
+    __device__ static void
+    Run(const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatC* __restrict__ p_c_grid,
+        const FloatC* __restrict__ p_c0_grid,
+        void* __restrict__ p_shared,
+        const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1,
+        const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1,
+        const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl&
+            c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+        const C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl&
+            c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+        const AElementwiseOperation& a_element_op,
+        const BElementwiseOperation& b_element_op,
+        const CElementwiseOperation& c_element_op,
+        const Block2CTileMap& block_2_ctile_map)
+    {
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_k0_m_k1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_k0_n_k1.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid,
+            c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
+                .GetElementSpaceSize());
+        auto c0_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c0_grid,
+            c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
+                .GetElementSpaceSize());
+
+        const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0);
+
+        // divide block work by [M, N]
+        const auto block_work_idx =
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!block_2_ctile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(
+                   c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
+                       .GetLength(I0),
+                   c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
+                       .GetLength(I3))))
+        {
+            return;
+        }
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
+
+        // lds max alignment
+        constexpr auto max_lds_align = K1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_k0_m_k1 = GetABlockDescriptor_K0PerBlock_MPerBlock_K1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_k0_n_k1 = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1();
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                AElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<K0PerBlock, MPerBlock, K1>,
+                                                ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(a_grid_desc_k0_m_k1),
+                                                decltype(a_block_desc_k0_m_k1),
+                                                ABlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                ABlockTransferSrcVectorDim,
+                                                2,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_K1,
+                                                1,
+                                                1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                NumGemmKPrefetchStage>(
+                a_grid_desc_k0_m_k1,
+                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                a_element_op,
+                a_block_desc_k0_m_k1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                BElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<K0PerBlock, NPerBlock, K1>,
+                                                BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(b_grid_desc_k0_n_k1),
+                                                decltype(b_block_desc_k0_n_k1),
+                                                BBlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                BBlockTransferSrcVectorDim,
+                                                2,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_K1,
+                                                1,
+                                                1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                true,
+                                                NumGemmKPrefetchStage>(
+                b_grid_desc_k0_n_k1,
+                make_multi_index(0, n_block_data_idx_on_grid, 0),
+                b_element_op,
+                b_block_desc_k0_n_k1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[K0PerBlock, MPerBlock] is in LDS
+        //     b_mtx[K0PerBlock, NPerBlock] is in LDS
+        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
+        //       register
+        // sanity check
+
+        auto blockwise_gemm =
+            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
+                                                                FloatAB,
+                                                                FloatAcc,
+                                                                decltype(a_block_desc_k0_m_k1),
+                                                                decltype(b_block_desc_k0_n_k1),
+                                                                MPerXdl,
+                                                                NPerXdl,
+                                                                MXdlPerWave,
+                                                                NXdlPerWave,
+                                                                K1>{};
+
+        auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size_aligned =
+            math::integer_least_multiple(a_block_desc_k0_m_k1.GetElementSpaceSize(), max_lds_align);
+
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<FloatAB*>(p_shared), a_block_desc_k0_m_k1.GetElementSpaceSize());
+
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<FloatAB*>(p_shared) + a_block_space_size_aligned,
+            b_block_desc_k0_n_k1.GetElementSpaceSize());
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0);
+
+        // gridwise GEMM pipeline
+        const index_t K0BlockMainLoop = __builtin_amdgcn_readfirstlane(K0 / K0PerBlock);
+
+        GridwiseGemmPipe::template Run<HasMainKBlockLoop>(a_grid_desc_k0_m_k1,
+                                                          a_block_desc_k0_m_k1,
+                                                          a_blockwise_copy,
+                                                          a_grid_buf,
+                                                          a_block_buf,
+                                                          a_block_slice_copy_step,
+                                                          b_grid_desc_k0_n_k1,
+                                                          b_block_desc_k0_n_k1,
+                                                          b_blockwise_copy,
+                                                          b_grid_buf,
+                                                          b_block_buf,
+                                                          b_block_slice_copy_step,
+                                                          blockwise_gemm,
+                                                          c_thread_buf,
+                                                          K0BlockMainLoop);
+
+        // shuffle C and write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+            constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+
+            constexpr auto c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl =
+                GetCBlockDescriptor_MBlock_NXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl();
+
+            auto c_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<FloatC*>(p_shared),
+                c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
+                    .GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                make_tuple(
+                    make_freeze_transform(I0), // freeze mblock
+                    make_pass_through_transform(
+                        Number<CShuffleMXdlPerWavePerShuffle>{}), // M0 (MXdlPerWave) per shuffle
+                    make_unmerge_transform(
+                        make_tuple(M1, M2, M3, M4)), // M1 = MWave, M2 * M3 * M4 = MPerXdl
+                    make_freeze_transform(I0),       // freeze nblock
+                    make_pass_through_transform(
+                        Number<CShuffleNXdlPerWavePerShuffle>{}), // N0 (NXdlPerWave) per shuffle
+                    make_unmerge_transform(
+                        make_tuple(N1, N2))), // M1 = MWave, M2 * M3 * M4 = MPerXdl
+                make_tuple(Sequence<0>{},
+                           Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<3>{},
+                           Sequence<4>{},
+                           Sequence<5>{}),
+                make_tuple(Sequence<>{},
+                           Sequence<0>{},
+                           Sequence<2, 4, 5, 6>{},
+                           Sequence<>{},
+                           Sequence<1>{},
+                           Sequence<3, 7>{})
+
+            );
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<FloatAcc,
+                                                   FloatC,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            M2,
+                                                            I1,
+                                                            M4,
+                                                            I1>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     m_thread_data_on_block_idx[I3],
+                                     m_thread_data_on_block_idx[I4],
+                                     n_thread_data_on_block_idx[I2]),
+                    ck::tensor_operation::element_wise::PassThrough{}};
+
+            auto c_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r2<
+                ThisThreadBlock,            // index_t BlockSize,
+                CElementwiseOperation,      // ElementwiseOperation,
+                CGlobalMemoryDataOperation, // DstInMemOp,
+                Sequence<1,
+                         CShuffleMXdlPerWavePerShuffle,
+                         MWave * MPerXdl,
+                         1,
+                         CShuffleNXdlPerWavePerShuffle,
+                         NWave * NPerXdl>, // BlockSliceLengths,
+                CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl,
+                Sequence<0, 1, 2, 3, 4, 5>, // typename ThreadClusterArrangeOrder,
+                FloatC,                     // typename Src0Data,
+                FloatC,                     // typename Src1Data,
+                FloatC,                     // typename DstData,
+                decltype(
+                    c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl),
+                decltype(
+                    c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl),
+                decltype(
+                    c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl),
+                Sequence<0, 1, 2, 3, 4, 5>,                 // typename DimAccessOrder,
+                5,                                          // index_t VectorDim,
+                CBlockTransferScalarPerVector_NWaveNPerXdl, // index_t ScalarPerVector,
+                true,  // bool ThreadTransferSrc0ResetCoordinateAfterRun,
+                false, // bool ThreadTransferSrc1ResetCoordinateAfterRun,
+                false> // bool ThreadTransferDstResetCoordinateAfterRun>
+                {c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                 make_multi_index(0, 0, 0, 0, 0, 0),
+                 c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                 make_multi_index(block_work_idx[I0], 0, 0, block_work_idx[I1], 0, 0),
+                 c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                 make_multi_index(block_work_idx[I0], 0, 0, block_work_idx[I1], 0, 0),
+                 c_element_op};
+
+            constexpr auto mxdlperwave_forward_step =
+                make_multi_index(0, CShuffleMXdlPerWavePerShuffle, 0, 0, 0, 0);
+            constexpr auto nxdlperwave_forward_step =
+                make_multi_index(0, 0, 0, 0, CShuffleNXdlPerWavePerShuffle, 0);
+            constexpr auto nxdlperwave_backward_step =
+                make_multi_index(0, 0, 0, 0, -CShuffleNXdlPerWavePerShuffle, 0);
+
+            static_for<0, MXdlPerWave, CShuffleMXdlPerWavePerShuffle>{}([&](auto mxdlperwave_iter) {
+                constexpr auto mxdlperwave = mxdlperwave_iter;
+
+                static_for<0,
+                           NXdlPerWave,
+                           CShuffleNXdlPerWavePerShuffle>{}([&](auto nxdlperwave_iter) {
+                    constexpr bool nxdlperwave_forward_sweep =
+                        (mxdlperwave % (2 * CShuffleMXdlPerWavePerShuffle) == 0);
+
+                    constexpr index_t nxdlperwave_value =
+                        nxdlperwave_forward_sweep
+                            ? nxdlperwave_iter
+                            : (NXdlPerWave - nxdlperwave_iter - CShuffleNXdlPerWavePerShuffle);
+
+                    constexpr auto nxdlperwave = Number<nxdlperwave_value>{};
+
+                    // make sure it's safe to do ds_write
+                    block_sync_lds();
+
+                    // VGPR to LDS
+                    c_thread_copy_vgpr_to_lds.Run(
+                        c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                        make_tuple(mxdlperwave, nxdlperwave, I0, I0, I0, I0, I0, I0),
+                        c_thread_buf,
+                        c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                        c_block_buf);
+
+                    // make sure it's safe to do ds_read
+                    block_sync_lds();
+
+                    // LDS to global
+                    c_block_copy_lds_to_global.Run(
+                        c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                        c_block_buf,
+                        c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                        c0_grid_buf,
+                        c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                        c_grid_buf);
+
+                    // move on nxdlperwave dimension
+                    if constexpr(nxdlperwave_forward_sweep &&
+                                 (nxdlperwave < NXdlPerWave - CShuffleNXdlPerWavePerShuffle))
+                    {
+                        c_block_copy_lds_to_global.MoveSrc1SliceWindow(
+                            c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                            nxdlperwave_forward_step);
+
+                        c_block_copy_lds_to_global.MoveDstSliceWindow(
+                            c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                            nxdlperwave_forward_step);
+                    }
+                    else if constexpr((!nxdlperwave_forward_sweep) && (nxdlperwave > 0))
+                    {
+                        c_block_copy_lds_to_global.MoveSrc1SliceWindow(
+                            c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                            nxdlperwave_backward_step);
+
+                        c_block_copy_lds_to_global.MoveDstSliceWindow(
+                            c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                            nxdlperwave_backward_step);
+                    }
+                });
+
+                // move on mxdlperwave dimension
+                if constexpr(mxdlperwave < MXdlPerWave - CShuffleMXdlPerWavePerShuffle)
+                {
+                    c_block_copy_lds_to_global.MoveSrc1SliceWindow(
+                        c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                        mxdlperwave_forward_step);
+
+                    c_block_copy_lds_to_global.MoveDstSliceWindow(
+                        c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                        mxdlperwave_forward_step);
+                }
+            });
+        }
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
new file mode 100644
index 00000000..fb1e34b9
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v3r3.hpp
@@ -0,0 +1,801 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/multi_index_transform_helper.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v6r3.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseGemm,
+          typename FloatAB,
+          typename FloatC,
+          typename AGridDesc_K0_M_K1,
+          typename BGridDesc_K0_N_K1,
+          typename CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl,
+          typename C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl,
+          typename C1GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          typename Block2CTileMap,
+          bool HasMainKBlockLoop>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+        kernel_gemm_xdlops_v3r3(
+            const FloatAB* __restrict__ p_a_grid,
+            const FloatAB* __restrict__ p_b_grid,
+            FloatC* __restrict__ p_c_grid,
+            const FloatC* __restrict__ p_c0_grid,
+            const FloatC* __restrict__ p_c1_grid,
+            const AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1,
+            const BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1,
+            const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
+                c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+            const C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
+                c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+            const C1GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
+                c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+            const AElementwiseOperation a_element_op,
+            const BElementwiseOperation b_element_op,
+            const CElementwiseOperation c_element_op,
+            const Block2CTileMap block_2_ctile_map)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    GridwiseGemm::template Run<HasMainKBlockLoop>(
+        p_a_grid,
+        p_b_grid,
+        p_c_grid,
+        p_c0_grid,
+        p_c1_grid,
+        p_shared,
+        a_grid_desc_k0_m_k1,
+        b_grid_desc_k0_n_k1,
+        c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+        c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+        c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+        a_element_op,
+        b_element_op,
+        c_element_op,
+        block_2_ctile_map);
+#else
+    ignore = p_a_grid;
+    ignore = p_b_grid;
+    ignore = p_c_grid;
+    ignore = p_c0_grid;
+    ignore = p_c1_grid;
+    ignore = a_grid_desc_k0_m_k1;
+    ignore = b_grid_desc_k0_n_k1;
+    ignore = c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl;
+    ignore = c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl;
+    ignore = c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl;
+    ignore = a_element_op;
+    ignore = b_element_op;
+    ignore = c_element_op;
+    ignore = block_2_ctile_map;
+#endif // end of if (defined(__gfx908__) || defined(__gfx90a__))
+}
+
+template <
+    index_t BlockSize,
+    typename FloatAB,
+    typename FloatAcc,
+    typename FloatC,
+    InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+    typename AGridDesc_K0_M_K1,
+    typename BGridDesc_K0_N_K1,
+    typename CGridDesc_M_N,
+    typename C0GridDesc_M_N,
+    typename C1GridDesc_M_N,
+    typename AElementwiseOperation,
+    typename BElementwiseOperation,
+    typename CElementwiseOperation,
+    index_t MPerBlock,
+    index_t NPerBlock,
+    index_t K0PerBlock,
+    index_t MPerXdl,
+    index_t NPerXdl,
+    index_t K1Value,
+    index_t MXdlPerWave,
+    index_t NXdlPerWave,
+    typename ABlockTransferThreadClusterLengths_K0_M_K1,
+    typename ABlockTransferThreadClusterArrangeOrder,
+    typename ABlockTransferSrcAccessOrder,
+    index_t ABlockTransferSrcVectorDim,
+    index_t ABlockTransferSrcScalarPerVector,
+    index_t ABlockTransferDstScalarPerVector_K1,
+    bool AThreadTransferSrcResetCoordinateAfterRun,
+    bool ABlockLdsExtraM,
+    typename BBlockTransferThreadClusterLengths_K0_N_K1,
+    typename BBlockTransferThreadClusterArrangeOrder,
+    typename BBlockTransferSrcAccessOrder,
+    index_t BBlockTransferSrcVectorDim,
+    index_t BBlockTransferSrcScalarPerVector,
+    index_t BBlockTransferDstScalarPerVector_K1,
+    bool BThreadTransferSrcResetCoordinateAfterRun,
+    bool BBlockLdsExtraN,
+    index_t CShuffleMXdlPerWavePerShuffle,
+    index_t CShuffleNXdlPerWavePerShuffle,
+    typename CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl,
+    index_t CBlockTransferScalarPerVector_NWaveNPerXdl,
+    index_t NumGemmKPrefetchStage = 1,
+    PipelineVersion PipelineVer   = PipelineVersion::v1>
+struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
+    static constexpr auto I7 = Number<7>{};
+
+    // K1 should be Number<...>
+    static constexpr auto K1 = Number<K1Value>{};
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    using GridwiseGemmPipe = remove_cvref_t<decltype(
+        GridwiseGemmPipeline_Selector<PipelineVer, NumGemmKPrefetchStage>())>;
+
+    __host__ __device__ static constexpr auto GetABlockDescriptor_K0PerBlock_MPerBlock_K1()
+    {
+        constexpr auto max_lds_align = K1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_k0_m_k1 = [&]() {
+            if constexpr(ABlockLdsExtraM)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1),
+                    make_tuple(Number<MPerBlock + 1>{} * K1, K1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<K0PerBlock>{}, Number<MPerBlock>{}, K1), max_lds_align);
+            }
+        }();
+
+        return a_block_desc_k0_m_k1;
+    }
+
+    __host__ __device__ static constexpr auto GetBBlockDescriptor_K0PerBlock_NPerBlock_K1()
+    {
+        constexpr auto max_lds_align = K1;
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_k0_n_k1 = [&]() {
+            if constexpr(BBlockLdsExtraN)
+            {
+                return make_naive_tensor_descriptor(
+                    make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1),
+                    make_tuple(Number<NPerBlock + 1>{} * K1, K1, I1));
+            }
+            else
+            {
+                return make_naive_tensor_descriptor_aligned(
+                    make_tuple(Number<K0PerBlock>{}, Number<NPerBlock>{}, K1), max_lds_align);
+            }
+        }();
+
+        return b_block_desc_k0_n_k1;
+    }
+
+    __host__ __device__ static constexpr auto
+    GetCBlockDescriptor_MBlock_NXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl()
+    {
+        constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        constexpr auto
+            c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl =
+                make_naive_tensor_descriptor_packed(
+                    make_tuple(I1,
+                               Number<CShuffleMXdlPerWavePerShuffle>{},
+                               Number<MWave * MPerXdl>{},
+                               I1,
+                               Number<CShuffleNXdlPerWavePerShuffle>{},
+                               Number<NWave * NPerXdl>{}));
+
+        return c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl;
+    }
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_desc_k0_m_k1 = GetABlockDescriptor_K0PerBlock_MPerBlock_K1();
+
+        constexpr auto b_block_desc_k0_n_k1 = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1();
+
+        constexpr auto max_lds_align = K1;
+
+        constexpr auto a_block_space_size_aligned =
+            math::integer_least_multiple(a_block_desc_k0_m_k1.GetElementSpaceSize(), max_lds_align);
+
+        constexpr auto b_block_space_size_aligned =
+            math::integer_least_multiple(b_block_desc_k0_n_k1.GetElementSpaceSize(), max_lds_align);
+
+        // LDS allocation for C shuffle in LDS
+        constexpr auto c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl =
+            GetCBlockDescriptor_MBlock_NXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl();
+
+        constexpr auto c_block_size =
+            c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
+                .GetElementSpaceSize();
+
+        return math::max((a_block_space_size_aligned + b_block_space_size_aligned) *
+                             sizeof(FloatAB),
+                         c_block_size * sizeof(FloatC));
+    }
+
+    // block_id to matrix tile idx (m0, n0) mapping are controlled by {M01, N01}
+    template <typename Block2CTileMap>
+    __host__ __device__ static constexpr bool
+    CheckValidity(const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1,
+                  const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1,
+                  const CGridDesc_M_N& c_grid_desc_m_n,
+                  const Block2CTileMap& block_2_ctile_map)
+    {
+        static_assert(is_known_at_compile_time<remove_cv_t<decltype(K1)>>::value,
+                      "wrong! K1 need to be known at compile-time");
+
+        static_assert((MPerBlock % (MPerXdl * MXdlPerWave) == 0) &&
+                          (NPerBlock % (NXdlPerWave * NPerXdl)) == 0,
+                      "Invalid tuning param!");
+
+        const auto M  = a_grid_desc_k0_m_k1.GetLength(I1);
+        const auto N  = b_grid_desc_k0_n_k1.GetLength(I1);
+        const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0);
+
+        if(!(M == c_grid_desc_m_n.GetLength(I0) && N == c_grid_desc_m_n.GetLength(I1) &&
+             K0 == b_grid_desc_k0_n_k1.GetLength(I0) && K1 == a_grid_desc_k0_m_k1.GetLength(I2) &&
+             K1 == b_grid_desc_k0_n_k1.GetLength(I2)))
+            return false;
+
+        if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K0 % K0PerBlock == 0))
+            return false;
+
+        // check gridwise gemm pipeline
+        const auto num_k_loop = K0 / K0PerBlock;
+
+        if(!GridwiseGemmPipe::IsSupported(num_k_loop))
+        {
+            return false;
+        }
+
+        if(!block_2_ctile_map.CheckValidity(c_grid_desc_m_n))
+        {
+            return false;
+        }
+
+        // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
+        return true;
+    }
+
+    __host__ __device__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
+    {
+        const index_t num_loop = K / (K0PerBlock * K1);
+
+        return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
+    }
+
+    template <typename CGridDesc_M_N_>
+    __host__ __device__ static constexpr auto
+    MakeCGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl(
+        const CGridDesc_M_N_& c_grid_desc_m_n)
+    {
+        const auto M = c_grid_desc_m_n.GetLength(I0);
+        const auto N = c_grid_desc_m_n.GetLength(I1);
+
+        const auto MBlock = M / MPerBlock;
+        const auto NBlock = N / NPerBlock;
+
+        constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+        constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+        const auto c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl =
+            transform_tensor_descriptor(
+                c_grid_desc_m_n,
+                make_tuple(make_unmerge_transform(make_tuple(
+                               MBlock, Number<MXdlPerWave>{}, Number<MWave * MPerXdl>{})),
+                           make_unmerge_transform(make_tuple(
+                               NBlock, Number<NXdlPerWave>{}, Number<NWave * NPerXdl>{}))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 1, 2>{}, Sequence<3, 4, 5>{}));
+
+        return c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl;
+    }
+
+    // return block_id to C matrix tile idx (m0, n0) mapping
+    __host__ __device__ static constexpr auto MakeDefaultBlock2CTileMap(
+        const CGridDesc_M_N& c_grid_desc_m_n, index_t /* M01 */, index_t /* N01 */)
+    {
+        return BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, CGridDesc_M_N>(
+            c_grid_desc_m_n);
+    }
+    using CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl =
+        remove_cvref_t<decltype(
+            MakeCGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl(
+                CGridDesc_M_N{}))>;
+
+    using C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl =
+        remove_cvref_t<decltype(
+            MakeCGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl(
+                C0GridDesc_M_N{}))>;
+
+    using C1GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl =
+        remove_cvref_t<decltype(
+            MakeCGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl(
+                C1GridDesc_M_N{}))>;
+
+    using DefaultBlock2CTileMap =
+        remove_cvref_t<decltype(MakeDefaultBlock2CTileMap(CGridDesc_M_N{}, 1, 1))>;
+
+    template <bool HasMainKBlockLoop, typename Block2CTileMap>
+    __device__ static void
+    Run(const FloatAB* __restrict__ p_a_grid,
+        const FloatAB* __restrict__ p_b_grid,
+        FloatC* __restrict__ p_c_grid,
+        const FloatC* __restrict__ p_c0_grid,
+        const FloatC* __restrict__ p_c1_grid,
+        void* __restrict__ p_shared,
+        const AGridDesc_K0_M_K1& a_grid_desc_k0_m_k1,
+        const BGridDesc_K0_N_K1& b_grid_desc_k0_n_k1,
+        const CGridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl&
+            c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+        const C0GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl&
+            c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+        const C1GridDescriptor_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl&
+            c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+        const AElementwiseOperation& a_element_op,
+        const BElementwiseOperation& b_element_op,
+        const CElementwiseOperation& c_element_op,
+        const Block2CTileMap& block_2_ctile_map)
+    {
+        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_a_grid, a_grid_desc_k0_m_k1.GetElementSpaceSize());
+        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_b_grid, b_grid_desc_k0_n_k1.GetElementSpaceSize());
+        auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c_grid,
+            c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
+                .GetElementSpaceSize());
+        auto c0_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c0_grid,
+            c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
+                .GetElementSpaceSize());
+        auto c1_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_c1_grid,
+            c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
+                .GetElementSpaceSize());
+
+        const auto K0 = a_grid_desc_k0_m_k1.GetLength(I0);
+
+        // divide block work by [M, N]
+        const auto block_work_idx =
+            block_2_ctile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        if(!block_2_ctile_map.ValidCTileIndex(
+               block_work_idx,
+               make_tuple(
+                   c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
+                       .GetLength(I0),
+                   c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
+                       .GetLength(I3))))
+        {
+            return;
+        }
+
+        // HACK: this force m/n_block_data_idx_on_grid into SGPR
+        const index_t m_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I0] * MPerBlock);
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * NPerBlock);
+
+        // lds max alignment
+        constexpr auto max_lds_align = K1;
+
+        // A matrix in LDS memory, dst of blockwise copy
+        constexpr auto a_block_desc_k0_m_k1 = GetABlockDescriptor_K0PerBlock_MPerBlock_K1();
+
+        // B matrix in LDS memory, dst of blockwise copy
+        constexpr auto b_block_desc_k0_n_k1 = GetBBlockDescriptor_K0PerBlock_NPerBlock_K1();
+
+        // A matrix blockwise copy
+        auto a_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                AElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<K0PerBlock, MPerBlock, K1>,
+                                                ABlockTransferThreadClusterLengths_K0_M_K1,
+                                                ABlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(a_grid_desc_k0_m_k1),
+                                                decltype(a_block_desc_k0_m_k1),
+                                                ABlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                ABlockTransferSrcVectorDim,
+                                                2,
+                                                ABlockTransferSrcScalarPerVector,
+                                                ABlockTransferDstScalarPerVector_K1,
+                                                1,
+                                                1,
+                                                AThreadTransferSrcResetCoordinateAfterRun,
+                                                true>(
+                a_grid_desc_k0_m_k1,
+                make_multi_index(0, m_block_data_idx_on_grid, 0),
+                a_element_op,
+                a_block_desc_k0_m_k1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // B matrix blockwise copy
+        auto b_blockwise_copy =
+            ThreadGroupTensorSliceTransfer_v4r1<ThisThreadBlock,
+                                                BElementwiseOperation,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                InMemoryDataOperationEnum::Set,
+                                                Sequence<K0PerBlock, NPerBlock, K1>,
+                                                BBlockTransferThreadClusterLengths_K0_N_K1,
+                                                BBlockTransferThreadClusterArrangeOrder,
+                                                FloatAB,
+                                                FloatAB,
+                                                decltype(b_grid_desc_k0_n_k1),
+                                                decltype(b_block_desc_k0_n_k1),
+                                                BBlockTransferSrcAccessOrder,
+                                                Sequence<1, 0, 2>,
+                                                BBlockTransferSrcVectorDim,
+                                                2,
+                                                BBlockTransferSrcScalarPerVector,
+                                                BBlockTransferDstScalarPerVector_K1,
+                                                1,
+                                                1,
+                                                BThreadTransferSrcResetCoordinateAfterRun,
+                                                true>(
+                b_grid_desc_k0_n_k1,
+                make_multi_index(0, n_block_data_idx_on_grid, 0),
+                b_element_op,
+                b_block_desc_k0_n_k1,
+                make_multi_index(0, 0, 0),
+                ck::tensor_operation::element_wise::PassThrough{});
+
+        // GEMM definition
+        //   c_mtx += transpose(a_mtx) * b_mtx
+        //     a_mtx[K0PerBlock, MPerBlock] is in LDS
+        //     b_mtx[K0PerBlock, NPerBlock] is in LDS
+        //     c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
+        //       register
+        // sanity check
+
+        auto blockwise_gemm =
+            BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1<BlockSize,
+                                                                FloatAB,
+                                                                FloatAcc,
+                                                                decltype(a_block_desc_k0_m_k1),
+                                                                decltype(b_block_desc_k0_n_k1),
+                                                                MPerXdl,
+                                                                NPerXdl,
+                                                                MXdlPerWave,
+                                                                NXdlPerWave,
+                                                                K1>{};
+
+        auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();
+
+        // LDS allocation for A and B: be careful of alignment
+        constexpr auto a_block_space_size_aligned =
+            math::integer_least_multiple(a_block_desc_k0_m_k1.GetElementSpaceSize(), max_lds_align);
+
+        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<FloatAB*>(p_shared), a_block_desc_k0_m_k1.GetElementSpaceSize());
+
+        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<FloatAB*>(p_shared) + a_block_space_size_aligned,
+            b_block_desc_k0_n_k1.GetElementSpaceSize());
+
+        constexpr auto a_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(K0PerBlock, 0, 0);
+
+        // gridwise GEMM pipeline
+        const index_t K0BlockMainLoop = __builtin_amdgcn_readfirstlane(K0 / K0PerBlock);
+
+        GridwiseGemmPipe::template Run<HasMainKBlockLoop>(a_grid_desc_k0_m_k1,
+                                                          a_block_desc_k0_m_k1,
+                                                          a_blockwise_copy,
+                                                          a_grid_buf,
+                                                          a_block_buf,
+                                                          a_block_slice_copy_step,
+                                                          b_grid_desc_k0_n_k1,
+                                                          b_block_desc_k0_n_k1,
+                                                          b_blockwise_copy,
+                                                          b_grid_buf,
+                                                          b_block_buf,
+                                                          b_block_slice_copy_step,
+                                                          blockwise_gemm,
+                                                          c_thread_buf,
+                                                          K0BlockMainLoop);
+
+        // shuffle C and write out
+        {
+            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
+                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
+            constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
+
+            // TODO: hacky, fix it!
+            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
+                blockwise_gemm.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            // TODO: hacky, fix it!
+            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
+                blockwise_gemm.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+
+            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
+            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
+            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
+            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
+            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+
+            constexpr auto c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl =
+                GetCBlockDescriptor_MBlock_NXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl();
+
+            auto c_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                static_cast<FloatC*>(p_shared),
+                c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl
+                    .GetElementSpaceSize());
+
+            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
+                c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                make_tuple(make_freeze_transform(I0), // freeze mblock
+                           make_pass_through_transform(
+                               Number<CShuffleMXdlPerWavePerShuffle>{}), // M0 (MXdlPerWave) per
+                                                                         // shuffle
+                           make_unmerge_transform(
+                               make_tuple(M1, M2, M3, M4)), // M1 = MWave, M2 * M3 * M4 = MPerXdl
+                           make_freeze_transform(I0),       // freeze nblock
+                           make_pass_through_transform(
+                               Number<CShuffleNXdlPerWavePerShuffle>{}), // N0 (NXdlPerWave) per
+                                                                         // shuffle
+                           make_unmerge_transform(
+                               make_tuple(N1, N2))), // M1 = MWave, M2 * M3 * M4 = MPerXdl
+                make_tuple(Sequence<0>{},
+                           Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<3>{},
+                           Sequence<4>{},
+                           Sequence<5>{}),
+                make_tuple(Sequence<>{},
+                           Sequence<0>{},
+                           Sequence<2, 4, 5, 6>{},
+                           Sequence<>{},
+                           Sequence<1>{},
+                           Sequence<3, 7>{})
+
+            );
+
+            // calculate origin of thread output tensor on global memory
+            //     blockwise GEMM c matrix starting index
+            const auto c_thread_mtx_on_block =
+                blockwise_gemm.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
+
+            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
+            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
+
+            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto m_thread_data_on_block_idx =
+                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
+                    make_multi_index(m_thread_data_on_block));
+
+            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
+                make_single_stage_tensor_adaptor(
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
+                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(Sequence<0>{}));
+
+            const auto n_thread_data_on_block_idx =
+                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
+                    make_multi_index(n_thread_data_on_block));
+
+            // VGPR to LDS
+            auto c_thread_copy_vgpr_to_lds =
+                ThreadwiseTensorSliceTransfer_v1r3<FloatAcc,
+                                                   FloatC,
+                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
+                                                            CShuffleNXdlPerWavePerShuffle,
+                                                            I1,
+                                                            I1,
+                                                            M2,
+                                                            I1,
+                                                            M4,
+                                                            I1>,
+                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
+                                                   7,
+                                                   1,
+                                                   InMemoryDataOperationEnum::Set,
+                                                   1,
+                                                   true>{
+                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                    make_multi_index(0,
+                                     0,
+                                     m_thread_data_on_block_idx[I1],
+                                     n_thread_data_on_block_idx[I1],
+                                     m_thread_data_on_block_idx[I2],
+                                     m_thread_data_on_block_idx[I3],
+                                     m_thread_data_on_block_idx[I4],
+                                     n_thread_data_on_block_idx[I2]),
+                    ck::tensor_operation::element_wise::PassThrough{}};
+
+            auto c_block_copy_lds_to_global = ThreadGroupTensorSliceTransfer_v6r3<
+                ThisThreadBlock,            // ThreadGroup
+                CElementwiseOperation,      // ElementwiseOperation,
+                CGlobalMemoryDataOperation, // DstInMemOp,
+                Sequence<1,
+                         CShuffleMXdlPerWavePerShuffle,
+                         MWave * MPerXdl,
+                         1,
+                         CShuffleNXdlPerWavePerShuffle,
+                         NWave * NPerXdl>, // BlockSliceLengths,
+                CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl,
+                Sequence<0, 1, 2, 3, 4, 5>, // typename ThreadClusterArrangeOrder,
+                FloatC,                     // typename Src0Data,
+                FloatC,                     // typename Src1Data,
+                FloatC,                     // typename Src2Data,
+                FloatC,                     // typename DstData,
+                decltype(
+                    c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl),
+                decltype(
+                    c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl),
+                decltype(
+                    c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl),
+                decltype(
+                    c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl),
+                Sequence<0, 1, 2, 3, 4, 5>,                 // typename DimAccessOrder,
+                5,                                          // index_t VectorDim,
+                CBlockTransferScalarPerVector_NWaveNPerXdl, // index_t ScalarPerVector,
+                true,  // bool ThreadTransferSrc0ResetCoordinateAfterRun,
+                false, // bool ThreadTransferSrc1ResetCoordinateAfterRun,
+                false, // bool ThreadTransferSrc2ResetCoordinateAfterRun,
+                false> // bool ThreadTransferDstResetCoordinateAfterRun>
+                {c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                 make_multi_index(0, 0, 0, 0, 0, 0),
+                 c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                 make_multi_index(block_work_idx[I0], 0, 0, block_work_idx[I1], 0, 0),
+                 c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                 make_multi_index(block_work_idx[I0], 0, 0, block_work_idx[I1], 0, 0),
+                 c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                 make_multi_index(block_work_idx[I0], 0, 0, block_work_idx[I1], 0, 0),
+                 c_element_op};
+
+            constexpr auto mxdlperwave_forward_step =
+                make_multi_index(0, CShuffleMXdlPerWavePerShuffle, 0, 0, 0, 0);
+            constexpr auto nxdlperwave_forward_step =
+                make_multi_index(0, 0, 0, 0, CShuffleNXdlPerWavePerShuffle, 0);
+            constexpr auto nxdlperwave_backward_step =
+                make_multi_index(0, 0, 0, 0, -CShuffleNXdlPerWavePerShuffle, 0);
+
+            static_for<0, MXdlPerWave, CShuffleMXdlPerWavePerShuffle>{}([&](auto mxdlperwave_iter) {
+                constexpr auto mxdlperwave = mxdlperwave_iter;
+
+                static_for<0,
+                           NXdlPerWave,
+                           CShuffleNXdlPerWavePerShuffle>{}([&](auto nxdlperwave_iter) {
+                    constexpr bool nxdlperwave_forward_sweep =
+                        (mxdlperwave % (2 * CShuffleMXdlPerWavePerShuffle) == 0);
+
+                    constexpr index_t nxdlperwave_value =
+                        nxdlperwave_forward_sweep
+                            ? nxdlperwave_iter
+                            : (NXdlPerWave - nxdlperwave_iter - CShuffleNXdlPerWavePerShuffle);
+
+                    constexpr auto nxdlperwave = Number<nxdlperwave_value>{};
+
+                    // make sure it's safe to do ds_write
+                    block_sync_lds();
+
+                    // VGPR to LDS
+                    c_thread_copy_vgpr_to_lds.Run(
+                        c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                        make_tuple(mxdlperwave, nxdlperwave, I0, I0, I0, I0, I0, I0),
+                        c_thread_buf,
+                        c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                        c_block_buf);
+
+                    // make sure it's safe to do ds_read
+                    block_sync_lds();
+
+                    // LDS to global
+                    c_block_copy_lds_to_global.Run(
+                        c_block_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                        c_block_buf,
+                        c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                        c0_grid_buf,
+                        c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                        c1_grid_buf,
+                        c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                        c_grid_buf);
+
+                    // move on nxdlperwave dimension
+                    if constexpr(nxdlperwave_forward_sweep &&
+                                 (nxdlperwave < NXdlPerWave - CShuffleNXdlPerWavePerShuffle))
+                    {
+                        c_block_copy_lds_to_global.MoveSrc1SliceWindow(
+                            c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                            nxdlperwave_forward_step);
+
+                        c_block_copy_lds_to_global.MoveSrc2SliceWindow(
+                            c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                            nxdlperwave_forward_step);
+
+                        c_block_copy_lds_to_global.MoveDstSliceWindow(
+                            c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                            nxdlperwave_forward_step);
+                    }
+                    else if constexpr((!nxdlperwave_forward_sweep) && (nxdlperwave > 0))
+                    {
+                        c_block_copy_lds_to_global.MoveSrc1SliceWindow(
+                            c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                            nxdlperwave_backward_step);
+
+                        c_block_copy_lds_to_global.MoveSrc2SliceWindow(
+                            c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                            nxdlperwave_backward_step);
+
+                        c_block_copy_lds_to_global.MoveDstSliceWindow(
+                            c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                            nxdlperwave_backward_step);
+                    }
+                });
+
+                // move on mxdlperwave dimension
+                if constexpr(mxdlperwave < MXdlPerWave - CShuffleMXdlPerWavePerShuffle)
+                {
+                    c_block_copy_lds_to_global.MoveSrc1SliceWindow(
+                        c0_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                        mxdlperwave_forward_step);
+
+                    c_block_copy_lds_to_global.MoveSrc2SliceWindow(
+                        c1_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                        mxdlperwave_forward_step);
+
+                    c_block_copy_lds_to_global.MoveDstSliceWindow(
+                        c_grid_desc_mblock_mxdlperwave_mwavemperxdl_nblock_nxdlperwave_nwavenperxdl,
+                        mxdlperwave_forward_step);
+                }
+            });
+        }
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_naive_variance.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_naive_variance.hpp
new file mode 100644
index 00000000..89efea4d
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_naive_variance.hpp
@@ -0,0 +1,360 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_common.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/utility/reduction_functions_accumulate.hpp"
+#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp"
+#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+// Y = Normalization(X, Beta, Gamma)
+template <typename XDataType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename AccElementwiseOperation,
+          typename GridDesc_M_K,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t XSrcVectorDim,
+          index_t XSrcVectorSize,
+          index_t GammaSrcVectorDim,
+          index_t GammaSrcVectorSize,
+          index_t BetaSrcVectorDim,
+          index_t BetaSrcVectorSize,
+          index_t YDstVectorDim,
+          index_t YDstVectorSize,
+          bool SweepOnce>
+struct GridwiseNormalizationNaiveVariance_mk_to_mk
+{
+    static_assert((XSrcVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0) ||
+                      (XSrcVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static_assert((YDstVectorDim == 0 && MThreadSliceSize % YDstVectorSize == 0) ||
+                      (YDstVectorDim == 1 && KThreadSliceSize % YDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static constexpr bool reorder_thread_cluster = (XSrcVectorDim == 0);
+
+    using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
+
+    using ThreadBufferDimAccessOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    using ThreadClusterArrangeOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    static constexpr auto thread_cluster_desc =
+        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
+
+    using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{})));
+    using ThreadReduceDstDesc_M =
+        decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
+
+    using BlockwiseSumReduce = PartitionedBlockwiseReduction<AccDataType,
+                                                             BlockSize,
+                                                             ThreadClusterLengths_M_K,
+                                                             ThreadClusterArrangeOrder,
+                                                             reduce::Add,
+                                                             true>;
+
+    using ThreadwiseSumReduce = ThreadwiseReduction<AccDataType,
+                                                    ThreadReduceSrcDesc_M_K,
+                                                    ThreadReduceDstDesc_M,
+                                                    reduce::Add,
+                                                    true>;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+
+    __device__ static void Run(const GridDesc_M_K& x_grid_desc_m_k,
+                               const GridDesc_M_K& gamma_grid_desc_m_k,
+                               const GridDesc_M_K& beta_grid_desc_m_k,
+                               const GridDesc_M_K& y_grid_desc_m_k,
+                               index_t num_k_block_tile_iteration,
+                               AccDataType epsilon,
+                               const XDataType* const __restrict__ p_x_global,
+                               const GammaDataType* const __restrict__ p_gamma_global,
+                               const BetaDataType* const __restrict__ p_beta_global,
+                               YDataType* const __restrict__ p_y_global,
+                               const AccElementwiseOperation acc_elementwise_op)
+    {
+        if constexpr(SweepOnce)
+        {
+            num_k_block_tile_iteration = 1;
+        }
+
+        // LDS
+        __shared__ AccDataType p_reduce_work_buffer[BlockSize];
+
+        auto y_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_y_global, y_grid_desc_m_k.GetElementSpaceSize());
+
+        auto reduce_work_buf =
+            make_dynamic_buffer<AddressSpaceEnum::Lds>(p_reduce_work_buffer, BlockSize);
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            x_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            gamma_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr,
+                     AccDataType,
+                     MThreadSliceSize * KThreadSliceSize,
+                     true>& beta_thread_buf = gamma_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            y_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr,
+                     AccDataType,
+                     MThreadSliceSize * KThreadSliceSize,
+                     true>& x_square_thread_buf = y_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>
+            mean_square_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true>& var_thread_buf =
+            mean_square_thread_buf;
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            mean_thread_buf(I)        = reduce::Add::template GetIdentityValue<AccDataType>();
+            mean_square_thread_buf(I) = reduce::Add::template GetIdentityValue<AccDataType>();
+        });
+
+        const index_t thread_local_id = get_thread_local_1d_id();
+        const index_t block_global_id = get_block_1d_id();
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
+
+        const auto thread_m_cluster_id = thread_cluster_idx[I0];
+        const auto thread_k_cluster_id = thread_cluster_idx[I1];
+
+        using ThreadBufferLengths_M_K         = Sequence<MThreadSliceSize, KThreadSliceSize>;
+        constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
+
+        auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2<XDataType,
+                                                                  AccDataType,
+                                                                  GridDesc_M_K,
+                                                                  decltype(thread_buffer_desc_m_k),
+                                                                  ThreadBufferLengths_M_K,
+                                                                  ThreadBufferDimAccessOrder,
+                                                                  XSrcVectorDim,
+                                                                  XSrcVectorSize,
+                                                                  1,
+                                                                  true>(
+            x_grid_desc_m_k,
+            make_multi_index(block_global_id * M_BlockTileSize +
+                                 thread_m_cluster_id * MThreadSliceSize,
+                             thread_k_cluster_id * KThreadSliceSize));
+
+        auto threadwise_gamma_load =
+            ThreadwiseTensorSliceTransfer_v2<GammaDataType,
+                                             AccDataType,
+                                             GridDesc_M_K,
+                                             decltype(thread_buffer_desc_m_k),
+                                             ThreadBufferLengths_M_K,
+                                             ThreadBufferDimAccessOrder,
+                                             GammaSrcVectorDim,
+                                             GammaSrcVectorSize,
+                                             1,
+                                             true>(
+                gamma_grid_desc_m_k,
+                make_multi_index(block_global_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 thread_k_cluster_id * KThreadSliceSize));
+
+        auto threadwise_beta_load =
+            ThreadwiseTensorSliceTransfer_v2<BetaDataType,
+                                             AccDataType,
+                                             GridDesc_M_K,
+                                             decltype(thread_buffer_desc_m_k),
+                                             ThreadBufferLengths_M_K,
+                                             ThreadBufferDimAccessOrder,
+                                             BetaSrcVectorDim,
+                                             BetaSrcVectorSize,
+                                             1,
+                                             true>(
+                beta_grid_desc_m_k,
+                make_multi_index(block_global_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 thread_k_cluster_id * KThreadSliceSize));
+
+        auto threadwise_y_store =
+            ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                               YDataType,
+                                               decltype(thread_buffer_desc_m_k),
+                                               GridDesc_M_K,
+                                               AccElementwiseOperation,
+                                               ThreadBufferLengths_M_K,
+                                               ThreadBufferDimAccessOrder,
+                                               YDstVectorDim,
+                                               YDstVectorSize,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                y_grid_desc_m_k,
+                make_multi_index(block_global_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 thread_k_cluster_id * KThreadSliceSize),
+                acc_elementwise_op);
+
+        // Copy x from Cache
+        // one pass: fwd, second pass: bwd
+        constexpr auto thread_copy_fwd_step_m_k =
+            make_multi_index(0, SweepOnce ? 0 : K_BlockTileSize);
+        constexpr auto thread_copy_bwd_step_m_k =
+            make_multi_index(0, SweepOnce ? 0 : -K_BlockTileSize);
+
+        const auto x_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_x_global, x_grid_desc_m_k.GetElementSpaceSize());
+
+        const auto gamma_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_gamma_global, gamma_grid_desc_m_k.GetElementSpaceSize());
+
+        const auto beta_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_beta_global, beta_grid_desc_m_k.GetElementSpaceSize());
+
+        // E(x), E[x^2], var(x)
+        // FIXME: Should not hack the transform from deviceOP
+        int reduce_length = x_grid_desc_m_k.GetTransforms()[I2].GetUpperLengths()[I0];
+
+        index_t reducedTiles = 0;
+        do
+        {
+            threadwise_x_load.Run(x_grid_desc_m_k,
+                                  x_global_val_buf,
+                                  thread_buffer_desc_m_k,
+                                  make_tuple(I0, I0),
+                                  x_thread_buf);
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                    constexpr auto offset_m_k =
+                        thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK));
+                    x_square_thread_buf(Number<offset_m_k>{}) =
+                        x_thread_buf(Number<offset_m_k>{}) * x_thread_buf(Number<offset_m_k>{});
+                });
+            });
+
+            ThreadwiseSumReduce::Reduce(x_thread_buf, mean_thread_buf);
+            ThreadwiseSumReduce::Reduce(x_square_thread_buf, mean_square_thread_buf);
+
+            threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k);
+
+            ++reducedTiles;
+        } while(reducedTiles < num_k_block_tile_iteration);
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            if constexpr(I > 0)
+                block_sync_lds();
+
+            BlockwiseSumReduce::Reduce(reduce_work_buf, mean_thread_buf(I));
+            mean_thread_buf(I) = mean_thread_buf(I) / reduce_length;
+
+            block_sync_lds();
+
+            BlockwiseSumReduce::Reduce(reduce_work_buf, mean_square_thread_buf(I));
+            mean_square_thread_buf(I) = mean_square_thread_buf(I) / reduce_length;
+
+            // var(x) = E[x^2] - E[x]^2
+            var_thread_buf(I) =
+                mean_square_thread_buf(I) - (mean_thread_buf(I) * mean_thread_buf(I));
+        });
+
+        // y = (x - E[x]) / sqrt(var[x] + epsilon)
+        auto thread_copy_tail_m_k = (num_k_block_tile_iteration - 1) * thread_copy_fwd_step_m_k;
+
+        threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_bwd_step_m_k);
+        threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, thread_copy_tail_m_k);
+        threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, thread_copy_tail_m_k);
+        threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, thread_copy_tail_m_k);
+
+        reducedTiles = 0;
+        do
+        {
+            if constexpr(!SweepOnce)
+            {
+                threadwise_x_load.Run(x_grid_desc_m_k,
+                                      x_global_val_buf,
+                                      thread_buffer_desc_m_k,
+                                      make_tuple(I0, I0),
+                                      x_thread_buf);
+            }
+
+            threadwise_gamma_load.Run(gamma_grid_desc_m_k,
+                                      gamma_global_val_buf,
+                                      thread_buffer_desc_m_k,
+                                      make_tuple(I0, I0),
+                                      gamma_thread_buf);
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                    constexpr auto offset_m_k =
+                        thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK));
+
+                    // normalize
+                    y_thread_buf(Number<offset_m_k>{}) =
+                        (x_thread_buf(Number<offset_m_k>{}) - mean_thread_buf(iM)) /
+                        sqrt(var_thread_buf(iM) + epsilon);
+
+                    // gamma
+                    y_thread_buf(Number<offset_m_k>{}) =
+                        y_thread_buf(Number<offset_m_k>{}) * gamma_thread_buf(Number<offset_m_k>{});
+                });
+            });
+
+            threadwise_beta_load.Run(beta_grid_desc_m_k,
+                                     beta_global_val_buf,
+                                     thread_buffer_desc_m_k,
+                                     make_tuple(I0, I0),
+                                     beta_thread_buf);
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                    constexpr auto offset_m_k =
+                        thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK));
+
+                    // beta
+                    y_thread_buf(Number<offset_m_k>{}) =
+                        y_thread_buf(Number<offset_m_k>{}) + beta_thread_buf(Number<offset_m_k>{});
+                });
+            });
+
+            threadwise_y_store.Run(thread_buffer_desc_m_k,
+                                   make_tuple(I0, I0),
+                                   y_thread_buf,
+                                   y_grid_desc_m_k,
+                                   y_global_val_buf);
+
+            threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_bwd_step_m_k);
+            threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, thread_copy_bwd_step_m_k);
+            threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, thread_copy_bwd_step_m_k);
+            threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, thread_copy_bwd_step_m_k);
+
+            ++reducedTiles;
+        } while(reducedTiles < num_k_block_tile_iteration);
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp
new file mode 100644
index 00000000..7aefd3c0
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp
@@ -0,0 +1,384 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_welford.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+// Y = Normalization(X, Beta, Gamma)
+template <typename XDataType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename AccElementwiseOperation,
+          typename GridDesc_M_K,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t XSrcVectorDim,
+          index_t XSrcVectorSize,
+          index_t GammaSrcVectorDim,
+          index_t GammaSrcVectorSize,
+          index_t BetaSrcVectorDim,
+          index_t BetaSrcVectorSize,
+          index_t YDstVectorDim,
+          index_t YDstVectorSize,
+          bool SweepOnce>
+struct GridwiseNormalizationWelfordVariance_mk_to_mk
+{
+    static_assert((XSrcVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0) ||
+                      (XSrcVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static_assert((YDstVectorDim == 0 && MThreadSliceSize % YDstVectorSize == 0) ||
+                      (YDstVectorDim == 1 && KThreadSliceSize % YDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static constexpr bool reorder_thread_cluster = (XSrcVectorDim == 0);
+
+    using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
+
+    using ThreadBufferDimAccessOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    using ThreadClusterArrangeOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    static constexpr auto thread_cluster_desc =
+        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
+
+    using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MThreadSliceSize>{}, Number<XSrcVectorSize>{})));
+    using ThreadReduceDstDesc_M =
+        decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
+
+    using ThreadwiseWelford =
+        ThreadwiseWelford<AccDataType, ThreadReduceSrcDesc_M_K, ThreadReduceDstDesc_M>;
+
+    using BlockwiseWelford = BlockwiseWelford<AccDataType,
+                                              BlockSize,
+                                              ThreadClusterLengths_M_K,
+                                              ThreadClusterArrangeOrder>;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    static constexpr index_t M_BlockTileSize     = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize     = KThreadClusterSize * KThreadSliceSize;
+    static constexpr index_t K_BlockTileStepSize = KThreadClusterSize * XSrcVectorSize;
+
+    static constexpr auto XThreadBufferNumber     = Number<KThreadSliceSize / XSrcVectorSize>{};
+    static constexpr auto GammaThreadBufferNumber = Number<KThreadSliceSize / XSrcVectorSize>{};
+    static constexpr auto BetaThreadBufferNumber  = Number<KThreadSliceSize / XSrcVectorSize>{};
+    static constexpr auto YThreadBufferNumber     = Number<KThreadSliceSize / XSrcVectorSize>{};
+
+    __device__ static int GetKPerThread(const GridDesc_M_K& x_grid_desc_m_k,
+                                        int thread_k_cluster_id)
+    {
+        // FIXME: Should not hack the transform from deviceOP
+        int kPerBlock = x_grid_desc_m_k.GetTransforms()[I2].GetUpperLengths()[I0];
+        int kPerThread =
+            kPerBlock < K_BlockTileSize ? 0 : KThreadSliceSize * (kPerBlock / K_BlockTileSize);
+        int kPerBlockTail = kPerBlock - kPerThread * KThreadClusterSize;
+
+        if(kPerBlockTail > 0)
+        {
+            static_for<0, XThreadBufferNumber, 1>{}([&](auto i) {
+                int thread_max_len =
+                    (thread_k_cluster_id + 1) * XSrcVectorSize + K_BlockTileStepSize * i;
+                int delta = thread_max_len - kPerBlockTail;
+                delta     = math::clamp(thread_max_len - kPerBlockTail, 0, XSrcVectorSize);
+                kPerThread += XSrcVectorSize - delta;
+            });
+        }
+
+        return kPerThread;
+    }
+
+    __device__ static void Run(const GridDesc_M_K& x_grid_desc_m_k,
+                               const GridDesc_M_K& gamma_grid_desc_m_k,
+                               const GridDesc_M_K& beta_grid_desc_m_k,
+                               const GridDesc_M_K& y_grid_desc_m_k,
+                               index_t num_k_block_tile_iteration,
+                               AccDataType epsilon,
+                               const XDataType* const __restrict__ p_x_global,
+                               const GammaDataType* const __restrict__ p_gamma_global,
+                               const BetaDataType* const __restrict__ p_beta_global,
+                               YDataType* const __restrict__ p_y_global,
+                               const AccElementwiseOperation acc_elementwise_op)
+    {
+        if constexpr(SweepOnce)
+        {
+            num_k_block_tile_iteration = 1;
+        }
+
+        auto y_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_y_global, y_grid_desc_m_k.GetElementSpaceSize());
+
+        auto x_thread_buf = generate_tuple(
+            [&](auto) {
+                return StaticBuffer<AddressSpaceEnum::Vgpr,
+                                    AccDataType,
+                                    MThreadSliceSize * XSrcVectorSize,
+                                    true>{};
+            },
+            Number<XThreadBufferNumber>{});
+
+        auto gamma_thread_buf = generate_tuple(
+            [&](auto) {
+                return StaticBuffer<AddressSpaceEnum::Vgpr,
+                                    AccDataType,
+                                    MThreadSliceSize * GammaSrcVectorSize,
+                                    true>{};
+            },
+            Number<GammaThreadBufferNumber>{});
+
+        auto beta_thread_buf = generate_tuple(
+            [&](auto) {
+                return StaticBuffer<AddressSpaceEnum::Vgpr,
+                                    AccDataType,
+                                    MThreadSliceSize * BetaSrcVectorSize,
+                                    true>{};
+            },
+            Number<BetaThreadBufferNumber>{});
+
+        auto y_thread_buf = generate_tuple(
+            [&](auto) {
+                return StaticBuffer<AddressSpaceEnum::Vgpr,
+                                    AccDataType,
+                                    MThreadSliceSize * YDstVectorSize,
+                                    true>{};
+            },
+            Number<YThreadBufferNumber>{});
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> var_thread_buf;
+
+        const index_t thread_local_id = get_thread_local_1d_id();
+        const index_t block_global_id = get_block_1d_id();
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
+
+        const auto thread_m_cluster_id = thread_cluster_idx[I0];
+        const auto thread_k_cluster_id = thread_cluster_idx[I1];
+
+        using ThreadBufferLengths_M_K         = Sequence<MThreadSliceSize, XSrcVectorSize>;
+        constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<XSrcVectorSize>{}));
+
+        auto threadwise_x_load = ThreadwiseTensorSliceTransfer_v2<XDataType,
+                                                                  AccDataType,
+                                                                  GridDesc_M_K,
+                                                                  decltype(thread_buffer_desc_m_k),
+                                                                  ThreadBufferLengths_M_K,
+                                                                  ThreadBufferDimAccessOrder,
+                                                                  XSrcVectorDim,
+                                                                  XSrcVectorSize,
+                                                                  1,
+                                                                  true>(
+            x_grid_desc_m_k,
+            make_multi_index(block_global_id * M_BlockTileSize +
+                                 thread_m_cluster_id * MThreadSliceSize,
+                             thread_k_cluster_id * XSrcVectorSize));
+
+        auto threadwise_gamma_load =
+            ThreadwiseTensorSliceTransfer_v2<GammaDataType,
+                                             AccDataType,
+                                             GridDesc_M_K,
+                                             decltype(thread_buffer_desc_m_k),
+                                             ThreadBufferLengths_M_K,
+                                             ThreadBufferDimAccessOrder,
+                                             GammaSrcVectorDim,
+                                             GammaSrcVectorSize,
+                                             1,
+                                             true>(
+                gamma_grid_desc_m_k,
+                make_multi_index(block_global_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 thread_k_cluster_id * GammaSrcVectorSize));
+
+        auto threadwise_beta_load =
+            ThreadwiseTensorSliceTransfer_v2<BetaDataType,
+                                             AccDataType,
+                                             GridDesc_M_K,
+                                             decltype(thread_buffer_desc_m_k),
+                                             ThreadBufferLengths_M_K,
+                                             ThreadBufferDimAccessOrder,
+                                             BetaSrcVectorDim,
+                                             BetaSrcVectorSize,
+                                             1,
+                                             true>(
+                beta_grid_desc_m_k,
+                make_multi_index(block_global_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 thread_k_cluster_id * BetaSrcVectorSize));
+
+        auto threadwise_y_store =
+            ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                               YDataType,
+                                               decltype(thread_buffer_desc_m_k),
+                                               GridDesc_M_K,
+                                               AccElementwiseOperation,
+                                               ThreadBufferLengths_M_K,
+                                               ThreadBufferDimAccessOrder,
+                                               YDstVectorDim,
+                                               YDstVectorSize,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                y_grid_desc_m_k,
+                make_multi_index(block_global_id * M_BlockTileSize +
+                                     thread_m_cluster_id * MThreadSliceSize,
+                                 thread_k_cluster_id * YDstVectorSize),
+                acc_elementwise_op);
+
+        constexpr auto thread_copy_fwd_step_m_k = make_multi_index(0, K_BlockTileStepSize);
+        constexpr auto thread_copy_bwd_step_m_k =
+            make_multi_index(0, SweepOnce ? 0 : -K_BlockTileSize);
+
+        const auto x_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_x_global, x_grid_desc_m_k.GetElementSpaceSize());
+
+        const auto gamma_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_gamma_global, gamma_grid_desc_m_k.GetElementSpaceSize());
+
+        const auto beta_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_beta_global, beta_grid_desc_m_k.GetElementSpaceSize());
+
+        auto threadwise_welford       = ThreadwiseWelford();
+        threadwise_welford.max_count_ = GetKPerThread(x_grid_desc_m_k, thread_k_cluster_id);
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            mean_thread_buf(I) = type_convert<AccDataType>(0.0f);
+            var_thread_buf(I)  = type_convert<AccDataType>(0.0f);
+        });
+
+        for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles)
+        {
+            static_for<0, XThreadBufferNumber, 1>{}([&](auto i) {
+                threadwise_x_load.Run(x_grid_desc_m_k,
+                                      x_global_val_buf,
+                                      thread_buffer_desc_m_k,
+                                      make_tuple(I0, I0),
+                                      x_thread_buf(i));
+                threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k);
+                threadwise_welford.Run(x_thread_buf[i], mean_thread_buf, var_thread_buf);
+            });
+        }
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            if constexpr(I > 0)
+                block_sync_lds();
+
+            int count = threadwise_welford.cur_count_;
+            BlockwiseWelford::Run(mean_thread_buf(I), var_thread_buf(I), count);
+        });
+
+        auto thread_copy_tail_m_k =
+            (num_k_block_tile_iteration - 1) * XThreadBufferNumber * thread_copy_fwd_step_m_k;
+
+        threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_bwd_step_m_k);
+        threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k, thread_copy_tail_m_k);
+        threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k, thread_copy_tail_m_k);
+        threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, thread_copy_tail_m_k);
+
+        for(index_t reducedTiles = 0; reducedTiles < num_k_block_tile_iteration; ++reducedTiles)
+        {
+            if constexpr(!SweepOnce)
+            {
+                static_for<0, XThreadBufferNumber, 1>{}([&](auto i) {
+                    threadwise_x_load.Run(x_grid_desc_m_k,
+                                          x_global_val_buf,
+                                          thread_buffer_desc_m_k,
+                                          make_tuple(I0, I0),
+                                          x_thread_buf(i));
+                    threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, thread_copy_fwd_step_m_k);
+                });
+            }
+
+            static_for<0, GammaThreadBufferNumber, 1>{}([&](auto i) {
+                threadwise_gamma_load.Run(gamma_grid_desc_m_k,
+                                          gamma_global_val_buf,
+                                          thread_buffer_desc_m_k,
+                                          make_tuple(I0, I0),
+                                          gamma_thread_buf(i));
+
+                threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k,
+                                                         thread_copy_fwd_step_m_k);
+            });
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                auto divisor = 1 / __builtin_amdgcn_sqrtf(var_thread_buf(iM) + epsilon);
+                static_for<0, XThreadBufferNumber, 1>{}([&](auto iK0) {
+                    static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) {
+                        constexpr auto offset_m_k =
+                            thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1));
+
+                        // normalize
+                        y_thread_buf(iK0)(Number<offset_m_k>{}) =
+                            (x_thread_buf(iK0)(Number<offset_m_k>{}) - mean_thread_buf(iM)) *
+                            divisor;
+
+                        // gamma
+                        y_thread_buf(iK0)(Number<offset_m_k>{}) =
+                            y_thread_buf(iK0)(Number<offset_m_k>{}) *
+                            gamma_thread_buf(iK0)(Number<offset_m_k>{});
+                    });
+                });
+            });
+
+            static_for<0, BetaThreadBufferNumber, 1>{}([&](auto i) {
+                threadwise_beta_load.Run(beta_grid_desc_m_k,
+                                         beta_global_val_buf,
+                                         thread_buffer_desc_m_k,
+                                         make_tuple(I0, I0),
+                                         beta_thread_buf(i));
+                threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k,
+                                                        thread_copy_fwd_step_m_k);
+            });
+
+            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                static_for<0, XThreadBufferNumber, 1>{}([&](auto iK0) {
+                    static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) {
+                        constexpr auto offset_m_k =
+                            thread_buffer_desc_m_k.CalculateOffset(make_tuple(iM, iK1));
+
+                        // beta
+                        y_thread_buf(iK0)(Number<offset_m_k>{}) =
+                            y_thread_buf(iK0)(Number<offset_m_k>{}) +
+                            beta_thread_buf(iK0)(Number<offset_m_k>{});
+                    });
+                });
+            });
+
+            static_for<0, YThreadBufferNumber, 1>{}([&](auto i) {
+                threadwise_y_store.Run(thread_buffer_desc_m_k,
+                                       make_tuple(I0, I0),
+                                       y_thread_buf(i),
+                                       y_grid_desc_m_k,
+                                       y_global_val_buf);
+                threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, thread_copy_fwd_step_m_k);
+            });
+
+            threadwise_x_load.MoveSrcSliceWindow(x_grid_desc_m_k, 2 * thread_copy_bwd_step_m_k);
+            threadwise_gamma_load.MoveSrcSliceWindow(gamma_grid_desc_m_k,
+                                                     2 * thread_copy_bwd_step_m_k);
+            threadwise_beta_load.MoveSrcSliceWindow(beta_grid_desc_m_k,
+                                                    2 * thread_copy_bwd_step_m_k);
+            threadwise_y_store.MoveDstSliceWindow(y_grid_desc_m_k, 2 * thread_copy_bwd_step_m_k);
+        }
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_permute.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_permute.hpp
new file mode 100644
index 00000000..de1ae915
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_permute.hpp
@@ -0,0 +1,339 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <functional>
+#include <numeric>
+#include <iterator>
+
+#include "ck/tensor_description/cluster_descriptor.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwisePermute,
+          typename InGridDesc,
+          typename OutGridDesc,
+          typename InDataType,
+          typename OutDataType,
+          typename ElementwiseOperation,
+          typename Block2TileMap>
+__global__ void kernel_nd_permute(const InGridDesc in_grid_desc,
+                                  const OutGridDesc out_grid_desc,
+                                  const InDataType* p_in_global,
+                                  OutDataType* p_out_global,
+                                  const ElementwiseOperation elementwise_op,
+                                  const Block2TileMap block_2_tile_map)
+{
+    __shared__ char p_shared[GridwisePermute::GetSharedMemoryNumberOfByte()];
+
+    GridwisePermute::Run(in_grid_desc,
+                         out_grid_desc,
+                         p_in_global,
+                         p_out_global,
+                         p_shared,
+                         elementwise_op,
+                         block_2_tile_map);
+}
+
+template <typename InGridDesc,
+          typename OutGridDesc,
+          typename InDataType,
+          typename OutDataType,
+          typename ElementwiseOperation,
+          index_t BlockSize,
+          index_t NPerBlock,
+          index_t HPerBlock,
+          index_t WPerBlock,
+          index_t InBlockLdsExtraW,
+          typename InBlockTransferThreadClusterLengths,
+          typename InBlockTransferThreadClusterArrangeOrder,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          index_t SrcScalarPerVector,
+          index_t DstScalarPerVector>
+struct GridwisePermute
+{
+    static_assert(InGridDesc::GetNumOfDimension() == OutGridDesc::GetNumOfDimension());
+    static_assert(3 <= InGridDesc::GetNumOfDimension());
+    static_assert((InGridDesc::GetNumOfDimension() - 2) <= SrcVectorDim &&
+                  SrcVectorDim < InGridDesc::GetNumOfDimension());
+    static_assert((OutGridDesc::GetNumOfDimension() - 2) <= DstVectorDim &&
+                  DstVectorDim < OutGridDesc::GetNumOfDimension());
+    static_assert(SrcVectorDim != DstVectorDim);
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+
+    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
+
+    struct Block2TileMap
+    {
+        static constexpr index_t NumDim = InGridDesc::GetNumOfDimension();
+        static_assert(3 <= NumDim);
+
+        static constexpr auto I0 = Number<0>{};
+
+        Block2TileMap()                     = delete;
+        Block2TileMap(const Block2TileMap&) = default;
+        Block2TileMap(Block2TileMap&&)      = delete;
+
+        ~Block2TileMap() = default;
+
+        Block2TileMap& operator=(const Block2TileMap&) = delete;
+        Block2TileMap& operator=(Block2TileMap&&) = delete;
+
+        explicit Block2TileMap(const InGridDesc& desc) : desc_(desc) {}
+
+        __host__ constexpr index_t CalculateGridSize(const InGridDesc& desc) const
+        {
+            const auto N0 =
+                math::integer_divide_ceil(desc.GetLength(Number<NumDim - 3>{}), NPerBlock);
+            const auto H0 =
+                math::integer_divide_ceil(desc.GetLength(Number<NumDim - 2>{}), HPerBlock);
+            const auto W0 =
+                math::integer_divide_ceil(desc.GetLength(Number<NumDim - 1>{}), WPerBlock);
+
+            const index_t grid_size = N0 * H0 * W0;
+
+            return grid_size;
+        }
+
+        template <typename TopIdx>
+        __host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx& idx_top) const
+        {
+            static_assert(TopIdx::Size() == 1);
+
+            auto block_1d_id = idx_top[I0];
+
+            const auto N0 =
+                math::integer_divide_ceil(desc_.GetLength(Number<NumDim - 3>{}), NPerBlock);
+            const auto H0 =
+                math::integer_divide_ceil(desc_.GetLength(Number<NumDim - 2>{}), HPerBlock);
+            const auto W0 =
+                math::integer_divide_ceil(desc_.GetLength(Number<NumDim - 1>{}), WPerBlock);
+
+            block_1d_id = block_1d_id % (N0 * H0 * W0);
+
+            index_t idx_N0 = block_1d_id / (H0 * W0);
+            index_t idx_H0 = (block_1d_id % (H0 * W0)) / W0;
+            index_t idx_W0 = block_1d_id % W0;
+
+            return make_tuple(idx_N0, idx_H0, idx_W0);
+        }
+
+        private:
+        const InGridDesc desc_;
+    };
+
+    using DefaultBlock2TileMap = Block2TileMap;
+
+    // use an [NPerBlock, HPerBlock, WPerBlock] tensor as element-copy relay
+    __host__ __device__ static constexpr auto GetInBlockDesc_NPerBlock_HPerBlock_WPerBlock()
+    {
+        return make_naive_tensor_descriptor(
+            make_tuple(Number<NPerBlock>{}, Number<HPerBlock>{}, Number<WPerBlock>{}),
+            make_tuple(Number<HPerBlock*(WPerBlock + InBlockLdsExtraW)>{},
+                       Number<WPerBlock + InBlockLdsExtraW>{},
+                       I1));
+    }
+
+    // for N-dimension descriptor, reserve its last 2 dimensions, then merge its leading dimensions
+    // into single one. finally, form a 3D descriptor: [d(0), d(1), ..., d(N - 2), d(N - 1)] ->
+    // [(d(0) x d(1) x ...), d(N - 2), d(N - 1)]
+    template <typename GridDesc>
+    __host__ __device__ static constexpr auto GetMergedDesc(const GridDesc& desc)
+    {
+        constexpr index_t NumDim = GridDesc::GetNumOfDimension();
+        static_assert(3 <= NumDim);
+
+        const auto merged_desc = transform_tensor_descriptor(
+            desc,
+            make_tuple(make_merge_transform(generate_tuple(
+                           [&](auto I) { return desc.GetLength(I); }, Number<NumDim - 2>{})),
+                       make_pass_through_transform(desc.GetLength(Number<NumDim - 2>{})),
+                       make_pass_through_transform(desc.GetLength(Number<NumDim - 1>{}))),
+            make_tuple(generate_sequence_v2([&](auto I) { return I; }, Number<NumDim - 2>{}),
+                       Sequence<NumDim - 2>{},
+                       Sequence<NumDim - 1>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+        return merged_desc;
+    }
+
+    __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
+    {
+        constexpr auto in_block_desc_nperblock_hperblock_wperblock =
+            GetInBlockDesc_NPerBlock_HPerBlock_WPerBlock();
+
+        return in_block_desc_nperblock_hperblock_wperblock.GetElementSpaceSize() *
+               sizeof(InDataType);
+    }
+
+    __host__ __device__ static constexpr auto MakeDefaultBlock2TileMap(const InGridDesc& desc)
+    {
+        return DefaultBlock2TileMap{desc};
+    }
+
+    __host__ __device__ static constexpr bool CheckValidity(const InGridDesc& in_grid_desc,
+                                                            const OutGridDesc& out_grid_desc)
+    {
+        constexpr index_t NumDim = InGridDesc::GetNumOfDimension();
+
+        // check if we only swap last 2 dimensions
+        bool valid = true;
+        static_for<0, NumDim - 2, 1>{}([&](auto I) {
+            if(valid && in_grid_desc.GetLength(I) != out_grid_desc.GetLength(I))
+            {
+                valid = false;
+            }
+        });
+
+        return valid &&
+               (in_grid_desc.GetLength(Number<NumDim - 1>{}) ==
+                out_grid_desc.GetLength(Number<NumDim - 2>{})) &&
+               (in_grid_desc.GetLength(Number<NumDim - 2>{}) ==
+                out_grid_desc.GetLength(Number<NumDim - 1>{}));
+    }
+
+    template <typename Block2TileMap>
+    __device__ static void Run(const InGridDesc in_grid_desc,
+                               const OutGridDesc out_grid_desc,
+                               const InDataType* p_in_global,
+                               OutDataType* p_out_global,
+                               void* __restrict__ p_shared,
+                               const ElementwiseOperation elementwise_op,
+                               const Block2TileMap& block_2_tile_map)
+    {
+        auto in_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_in_global, in_grid_desc.GetElementSpaceSize());
+
+        auto out_global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_out_global, out_grid_desc.GetElementSpaceSize());
+
+        // each workgroup handles an [NPerBlock, HPerBlock, WPerBLock] slice-transpose problem
+        const auto block_work_idx =
+            block_2_tile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id()));
+
+        const index_t n_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I0] * NPerBlock);
+
+        const index_t h_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I1] * HPerBlock);
+
+        const index_t w_block_data_idx_on_grid =
+            __builtin_amdgcn_readfirstlane(block_work_idx[I2] * WPerBlock);
+
+        // create [NPerBlock, HPerBlock, WPerBLock] shaped LDS buffer
+        constexpr auto in_block_desc_nperblock_hperblock_wperblock =
+            GetInBlockDesc_NPerBlock_HPerBlock_WPerBlock();
+
+        auto in_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+            static_cast<InDataType*>(p_shared),
+            in_block_desc_nperblock_hperblock_wperblock.GetElementSpaceSize());
+
+        using BlockSliceLengths          = Sequence<NPerBlock, HPerBlock, WPerBlock>;
+        using InBlockTransferAccessOrder = Sequence<0, 1, 2>;
+
+        constexpr index_t SrcVectorDimAfterMerge =
+            SrcVectorDim - (InGridDesc::GetNumOfDimension() - 3);
+        constexpr index_t DstVectorDimAfterMerge = SrcVectorDimAfterMerge;
+
+        using ck::tensor_operation::element_wise::PassThrough;
+
+        // merge input descriptor into [(in_grid_desc.GetLength(0) x in_grid_desc.GetLength(1) x
+        // ...), in_grid_desc.GetLength(NumDim - 2), in_grid_desc.GetLength(NumDim - 1)]
+        const auto in_grid_desc_n_h_w = GetMergedDesc(in_grid_desc);
+
+        // a workgroup copies an [NPerBlock, HPerBlock, WPerBlock] slice from global memory to LDS
+        auto in_global_load = ThreadGroupTensorSliceTransfer_v4r1<
+            ThisThreadBlock,
+            ElementwiseOperation,
+            PassThrough,
+            InMemoryDataOperationEnum::Set,
+            BlockSliceLengths,
+            InBlockTransferThreadClusterLengths,
+            InBlockTransferThreadClusterArrangeOrder,
+            InDataType,
+            InDataType,
+            decltype(in_grid_desc_n_h_w),
+            decltype(in_block_desc_nperblock_hperblock_wperblock),
+            InBlockTransferAccessOrder,
+            InBlockTransferAccessOrder,
+            SrcVectorDimAfterMerge,
+            2,
+            SrcScalarPerVector,
+            1,
+            1,
+            1,
+            true,
+            true>(in_grid_desc_n_h_w,
+                  make_multi_index(
+                      n_block_data_idx_on_grid, h_block_data_idx_on_grid, w_block_data_idx_on_grid),
+                  PassThrough{},
+                  in_block_desc_nperblock_hperblock_wperblock,
+                  make_multi_index(0, 0, 0),
+                  PassThrough{});
+
+        // merge output descriptor into [(out_grid_desc.GetLength(0) x out_grid_desc.GetLength(1) x
+        // ...), out_grid_desc.GetLength(NumDim - 2), out_grid_desc.GetLength(NumDim - 1)]
+        const auto out_grid_desc_n_w_h = GetMergedDesc(out_grid_desc);
+
+        // create transposed view of output tensor
+        const auto out_grid_desc_n_h_w = transform_tensor_descriptor(
+            out_grid_desc_n_w_h,
+            make_tuple(make_pass_through_transform(out_grid_desc_n_w_h.GetLength(I0)),
+                       make_pass_through_transform(out_grid_desc_n_w_h.GetLength(I1)),
+                       make_pass_through_transform(out_grid_desc_n_w_h.GetLength(I2))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<2>{}, Sequence<1>{}));
+
+        // a workgroup copies an [NPerBlock, HPerBlock, WPerBlock] slice from LDS to global memory
+        auto out_global_store = ThreadGroupTensorSliceTransfer_v4r1<
+            ThisThreadBlock,
+            ElementwiseOperation,
+            PassThrough,
+            InMemoryDataOperationEnum::Set,
+            BlockSliceLengths,
+            InBlockTransferThreadClusterLengths,
+            InBlockTransferThreadClusterArrangeOrder,
+            InDataType,
+            OutDataType,
+            decltype(in_block_desc_nperblock_hperblock_wperblock),
+            decltype(out_grid_desc_n_h_w),
+            InBlockTransferAccessOrder,
+            InBlockTransferAccessOrder,
+            2,
+            DstVectorDimAfterMerge,
+            1,
+            DstScalarPerVector,
+            1,
+            1,
+            true,
+            true>(in_block_desc_nperblock_hperblock_wperblock,
+                  make_multi_index(0, 0, 0),
+                  PassThrough{},
+                  out_grid_desc_n_h_w,
+                  make_multi_index(
+                      n_block_data_idx_on_grid, h_block_data_idx_on_grid, w_block_data_idx_on_grid),
+                  elementwise_op);
+
+        in_global_load.Run(in_grid_desc_n_h_w,
+                           in_global_buf,
+                           in_block_desc_nperblock_hperblock_wperblock,
+                           in_block_buf,
+                           I0);
+
+        out_global_store.Run(in_block_desc_nperblock_hperblock_wperblock,
+                             in_block_buf,
+                             out_grid_desc_n_h_w,
+                             out_global_buf,
+                             I0);
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp
new file mode 100644
index 00000000..901e7aee
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+
+namespace ck {
+
+template <index_t BlockSize, typename DataType, typename Grid1dBufferDescType>
+__global__ void kernel_buffer_set_value(const Grid1dBufferDescType grid_1d_buffer_desc,
+                                        DataType* const __restrict__ p_global,
+                                        DataType value)
+
+{
+
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
+
+    constexpr auto I0 = Number<0>{};
+
+    const index_t thread_local_id = get_thread_local_1d_id();
+    const index_t block_global_id = get_block_1d_id();
+
+    const index_t thread_global_id = block_global_id * BlockSize + thread_local_id;
+
+    StaticBuffer<AddressSpaceEnum::Vgpr, DataType, 1, true> value_buf;
+
+    value_buf(I0) = value;
+
+    constexpr auto val_buff_desc = make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
+
+    auto global_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+        p_global, grid_1d_buffer_desc.GetElementSpaceSize());
+
+    if(thread_global_id < grid_1d_buffer_desc.GetElementSize())
+    {
+        auto threadwise_store = ThreadwiseTensorSliceTransfer_v1r3<DataType,
+                                                                   DataType,
+                                                                   decltype(val_buff_desc),
+                                                                   Grid1dBufferDescType,
+                                                                   PassThroughOp,
+                                                                   Sequence<1>,
+                                                                   Sequence<0>,
+                                                                   0,
+                                                                   1,
+                                                                   InMemoryDataOperationEnum::Set,
+                                                                   1,
+                                                                   true>(
+            grid_1d_buffer_desc, make_multi_index(thread_global_id), PassThroughOp{});
+
+        threadwise_store.Run(
+            val_buff_desc, make_tuple(I0), value_buf, grid_1d_buffer_desc, global_buf);
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_set_multiple_buffer_value.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_set_multiple_buffer_value.hpp
new file mode 100644
index 00000000..88c7b6ac
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_set_multiple_buffer_value.hpp
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+
+namespace ck {
+
+template <typename Grid1dBufferDescTuple,
+          index_t NumBuffer,
+          index_t BlockSize,
+          typename DataTypePointerTuple,
+          typename DataTypeTuple>
+__global__ void
+kernel_multiple_buffer_set_value(const Grid1dBufferDescTuple grid_1d_buffer_desc_tuple,
+                                 DataTypePointerTuple p_global_tuple,
+                                 DataTypeTuple value_tuple)
+
+{
+    static_assert(NumBuffer == DataTypePointerTuple::Size() && NumBuffer == DataTypeTuple::Size(),
+                  "The tuple size should be same as NumBuffer!");
+
+    static_for<0, NumBuffer, 1>{}([&](auto iB) {
+        using DataTypePointer     = remove_cvref_t<decltype(DataTypePointerTuple{}[iB])>;
+        using DataTypeFromPointer = remove_pointer_t<DataTypePointer>;
+        using DataType            = remove_cvref_t<decltype(DataTypeTuple{}[iB])>;
+
+        static_assert(is_same<DataType, DataTypeFromPointer>::value,
+                      "Types in tuples does not match!");
+    });
+
+    constexpr auto I0 = Number<0>{};
+
+    const index_t thread_global_id = get_thread_global_1d_id();
+
+    auto value_buf_tuple = generate_tuple(
+        [&](auto iB) {
+            using DataType = remove_cvref_t<decltype(DataTypeTuple{}[iB])>;
+
+            return StaticBuffer<AddressSpaceEnum::Vgpr, DataType, 1, true>{};
+        },
+        Number<NumBuffer>{});
+
+    static_for<0, NumBuffer, 1>{}([&](auto iB) {
+        static_for<0, 1, 1>{}([&](auto J) { value_buf_tuple(iB)(J) = value_tuple[iB]; });
+    });
+
+    auto global_buf_tuple = generate_tuple(
+        [&](auto iB) {
+            return make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_global_tuple(iB), grid_1d_buffer_desc_tuple[iB].GetElementSpaceSize());
+        },
+        Number<NumBuffer>{});
+
+    constexpr auto val_buff_desc = make_naive_tensor_descriptor_packed(make_tuple(Number<1>{}));
+
+    static_for<0, NumBuffer, 1>{}([&](auto iB) {
+        using DataType      = remove_cvref_t<decltype(DataTypeTuple{}[iB])>;
+        using PassThroughOp = tensor_operation::element_wise::PassThrough;
+
+        auto threadwise_store =
+            ThreadwiseTensorSliceTransfer_v1r3<DataType,
+                                               DataType,
+                                               decltype(val_buff_desc),
+                                               decltype(Grid1dBufferDescTuple{}[iB]),
+                                               PassThroughOp,
+                                               Sequence<1>,
+                                               Sequence<0>,
+                                               0,
+                                               1,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                grid_1d_buffer_desc_tuple[iB], make_multi_index(thread_global_id), PassThroughOp{});
+
+        threadwise_store.Run(val_buff_desc,
+                             make_tuple(I0),
+                             value_buf_tuple(iB),
+                             grid_1d_buffer_desc_tuple[iB],
+                             global_buf_tuple(iB));
+    });
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp
new file mode 100644
index 00000000..0344e683
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_softmax.hpp
@@ -0,0 +1,407 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_common.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/utility/reduction_functions_accumulate.hpp"
+#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp"
+#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+
+template <typename GridwiseReduction,
+          typename InDataType,
+          typename OutDataType,
+          typename AccDataType,
+          typename GridDesc_M_K>
+__global__ void kernel_softmax(const GridDesc_M_K in_grid_desc_m_k,
+                               const GridDesc_M_K out_grid_desc_m_k,
+                               index_t block_group_size,
+                               index_t num_k_block_tile_iteration,
+                               AccDataType alpha,
+                               const InDataType* const __restrict__ p_in_value_global,
+                               AccDataType beta,
+                               OutDataType* const __restrict__ p_out_value_global)
+{
+    GridwiseReduction::Run(in_grid_desc_m_k,
+                           out_grid_desc_m_k,
+                           block_group_size,
+                           num_k_block_tile_iteration,
+                           alpha,
+                           p_in_value_global,
+                           beta,
+                           p_out_value_global);
+};
+
+template <typename InDataType,
+          typename OutDataType,
+          typename AccDataType,
+          typename GridDesc_M_K,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t InSrcVectorDim,
+          index_t InSrcVectorSize,
+          index_t OutDstVectorSize,
+          bool SweepOnce>
+struct GridwiseSoftmax_mk_to_mk
+{
+    static_assert(((InSrcVectorDim == 0 && MThreadSliceSize % InSrcVectorSize == 0) ||
+                   (InSrcVectorDim == 1 && KThreadSliceSize % InSrcVectorSize == 0)) &&
+                      (KThreadSliceSize % OutDstVectorSize == 0),
+                  "Invalid thread slice sizes and/or vector sizes configuration, please check!");
+
+    static constexpr bool reorder_thread_cluster = (InSrcVectorDim == 0);
+
+    using ThreadClusterLengths_M_K = Sequence<MThreadClusterSize, KThreadClusterSize>;
+
+    using ThreadBufferDimAccessOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    using ThreadClusterArrangeOrder =
+        typename conditional<reorder_thread_cluster, Sequence<1, 0>, Sequence<0, 1>>::type;
+
+    static constexpr auto thread_cluster_desc =
+        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
+
+    using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{})));
+    using ThreadReduceDstDesc_M =
+        decltype(make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{})));
+
+    using PassThroughOp = tensor_operation::element_wise::PassThrough;
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize;
+    static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize;
+
+    __device__ static void Run(const GridDesc_M_K& in_grid_desc_m_k,
+                               const GridDesc_M_K& out_grid_desc_m_k,
+                               index_t block_group_size,
+                               index_t num_k_block_tile_iteration,
+                               AccDataType alpha,
+                               const InDataType* const __restrict__ p_in_value_global,
+                               AccDataType beta,
+                               OutDataType* const __restrict__ p_out_value_global)
+    {
+        if constexpr(SweepOnce)
+        {
+            num_k_block_tile_iteration = 1;
+        }
+
+        // LDS
+        __shared__ AccDataType p_reduce_work_buffer[BlockSize];
+
+        auto out_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_out_value_global, out_grid_desc_m_k.GetElementSpaceSize());
+
+        auto reduce_work_buf =
+            make_dynamic_buffer<AddressSpaceEnum::Lds>(p_reduce_work_buffer, BlockSize);
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            in_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize * KThreadSliceSize, true>
+            out_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> max_value_buf;
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            max_value_buf(I) = reduce::Max::template GetIdentityValue<AccDataType>();
+        });
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MThreadSliceSize, true> accu_value_buf;
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            accu_value_buf(I) = reduce::Add::template GetIdentityValue<AccDataType>();
+        });
+
+        const index_t thread_local_id = get_thread_local_1d_id();
+        const index_t block_global_id = get_block_1d_id();
+        const index_t blkgroup_id     = block_global_id / block_group_size;
+        const index_t block_local_id  = block_global_id % block_group_size;
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
+
+        const auto thread_m_cluster_id = thread_cluster_idx[I0];
+        const auto thread_k_cluster_id = thread_cluster_idx[I1];
+
+        const index_t reduceSizePerBlock = K_BlockTileSize * num_k_block_tile_iteration;
+
+        using ThreadBufferLengths         = Sequence<MThreadSliceSize, KThreadSliceSize>;
+        constexpr auto thread_buffer_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(Number<MThreadSliceSize>{}, Number<KThreadSliceSize>{}));
+
+        // Normally, 0 as invalid element value is adequate since 0 makes no contribution to
+        // accumulated result. However, in stable softmax, all values 0s or not are subtracted by
+        // another value_max. As numbers become non-zero, effectively it allows invalid values to
+        // slip through and contribute to the accumulated result.
+        //
+        // The trick here is leveraging the fact that many math functions (add, sub, exp, ...)
+        // propagate NaNs when operands have NaNs involved. By initialiing invalid element value
+        // with NaN, an invalid value doing math manipulations is still NaN, which in turn can still
+        // be identified as an invalid value. We can then discard the invalid values which
+        // originally failed the bound check during accumulation. This allows to ignore values that
+        // failed bound check even after multiple math manipulations.
+        //
+        // NOTE: reset coordinate after every step because the same threadwise copy will sweep
+        // through global memory 3 times back and forth
+        auto threadwise_src_load = ThreadwiseTensorSliceTransfer_v2<InDataType,
+                                                                    AccDataType,
+                                                                    GridDesc_M_K,
+                                                                    decltype(thread_buffer_desc),
+                                                                    ThreadBufferLengths,
+                                                                    ThreadBufferDimAccessOrder,
+                                                                    InSrcVectorDim,
+                                                                    InSrcVectorSize,
+                                                                    1,
+                                                                    true /* ResetCoordAfterRun */,
+                                                                    true /* InvalidElementAsNaN */>(
+            in_grid_desc_m_k,
+            make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
+                             block_local_id * reduceSizePerBlock +
+                                 thread_k_cluster_id * KThreadSliceSize));
+
+        auto threadwise_dst_load = ThreadwiseTensorSliceTransfer_v2<OutDataType,
+                                                                    AccDataType,
+                                                                    GridDesc_M_K,
+                                                                    decltype(thread_buffer_desc),
+                                                                    ThreadBufferLengths,
+                                                                    ThreadBufferDimAccessOrder,
+                                                                    InSrcVectorDim,
+                                                                    InSrcVectorSize,
+                                                                    1,
+                                                                    false>(
+            out_grid_desc_m_k,
+            make_multi_index(blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
+                             block_local_id * reduceSizePerBlock +
+                                 thread_k_cluster_id * KThreadSliceSize));
+
+        auto threadwise_dst_store =
+            ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
+                                               OutDataType,
+                                               decltype(thread_buffer_desc),
+                                               GridDesc_M_K,
+                                               PassThroughOp,
+                                               ThreadBufferLengths,
+                                               ThreadBufferDimAccessOrder,
+                                               InSrcVectorDim,
+                                               OutDstVectorSize,
+                                               InMemoryDataOperationEnum::Set,
+                                               1,
+                                               true>(
+                out_grid_desc_m_k,
+                make_multi_index(
+                    blkgroup_id * M_BlockTileSize + thread_m_cluster_id * MThreadSliceSize,
+                    block_local_id * reduceSizePerBlock + thread_k_cluster_id * KThreadSliceSize),
+                PassThroughOp{});
+
+        constexpr auto in_thread_copy_fwd_step =
+            make_multi_index(0, SweepOnce ? 0 : K_BlockTileSize);
+        constexpr auto in_thread_copy_bwd_step =
+            make_multi_index(0, SweepOnce ? 0 : -K_BlockTileSize);
+
+        ///
+        /// max(x)
+        ///
+        using BlockwiseMaxReduce = PartitionedBlockwiseReduction<
+            AccDataType,
+            BlockSize,
+            ThreadClusterLengths_M_K,
+            ThreadClusterArrangeOrder,
+            reduce::Max,
+            false, // param ignored
+            detail::AccumulateWithNanIgnore<reduce::Max, AccDataType>>;
+
+        using ThreadwiseMaxReduce =
+            ThreadwiseReduction<AccDataType,
+                                ThreadReduceSrcDesc_M_K,
+                                ThreadReduceDstDesc_M,
+                                reduce::Max,
+                                false, // param ignored
+                                detail::AccumulateWithNanIgnore<reduce::Max, AccDataType>>;
+
+        const auto in_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+            p_in_value_global, in_grid_desc_m_k.GetElementSpaceSize());
+
+        index_t reducedTiles = 0;
+        do
+        {
+            threadwise_src_load.Run(in_grid_desc_m_k,
+                                    in_global_val_buf,
+                                    thread_buffer_desc,
+                                    make_tuple(I0, I0),
+                                    in_thread_buf);
+
+            ThreadwiseMaxReduce::Reduce(in_thread_buf, max_value_buf);
+
+            threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_fwd_step);
+
+            reducedTiles++;
+        } while(reducedTiles < num_k_block_tile_iteration);
+
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            BlockwiseMaxReduce::Reduce(reduce_work_buf, max_value_buf(I));
+            block_sync_lds();
+        });
+
+        threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_bwd_step);
+
+        ///
+        /// sum(exp(x - max(x)))
+        ///
+        using BlockwiseSumReduce = PartitionedBlockwiseReduction<
+            AccDataType,
+            BlockSize,
+            ThreadClusterLengths_M_K,
+            ThreadClusterArrangeOrder,
+            reduce::Add,
+            false, // ignored
+            detail::AccumulateWithNanIgnore<reduce::Add, AccDataType>>;
+
+        using ThreadwiseSumReduce =
+            ThreadwiseReduction<AccDataType,
+                                ThreadReduceSrcDesc_M_K,
+                                ThreadReduceDstDesc_M,
+                                reduce::Add,
+                                false, // ignored
+                                detail::AccumulateWithNanIgnore<reduce::Add, AccDataType>>;
+
+        reducedTiles = 0;
+        do
+        {
+            if constexpr(!SweepOnce)
+            {
+                threadwise_src_load.Run(in_grid_desc_m_k,
+                                        in_global_val_buf,
+                                        thread_buffer_desc,
+                                        make_tuple(I0, I0),
+                                        in_thread_buf);
+            }
+
+            // do element-wise pre-reduction operation
+            static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                    constexpr auto offset = thread_buffer_desc.CalculateOffset(make_tuple(iM, iK));
+                    out_thread_buf(Number<offset>{}) =
+                        math::exp(in_thread_buf(Number<offset>{}) - max_value_buf(iM));
+                });
+            });
+
+            ThreadwiseSumReduce::Reduce(out_thread_buf, accu_value_buf);
+
+            threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_bwd_step);
+
+            reducedTiles++;
+        } while(reducedTiles < num_k_block_tile_iteration);
+
+        block_sync_lds(); // wait for reading being complete before writing to LDS
+        static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
+            BlockwiseSumReduce::Reduce(reduce_work_buf, accu_value_buf(I));
+            block_sync_lds();
+        });
+
+        threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_fwd_step);
+
+        ///
+        /// softmax
+        ///
+        reducedTiles = 0;
+        if(float_equal_zero{}(beta))
+        {
+            do
+            {
+                if constexpr(!SweepOnce)
+                {
+                    threadwise_src_load.Run(in_grid_desc_m_k,
+                                            in_global_val_buf,
+                                            thread_buffer_desc,
+                                            make_tuple(I0, I0),
+                                            in_thread_buf);
+                }
+
+                static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                    // out = alpha * exp(x - max(x)) / sum(exp(x - max(x)))
+                    static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                        constexpr auto offset =
+                            thread_buffer_desc.CalculateOffset(make_tuple(iM, iK));
+                        out_thread_buf(Number<offset>{}) =
+                            alpha * math::exp(in_thread_buf(Number<offset>{}) - max_value_buf(iM)) /
+                            accu_value_buf(iM);
+                    });
+                });
+
+                threadwise_dst_store.Run(thread_buffer_desc,
+                                         make_tuple(I0, I0),
+                                         out_thread_buf,
+                                         out_grid_desc_m_k,
+                                         out_global_val_buf);
+
+                threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_fwd_step);
+                threadwise_dst_store.MoveDstSliceWindow(out_grid_desc_m_k, in_thread_copy_fwd_step);
+
+                reducedTiles++;
+            } while(reducedTiles < num_k_block_tile_iteration);
+        }
+        else
+        {
+            StaticBuffer<AddressSpaceEnum::Vgpr,
+                         AccDataType,
+                         MThreadSliceSize * KThreadSliceSize,
+                         true>
+                in_prior_dst_buf;
+            do
+            {
+                if constexpr(!SweepOnce)
+                {
+                    threadwise_src_load.Run(in_grid_desc_m_k,
+                                            in_global_val_buf,
+                                            thread_buffer_desc,
+                                            make_tuple(I0, I0),
+                                            in_thread_buf);
+                }
+                threadwise_dst_load.Run(out_grid_desc_m_k,
+                                        out_global_val_buf,
+                                        thread_buffer_desc,
+                                        make_tuple(I0, I0),
+                                        in_prior_dst_buf);
+
+                static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
+                    // out = alpha * exp(x - max(x)) / sum(exp(x - max(x))) + beta * prior_out
+                    static_for<0, KThreadSliceSize, 1>{}([&](auto iK) {
+                        constexpr auto offset =
+                            thread_buffer_desc.CalculateOffset(make_tuple(iM, iK));
+                        out_thread_buf(Number<offset>{}) =
+                            alpha * math::exp(in_thread_buf(Number<offset>{}) - max_value_buf(iM)) /
+                                accu_value_buf(iM) +
+                            beta * in_prior_dst_buf(Number<offset>{});
+                    });
+                });
+
+                threadwise_dst_store.Run(thread_buffer_desc,
+                                         make_tuple(I0, I0),
+                                         out_thread_buf,
+                                         out_grid_desc_m_k,
+                                         out_global_val_buf);
+
+                threadwise_src_load.MoveSrcSliceWindow(in_grid_desc_m_k, in_thread_copy_fwd_step);
+                threadwise_dst_store.MoveDstSliceWindow(out_grid_desc_m_k, in_thread_copy_fwd_step);
+                threadwise_dst_load.MoveSrcSliceWindow(out_grid_desc_m_k, in_thread_copy_fwd_step);
+
+                reducedTiles++;
+            } while(reducedTiles < num_k_block_tile_iteration);
+        }
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_sparse_embedding3_forward_layernorm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_sparse_embedding3_forward_layernorm.hpp
new file mode 100644
index 00000000..3de6aa08
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_sparse_embedding3_forward_layernorm.hpp
@@ -0,0 +1,344 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_welford.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_welford.hpp"
+
+namespace ck {
+
+template <typename GridwiseSparseEmbedding,
+          typename EmbType,
+          typename IndexType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename AccDataType,
+          typename OutType,
+          typename OutGridDesc>
+#if CK_USE_LAUNCH_BOUNDS
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+#endif
+    __global__ void kernel_sparse_embedding3_forward_layernorm(OutType* p_out,
+                                                               const EmbType* p_emb_a,
+                                                               const EmbType* p_emb_b,
+                                                               const EmbType* p_emb_c,
+                                                               const IndexType* p_index_a,
+                                                               const IndexType* p_index_b,
+                                                               const IndexType* p_index_c,
+                                                               const GammaDataType* p_gamma,
+                                                               const BetaDataType* p_beta,
+                                                               const OutGridDesc out_grid_desc,
+                                                               const AccDataType epsilon)
+{
+    GridwiseSparseEmbedding::Run(p_out,
+                                 p_emb_a,
+                                 p_emb_b,
+                                 p_emb_c,
+                                 p_index_a,
+                                 p_index_b,
+                                 p_index_c,
+                                 p_gamma,
+                                 p_beta,
+                                 out_grid_desc,
+                                 epsilon);
+}
+
+template <typename EmbType,
+          typename IndexType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename AccDataType,
+          typename OutType,
+          typename OutGridDesc,
+          ck::index_t BlockSize,
+          ck::index_t DimClusterSize,
+          ck::index_t RowClusterSize,
+          ck::index_t DimPerBlock,   // Row x Dim, along Dim
+          ck::index_t RowPerBlock,   // Row x Dim, along Row
+          ck::index_t DimThreadSize, // this is actually not vector, but number of registers
+          ck::index_t RowVectorSize>
+struct GridwiseSparseEmbedding3ForwardLayernorm
+{
+    static constexpr auto I0          = Number<0>{};
+    static constexpr auto I1          = Number<1>{};
+    static constexpr auto I2          = Number<2>{};
+    static constexpr auto I3          = Number<3>{};
+    static constexpr index_t WaveSize = 64;
+
+    static_assert(BlockSize == RowClusterSize * DimClusterSize,
+                  "Invalid cluster distribution within block");
+    static_assert(RowClusterSize % WaveSize == 0, "need to be wavewise");
+
+    static_assert(DimPerBlock % (DimClusterSize * DimThreadSize) == 0, "");
+    static_assert(RowPerBlock % (RowClusterSize * RowVectorSize) == 0, "");
+
+    static constexpr auto DimSubBlocks = DimPerBlock / (DimClusterSize * DimThreadSize);
+    static constexpr auto RowSubBlocks = RowPerBlock / (RowClusterSize * RowVectorSize);
+
+    static_assert((DimPerBlock % DimSubBlocks == 0) && (RowPerBlock % RowSubBlocks == 0), "");
+    static constexpr auto DimPerSubBlock = DimPerBlock / DimSubBlocks;
+    static constexpr auto RowPerSubBlock = RowPerBlock / RowSubBlocks;
+
+    using ThreadwiseWolfordDesc2D = decltype(make_naive_tensor_descriptor_packed(make_tuple(
+        Number<DimSubBlocks * DimThreadSize>{}, Number<RowSubBlocks * RowVectorSize>{})));
+
+    using ThreadwiseWolfordDescReduce = decltype(
+        make_naive_tensor_descriptor_packed(make_tuple(Number<DimSubBlocks * DimThreadSize>{})));
+
+    using ThreadwiseWelford =
+        ThreadwiseWelford<AccDataType, ThreadwiseWolfordDesc2D, ThreadwiseWolfordDescReduce>;
+
+    using ThreadClusterLength = Sequence<DimClusterSize, RowClusterSize>;
+
+    using BlockwiseWelford =
+        BlockwiseWelford<AccDataType, BlockSize, ThreadClusterLength, Sequence<0, 1>>;
+
+    __device__ static void Run(OutType* p_out,
+                               const EmbType* p_emb_a,
+                               const EmbType* p_emb_b,
+                               const EmbType* p_emb_c,
+                               const IndexType* p_index_a,
+                               const IndexType* p_index_b,
+                               const IndexType* p_index_c,
+                               const GammaDataType* p_gamma,
+                               const BetaDataType* p_beta,
+                               const OutGridDesc,
+                               const AccDataType epsilon)
+    {
+        const index_t thread_local_id = get_thread_local_1d_id();
+        const index_t block_global_id = get_block_1d_id();
+
+        // const auto index_length = out_grid_desc.GetLength(I0);
+        // const auto emb_dim      = out_grid_desc.GetLength(I1);
+
+        constexpr auto thread_cluster_desc =
+            make_cluster_descriptor(Sequence<DimClusterSize, RowClusterSize>{}, Sequence<0, 1>{});
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(thread_local_id));
+
+        const auto thread_dim_cluster_id = thread_cluster_idx[I0];
+        const auto thread_row_cluster_id = thread_cluster_idx[I1];
+
+        const auto wave_dim_id = __builtin_amdgcn_readfirstlane(thread_dim_cluster_id / WaveSize);
+
+        const auto index_start = block_global_id * DimPerBlock + wave_dim_id * DimThreadSize;
+
+        auto threadwise_welford       = ThreadwiseWelford();
+        threadwise_welford.max_count_ = RowSubBlocks * RowVectorSize;
+
+        constexpr auto thread_buf_size =
+            DimSubBlocks * DimThreadSize * RowSubBlocks * RowVectorSize;
+        constexpr auto thread_buf_desc = make_naive_tensor_descriptor_packed(
+            make_tuple(DimSubBlocks, DimThreadSize, RowSubBlocks, RowVectorSize));
+        constexpr auto mean_var_buf_size = DimSubBlocks * DimThreadSize;
+        constexpr auto mean_var_buf_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(DimSubBlocks, DimThreadSize));
+        constexpr auto gamma_beta_buf_size = RowSubBlocks * RowVectorSize;
+        constexpr auto gamma_beta_buf_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(RowSubBlocks, RowVectorSize));
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, EmbType, thread_buf_size, true> in_thread_buf_a;
+        StaticBuffer<AddressSpaceEnum::Vgpr, EmbType, thread_buf_size, true> in_thread_buf_b;
+        StaticBuffer<AddressSpaceEnum::Vgpr, EmbType, thread_buf_size, true> in_thread_buf_c;
+
+        StaticBuffer<AddressSpaceEnum::Sgpr, IndexType, DimPerBlock, true> index_buf_a;
+        StaticBuffer<AddressSpaceEnum::Sgpr, IndexType, DimPerBlock, true> index_buf_b;
+        StaticBuffer<AddressSpaceEnum::Sgpr, IndexType, DimPerBlock, true> index_buf_c;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, thread_buf_size, true> acc_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, gamma_beta_buf_size, true>
+            gamma_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, gamma_beta_buf_size, true>
+            beta_thread_buf;
+
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, mean_var_buf_size, true> mean_thread_buf;
+        StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, mean_var_buf_size, true> var_thread_buf;
+
+        auto load_current_sub_row = [&](auto i_dim_sub_, auto i_row_sub_) {
+            vector_type_maker_t<EmbType, RowVectorSize> emb_vector_a;
+            vector_type_maker_t<EmbType, RowVectorSize> emb_vector_b;
+            vector_type_maker_t<EmbType, RowVectorSize> emb_vector_c;
+
+            using src_vector_t = typename decltype(emb_vector_a)::type;
+            static_for<0, DimThreadSize, 1>{}([&](auto i_dim_vec_) {
+                constexpr auto current_dim = i_dim_sub_ * DimPerSubBlock + i_dim_vec_;
+                IndexType index_a          = index_buf_a[Number<current_dim>{}];
+                IndexType index_b          = index_buf_b[Number<current_dim>{}];
+                IndexType index_c          = index_buf_c[Number<current_dim>{}];
+
+                auto thread_offset = (thread_row_cluster_id + i_row_sub_ * RowClusterSize) *
+                                     sizeof(EmbType) * RowVectorSize;
+
+                int32x4_t emb_res_a =
+                    make_wave_buffer_resource_with_default_range(p_emb_a + index_a * RowPerBlock);
+                int32x4_t emb_res_b =
+                    make_wave_buffer_resource_with_default_range(p_emb_b + index_b * RowPerBlock);
+                int32x4_t emb_res_c =
+                    make_wave_buffer_resource_with_default_range(p_emb_c + index_c * RowPerBlock);
+                emb_vector_a.template AsType<src_vector_t>()(I0) =
+                    amd_buffer_load_impl<EmbType, RowVectorSize>(emb_res_a, thread_offset, 0);
+                emb_vector_b.template AsType<src_vector_t>()(I0) =
+                    amd_buffer_load_impl<EmbType, RowVectorSize>(emb_res_b, thread_offset, 0);
+                emb_vector_c.template AsType<src_vector_t>()(I0) =
+                    amd_buffer_load_impl<EmbType, RowVectorSize>(emb_res_c, thread_offset, 0);
+
+                static_for<0, RowVectorSize, 1>{}([&](auto i_row_vec_) {
+                    constexpr auto register_offset = thread_buf_desc.CalculateOffset(
+                        make_tuple(i_dim_sub_, i_dim_vec_, i_row_sub_, i_row_vec_));
+                    in_thread_buf_a(Number<register_offset>{}) =
+                        emb_vector_a.template AsType<EmbType>()[i_row_vec_];
+                    in_thread_buf_b(Number<register_offset>{}) =
+                        emb_vector_b.template AsType<EmbType>()[i_row_vec_];
+                    in_thread_buf_c(Number<register_offset>{}) =
+                        emb_vector_c.template AsType<EmbType>()[i_row_vec_];
+                });
+            });
+        };
+
+        auto accumulate_current_sub_row = [&](auto i_dim_sub_, auto i_row_sub_) {
+            static_for<0, DimThreadSize, 1>{}([&](auto i_dim_vec_) {
+                static_for<0, RowVectorSize, 1>{}([&](auto i_row_vec_) {
+                    constexpr auto register_offset = thread_buf_desc.CalculateOffset(
+                        make_tuple(i_dim_sub_, i_dim_vec_, i_row_sub_, i_row_vec_));
+                    AccDataType va =
+                        ck::type_convert<AccDataType>(in_thread_buf_a(Number<register_offset>{}));
+                    AccDataType vb =
+                        ck::type_convert<AccDataType>(in_thread_buf_b(Number<register_offset>{}));
+                    AccDataType vc =
+                        ck::type_convert<AccDataType>(in_thread_buf_c(Number<register_offset>{}));
+
+                    acc_thread_buf(Number<register_offset>{}) += va + vb + vc;
+                });
+            });
+        };
+
+        auto threadwise_welford_sub_row = [&](auto i_dim_sub_, auto i_row_sub_) {
+            static_for<0, DimThreadSize, 1>{}([&](auto i_dim_vec_) {
+                static_for<0, RowVectorSize, 1>{}([&](auto i_row_vec_) {
+                    constexpr auto register_offset = thread_buf_desc.CalculateOffset(
+                        make_tuple(i_dim_sub_, i_dim_vec_, i_row_sub_, i_row_vec_));
+                    constexpr auto mean_var_offset =
+                        mean_var_buf_desc.CalculateOffset(make_tuple(i_dim_sub_, i_dim_vec_));
+
+                    threadwise_welford.cur_count_++;
+                    threadwise_welford.Update(mean_thread_buf(Number<mean_var_offset>{}),
+                                              var_thread_buf(Number<mean_var_offset>{}),
+                                              acc_thread_buf(Number<register_offset>{}));
+                });
+            });
+        };
+
+        auto threadwise_normalize_store_out = [&](auto i_dim_sub_, auto i_row_sub_) {
+            int32x4_t out_res =
+                make_wave_buffer_resource_with_default_range(p_out + index_start * RowPerBlock);
+            static_for<0, DimThreadSize, 1>{}([&](auto i_dim_vec_) {
+                vector_type_maker_t<OutType, RowVectorSize> out_vector;
+                using dst_vector_t = typename decltype(out_vector)::type;
+
+                constexpr auto mean_var_offset =
+                    mean_var_buf_desc.CalculateOffset(make_tuple(i_dim_sub_, i_dim_vec_));
+
+                static_for<0, RowVectorSize, 1>{}([&](auto i_row_vec_) {
+                    constexpr auto register_offset = thread_buf_desc.CalculateOffset(
+                        make_tuple(i_dim_sub_, i_dim_vec_, i_row_sub_, i_row_vec_));
+                    constexpr auto gamma_beta_offset =
+                        gamma_beta_buf_desc.CalculateOffset(make_tuple(i_row_sub_, i_row_vec_));
+
+                    auto acc_val = acc_thread_buf[Number<register_offset>{}];
+                    acc_val      = (acc_val - mean_thread_buf(Number<mean_var_offset>{})) /
+                              sqrt(var_thread_buf(Number<mean_var_offset>{}) + epsilon);
+                    acc_val = acc_val * gamma_thread_buf[Number<gamma_beta_offset>{}] +
+                              beta_thread_buf[Number<gamma_beta_offset>{}];
+
+                    out_vector.template AsType<OutType>()(Number<i_row_vec_>{}) =
+                        type_convert<OutType>(acc_val);
+                });
+
+                index_t thread_offset = (thread_row_cluster_id + i_row_sub_ * RowClusterSize) *
+                                        sizeof(OutType) * RowVectorSize;
+
+                amd_buffer_store_impl<OutType, RowVectorSize>(
+                    out_vector.template AsType<dst_vector_t>()[Number<0>{}],
+                    out_res,
+                    thread_offset,
+                    0);
+            });
+        };
+
+        // first load index
+        ck::static_for<0, DimPerBlock, 1>{}([&](auto i_idx_) {
+            // prefer use s_load
+            index_buf_a(i_idx_) = p_index_a[index_start + i_idx_.value];
+            index_buf_b(i_idx_) = p_index_b[index_start + i_idx_.value];
+            index_buf_c(i_idx_) = p_index_c[index_start + i_idx_.value];
+        });
+
+        // load gamma/beta
+        static_for<0, RowSubBlocks, 1>{}([&](auto i_row_sub_) {
+            vector_type_maker_t<GammaDataType, RowVectorSize> gamma_vector;
+            vector_type_maker_t<BetaDataType, RowVectorSize> beta_vector;
+
+            index_t thread_offset_gamma = (thread_row_cluster_id + i_row_sub_ * RowClusterSize) *
+                                          sizeof(GammaDataType) * RowVectorSize;
+            index_t thread_offset_beta = (thread_row_cluster_id + i_row_sub_ * RowClusterSize) *
+                                         sizeof(BetaDataType) * RowVectorSize;
+
+            int32x4_t gamma_res = make_wave_buffer_resource_with_default_range(p_gamma);
+            int32x4_t beta_res  = make_wave_buffer_resource_with_default_range(p_beta);
+
+            gamma_vector.template AsType<typename decltype(gamma_vector)::type>()(I0) =
+                amd_buffer_load_impl<GammaDataType, RowVectorSize>(
+                    gamma_res, thread_offset_gamma, 0);
+            beta_vector.template AsType<typename decltype(beta_vector)::type>()(I0) =
+                amd_buffer_load_impl<BetaDataType, RowVectorSize>(beta_res, thread_offset_beta, 0);
+
+            static_for<0, RowVectorSize, 1>{}([&](auto i_row_vec_) {
+                constexpr auto offset =
+                    gamma_beta_buf_desc.CalculateOffset(make_tuple(i_row_sub_, i_row_vec_));
+                gamma_thread_buf(Number<offset>{}) = type_convert<AccDataType>(
+                    gamma_vector.template AsType<GammaDataType>()[Number<i_row_vec_>{}]);
+                beta_thread_buf(Number<offset>{}) = type_convert<AccDataType>(
+                    beta_vector.template AsType<BetaDataType>()[Number<i_row_vec_>{}]);
+            });
+        });
+
+        static_for<0, thread_buf_size, 1>{}(
+            [&](auto I) { acc_thread_buf(I) = type_convert<AccDataType>(0.0f); });
+
+        static_for<0, mean_var_buf_size, 1>{}([&](auto I) {
+            mean_thread_buf(I) = type_convert<AccDataType>(0.0f);
+            var_thread_buf(I)  = type_convert<AccDataType>(0.0f);
+        });
+
+        static_for<0, DimSubBlocks, 1>{}([&](auto i_dim_sub) {
+            load_current_sub_row(i_dim_sub, Number<0>{});
+            static_for<0, RowSubBlocks - 1, 1>{}([&](auto i_row) {
+                load_current_sub_row(i_dim_sub, Number<1>{} + i_row);
+                accumulate_current_sub_row(i_dim_sub, i_row);
+                threadwise_welford_sub_row(i_dim_sub, i_row);
+            });
+            accumulate_current_sub_row(i_dim_sub, Number<RowSubBlocks - 1>{});
+            threadwise_welford_sub_row(i_dim_sub, Number<RowSubBlocks - 1>{});
+
+            // blockwise welford
+            static_for<0, mean_var_buf_size, 1>{}([&](auto I) {
+                if constexpr(I > 0)
+                    block_sync_lds();
+
+                BlockwiseWelford::Run(
+                    mean_thread_buf(I), var_thread_buf(I), threadwise_welford.cur_count_);
+            });
+
+            // store
+            static_for<0, RowSubBlocks, 1>{}(
+                [&](auto i_row) { threadwise_normalize_store_out(i_dim_sub, i_row); });
+        });
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp b/include/ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp
new file mode 100644
index 00000000..188c62d9
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/reduction_functions_accumulate.hpp"
+
+namespace ck {
+
+// Assume
+//  1) SrcDesc is known at compile-time
+//  2) DstDesc is known at compile-time
+//  3) SrcBuffer is static buffer
+//  4) DstBuffer is static buffer
+template <typename AccDataType,
+          typename SrcThreadDesc_M_K,
+          typename DstThreadDesc_M,
+          typename OpReduce,
+          bool PropagateNan,
+          typename Accumulation =
+              detail::AccumulateWithNanCheck<PropagateNan, OpReduce, AccDataType>>
+struct ThreadwiseReduction
+{
+    static constexpr auto src_thread_desc_m_k = SrcThreadDesc_M_K{};
+    static constexpr auto dst_thread_desc_m   = DstThreadDesc_M{};
+
+    static constexpr auto src_length_m = src_thread_desc_m_k.GetLength(Number<0>{});
+    static constexpr auto src_length_k = src_thread_desc_m_k.GetLength(Number<1>{});
+    static constexpr auto dst_length_m = dst_thread_desc_m.GetLength(Number<0>{});
+
+    static_assert(src_length_m == dst_length_m, "lengths of source and dst buffer must match!");
+
+    using Op = OpReduce;
+
+    template <typename SrcBufferType, typename DstBufferType>
+    __device__ static void Reduce(const SrcBufferType& src_buf, DstBufferType& dst_buf)
+    {
+        static_for<0, src_length_m, 1>{}([&](auto iM) {
+            constexpr index_t out_offset = dst_thread_desc_m.CalculateOffset(make_tuple(iM));
+
+            static_for<0, src_length_k, 1>{}([&](auto iK) {
+                constexpr auto offset = src_thread_desc_m_k.CalculateOffset(make_tuple(iM, iK));
+
+                Accumulation::Calculate(dst_buf(Number<out_offset>{}), src_buf[Number<offset>{}]);
+            });
+        });
+    };
+};
+
+// Assume
+//  1) SrcDesc is known at compile-time
+//  2) DstDesc is known at compile-time
+//  3) SrcBuffer is static buffer
+//  4) DstBuffer is static buffer
+template <
+    typename AccDataType,
+    typename IndexDataType,
+    typename SrcThreadDesc_M_K,
+    typename DstThreadDesc_M,
+    typename OpReduce,
+    bool PropagateNan,
+    typename Accumulation =
+        detail::AccumulateWithIndexAndNanCheck<PropagateNan, OpReduce, AccDataType, IndexDataType>>
+struct ThreadwiseReductionWithIndex
+{
+    static constexpr auto src_thread_desc_m_k = SrcThreadDesc_M_K{};
+    static constexpr auto dst_thread_desc_m   = DstThreadDesc_M{};
+
+    static constexpr auto src_length_m = src_thread_desc_m_k.GetLength(Number<0>{});
+    static constexpr auto src_length_k = src_thread_desc_m_k.GetLength(Number<1>{});
+    static constexpr auto dst_length_m = dst_thread_desc_m.GetLength(Number<0>{});
+
+    static_assert(src_length_m == dst_length_m, "lengths of source and dst buffer must match!");
+
+    template <typename SrcValueBufferType,
+              typename SrcIndexBufferType,
+              typename DstValueBufferType,
+              typename DstIndexBufferType>
+    __device__ static void Reduce(const SrcValueBufferType& src_val_buf,
+                                  const SrcIndexBufferType& src_idx_buf,
+                                  DstValueBufferType& dst_val_buf,
+                                  DstIndexBufferType& dst_idx_buf)
+    {
+        static_for<0, src_length_m, 1>{}([&](auto iM) {
+            constexpr index_t out_offset = dst_thread_desc_m.CalculateOffset(make_tuple(iM));
+
+            static_for<0, src_length_k, 1>{}([&](auto iK) {
+                constexpr auto offset = src_thread_desc_m_k.CalculateOffset(make_tuple(iM, iK));
+
+                Accumulation::Calculate(dst_val_buf(Number<out_offset>{}),
+                                        src_val_buf[Number<offset>{}],
+                                        dst_idx_buf(Number<out_offset>{}),
+                                        src_idx_buf[Number<offset>{}]);
+            });
+        });
+    };
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_contraction_dl.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_contraction_dl.hpp
new file mode 100644
index 00000000..94cdfe01
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_contraction_dl.hpp
@@ -0,0 +1,224 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/utility/math.hpp"
+
+namespace ck {
+
+// C[TM0, TM1, TN0, TN1] += A[TK, TM0, TM1] * B[TK, TN0, TN1]
+//   Tensor element can be vectorized data
+// Assume:
+//   1. AThreadDesc_TK0_TM0_TM1_TK1, BThreadDesc_TK0_TN0_TN1_TK1, CThreadDesc_TM0_TM1_TN0_TN1 are
+//   known at compile-time
+//   2. AOriginIdx, BOriginIdx, COriginIdx are known at compile-time
+template <typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename AThreadDesc_TK0_TM0_TM1_TK1,
+          typename BThreadDesc_TK0_TN0_TN1_TK1,
+          typename CThreadDesc_TM0_TM1_TN0_TN1,
+          typename TKLengths,
+          typename TMLengths,
+          typename TNLengths,
+          typename enable_if<AThreadDesc_TK0_TM0_TM1_TK1::IsKnownAtCompileTime() &&
+                                 BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() &&
+                                 CThreadDesc_TM0_TM1_TN0_TN1::IsKnownAtCompileTime(),
+                             bool>::type = false>
+struct ThreadwiseGemmDl_km0m1_kn0n1_m0m1n0n1
+{
+    __device__ constexpr ThreadwiseGemmDl_km0m1_kn0n1_m0m1n0n1()
+    {
+        static_assert(AThreadDesc_TK0_TM0_TM1_TK1::IsKnownAtCompileTime() &&
+                          BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() &&
+                          CThreadDesc_TM0_TM1_TN0_TN1::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        // TODO: sanity-check: compare AThreadDesc_TK0_TM0_TM1_TK1, BThreadDesc_TK0_TN0_TN1_TK1,
+        // CThreadDesc_TM0_TM1_TN0_TN1 Size with KLenghts, TMLengths and TNLengths
+
+        // TODO remove this restriction
+        static_assert(TKLengths::Size() == 1 && TMLengths::Size() == 2 && TNLengths::Size() == 2,
+                      "wrong!");
+    }
+
+    template <typename ABuffer,
+              typename AOriginIdx,
+              typename BBuffer,
+              typename BOriginIdx,
+              typename CBuffer,
+              typename COriginIdx>
+    __device__ static void Run(const ABuffer& a_buf,
+                               AOriginIdx,
+                               const BBuffer& b_buf,
+                               BOriginIdx,
+                               CBuffer& c_buf,
+                               COriginIdx)
+    {
+        static_assert(is_known_at_compile_time<remove_cvref_t<AOriginIdx>>::value &&
+                          is_known_at_compile_time<remove_cvref_t<BOriginIdx>>::value &&
+                          is_known_at_compile_time<remove_cvref_t<COriginIdx>>::value,
+                      "wrong! AOriginIdx, BOriginIdx, COringinIdx should be known at compile-time");
+
+        static_assert(
+            is_same<remove_cvref_t<typename ABuffer::type>, remove_cvref_t<FloatA>>::value &&
+            is_same<remove_cvref_t<typename BBuffer::type>, remove_cvref_t<FloatB>>::value &&
+            is_same<remove_cvref_t<typename CBuffer::type>, remove_cvref_t<FloatC>>::value &&
+            "wrong! inconsistent type");
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        constexpr auto TK  = TKLengths{}[I0];
+        constexpr auto TM0 = TMLengths{}[I0];
+        constexpr auto TM1 = TMLengths{}[I1];
+        constexpr auto TN0 = TNLengths{}[I0];
+        constexpr auto TN1 = TNLengths{}[I1];
+
+        constexpr auto a_origin_idx = to_multi_index(AOriginIdx{});
+        constexpr auto b_origin_idx = to_multi_index(BOriginIdx{});
+        constexpr auto c_origin_idx = to_multi_index(COriginIdx{});
+
+        static_for<0, TK, 1>{}([&](auto tk) {
+            static_for<0, TM0, 1>{}([&](auto tm0) {
+                static_for<0, TM1, 1>{}([&](auto tm1) {
+                    static_for<0, TN0, 1>{}([&](auto tn0) {
+                        static_for<0, TN1, 1>{}([&](auto tn1) {
+                            constexpr index_t a_offset =
+                                AThreadDesc_TK0_TM0_TM1_TK1{}.CalculateOffset(
+                                    a_origin_idx + make_multi_index(tk, tm0, tm1));
+                            constexpr index_t b_offset =
+                                BThreadDesc_TK0_TN0_TN1_TK1{}.CalculateOffset(
+                                    b_origin_idx + make_multi_index(tk, tn0, tn1));
+                            constexpr index_t c_offset =
+                                CThreadDesc_TM0_TM1_TN0_TN1{}.CalculateOffset(
+                                    c_origin_idx + make_multi_index(tm0, tm1, tn0, tn1));
+
+                            inner_product<FloatA, FloatB, FloatC>(a_buf[Number<a_offset>{}],
+                                                                  b_buf[Number<b_offset>{}],
+                                                                  c_buf(Number<c_offset>{}));
+                        });
+                    });
+                });
+            });
+        });
+    }
+};
+
+// C[TM0, TM1, TN0, TN1] += A[TK0, TM0, TM1, TK1] * B[TK0, TN0, TN1, TK1]
+//   Tensor element can be vectorized data
+// Assume:
+//   1. AThreadDesc_TK0_TM0_TM1_TK1, BThreadDesc_TK0_TN0_TN1_TK1, CThreadDesc_TM0_TM1_TN0_TN1 are
+//      known at compile-time
+//   2. AOriginIdx, BOriginIdx, COriginIdx are known at compile-time
+template <typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename AThreadDesc_TK0_TM0_TM1_TK1,
+          typename BThreadDesc_TK0_TN0_TN1_TK1,
+          typename CThreadDesc_TM0_TM1_TN0_TN1,
+          typename TKLengths,
+          typename TMLengths,
+          typename TNLengths,
+          typename enable_if<AThreadDesc_TK0_TM0_TM1_TK1::IsKnownAtCompileTime() &&
+                                 BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() &&
+                                 CThreadDesc_TM0_TM1_TN0_TN1::IsKnownAtCompileTime(),
+                             bool>::type = false>
+struct ThreadwiseContractionDl_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1
+{
+    __device__ constexpr ThreadwiseContractionDl_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_TN0_TN1()
+    {
+        static_assert(AThreadDesc_TK0_TM0_TM1_TK1::IsKnownAtCompileTime() &&
+                          BThreadDesc_TK0_TN0_TN1_TK1::IsKnownAtCompileTime() &&
+                          CThreadDesc_TM0_TM1_TN0_TN1::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        // TODO: sanity-check: compare AThreadDesc_TK0_TM0_TM1_TK1, BThreadDesc_TK0_TN0_TN1_TK1,
+        // CThreadDesc_TM0_TM1_TN0_TN1 Size with KLenghts, TMLengths and TNLengths
+
+        // TODO remove this restriction
+        static_assert(TKLengths::Size() == 2 && TMLengths::Size() == 2 && TNLengths::Size() == 2,
+                      "wrong!");
+    }
+
+    template <typename ABuffer,
+              typename AOriginIdx,
+              typename BBuffer,
+              typename BOriginIdx,
+              typename CBuffer,
+              typename COriginIdx>
+    __device__ static void Run(const ABuffer& a_buf,
+                               AOriginIdx,
+                               const BBuffer& b_buf,
+                               BOriginIdx,
+                               CBuffer& c_buf,
+                               COriginIdx)
+    {
+        static_assert(is_known_at_compile_time<remove_cvref_t<AOriginIdx>>::value &&
+                          is_known_at_compile_time<remove_cvref_t<BOriginIdx>>::value &&
+                          is_known_at_compile_time<remove_cvref_t<COriginIdx>>::value,
+                      "wrong! AOriginIdx, BOriginIdx, COringinIdx should be known at compile-time");
+
+        static_assert(
+            is_same<remove_cvref_t<typename ABuffer::type>, remove_cvref_t<FloatA>>::value &&
+            is_same<remove_cvref_t<typename BBuffer::type>, remove_cvref_t<FloatB>>::value &&
+            is_same<remove_cvref_t<typename CBuffer::type>, remove_cvref_t<FloatC>>::value &&
+            "wrong! inconsistent type");
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        constexpr index_t TK0 = TKLengths{}[I0];
+        constexpr index_t TK1 = TKLengths{}[I1];
+        constexpr index_t TM0 = TMLengths{}[I0];
+        constexpr index_t TM1 = TMLengths{}[I1];
+        constexpr index_t TN0 = TNLengths{}[I0];
+        constexpr index_t TN1 = TNLengths{}[I1];
+
+        constexpr auto a_origin_idx = to_multi_index(AOriginIdx{});
+        constexpr auto b_origin_idx = to_multi_index(BOriginIdx{});
+        constexpr auto c_origin_idx = to_multi_index(COriginIdx{});
+
+        static_for<0, TK0, 1>{}([&](auto tk0) {
+            static_for<0, TM0, 1>{}([&](auto tm0) {
+                static_for<0, TM1, 1>{}([&](auto tm1) {
+                    static_for<0, TN0, 1>{}([&](auto tn0) {
+                        static_for<0, TN1, 1>{}([&](auto tn1) {
+                            vector_type<FloatA, TK1> a_vec;
+                            vector_type<FloatB, TK1> b_vec;
+
+                            static_for<0, TK1, 1>{}([&](auto tk1) {
+                                constexpr index_t a_offset =
+                                    AThreadDesc_TK0_TM0_TM1_TK1{}.CalculateOffset(
+                                        a_origin_idx + make_multi_index(tk0, tm0, tm1, tk1));
+
+                                constexpr index_t b_offset =
+                                    BThreadDesc_TK0_TN0_TN1_TK1{}.CalculateOffset(
+                                        b_origin_idx + make_multi_index(tk0, tn0, tn1, tk1));
+
+                                a_vec.template AsType<FloatA>()(tk1) = a_buf[Number<a_offset>{}];
+                                b_vec.template AsType<FloatB>()(tk1) = b_buf[Number<b_offset>{}];
+                            });
+
+                            using a_vector_t = typename vector_type<FloatA, TK1>::type;
+                            using b_vector_t = typename vector_type<FloatB, TK1>::type;
+
+                            constexpr index_t c_offset =
+                                CThreadDesc_TM0_TM1_TN0_TN1{}.CalculateOffset(
+                                    c_origin_idx + make_multi_index(tm0, tm1, tn0, tn1));
+
+                            inner_product<a_vector_t, b_vector_t, FloatC>(
+                                a_vec.template AsType<a_vector_t>()[I0],
+                                b_vec.template AsType<b_vector_t>()[I0],
+                                c_buf(Number<c_offset>{}));
+                        });
+                    });
+                });
+            });
+        });
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_gemm_dlops_v3.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_gemm_dlops_v3.hpp
new file mode 100644
index 00000000..e045e3b5
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_gemm_dlops_v3.hpp
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_THREADWISE_GEMM_DLOPS_V3_HPP
+#define CK_THREADWISE_GEMM_DLOPS_V3_HPP
+
+#include "common_header.hpp"
+#include "math.hpp"
+
+namespace ck {
+
+// C[M, N] += transpose(A[K, M]) * B[K, N]
+//   Element of matrix can be vectorized data
+// Assume:
+//   1. AThreadDesc_E1_K_E2, BThreadDesc_E1_N_Ho_Wo_E2, CThreadDesc_K_N_Ho_Wo are known at
+//   compile-time
+//   2. AOriginIdx, BOriginIdx, COriginIdx are known at compile-time
+template <typename FloatA,
+          typename FloatB,
+          typename FloatC,
+          typename AThreadDesc_E1_K_E2,
+          typename BThreadDesc_E1_N_Ho_Wo_E2,
+          typename CThreadDesc_K_N_Ho_Wo,
+          typename enable_if<AThreadDesc_E1_K_E2::IsKnownAtCompileTime() &&
+                                 BThreadDesc_E1_N_Ho_Wo_E2::IsKnownAtCompileTime() &&
+                                 CThreadDesc_K_N_Ho_Wo::IsKnownAtCompileTime(),
+                             bool>::type = false>
+struct ThreadwiseGemmDlops_km_kn_mn_v3
+{
+
+    template <typename ABuffer,
+              typename AOriginIdx,
+              typename BBuffer,
+              typename BOriginIdx,
+              typename CBuffer,
+              typename COriginIdx>
+    __device__ static void Run(const ABuffer& a_buf,
+                               AOriginIdx,
+                               const BBuffer& b_buf,
+                               BOriginIdx,
+                               CBuffer& c_buf,
+                               COriginIdx)
+    {
+
+        static_assert(AThreadDesc_E1_K_E2::IsKnownAtCompileTime() &&
+                          BThreadDesc_E1_N_Ho_Wo_E2::IsKnownAtCompileTime() &&
+                          CThreadDesc_K_N_Ho_Wo::IsKnownAtCompileTime(),
+                      "wrong! Desc should be known at compile-time");
+
+        static_assert(is_known_at_compile_time<remove_cvref_t<AOriginIdx>>::value &&
+                          is_known_at_compile_time<remove_cvref_t<BOriginIdx>>::value &&
+                          is_known_at_compile_time<remove_cvref_t<COriginIdx>>::value,
+                      "wrong! AOriginIdx, BOriginIdx, COringinIdx should be known at compile-time");
+
+        static_assert(
+            is_same<remove_cvref_t<typename ABuffer::type>, remove_cvref_t<FloatA>>::value &&
+            is_same<remove_cvref_t<typename BBuffer::type>, remove_cvref_t<FloatB>>::value &&
+            is_same<remove_cvref_t<typename CBuffer::type>, remove_cvref_t<FloatC>>::value &&
+            "wrong! inconsistent type");
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+        constexpr auto I2 = Number<2>{};
+        constexpr auto I3 = Number<3>{};
+
+        constexpr auto E1 = AThreadDesc_E1_K_E2{}.GetLength(I0);
+        constexpr auto K  = AThreadDesc_E1_K_E2{}.GetLength(I1);
+        constexpr auto E2 = AThreadDesc_E1_K_E2{}.GetLength(I2);
+
+        constexpr auto Ho = BThreadDesc_E1_N_Ho_Wo_E2{}.GetLength(I2);
+        constexpr auto Wo = BThreadDesc_E1_N_Ho_Wo_E2{}.GetLength(I3);
+
+        constexpr auto a_origin_idx = to_multi_index(AOriginIdx{});
+        constexpr auto b_origin_idx = to_multi_index(BOriginIdx{});
+        constexpr auto c_origin_idx = to_multi_index(COriginIdx{});
+
+        if constexpr((Ho % 2 == 0) && (Wo % 2 == 0))
+        {
+            constexpr auto SubHW = 2;
+
+            static_for<0, K, 1>{}([&](auto k) {
+                static_for<0, Ho, SubHW>{}([&](auto h) {
+                    static_for<0, Wo, SubHW>{}([&](auto w) {
+                        static_for<0, E1, 1>{}([&](auto e1) {
+                            static_for<0, E2, 1>{}([&](auto e2) {
+                                constexpr index_t a_offset = AThreadDesc_E1_K_E2{}.CalculateOffset(
+                                    a_origin_idx + make_tuple(e1, k, e2));
+
+                                constexpr index_t b0_offset =
+                                    BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset(
+                                        b_origin_idx + make_tuple(e1, 0, h, w, e2));
+
+                                constexpr index_t b1_offset =
+                                    BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset(
+                                        b_origin_idx + make_tuple(e1, 0, h, w + 1, e2));
+
+                                constexpr index_t b2_offset =
+                                    BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset(
+                                        b_origin_idx + make_tuple(e1, 0, h + 1, w, e2));
+
+                                constexpr index_t b3_offset =
+                                    BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset(
+                                        b_origin_idx + make_tuple(e1, 0, h + 1, w + 1, e2));
+
+                                constexpr index_t c0_offset =
+                                    CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(c_origin_idx +
+                                                                            make_tuple(k, 0, h, w));
+
+                                constexpr index_t c1_offset =
+                                    CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(
+                                        c_origin_idx + make_tuple(k, 0, h, w + 1));
+
+                                constexpr index_t c2_offset =
+                                    CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(
+                                        c_origin_idx + make_tuple(k, 0, h + 1, w));
+
+                                constexpr index_t c3_offset =
+                                    CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(
+                                        c_origin_idx + make_tuple(k, 0, h + 1, w + 1));
+
+                                amd_assembly_outer_product_1x4(a_buf[Number<a_offset>{}],
+                                                               b_buf[Number<b0_offset>{}],
+                                                               b_buf[Number<b1_offset>{}],
+                                                               b_buf[Number<b2_offset>{}],
+                                                               b_buf[Number<b3_offset>{}],
+                                                               c_buf(Number<c0_offset>{}),
+                                                               c_buf(Number<c1_offset>{}),
+                                                               c_buf(Number<c2_offset>{}),
+                                                               c_buf(Number<c3_offset>{}));
+                            });
+                        });
+                    });
+                });
+            });
+        }
+        else
+        {
+
+            static_for<0, K, 1>{}([&](auto k) {
+                static_for<0, Ho, 1>{}([&](auto h) {
+                    static_for<0, Wo, 1>{}([&](auto w) {
+                        static_for<0, E1, 1>{}([&](auto e1) {
+                            static_for<0, E2, 1>{}([&](auto e2) {
+                                constexpr index_t a_offset = AThreadDesc_E1_K_E2{}.CalculateOffset(
+                                    a_origin_idx + make_tuple(e1, k, e2));
+
+                                constexpr index_t b_offset =
+                                    BThreadDesc_E1_N_Ho_Wo_E2{}.CalculateOffset(
+                                        b_origin_idx + make_tuple(e1, 0, h, w, e2));
+
+                                constexpr index_t c_offset =
+                                    CThreadDesc_K_N_Ho_Wo{}.CalculateOffset(c_origin_idx +
+                                                                            make_tuple(k, 0, h, w));
+
+                                inner_product<FloatA, FloatB, FloatC>(a_buf[Number<a_offset>{}],
+                                                                      b_buf[Number<b_offset>{}],
+                                                                      c_buf(Number<c_offset>{}));
+                            });
+                        });
+                    });
+                });
+            });
+        }
+    }
+};
+
+} // namespace ck
+#endif
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_set.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_set.hpp
new file mode 100644
index 00000000..0a1197a1
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_set.hpp
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// Assume:
+//   1. Desc is known at compile-time
+//   2. Buffer is StaticBuffer
+//   3. OriginIdx is known at compile-time
+//   4. use #-step
+template <typename Data,
+          typename Desc,
+          typename SliceLengths,
+          typename enable_if<Desc::IsKnownAtCompileTime(), bool>::type = false>
+struct ThreadwiseTensorSliceSet_v1
+{
+    static constexpr index_t nDim = SliceLengths::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    template <typename OriginIdx, typename Buffer>
+    __device__ void Run(const Desc&, const OriginIdx&, Buffer& buf, const Data& initial_value) const
+    {
+        static_assert(Desc::IsKnownAtCompileTime(),
+                      "wrong! SrcDesc and DstDesc need to known at compile-time");
+
+        static_assert(Buffer::IsStaticBuffer(), "wrong! DstBuffer need to be StaticBuffer");
+
+        static_assert(is_known_at_compile_time<remove_cvref_t<OriginIdx>>::value,
+                      "wrong! OriginIdx need to be known at compile-time");
+
+        // Desc is known at compile-time
+        constexpr auto desc = remove_cvref_t<Desc>{};
+
+        // OriginIdx is known at compile-time
+        constexpr auto origin_idx = to_multi_index(OriginIdx{});
+
+        static_ford<SliceLengths>{}([&](auto access_idx) {
+            constexpr auto coord = make_tensor_coordinate(desc, origin_idx + access_idx);
+
+            constexpr bool is_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(desc, coord);
+
+            constexpr index_t offset = coord.GetOffset();
+
+            if constexpr(is_valid)
+            {
+                buf(Number<offset>{}) = initial_value;
+            }
+        });
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
new file mode 100644
index 00000000..b0f453b0
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp
@@ -0,0 +1,1301 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_space_filling_curve.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+
+namespace ck {
+
+// Do following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
+// and sometimes useless instructions:
+//   1. Don't save a reference to tensor descriptor in class, pass in tensor descriptor as argument
+//   instead
+//   2. Don't construct a new tensor coordinate everytime when using it, update and reuse the same
+//   tensor coordinate instead
+//   3. Don't use a pointer to VGPR buffer, use vector instead
+
+namespace detail {
+// TODO: How to fix this? It uses an struct instead of lambda because lambda
+// doesn't have constructor
+template <index_t VectorDim, index_t ScalarPerVector>
+struct lambda_scalar_per_access
+{
+    __host__ __device__ constexpr auto operator()(index_t i) const
+    {
+        return (i == VectorDim) ? ScalarPerVector : 1;
+    }
+};
+
+template <index_t VectorDim>
+struct lambda_scalar_step_in_vector
+{
+    __host__ __device__ constexpr auto operator()(index_t i) const
+    {
+        return (i == VectorDim) ? 1 : 0;
+    }
+};
+} // namespace detail
+
+// Assume:
+//   1. src:
+//     1. SrcDesc is known at compile-time
+//     2. SrcBuffer is StaticBuffer
+//     3. SrcSliceOrginIdx is known at compile-time
+//   2. dst:
+//     1. DstDesc is not known at compile-time
+//     2. DstBuffer is DynamicBuffer
+//     3. DstSliceOrginIdx is not known at compile time
+template <typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename ElementwiseOperation,
+          typename SliceLengths,
+          typename DimAccessOrder,
+          index_t DstVectorDim,
+          index_t DstScalarPerVector,
+          InMemoryDataOperationEnum DstInMemOp,
+          index_t DstScalarStrideInVector,
+          bool DstResetCoordinateAfterRun,
+          typename enable_if<SrcDesc::IsKnownAtCompileTime(), bool>::type = false>
+struct ThreadwiseTensorSliceTransfer_v1r3
+{
+    static constexpr index_t nDim = SliceLengths::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
+
+    using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
+
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v1r3(const DstDesc& dst_desc,
+                                                            const Index& dst_slice_origin_idx,
+                                                            const ElementwiseOperation& element_op)
+        : dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin_idx)),
+          element_op_{element_op}
+    {
+        static_assert(SrcDesc::IsKnownAtCompileTime(),
+                      "wrong! SrcDesc need to known at compile-time");
+        static_assert(SliceLengths::At(Number<DstVectorDim>{}) % DstScalarPerVector == 0,
+                      "wrong! Not divisible");
+    }
+
+    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
+    {
+        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
+    }
+
+    template <typename SrcSliceOriginIdx, typename SrcBuffer, typename DstBuffer>
+    __device__ void Run(const SrcDesc&,
+                        const SrcSliceOriginIdx&,
+                        const SrcBuffer& src_buf,
+                        const DstDesc& dst_desc,
+                        DstBuffer& dst_buf)
+    {
+        static_assert(SrcDesc::IsKnownAtCompileTime(),
+                      "wrong! SrcDesc need to known at compile-time");
+
+        static_assert(is_known_at_compile_time<remove_cvref_t<SrcSliceOriginIdx>>::value,
+                      "wrong! SrcSliceOrigin need to known at compile-time");
+
+        static_assert(SrcBuffer::IsStaticBuffer(), "wrong! SrcBuffer need to be StaticBuffer");
+
+        // SrcDesc and src_slice_origin_idx are known at compile-time
+        constexpr auto src_desc             = remove_cvref_t<SrcDesc>{};
+        constexpr auto src_slice_origin_idx = to_multi_index(SrcSliceOriginIdx{});
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto dst_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto dst_scalar_step_in_vector =
+            generate_sequence(detail::lambda_scalar_step_in_vector<DstVectorDim>{}, Number<nDim>{});
+
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
+                                                    remove_cv_t<decltype(dst_scalar_per_access)>>;
+
+        // TODO: Use SpaceFillingCurve::ScalarsPerAccess instread of DstScalarPerVector?
+        static_assert(DstScalarPerVector == SpaceFillingCurve::ScalarPerVector,
+                      "wrong!DstScalarPerVector != SpaceFillingCurve::ScalarPerVector");
+        typename vector_type_maker<DstData, DstScalarPerVector>::type dst_vector;
+        using dst_vector_t = typename vector_type_maker<DstData, DstScalarPerVector>::type::type;
+
+        constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
+
+        static_for<0, num_access, 1>{}([&](auto idx_1d) {
+            constexpr auto idx_md = SpaceFillingCurve::GetIndex(idx_1d);
+
+            // copy data from src_buf into dst_vector
+            // TODO: It's a hack here to use \p dst_scalar_step_in_vector. Use SpaceFillingCurve?
+            static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
+                constexpr index_t src_offset = src_desc.CalculateOffset(
+                    src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
+
+                SrcData v;
+
+                // apply element-wise operation
+                element_op_(v, src_buf[Number<src_offset>{}]);
+
+                // apply type convert
+                dst_vector.template AsType<DstData>()(i) = type_convert<DstData>(v);
+            });
+
+            const bool is_dst_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
+
+            // copy data from dst_vector into dst_buf
+            dst_buf.template Update<DstInMemOp, dst_vector_t>(
+                dst_coord_.GetOffset(),
+                is_dst_valid,
+                dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
+
+            if constexpr(idx_1d.value != num_access - 1)
+            {
+                constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
+
+                move_tensor_coordinate(
+                    dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step));
+            }
+        });
+
+        // move dst coordinate back to slice origin (or not)
+        if constexpr(DstResetCoordinateAfterRun)
+        {
+            const auto dst_reset_step =
+                make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep());
+
+            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
+        }
+    }
+
+    __device__ static constexpr auto GetDstCoordinateResetStep()
+    {
+        constexpr auto dst_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
+                                                    remove_cv_t<decltype(dst_scalar_per_access)>>;
+
+        constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
+        if constexpr(num_access == 0)
+        {
+            return typename SpaceFillingCurve::Index{};
+        }
+        else
+        {
+            constexpr auto reset_step =
+                SpaceFillingCurve::GetStepBetween(Number<num_access - 1>{}, Number<0>{});
+
+            return reset_step;
+        }
+    }
+
+    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
+                                       const Index& dst_slice_origin_step_idx)
+    {
+        // if dst coord was not reset by Run(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
+                                       : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
+    }
+
+    private:
+    DstCoord dst_coord_;
+    const ElementwiseOperation element_op_;
+}; // namespace ThreadwiseTensorSliceTransfer_v1r3
+
+// Assume:
+//   1. src:
+//     1. SrcDesc is not known at compile-time
+//     2. SrcBuffer is DynamicBuffer
+//     3. src_slice_origin_idx is not known at compile-time
+//   2. dst:
+//     1. DstDesc is known at compile-time
+//     2. DstBuffer is StaticBuffer
+//     3. dst_slice_origin_idx is known at compile-time
+template <typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SliceLengths,
+          typename DimAccessOrder,
+          index_t SrcVectorDim,
+          index_t SrcScalarPerVector,
+          index_t SrcScalarStrideInVector,
+          bool SrcResetCoordinateAfterRun,
+          bool InvalidElementAsNaN                                        = false,
+          typename enable_if<DstDesc::IsKnownAtCompileTime(), bool>::type = false>
+struct ThreadwiseTensorSliceTransfer_v2
+{
+    static_assert((InvalidElementAsNaN && !std::is_integral<DstData>::value) ||
+                      (!InvalidElementAsNaN),
+                  "Filling invalid element as NaN is only for floating point types");
+
+    static constexpr index_t nDim = SliceLengths::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
+
+    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
+
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v2(const SrcDesc& src_desc,
+                                                          const Index& src_slice_origin_idx)
+        : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin_idx))
+    {
+        static_assert(DstDesc::IsKnownAtCompileTime(),
+                      "wrong! SrcDesc need to known at compile-time");
+        static_assert(SliceLengths::At(Number<SrcVectorDim>{}) % SrcScalarPerVector == 0,
+                      "wrong! Not divisible");
+    }
+
+    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
+    {
+        src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
+    }
+
+    template <typename SrcBuffer, typename DstBuffer, typename DstSliceOriginIdx>
+    __device__ void Run(const SrcDesc& src_desc,
+                        const SrcBuffer& src_buf,
+                        const DstDesc&,
+                        const DstSliceOriginIdx&,
+                        DstBuffer& dst_buf)
+    {
+        static_assert(DstDesc::IsKnownAtCompileTime(),
+                      "wrong! DstDesc need to known at compile-time");
+
+        static_assert(is_known_at_compile_time<remove_cvref_t<DstSliceOriginIdx>>::value,
+                      "wrong! DstSliceOrigin need to known at compile-time");
+
+        static_assert(
+            is_same<remove_cvref_t<typename DstBuffer::type>, remove_cvref_t<DstData>>::value &&
+            "wrong! inconsistent type");
+
+        // DstDesc and dst_slice_origin_idx are known at compile-time
+        constexpr auto dst_desc             = remove_cvref_t<DstDesc>{};
+        constexpr auto dst_slice_origin_idx = DstSliceOriginIdx{};
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto src_scalar_step_in_vector =
+            generate_sequence(detail::lambda_scalar_step_in_vector<SrcVectorDim>{}, Number<nDim>{});
+
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
+                                                    remove_cv_t<decltype(src_scalar_per_access)>>;
+
+        // loop over tensor and copy
+        constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
+
+        static_for<0, num_access, 1>{}([&](auto idx_1d) {
+            typename vector_type_maker<SrcData, SrcScalarPerVector>::type src_vector;
+
+            using src_vector_t =
+                typename vector_type_maker<SrcData, SrcScalarPerVector>::type::type;
+            constexpr auto src_data_idx = SpaceFillingCurve::GetIndex(idx_1d);
+
+            const bool is_src_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
+
+            // copy data from src_buf into src_vector
+            src_vector.template AsType<src_vector_t>()(Number<0>{}) =
+                src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid);
+
+            // copy data from src_vector into dst_buf
+            static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                constexpr index_t dst_offset =
+                    dst_desc.CalculateOffset(to_multi_index(dst_slice_origin_idx) + src_data_idx +
+                                             i * src_scalar_step_in_vector);
+
+                if constexpr(InvalidElementAsNaN)
+                {
+                    dst_buf(Number<dst_offset>{}) =
+                        is_src_valid
+                            ? type_convert<DstData>(src_vector.template AsType<SrcData>()[i])
+                            : NumericLimits<DstData>::QuietNaN();
+                }
+                else
+                {
+                    dst_buf(Number<dst_offset>{}) =
+                        type_convert<DstData>(src_vector.template AsType<SrcData>()[i]);
+                }
+            });
+
+            if constexpr(idx_1d.value != num_access - 1)
+            {
+                constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
+
+                move_tensor_coordinate(
+                    src_desc, src_coord_, make_tensor_coordinate_step(src_desc, forward_step));
+            }
+        });
+
+        // move src coordinate back to slice origin (or not)
+        if constexpr(SrcResetCoordinateAfterRun)
+        {
+            const auto src_reset_step =
+                make_tensor_coordinate_step(src_desc, GetSrcCoordinateResetStep());
+
+            move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
+        }
+    }
+
+    __device__ static constexpr auto GetSrcCoordinateResetStep()
+    {
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
+                                                    remove_cv_t<decltype(src_scalar_per_access)>>;
+
+        constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
+        if constexpr(num_access == 0)
+        {
+            return typename SpaceFillingCurve::Index{};
+        }
+        else
+        {
+            constexpr auto reset_step =
+                SpaceFillingCurve::GetStepBetween(Number<num_access - 1>{}, Number<0>{});
+
+            return reset_step;
+        }
+    }
+
+    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
+                                       const Index& src_slice_origin_step_idx)
+    {
+        // if src coord was not reset by Run(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
+                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    template <typename SrcMoveSliceWindowStepHack>
+    __device__ void
+    MoveSrcSliceWindow(const SrcDesc& src_desc,
+                       const Index& src_slice_origin_step_idx,
+                       const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
+                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(
+            src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
+
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
+
+    private:
+    SrcCoord src_coord_;
+}; // namespace ck
+
+// Assume:
+//   1. src_desc and dst_desc are not known at compile-time
+//   2. SrcBuffer and DstBuffer are DynamicBuffer
+//   3. src_slice_origin and dst_slice_origin are not known at compile-time,
+//   4. Use thread buffer
+template <typename SliceLengths,
+          InMemoryDataOperationEnum DstInMemOp,
+          typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          index_t SrcScalarPerVector,
+          index_t DstScalarPerVector,
+          index_t SrcScalarStrideInVector,
+          index_t DstScalarStrideInVector,
+          bool SrcResetCoordinateAfterRun, // control whether to move back src coordinate after each
+                                           // RunRead(),  will be fused with MoveSrcSliceWindow to
+                                           // save addr computation
+          bool DstResetCoordinateAfterRun> // control whether to move back dst coordinate after each
+                                           // RunWrite(),  will be fused with MoveDstSliceWindow to
+                                           // save addr computation
+struct ThreadwiseTensorSliceTransfer_v3
+{
+    static constexpr index_t nDim = SliceLengths::Size();
+    using Index                   = MultiIndex<nDim>;
+
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
+    using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
+
+    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
+    using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
+
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v3(const SrcDesc& src_desc,
+                                                          const Index& src_slice_origin,
+                                                          const DstDesc& dst_desc,
+                                                          const Index& dst_slice_origin)
+        : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
+          dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin))
+    {
+        static_assert(SliceLengths::At(Number<SrcVectorDim>{}) % SrcScalarPerVector == 0,
+                      "wrong! Not divisible");
+        static_assert(SliceLengths::At(Number<DstVectorDim>{}) % DstScalarPerVector == 0,
+                      "wrong! Not divisible");
+    }
+
+    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
+    {
+        src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
+    }
+
+    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
+    {
+        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
+    }
+
+    template <typename SrcBuffer, typename SrcStepHacks>
+    __device__ void
+    RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks)
+    {
+        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
+                          SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
+                      "wrong!");
+
+        static_assert(
+            is_same<remove_cvref_t<typename SrcBuffer::type>, remove_cvref_t<SrcData>>::value,
+            "wrong! SrcBuffer and SrcData data type are inconsistent");
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto src_scalar_step_in_vector =
+            generate_sequence(detail::lambda_scalar_step_in_vector<SrcVectorDim>{}, Number<nDim>{});
+
+        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
+
+        constexpr auto ordered_src_access_lengths =
+            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
+
+        // make forward steps
+        const auto src_forward_steps = generate_tuple(
+            [&](auto i) {
+                Index forward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(
+                    src_desc, forward_step_idx, src_step_hacks[I0][i]);
+            },
+            Number<nDim>{});
+
+        // make backward steps
+        const auto src_backward_steps = generate_tuple(
+            [&](auto i) {
+                Index backward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(
+                    src_desc, backward_step_idx, src_step_hacks[I1][i]);
+            },
+            Number<nDim>{});
+
+        // loop over tensor and copy
+        static_ford<decltype(ordered_src_access_lengths)>{}([&](auto ordered_src_access_idx) {
+            // judge move forward or move backward
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                forward_sweep_(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_src_access_idx[I0];
+
+                    static_for<1, i, 1>{}([&](auto j) {
+                        tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
+                    });
+
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep_;
+            }();
+
+            // calculate src data index
+            constexpr auto src_data_idx = [&]() {
+                Index ordered_idx;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i]
+                                                      : ordered_src_access_lengths[i] - 1 -
+                                                            ordered_src_access_idx[i];
+                });
+
+                return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
+                       src_scalar_per_access;
+            }();
+
+            vector_type_maker_t<SrcData, SrcScalarPerVector> src_tmp_vector;
+
+            using src_vector_t = typename decltype(src_tmp_vector)::type;
+
+            const bool is_src_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
+
+            // copy data from src_buf to src_tmp_vector
+            src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
+                src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid);
+
+            // copy data from src_tmp_vector to buffer_
+            static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                constexpr index_t buffer_offset =
+                    buffer_desc_.CalculateOffset(src_data_idx + i * src_scalar_step_in_vector);
+
+                buffer_(Number<buffer_offset>{}) = src_tmp_vector.template AsType<SrcData>()[i];
+            });
+
+            constexpr auto move_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &=
+                            ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1;
+                    });
+                });
+
+                return move_on_dim_;
+            }
+            ();
+
+            // move
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(move_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_tensor_coordinate(
+                            src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]);
+                    }
+                    else
+                    {
+                        move_tensor_coordinate(
+                            src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]);
+                    }
+                }
+            });
+        });
+
+        // move src coordinate back to slice origin (or not)
+        if constexpr(SrcResetCoordinateAfterRun)
+        {
+            const auto src_reset_step =
+                make_tensor_coordinate_step(src_desc, GetSrcCoordinateResetStep());
+
+            move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
+        }
+    }
+
+    template <typename DstBuffer, typename DstStepHacks>
+    __device__ void
+    RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf, const DstStepHacks& dst_step_hacks)
+    {
+        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
+                          DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
+                      "wrong!");
+
+        static_assert(
+            is_same<remove_cvref_t<typename DstBuffer::type>, remove_cvref_t<DstData>>::value,
+            "wrong! SrcBuffer or DstBuffer data type is wrong");
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        // src scalar per access on each dim
+        // TODO: don't use this
+        constexpr auto dst_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto dst_scalar_step_in_vector =
+            generate_sequence(detail::lambda_scalar_step_in_vector<DstVectorDim>{}, Number<nDim>{});
+
+        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
+
+        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
+
+        constexpr auto ordered_dst_access_lengths =
+            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
+
+        // make forward steps
+        const auto dst_forward_steps = generate_tuple(
+            [&](auto i) {
+                Index forward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(
+                    dst_desc, forward_step_idx, dst_step_hacks[I0][i]);
+            },
+            Number<nDim>{});
+
+        // make backward steps
+        const auto dst_backward_steps = generate_tuple(
+            [&](auto i) {
+                Index backward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(
+                    dst_desc, backward_step_idx, dst_step_hacks[I1][i]);
+            },
+            Number<nDim>{});
+
+        // loop over tensor and copy
+        static_ford<decltype(ordered_dst_access_lengths)>{}([&](auto ordered_dst_access_idx) {
+            // judge move forward or move backward
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                forward_sweep_(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_dst_access_idx[I0];
+
+                    static_for<1, i, 1>{}([&](auto j) {
+                        tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j];
+                    });
+
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep_;
+            }();
+
+            // calculate dst data index
+            constexpr auto dst_data_idx = [&]() {
+                Index ordered_idx;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i]
+                                                      : ordered_dst_access_lengths[i] - 1 -
+                                                            ordered_dst_access_idx[i];
+                });
+
+                return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
+                       dst_scalar_per_access;
+            }();
+
+            vector_type_maker_t<DstData, DstScalarPerVector> dst_tmp_vector;
+
+            // copy data from buffer_ to dst_tmp_vector
+            static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
+                constexpr index_t buffer_offset =
+                    buffer_desc_.CalculateOffset(dst_data_idx + i * dst_scalar_step_in_vector);
+
+                dst_tmp_vector.template AsType<DstData>()(i) =
+                    type_convert<DstData>(buffer_[Number<buffer_offset>{}]);
+            });
+
+            using dst_vector_t = typename decltype(dst_tmp_vector)::type;
+
+            // copy data from dst_tmp_vector to dst_buf
+            const bool is_dst_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
+
+            dst_buf.template Set<dst_vector_t>(
+                dst_coord_.GetOffset(),
+                is_dst_valid,
+                dst_tmp_vector.template AsType<dst_vector_t>()[Number<0>{}]);
+
+            constexpr auto move_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &=
+                            ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1;
+                    });
+                });
+
+                return move_on_dim_;
+            }
+            ();
+
+            // move
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(move_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]);
+                    }
+                    else
+                    {
+                        move_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]);
+                    }
+                }
+            });
+        });
+
+        // move dst coordinate back to slice origin (or not)
+        if constexpr(DstResetCoordinateAfterRun)
+        {
+            const auto dst_reset_step =
+                make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep());
+
+            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
+        }
+    }
+
+    template <typename SrcBuffer>
+    __device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf)
+    {
+        constexpr index_t ntransform_src = SrcDesc::GetNumOfTransform();
+
+        constexpr auto zeros = typename uniform_sequence_gen<ntransform_src, 0>::type{};
+
+        constexpr auto src_step_hacks =
+            make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
+                       generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
+
+        RunRead(src_desc, src_buf, src_step_hacks);
+    }
+
+    template <typename DstBuffer>
+    __device__ void RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf)
+    {
+        constexpr index_t ntransform_dst = DstDesc::GetNumOfTransform();
+
+        constexpr auto zeros = typename uniform_sequence_gen<ntransform_dst, 0>::type{};
+
+        constexpr auto dst_step_hacks =
+            make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
+                       generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
+
+        RunWrite(dst_desc, dst_buf, dst_step_hacks);
+    }
+
+    __device__ static constexpr auto GetSrcCoordinateResetStep()
+    {
+        constexpr auto I0 = Number<0>{};
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
+
+        constexpr auto ordered_src_access_lengths =
+            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
+
+        // judge move forward or move backward during the last iteration
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto i) {
+                index_t tmp = ordered_src_access_lengths[I0] - 1;
+
+                static_for<1, i, 1>{}([&](auto j) {
+                    tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
+                });
+
+                forward_sweep_(i) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate src data index after last iteration in RunRead(), if it has not being reset by
+        // RunRead()
+        constexpr auto src_data_idx = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0;
+            });
+
+            return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
+                   src_scalar_per_access;
+        }();
+
+        //
+        constexpr auto reset_src_data_step = [&]() {
+            Index reset_src_data_step_;
+
+            static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; });
+
+            return reset_src_data_step_;
+        }();
+
+        return reset_src_data_step;
+    }
+
+    __device__ static constexpr auto GetDstCoordinateResetStep()
+    {
+        constexpr auto I0 = Number<0>{};
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto dst_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
+
+        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
+
+        constexpr auto ordered_dst_access_lengths =
+            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
+
+        // judge move forward or move backward during the last iteration
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto i) {
+                index_t tmp = ordered_dst_access_lengths[I0] - 1;
+
+                static_for<1, i, 1>{}([&](auto j) {
+                    tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
+                });
+
+                forward_sweep_(i) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate dst data index after last iteration in RunWrite(), if it has not being reset by
+        // RunWrite()
+        constexpr auto dst_data_idx = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0;
+            });
+
+            return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
+                   dst_scalar_per_access;
+        }();
+
+        //
+        constexpr auto reset_dst_data_step = [&]() {
+            Index reset_dst_data_step_;
+
+            static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; });
+
+            return reset_dst_data_step_;
+        }();
+
+        return reset_dst_data_step;
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
+                                       const Index& src_slice_origin_step_idx)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
+                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    template <typename SrcMoveSliceWindowStepHack>
+    __device__ void
+    MoveSrcSliceWindow(const SrcDesc& src_desc,
+                       const Index& src_slice_origin_step_idx,
+                       const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
+                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(
+            src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
+
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
+    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
+                                       const Index& dst_slice_origin_step_idx)
+    {
+        // if dst coord was not reset by RunWrite(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
+                                       : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
+    }
+
+    private:
+    static constexpr auto buffer_desc_ =
+        make_naive_tensor_descriptor_packed(sequence_to_tuple_of_number(SliceLengths{}));
+
+    static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize();
+
+    StaticBuffer<AddressSpaceEnum::Vgpr, SrcData, buffer_size_, true> buffer_;
+
+    SrcCoord src_coord_;
+    DstCoord dst_coord_;
+};
+
+// Assume:
+//   1. src:
+//     1. SrcDesc is known at compile-time
+//     2. SrcBuffer is DynamicBuffer
+//     3. src_ref_idx is known at run-time
+//     4. SrcRefToOriginDisplacement is known at compile-time
+//     5. use #-step
+//   2. dst:
+//     1. DstDesc is known at compile-time
+//     2. DstBuffer is StaticBuffer
+//     3. DstOriginIdx is known at compile-time
+//     4. use direct address calculation
+//   3. vector access on src
+template <typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SliceLengths,
+          typename DimAccessOrder,
+          index_t SrcVectorDim,
+          index_t SrcScalarPerVector,
+          index_t SrcScalarStrideInVector,
+          typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
+                             bool>::type = false>
+struct ThreadwiseTensorSliceTransfer_v4
+{
+    static constexpr index_t nDim = SliceLengths::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
+
+    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
+
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v4(const Index& src_ref_idx)
+        : src_ref_coord_(make_tensor_coordinate(SrcDesc{}, src_ref_idx))
+    {
+        static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
+                      "wrong! SrcDesc and DstDesc need to known at compile-time");
+
+        static_assert(SliceLengths::At(Number<SrcVectorDim>{}) % SrcScalarPerVector == 0,
+                      "wrong! Not divisible");
+    }
+
+    template <typename SrcRefToOriginDisplacement,
+              typename DstOriginIdx,
+              typename SrcBuffer,
+              typename DstBuffer>
+    __device__ void Run(const SrcDesc&,
+                        const SrcRefToOriginDisplacement&,
+                        const SrcBuffer& src_buf,
+                        const DstDesc&,
+                        const DstOriginIdx&,
+                        DstBuffer& dst_buf) const
+    {
+        static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
+                      "wrong! SrcDesc and DstDesc need to known at compile-time");
+
+        static_assert(
+            is_same<remove_cvref_t<typename SrcBuffer::type>, remove_cvref_t<SrcData>>::value &&
+                is_same<remove_cvref_t<typename DstBuffer::type>, remove_cvref_t<DstData>>::value,
+            "wrong! SrcBuffer or DstBuffer data type is wrong");
+
+        static_assert(DstBuffer::IsStaticBuffer(), "wrong! DstBuffer need to be StaticBuffer");
+
+        static_assert(is_known_at_compile_time<remove_cvref_t<SrcRefToOriginDisplacement>>::value &&
+                          is_known_at_compile_time<remove_cvref_t<DstOriginIdx>>::value,
+                      "wrong! SrcOriginToRefDistance and DstOriginToRefDistance need to be known "
+                      "at compile-time");
+
+        // SrcDesc and DstDesc are known at compile-time
+        constexpr auto src_desc = remove_cvref_t<SrcDesc>{};
+        constexpr auto dst_desc = remove_cvref_t<DstDesc>{};
+
+        // SrcOriginToRefDisttance and DstOriginToRefDistance are known at compile-time
+        constexpr auto src_ref_to_origin_disp_idx = to_multi_index(SrcRefToOriginDisplacement{});
+        constexpr auto dst_origin_idx             = to_multi_index(DstOriginIdx{});
+
+        // scalar per access of each dim
+        constexpr auto src_scalar_per_access = generate_sequence_v2(
+            [&](auto i) constexpr {
+                if constexpr(i == SrcVectorDim)
+                {
+                    return Number<SrcScalarPerVector>{};
+                }
+                else
+                {
+                    return Number<1>{};
+                }
+            },
+            Number<nDim>{});
+
+        // scalar step (if steping on SrcVectorDim) of each dim
+        constexpr auto src_scalar_step_in_vector = generate_sequence_v2(
+            [&](auto i) constexpr {
+                if constexpr(i == SrcVectorDim)
+                {
+                    return Number<1>{};
+                }
+                else
+                {
+                    return Number<0>{};
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto dim_access_order = DimAccessOrder{};
+
+        constexpr auto ordered_access_lengths =
+            container_reorder_given_new2old(access_lengths, dim_access_order);
+
+        static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
+#if 0
+            // TODO: unable to compile
+            // position in slice window
+            constexpr auto data_to_origin_disp_idx =
+                container_reorder_given_old2new(ordered_access_idx, dim_access_order) *
+                src_scalar_per_access;
+#else
+            // position in slice window
+            constexpr auto data_to_origin_disp_idx =
+                ordered_access_idx.ReorderGivenOld2New(dim_access_order) * src_scalar_per_access;
+#endif
+            // src coordinate
+            constexpr auto src_ref_to_data_disp_idx =
+                src_ref_to_origin_disp_idx + data_to_origin_disp_idx;
+
+            constexpr auto src_ref_to_data_disp_coord_step =
+                make_tensor_coordinate_step(src_desc, src_ref_to_data_disp_idx);
+
+            auto src_data_coord = src_ref_coord_;
+
+            move_tensor_coordinate(src_desc, src_data_coord, src_ref_to_data_disp_coord_step);
+
+            vector_type_maker_t<SrcData, SrcScalarPerVector> src_tmp_vector;
+
+            using src_vector_t = typename decltype(src_tmp_vector)::type;
+
+            const bool is_src_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
+                src_desc, src_data_coord);
+
+            // copy data from src_buf into src_tmp_vector
+            if constexpr(SrcBuffer::IsDynamicBuffer())
+            {
+                src_tmp_vector.template AsType<src_vector_t>()(Number<0>{}) =
+                    src_buf.template Get<src_vector_t>(src_data_coord.GetOffset(), is_src_valid);
+            }
+            else if constexpr(SrcBuffer::IsStaticBuffer())
+            {
+                static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                    constexpr index_t src_offset = src_desc.CalculateOffset(
+                        src_ref_to_origin_disp_idx + data_to_origin_disp_idx +
+                        i * src_scalar_step_in_vector);
+
+                    // apply type convert
+                    src_tmp_vector.template AsType<SrcData>()(i) = src_buf[Number<src_offset>{}];
+                });
+            }
+            // copy data from src_tmp_vector to dst_tmp_vector (data cast data from SrcData to
+            // DstData)
+            vector_type_maker_t<DstData, SrcScalarPerVector> dst_tmp_vector;
+
+            // TODO: if SrcData and DstData are vetor type, then static_cast may not compile
+            static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                dst_tmp_vector.template AsType<DstData>()(i) =
+                    type_convert<DstData>(src_tmp_vector.template AsType<SrcData>()[i]);
+            });
+
+            // copy data from dst_tmp_vector into dst_buf
+            static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                constexpr index_t dst_offset = dst_desc.CalculateOffset(
+                    dst_origin_idx + data_to_origin_disp_idx + i * src_scalar_step_in_vector);
+
+                dst_buf(Number<dst_offset>{}) = dst_tmp_vector.template AsType<DstData>()[i];
+            });
+        });
+    }
+
+    template <typename SrcSliceMoveStepIdx>
+    __device__ void MoveSrcSliceWindow(const SrcDesc&,
+                                       const SrcSliceMoveStepIdx& src_slice_move_step_idx)
+    {
+        constexpr auto src_desc = SrcDesc{};
+
+        const auto src_slice_move_step_iter =
+            make_tensor_coordinate_step(src_desc, to_multi_index(src_slice_move_step_idx));
+
+        move_tensor_coordinate(SrcDesc{}, src_ref_coord_, src_slice_move_step_iter);
+    }
+    __device__ void SetSrcCoord(const Index& src_ref_idx)
+    {
+        src_ref_coord_ = make_tensor_coordinate(SrcDesc{}, src_ref_idx);
+    }
+
+    private:
+    SrcCoord src_ref_coord_;
+};
+
+// Do NOT involve any tensor coordinates with StaticBuffer
+template <typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename ElementwiseOperation,
+          typename SliceLengths,
+          typename DimAccessOrder,
+          index_t DstVectorDim,
+          index_t DstScalarPerVector,
+          typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
+                             bool>::type = false>
+struct ThreadwiseTensorSliceTransfer_StaticToStatic
+{
+    static constexpr index_t nDim = SliceLengths::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    __device__ constexpr ThreadwiseTensorSliceTransfer_StaticToStatic(
+        const ElementwiseOperation& element_op)
+        : element_op_{element_op}
+    {
+        static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
+                      "wrong! Desc need to known at compile-time");
+
+        static_assert(SliceLengths::At(Number<DstVectorDim>{}) % DstScalarPerVector == 0,
+                      "wrong! Not divisible");
+    }
+
+    template <typename SrcSliceOriginIdx,
+              typename DstSliceOriginIdx,
+              typename SrcBuffer,
+              typename DstBuffer>
+    __device__ void Run(const SrcDesc&,
+                        const SrcSliceOriginIdx&,
+                        const SrcBuffer& src_buf,
+                        const DstDesc&,
+                        const DstSliceOriginIdx&,
+                        DstBuffer& dst_buf)
+    {
+        static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
+                      "wrong! Desc need to known at compile-time");
+
+        static_assert(is_known_at_compile_time<remove_cvref_t<SrcSliceOriginIdx>>::value &&
+                          is_known_at_compile_time<remove_cvref_t<DstSliceOriginIdx>>::value,
+                      "wrong! SliceOrigin need to known at compile-time");
+
+        static_assert(SrcBuffer::IsStaticBuffer() && DstBuffer::IsStaticBuffer(),
+                      "wrong! Buffer need to be StaticBuffer");
+
+        // SrcDesc and src_slice_origin_idx are known at compile-time
+        constexpr auto src_desc             = remove_cvref_t<SrcDesc>{};
+        constexpr auto dst_desc             = remove_cvref_t<DstDesc>{};
+        constexpr auto src_slice_origin_idx = to_multi_index(SrcSliceOriginIdx{});
+        constexpr auto dst_slice_origin_idx = to_multi_index(DstSliceOriginIdx{});
+
+        // scalar per access on each dim
+        constexpr auto dst_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto dst_scalar_step_in_vector =
+            generate_sequence(detail::lambda_scalar_step_in_vector<DstVectorDim>{}, Number<nDim>{});
+
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
+                                                    remove_cv_t<decltype(dst_scalar_per_access)>>;
+
+        static_assert(DstScalarPerVector == SpaceFillingCurve::ScalarPerVector,
+                      "wrong!DstScalarPerVector != SpaceFillingCurve::ScalarPerVector");
+
+        constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
+
+        static_for<0, num_access, 1>{}([&](auto idx_1d) {
+            constexpr auto idx_md = SpaceFillingCurve::GetIndex(idx_1d);
+
+            // copy data from src_buf into dst_vector
+            static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
+                constexpr index_t src_offset = src_desc.CalculateOffset(
+                    src_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
+
+                constexpr index_t dst_offset = dst_desc.CalculateOffset(
+                    dst_slice_origin_idx + idx_md + i * dst_scalar_step_in_vector);
+
+                SrcData v;
+
+                // apply element-wise operation
+                element_op_(v, src_buf[Number<src_offset>{}]);
+
+                // apply type convert
+                dst_buf(Number<dst_offset>{}) = type_convert<DstData>(v);
+            });
+        });
+    }
+
+    ElementwiseOperation element_op_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
new file mode 100644
index 00000000..bb28c194
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
@@ -0,0 +1,794 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer.hpp"
+#include "ck/tensor/static_tensor.hpp"
+
+namespace ck {
+
+namespace detail {
+// TODO: How to fix this? It uses an struct instead of lambda because lambda
+// doesn't have constructor
+template <index_t SrcVectorDim,
+          index_t SrcScalarPerVector,
+          index_t DstVectorDim,
+          index_t DstScalarPerVector>
+struct lambda_scalar_per_access_for_src_and_dst
+{
+    __host__ __device__ constexpr auto operator()(index_t i) const
+    {
+        if(i == SrcVectorDim && i == DstVectorDim)
+        {
+            return math::lcm(SrcScalarPerVector, DstScalarPerVector);
+        }
+        else if(i == SrcVectorDim)
+        {
+            return SrcScalarPerVector;
+        }
+        else if(i == DstVectorDim)
+        {
+            return DstScalarPerVector;
+        }
+        else
+        {
+            return 1;
+        }
+    }
+};
+
+} // namespace detail
+
+// Assume:
+//   1. src_desc and dst_desc are not known at compile-time
+//   2. SrcBuffer and DstBuffer are DynamicBuffer
+//   3. src_slice_origin and dst_slice_origin are not known at compile-time,
+//   4. Use thread buffer
+template <typename SliceLengths,
+          typename SrcElementwiseOperation,
+          typename DstElementwiseOperation,
+          InMemoryDataOperationEnum DstInMemOp,
+          typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          index_t SrcScalarPerVector,
+          index_t DstScalarPerVector,
+          index_t SrcScalarStrideInVector,
+          index_t DstScalarStrideInVector,
+          bool SrcResetCoordinateAfterRun, // control whether to move back src coordinate after each
+                                           // RunRead(),  will be fused with MoveSrcSliceWindow to
+                                           // save addr computation
+          bool DstResetCoordinateAfterRun, // control whether to move back dst coordinate after each
+                                           // RunWrite(),  will be fused with MoveDstSliceWindow to
+                                           // save addr computation
+          index_t NumThreadScratch = 1>
+struct ThreadwiseTensorSliceTransfer_v3r1
+{
+    static constexpr index_t nDim = SliceLengths::Size();
+    using Index                   = MultiIndex<nDim>;
+
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
+    using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
+
+    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
+    using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
+
+    static constexpr auto I0 = Number<0>{};
+
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v3r1(
+        const SrcDesc& src_desc,
+        const Index& src_slice_origin,
+        const SrcElementwiseOperation& src_element_op,
+        const DstDesc& dst_desc,
+        const Index& dst_slice_origin,
+        const DstElementwiseOperation& dst_element_op)
+        : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
+          dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin)),
+          src_element_op_(src_element_op),
+          dst_element_op_(dst_element_op)
+    {
+    }
+
+    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
+    {
+        src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
+    }
+
+    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
+    {
+        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
+    }
+
+    template <typename SrcBuffer, index_t ThreadScratchId = 0>
+    __device__ void RunRead(const SrcDesc& src_desc,
+                            const SrcBuffer& src_buf,
+                            Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
+    {
+        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
+                          SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
+                      "wrong!");
+
+        static_assert(
+            is_same<remove_cvref_t<typename SrcBuffer::type>, remove_cvref_t<SrcData>>::value,
+            "wrong! SrcBuffer and SrcData data type are inconsistent");
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
+
+        constexpr auto ordered_src_access_lengths =
+            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
+
+        // make forward steps
+        const auto src_forward_steps = generate_tuple(
+            [&](auto i) {
+                Index forward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(src_desc, forward_step_idx);
+            },
+            Number<nDim>{});
+
+        // make backward steps
+        const auto src_backward_steps = generate_tuple(
+            [&](auto i) {
+                Index backward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(src_desc, backward_step_idx);
+            },
+            Number<nDim>{});
+
+        // loop over tensor and copy
+        static_ford<decltype(ordered_src_access_lengths)>{}([&](auto ordered_src_access_idx) {
+            // judge move forward or move backward
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                forward_sweep_(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_src_access_idx[I0];
+
+                    static_for<1, i, 1>{}([&](auto j) {
+                        tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
+                    });
+
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep_;
+            }();
+
+            // calculate src data index
+            constexpr auto src_data_idx = [&]() {
+                Index ordered_idx;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i]
+                                                      : ordered_src_access_lengths[i] - 1 -
+                                                            ordered_src_access_idx[i];
+                });
+
+                return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
+                       src_scalar_per_access;
+            }();
+
+            constexpr auto src_data_idx_seq = generate_sequence_v2(
+                [&](auto i) { return Number<src_data_idx[i]>{}; }, Number<src_data_idx.Size()>{});
+
+            const bool is_src_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
+
+            using src_vector_type = vector_type_maker_t<SrcData, SrcScalarPerVector>;
+            using src_vector_t    = typename src_vector_type::type;
+
+            // copy data from src_buf into src_vector_container
+            auto src_vector_container = src_vector_type{
+                src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid)};
+
+            // apply SrcElementwiseOperation on src_vector_container
+            static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                SrcData src_v;
+
+                src_element_op_(src_v, src_vector_container.template AsType<SrcData>()[i]);
+
+                src_vector_container.template AsType<SrcData>()(i) = src_v;
+            });
+
+            // copy data from src_vector_container into src_thread_scratch_
+            src_thread_scratch_tuple_(thread_scratch_id)
+                .template SetAsType<src_vector_t>(
+                    src_data_idx_seq, src_vector_container.template AsType<src_vector_t>()[I0]);
+
+            constexpr auto move_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &=
+                            ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1;
+                    });
+                });
+
+                return move_on_dim_;
+            }
+            ();
+
+            // move src coord
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(move_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_tensor_coordinate(
+                            src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]);
+                    }
+                    else
+                    {
+                        move_tensor_coordinate(
+                            src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]);
+                    }
+                }
+            });
+        });
+
+        // move src coordinate back to slice origin (or not)
+        if constexpr(SrcResetCoordinateAfterRun)
+        {
+            const auto src_reset_step =
+                make_tensor_coordinate_step(src_desc, GetSrcCoordinateResetStep());
+
+            move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
+        }
+    }
+
+    template <index_t ThreadScratchId>
+    __device__ void
+    TransferDataFromSrcThreadScratchToDstThreadScratch(Number<ThreadScratchId> thread_scratch_id)
+    {
+#if !CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE
+        static_ford<SliceLengths>{}([&](auto idx) {
+            // convert from SrcData to DstData here
+            dst_thread_scratch_(idx) =
+                type_convert<DstData>(src_thread_scratch_tuple_[thread_scratch_id][idx]);
+        });
+#else
+        // sub-dword transpose between src_thread_scratch_ and dst_thread_scratch_
+        // TODO make this logic more generic for more sub-dword datatype
+        if constexpr(SrcVectorDim != DstVectorDim &&
+                     ((is_same<half_t, remove_cvref_t<SrcData>>::value &&
+                       is_same<half_t, remove_cvref_t<DstData>>::value &&
+                       SrcScalarPerVector % 2 == 0 && DstScalarPerVector % 2 == 0) ||
+                      (is_same<int8_t, remove_cvref_t<SrcData>>::value &&
+                       is_same<int8_t, remove_cvref_t<DstData>>::value &&
+                       SrcScalarPerVector % 4 == 0 && DstScalarPerVector % 4 == 0)))
+        {
+            // each transpose does
+            // DstScalarPerVector # of src vectors in src_thread_scratch_
+            // SrcScalarPerVector # of dst vectors in dst_thread_scratch_
+            constexpr index_t num_src_vector = Number<DstScalarPerVector>{};
+            constexpr index_t num_dst_vector = Number<SrcScalarPerVector>{};
+
+            // Assume SrcVectorDim is not the same as DstVectorDim, so we do transpose
+            // TODO: make this logic generic for all scenario
+            static_assert(SrcVectorDim != DstVectorDim, "wrong");
+
+            constexpr auto src_scalar_step_in_vector = generate_sequence(
+                detail::lambda_scalar_step_in_vector<SrcVectorDim>{}, Number<nDim>{});
+
+            constexpr auto dst_scalar_step_in_vector = generate_sequence(
+                detail::lambda_scalar_step_in_vector<DstVectorDim>{}, Number<nDim>{});
+
+            constexpr auto scalar_per_access = generate_sequence(
+                detail::lambda_scalar_per_access_for_src_and_dst<SrcVectorDim,
+                                                                 SrcScalarPerVector,
+                                                                 DstVectorDim,
+                                                                 DstScalarPerVector>{},
+                Number<nDim>{});
+
+            constexpr auto access_lengths = SliceLengths{} / scalar_per_access;
+
+            static_ford<decltype(access_lengths)>{}([&](auto access_idx) {
+                constexpr auto data_idx = access_idx * scalar_per_access;
+
+                constexpr auto data_idx_seq = generate_sequence_v2(
+                    [&](auto i) { return Number<data_idx[i]>{}; }, Number<nDim>{});
+
+                // TODO type_convert is not used yet!!!!!
+                using src_vector_t = vector_type_maker_t<SrcData, SrcScalarPerVector>;
+                using dst_vector_t = vector_type_maker_t<DstData, DstScalarPerVector>;
+
+                // get DstScalarPerVector # of read-only references to src vectors from
+                // src_thread_scratch_
+                const auto src_vector_refs = generate_tie(
+                    [&](auto i) -> const src_vector_t& {
+                        // i increment corresponds to movement in DstVectorDim
+                        return src_thread_scratch_tuple_[thread_scratch_id].GetVectorTypeReference(
+                            data_idx_seq + i * dst_scalar_step_in_vector);
+                    },
+                    Number<num_src_vector>{});
+
+                // get SrcScalarPerVector # of references to dst vectors from dst_thread_scratch_
+                auto dst_vector_refs = generate_tie(
+                    [&](auto i) -> dst_vector_t& {
+                        // i increment corresponds to movement in SrcVectorDim
+                        return dst_thread_scratch_.GetVectorTypeReference(
+                            data_idx_seq + i * src_scalar_step_in_vector);
+                    },
+                    Number<num_dst_vector>{});
+
+                // do data transpose
+                // TODO type_convert is not used yet!!!!!
+                transpose_vectors<SrcData, DstScalarPerVector, SrcScalarPerVector>{}(
+                    src_vector_refs, dst_vector_refs);
+            });
+        }
+        else
+        {
+            static_ford<SliceLengths>{}([&](auto idx) {
+                // convert from SrcData to DstData here
+                dst_thread_scratch_(idx) =
+                    type_convert<DstData>(src_thread_scratch_tuple_[thread_scratch_id][idx]);
+            });
+        }
+#endif
+    }
+
+    template <typename DstBuffer, index_t ThreadScratchId = 0>
+    __device__ void RunWrite(const DstDesc& dst_desc,
+                             DstBuffer& dst_buf,
+                             Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
+    {
+        // if there is transpose, it's done here
+        // TODO move this elsewhere
+        TransferDataFromSrcThreadScratchToDstThreadScratch(thread_scratch_id);
+
+        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
+                          DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
+                      "wrong!");
+
+        static_assert(
+            is_same<remove_cvref_t<typename DstBuffer::type>, remove_cvref_t<DstData>>::value,
+            "wrong! SrcBuffer or DstBuffer data type is wrong");
+
+        // src scalar per access on each dim
+        // TODO: don't use this
+        constexpr auto dst_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
+
+        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
+
+        constexpr auto ordered_dst_access_lengths =
+            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
+
+        // make forward steps
+        const auto dst_forward_steps = generate_tuple(
+            [&](auto i) {
+                Index forward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(dst_desc, forward_step_idx);
+            },
+            Number<nDim>{});
+
+        // make backward steps
+        const auto dst_backward_steps = generate_tuple(
+            [&](auto i) {
+                Index backward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(dst_desc, backward_step_idx);
+            },
+            Number<nDim>{});
+
+        // loop over tensor and copy
+        static_ford<decltype(ordered_dst_access_lengths)>{}([&](auto ordered_dst_access_idx) {
+            // judge move forward or move backward
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                forward_sweep_(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_dst_access_idx[I0];
+
+                    static_for<1, i, 1>{}([&](auto j) {
+                        tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j];
+                    });
+
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep_;
+            }();
+
+            // calculate dst data index
+            constexpr auto dst_data_idx = [&]() {
+                Index ordered_idx;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i]
+                                                      : ordered_dst_access_lengths[i] - 1 -
+                                                            ordered_dst_access_idx[i];
+                });
+
+                return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
+                       dst_scalar_per_access;
+            }();
+
+            constexpr auto dst_data_idx_seq = generate_sequence_v2(
+                [&](auto i) { return Number<dst_data_idx[i]>{}; }, Number<dst_data_idx.Size()>{});
+
+            const bool is_dst_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
+
+            using dst_vector_type = vector_type_maker_t<DstData, DstScalarPerVector>;
+            using dst_vector_t    = typename dst_vector_type::type;
+
+            // copy data from dst_thread_scratch_ into dst_vector_container
+            auto dst_vector_container = dst_vector_type{
+                dst_thread_scratch_.template GetAsType<dst_vector_t>(dst_data_idx_seq)};
+
+            static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
+                DstData dst_v;
+
+                // apply DstElementwiseOperation
+                dst_element_op_(dst_v, dst_vector_container.template AsType<DstData>()[i]);
+
+                dst_vector_container.template AsType<DstData>()(i) = dst_v;
+            });
+
+            // copy data from dst_vector_container to dst_buf
+            dst_buf.template Set<dst_vector_t>(
+                dst_coord_.GetOffset(),
+                is_dst_valid,
+                dst_vector_container.template AsType<dst_vector_t>()[I0]);
+
+            constexpr auto move_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &=
+                            ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1;
+                    });
+                });
+
+                return move_on_dim_;
+            }
+            ();
+
+            // move dst coord
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(move_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]);
+                    }
+                    else
+                    {
+                        move_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]);
+                    }
+                }
+            });
+        });
+
+        // move dst coordinate back to slice origin (or not)
+        if constexpr(DstResetCoordinateAfterRun)
+        {
+            const auto dst_reset_step =
+                make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep());
+
+            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
+        }
+    }
+
+    __device__ static constexpr auto GetSrcCoordinateResetStep()
+    {
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
+
+        constexpr auto ordered_src_access_lengths =
+            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
+
+        // judge move forward or move backward during the last iteration
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto i) {
+                index_t tmp = ordered_src_access_lengths[I0] - 1;
+
+                static_for<1, i, 1>{}([&](auto j) {
+                    tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
+                });
+
+                forward_sweep_(i) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate src data index after last iteration in RunRead(), if it has not being reset by
+        // RunRead()
+        constexpr auto src_data_idx = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0;
+            });
+
+            return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
+                   src_scalar_per_access;
+        }();
+
+        //
+        constexpr auto reset_src_data_step = [&]() {
+            Index reset_src_data_step_;
+
+            static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; });
+
+            return reset_src_data_step_;
+        }();
+
+        return reset_src_data_step;
+    }
+
+    __device__ static constexpr auto GetDstCoordinateResetStep()
+    {
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto dst_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
+
+        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
+
+        constexpr auto ordered_dst_access_lengths =
+            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
+
+        // judge move forward or move backward during the last iteration
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto i) {
+                index_t tmp = ordered_dst_access_lengths[I0] - 1;
+
+                static_for<1, i, 1>{}([&](auto j) {
+                    tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
+                });
+
+                forward_sweep_(i) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate dst data index after last iteration in RunWrite(), if it has not being reset by
+        // RunWrite()
+        constexpr auto dst_data_idx = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0;
+            });
+
+            return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
+                   dst_scalar_per_access;
+        }();
+
+        //
+        constexpr auto reset_dst_data_step = [&]() {
+            Index reset_dst_data_step_;
+
+            static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; });
+
+            return reset_dst_data_step_;
+        }();
+
+        return reset_dst_data_step;
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
+                                       const Index& src_slice_origin_step_idx)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
+                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
+
+    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
+                                       const Index& dst_slice_origin_step_idx)
+    {
+        // if dst coord was not reset by RunWrite(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
+                                       : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
+    }
+
+    __device__ static constexpr auto GetSrcThreadScratchDescriptor()
+    {
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto src_access_lengths_and_vector_length = container_push_back(
+            sequence_to_tuple_of_number(src_access_lengths), Number<SrcScalarPerVector>{});
+
+        // 1st stage of transforms
+        constexpr auto desc0 =
+            make_naive_tensor_descriptor_packed(src_access_lengths_and_vector_length);
+
+        // 2nd stage of transforms
+        constexpr auto transforms = generate_tuple(
+            [&](auto i) {
+                if constexpr(i == SrcVectorDim)
+                {
+                    return make_merge_transform_v3_division_mod(
+                        make_tuple(src_access_lengths_and_vector_length[i],
+                                   src_access_lengths_and_vector_length[Number<nDim>{}]));
+                }
+                else
+                {
+                    return make_pass_through_transform(src_access_lengths_and_vector_length[i]);
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto low_dim_idss = generate_tuple(
+            [&](auto i) {
+                if constexpr(i == SrcVectorDim)
+                {
+                    return Sequence<i.value, nDim>{};
+                }
+                else
+                {
+                    return Sequence<i.value>{};
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto up_dim_idss =
+            generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
+
+        return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
+    }
+
+    __device__ static constexpr auto GetDstThreadScratchDescriptor()
+    {
+        // 1st stage of transforms
+        constexpr auto dst_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
+
+        constexpr auto dst_access_lengths_and_vector_length = container_push_back(
+            sequence_to_tuple_of_number(dst_access_lengths), Number<DstScalarPerVector>{});
+
+        constexpr auto desc0 =
+            make_naive_tensor_descriptor_packed(dst_access_lengths_and_vector_length);
+
+        // 2nd stage of transforms
+        constexpr auto transforms = generate_tuple(
+            [&](auto i) {
+                if constexpr(i == DstVectorDim)
+                {
+                    return make_merge_transform_v3_division_mod(
+                        make_tuple(dst_access_lengths_and_vector_length[i],
+                                   dst_access_lengths_and_vector_length[Number<nDim>{}]));
+                }
+                else
+                {
+                    return make_pass_through_transform(dst_access_lengths_and_vector_length[i]);
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto low_dim_idss = generate_tuple(
+            [&](auto i) {
+                if constexpr(i == DstVectorDim)
+                {
+                    return Sequence<i.value, nDim>{};
+                }
+                else
+                {
+                    return Sequence<i.value>{};
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto up_dim_idss =
+            generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
+
+        return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
+    }
+
+    private:
+    static constexpr auto src_thread_scratch_desc_ = decltype(GetSrcThreadScratchDescriptor()){};
+    static constexpr auto dst_thread_scratch_desc_ = decltype(GetDstThreadScratchDescriptor()){};
+
+    using SrcThreadScratch = StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
+                                                             SrcData,
+                                                             SrcScalarPerVector,
+                                                             decltype(src_thread_scratch_desc_),
+                                                             true>;
+
+    using DstThreadScratch = StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
+                                                             DstData,
+                                                             DstScalarPerVector,
+                                                             decltype(dst_thread_scratch_desc_),
+                                                             true>;
+
+    StaticallyIndexedArray<SrcThreadScratch, NumThreadScratch> src_thread_scratch_tuple_;
+
+    DstThreadScratch dst_thread_scratch_;
+
+    SrcCoord src_coord_;
+    DstCoord dst_coord_;
+    const SrcElementwiseOperation src_element_op_;
+    const DstElementwiseOperation dst_element_op_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r3.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r3.hpp
new file mode 100644
index 00000000..6a73466e
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r3.hpp
@@ -0,0 +1,886 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R3_HPP
+#define CK_THREADWISE_TENSOR_SLICE_TRANSFER_V3R3_HPP
+
+#include "common_header.hpp"
+#include "tensor_descriptor.hpp"
+#include "tensor_descriptor_helper.hpp"
+#include "static_tensor.hpp"
+
+namespace ck {
+
+namespace detail {
+// TODO: How to fix this? It uses an struct instead of lambda because lambda
+// doesn't have constructor
+template <index_t SrcVectorDim,
+          index_t SrcScalarPerVector,
+          index_t DstVectorDim,
+          index_t DstScalarPerVector>
+struct lambda_scalar_per_access_for_src_and_dst
+{
+    __host__ __device__ constexpr auto operator()(index_t i) const
+    {
+        if(i == SrcVectorDim && i == DstVectorDim)
+        {
+            return math::lcm(SrcScalarPerVector, DstScalarPerVector);
+        }
+        else if(i == SrcVectorDim)
+        {
+            return SrcScalarPerVector;
+        }
+        else if(i == DstVectorDim)
+        {
+            return DstScalarPerVector;
+        }
+        else
+        {
+            return 1;
+        }
+    }
+};
+
+} // namespace detail
+
+// Assume:
+//   1. src_desc and dst_desc are not known at compile-time
+//   2. SrcBuffer and DstBuffer are DynamicBuffer
+//   3. src_slice_origin and dst_slice_origin are not known at compile-time,
+//   4. Use thread buffer
+template <typename SliceLengths,
+          typename SrcElementwiseOperation,
+          typename DstElementwiseOperation,
+          InMemoryDataOperationEnum DstInMemOp,
+          typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename Dst0Desc,
+          typename Dst1Desc,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          index_t SrcVectorDim,
+          index_t DstVectorDim,
+          index_t SrcScalarPerVector,
+          index_t DstScalarPerVector,
+          index_t SrcScalarStrideInVector,
+          index_t DstScalarStrideInVector,
+          bool SrcResetCoordinateAfterRun, // control whether to move back src coordinate after each
+                                           // RunRead(),  will be fused with MoveSrcSliceWindow to
+                                           // save addr computation
+          bool DstResetCoordinateAfterRun> // control whether to move back dst coordinate after each
+                                           // RunWrite(),  will be fused with MoveDstSliceWindow to
+                                           // save addr computation
+struct ThreadwiseTensorSliceTransfer_v3r3
+{
+    static constexpr index_t nDim = SliceLengths::Size();
+    using Index                   = MultiIndex<nDim>;
+
+    using SrcCoord  = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
+    using DstCoord  = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
+    using Dst0Coord = decltype(make_tensor_coordinate(Dst0Desc{}, Index{}));
+    using Dst1Coord = decltype(make_tensor_coordinate(Dst1Desc{}, Index{}));
+
+    using SrcCoordStep  = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
+    using DstCoordStep  = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
+    using Dst0CoordStep = decltype(make_tensor_coordinate_step(Dst0Desc{}, Index{}));
+    using Dst1CoordStep = decltype(make_tensor_coordinate_step(Dst1Desc{}, Index{}));
+
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v3r3(
+        const SrcDesc& src_desc,
+        const Index& src_slice_origin,
+        const SrcElementwiseOperation& src_element_op,
+        const DstDesc& dst_desc,
+        const Dst0Desc& dst0_desc,
+        const Dst1Desc& dst1_desc,
+        const Index& dst_slice_origin,
+        const DstElementwiseOperation& dst_element_op)
+        : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
+          dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin)),
+          dst0_coord_(make_tensor_coordinate(dst0_desc, dst_slice_origin)),
+          dst1_coord_(make_tensor_coordinate(dst1_desc, dst_slice_origin)),
+          src_element_op_(src_element_op),
+          dst_element_op_(dst_element_op)
+    {
+    }
+
+    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
+    {
+        src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
+    }
+
+    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc,
+                                      const Dst0Desc& dst0_desc,
+                                      const Dst1Desc& dst1_desc,
+                                      const Index& dst_slice_origin_idx)
+    {
+        dst_coord_  = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
+        dst0_coord_ = make_tensor_coordinate(dst0_desc, dst_slice_origin_idx);
+        dst1_coord_ = make_tensor_coordinate(dst1_desc, dst_slice_origin_idx);
+    }
+
+    template <typename SrcBuffer>
+    __device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf)
+    {
+        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
+                          SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
+                      "wrong!");
+
+        static_assert(
+            is_same<remove_cvref_t<typename SrcBuffer::type>, remove_cvref_t<SrcData>>::value,
+            "wrong! SrcBuffer and SrcData data type are inconsistent");
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
+
+        constexpr auto ordered_src_access_lengths =
+            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
+
+        // make forward steps
+        const auto src_forward_steps = generate_tuple(
+            [&](auto i) {
+                Index forward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(src_desc, forward_step_idx);
+            },
+            Number<nDim>{});
+
+        // make backward steps
+        const auto src_backward_steps = generate_tuple(
+            [&](auto i) {
+                Index backward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(src_desc, backward_step_idx);
+            },
+            Number<nDim>{});
+
+        // loop over tensor and copy
+        static_ford<decltype(ordered_src_access_lengths)>{}([&](auto ordered_src_access_idx) {
+            // judge move forward or move backward
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                forward_sweep_(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_src_access_idx[I0];
+
+                    static_for<1, i, 1>{}([&](auto j) {
+                        tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
+                    });
+
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep_;
+            }();
+
+            // calculate src data index
+            constexpr auto src_data_idx = [&]() {
+                Index ordered_idx;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i]
+                                                      : ordered_src_access_lengths[i] - 1 -
+                                                            ordered_src_access_idx[i];
+                });
+
+                return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
+                       src_scalar_per_access;
+            }();
+
+            constexpr auto src_data_idx_seq = generate_sequence_v2(
+                [&](auto i) { return Number<src_data_idx[i]>{}; }, Number<src_data_idx.Size()>{});
+
+            const bool is_src_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
+
+            using src_vector_type = vector_type_maker_t<SrcData, SrcScalarPerVector>;
+            using src_vector_t    = typename src_vector_type::type;
+
+            // copy data from src_buf into src_vector_container
+            auto src_vector_container = src_vector_type{
+                src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid)};
+
+            // apply SrcElementwiseOperation on src_vector_container
+            static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
+                src_vector_container.template AsType<SrcData>()(i) =
+                    src_element_op_(src_vector_container.template AsType<SrcData>()[i]);
+            });
+
+            // copy data from src_vector_container into src_thread_scratch_
+            src_thread_scratch_.template SetAsType<src_vector_t>(
+                src_data_idx_seq, src_vector_container.template AsType<src_vector_t>()[I0]);
+
+            constexpr auto move_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &=
+                            ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1;
+                    });
+                });
+
+                return move_on_dim_;
+            }
+            ();
+
+            // move src coord
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(move_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_tensor_coordinate(
+                            src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]);
+                    }
+                    else
+                    {
+                        move_tensor_coordinate(
+                            src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]);
+                    }
+                }
+            });
+        });
+
+        // move src coordinate back to slice origin (or not)
+        if constexpr(SrcResetCoordinateAfterRun)
+        {
+            const auto src_reset_step =
+                make_tensor_coordinate_step(src_desc, GetSrcCoordinateResetStep());
+
+            move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
+        }
+    }
+
+    __device__ void TransferDataFromSrcThreadScratchToDstThreadScratch()
+    {
+#if !CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE
+        static_ford<SliceLengths>{}([&](auto idx) {
+            // convert from SrcData to DstData here
+            dst_thread_scratch_(idx) = type_convert<DstData>(src_thread_scratch_[idx]);
+        });
+#else
+        // sub-dword transpose between src_thread_scratch_ and dst_thread_scratch_
+        // TODO make this logic more generic for more sub-dword datatype
+        if constexpr(SrcVectorDim != DstVectorDim &&
+                     is_same<half_t, remove_cvref_t<SrcData>>::value &&
+                     is_same<half_t, remove_cvref_t<DstData>>::value &&
+                     SrcScalarPerVector % 2 == 0 && DstScalarPerVector % 2 == 0)
+        {
+            // each transpose does
+            // DstScalarPerVector # of src vectors in src_thread_scratch_
+            // SrcScalarPerVector # of dst vectors in dst_thread_scratch_
+            constexpr index_t num_src_vector = Number<DstScalarPerVector>{};
+            constexpr index_t num_dst_vector = Number<SrcScalarPerVector>{};
+
+            // Assume SrcVectorDim is not the same as DstVectorDim, so we do transpose
+            // TODO: make this logic generic for all scenario
+            static_assert(SrcVectorDim != DstVectorDim, "wrong");
+
+            constexpr auto src_scalar_step_in_vector = generate_sequence(
+                detail::lambda_scalar_step_in_vector<SrcVectorDim>{}, Number<nDim>{});
+
+            constexpr auto dst_scalar_step_in_vector = generate_sequence(
+                detail::lambda_scalar_step_in_vector<DstVectorDim>{}, Number<nDim>{});
+
+            constexpr auto scalar_per_access = generate_sequence(
+                detail::lambda_scalar_per_access_for_src_and_dst<SrcVectorDim,
+                                                                 SrcScalarPerVector,
+                                                                 DstVectorDim,
+                                                                 DstScalarPerVector>{},
+                Number<nDim>{});
+
+            constexpr auto access_lengths = SliceLengths{} / scalar_per_access;
+
+            static_ford<decltype(access_lengths)>{}([&](auto access_idx) {
+                constexpr auto data_idx = access_idx * scalar_per_access;
+
+                constexpr auto data_idx_seq = generate_sequence_v2(
+                    [&](auto i) { return Number<data_idx[i]>{}; }, Number<nDim>{});
+
+                // TODO type_convert is not used yet!!!!!
+                using src_vector_t = vector_type_maker_t<SrcData, SrcScalarPerVector>;
+                using dst_vector_t = vector_type_maker_t<DstData, DstScalarPerVector>;
+
+                // get DstScalarPerVector # of read-only references to src vectors from
+                // src_thread_scratch_
+                const auto src_vector_refs = generate_tie(
+                    [&](auto i) -> const src_vector_t& {
+                        // i increment corresponds to movement in DstVectorDim
+                        return src_thread_scratch_.GetVectorTypeReference(
+                            data_idx_seq + i * dst_scalar_step_in_vector);
+                    },
+                    Number<num_src_vector>{});
+
+                // get SrcScalarPerVector # of references to dst vectors from dst_thread_scratch_
+                auto dst_vector_refs = generate_tie(
+                    [&](auto i) -> dst_vector_t& {
+                        // i increment corresponds to movement in SrcVectorDim
+                        return dst_thread_scratch_.GetVectorTypeReference(
+                            data_idx_seq + i * src_scalar_step_in_vector);
+                    },
+                    Number<num_dst_vector>{});
+
+                // do data transpose
+                // TODO type_convert is not used yet!!!!!
+                transpose_vectors<SrcData, DstScalarPerVector, SrcScalarPerVector>{}(
+                    src_vector_refs, dst_vector_refs);
+            });
+        }
+        else
+        {
+            static_ford<SliceLengths>{}([&](auto idx) {
+                // convert from SrcData to DstData here
+                dst_thread_scratch_(idx) = type_convert<DstData>(src_thread_scratch_[idx]);
+            });
+        }
+#endif
+    }
+
+    template <typename DstBuffer, typename Dst0Buffer, typename Dst1Buffer>
+    __device__ void RunWrite(const DstDesc& dst_desc,
+                             DstBuffer& dst_buf,
+                             const Dst0Desc& dst0_desc,
+                             const Dst0Buffer& dst0_buf,
+                             const Dst1Desc& dst1_desc,
+                             const Dst1Buffer& dst1_buf)
+    {
+        // if there is transpose, it's done here
+        // TODO move this elsewhere
+        TransferDataFromSrcThreadScratchToDstThreadScratch();
+
+        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
+                          DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
+                      "wrong!");
+
+        static_assert(
+            is_same<remove_cvref_t<typename DstBuffer::type>, remove_cvref_t<DstData>>::value,
+            "wrong! SrcBuffer or DstBuffer data type is wrong");
+
+        constexpr auto I0 = Number<0>{};
+        constexpr auto I1 = Number<1>{};
+
+        // src scalar per access on each dim
+        // TODO: don't use this
+        constexpr auto dst_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
+
+        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
+
+        constexpr auto ordered_dst_access_lengths =
+            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
+
+        // make forward steps
+        const auto dst_forward_steps = generate_tuple(
+            [&](auto i) {
+                Index forward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(dst_desc, forward_step_idx);
+            },
+            Number<nDim>{});
+
+        // make forward steps: dst0
+        // WARNING!!!!!!: this logic is only correct if dst/dst0/dst1 can use the same
+        // DstScalarPerVector
+        // TODO: fix this
+        const auto dst0_forward_steps = generate_tuple(
+            [&](auto i) {
+                Index forward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(dst0_desc, forward_step_idx);
+            },
+            Number<nDim>{});
+
+        // make forward steps: dst1
+        // WARNING!!!!!!: this logic is only correct if dst/dst0/dst1 can use the same
+        // DstScalarPerVector
+        // TODO: fix this
+        const auto dst1_forward_steps = generate_tuple(
+            [&](auto i) {
+                Index forward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(dst1_desc, forward_step_idx);
+            },
+            Number<nDim>{});
+
+        // make backward steps
+        const auto dst_backward_steps = generate_tuple(
+            [&](auto i) {
+                Index backward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(dst_desc, backward_step_idx);
+            },
+            Number<nDim>{});
+
+        // make backward steps: dst0
+        // WARNING!!!!!!: this logic is only correct if dst/dst0/dst1 can use the same
+        // DstScalarPerVector
+        // TODO: fix this
+        const auto dst0_backward_steps = generate_tuple(
+            [&](auto i) {
+                Index backward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(dst0_desc, backward_step_idx);
+            },
+            Number<nDim>{});
+
+        // make backward steps: dst1
+        // WARNING!!!!!!: this logic is only correct if dst/dst0/dst1 can use the same
+        // DstScalarPerVector
+        // TODO: fix this
+        const auto dst1_backward_steps = generate_tuple(
+            [&](auto i) {
+                Index backward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(dst1_desc, backward_step_idx);
+            },
+            Number<nDim>{});
+
+        // loop over tensor and copy
+        static_ford<decltype(ordered_dst_access_lengths)>{}([&](auto ordered_dst_access_idx) {
+            // judge move forward or move backward
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                forward_sweep_(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_dst_access_idx[I0];
+
+                    static_for<1, i, 1>{}([&](auto j) {
+                        tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j];
+                    });
+
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep_;
+            }();
+
+            // calculate dst data index
+            constexpr auto dst_data_idx = [&]() {
+                Index ordered_idx;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i]
+                                                      : ordered_dst_access_lengths[i] - 1 -
+                                                            ordered_dst_access_idx[i];
+                });
+
+                return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
+                       dst_scalar_per_access;
+            }();
+
+            constexpr auto dst_data_idx_seq = generate_sequence_v2(
+                [&](auto i) { return Number<dst_data_idx[i]>{}; }, Number<dst_data_idx.Size()>{});
+
+            const bool is_dst_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
+
+            using dst_vector_type = vector_type_maker_t<DstData, DstScalarPerVector>;
+            using dst_vector_t    = typename dst_vector_type::type;
+
+            // copy data from dst_thread_scratch_ into dst_vector_container
+            auto dst_vector_container = dst_vector_type{
+                dst_thread_scratch_.template GetAsType<dst_vector_t>(dst_data_idx_seq)};
+
+            // apply DstElementwiseOperation on dst_vector_container
+            static_for<0, DstScalarPerVector, 1>{}([&](auto i) {
+                dst_vector_container.template AsType<DstData>()(i) =
+                    dst_element_op_(dst_vector_container.template AsType<DstData>()[i]);
+            });
+
+            // copy data from dst_vector_container to dst_buf
+            dst_buf.template Set<dst_vector_t>(
+                dst_coord_.GetOffset(),
+                is_dst_valid,
+                dst_vector_container.template AsType<dst_vector_t>()[I0]);
+
+            constexpr auto move_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &=
+                            ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1;
+                    });
+                });
+
+                return move_on_dim_;
+            }
+            ();
+
+            // move dst coord
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(move_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]);
+                    }
+                    else
+                    {
+                        move_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]);
+                    }
+                }
+            });
+        });
+
+        // move dst coordinate back to slice origin (or not)
+        if constexpr(DstResetCoordinateAfterRun)
+        {
+            const auto dst_reset_step =
+                make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep());
+
+            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
+        }
+    }
+
+    __device__ static constexpr auto GetSrcCoordinateResetStep()
+    {
+        constexpr auto I0 = Number<0>{};
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
+
+        constexpr auto ordered_src_access_lengths =
+            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
+
+        // judge move forward or move backward during the last iteration
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            // TODO: BUG: should start at 1
+            static_for<1, nDim, 1>{}([&](auto i) {
+                index_t tmp = ordered_src_access_lengths[I0] - 1;
+
+                static_for<1, i, 1>{}([&](auto j) {
+                    tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
+                });
+
+                forward_sweep_(i) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate src data index after last iteration in RunRead(), if it has not being reset by
+        // RunRead()
+        constexpr auto src_data_idx = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0;
+            });
+
+            return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
+                   src_scalar_per_access;
+        }();
+
+        //
+        constexpr auto reset_src_data_step = [&]() {
+            Index reset_src_data_step_;
+
+            static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; });
+
+            return reset_src_data_step_;
+        }();
+
+        return reset_src_data_step;
+    }
+
+    __device__ static constexpr auto GetDstCoordinateResetStep()
+    {
+        constexpr auto I0 = Number<0>{};
+
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto dst_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
+
+        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
+
+        constexpr auto ordered_dst_access_lengths =
+            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
+
+        // judge move forward or move backward during the last iteration
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto i) {
+                index_t tmp = ordered_dst_access_lengths[I0] - 1;
+
+                static_for<1, i, 1>{}([&](auto j) {
+                    tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
+                });
+
+                forward_sweep_(i) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate dst data index after last iteration in RunWrite(), if it has not being reset by
+        // RunWrite()
+        constexpr auto dst_data_idx = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0;
+            });
+
+            return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
+                   dst_scalar_per_access;
+        }();
+
+        //
+        constexpr auto reset_dst_data_step = [&]() {
+            Index reset_dst_data_step_;
+
+            static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; });
+
+            return reset_dst_data_step_;
+        }();
+
+        return reset_dst_data_step;
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
+                                       const Index& src_slice_origin_step_idx)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
+                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
+                                       const Index& src_slice_origin_step_idx)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
+                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
+
+    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
+                                       const Dst0Desc dst0_desc,
+                                       const Dst1Desc dst1_desc,
+                                       const Index& dst_slice_origin_step_idx)
+    {
+        // if dst coord was not reset by RunWrite(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
+                                       : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
+        move_tensor_coordinate(dst0_desc, dst0_coord_, adjusted_step);
+        move_tensor_coordinate(dst1_desc, dst1_coord_, adjusted_step);
+    }
+
+    __device__ static constexpr auto GetSrcThreadScratchDescriptor()
+    {
+        constexpr auto src_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
+
+        constexpr auto src_access_lengths_and_vector_length = container_push_back(
+            sequence_to_tuple_of_number(src_access_lengths), Number<SrcScalarPerVector>{});
+
+        // 1st stage of transforms
+        constexpr auto desc0 =
+            make_naive_tensor_descriptor_packed(src_access_lengths_and_vector_length);
+
+        // 2nd stage of transforms
+        constexpr auto transforms = generate_tuple(
+            [&](auto i) {
+                if constexpr(i == SrcVectorDim)
+                {
+                    return make_merge_transform_v3_division_mod(
+                        make_tuple(src_access_lengths_and_vector_length[i],
+                                   src_access_lengths_and_vector_length[Number<nDim>{}]));
+                }
+                else
+                {
+                    return make_pass_through_transform(src_access_lengths_and_vector_length[i]);
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto low_dim_idss = generate_tuple(
+            [&](auto i) {
+                if constexpr(i == SrcVectorDim)
+                {
+                    return Sequence<i.value, nDim>{};
+                }
+                else
+                {
+                    return Sequence<i.value>{};
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto up_dim_idss =
+            generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
+
+        return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
+    }
+
+    __device__ static constexpr auto GetDstThreadScratchDescriptor()
+    {
+        // 1st stage of transforms
+        constexpr auto dst_scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});
+
+        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
+
+        constexpr auto dst_access_lengths_and_vector_length = container_push_back(
+            sequence_to_tuple_of_number(dst_access_lengths), Number<DstScalarPerVector>{});
+
+        constexpr auto desc0 =
+            make_naive_tensor_descriptor_packed(dst_access_lengths_and_vector_length);
+
+        // 2nd stage of transforms
+        constexpr auto transforms = generate_tuple(
+            [&](auto i) {
+                if constexpr(i == DstVectorDim)
+                {
+                    return make_merge_transform_v3_division_mod(
+                        make_tuple(dst_access_lengths_and_vector_length[i],
+                                   dst_access_lengths_and_vector_length[Number<nDim>{}]));
+                }
+                else
+                {
+                    return make_pass_through_transform(dst_access_lengths_and_vector_length[i]);
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto low_dim_idss = generate_tuple(
+            [&](auto i) {
+                if constexpr(i == DstVectorDim)
+                {
+                    return Sequence<i.value, nDim>{};
+                }
+                else
+                {
+                    return Sequence<i.value>{};
+                }
+            },
+            Number<nDim>{});
+
+        constexpr auto up_dim_idss =
+            generate_tuple([&](auto i) { return Sequence<i.value>{}; }, Number<nDim>{});
+
+        return transform_tensor_descriptor(desc0, transforms, low_dim_idss, up_dim_idss);
+    }
+
+    private:
+    static constexpr auto src_thread_scratch_desc_ = decltype(GetSrcThreadScratchDescriptor()){};
+    static constexpr auto dst_thread_scratch_desc_ = decltype(GetDstThreadScratchDescriptor()){};
+
+    StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
+                                    SrcData,
+                                    SrcScalarPerVector,
+                                    decltype(src_thread_scratch_desc_),
+                                    true>
+        src_thread_scratch_;
+
+    StaticTensorTupleOfVectorBuffer<AddressSpaceEnum::Vgpr,
+                                    DstData,
+                                    DstScalarPerVector,
+                                    decltype(dst_thread_scratch_desc_),
+                                    true>
+        dst_thread_scratch_;
+
+    SrcCoord src_coord_;
+    DstCoord dst_coord_;
+    const SrcElementwiseOperation src_element_op_;
+    const DstElementwiseOperation dst_element_op_;
+};
+
+} // namespace ck
+#endif
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp
new file mode 100644
index 00000000..6e8a2393
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v4r1.hpp
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+
+namespace ck {
+// Assume:
+//   1. src:
+//     1. SrcDesc is known at compile-time
+//     2. SrcBuffer is DynamicBuffer
+//     3. src_ref_idx is known at run-time
+//     4. SrcRefToOriginDisplacement is known at compile-time
+//     5. use #-step
+//   2. dst:
+//     1. DstDesc is known at compile-time
+//     2. DstBuffer is StaticBuffer
+//     3. DstOriginIdx is known at compile-time
+//     4. use direct address calculation
+//   3. vector access on src
+template <typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SliceLengths,
+          typename DimAccessOrder,
+          typename SrcVectorTensorLengths,
+          typename SrcVectorTensorContiguousDimOrder,
+          typename enable_if<SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
+                             bool>::type = false>
+struct ThreadwiseTensorSliceTransfer_v4r1
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    static constexpr index_t nDim = SliceLengths::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
+
+    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
+
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v4r1(const Index& src_ref_idx)
+        : src_ref_coord_(make_tensor_coordinate(SrcDesc{}, src_ref_idx))
+    {
+        static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
+                      "wrong! SrcDesc and DstDesc need to known at compile-time");
+
+        static_for<0, nDim, 1>{}([](auto i) {
+            static_assert(SliceLengths::At(i) % SrcVectorTensorLengths::At(i) == 0, "wrong!");
+        });
+    }
+
+    template <typename SrcRefToOriginDisplacement,
+              typename DstOriginIdx,
+              typename SrcBuffer,
+              typename DstBuffer>
+    __device__ void Run(const SrcDesc&,
+                        const SrcRefToOriginDisplacement&,
+                        const SrcBuffer& src_buf,
+                        const DstDesc&,
+                        const DstOriginIdx&,
+                        DstBuffer& dst_buf) const
+    {
+        static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
+                      "wrong! SrcDesc and DstDesc need to known at compile-time");
+
+        static_assert(
+            is_same<remove_cvref_t<typename SrcBuffer::type>, remove_cvref_t<SrcData>>::value &&
+                is_same<remove_cvref_t<typename DstBuffer::type>, remove_cvref_t<DstData>>::value,
+            "wrong! SrcBuffer or DstBuffer data type is wrong");
+
+        static_assert(DstBuffer::IsStaticBuffer(), "wrong! DstBuffer need to be StaticBuffer");
+
+        static_assert(is_known_at_compile_time<remove_cvref_t<SrcRefToOriginDisplacement>>::value &&
+                          is_known_at_compile_time<remove_cvref_t<DstOriginIdx>>::value,
+                      "wrong! SrcOriginToRefDistance and DstOriginToRefDistance need to be known "
+                      "at compile-time");
+
+        // SrcDesc and DstDesc are known at compile-time
+        constexpr auto src_desc = remove_cvref_t<SrcDesc>{};
+        constexpr auto dst_desc = remove_cvref_t<DstDesc>{};
+
+        // SrcOriginToRefDisttance and DstOriginToRefDistance are known at compile-time
+        constexpr auto src_ref_to_origin_disp_idx = to_multi_index(SrcRefToOriginDisplacement{});
+        constexpr auto dst_origin_idx             = to_multi_index(DstOriginIdx{});
+
+        // tensor descriptor for src_vector
+        constexpr auto src_vector_tensor_lengths = SrcVectorTensorLengths{};
+
+        constexpr auto src_vector_tensor_strides = container_reorder_given_old2new(
+            container_reverse_exclusive_scan(
+                container_reorder_given_new2old(src_vector_tensor_lengths,
+                                                SrcVectorTensorContiguousDimOrder{}),
+                math::multiplies{},
+                I1),
+            SrcVectorTensorContiguousDimOrder{});
+
+        constexpr auto src_vector_desc =
+            make_naive_tensor_descriptor(sequence_to_tuple_of_number(src_vector_tensor_lengths),
+                                         sequence_to_tuple_of_number(src_vector_tensor_strides));
+
+        // access order and lengths
+        constexpr auto access_lengths = SliceLengths{} / src_vector_tensor_lengths;
+
+        constexpr auto dim_access_order = DimAccessOrder{};
+
+        constexpr auto ordered_access_lengths =
+            container_reorder_given_new2old(access_lengths, dim_access_order);
+
+        static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
+            // position in slice window
+            constexpr auto data_to_origin_disp_idx =
+                ordered_access_idx.ReorderGivenOld2New(dim_access_order) *
+                src_vector_tensor_lengths;
+
+            // src coordinate at starting point of src_vector
+            constexpr auto src_ref_to_data_disp_idx =
+                src_ref_to_origin_disp_idx + data_to_origin_disp_idx;
+
+            constexpr auto src_ref_to_data_disp_coord_step =
+                make_tensor_coordinate_step(src_desc, src_ref_to_data_disp_idx);
+
+            auto src_data_coord = src_ref_coord_;
+
+            move_tensor_coordinate(src_desc, src_data_coord, src_ref_to_data_disp_coord_step);
+
+            vector_type_maker_t<SrcData, src_vector_desc.GetElementSpaceSize()> src_vector;
+
+            using src_vector_t = typename decltype(src_vector)::type;
+
+            const bool is_src_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
+                src_desc, src_data_coord);
+
+            // copy data from src_buf into src_vector
+            src_vector.template AsType<src_vector_t>()(I0) =
+                src_buf.template Get<src_vector_t>(src_data_coord.GetOffset(), is_src_valid);
+
+            // copy data from src_vector into dst_buf (also cast from SrcData to DstData)
+            static_ford<SrcVectorTensorLengths>{}([&](auto src_vector_idx_) {
+                constexpr auto src_vector_idx = to_multi_index(src_vector_idx_);
+
+                constexpr index_t src_vector_offset =
+                    src_vector_desc.CalculateOffset(src_vector_idx);
+
+                constexpr index_t dst_offset = dst_desc.CalculateOffset(
+                    dst_origin_idx + data_to_origin_disp_idx + src_vector_idx);
+
+                dst_buf(Number<dst_offset>{}) = type_convert<DstData>(
+                    src_vector.template AsType<DstData>()[Number<src_vector_offset>{}]);
+            });
+        });
+    }
+
+    template <typename SrcSliceMoveStepIdx>
+    __device__ void MoveSrcSliceWindow(const SrcDesc&,
+                                       const SrcSliceMoveStepIdx& src_slice_move_step_idx)
+    {
+        constexpr auto src_desc = SrcDesc{};
+
+        const auto src_slice_move_step_iter =
+            make_tensor_coordinate_step(src_desc, to_multi_index(src_slice_move_step_idx));
+
+        move_tensor_coordinate(SrcDesc{}, src_ref_coord_, src_slice_move_step_iter);
+    }
+
+    private:
+    SrcCoord src_ref_coord_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp
new file mode 100644
index 00000000..f13da341
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v5r1.hpp
@@ -0,0 +1,614 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/tensor_space_filling_curve.hpp"
+
+namespace ck {
+
+// Assume:
+//   1. src_desc and dst_desc are not known at compile-time
+//   2. SrcBuffer and DstBuffer are DynamicBuffer
+//   3. src_slice_origin and dst_slice_origin are not known at compile-time,
+//   4. Use thread buffer
+template <typename SliceLengths,
+          InMemoryDataOperationEnum DstInMemOp,
+          typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename SrcDimAccessOrder,
+          typename DstDimAccessOrder,
+          typename SrcVectorTensorLengths,
+          typename DstVectorTensorLengths,
+          typename SrcVectorTensorContiguousDimOrder,
+          typename DstVectorTensorContiguousDimOrder,
+          bool SrcResetCoordinateAfterRun, // control whether to move back src coordinate after each
+                                           // RunRead(),  will be fused with MoveSrcSliceWindow to
+                                           // save addr computation
+          bool DstResetCoordinateAfterRun> // control whether to move back dst coordinate after each
+                                           // RunWrite(),  will be fused with MoveDstSliceWindow to
+                                           // save addr computation
+struct ThreadwiseTensorSliceTransfer_v5r1
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    static constexpr index_t nDim = SliceLengths::Size();
+    using Index                   = MultiIndex<nDim>;
+
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
+    using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
+
+    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
+    using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
+
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v5r1(const SrcDesc& src_desc,
+                                                            const Index& src_slice_origin,
+                                                            const DstDesc& dst_desc,
+                                                            const Index& dst_slice_origin)
+        : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
+          dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin))
+    {
+        // TODO: fix this
+        static_assert(is_same<SrcData, DstData>::value,
+                      "wrong! current implementation assume SrcData and DstData are same type");
+
+        static_for<0, nDim, 1>{}([](auto i) {
+            static_assert(SliceLengths::At(i) % SrcVectorTensorLengths::At(i) == 0 &&
+                              SliceLengths::At(i) % DstVectorTensorLengths::At(i) == 0,
+                          "wrong!");
+        });
+    }
+
+    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
+    {
+        src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
+    }
+
+    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
+    {
+        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
+    }
+
+    template <typename SrcBuffer, typename SrcStepHacks>
+    __device__ void
+    RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks)
+    {
+        static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
+                          SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
+                      "wrong!");
+
+        static_assert(
+            is_same<remove_cvref_t<typename SrcBuffer::type>, remove_cvref_t<SrcData>>::value,
+            "wrong! SrcBuffer and SrcData data type are inconsistent");
+
+        // tensor descriptor for src_vector
+        constexpr auto src_vector_tensor_lengths = SrcVectorTensorLengths{};
+
+        constexpr auto src_vector_tensor_strides = container_reorder_given_old2new(
+            container_reverse_exclusive_scan(
+                container_reorder_given_new2old(src_vector_tensor_lengths,
+                                                SrcVectorTensorContiguousDimOrder{}),
+                math::multiplies{},
+                I1),
+            SrcVectorTensorContiguousDimOrder{});
+
+        constexpr auto src_vector_desc =
+            make_naive_tensor_descriptor(sequence_to_tuple_of_number(src_vector_tensor_lengths),
+                                         sequence_to_tuple_of_number(src_vector_tensor_strides));
+
+        // access order and lengths
+        constexpr auto src_access_lengths = SliceLengths{} / src_vector_tensor_lengths;
+
+        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
+
+        constexpr auto ordered_src_access_lengths =
+            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
+
+        // make forward steps
+        const auto src_forward_steps = generate_tuple(
+            [&](auto i) {
+                Index forward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    forward_step_idx(j) = (i.value == j.value) ? src_vector_tensor_lengths[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(
+                    src_desc, forward_step_idx, src_step_hacks[I0][i]);
+            },
+            Number<nDim>{});
+
+        // make backward steps
+        const auto src_backward_steps = generate_tuple(
+            [&](auto i) {
+                Index backward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    backward_step_idx(j) = (i.value == j.value) ? -src_vector_tensor_lengths[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(
+                    src_desc, backward_step_idx, src_step_hacks[I1][i]);
+            },
+            Number<nDim>{});
+
+        // loop over tensor and copy
+        static_ford<decltype(ordered_src_access_lengths)>{}([&](auto ordered_src_access_idx) {
+            // judge move forward or move backward
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                forward_sweep_(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_src_access_idx[I0];
+
+                    static_for<0, i, 1>{}([&](auto j) {
+                        tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
+                    });
+
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep_;
+            }();
+
+            // calculate src data index
+            constexpr auto src_data_idx = [&]() {
+                Index ordered_idx;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i]
+                                                      : ordered_src_access_lengths[i] - 1 -
+                                                            ordered_src_access_idx[i];
+                });
+
+                return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
+                       src_vector_tensor_lengths;
+            }();
+
+            vector_type_maker_t<SrcData, src_vector_desc.GetElementSpaceSize()> src_vector;
+
+            using src_vector_t = typename decltype(src_vector)::type;
+
+            const bool is_src_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
+
+            // copy data from src_buf to src_vector
+            src_vector.template AsType<src_vector_t>()(I0) =
+                src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid);
+
+            // copy data from src_vector to buffer_
+            static_ford<SrcVectorTensorLengths>{}([&](auto src_vector_idx_) {
+                constexpr auto src_vector_idx = to_multi_index(src_vector_idx_);
+
+                constexpr index_t src_vector_offset =
+                    src_vector_desc.CalculateOffset(src_vector_idx);
+
+                constexpr index_t buffer_offset =
+                    buffer_desc_.CalculateOffset(src_data_idx + src_vector_idx);
+
+                buffer_(Number<buffer_offset>{}) =
+                    src_vector.template AsType<SrcData>()[Number<src_vector_offset>{}];
+            });
+
+            constexpr auto move_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &=
+                            ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1;
+                    });
+                });
+
+                return move_on_dim_;
+            }
+            ();
+
+            // move
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(move_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_tensor_coordinate(
+                            src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]);
+                    }
+                    else
+                    {
+                        move_tensor_coordinate(
+                            src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]);
+                    }
+                }
+            });
+        });
+
+        // move src coordinate back to slice origin (or not)
+        if constexpr(SrcResetCoordinateAfterRun)
+        {
+            const auto src_reset_step =
+                make_tensor_coordinate_step(src_desc, GetSrcCoordinateResetStep());
+
+            move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
+        }
+    }
+
+    template <typename DstBuffer, typename DstStepHacks>
+    __device__ void
+    RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf, const DstStepHacks& dst_step_hacks)
+    {
+        static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
+                          DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
+                      "wrong!");
+
+        static_assert(
+            is_same<remove_cvref_t<typename DstBuffer::type>, remove_cvref_t<DstData>>::value,
+            "wrong! SrcBuffer or DstBuffer data type is wrong");
+
+        // tensor descriptor for dst_vector
+        constexpr auto dst_vector_tensor_lengths = DstVectorTensorLengths{};
+
+        constexpr auto dst_vector_tensor_strides = container_reorder_given_old2new(
+            container_reverse_exclusive_scan(
+                container_reorder_given_new2old(dst_vector_tensor_lengths,
+                                                DstVectorTensorContiguousDimOrder{}),
+                math::multiplies{},
+                I1),
+            DstVectorTensorContiguousDimOrder{});
+
+        constexpr auto dst_vector_desc =
+            make_naive_tensor_descriptor(sequence_to_tuple_of_number(dst_vector_tensor_lengths),
+                                         sequence_to_tuple_of_number(dst_vector_tensor_strides));
+
+        // dst access order and lengths
+        constexpr auto dst_access_lengths = SliceLengths{} / dst_vector_tensor_lengths;
+
+        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
+
+        constexpr auto ordered_dst_access_lengths =
+            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
+
+        // make forward steps
+        const auto dst_forward_steps = generate_tuple(
+            [&](auto i) {
+                Index forward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    forward_step_idx(j) = (i.value == j.value) ? dst_vector_tensor_lengths[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(
+                    dst_desc, forward_step_idx, dst_step_hacks[I0][i]);
+            },
+            Number<nDim>{});
+
+        // make backward steps
+        const auto dst_backward_steps = generate_tuple(
+            [&](auto i) {
+                Index backward_step_idx;
+
+                static_for<0, nDim, 1>{}([&](auto j) {
+                    backward_step_idx(j) = (i.value == j.value) ? -dst_vector_tensor_lengths[i] : 0;
+                });
+
+                return make_tensor_coordinate_step(
+                    dst_desc, backward_step_idx, dst_step_hacks[I1][i]);
+            },
+            Number<nDim>{});
+
+        // loop over tensor and copy
+        static_ford<decltype(ordered_dst_access_lengths)>{}([&](auto ordered_dst_access_idx) {
+            // judge move forward or move backward
+            constexpr auto forward_sweep = [&]() {
+                StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+                forward_sweep_(I0) = true;
+
+                static_for<1, nDim, 1>{}([&](auto i) {
+                    index_t tmp = ordered_dst_access_idx[I0];
+
+                    static_for<0, i, 1>{}([&](auto j) {
+                        tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j];
+                    });
+
+                    forward_sweep_(i) = tmp % 2 == 0;
+                });
+
+                return forward_sweep_;
+            }();
+
+            // calculate dst data index
+            constexpr auto dst_data_idx = [&]() {
+                Index ordered_idx;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i]
+                                                      : ordered_dst_access_lengths[i] - 1 -
+                                                            ordered_dst_access_idx[i];
+                });
+
+                return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
+                       dst_vector_tensor_lengths;
+            }();
+
+            vector_type_maker_t<DstData, dst_vector_desc.GetElementSpaceSize()> dst_vector;
+
+            // copy data from buffer_ to dst_vector (also cast from SrcData to DstData)
+            static_ford<DstVectorTensorLengths>{}([&](auto dst_vector_idx_) {
+                constexpr auto dst_vector_idx = to_multi_index(dst_vector_idx_);
+
+                constexpr index_t buffer_offset =
+                    buffer_desc_.CalculateOffset(dst_data_idx + dst_vector_idx);
+
+                constexpr index_t dst_vector_offset =
+                    dst_vector_desc.CalculateOffset(dst_vector_idx);
+
+                dst_vector.template AsType<DstData>()(Number<dst_vector_offset>{}) =
+                    type_convert<DstData>(buffer_[Number<buffer_offset>{}]);
+            });
+
+            using dst_vector_t = typename decltype(dst_vector)::type;
+
+            // copy data from dst_vector to dst_buf
+            const bool is_dst_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
+
+            dst_buf.template Set<dst_vector_t>(
+                dst_coord_.GetOffset(),
+                is_dst_valid,
+                dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
+
+            constexpr auto move_on_dim = [&]() constexpr
+            {
+                StaticallyIndexedArray<bool, nDim> move_on_dim_;
+
+                static_for<0, nDim, 1>{}([&](auto i) {
+                    move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1;
+
+                    static_for<i + 1, nDim, 1>{}([&](auto j) {
+                        move_on_dim_(i) &=
+                            ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1;
+                    });
+                });
+
+                return move_on_dim_;
+            }
+            ();
+
+            // move
+            static_for<0, nDim, 1>{}([&](auto i) {
+                if constexpr(move_on_dim[i])
+                {
+                    if constexpr(forward_sweep[i])
+                    {
+                        move_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]);
+                    }
+                    else
+                    {
+                        move_tensor_coordinate(
+                            dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]);
+                    }
+                }
+            });
+        });
+
+        // move dst coordinate back to slice origin (or not)
+        if constexpr(DstResetCoordinateAfterRun)
+        {
+            const auto dst_reset_step =
+                make_tensor_coordinate_step(dst_desc, GetDstCoordinateResetStep());
+
+            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
+        }
+    }
+
+    template <typename SrcBuffer>
+    __device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf)
+    {
+        constexpr index_t ntransform_src = SrcDesc::GetNumOfTransform();
+
+        constexpr auto zeros = typename uniform_sequence_gen<ntransform_src, 0>::type{};
+
+        constexpr auto src_step_hacks =
+            make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
+                       generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
+
+        RunRead(src_desc, src_buf, src_step_hacks);
+    }
+
+    template <typename DstBuffer>
+    __device__ void RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf)
+    {
+        constexpr index_t ntransform_dst = DstDesc::GetNumOfTransform();
+
+        constexpr auto zeros = typename uniform_sequence_gen<ntransform_dst, 0>::type{};
+
+        constexpr auto dst_step_hacks =
+            make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
+                       generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
+
+        RunWrite(dst_desc, dst_buf, dst_step_hacks);
+    }
+
+    __device__ static constexpr auto GetSrcCoordinateResetStep()
+    {
+        constexpr auto src_vector_tensor_lengths = SrcVectorTensorLengths{};
+
+        constexpr auto src_access_lengths = SliceLengths{} / src_vector_tensor_lengths;
+
+        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
+
+        constexpr auto ordered_src_access_lengths =
+            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
+
+        // judge move forward or move backward during the last iteration
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto i) {
+                index_t tmp = ordered_src_access_lengths[I0] - 1;
+
+                static_for<0, i, 1>{}([&](auto j) {
+                    tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
+                });
+
+                forward_sweep_(i) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate src data index after last iteration in RunRead(), if it has not being reset by
+        // RunRead()
+        constexpr auto src_data_idx = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0;
+            });
+
+            return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
+                   src_vector_tensor_lengths;
+        }();
+
+        //
+        constexpr auto reset_src_data_step = [&]() {
+            Index reset_src_data_step_;
+
+            static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; });
+
+            return reset_src_data_step_;
+        }();
+
+        return reset_src_data_step;
+    }
+
+    __device__ static constexpr auto GetDstCoordinateResetStep()
+    {
+        constexpr auto dst_vector_tensor_lengths = DstVectorTensorLengths{};
+
+        constexpr auto dst_access_lengths = SliceLengths{} / dst_vector_tensor_lengths;
+
+        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
+
+        constexpr auto ordered_dst_access_lengths =
+            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
+
+        // judge move forward or move backward during the last iteration
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto i) {
+                index_t tmp = ordered_dst_access_lengths[I0] - 1;
+
+                static_for<0, i, 1>{}([&](auto j) {
+                    tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
+                });
+
+                forward_sweep_(i) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate dst data index after last iteration in RunWrite(), if it has not being reset by
+        // RunWrite()
+        constexpr auto dst_data_idx = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto i) {
+                ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0;
+            });
+
+            return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
+                   dst_vector_tensor_lengths;
+        }();
+
+        //
+        constexpr auto reset_dst_data_step = [&]() {
+            Index reset_dst_data_step_;
+
+            static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; });
+
+            return reset_dst_data_step_;
+        }();
+
+        return reset_dst_data_step;
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
+                                       const Index& src_slice_origin_step_idx)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
+                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    template <typename SrcMoveSliceWindowStepHack>
+    __device__ void
+    MoveSrcSliceWindow(const SrcDesc& src_desc,
+                       const Index& src_slice_origin_step_idx,
+                       const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
+                                       : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(
+            src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
+
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
+    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
+                                       const Index& dst_slice_origin_step_idx)
+    {
+        // if dst coord was not reset by RunWrite(), then need to adjust the step here
+        const auto adjusted_step_idx =
+            DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
+                                       : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
+    }
+
+    private:
+    static constexpr auto buffer_desc_ =
+        make_naive_tensor_descriptor_packed(sequence_to_tuple_of_number(SliceLengths{}));
+
+    static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize();
+
+    StaticBuffer<AddressSpaceEnum::Vgpr, SrcData, buffer_size_, true> buffer_;
+
+    SrcCoord src_coord_;
+    DstCoord dst_coord_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp
new file mode 100644
index 00000000..9c91cd9c
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1.hpp
@@ -0,0 +1,213 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/tensor_space_filling_curve.hpp"
+
+namespace ck {
+
+// Do following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
+// and sometimes useless instructions:
+//   1. Don't save a reference to tensor descriptor in class, pass in tensor descriptor as argument
+//   instead
+//   2. Don't construct a new tensor coordinate everytime when using it, update and reuse the same
+//   tensor coordinate instead
+//   3. Don't use a pointer to VGPR buffer, use vector instead
+
+// Assume:
+//   1. src_desc and dst_desc are not known at compile-time
+//   2. SrcBuffer and DstBuffer are DynamicBuffer
+//   3. src_slice_origin and dst_slice_origin are not known at compile-time,
+template <typename SrcData,
+          typename DstData,
+          typename SrcDesc,
+          typename DstDesc,
+          typename ElementwiseOperation,
+          typename SliceLengths,
+          typename DimAccessOrder,
+          index_t VectorDim,
+          index_t ScalarPerVector,
+          InMemoryDataOperationEnum DstInMemOp,
+          bool SrcResetCoordinateAfterRun,
+          bool DstResetCoordinateAfterRun>
+struct ThreadwiseTensorSliceTransfer_v6r1
+{
+    static constexpr index_t nDim = SliceLengths::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
+    using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
+
+    static constexpr auto I0 = Number<0>{};
+
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v6r1(const SrcDesc& src_desc,
+                                                            const Index& src_slice_origin,
+                                                            const DstDesc& dst_desc,
+                                                            const Index& dst_slice_origin,
+                                                            const ElementwiseOperation& element_op)
+        : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
+          dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin)),
+          element_op_(element_op)
+    {
+        static_assert(SliceLengths::At(Number<VectorDim>{}) % ScalarPerVector == 0,
+                      "wrong! cannot evenly divide");
+    }
+
+    __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
+    {
+        src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
+    }
+
+    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
+    {
+        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
+    }
+
+    template <typename SrcBuffer, typename DstBuffer>
+    __device__ void Run(const SrcDesc& src_desc,
+                        const SrcBuffer& src_buf,
+                        const DstDesc& dst_desc,
+                        DstBuffer& dst_buf)
+    {
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});
+
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
+                                                    remove_cv_t<decltype(scalar_per_access)>>;
+
+        // loop over space-filling curve
+        constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
+
+        static_for<0, num_access, 1>{}([&](auto idx_1d) {
+            using src_vector_type = vector_type_maker_t<SrcData, ScalarPerVector>;
+            using src_vector_t    = typename src_vector_type::type;
+
+            using dst_vector_type = vector_type_maker_t<DstData, ScalarPerVector>;
+            using dst_vector_t    = typename dst_vector_type::type;
+
+            const bool is_src_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
+
+            // copy data from src_buf into src_vector_container
+            auto src_vector_container = src_vector_type{
+                src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid)};
+
+            auto dst_vector_container = dst_vector_type{};
+
+            // apply pointwise operation
+            static_for<0, ScalarPerVector, 1>{}([&](auto i) {
+                SrcData v;
+
+                // apply element-wise operation
+                element_op_(v, src_vector_container.template AsType<SrcData>()[i]);
+
+                // apply type convert
+                dst_vector_container.template AsType<DstData>()(i) = type_convert<DstData>(v);
+            });
+
+            const bool is_dst_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
+
+            // copy data from dst_vector into dst_buf
+            dst_buf.template Update<DstInMemOp, dst_vector_t>(
+                dst_coord_.GetOffset(),
+                is_dst_valid,
+                dst_vector_container.template AsType<dst_vector_t>()[I0]);
+
+            // move coordinate
+            if constexpr(idx_1d.value != num_access - 1)
+            {
+                constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
+                move_tensor_coordinate(
+                    src_desc, src_coord_, make_tensor_coordinate_step(src_desc, forward_step));
+                move_tensor_coordinate(
+                    dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step));
+            }
+        });
+
+        // move coordinate back to slice origin (or not)
+        if constexpr(SrcResetCoordinateAfterRun)
+        {
+            const auto src_reset_step =
+                make_tensor_coordinate_step(src_desc, GetCoordinateResetStep());
+
+            move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
+        }
+
+        if constexpr(DstResetCoordinateAfterRun)
+        {
+            const auto dst_reset_step =
+                make_tensor_coordinate_step(dst_desc, GetCoordinateResetStep());
+
+            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
+        }
+    }
+
+    __device__ static constexpr auto GetCoordinateResetStep()
+    {
+        constexpr auto scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});
+
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
+                                                    remove_cv_t<decltype(scalar_per_access)>>;
+
+        constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
+        if constexpr(num_access == 0)
+        {
+            return typename SpaceFillingCurve::Index{};
+        }
+        else
+        {
+            constexpr auto reset_step =
+                SpaceFillingCurve::GetStepBetween(Number<num_access - 1>{}, Number<0>{});
+
+            return reset_step;
+        }
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
+                                       const Index& src_slice_origin_step_idx)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx = SrcResetCoordinateAfterRun
+                                           ? src_slice_origin_step_idx
+                                           : src_slice_origin_step_idx + GetCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
+    }
+
+    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
+                                       const Index& dst_slice_origin_step_idx)
+    {
+        // if dst coord was not reset by Run(), then need to adjust the step here
+        const auto adjusted_step_idx = DstResetCoordinateAfterRun
+                                           ? dst_slice_origin_step_idx
+                                           : dst_slice_origin_step_idx + GetCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
+    }
+
+    private:
+    SrcCoord src_coord_;
+    DstCoord dst_coord_;
+    const ElementwiseOperation element_op_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp
new file mode 100644
index 00000000..68bc2726
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp
@@ -0,0 +1,260 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/tensor_space_filling_curve.hpp"
+
+namespace ck {
+
+// Do following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
+// and sometimes useless instructions:
+//   1. Don't save a reference to tensor descriptor in class, pass in tensor descriptor as argument
+//   instead
+//   2. Don't construct a new tensor coordinate everytime when using it, update and reuse the same
+//   tensor coordinate instead
+//   3. Don't use a pointer to VGPR buffer, use vector instead
+
+// Assume:
+//   1. src0_desc and dst_desc are not known at compile-time
+//   2. SrcBuffer and DstBuffer are DynamicBuffer
+//   3. src_slice_origin and dst_slice_origin are not known at compile-time,
+template <typename Src0Data,
+          typename Src1Data,
+          typename DstData,
+          typename Src0Desc,
+          typename Src1Desc,
+          typename DstDesc,
+          typename ElementwiseOperation,
+          typename SliceLengths,
+          typename DimAccessOrder,
+          index_t VectorDim,
+          index_t ScalarPerVector,
+          InMemoryDataOperationEnum DstInMemOp,
+          bool Src0ResetCoordinateAfterRun,
+          bool Src1ResetCoordinateAfterRun,
+          bool DstResetCoordinateAfterRun>
+struct ThreadwiseTensorSliceTransfer_v6r2
+{
+    static constexpr index_t nDim = SliceLengths::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    using Src0Coord = decltype(make_tensor_coordinate(Src0Desc{}, Index{}));
+    using Src1Coord = decltype(make_tensor_coordinate(Src1Desc{}, Index{}));
+    using DstCoord  = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
+
+    static constexpr auto I0 = Number<0>{};
+
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v6r2(const Src0Desc& src0_desc,
+                                                            const Index& src0_slice_origin,
+                                                            const Src1Desc& src1_desc,
+                                                            const Index& src1_slice_origin,
+                                                            const DstDesc& dst_desc,
+                                                            const Index& dst_slice_origin,
+                                                            const ElementwiseOperation& element_op)
+        : src0_coord_(make_tensor_coordinate(src0_desc, src0_slice_origin)),
+          src1_coord_(make_tensor_coordinate(src1_desc, src1_slice_origin)),
+          dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin)),
+          element_op_(element_op)
+    {
+        static_assert(SliceLengths::At(Number<VectorDim>{}) % ScalarPerVector == 0,
+                      "wrong! cannot evenly divide");
+    }
+
+    __device__ void SetSrc0SliceOrigin(const Src0Desc& src0_desc,
+                                       const Index& src0_slice_origin_idx)
+    {
+        src0_coord_ = make_tensor_coordinate(src0_desc, src0_slice_origin_idx);
+    }
+
+    __device__ void SetSrc1SliceOrigin(const Src1Desc& src1_desc,
+                                       const Index& src1_slice_origin_idx)
+    {
+        src1_coord_ = make_tensor_coordinate(src1_desc, src1_slice_origin_idx);
+    }
+
+    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
+    {
+        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
+    }
+
+    template <typename Src0Buffer, typename Src1Buffer, typename DstBuffer>
+    __device__ void Run(const Src0Desc& src0_desc,
+                        const Src0Buffer& src0_buf,
+                        const Src1Desc& src1_desc,
+                        const Src1Buffer& src1_buf,
+                        const DstDesc& dst_desc,
+                        DstBuffer& dst_buf)
+    {
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});
+
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
+                                                    remove_cv_t<decltype(scalar_per_access)>>;
+
+        constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
+
+        // loop over space-filling curve
+        static_for<0, num_access, 1>{}([&](auto idx_1d) {
+            using src0_vector_type = vector_type_maker_t<Src0Data, ScalarPerVector>;
+            using src0_vector_t    = typename src0_vector_type::type;
+
+            using src1_vector_type = vector_type_maker_t<Src1Data, ScalarPerVector>;
+            using src1_vector_t    = typename src1_vector_type::type;
+
+            using dst_vector_type = vector_type_maker_t<DstData, ScalarPerVector>;
+            using dst_vector_t    = typename dst_vector_type::type;
+
+            const bool is_src0_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(src0_desc, src0_coord_);
+
+            const bool is_src1_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(src1_desc, src1_coord_);
+
+            // copy data from src0_buf into src0_vector_container
+            auto src0_vector_container = src0_vector_type{
+                src0_buf.template Get<src0_vector_t>(src0_coord_.GetOffset(), is_src0_valid)};
+
+            auto src1_vector_container = src1_vector_type{
+                src1_buf.template Get<src1_vector_t>(src1_coord_.GetOffset(), is_src1_valid)};
+
+            auto dst_vector_container = dst_vector_type{};
+
+            // apply pointwise operation
+            static_for<0, ScalarPerVector, 1>{}([&](auto i) {
+                element_op_(dst_vector_container.template AsType<DstData>()(i),
+                            src0_vector_container.template AsType<Src0Data>()[i],
+                            src1_vector_container.template AsType<Src1Data>()[i]);
+            });
+
+            const bool is_dst_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
+
+            // copy data from dst_vector into dst_buf
+            dst_buf.template Update<DstInMemOp, dst_vector_t>(
+                dst_coord_.GetOffset(),
+                is_dst_valid,
+                dst_vector_container.template AsType<dst_vector_t>()[I0]);
+
+            // move coordinate
+            if constexpr(idx_1d.value != num_access - 1)
+            {
+                constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
+                move_tensor_coordinate(
+                    src0_desc, src0_coord_, make_tensor_coordinate_step(src0_desc, forward_step));
+                move_tensor_coordinate(
+                    src1_desc, src1_coord_, make_tensor_coordinate_step(src1_desc, forward_step));
+                move_tensor_coordinate(
+                    dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step));
+            }
+        });
+
+        // move coordinate back to slice origin (or not)
+        if constexpr(Src0ResetCoordinateAfterRun)
+        {
+            const auto src0_reset_step =
+                make_tensor_coordinate_step(src0_desc, GetCoordinateResetStep());
+
+            move_tensor_coordinate(src0_desc, src0_coord_, src0_reset_step);
+        }
+
+        if constexpr(Src1ResetCoordinateAfterRun)
+        {
+            const auto src1_reset_step =
+                make_tensor_coordinate_step(src1_desc, GetCoordinateResetStep());
+
+            move_tensor_coordinate(src1_desc, src1_coord_, src1_reset_step);
+        }
+
+        if constexpr(DstResetCoordinateAfterRun)
+        {
+            const auto dst_reset_step =
+                make_tensor_coordinate_step(dst_desc, GetCoordinateResetStep());
+
+            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
+        }
+    }
+
+    __device__ static constexpr auto GetCoordinateResetStep()
+    {
+        constexpr auto scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});
+
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
+                                                    remove_cv_t<decltype(scalar_per_access)>>;
+
+        constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
+        if constexpr(num_access == 0)
+        {
+            return typename SpaceFillingCurve::Index{};
+        }
+        else
+        {
+            constexpr auto reset_step =
+                SpaceFillingCurve::GetStepBetween(Number<num_access - 1>{}, Number<0>{});
+
+            return reset_step;
+        }
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveSrc0SliceWindow(const Src0Desc& src0_desc,
+                                        const Index& src0_slice_origin_step_idx)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx = Src0ResetCoordinateAfterRun
+                                           ? src0_slice_origin_step_idx
+                                           : src0_slice_origin_step_idx + GetCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(src0_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(src0_desc, src0_coord_, adjusted_step);
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveSrc1SliceWindow(const Src1Desc& src1_desc,
+                                        const Index& src1_slice_origin_step_idx)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx = Src1ResetCoordinateAfterRun
+                                           ? src1_slice_origin_step_idx
+                                           : src1_slice_origin_step_idx + GetCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(src1_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(src1_desc, src1_coord_, adjusted_step);
+    }
+
+    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
+                                       const Index& dst_slice_origin_step_idx)
+    {
+        // if dst coord was not reset by Run(), then need to adjust the step here
+        const auto adjusted_step_idx = DstResetCoordinateAfterRun
+                                           ? dst_slice_origin_step_idx
+                                           : dst_slice_origin_step_idx + GetCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
+    }
+
+    private:
+    Src0Coord src0_coord_;
+    Src1Coord src1_coord_;
+    DstCoord dst_coord_;
+    const ElementwiseOperation element_op_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp
new file mode 100644
index 00000000..0f5fb88b
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp
@@ -0,0 +1,310 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/tensor_space_filling_curve.hpp"
+
+namespace ck {
+
+// Do following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
+// and sometimes useless instructions:
+//   1. Don't save a reference to tensor descriptor in class, pass in tensor descriptor as argument
+//   instead
+//   2. Don't construct a new tensor coordinate everytime when using it, update and reuse the same
+//   tensor coordinate instead
+//   3. Don't use a pointer to VGPR buffer, use vector instead
+
+// Assume:
+//   1. src0_desc and dst_desc are not known at compile-time
+//   2. SrcBuffer and DstBuffer are DynamicBuffer
+//   3. src_slice_origin and dst_slice_origin are not known at compile-time,
+template <typename Src0Data,
+          typename Src1Data,
+          typename Src2Data,
+          typename DstData,
+          typename Src0Desc,
+          typename Src1Desc,
+          typename Src2Desc,
+          typename DstDesc,
+          typename ElementwiseOperation,
+          typename SliceLengths,
+          typename DimAccessOrder,
+          index_t VectorDim,
+          index_t ScalarPerVector,
+          InMemoryDataOperationEnum DstInMemOp,
+          bool Src0ResetCoordinateAfterRun,
+          bool Src1ResetCoordinateAfterRun,
+          bool Src2ResetCoordinateAfterRun,
+          bool DstResetCoordinateAfterRun>
+struct ThreadwiseTensorSliceTransfer_v6r3
+{
+    static constexpr index_t nDim = SliceLengths::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    using Src0Coord = decltype(make_tensor_coordinate(Src0Desc{}, Index{}));
+    using Src1Coord = decltype(make_tensor_coordinate(Src1Desc{}, Index{}));
+    using Src2Coord = decltype(make_tensor_coordinate(Src2Desc{}, Index{}));
+    using DstCoord  = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
+
+    static constexpr auto I0 = Number<0>{};
+
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v6r3(const Src0Desc& src0_desc,
+                                                            const Index& src0_slice_origin,
+                                                            const Src1Desc& src1_desc,
+                                                            const Index& src1_slice_origin,
+                                                            const Src2Desc& src2_desc,
+                                                            const Index& src2_slice_origin,
+                                                            const DstDesc& dst_desc,
+                                                            const Index& dst_slice_origin,
+                                                            const ElementwiseOperation& element_op)
+        : src0_coord_(make_tensor_coordinate(src0_desc, src0_slice_origin)),
+          src1_coord_(make_tensor_coordinate(src1_desc, src1_slice_origin)),
+          src2_coord_(make_tensor_coordinate(src2_desc, src2_slice_origin)),
+          dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin)),
+          element_op_(element_op)
+    {
+        static_assert(SliceLengths::At(Number<VectorDim>{}) % ScalarPerVector == 0,
+                      "wrong! cannot evenly divide");
+    }
+
+    __device__ void SetSrc0SliceOrigin(const Src0Desc& src0_desc,
+                                       const Index& src0_slice_origin_idx)
+    {
+        src0_coord_ = make_tensor_coordinate(src0_desc, src0_slice_origin_idx);
+    }
+
+    __device__ void SetSrc1SliceOrigin(const Src1Desc& src1_desc,
+                                       const Index& src1_slice_origin_idx)
+    {
+        src1_coord_ = make_tensor_coordinate(src1_desc, src1_slice_origin_idx);
+    }
+
+    __device__ void SetSrc2SliceOrigin(const Src2Desc& src2_desc,
+                                       const Index& src2_slice_origin_idx)
+    {
+        src2_coord_ = make_tensor_coordinate(src2_desc, src2_slice_origin_idx);
+    }
+
+    __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
+    {
+        dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
+    }
+
+    template <typename Src0Buffer, typename Src1Buffer, typename Src2Buffer, typename DstBuffer>
+    __device__ void Run(const Src0Desc& src0_desc,
+                        const Src0Buffer& src0_buf,
+                        const Src1Desc& src1_desc,
+                        const Src1Buffer& src1_buf,
+                        const Src2Desc& src2_desc,
+                        const Src2Buffer& src2_buf,
+                        const DstDesc& dst_desc,
+                        DstBuffer& dst_buf)
+    {
+        // scalar per access on each dim
+        // TODO: don't use lambda_scalar_per_access
+        constexpr auto scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});
+
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
+                                                    remove_cv_t<decltype(scalar_per_access)>>;
+
+        constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
+
+        // loop over space-filling curve
+        static_for<0, num_access, 1>{}([&](auto idx_1d) {
+            using src0_vector_type = vector_type_maker_t<Src0Data, ScalarPerVector>;
+            using src0_vector_t    = typename src0_vector_type::type;
+
+            using src1_vector_type = vector_type_maker_t<Src1Data, ScalarPerVector>;
+            using src1_vector_t    = typename src1_vector_type::type;
+
+            using src2_vector_type = vector_type_maker_t<Src2Data, ScalarPerVector>;
+            using src2_vector_t    = typename src2_vector_type::type;
+
+            using dst_vector_type = vector_type_maker_t<DstData, ScalarPerVector>;
+            using dst_vector_t    = typename dst_vector_type::type;
+
+            const bool is_src0_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(src0_desc, src0_coord_);
+
+            const bool is_src1_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(src1_desc, src1_coord_);
+
+            const bool is_src2_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(src2_desc, src2_coord_);
+
+            // copy data from src0_buf into src0_vector_container
+            auto src0_vector_container = src0_vector_type{
+                src0_buf.template Get<src0_vector_t>(src0_coord_.GetOffset(), is_src0_valid)};
+
+            auto src1_vector_container = src1_vector_type{
+                src1_buf.template Get<src1_vector_t>(src1_coord_.GetOffset(), is_src1_valid)};
+
+            auto src2_vector_container = src2_vector_type{
+                src2_buf.template Get<src2_vector_t>(src2_coord_.GetOffset(), is_src2_valid)};
+
+            auto dst_vector_container = dst_vector_type{};
+
+            // apply pointwise operation
+            static_for<0, ScalarPerVector, 1>{}([&](auto i) {
+                element_op_(dst_vector_container.template AsType<DstData>()(i),
+                            src0_vector_container.template AsType<Src0Data>()[i],
+                            src1_vector_container.template AsType<Src1Data>()[i],
+                            src2_vector_container.template AsType<Src2Data>()[i]);
+            });
+
+            const bool is_dst_valid =
+                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
+
+            dst_buf.template Update<DstInMemOp, dst_vector_t>(
+                dst_coord_.GetOffset(),
+                is_dst_valid,
+                dst_vector_container.template AsType<dst_vector_t>()[I0]);
+
+            // move coordinate
+            if constexpr(idx_1d.value != num_access - 1)
+            {
+                constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
+                move_tensor_coordinate(
+                    src0_desc, src0_coord_, make_tensor_coordinate_step(src0_desc, forward_step));
+                move_tensor_coordinate(
+                    src1_desc, src1_coord_, make_tensor_coordinate_step(src1_desc, forward_step));
+                move_tensor_coordinate(
+                    src2_desc, src2_coord_, make_tensor_coordinate_step(src2_desc, forward_step));
+                move_tensor_coordinate(
+                    dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step));
+            }
+        });
+
+        // move coordinate back to slice origin (or not)
+        if constexpr(Src0ResetCoordinateAfterRun)
+        {
+            const auto src0_reset_step =
+                make_tensor_coordinate_step(src0_desc, GetCoordinateResetStep());
+
+            move_tensor_coordinate(src0_desc, src0_coord_, src0_reset_step);
+        }
+
+        if constexpr(Src1ResetCoordinateAfterRun)
+        {
+            const auto src1_reset_step =
+                make_tensor_coordinate_step(src1_desc, GetCoordinateResetStep());
+
+            move_tensor_coordinate(src1_desc, src1_coord_, src1_reset_step);
+        }
+
+        if constexpr(Src2ResetCoordinateAfterRun)
+        {
+            const auto src2_reset_step =
+                make_tensor_coordinate_step(src2_desc, GetCoordinateResetStep());
+
+            move_tensor_coordinate(src2_desc, src2_coord_, src2_reset_step);
+        }
+
+        if constexpr(DstResetCoordinateAfterRun)
+        {
+            const auto dst_reset_step =
+                make_tensor_coordinate_step(dst_desc, GetCoordinateResetStep());
+
+            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
+        }
+    }
+
+    __device__ static constexpr auto GetCoordinateResetStep()
+    {
+        constexpr auto scalar_per_access = generate_sequence(
+            detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});
+
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
+                                                    remove_cv_t<decltype(scalar_per_access)>>;
+
+        constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
+        if constexpr(num_access == 0)
+        {
+            return typename SpaceFillingCurve::Index{};
+        }
+        else
+        {
+            constexpr auto reset_step =
+                SpaceFillingCurve::GetStepBetween(Number<num_access - 1>{}, Number<0>{});
+
+            return reset_step;
+        }
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveSrc0SliceWindow(const Src0Desc& src0_desc,
+                                        const Index& src0_slice_origin_step_idx)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx = Src0ResetCoordinateAfterRun
+                                           ? src0_slice_origin_step_idx
+                                           : src0_slice_origin_step_idx + GetCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(src0_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(src0_desc, src0_coord_, adjusted_step);
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveSrc1SliceWindow(const Src1Desc& src1_desc,
+                                        const Index& src1_slice_origin_step_idx)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx = Src1ResetCoordinateAfterRun
+                                           ? src1_slice_origin_step_idx
+                                           : src1_slice_origin_step_idx + GetCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(src1_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(src1_desc, src1_coord_, adjusted_step);
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveSrc2SliceWindow(const Src2Desc& src2_desc,
+                                        const Index& src2_slice_origin_step_idx)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx = Src2ResetCoordinateAfterRun
+                                           ? src2_slice_origin_step_idx
+                                           : src2_slice_origin_step_idx + GetCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(src2_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(src2_desc, src2_coord_, adjusted_step);
+    }
+
+    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
+    __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
+                                       const Index& dst_slice_origin_step_idx)
+    {
+        // if dst coord was not reset by Run(), then need to adjust the step here
+        const auto adjusted_step_idx = DstResetCoordinateAfterRun
+                                           ? dst_slice_origin_step_idx
+                                           : dst_slice_origin_step_idx + GetCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
+
+        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
+    }
+
+    private:
+    Src0Coord src0_coord_;
+    Src1Coord src1_coord_;
+    Src2Coord src2_coord_;
+    DstCoord dst_coord_;
+    const ElementwiseOperation element_op_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp
new file mode 100644
index 00000000..2eb1b0ee
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7.hpp
@@ -0,0 +1,298 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_description/tensor_space_filling_curve.hpp"
+
+namespace ck {
+
+// Thread-level multi-source, multi-destination tensor slice data movement
+// Assume:
+//   1. All sources and destinations are DynamicBuffer
+//   2. Same VectorDim and ScalerPerVector for all sources and destinations
+//   3. DstInMemOps are per destination tensor
+//   4. ThreadTransferSrcResetCoordinateAfterRunFlags are per source tensor
+//   5. ThreadTransferDstResetCoordinateAfterRunFlags are per destination tensor
+//   6. Does not need to know src_descs and dst_descs at compile-time
+//   7. Does not need to know src_slice_origins and dst_slice_origins at compile-time,
+//
+// Does following things to avoid scratch memory issue
+//   1. Use StaticallyIndexedArray or vector_type instead of C array for thread buffer
+//   2. Pass tensor descritpors by reference (or tuple of references)
+//   3. Does not keep reference to tensor descriptor
+//   4. Does not construct new tensor coordinate when call Run()
+template <typename SrcDatas,
+          typename DstDatas,
+          typename SrcDescs,
+          typename DstDescs,
+          typename ElementwiseOperation,
+          typename DstInMemOps, // Sequence<InMemoryDataOperationEnum ...>
+          typename SliceLengths,
+          typename DimAccessOrder,
+          index_t VectorDim,
+          index_t ScalarPerVector,
+          typename SrcResetCoordinateAfterRunFlags, // Sequence<bool ...>
+          typename DstResetCoordinateAfterRunFlags> // Sequence<bool ...>
+struct ThreadwiseTensorSliceTransfer_v7
+{
+    static constexpr auto I0 = Number<0>{};
+
+    static constexpr index_t nDim = SliceLengths::Size();
+
+    static constexpr index_t nSrc = SrcDescs::Size();
+    static constexpr index_t nDst = DstDescs::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    // return a tuple of coordiantes for a tuple of tensor
+    template <typename Descs,
+              typename Indices,
+              enable_if_t<Descs::Size() == Indices::Size(), bool> = false>
+    static constexpr auto MakeCoordinates(const Descs& descs, const Indices& indices)
+    {
+        return generate_tuple([&](auto i) { return make_tensor_coordinate(descs[i], indices[i]); },
+                              Number<Descs::Size()>{});
+    }
+
+    using SrcCoords = decltype(MakeCoordinates(SrcDescs{}, StaticallyIndexedArray<Index, nSrc>{}));
+    using DstCoords = decltype(MakeCoordinates(DstDescs{}, StaticallyIndexedArray<Index, nDst>{}));
+
+    // scalar per access on each dim
+    // FIXME: don't use lambda_scalar_per_access
+    static constexpr auto scalar_per_access = generate_sequence(
+        detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});
+
+    using SpaceFillingCurve =
+        SpaceFillingCurve<SliceLengths, DimAccessOrder, remove_cv_t<decltype(scalar_per_access)>>;
+
+    __device__ constexpr ThreadwiseTensorSliceTransfer_v7(
+        const SrcDescs& src_descs,
+        const StaticallyIndexedArray<Index, nSrc>& src_slice_origins,
+        const DstDescs& dst_descs,
+        const StaticallyIndexedArray<Index, nDst>& dst_slice_origins,
+        const ElementwiseOperation& element_op)
+        : src_coords_(MakeCoordinates(src_descs, src_slice_origins)),
+          dst_coords_(MakeCoordinates(dst_descs, dst_slice_origins)),
+          element_op_(element_op)
+    {
+        static_assert(SliceLengths::At(Number<VectorDim>{}) % ScalarPerVector == 0,
+                      "wrong! cannot evenly divide");
+    }
+
+    template <typename Indices, enable_if_t<SrcDescs::Size() == Indices::Size(), bool> = false>
+    __device__ void SetSrcSliceOrigins(const SrcDescs& src_descs,
+                                       const Indices& src_slice_origin_idxs)
+    {
+        static_for<0, nSrc, 1>{}([&](auto i) {
+            src_coords_(i) = make_tensor_coordinate(src_descs[i], src_slice_origin_idxs[i]);
+        });
+    }
+
+    template <typename Indices, enable_if_t<DstDescs::Size() == Indices::Size(), bool> = false>
+    __device__ void SetDstSliceOrigins(const DstDescs& dst_descs,
+                                       const Indices& dst_slice_origin_idxs)
+    {
+        static_for<0, nDst, 1>{}([&](auto i) {
+            dst_coords_(i) = make_tensor_coordinate(dst_descs[i], dst_slice_origin_idxs[i]);
+        });
+    }
+
+    // SrcDescs: Tuple<const SrcDesc0&, const SrcDesc1&, ...>
+    // SrcBuffers: Tuple<const SrcBuffer0&, const SrcBuffer1&, ...>
+    // DstDescs: Tuple<const DstDesc0&, const DstDesc1&, ...>
+    // DstBuffers: Tuple<const DstBuffer0&, const DstBuffer1&, ...>
+    template <typename SrcBuffers,
+              typename DstBuffers,
+              enable_if_t<SrcDescs::Size() == SrcBuffers::Size() &&
+                              DstDescs::Size() == DstBuffers::Size(),
+                          bool> = false>
+    __device__ void Run(const SrcDescs& src_descs,
+                        const SrcBuffers& src_bufs,
+                        const DstDescs& dst_descs,
+                        DstBuffers dst_bufs)
+    {
+        auto generate_vectors = [&](auto data_types) {
+            constexpr index_t num = data_types.Size();
+
+            return generate_tuple(
+                [&](auto i) {
+                    using DataType = remove_cvref_t<decltype(data_types[i])>;
+
+                    return vector_type_maker_t<DataType, ScalarPerVector>{};
+                },
+                Number<num>{});
+        };
+
+        constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
+
+        // loop over space-filling curve
+        static_for<0, num_access, 1>{}([&](auto iAccess) {
+            auto src_vectors = generate_vectors(SrcDatas{});
+            auto dst_vectors = generate_vectors(DstDatas{});
+
+            // copy data from src_bufs into src_vectors
+            static_for<0, nSrc, 1>{}([&](auto i) {
+                using src_vector_t = typename remove_cvref_t<decltype(src_vectors[i])>::type;
+
+                const bool is_src_valid =
+                    coordinate_has_valid_offset_assuming_visible_index_is_valid(src_descs[i],
+                                                                                src_coords_[i]);
+
+                src_vectors(i).template AsType<src_vector_t>()(I0) =
+                    src_bufs[i].template Get<src_vector_t>(src_coords_[i].GetOffset(),
+                                                           is_src_valid);
+            });
+
+            // apply pointwise function
+            static_for<0, ScalarPerVector, 1>{}([&](auto i) {
+                // get reference to src data
+                const auto src_data_refs = generate_tie(
+                    // return type should be lvalue
+                    [&](auto iSrc) -> const auto& {
+                        using SrcData = remove_cvref_t<tuple_element_t<iSrc.value, SrcDatas>>;
+
+                        return src_vectors[iSrc].template AsType<SrcData>()[i];
+                    },
+                    Number<nSrc>{});
+
+                // get reference to dst data
+                auto dst_data_refs = generate_tie(
+                    // return type should be lvalue
+                    [&](auto iDst) -> auto& {
+                        using DstData = remove_cvref_t<tuple_element_t<iDst.value, DstDatas>>;
+
+                        return dst_vectors(iDst).template AsType<DstData>()(i);
+                    },
+                    Number<nDst>{});
+
+                // apply pointwise function
+                // pointwise function signature:
+                // element_op_(dst_data_refs[I0],
+                //             dst_data_refs[I1],
+                //             ...,
+                //             src_data_refs[I0],
+                //             src_data_refs[I1],
+                //             ...)
+                unpack2(element_op_, dst_data_refs, src_data_refs);
+            });
+
+            // copy data from buf_vectors into dst_bufs
+            static_for<0, nDst, 1>{}([&](auto i) {
+                using dst_vector_t = typename remove_cvref_t<decltype(dst_vectors[i])>::type;
+
+                const bool is_dst_valid =
+                    coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_descs[i],
+                                                                                dst_coords_[i]);
+
+                constexpr InMemoryDataOperationEnum DstInMemOp =
+                    static_cast<InMemoryDataOperationEnum>(DstInMemOps::At(i.value));
+
+                dst_bufs(i).template Update<DstInMemOp, dst_vector_t>(
+                    dst_coords_[i].GetOffset(),
+                    is_dst_valid,
+                    dst_vectors[i].template AsType<dst_vector_t>()[I0]);
+            });
+
+            // move coordinate
+            if constexpr(iAccess.value != num_access - 1)
+            {
+                constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(iAccess);
+
+                static_for<0, nSrc, 1>{}([&](auto i) {
+                    move_tensor_coordinate(src_descs[i],
+                                           src_coords_(i),
+                                           make_tensor_coordinate_step(src_descs[i], forward_step));
+                });
+
+                static_for<0, nDst, 1>{}([&](auto i) {
+                    move_tensor_coordinate(dst_descs[i],
+                                           dst_coords_(i),
+                                           make_tensor_coordinate_step(dst_descs[i], forward_step));
+                });
+            }
+        });
+
+        // move coordinate back to slice origin (or not)
+        static_for<0, nSrc, 1>{}([&](auto i) {
+            if constexpr(SrcResetCoordinateAfterRunFlags::At(i))
+            {
+                const auto src_reset_step =
+                    make_tensor_coordinate_step(src_descs[i], GetCoordinateResetStep());
+
+                move_tensor_coordinate(src_descs[i], src_coords_(i), src_reset_step);
+            }
+        });
+
+        static_for<0, nDst, 1>{}([&](auto i) {
+            if constexpr(DstResetCoordinateAfterRunFlags::At(i))
+            {
+                const auto dst_reset_step =
+                    make_tensor_coordinate_step(dst_descs[i], GetCoordinateResetStep());
+
+                move_tensor_coordinate(dst_descs[i], dst_coords_(i), dst_reset_step);
+            }
+        });
+    }
+
+    __device__ static constexpr auto GetCoordinateResetStep()
+    {
+        constexpr auto num_access = SpaceFillingCurve::GetNumOfAccess();
+
+        if constexpr(num_access == 0)
+        {
+            return typename SpaceFillingCurve::Index{};
+        }
+        else
+        {
+            constexpr auto reset_step =
+                SpaceFillingCurve::GetStepBetween(Number<num_access - 1>{}, Number<0>{});
+
+            return reset_step;
+        }
+    }
+
+    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
+    template <index_t ISrc>
+    __device__ void MoveSrcSliceWindow(const SrcDescs& src_descs,
+                                       Number<ISrc> iSrc,
+                                       const Index& src_slice_origin_step_idx)
+    {
+        // if src coord was not reset by RunRead(), then need to adjust the step here
+        const auto adjusted_step_idx = SrcResetCoordinateAfterRunFlags::At(iSrc)
+                                           ? src_slice_origin_step_idx
+                                           : src_slice_origin_step_idx + GetCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(src_descs[iSrc], adjusted_step_idx);
+
+        move_tensor_coordinate(src_descs[iSrc], src_coords_(iSrc), adjusted_step);
+    }
+
+    // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
+    template <index_t IDst>
+    __device__ void MoveDstSliceWindow(const DstDescs& dst_descs,
+                                       Number<IDst> iDst,
+                                       const Index& dst_slice_origin_step_idx)
+    {
+        // if dst coord was not reset by Run(), then need to adjust the step here
+        const auto adjusted_step_idx = DstResetCoordinateAfterRunFlags::At(iDst)
+                                           ? dst_slice_origin_step_idx
+                                           : dst_slice_origin_step_idx + GetCoordinateResetStep();
+
+        // is it OK to construct a new step every time?
+        const auto adjusted_step = make_tensor_coordinate_step(dst_descs[iDst], adjusted_step_idx);
+
+        move_tensor_coordinate(dst_descs[iDst], dst_coords_(iDst), adjusted_step);
+    }
+
+    private:
+    SrcCoords src_coords_;
+    DstCoords dst_coords_;
+    const ElementwiseOperation element_op_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/thread/threadwise_welford.hpp b/include/ck/tensor_operation/gpu/thread/threadwise_welford.hpp
new file mode 100644
index 00000000..12ba2c53
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_welford.hpp
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/math_v2.hpp"
+
+namespace ck {
+
+// Assume
+//  1) XDesc is known at compile-time
+//  2) MeanVarDesc is known at compile-time
+//  3) XBuffer is static buffer
+//  4) MeanBuffer is static buffer
+//  5) VarBuffer is static buffer
+template <typename T, typename XThreadDesc_M_K, typename MeanVarThreadDesc_M>
+struct ThreadwiseWelford
+{
+    static constexpr auto x_thread_desc_m_k      = XThreadDesc_M_K{};
+    static constexpr auto mean_var_thread_desc_m = MeanVarThreadDesc_M{};
+
+    static constexpr auto thread_x_length_m        = x_thread_desc_m_k.GetLength(Number<0>{});
+    static constexpr auto thread_x_length_k        = x_thread_desc_m_k.GetLength(Number<1>{});
+    static constexpr auto thread_mean_var_length_m = mean_var_thread_desc_m.GetLength(Number<0>{});
+
+    static_assert(thread_x_length_m == thread_mean_var_length_m,
+                  "lengths of source and mean/var buffer must match!");
+
+    __device__ constexpr ThreadwiseWelford() : cur_count_(0), max_count_(0) {}
+
+    __device__ inline void Update(T& mean, T& var, T x)
+    {
+        using ck::math::isnan;
+
+        if(isnan(x))
+        {
+            mean = x;
+            var  = x;
+        }
+        else
+        {
+            T delta = x - mean;
+            mean += delta / cur_count_;
+            T delta2 = x - mean;
+            var += delta * delta2;
+        }
+    }
+
+    template <typename XBufferType, typename MeanBufferType, typename VarBufferType>
+    __device__ void
+    Run(const XBufferType& x_buf_m_k, MeanBufferType& mean_buf_m, VarBufferType& var_buf_m)
+    {
+        // FIXME - Better naming for var_buf_m
+
+        static_for<0, thread_x_length_k, 1>{}([&](auto iK) {
+            if(cur_count_ < max_count_)
+            {
+                ++cur_count_;
+
+                static_for<0, thread_x_length_m, 1>{}([&](auto iM) {
+                    constexpr index_t out_offset =
+                        mean_var_thread_desc_m.CalculateOffset(make_tuple(iM));
+
+                    constexpr auto in_offset =
+                        x_thread_desc_m_k.CalculateOffset(make_tuple(iM, iK));
+                    Update(mean_buf_m(Number<out_offset>{}),
+                           var_buf_m(Number<out_offset>{}),
+                           x_buf_m_k[Number<in_offset>{}]);
+                });
+            }
+        });
+    };
+
+    int cur_count_;
+    int max_count_;
+};
+
+template <typename T,
+          typename SrcMeanVarCountThreadDesc_M_K,
+          typename DstMeanVarThreadDesc_M,
+          bool GetActualVariance = false>
+struct ThreadwiseWelfordMerge
+{
+    static constexpr auto src_thread_desc_m_k = SrcMeanVarCountThreadDesc_M_K{};
+    static constexpr auto dst_thread_desc_m   = DstMeanVarThreadDesc_M{};
+
+    static constexpr auto src_length_m = src_thread_desc_m_k.GetLength(Number<0>{});
+    static constexpr auto src_length_k = src_thread_desc_m_k.GetLength(Number<1>{});
+    static constexpr auto dst_length_m = dst_thread_desc_m.GetLength(Number<0>{});
+
+    static_assert(src_length_m == dst_length_m, "lengths of source and dst buffer must match!");
+
+    __device__ static void
+    Merge(T& mean_a, T& var_a, int32_t& count_a, T mean_b, T var_b, int32_t count_b)
+    {
+        int count            = count_a + count_b;
+        T count_b_over_count = count == 0 ? type_convert<T>(0) : type_convert<T>(count_b) / count;
+        T delta              = mean_b - mean_a;
+        mean_a += delta * count_b_over_count;
+        var_a += var_b + delta * delta * count_a * count_b_over_count;
+        count_a = count;
+    }
+
+    template <typename SrcMeanBufferType,
+              typename SrcVarBufferType,
+              typename SrcCountBufferType,
+              typename DstMeanBufferType,
+              typename DstVarBufferType,
+              typename DstCountBufferType>
+    __device__ static void Run(const SrcMeanBufferType& src_mean_buf,
+                               const SrcVarBufferType& src_var_buf,
+                               const SrcCountBufferType& src_count_buf,
+                               DstMeanBufferType& dst_mean_buf,
+                               DstVarBufferType& dst_var_buf,
+                               DstCountBufferType& dst_count_buf)
+    {
+        static_for<0, src_length_m, 1>{}([&](auto iM) {
+            static_for<0, src_length_k, 1>{}([&](auto iK) {
+                constexpr auto src_offset = src_thread_desc_m_k.CalculateOffset(make_tuple(iM, iK));
+
+                Merge(dst_mean_buf(iM),
+                      dst_var_buf(iM),
+                      dst_count_buf(iM),
+                      src_mean_buf[Number<src_offset>{}],
+                      src_var_buf[Number<src_offset>{}],
+                      src_count_buf[Number<src_offset>{}]);
+            });
+
+            if constexpr(GetActualVariance)
+            {
+                dst_var_buf(iM) = dst_var_buf[iM] / dst_count_buf[iM];
+            };
+        });
+    };
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
new file mode 100644
index 00000000..4d53f0d8
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -0,0 +1,851 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/utility/math.hpp"
+#include "ck/utility/amd_xdlops.hpp"
+
+namespace ck {
+
+enum struct MfmaInstr
+{
+    mfma_f32_32x32x1xf32 = 0,
+    mfma_f32_16x16x1xf32,
+    mfma_f32_4x4x1xf32,
+    mfma_f32_32x32x2xf32,
+    mfma_f32_16x16x4xf32,
+    mfma_f32_32x32x4f16,
+    mfma_f32_16x16x4f16,
+    mfma_f32_4x4x4f16,
+    mfma_f32_32x32x8f16,
+    mfma_f32_16x16x16f16,
+    mfma_f32_32x32x8bf16_1k,
+    mfma_f32_16x16x16bf16_1k,
+    mfma_f32_32x32x4bf16,
+    mfma_f32_16x16x8bf16,
+    mfma_i32_32x32x8i8,
+    mfma_i32_16x16x16i8,
+    mfma_f64_16x16x4f64
+};
+
+template <MfmaInstr instr>
+struct mfma_type;
+
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_32x32x1xf32>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 4;
+    static constexpr index_t num_regs_per_blk    = 16;
+    static constexpr index_t num_threads_per_blk = 32;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 2;
+    static constexpr index_t num_output_blks     = 2;
+    static constexpr index_t m_per_blk           = 32;
+    static constexpr index_t n_per_blk           = 32;
+    static constexpr index_t k_per_blk           = 1;
+    static constexpr bool is_k_reduction         = false;
+
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_32x32x1f32<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_32x32x2xf32>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 4;
+    static constexpr index_t num_regs_per_blk    = 16;
+    static constexpr index_t num_threads_per_blk = 32;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 2;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 32;
+    static constexpr index_t n_per_blk           = 32;
+    static constexpr index_t k_per_blk           = 1;
+    static constexpr bool is_k_reduction         = true;
+
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_32x32x2f32<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_16x16x4xf32>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t num_regs_per_blk    = 4;
+    static constexpr index_t num_threads_per_blk = 16;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 4;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 16;
+    static constexpr index_t n_per_blk           = 16;
+    static constexpr index_t k_per_blk           = 1;
+    static constexpr bool is_k_reduction         = true;
+
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_16x16x4f32<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_16x16x1xf32>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t num_regs_per_blk    = 4;
+    static constexpr index_t num_threads_per_blk = 16;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 4;
+    static constexpr index_t num_output_blks     = 4;
+    static constexpr index_t m_per_blk           = 16;
+    static constexpr index_t n_per_blk           = 16;
+    static constexpr index_t k_per_blk           = 1;
+    static constexpr bool is_k_reduction         = false;
+
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_16x16x1f32<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+
+// treat 4x4x1 as a single-blk 4x64 mfma
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_4x4x1xf32>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t num_regs_per_blk    = 4;
+    static constexpr index_t num_threads_per_blk = 64;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 1;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 4;
+    static constexpr index_t n_per_blk           = 64;
+    static constexpr index_t k_per_blk           = 1;
+    static constexpr bool is_k_reduction         = false;
+
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_4x4x1f32<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_32x32x4f16>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 4;
+    static constexpr index_t num_regs_per_blk    = 16;
+    static constexpr index_t num_threads_per_blk = 32;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 2;
+    static constexpr index_t num_output_blks     = 2;
+    static constexpr index_t m_per_blk           = 32;
+    static constexpr index_t n_per_blk           = 32;
+    static constexpr index_t k_per_blk           = 4;
+    static constexpr bool is_k_reduction         = false;
+
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_32x32x4f16<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_32x32x8f16>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 4;
+    static constexpr index_t num_regs_per_blk    = 16;
+    static constexpr index_t num_threads_per_blk = 32;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 2;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 32;
+    static constexpr index_t n_per_blk           = 32;
+    static constexpr index_t k_per_blk           = 4;
+    static constexpr bool is_k_reduction         = true;
+
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_32x32x8f16<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_16x16x16f16>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t num_regs_per_blk    = 4;
+    static constexpr index_t num_threads_per_blk = 16;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 4;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 16;
+    static constexpr index_t n_per_blk           = 16;
+    static constexpr index_t k_per_blk           = 4;
+    static constexpr bool is_k_reduction         = true;
+
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_16x16x16f16<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_16x16x4f16>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t num_regs_per_blk    = 4;
+    static constexpr index_t num_threads_per_blk = 16;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 4;
+    static constexpr index_t num_output_blks     = 4;
+    static constexpr index_t m_per_blk           = 16;
+    static constexpr index_t n_per_blk           = 16;
+    static constexpr index_t k_per_blk           = 4;
+    static constexpr bool is_k_reduction         = false;
+
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_16x16x4f16<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_4x4x4f16>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t num_regs_per_blk    = 4;
+    static constexpr index_t num_threads_per_blk = 64;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 1;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 4;
+    static constexpr index_t n_per_blk           = 64;
+    static constexpr index_t k_per_blk           = 4;
+    static constexpr bool is_k_reduction         = false;
+
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_4x4x4f16<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_32x32x8bf16_1k>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 4;
+    static constexpr index_t num_regs_per_blk    = 16;
+    static constexpr index_t num_threads_per_blk = 32;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 2;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 32;
+    static constexpr index_t n_per_blk           = 32;
+    static constexpr index_t k_per_blk           = 4;
+    static constexpr bool is_k_reduction         = true;
+
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_32x32x8bf16_1k<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_16x16x16bf16_1k>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t num_regs_per_blk    = 4;
+    static constexpr index_t num_threads_per_blk = 16;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 4;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 16;
+    static constexpr index_t n_per_blk           = 16;
+    static constexpr index_t k_per_blk           = 4;
+    static constexpr bool is_k_reduction         = true;
+
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_16x16x16bf16_1k<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_32x32x4bf16>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 4;
+    static constexpr index_t num_regs_per_blk    = 16;
+    static constexpr index_t num_threads_per_blk = 32;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 2;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 32;
+    static constexpr index_t n_per_blk           = 32;
+    static constexpr index_t k_per_blk           = 2;
+    static constexpr bool is_k_reduction         = true;
+
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_32x32x4bf16<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_type<MfmaInstr::mfma_f32_16x16x8bf16>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t num_regs_per_blk    = 4;
+    static constexpr index_t num_threads_per_blk = 16;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 4;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 16;
+    static constexpr index_t n_per_blk           = 16;
+    static constexpr index_t k_per_blk           = 2;
+    static constexpr bool is_k_reduction         = true;
+
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f32_16x16x8bf16<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_type<MfmaInstr::mfma_i32_32x32x8i8>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 4;
+    static constexpr index_t num_regs_per_blk    = 16;
+    static constexpr index_t num_threads_per_blk = 32;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 2;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 32;
+    static constexpr index_t n_per_blk           = 32;
+    static constexpr index_t k_per_blk           = 4;
+    static constexpr bool is_k_reduction         = true;
+
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_i32_32x32x8i8<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_type<MfmaInstr::mfma_i32_16x16x16i8>
+{
+    static constexpr index_t group_size          = 4;
+    static constexpr index_t num_groups_per_blk  = 1;
+    static constexpr index_t num_regs_per_blk    = 4;
+    static constexpr index_t num_threads_per_blk = 16;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 4;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 16;
+    static constexpr index_t n_per_blk           = 16;
+    static constexpr index_t k_per_blk           = 4;
+    static constexpr bool is_k_reduction         = true;
+
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_i32_16x16x16i8<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+
+template <>
+struct mfma_type<MfmaInstr::mfma_f64_16x16x4f64>
+{
+    static constexpr index_t group_size          = 1;
+    static constexpr index_t num_groups_per_blk  = 4;
+    static constexpr index_t num_regs_per_blk    = 4; // group_size * num_groups_per_blk;
+    static constexpr index_t num_threads_per_blk = 16;
+    static constexpr index_t wave_size           = 64;
+    static constexpr index_t num_input_blks      = 4; // wave_size / num_threads_per_blk;
+    static constexpr index_t num_output_blks     = 1;
+    static constexpr index_t m_per_blk           = 16;
+    static constexpr index_t n_per_blk           = 16;
+    static constexpr index_t k_per_blk           = 1;
+    static constexpr bool is_k_reduction         = true;
+
+    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
+    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    {
+        intrin_mfma_f64_16x16x4f64<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+    }
+};
+
+template <typename base_type, index_t MPerXdlops, index_t NPerXdlops>
+struct MfmaSelector
+{
+    template <typename base_type_, index_t MPerXdlops_, index_t NPerXdlops_>
+    static constexpr auto GetMfma();
+
+    template <>
+    static constexpr auto GetMfma<double, 16, 16>()
+    {
+        return MfmaInstr::mfma_f64_16x16x4f64;
+    }
+
+    template <>
+    static constexpr auto GetMfma<float, 64, 64>()
+    {
+        return MfmaInstr::mfma_f32_32x32x1xf32;
+    }
+
+    template <>
+    static constexpr auto GetMfma<float, 32, 64>()
+    {
+        return MfmaInstr::mfma_f32_32x32x1xf32;
+    }
+
+    template <>
+    static constexpr auto GetMfma<float, 16, 64>()
+    {
+        return MfmaInstr::mfma_f32_16x16x1xf32;
+    }
+
+    template <>
+    static constexpr auto GetMfma<float, 8, 64>()
+    {
+        return MfmaInstr::mfma_f32_4x4x1xf32;
+    }
+
+    template <>
+    static constexpr auto GetMfma<float, 4, 64>()
+    {
+        return MfmaInstr::mfma_f32_4x4x1xf32;
+    }
+
+    template <>
+    static constexpr auto GetMfma<float, 32, 32>()
+    {
+        return MfmaInstr::mfma_f32_32x32x2xf32;
+    }
+
+    template <>
+    static constexpr auto GetMfma<float, 16, 16>()
+    {
+        return MfmaInstr::mfma_f32_16x16x4xf32;
+    }
+
+    template <>
+    static constexpr auto GetMfma<half_t, 64, 64>()
+    {
+        return MfmaInstr::mfma_f32_32x32x4f16;
+    }
+
+    template <>
+    static constexpr auto GetMfma<half_t, 32, 64>()
+    {
+        return MfmaInstr::mfma_f32_32x32x4f16;
+    }
+
+    template <>
+    static constexpr auto GetMfma<half_t, 32, 32>()
+    {
+        return MfmaInstr::mfma_f32_32x32x8f16;
+    }
+
+    template <>
+    static constexpr auto GetMfma<half_t, 16, 16>()
+    {
+        return MfmaInstr::mfma_f32_16x16x16f16;
+    }
+
+    template <>
+    static constexpr auto GetMfma<half_t, 16, 64>()
+    {
+        return MfmaInstr::mfma_f32_16x16x4f16;
+    }
+
+    template <>
+    static constexpr auto GetMfma<half_t, 8, 64>()
+    {
+        return MfmaInstr::mfma_f32_4x4x4f16;
+    }
+
+    template <>
+    static constexpr auto GetMfma<half_t, 4, 64>()
+    {
+        return MfmaInstr::mfma_f32_4x4x4f16;
+    }
+
+    template <>
+    static constexpr auto GetMfma<bhalf_t, 32, 32>()
+    {
+#if defined(CK_USE_AMD_MFMA_BF16_1K_OP)
+        return MfmaInstr::mfma_f32_32x32x8bf16_1k;
+#else
+        return MfmaInstr::mfma_f32_32x32x4bf16;
+#endif
+    }
+
+    template <>
+    static constexpr auto GetMfma<bhalf_t, 16, 16>()
+    {
+#if defined(CK_USE_AMD_MFMA_BF16_1K_OP)
+        return MfmaInstr::mfma_f32_16x16x16bf16_1k;
+#else
+        return MfmaInstr::mfma_f32_16x16x8bf16;
+#endif
+    }
+
+    template <>
+    static constexpr auto GetMfma<int8_t, 32, 32>()
+    {
+        return MfmaInstr::mfma_i32_32x32x8i8;
+    }
+
+    template <>
+    static constexpr auto GetMfma<int8_t, 16, 16>()
+    {
+        return MfmaInstr::mfma_i32_16x16x16i8;
+    }
+
+    static constexpr auto selected_mfma = mfma_type<GetMfma<base_type, MPerXdlops, NPerXdlops>()>{};
+
+    __host__ __device__ constexpr MfmaSelector()
+    {
+        static_assert(selected_mfma.group_size * selected_mfma.num_groups_per_blk ==
+                          selected_mfma.num_regs_per_blk,
+                      "wrong! num_regs_per_blk");
+
+        static_assert(selected_mfma.num_threads_per_blk == selected_mfma.n_per_blk,
+                      "n_per_blk != num_threads_per_blk");
+
+        static_assert(selected_mfma.num_regs_per_blk * selected_mfma.num_input_blks ==
+                          selected_mfma.m_per_blk,
+                      "m_per_blk != num_input_blks * num_regs_per_blk");
+
+        static_assert(selected_mfma.num_output_blks == selected_mfma.num_input_blks ||
+                          selected_mfma.num_output_blks == 1,
+                      "incorrect num_output_blks");
+
+        static_assert(selected_mfma.num_regs_per_blk * selected_mfma.wave_size ==
+                          selected_mfma.m_per_blk * selected_mfma.n_per_blk,
+                      "num_regs_per_blk incorrect");
+
+        static_assert(selected_mfma.is_k_reduction ||
+                          (selected_mfma.num_input_blks == selected_mfma.num_output_blks),
+                      "is_k_reduction wrong!");
+    }
+
+    static constexpr bool IsABroadcast()
+    {
+        static_assert(NPerXdlops >= MPerXdlops, "only support ABroadcast");
+        return true;
+    }
+
+    static constexpr index_t GetKPerXdlops()
+    {
+        return (selected_mfma.is_k_reduction ? selected_mfma.num_input_blks : 1) *
+               selected_mfma.k_per_blk;
+    }
+
+    static constexpr index_t GetK1PerXdlops() { return selected_mfma.k_per_blk; }
+};
+
+template <typename base_type,
+          index_t MPerXdlops,
+          index_t NPerXdlops,
+          index_t KPack,
+          bool TransposeC = false>
+struct XdlopsGemm
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+    static constexpr auto I5 = Number<5>{};
+
+    using CIndex   = MultiIndex<2>;
+    using CIndex4D = MultiIndex<4>;
+
+    __device__ static constexpr index_t GetNumBlks() { return mfma_instr.num_output_blks; }
+
+    __device__ static constexpr index_t GetNumXdlops()
+    {
+        return MPerXdlops * NPerXdlops /
+               (mfma_instr.m_per_blk * mfma_instr.n_per_blk * mfma_instr.num_output_blks);
+    }
+
+    __host__ __device__ constexpr XdlopsGemm()
+    {
+        static_assert(NPerXdlops == 4 || NPerXdlops == 8 || NPerXdlops == 16 || NPerXdlops == 32 ||
+                          NPerXdlops == 64,
+                      "Only support GemmNPerXdlops == 4, 8, 16, 32 or 64 for xdlops");
+
+        static_assert(MPerXdlops == 4 || MPerXdlops == 8 || MPerXdlops == 16 || MPerXdlops == 32 ||
+                          MPerXdlops == 64,
+                      "Only support GemmMPerXdlops == 4, 8, 16, 32 or 64 for xdlops");
+
+        static_assert(KPack % mfma_instr.k_per_blk == 0, "KPack cannot be divided by k_per_blk");
+    }
+
+    // XDL output supporting C = A * B
+    // M2_N2 -> M2_M3_M4_N2
+    template <typename CDesc_M0_N0_M1_N1_M2_N2>
+    __host__ __device__ static constexpr auto
+    MakeCDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CDesc_M0_N0_M1_N1_M2_N2& c_desc_m0_n0_m1_n1_m2_n2)
+    {
+        const auto M0 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I0);
+        const auto N0 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I1);
+        const auto M1 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I2);
+        const auto N1 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I3);
+
+        return transform_tensor_descriptor(
+            c_desc_m0_n0_m1_n1_m2_n2,
+            make_tuple(make_pass_through_transform(M0),
+                       make_pass_through_transform(N0),
+                       make_pass_through_transform(M1),
+                       make_pass_through_transform(N1),
+                       make_unmerge_transform(make_tuple(Number<mfma_instr.num_groups_per_blk>{},
+                                                         Number<mfma_instr.num_input_blks>{},
+                                                         Number<mfma_instr.group_size>{})),
+                       make_pass_through_transform(Number<mfma_instr.num_threads_per_blk>{})),
+            make_tuple(Sequence<0>{},
+                       Sequence<1>{},
+                       Sequence<2>{},
+                       Sequence<3>{},
+                       Sequence<4>{},
+                       Sequence<5>{}),
+            make_tuple(Sequence<0>{},
+                       Sequence<1>{},
+                       Sequence<2>{},
+                       Sequence<3>{},
+                       Sequence<4, 5, 6>{},
+                       Sequence<7>{}));
+    }
+
+    // transposed XDL output supporting C' = B' * A'
+    // M2_N2 -> M2_N2_N3_N4
+    template <typename CDesc_M0_N0_M1_N1_M2_N2>
+    __host__ __device__ static constexpr auto
+    MakeCDescriptor_M0_N0_M1_N1_M2_N2_N3_N4(const CDesc_M0_N0_M1_N1_M2_N2& c_desc_m0_n0_m1_n1_m2_n2)
+    {
+        const auto M0 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I0);
+        const auto N0 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I1);
+        const auto M1 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I2);
+        const auto N1 = c_desc_m0_n0_m1_n1_m2_n2.GetLength(I3);
+
+        return transform_tensor_descriptor(
+            c_desc_m0_n0_m1_n1_m2_n2,
+            make_tuple(make_pass_through_transform(M0),
+                       make_pass_through_transform(N0),
+                       make_pass_through_transform(M1),
+                       make_pass_through_transform(N1),
+                       make_pass_through_transform(Number<mfma_instr.num_threads_per_blk>{}),
+                       make_unmerge_transform(make_tuple(Number<mfma_instr.num_groups_per_blk>{},
+                                                         Number<mfma_instr.num_input_blks>{},
+                                                         Number<mfma_instr.group_size>{}))),
+            make_tuple(Sequence<0>{},
+                       Sequence<1>{},
+                       Sequence<2>{},
+                       Sequence<3>{},
+                       Sequence<4>{},
+                       Sequence<5>{}),
+            make_tuple(Sequence<0>{},
+                       Sequence<1>{},
+                       Sequence<2>{},
+                       Sequence<3>{},
+                       Sequence<4>{},
+                       Sequence<5, 6, 7>{}));
+    }
+
+    template <typename CDesc_G_M0_N0_M1_N1_M2_N2>
+    __host__ __device__ static constexpr auto MakeCDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(
+        const CDesc_G_M0_N0_M1_N1_M2_N2& c_desc_g_m0_n0_m1_n1_m2_n2)
+    {
+        const auto G  = c_desc_g_m0_n0_m1_n1_m2_n2.GetLength(I0);
+        const auto M0 = c_desc_g_m0_n0_m1_n1_m2_n2.GetLength(I1);
+        const auto N0 = c_desc_g_m0_n0_m1_n1_m2_n2.GetLength(I2);
+        const auto M1 = c_desc_g_m0_n0_m1_n1_m2_n2.GetLength(I3);
+        const auto N1 = c_desc_g_m0_n0_m1_n1_m2_n2.GetLength(I4);
+
+        return transform_tensor_descriptor(
+            c_desc_g_m0_n0_m1_n1_m2_n2,
+            make_tuple(make_pass_through_transform(G),
+                       make_pass_through_transform(M0),
+                       make_pass_through_transform(N0),
+                       make_pass_through_transform(M1),
+                       make_pass_through_transform(N1),
+                       make_unmerge_transform(make_tuple(mfma_instr.num_groups_per_blk,
+                                                         mfma_instr.num_input_blks,
+                                                         mfma_instr.group_size)),
+                       make_pass_through_transform(mfma_instr.num_threads_per_blk)),
+            make_tuple(Sequence<0>{},
+                       Sequence<1>{},
+                       Sequence<2>{},
+                       Sequence<3>{},
+                       Sequence<4>{},
+                       Sequence<5>{},
+                       Sequence<6>{}),
+            make_tuple(Sequence<0>{},
+                       Sequence<1>{},
+                       Sequence<2>{},
+                       Sequence<3>{},
+                       Sequence<4>{},
+                       Sequence<5, 6, 7>{},
+                       Sequence<8>{}));
+    }
+
+    __device__ static constexpr index_t GetRegSizePerXdlops()
+    {
+        return MPerXdlops * NPerXdlops / mfma_instr.wave_size;
+    }
+
+    __device__ static constexpr index_t GetWaveSize() { return mfma_instr.wave_size; }
+
+    template <class FloatA, class FloatB, class FloatC>
+    __device__ void Run(const FloatA& p_a_wave, const FloatB& p_b_wave, FloatC& p_c_thread) const
+    {
+        static_assert(is_same<base_type, double>::value || is_same<base_type, float>::value ||
+                          is_same<base_type, half_t>::value || is_same<base_type, bhalf_t>::value ||
+                          is_same<base_type, int8_t>::value,
+                      "base base_type must be double, float, half, bfloat16, and int8_t!");
+
+        static_for<0, KPack / mfma_instr.k_per_blk, 1>{}([&](auto k) {
+            if constexpr(!TransposeC)
+            {
+                mfma_instr.template run<MPerXdlops, NPerXdlops>(
+                    p_a_wave[k], p_b_wave[k], p_c_thread);
+            }
+            else
+            {
+                mfma_instr.template run<MPerXdlops, NPerXdlops>(
+                    p_b_wave[k], p_a_wave[k], p_c_thread);
+            }
+        });
+    }
+
+    __device__ static auto GetLaneId() { return get_thread_local_1d_id() % mfma_instr.wave_size; }
+
+    __device__ static auto GetBlkIdx()
+    {
+        const auto laneId = GetLaneId();
+
+        constexpr auto threadidx_to_blk_idx_adaptor = make_single_stage_tensor_adaptor(
+            make_tuple(make_merge_transform(
+                make_tuple(1, mfma_instr.num_input_blks, mfma_instr.num_threads_per_blk))),
+            make_tuple(Sequence<0, 1, 2>{}),
+            make_tuple(Sequence<0>{}));
+
+        const auto blk_idx =
+            threadidx_to_blk_idx_adaptor.CalculateBottomIndex(make_multi_index(laneId));
+
+        const auto blk_id = blk_idx[I1];
+        const auto blk_td = blk_idx[I2];
+
+        return make_tuple(blk_id, blk_td);
+    }
+
+    __host__ __device__ static auto CalculateAThreadOriginDataIndex()
+    {
+        const auto laneId  = GetLaneId();
+        const auto blk_idx = GetBlkIdx();
+
+        const auto blk_id = blk_idx[I0];
+        const auto blk_td = blk_idx[I1];
+
+        if constexpr(mfma_instr.is_k_reduction)
+        {
+            return make_tuple(blk_id, blk_td);
+        }
+        else
+        {
+            return make_tuple(0, laneId);
+        }
+    }
+
+    __host__ __device__ static auto CalculateBThreadOriginDataIndex()
+    {
+        const auto laneId  = GetLaneId();
+        const auto blk_idx = GetBlkIdx();
+
+        const auto blk_id = blk_idx[I0];
+        const auto blk_td = blk_idx[I1];
+
+        if constexpr(mfma_instr.is_k_reduction)
+        {
+            return make_tuple(blk_id, blk_td);
+        }
+        else
+        {
+            return make_tuple(0, laneId);
+        }
+    }
+
+    __device__ static CIndex GetBeginOfThreadBlk(index_t xdlops_i, index_t blk_i)
+    {
+        const auto blk_idx = GetBlkIdx();
+
+        const auto blk_id = blk_idx[I0];
+        const auto blk_td = blk_idx[I1];
+
+        index_t n_offset = blk_i * mfma_instr.n_per_blk + blk_td;
+        index_t m_offset = xdlops_i * mfma_instr.m_per_blk + blk_id * mfma_instr.group_size;
+
+        return TransposeC ? CIndex{n_offset, m_offset} : CIndex{m_offset, n_offset};
+    }
+
+    __device__ static CIndex4D GetBeginOfThreadBlk4D(index_t /* xdlops_i */, index_t /* blk_i */)
+    {
+        const auto blk_idx = GetBlkIdx();
+
+        const auto blk_id = blk_idx[I0];
+        const auto blk_td = blk_idx[I1];
+
+        return TransposeC ? CIndex4D{blk_td, I0, blk_id, I0} : CIndex4D{I0, blk_id, I0, blk_td};
+    }
+
+    static constexpr auto mfma = MfmaSelector<base_type, MPerXdlops, NPerXdlops>{};
+
+    static constexpr auto mfma_instr = mfma.selected_mfma;
+
+    static constexpr auto KPerXdlops  = mfma.GetKPerXdlops();
+    static constexpr auto K1PerXdlops = mfma.GetK1PerXdlops();
+    static constexpr auto K0PerXdlops = KPerXdlops / K1PerXdlops;
+
+    __host__ __device__ static constexpr auto GetCM0M1M2NThreadBlkLengths()
+    {
+        return make_tuple(
+            Number<mfma_instr.num_groups_per_blk>{}, I1, Number<mfma_instr.group_size>{}, I1);
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp
new file mode 100644
index 00000000..5fc11d91
--- /dev/null
+++ b/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_specialization.hpp"
+
+namespace ck {
+namespace tensor_operation {
+
+// assume C[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2...]
+template <index_t NumDimG,
+          index_t NumDimM,
+          index_t NumDimN,
+          device::TensorSpecialization TensorSpec>
+static auto MakeGridDescriptorPair(const std::vector<index_t>& gs_ms_ns_lengths_vec,
+                                   const std::vector<index_t>& gs_ms_ns_strides_vec)
+{
+    if(!(gs_ms_ns_lengths_vec.size() == NumDimG + NumDimM + NumDimN &&
+         gs_ms_ns_strides_vec.size() == NumDimG + NumDimM + NumDimN))
+    {
+        throw std::runtime_error("wrong! dimension must match input lengths");
+    }
+
+    const auto to_tuple = [&](auto& vec, auto start, auto end) {
+        return generate_tuple([&](auto i) { return vec[start + i]; }, Number<end - start>{});
+    };
+
+    const auto gs_ms_ns_lengths =
+        to_tuple(gs_ms_ns_lengths_vec, Number<0>{}, Number<NumDimG + NumDimM + NumDimN>{});
+    const auto gs_ms_ns_strides =
+        to_tuple(gs_ms_ns_strides_vec, Number<0>{}, Number<NumDimG + NumDimM + NumDimN>{});
+
+    // dimension Ids for G0, G1, ...
+    constexpr auto gDimIds = typename arithmetic_sequence_gen<0, NumDimG, 1>::type{};
+
+    // dimension Ids for M0, M1, ...
+    constexpr auto mDimIds =
+        typename arithmetic_sequence_gen<NumDimG, NumDimG + NumDimM, 1>::type{};
+
+    // dimension Ids for N0, N1, ...
+    constexpr auto nDimIds =
+        typename arithmetic_sequence_gen<NumDimG + NumDimM, NumDimG + NumDimM + NumDimN, 1>::type{};
+
+    // lengths for G0, G1, ...
+    const auto gLengths = get_container_subset(gs_ms_ns_lengths, gDimIds);
+
+    // lengths for M0, M1, ...
+    const auto mLengths = get_container_subset(gs_ms_ns_lengths, mDimIds);
+
+    // lengths for N0, N1, ...
+    const auto nLengths = get_container_subset(gs_ms_ns_lengths, nDimIds);
+
+    if constexpr(TensorSpec == device::TensorSpecialization::Packed)
+    {
+        auto G = container_reduce(gLengths, math::multiplies{}, Number<1>{});
+        auto M = container_reduce(mLengths, math::multiplies{}, Number<1>{});
+        auto N = container_reduce(nLengths, math::multiplies{}, Number<1>{});
+        const auto grid_desc_g_mraw_nraw = make_naive_tensor_descriptor(
+            make_tuple(G, M, N),
+            make_tuple(gs_ms_ns_strides[Number<NumDimG - 1>{}],
+                       gs_ms_ns_strides[Number<NumDimG + NumDimM - 1>{}],
+                       gs_ms_ns_strides[Number<NumDimG + NumDimM + NumDimN - 1>{}]));
+
+        const auto grid_desc_mraw_nraw = make_naive_tensor_descriptor(
+            make_tuple(M, N),
+            make_tuple(gs_ms_ns_strides[Number<NumDimG + NumDimM - 1>{}],
+                       gs_ms_ns_strides[Number<NumDimG + NumDimM + NumDimN - 1>{}]));
+
+        return std::make_pair(grid_desc_g_mraw_nraw, grid_desc_mraw_nraw);
+    }
+    else
+    {
+        // naive tensor C[G0, G1, ..., M0, M1, M2, ..., N0, N1, N2...]
+        const auto grid_desc_gs_ms_ns =
+            make_naive_tensor_descriptor(gs_ms_ns_lengths, gs_ms_ns_strides);
+
+        // transformed tensor C[G = G0 * G1 * ..., MRaw = M0 * M1 * M2 * ... , NRaw = N0 * N1 *
+        // N2 * ...]
+        // Note: This does not require padding as it only provides G offset calculation. Technically
+        // descriptor for only G is needed. Here we opt for backward compatibility purpose to return
+        // G_M_N
+        const auto grid_desc_g_mraw_nraw =
+            transform_tensor_descriptor(grid_desc_gs_ms_ns,
+                                        make_tuple(make_merge_transform(gLengths),
+                                                   make_merge_transform(mLengths),
+                                                   make_merge_transform(nLengths)),
+                                        make_tuple(gDimIds, mDimIds, nDimIds),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+        const auto c_ms_ns_lengths = to_tuple(
+            gs_ms_ns_lengths_vec, Number<NumDimG>{}, Number<NumDimG + NumDimM + NumDimN>{});
+        const auto c_ms_ns_strides = to_tuple(
+            gs_ms_ns_strides_vec, Number<NumDimG>{}, Number<NumDimG + NumDimM + NumDimN>{});
+
+        // transformed tensor C[MRaw = M0 * M1 * M2 * ... , NRaw = N0 * N1 *
+        // N2 * ...]
+        const auto grid_desc_ms_ns = make_naive_tensor_descriptor(c_ms_ns_lengths, c_ms_ns_strides);
+
+        const auto grid_desc_mraw_nraw = transform_tensor_descriptor(
+            grid_desc_ms_ns,
+            make_tuple(make_merge_transform(mLengths), make_merge_transform(nLengths)),
+            make_tuple(mDimIds - Number<NumDimG>{}, nDimIds - Number<NumDimG>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        return std::make_pair(grid_desc_g_mraw_nraw, grid_desc_mraw_nraw);
+    }
+}
+
+template <typename NumDims_G_M_N_K_O, // Sequence<>
+          typename PerBlock_M_N_K_O,  // Sequence<>
+          device::GemmSpecialization GemmSpec,
+          device::TensorSpecialization ASpec,
+          device::TensorSpecialization B0Spec,
+          device::TensorSpecialization B1Spec,
+          device::TensorSpecialization CSpec>
+struct TransformBatchedContractionContractionToBatchedGemmGemm
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+    static constexpr auto I2 = Number<2>{};
+    static constexpr auto I3 = Number<3>{};
+    static constexpr auto I4 = Number<4>{};
+
+    static constexpr index_t NumDimG = NumDims_G_M_N_K_O::At(I0);
+    static constexpr index_t NumDimM = NumDims_G_M_N_K_O::At(I1);
+    static constexpr index_t NumDimN = NumDims_G_M_N_K_O::At(I2);
+    static constexpr index_t NumDimK = NumDims_G_M_N_K_O::At(I3);
+    static constexpr index_t NumDimO = NumDims_G_M_N_K_O::At(I4);
+
+    static constexpr index_t MPerBlock = PerBlock_M_N_K_O::At(I0);
+    static constexpr index_t NPerBlock = PerBlock_M_N_K_O::At(I1);
+    static constexpr index_t KPerBlock = PerBlock_M_N_K_O::At(I2);
+    static constexpr index_t OPerBlock = PerBlock_M_N_K_O::At(I3);
+
+    static constexpr auto matrix_padder =
+        device::GemmGemmPadder<GemmSpec, index_t, index_t, index_t, index_t>{
+            MPerBlock, NPerBlock, KPerBlock, OPerBlock};
+
+    //
+    // A
+    //
+    static auto MakeAGridDescriptorPair(const std::vector<index_t>& a_gs_ms_ks_lengths_vec,
+                                        const std::vector<index_t>& a_gs_ms_ks_strides_vec)
+    {
+        return MakeGridDescriptorPair<NumDimG, NumDimM, NumDimK, ASpec>(a_gs_ms_ks_lengths_vec,
+                                                                        a_gs_ms_ks_strides_vec);
+    }
+
+    // TODO: rename to G_MRaw_KRaw
+    static auto MakeAGridDescriptor_G_M_K(const std::vector<index_t>& a_gs_ms_ks_lengths_vec,
+                                          const std::vector<index_t>& a_gs_ms_ks_strides_vec)
+    {
+        return MakeAGridDescriptorPair(a_gs_ms_ks_lengths_vec, a_gs_ms_ks_strides_vec).first;
+    }
+    static auto MakeAGridDescriptor_M_K(const std::vector<index_t>& a_gs_ms_ks_lengths_vec,
+                                        const std::vector<index_t>& a_gs_ms_ks_strides_vec)
+    {
+        return matrix_padder.PadADescriptor_M_K(
+            MakeAGridDescriptorPair(a_gs_ms_ks_lengths_vec, a_gs_ms_ks_strides_vec).second);
+    }
+
+    template <typename AGridDesc_M_K, typename Number>
+    __host__ __device__ static constexpr auto
+    MakeAGridDescriptor_AK0_M_AK1(const AGridDesc_M_K& a_grid_desc_m_k, const Number& AK1)
+    {
+        const auto M = a_grid_desc_m_k.GetLength(I0);
+        const auto K = a_grid_desc_m_k.GetLength(I1);
+
+        const auto AK0 = K / AK1;
+
+        return transform_tensor_descriptor(a_grid_desc_m_k,
+                                           make_tuple(make_unmerge_transform(make_tuple(AK0, AK1)),
+                                                      make_pass_through_transform(M)),
+                                           make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                           make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    //
+    // B (alias of B0)
+    //
+    static auto MakeB0GridDescriptorPair(const std::vector<index_t>& b0_gs_ns_ks_lengths_vec,
+                                         const std::vector<index_t>& b0_gs_ns_ks_strides_vec)
+    {
+        return MakeGridDescriptorPair<NumDimG, NumDimN, NumDimK, B0Spec>(b0_gs_ns_ks_lengths_vec,
+                                                                         b0_gs_ns_ks_strides_vec);
+    }
+
+    // TODO: rename to G_MRaw_NRaw
+    static auto MakeB0GridDescriptor_G_N_K(const std::vector<index_t>& b0_gs_ns_ks_lengths_vec,
+                                           const std::vector<index_t>& b0_gs_ns_ks_strides_vec)
+    {
+        return MakeB0GridDescriptorPair(b0_gs_ns_ks_lengths_vec, b0_gs_ns_ks_strides_vec).first;
+    }
+    static auto MakeB0GridDescriptor_N_K(const std::vector<index_t>& b0_gs_ns_ks_lengths_vec,
+                                         const std::vector<index_t>& b0_gs_ns_ks_strides_vec)
+    {
+        // alias of matrix_padder.PadB0Descriptor_N_K
+        return matrix_padder.PadBDescriptor_N_K(
+            MakeB0GridDescriptorPair(b0_gs_ns_ks_lengths_vec, b0_gs_ns_ks_strides_vec).second);
+    }
+
+    template <typename BGridDesc_N_K, typename Number>
+    __host__ __device__ static constexpr auto
+    MakeB0GridDescriptor_BK0_N_BK1(const BGridDesc_N_K& b_grid_desc_n_k, const Number& BK1)
+    {
+        const auto N = b_grid_desc_n_k.GetLength(I0);
+        const auto K = b_grid_desc_n_k.GetLength(I1);
+
+        const auto BK0 = K / BK1;
+
+        return transform_tensor_descriptor(b_grid_desc_n_k,
+                                           make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                      make_pass_through_transform(N)),
+                                           make_tuple(Sequence<1>{}, Sequence<0>{}),
+                                           make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    //
+    // B1
+    //
+    static auto MakeB1GridDescriptorPair(const std::vector<index_t>& b1_gs_os_ns_lengths_vec,
+                                         const std::vector<index_t>& b1_gs_os_ns_strides_vec)
+    {
+        return MakeGridDescriptorPair<NumDimG, NumDimO, NumDimN, B1Spec>(b1_gs_os_ns_lengths_vec,
+                                                                         b1_gs_os_ns_strides_vec);
+    }
+
+    // TODO: rename to G_NRaw_KRaw
+    static auto MakeB1GridDescriptor_G_N_K(const std::vector<index_t>& b1_gs_os_ns_lengths_vec,
+                                           const std::vector<index_t>& b1_gs_os_ns_strides_vec)
+    {
+        return MakeB1GridDescriptorPair(b1_gs_os_ns_lengths_vec, b1_gs_os_ns_strides_vec).first;
+    }
+    static auto MakeB1GridDescriptor_N_K(const std::vector<index_t>& b1_gs_os_ns_lengths_vec,
+                                         const std::vector<index_t>& b1_gs_os_ns_strides_vec)
+    {
+        // alias of matrix_padder.PadB1Descriptor_O_N
+        return matrix_padder.PadB1Descriptor_N_K(
+            MakeB1GridDescriptorPair(b1_gs_os_ns_lengths_vec, b1_gs_os_ns_strides_vec).second);
+    }
+
+    template <typename B1GridDesc_N_K, typename Number>
+    __host__ __device__ static constexpr auto
+    MakeB1GridDescriptor_BK0_N_BK1(const B1GridDesc_N_K& b1_grid_desc_n_k, const Number& B1K1)
+    {
+        const auto N = b1_grid_desc_n_k.GetLength(I0);
+        const auto K = b1_grid_desc_n_k.GetLength(I1);
+
+        const auto B1K0 = K / B1K1;
+
+        return transform_tensor_descriptor(
+            b1_grid_desc_n_k,
+            make_tuple(make_unmerge_transform(make_tuple(B1K0, B1K1)),
+                       make_pass_through_transform(N)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+    }
+
+    //
+    // C
+    //
+    static auto MakeCGridDescriptorPair(const std::vector<index_t>& c_gs_ms_os_lengths_vec,
+                                        const std::vector<index_t>& c_gs_ms_os_strides_vec)
+    {
+        return MakeGridDescriptorPair<NumDimG, NumDimM, NumDimO, CSpec>(c_gs_ms_os_lengths_vec,
+                                                                        c_gs_ms_os_strides_vec);
+    }
+
+    // TODO: rename to G_MRaw_NRaw
+    static auto MakeCGridDescriptor_G_M_N(const std::vector<index_t>& c_gs_ms_os_lengths_vec,
+                                          const std::vector<index_t>& c_gs_ms_os_strides_vec)
+    {
+        return MakeCGridDescriptorPair(c_gs_ms_os_lengths_vec, c_gs_ms_os_strides_vec).first;
+    }
+    static auto MakeCGridDescriptor_M_N(const std::vector<index_t>& c_gs_ms_os_lengths_vec,
+                                        const std::vector<index_t>& c_gs_ms_os_strides_vec)
+    {
+        return matrix_padder.PadCDescriptor_M_N(
+            MakeCGridDescriptorPair(c_gs_ms_os_lengths_vec, c_gs_ms_os_strides_vec).second);
+    }
+};
+
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
new file mode 100644
index 00000000..13d0a28c
--- /dev/null
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_data_to_gemm_v1.hpp
@@ -0,0 +1,583 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+
+namespace ck {
+namespace tensor_operation {
+
+template <
+    index_t NDimSpatial,
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization ConvBwdDataSpecialization,
+    index_t AK1,
+    index_t BK1,
+    index_t GemmMPerBlock,
+    index_t GemmNPerBlock,
+    bool DoPadGemmM,
+    bool DoPadGemmN>
+struct TransformConvBwdDataToGemm_v1
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    template <typename ALayout,
+              typename std::enable_if<NDimSpatial == 2 &&
+                                          is_same_v<ALayout, tensor_layout::convolution::GNHWK>,
+                                      bool>::type = false>
+    static auto MakeADescriptor_AK0_M_AK1(
+        const std::array<index_t, NDimSpatial + 3>& out_g_n_k_wos_lengths,
+        const std::array<index_t, NDimSpatial + 3>& /* out_g_n_k_wos_strides */,
+        const std::array<index_t, NDimSpatial + 3>& wei_g_k_c_xs_lengths,
+        const std::array<index_t, NDimSpatial + 3>& /* wei_g_k_c_xs_strides */,
+        const std::array<index_t, NDimSpatial + 3>& in_g_n_c_wis_lengths,
+        const std::array<index_t, NDimSpatial + 3>& /* in_g_n_c_wis_strides */,
+        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+        const std::array<index_t, NDimSpatial>& input_left_pads,
+        const std::array<index_t, NDimSpatial>& /* input_right_pads */,
+        const std::array<index_t, NDimSpatial>& tildes)
+    {
+        index_t i_ytilde = tildes[0];
+        index_t i_xtilde = tildes[1];
+
+        const index_t N = in_g_n_c_wis_lengths[1];
+        const index_t K = wei_g_k_c_xs_lengths[1];
+
+        const index_t Hi = in_g_n_c_wis_lengths[3];
+        const index_t Wi = in_g_n_c_wis_lengths[4];
+
+        const index_t Ho = out_g_n_k_wos_lengths[3];
+        const index_t Wo = out_g_n_k_wos_lengths[4];
+
+        const index_t Y = wei_g_k_c_xs_lengths[3];
+        const index_t X = wei_g_k_c_xs_lengths[4];
+
+        const index_t InLeftPadH = input_left_pads[0];
+        const index_t InLeftPadW = input_left_pads[1];
+
+        const index_t ConvStrideH = conv_filter_strides[0];
+        const index_t ConvStrideW = conv_filter_strides[1];
+
+        const index_t ConvDilationH = conv_filter_dilations[0];
+        const index_t ConvDilationW = conv_filter_dilations[1];
+
+        const index_t AK0 = K / AK1;
+
+        // assume packed
+        const auto out_n_ho_wo_k_grid_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(N, Ho, Wo, K));
+
+        if constexpr(ConvBwdDataSpecialization ==
+                     ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
+                         Filter1x1Stride1Pad0)
+        {
+            // A: output tensor
+            const auto out_gemmak0_gemmmraw_gemmak1_grid_desc = transform_tensor_descriptor(
+                make_naive_tensor_descriptor_packed(make_tuple(N * Ho * Wo, K)),
+                make_tuple(make_pass_through_transform(N * Ho * Wo),
+                           make_unmerge_transform(make_tuple(AK0, AK1))),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<1>{}, Sequence<0, 2>{}));
+
+            const auto out_gemmak0_gemmm_gemmak1_grid_desc =
+                ck::tensor_operation::device::PadTensorDescriptor(
+                    out_gemmak0_gemmmraw_gemmak1_grid_desc,
+                    make_tuple(AK0, GemmMPerBlock, AK1),
+                    Sequence<false, DoPadGemmM, false>{});
+
+            return out_gemmak0_gemmm_gemmak1_grid_desc;
+        }
+        else
+        {
+            const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+            const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+
+            const auto YTilde = ConvStrideH / GcdStrideDilationH;
+            const auto XTilde = ConvStrideW / GcdStrideDilationW;
+
+            const auto YDot = math::integer_divide_ceil(Y, YTilde);
+            const auto XDot = math::integer_divide_ceil(X, XTilde);
+
+            const auto HTilde =
+                Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH);
+            const auto WTilde =
+                Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW);
+
+            // only work on HTilde and WTilde that contribute to non-padding area of input tensor
+            const auto IHTildeSliceBegin = math::integer_divide_floor(
+                math::max(I0, InLeftPadH - ConvDilationH * (YTilde - I1)), ConvStrideH);
+            const auto IWTildeSliceBegin = math::integer_divide_floor(
+                math::max(I0, InLeftPadW - ConvDilationW * (XTilde - I1)), ConvStrideW);
+
+            const auto IHTildeSliceEnd = math::min(
+                HTilde, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1);
+            const auto IWTildeSliceEnd = math::min(
+                WTilde, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1);
+
+            const auto HTildeSlice = IHTildeSliceEnd - IHTildeSliceBegin;
+            const auto WTildeSlice = IWTildeSliceEnd - IWTildeSliceBegin;
+
+            // GemmK is different for each GEMM
+            const auto YDotSlice = math::integer_divide_ceil(Y - i_ytilde, YTilde);
+            const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde);
+
+            // A: output tensor
+            const auto out_n_hop_wop_k_grid_desc = transform_tensor_descriptor(
+                out_n_ho_wo_k_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Ho, I0, I0),
+                           make_pad_transform(Wo, I0, I0),
+                           make_pass_through_transform(K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto out_n_ydot_htilde_xdot_wtilde_k_grid_desc = transform_tensor_descriptor(
+                out_n_hop_wop_k_grid_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(YDot, HTilde),
+                                         make_tuple(-ConvDilationH / GcdStrideDilationH, I1)),
+                    make_embed_transform(make_tuple(XDot, WTilde),
+                                         make_tuple(-ConvDilationW / GcdStrideDilationW, I1)),
+                    make_pass_through_transform(K)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+            const auto out_n_ydotslice_htildeslice_xdotslice_wtildeslice_ak0_ak1_grid_desc =
+                transform_tensor_descriptor(
+                    out_n_ydot_htilde_xdot_wtilde_k_grid_desc,
+                    make_tuple(make_pass_through_transform(N),
+                               make_slice_transform(YDot, I0, YDotSlice),
+                               make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice),
+                               make_slice_transform(XDot, I0, XDotSlice),
+                               make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
+                               make_unmerge_transform(make_tuple(AK0, AK1))),
+                    make_tuple(Sequence<0>{},
+                               Sequence<1>{},
+                               Sequence<2>{},
+                               Sequence<3>{},
+                               Sequence<4>{},
+                               Sequence<5>{}),
+                    make_tuple(Sequence<0>{},
+                               Sequence<1>{},
+                               Sequence<2>{},
+                               Sequence<3>{},
+                               Sequence<4>{},
+                               Sequence<5, 6>{}));
+
+            const auto out_gemmak0_gemmmraw_gemmak1_grid_desc = transform_tensor_descriptor(
+                out_n_ydotslice_htildeslice_xdotslice_wtildeslice_ak0_ak1_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, AK0)),
+                           make_merge_transform(make_tuple(N, HTildeSlice, WTildeSlice)),
+                           make_pass_through_transform(AK1)),
+                make_tuple(Sequence<1, 3, 5>{}, Sequence<0, 2, 4>{}, Sequence<6>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            const auto out_gemmak0_gemmm_gemmak1_grid_desc =
+                ck::tensor_operation::device::PadTensorDescriptor(
+                    out_gemmak0_gemmmraw_gemmak1_grid_desc,
+                    make_tuple(AK0, GemmMPerBlock, AK1),
+                    Sequence<false, DoPadGemmM, false>{});
+
+            return out_gemmak0_gemmm_gemmak1_grid_desc;
+        }
+    }
+
+    template <typename BLayout,
+              typename std::enable_if<NDimSpatial == 2 &&
+                                          is_same_v<BLayout, tensor_layout::convolution::GKYXC>,
+                                      bool>::type = false>
+    static auto MakeBDescriptor_BK0_N_BK1(
+        const std::array<index_t, NDimSpatial + 3>& out_g_n_k_wos_lengths,
+        const std::array<index_t, NDimSpatial + 3>& /* out_g_n_k_wos_strides */,
+        const std::array<index_t, NDimSpatial + 3>& wei_g_k_c_xs_lengths,
+        const std::array<index_t, NDimSpatial + 3>& /* wei_g_k_c_xs_strides */,
+        const std::array<index_t, NDimSpatial + 3>& in_g_n_c_wis_lengths,
+        const std::array<index_t, NDimSpatial + 3>& /* in_g_n_c_wis_strides */,
+        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+        const std::array<index_t, NDimSpatial>& /* input_left_pads */,
+        const std::array<index_t, NDimSpatial>& /* input_right_pads */,
+        const std::array<index_t, NDimSpatial>& tildes)
+    {
+        index_t i_ytilde = tildes[0];
+        index_t i_xtilde = tildes[1];
+
+        const index_t N = in_g_n_c_wis_lengths[1];
+        const index_t K = wei_g_k_c_xs_lengths[1];
+        const index_t C = wei_g_k_c_xs_lengths[2];
+
+        const index_t Ho = out_g_n_k_wos_lengths[3];
+        const index_t Wo = out_g_n_k_wos_lengths[4];
+
+        const index_t Y = wei_g_k_c_xs_lengths[3];
+        const index_t X = wei_g_k_c_xs_lengths[4];
+
+        const index_t ConvStrideH = conv_filter_strides[0];
+        const index_t ConvStrideW = conv_filter_strides[1];
+
+        const index_t ConvDilationH = conv_filter_dilations[0];
+        const index_t ConvDilationW = conv_filter_dilations[1];
+
+        const index_t BK0 = K / BK1;
+
+        // assume packed
+        const auto wei_k_y_x_c_grid_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(K, Y, X, C));
+
+        if constexpr(ConvBwdDataSpecialization ==
+                     ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
+                         Filter1x1Stride1Pad0)
+        {
+            // B: weight tensor
+            const auto wei_gemmbk0_gemmnraw_gemmbk1_grid_desc =
+                transform_tensor_descriptor(make_naive_tensor_descriptor_packed(make_tuple(K, C)),
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_pass_through_transform(C)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+            make_naive_tensor_descriptor(make_tuple(N * Ho * Wo, C), make_tuple(I0, I1));
+
+            const auto wei_gemmbk0_gemmn_gemmbk1_grid_desc =
+                ck::tensor_operation::device::PadTensorDescriptor(
+                    wei_gemmbk0_gemmnraw_gemmbk1_grid_desc,
+                    make_tuple(BK0, GemmNPerBlock, BK1),
+                    Sequence<false, DoPadGemmN, false>{});
+
+            return wei_gemmbk0_gemmn_gemmbk1_grid_desc;
+        }
+        else
+        {
+            const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+            const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+
+            const auto YTilde = ConvStrideH / GcdStrideDilationH;
+            const auto XTilde = ConvStrideW / GcdStrideDilationW;
+
+            const auto YDot = math::integer_divide_ceil(Y, YTilde);
+            const auto XDot = math::integer_divide_ceil(X, XTilde);
+
+            // GemmK is different for each GEMM
+            const auto YDotSlice = math::integer_divide_ceil(Y - i_ytilde, YTilde);
+            const auto XDotSlice = math::integer_divide_ceil(X - i_xtilde, XTilde);
+
+            // B weight tensor
+            const auto wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc = transform_tensor_descriptor(
+                wei_k_y_x_c_grid_desc,
+                make_tuple(make_pass_through_transform(K),
+                           make_embed_transform(make_tuple(YDot, YTilde),
+                                                make_tuple(ConvStrideH / GcdStrideDilationH, I1)),
+                           make_embed_transform(make_tuple(XDot, XTilde),
+                                                make_tuple(ConvStrideW / GcdStrideDilationW, I1)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+            const auto wei_bk0_bk1_ydotslice_xdotslice_c_grid_desc =
+                transform_tensor_descriptor(wei_k_ydot_ytilde_xdot_xtilde_c_grid_desc,
+                                            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1)),
+                                                       make_slice_transform(YDot, I0, YDotSlice),
+                                                       make_slice_transform(XDot, I0, XDotSlice),
+                                                       make_freeze_transform(i_ytilde),
+                                                       make_freeze_transform(i_xtilde),
+                                                       make_pass_through_transform(C)),
+                                            make_tuple(Sequence<0>{},
+                                                       Sequence<1>{},
+                                                       Sequence<3>{},
+                                                       Sequence<2>{},
+                                                       Sequence<4>{},
+                                                       Sequence<5>{}),
+                                            make_tuple(Sequence<0, 1>{},
+                                                       Sequence<2>{},
+                                                       Sequence<3>{},
+                                                       Sequence<>{},
+                                                       Sequence<>{},
+                                                       Sequence<4>{}));
+
+            const auto wei_gemmbk0_gemmnraw_gemmbk1_grid_desc = transform_tensor_descriptor(
+                wei_bk0_bk1_ydotslice_xdotslice_c_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(YDotSlice, XDotSlice, BK0)),
+                           make_pass_through_transform(C),
+                           make_pass_through_transform(BK1)),
+                make_tuple(Sequence<2, 3, 0>{}, Sequence<4>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            const auto wei_gemmbk0_gemmn_gemmbk1_grid_desc =
+                ck::tensor_operation::device::PadTensorDescriptor(
+                    wei_gemmbk0_gemmnraw_gemmbk1_grid_desc,
+                    make_tuple(
+                        wei_gemmbk0_gemmnraw_gemmbk1_grid_desc.GetLength(I0), GemmNPerBlock, BK1),
+                    Sequence<false, DoPadGemmN, false>{});
+
+            return wei_gemmbk0_gemmn_gemmbk1_grid_desc;
+        }
+    }
+
+    template <typename CLayout,
+              typename std::enable_if<NDimSpatial == 2 &&
+                                          (is_same_v<CLayout, tensor_layout::convolution::GNHWC> ||
+                                           is_same_v<CLayout, tensor_layout::convolution::NHWGC> ||
+                                           is_same_v<CLayout, tensor_layout::convolution::G_NHW_C>),
+                                      bool>::type = false>
+    static auto
+    MakeCDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& out_g_n_k_wos_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& /* out_g_n_k_wos_strides */,
+                        const std::array<index_t, NDimSpatial + 3>& wei_g_k_c_xs_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& /* wei_g_k_c_xs_strides */,
+                        const std::array<index_t, NDimSpatial + 3>& in_g_n_c_wis_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& in_g_n_c_wis_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                        const std::array<index_t, NDimSpatial>& input_left_pads,
+                        const std::array<index_t, NDimSpatial>& input_right_pads,
+                        const std::array<index_t, NDimSpatial>& tildes)
+    {
+        index_t i_ytilde = tildes[0];
+        index_t i_xtilde = tildes[1];
+
+        const index_t N = in_g_n_c_wis_lengths[1];
+        const index_t C = wei_g_k_c_xs_lengths[2];
+
+        const index_t Hi = in_g_n_c_wis_lengths[3];
+        const index_t Wi = in_g_n_c_wis_lengths[4];
+
+        const index_t Ho = out_g_n_k_wos_lengths[3];
+        const index_t Wo = out_g_n_k_wos_lengths[4];
+
+        const index_t Y = wei_g_k_c_xs_lengths[3];
+        const index_t X = wei_g_k_c_xs_lengths[4];
+
+        const index_t InLeftPadH = input_left_pads[0];
+        const index_t InLeftPadW = input_left_pads[1];
+
+        const index_t InRightPadH = input_right_pads[0];
+        const index_t InRightPadW = input_right_pads[1];
+
+        const index_t ConvStrideH = conv_filter_strides[0];
+        const index_t ConvStrideW = conv_filter_strides[1];
+
+        const index_t ConvDilationH = conv_filter_dilations[0];
+        const index_t ConvDilationW = conv_filter_dilations[1];
+
+        // assume strided
+        const auto in_n_hi_wi_c_grid_desc =
+            make_naive_tensor_descriptor(make_tuple(N, Hi, Wi, C),
+                                         make_tuple(in_g_n_c_wis_strides[1],
+                                                    in_g_n_c_wis_strides[3],
+                                                    in_g_n_c_wis_strides[4],
+                                                    in_g_n_c_wis_strides[2]));
+
+        if constexpr(ConvBwdDataSpecialization ==
+                     ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
+                         Filter1x1Stride1Pad0)
+        {
+            // C: input tensor
+            const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_embed_transform(make_tuple(I1, Ho), make_tuple(I1, ConvStrideH)),
+                           make_embed_transform(make_tuple(I1, Wo), make_tuple(I1, ConvStrideW)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+            const auto in_gemmmraw_gemmnraw_grid_desc = transform_tensor_descriptor(
+                in_n_y_ho_x_wo_c_grid_desc,
+                make_tuple(make_freeze_transform(I0),
+                           make_freeze_transform(I0),
+                           make_merge_transform(make_tuple(N, Ho, Wo)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<1>{}, Sequence<3>{}, Sequence<0, 2, 4>{}, Sequence<5>{}),
+                make_tuple(Sequence<>{}, Sequence<>{}, Sequence<0>{}, Sequence<1>{}));
+
+            const auto in_gemmm_gemmn_grid_desc = ck::tensor_operation::device::PadTensorDescriptor(
+                in_gemmmraw_gemmnraw_grid_desc,
+                make_tuple(GemmMPerBlock, GemmNPerBlock),
+                Sequence<DoPadGemmM, DoPadGemmN>{});
+
+            return in_gemmm_gemmn_grid_desc;
+        }
+        else
+        {
+            const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+            const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+
+            const auto YTilde = ConvStrideH / GcdStrideDilationH;
+            const auto XTilde = ConvStrideW / GcdStrideDilationW;
+
+            const auto HTilde =
+                Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH);
+            const auto WTilde =
+                Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW);
+
+            // only work on HTilde and WTilde that contribute to non-padding area of input tensor
+            const auto IHTildeSliceBegin = math::integer_divide_floor(
+                math::max(I0, InLeftPadH - ConvDilationH * (YTilde - I1)), ConvStrideH);
+            const auto IWTildeSliceBegin = math::integer_divide_floor(
+                math::max(I0, InLeftPadW - ConvDilationW * (XTilde - I1)), ConvStrideW);
+
+            const auto IHTildeSliceEnd = math::min(
+                HTilde, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1);
+            const auto IWTildeSliceEnd = math::min(
+                WTilde, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1);
+
+            const auto HTildeSlice = IHTildeSliceEnd - IHTildeSliceBegin;
+            const auto WTildeSlice = IWTildeSliceEnd - IWTildeSliceBegin;
+
+            // C: input tensor
+            const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+                in_n_hi_wi_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc = transform_tensor_descriptor(
+                in_n_hip_wip_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_embed_transform(make_tuple(YTilde, HTilde),
+                                                make_tuple(ConvDilationH, ConvStrideH)),
+                           make_embed_transform(make_tuple(XTilde, WTilde),
+                                                make_tuple(ConvDilationW, ConvStrideW)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+            const auto in_n_htildeslice_wtildeslice_c_grid_desc = transform_tensor_descriptor(
+                in_n_ytilde_htilde_xtilde_wtilde_c_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_freeze_transform(i_ytilde),
+                           make_slice_transform(HTilde, IHTildeSliceBegin, HTildeSlice),
+                           make_freeze_transform(i_xtilde),
+                           make_slice_transform(WTilde, IWTildeSliceBegin, WTildeSlice),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{},
+                           Sequence<1>{},
+                           Sequence<2>{},
+                           Sequence<3>{},
+                           Sequence<4>{},
+                           Sequence<5>{}),
+                make_tuple(Sequence<0>{},
+                           Sequence<>{},
+                           Sequence<1>{},
+                           Sequence<>{},
+                           Sequence<2>{},
+                           Sequence<3>{}));
+
+            const auto in_gemmmraw_gemmnraw_grid_desc = transform_tensor_descriptor(
+                in_n_htildeslice_wtildeslice_c_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(N, HTildeSlice, WTildeSlice)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto in_gemmm_gemmn_grid_desc = ck::tensor_operation::device::PadTensorDescriptor(
+                in_gemmmraw_gemmnraw_grid_desc,
+                make_tuple(GemmMPerBlock, GemmNPerBlock),
+                Sequence<DoPadGemmM, DoPadGemmN>{});
+
+            return in_gemmm_gemmn_grid_desc;
+        }
+    }
+
+    // for input bias
+    template <typename CLayout,
+              typename std::enable_if<NDimSpatial == 2 &&
+                                          (is_same_v<CLayout, tensor_layout::convolution::GC> ||
+                                           is_same_v<CLayout, tensor_layout::convolution::G_C>),
+                                      bool>::type = false>
+    static auto
+    MakeCDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& out_g_n_k_wos_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& /* out_g_n_k_wos_strides */,
+                        const std::array<index_t, NDimSpatial + 3>& wei_g_k_c_xs_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& /* wei_g_k_c_xs_strides */,
+                        const std::array<index_t, NDimSpatial + 3>& in_g_n_c_wis_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& /* in_g_n_c_wis_strides */,
+                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                        const std::array<index_t, NDimSpatial>& input_left_pads,
+                        const std::array<index_t, NDimSpatial>& /* input_right_pads */,
+                        const std::array<index_t, NDimSpatial>& /* tildes */)
+    {
+        const index_t N = in_g_n_c_wis_lengths[1];
+        const index_t C = wei_g_k_c_xs_lengths[2];
+
+        const index_t Hi = in_g_n_c_wis_lengths[3];
+        const index_t Wi = in_g_n_c_wis_lengths[4];
+
+        const index_t Ho = out_g_n_k_wos_lengths[3];
+        const index_t Wo = out_g_n_k_wos_lengths[4];
+
+        const index_t Y = wei_g_k_c_xs_lengths[3];
+        const index_t X = wei_g_k_c_xs_lengths[4];
+
+        const index_t InLeftPadH = input_left_pads[0];
+        const index_t InLeftPadW = input_left_pads[1];
+
+        const index_t ConvStrideH = conv_filter_strides[0];
+        const index_t ConvStrideW = conv_filter_strides[1];
+
+        const index_t ConvDilationH = conv_filter_dilations[0];
+        const index_t ConvDilationW = conv_filter_dilations[1];
+
+        if constexpr(ConvBwdDataSpecialization ==
+                     ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::
+                         Filter1x1Stride1Pad0)
+        {
+            const auto in_gemmm_gemmn_grid_desc =
+                make_naive_tensor_descriptor(make_tuple(N * Ho * Wo, C), make_tuple(I0, I1));
+
+            return in_gemmm_gemmn_grid_desc;
+        }
+        else
+        {
+            const auto GcdStrideDilationH = math::gcd(ConvStrideH, ConvDilationH);
+            const auto GcdStrideDilationW = math::gcd(ConvStrideW, ConvDilationW);
+
+            const auto YTilde = ConvStrideH / GcdStrideDilationH;
+            const auto XTilde = ConvStrideW / GcdStrideDilationW;
+
+            const auto HTilde =
+                Ho + math::integer_divide_ceil(ConvDilationH * (Y - I1), ConvStrideH);
+            const auto WTilde =
+                Wo + math::integer_divide_ceil(ConvDilationW * (X - I1), ConvStrideW);
+
+            // only work on HTilde and WTilde that contribute to non-padding area of input tensor
+            const auto IHTildeSliceBegin = math::integer_divide_floor(
+                math::max(I0, InLeftPadH - ConvDilationH * (YTilde - I1)), ConvStrideH);
+            const auto IWTildeSliceBegin = math::integer_divide_floor(
+                math::max(I0, InLeftPadW - ConvDilationW * (XTilde - I1)), ConvStrideW);
+
+            const auto IHTildeSliceEnd = math::min(
+                HTilde, math::integer_divide_ceil(InLeftPadH + Hi - I1, ConvStrideH) + I1);
+            const auto IWTildeSliceEnd = math::min(
+                WTilde, math::integer_divide_ceil(InLeftPadW + Wi - I1, ConvStrideW) + I1);
+
+            const auto HTildeSlice = IHTildeSliceEnd - IHTildeSliceBegin;
+            const auto WTildeSlice = IWTildeSliceEnd - IWTildeSliceBegin;
+
+            // bias tensor
+            const auto in_gemmmraw_gemmnraw_grid_desc = make_naive_tensor_descriptor(
+                make_tuple(N * HTildeSlice * WTildeSlice, C), make_tuple(I0, I1));
+
+            const auto in_gemmm_gemmn_grid_desc = ck::tensor_operation::device::PadTensorDescriptor(
+                in_gemmmraw_gemmnraw_grid_desc,
+                make_tuple(GemmMPerBlock, GemmNPerBlock),
+                Sequence<DoPadGemmM, DoPadGemmN>{});
+
+            return in_gemmm_gemmn_grid_desc;
+        }
+    }
+};
+
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
new file mode 100644
index 00000000..1b5e64b6
--- /dev/null
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
@@ -0,0 +1,880 @@
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/library/utility/numeric.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+
+namespace ck {
+namespace tensor_operation {
+
+template <index_t NDimSpatial, device::ConvolutionForwardSpecialization ConvForwardSpecialization>
+struct TransformConvFwdToGemm
+{
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    template <typename ALayout,
+              typename std::enable_if<NDimSpatial == 1 &&
+                                          is_same_v<ALayout, tensor_layout::convolution::GNWC>,
+                                      bool>::type = false>
+    static auto
+    MakeADescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& /* a_g_n_c_wis_strides */,
+                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& /* b_g_k_c_xs_strides */,
+                        const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& /* c_g_n_k_wos_strides */,
+                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                        const std::array<index_t, NDimSpatial>& input_left_pads,
+                        const std::array<index_t, NDimSpatial>& input_right_pads)
+    {
+        const index_t N = a_g_n_c_wis_lengths[1];
+        const index_t C = a_g_n_c_wis_lengths[2];
+
+        const index_t Wi = a_g_n_c_wis_lengths[3];
+
+        const index_t Wo = c_g_n_k_wos_lengths[3];
+
+        const index_t ConvStrideW = conv_filter_strides[0];
+
+        if constexpr(ConvForwardSpecialization ==
+                     device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
+        {
+            const index_t NWo =
+                N * ck::accumulate_n<index_t>(
+                        c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>());
+
+            const auto in_gemmm_gemmk_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(NWo, C));
+
+            return in_gemmm_gemmk_desc;
+        }
+        else if constexpr(ConvForwardSpecialization ==
+                          device::ConvolutionForwardSpecialization::Filter1x1Pad0)
+        {
+            const auto in_n_wi_c_desc = make_naive_tensor_descriptor_packed(make_tuple(N, Wi, C));
+
+            const auto in_n_wo_c_desc = transform_tensor_descriptor(
+                in_n_wi_c_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            const auto in_gemmm_gemmk_desc = transform_tensor_descriptor(
+                in_n_wo_c_desc,
+                make_tuple(make_merge_transform(make_tuple(N, Wo)), make_pass_through_transform(C)),
+                make_tuple(Sequence<0, 1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return in_gemmm_gemmk_desc;
+        }
+        else
+        {
+            const index_t X             = b_g_k_c_xs_lengths[3];
+            const index_t ConvDilationW = conv_filter_dilations[0];
+            const index_t InLeftPadW    = input_left_pads[0];
+            const index_t InRightPadW   = input_right_pads[0];
+
+            const auto in_n_wi_c_desc = make_naive_tensor_descriptor_packed(make_tuple(N, Wi, C));
+
+            const auto in_n_wip_c_desc = transform_tensor_descriptor(
+                in_n_wi_c_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            const auto in_n_x_wo_c_desc = transform_tensor_descriptor(
+                in_n_wip_c_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                    make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+            const auto in_gemmm_gemmk_desc =
+                transform_tensor_descriptor(in_n_x_wo_c_desc,
+                                            make_tuple(make_merge_transform(make_tuple(N, Wo)),
+                                                       make_merge_transform(make_tuple(X, C))),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1, 3>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return in_gemmm_gemmk_desc;
+        }
+    }
+
+    template <typename ALayout,
+              typename std::enable_if<NDimSpatial == 2 &&
+                                          is_same_v<ALayout, tensor_layout::convolution::GNHWC>,
+                                      bool>::type = false>
+    static auto
+    MakeADescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& /* a_g_n_c_wis_strides */,
+                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& /* b_g_k_c_xs_strides */,
+                        const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& /* c_g_n_k_wos_strides */,
+                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                        const std::array<index_t, NDimSpatial>& input_left_pads,
+                        const std::array<index_t, NDimSpatial>& input_right_pads)
+    {
+        const index_t N = a_g_n_c_wis_lengths[1];
+        const index_t C = a_g_n_c_wis_lengths[2];
+
+        const index_t Hi = a_g_n_c_wis_lengths[3];
+        const index_t Wi = a_g_n_c_wis_lengths[4];
+
+        const index_t Ho = c_g_n_k_wos_lengths[3];
+        const index_t Wo = c_g_n_k_wos_lengths[4];
+
+        const index_t ConvStrideH = conv_filter_strides[0];
+        const index_t ConvStrideW = conv_filter_strides[1];
+
+        if constexpr(ConvForwardSpecialization ==
+                     device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
+        {
+            const index_t NHoWo =
+                N * ck::accumulate_n<index_t>(
+                        c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>());
+
+            const auto in_gemmm_gemmk_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(NHoWo, C));
+
+            return in_gemmm_gemmk_desc;
+        }
+        else if constexpr(ConvForwardSpecialization ==
+                          device::ConvolutionForwardSpecialization::Filter1x1Pad0)
+        {
+            const auto in_n_hi_wi_c_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C));
+
+            const auto in_n_ho_wo_c_desc = transform_tensor_descriptor(
+                in_n_hi_wi_c_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_embed_transform(make_tuple(Ho), make_tuple(ConvStrideH)),
+                           make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto in_gemmm_gemmk_desc =
+                transform_tensor_descriptor(in_n_ho_wo_c_desc,
+                                            make_tuple(make_merge_transform(make_tuple(N, Ho, Wo)),
+                                                       make_pass_through_transform(C)),
+                                            make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return in_gemmm_gemmk_desc;
+        }
+        else
+        {
+            const index_t Y = b_g_k_c_xs_lengths[3];
+            const index_t X = b_g_k_c_xs_lengths[4];
+
+            const index_t ConvDilationH = conv_filter_dilations[0];
+            const index_t ConvDilationW = conv_filter_dilations[1];
+
+            const index_t InLeftPadH = input_left_pads[0];
+            const index_t InLeftPadW = input_left_pads[1];
+
+            const index_t InRightPadH = input_right_pads[0];
+            const index_t InRightPadW = input_right_pads[1];
+
+            const auto in_n_hi_wi_c_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C));
+
+            const auto in_n_hip_wip_c_desc = transform_tensor_descriptor(
+                in_n_hi_wi_c_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto in_n_y_ho_x_wo_c_desc = transform_tensor_descriptor(
+                in_n_hip_wip_c_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                    make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                    make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+            const auto in_gemmm_gemmk_desc =
+                transform_tensor_descriptor(in_n_y_ho_x_wo_c_desc,
+                                            make_tuple(make_merge_transform(make_tuple(N, Ho, Wo)),
+                                                       make_merge_transform(make_tuple(Y, X, C))),
+                                            make_tuple(Sequence<0, 2, 4>{}, Sequence<1, 3, 5>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return in_gemmm_gemmk_desc;
+        }
+    }
+
+    template <typename ALayout,
+              typename std::enable_if<NDimSpatial == 3 &&
+                                          is_same_v<ALayout, tensor_layout::convolution::GNDHWC>,
+                                      bool>::type = false>
+    static auto
+    MakeADescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& /* a_g_n_c_wis_strides */,
+                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& /* b_g_k_c_xs_strides */,
+                        const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& /* c_g_n_k_wos_strides */,
+                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                        const std::array<index_t, NDimSpatial>& input_left_pads,
+                        const std::array<index_t, NDimSpatial>& input_right_pads)
+    {
+        const index_t N = a_g_n_c_wis_lengths[1];
+        const index_t C = a_g_n_c_wis_lengths[2];
+
+        const index_t Di = a_g_n_c_wis_lengths[3];
+        const index_t Hi = a_g_n_c_wis_lengths[4];
+        const index_t Wi = a_g_n_c_wis_lengths[5];
+
+        const index_t Do = c_g_n_k_wos_lengths[3];
+        const index_t Ho = c_g_n_k_wos_lengths[4];
+        const index_t Wo = c_g_n_k_wos_lengths[5];
+
+        const index_t ConvStrideD = conv_filter_strides[0];
+        const index_t ConvStrideH = conv_filter_strides[1];
+        const index_t ConvStrideW = conv_filter_strides[2];
+
+        if constexpr(ConvForwardSpecialization ==
+                     device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
+        {
+            const index_t NDoHoWo =
+                N * ck::accumulate_n<index_t>(
+                        c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>());
+
+            const auto in_gemmm_gemmk_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(NDoHoWo, C));
+
+            return in_gemmm_gemmk_desc;
+        }
+        else if constexpr(ConvForwardSpecialization ==
+                          device::ConvolutionForwardSpecialization::Filter1x1Pad0)
+        {
+            const auto in_n_di_hi_wi_c_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N, Di, Hi, Wi, C));
+
+            const auto in_n_do_ho_wo_c_desc = transform_tensor_descriptor(
+                in_n_di_hi_wi_c_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_embed_transform(make_tuple(Do), make_tuple(ConvStrideD)),
+                           make_embed_transform(make_tuple(Ho), make_tuple(ConvStrideH)),
+                           make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)),
+                           make_pass_through_transform(C)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+            const auto in_gemmm_gemmk_desc = transform_tensor_descriptor(
+                in_n_do_ho_wo_c_desc,
+                make_tuple(make_merge_transform(make_tuple(N, Do, Ho, Wo)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<4>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return in_gemmm_gemmk_desc;
+        }
+        else
+        {
+            const index_t Z = b_g_k_c_xs_lengths[3];
+            const index_t Y = b_g_k_c_xs_lengths[4];
+            const index_t X = b_g_k_c_xs_lengths[5];
+
+            const index_t ConvDilationD = conv_filter_dilations[0];
+            const index_t ConvDilationH = conv_filter_dilations[1];
+            const index_t ConvDilationW = conv_filter_dilations[2];
+
+            const index_t InLeftPadD = input_left_pads[0];
+            const index_t InLeftPadH = input_left_pads[1];
+            const index_t InLeftPadW = input_left_pads[2];
+
+            const index_t InRightPadD = input_right_pads[0];
+            const index_t InRightPadH = input_right_pads[1];
+            const index_t InRightPadW = input_right_pads[2];
+
+            const auto in_n_di_hi_wi_c_desc =
+                make_naive_tensor_descriptor_packed(make_tuple(N, Di, Hi, Wi, C));
+
+            const auto in_n_hip_wip_c_desc = transform_tensor_descriptor(
+                in_n_di_hi_wi_c_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Di, InLeftPadD, InRightPadD),
+                           make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+            const auto in_n_z_do_y_ho_x_wo_c_desc = transform_tensor_descriptor(
+                in_n_hip_wip_c_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(Z, Do), make_tuple(ConvDilationD, ConvStrideD)),
+                    make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                    make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                    make_pass_through_transform(C)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                make_tuple(Sequence<0>{},
+                           Sequence<1, 2>{},
+                           Sequence<3, 4>{},
+                           Sequence<5, 6>{},
+                           Sequence<7>{}));
+
+            const auto in_gemmm_gemmk_desc = transform_tensor_descriptor(
+                in_n_z_do_y_ho_x_wo_c_desc,
+                make_tuple(make_merge_transform(make_tuple(N, Do, Ho, Wo)),
+                           make_merge_transform(make_tuple(Z, Y, X, C))),
+                make_tuple(Sequence<0, 2, 4, 6>{}, Sequence<1, 3, 5, 7>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return in_gemmm_gemmk_desc;
+        }
+    }
+
+    // TODO: implement ck::tensor_layout::convolution that describe packed/strided dimemsion as
+    // properties
+    template <typename ALayout,
+              typename std::enable_if<NDimSpatial == 1 &&
+                                          (is_same_v<ALayout, tensor_layout::convolution::G_NW_C> ||
+                                           is_same_v<ALayout, tensor_layout::convolution::NWGC>),
+                                      bool>::type = false>
+    static auto
+    MakeADescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& /* b_g_k_c_xs_strides */,
+                        const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& /* c_g_n_k_wos_strides */,
+                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                        const std::array<index_t, NDimSpatial>& input_left_pads,
+                        const std::array<index_t, NDimSpatial>& input_right_pads)
+    {
+        const index_t N = a_g_n_c_wis_lengths[1];
+        const index_t C = a_g_n_c_wis_lengths[2];
+
+        const index_t Wi = a_g_n_c_wis_lengths[3];
+
+        const index_t Wo = c_g_n_k_wos_lengths[3];
+
+        const index_t ConvStrideW = conv_filter_strides[0];
+
+        if constexpr(ConvForwardSpecialization ==
+                     device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
+        {
+            const index_t NHoWo =
+                N * ck::accumulate_n<index_t>(
+                        c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>());
+
+            // This is different
+            const index_t WiStride = a_g_n_c_wis_strides[2 + NDimSpatial];
+            const auto CStride     = I1;
+
+            const auto in_gemmm_gemmk_desc =
+                make_naive_tensor_descriptor(make_tuple(NHoWo, C), make_tuple(WiStride, CStride));
+
+            return in_gemmm_gemmk_desc;
+        }
+        else if constexpr(ConvForwardSpecialization ==
+                          device::ConvolutionForwardSpecialization::Filter1x1Pad0)
+        {
+            // This is different
+            const index_t NStride  = a_g_n_c_wis_strides[1];
+            const index_t WiStride = a_g_n_c_wis_strides[3];
+            const auto CStride     = I1;
+
+            const auto in_n_wi_c_desc = make_naive_tensor_descriptor(
+                make_tuple(N, Wi, C), make_tuple(NStride, WiStride, CStride));
+
+            const auto in_n_wo_c_desc = transform_tensor_descriptor(
+                in_n_wi_c_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            const auto in_gemmm_gemmk_desc = transform_tensor_descriptor(
+                in_n_wo_c_desc,
+                make_tuple(make_merge_transform(make_tuple(N, Wo)), make_pass_through_transform(C)),
+                make_tuple(Sequence<0, 1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return in_gemmm_gemmk_desc;
+        }
+        else
+        {
+            const index_t X             = b_g_k_c_xs_lengths[3];
+            const index_t ConvDilationW = conv_filter_dilations[0];
+            const index_t InLeftPadW    = input_left_pads[0];
+            const index_t InRightPadW   = input_right_pads[0];
+
+            // This is different
+            const index_t NStride  = a_g_n_c_wis_strides[1];
+            const index_t WiStride = a_g_n_c_wis_strides[3];
+            const auto CStride     = I1;
+
+            const auto in_n_wi_c_desc = make_naive_tensor_descriptor(
+                make_tuple(N, Wi, C), make_tuple(NStride, WiStride, CStride));
+
+            const auto in_n_wip_c_desc = transform_tensor_descriptor(
+                in_n_wi_c_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            const auto in_n_x_wo_c_desc = transform_tensor_descriptor(
+                in_n_wip_c_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                    make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}));
+
+            const auto in_gemmm_gemmk_desc =
+                transform_tensor_descriptor(in_n_x_wo_c_desc,
+                                            make_tuple(make_merge_transform(make_tuple(N, Wo)),
+                                                       make_merge_transform(make_tuple(X, C))),
+                                            make_tuple(Sequence<0, 2>{}, Sequence<1, 3>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return in_gemmm_gemmk_desc;
+        }
+    }
+
+    template <typename ALayout,
+              typename std::enable_if<
+                  NDimSpatial == 2 && (is_same_v<ALayout, tensor_layout::convolution::G_NHW_C> ||
+                                       is_same_v<ALayout, tensor_layout::convolution::NHWGC>),
+                  bool>::type = false>
+    static auto
+    MakeADescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& /* b_g_k_c_xs_strides */,
+                        const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& /* c_g_n_k_wos_strides */,
+                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                        const std::array<index_t, NDimSpatial>& input_left_pads,
+                        const std::array<index_t, NDimSpatial>& input_right_pads)
+    {
+        const index_t N = a_g_n_c_wis_lengths[1];
+        const index_t C = a_g_n_c_wis_lengths[2];
+
+        const index_t Hi = a_g_n_c_wis_lengths[3];
+        const index_t Wi = a_g_n_c_wis_lengths[4];
+
+        const index_t Ho = c_g_n_k_wos_lengths[3];
+        const index_t Wo = c_g_n_k_wos_lengths[4];
+
+        const index_t ConvStrideH = conv_filter_strides[0];
+        const index_t ConvStrideW = conv_filter_strides[1];
+
+        if constexpr(ConvForwardSpecialization ==
+                     device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
+        {
+            const index_t NHoWo =
+                N * ck::accumulate_n<index_t>(
+                        c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>());
+
+            // This is different
+            const index_t WiStride = a_g_n_c_wis_strides[2 + NDimSpatial];
+            const auto CStride     = I1;
+
+            const auto in_gemmm_gemmk_desc =
+                make_naive_tensor_descriptor(make_tuple(NHoWo, C), make_tuple(WiStride, CStride));
+
+            return in_gemmm_gemmk_desc;
+        }
+        else if constexpr(ConvForwardSpecialization ==
+                          device::ConvolutionForwardSpecialization::Filter1x1Pad0)
+        {
+            // This is different
+            const index_t NStride  = a_g_n_c_wis_strides[1];
+            const index_t HiStride = a_g_n_c_wis_strides[3];
+            const index_t WiStride = a_g_n_c_wis_strides[4];
+            const auto CStride     = I1;
+
+            const auto in_n_hi_wi_c_desc = make_naive_tensor_descriptor(
+                make_tuple(N, Hi, Wi, C), make_tuple(NStride, HiStride, WiStride, CStride));
+
+            const auto in_n_ho_wo_c_desc = transform_tensor_descriptor(
+                in_n_hi_wi_c_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_embed_transform(make_tuple(Ho), make_tuple(ConvStrideH)),
+                           make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto in_gemmm_gemmk_desc =
+                transform_tensor_descriptor(in_n_ho_wo_c_desc,
+                                            make_tuple(make_merge_transform(make_tuple(N, Ho, Wo)),
+                                                       make_pass_through_transform(C)),
+                                            make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return in_gemmm_gemmk_desc;
+        }
+        else
+        {
+            const index_t Y = b_g_k_c_xs_lengths[3];
+            const index_t X = b_g_k_c_xs_lengths[4];
+
+            const index_t ConvDilationH = conv_filter_dilations[0];
+            const index_t ConvDilationW = conv_filter_dilations[1];
+
+            const index_t InLeftPadH = input_left_pads[0];
+            const index_t InLeftPadW = input_left_pads[1];
+
+            const index_t InRightPadH = input_right_pads[0];
+            const index_t InRightPadW = input_right_pads[1];
+
+            // This is different
+            const index_t NStride  = a_g_n_c_wis_strides[1];
+            const index_t HiStride = a_g_n_c_wis_strides[3];
+            const index_t WiStride = a_g_n_c_wis_strides[4];
+            const auto CStride     = I1;
+
+            const auto in_n_hi_wi_c_desc = make_naive_tensor_descriptor(
+                make_tuple(N, Hi, Wi, C), make_tuple(NStride, HiStride, WiStride, CStride));
+
+            const auto in_n_hip_wip_c_desc = transform_tensor_descriptor(
+                in_n_hi_wi_c_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto in_n_y_ho_x_wo_c_desc = transform_tensor_descriptor(
+                in_n_hip_wip_c_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                    make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                    make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{}));
+
+            const auto in_gemmm_gemmk_desc =
+                transform_tensor_descriptor(in_n_y_ho_x_wo_c_desc,
+                                            make_tuple(make_merge_transform(make_tuple(N, Ho, Wo)),
+                                                       make_merge_transform(make_tuple(Y, X, C))),
+                                            make_tuple(Sequence<0, 2, 4>{}, Sequence<1, 3, 5>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return in_gemmm_gemmk_desc;
+        }
+    }
+
+    template <typename ALayout,
+              typename std::enable_if<
+                  NDimSpatial == 3 && (is_same_v<ALayout, tensor_layout::convolution::G_NDHW_C> ||
+                                       is_same_v<ALayout, tensor_layout::convolution::NDHWGC>),
+                  bool>::type = false>
+    static auto
+    MakeADescriptor_M_K(const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& a_g_n_c_wis_strides,
+                        const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& /* b_g_k_c_xs_strides */,
+                        const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& /* c_g_n_k_wos_strides */,
+                        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+                        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+                        const std::array<index_t, NDimSpatial>& input_left_pads,
+                        const std::array<index_t, NDimSpatial>& input_right_pads)
+    {
+        const index_t N = a_g_n_c_wis_lengths[1];
+        const index_t C = a_g_n_c_wis_lengths[2];
+
+        const index_t Di = a_g_n_c_wis_lengths[3];
+        const index_t Hi = a_g_n_c_wis_lengths[4];
+        const index_t Wi = a_g_n_c_wis_lengths[5];
+
+        const index_t Do = c_g_n_k_wos_lengths[3];
+        const index_t Ho = c_g_n_k_wos_lengths[4];
+        const index_t Wo = c_g_n_k_wos_lengths[5];
+
+        const index_t ConvStrideD = conv_filter_strides[0];
+        const index_t ConvStrideH = conv_filter_strides[1];
+        const index_t ConvStrideW = conv_filter_strides[2];
+
+        if constexpr(ConvForwardSpecialization ==
+                     device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
+        {
+            const index_t NDoHoWo =
+                N * ck::accumulate_n<index_t>(
+                        c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>());
+
+            // This is different
+            const index_t WiStride = a_g_n_c_wis_strides[2 + NDimSpatial];
+            const auto CStride     = I1;
+
+            const auto in_gemmm_gemmk_desc =
+                make_naive_tensor_descriptor(make_tuple(NDoHoWo, C), make_tuple(WiStride, CStride));
+
+            return in_gemmm_gemmk_desc;
+        }
+        else if constexpr(ConvForwardSpecialization ==
+                          device::ConvolutionForwardSpecialization::Filter1x1Pad0)
+        {
+            // This is different
+            const index_t NStride  = a_g_n_c_wis_strides[1];
+            const index_t DiStride = a_g_n_c_wis_strides[3];
+            const index_t HiStride = a_g_n_c_wis_strides[4];
+            const index_t WiStride = a_g_n_c_wis_strides[5];
+            const auto CStride     = I1;
+
+            const auto in_n_di_hi_wi_c_desc = make_naive_tensor_descriptor(
+                make_tuple(N, Di, Hi, Wi, C),
+                make_tuple(NStride, DiStride, HiStride, WiStride, CStride));
+
+            const auto in_n_do_ho_wo_c_desc = transform_tensor_descriptor(
+                in_n_di_hi_wi_c_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_embed_transform(make_tuple(Do), make_tuple(ConvStrideD)),
+                           make_embed_transform(make_tuple(Ho), make_tuple(ConvStrideH)),
+                           make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)),
+                           make_pass_through_transform(C)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+            const auto in_gemmm_gemmk_desc = transform_tensor_descriptor(
+                in_n_do_ho_wo_c_desc,
+                make_tuple(make_merge_transform(make_tuple(N, Do, Ho, Wo)),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<4>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return in_gemmm_gemmk_desc;
+        }
+        else
+        {
+            const index_t Z = b_g_k_c_xs_lengths[3];
+            const index_t Y = b_g_k_c_xs_lengths[4];
+            const index_t X = b_g_k_c_xs_lengths[5];
+
+            const index_t ConvDilationD = conv_filter_dilations[0];
+            const index_t ConvDilationH = conv_filter_dilations[1];
+            const index_t ConvDilationW = conv_filter_dilations[2];
+
+            const index_t InLeftPadD = input_left_pads[0];
+            const index_t InLeftPadH = input_left_pads[1];
+            const index_t InLeftPadW = input_left_pads[2];
+
+            const index_t InRightPadD = input_right_pads[0];
+            const index_t InRightPadH = input_right_pads[1];
+            const index_t InRightPadW = input_right_pads[2];
+
+            // This is different
+            const index_t NStride  = a_g_n_c_wis_strides[1];
+            const index_t DiStride = a_g_n_c_wis_strides[3];
+            const index_t HiStride = a_g_n_c_wis_strides[4];
+            const index_t WiStride = a_g_n_c_wis_strides[5];
+            const auto CStride     = I1;
+
+            const auto in_n_di_hi_wi_c_desc = make_naive_tensor_descriptor(
+                make_tuple(N, Di, Hi, Wi, C),
+                make_tuple(NStride, DiStride, HiStride, WiStride, CStride));
+
+            const auto in_n_hip_wip_c_desc = transform_tensor_descriptor(
+                in_n_di_hi_wi_c_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Di, InLeftPadD, InRightPadD),
+                           make_pad_transform(Hi, InLeftPadH, InRightPadH),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(C)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+
+            const auto in_n_z_do_y_ho_x_wo_c_desc = transform_tensor_descriptor(
+                in_n_hip_wip_c_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(Z, Do), make_tuple(ConvDilationD, ConvStrideD)),
+                    make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)),
+                    make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                    make_pass_through_transform(C)),
+                make_tuple(
+                    Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+                make_tuple(Sequence<0>{},
+                           Sequence<1, 2>{},
+                           Sequence<3, 4>{},
+                           Sequence<5, 6>{},
+                           Sequence<7>{}));
+
+            const auto in_gemmm_gemmk_desc = transform_tensor_descriptor(
+                in_n_z_do_y_ho_x_wo_c_desc,
+                make_tuple(make_merge_transform(make_tuple(N, Do, Ho, Wo)),
+                           make_merge_transform(make_tuple(Z, Y, X, C))),
+                make_tuple(Sequence<0, 2, 4, 6>{}, Sequence<1, 3, 5, 7>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return in_gemmm_gemmk_desc;
+        }
+    }
+
+    template <typename BLayout,
+              typename std::enable_if<is_same_v<BLayout, tensor_layout::convolution::GKXC> ||
+                                          is_same_v<BLayout, tensor_layout::convolution::GKYXC> ||
+                                          is_same_v<BLayout, tensor_layout::convolution::GKZYXC>,
+                                      bool>::type = false>
+    static auto
+    MakeBDescriptor_N_K(const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& /* b_g_k_c_xs_strides */)
+    {
+        const index_t K = b_g_k_c_xs_lengths[1];
+        const index_t C = b_g_k_c_xs_lengths[2];
+
+        const index_t YX = ck::accumulate_n<index_t>(
+            b_g_k_c_xs_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>());
+
+        const auto wei_gemmn_gemmk_desc =
+            make_naive_tensor_descriptor_packed(make_tuple(K, YX * C));
+
+        return wei_gemmn_gemmk_desc;
+    }
+
+    template <
+        typename BLayout,
+        typename std::enable_if<is_same_v<BLayout, tensor_layout::convolution::G_K_X_C> ||
+                                    is_same_v<BLayout, tensor_layout::convolution::G_K_YX_C> ||
+                                    is_same_v<BLayout, tensor_layout::convolution::G_K_ZYX_C> ||
+                                    is_same_v<BLayout, tensor_layout::convolution::KXGC> ||
+                                    is_same_v<BLayout, tensor_layout::convolution::KYXGC> ||
+                                    is_same_v<BLayout, tensor_layout::convolution::KZYXGC>,
+                                bool>::type = false>
+    static auto MakeBDescriptor_N_K(const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_lengths,
+                                    const std::array<index_t, NDimSpatial + 3>& b_g_k_c_xs_strides)
+    {
+        const index_t K = b_g_k_c_xs_lengths[1];
+        const index_t C = b_g_k_c_xs_lengths[2];
+
+        const index_t YX = ck::accumulate_n<index_t>(
+            b_g_k_c_xs_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>());
+
+        const index_t KStride = b_g_k_c_xs_strides[1];
+        const index_t XStride = b_g_k_c_xs_strides[2 + NDimSpatial];
+        const auto CStride    = I1;
+
+        const auto wei_k_yx_c_desc = make_naive_tensor_descriptor(
+            make_tuple(K, YX, C), make_tuple(KStride, XStride, CStride));
+
+        const auto wei_gemmn_gemmk_desc = transform_tensor_descriptor(
+            wei_k_yx_c_desc,
+            make_tuple(make_pass_through_transform(K), make_merge_transform(make_tuple(YX, C))),
+            make_tuple(Sequence<0>{}, Sequence<1, 2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        return wei_gemmn_gemmk_desc;
+    }
+
+    template <typename CLayout,
+              typename std::enable_if<is_same_v<CLayout, tensor_layout::convolution::GNWK> ||
+                                          is_same_v<CLayout, tensor_layout::convolution::GNHWK> ||
+                                          is_same_v<CLayout, tensor_layout::convolution::GNDHWK>,
+                                      bool>::type = false>
+    static auto
+    MakeCDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& /* c_g_n_k_wos_strides */)
+    {
+        const index_t N = c_g_n_k_wos_lengths[1];
+        const index_t K = c_g_n_k_wos_lengths[2];
+
+        const index_t NHoWo =
+            N * ck::accumulate_n<index_t>(
+                    c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>());
+
+        const auto out_gemmm_gemmn_desc = make_naive_tensor_descriptor_packed(make_tuple(NHoWo, K));
+
+        return out_gemmm_gemmn_desc;
+    }
+
+    template <
+        typename CLayout,
+        typename std::enable_if<is_same_v<CLayout, tensor_layout::convolution::G_NW_K> ||
+                                    is_same_v<CLayout, tensor_layout::convolution::G_NHW_K> ||
+                                    is_same_v<CLayout, tensor_layout::convolution::G_NDHW_K> ||
+                                    is_same_v<CLayout, tensor_layout::convolution::NWGK> ||
+                                    is_same_v<CLayout, tensor_layout::convolution::NHWGK> ||
+                                    is_same_v<CLayout, tensor_layout::convolution::NDHWGK>,
+                                bool>::type = false>
+    static auto MakeCDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
+                                    const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_strides)
+    {
+        const index_t N = c_g_n_k_wos_lengths[1];
+        const index_t K = c_g_n_k_wos_lengths[2];
+
+        const auto KStride     = I1;
+        const index_t WoStride = c_g_n_k_wos_strides[NDimSpatial + 2];
+
+        const index_t NHoWo =
+            N * ck::accumulate_n<index_t>(
+                    c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>());
+
+        const auto out_gemmm_gemmn_desc =
+            make_naive_tensor_descriptor(make_tuple(NHoWo, K), make_tuple(WoStride, KStride));
+
+        return out_gemmm_gemmn_desc;
+    }
+
+    // for output bias
+    template <typename CLayout,
+              typename std::enable_if<is_same_v<CLayout, tensor_layout::convolution::GK> ||
+                                          is_same_v<CLayout, tensor_layout::convolution::G_K>,
+                                      bool>::type = false>
+    static auto
+    MakeCDescriptor_M_N(const std::array<index_t, NDimSpatial + 3>& c_g_n_k_wos_lengths,
+                        const std::array<index_t, NDimSpatial + 3>& /* c_g_n_k_wos_strides */)
+    {
+        const index_t N = c_g_n_k_wos_lengths[1];
+        const index_t K = c_g_n_k_wos_lengths[2];
+
+        const index_t NHoWo =
+            N * ck::accumulate_n<index_t>(
+                    c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>());
+
+        const auto out_gemmm_gemmn_desc =
+            make_naive_tensor_descriptor(make_tuple(NHoWo, K), make_tuple(I0, I1));
+
+        return out_gemmm_gemmn_desc;
+    }
+};
+
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/utility/amd_address_space.hpp b/include/ck/utility/amd_address_space.hpp
new file mode 100644
index 00000000..9f152591
--- /dev/null
+++ b/include/ck/utility/amd_address_space.hpp
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "c_style_pointer_cast.hpp"
+
+// Address Space for AMDGCN
+// https://llvm.org/docs/AMDGPUUsage.html#address-space
+
+namespace ck {
+
+enum struct AddressSpaceEnum
+{
+    Generic,
+    Global,
+    Lds,
+    Sgpr,
+    Vgpr,
+};
+
+template <typename T>
+__device__ T* cast_pointer_to_generic_address_space(T CK_CONSTANT_ADDRESS_SPACE* p)
+{
+    // cast a pointer in "Constant" address space (4) to "Generic" address space (0)
+    // only c-style pointer cast seems be able to be compiled
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wold-style-cast"
+    return (T*)p; // NOLINT(old-style-cast)
+#pragma clang diagnostic pop
+}
+
+template <typename T>
+__host__ __device__ T CK_CONSTANT_ADDRESS_SPACE* cast_pointer_to_constant_address_space(T* p)
+{
+    // cast a pointer in "Generic" address space (0) to "Constant" address space (4)
+    // only c-style pointer cast seems be able to be compiled
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wold-style-cast"
+    return (T CK_CONSTANT_ADDRESS_SPACE*)p; // NOLINT(old-style-cast)
+#pragma clang diagnostic pop
+}
+
+} // namespace ck
diff --git a/include/ck/utility/amd_buffer_addressing.hpp b/include/ck/utility/amd_buffer_addressing.hpp
new file mode 100644
index 00000000..79295356
--- /dev/null
+++ b/include/ck/utility/amd_buffer_addressing.hpp
@@ -0,0 +1,1177 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include "data_type.hpp"
+
+namespace ck {
+
+template <typename T>
+union BufferResource
+{
+    __device__ constexpr BufferResource() : content{} {}
+
+    // 128 bit SGPRs to supply buffer resource in buffer instructions
+    // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
+    int32x4_t content;
+    StaticallyIndexedArray<T*, 2> address;
+    StaticallyIndexedArray<int32_t, 4> range;
+    StaticallyIndexedArray<int32_t, 4> config;
+};
+
+template <typename T>
+__device__ int32x4_t make_wave_buffer_resource(T* p_wave, index_t element_space_size)
+{
+    BufferResource<T> wave_buffer_resource;
+
+    // wavewise base address (64 bit)
+    wave_buffer_resource.address(Number<0>{}) = const_cast<remove_cv_t<T>*>(p_wave);
+    // wavewise range (32 bit)
+    wave_buffer_resource.range(Number<2>{}) = element_space_size * sizeof(T);
+    // wavewise setting (32 bit)
+    wave_buffer_resource.config(Number<3>{}) = CK_BUFFER_RESOURCE_3RD_DWORD;
+
+    return wave_buffer_resource.content;
+}
+
+template <typename T>
+__device__ int32x4_t make_wave_buffer_resource_with_default_range(T* p_wave)
+{
+    BufferResource<T> wave_buffer_resource;
+
+    // wavewise base address (64 bit)
+    wave_buffer_resource.address(Number<0>{}) = const_cast<remove_cv_t<T>*>(p_wave);
+    // wavewise range (32 bit)
+    wave_buffer_resource.range(Number<2>{}) = 0xffffffff; // max possible range
+    // wavewise setting (32 bit)
+    wave_buffer_resource.config(Number<3>{}) = CK_BUFFER_RESOURCE_3RD_DWORD;
+
+    return wave_buffer_resource.content;
+}
+
+// buffer load i8
+__device__ int8_t
+llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc,
+                               index_t voffset,
+                               index_t soffset,
+                               index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8");
+
+__device__ int8x2_t
+llvm_amdgcn_raw_buffer_load_i8x2(int32x4_t srsrc,
+                                 index_t voffset,
+                                 index_t soffset,
+                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8");
+
+__device__ int8x4_t
+llvm_amdgcn_raw_buffer_load_i8x4(int32x4_t srsrc,
+                                 index_t voffset,
+                                 index_t soffset,
+                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8");
+
+// buffer load i16
+__device__ bhalf_t
+llvm_amdgcn_raw_buffer_load_i16(int32x4_t srsrc,
+                                index_t voffset,
+                                index_t soffset,
+                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16");
+
+__device__ bhalf2_t
+llvm_amdgcn_raw_buffer_load_i16x2(int32x4_t srsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16");
+
+__device__ bhalf4_t
+llvm_amdgcn_raw_buffer_load_i16x4(int32x4_t srsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16");
+
+// buffer load i32
+__device__ int32_t
+llvm_amdgcn_raw_buffer_load_i32(int32x4_t srsrc,
+                                index_t voffset,
+                                index_t soffset,
+                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32");
+
+__device__ int32x2_t
+llvm_amdgcn_raw_buffer_load_i32x2(int32x4_t srsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32");
+
+__device__ int32x4_t
+llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32");
+
+// buffer load fp16
+__device__ half_t
+llvm_amdgcn_raw_buffer_load_fp16(int32x4_t srsrc,
+                                 index_t voffset,
+                                 index_t soffset,
+                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16");
+
+__device__ half2_t
+llvm_amdgcn_raw_buffer_load_fp16x2(int32x4_t srsrc,
+                                   index_t voffset,
+                                   index_t soffset,
+                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16");
+
+__device__ half4_t
+llvm_amdgcn_raw_buffer_load_fp16x4(int32x4_t srsrc,
+                                   index_t voffset,
+                                   index_t soffset,
+                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16");
+
+// buffer load fp32
+__device__ float
+llvm_amdgcn_raw_buffer_load_fp32(int32x4_t srsrc,
+                                 index_t voffset,
+                                 index_t soffset,
+                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32");
+
+__device__ float2_t
+llvm_amdgcn_raw_buffer_load_fp32x2(int32x4_t srsrc,
+                                   index_t voffset,
+                                   index_t soffset,
+                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32");
+
+__device__ float4_t
+llvm_amdgcn_raw_buffer_load_fp32x4(int32x4_t srsrc,
+                                   index_t voffset,
+                                   index_t soffset,
+                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32");
+
+// buffer store i8
+__device__ void
+llvm_amdgcn_raw_buffer_store_i8(int8_t vdata,
+                                int32x4_t rsrc,
+                                index_t voffset,
+                                index_t soffset,
+                                index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8");
+
+__device__ void
+llvm_amdgcn_raw_buffer_store_i8x2(int8x2_t vdata,
+                                  int32x4_t rsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8");
+
+__device__ void
+llvm_amdgcn_raw_buffer_store_i8x4(int8x4_t vdata,
+                                  int32x4_t rsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8");
+
+// buffer store i16
+__device__ void
+llvm_amdgcn_raw_buffer_store_i16(bhalf_t vdata,
+                                 int32x4_t rsrc,
+                                 index_t voffset,
+                                 index_t soffset,
+                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16");
+
+__device__ void
+llvm_amdgcn_raw_buffer_store_i16x2(bhalf2_t vdata,
+                                   int32x4_t rsrc,
+                                   index_t voffset,
+                                   index_t soffset,
+                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16");
+
+__device__ void
+llvm_amdgcn_raw_buffer_store_i16x4(bhalf4_t vdata,
+                                   int32x4_t rsrc,
+                                   index_t voffset,
+                                   index_t soffset,
+                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16");
+
+// buffer store i32
+__device__ void
+llvm_amdgcn_raw_buffer_store_i32(int32_t vdata,
+                                 int32x4_t rsrc,
+                                 index_t voffset,
+                                 index_t soffset,
+                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32");
+
+__device__ void
+llvm_amdgcn_raw_buffer_store_i32x2(int32x2_t vdata,
+                                   int32x4_t rsrc,
+                                   index_t voffset,
+                                   index_t soffset,
+                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32");
+
+__device__ void
+llvm_amdgcn_raw_buffer_store_i32x4(int32x4_t vdata,
+                                   int32x4_t rsrc,
+                                   index_t voffset,
+                                   index_t soffset,
+                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32");
+
+// buffer store fp16
+__device__ void
+llvm_amdgcn_raw_buffer_store_fp16(half_t vdata,
+                                  int32x4_t rsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16");
+
+__device__ void
+llvm_amdgcn_raw_buffer_store_fp16x2(half2_t vdata,
+                                    int32x4_t rsrc,
+                                    index_t voffset,
+                                    index_t soffset,
+                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16");
+
+__device__ void
+llvm_amdgcn_raw_buffer_store_fp16x4(half4_t vdata,
+                                    int32x4_t rsrc,
+                                    index_t voffset,
+                                    index_t soffset,
+                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16");
+
+// buffer store fp32
+__device__ void
+llvm_amdgcn_raw_buffer_store_fp32(float vdata,
+                                  int32x4_t rsrc,
+                                  index_t voffset,
+                                  index_t soffset,
+                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32");
+
+__device__ void
+llvm_amdgcn_raw_buffer_store_fp32x2(float2_t vdata,
+                                    int32x4_t rsrc,
+                                    index_t voffset,
+                                    index_t soffset,
+                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32");
+
+__device__ void
+llvm_amdgcn_raw_buffer_store_fp32x4(float4_t vdata,
+                                    int32x4_t rsrc,
+                                    index_t voffset,
+                                    index_t soffset,
+                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32");
+
+// buffer atomic-add fp16
+__device__ half2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2(
+    half2_t vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16");
+
+// buffer atomic-add i32
+__device__ int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(
+    int32_t vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32");
+
+// buffer atomic-add fp32
+__device__ float llvm_amdgcn_raw_buffer_atomic_add_fp32(
+    float vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32");
+
+// buffer atomic-add fp32
+__device__ double
+llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata,
+                                       int32x4_t rsrc, // dst_wave_buffer_resource
+                                       int voffset,    // dst_thread_addr_offset
+                                       int soffset,    // dst_wave_addr_offset
+                                       int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64");
+
+template <typename T, index_t N>
+__device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_wave_buffer_resource,
+                                                                 index_t src_thread_addr_offset,
+                                                                 index_t src_wave_addr_offset)
+{
+    static_assert(
+        (is_same<T, double>::value && (N == 1 || N == 2 || N == 4)) ||
+            (is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
+            (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
+            (is_same<T, bhalf_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
+            (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
+            (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
+        "wrong! not implemented");
+
+    if constexpr(is_same<T, double>::value)
+    {
+        // use fp32 load to mimic fp64 load
+        if constexpr(N == 1)
+        {
+            const float2_t tmp = llvm_amdgcn_raw_buffer_load_fp32x2(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+
+            return bit_cast<double>(tmp);
+        }
+        else if constexpr(N == 2)
+        {
+            const float4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+
+            return bit_cast<double2_t>(tmp);
+        }
+        else if constexpr(N == 4)
+        {
+            const float4_t f32_0 = llvm_amdgcn_raw_buffer_load_fp32x4(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+
+            const float4_t f32_1 =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset + 4 * sizeof(float),
+                                                   0);
+            vector_type<double, 4> tmp;
+
+            tmp.AsType<double2_t>()(Number<0>{}) = bit_cast<double2_t>(f32_0);
+            tmp.AsType<double2_t>()(Number<1>{}) = bit_cast<double2_t>(f32_1);
+
+            return tmp.AsType<double4_t>()(Number<0>{});
+        }
+    }
+    else if constexpr(is_same<T, float>::value)
+    {
+        if constexpr(N == 1)
+        {
+            return llvm_amdgcn_raw_buffer_load_fp32(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+        }
+        else if constexpr(N == 2)
+        {
+            return llvm_amdgcn_raw_buffer_load_fp32x2(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+        }
+        else if constexpr(N == 4)
+        {
+            return llvm_amdgcn_raw_buffer_load_fp32x4(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+        }
+        else if constexpr(N == 8)
+        {
+            vector_type<float, 8> tmp;
+
+            tmp.AsType<float4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_fp32x4(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+
+            tmp.AsType<float4_t>()(Number<1>{}) =
+                llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
+                                                   src_thread_addr_offset,
+                                                   src_wave_addr_offset + 4 * sizeof(float),
+                                                   0);
+
+            return tmp.AsType<float8_t>()(Number<0>{});
+        }
+    }
+    else if constexpr(is_same<T, half_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            return llvm_amdgcn_raw_buffer_load_fp16(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+        }
+        else if constexpr(N == 2)
+        {
+            return llvm_amdgcn_raw_buffer_load_fp16x2(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+        }
+        else if constexpr(N == 4)
+        {
+            return llvm_amdgcn_raw_buffer_load_fp16x4(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+        }
+        else if constexpr(N == 8)
+        {
+            // use fp32 load to mimic fp16 load
+            float4_t tmp = llvm_amdgcn_raw_buffer_load_fp32x4(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+
+            return bit_cast<half8_t>(tmp);
+        }
+    }
+    else if constexpr(is_same<T, bhalf_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            return llvm_amdgcn_raw_buffer_load_i16(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+        }
+        else if constexpr(N == 2)
+        {
+            return llvm_amdgcn_raw_buffer_load_i16x2(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+        }
+        else if constexpr(N == 4)
+        {
+            return llvm_amdgcn_raw_buffer_load_i16x4(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+        }
+        else if constexpr(N == 8)
+        {
+            int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+
+            return bit_cast<bhalf8_t>(tmp);
+        }
+    }
+    else if constexpr(is_same<T, int32_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            return llvm_amdgcn_raw_buffer_load_i32(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+        }
+        else if constexpr(N == 2)
+        {
+            return llvm_amdgcn_raw_buffer_load_i32x2(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+        }
+        else if constexpr(N == 4)
+        {
+            return llvm_amdgcn_raw_buffer_load_i32x4(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+        }
+        else if constexpr(N == 8)
+        {
+            vector_type<int32_t, 8> tmp;
+
+            tmp.AsType<int32x4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_i32x4(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+
+            tmp.AsType<int32x4_t>()(Number<1>{}) =
+                llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
+                                                  src_thread_addr_offset,
+                                                  src_wave_addr_offset + 4 * sizeof(int32_t),
+                                                  0);
+            return tmp.AsType<int32x8_t>()(Number<0>{});
+        }
+    }
+    else if constexpr(is_same<T, int8_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            return llvm_amdgcn_raw_buffer_load_i8(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+        }
+        else if constexpr(N == 2)
+        {
+#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
+            return llvm_amdgcn_raw_buffer_load_i8x2(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+#else
+            int16_t tmp = llvm_amdgcn_raw_buffer_load_i16(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+
+            return bit_cast<int8x2_t>(tmp);
+#endif
+        }
+        else if constexpr(N == 4)
+        {
+#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
+            return llvm_amdgcn_raw_buffer_load_i8x4(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+#else
+            int32_t tmp = llvm_amdgcn_raw_buffer_load_i32(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+
+            return bit_cast<int8x4_t>(tmp);
+#endif
+        }
+        else if constexpr(N == 8)
+        {
+#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
+            vector_type<int8_t, 8> tmp;
+
+            tmp.AsType<int8x4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_i8x4(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+
+            tmp.AsType<int8x4_t>()(Number<1>{}) =
+                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
+                                                 src_thread_addr_offset,
+                                                 src_wave_addr_offset + 4 * sizeof(int8_t),
+                                                 0);
+
+            return tmp.AsType<int8x8_t>()(Number<0>{});
+#else
+            int32x2_t tmp = llvm_amdgcn_raw_buffer_load_i32x2(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+
+            return bit_cast<int8x8_t>(tmp);
+#endif
+        }
+        else if constexpr(N == 16)
+        {
+#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
+            vector_type<int8_t, 16> tmp;
+
+            tmp.AsType<int8x4_t>()(Number<0>{}) = llvm_amdgcn_raw_buffer_load_i8x4(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+
+            tmp.AsType<int8x4_t>()(Number<1>{}) =
+                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
+                                                 src_thread_addr_offset,
+                                                 src_wave_addr_offset + 4 * sizeof(int8_t),
+                                                 0);
+
+            tmp.AsType<int8x4_t>()(Number<2>{}) =
+                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
+                                                 src_thread_addr_offset,
+                                                 src_wave_addr_offset + 8 * sizeof(int8_t),
+                                                 0);
+
+            tmp.AsType<int8x4_t>()(Number<3>{}) =
+                llvm_amdgcn_raw_buffer_load_i8x4(src_wave_buffer_resource,
+                                                 src_thread_addr_offset,
+                                                 src_wave_addr_offset + 12 * sizeof(int8_t),
+                                                 0);
+
+            return tmp.AsType<int8x16_t>()(Number<0>{});
+#else
+            int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(
+                src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset, 0);
+
+            return bit_cast<int8x16_t>(tmp);
+#endif
+        }
+    }
+}
+
+template <typename T, index_t N>
+__device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src_thread_data,
+                                      int32x4_t dst_wave_buffer_resource,
+                                      index_t dst_thread_addr_offset,
+                                      index_t dst_wave_addr_offset)
+{
+    static_assert(
+        (is_same<T, double>::value && (N == 1 || N == 2)) ||
+            (is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
+            (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
+            (is_same<T, bhalf_t>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
+            (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)) ||
+            (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
+        "wrong! not implemented");
+
+    if constexpr(is_same<T, double>::value)
+    {
+        // use fp32 store to mimic fp64 store
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_store_fp32x2(bit_cast<float2_t>(src_thread_data),
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                0);
+        }
+        else if constexpr(N == 2)
+        {
+            llvm_amdgcn_raw_buffer_store_fp32x4(bit_cast<float4_t>(src_thread_data),
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                0);
+        }
+    }
+    else if constexpr(is_same<T, float>::value)
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_store_fp32(src_thread_data,
+                                              dst_wave_buffer_resource,
+                                              dst_thread_addr_offset,
+                                              dst_wave_addr_offset,
+                                              0);
+        }
+        else if constexpr(N == 2)
+        {
+            llvm_amdgcn_raw_buffer_store_fp32x2(src_thread_data,
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                0);
+        }
+        else if constexpr(N == 4)
+        {
+            llvm_amdgcn_raw_buffer_store_fp32x4(src_thread_data,
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                0);
+        }
+    }
+    else if constexpr(is_same<T, half_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_store_fp16(src_thread_data,
+                                              dst_wave_buffer_resource,
+                                              dst_thread_addr_offset,
+                                              dst_wave_addr_offset,
+                                              0);
+        }
+        else if constexpr(N == 2)
+        {
+            llvm_amdgcn_raw_buffer_store_fp16x2(src_thread_data,
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                0);
+        }
+        else if constexpr(N == 4)
+        {
+            llvm_amdgcn_raw_buffer_store_fp16x4(src_thread_data,
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                0);
+        }
+        else if constexpr(N == 8)
+        {
+#if 0
+            vector_type<half_t, 8> tmp{src_thread_data};
+
+            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[Number<0>{}],
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                0);
+
+            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.AsType<half4_t>()[Number<1>{}],
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset + 4 * sizeof(half_t),
+                                                0);
+#else
+            llvm_amdgcn_raw_buffer_store_fp32x4(bit_cast<float4_t>(src_thread_data),
+                                                dst_wave_buffer_resource,
+                                                dst_thread_addr_offset,
+                                                dst_wave_addr_offset,
+                                                0);
+#endif
+        }
+    }
+    else if constexpr(is_same<T, bhalf_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_store_i16(src_thread_data,
+                                             dst_wave_buffer_resource,
+                                             dst_thread_addr_offset,
+                                             dst_wave_addr_offset,
+                                             0);
+        }
+        else if constexpr(N == 2)
+        {
+            llvm_amdgcn_raw_buffer_store_i16x2(src_thread_data,
+                                               dst_wave_buffer_resource,
+                                               dst_thread_addr_offset,
+                                               dst_wave_addr_offset,
+                                               0);
+        }
+        else if constexpr(N == 4)
+        {
+            llvm_amdgcn_raw_buffer_store_i16x4(src_thread_data,
+                                               dst_wave_buffer_resource,
+                                               dst_thread_addr_offset,
+                                               dst_wave_addr_offset,
+                                               0);
+        }
+        else if constexpr(N == 8)
+        {
+            vector_type<bhalf_t, 8> tmp{src_thread_data};
+
+            llvm_amdgcn_raw_buffer_store_i16x4(tmp.AsType<bhalf4_t>()[Number<0>{}],
+                                               dst_wave_buffer_resource,
+                                               dst_thread_addr_offset,
+                                               dst_wave_addr_offset,
+                                               0);
+
+            llvm_amdgcn_raw_buffer_store_i16x4(tmp.AsType<bhalf4_t>()[Number<1>{}],
+                                               dst_wave_buffer_resource,
+                                               dst_thread_addr_offset,
+                                               dst_wave_addr_offset + 4 * sizeof(bhalf_t),
+                                               0);
+        }
+    }
+    else if constexpr(is_same<T, int32_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_store_i32(src_thread_data,
+                                             dst_wave_buffer_resource,
+                                             dst_thread_addr_offset,
+                                             dst_wave_addr_offset,
+                                             0);
+        }
+        else if constexpr(N == 2)
+        {
+            llvm_amdgcn_raw_buffer_store_i32x2(src_thread_data,
+                                               dst_wave_buffer_resource,
+                                               dst_thread_addr_offset,
+                                               dst_wave_addr_offset,
+                                               0);
+        }
+        else if constexpr(N == 4)
+        {
+            llvm_amdgcn_raw_buffer_store_i32x4(src_thread_data,
+                                               dst_wave_buffer_resource,
+                                               dst_thread_addr_offset,
+                                               dst_wave_addr_offset,
+                                               0);
+        }
+    }
+    else if constexpr(is_same<T, int8_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_store_i8(src_thread_data,
+                                            dst_wave_buffer_resource,
+                                            dst_thread_addr_offset,
+                                            dst_wave_addr_offset,
+                                            0);
+        }
+        else if constexpr(N == 2)
+        {
+#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
+            llvm_amdgcn_raw_buffer_store_i8x2(src_thread_data,
+                                              dst_wave_buffer_resource,
+                                              dst_thread_addr_offset,
+                                              dst_wave_addr_offset,
+                                              0);
+#else
+            llvm_amdgcn_raw_buffer_store_i16(bit_cast<int16_t>(src_thread_data),
+                                             dst_wave_buffer_resource,
+                                             dst_thread_addr_offset,
+                                             dst_wave_addr_offset,
+                                             0);
+#endif
+        }
+        else if constexpr(N == 4)
+        {
+#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
+            llvm_amdgcn_raw_buffer_store_i8x4(src_thread_data,
+                                              dst_wave_buffer_resource,
+                                              dst_thread_addr_offset,
+                                              dst_wave_addr_offset,
+                                              0);
+#else
+            llvm_amdgcn_raw_buffer_store_i32(bit_cast<int32_t>(src_thread_data),
+                                             dst_wave_buffer_resource,
+                                             dst_thread_addr_offset,
+                                             dst_wave_addr_offset,
+                                             0);
+#endif
+        }
+        else if constexpr(N == 8)
+        {
+            llvm_amdgcn_raw_buffer_store_i32x2(bit_cast<int32x2_t>(src_thread_data),
+                                               dst_wave_buffer_resource,
+                                               dst_thread_addr_offset,
+                                               dst_wave_addr_offset,
+                                               0);
+        }
+        else if constexpr(N == 16)
+        {
+            llvm_amdgcn_raw_buffer_store_i32x4(bit_cast<int32x4_t>(src_thread_data),
+                                               dst_wave_buffer_resource,
+                                               dst_thread_addr_offset,
+                                               dst_wave_addr_offset,
+                                               0);
+        }
+    }
+}
+
+template <typename T, index_t N>
+__device__ void amd_buffer_atomic_add_impl(const typename vector_type<T, N>::type src_thread_data,
+                                           int32x4_t dst_wave_buffer_resource,
+                                           index_t dst_thread_addr_offset,
+                                           index_t dst_wave_addr_offset)
+{
+    static_assert((is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
+                      (is_same<T, half_t>::value && (N == 2 || N == 4 || N == 8)) ||
+                      (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)),
+                  "wrong! not implemented");
+
+    if constexpr(is_same<T, float>::value)
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(src_thread_data,
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset,
+                                                   0);
+        }
+        else if constexpr(N == 2)
+        {
+            vector_type<float, 2> tmp{src_thread_data};
+
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<0>{}],
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset,
+                                                   0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<1>{}],
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset + sizeof(float),
+                                                   0);
+        }
+        else if constexpr(N == 4)
+        {
+            vector_type<float, 4> tmp{src_thread_data};
+
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<0>{}],
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset,
+                                                   0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<1>{}],
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset + sizeof(float),
+                                                   0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<2>{}],
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset + 2 * sizeof(float),
+                                                   0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_fp32(tmp.AsType<float>()[Number<3>{}],
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset + 3 * sizeof(float),
+                                                   0);
+        }
+    }
+    else if constexpr(is_same<T, half_t>::value)
+    {
+        if constexpr(N == 2)
+        {
+            llvm_amdgcn_raw_buffer_atomic_add_fp16x2(src_thread_data,
+                                                     dst_wave_buffer_resource,
+                                                     dst_thread_addr_offset,
+                                                     dst_wave_addr_offset,
+                                                     0);
+        }
+        else if constexpr(N == 4)
+        {
+            vector_type<half_t, 4> tmp{src_thread_data};
+
+            static_for<0, 2, 1>{}([&](auto i) {
+                llvm_amdgcn_raw_buffer_atomic_add_fp16x2(tmp.AsType<half2_t>()[i],
+                                                         dst_wave_buffer_resource,
+                                                         dst_thread_addr_offset,
+                                                         dst_wave_addr_offset + i * sizeof(half2_t),
+                                                         0);
+            });
+        }
+        else if constexpr(N == 8)
+        {
+            vector_type<half_t, 8> tmp{src_thread_data};
+
+            static_for<0, 4, 1>{}([&](auto i) {
+                llvm_amdgcn_raw_buffer_atomic_add_fp16x2(tmp.AsType<half2_t>()[i],
+                                                         dst_wave_buffer_resource,
+                                                         dst_thread_addr_offset,
+                                                         dst_wave_addr_offset + i * sizeof(half2_t),
+                                                         0);
+            });
+        }
+    }
+    else if constexpr(is_same<T, int32_t>::value)
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_atomic_add_i32(src_thread_data,
+                                                  dst_wave_buffer_resource,
+                                                  dst_thread_addr_offset,
+                                                  dst_wave_addr_offset,
+                                                  0);
+        }
+        else if constexpr(N == 2)
+        {
+            vector_type<int32_t, 2> tmp{src_thread_data};
+
+            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<0>{}],
+                                                  dst_wave_buffer_resource,
+                                                  dst_thread_addr_offset,
+                                                  dst_wave_addr_offset,
+                                                  0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<1>{}],
+                                                  dst_wave_buffer_resource,
+                                                  dst_thread_addr_offset,
+                                                  dst_wave_addr_offset + sizeof(int32_t),
+                                                  0);
+        }
+        else if constexpr(N == 4)
+        {
+            vector_type<int32_t, 4> tmp{src_thread_data};
+
+            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<0>{}],
+                                                  dst_wave_buffer_resource,
+                                                  dst_thread_addr_offset,
+                                                  dst_wave_addr_offset,
+                                                  0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<1>{}],
+                                                  dst_wave_buffer_resource,
+                                                  dst_thread_addr_offset,
+                                                  dst_wave_addr_offset + sizeof(int32_t),
+                                                  0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<2>{}],
+                                                  dst_wave_buffer_resource,
+                                                  dst_thread_addr_offset,
+                                                  dst_wave_addr_offset + 2 * sizeof(int32_t),
+                                                  0);
+
+            llvm_amdgcn_raw_buffer_atomic_add_i32(tmp.AsType<int32_t>()[Number<3>{}],
+                                                  dst_wave_buffer_resource,
+                                                  dst_thread_addr_offset,
+                                                  dst_wave_addr_offset + 3 * sizeof(int32_t),
+                                                  0);
+        }
+    }
+}
+
+template <typename T, index_t N>
+__device__ void amd_buffer_atomic_max_impl(const typename vector_type<T, N>::type src_thread_data,
+                                           int32x4_t dst_wave_buffer_resource,
+                                           index_t dst_thread_addr_offset,
+                                           index_t dst_wave_addr_offset)
+{
+    static_assert((is_same<T, double>::value && (N == 1 || N == 2 || N == 4)),
+                  "wrong! not implemented");
+    if constexpr(is_same<T, double>::value)
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(src_thread_data,
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset,
+                                                   0);
+        }
+        else if constexpr(N == 2)
+        {
+            vector_type<double, 2> tmp{src_thread_data};
+
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<0>{}],
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset,
+                                                   0);
+
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<1>{}],
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset + sizeof(double),
+                                                   0);
+        }
+        else if constexpr(N == 4)
+        {
+            vector_type<double, 4> tmp{src_thread_data};
+
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<0>{}],
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset,
+                                                   0);
+
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<1>{}],
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset + sizeof(double),
+                                                   0);
+
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<2>{}],
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset + 2 * sizeof(double),
+                                                   0);
+
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<3>{}],
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset + 3 * sizeof(double),
+                                                   0);
+        }
+    }
+}
+
+// buffer_load requires:
+//   1) p_src_wave must point to global memory space
+//   2) p_src_wave must be a wavewise pointer.
+// It is user's responsibility to make sure that is true.
+template <typename T, index_t N>
+__device__ typename vector_type_maker<T, N>::type::type
+amd_buffer_load_invalid_element_return_zero(const T* p_src_wave,
+                                            index_t src_thread_element_offset,
+                                            bool src_thread_element_valid,
+                                            index_t src_element_space_size)
+{
+    const int32x4_t src_wave_buffer_resource =
+        make_wave_buffer_resource(p_src_wave, src_element_space_size);
+
+    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
+
+    using vector_t = typename vector_type_maker<T, N>::type::type;
+    using scalar_t = typename scalar_type<vector_t>::type;
+
+    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
+
+#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
+    uint32_t src_addr_shift = src_thread_element_valid ? 0 : 0x7fffffff;
+
+    return amd_buffer_load_impl<scalar_t, vector_size>(
+        src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
+#else
+    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size>(
+        src_wave_buffer_resource, src_thread_addr_offset, 0);
+
+    return src_thread_element_valid ? tmp : vector_t(0);
+#endif
+}
+
+// buffer_load requires:
+//   1) p_src_wave must point to global memory space
+//   2) p_src_wave must be a wavewise pointer.
+// It is user's responsibility to make sure that is true.
+template <typename T, index_t N>
+__device__ typename vector_type_maker<T, N>::type::type
+amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave,
+                                                        index_t src_thread_element_offset,
+                                                        bool src_thread_element_valid,
+                                                        index_t src_element_space_size,
+                                                        T customized_value)
+{
+    const int32x4_t src_wave_buffer_resource =
+        make_wave_buffer_resource(p_src_wave, src_element_space_size);
+
+    index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
+
+    using vector_t = typename vector_type_maker<T, N>::type::type;
+    using scalar_t = typename scalar_type<vector_t>::type;
+
+    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
+
+    vector_t tmp = amd_buffer_load_impl<scalar_t, vector_size>(
+        src_wave_buffer_resource, src_thread_addr_offset, 0);
+
+    return src_thread_element_valid ? tmp : vector_t(customized_value);
+}
+
+// buffer_store requires:
+//   1) p_dst_wave must point to global memory
+//   2) p_dst_wave must be a wavewise pointer.
+// It is user's responsibility to make sure that is true.
+template <typename T, index_t N>
+__device__ void amd_buffer_store(const typename vector_type_maker<T, N>::type::type src_thread_data,
+                                 T* p_dst_wave,
+                                 const index_t dst_thread_element_offset,
+                                 const bool dst_thread_element_valid,
+                                 const index_t dst_element_space_size)
+{
+    const int32x4_t dst_wave_buffer_resource =
+        make_wave_buffer_resource(p_dst_wave, dst_element_space_size);
+
+    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
+
+    using vector_t                = typename vector_type_maker<T, N>::type::type;
+    using scalar_t                = typename scalar_type<vector_t>::type;
+    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
+
+#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
+    uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x7fffffff;
+
+    amd_buffer_store_impl<scalar_t, vector_size>(
+        src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
+#else
+    if(dst_thread_element_valid)
+    {
+        amd_buffer_store_impl<scalar_t, vector_size>(
+            src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
+    }
+#endif
+}
+
+// buffer_atomic_add requires:
+//   1) p_dst_wave must point to global memory
+//   2) p_dst_wave must be a wavewise pointer.
+// It is user's responsibility to make sure that is true.
+template <typename T, index_t N>
+__device__ void
+amd_buffer_atomic_add(const typename vector_type_maker<T, N>::type::type src_thread_data,
+                      T* p_dst_wave,
+                      const index_t dst_thread_element_offset,
+                      const bool dst_thread_element_valid,
+                      const index_t dst_element_space_size)
+{
+    const int32x4_t dst_wave_buffer_resource =
+        make_wave_buffer_resource(p_dst_wave, dst_element_space_size);
+
+    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
+
+    using vector_t                = typename vector_type_maker<T, N>::type::type;
+    using scalar_t                = typename scalar_type<vector_t>::type;
+    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
+
+#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
+    uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x7fffffff;
+
+    amd_buffer_atomic_add_impl<scalar_t, vector_size>(
+        src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
+#else
+    if(dst_thread_element_valid)
+    {
+        amd_buffer_atomic_add_impl<scalar_t, vector_size>(
+            src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
+    }
+#endif
+}
+
+// buffer_atomic_max requires:
+//   1) p_dst_wave must point to global memory
+//   2) p_dst_wave must be a wavewise pointer.
+// It is user's responsibility to make sure that is true.
+template <typename T, index_t N>
+__device__ void
+amd_buffer_atomic_max(const typename vector_type_maker<T, N>::type::type src_thread_data,
+                      T* p_dst_wave,
+                      const index_t dst_thread_element_offset,
+                      const bool dst_thread_element_valid,
+                      const index_t dst_element_space_size)
+{
+    const int32x4_t dst_wave_buffer_resource =
+        make_wave_buffer_resource(p_dst_wave, dst_element_space_size);
+
+    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
+
+    using vector_t                = typename vector_type_maker<T, N>::type::type;
+    using scalar_t                = typename scalar_type<vector_t>::type;
+    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
+
+#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK
+    uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x7fffffff;
+
+    amd_buffer_atomic_max_impl<scalar_t, vector_size>(
+        src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
+#else
+    if(dst_thread_element_valid)
+    {
+        amd_buffer_atomic_max_impl<scalar_t, vector_size>(
+            src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
+    }
+#endif
+}
+
+} // namespace ck
diff --git a/include/ck/utility/amd_inline_asm.hpp b/include/ck/utility/amd_inline_asm.hpp
new file mode 100644
index 00000000..82bf2a5e
--- /dev/null
+++ b/include/ck/utility/amd_inline_asm.hpp
@@ -0,0 +1,359 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_AMD_INLINE_ASM_HPP
+#define CK_AMD_INLINE_ASM_HPP
+
+#include "data_type.hpp"
+#include "c_style_pointer_cast.hpp"
+
+// TODO: deprecate all amd_assembly_outer_product_xxx
+
+namespace ck {
+
+// c0 += inner_product(a, b0)
+// c1 += inner_product(a, b1)
+__device__ void amd_assembly_outer_product_1x2(float a, float b0, float b1, float& c0, float& c1)
+{
+    asm volatile("\n \
+            v_fmac_f32 %0, %2, %3 \n \
+            v_fmac_f32 %1, %2, %4 \n \
+            "
+                 : "=v"(c0), "=v"(c1)
+                 : "v"(a), "v"(b0), "v"(b1), "0"(c0), "1"(c1));
+}
+
+// c0 += inner_product(a, b0)
+// c1 += inner_product(a, b1)
+// c2 += inner_product(a, b2)
+// c3 += inner_product(a, b3)
+__device__ void amd_assembly_outer_product_1x4(
+    float a, float b0, float b1, float b2, float b3, float& c0, float& c1, float& c2, float& c3)
+{
+    asm volatile("\n \
+            v_fmac_f32 %0, %4, %5 \n \
+            v_fmac_f32 %1, %4, %6 \n \
+            v_fmac_f32 %2, %4, %7 \n \
+            v_fmac_f32 %3, %4, %8 \n \
+            "
+                 : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3)
+                 : "v"(a), "v"(b0), "v"(b1), "v"(b2), "v"(b3), "0"(c0), "1"(c1), "2"(c2), "3"(c3));
+}
+
+// c0 += inner_product(a, b0)
+// c1 += inner_product(a, b1)
+__device__ void
+amd_assembly_outer_product_1x2(half2_t a, half2_t b0, half2_t b1, float& c0, float& c1)
+{
+    asm volatile("\n \
+            v_dot2_f32_f16 %0, %2, %3, %0\n \
+            v_dot2_f32_f16 %1, %2, %4, %1\n \
+            "
+                 : "=v"(c0), "=v"(c1)
+                 : "v"(a), "v"(b0), "v"(b1), "0"(c0), "1"(c1));
+}
+
+// c0 += inner_product(a, b0)
+// c1 += inner_product(a, b1)
+__device__ void
+amd_assembly_outer_product_1x2(half4_t a, half4_t b0, half4_t b1, float& c0, float& c1)
+{
+    // TODO remove pointer casting
+    const half2_t* p_a_half2  = c_style_pointer_cast<const half2_t*>(&a);
+    const half2_t* p_b0_half2 = c_style_pointer_cast<const half2_t*>(&b0);
+    const half2_t* p_b1_half2 = c_style_pointer_cast<const half2_t*>(&b1);
+
+    // do dot2 two times
+    asm volatile("\n \
+            v_dot2_f32_f16 %0, %2, %4, %0\n \
+            v_dot2_f32_f16 %1, %2, %6, %1\n \
+            v_dot2_f32_f16 %0, %3, %5, %0\n \
+            v_dot2_f32_f16 %1, %3, %7, %1\n \
+            "
+                 : "=v"(c0), "=v"(c1)
+                 : "v"(p_a_half2[0]),
+                   "v"(p_a_half2[1]),
+                   "v"(p_b0_half2[0]),
+                   "v"(p_b0_half2[1]),
+                   "v"(p_b1_half2[0]),
+                   "v"(p_b1_half2[1]),
+                   "0"(c0),
+                   "1"(c1));
+}
+
+// c0 += inner_product(a, b0)
+// c1 += inner_product(a, b1)
+// c2 += inner_product(a, b2)
+// c3 += inner_product(a, b3)
+__device__ void amd_assembly_outer_product_1x4(half2_t a,
+                                               half2_t b0,
+                                               half2_t b1,
+                                               half2_t b2,
+                                               half2_t b3,
+                                               float& c0,
+                                               float& c1,
+                                               float& c2,
+                                               float& c3)
+{
+    asm volatile("\n \
+            v_dot2_f32_f16 %0, %4, %5, %0\n \
+            v_dot2_f32_f16 %1, %4, %6, %1\n \
+            v_dot2_f32_f16 %2, %4, %7, %2\n \
+            v_dot2_f32_f16 %3, %4, %8, %3\n \
+            "
+                 : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3)
+                 : "v"(a), "v"(b0), "v"(b1), "v"(b2), "v"(b3), "0"(c0), "1"(c1), "2"(c2), "3"(c3));
+}
+
+// c0 += inner_product(a, b0)
+// c1 += inner_product(a, b1)
+// c2 += inner_product(a, b2)
+// c3 += inner_product(a, b3)
+__device__ void amd_assembly_outer_product_1x4(half4_t a,
+                                               half4_t b0,
+                                               half4_t b1,
+                                               half4_t b2,
+                                               half4_t b3,
+                                               float& c0,
+                                               float& c1,
+                                               float& c2,
+                                               float& c3)
+{
+    // TODO remove pointer casting
+    const half2_t* p_a_half2  = c_style_pointer_cast<const half2_t*>(&a);
+    const half2_t* p_b0_half2 = c_style_pointer_cast<const half2_t*>(&b0);
+    const half2_t* p_b1_half2 = c_style_pointer_cast<const half2_t*>(&b1);
+    const half2_t* p_b2_half2 = c_style_pointer_cast<const half2_t*>(&b2);
+    const half2_t* p_b3_half2 = c_style_pointer_cast<const half2_t*>(&b3);
+
+    // do dot2 two times
+    asm volatile("\n \
+            v_dot2_f32_f16 %0, %4, %6,  %0\n \
+            v_dot2_f32_f16 %1, %4, %8,  %1\n \
+            v_dot2_f32_f16 %2, %4, %10, %2\n \
+            v_dot2_f32_f16 %3, %4, %12, %3\n \
+            v_dot2_f32_f16 %0, %5, %7,  %0\n \
+            v_dot2_f32_f16 %1, %5, %9,  %1\n \
+            v_dot2_f32_f16 %2, %5, %11, %2\n \
+            v_dot2_f32_f16 %3, %5, %13, %3\n \
+            "
+                 : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3)
+                 : "v"(p_a_half2[0]),
+                   "v"(p_a_half2[1]),
+                   "v"(p_b0_half2[0]),
+                   "v"(p_b0_half2[1]),
+                   "v"(p_b1_half2[0]),
+                   "v"(p_b1_half2[1]),
+                   "v"(p_b2_half2[0]),
+                   "v"(p_b2_half2[1]),
+                   "v"(p_b3_half2[0]),
+                   "v"(p_b3_half2[1]),
+                   "0"(c0),
+                   "1"(c1),
+                   "2"(c2),
+                   "3"(c3));
+}
+
+__device__ void amd_assembly_outer_product_1x4(half8_t a,
+                                               half8_t b0,
+                                               half8_t b1,
+                                               half8_t b2,
+                                               half8_t b3,
+                                               float& c0,
+                                               float& c1,
+                                               float& c2,
+                                               float& c3)
+{
+
+    // TODO remove pointer casting
+    const half4_t* p_a_half4  = c_style_pointer_cast<const half4_t*>(&a);
+    const half4_t* p_b0_half4 = c_style_pointer_cast<const half4_t*>(&b0);
+    const half4_t* p_b1_half4 = c_style_pointer_cast<const half4_t*>(&b1);
+    const half4_t* p_b2_half4 = c_style_pointer_cast<const half4_t*>(&b2);
+    const half4_t* p_b3_half4 = c_style_pointer_cast<const half4_t*>(&b3);
+
+    amd_assembly_outer_product_1x4(
+        p_a_half4[0], p_b0_half4[0], p_b1_half4[0], p_b2_half4[0], p_b3_half4[0], c0, c1, c2, c3);
+
+    amd_assembly_outer_product_1x4(
+        p_a_half4[1], p_b0_half4[1], p_b1_half4[1], p_b2_half4[1], p_b3_half4[1], c0, c1, c2, c3);
+}
+
+__device__ void amd_assembly_outer_product_1x4(half16_t a,
+                                               half16_t b0,
+                                               half16_t b1,
+                                               half16_t b2,
+                                               half16_t b3,
+                                               float& c0,
+                                               float& c1,
+                                               float& c2,
+                                               float& c3)
+{
+    // TODO remove pointer casting
+    const half8_t* p_a_half8  = c_style_pointer_cast<const half8_t*>(&a);
+    const half8_t* p_b0_half8 = c_style_pointer_cast<const half8_t*>(&b0);
+    const half8_t* p_b1_half8 = c_style_pointer_cast<const half8_t*>(&b1);
+    const half8_t* p_b2_half8 = c_style_pointer_cast<const half8_t*>(&b2);
+    const half8_t* p_b3_half8 = c_style_pointer_cast<const half8_t*>(&b3);
+
+    amd_assembly_outer_product_1x4(
+        p_a_half8[0], p_b0_half8[0], p_b1_half8[0], p_b2_half8[0], p_b3_half8[0], c0, c1, c2, c3);
+
+    amd_assembly_outer_product_1x4(
+        p_a_half8[1], p_b0_half8[1], p_b1_half8[1], p_b2_half8[1], p_b3_half8[1], c0, c1, c2, c3);
+}
+
+// c0 += inner_product(a, b0)
+// c1 += inner_product(a, b1)
+__device__ void
+amd_assembly_outer_product_1x2(int8x4_t a, int8x4_t b0, int8x4_t b1, int32_t& c0, int32_t& c1)
+{
+#if 1
+    asm volatile("\n \
+            v_dot4_i32_i8 %0, %2, %3, %0\n \
+            v_dot4_i32_i8 %1, %2, %4, %1\n \
+            "
+                 : "=v"(c0), "=v"(c1)
+                 : "v"(bit_cast<int32_t>(a)),
+                   "v"(bit_cast<int32_t>(b0)),
+                   "v"(bit_cast<int32_t>(b1)),
+                   "0"(c0),
+                   "1"(c1));
+#else
+    c0 = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b0), c0, false);
+    c1 = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b1), c1, false);
+#endif
+}
+
+// c0 += inner_product(a, b0)
+// c1 += inner_product(a, b1)
+// c2 += inner_product(a, b2)
+// c3 += inner_product(a, b3)
+__device__ void amd_assembly_outer_product_1x4(int8x4_t a,
+                                               int8x4_t b0,
+                                               int8x4_t b1,
+                                               int8x4_t b2,
+                                               int8x4_t b3,
+                                               int32_t& c0,
+                                               int32_t& c1,
+                                               int32_t& c2,
+                                               int32_t& c3)
+{
+#if 1
+    asm volatile("\n \
+            v_dot4_i32_i8 %0, %4, %5, %0\n \
+            v_dot4_i32_i8 %1, %4, %6, %1\n \
+            v_dot4_i32_i8 %2, %4, %7, %2\n \
+            v_dot4_i32_i8 %3, %4, %8, %3\n \
+            "
+                 : "=v"(c0), "=v"(c1), "=v"(c2), "=v"(c3)
+                 : "v"(bit_cast<int32_t>(a)),
+                   "v"(bit_cast<int32_t>(b0)),
+                   "v"(bit_cast<int32_t>(b1)),
+                   "v"(bit_cast<int32_t>(b2)),
+                   "v"(bit_cast<int32_t>(b3)),
+                   "0"(c0),
+                   "1"(c1),
+                   "2"(c2),
+                   "3"(c3));
+#else
+    c0 = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b0), c0, false);
+    c1 = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b1), c1, false);
+    c2 = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b2), c2, false);
+    c3 = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b3), c3, false);
+#endif
+}
+
+__device__ void amd_assembly_outer_product_1x4(int8x8_t a,
+                                               int8x8_t b0,
+                                               int8x8_t b1,
+                                               int8x8_t b2,
+                                               int8x8_t b3,
+                                               int32_t& c0,
+                                               int32_t& c1,
+                                               int32_t& c2,
+                                               int32_t& c3)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+
+    amd_assembly_outer_product_1x4(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I0],
+                                   vector_type<int8_t, 8>{b0}.AsType<int8x4_t>()[I0],
+                                   vector_type<int8_t, 8>{b1}.AsType<int8x4_t>()[I0],
+                                   vector_type<int8_t, 8>{b2}.AsType<int8x4_t>()[I0],
+                                   vector_type<int8_t, 8>{b3}.AsType<int8x4_t>()[I0],
+                                   c0,
+                                   c1,
+                                   c2,
+                                   c3);
+
+    amd_assembly_outer_product_1x4(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I1],
+                                   vector_type<int8_t, 8>{b0}.AsType<int8x4_t>()[I1],
+                                   vector_type<int8_t, 8>{b1}.AsType<int8x4_t>()[I1],
+                                   vector_type<int8_t, 8>{b2}.AsType<int8x4_t>()[I1],
+                                   vector_type<int8_t, 8>{b3}.AsType<int8x4_t>()[I1],
+                                   c0,
+                                   c1,
+                                   c2,
+                                   c3);
+}
+
+__device__ void amd_assembly_outer_product_1x4(int8x16_t a,
+                                               int8x16_t b0,
+                                               int8x16_t b1,
+                                               int8x16_t b2,
+                                               int8x16_t b3,
+                                               int32_t& c0,
+                                               int32_t& c1,
+                                               int32_t& c2,
+                                               int32_t& c3)
+
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    amd_assembly_outer_product_1x4(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I0],
+                                   vector_type<int8_t, 16>{b0}.AsType<int8x4_t>()[I0],
+                                   vector_type<int8_t, 16>{b1}.AsType<int8x4_t>()[I0],
+                                   vector_type<int8_t, 16>{b2}.AsType<int8x4_t>()[I0],
+                                   vector_type<int8_t, 16>{b3}.AsType<int8x4_t>()[I0],
+                                   c0,
+                                   c1,
+                                   c2,
+                                   c3);
+
+    amd_assembly_outer_product_1x4(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I1],
+                                   vector_type<int8_t, 16>{b0}.AsType<int8x4_t>()[I1],
+                                   vector_type<int8_t, 16>{b1}.AsType<int8x4_t>()[I1],
+                                   vector_type<int8_t, 16>{b2}.AsType<int8x4_t>()[I1],
+                                   vector_type<int8_t, 16>{b3}.AsType<int8x4_t>()[I1],
+                                   c0,
+                                   c1,
+                                   c2,
+                                   c3);
+
+    amd_assembly_outer_product_1x4(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I2],
+                                   vector_type<int8_t, 16>{b0}.AsType<int8x4_t>()[I2],
+                                   vector_type<int8_t, 16>{b1}.AsType<int8x4_t>()[I2],
+                                   vector_type<int8_t, 16>{b2}.AsType<int8x4_t>()[I2],
+                                   vector_type<int8_t, 16>{b3}.AsType<int8x4_t>()[I2],
+                                   c0,
+                                   c1,
+                                   c2,
+                                   c3);
+
+    amd_assembly_outer_product_1x4(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I3],
+                                   vector_type<int8_t, 16>{b0}.AsType<int8x4_t>()[I3],
+                                   vector_type<int8_t, 16>{b1}.AsType<int8x4_t>()[I3],
+                                   vector_type<int8_t, 16>{b2}.AsType<int8x4_t>()[I3],
+                                   vector_type<int8_t, 16>{b3}.AsType<int8x4_t>()[I3],
+                                   c0,
+                                   c1,
+                                   c2,
+                                   c3);
+}
+
+} // namespace ck
+#endif
diff --git a/include/ck/utility/amd_llvm_intrinsic.hpp b/include/ck/utility/amd_llvm_intrinsic.hpp
new file mode 100644
index 00000000..01e77d7b
--- /dev/null
+++ b/include/ck/utility/amd_llvm_intrinsic.hpp
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_AMD_LLVM_INTRINSIC_HPP
+#define CK_AMD_LLVM_INTRINSIC_HPP
+
+#include "data_type.hpp"
+
+namespace ck {
+
+__device__ int32_t llvm_amdgcn_readfirstlane_i32(int32_t i) __asm("llvm.amdgcn.readfirstlane");
+
+} // namespace ck
+#endif
diff --git a/include/ck/utility/amd_wmma.hpp b/include/ck/utility/amd_wmma.hpp
new file mode 100644
index 00000000..752876a7
--- /dev/null
+++ b/include/ck/utility/amd_wmma.hpp
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_AMD_WMMA_HPP
+#define CK_AMD_WMMA_HPP
+
+#include "data_type.hpp"
+// TODO: Add arch limitation
+namespace ck {
+
+// wave32 only
+// src: fp16, dst: fp32
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_wmma_f32_16x16x16_f16_w32;
+
+template <>
+struct intrin_wmma_f32_16x16x16_f16_w32<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const half16_t& reg_a, const half16_t& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<float8_t>()(Number<0>{}) = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(
+            reg_a, reg_b, reg_c.template AsType<float8_t>()[Number<0>{}]);
+    }
+};
+
+// src: bf16, dst: fp32
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_wmma_f32_16x16x16_bf16_w32;
+
+template <>
+struct intrin_wmma_f32_16x16x16_bf16_w32<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const bhalf16_t& reg_a, const bhalf16_t& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<float8_t>()(Number<0>{}) =
+            __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(
+                reg_a, reg_b, reg_c.template AsType<float8_t>()[Number<0>{}]);
+    }
+};
+
+// src: fp16, dst: fp16
+template <index_t MPerWave, index_t NPerWave, index_t Opsel>
+struct intrin_wmma_f16_16x16x16_f16_w32;
+
+template <index_t Opsel>
+struct intrin_wmma_f16_16x16x16_f16_w32<16, 16, Opsel>
+{
+    template <class FloatC>
+    __device__ static void Run(const half16_t& reg_a, const half16_t& reg_b, FloatC& reg_c)
+    {
+        // opsel usage
+        // false: D0.[0:15] = result
+        // true : D0.[16:31]= result
+        reg_c.template AsType<half16_t>()(Number<0>{}) = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32(
+            reg_a, reg_b, reg_c.template AsType<half16_t>()[Number<0>{}], Opsel);
+    }
+};
+
+// src: bf16, dst: bf16
+template <index_t MPerWave, index_t NPerWave, index_t Opsel>
+struct intrin_wmma_bf16_16x16x16_bf16_w32;
+
+template <index_t Opsel>
+struct intrin_wmma_bf16_16x16x16_bf16_w32<16, 16, Opsel>
+{
+    template <class FloatC>
+    __device__ static void Run(const bhalf16_t& reg_a, const bhalf16_t& reg_b, FloatC& reg_c)
+    {
+        // opsel usage
+        // false: D0.[0:15] = result
+        // true : D0.[16:31]= result
+        reg_c.template AsType<bhalf16_t>()(Number<0>{}) =
+            __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32(
+                reg_a, reg_b, reg_c.template AsType<bhalf16_t>()[Number<0>{}], Opsel);
+    }
+};
+
+// src: iu8, dst: i32
+template <index_t MPerWave, index_t NPerWave, bool neg_a, bool neg_b, bool clamp>
+struct intrin_wmma_i32_16x16x16_iu8_w32;
+
+template <bool neg_a, bool neg_b, bool clamp>
+struct intrin_wmma_i32_16x16x16_iu8_w32<16, 16, neg_a, neg_b, clamp>
+{
+    template <class FloatC>
+    __device__ static void Run(const int8x16_t& reg_a, const int8x16_t& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<int32x8_t>()(Number<0>{}) =
+            __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(
+                neg_a,
+                bit_cast<int32x4_t>(reg_a),
+                neg_b,
+                bit_cast<int32x4_t>(reg_b),
+                reg_c.template AsType<int32x8_t>()[Number<0>{}],
+                clamp);
+    }
+};
+
+} // namespace ck
+#endif
diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp
new file mode 100644
index 00000000..b4be0cbe
--- /dev/null
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -0,0 +1,320 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_AMD_XDLOPS_HPP
+#define CK_AMD_XDLOPS_HPP
+
+#include "data_type.hpp"
+
+namespace ck {
+
+// fp32
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_32x32x1f32;
+
+template <>
+struct intrin_mfma_f32_32x32x1f32<64, 64>
+{
+    template <class FloatC>
+    __device__ static void Run(const float& reg_a, const float& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<float32_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_32x32x1f32(
+            reg_a, reg_b, reg_c.template AsType<float32_t>()[Number<0>{}], 1, 0, 0);
+        reg_c.template AsType<float32_t>()(Number<1>{}) = __builtin_amdgcn_mfma_f32_32x32x1f32(
+            reg_a, reg_b, reg_c.template AsType<float32_t>()[Number<1>{}], 1, 1, 0);
+    }
+};
+
+template <>
+struct intrin_mfma_f32_32x32x1f32<32, 64>
+{
+    template <class FloatC>
+    __device__ static void Run(const float& reg_a, const float& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<float32_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_32x32x1f32(
+            reg_a, reg_b, reg_c.template AsType<float32_t>()[Number<0>{}], 1, 0, 0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_32x32x2f32;
+
+template <>
+struct intrin_mfma_f32_32x32x2f32<32, 32>
+{
+    template <class FloatC>
+    __device__ static void Run(const float& reg_a, const float& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<float16_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_32x32x2f32(
+            reg_a, reg_b, reg_c.template AsType<float16_t>()[Number<0>{}], 0, 0, 0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_16x16x4f32;
+
+template <>
+struct intrin_mfma_f32_16x16x4f32<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const float& reg_a, const float& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x4f32(
+            reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<0>{}], 0, 0, 0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_16x16x1f32;
+
+template <>
+struct intrin_mfma_f32_16x16x1f32<16, 64>
+{
+    template <class FloatC>
+    __device__ static void Run(const float& reg_a, const float& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<float16_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x1f32(
+            reg_a, reg_b, reg_c.template AsType<float16_t>()[Number<0>{}], 2, 0, 0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_4x4x1f32;
+
+template <>
+struct intrin_mfma_f32_4x4x1f32<4, 64>
+{
+    template <class FloatC>
+    __device__ static void Run(const float& reg_a, const float& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_4x4x1f32(
+            reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<0>{}], 4, 0, 0);
+    }
+};
+
+template <>
+struct intrin_mfma_f32_4x4x1f32<8, 64>
+{
+    template <class FloatC>
+    __device__ static void Run(const float& reg_a, const float& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_4x4x1f32(
+            reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<0>{}], 4, 0, 0);
+        reg_c.template AsType<float4_t>()(Number<1>{}) = __builtin_amdgcn_mfma_f32_4x4x1f32(
+            reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<1>{}], 4, 1, 0);
+    }
+};
+
+// fp16
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_32x32x4f16;
+
+template <>
+struct intrin_mfma_f32_32x32x4f16<64, 64>
+{
+    template <class FloatC>
+    __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<float32_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_32x32x4f16(
+            reg_a, reg_b, reg_c.template AsType<float32_t>()[Number<0>{}], 1, 0, 0);
+        reg_c.template AsType<float32_t>()(Number<1>{}) = __builtin_amdgcn_mfma_f32_32x32x4f16(
+            reg_a, reg_b, reg_c.template AsType<float32_t>()[Number<1>{}], 1, 1, 0);
+    }
+};
+
+template <>
+struct intrin_mfma_f32_32x32x4f16<32, 64>
+{
+    template <class FloatC>
+    __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<float32_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_32x32x4f16(
+            reg_a, reg_b, reg_c.template AsType<float32_t>()[Number<0>{}], 1, 0, 0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_32x32x8f16;
+
+template <>
+struct intrin_mfma_f32_32x32x8f16<32, 32>
+{
+    template <class FloatC>
+    __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<float16_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_32x32x8f16(
+            reg_a, reg_b, reg_c.template AsType<float16_t>()[Number<0>{}], 0, 0, 0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_16x16x16f16;
+
+template <>
+struct intrin_mfma_f32_16x16x16f16<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x16f16(
+            reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<0>{}], 0, 0, 0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_16x16x4f16;
+
+template <>
+struct intrin_mfma_f32_16x16x4f16<16, 64>
+{
+    template <class FloatC>
+    __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<float16_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x4f16(
+            reg_a, reg_b, reg_c.template AsType<float16_t>()[Number<0>{}], 2, 0, 0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_4x4x4f16;
+
+template <>
+struct intrin_mfma_f32_4x4x4f16<4, 64>
+{
+    template <class FloatC>
+    __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_4x4x4f16(
+            reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<0>{}], 4, 0, 0);
+    }
+};
+
+template <>
+struct intrin_mfma_f32_4x4x4f16<8, 64>
+{
+    template <class FloatC>
+    __device__ static void Run(const half4_t& reg_a, const half4_t& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_4x4x4f16(
+            reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<0>{}], 4, 0, 0);
+        reg_c.template AsType<float4_t>()(Number<1>{}) = __builtin_amdgcn_mfma_f32_4x4x4f16(
+            reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<1>{}], 4, 1, 0);
+    }
+};
+
+// bfp16
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_32x32x8bf16_1k;
+
+template <>
+struct intrin_mfma_f32_32x32x8bf16_1k<32, 32>
+{
+    template <class FloatC>
+    __device__ static void Run(const bhalf4_t& reg_a, const bhalf4_t& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<float16_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(
+            reg_a, reg_b, reg_c.template AsType<float16_t>()[Number<0>{}], 0, 0, 0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_16x16x16bf16_1k;
+
+template <>
+struct intrin_mfma_f32_16x16x16bf16_1k<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const bhalf4_t& reg_a, const bhalf4_t& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x16bf16_1k(
+            reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<0>{}], 0, 0, 0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_32x32x4bf16;
+
+template <>
+struct intrin_mfma_f32_32x32x4bf16<32, 32>
+{
+    template <class FloatC>
+    __device__ static void Run(const bhalf2_t& reg_a, const bhalf2_t& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<float16_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_32x32x4bf16(
+            reg_a, reg_b, reg_c.template AsType<float16_t>()[Number<0>{}], 0, 0, 0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f32_16x16x8bf16;
+
+template <>
+struct intrin_mfma_f32_16x16x8bf16<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const bhalf2_t& reg_a, const bhalf2_t& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<float4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f32_16x16x8bf16(
+            reg_a, reg_b, reg_c.template AsType<float4_t>()[Number<0>{}], 0, 0, 0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_i32_32x32x8i8;
+
+template <>
+struct intrin_mfma_i32_32x32x8i8<32, 32>
+{
+    template <class FloatC>
+    __device__ static void Run(const int8x4_t& reg_a, const int8x4_t& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<int32x16_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_i32_32x32x8i8(bit_cast<int32_t>(reg_a),
+                                                bit_cast<int32_t>(reg_b),
+                                                reg_c.template AsType<int32x16_t>()[Number<0>{}],
+                                                0,
+                                                0,
+                                                0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_i32_16x16x16i8;
+
+template <>
+struct intrin_mfma_i32_16x16x16i8<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const int8x4_t& reg_a, const int8x4_t& reg_b, FloatC& reg_c)
+    {
+        reg_c.template AsType<int32x4_t>()(Number<0>{}) =
+            __builtin_amdgcn_mfma_i32_16x16x16i8(bit_cast<int32_t>(reg_a),
+                                                 bit_cast<int32_t>(reg_b),
+                                                 reg_c.template AsType<int32x4_t>()[Number<0>{}],
+                                                 0,
+                                                 0,
+                                                 0);
+    }
+};
+
+template <index_t MPerWave, index_t NPerWave>
+struct intrin_mfma_f64_16x16x4f64;
+
+template <>
+struct intrin_mfma_f64_16x16x4f64<16, 16>
+{
+    template <class FloatC>
+    __device__ static void Run(const double& reg_a, const double& reg_b, FloatC& reg_c)
+    {
+#ifdef __gfx90a__
+        reg_c.template AsType<double4_t>()(Number<0>{}) = __builtin_amdgcn_mfma_f64_16x16x4f64(
+            reg_a, reg_b, reg_c.template AsType<double4_t>()[Number<0>{}], 0, 0, 0);
+#else
+        ignore = reg_a;
+        ignore = reg_b;
+        ignore = reg_c;
+#endif
+    }
+};
+} // namespace ck
+#endif
diff --git a/include/ck/utility/array.hpp b/include/ck/utility/array.hpp
new file mode 100644
index 00000000..370a457f
--- /dev/null
+++ b/include/ck/utility/array.hpp
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_ARRAY_HPP
+#define CK_ARRAY_HPP
+
+#include "functional2.hpp"
+#include "sequence.hpp"
+
+namespace ck {
+
+template <typename TData, index_t NSize>
+struct Array
+{
+    using type      = Array;
+    using data_type = TData;
+
+    TData mData[NSize];
+
+    __host__ __device__ static constexpr index_t Size() { return NSize; }
+
+    __host__ __device__ constexpr const TData& At(index_t i) const { return mData[i]; }
+
+    __host__ __device__ constexpr TData& At(index_t i) { return mData[i]; }
+
+    __host__ __device__ constexpr const TData& operator[](index_t i) const { return At(i); }
+
+    __host__ __device__ constexpr TData& operator()(index_t i) { return At(i); }
+
+    template <typename T>
+    __host__ __device__ constexpr auto operator=(const T& a)
+    {
+        static_assert(T::Size() == Size(), "wrong! size not the same");
+
+        static_for<0, Size(), 1>{}([&](auto i) { operator()(i) = a[i]; });
+
+        return *this;
+    }
+};
+
+// empty Array
+template <typename TData>
+struct Array<TData, 0>
+{
+    using type      = Array;
+    using data_type = TData;
+
+    __host__ __device__ static constexpr index_t Size() { return 0; }
+};
+
+template <typename X, typename... Xs>
+__host__ __device__ constexpr auto make_array(X&& x, Xs&&... xs)
+{
+    using data_type = remove_cvref_t<X>;
+    return Array<data_type, sizeof...(Xs) + 1>{std::forward<X>(x), std::forward<Xs>(xs)...};
+}
+
+// make empty array
+template <typename X>
+__host__ __device__ constexpr auto make_array()
+{
+    return Array<X, 0>{};
+}
+
+} // namespace ck
+#endif
diff --git a/include/ck/utility/array_multi_index.hpp b/include/ck/utility/array_multi_index.hpp
new file mode 100644
index 00000000..9b8d5b95
--- /dev/null
+++ b/include/ck/utility/array_multi_index.hpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_ARRAY_MULTI_INDEX_HPP
+#define CK_ARRAY_MULTI_INDEX_HPP
+
+#include "common_header.hpp"
+
+namespace ck {
+
+template <index_t N>
+using MultiIndex = Array<index_t, N>;
+
+template <typename... Xs>
+__host__ __device__ constexpr auto make_multi_index(Xs&&... xs)
+{
+    return make_array<index_t>(index_t{xs}...);
+}
+
+template <index_t NSize>
+__host__ __device__ constexpr auto make_zero_multi_index()
+{
+    return unpack([](auto... xs) { return make_multi_index(xs...); },
+                  typename uniform_sequence_gen<NSize, 0>::type{});
+}
+
+template <typename T>
+__host__ __device__ constexpr auto to_multi_index(const T& x)
+{
+    return unpack([](auto... ys) { return make_multi_index(ys...); }, x);
+}
+
+template <index_t NSize, typename X>
+__host__ __device__ constexpr auto operator+=(MultiIndex<NSize>& y, const X& x)
+{
+    static_assert(X::Size() == NSize, "wrong! size not the same");
+    static_for<0, NSize, 1>{}([&](auto i) { y(i) += x[i]; });
+    return y;
+}
+
+template <index_t NSize, typename X>
+__host__ __device__ constexpr auto operator-=(MultiIndex<NSize>& y, const X& x)
+{
+    static_assert(X::Size() == NSize, "wrong! size not the same");
+    static_for<0, NSize, 1>{}([&](auto i) { y(i) -= x[i]; });
+    return y;
+}
+
+template <index_t NSize, typename T>
+__host__ __device__ constexpr auto operator+(const MultiIndex<NSize>& a, const T& b)
+{
+    using type = MultiIndex<NSize>;
+    static_assert(T::Size() == NSize, "wrong! size not the same");
+    type r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = a[i] + b[i]; });
+    return r;
+}
+
+template <index_t NSize, typename T>
+__host__ __device__ constexpr auto operator-(const MultiIndex<NSize>& a, const T& b)
+{
+    using type = MultiIndex<NSize>;
+    static_assert(T::Size() == NSize, "wrong! size not the same");
+    type r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = a[i] - b[i]; });
+    return r;
+}
+
+template <index_t NSize, typename T>
+__host__ __device__ constexpr auto operator*(const MultiIndex<NSize>& a, const T& b)
+{
+    using type = MultiIndex<NSize>;
+    static_assert(T::Size() == NSize, "wrong! size not the same");
+    type r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = a[i] * b[i]; });
+    return r;
+}
+
+} // namespace ck
+#endif
diff --git a/include/ck/utility/c_style_pointer_cast.hpp b/include/ck/utility/c_style_pointer_cast.hpp
new file mode 100644
index 00000000..6e8b0081
--- /dev/null
+++ b/include/ck/utility/c_style_pointer_cast.hpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_C_STYLE_POINTER_CAST_HPP
+#define CK_C_STYLE_POINTER_CAST_HPP
+
+#include "type.hpp"
+#include "enable_if.hpp"
+
+namespace ck {
+
+template <typename PY,
+          typename PX,
+          typename enable_if<is_pointer_v<PY> && is_pointer_v<PX>, bool>::type = false>
+__host__ __device__ PY c_style_pointer_cast(PX p_x)
+{
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wold-style-cast"
+#pragma clang diagnostic ignored "-Wcast-align"
+    return (PY)p_x; // NOLINT(old-style-cast, cast-align)
+#pragma clang diagnostic pop
+}
+
+} // namespace ck
+#endif
diff --git a/include/ck/utility/common_header.hpp b/include/ck/utility/common_header.hpp
new file mode 100644
index 00000000..1378bbe4
--- /dev/null
+++ b/include/ck/utility/common_header.hpp
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/utility/array.hpp"
+#include "ck/utility/container_helper.hpp"
+#include "ck/utility/statically_indexed_array.hpp"
+#include "ck/utility/container_element_picker.hpp"
+#include "ck/utility/multi_index.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/functional.hpp"
+#include "ck/utility/functional2.hpp"
+#include "ck/utility/functional3.hpp"
+#include "ck/utility/functional4.hpp"
+#include "ck/utility/enable_if.hpp"
+#include "ck/utility/ignore.hpp"
+#include "ck/utility/integral_constant.hpp"
+#include "ck/utility/math.hpp"
+#include "ck/utility/number.hpp"
+#include "ck/utility/sequence.hpp"
+#include "ck/utility/sequence_helper.hpp"
+#include "ck/utility/tuple.hpp"
+#include "ck/utility/tuple_helper.hpp"
+#include "ck/utility/type.hpp"
+#include "ck/utility/magic_division.hpp"
+#include "ck/utility/c_style_pointer_cast.hpp"
+#include "ck/utility/is_known_at_compile_time.hpp"
+#include "ck/utility/transpose_vectors.hpp"
+#include "ck/utility/inner_product.hpp"
+#include "ck/utility/thread_group.hpp"
+#include "ck/utility/debug.hpp"
+
+#include "ck/utility/amd_buffer_addressing.hpp"
+#include "ck/utility/generic_memory_space_atomic.hpp"
+#include "ck/utility/get_id.hpp"
+#include "ck/utility/thread_group.hpp"
+#include "ck/utility/synchronization.hpp"
+#include "ck/utility/amd_address_space.hpp"
+#include "ck/utility/static_buffer.hpp"
+#include "ck/utility/dynamic_buffer.hpp"
+
+// TODO: remove this
+#if CK_USE_AMD_INLINE_ASM
+#include "ck/utility/amd_inline_asm.hpp"
+#endif
+
+#ifdef CK_USE_AMD_MFMA
+#include "ck/utility/amd_xdlops.hpp"
+#endif
diff --git a/include/ck/utility/container_element_picker.hpp b/include/ck/utility/container_element_picker.hpp
new file mode 100644
index 00000000..abc5185e
--- /dev/null
+++ b/include/ck/utility/container_element_picker.hpp
@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_CONTAINER_ELEMENT_PICKER_HPP
+#define CK_CONTAINER_ELEMENT_PICKER_HPP
+
+#include "functional2.hpp"
+#include "sequence.hpp"
+
+namespace ck {
+
+// Arr: Array or StaticallyIndexedArray
+// Picks: Sequence<...>
+template <typename Arr, typename Picks>
+struct ContainerElementPicker
+{
+    using type = ContainerElementPicker;
+#if 0
+    using data_type = typename Arr::data_type;
+#endif
+
+    __host__ __device__ constexpr ContainerElementPicker() = delete;
+
+    __host__ __device__ constexpr ContainerElementPicker(Arr& array) : mArray{array}
+    {
+        constexpr index_t imax =
+            reduce_on_sequence(Picks{}, math::maximize<index_t>{}, Number<0>{});
+
+        static_assert(imax < Arr::Size(), "wrong! exceeding # array element");
+    }
+
+    __host__ __device__ static constexpr auto Size() { return Picks::Size(); }
+
+    template <index_t I>
+    __host__ __device__ constexpr const auto& At(Number<I> i) const
+    {
+        static_assert(I < Size(), "wrong!");
+
+        constexpr auto IP = Picks{}[i];
+        return mArray[IP];
+    }
+
+    template <index_t I>
+    __host__ __device__ constexpr auto& At(Number<I> i)
+    {
+        static_assert(I < Size(), "wrong!");
+
+        constexpr auto IP = Picks{}[i];
+        return mArray(IP);
+    }
+
+    template <index_t I>
+    __host__ __device__ constexpr const auto& operator[](Number<I> i) const
+    {
+        return At(i);
+    }
+
+    template <index_t I>
+    __host__ __device__ constexpr auto& operator()(Number<I> i)
+    {
+        return At(i);
+    }
+
+    template <typename T>
+    __host__ __device__ constexpr auto operator=(const T& a)
+    {
+        static_assert(T::Size() == Size(), "wrong! size not the same");
+
+        static_for<0, Size(), 1>{}([&](auto i) { operator()(i) = a[i]; });
+
+        return *this;
+    }
+
+    private:
+    Arr& mArray;
+};
+
+// Arr: Array or StaticallyIndexedArray
+// Picks: Sequence<...>
+template <typename Arr, typename Picks>
+struct ConstantContainerElementPicker
+{
+    using type = ConstantContainerElementPicker;
+#if 0
+    using data_type = typename Arr::data_type;
+#endif
+
+    __host__ __device__ constexpr ConstantContainerElementPicker() = delete;
+
+    __host__ __device__ constexpr ConstantContainerElementPicker(const Arr& array) : mArray{array}
+    {
+        constexpr index_t imax =
+            reduce_on_sequence(Picks{}, math::maximize<index_t>{}, Number<0>{});
+
+        static_assert(imax < Arr::Size(), "wrong! exceeding # array element");
+    }
+
+    __host__ __device__ static constexpr auto Size() { return Picks::Size(); }
+
+    template <index_t I>
+    __host__ __device__ constexpr const auto& At(Number<I> i) const
+    {
+        static_assert(I < Size(), "wrong!");
+
+        constexpr auto IP = Picks{}[i];
+        return mArray[IP];
+    }
+
+    template <index_t I>
+    __host__ __device__ constexpr const auto& operator[](Number<I> i) const
+    {
+        return At(i);
+    }
+
+    private:
+    const Arr& mArray;
+};
+
+template <typename Arr, typename Picks, typename X>
+__host__ __device__ constexpr auto operator+=(ContainerElementPicker<Arr, Picks>& y, const X& x)
+{
+    using Y                 = ContainerElementPicker<Arr, Picks>;
+    constexpr index_t nsize = Y::Size();
+
+    static_assert(nsize == X::Size(), "wrong! size not the same");
+
+    static_for<0, nsize, 1>{}([&](auto i) { y(i) += x[i]; });
+
+    return y;
+}
+
+template <typename Arr, typename Picks, typename X>
+__host__ __device__ constexpr auto operator-=(ContainerElementPicker<Arr, Picks>& y, const X& x)
+{
+    using Y                 = ContainerElementPicker<Arr, Picks>;
+    constexpr index_t nsize = Y::Size();
+
+    static_assert(nsize == X::Size(), "wrong! size not the same");
+
+    static_for<0, nsize, 1>{}([&](auto i) { y(i) -= x[i]; });
+
+    return y;
+}
+
+template <typename Arr, typename Picks>
+__host__ __device__ constexpr auto pick_container_element(Arr& a, Picks)
+{
+    return ContainerElementPicker<Arr, Picks>(a);
+}
+
+template <typename Arr, typename Picks>
+__host__ __device__ constexpr auto pick_container_element(const Arr& a, Picks)
+{
+    return ConstantContainerElementPicker<Arr, Picks>(a);
+}
+
+} // namespace ck
+#endif
diff --git a/include/ck/utility/container_helper.hpp b/include/ck/utility/container_helper.hpp
new file mode 100644
index 00000000..c8b02bc5
--- /dev/null
+++ b/include/ck/utility/container_helper.hpp
@@ -0,0 +1,393 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_CONTAINER_HELPER_HPP
+#define CK_CONTAINER_HELPER_HPP
+
+#include "sequence.hpp"
+#include "sequence_helper.hpp"
+#include "array.hpp"
+#include "tuple.hpp"
+#include "tuple_helper.hpp"
+#include "statically_indexed_array.hpp"
+#include "container_element_picker.hpp"
+
+namespace ck {
+
+template <typename TData, index_t NSize>
+__host__ __device__ constexpr auto container_push_back(const Array<TData, NSize>& a, const TData& x)
+{
+    Array<TData, NSize + 1> r;
+
+    static_for<0, NSize, 1>{}([&r, &a ](auto i) constexpr { r(i) = a[i]; });
+
+    r(Number<NSize>{}) = x;
+
+    return r;
+}
+
+template <typename... Ts, typename T>
+__host__ __device__ constexpr auto container_push_front(const Tuple<Ts...>& a, const T& x)
+{
+    return container_concat(make_tuple(x), a);
+}
+
+template <typename... Ts, typename T>
+__host__ __device__ constexpr auto container_push_back(const Tuple<Ts...>& a, const T& x)
+{
+    return container_concat(a, make_tuple(x));
+}
+
+template <typename TData, index_t NSize, index_t... IRs>
+__host__ __device__ constexpr auto
+container_reorder_given_new2old(const Array<TData, NSize>& old_array, Sequence<IRs...> /*new2old*/)
+{
+    static_assert(NSize == sizeof...(IRs), "wrong! size not consistent");
+
+    static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
+
+    return make_array(old_array[Number<IRs>{}]...);
+}
+
+template <typename TData, index_t NSize, index_t... IRs>
+__host__ __device__ constexpr auto
+container_reorder_given_old2new(const Array<TData, NSize>& old_array, Sequence<IRs...> old2new)
+{
+    return container_reorder_given_new2old(
+        old_array, typename sequence_map_inverse<decltype(old2new)>::type{});
+}
+
+template <typename... Ts, index_t... IRs>
+__host__ __device__ constexpr auto container_reorder_given_new2old(const Tuple<Ts...>& old_tuple,
+                                                                   Sequence<IRs...> /*new2old*/)
+{
+    static_assert(sizeof...(Ts) == sizeof...(IRs), "wrong! size not consistent");
+
+    static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
+
+    return make_tuple(old_tuple[Number<IRs>{}]...);
+}
+
+template <typename... Ts, index_t... IRs>
+__host__ __device__ constexpr auto container_reorder_given_old2new(const Tuple<Ts...>& old_tuple,
+                                                                   Sequence<IRs...> old2new)
+{
+    return container_reorder_given_new2old(
+        old_tuple, typename sequence_map_inverse<decltype(old2new)>::type{});
+}
+
+template <index_t... Is, index_t... IRs>
+__host__ __device__ constexpr auto container_reorder_given_new2old(Sequence<Is...> /* old_seq */,
+                                                                   Sequence<IRs...> /*new2old*/)
+{
+    static_assert(sizeof...(Is) == sizeof...(IRs), "wrong! size not consistent");
+
+    static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
+
+    return Sequence<Sequence<Is...>::At(Number<IRs>{})...>{};
+}
+
+template <index_t... Is, index_t... IRs>
+__host__ __device__ constexpr auto container_reorder_given_old2new(Sequence<Is...> old_seq,
+                                                                   Sequence<IRs...> /* old2new */)
+{
+    static_assert(sizeof...(Is) == sizeof...(IRs), "wrong! size not consistent");
+
+    static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
+
+    constexpr auto new2old = typename sequence_map_inverse<Sequence<IRs...>>::type{};
+
+    return container_reorder_given_new2old(old_seq, new2old);
+}
+
+#if !CK_WORKAROUND_SWDEV_275126
+// rocm-4.1 compiler would crash for recursive lambda
+template <typename Container,
+          typename Reduce,
+          typename Init,
+          index_t IBegin = 0,
+          index_t IEnd   = Container::Size(),
+          index_t IStep  = 1>
+__host__ __device__ constexpr auto container_reduce(const Container& x,
+                                                    Reduce reduce,
+                                                    Init init,
+                                                    Number<IBegin> = Number<0>{},
+                                                    Number<IEnd>   = Number<Container::Size()>{},
+                                                    Number<IStep>  = Number<1>{})
+{
+    static_assert((IEnd - IBegin) % IStep == 0, "wrong!");
+
+    // f is recursive function, fs is a dummy of f
+    // i is index, y_old is current scan, r_old is current reduction
+    auto f = [&](auto fs, auto i, auto r_old) {
+        auto r_new = reduce(x[i], r_old);
+
+        if constexpr(i.value < IEnd - IStep)
+        {
+            // recursively call f/fs
+            return fs(fs, i + Number<IStep>{}, r_new);
+        }
+        else
+        {
+            return r_new;
+        }
+    };
+
+    // start recursion
+    return f(f, Number<IBegin>{}, init);
+}
+#else
+// i is index, y_old is current scan, r_old is current reduction
+template <typename Container,
+          typename Reduce,
+          typename ROld,
+          index_t I,
+          index_t IEnd,
+          index_t IStep>
+__host__ __device__ constexpr auto container_reduce_impl(
+    const Container& x, Reduce reduce, ROld r_old, Number<I> i, Number<IEnd>, Number<IStep>)
+{
+    auto r_new = reduce(x[i], r_old);
+
+    if constexpr(i.value < IEnd - IStep)
+    {
+        return container_reduce_impl(
+            x, reduce, r_new, i + Number<IStep>{}, Number<IEnd>{}, Number<IStep>{});
+    }
+    else
+    {
+        return r_new;
+    }
+}
+
+// rocm-4.1 compiler would crash for recursive lambda
+// container reduce with initial value
+template <typename Container,
+          typename Reduce,
+          typename Init,
+          index_t IBegin = 0,
+          index_t IEnd   = Container::Size(),
+          index_t IStep  = 1>
+__host__ __device__ constexpr auto container_reduce(const Container& x,
+                                                    Reduce reduce,
+                                                    Init init,
+                                                    Number<IBegin> = Number<0>{},
+                                                    Number<IEnd>   = Number<Container::Size()>{},
+                                                    Number<IStep>  = Number<1>{})
+{
+    static_assert((IEnd - IBegin) % IStep == 0, "wrong!");
+
+    if constexpr(IEnd > IBegin)
+    {
+        return container_reduce_impl(
+            x, reduce, init, Number<IBegin>{}, Number<IEnd>{}, Number<IStep>{});
+    }
+    else
+    {
+        return init;
+    }
+}
+#endif
+
+template <typename TData, index_t NSize, typename Reduce>
+__host__ __device__ constexpr auto
+container_reverse_inclusive_scan(const Array<TData, NSize>& x, Reduce f, TData init)
+{
+    Array<TData, NSize> y;
+
+    TData r = init;
+
+    static_for<NSize - 1, 0, -1>{}([&](auto i) {
+        r    = f(r, x[i]);
+        y(i) = r;
+    });
+
+    r              = f(r, x[Number<0>{}]);
+    y(Number<0>{}) = r;
+
+    return y;
+}
+
+template <typename TData, index_t NSize, typename Reduce>
+__host__ __device__ constexpr auto
+container_reverse_exclusive_scan(const Array<TData, NSize>& x, Reduce f, TData init)
+{
+    Array<TData, NSize> y;
+
+    TData r = init;
+
+    static_for<NSize - 1, 0, -1>{}([&](auto i) {
+        y(i) = r;
+        r    = f(r, x[i]);
+    });
+
+    y(Number<0>{}) = r;
+
+    return y;
+}
+
+template <index_t... Is, typename Reduce, index_t Init>
+__host__ __device__ constexpr auto
+container_reverse_exclusive_scan(const Sequence<Is...>& seq, Reduce f, Number<Init>)
+{
+    return reverse_exclusive_scan_sequence(seq, f, Number<Init>{});
+}
+
+#if !CK_WORKAROUND_SWDEV_275126
+// rocm4.1 compiler would crash with recursive lambda
+template <typename... Xs, typename Reduce, typename Init>
+__host__ __device__ constexpr auto
+container_reverse_exclusive_scan(const Tuple<Xs...>& x, Reduce reduce, Init init)
+{
+    constexpr index_t NSize = sizeof...(Xs);
+
+    // f is recursive function, fs is a dummy of f
+    // i is index, y_old is current scan, r_old is current reduction
+    auto f = [&](auto fs, auto i, auto y_old, auto r_old) {
+        auto r_new = reduce(x[i], r_old);
+
+        auto y_new = container_push_front(y_old, r_new);
+
+        if constexpr(i.value > 1)
+        {
+            // recursively call f/fs
+            return fs(fs, i - Number<1>{}, y_new, r_new);
+        }
+        else
+        {
+            return y_new;
+        }
+    };
+
+    // start recursion
+    return f(f, Number<NSize - 1>{}, make_tuple(init), init);
+}
+#else
+// i is index, y_old is current scan, r_old is current reduction
+template <typename... Xs, typename Reduce, index_t I, typename YOld, typename ROld>
+__host__ __device__ constexpr auto container_reverse_exclusive_scan_impl(
+    const Tuple<Xs...>& x, Reduce reduce, Number<I> i, YOld y_old, ROld r_old)
+{
+    auto r_new = reduce(x[i], r_old);
+
+    auto y_new = container_push_front(y_old, r_new);
+
+    if constexpr(i.value > 1)
+    {
+        // recursively call f/fs
+        return container_reverse_exclusive_scan_impl(x, reduce, i - Number<1>{}, y_new, r_new);
+    }
+    else
+    {
+        return y_new;
+    }
+}
+
+template <typename... Xs, typename Reduce, typename Init>
+__host__ __device__ constexpr auto
+container_reverse_exclusive_scan(const Tuple<Xs...>& x, Reduce reduce, Init init)
+{
+    constexpr index_t NSize = sizeof...(Xs);
+
+    return container_reverse_exclusive_scan_impl(
+        x, reduce, Number<NSize - 1>{}, make_tuple(init), init);
+}
+#endif
+
+// TODO: update to like container_reverse_exclusive_scan to deal with Tuple of Numebr<>
+template <typename... Xs, typename Reduce, typename TData>
+__host__ __device__ constexpr auto
+container_reverse_inclusive_scan(const Tuple<Xs...>& x, Reduce f, TData init)
+{
+    constexpr index_t NSize = sizeof...(Xs);
+
+    Tuple<Xs...> y;
+
+    TData r = init;
+
+    static_for<NSize - 1, 0, -1>{}([&](auto i) {
+        r    = f(r, x[i]);
+        y(i) = r;
+    });
+
+    r              = f(r, x[Number<0>{}]);
+    y(Number<0>{}) = r;
+
+    return y;
+}
+
+template <typename X, typename... Ys>
+__host__ __device__ constexpr auto container_concat(const X& x, const Ys&... ys)
+{
+    return container_concat(x, container_concat(ys...));
+}
+
+template <typename T, index_t NX, index_t NY>
+__host__ __device__ constexpr auto container_concat(const Array<T, NX>& ax, const Array<T, NY>& ay)
+{
+    return unpack2(
+        [&](auto&&... zs) { return make_array(std::forward<decltype(zs)>(zs)...); }, ax, ay);
+}
+
+template <typename... X, typename... Y>
+__host__ __device__ constexpr auto container_concat(const Tuple<X...>& tx, const Tuple<Y...>& ty)
+{
+    return unpack2(
+        [&](auto&&... zs) { return make_tuple(std::forward<decltype(zs)>(zs)...); }, tx, ty);
+}
+
+template <typename Container>
+__host__ __device__ constexpr auto container_concat(const Container& x)
+{
+    return x;
+}
+
+template <typename T, index_t N, index_t... Is>
+__host__ __device__ constexpr auto get_container_subset(const Array<T, N>& arr, Sequence<Is...>)
+{
+    static_assert(N >= sizeof...(Is), "wrong! size");
+
+    return make_array(arr[Number<Is>{}]...);
+}
+
+template <typename... Ts, index_t... Is>
+__host__ __device__ constexpr auto get_container_subset(const Tuple<Ts...>& tup, Sequence<Is...>)
+{
+    static_assert(sizeof...(Ts) >= sizeof...(Is), "wrong! size");
+
+    return make_tuple(tup[Number<Is>{}]...);
+}
+
+template <typename T, index_t N, index_t... Is>
+__host__ __device__ constexpr void
+set_container_subset(Array<T, N>& y, Sequence<Is...> picks, const Array<T, sizeof...(Is)>& x)
+{
+    static_assert(N >= sizeof...(Is), "wrong! size");
+
+    static_for<0, sizeof...(Is), 1>{}([&](auto i) { y(picks[i]) = x[i]; });
+}
+
+template <typename... Ys, index_t... Is, typename... Xs>
+__host__ __device__ constexpr void
+set_container_subset(Tuple<Ys...>& y, Sequence<Is...> picks, const Tuple<Xs...>& x)
+{
+    static_assert(sizeof...(Ys) >= sizeof...(Is) && sizeof...(Is) == sizeof...(Xs), "wrong! size");
+
+    static_for<0, sizeof...(Is), 1>{}([&](auto i) { y(picks[i]) = x[i]; });
+}
+
+template <index_t... Is>
+__host__ __device__ constexpr auto sequence_to_tuple_of_number(Sequence<Is...>)
+{
+    using Seq = Sequence<Is...>;
+
+    return generate_tuple(
+        [&](auto i) {
+            constexpr index_t tmp = Seq::At(i);
+            return Number<tmp>{};
+        },
+        Seq::Size());
+}
+
+} // namespace ck
+#endif
diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp
new file mode 100644
index 00000000..40ee8b61
--- /dev/null
+++ b/include/ck/utility/data_type.hpp
@@ -0,0 +1,1059 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/statically_indexed_array.hpp"
+
+namespace ck {
+
+using bhalf_t = ushort;
+using half_t  = _Float16;
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+using int4_t = _BitInt(4);
+#endif
+
+// vector_type
+template <typename T, index_t N>
+struct vector_type;
+
+// Caution: DO NOT REMOVE
+// intentionally have only declaration but no definition to cause compilation failure when trying to
+// instantiate this template. The purpose is to catch user's mistake when trying to make "vector of
+// vectors"
+template <typename T, index_t V, index_t N>
+struct vector_type<T __attribute__((ext_vector_type(V))), N>;
+
+// Caution: DO NOT REMOVE
+// intentionally have only declaration but no definition to cause compilation failure when trying to
+// instantiate this template. The purpose is to catch user's mistake when trying to make "vector of
+// vectors"
+template <typename T, index_t V, index_t N>
+struct vector_type<vector_type<T, V>, N>;
+
+// vector_type_maker
+// This is the right way to handle "vector of vectors": making a bigger vector instead
+template <typename T, index_t N>
+struct vector_type_maker
+{
+    using type = vector_type<T, N>;
+};
+
+template <typename T, index_t N0, index_t N1>
+struct vector_type_maker<T __attribute__((ext_vector_type(N1))), N0>
+{
+    using type = vector_type<T, N0 * N1>;
+};
+
+template <typename T, index_t N0, index_t N1>
+struct vector_type_maker<vector_type<T, N1>, N0>
+{
+    using type = vector_type<T, N0 * N1>;
+};
+
+template <typename T, index_t N>
+using vector_type_maker_t = typename vector_type_maker<T, N>::type;
+
+template <typename T, index_t N>
+__host__ __device__ constexpr auto make_vector_type(Number<N>)
+{
+    return typename vector_type_maker<T, N>::type{};
+}
+
+// scalar_type
+template <typename TV>
+struct scalar_type;
+
+// is_scalar_type
+template <typename TV>
+struct is_scalar_type
+{
+    static constexpr bool value = (scalar_type<remove_cvref_t<TV>>::vector_size == 1);
+};
+
+// has_same_scalar_type
+template <typename X, typename Y>
+using has_same_scalar_type = is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                     typename scalar_type<remove_cvref_t<Y>>::type>;
+
+template <typename T, index_t N>
+struct scalar_type<T __attribute__((ext_vector_type(N)))>
+{
+    using type                           = T;
+    static constexpr index_t vector_size = N;
+};
+
+template <typename T, index_t N>
+struct scalar_type<vector_type<T, N>>
+{
+    using type                           = T;
+    static constexpr index_t vector_size = N;
+};
+
+//
+template <>
+struct scalar_type<double>
+{
+    using type                           = double;
+    static constexpr index_t vector_size = 1;
+};
+
+template <>
+struct scalar_type<float>
+{
+    using type                           = float;
+    static constexpr index_t vector_size = 1;
+};
+
+template <>
+struct scalar_type<half_t>
+{
+    using type                           = half_t;
+    static constexpr index_t vector_size = 1;
+};
+
+template <>
+struct scalar_type<bhalf_t>
+{
+    using type                           = bhalf_t;
+    static constexpr index_t vector_size = 1;
+};
+
+template <>
+struct scalar_type<int32_t>
+{
+    using type                           = int32_t;
+    static constexpr index_t vector_size = 1;
+};
+
+template <>
+struct scalar_type<int8_t>
+{
+    using type                           = int8_t;
+    static constexpr index_t vector_size = 1;
+};
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+template <>
+struct scalar_type<int4_t>
+{
+    using type                           = int4_t;
+    static constexpr index_t vector_size = 1;
+};
+#endif
+
+//
+template <typename T>
+struct vector_type<T, 1>
+{
+    using d1_t = T;
+    using type = d1_t;
+
+    union
+    {
+        T d1_;
+        StaticallyIndexedArray<T, 1> d1x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value, "wrong!");
+
+        return data_.d1x1_;
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value, "wrong!");
+
+        return data_.d1x1_;
+    }
+};
+
+template <typename T>
+struct vector_type<T, 2>
+{
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+
+    using type = d2_t;
+
+    union
+    {
+        d2_t d2_;
+        StaticallyIndexedArray<d1_t, 2> d1x2_;
+        StaticallyIndexedArray<d2_t, 1> d2x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value, "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x2_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x1_;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value, "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x2_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x1_;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 4>
+{
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+    typedef T d4_t __attribute__((ext_vector_type(4)));
+
+    using type = d4_t;
+
+    union
+    {
+        d4_t d4_;
+        StaticallyIndexedArray<d1_t, 4> d1x4_;
+        StaticallyIndexedArray<d2_t, 2> d2x2_;
+        StaticallyIndexedArray<d4_t, 1> d4x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value || is_same<X, d4_t>::value,
+                      "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x4_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x2_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x1_;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value || is_same<X, d4_t>::value,
+                      "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x4_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x2_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x1_;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 8>
+{
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+    typedef T d4_t __attribute__((ext_vector_type(4)));
+    typedef T d8_t __attribute__((ext_vector_type(8)));
+
+    using type = d8_t;
+
+    union
+    {
+        d8_t d8_;
+        StaticallyIndexedArray<d1_t, 8> d1x8_;
+        StaticallyIndexedArray<d2_t, 4> d2x4_;
+        StaticallyIndexedArray<d4_t, 2> d4x2_;
+        StaticallyIndexedArray<d8_t, 1> d8x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value,
+                      "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x8_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x4_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x2_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x1_;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value,
+                      "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x8_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x4_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x2_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x1_;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 16>
+{
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+    typedef T d4_t __attribute__((ext_vector_type(4)));
+    typedef T d8_t __attribute__((ext_vector_type(8)));
+    typedef T d16_t __attribute__((ext_vector_type(16)));
+
+    using type = d16_t;
+
+    union
+    {
+        d16_t d16_;
+        StaticallyIndexedArray<d1_t, 16> d1x16_;
+        StaticallyIndexedArray<d2_t, 8> d2x8_;
+        StaticallyIndexedArray<d4_t, 4> d4x4_;
+        StaticallyIndexedArray<d8_t, 2> d8x2_;
+        StaticallyIndexedArray<d16_t, 1> d16x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
+                          is_same<X, d16_t>::value,
+                      "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x16_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x8_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x4_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x2_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x1_;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
+                          is_same<X, d16_t>::value,
+                      "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x16_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x8_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x4_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x2_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x1_;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 32>
+{
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+    typedef T d4_t __attribute__((ext_vector_type(4)));
+    typedef T d8_t __attribute__((ext_vector_type(8)));
+    typedef T d16_t __attribute__((ext_vector_type(16)));
+    typedef T d32_t __attribute__((ext_vector_type(32)));
+
+    using type = d32_t;
+
+    union
+    {
+        d32_t d32_;
+        StaticallyIndexedArray<d1_t, 32> d1x32_;
+        StaticallyIndexedArray<d2_t, 16> d2x16_;
+        StaticallyIndexedArray<d4_t, 8> d4x8_;
+        StaticallyIndexedArray<d8_t, 4> d8x4_;
+        StaticallyIndexedArray<d16_t, 2> d16x2_;
+        StaticallyIndexedArray<d32_t, 1> d32x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
+                          is_same<X, d16_t>::value || is_same<X, d32_t>::value,
+                      "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x32_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x16_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x8_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x4_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x2_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x1_;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
+                          is_same<X, d16_t>::value || is_same<X, d32_t>::value,
+                      "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x32_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x16_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x8_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x4_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x2_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x1_;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 64>
+{
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+    typedef T d4_t __attribute__((ext_vector_type(4)));
+    typedef T d8_t __attribute__((ext_vector_type(8)));
+    typedef T d16_t __attribute__((ext_vector_type(16)));
+    typedef T d32_t __attribute__((ext_vector_type(32)));
+    typedef T d64_t __attribute__((ext_vector_type(64)));
+
+    using type = d64_t;
+
+    union
+    {
+        d64_t d64_;
+        StaticallyIndexedArray<d1_t, 64> d1x64_;
+        StaticallyIndexedArray<d2_t, 32> d2x32_;
+        StaticallyIndexedArray<d4_t, 16> d4x16_;
+        StaticallyIndexedArray<d8_t, 8> d8x8_;
+        StaticallyIndexedArray<d16_t, 4> d16x4_;
+        StaticallyIndexedArray<d32_t, 2> d32x2_;
+        StaticallyIndexedArray<d64_t, 1> d64x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
+                          is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
+                          is_same<X, d64_t>::value,
+                      "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x64_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x32_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x16_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x8_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x4_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x2_;
+        }
+        else if constexpr(is_same<X, d64_t>::value)
+        {
+            return data_.d64x1_;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
+                          is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
+                          is_same<X, d64_t>::value,
+                      "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x64_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x32_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x16_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x8_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x4_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x2_;
+        }
+        else if constexpr(is_same<X, d64_t>::value)
+        {
+            return data_.d64x1_;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 128>
+{
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+    typedef T d4_t __attribute__((ext_vector_type(4)));
+    typedef T d8_t __attribute__((ext_vector_type(8)));
+    typedef T d16_t __attribute__((ext_vector_type(16)));
+    typedef T d32_t __attribute__((ext_vector_type(32)));
+    typedef T d64_t __attribute__((ext_vector_type(64)));
+    typedef T d128_t __attribute__((ext_vector_type(128)));
+
+    using type = d128_t;
+
+    union
+    {
+        d128_t d128_;
+        StaticallyIndexedArray<d1_t, 128> d1x128_;
+        StaticallyIndexedArray<d2_t, 64> d2x64_;
+        StaticallyIndexedArray<d4_t, 32> d4x32_;
+        StaticallyIndexedArray<d8_t, 16> d8x16_;
+        StaticallyIndexedArray<d16_t, 8> d16x8_;
+        StaticallyIndexedArray<d32_t, 4> d32x4_;
+        StaticallyIndexedArray<d64_t, 2> d64x2_;
+        StaticallyIndexedArray<d128_t, 1> d128x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
+                          is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
+                          is_same<X, d64_t>::value || is_same<X, d128_t>::value,
+                      "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x128_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x64_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x32_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x16_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x8_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x4_;
+        }
+        else if constexpr(is_same<X, d64_t>::value)
+        {
+            return data_.d64x2_;
+        }
+        else if constexpr(is_same<X, d128_t>::value)
+        {
+            return data_.d128x1_;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(is_same<X, d1_t>::value || is_same<X, d2_t>::value ||
+                          is_same<X, d4_t>::value || is_same<X, d8_t>::value ||
+                          is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
+                          is_same<X, d64_t>::value || is_same<X, d128_t>::value,
+                      "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x128_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x64_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x32_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x16_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x8_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x4_;
+        }
+        else if constexpr(is_same<X, d64_t>::value)
+        {
+            return data_.d64x2_;
+        }
+        else if constexpr(is_same<X, d128_t>::value)
+        {
+            return data_.d128x1_;
+        }
+    }
+};
+
+template <typename T>
+struct vector_type<T, 256>
+{
+    using d1_t = T;
+    typedef T d2_t __attribute__((ext_vector_type(2)));
+    typedef T d4_t __attribute__((ext_vector_type(4)));
+    typedef T d8_t __attribute__((ext_vector_type(8)));
+    typedef T d16_t __attribute__((ext_vector_type(16)));
+    typedef T d32_t __attribute__((ext_vector_type(32)));
+    typedef T d64_t __attribute__((ext_vector_type(64)));
+    typedef T d128_t __attribute__((ext_vector_type(128)));
+    typedef T d256_t __attribute__((ext_vector_type(256)));
+
+    using type = d256_t;
+
+    union
+    {
+        d256_t d256_;
+        StaticallyIndexedArray<d1_t, 256> d1x256_;
+        StaticallyIndexedArray<d2_t, 128> d2x128_;
+        StaticallyIndexedArray<d4_t, 64> d4x64_;
+        StaticallyIndexedArray<d8_t, 32> d8x32_;
+        StaticallyIndexedArray<d16_t, 16> d16x16_;
+        StaticallyIndexedArray<d32_t, 8> d32x8_;
+        StaticallyIndexedArray<d64_t, 4> d64x4_;
+        StaticallyIndexedArray<d128_t, 2> d128x2_;
+        StaticallyIndexedArray<d256_t, 1> d256x1_;
+    } data_;
+
+    __host__ __device__ constexpr vector_type() : data_{type{0}} {}
+
+    __host__ __device__ constexpr vector_type(type v) : data_{v} {}
+
+    template <typename X>
+    __host__ __device__ constexpr const auto& AsType() const
+    {
+        static_assert(
+            is_same<X, d1_t>::value || is_same<X, d2_t>::value || is_same<X, d4_t>::value ||
+                is_same<X, d8_t>::value || is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
+                is_same<X, d64_t>::value || is_same<X, d128_t>::value || is_same<X, d256_t>::value,
+            "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x256_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x128_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x64_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x32_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x16_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x8_;
+        }
+        else if constexpr(is_same<X, d64_t>::value)
+        {
+            return data_.d64x4_;
+        }
+        else if constexpr(is_same<X, d128_t>::value)
+        {
+            return data_.d128x2_;
+        }
+        else if constexpr(is_same<X, d256_t>::value)
+        {
+            return data_.d256x1_;
+        }
+    }
+
+    template <typename X>
+    __host__ __device__ constexpr auto& AsType()
+    {
+        static_assert(
+            is_same<X, d1_t>::value || is_same<X, d2_t>::value || is_same<X, d4_t>::value ||
+                is_same<X, d8_t>::value || is_same<X, d16_t>::value || is_same<X, d32_t>::value ||
+                is_same<X, d64_t>::value || is_same<X, d128_t>::value || is_same<X, d256_t>::value,
+            "wrong!");
+
+        if constexpr(is_same<X, d1_t>::value)
+        {
+            return data_.d1x256_;
+        }
+        else if constexpr(is_same<X, d2_t>::value)
+        {
+            return data_.d2x128_;
+        }
+        else if constexpr(is_same<X, d4_t>::value)
+        {
+            return data_.d4x64_;
+        }
+        else if constexpr(is_same<X, d8_t>::value)
+        {
+            return data_.d8x32_;
+        }
+        else if constexpr(is_same<X, d16_t>::value)
+        {
+            return data_.d16x16_;
+        }
+        else if constexpr(is_same<X, d32_t>::value)
+        {
+            return data_.d32x8_;
+        }
+        else if constexpr(is_same<X, d64_t>::value)
+        {
+            return data_.d64x4_;
+        }
+        else if constexpr(is_same<X, d128_t>::value)
+        {
+            return data_.d128x2_;
+        }
+        else if constexpr(is_same<X, d256_t>::value)
+        {
+            return data_.d256x1_;
+        }
+    }
+};
+
+// fp64
+using double2_t = typename vector_type<double, 2>::type;
+using double4_t = typename vector_type<double, 4>::type;
+
+// fp32
+using float2_t  = typename vector_type<float, 2>::type;
+using float4_t  = typename vector_type<float, 4>::type;
+using float8_t  = typename vector_type<float, 8>::type;
+using float16_t = typename vector_type<float, 16>::type;
+using float32_t = typename vector_type<float, 32>::type;
+using float64_t = typename vector_type<float, 64>::type;
+
+// fp16
+using half2_t  = typename vector_type<half_t, 2>::type;
+using half4_t  = typename vector_type<half_t, 4>::type;
+using half8_t  = typename vector_type<half_t, 8>::type;
+using half16_t = typename vector_type<half_t, 16>::type;
+using half32_t = typename vector_type<half_t, 32>::type;
+using half64_t = typename vector_type<half_t, 64>::type;
+
+// bfp16
+using bhalf2_t  = typename vector_type<bhalf_t, 2>::type;
+using bhalf4_t  = typename vector_type<bhalf_t, 4>::type;
+using bhalf8_t  = typename vector_type<bhalf_t, 8>::type;
+using bhalf16_t = typename vector_type<bhalf_t, 16>::type;
+using bhalf32_t = typename vector_type<bhalf_t, 32>::type;
+using bhalf64_t = typename vector_type<bhalf_t, 64>::type;
+
+// i32
+using int32x2_t  = typename vector_type<int32_t, 2>::type;
+using int32x4_t  = typename vector_type<int32_t, 4>::type;
+using int32x8_t  = typename vector_type<int32_t, 8>::type;
+using int32x16_t = typename vector_type<int32_t, 16>::type;
+using int32x32_t = typename vector_type<int32_t, 32>::type;
+using int32x64_t = typename vector_type<int32_t, 64>::type;
+
+// i8
+using int8x2_t  = typename vector_type<int8_t, 2>::type;
+using int8x4_t  = typename vector_type<int8_t, 4>::type;
+using int8x8_t  = typename vector_type<int8_t, 8>::type;
+using int8x16_t = typename vector_type<int8_t, 16>::type;
+using int8x32_t = typename vector_type<int8_t, 32>::type;
+using int8x64_t = typename vector_type<int8_t, 64>::type;
+
+// Convert X to Y
+template <typename Y, typename X>
+__host__ __device__ constexpr Y type_convert(X x)
+{
+    static_assert(!std::is_reference_v<Y> && !std::is_reference_v<X>);
+
+    return static_cast<Y>(x);
+}
+
+// convert bfp16 to fp32
+template <>
+inline __host__ __device__ constexpr float type_convert<float, bhalf_t>(bhalf_t x)
+{
+    union
+    {
+        uint32_t int32;
+        float fp32;
+    } u = {uint32_t(x) << 16};
+
+    return u.fp32;
+}
+
+// convert fp32 to bfp16
+template <>
+inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, float>(float x)
+{
+    union
+    {
+        float fp32;
+        uint32_t int32;
+    } u = {x};
+
+    if(~u.int32 & 0x7f800000)
+    {
+        // When the exponent bits are not all 1s, then the value is zero, normal,
+        // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
+        // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
+        // This causes the bfloat16's mantissa to be incremented by 1 if the 16
+        // least significant bits of the float mantissa are greater than 0x8000,
+        // or if they are equal to 0x8000 and the least significant bit of the
+        // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
+        // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
+        // has the value 0x7f, then incrementing it causes it to become 0x00 and
+        // the exponent is incremented by one, which is the next higher FP value
+        // to the unrounded bfloat16 value. When the bfloat16 value is subnormal
+        // with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
+        // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
+        // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
+        // incrementing it causes it to become an exponent of 0xFF and a mantissa
+        // of 0x00, which is Inf, the next higher value to the unrounded value.
+        u.int32 += 0x7fff + ((u.int32 >> 16) & 1); // Round to nearest, round to even
+    }
+    else if(u.int32 & 0xffff)
+    {
+        // When all of the exponent bits are 1, the value is Inf or NaN.
+        // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
+        // mantissa bit. Quiet NaN is indicated by the most significant mantissa
+        // bit being 1. Signaling NaN is indicated by the most significant
+        // mantissa bit being 0 but some other bit(s) being 1. If any of the
+        // lower 16 bits of the mantissa are 1, we set the least significant bit
+        // of the bfloat16 mantissa, in order to preserve signaling NaN in case
+        // the bloat16's mantissa bits are all 0.
+        u.int32 |= 0x10000; // Preserve signaling NaN
+    }
+
+    return uint16_t(u.int32 >> 16);
+}
+
+template <typename T>
+struct NumericLimits
+{
+    __host__ __device__ static constexpr T Min() { return std::numeric_limits<T>::min(); }
+
+    __host__ __device__ static constexpr T Max() { return std::numeric_limits<T>::max(); }
+
+    __host__ __device__ static constexpr T Lowest() { return std::numeric_limits<T>::lowest(); }
+
+    __host__ __device__ static constexpr T QuietNaN()
+    {
+        return std::numeric_limits<T>::quiet_NaN();
+    }
+
+    __host__ __device__ static constexpr T Infinity() { return std::numeric_limits<T>::infinity(); }
+};
+
+template <>
+struct NumericLimits<half_t>
+{
+    static constexpr unsigned short binary_min    = 0x0400;
+    static constexpr unsigned short binary_max    = 0x7BFF;
+    static constexpr unsigned short binary_lowest = 0xFBFF;
+    static constexpr unsigned short binary_qnan   = 0x7FFF;
+
+    __host__ __device__ static constexpr half_t Min() { return bit_cast<half_t>(binary_min); }
+
+    __host__ __device__ static constexpr half_t Max() { return bit_cast<half_t>(binary_max); }
+
+    __host__ __device__ static constexpr half_t Lowest() { return bit_cast<half_t>(binary_lowest); }
+
+    __host__ __device__ static constexpr half_t QuietNaN() { return bit_cast<half_t>(binary_qnan); }
+};
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+template <>
+struct NumericLimits<int4_t>
+{
+    __host__ __device__ static constexpr int4_t Min() { return int4_t(-8); }
+
+    __host__ __device__ static constexpr int4_t Max() { return int4_t(7); }
+
+    __host__ __device__ static constexpr int4_t Lowest() { return int4_t(-8); }
+};
+#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+
+} // namespace ck
diff --git a/include/ck/utility/debug.hpp b/include/ck/utility/debug.hpp
new file mode 100644
index 00000000..593bbb71
--- /dev/null
+++ b/include/ck/utility/debug.hpp
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef UTILITY_DEBUG_HPP
+#define UTILITY_DEBUG_HPP
+
+namespace ck {
+namespace debug {
+
+namespace detail {
+template <typename T, typename Enable = void>
+struct PrintAsType;
+
+template <typename T>
+struct PrintAsType<T, typename std::enable_if<std::is_floating_point<T>::value>::type>
+{
+    using type = float;
+    __host__ __device__ static void Print(const T& p) { printf("%.3f ", static_cast<type>(p)); }
+};
+
+template <>
+struct PrintAsType<ck::half_t, void>
+{
+    using type = float;
+    __host__ __device__ static void Print(const ck::half_t& p)
+    {
+        printf("%.3f ", static_cast<type>(p));
+    }
+};
+
+template <typename T>
+struct PrintAsType<T, typename std::enable_if<std::is_integral<T>::value>::type>
+{
+    using type = int;
+    __host__ __device__ static void Print(const T& p) { printf("%d ", static_cast<type>(p)); }
+};
+} // namespace detail
+
+// Print at runtime the data in shared memory in 128 bytes per row format given shared mem pointer
+// and the number of elements. Can optionally specify strides between elements and how many bytes'
+// worth of data per row.
+//
+// Usage example:
+//
+//   debug::print_shared(a_block_buf.p_data_, index_t(a_block_desc_k0_m_k1.GetElementSpaceSize()));
+//
+template <typename T, index_t element_stride = 1, index_t row_bytes = 128>
+__device__ void print_shared(T const* p_shared, index_t num_elements)
+{
+    constexpr index_t row_elements = row_bytes / sizeof(T);
+    static_assert((element_stride >= 1 && element_stride <= row_elements),
+                  "element_stride should between [1, row_elements]");
+
+    index_t wgid = blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;
+    index_t tid =
+        (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
+
+    __syncthreads();
+
+    if(tid == 0)
+    {
+        printf("\nWorkgroup id %d, bytes per row %d, element stride %d\n\n",
+               wgid,
+               row_bytes,
+               element_stride);
+        for(index_t i = 0; i < num_elements; i += row_elements)
+        {
+            printf("elem %5d: ", i);
+            for(index_t j = 0; j < row_elements; j += element_stride)
+            {
+                detail::PrintAsType<T>::Print(p_shared[i + j]);
+            }
+
+            printf("\n");
+        }
+        printf("\n");
+    }
+
+    __syncthreads();
+}
+
+} // namespace debug
+} // namespace ck
+
+#endif // UTILITY_DEBUG_HPP
diff --git a/include/ck/utility/dynamic_buffer.hpp b/include/ck/utility/dynamic_buffer.hpp
new file mode 100644
index 00000000..c6f0d299
--- /dev/null
+++ b/include/ck/utility/dynamic_buffer.hpp
@@ -0,0 +1,398 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/utility/data_type.hpp"
+#include "enable_if.hpp"
+#include "c_style_pointer_cast.hpp"
+#include "amd_buffer_addressing.hpp"
+#include "generic_memory_space_atomic.hpp"
+
+namespace ck {
+
+// T may be scalar or vector
+// X may be scalar or vector
+// T and X have same scalar type
+// X contains multiple T
+template <AddressSpaceEnum BufferAddressSpace,
+          typename T,
+          typename ElementSpaceSize,
+          bool InvalidElementUseNumericalZeroValue>
+struct DynamicBuffer
+{
+    using type = T;
+
+    T* p_data_;
+    ElementSpaceSize element_space_size_;
+    T invalid_element_value_ = T{0};
+
+    __host__ __device__ constexpr DynamicBuffer(T* p_data, ElementSpaceSize element_space_size)
+        : p_data_{p_data}, element_space_size_{element_space_size}
+    {
+    }
+
+    __host__ __device__ constexpr DynamicBuffer(T* p_data,
+                                                ElementSpaceSize element_space_size,
+                                                T invalid_element_value)
+        : p_data_{p_data},
+          element_space_size_{element_space_size},
+          invalid_element_value_{invalid_element_value}
+    {
+    }
+
+    __host__ __device__ static constexpr AddressSpaceEnum GetAddressSpace()
+    {
+        return BufferAddressSpace;
+    }
+
+    __host__ __device__ constexpr const T& operator[](index_t i) const { return p_data_[i]; }
+
+    __host__ __device__ constexpr T& operator()(index_t i) { return p_data_[i]; }
+
+    template <typename X,
+              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                 bool>::type = false>
+    __host__ __device__ constexpr auto Get(index_t i, bool is_valid_element) const
+    {
+        // X contains multiple T
+        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+
+        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+
+        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
+                      "wrong! X should contain multiple T");
+
+#if CK_USE_AMD_BUFFER_LOAD
+        bool constexpr use_amd_buffer_addressing = true;
+#else
+        bool constexpr use_amd_buffer_addressing = false;
+#endif
+
+        if constexpr(GetAddressSpace() == AddressSpaceEnum::Global && use_amd_buffer_addressing)
+        {
+            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+
+            if constexpr(InvalidElementUseNumericalZeroValue)
+            {
+                return amd_buffer_load_invalid_element_return_zero<remove_cvref_t<T>, t_per_x>(
+                    p_data_, i, is_valid_element, element_space_size_);
+            }
+            else
+            {
+                return amd_buffer_load_invalid_element_return_customized_value<remove_cvref_t<T>,
+                                                                               t_per_x>(
+                    p_data_, i, is_valid_element, element_space_size_, invalid_element_value_);
+            }
+        }
+        else
+        {
+            if(is_valid_element)
+            {
+#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
+                X tmp;
+
+                __builtin_memcpy(&tmp, &(p_data_[i]), sizeof(X));
+
+                return tmp;
+#else
+                return *c_style_pointer_cast<const X*>(&p_data_[i]);
+#endif
+            }
+            else
+            {
+                if constexpr(InvalidElementUseNumericalZeroValue)
+                {
+                    return X{0};
+                }
+                else
+                {
+                    return X{invalid_element_value_};
+                }
+            }
+        }
+    }
+
+    template <InMemoryDataOperationEnum Op,
+              typename X,
+              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                 bool>::type = false>
+    __host__ __device__ void Update(index_t i, bool is_valid_element, const X& x)
+    {
+        if constexpr(Op == InMemoryDataOperationEnum::Set)
+        {
+            this->template Set<X>(i, is_valid_element, x);
+        }
+        else if constexpr(Op == InMemoryDataOperationEnum::AtomicAdd)
+        {
+            this->template AtomicAdd<X>(i, is_valid_element, x);
+        }
+        else if constexpr(Op == InMemoryDataOperationEnum::AtomicMax)
+        {
+            this->template AtomicMax<X>(i, is_valid_element, x);
+        }
+        else if constexpr(Op == InMemoryDataOperationEnum::Add)
+        {
+            auto tmp = this->template Get<X>(i, is_valid_element);
+            this->template Set<X>(i, is_valid_element, x + tmp);
+            // tmp += x;
+            // this->template Set<X>(i, is_valid_element, tmp);
+        }
+    }
+
+    template <typename X,
+              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                 bool>::type = false>
+    __host__ __device__ void Set(index_t i, bool is_valid_element, const X& x)
+    {
+        // X contains multiple T
+        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+
+        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+
+        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
+                      "wrong! X should contain multiple T");
+
+#if CK_USE_AMD_BUFFER_STORE
+        bool constexpr use_amd_buffer_addressing = true;
+#else
+        bool constexpr use_amd_buffer_addressing      = false;
+#endif
+
+#if CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
+        bool constexpr workaround_int8_ds_write_issue = true;
+#else
+        bool constexpr workaround_int8_ds_write_issue = false;
+#endif
+
+        if constexpr(GetAddressSpace() == AddressSpaceEnum::Global && use_amd_buffer_addressing)
+        {
+            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+
+            amd_buffer_store<remove_cvref_t<T>, t_per_x>(
+                x, p_data_, i, is_valid_element, element_space_size_);
+        }
+        else if constexpr(GetAddressSpace() == AddressSpaceEnum::Lds &&
+                          is_same<typename scalar_type<remove_cvref_t<T>>::type, int8_t>::value &&
+                          workaround_int8_ds_write_issue)
+        {
+            if(is_valid_element)
+            {
+                // HACK: compiler would lower IR "store<i8, 16> address_space(3)" into inefficient
+                // ISA, so I try to let compiler emit IR "store<i32, 4>" which would be lower to
+                // ds_write_b128
+                // TODO: remove this after compiler fix
+                static_assert((is_same<remove_cvref_t<T>, int8_t>::value &&
+                               is_same<remove_cvref_t<X>, int8_t>::value) ||
+                                  (is_same<remove_cvref_t<T>, int8_t>::value &&
+                                   is_same<remove_cvref_t<X>, int8x2_t>::value) ||
+                                  (is_same<remove_cvref_t<T>, int8_t>::value &&
+                                   is_same<remove_cvref_t<X>, int8x4_t>::value) ||
+                                  (is_same<remove_cvref_t<T>, int8_t>::value &&
+                                   is_same<remove_cvref_t<X>, int8x8_t>::value) ||
+                                  (is_same<remove_cvref_t<T>, int8_t>::value &&
+                                   is_same<remove_cvref_t<X>, int8x16_t>::value) ||
+                                  (is_same<remove_cvref_t<T>, int8x4_t>::value &&
+                                   is_same<remove_cvref_t<X>, int8x4_t>::value) ||
+                                  (is_same<remove_cvref_t<T>, int8x8_t>::value &&
+                                   is_same<remove_cvref_t<X>, int8x8_t>::value) ||
+                                  (is_same<remove_cvref_t<T>, int8x16_t>::value &&
+                                   is_same<remove_cvref_t<X>, int8x16_t>::value),
+                              "wrong! not implemented for this combination, please add "
+                              "implementation");
+
+                if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
+                             is_same<remove_cvref_t<X>, int8_t>::value)
+                {
+                    // HACK: cast pointer of x is bad
+                    // TODO: remove this after compiler fix
+                    *c_style_pointer_cast<int8_t*>(&p_data_[i]) =
+                        *c_style_pointer_cast<const int8_t*>(&x);
+                }
+                else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
+                                  is_same<remove_cvref_t<X>, int8x2_t>::value)
+                {
+                    // HACK: cast pointer of x is bad
+                    // TODO: remove this after compiler fix
+                    *c_style_pointer_cast<int16_t*>(&p_data_[i]) =
+                        *c_style_pointer_cast<const int16_t*>(&x);
+                }
+                else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
+                                  is_same<remove_cvref_t<X>, int8x4_t>::value)
+                {
+                    // HACK: cast pointer of x is bad
+                    // TODO: remove this after compiler fix
+                    *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
+                        *c_style_pointer_cast<const int32_t*>(&x);
+                }
+                else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
+                                  is_same<remove_cvref_t<X>, int8x8_t>::value)
+                {
+                    // HACK: cast pointer of x is bad
+                    // TODO: remove this after compiler fix
+                    *c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
+                        *c_style_pointer_cast<const int32x2_t*>(&x);
+                }
+                else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
+                                  is_same<remove_cvref_t<X>, int8x16_t>::value)
+                {
+                    // HACK: cast pointer of x is bad
+                    // TODO: remove this after compiler fix
+                    *c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
+                        *c_style_pointer_cast<const int32x4_t*>(&x);
+                }
+                else if constexpr(is_same<remove_cvref_t<T>, int8x4_t>::value &&
+                                  is_same<remove_cvref_t<X>, int8x4_t>::value)
+                {
+                    // HACK: cast pointer of x is bad
+                    // TODO: remove this after compiler fix
+                    *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
+                        *c_style_pointer_cast<const int32_t*>(&x);
+                }
+                else if constexpr(is_same<remove_cvref_t<T>, int8x8_t>::value &&
+                                  is_same<remove_cvref_t<X>, int8x8_t>::value)
+                {
+                    // HACK: cast pointer of x is bad
+                    // TODO: remove this after compiler fix
+                    *c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
+                        *c_style_pointer_cast<const int32x2_t*>(&x);
+                }
+                else if constexpr(is_same<remove_cvref_t<T>, int8x16_t>::value &&
+                                  is_same<remove_cvref_t<X>, int8x16_t>::value)
+                {
+                    // HACK: cast pointer of x is bad
+                    // TODO: remove this after compiler fix
+                    *c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
+                        *c_style_pointer_cast<const int32x4_t*>(&x);
+                }
+            }
+        }
+        else
+        {
+            if(is_valid_element)
+            {
+#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
+                X tmp = x;
+
+                __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
+#else
+                *c_style_pointer_cast<X*>(&p_data_[i]) = x;
+#endif
+            }
+        }
+    }
+
+    template <typename X,
+              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                 bool>::type = false>
+    __host__ __device__ void AtomicAdd(index_t i, bool is_valid_element, const X& x)
+    {
+        using scalar_t = typename scalar_type<remove_cvref_t<T>>::type;
+
+        // X contains multiple T
+        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+
+        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+
+        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
+                      "wrong! X should contain multiple T");
+
+        static_assert(GetAddressSpace() == AddressSpaceEnum::Global, "only support global mem");
+
+#if CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
+        bool constexpr use_amd_buffer_addressing =
+            is_same_v<remove_cvref_t<scalar_t>, int32_t> ||
+            is_same_v<remove_cvref_t<scalar_t>, float> ||
+            (is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0);
+#elif CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && (!CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT)
+        bool constexpr use_amd_buffer_addressing = is_same_v<remove_cvref_t<scalar_t>, int32_t>;
+#elif(!CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER) && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
+        bool constexpr use_amd_buffer_addressing =
+            is_same_v<remove_cvref_t<scalar_t>, float> ||
+            (is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0);
+#else
+        bool constexpr use_amd_buffer_addressing = false;
+#endif
+
+        if constexpr(use_amd_buffer_addressing)
+        {
+            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+
+            amd_buffer_atomic_add<remove_cvref_t<T>, t_per_x>(
+                x, p_data_, i, is_valid_element, element_space_size_);
+        }
+        else
+        {
+            if(is_valid_element)
+            {
+                atomic_add<X>(c_style_pointer_cast<X*>(&p_data_[i]), x);
+            }
+        }
+    }
+
+    template <typename X,
+              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                 bool>::type = false>
+    __host__ __device__ void AtomicMax(index_t i, bool is_valid_element, const X& x)
+    {
+        // X contains multiple T
+        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+
+        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+
+        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
+                      "wrong! X should contain multiple T");
+
+        static_assert(GetAddressSpace() == AddressSpaceEnum::Global, "only support global mem");
+
+#if CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64
+        using scalar_t                           = typename scalar_type<remove_cvref_t<T>>::type;
+        bool constexpr use_amd_buffer_addressing = is_same_v<remove_cvref_t<scalar_t>, double>;
+#else
+        bool constexpr use_amd_buffer_addressing = false;
+#endif
+
+        if constexpr(use_amd_buffer_addressing)
+        {
+            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+
+            amd_buffer_atomic_max<remove_cvref_t<T>, t_per_x>(
+                x, p_data_, i, is_valid_element, element_space_size_);
+        }
+        else if(is_valid_element)
+        {
+            atomic_max<X>(c_style_pointer_cast<X*>(&p_data_[i]), x);
+        }
+    }
+
+    __host__ __device__ static constexpr bool IsStaticBuffer() { return false; }
+
+    __host__ __device__ static constexpr bool IsDynamicBuffer() { return true; }
+};
+
+template <AddressSpaceEnum BufferAddressSpace, typename T, typename ElementSpaceSize>
+__host__ __device__ constexpr auto make_dynamic_buffer(T* p, ElementSpaceSize element_space_size)
+{
+    return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, true>{p, element_space_size};
+}
+
+template <
+    AddressSpaceEnum BufferAddressSpace,
+    typename T,
+    typename ElementSpaceSize,
+    typename X,
+    typename enable_if<is_same<remove_cvref_t<T>, remove_cvref_t<X>>::value, bool>::type = false>
+__host__ __device__ constexpr auto
+make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, X invalid_element_value)
+{
+    return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, false>{
+        p, element_space_size, invalid_element_value};
+}
+
+} // namespace ck
diff --git a/include/ck/utility/enable_if.hpp b/include/ck/utility/enable_if.hpp
new file mode 100644
index 00000000..297434b0
--- /dev/null
+++ b/include/ck/utility/enable_if.hpp
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+
+template <bool B, typename T = void>
+using enable_if = std::enable_if<B, T>;
+
+template <bool B, typename T = void>
+using enable_if_t = typename std::enable_if<B, T>::type;
+
+} // namespace ck
diff --git a/include/ck/utility/functional.hpp b/include/ck/utility/functional.hpp
new file mode 100644
index 00000000..08e73078
--- /dev/null
+++ b/include/ck/utility/functional.hpp
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/integral_constant.hpp"
+#include "ck/utility/type.hpp"
+
+namespace ck {
+
+// TODO: right? wrong?
+struct forwarder
+{
+    template <typename T>
+    __host__ __device__ constexpr T&& operator()(T&& x) const
+    {
+        return static_cast<T&&>(x);
+    }
+};
+
+struct swallow
+{
+    template <typename... Ts>
+    __host__ __device__ constexpr swallow(Ts&&...)
+    {
+    }
+};
+
+template <typename T>
+struct logical_and
+{
+    constexpr bool operator()(const T& x, const T& y) const { return x && y; }
+};
+
+template <typename T>
+struct logical_or
+{
+    constexpr bool operator()(const T& x, const T& y) const { return x || y; }
+};
+
+template <typename T>
+struct logical_not
+{
+    constexpr bool operator()(const T& x) const { return !x; }
+};
+
+// Emulate if constexpr
+template <bool>
+struct static_if;
+
+template <>
+struct static_if<true>
+{
+    using Type = static_if<true>;
+
+    template <typename F>
+    __host__ __device__ constexpr auto operator()(F f) const
+    {
+        // This is a trick for compiler:
+        //   Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will
+        //   use it,
+        //   this will make "f" a generic lambda, so that "f" won't be compiled
+        //   until being
+        //   instantiated here
+        f(forwarder{});
+        return Type{};
+    }
+
+    template <typename F>
+    __host__ __device__ static void Else(F)
+    {
+    }
+};
+
+template <>
+struct static_if<false>
+{
+    using Type = static_if<false>;
+
+    template <typename F>
+    __host__ __device__ constexpr auto operator()(F) const
+    {
+        return Type{};
+    }
+
+    template <typename F>
+    __host__ __device__ static void Else(F f)
+    {
+        // This is a trick for compiler:
+        //   Pass forwarder to lambda "f" as "auto" argument, and make sure "f" will
+        //   use it,
+        //   this will make "f" a generic lambda, so that "f" won't be compiled
+        //   until being
+        //   instantiated here
+        f(forwarder{});
+    }
+};
+
+template <bool predicate, class X, class Y>
+struct conditional;
+
+template <class X, class Y>
+struct conditional<true, X, Y>
+{
+    using type = X;
+};
+
+template <class X, class Y>
+struct conditional<false, X, Y>
+{
+    using type = Y;
+};
+
+template <bool predicate, class X, class Y>
+using conditional_t = typename conditional<predicate, X, Y>::type;
+
+// z = predicate ? x : y
+template <bool predicate, typename X, typename Y>
+constexpr auto conditional_expr(X&& x, Y&& y)
+{
+    if constexpr(predicate)
+    {
+        return std::forward<X>(x);
+    }
+    else
+    {
+        return std::forward<Y>(y);
+    }
+}
+
+} // namespace ck
diff --git a/include/ck/utility/functional2.hpp b/include/ck/utility/functional2.hpp
new file mode 100644
index 00000000..6f125ca4
--- /dev/null
+++ b/include/ck/utility/functional2.hpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/functional.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+
+namespace detail {
+
+template <class>
+struct static_for_impl;
+
+template <index_t... Is>
+struct static_for_impl<Sequence<Is...>>
+{
+    template <class F>
+    __host__ __device__ constexpr void operator()(F f) const
+    {
+        swallow{(f(Number<Is>{}), 0)...};
+    }
+};
+
+} // namespace detail
+
+// F signature: F(Number<Iter>)
+template <index_t NBegin, index_t NEnd, index_t Increment>
+struct static_for
+{
+    __host__ __device__ constexpr static_for()
+    {
+        static_assert(Increment != 0 && (NEnd - NBegin) % Increment == 0,
+                      "Wrong! should satisfy (NEnd - NBegin) % Increment == 0");
+        static_assert((Increment > 0 && NBegin <= NEnd) || (Increment < 0 && NBegin >= NEnd),
+                      "wrongs! should (Increment > 0 && NBegin <= NEnd) || (Increment < 0 && "
+                      "NBegin >= NEnd)");
+    }
+
+    template <class F>
+    __host__ __device__ constexpr void operator()(F f) const
+    {
+        detail::static_for_impl<typename arithmetic_sequence_gen<NBegin, NEnd, Increment>::type>{}(
+            f);
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/utility/functional3.hpp b/include/ck/utility/functional3.hpp
new file mode 100644
index 00000000..06b67ef7
--- /dev/null
+++ b/include/ck/utility/functional3.hpp
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/utility/functional.hpp"
+#include "ck/utility/functional2.hpp"
+#include "ck/utility/sequence.hpp"
+#include "ck/utility/multi_index.hpp"
+
+namespace ck {
+
+namespace detail {
+
+// RemainLengths: Sequence<...>
+// Orders: Sequence<...>
+template <class RemainLengths, class Orders>
+struct static_ford_impl
+{
+    __host__ __device__ constexpr static_ford_impl()
+    {
+        static_assert(RemainLengths::GetSize() > 0, "wrong! should not get here");
+    }
+
+    // F signature: F(Sequence<...>)
+    // CurrentOrderedId: Sequence<...>
+    template <class F, class CurrentOrderedId>
+    __host__ __device__ constexpr void operator()(F f, CurrentOrderedId) const
+    {
+        static_for<0, RemainLengths::Front(), 1>{}([=](auto I) {
+            static_ford_impl<decltype(RemainLengths::PopFront()), Orders>{}(
+                f, CurrentOrderedId::PushBack(I));
+        });
+    }
+};
+
+template <class Orders>
+struct static_ford_impl<Sequence<>, Orders>
+{
+    // F signature: F(Sequence<...>)
+    // OrderedId: Sequence<...>
+    template <class F, class OrderedId>
+    __host__ __device__ constexpr void operator()(F f, OrderedId) const
+    {
+        // retrive unordered Id
+        f(OrderedId::ReorderGivenOld2New(Orders{}));
+    }
+};
+
+// RemainLengths: Sequence<...>
+// Orders: Sequence<...>
+template <class RemainLengths, class Orders>
+struct ford_impl
+{
+    __host__ __device__ constexpr ford_impl()
+    {
+        static_assert(RemainLengths::GetSize() > 0, "wrong! should not get here");
+    }
+
+    // F signature: F(Array<...> multi_id)
+    // CurrentOrderdId: Array<...>
+    template <class F, class CurrentOrderedId>
+    __host__ __device__ constexpr void operator()(F f, CurrentOrderedId current_ordered_id) const
+    {
+        for(index_t i = 0; i < RemainLengths::Front(); ++i)
+        {
+            ford_impl<decltype(RemainLengths::PopFront()), Orders>{}(
+                f, container_push_back(current_ordered_id, i));
+        }
+    }
+};
+
+template <class Orders>
+struct ford_impl<Sequence<>, Orders>
+{
+    // F signature: F(Array<...> multi_id)
+    // CurrentOrderdId: Array<...>
+    template <class F, class CurrentOrderedId>
+    __host__ __device__ constexpr void operator()(F f, CurrentOrderedId current_ordered_id) const
+    {
+        // retrive unordered Id
+        f(container_reorder_given_old2new(current_ordered_id, Orders{}));
+    }
+};
+
+} // namespace detail
+
+// Lengths is Sequence<...>, it is the length of each dimension for
+// N-dimensional loop
+// Orders is Sequence<...>, it is the order of dimension in which static_ford
+// will loop over each
+// dimension
+template <class Lengths,
+          class Orders = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::type>
+struct static_ford
+{
+    __host__ __device__ constexpr static_ford()
+    {
+        static_assert(Lengths::GetSize() > 0, "wrong! Lengths is empty");
+        static_assert(Lengths::GetSize() == Orders::GetSize(), "wrong! inconsistent size");
+    }
+
+    // F signature: F(Sequence<...> multi_id)
+    // multi_id is the unordered multi-index
+    template <class F>
+    __host__ __device__ constexpr void operator()(F f) const
+    {
+        constexpr auto ordered_lengths = Lengths::ReorderGivenNew2Old(Orders{});
+        detail::static_ford_impl<decltype(ordered_lengths), Orders>{}(f, Sequence<>{});
+    }
+};
+
+// Lengths is Sequence<...>, it is the length of each dimension for
+// N-dimensional loop
+// Orders is Sequence<...>, it is the order of dimension in which ford will loop
+// over each
+// dimension
+template <class Lengths,
+          class Orders = typename arithmetic_sequence_gen<0, Lengths::GetSize(), 1>::type>
+struct ford
+{
+    __host__ __device__ constexpr ford()
+    {
+        static_assert(Lengths::GetSize() > 0, "wrong! Lengths is empty");
+        static_assert(Lengths::GetSize() == Orders::GetSize(), "wrong! inconsistent size");
+    }
+
+    // F signature: F(Array<...> multi_id)
+    // multi_id is the unordered multi-index
+    template <class F>
+    __host__ __device__ constexpr void operator()(F f) const
+    {
+        constexpr auto ordered_lengths = Lengths::ReorderGivenNew2Old(Orders{});
+
+        for(index_t i = 0; i < ordered_lengths.Front(); ++i)
+        {
+            detail::ford_impl<decltype(ordered_lengths.PopFront()), Orders>{}(f,
+                                                                              make_multi_index(i));
+        }
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/utility/functional4.hpp b/include/ck/utility/functional4.hpp
new file mode 100644
index 00000000..6eeaf15c
--- /dev/null
+++ b/include/ck/utility/functional4.hpp
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_FUNCTIONAL4_HPP
+#define CK_FUNCTIONAL4_HPP
+
+#include "sequence.hpp"
+#include "tuple.hpp"
+#include "array.hpp"
+
+namespace ck {
+
+namespace detail {
+
+template <typename Indices>
+struct unpack_impl;
+
+template <index_t... Is>
+struct unpack_impl<Sequence<Is...>>
+{
+    template <typename F, typename X>
+    __host__ __device__ constexpr auto operator()(F&& f, X&& x) const
+    {
+        return std::forward<F>(f)(std::forward<X>(x).At(Number<Is>{})...);
+    }
+};
+
+template <typename Seq0, typename Seq1>
+struct unpack2_impl;
+
+// TODO: remove this, after properly implementing unpack that takes any number of containers
+template <index_t... Is, index_t... Js>
+struct unpack2_impl<Sequence<Is...>, Sequence<Js...>>
+{
+    template <typename F, typename X, typename Y>
+    __host__ __device__ constexpr auto operator()(F&& f, X&& x, Y&& y) const
+    {
+        return std::forward<F>(f)(std::forward<X>(x).At(Number<Is>{})...,
+                                  std::forward<Y>(y).At(Number<Js>{})...);
+    }
+};
+
+} // namespace detail
+
+template <typename F, typename X>
+__host__ __device__ constexpr auto unpack(F&& f, X&& x)
+{
+    using X_ = remove_reference_t<X>;
+    return detail::unpack_impl<typename arithmetic_sequence_gen<0, X_::Size(), 1>::type>{}(
+        std::forward<F>(f), std::forward<X>(x));
+}
+
+// TODO: properly implement unpack that takes any number of containers
+template <typename F, typename X, typename Y>
+__host__ __device__ constexpr auto unpack2(F&& f, X&& x, Y&& y)
+{
+    using X_ = remove_reference_t<X>;
+    using Y_ = remove_reference_t<Y>;
+    return detail::unpack2_impl<typename arithmetic_sequence_gen<0, X_::Size(), 1>::type,
+                                typename arithmetic_sequence_gen<0, Y_::Size(), 1>::type>{}(
+        std::forward<F>(f), std::forward<X>(x), std::forward<Y>(y));
+}
+
+} // namespace ck
+#endif
diff --git a/include/ck/utility/generic_memory_space_atomic.hpp b/include/ck/utility/generic_memory_space_atomic.hpp
new file mode 100644
index 00000000..6a1ca966
--- /dev/null
+++ b/include/ck/utility/generic_memory_space_atomic.hpp
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include "data_type.hpp"
+
+namespace ck {
+
+// Caution: DO NOT REMOVE
+// intentionally have only declaration but no definition to cause compilation failure when trying to
+// instantiate this template. The purpose is to make the implementation of atomic_add explicit for
+// each datatype.
+template <typename X>
+__device__ X atomic_add(X* p_dst, const X& x);
+
+template <>
+__device__ int32_t atomic_add<int32_t>(int32_t* p_dst, const int32_t& x)
+{
+    return atomicAdd(p_dst, x);
+}
+
+template <>
+__device__ uint32_t atomic_add<uint32_t>(uint32_t* p_dst, const uint32_t& x)
+{
+    return atomicAdd(p_dst, x);
+}
+
+template <>
+__device__ float atomic_add<float>(float* p_dst, const float& x)
+{
+    return atomicAdd(p_dst, x);
+}
+
+template <>
+__device__ double atomic_add<double>(double* p_dst, const double& x)
+{
+    return atomicAdd(p_dst, x);
+}
+
+template <>
+__device__ float2_t atomic_add<float2_t>(float2_t* p_dst, const float2_t& x)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+
+    const vector_type<float, 2> vx{x};
+    vector_type<float, 2> vy{0};
+
+    vy.template AsType<float>()(I0) =
+        atomicAdd(c_style_pointer_cast<float*>(p_dst), vx.template AsType<float>()[I0]);
+    vy.template AsType<float>()(I1) =
+        atomicAdd(c_style_pointer_cast<float*>(p_dst) + 1, vx.template AsType<float>()[I1]);
+
+    return vy.template AsType<float2_t>()[I0];
+}
+
+template <>
+__device__ double2_t atomic_add<double2_t>(double2_t* p_dst, const double2_t& x)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+
+    const vector_type<double, 2> vx{x};
+    vector_type<double, 2> vy{0};
+
+    vy.template AsType<double>()(I0) =
+        atomicAdd(c_style_pointer_cast<double*>(p_dst), vx.template AsType<double>()[I0]);
+    vy.template AsType<double>()(I1) =
+        atomicAdd(c_style_pointer_cast<double*>(p_dst) + 1, vx.template AsType<double>()[I1]);
+
+    return vy.template AsType<double2_t>()[I0];
+}
+
+// Caution: DO NOT REMOVE
+// intentionally have only declaration but no definition to cause compilation failure when trying to
+// instantiate this template. The purpose is to make the implementation of atomic_max explicit for
+// each datatype.
+
+template <typename X>
+__device__ X atomic_max(X* p_dst, const X& x);
+
+template <>
+__device__ int32_t atomic_max<int32_t>(int32_t* p_dst, const int32_t& x)
+{
+    return atomicMax(p_dst, x);
+}
+
+template <>
+__device__ uint32_t atomic_max<uint32_t>(uint32_t* p_dst, const uint32_t& x)
+{
+    return atomicMax(p_dst, x);
+}
+
+template <>
+__device__ float atomic_max<float>(float* p_dst, const float& x)
+{
+    return atomicMax(p_dst, x);
+}
+
+template <>
+__device__ double atomic_max<double>(double* p_dst, const double& x)
+{
+    return atomicMax(p_dst, x);
+}
+
+template <>
+__device__ float2_t atomic_max<float2_t>(float2_t* p_dst, const float2_t& x)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+
+    const vector_type<float, 2> vx{x};
+    vector_type<float, 2> vy{0};
+
+    vy.template AsType<float>()(I0) =
+        atomicMax(c_style_pointer_cast<float*>(p_dst), vx.template AsType<float>()[I0]);
+    vy.template AsType<float>()(I1) =
+        atomicMax(c_style_pointer_cast<float*>(p_dst) + 1, vx.template AsType<float>()[I1]);
+
+    return vy.template AsType<float2_t>()[I0];
+}
+
+} // namespace ck
diff --git a/include/ck/utility/get_id.hpp b/include/ck/utility/get_id.hpp
new file mode 100644
index 00000000..44ff4381
--- /dev/null
+++ b/include/ck/utility/get_id.hpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+
+namespace ck {
+
+__host__ __device__ constexpr index_t get_warp_size()
+{
+    // warpSize is defined by HIP
+    return warpSize;
+}
+
+__device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
+
+__device__ index_t get_thread_global_1d_id() { return blockIdx.x * blockDim.x + threadIdx.x; }
+
+__device__ index_t get_warp_local_1d_id() { return threadIdx.x / get_warp_size(); }
+
+__device__ index_t get_block_1d_id() { return blockIdx.x; }
+
+__device__ index_t get_grid_size() { return gridDim.x; }
+
+__device__ index_t get_block_size() { return blockDim.x; }
+
+} // namespace ck
diff --git a/include/ck/utility/ignore.hpp b/include/ck/utility/ignore.hpp
new file mode 100644
index 00000000..ac33cbf9
--- /dev/null
+++ b/include/ck/utility/ignore.hpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+// https://en.cppreference.com/w/cpp/utility/tuple/ignore
+
+namespace ck {
+
+namespace detail {
+struct ignore_t
+{
+    template <typename T>
+    constexpr void operator=(T&&) const noexcept
+    {
+    }
+};
+} // namespace detail
+
+inline constexpr detail::ignore_t ignore;
+
+} // namespace ck
diff --git a/include/ck/utility/inner_product.hpp b/include/ck/utility/inner_product.hpp
new file mode 100644
index 00000000..0f45ec17
--- /dev/null
+++ b/include/ck/utility/inner_product.hpp
@@ -0,0 +1,205 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include "data_type.hpp"
+
+namespace ck {
+
+template <typename TA, typename TB, typename TC>
+__device__ void inner_product(const TA& a, const TB& b, TC& c);
+
+template <>
+__device__ void inner_product<float, float, float>(const float& a, const float& b, float& c)
+{
+#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM && defined(CK_USE_AMD_V_MAC_F32)
+    asm volatile("\n \
+            v_mac_f32 %0, %1, %2 \n \
+            "
+                 : "=v"(c)
+                 : "v"(a), "v"(b), "0"(c));
+#elif CK_USE_AMD_INNER_PRODUCT_INLINE_ASM && defined(CK_USE_AMD_V_FMAC_F32)
+    asm volatile("\n \
+            v_fmac_f32 %0, %1, %2 \n \
+            "
+                 : "=v"(c)
+                 : "v"(a), "v"(b), "0"(c));
+#else
+    c += a * b;
+#endif
+}
+
+template <>
+__device__ void
+inner_product<float2_t, float2_t, float>(const float2_t& a, const float2_t& b, float& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+
+    inner_product(vector_type<float, 2>{a}.AsType<float>()[I0],
+                  vector_type<float, 2>{b}.AsType<float>()[I0],
+                  c);
+
+    inner_product(vector_type<float, 2>{a}.AsType<float>()[I1],
+                  vector_type<float, 2>{b}.AsType<float>()[I1],
+                  c);
+}
+
+template <>
+__device__ void
+inner_product<float4_t, float4_t, float>(const float4_t& a, const float4_t& b, float& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    inner_product(vector_type<float, 4>{a}.AsType<float>()[I0],
+                  vector_type<float, 4>{b}.AsType<float>()[I0],
+                  c);
+
+    inner_product(vector_type<float, 4>{a}.AsType<float>()[I1],
+                  vector_type<float, 4>{b}.AsType<float>()[I1],
+                  c);
+
+    inner_product(vector_type<float, 4>{a}.AsType<float>()[I2],
+                  vector_type<float, 4>{b}.AsType<float>()[I2],
+                  c);
+
+    inner_product(vector_type<float, 4>{a}.AsType<float>()[I3],
+                  vector_type<float, 4>{b}.AsType<float>()[I3],
+                  c);
+}
+
+template <>
+__device__ void inner_product<half2_t, half2_t, float>(const half2_t& a, const half2_t& b, float& c)
+{
+#if defined(CK_USE_AMD_V_DOT2_F32_F16)
+#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
+    asm volatile("\n \
+            v_dot2_f32_f16 %0, %1, %2, %0\n \
+            "
+                 : "=v"(c)
+                 : "v"(a), "v"(b), "0"(c));
+#else
+    c = __builtin_amdgcn_sdot2(a, b, c, false);
+#endif
+#else
+    const vector_type<half_t, 2> a_vector{a};
+    const vector_type<half_t, 2> b_vector{b};
+
+    static_for<0, 2, 1>{}([&](auto i) {
+        c += type_convert<int32_t>(a_vector.AsType<half_t>()[i]) *
+             type_convert<int32_t>(b_vector.AsType<half_t>()[i]);
+    });
+#endif
+}
+
+template <>
+__device__ void inner_product<half4_t, half4_t, float>(const half4_t& a, const half4_t& b, float& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+
+    inner_product(vector_type<half_t, 4>{a}.AsType<half2_t>()[I0],
+                  vector_type<half_t, 4>{b}.AsType<half2_t>()[I0],
+                  c);
+
+    inner_product(vector_type<half_t, 4>{a}.AsType<half2_t>()[I1],
+                  vector_type<half_t, 4>{b}.AsType<half2_t>()[I1],
+                  c);
+}
+
+template <>
+__device__ void inner_product<half8_t, half8_t, float>(const half8_t& a, const half8_t& b, float& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    inner_product(vector_type<half_t, 8>{a}.AsType<half2_t>()[I0],
+                  vector_type<half_t, 8>{b}.AsType<half2_t>()[I0],
+                  c);
+
+    inner_product(vector_type<half_t, 8>{a}.AsType<half2_t>()[I1],
+                  vector_type<half_t, 8>{b}.AsType<half2_t>()[I1],
+                  c);
+
+    inner_product(vector_type<half_t, 8>{a}.AsType<half2_t>()[I2],
+                  vector_type<half_t, 8>{b}.AsType<half2_t>()[I2],
+                  c);
+
+    inner_product(vector_type<half_t, 8>{a}.AsType<half2_t>()[I3],
+                  vector_type<half_t, 8>{b}.AsType<half2_t>()[I3],
+                  c);
+}
+
+template <>
+__device__ void
+inner_product<int8x4_t, int8x4_t, int32_t>(const int8x4_t& a, const int8x4_t& b, int32_t& c)
+{
+#if defined(CK_USE_AMD_V_DOT4_I32_I8)
+#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
+    asm volatile("\n \
+            v_dot4_i32_i8 %0, %1, %2, %0\n \
+            "
+                 : "=v"(c)
+                 : "v"(bit_cast<int32_t>(a)), "v"(bit_cast<int32_t>(b)), "0"(c));
+#else
+    c = __builtin_amdgcn_sdot4(bit_cast<int32_t>(a), bit_cast<int32_t>(b), c, false);
+#endif
+#else
+    const vector_type<int8_t, 4> a_vector{a};
+    const vector_type<int8_t, 4> b_vector{b};
+
+    static_for<0, 4, 1>{}([&](auto i) {
+        c += type_convert<int32_t>(a_vector.AsType<int8_t>()[i]) *
+             type_convert<int32_t>(b_vector.AsType<int8_t>()[i]);
+    });
+#endif
+}
+
+template <>
+__device__ void
+inner_product<int8x8_t, int8x8_t, int32_t>(const int8x8_t& a, const int8x8_t& b, int32_t& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+
+    inner_product(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I0],
+                  vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I0],
+                  c);
+
+    inner_product(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I1],
+                  vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I1],
+                  c);
+}
+
+template <>
+__device__ void
+inner_product<int8x16_t, int8x16_t, int32_t>(const int8x16_t& a, const int8x16_t& b, int32_t& c)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I0],
+                  vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I0],
+                  c);
+
+    inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I1],
+                  vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I1],
+                  c);
+
+    inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I2],
+                  vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I2],
+                  c);
+
+    inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I3],
+                  vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I3],
+                  c);
+}
+
+} // namespace ck
diff --git a/include/ck/utility/integral_constant.hpp b/include/ck/utility/integral_constant.hpp
new file mode 100644
index 00000000..9aab4e24
--- /dev/null
+++ b/include/ck/utility/integral_constant.hpp
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+
+template <class T, T v>
+struct integral_constant
+{
+    static constexpr T value = v;
+    typedef T value_type;
+    typedef integral_constant type;
+    __host__ __device__ constexpr operator value_type() const noexcept { return value; }
+    __host__ __device__ constexpr value_type operator()() const noexcept { return value; }
+};
+
+template <typename TX, TX X, typename TY, TY Y>
+__host__ __device__ constexpr auto operator+(integral_constant<TX, X>, integral_constant<TY, Y>)
+{
+    return integral_constant<decltype(X + Y), X + Y>{};
+}
+
+template <typename TX, TX X, typename TY, TY Y>
+__host__ __device__ constexpr auto operator-(integral_constant<TX, X>, integral_constant<TY, Y>)
+{
+    static_assert(Y <= X, "wrong!");
+    return integral_constant<decltype(X - Y), X - Y>{};
+}
+
+template <typename TX, TX X, typename TY, TY Y>
+__host__ __device__ constexpr auto operator*(integral_constant<TX, X>, integral_constant<TY, Y>)
+{
+    return integral_constant<decltype(X * Y), X * Y>{};
+}
+
+template <typename TX, TX X, typename TY, TY Y>
+__host__ __device__ constexpr auto operator/(integral_constant<TX, X>, integral_constant<TY, Y>)
+{
+    static_assert(Y > 0, "wrong!");
+    return integral_constant<decltype(X / Y), X / Y>{};
+}
+
+template <typename TX, TX X, typename TY, TY Y>
+__host__ __device__ constexpr auto operator%(integral_constant<TX, X>, integral_constant<TY, Y>)
+{
+    static_assert(Y > 0, "wrong!");
+    return integral_constant<decltype(X % Y), X % Y>{};
+}
+
+} // namespace ck
diff --git a/include/ck/utility/is_known_at_compile_time.hpp b/include/ck/utility/is_known_at_compile_time.hpp
new file mode 100644
index 00000000..81981544
--- /dev/null
+++ b/include/ck/utility/is_known_at_compile_time.hpp
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "integral_constant.hpp"
+#include "sequence.hpp"
+#include "tuple.hpp"
+
+namespace ck {
+
+template <typename T>
+struct is_known_at_compile_time;
+
+template <>
+struct is_known_at_compile_time<index_t>
+{
+    static constexpr bool value = false;
+};
+
+template <>
+struct is_known_at_compile_time<long_index_t>
+{
+    static constexpr bool value = false;
+};
+
+template <typename T, T X>
+struct is_known_at_compile_time<integral_constant<T, X>>
+{
+    static constexpr bool value = true;
+};
+
+template <index_t... Is>
+struct is_known_at_compile_time<Sequence<Is...>>
+{
+    static constexpr bool value = true;
+};
+
+template <typename... Ts>
+struct is_known_at_compile_time<Tuple<Ts...>>
+{
+    __host__ __device__ static constexpr bool IsKnownAtCompileTime()
+    {
+        return container_reduce(
+            Tuple<Ts...>{},
+            [](auto x, bool r) {
+                return is_known_at_compile_time<remove_cvref_t<decltype(x)>>::value & r;
+            },
+            true);
+    }
+
+    static constexpr bool value = IsKnownAtCompileTime();
+};
+
+} // namespace ck
diff --git a/include/ck/utility/magic_division.hpp b/include/ck/utility/magic_division.hpp
new file mode 100644
index 00000000..a5e8e921
--- /dev/null
+++ b/include/ck/utility/magic_division.hpp
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "integral_constant.hpp"
+#include "number.hpp"
+#include "type.hpp"
+#include "tuple.hpp"
+
+namespace ck {
+
+// magic number division
+// Caution:
+//   1. For uint32_t as dividend: magic number division implementation being used would produce
+//   correct result if the dividend is uint32_t and its value is within 31-bit value range.
+//   2. For int32_t as dividendd: magic number division for int32_t dividened has not been
+//   implemented, the int32_t dividend would be bit-wise interpreted as uint32_t and magic number
+//   division implementation for uint32_t is then used. Therefore, dividend value need to be
+//   non-negative.
+// TODO:
+//   1. Implement magic number divison for int32_t
+//   2. Implement magic number divison for unit32_t with 32-bit value range
+struct MagicDivision
+{
+    // uint32_t
+    __host__ __device__ static constexpr auto CalculateMagicNumbers(uint32_t divisor)
+    {
+        // WARNING: magic division is only applicable for division inside this range.
+        // You should use the return value of CalculateMagicNumbers, if division is not inside this
+        // range. The "else" logic below is to quiet down run-time error.
+        if(divisor >= 1 && divisor <= INT32_MAX)
+        {
+            uint32_t shift = 0;
+            for(shift = 0; shift < 32; ++shift)
+            {
+                if((1U << shift) >= divisor)
+                {
+                    break;
+                }
+            }
+
+            uint64_t one        = 1;
+            uint64_t multiplier = ((one << 32) * ((one << shift) - divisor)) / divisor + 1;
+            // assert(multiplier <= 0xffffffffUL);
+
+            return make_tuple(uint32_t(multiplier), shift);
+        }
+        else
+        {
+            return make_tuple(uint32_t(0), uint32_t(0));
+        }
+    }
+
+    __host__ __device__ static constexpr uint32_t CalculateMagicMultiplier(uint32_t divisor)
+    {
+        auto tmp = CalculateMagicNumbers(divisor);
+
+        return tmp[Number<0>{}];
+    }
+
+    __host__ __device__ static constexpr uint32_t CalculateMagicShift(uint32_t divisor)
+    {
+        auto tmp = CalculateMagicNumbers(divisor);
+
+        return tmp[Number<1>{}];
+    }
+
+    // integral_constant<uint32_t, .>
+    template <uint32_t Divisor>
+    __host__ __device__ static constexpr auto
+        CalculateMagicNumbers(integral_constant<uint32_t, Divisor>)
+    {
+        constexpr auto tmp = CalculateMagicNumbers(uint32_t{Divisor});
+
+        constexpr uint32_t multiplier = tmp[Number<0>{}];
+        constexpr uint32_t shift      = tmp[Number<1>{}];
+
+        return make_tuple(integral_constant<uint32_t, multiplier>{},
+                          integral_constant<uint32_t, shift>{});
+    }
+
+    template <uint32_t Divisor>
+    __host__ __device__ static constexpr auto
+        CalculateMagicMultiplier(integral_constant<uint32_t, Divisor>)
+    {
+        constexpr uint32_t multiplier = CalculateMagicMultiplier(uint32_t{Divisor});
+
+        return integral_constant<uint32_t, multiplier>{};
+    }
+
+    template <uint32_t Divisor>
+    __host__ __device__ static constexpr auto
+        CalculateMagicShift(integral_constant<uint32_t, Divisor>)
+    {
+        constexpr uint32_t shift = CalculateMagicShift(uint32_t{Divisor});
+
+        return integral_constant<uint32_t, shift>{};
+    }
+
+    // integral_constant<int32_t, .>
+    template <int32_t Divisor>
+    __host__ __device__ static constexpr auto
+        CalculateMagicNumbers(integral_constant<int32_t, Divisor>)
+    {
+        return CalculateMagicNumbers(integral_constant<uint32_t, Divisor>{});
+    }
+
+    template <int32_t Divisor>
+    __host__ __device__ static constexpr auto
+        CalculateMagicMultiplier(integral_constant<int32_t, Divisor>)
+    {
+        return CalculateMagicMultiplier(integral_constant<uint32_t, Divisor>{});
+    }
+
+    template <int32_t Divisor>
+    __host__ __device__ static constexpr auto
+        CalculateMagicShift(integral_constant<int32_t, Divisor>)
+    {
+        return CalculateMagicShift(integral_constant<uint32_t, Divisor>{});
+    }
+
+    // magic division for uint32_t
+    __device__ static constexpr uint32_t
+    DoMagicDivision(uint32_t dividend, uint32_t multiplier, uint32_t shift)
+    {
+        uint32_t tmp = __umulhi(dividend, multiplier);
+        return (tmp + dividend) >> shift;
+    }
+
+    __host__ static constexpr uint32_t
+    DoMagicDivision(uint32_t dividend, uint32_t multiplier, uint32_t shift)
+    {
+        uint32_t tmp = static_cast<uint64_t>(dividend) * multiplier >> 32;
+        return (tmp + dividend) >> shift;
+    }
+
+    // magic division for int32_t
+    // HACK: use dividend_i32 as if it's uint32_t, dividend_i32 need to be
+    // non-negative for result to be correct
+    // TODO: figure out how to do magic number divison for int32_t as dividended
+    __device__ static constexpr int32_t
+    DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
+    {
+        uint32_t dividend_u32 = bit_cast<uint32_t>(dividend_i32);
+        uint32_t tmp          = __umulhi(dividend_u32, multiplier);
+        return (tmp + dividend_u32) >> shift;
+    }
+
+    __host__ static constexpr int32_t
+    DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
+    {
+        uint32_t dividend_u32 = bit_cast<uint32_t>(dividend_i32);
+        uint32_t tmp          = static_cast<uint64_t>(dividend_u32) * multiplier >> 32;
+        return (tmp + dividend_u32) >> shift;
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/utility/math.hpp b/include/ck/utility/math.hpp
new file mode 100644
index 00000000..12203bd7
--- /dev/null
+++ b/include/ck/utility/math.hpp
@@ -0,0 +1,240 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "integral_constant.hpp"
+#include "number.hpp"
+#include "type.hpp"
+#include "enable_if.hpp"
+
+namespace ck {
+namespace math {
+
+template <typename T, T s>
+struct scales
+{
+    __host__ __device__ constexpr T operator()(T a) const { return s * a; }
+};
+
+template <typename T>
+struct plus
+{
+    __host__ __device__ constexpr T operator()(T a, T b) const { return a + b; }
+};
+
+template <typename T>
+struct minus
+{
+    __host__ __device__ constexpr T operator()(T a, T b) const { return a - b; }
+};
+
+struct multiplies
+{
+    template <typename A, typename B>
+    __host__ __device__ constexpr auto operator()(const A& a, const B& b) const
+    {
+        return a * b;
+    }
+};
+
+template <typename T>
+struct maximize
+{
+    __host__ __device__ constexpr T operator()(T a, T b) const { return a >= b ? a : b; }
+};
+
+template <typename T>
+struct minimize
+{
+    __host__ __device__ constexpr T operator()(T a, T b) const { return a <= b ? a : b; }
+};
+
+template <typename T>
+struct integer_divide_ceiler
+{
+    __host__ __device__ constexpr T operator()(T a, T b) const
+    {
+        static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
+
+        return (a + b - Number<1>{}) / b;
+    }
+};
+
+template <typename X, typename Y>
+__host__ __device__ constexpr auto integer_divide_floor(X x, Y y)
+{
+    return x / y;
+}
+
+template <typename X, typename Y>
+__host__ __device__ constexpr auto integer_divide_ceil(X x, Y y)
+{
+    return (x + y - Number<1>{}) / y;
+}
+
+template <typename X, typename Y>
+__host__ __device__ constexpr auto integer_least_multiple(X x, Y y)
+{
+    return y * integer_divide_ceil(x, y);
+}
+
+template <typename T>
+__host__ __device__ constexpr T max(T x)
+{
+    return x;
+}
+
+template <typename T>
+__host__ __device__ constexpr T max(T x, T y)
+{
+    return x > y ? x : y;
+}
+
+template <index_t X>
+__host__ __device__ constexpr index_t max(Number<X>, index_t y)
+{
+    return X > y ? X : y;
+}
+
+template <index_t Y>
+__host__ __device__ constexpr index_t max(index_t x, Number<Y>)
+{
+    return x > Y ? x : Y;
+}
+
+template <typename X, typename... Ys>
+__host__ __device__ constexpr auto max(X x, Ys... ys)
+{
+    static_assert(sizeof...(Ys) > 0, "not enough argument");
+
+    return max(x, max(ys...));
+}
+
+template <typename T>
+__host__ __device__ constexpr T min(T x)
+{
+    return x;
+}
+
+template <typename T>
+__host__ __device__ constexpr T min(T x, T y)
+{
+    return x < y ? x : y;
+}
+
+template <index_t X>
+__host__ __device__ constexpr index_t min(Number<X>, index_t y)
+{
+    return X < y ? X : y;
+}
+
+template <index_t Y>
+__host__ __device__ constexpr index_t min(index_t x, Number<Y>)
+{
+    return x < Y ? x : Y;
+}
+
+template <typename X, typename... Ys>
+__host__ __device__ constexpr auto min(X x, Ys... ys)
+{
+    static_assert(sizeof...(Ys) > 0, "not enough argument");
+
+    return min(x, min(ys...));
+}
+
+template <typename T>
+__host__ __device__ constexpr T clamp(const T& x, const T& lowerbound, const T& upperbound)
+{
+    return min(max(x, lowerbound), upperbound);
+}
+
+// disallow implicit type casting
+template <typename T>
+__device__ T exp(T x);
+
+// TODO: add f16 support using v_exp_f16
+
+template <>
+__device__ float exp<float>(float x)
+{
+    return __expf(x);
+}
+
+template <>
+__device__ double exp<double>(double x)
+{
+    return exp(x);
+}
+
+// greatest common divisor, aka highest common factor
+__host__ __device__ constexpr index_t gcd(index_t x, index_t y)
+{
+    if(x < 0)
+    {
+        return gcd(-x, y);
+    }
+    else if(y < 0)
+    {
+        return gcd(x, -y);
+    }
+    else if(x == y || x == 0)
+    {
+        return y;
+    }
+    else if(y == 0)
+    {
+        return x;
+    }
+    else if(x > y)
+    {
+        return gcd(x % y, y);
+    }
+    else
+    {
+        return gcd(x, y % x);
+    }
+}
+
+template <index_t X, index_t Y>
+__host__ __device__ constexpr auto gcd(Number<X>, Number<Y>)
+{
+    constexpr auto r = gcd(X, Y);
+
+    return Number<r>{};
+}
+
+template <typename X, typename... Ys, typename enable_if<sizeof...(Ys) >= 2, bool>::type = false>
+__host__ __device__ constexpr auto gcd(X x, Ys... ys)
+{
+    return gcd(x, gcd(ys...));
+}
+
+// least common multiple
+template <typename X, typename Y>
+__host__ __device__ constexpr auto lcm(X x, Y y)
+{
+    return (x * y) / gcd(x, y);
+}
+
+template <typename X, typename... Ys, typename enable_if<sizeof...(Ys) >= 2, bool>::type = false>
+__host__ __device__ constexpr auto lcm(X x, Ys... ys)
+{
+    return lcm(x, lcm(ys...));
+}
+
+template <typename T>
+struct equal
+{
+    __host__ __device__ constexpr bool operator()(T x, T y) const { return x == y; }
+};
+
+template <typename T>
+struct less
+{
+    __host__ __device__ constexpr bool operator()(T x, T y) const { return x < y; }
+};
+
+} // namespace math
+} // namespace ck
diff --git a/include/ck/utility/math_v2.hpp b/include/ck/utility/math_v2.hpp
new file mode 100644
index 00000000..dc97666b
--- /dev/null
+++ b/include/ck/utility/math_v2.hpp
@@ -0,0 +1,164 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cmath>
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/type.hpp"
+
+namespace ck {
+namespace math {
+
+// math functions for the host,  some are implemented by calling C++ std functions
+
+static inline __host__ float abs(float x) { return std::abs(x); };
+
+static inline __host__ double abs(double x) { return std::abs(x); };
+
+static inline __host__ int8_t abs(int8_t x)
+{
+    int8_t sgn = x >> (8 - 1);
+
+    return (x ^ sgn) - sgn;
+};
+
+static inline __host__ int32_t abs(int32_t x)
+{
+    int32_t sgn = x >> (32 - 1);
+
+    return (x ^ sgn) - sgn;
+};
+
+static inline __host__ half_t abs(half_t x)
+{
+    uint16_t xx = ck::bit_cast<uint16_t>(x);
+
+    uint16_t abs_xx = xx & 0x7fff;
+
+    half_t abs_x = ck::bit_cast<half_t>(abs_xx);
+
+    return abs_x;
+};
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+static inline __host__ int4_t abs(int4_t x)
+{
+    int4_t sgn = x >> (4 - 1);
+    return (x ^ sgn) - sgn;
+}
+#endif
+
+static inline __host__ bool isnan(float x) { return std::isnan(x); };
+
+static inline __host__ bool isnan(double x) { return std::isnan(x); };
+
+static inline __host__ bool isnan(int8_t x)
+{
+    (void)x;
+    return false;
+};
+
+static inline __host__ bool isnan(int32_t x)
+{
+    (void)x;
+    return false;
+};
+
+static inline __host__ bool isnan(half_t x)
+{
+    uint16_t xx = ck::bit_cast<uint16_t>(x);
+
+    return (xx & 0x7FFF) > 0x7C00;
+};
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+static inline __host__ bool isnan(int4_t x)
+{
+    (void)x;
+    return false;
+};
+#endif
+
+static inline __host__ float sqrt(float x) { return std::sqrt(x); };
+
+static inline __host__ double sqrt(double x) { return std::sqrt(x); };
+
+// math functions for the HIP kernel,  some are implemented by calling hip builtin functions
+
+static inline __device__ float abs(float x) { return ::abs(x); };
+
+static inline __device__ double abs(double x) { return ::abs(x); };
+
+static inline __device__ int8_t abs(int8_t x)
+{
+    int8_t sgn = x >> (8 - 1);
+
+    return (x ^ sgn) - sgn;
+};
+
+static inline __device__ int32_t abs(int32_t x)
+{
+    int32_t sgn = x >> (32 - 1);
+
+    return (x ^ sgn) - sgn;
+};
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+static inline __device__ int4_t abs(int4_t x)
+{
+    int4_t sgn = x >> (4 - 1);
+
+    return (x ^ sgn) - sgn;
+};
+#endif
+
+static inline __device__ half_t abs(half_t x)
+{
+    uint16_t xx = ck::bit_cast<uint16_t>(x);
+
+    uint16_t abs_xx = xx & 0x7fff;
+
+    half_t abs_x = ck::bit_cast<half_t>(abs_xx);
+
+    return abs_x;
+};
+
+static inline __device__ bool isnan(float x) { return ::isnan(x); };
+
+static inline __device__ bool isnan(double x) { return ::isnan(x); };
+
+static inline __device__ bool isnan(int8_t x)
+{
+    (void)x;
+    return false;
+};
+
+static inline __device__ bool isnan(int32_t x)
+{
+    (void)x;
+    return false;
+};
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+static inline __device__ bool isnan(int4_t x)
+{
+    (void)x;
+    return false;
+};
+#endif
+
+static inline __device__ bool isnan(half_t x)
+{
+    uint16_t xx = ck::bit_cast<uint16_t>(x);
+
+    return (xx & 0x7FFF) > 0x7C00;
+};
+
+static inline __device__ float sqrt(float x) { return ::sqrtf(x); };
+
+static inline __device__ double sqrt(double x) { return ::sqrt(x); };
+
+} // namespace math
+} // namespace ck
diff --git a/include/ck/utility/multi_index.hpp b/include/ck/utility/multi_index.hpp
new file mode 100644
index 00000000..1d544c09
--- /dev/null
+++ b/include/ck/utility/multi_index.hpp
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "common_header.hpp"
+
+#if CK_EXPERIMENTAL_USE_DYNAMICALLY_INDEXED_MULTI_INDEX
+#include "array_multi_index.hpp"
+#else
+#include "statically_indexed_array_multi_index.hpp"
+#endif
diff --git a/include/ck/utility/number.hpp b/include/ck/utility/number.hpp
new file mode 100644
index 00000000..f3ca6b61
--- /dev/null
+++ b/include/ck/utility/number.hpp
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_NUMBER_HPP
+#define CK_NUMBER_HPP
+
+#include "integral_constant.hpp"
+
+namespace ck {
+
+template <index_t N>
+using Number = integral_constant<index_t, N>;
+
+template <index_t N>
+using LongNumber = integral_constant<long_index_t, N>;
+
+} // namespace ck
+#endif
diff --git a/include/ck/utility/print.hpp b/include/ck/utility/print.hpp
new file mode 100644
index 00000000..eed1ca42
--- /dev/null
+++ b/include/ck/utility/print.hpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_PRINT_HPP
+#define CK_PRINT_HPP
+
+#include "array.hpp"
+#include "statically_indexed_array.hpp"
+#include "container_helper.hpp"
+#include "sequence.hpp"
+
+namespace ck {
+
+template <typename T>
+__host__ __device__ void print_array(const char* s, T a)
+{
+    constexpr index_t nsize = a.Size();
+
+    printf("%s size %d, {", s, nsize);
+    static_for<0, nsize, 1>{}([&a](auto i) constexpr { printf("%d, ", int32_t{a[i]}); });
+    printf("}\n");
+}
+
+} // namespace ck
+#endif
diff --git a/include/ck/utility/reduction_common.hpp b/include/ck/utility/reduction_common.hpp
new file mode 100644
index 00000000..aceef7b2
--- /dev/null
+++ b/include/ck/utility/reduction_common.hpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/reduction_enums.hpp"
+
+namespace ck {
+
+struct float_equal_one
+{
+    template <class T>
+    __host__ __device__ inline bool operator()(T x)
+    {
+        return x <= static_cast<T>(1.0f) and x >= static_cast<T>(1.0f);
+    };
+};
+
+struct float_equal_zero
+{
+    template <class T>
+    __host__ __device__ inline bool operator()(T x)
+    {
+        return x <= static_cast<T>(0.0f) and x >= static_cast<T>(0.0f);
+    };
+};
+
+template <index_t N>
+static constexpr __device__ index_t get_shift()
+{
+    return (get_shift<N / 2>() + 1);
+};
+
+template <>
+constexpr __device__ index_t get_shift<1>()
+{
+    return (0);
+}
+
+} // namespace ck
diff --git a/include/ck/utility/reduction_enums.hpp b/include/ck/utility/reduction_enums.hpp
new file mode 100644
index 00000000..67856331
--- /dev/null
+++ b/include/ck/utility/reduction_enums.hpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+
+enum struct ReduceTensorOp
+{
+    ADD   = 0,
+    MUL   = 1,
+    MIN   = 2,
+    MAX   = 3,
+    AMAX  = 4,
+    AVG   = 5,
+    NORM1 = 6,
+    NORM2 = 7,
+    // MUL_NO_ZEROS = 8,
+};
+
+enum struct NanPropagation
+{
+    NOT_PROPAGATE_NAN = 0,
+    PROPAGATE_NAN     = 1,
+};
+
+enum struct ReduceTensorIndices
+{
+    NO_INDICES        = 0,
+    FLATTENED_INDICES = 1,
+};
+
+enum struct IndicesType
+{
+    INDICES_32BIT = 0,
+    INDICES_64BIT = 1,
+    INDICES_16BIT = 2,
+    INDICES_8BIT  = 3,
+};
+
+} // namespace ck
diff --git a/include/ck/utility/reduction_functions_accumulate.hpp b/include/ck/utility/reduction_functions_accumulate.hpp
new file mode 100644
index 00000000..724e5599
--- /dev/null
+++ b/include/ck/utility/reduction_functions_accumulate.hpp
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/math_v2.hpp"
+#include "ck/utility/reduction_common.hpp"
+#include "ck/utility/reduction_operator.hpp"
+
+namespace ck {
+namespace detail {
+
+// Check for NaN; guarantee NaNs are NOT propagated to result (i.e., ignore NaNs)
+template <typename ReduceOperation, typename AccDataType>
+struct AccumulateWithNanIgnore
+{
+    __device__ static inline void Calculate(AccDataType& accuVal, AccDataType currVal)
+    {
+        if(!ck::math::isnan(currVal))
+        {
+            ReduceOperation{}(accuVal, currVal);
+        }
+    };
+};
+
+template <bool PropagateNan, typename ReduceOperation, typename AccDataType>
+struct AccumulateWithNanCheck;
+
+// Does not check for NaN; does not guarantee NaNs be propagated to result
+// e.g., given that max(a, b) = a > b ? a : b
+// then  max(NaN, 1) returns 1
+//       max(1, NaN) returns NaN
+// since any comparison involving NaNs returns false
+template <typename ReduceOperation, typename AccDataType>
+struct AccumulateWithNanCheck<false, ReduceOperation, AccDataType>
+{
+    // cppcheck-suppress constParameter
+    __host__ __device__ static inline void Calculate(AccDataType& accuVal, AccDataType currVal)
+    {
+        ReduceOperation{}(accuVal, currVal);
+    };
+};
+
+// Check for NaN; guarantees NaNs be propagated to result
+template <typename ReduceOperation, typename AccDataType>
+struct AccumulateWithNanCheck<true, ReduceOperation, AccDataType>
+{
+    __host__ __device__ static inline void Calculate(AccDataType& accuVal, AccDataType currVal)
+    {
+        using ck::math::isnan;
+
+        if(isnan(currVal))
+        {
+            accuVal = currVal;
+        }
+        else
+        {
+            ReduceOperation{}(accuVal, currVal);
+        };
+    };
+};
+
+template <bool PropagateNan, typename ReduceOperation, typename AccDataType, typename IndexDataType>
+struct AccumulateWithIndexAndNanCheck;
+
+template <typename ReduceOperation, typename AccDataType, typename IndexDataType>
+struct AccumulateWithIndexAndNanCheck<false, ReduceOperation, AccDataType, IndexDataType>
+{
+    __host__ __device__ static inline void
+    // cppcheck-suppress constParameter
+    Calculate(AccDataType& accuVal,
+              AccDataType currVal,
+              IndexDataType& accuIndex,
+              IndexDataType currIndex)
+    {
+        bool changed = false;
+
+        ReduceOperation{}(accuVal, currVal, changed);
+
+        if(changed)
+            accuIndex = currIndex;
+    };
+};
+
+template <typename ReduceOperation, typename AccDataType, typename IndexDataType>
+struct AccumulateWithIndexAndNanCheck<true, ReduceOperation, AccDataType, IndexDataType>
+{
+    // The method is called when the ReduceOperation is indexable and the user asked for indices
+    __host__ __device__ static inline void Calculate(AccDataType& accuVal,
+                                                     AccDataType currVal,
+                                                     IndexDataType& accuIndex,
+                                                     IndexDataType currIndex)
+    {
+        using ck::math::isnan;
+
+        if(isnan(currVal))
+        {
+            accuVal   = currVal;
+            accuIndex = currIndex;
+        }
+        else
+        {
+            bool changed = false;
+
+            ReduceOperation{}(accuVal, currVal, changed);
+
+            if(changed)
+                accuIndex = currIndex;
+        }
+    };
+};
+
+} // namespace detail
+} // namespace ck
diff --git a/include/ck/utility/reduction_operator.hpp b/include/ck/utility/reduction_operator.hpp
new file mode 100644
index 00000000..25ae8fd3
--- /dev/null
+++ b/include/ck/utility/reduction_operator.hpp
@@ -0,0 +1,292 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/type.hpp"
+
+namespace ck {
+
+namespace reduce {
+
+// Every binary operator used in reduction is represented by a templated functor class. Each functor
+// class must provide at least
+// three members:
+// 1) GetIdentityValue() -- the interface to return the "identity element" for the binary
+// operator, "identity element" is the unique
+//                    element in the algebraic space that doesn't affect the value of other elements
+//                    when operated against them, and the concept is similar to zero vector in
+//                    vector space
+//                    (http://pages.cs.wisc.edu/~matthewb/pages/notes/pdf/linearalgebra/VectorSpaces.pdf).
+// 2) IsCompatibleInMemoryDataOperation() -- return true if the reduction task corresponding to this
+// operator can use the InMemoryDataOperation to finalize, or else it return false
+// 3) operator() -- the first argument of the operator must be both an input & output, and the
+//                  corresponding variable usually stores
+//                  the accumulated result of many operator() calls; the second argument is only an
+//                  input. For indexable binary
+//                  operator, the second version of operator() has third argument (which is an
+//                  output) to indicate whether the
+//                  accumulated value (the first argument) has changed, in which case the recorded
+//                  accumulated index also need be
+//                  changed.
+
+struct Add
+{
+    template <typename T>
+    __host__ __device__ static constexpr T GetIdentityValue()
+    {
+        return type_convert<T>(0.0f);
+    };
+
+    __host__ __device__ static constexpr bool
+    IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation)
+    {
+        return operation == InMemoryDataOperationEnum::AtomicAdd ||
+               operation == InMemoryDataOperationEnum::Set;
+    };
+
+    template <typename T>
+    __host__ __device__ inline constexpr void operator()(T& a, T b) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, int32_t>::value,
+                      "The data type is not supported by the Add accumulator!");
+
+        a = a + b;
+    }
+};
+
+struct SquaredAdd
+{
+    template <class T>
+    __host__ __device__ static constexpr T GetIdentityValue()
+    {
+        return type_convert<T>(0.0f);
+    };
+
+    __host__ __device__ static constexpr bool
+    IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation)
+    {
+        return operation == InMemoryDataOperationEnum::AtomicAdd ||
+               operation == InMemoryDataOperationEnum::Set;
+    };
+
+    template <class T>
+    __host__ __device__ inline constexpr void operator()(T& a, T b) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+                          is_same<T, int8_t>::value,
+                      "The data type is not supported by the SquaredAdd accumulator!");
+
+        a = a + b * b;
+    }
+};
+
+struct Mul
+{
+    template <typename T>
+    __host__ __device__ static constexpr T GetIdentityValue()
+    {
+        return type_convert<T>(1.0f);
+    };
+
+    __host__ __device__ static constexpr bool
+    IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation)
+    {
+        return operation == InMemoryDataOperationEnum::Set;
+    };
+
+    template <typename T>
+    __host__ __device__ inline constexpr void operator()(T& a, T b) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, int32_t>::value,
+                      "The data type is not supported by the Mul accumulator!");
+
+        a = a * b;
+    }
+};
+
+struct Max
+{
+    template <typename T>
+    __host__ __device__ static constexpr T GetIdentityValue()
+    {
+        return NumericLimits<T>::Lowest();
+    };
+
+    __host__ __device__ static constexpr bool
+    IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation)
+    {
+        // ToChange: atomic_max to be added
+        return operation == InMemoryDataOperationEnum::Set;
+    };
+
+    template <typename T>
+    __host__ __device__ inline constexpr void operator()(T& a, T b) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+                          is_same<T, int8_t>::value,
+                      "The data type is not supported by the Max accumulator!");
+
+        if(a < b)
+            a = b;
+    }
+
+    template <typename T>
+    __host__ __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+                          is_same<T, int8_t>::value,
+                      "The data type is not supported by the Max accumulator!");
+
+        if(a < b)
+        {
+            a       = b;
+            changed = true;
+        }
+    }
+};
+
+struct Min
+{
+    template <typename T>
+    __host__ __device__ static constexpr T GetIdentityValue()
+    {
+        return NumericLimits<T>::Max();
+    };
+
+    __host__ __device__ static constexpr bool
+    IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation)
+    {
+        // ToChange: atomic_min to be added
+        return operation == InMemoryDataOperationEnum::Set;
+    };
+
+    template <typename T>
+    __host__ __device__ inline constexpr void operator()(T& a, T b) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+                          is_same<T, int8_t>::value,
+                      "The data type is not supported by the Min accumulator!");
+
+        if(a > b)
+            a = b;
+    }
+
+    template <typename T>
+    __host__ __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+                          is_same<T, int8_t>::value,
+                      "The data type is not supported by the Min accumulator!");
+
+        if(a > b)
+        {
+            a       = b;
+            changed = true;
+        }
+    }
+};
+
+struct AMax
+{
+    template <typename T>
+    __host__ __device__ static constexpr T GetIdentityValue()
+    {
+        return type_convert<T>(0.0f);
+    };
+
+    __host__ __device__ static constexpr bool
+    IsCompatibleInMemoryDataOperation(InMemoryDataOperationEnum operation)
+    {
+        // ToChange: atomic_max to be added
+        return operation == InMemoryDataOperationEnum::Set;
+    };
+
+    template <typename T>
+    __host__ __device__ inline constexpr void operator()(T& a, T b) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+                          is_same<T, int8_t>::value,
+                      "The data type is not supported by the AMax accumulator!");
+
+        if(a < b)
+            a = b;
+    }
+
+    template <typename T>
+    __host__ __device__ inline constexpr void operator()(T& a, T b, bool& changed) const
+    {
+        static_assert(is_same<T, float>::value || is_same<T, double>::value ||
+                          is_same<T, half_t>::value || is_same<T, int32_t>::value ||
+                          is_same<T, int8_t>::value,
+                      "The data type is not supported by the AMax accumulator!");
+
+        if(a < b)
+        {
+            a       = b;
+            changed = true;
+        }
+    }
+};
+
+template <typename T>
+constexpr T GetIdentityValueForInMemoryDataOperation(InMemoryDataOperationEnum operation)
+{
+    T result = ck::type_convert<T>(0.0f);
+
+    if(operation == InMemoryDataOperationEnum::AtomicMax)
+        result = ck::NumericLimits<T>::Lowest();
+
+    return (result);
+};
+
+template <InMemoryDataOperationEnum Operation, typename DataType>
+struct InMemoryDataOperatonSupportedOnDataType
+{
+    static constexpr bool value = false;
+};
+
+template <typename DataType>
+struct InMemoryDataOperatonSupportedOnDataType<InMemoryDataOperationEnum::AtomicAdd, DataType>
+{
+    static constexpr bool value =
+        is_same<DataType, float>::value || is_same<DataType, double>::value;
+};
+
+template <typename DataType>
+struct InMemoryDataOperatonSupportedOnDataType<InMemoryDataOperationEnum::AtomicMax, DataType>
+{
+    static constexpr bool value =
+        is_same<DataType, float>::value || is_same<DataType, double>::value;
+};
+
+template <typename DataType>
+struct InMemoryDataOperatonSupportedOnDataType<InMemoryDataOperationEnum::Set, DataType>
+{
+    static constexpr bool value =
+        is_same<DataType, float>::value || is_same<DataType, double>::value ||
+        is_same<DataType, half_t>::value || is_same<DataType, bhalf_t>::value ||
+        is_same<DataType, int8_t>::value || is_same<DataType, int32_t>::value;
+};
+
+template <typename DataType>
+struct InMemoryDataOperatonSupportedOnDataType<InMemoryDataOperationEnum::Add, DataType>
+{
+    static constexpr bool value =
+        is_same<DataType, float>::value || is_same<DataType, double>::value ||
+        is_same<DataType, half_t>::value || is_same<DataType, int8_t>::value ||
+        is_same<DataType, int32_t>::value;
+};
+
+} // namespace reduce
+} // namespace ck
diff --git a/include/ck/utility/sequence.hpp b/include/ck/utility/sequence.hpp
new file mode 100644
index 00000000..97b59722
--- /dev/null
+++ b/include/ck/utility/sequence.hpp
@@ -0,0 +1,899 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/integral_constant.hpp"
+#include "ck/utility/type.hpp"
+#include "ck/utility/functional.hpp"
+#include "ck/utility/math.hpp"
+
+namespace ck {
+
+template <index_t, index_t, index_t>
+struct static_for;
+
+template <index_t...>
+struct Sequence;
+
+template <typename Seq, index_t I>
+struct sequence_split;
+
+template <typename>
+struct sequence_reverse;
+
+template <typename>
+struct sequence_map_inverse;
+
+template <typename>
+struct is_valid_sequence_map;
+
+template <index_t I, index_t... Is>
+__host__ __device__ constexpr auto sequence_pop_front(Sequence<I, Is...>);
+
+template <typename Seq>
+__host__ __device__ constexpr auto sequence_pop_back(Seq);
+
+template <index_t... Is>
+struct Sequence
+{
+    using Type      = Sequence;
+    using data_type = index_t;
+
+    static constexpr index_t mSize = sizeof...(Is);
+
+    __host__ __device__ static constexpr auto Size() { return Number<mSize>{}; }
+
+    __host__ __device__ static constexpr auto GetSize() { return Size(); }
+
+    __host__ __device__ static constexpr index_t At(index_t I)
+    {
+        // the last dummy element is to prevent compiler complain about empty array, when mSize = 0
+        const index_t mData[mSize + 1] = {Is..., 0};
+        return mData[I];
+    }
+
+    template <index_t I>
+    __host__ __device__ static constexpr auto At(Number<I>)
+    {
+        static_assert(I < mSize, "wrong! I too large");
+
+        return Number<At(I)>{};
+    }
+
+    template <index_t I>
+    __host__ __device__ static constexpr auto Get(Number<I>)
+    {
+        return At(Number<I>{});
+    }
+
+    template <typename I>
+    __host__ __device__ constexpr auto operator[](I i) const
+    {
+        return At(i);
+    }
+
+    template <index_t... IRs>
+    __host__ __device__ static constexpr auto ReorderGivenNew2Old(Sequence<IRs...> /*new2old*/)
+    {
+        static_assert(sizeof...(Is) == sizeof...(IRs),
+                      "wrong! reorder map should have the same size as Sequence to be rerodered");
+
+        static_assert(is_valid_sequence_map<Sequence<IRs...>>::value, "wrong! invalid reorder map");
+
+        return Sequence<Type::At(Number<IRs>{})...>{};
+    }
+
+    // MapOld2New is Sequence<...>
+    template <typename MapOld2New>
+    __host__ __device__ static constexpr auto ReorderGivenOld2New(MapOld2New)
+    {
+        static_assert(MapOld2New::Size() == Size(),
+                      "wrong! reorder map should have the same size as Sequence to be rerodered");
+
+        static_assert(is_valid_sequence_map<MapOld2New>::value, "wrong! invalid reorder map");
+
+        return ReorderGivenNew2Old(typename sequence_map_inverse<MapOld2New>::type{});
+    }
+
+    __host__ __device__ static constexpr auto Reverse()
+    {
+        return typename sequence_reverse<Type>::type{};
+    }
+
+    __host__ __device__ static constexpr auto Front()
+    {
+        static_assert(mSize > 0, "wrong!");
+        return At(Number<0>{});
+    }
+
+    __host__ __device__ static constexpr auto Back()
+    {
+        static_assert(mSize > 0, "wrong!");
+        return At(Number<mSize - 1>{});
+    }
+
+    __host__ __device__ static constexpr auto PopFront() { return sequence_pop_front(Type{}); }
+
+    __host__ __device__ static constexpr auto PopBack() { return sequence_pop_back(Type{}); }
+
+    template <index_t... Xs>
+    __host__ __device__ static constexpr auto PushFront(Sequence<Xs...>)
+    {
+        return Sequence<Xs..., Is...>{};
+    }
+
+    template <index_t... Xs>
+    __host__ __device__ static constexpr auto PushFront(Number<Xs>...)
+    {
+        return Sequence<Xs..., Is...>{};
+    }
+
+    template <index_t... Xs>
+    __host__ __device__ static constexpr auto PushBack(Sequence<Xs...>)
+    {
+        return Sequence<Is..., Xs...>{};
+    }
+
+    template <index_t... Xs>
+    __host__ __device__ static constexpr auto PushBack(Number<Xs>...)
+    {
+        return Sequence<Is..., Xs...>{};
+    }
+
+    template <index_t... Ns>
+    __host__ __device__ static constexpr auto Extract(Number<Ns>...)
+    {
+        return Sequence<Type::At(Number<Ns>{})...>{};
+    }
+
+    template <index_t... Ns>
+    __host__ __device__ static constexpr auto Extract(Sequence<Ns...>)
+    {
+        return Sequence<Type::At(Number<Ns>{})...>{};
+    }
+
+    template <index_t I, index_t X>
+    __host__ __device__ static constexpr auto Modify(Number<I>, Number<X>)
+    {
+        static_assert(I < Size(), "wrong!");
+
+        using seq_split          = sequence_split<Type, I>;
+        constexpr auto seq_left  = typename seq_split::left_type{};
+        constexpr auto seq_right = typename seq_split::right_type{}.PopFront();
+
+        return seq_left.PushBack(Number<X>{}).PushBack(seq_right);
+    }
+
+    template <typename F>
+    __host__ __device__ static constexpr auto Transform(F f)
+    {
+        return Sequence<f(Is)...>{};
+    }
+
+    __host__ __device__ static void Print()
+    {
+        printf("{");
+        printf("size %d, ", index_t{Size()});
+        static_for<0, Size(), 1>{}([&](auto i) { printf("%d ", At(i).value); });
+        printf("}");
+    }
+};
+
+// merge sequence
+template <typename Seq, typename... Seqs>
+struct sequence_merge
+{
+    using type = typename sequence_merge<Seq, typename sequence_merge<Seqs...>::type>::type;
+};
+
+template <index_t... Xs, index_t... Ys>
+struct sequence_merge<Sequence<Xs...>, Sequence<Ys...>>
+{
+    using type = Sequence<Xs..., Ys...>;
+};
+
+template <typename Seq>
+struct sequence_merge<Seq>
+{
+    using type = Seq;
+};
+
+// generate sequence
+template <index_t NSize, typename F>
+struct sequence_gen
+{
+    template <index_t IBegin, index_t NRemain, typename G>
+    struct sequence_gen_impl
+    {
+        static constexpr index_t NRemainLeft  = NRemain / 2;
+        static constexpr index_t NRemainRight = NRemain - NRemainLeft;
+        static constexpr index_t IMiddle      = IBegin + NRemainLeft;
+
+        using type = typename sequence_merge<
+            typename sequence_gen_impl<IBegin, NRemainLeft, G>::type,
+            typename sequence_gen_impl<IMiddle, NRemainRight, G>::type>::type;
+    };
+
+    template <index_t I, typename G>
+    struct sequence_gen_impl<I, 1, G>
+    {
+        static constexpr index_t Is = G{}(Number<I>{});
+        using type                  = Sequence<Is>;
+    };
+
+    template <index_t I, typename G>
+    struct sequence_gen_impl<I, 0, G>
+    {
+        using type = Sequence<>;
+    };
+
+    using type = typename sequence_gen_impl<0, NSize, F>::type;
+};
+
+// arithmetic sequence
+template <index_t IBegin, index_t IEnd, index_t Increment>
+struct arithmetic_sequence_gen
+{
+    struct F
+    {
+        __host__ __device__ constexpr index_t operator()(index_t i) const
+        {
+            return i * Increment + IBegin;
+        }
+    };
+
+    using type0 = typename sequence_gen<(IEnd - IBegin) / Increment, F>::type;
+    using type1 = Sequence<>;
+
+    static constexpr bool kHasContent =
+        (Increment > 0 && IBegin < IEnd) || (Increment < 0 && IBegin > IEnd);
+
+    using type = typename conditional<kHasContent, type0, type1>::type;
+};
+
+// uniform sequence
+template <index_t NSize, index_t I>
+struct uniform_sequence_gen
+{
+    struct F
+    {
+        __host__ __device__ constexpr index_t operator()(index_t) const { return I; }
+    };
+
+    using type = typename sequence_gen<NSize, F>::type;
+};
+
+// reverse inclusive scan (with init) sequence
+template <typename, typename, index_t>
+struct sequence_reverse_inclusive_scan;
+
+template <index_t I, index_t... Is, typename Reduce, index_t Init>
+struct sequence_reverse_inclusive_scan<Sequence<I, Is...>, Reduce, Init>
+{
+    using old_scan = typename sequence_reverse_inclusive_scan<Sequence<Is...>, Reduce, Init>::type;
+
+    static constexpr index_t new_reduce = Reduce{}(I, old_scan{}.Front());
+
+    using type = typename sequence_merge<Sequence<new_reduce>, old_scan>::type;
+};
+
+template <index_t I, typename Reduce, index_t Init>
+struct sequence_reverse_inclusive_scan<Sequence<I>, Reduce, Init>
+{
+    using type = Sequence<Reduce{}(I, Init)>;
+};
+
+template <typename Reduce, index_t Init>
+struct sequence_reverse_inclusive_scan<Sequence<>, Reduce, Init>
+{
+    using type = Sequence<>;
+};
+
+// split sequence
+template <typename Seq, index_t I>
+struct sequence_split
+{
+    static constexpr index_t NSize = Seq{}.Size();
+
+    using range0 = typename arithmetic_sequence_gen<0, I, 1>::type;
+    using range1 = typename arithmetic_sequence_gen<I, NSize, 1>::type;
+
+    using left_type  = decltype(Seq::Extract(range0{}));
+    using right_type = decltype(Seq::Extract(range1{}));
+};
+
+// reverse sequence
+template <typename Seq>
+struct sequence_reverse
+{
+    static constexpr index_t NSize = Seq{}.Size();
+
+    using seq_split = sequence_split<Seq, NSize / 2>;
+    using type      = typename sequence_merge<
+        typename sequence_reverse<typename seq_split::right_type>::type,
+        typename sequence_reverse<typename seq_split::left_type>::type>::type;
+};
+
+template <index_t I>
+struct sequence_reverse<Sequence<I>>
+{
+    using type = Sequence<I>;
+};
+
+template <index_t I0, index_t I1>
+struct sequence_reverse<Sequence<I0, I1>>
+{
+    using type = Sequence<I1, I0>;
+};
+
+#if 1
+template <typename Reduce, typename Seq, typename... Seqs>
+struct sequence_reduce
+{
+    using type = typename sequence_reduce<Reduce,
+                                          Seq,
+                                          typename sequence_reduce<Reduce, Seqs...>::type>::type;
+};
+
+template <typename Reduce, index_t... Xs, index_t... Ys>
+struct sequence_reduce<Reduce, Sequence<Xs...>, Sequence<Ys...>>
+{
+    using type = Sequence<Reduce{}(Xs, Ys)...>;
+};
+
+template <typename Reduce, typename Seq>
+struct sequence_reduce<Reduce, Seq>
+{
+    using type = Seq;
+};
+#endif
+
+template <typename Values, typename Ids, typename Compare>
+struct sequence_sort_impl
+{
+    template <typename LeftValues,
+              typename LeftIds,
+              typename RightValues,
+              typename RightIds,
+              typename MergedValues,
+              typename MergedIds,
+              typename Comp>
+    struct sorted_sequence_merge_impl
+    {
+        static constexpr bool choose_left = LeftValues::Front() < RightValues::Front();
+
+        static constexpr index_t chosen_value =
+            choose_left ? LeftValues::Front() : RightValues::Front();
+        static constexpr index_t chosen_id = choose_left ? LeftIds::Front() : RightIds::Front();
+
+        using new_merged_values = decltype(MergedValues::PushBack(Number<chosen_value>{}));
+        using new_merged_ids    = decltype(MergedIds::PushBack(Number<chosen_id>{}));
+
+        using new_left_values =
+            typename conditional<choose_left, decltype(LeftValues::PopFront()), LeftValues>::type;
+        using new_left_ids =
+            typename conditional<choose_left, decltype(LeftIds::PopFront()), LeftIds>::type;
+
+        using new_right_values =
+            typename conditional<choose_left, RightValues, decltype(RightValues::PopFront())>::type;
+        using new_right_ids =
+            typename conditional<choose_left, RightIds, decltype(RightIds::PopFront())>::type;
+
+        using merge = sorted_sequence_merge_impl<new_left_values,
+                                                 new_left_ids,
+                                                 new_right_values,
+                                                 new_right_ids,
+                                                 new_merged_values,
+                                                 new_merged_ids,
+                                                 Comp>;
+        // this is output
+        using merged_values = typename merge::merged_values;
+        using merged_ids    = typename merge::merged_ids;
+    };
+
+    template <typename LeftValues,
+              typename LeftIds,
+              typename MergedValues,
+              typename MergedIds,
+              typename Comp>
+    struct sorted_sequence_merge_impl<LeftValues,
+                                      LeftIds,
+                                      Sequence<>,
+                                      Sequence<>,
+                                      MergedValues,
+                                      MergedIds,
+                                      Comp>
+    {
+        using merged_values = typename sequence_merge<MergedValues, LeftValues>::type;
+        using merged_ids    = typename sequence_merge<MergedIds, LeftIds>::type;
+    };
+
+    template <typename RightValues,
+              typename RightIds,
+              typename MergedValues,
+              typename MergedIds,
+              typename Comp>
+    struct sorted_sequence_merge_impl<Sequence<>,
+                                      Sequence<>,
+                                      RightValues,
+                                      RightIds,
+                                      MergedValues,
+                                      MergedIds,
+                                      Comp>
+    {
+        using merged_values = typename sequence_merge<MergedValues, RightValues>::type;
+        using merged_ids    = typename sequence_merge<MergedIds, RightIds>::type;
+    };
+
+    template <typename LeftValues,
+              typename LeftIds,
+              typename RightValues,
+              typename RightIds,
+              typename Comp>
+    struct sorted_sequence_merge
+    {
+        using merge = sorted_sequence_merge_impl<LeftValues,
+                                                 LeftIds,
+                                                 RightValues,
+                                                 RightIds,
+                                                 Sequence<>,
+                                                 Sequence<>,
+                                                 Comp>;
+
+        using merged_values = typename merge::merged_values;
+        using merged_ids    = typename merge::merged_ids;
+    };
+
+    static constexpr index_t nsize = Values::Size();
+
+    using split_unsorted_values = sequence_split<Values, nsize / 2>;
+    using split_unsorted_ids    = sequence_split<Ids, nsize / 2>;
+
+    using left_unsorted_values = typename split_unsorted_values::left_type;
+    using left_unsorted_ids    = typename split_unsorted_ids::left_type;
+    using left_sort          = sequence_sort_impl<left_unsorted_values, left_unsorted_ids, Compare>;
+    using left_sorted_values = typename left_sort::sorted_values;
+    using left_sorted_ids    = typename left_sort::sorted_ids;
+
+    using right_unsorted_values = typename split_unsorted_values::right_type;
+    using right_unsorted_ids    = typename split_unsorted_ids::right_type;
+    using right_sort = sequence_sort_impl<right_unsorted_values, right_unsorted_ids, Compare>;
+    using right_sorted_values = typename right_sort::sorted_values;
+    using right_sorted_ids    = typename right_sort::sorted_ids;
+
+    using merged_sorted = sorted_sequence_merge<left_sorted_values,
+                                                left_sorted_ids,
+                                                right_sorted_values,
+                                                right_sorted_ids,
+                                                Compare>;
+
+    using sorted_values = typename merged_sorted::merged_values;
+    using sorted_ids    = typename merged_sorted::merged_ids;
+};
+
+template <index_t ValueX, index_t ValueY, index_t IdX, index_t IdY, typename Compare>
+struct sequence_sort_impl<Sequence<ValueX, ValueY>, Sequence<IdX, IdY>, Compare>
+{
+    static constexpr bool choose_x = Compare{}(ValueX, ValueY);
+
+    using sorted_values =
+        typename conditional<choose_x, Sequence<ValueX, ValueY>, Sequence<ValueY, ValueX>>::type;
+    using sorted_ids = typename conditional<choose_x, Sequence<IdX, IdY>, Sequence<IdY, IdX>>::type;
+};
+
+template <index_t Value, index_t Id, typename Compare>
+struct sequence_sort_impl<Sequence<Value>, Sequence<Id>, Compare>
+{
+    using sorted_values = Sequence<Value>;
+    using sorted_ids    = Sequence<Id>;
+};
+
+template <typename Compare>
+struct sequence_sort_impl<Sequence<>, Sequence<>, Compare>
+{
+    using sorted_values = Sequence<>;
+    using sorted_ids    = Sequence<>;
+};
+
+template <typename Values, typename Compare>
+struct sequence_sort
+{
+    using unsorted_ids = typename arithmetic_sequence_gen<0, Values::Size(), 1>::type;
+    using sort         = sequence_sort_impl<Values, unsorted_ids, Compare>;
+
+    // this is output
+    using type                = typename sort::sorted_values;
+    using sorted2unsorted_map = typename sort::sorted_ids;
+};
+
+template <typename Values, typename Less, typename Equal>
+struct sequence_unique_sort
+{
+    template <typename RemainValues,
+              typename RemainIds,
+              typename UniquifiedValues,
+              typename UniquifiedIds,
+              typename Eq>
+    struct sorted_sequence_uniquify_impl
+    {
+        static constexpr index_t current_value = RemainValues::Front();
+        static constexpr index_t current_id    = RemainIds::Front();
+
+        static constexpr bool is_unique_value = (current_value != UniquifiedValues::Back());
+
+        using new_remain_values = decltype(RemainValues::PopFront());
+        using new_remain_ids    = decltype(RemainIds::PopFront());
+
+        using new_uniquified_values =
+            typename conditional<is_unique_value,
+                                 decltype(UniquifiedValues::PushBack(Number<current_value>{})),
+                                 UniquifiedValues>::type;
+
+        using new_uniquified_ids =
+            typename conditional<is_unique_value,
+                                 decltype(UniquifiedIds::PushBack(Number<current_id>{})),
+                                 UniquifiedIds>::type;
+
+        using uniquify = sorted_sequence_uniquify_impl<new_remain_values,
+                                                       new_remain_ids,
+                                                       new_uniquified_values,
+                                                       new_uniquified_ids,
+                                                       Eq>;
+
+        // this is output
+        using uniquified_values = typename uniquify::uniquified_values;
+        using uniquified_ids    = typename uniquify::uniquified_ids;
+    };
+
+    template <typename UniquifiedValues, typename UniquifiedIds, typename Eq>
+    struct sorted_sequence_uniquify_impl<Sequence<>,
+                                         Sequence<>,
+                                         UniquifiedValues,
+                                         UniquifiedIds,
+                                         Eq>
+    {
+        using uniquified_values = UniquifiedValues;
+        using uniquified_ids    = UniquifiedIds;
+    };
+
+    template <typename SortedValues, typename SortedIds, typename Eq>
+    struct sorted_sequence_uniquify
+    {
+        using uniquify = sorted_sequence_uniquify_impl<decltype(SortedValues::PopFront()),
+                                                       decltype(SortedIds::PopFront()),
+                                                       Sequence<SortedValues::Front()>,
+                                                       Sequence<SortedIds::Front()>,
+                                                       Eq>;
+
+        using uniquified_values = typename uniquify::uniquified_values;
+        using uniquified_ids    = typename uniquify::uniquified_ids;
+    };
+
+    using sort          = sequence_sort<Values, Less>;
+    using sorted_values = typename sort::type;
+    using sorted_ids    = typename sort::sorted2unsorted_map;
+
+    using uniquify = sorted_sequence_uniquify<sorted_values, sorted_ids, Equal>;
+
+    // this is output
+    using type                = typename uniquify::uniquified_values;
+    using sorted2unsorted_map = typename uniquify::uniquified_ids;
+};
+
+template <typename SeqMap>
+struct is_valid_sequence_map : is_same<typename arithmetic_sequence_gen<0, SeqMap::Size(), 1>::type,
+                                       typename sequence_sort<SeqMap, math::less<index_t>>::type>
+{
+};
+
+template <typename SeqMap>
+struct sequence_map_inverse
+{
+    template <typename X2Y, typename WorkingY2X, index_t XBegin, index_t XRemain>
+    struct sequence_map_inverse_impl
+    {
+        static constexpr auto new_y2x =
+            WorkingY2X::Modify(X2Y::At(Number<XBegin>{}), Number<XBegin>{});
+
+        using type =
+            typename sequence_map_inverse_impl<X2Y, decltype(new_y2x), XBegin + 1, XRemain - 1>::
+                type;
+    };
+
+    template <typename X2Y, typename WorkingY2X, index_t XBegin>
+    struct sequence_map_inverse_impl<X2Y, WorkingY2X, XBegin, 0>
+    {
+        using type = WorkingY2X;
+    };
+
+    using type =
+        typename sequence_map_inverse_impl<SeqMap,
+                                           typename uniform_sequence_gen<SeqMap::Size(), 0>::type,
+                                           0,
+                                           SeqMap::Size()>::type;
+};
+
+template <index_t... Xs, index_t... Ys>
+__host__ __device__ constexpr bool operator==(Sequence<Xs...>, Sequence<Ys...>)
+{
+    return ((Xs == Ys) && ...);
+}
+
+template <index_t... Xs, index_t... Ys>
+__host__ __device__ constexpr auto operator+(Sequence<Xs...>, Sequence<Ys...>)
+{
+    static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong! inconsistent size");
+
+    return Sequence<(Xs + Ys)...>{};
+}
+
+template <index_t... Xs, index_t... Ys>
+__host__ __device__ constexpr auto operator-(Sequence<Xs...>, Sequence<Ys...>)
+{
+    static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong! inconsistent size");
+
+    return Sequence<(Xs - Ys)...>{};
+}
+
+template <index_t... Xs, index_t... Ys>
+__host__ __device__ constexpr auto operator*(Sequence<Xs...>, Sequence<Ys...>)
+{
+    static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong! inconsistent size");
+
+    return Sequence<(Xs * Ys)...>{};
+}
+
+template <index_t... Xs, index_t... Ys>
+__host__ __device__ constexpr auto operator/(Sequence<Xs...>, Sequence<Ys...>)
+{
+    static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong! inconsistent size");
+
+    return Sequence<(Xs / Ys)...>{};
+}
+
+template <index_t... Xs, index_t... Ys>
+__host__ __device__ constexpr auto operator%(Sequence<Xs...>, Sequence<Ys...>)
+{
+    static_assert(sizeof...(Xs) == sizeof...(Ys), "wrong! inconsistent size");
+
+    return Sequence<(Xs % Ys)...>{};
+}
+
+template <index_t... Xs, index_t Y>
+__host__ __device__ constexpr auto operator+(Sequence<Xs...>, Number<Y>)
+{
+    return Sequence<(Xs + Y)...>{};
+}
+
+template <index_t... Xs, index_t Y>
+__host__ __device__ constexpr auto operator-(Sequence<Xs...>, Number<Y>)
+{
+    return Sequence<(Xs - Y)...>{};
+}
+
+template <index_t... Xs, index_t Y>
+__host__ __device__ constexpr auto operator*(Sequence<Xs...>, Number<Y>)
+{
+    return Sequence<(Xs * Y)...>{};
+}
+
+template <index_t... Xs, index_t Y>
+__host__ __device__ constexpr auto operator/(Sequence<Xs...>, Number<Y>)
+{
+    return Sequence<(Xs / Y)...>{};
+}
+
+template <index_t... Xs, index_t Y>
+__host__ __device__ constexpr auto operator%(Sequence<Xs...>, Number<Y>)
+{
+    return Sequence<(Xs % Y)...>{};
+}
+
+template <index_t Y, index_t... Xs>
+__host__ __device__ constexpr auto operator+(Number<Y>, Sequence<Xs...>)
+{
+    return Sequence<(Y + Xs)...>{};
+}
+
+template <index_t Y, index_t... Xs>
+__host__ __device__ constexpr auto operator-(Number<Y>, Sequence<Xs...>)
+{
+    return Sequence<(Y - Xs)...>{};
+}
+
+template <index_t Y, index_t... Xs>
+__host__ __device__ constexpr auto operator*(Number<Y>, Sequence<Xs...>)
+{
+    return Sequence<(Y * Xs)...>{};
+}
+
+template <index_t Y, index_t... Xs>
+__host__ __device__ constexpr auto operator/(Number<Y>, Sequence<Xs...>)
+{
+    return Sequence<(Y / Xs)...>{};
+}
+
+template <index_t Y, index_t... Xs>
+__host__ __device__ constexpr auto operator%(Number<Y>, Sequence<Xs...>)
+{
+    return Sequence<(Y % Xs)...>{};
+}
+
+template <index_t I, index_t... Is>
+__host__ __device__ constexpr auto sequence_pop_front(Sequence<I, Is...>)
+{
+    return Sequence<Is...>{};
+}
+
+template <typename Seq>
+__host__ __device__ constexpr auto sequence_pop_back(Seq)
+{
+    static_assert(Seq::Size() > 0, "wrong! cannot pop an empty Sequence!");
+    return sequence_pop_front(Seq::Reverse()).Reverse();
+}
+
+template <typename... Seqs>
+__host__ __device__ constexpr auto merge_sequences(Seqs...)
+{
+    return typename sequence_merge<Seqs...>::type{};
+}
+
+template <typename F, index_t... Xs>
+__host__ __device__ constexpr auto transform_sequences(F f, Sequence<Xs...>)
+{
+    return Sequence<f(Xs)...>{};
+}
+
+template <typename F, index_t... Xs, index_t... Ys>
+__host__ __device__ constexpr auto transform_sequences(F f, Sequence<Xs...>, Sequence<Ys...>)
+{
+    static_assert(Sequence<Xs...>::mSize == Sequence<Ys...>::mSize, "Dim not the same");
+
+    return Sequence<f(Xs, Ys)...>{};
+}
+
+template <typename F, index_t... Xs, index_t... Ys, index_t... Zs>
+__host__ __device__ constexpr auto
+transform_sequences(F f, Sequence<Xs...>, Sequence<Ys...>, Sequence<Zs...>)
+{
+    static_assert(Sequence<Xs...>::mSize == Sequence<Ys...>::mSize &&
+                      Sequence<Xs...>::mSize == Sequence<Zs...>::mSize,
+                  "Dim not the same");
+
+    return Sequence<f(Xs, Ys, Zs)...>{};
+}
+
+template <typename Seq, typename Reduce, index_t Init>
+__host__ __device__ constexpr auto reverse_inclusive_scan_sequence(Seq, Reduce, Number<Init>)
+{
+    return typename sequence_reverse_inclusive_scan<Seq, Reduce, Init>::type{};
+}
+
+template <typename Seq, typename Reduce, index_t Init>
+__host__ __device__ constexpr auto reverse_exclusive_scan_sequence(Seq, Reduce, Number<Init>)
+{
+    return reverse_inclusive_scan_sequence(Seq::PopFront(), Reduce{}, Number<Init>{})
+        .PushBack(Number<Init>{});
+}
+
+template <typename Seq, typename Reduce, index_t Init>
+__host__ __device__ constexpr auto inclusive_scan_sequence(Seq, Reduce, Number<Init>)
+{
+    return reverse_inclusive_scan_sequence(Seq{}.Reverse(), Reduce{}, Number<Init>{}).Reverse();
+}
+
+template <typename Seq, index_t... Is>
+__host__ __device__ constexpr auto pick_sequence_elements_by_ids(Seq, Sequence<Is...> /* ids */)
+{
+    return Sequence<Seq::At(Number<Is>{})...>{};
+}
+
+#if 1
+namespace detail {
+template <typename WorkSeq, typename RemainSeq, typename RemainMask>
+struct pick_sequence_elements_by_mask_impl
+{
+    using new_work_seq = typename conditional<RemainMask::Front(),
+                                              decltype(WorkSeq::PushBack(RemainSeq::Front())),
+                                              WorkSeq>::type;
+
+    using type =
+        typename pick_sequence_elements_by_mask_impl<new_work_seq,
+                                                     decltype(RemainSeq::PopFront()),
+                                                     decltype(RemainMask::PopFront())>::type;
+};
+
+template <typename WorkSeq>
+struct pick_sequence_elements_by_mask_impl<WorkSeq, Sequence<>, Sequence<>>
+{
+    using type = WorkSeq;
+};
+
+} // namespace detail
+
+template <typename Seq, typename Mask>
+__host__ __device__ constexpr auto pick_sequence_elements_by_mask(Seq, Mask)
+{
+    static_assert(Seq::Size() == Mask::Size(), "wrong!");
+
+    return typename detail::pick_sequence_elements_by_mask_impl<Sequence<>, Seq, Mask>::type{};
+}
+
+namespace detail {
+template <typename WorkSeq, typename RemainValues, typename RemainIds>
+struct modify_sequence_elements_by_ids_impl
+{
+    using new_work_seq = decltype(WorkSeq::Modify(RemainIds::Front(), RemainValues::Front()));
+
+    using type =
+        typename modify_sequence_elements_by_ids_impl<new_work_seq,
+                                                      decltype(RemainValues::PopFront()),
+                                                      decltype(RemainIds::PopFront())>::type;
+};
+
+template <typename WorkSeq>
+struct modify_sequence_elements_by_ids_impl<WorkSeq, Sequence<>, Sequence<>>
+{
+    using type = WorkSeq;
+};
+} // namespace detail
+
+template <typename Seq, typename Values, typename Ids>
+__host__ __device__ constexpr auto modify_sequence_elements_by_ids(Seq, Values, Ids)
+{
+    static_assert(Values::Size() == Ids::Size() && Seq::Size() >= Values::Size(), "wrong!");
+
+    return typename detail::modify_sequence_elements_by_ids_impl<Seq, Values, Ids>::type{};
+}
+#endif
+
+template <typename Seq, typename Reduce, index_t Init>
+__host__ __device__ constexpr index_t
+reduce_on_sequence(Seq, Reduce f, Number<Init> /*initial_value*/)
+{
+    index_t result = Init;
+
+    for(index_t i = 0; i < Seq::Size(); ++i)
+    {
+        result = f(result, Seq::At(i));
+    }
+
+    return result;
+}
+
+// TODO: a generic any_of for any container
+template <typename Seq, typename F>
+__host__ __device__ constexpr bool sequence_any_of(Seq, F f)
+{
+    bool flag = false;
+
+    for(index_t i = 0; i < Seq::Size(); ++i)
+    {
+        flag = flag || f(Seq::At(i));
+    }
+
+    return flag;
+}
+
+// TODO: a generic all_of for any container
+template <typename Seq, typename F>
+__host__ __device__ constexpr bool sequence_all_of(Seq, F f)
+{
+    bool flag = true;
+
+    for(index_t i = 0; i < Seq::Size(); ++i)
+    {
+        flag = flag && f(Seq::At(i));
+    }
+
+    return flag;
+}
+
+template <typename Sx, typename Sy>
+using sequence_merge_t = typename sequence_merge<Sx, Sy>::type;
+
+template <index_t NSize, index_t I>
+using uniform_sequence_gen_t = typename uniform_sequence_gen<NSize, I>::type;
+
+} // namespace ck
diff --git a/include/ck/utility/sequence_helper.hpp b/include/ck/utility/sequence_helper.hpp
new file mode 100644
index 00000000..db25c27e
--- /dev/null
+++ b/include/ck/utility/sequence_helper.hpp
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/tuple.hpp"
+
+namespace ck {
+
+template <index_t... Is>
+__host__ __device__ constexpr auto make_sequence(Number<Is>...)
+{
+    return Sequence<Is...>{};
+}
+
+// F returns index_t
+template <typename F, index_t N>
+__host__ __device__ constexpr auto generate_sequence(F, Number<N>)
+{
+    return typename sequence_gen<N, F>::type{};
+}
+
+// F returns Number<>
+template <typename F, index_t N>
+__host__ __device__ constexpr auto generate_sequence_v2(F&& f, Number<N>)
+{
+    return unpack([&f](auto&&... xs) { return make_sequence(f(xs)...); },
+                  typename arithmetic_sequence_gen<0, N, 1>::type{});
+}
+
+template <index_t... Is>
+__host__ __device__ constexpr auto to_sequence(Tuple<Number<Is>...>)
+{
+    return Sequence<Is...>{};
+}
+
+} // namespace ck
diff --git a/include/ck/utility/span.hpp b/include/ck/utility/span.hpp
new file mode 100644
index 00000000..1e501214
--- /dev/null
+++ b/include/ck/utility/span.hpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstddef>
+#include <array>
+#include <type_traits>
+
+namespace ck {
+
+template <typename T>
+class span
+{
+    public:
+    using element_type    = T;
+    using value_type      = std::remove_cv_t<element_type>;
+    using size_type       = std::size_t;
+    using difference_type = std::ptrdiff_t;
+    using pointer         = element_type*;
+    using const_pointer   = const element_type*;
+    using reference       = element_type&;
+    using const_reference = const element_type&;
+    using iterator        = pointer;
+    using const_iterator  = pointer;
+
+    constexpr span() : span(nullptr, size_type{0}) {}
+
+    constexpr span(pointer first, size_type count) : ptr_(first), size_(count) {}
+
+    constexpr span(pointer first, pointer last) : span(first, last - first) {}
+
+    template <std::size_t N>
+    constexpr span(element_type (&arr)[N]) noexcept : span(arr, N)
+    {
+    }
+
+    template <std::size_t N>
+    constexpr span(std::array<value_type, N>& arr) noexcept : span(arr.data(), N)
+    {
+    }
+
+    template <typename Container>
+    constexpr span(const Container& container) : span(container.data(), container.size())
+    {
+    }
+
+    constexpr iterator begin() const noexcept { return ptr_; }
+    constexpr const_iterator cbegin() const noexcept { return begin(); }
+
+    constexpr iterator end() const noexcept { return begin() + size(); }
+    constexpr const_iterator cend() const noexcept { return end(); }
+
+    constexpr reference front() const { return *begin(); }
+    constexpr reference back() const { return *(--end()); }
+
+    constexpr reference operator[](size_type idx) const { return *(begin() + idx); }
+    constexpr pointer data() const noexcept { return ptr_; }
+
+    constexpr size_type size() const noexcept { return size_; }
+
+    private:
+    pointer ptr_;
+    size_type size_;
+};
+
+} // namespace ck
diff --git a/include/ck/utility/static_buffer.hpp b/include/ck/utility/static_buffer.hpp
new file mode 100644
index 00000000..dd25c962
--- /dev/null
+++ b/include/ck/utility/static_buffer.hpp
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "statically_indexed_array.hpp"
+
+namespace ck {
+
+// static buffer for scalar
+template <AddressSpaceEnum AddressSpace,
+          typename T,
+          index_t N,
+          bool InvalidElementUseNumericalZeroValue> // TODO remove this bool, no longer needed
+struct StaticBuffer : public StaticallyIndexedArray<T, N>
+{
+    using type = T;
+    using base = StaticallyIndexedArray<T, N>;
+
+    __host__ __device__ constexpr StaticBuffer() : base{} {}
+
+    template <typename... Ys>
+    __host__ __device__ constexpr StaticBuffer& operator=(const Tuple<Ys...>& y)
+    {
+        static_assert(base::Size() == sizeof...(Ys), "wrong! size not the same");
+        StaticBuffer& x = *this;
+        static_for<0, base::Size(), 1>{}([&](auto i) { x(i) = y[i]; });
+        return x;
+    }
+
+    __host__ __device__ constexpr StaticBuffer& operator=(const T& y)
+    {
+        StaticBuffer& x = *this;
+        static_for<0, base::Size(), 1>{}([&](auto i) { x(i) = y; });
+        return x;
+    }
+
+    __host__ __device__ static constexpr AddressSpaceEnum GetAddressSpace() { return AddressSpace; }
+
+    __host__ __device__ static constexpr bool IsStaticBuffer() { return true; }
+
+    __host__ __device__ static constexpr bool IsDynamicBuffer() { return false; }
+
+    // read access
+    template <index_t I>
+    __host__ __device__ constexpr const T& operator[](Number<I> i) const
+    {
+        return base::operator[](i);
+    }
+
+    // write access
+    template <index_t I>
+    __host__ __device__ constexpr T& operator()(Number<I> i)
+    {
+        return base::operator()(i);
+    }
+
+    __host__ __device__ void Set(T x)
+    {
+        static_for<0, N, 1>{}([&](auto i) { operator()(i) = T{x}; });
+    }
+
+    __host__ __device__ void Clear() { Set(T{0}); }
+};
+
+// static buffer for vector
+template <AddressSpaceEnum AddressSpace,
+          typename S,
+          index_t NumOfVector,
+          index_t ScalarPerVector,
+          bool InvalidElementUseNumericalZeroValue, // TODO remove this bool, no longer needed,
+          typename enable_if<is_scalar_type<S>::value, bool>::type = false>
+struct StaticBufferTupleOfVector
+    : public StaticallyIndexedArray<vector_type<S, ScalarPerVector>, NumOfVector>
+{
+    using V    = typename vector_type<S, ScalarPerVector>::type;
+    using base = StaticallyIndexedArray<vector_type<S, ScalarPerVector>, NumOfVector>;
+
+    static constexpr auto s_per_v   = Number<ScalarPerVector>{};
+    static constexpr auto num_of_v_ = Number<NumOfVector>{};
+    static constexpr auto s_per_buf = s_per_v * num_of_v_;
+
+    __host__ __device__ constexpr StaticBufferTupleOfVector() : base{} {}
+
+    __host__ __device__ static constexpr AddressSpaceEnum GetAddressSpace() { return AddressSpace; }
+
+    __host__ __device__ static constexpr bool IsStaticBuffer() { return true; }
+
+    __host__ __device__ static constexpr bool IsDynamicBuffer() { return false; }
+
+    __host__ __device__ static constexpr index_t Size() { return s_per_buf; };
+
+    // Get S
+    // i is offset of S
+    template <index_t I>
+    __host__ __device__ constexpr const S& operator[](Number<I> i) const
+    {
+        constexpr auto i_v = i / s_per_v;
+        constexpr auto i_s = i % s_per_v;
+
+        return base::operator[](i_v).template AsType<S>()[i_s];
+    }
+
+    // Set S
+    // i is offset of S
+    template <index_t I>
+    __host__ __device__ constexpr S& operator()(Number<I> i)
+    {
+        constexpr auto i_v = i / s_per_v;
+        constexpr auto i_s = i % s_per_v;
+
+        return base::operator()(i_v).template AsType<S>()(i_s);
+    }
+
+    // Get X
+    // i is offset of S, not X. i should be aligned to X
+    template <typename X,
+              index_t I,
+              typename enable_if<has_same_scalar_type<S, X>::value, bool>::type = false>
+    __host__ __device__ constexpr auto GetAsType(Number<I> i) const
+    {
+        constexpr auto s_per_x = Number<scalar_type<remove_cvref_t<X>>::vector_size>{};
+
+        static_assert(s_per_v % s_per_x == 0, "wrong! V must  one or multiple X");
+        static_assert(i % s_per_x == 0, "wrong!");
+
+        constexpr auto i_v = i / s_per_v;
+        constexpr auto i_x = (i % s_per_v) / s_per_x;
+
+        return base::operator[](i_v).template AsType<X>()[i_x];
+    }
+
+    // Set X
+    // i is offset of S, not X. i should be aligned to X
+    template <typename X,
+              index_t I,
+              typename enable_if<has_same_scalar_type<S, X>::value, bool>::type = false>
+    __host__ __device__ constexpr void SetAsType(Number<I> i, X x)
+    {
+        constexpr auto s_per_x = Number<scalar_type<remove_cvref_t<X>>::vector_size>{};
+
+        static_assert(s_per_v % s_per_x == 0, "wrong! V must contain one or multiple X");
+        static_assert(i % s_per_x == 0, "wrong!");
+
+        constexpr auto i_v = i / s_per_v;
+        constexpr auto i_x = (i % s_per_v) / s_per_x;
+
+        base::operator()(i_v).template AsType<X>()(i_x) = x;
+    }
+
+    // Get read access to vector_type V
+    // i is offset of S, not V. i should be aligned to V
+    template <index_t I>
+    __host__ __device__ constexpr const auto& GetVectorTypeReference(Number<I> i) const
+    {
+        static_assert(i % s_per_v == 0, "wrong!");
+
+        constexpr auto i_v = i / s_per_v;
+
+        return base::operator[](i_v);
+    }
+
+    // Get write access to vector_type V
+    // i is offset of S, not V. i should be aligned to V
+    template <index_t I>
+    __host__ __device__ constexpr auto& GetVectorTypeReference(Number<I> i)
+    {
+        static_assert(i % s_per_v == 0, "wrong!");
+
+        constexpr auto i_v = i / s_per_v;
+
+        return base::operator()(i_v);
+    }
+
+    __host__ __device__ void Clear()
+    {
+        constexpr index_t NumScalars = NumOfVector * ScalarPerVector;
+
+        static_for<0, NumScalars, 1>{}([&](auto i) { SetAsType(i, S{0}); });
+    }
+};
+
+template <AddressSpaceEnum AddressSpace, typename T, index_t N>
+__host__ __device__ constexpr auto make_static_buffer(Number<N>)
+{
+    return StaticBuffer<AddressSpace, T, N, true>{};
+}
+
+template <AddressSpaceEnum AddressSpace, typename T, long_index_t N>
+__host__ __device__ constexpr auto make_static_buffer(LongNumber<N>)
+{
+    return StaticBuffer<AddressSpace, T, N, true>{};
+}
+
+} // namespace ck
diff --git a/include/ck/utility/statically_indexed_array.hpp b/include/ck/utility/statically_indexed_array.hpp
new file mode 100644
index 00000000..3438776f
--- /dev/null
+++ b/include/ck/utility/statically_indexed_array.hpp
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_STATICALLY_INDEXED_ARRAY_HPP
+#define CK_STATICALLY_INDEXED_ARRAY_HPP
+
+#include "functional2.hpp"
+#include "sequence.hpp"
+#include "tuple.hpp"
+
+namespace ck {
+
+namespace detail {
+template <typename X, typename Y>
+struct tuple_concat;
+
+template <typename... Xs, typename... Ys>
+struct tuple_concat<Tuple<Xs...>, Tuple<Ys...>>
+{
+    using type = Tuple<Xs..., Ys...>;
+};
+
+template <typename T, index_t N>
+struct StaticallyIndexedArrayImpl
+{
+    using type =
+        typename tuple_concat<typename StaticallyIndexedArrayImpl<T, N / 2>::type,
+                              typename StaticallyIndexedArrayImpl<T, N - N / 2>::type>::type;
+};
+
+template <typename T>
+struct StaticallyIndexedArrayImpl<T, 0>
+{
+    using type = Tuple<>;
+};
+
+template <typename T>
+struct StaticallyIndexedArrayImpl<T, 1>
+{
+    using type = Tuple<T>;
+};
+} // namespace detail
+
+template <typename T, index_t N>
+using StaticallyIndexedArray = typename detail::StaticallyIndexedArrayImpl<T, N>::type;
+
+template <typename X, typename... Xs>
+__host__ __device__ constexpr auto make_statically_indexed_array(const X& x, const Xs&... xs)
+{
+    return StaticallyIndexedArray<X, sizeof...(Xs) + 1>(x, static_cast<X>(xs)...);
+}
+
+// make empty StaticallyIndexedArray
+template <typename X>
+__host__ __device__ constexpr auto make_statically_indexed_array()
+{
+    return StaticallyIndexedArray<X, 0>();
+}
+
+template <typename T, index_t N>
+struct StaticallyIndexedArray_v2
+{
+    __host__ __device__ constexpr StaticallyIndexedArray_v2() = default;
+
+    __host__ __device__ static constexpr index_t Size() { return N; }
+
+    // read access
+    template <index_t I>
+    __host__ __device__ constexpr const auto& At(Number<I>) const
+    {
+        static_assert(I < N, "wrong! out of range");
+
+        return data_[I];
+    }
+
+    // write access
+    template <index_t I>
+    __host__ __device__ constexpr auto& At(Number<I>)
+    {
+        static_assert(I < N, "wrong! out of range");
+
+        return data_[I];
+    }
+
+    // read access
+    template <index_t I>
+    __host__ __device__ constexpr const auto& operator[](Number<I> i) const
+    {
+        return At(i);
+    }
+
+    // write access
+    template <index_t I>
+    __host__ __device__ constexpr auto& operator()(Number<I> i)
+    {
+        return At(i);
+    }
+
+    __host__ __device__ static constexpr bool IsStaticBuffer() { return true; }
+
+    T data_[N];
+};
+
+} // namespace ck
+#endif
diff --git a/include/ck/utility/statically_indexed_array_multi_index.hpp b/include/ck/utility/statically_indexed_array_multi_index.hpp
new file mode 100644
index 00000000..21b2941b
--- /dev/null
+++ b/include/ck/utility/statically_indexed_array_multi_index.hpp
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef CK_STATICALLY_INDEXED_ARRAY_MULTI_INDEX_HPP
+#define CK_STATICALLY_INDEXED_ARRAY_MULTI_INDEX_HPP
+
+#include "common_header.hpp"
+
+namespace ck {
+
+template <index_t N>
+using MultiIndex = StaticallyIndexedArray<index_t, N>;
+
+template <typename... Xs>
+__host__ __device__ constexpr auto make_multi_index(Xs&&... xs)
+{
+    return make_statically_indexed_array<index_t>(index_t{xs}...);
+}
+
+template <index_t NSize>
+__host__ __device__ constexpr auto make_zero_multi_index()
+{
+    return unpack([](auto... xs) { return make_multi_index(xs...); },
+                  typename uniform_sequence_gen<NSize, 0>::type{});
+}
+
+template <typename T>
+__host__ __device__ constexpr auto to_multi_index(const T& x)
+{
+    return unpack([](auto... ys) { return make_multi_index(ys...); }, x);
+}
+
+// Here should use MultiIndex<NSize>, instead of Tuple<Ys...>, although the former
+// is the alias of the latter. This is because compiler cannot infer the NSize if
+// using MultiIndex<NSize>
+// TODO: how to fix this?
+template <
+    typename... Ys,
+    typename X,
+    enable_if_t<!std::is_integral<X>::value && !std::is_floating_point<X>::value, bool> = false>
+__host__ __device__ constexpr auto operator+=(Tuple<Ys...>& y, const X& x)
+{
+    static_assert(X::Size() == sizeof...(Ys), "wrong! size not the same");
+    constexpr index_t NSize = sizeof...(Ys);
+    static_for<0, NSize, 1>{}([&](auto i) { y(i) += x[i]; });
+    return y;
+}
+
+template <
+    typename... Ys,
+    typename X,
+    enable_if_t<!std::is_integral<X>::value && !std::is_floating_point<X>::value, bool> = false>
+__host__ __device__ constexpr auto operator-=(Tuple<Ys...>& y, const X& x)
+{
+    static_assert(X::Size() == sizeof...(Ys), "wrong! size not the same");
+    constexpr index_t NSize = sizeof...(Ys);
+    static_for<0, NSize, 1>{}([&](auto i) { y(i) -= x[i]; });
+    return y;
+}
+
+template <
+    typename... Xs,
+    typename Y,
+    enable_if_t<!std::is_integral<Y>::value && !std::is_floating_point<Y>::value, bool> = false>
+__host__ __device__ constexpr auto operator+(const Tuple<Xs...>& x, const Y& y)
+{
+    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
+    constexpr index_t NSize = sizeof...(Xs);
+
+    Tuple<Xs...> r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = x[i] + y[i]; });
+    return r;
+}
+
+template <
+    typename... Xs,
+    typename Y,
+    enable_if_t<!std::is_integral<Y>::value && !std::is_floating_point<Y>::value, bool> = false>
+__host__ __device__ constexpr auto operator-(const Tuple<Xs...>& x, const Y& y)
+{
+    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
+    constexpr index_t NSize = sizeof...(Xs);
+
+    Tuple<Xs...> r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = x[i] - y[i]; });
+    return r;
+}
+
+template <
+    typename... Xs,
+    typename Y,
+    enable_if_t<!std::is_integral<Y>::value && !std::is_floating_point<Y>::value, bool> = false>
+__host__ __device__ constexpr auto operator*(const Tuple<Xs...>& x, const Y& y)
+{
+    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
+    constexpr index_t NSize = sizeof...(Xs);
+
+    Tuple<Xs...> r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = x[i] * y[i]; });
+    return r;
+}
+
+// MultiIndex = scalar * MultiIndex
+template <typename... Xs,
+          typename Y,
+          enable_if_t<std::is_integral<Y>::value || std::is_floating_point<Y>::value, bool> = false>
+__host__ __device__ constexpr auto operator*(Y a, const Tuple<Xs...>& x)
+{
+    constexpr index_t NSize = sizeof...(Xs);
+
+    Tuple<Xs...> r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = a * x[i]; });
+    return r;
+}
+
+// MultiIndex = MultiIndex * scalar
+template <typename... Xs,
+          typename Y,
+          enable_if_t<std::is_integral<Y>::value || std::is_floating_point<Y>::value, bool> = false>
+__host__ __device__ constexpr auto operator*(const Tuple<Xs...>& x, Y a)
+{
+    return a * x;
+}
+
+namespace mathext {
+
+template <typename... Xs>
+__host__ __device__ constexpr auto exp(const Tuple<Xs...>& x)
+{
+    constexpr index_t NSize = sizeof...(Xs);
+
+    Tuple<Xs...> r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = math::exp(x[i]); });
+    return r;
+}
+
+template <typename... Xs, typename Y>
+__host__ __device__ constexpr auto max(const Tuple<Xs...>& x, const Y& y)
+{
+    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
+    constexpr index_t NSize = sizeof...(Xs);
+
+    Tuple<Xs...> r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = math::max(x[i], y[i]); });
+    return r;
+}
+
+} // namespace mathext
+
+template <typename... Xs>
+__host__ __device__ void print_multi_index(const Tuple<Xs...>& x)
+{
+    printf("{");
+    printf("MultiIndex, ");
+    printf("size %d,", index_t{sizeof...(Xs)});
+    static_for<0, sizeof...(Xs), 1>{}(
+        [&](auto i) { printf("%d ", static_cast<index_t>(x.At(i))); });
+    printf("}");
+}
+
+} // namespace ck
+#endif
diff --git a/include/ck/utility/synchronization.hpp b/include/ck/utility/synchronization.hpp
new file mode 100644
index 00000000..9a463e56
--- /dev/null
+++ b/include/ck/utility/synchronization.hpp
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+
+namespace ck {
+
+__device__ void block_sync_lds()
+{
+#if CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
+    asm volatile("\
+    s_waitcnt lgkmcnt(0) \n \
+    s_barrier \
+    " ::);
+#else
+    __syncthreads();
+#endif
+}
+__device__ void s_nop()
+{
+#if 1
+    asm volatile("\
+    s_nop 0 \n \
+    " ::);
+#else
+    __builtin_amdgcn_sched_barrier(0);
+#endif
+}
+
+} // namespace ck
diff --git a/include/ck/utility/thread_group.hpp b/include/ck/utility/thread_group.hpp
new file mode 100644
index 00000000..d469dec8
--- /dev/null
+++ b/include/ck/utility/thread_group.hpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "get_id.hpp"
+
+namespace ck {
+
+template <index_t ThreadPerBlock>
+struct ThisThreadBlock
+{
+    static constexpr index_t kNumThread_ = ThreadPerBlock;
+
+    __device__ static constexpr index_t GetNumOfThread() { return kNumThread_; }
+
+    __device__ static constexpr bool IsBelong() { return true; }
+
+    __device__ static index_t GetThreadId() { return get_thread_local_1d_id(); }
+};
+
+} // namespace ck
diff --git a/include/ck/utility/transpose_vectors.hpp b/include/ck/utility/transpose_vectors.hpp
new file mode 100644
index 00000000..2b0075d6
--- /dev/null
+++ b/include/ck/utility/transpose_vectors.hpp
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "statically_indexed_array.hpp"
+#include "data_type.hpp"
+
+namespace ck {
+
+template <typename S,
+          index_t NX,
+          index_t NY,
+          typename enable_if<is_scalar_type<S>::value, bool>::type = false>
+struct transpose_vectors;
+
+// transpose fp16 2x2
+__device__ void transpose_fp16_2x2(const half2_t& x0, const half2_t& x1, half2_t& y0, half2_t& y1)
+{
+#if 0
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    const vector_type<half_t, 2> vx0{x0}, vx1{x1};
+    vector_type<half_t, 2> vy0, vy1;
+
+    vy0.template AsType<half_t>()(I0) = vx0.template AsType<half_t>()[I0];
+    vy0.template AsType<half_t>()(I1) = vx1.template AsType<half_t>()[I0];
+
+    vy1.template AsType<half_t>()(I0) = vx0.template AsType<half_t>()[I1];
+    vy1.template AsType<half_t>()(I1) = vx1.template AsType<half_t>()[I1];
+
+    y0 = vy0.template AsType<half2_t>()[I0];
+    y1 = vy1.template AsType<half2_t>()[I0];
+#else
+    constexpr int32_t m0 = 0x05040100;
+    constexpr int32_t m1 = 0x07060302;
+
+    // ex: v_perm_b32(0x 11 22 33 44, 0x 55 66 77 88, 0x 05 01 04 00) -> 0x33774488
+    //                   -- -- -- --     -- -- -- --      -  -  -  -
+    //             index  7  6  5  4      3  2  1  0     33 77 44 88
+    // index is reversed because of little endianness (least significant bits first)
+    y0 = bit_cast<half2_t>(__builtin_amdgcn_perm(bit_cast<int32_t>(x1), bit_cast<int32_t>(x0), m0));
+    y1 = bit_cast<half2_t>(__builtin_amdgcn_perm(bit_cast<int32_t>(x1), bit_cast<int32_t>(x0), m1));
+#endif
+}
+
+template <index_t NX, index_t NY>
+struct transpose_vectors<half_t, NX, NY>
+{
+    // we got [NY * NX] amount of S data to be transposed
+    static constexpr index_t s_per_x = NY;
+    static constexpr index_t s_per_y = NX;
+
+    using S  = half_t;
+    using VX = vector_type<half_t, s_per_x>;
+    using VY = vector_type<half_t, s_per_y>;
+
+    __device__ void operator()(const StaticallyIndexedArray<const VX&, NX>& vx_tuple,
+                               StaticallyIndexedArray<VY&, NY>& vy_tuple)
+    {
+        static constexpr auto I1 = Number<1>{};
+        static constexpr auto I2 = Number<2>{};
+
+        static_assert((NX % 2 == 0 && NY % 2 == 0), "wrong!");
+
+        // loop over 2x2 tile and transpose data from vx_tuple into vy_tuple
+        static_for<0, NY, 2>{}([&](auto iy) {
+            static_for<0, NX, 2>{}([&](auto ix) {
+                // reference to 2 half2_t data from vx_tuple
+                const auto& x_s2_0 = vx_tuple[ix].template AsType<half2_t>()[iy / I2];
+                const auto& x_s2_1 = vx_tuple[ix + I1].template AsType<half2_t>()[iy / I2];
+
+                // reference to 2 half2_t data from vy_tuple
+                auto& y_s2_0 = vy_tuple(iy).template AsType<half2_t>()(ix / I2);
+                auto& y_s2_1 = vy_tuple(iy + I1).template AsType<half2_t>()(ix / I2);
+
+                // transpose
+                transpose_fp16_2x2(x_s2_0, x_s2_1, y_s2_0, y_s2_1);
+            });
+        });
+    }
+};
+
+// transpose int8 4x4
+__device__ void transpose_int8_4x4(const int8x4_t& x0,
+                                   const int8x4_t& x1,
+                                   const int8x4_t& x2,
+                                   const int8x4_t& x3,
+                                   int8x4_t& y0,
+                                   int8x4_t& y1,
+                                   int8x4_t& y2,
+                                   int8x4_t& y3)
+{
+    int32_t t0, t1;
+    int32_t z0, z1, z2, z3;
+    constexpr int32_t m0 = 0x05010400;
+    constexpr int32_t m1 = 0x05040100;
+    constexpr int32_t m2 = 0x07060302;
+    constexpr int32_t m3 = 0x07030602;
+
+    // ex: v_perm_b32(0x 11 22 33 44, 0x 55 66 77 88, 0x 05 01 04 00) -> 0x33774488
+    //                   -- -- -- --     -- -- -- --      -  -  -  -
+    //             index  7  6  5  4      3  2  1  0     33 77 44 88
+    // index is reversed because of little endianness (least significant bits first)
+    t0 = __builtin_amdgcn_perm(bit_cast<int32_t>(x1), bit_cast<int32_t>(x0), m0);
+    t1 = __builtin_amdgcn_perm(bit_cast<int32_t>(x3), bit_cast<int32_t>(x2), m0);
+    z0 = __builtin_amdgcn_perm(bit_cast<int32_t>(t1), bit_cast<int32_t>(t0), m1);
+    z1 = __builtin_amdgcn_perm(bit_cast<int32_t>(t1), bit_cast<int32_t>(t0), m2);
+    t0 = __builtin_amdgcn_perm(bit_cast<int32_t>(x1), bit_cast<int32_t>(x0), m3);
+    t1 = __builtin_amdgcn_perm(bit_cast<int32_t>(x3), bit_cast<int32_t>(x2), m3);
+    z2 = __builtin_amdgcn_perm(bit_cast<int32_t>(t1), bit_cast<int32_t>(t0), m1);
+    z3 = __builtin_amdgcn_perm(bit_cast<int32_t>(t1), bit_cast<int32_t>(t0), m2);
+
+    y0 = bit_cast<int8x4_t>(z0);
+    y1 = bit_cast<int8x4_t>(z1);
+    y2 = bit_cast<int8x4_t>(z2);
+    y3 = bit_cast<int8x4_t>(z3);
+}
+
+template <index_t NX, index_t NY>
+struct transpose_vectors<int8_t, NX, NY>
+{
+    // we got [NY * NX] amount of S data to be transposed
+    static constexpr index_t s_per_x = NY;
+    static constexpr index_t s_per_y = NX;
+
+    using S  = int8_t;
+    using VX = vector_type<int8_t, s_per_x>;
+    using VY = vector_type<int8_t, s_per_y>;
+
+    __device__ void operator()(const StaticallyIndexedArray<const VX&, NX>& vx_tuple,
+                               StaticallyIndexedArray<VY&, NY>& vy_tuple)
+    {
+        static constexpr auto I1 = Number<1>{};
+        static constexpr auto I2 = Number<2>{};
+        static constexpr auto I3 = Number<3>{};
+        static constexpr auto I4 = Number<4>{};
+
+        static_assert((NX % 4 == 0 && NY % 4 == 0), "wrong!");
+
+        // loop over 4x4 tile and transpose data from vx_tuple into vy_tuple
+        static_for<0, NY, 4>{}([&](auto iy) {
+            static_for<0, NX, 4>{}([&](auto ix) {
+                // reference to 4 int8 data from vx_tuple
+                const auto& x_s4_0 = vx_tuple[ix].template AsType<int8x4_t>()[iy / I4];
+                const auto& x_s4_1 = vx_tuple[ix + I1].template AsType<int8x4_t>()[iy / I4];
+                const auto& x_s4_2 = vx_tuple[ix + I2].template AsType<int8x4_t>()[iy / I4];
+                const auto& x_s4_3 = vx_tuple[ix + I3].template AsType<int8x4_t>()[iy / I4];
+
+                // reference to 4 int8 data from vy_tuple
+                auto& y_s4_0 = vy_tuple(iy).template AsType<int8x4_t>()(ix / I4);
+                auto& y_s4_1 = vy_tuple(iy + I1).template AsType<int8x4_t>()(ix / I4);
+                auto& y_s4_2 = vy_tuple(iy + I2).template AsType<int8x4_t>()(ix / I4);
+                auto& y_s4_3 = vy_tuple(iy + I3).template AsType<int8x4_t>()(ix / I4);
+
+                // transpose
+                transpose_int8_4x4(x_s4_0, x_s4_1, x_s4_2, x_s4_3, y_s4_0, y_s4_1, y_s4_2, y_s4_3);
+            });
+        });
+    }
+};
+
+} // namespace ck
diff --git a/include/ck/utility/tuple.hpp b/include/ck/utility/tuple.hpp
new file mode 100644
index 00000000..d8664be5
--- /dev/null
+++ b/include/ck/utility/tuple.hpp
@@ -0,0 +1,221 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/integral_constant.hpp"
+#include "ck/utility/sequence.hpp"
+#include "ck/utility/type.hpp"
+#include "ck/utility/enable_if.hpp"
+
+namespace ck {
+
+namespace detail {
+
+template <index_t>
+struct TupleElementKey
+{
+    __host__ __device__ constexpr TupleElementKey() = default;
+};
+
+template <typename Key, typename Data>
+struct TupleElementKeyData
+{
+    using DataType = Data;
+
+#if 0 // workaround compiler complaint about implicitly-deleted default constructor
+    __host__ __device__ constexpr TupleElementKeyData() = default;
+#else
+    __host__ __device__ constexpr TupleElementKeyData() : mData{} {}
+#endif
+
+    template <typename T,
+              typename enable_if<!is_same<remove_cvref_t<T>, TupleElementKeyData>::value,
+                                 bool>::type = false>
+    __host__ __device__ constexpr TupleElementKeyData(T&& v) : mData(std::forward<T>(v))
+    {
+    }
+
+    DataType mData;
+};
+
+// for read access of tuple element
+template <typename Key, typename Data>
+__host__ __device__ constexpr const Data&
+get_tuple_element_data_reference(const TupleElementKeyData<Key, Data>& x)
+{
+    return static_cast<const Data&>(x.mData);
+}
+
+// for write access of tuple element
+template <typename Key, typename Data>
+__host__ __device__ constexpr Data&
+get_tuple_element_data_reference(TupleElementKeyData<Key, Data>& x)
+{
+    return x.mData;
+}
+
+// TODO: not sure the use of reference is correct
+template <typename Key, typename Data>
+__host__ __device__ constexpr Data&&
+get_tuple_element_data_reference(TupleElementKeyData<Key, Data>&& x)
+{
+    return static_cast<Data&&>(x.mData);
+}
+
+// for infering type of tuple element
+template <typename Key, typename Data>
+__host__ __device__ constexpr Data get_tuple_element_data(const TupleElementKeyData<Key, Data>& x)
+{
+    return std::forward(x.mData);
+}
+
+template <typename Indices, typename... Xs>
+struct TupleImpl;
+
+template <index_t... Is, typename... Xs>
+struct TupleImpl<Sequence<Is...>, Xs...> : TupleElementKeyData<TupleElementKey<Is>, Xs>...
+{
+    __host__ __device__ constexpr TupleImpl() = default;
+
+    template <typename Y,
+              typename enable_if<sizeof...(Is) == 1 && sizeof...(Xs) == 1 &&
+                                     !is_same<remove_cvref_t<Y>, TupleImpl>::value,
+                                 bool>::type = false>
+    __host__ __device__ constexpr TupleImpl(Y&& y)
+        : TupleElementKeyData<TupleElementKey<Is>, Xs>(std::forward<Y>(y))...
+    {
+    }
+
+    template <typename... Ys, typename enable_if<sizeof...(Ys) >= 2, bool>::type = false>
+    __host__ __device__ constexpr TupleImpl(Ys&&... ys)
+        : TupleElementKeyData<TupleElementKey<Is>, Xs>(std::forward<Ys>(ys))...
+    {
+        static_assert(sizeof...(Is) == sizeof...(Xs) && sizeof...(Is) == sizeof...(Ys),
+                      "wrong! inconsistent size");
+    }
+
+    __host__ __device__ static constexpr index_t Size() { return sizeof...(Xs); }
+
+    template <index_t I>
+    __host__ __device__ constexpr const auto& GetElementDataByKey(TupleElementKey<I>) const
+    {
+        return get_tuple_element_data_reference<TupleElementKey<I>>(*this);
+    }
+
+    template <index_t I>
+    __host__ __device__ constexpr auto& GetElementDataByKey(TupleElementKey<I>)
+    {
+        return get_tuple_element_data_reference<TupleElementKey<I>>(*this);
+    }
+};
+
+} // namespace detail
+
+template <typename... Xs>
+struct Tuple : detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(Xs), 1>::type, Xs...>
+{
+    using base =
+        detail::TupleImpl<typename arithmetic_sequence_gen<0, sizeof...(Xs), 1>::type, Xs...>;
+
+    __host__ __device__ constexpr Tuple() = default;
+
+    template <typename Y,
+              typename enable_if<sizeof...(Xs) == 1 && !is_same<remove_cvref_t<Y>, Tuple>::value,
+                                 bool>::type = false>
+    __host__ __device__ constexpr Tuple(Y&& y) : base(std::forward<Y>(y))
+    {
+    }
+
+    template <typename... Ys,
+              typename enable_if<sizeof...(Ys) == sizeof...(Xs) && sizeof...(Ys) >= 2, bool>::type =
+                  false>
+    __host__ __device__ constexpr Tuple(Ys&&... ys) : base(std::forward<Ys>(ys)...)
+    {
+    }
+
+    __host__ __device__ static constexpr index_t Size() { return sizeof...(Xs); }
+
+    // read access
+    template <index_t I>
+    __host__ __device__ constexpr const auto& At(Number<I>) const
+    {
+        static_assert(I < base::Size(), "wrong! out of range");
+        return base::GetElementDataByKey(detail::TupleElementKey<I>{});
+    }
+
+    // write access
+    template <index_t I>
+    __host__ __device__ constexpr auto& At(Number<I>)
+    {
+        static_assert(I < base::Size(), "wrong! out of range");
+        return base::GetElementDataByKey(detail::TupleElementKey<I>{});
+    }
+
+    // read access
+    template <index_t I>
+    __host__ __device__ constexpr const auto& operator[](Number<I> i) const
+    {
+        return At(i);
+    }
+
+    // write access
+    template <index_t I>
+    __host__ __device__ constexpr auto& operator()(Number<I> i)
+    {
+        return At(i);
+    }
+
+    template <typename T>
+    __host__ __device__ constexpr auto operator=(const T& a)
+    {
+        static_assert(T::Size() == Size(), "wrong! size not the same");
+
+        static_for<0, Size(), 1>{}([&](auto i) { operator()(i) = a[i]; });
+
+        return *this;
+    }
+
+    __host__ __device__ static constexpr bool IsStaticBuffer() { return true; }
+};
+
+template <>
+struct Tuple<>
+{
+    __host__ __device__ constexpr Tuple() = default;
+
+    __host__ __device__ static constexpr index_t Size() { return 0; }
+
+    template <typename T>
+    __host__ __device__ constexpr auto operator=(const T&)
+    {
+        return *this;
+    }
+
+    __host__ __device__ static constexpr bool IsStaticBuffer() { return true; }
+};
+
+template <index_t I, typename TTuple>
+struct tuple_element
+{
+    // type should keep the cv/ref qualifier of original tuple element
+    using type = decltype(detail::get_tuple_element_data<detail::TupleElementKey<I>>(TTuple{}));
+};
+
+template <index_t I, typename TTuple>
+using tuple_element_t = typename tuple_element<I, TTuple>::type;
+
+template <typename... Xs>
+__host__ __device__ constexpr auto make_tuple(Xs&&... xs)
+{
+    return Tuple<remove_cvref_t<Xs>...>(std::forward<Xs>(xs)...);
+}
+
+// https://en.cppreference.com/w/cpp/utility/tuple/tie
+template <typename... Args>
+constexpr Tuple<Args&...> tie(Args&... args) noexcept
+{
+    return {args...};
+}
+
+} // namespace ck
diff --git a/include/ck/utility/tuple_helper.hpp b/include/ck/utility/tuple_helper.hpp
new file mode 100644
index 00000000..6f5b142a
--- /dev/null
+++ b/include/ck/utility/tuple_helper.hpp
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "functional4.hpp"
+#include "tuple.hpp"
+
+namespace ck {
+
+template <typename F, index_t N>
+__host__ __device__ constexpr auto generate_tuple(F&& f, Number<N>)
+{
+    return unpack([&f](auto&&... xs) { return make_tuple(f(xs)...); },
+                  typename arithmetic_sequence_gen<0, N, 1>::type{});
+}
+
+template <typename F, index_t N>
+__host__ __device__ constexpr auto generate_tie(F&& f, Number<N>)
+{
+    return unpack([&f](auto&&... xs) { return tie(f(xs)...); },
+                  typename arithmetic_sequence_gen<0, N, 1>::type{});
+}
+
+// tx and ty are tuple of references, return type of will tuple of referennce (not rvalue)
+template <typename... X, typename... Y>
+__host__ __device__ constexpr auto concat_tuple_of_reference(const Tuple<X&...>& tx,
+                                                             const Tuple<Y&...>& ty)
+{
+    return unpack2(
+        [&](auto&&... zs) { return Tuple<decltype(zs)...>{std::forward<decltype(zs)>(zs)...}; },
+        tx,
+        ty);
+}
+
+namespace detail {
+
+template <typename F, typename X, index_t... Is>
+__host__ __device__ constexpr auto transform_tuples_impl(F f, const X& x, Sequence<Is...>)
+{
+    return make_tuple(f(x.At(Number<Is>{}))...);
+}
+
+template <typename F, typename X, typename Y, index_t... Is>
+__host__ __device__ constexpr auto
+transform_tuples_impl(F f, const X& x, const Y& y, Sequence<Is...>)
+{
+    return make_tuple(f(x.At(Number<Is>{}), y.At(Number<Is>{}))...);
+}
+
+template <typename F, typename X, typename Y, typename Z, index_t... Is>
+__host__ __device__ constexpr auto
+transform_tuples_impl(F f, const X& x, const Y& y, const Z& z, Sequence<Is...>)
+{
+    return make_tuple(f(x.At(Number<Is>{}), y.At(Number<Is>{}), z.At(Number<Is>{}))...);
+}
+
+} // namespace detail
+
+template <typename F, typename X>
+__host__ __device__ constexpr auto transform_tuples(F f, const X& x)
+{
+    return detail::transform_tuples_impl(
+        f, x, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
+}
+
+template <typename F, typename X, typename Y>
+__host__ __device__ constexpr auto transform_tuples(F f, const X& x, const Y& y)
+{
+    return detail::transform_tuples_impl(
+        f, x, y, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
+}
+
+template <typename F, typename X, typename Y, typename Z>
+__host__ __device__ constexpr auto transform_tuples(F f, const X& x, const Y& y, const Z& z)
+{
+    return detail::transform_tuples_impl(
+        f, x, y, z, typename arithmetic_sequence_gen<0, X::Size(), 1>::type{});
+}
+
+} // namespace ck
diff --git a/include/ck/utility/type.hpp b/include/ck/utility/type.hpp
new file mode 100644
index 00000000..90b9df29
--- /dev/null
+++ b/include/ck/utility/type.hpp
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/utility/integral_constant.hpp"
+#include "ck/utility/enable_if.hpp"
+
+namespace ck {
+
+template <typename X, typename Y>
+struct is_same : public integral_constant<bool, false>
+{
+};
+
+template <typename X>
+struct is_same<X, X> : public integral_constant<bool, true>
+{
+};
+
+template <typename X, typename Y>
+inline constexpr bool is_same_v = is_same<X, Y>::value;
+
+template <typename T>
+using remove_reference_t = typename std::remove_reference<T>::type;
+
+template <typename T>
+using remove_cv_t = typename std::remove_cv<T>::type;
+
+template <typename T>
+using remove_cvref_t = remove_cv_t<std::remove_reference_t<T>>;
+
+template <typename T>
+using remove_pointer_t = typename std::remove_pointer<T>::type;
+
+template <typename T>
+inline constexpr bool is_pointer_v = std::is_pointer<T>::value;
+
+template <typename Y, typename X, typename enable_if<sizeof(X) == sizeof(Y), bool>::type = false>
+__host__ __device__ constexpr Y bit_cast(const X& x)
+{
+#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST
+    Y y;
+
+    __builtin_memcpy(&y, &x, sizeof(X));
+
+    return y;
+#else
+    union AsType
+    {
+        X x;
+        Y y;
+    };
+
+    return AsType{x}.y;
+#endif
+}
+
+} // namespace ck
diff --git a/library/CMakeLists.txt b/library/CMakeLists.txt
new file mode 100644
index 00000000..90873fdd
--- /dev/null
+++ b/library/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(src/tensor_operation_instance/gpu)
+add_subdirectory(src/utility)
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp
new file mode 100644
index 00000000..46a1fa55
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct ReferenceBatchedGemm : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<ADataType>& a_g_m_k,
+                 const Tensor<BDataType>& b_g_k_n,
+                 Tensor<CDataType>& c_g_m_n,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op)
+            : a_g_m_k_{a_g_m_k},
+              b_g_k_n_{b_g_k_n},
+              c_g_m_n_{c_g_m_n},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op}
+        {
+        }
+
+        const Tensor<ADataType>& a_g_m_k_;
+        const Tensor<BDataType>& b_g_k_n_;
+        Tensor<CDataType>& c_g_m_n_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument = ReferenceBatchedGemm::Argument;
+
+        float Run(const Argument& arg)
+        {
+            auto f_gmk_gkn_gmn = [&](auto g, auto m, auto n) {
+                const int K = arg.a_g_m_k_.mDesc.GetLengths()[2];
+
+                AccDataType v_acc = 0;
+
+                for(int k = 0; k < K; ++k)
+                {
+                    ADataType v_a;
+                    BDataType v_b;
+
+                    arg.a_element_op_(v_a, arg.a_g_m_k_(g, m, k));
+                    arg.b_element_op_(v_b, arg.b_g_k_n_(g, k, n));
+
+                    v_acc +=
+                        ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);
+                }
+
+                AccDataType v_c;
+
+                arg.c_element_op_(v_c, v_acc);
+
+                arg.c_g_m_n_(g, m, n) = ck::type_convert<CDataType>(v_c);
+            };
+
+            make_ParallelTensorFunctor(f_gmk_gkn_gmn,
+                                       arg.c_g_m_n_.mDesc.GetLengths()[0],
+                                       arg.c_g_m_n_.mDesc.GetLengths()[1],
+                                       arg.c_g_m_n_.mDesc.GetLengths()[2])(
+                std::thread::hardware_concurrency());
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<ADataType>& a_g_m_k,
+                             const Tensor<BDataType>& b_g_k_n,
+                             Tensor<CDataType>& c_g_m_n,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{a_g_m_k, b_g_k_n, c_g_m_n, a_element_op, b_element_op, c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceBatchedGemm"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward.hpp
new file mode 100644
index 00000000..0b621e88
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward.hpp
@@ -0,0 +1,412 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <array>
+#include <algorithm>
+#include <thread>
+
+#include "ck/utility/math_v2.hpp"
+#include "ck/utility/ignore.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+template <typename XDataType,
+          typename DxDataType,
+          typename DyDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename DscaleDbiasDataType,
+          typename MeanVarDataType,
+          typename DyElementwiseOp,
+          index_t Rank,
+          index_t NumBatchNormReduceDim>
+struct ReferenceBatchNormBwd : public device::DeviceBatchNormBwd<XDataType,
+                                                                 DxDataType,
+                                                                 DyDataType,
+                                                                 AccDataType,
+                                                                 ScaleDataType,
+                                                                 DscaleDbiasDataType,
+                                                                 MeanVarDataType,
+                                                                 DyElementwiseOp,
+                                                                 Rank,
+                                                                 NumBatchNormReduceDim>
+{
+    static_assert(Rank <= 6, "Bigger Rank size is not supported!");
+
+    static constexpr index_t NumInvariantDim = Rank - NumBatchNormReduceDim;
+
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const std::array<index_t, Rank> xyLengths,
+                 const std::array<index_t, Rank> xStrides,
+                 const std::array<index_t, Rank> dxStrides,
+                 const std::array<index_t, Rank> dyStrides,
+                 const std::array<int, NumBatchNormReduceDim> reduceDims,
+                 const std::array<index_t, NumInvariantDim> bnScaleBiasMeanVarLengths,
+                 const std::array<index_t, NumInvariantDim> bnScaleStrides,
+                 const std::array<index_t, NumInvariantDim> bnDscaleDbiasStrides,
+                 const std::array<index_t, NumInvariantDim> bnMeanVarStrides,
+                 const XDataType* p_x,
+                 const DyDataType* p_dy,
+                 const ScaleDataType* p_scale,
+                 const MeanVarDataType* p_savedMean,
+                 const MeanVarDataType* p_savedInvVar,
+                 double epsilon,
+                 const DyElementwiseOp dy_elementwise_op,
+                 DxDataType* p_dx,
+                 DscaleDbiasDataType* p_dscale,
+                 DscaleDbiasDataType* p_dbias)
+            : reduceDims_(reduceDims),
+              bnScaleBiasMeanVarLengths_(bnScaleBiasMeanVarLengths),
+              bnScaleStrides_(bnScaleStrides),
+              bnDscaleDbiasStrides_(bnDscaleDbiasStrides),
+              bnMeanVarStrides_(bnMeanVarStrides),
+              p_x_(p_x),
+              p_dy_(p_dy),
+              p_scale_(p_scale),
+              p_savedMean_(p_savedMean),
+              p_savedInvVar_(p_savedInvVar),
+              dy_elementwise_op_(dy_elementwise_op),
+              p_dx_(p_dx),
+              p_dscale_(p_dscale),
+              p_dbias_(p_dbias)
+        {
+            using ck::host_common::get_index_set;
+
+            if(std::any_of(
+                   reduceDims.begin(), reduceDims.end(), [](int d) { return d < 0 || d >= Rank; }))
+                throw std::runtime_error("Invalid reduce dimensions!");
+
+            // get invariant_dims[] and invariant_lengths[]
+            for(int dim = 0, i = 0; dim < Rank; dim++)
+                if(std::none_of(
+                       reduceDims.begin(), reduceDims.end(), [&](int d) { return d == dim; }))
+                {
+                    invariantDims_[i]     = dim;
+                    invariant_lengths_[i] = xyLengths[dim];
+                    i++;
+                };
+
+            // get reduce_lengths_[]
+            for(int j = 0, i = 0; j < NumBatchNormReduceDim; j++)
+            {
+                int dim              = reduceDims[j];
+                reduce_lengths_[i++] = xyLengths[dim];
+            };
+
+            for(int i = 0; i < NumInvariantDim; i++)
+                if(invariant_lengths_[i] != bnScaleBiasMeanVarLengths_[i])
+                    throw std::runtime_error("Invalid lengths parameters!");
+
+            for(int j = 0, i = 0; j < NumInvariantDim; j++)
+            {
+                int dim                  = invariantDims_[j];
+                x_invariant_strides_[i]  = xStrides[dim];
+                dy_invariant_strides_[i] = dyStrides[dim];
+                dx_invariant_strides_[i] = dxStrides[dim];
+                i++;
+            };
+
+            for(int j = 0, i = 0; j < NumBatchNormReduceDim; j++)
+            {
+                int dim               = reduceDims_[j];
+                x_reduce_strides_[i]  = xStrides[dim];
+                dy_reduce_strides_[i] = dyStrides[dim];
+                dx_reduce_strides_[i] = dxStrides[dim];
+                i++;
+            };
+
+            reduceSize_ = std::accumulate(
+                reduce_lengths_.begin(), reduce_lengths_.end(), 1, std::multiplies<size_t>{});
+
+            invariant_index_set_ = get_index_set<NumInvariantDim>(invariant_lengths_);
+            reduce_index_set_    = get_index_set<NumBatchNormReduceDim>(reduce_lengths_);
+
+            epsilon_ = type_convert<AccDataType>(epsilon);
+
+            haveSavedMeanInvVar_ = (p_savedMean != nullptr && p_savedInvVar != nullptr);
+        }
+
+        std::array<int, NumBatchNormReduceDim> reduceDims_;
+        std::array<int, NumInvariantDim> invariantDims_;
+        std::array<index_t, NumInvariantDim> invariant_lengths_;
+        std::array<index_t, NumBatchNormReduceDim> reduce_lengths_;
+
+        const std::array<index_t, NumInvariantDim> bnScaleBiasMeanVarLengths_;
+        const std::array<index_t, NumInvariantDim> bnScaleStrides_;
+        const std::array<index_t, NumInvariantDim> bnDscaleDbiasStrides_;
+        const std::array<index_t, NumInvariantDim> bnMeanVarStrides_;
+
+        std::array<index_t, NumInvariantDim> x_invariant_strides_;
+        std::array<index_t, NumInvariantDim> dy_invariant_strides_;
+        std::array<index_t, NumInvariantDim> dx_invariant_strides_;
+        std::array<index_t, NumBatchNormReduceDim> x_reduce_strides_;
+        std::array<index_t, NumBatchNormReduceDim> dy_reduce_strides_;
+        std::array<index_t, NumBatchNormReduceDim> dx_reduce_strides_;
+
+        const XDataType* p_x_;
+        const DyDataType* p_dy_;
+        const ScaleDataType* p_scale_;
+        const MeanVarDataType* p_savedMean_;
+        const MeanVarDataType* p_savedInvVar_;
+        const DyElementwiseOp dy_elementwise_op_;
+
+        DxDataType* p_dx_;
+        DscaleDbiasDataType* p_dscale_;
+        DscaleDbiasDataType* p_dbias_;
+
+        bool haveSavedMeanInvVar_;
+
+        std::vector<std::array<index_t, NumInvariantDim>> invariant_index_set_;
+        std::vector<std::array<index_t, NumBatchNormReduceDim>> reduce_index_set_;
+
+        AccDataType epsilon_;
+        size_t reduceSize_;
+    };
+
+    struct Invoker : public device::BaseInvoker
+    {
+        float Run(const Argument& arg)
+        {
+            using ck::host_common::get_offset_from_index;
+
+            auto thread_reduce_func = [&](auto invariant_index) {
+                size_t x_invariant_offset = get_offset_from_index<NumInvariantDim>(
+                    arg.x_invariant_strides_, invariant_index);
+                size_t dy_invariant_offset = get_offset_from_index<NumInvariantDim>(
+                    arg.dy_invariant_strides_, invariant_index);
+                size_t dx_invariant_offset = get_offset_from_index<NumInvariantDim>(
+                    arg.dx_invariant_strides_, invariant_index);
+
+                AccDataType mean     = type_convert<AccDataType>(0.0f);
+                AccDataType variance = type_convert<AccDataType>(0.0f);
+                AccDataType invVar;
+                int32_t curr_count = 0;
+
+                if(arg.haveSavedMeanInvVar_)
+                {
+                    size_t mean_invVar_invariant_offset = get_offset_from_index<NumInvariantDim>(
+                        arg.bnMeanVarStrides_, invariant_index);
+
+                    mean =
+                        type_convert<AccDataType>(arg.p_savedMean_[mean_invVar_invariant_offset]);
+                    invVar =
+                        type_convert<AccDataType>(arg.p_savedInvVar_[mean_invVar_invariant_offset]);
+                }
+                else
+                {
+                    // compute mean, variance using welford method
+                    for(const auto& reduce_index : arg.reduce_index_set_)
+                    {
+                        size_t x_reduce_offset = get_offset_from_index<NumBatchNormReduceDim>(
+                            arg.x_reduce_strides_, reduce_index);
+
+                        auto x_offset = x_invariant_offset + x_reduce_offset;
+
+                        curr_count++;
+
+                        AccDataType x = type_convert<AccDataType>(arg.p_x_[x_offset]);
+
+                        AccDataType delta = x - mean;
+
+                        mean += delta / curr_count;
+
+                        AccDataType delta2 = x - mean;
+
+                        variance += delta * delta2;
+                    };
+
+                    // actual variance
+                    variance = variance / curr_count;
+
+                    // inv-variance defined as 1/sqrt(epsilon+variance)
+                    invVar =
+                        type_convert<AccDataType>(1.0f) / ck::math::sqrt(arg.epsilon_ + variance);
+                };
+
+                AccDataType dbias =
+                    type_convert<AccDataType>(0.0f); // Sum on reduced dimensions of dy
+                AccDataType dscale =
+                    type_convert<AccDataType>(0.0f); // Sum on reduced dimensions of dy * norm_x
+
+                // 1) calculate dy * (x - mean) * inv-variance
+                // 2) calculate sum(dy) on reduced dimensions
+                // 3) calculate sum(dy * norm_x) on reduced dimensions
+                for(const auto& reduce_index : arg.reduce_index_set_)
+                {
+                    size_t x_reduce_offset = get_offset_from_index<NumBatchNormReduceDim>(
+                        arg.x_reduce_strides_, reduce_index);
+                    size_t dy_reduce_offset = get_offset_from_index<NumBatchNormReduceDim>(
+                        arg.dy_reduce_strides_, reduce_index);
+
+                    auto x_offset  = x_invariant_offset + x_reduce_offset;
+                    auto dy_offset = dy_invariant_offset + dy_reduce_offset;
+
+                    AccDataType x = type_convert<AccDataType>(arg.p_x_[x_offset]);
+
+                    AccDataType norm_x = (x - mean) * invVar;
+                    AccDataType dy     = type_convert<AccDataType>(arg.p_dy_[dy_offset]);
+
+                    arg.dy_elementwise_op_(dy, dy);
+
+                    dbias += dy;
+                    dscale += norm_x * dy;
+                };
+
+                size_t dscale_offset = get_offset_from_index<NumInvariantDim>(
+                    arg.bnDscaleDbiasStrides_, invariant_index);
+                size_t dbias_offset = get_offset_from_index<NumInvariantDim>(
+                    arg.bnDscaleDbiasStrides_, invariant_index);
+
+                arg.p_dscale_[dscale_offset] = type_convert<DscaleDbiasDataType>(dscale);
+                arg.p_dbias_[dbias_offset]   = type_convert<DscaleDbiasDataType>(dbias);
+
+                size_t scale_offset =
+                    get_offset_from_index<NumInvariantDim>(arg.bnScaleStrides_, invariant_index);
+
+                AccDataType scale = type_convert<AccDataType>(arg.p_scale_[scale_offset]);
+
+                AccDataType multiplier = type_convert<AccDataType>(1.0f) /
+                                         type_convert<AccDataType>(arg.reduceSize_) * invVar *
+                                         scale;
+
+                // 1) calculate tmp = dscale * (x - mean) * inv-variance
+                // 2) calculate dx = 1/reduceSize * inv-variance * scale * (reduceSize * dy - dbias
+                // - tmp)
+                for(const auto& reduce_index : arg.reduce_index_set_)
+                {
+                    size_t x_reduce_offset = get_offset_from_index<NumBatchNormReduceDim>(
+                        arg.x_reduce_strides_, reduce_index);
+                    size_t dy_reduce_offset = get_offset_from_index<NumBatchNormReduceDim>(
+                        arg.dy_reduce_strides_, reduce_index);
+                    size_t dx_reduce_offset = get_offset_from_index<NumBatchNormReduceDim>(
+                        arg.dx_reduce_strides_, reduce_index);
+
+                    auto x_offset  = x_invariant_offset + x_reduce_offset;
+                    auto dy_offset = dy_invariant_offset + dy_reduce_offset;
+                    auto dx_offset = dx_invariant_offset + dx_reduce_offset;
+
+                    AccDataType x = type_convert<AccDataType>(arg.p_x_[x_offset]);
+
+                    AccDataType norm_x = (x - mean) * invVar;
+                    AccDataType dy     = type_convert<AccDataType>(arg.p_dy_[dy_offset]);
+
+                    arg.dy_elementwise_op_(dy, dy);
+
+                    AccDataType tmpVal = norm_x * dscale;
+
+                    AccDataType dx = multiplier * (type_convert<AccDataType>(arg.reduceSize_) * dy -
+                                                   dbias - tmpVal);
+
+                    arg.p_dx_[dx_offset] = type_convert<DxDataType>(dx);
+                };
+            };
+
+            std::size_t num_thread = std::thread::hardware_concurrency();
+            std::size_t work_per_thread =
+                (arg.invariant_index_set_.size() + num_thread - 1) / num_thread;
+
+            std::vector<joinable_thread> threads(num_thread);
+
+            for(std::size_t it = 0; it < num_thread; ++it)
+            {
+                std::size_t i_begin = it * work_per_thread;
+                std::size_t i_end   = std::min(static_cast<size_t>((it + 1) * work_per_thread),
+                                             arg.invariant_index_set_.size());
+
+                auto f = [=] {
+                    for(std::size_t i = i_begin; i < i_end; ++i)
+                    {
+                        thread_reduce_func(arg.invariant_index_set_[i]);
+                    }
+                };
+
+                threads[it] = joinable_thread(f);
+            }
+
+            return (0.0f);
+        };
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /*stream_config*/ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        };
+    };
+
+    bool IsSupportedArgument(const device::BaseArgument* p_arg) override
+    {
+        (void)p_arg;
+
+        return (true);
+    };
+
+    std::unique_ptr<device::BaseArgument>
+    MakeArgumentPointer(const std::array<index_t, Rank> xyLengths,
+                        const std::array<index_t, Rank> xStrides,
+                        const std::array<index_t, Rank> dxStrides,
+                        const std::array<index_t, Rank> dyStrides,
+                        const std::array<int, NumBatchNormReduceDim> reduceDims,
+                        const std::array<index_t, NumInvariantDim> bnScaleBiasMeanVarLengths,
+                        const std::array<index_t, NumInvariantDim> bnScaleStrides,
+                        const std::array<index_t, NumInvariantDim> bnDscaleDbiasStrides,
+                        const std::array<index_t, NumInvariantDim> bnMeanVarStrides,
+                        const void* p_x,
+                        const void* p_dy,
+                        const void* p_scale,
+                        const void* p_savedMean,
+                        const void* p_savedInvVar,
+                        double epsilon,
+                        const DyElementwiseOp dy_elementwise_op,
+                        void* p_dx,
+                        void* p_dscale,
+                        void* p_dbias) override
+    {
+        return std::make_unique<Argument>(xyLengths,
+                                          xStrides,
+                                          dxStrides,
+                                          dyStrides,
+                                          reduceDims,
+                                          bnScaleBiasMeanVarLengths,
+                                          bnScaleStrides,
+                                          bnDscaleDbiasStrides,
+                                          bnMeanVarStrides,
+                                          static_cast<const XDataType*>(p_x),
+                                          static_cast<const DyDataType*>(p_dy),
+                                          static_cast<const ScaleDataType*>(p_scale),
+                                          static_cast<const MeanVarDataType*>(p_savedMean),
+                                          static_cast<const MeanVarDataType*>(p_savedInvVar),
+                                          epsilon,
+                                          dy_elementwise_op,
+                                          static_cast<DxDataType*>(p_dx),
+                                          static_cast<DscaleDbiasDataType*>(p_dscale),
+                                          static_cast<DscaleDbiasDataType*>(p_dbias));
+    };
+
+    std::unique_ptr<device::BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>();
+    };
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "Reference_BatchNorm_Backward" << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward.hpp
new file mode 100644
index 00000000..dd0db316
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward.hpp
@@ -0,0 +1,368 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <array>
+#include <algorithm>
+#include <thread>
+
+#include "ck/utility/math_v2.hpp"
+#include "ck/utility/ignore.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+template <typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename YElementwiseOp,
+          index_t Rank,
+          index_t NumBatchNormReduceDim>
+struct ReferenceBatchNormFwd : public device::DeviceBatchNormFwd<XDataType,
+                                                                 YDataType,
+                                                                 AccDataType,
+                                                                 ScaleDataType,
+                                                                 BiasDataType,
+                                                                 MeanVarDataType,
+                                                                 YElementwiseOp,
+                                                                 Rank,
+                                                                 NumBatchNormReduceDim>
+{
+    static_assert(Rank <= 6, "Bigger Rank size is not supported!");
+
+    static constexpr index_t NumInvariantDim = Rank - NumBatchNormReduceDim;
+
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const std::array<index_t, Rank> xyLengths,
+                 const std::array<index_t, Rank> xStrides,
+                 const std::array<index_t, Rank> yStrides,
+                 const std::array<int, NumBatchNormReduceDim> reduceDims,
+                 const std::array<index_t, NumInvariantDim> bnScaleBiasMeanVarLengths,
+                 const std::array<index_t, NumInvariantDim> bnScaleStrides,
+                 const std::array<index_t, NumInvariantDim> bnBiasStrides,
+                 const std::array<index_t, NumInvariantDim> bnMeanVarStrides,
+                 const XDataType* p_x,
+                 const ScaleDataType* bnScale,
+                 const BiasDataType* bnBias,
+                 double epsilon,
+                 const YElementwiseOp y_elementwise_op,
+                 YDataType* p_y,
+                 MeanVarDataType* resultSaveMean,
+                 MeanVarDataType* resultSaveInvVariance,
+                 double averageFactor,
+                 MeanVarDataType* resultRunningMean,
+                 MeanVarDataType* resultRunningVariance)
+            : reduceDims_(reduceDims),
+              bnScaleBiasMeanVarLengths_(bnScaleBiasMeanVarLengths),
+              bnScaleStrides_(bnScaleStrides),
+              bnBiasStrides_(bnBiasStrides),
+              bnMeanVarStrides_(bnMeanVarStrides),
+              p_x_(p_x),
+              bnScale_(bnScale),
+              bnBias_(bnBias),
+              y_elementwise_op_(y_elementwise_op),
+              p_y_(p_y),
+              resultSaveMean_(resultSaveMean),
+              resultSaveInvVariance_(resultSaveInvVariance),
+              resultRunningMean_(resultRunningMean),
+              resultRunningVariance_(resultRunningVariance)
+        {
+            using ck::host_common::get_index_set;
+
+            if(std::any_of(
+                   reduceDims.begin(), reduceDims.end(), [](int d) { return d < 0 || d >= Rank; }))
+                throw std::runtime_error("Invalid reduce dimensions!");
+
+            // get invariant_dims[] and invariant_lengths[]
+            for(int dim = 0, i = 0; dim < Rank; dim++)
+                if(std::none_of(
+                       reduceDims.begin(), reduceDims.end(), [&](int d) { return d == dim; }))
+                {
+                    invariantDims_[i]     = dim;
+                    invariant_lengths_[i] = xyLengths[dim];
+                    i++;
+                };
+
+            // get reduce_lengths_[]
+            for(int j = 0, i = 0; j < NumBatchNormReduceDim; j++)
+            {
+                int dim              = reduceDims[j];
+                reduce_lengths_[i++] = xyLengths[dim];
+            };
+
+            for(int i = 0; i < NumInvariantDim; i++)
+                if(invariant_lengths_[i] != bnScaleBiasMeanVarLengths_[i])
+                    throw std::runtime_error("Invalid lengths parameters!");
+
+            for(int j = 0, i = 0; j < NumInvariantDim; j++)
+            {
+                int dim                 = invariantDims_[j];
+                x_invariant_strides_[i] = xStrides[dim];
+                y_invariant_strides_[i] = yStrides[dim];
+                i++;
+            };
+
+            for(int j = 0, i = 0; j < NumBatchNormReduceDim; j++)
+            {
+                int dim              = reduceDims_[j];
+                x_reduce_strides_[i] = xStrides[dim];
+                y_reduce_strides_[i] = yStrides[dim];
+                i++;
+            };
+
+            invariant_index_set_ = get_index_set<NumInvariantDim>(invariant_lengths_);
+            reduce_index_set_    = get_index_set<NumBatchNormReduceDim>(reduce_lengths_);
+
+            epsilon_       = type_convert<AccDataType>(epsilon);
+            averageFactor_ = type_convert<AccDataType>(averageFactor);
+
+            resultSave    = (resultSaveMean != nullptr && resultSaveInvVariance != nullptr);
+            resultRunning = (resultRunningMean != nullptr && resultRunningVariance != nullptr);
+        }
+
+        std::array<int, NumBatchNormReduceDim> reduceDims_;
+        std::array<int, NumInvariantDim> invariantDims_;
+        std::array<index_t, NumInvariantDim> invariant_lengths_;
+        std::array<index_t, NumBatchNormReduceDim> reduce_lengths_;
+
+        const std::array<index_t, NumInvariantDim> bnScaleBiasMeanVarLengths_;
+        const std::array<index_t, NumInvariantDim> bnScaleStrides_;
+        const std::array<index_t, NumInvariantDim> bnBiasStrides_;
+        const std::array<index_t, NumInvariantDim> bnMeanVarStrides_;
+
+        std::array<index_t, NumInvariantDim> x_invariant_strides_;
+        std::array<index_t, NumInvariantDim> y_invariant_strides_;
+        std::array<index_t, NumBatchNormReduceDim> x_reduce_strides_;
+        std::array<index_t, NumBatchNormReduceDim> y_reduce_strides_;
+
+        const XDataType* p_x_;
+        const ScaleDataType* bnScale_;
+        const BiasDataType* bnBias_;
+        const YElementwiseOp y_elementwise_op_;
+        YDataType* p_y_;
+
+        MeanVarDataType* resultSaveMean_;
+        MeanVarDataType* resultSaveInvVariance_;
+        MeanVarDataType* resultRunningMean_;
+        MeanVarDataType* resultRunningVariance_;
+
+        bool resultSave, resultRunning;
+
+        std::vector<std::array<index_t, NumInvariantDim>> invariant_index_set_;
+        std::vector<std::array<index_t, NumBatchNormReduceDim>> reduce_index_set_;
+
+        AccDataType averageFactor_;
+        AccDataType epsilon_;
+    };
+
+    struct Invoker : public device::BaseInvoker
+    {
+        float Run(const Argument& arg)
+        {
+            using ck::host_common::get_offset_from_index;
+
+            auto thread_reduce_func = [&](auto invariant_index) {
+                size_t x_invariant_offset = get_offset_from_index<NumInvariantDim>(
+                    arg.x_invariant_strides_, invariant_index);
+                size_t y_invariant_offset = get_offset_from_index<NumInvariantDim>(
+                    arg.y_invariant_strides_, invariant_index);
+                AccDataType mean     = type_convert<AccDataType>(0.0f);
+                AccDataType variance = type_convert<AccDataType>(0.0f);
+                int32_t curr_count   = 0;
+
+                // compute mean, variance using welford method
+                for(const auto& reduce_index : arg.reduce_index_set_)
+                {
+                    size_t x_reduce_offset = get_offset_from_index<NumBatchNormReduceDim>(
+                        arg.x_reduce_strides_, reduce_index);
+
+                    auto x_offset = x_invariant_offset + x_reduce_offset;
+
+                    curr_count++;
+
+                    AccDataType x = type_convert<AccDataType>(arg.p_x_[x_offset]);
+
+                    AccDataType delta = x - mean;
+
+                    mean += delta / curr_count;
+
+                    AccDataType delta2 = x - mean;
+
+                    variance += delta * delta2;
+                };
+
+                // actual variance
+                variance = variance / curr_count;
+
+                // inv-variance defined as 1/sqrt(epsilon+variance)
+                AccDataType invVariance =
+                    type_convert<AccDataType>(1.0f) / ck::math::sqrt(arg.epsilon_ + variance);
+
+                // save the mean/inv-variance if required
+                if(arg.resultSave)
+                {
+                    size_t offset = get_offset_from_index<NumInvariantDim>(arg.bnMeanVarStrides_,
+                                                                           invariant_index);
+
+                    arg.resultSaveMean_[offset]        = type_convert<MeanVarDataType>(mean);
+                    arg.resultSaveInvVariance_[offset] = type_convert<MeanVarDataType>(invVariance);
+                };
+
+                // update the moving average if required
+                if(arg.resultRunning)
+                {
+                    size_t offset = get_offset_from_index<NumInvariantDim>(arg.bnMeanVarStrides_,
+                                                                           invariant_index);
+
+                    AccDataType oneMinusAverageFactor =
+                        type_convert<AccDataType>(1.0) - arg.averageFactor_;
+                    arg.resultRunningMean_[offset] = type_convert<MeanVarDataType>(
+                        type_convert<AccDataType>(arg.resultRunningMean_[offset]) *
+                            oneMinusAverageFactor +
+                        mean * arg.averageFactor_);
+                    arg.resultRunningVariance_[offset] = type_convert<MeanVarDataType>(
+                        arg.resultRunningVariance_[offset] * oneMinusAverageFactor +
+                        variance * arg.averageFactor_);
+                };
+
+                size_t scale_offset =
+                    get_offset_from_index<NumInvariantDim>(arg.bnScaleStrides_, invariant_index);
+                size_t bias_offset =
+                    get_offset_from_index<NumInvariantDim>(arg.bnBiasStrides_, invariant_index);
+
+                AccDataType scale = type_convert<AccDataType>(arg.bnScale_[scale_offset]);
+                AccDataType bias  = type_convert<AccDataType>(arg.bnBias_[bias_offset]);
+
+                // Normalization
+                for(const auto& reduce_index : arg.reduce_index_set_)
+                {
+                    size_t x_reduce_offset = get_offset_from_index<NumBatchNormReduceDim>(
+                        arg.x_reduce_strides_, reduce_index);
+                    size_t y_reduce_offset = get_offset_from_index<NumBatchNormReduceDim>(
+                        arg.y_reduce_strides_, reduce_index);
+
+                    auto x_offset = x_invariant_offset + x_reduce_offset;
+                    auto y_offset = y_invariant_offset + y_reduce_offset;
+
+                    AccDataType x = type_convert<AccDataType>(arg.p_x_[x_offset]);
+
+                    AccDataType norm_x = (x - mean) * invVariance;
+
+                    AccDataType y = scale * norm_x + bias;
+
+                    arg.y_elementwise_op_(y, y);
+
+                    arg.p_y_[y_offset] = type_convert<YDataType>(y);
+                };
+            };
+
+            std::size_t num_thread = std::thread::hardware_concurrency();
+            std::size_t work_per_thread =
+                (arg.invariant_index_set_.size() + num_thread - 1) / num_thread;
+
+            std::vector<joinable_thread> threads(num_thread);
+
+            for(std::size_t it = 0; it < num_thread; ++it)
+            {
+                std::size_t i_begin = it * work_per_thread;
+                std::size_t i_end   = std::min(static_cast<size_t>((it + 1) * work_per_thread),
+                                             arg.invariant_index_set_.size());
+
+                auto f = [=] {
+                    for(std::size_t i = i_begin; i < i_end; ++i)
+                    {
+                        thread_reduce_func(arg.invariant_index_set_[i]);
+                    }
+                };
+
+                threads[it] = joinable_thread(f);
+            }
+
+            return (0.0f);
+        };
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /*stream_config*/ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        };
+    };
+
+    bool IsSupportedArgument(const device::BaseArgument* p_arg) override
+    {
+        (void)p_arg;
+
+        return (true);
+    };
+
+    std::unique_ptr<device::BaseArgument>
+    MakeArgumentPointer(const std::array<index_t, 4> xyLengths,
+                        const std::array<index_t, 4> xStrides,
+                        const std::array<index_t, 4> yStrides,
+                        const std::array<int, 3> reduceDims,
+                        const std::array<index_t, 1> bnScaleBiasMeanVarLengths,
+                        const std::array<index_t, 1> bnScaleStrides,
+                        const std::array<index_t, 1> bnBiasStrides,
+                        const std::array<index_t, 1> bnMeanVarStrides,
+                        const void* p_x,
+                        const void* bnScale,
+                        const void* bnBias,
+                        double epsilon,
+                        const YElementwiseOp y_elementwise_op,
+                        void* p_y,
+                        void* resultSaveMean,
+                        void* resultSaveInvVariance,
+                        double averageFactor,
+                        void* resultRunningMean,
+                        void* resultRunningVariance) override
+    {
+        return std::make_unique<Argument>(xyLengths,
+                                          xStrides,
+                                          yStrides,
+                                          reduceDims,
+                                          bnScaleBiasMeanVarLengths,
+                                          bnScaleStrides,
+                                          bnBiasStrides,
+                                          bnMeanVarStrides,
+                                          static_cast<const XDataType*>(p_x),
+                                          static_cast<const ScaleDataType*>(bnScale),
+                                          static_cast<const BiasDataType*>(bnBias),
+                                          epsilon,
+                                          y_elementwise_op,
+                                          static_cast<YDataType*>(p_y),
+                                          static_cast<MeanVarDataType*>(resultSaveMean),
+                                          static_cast<MeanVarDataType*>(resultSaveInvVariance),
+                                          averageFactor,
+                                          static_cast<MeanVarDataType*>(resultRunningMean),
+                                          static_cast<MeanVarDataType*>(resultRunningVariance));
+    };
+
+    std::unique_ptr<device::BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>();
+    };
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "Reference_BatchNorm_Forward" << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp
new file mode 100644
index 00000000..463c655a
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_batchnorm_infer.hpp
@@ -0,0 +1,300 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+#include <array>
+#include <algorithm>
+
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/tensor_operation/gpu/device/device_batchnorm_infer.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+template <typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename YElementwiseOp,
+          index_t Rank,
+          index_t NumBatchNormReduceDim>
+struct ReferenceBatchNormInfer : public device::DeviceBatchNormInfer<XDataType,
+                                                                     YDataType,
+                                                                     AccDataType,
+                                                                     ScaleDataType,
+                                                                     BiasDataType,
+                                                                     MeanVarDataType,
+                                                                     YElementwiseOp,
+                                                                     Rank,
+                                                                     NumBatchNormReduceDim>
+{
+    static_assert(Rank <= 6, "Bigger Rank size is not supported!");
+
+    static constexpr index_t NumInvariantDim = Rank - NumBatchNormReduceDim;
+
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const std::array<index_t, Rank> xyLengths,
+                 const std::array<index_t, Rank> xStrides,
+                 const std::array<index_t, Rank> yStrides,
+                 const std::array<int, NumBatchNormReduceDim> reduceDims,
+                 const std::array<index_t, NumInvariantDim> bnScaleBiasMeanVarLengths,
+                 const std::array<index_t, NumInvariantDim> bnScaleStrides,
+                 const std::array<index_t, NumInvariantDim> bnBiasStrides,
+                 const std::array<index_t, NumInvariantDim> bnMeanVarStrides,
+                 const XDataType* p_x,
+                 const ScaleDataType* bnScale,
+                 const BiasDataType* bnBias,
+                 double epsilon,
+                 const YElementwiseOp y_elementwise_op,
+                 const MeanVarDataType* estimatedMean,
+                 const MeanVarDataType* estimatedVariance,
+                 YDataType* p_y)
+            : reduceDims_(reduceDims),
+              bnScaleBiasMeanVarLengths_(bnScaleBiasMeanVarLengths),
+              bnScaleStrides_(bnScaleStrides),
+              bnBiasStrides_(bnBiasStrides),
+              bnMeanVarStrides_(bnMeanVarStrides),
+              p_x_(p_x),
+              bnScale_(bnScale),
+              bnBias_(bnBias),
+              y_elementwise_op_(y_elementwise_op),
+              estimatedMean_(estimatedMean),
+              estimatedVariance_(estimatedVariance),
+              p_y_(p_y)
+        {
+            using ck::host_common::get_index_set;
+
+            if(std::any_of(
+                   reduceDims.begin(), reduceDims.end(), [](int d) { return d < 0 || d >= Rank; }))
+                throw std::runtime_error("Invalid reduce dimensions!");
+
+            // get invariant_dims[] and invariant_lengths[]
+            for(int dim = 0, i = 0; dim < Rank; dim++)
+                if(std::none_of(
+                       reduceDims.begin(), reduceDims.end(), [&](int d) { return d == dim; }))
+                {
+                    invariantDims_[i]     = dim;
+                    invariant_lengths_[i] = xyLengths[dim];
+                    i++;
+                };
+
+            // get reduce_lengths_[]
+            for(int j = 0, i = 0; j < NumBatchNormReduceDim; j++)
+            {
+                int dim              = reduceDims[j];
+                reduce_lengths_[i++] = xyLengths[dim];
+            };
+
+            // check invariant_lengths_ and bnScaleBiasMeanVarLengths
+            for(int i = 0; i < NumInvariantDim; i++)
+                if(invariant_lengths_[i] != bnScaleBiasMeanVarLengths_[i])
+                    throw std::runtime_error("Invalid lengths parameters!");
+
+            for(int j = 0, i = 0; j < NumInvariantDim; j++)
+            {
+                int dim                 = invariantDims_[j];
+                x_invariant_strides_[i] = xStrides[dim];
+                y_invariant_strides_[i] = yStrides[dim];
+                i++;
+            };
+
+            for(int j = 0, i = 0; j < NumBatchNormReduceDim; j++)
+            {
+                int dim              = reduceDims_[j];
+                x_reduce_strides_[i] = xStrides[dim];
+                y_reduce_strides_[i] = yStrides[dim];
+                i++;
+            };
+
+            invariant_index_set_ = get_index_set<NumInvariantDim>(invariant_lengths_);
+            reduce_index_set_    = get_index_set<NumBatchNormReduceDim>(reduce_lengths_);
+
+            epsilon_ = type_convert<AccDataType>(epsilon);
+        }
+
+        std::array<int, NumBatchNormReduceDim> reduceDims_;
+        std::array<int, NumInvariantDim> invariantDims_;
+        std::array<index_t, NumInvariantDim> invariant_lengths_;
+        std::array<index_t, NumBatchNormReduceDim> reduce_lengths_;
+
+        const std::array<index_t, NumInvariantDim> bnScaleBiasMeanVarLengths_;
+        const std::array<index_t, NumInvariantDim> bnScaleStrides_;
+        const std::array<index_t, NumInvariantDim> bnBiasStrides_;
+        const std::array<index_t, NumInvariantDim> bnMeanVarStrides_;
+
+        std::array<index_t, NumInvariantDim> x_invariant_strides_;
+        std::array<index_t, NumInvariantDim> y_invariant_strides_;
+        std::array<index_t, NumBatchNormReduceDim> x_reduce_strides_;
+        std::array<index_t, NumBatchNormReduceDim> y_reduce_strides_;
+
+        const XDataType* p_x_;
+        const ScaleDataType* bnScale_;
+        const BiasDataType* bnBias_;
+        const YElementwiseOp y_elementwise_op_;
+
+        const MeanVarDataType* estimatedMean_;
+        const MeanVarDataType* estimatedVariance_;
+
+        YDataType* p_y_;
+
+        std::vector<std::array<index_t, NumInvariantDim>> invariant_index_set_;
+        std::vector<std::array<index_t, NumBatchNormReduceDim>> reduce_index_set_;
+
+        AccDataType epsilon_;
+    };
+
+    struct Invoker : public device::BaseInvoker
+    {
+        float Run(const Argument& arg)
+        {
+            using ck::host_common::get_offset_from_index;
+
+            auto thread_reduce_func = [&](auto invariant_index) {
+                size_t x_invariant_offset = get_offset_from_index<NumInvariantDim>(
+                    arg.x_invariant_strides_, invariant_index);
+                size_t y_invariant_offset = get_offset_from_index<NumInvariantDim>(
+                    arg.y_invariant_strides_, invariant_index);
+
+                size_t mean_variance_offset =
+                    get_offset_from_index<NumInvariantDim>(arg.bnMeanVarStrides_, invariant_index);
+
+                AccDataType mean     = arg.estimatedMean_[mean_variance_offset];
+                AccDataType variance = arg.estimatedVariance_[mean_variance_offset];
+
+                // inv-variance defined as 1/sqrt(epsilon+variance)
+                AccDataType invVariance =
+                    type_convert<AccDataType>(1.0f) / std::sqrt(arg.epsilon_ + variance);
+
+                size_t scale_offset =
+                    get_offset_from_index<NumInvariantDim>(arg.bnScaleStrides_, invariant_index);
+                size_t bias_offset =
+                    get_offset_from_index<NumInvariantDim>(arg.bnBiasStrides_, invariant_index);
+
+                AccDataType scale = type_convert<AccDataType>(arg.bnScale_[scale_offset]);
+                AccDataType bias  = type_convert<AccDataType>(arg.bnBias_[bias_offset]);
+
+                // normalization
+                for(const auto& reduce_index : arg.reduce_index_set_)
+                {
+                    size_t x_reduce_offset = get_offset_from_index<NumBatchNormReduceDim>(
+                        arg.x_reduce_strides_, reduce_index);
+                    size_t y_reduce_offset = get_offset_from_index<NumBatchNormReduceDim>(
+                        arg.y_reduce_strides_, reduce_index);
+
+                    auto x_offset = x_invariant_offset + x_reduce_offset;
+                    auto y_offset = y_invariant_offset + y_reduce_offset;
+
+                    AccDataType x = type_convert<AccDataType>(arg.p_x_[x_offset]);
+
+                    AccDataType norm_x = (x - mean) * invVariance;
+
+                    AccDataType y = scale * norm_x + bias;
+
+                    arg.y_elementwise_op_(y, y);
+
+                    arg.p_y_[y_offset] = type_convert<YDataType>(y);
+                };
+            };
+
+            std::size_t num_thread = std::thread::hardware_concurrency();
+            std::size_t work_per_thread =
+                (arg.invariant_index_set_.size() + num_thread - 1) / num_thread;
+
+            std::vector<joinable_thread> threads(num_thread);
+
+            for(std::size_t it = 0; it < num_thread; ++it)
+            {
+                std::size_t i_begin = it * work_per_thread;
+                std::size_t i_end   = std::min(static_cast<size_t>((it + 1) * work_per_thread),
+                                             arg.invariant_index_set_.size());
+
+                auto f = [=] {
+                    for(std::size_t i = i_begin; i < i_end; ++i)
+                    {
+                        thread_reduce_func(arg.invariant_index_set_[i]);
+                    }
+                };
+
+                threads[it] = joinable_thread(f);
+            }
+
+            return (0.0f);
+        };
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /*stream_config*/ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        };
+    };
+
+    bool IsSupportedArgument(const device::BaseArgument* p_arg) override
+    {
+        (void)p_arg;
+
+        return (true);
+    };
+
+    std::unique_ptr<device::BaseArgument>
+    MakeArgumentPointer(const std::array<index_t, Rank> xyLengths,
+                        const std::array<index_t, Rank> xStrides,
+                        const std::array<index_t, Rank> yStrides,
+                        const std::array<int, NumBatchNormReduceDim> reduceDims,
+                        const std::array<index_t, NumInvariantDim> bnScaleBiasMeanVarLengths,
+                        const std::array<index_t, NumInvariantDim> bnScaleStrides,
+                        const std::array<index_t, NumInvariantDim> bnBiasStrides,
+                        const std::array<index_t, NumInvariantDim> bnMeanVarStrides,
+                        const void* p_x,
+                        const void* bnScale,
+                        const void* bnBias,
+                        double epsilon,
+                        const YElementwiseOp y_elementwise_op,
+                        const void* estimatedMean,
+                        const void* estimatedVariance,
+                        void* p_y) override
+    {
+        return std::make_unique<Argument>(xyLengths,
+                                          xStrides,
+                                          yStrides,
+                                          reduceDims,
+                                          bnScaleBiasMeanVarLengths,
+                                          bnScaleStrides,
+                                          bnBiasStrides,
+                                          bnMeanVarStrides,
+                                          static_cast<const XDataType*>(p_x),
+                                          static_cast<const ScaleDataType*>(bnScale),
+                                          static_cast<const BiasDataType*>(bnBias),
+                                          epsilon,
+                                          y_elementwise_op,
+                                          static_cast<const MeanVarDataType*>(estimatedMean),
+                                          static_cast<const MeanVarDataType*>(estimatedVariance),
+                                          static_cast<YDataType*>(p_y));
+    };
+
+    std::unique_ptr<device::BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>();
+    };
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "Reference_BatchNorm_Infer<" << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp
new file mode 100644
index 00000000..b0149d88
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_cgemm.hpp
@@ -0,0 +1,184 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+// FIXME: support arbitrary elementwise operation for A/B/C
+template <
+    typename ADataType,
+    typename BDataType,
+    typename CDataType,
+    typename AElementwiseOperation,
+    typename BElementwiseOperation,
+    typename CElementwiseOperation,
+    enable_if_t<
+        is_same_v<AElementwiseOperation, ck::tensor_operation::element_wise::PassThrough> &&
+            is_same_v<BElementwiseOperation, ck::tensor_operation::element_wise::PassThrough> &&
+            is_same_v<CElementwiseOperation, ck::tensor_operation::element_wise::PassThrough>,
+        bool> = false>
+struct ReferenceCGemm : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<ADataType>& a_m_k_real,
+                 const Tensor<ADataType>& a_m_k_imag,
+                 const Tensor<BDataType>& b_k_n_real,
+                 const Tensor<BDataType>& b_k_n_imag,
+                 Tensor<CDataType>& c_m_n_real,
+                 Tensor<CDataType>& c_m_n_imag,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op)
+            : a_m_k_real_{a_m_k_real},
+              a_m_k_imag_{a_m_k_imag},
+              b_k_n_real_{b_k_n_real},
+              b_k_n_imag_{b_k_n_imag},
+              c_m_n_real_{c_m_n_real},
+              c_m_n_imag_{c_m_n_imag},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op}
+        {
+        }
+
+        const Tensor<ADataType>& a_m_k_real_;
+        const Tensor<ADataType>& a_m_k_imag_;
+        const Tensor<BDataType>& b_k_n_real_;
+        const Tensor<BDataType>& b_k_n_imag_;
+        Tensor<CDataType>& c_m_n_real_;
+        Tensor<CDataType>& c_m_n_imag_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument = ReferenceCGemm::Argument;
+
+        float Run(const Argument& arg)
+        {
+            const std::size_t K = arg.a_m_k_real_.mDesc.GetLengths()[1];
+
+            if(K != arg.a_m_k_imag_.mDesc.GetLengths()[1])
+            {
+                throw std::runtime_error("wrong! Incompatible real and imag sizes in CGEMM");
+            }
+
+            auto f_mk_kn_mn_real = [&](auto m, auto n) {
+                float v_c_real = 0;
+
+                for(std::size_t k = 0; k < K; ++k)
+                {
+                    float v_a_real = ck::type_convert<float>(arg.a_m_k_real_(m, k));
+                    float v_a_imag = ck::type_convert<float>(arg.a_m_k_imag_(m, k));
+                    float v_b_real = ck::type_convert<float>(arg.b_k_n_real_(k, n));
+                    float v_b_imag = ck::type_convert<float>(arg.b_k_n_imag_(k, n));
+
+                    v_c_real += v_a_real * v_b_real - v_a_imag * v_b_imag;
+                }
+
+                arg.c_m_n_real_(m, n) = ck::type_convert<CDataType>(v_c_real);
+            };
+
+            auto f_mk_kn_mn_imag = [&](auto m, auto n) {
+                float v_c_imag = 0;
+
+                for(std::size_t k = 0; k < K; ++k)
+                {
+                    float v_a_real = ck::type_convert<float>(arg.a_m_k_real_(m, k));
+                    float v_a_imag = ck::type_convert<float>(arg.a_m_k_imag_(m, k));
+                    float v_b_real = ck::type_convert<float>(arg.b_k_n_real_(k, n));
+                    float v_b_imag = ck::type_convert<float>(arg.b_k_n_imag_(k, n));
+
+                    v_c_imag += v_a_real * v_b_imag + v_a_imag * v_b_real;
+                }
+
+                arg.c_m_n_imag_(m, n) = ck::type_convert<CDataType>(v_c_imag);
+            };
+
+            make_ParallelTensorFunctor(f_mk_kn_mn_real,
+                                       arg.c_m_n_real_.mDesc.GetLengths()[0],
+                                       arg.c_m_n_real_.mDesc.GetLengths()[1])(
+                std::thread::hardware_concurrency());
+            make_ParallelTensorFunctor(f_mk_kn_mn_imag,
+                                       arg.c_m_n_imag_.mDesc.GetLengths()[0],
+                                       arg.c_m_n_imag_.mDesc.GetLengths()[1])(
+                std::thread::hardware_concurrency());
+
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<ADataType>& a_m_k_real,
+                             const Tensor<ADataType>& a_m_k_imag,
+                             const Tensor<BDataType>& b_k_n_real,
+                             const Tensor<BDataType>& b_k_n_imag,
+                             Tensor<CDataType>& c_m_n_real,
+                             Tensor<CDataType>& c_m_n_imag,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{a_m_k_real,
+                        a_m_k_imag,
+                        b_k_n_real,
+                        b_k_n_imag,
+                        c_m_n_real,
+                        c_m_n_imag,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceCGemm"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
new file mode 100644
index 00000000..225f7b7e
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp
@@ -0,0 +1,378 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+#include "ck/library/utility/host_tensor.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+// input descriptor in [G, N, C, Do, Ho, Wo] order
+// weight descriptor in [G, K, C, Z, Y, X] order
+// output descriptor in [G, N, K, Di, Hi, Wi] order
+// phyiscal layout is irrelavent
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation,
+          typename std::enable_if<NDimSpatial >= 1 && NDimSpatial <= 3, bool>::type = false>
+struct ReferenceConvBwdData : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(Tensor<InDataType>& input,
+                 const Tensor<WeiDataType>& weight,
+                 const Tensor<OutDataType>& output,
+                 std::vector<ck::index_t> conv_filter_strides,
+                 std::vector<ck::index_t> conv_filter_dilations,
+                 std::vector<ck::index_t> input_left_pads,
+                 std::vector<ck::index_t> input_right_pads,
+                 InElementwiseOperation in_element_op,
+                 WeiElementwiseOperation wei_element_op,
+                 OutElementwiseOperation out_element_op)
+            : input_{input},
+              weight_{weight},
+              output_{output},
+              conv_strides_{conv_filter_strides},
+              conv_dilations_{conv_filter_dilations},
+              in_left_pads_{input_left_pads},
+              in_right_pads_{input_right_pads},
+              in_element_op_{in_element_op},
+              wei_element_op_{wei_element_op},
+              out_element_op_{out_element_op}
+        {
+        }
+
+        Tensor<InDataType>& input_;
+        const Tensor<WeiDataType>& weight_;
+        const Tensor<OutDataType>& output_;
+
+        std::vector<index_t> conv_strides_;
+        std::vector<index_t> conv_dilations_;
+        std::vector<index_t> in_left_pads_;
+        std::vector<index_t> in_right_pads_;
+
+        InElementwiseOperation in_element_op_;
+        WeiElementwiseOperation wei_element_op_;
+        OutElementwiseOperation out_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument = ReferenceConvBwdData::Argument;
+
+        float Run(const Argument& arg)
+        {
+            if(!(arg.input_.GetNumOfDimension() == NDimSpatial + 3 &&
+                 arg.weight_.GetNumOfDimension() == NDimSpatial + 3 &&
+                 arg.output_.GetNumOfDimension() == NDimSpatial + 3))
+            {
+                throw std::runtime_error("wrong! inconsistent dimension");
+            }
+
+            if constexpr(NDimSpatial == 1)
+            {
+                auto f_ncw = [&](auto g, auto n, auto c, auto wi) {
+                    std::size_t K  = arg.weight_.GetLengths()[1];
+                    std::size_t X  = arg.weight_.GetLengths()[3];
+                    std::size_t Wo = arg.output_.GetLengths()[3];
+
+                    float v_acc = 0;
+
+                    for(std::size_t x = 0; x < X; ++x)
+                    {
+                        auto w_tmp = static_cast<ck::long_index_t>(wi) +
+                                     static_cast<ck::long_index_t>(arg.in_left_pads_[0]) -
+                                     static_cast<ck::long_index_t>(x * arg.conv_dilations_[0]);
+
+                        if(w_tmp % arg.conv_strides_[0] == 0)
+                        {
+                            auto wo = static_cast<ck::long_index_t>(w_tmp) /
+                                      static_cast<ck::long_index_t>(arg.conv_strides_[0]);
+
+                            if(wo >= 0 && ck::type_convert<std::size_t>(wo) < Wo)
+                            {
+                                for(std::size_t k = 0; k < K; ++k)
+                                {
+                                    float v_out = 0;
+                                    float v_wei = 0;
+
+                                    arg.out_element_op_(
+                                        v_out, ck::type_convert<float>(arg.output_(g, n, k, wo)));
+
+                                    arg.wei_element_op_(
+                                        v_wei, ck::type_convert<float>(arg.weight_(g, k, c, x)));
+
+                                    v_acc += v_out * v_wei;
+                                }
+                            }
+                        }
+                    }
+
+                    float v_in;
+
+                    arg.in_element_op_(v_in, v_acc);
+
+                    arg.input_(g, n, c, wi) = ck::type_convert<InDataType>(v_acc);
+                };
+
+                make_ParallelTensorFunctor(f_ncw,
+                                           arg.input_.GetLengths()[0],
+                                           arg.input_.GetLengths()[1],
+                                           arg.input_.GetLengths()[2],
+                                           arg.input_.GetLengths()[3])(
+                    std::thread::hardware_concurrency());
+
+                return 0;
+            }
+            else if constexpr(NDimSpatial == 2)
+            {
+                auto f_nchw = [&](auto g, auto n, auto c, auto hi, auto wi) {
+                    std::size_t K = arg.weight_.GetLengths()[1];
+                    std::size_t Y = arg.weight_.GetLengths()[3];
+                    std::size_t X = arg.weight_.GetLengths()[4];
+
+                    std::size_t Ho = arg.output_.GetLengths()[3];
+                    std::size_t Wo = arg.output_.GetLengths()[4];
+
+                    float v_acc = 0;
+
+                    for(std::size_t y = 0; y < Y; ++y)
+                    {
+                        auto h_tmp = static_cast<ck::long_index_t>(hi) +
+                                     static_cast<ck::long_index_t>(arg.in_left_pads_[0]) -
+                                     static_cast<ck::long_index_t>(y * arg.conv_dilations_[0]);
+                        if(h_tmp % arg.conv_strides_[0] == 0)
+                        {
+                            auto ho = static_cast<ck::long_index_t>(h_tmp) /
+                                      static_cast<ck::long_index_t>(arg.conv_strides_[0]);
+                            if(ho >= 0 && ck::type_convert<std::size_t>(ho) < Ho)
+                            {
+                                for(std::size_t x = 0; x < X; ++x)
+                                {
+                                    auto w_tmp =
+                                        static_cast<ck::long_index_t>(wi) +
+                                        static_cast<ck::long_index_t>(arg.in_left_pads_[1]) -
+                                        static_cast<ck::long_index_t>(x * arg.conv_dilations_[1]);
+                                    if(w_tmp % arg.conv_strides_[1] == 0)
+                                    {
+                                        auto wo =
+                                            static_cast<ck::long_index_t>(w_tmp) /
+                                            static_cast<ck::long_index_t>(arg.conv_strides_[1]);
+                                        if(wo >= 0 && ck::type_convert<std::size_t>(wo) < Wo)
+                                        {
+                                            for(std::size_t k = 0; k < K; ++k)
+                                            {
+                                                float v_out = 0;
+                                                float v_wei = 0;
+
+                                                arg.out_element_op_(
+                                                    v_out,
+                                                    ck::type_convert<float>(
+                                                        arg.output_(g, n, k, ho, wo)));
+
+                                                arg.wei_element_op_(
+                                                    v_wei,
+                                                    ck::type_convert<float>(
+                                                        arg.weight_(g, k, c, y, x)));
+
+                                                v_acc += v_out * v_wei;
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+
+                    float v_in;
+
+                    arg.in_element_op_(v_in, v_acc);
+
+                    arg.input_(g, n, c, hi, wi) = ck::type_convert<InDataType>(v_acc);
+                };
+
+                make_ParallelTensorFunctor(f_nchw,
+                                           arg.input_.GetLengths()[0],
+                                           arg.input_.GetLengths()[1],
+                                           arg.input_.GetLengths()[2],
+                                           arg.input_.GetLengths()[3],
+                                           arg.input_.GetLengths()[4])(
+                    std::thread::hardware_concurrency());
+
+                return 0;
+            }
+            else if constexpr(NDimSpatial == 3)
+            {
+                auto f_ncdhw = [&](auto g, auto n, auto c, auto di, auto hi, auto wi) {
+                    std::size_t K = arg.weight_.GetLengths()[1];
+                    std::size_t Z = arg.weight_.GetLengths()[3];
+                    std::size_t Y = arg.weight_.GetLengths()[4];
+                    std::size_t X = arg.weight_.GetLengths()[5];
+
+                    std::size_t Do = arg.output_.GetLengths()[3];
+                    std::size_t Ho = arg.output_.GetLengths()[4];
+                    std::size_t Wo = arg.output_.GetLengths()[5];
+
+                    float v_acc = 0;
+
+                    for(std::size_t z = 0; z < Z; ++z)
+                    {
+                        auto d_tmp = static_cast<ck::long_index_t>(di) +
+                                     static_cast<ck::long_index_t>(arg.in_left_pads_[0]) -
+                                     static_cast<ck::long_index_t>(z * arg.conv_dilations_[0]);
+                        if(d_tmp % arg.conv_strides_[0] == 0)
+                        {
+                            auto do_ = static_cast<ck::long_index_t>(d_tmp) /
+                                       static_cast<ck::long_index_t>(arg.conv_strides_[0]);
+                            if(do_ >= 0 && ck::type_convert<std::size_t>(do_) < Do)
+                            {
+                                for(std::size_t y = 0; y < Y; ++y)
+                                {
+                                    auto h_tmp =
+                                        static_cast<ck::long_index_t>(hi) +
+                                        static_cast<ck::long_index_t>(arg.in_left_pads_[1]) -
+                                        static_cast<ck::long_index_t>(y * arg.conv_dilations_[1]);
+                                    if(h_tmp % arg.conv_strides_[1] == 0)
+                                    {
+                                        auto ho =
+                                            static_cast<ck::long_index_t>(h_tmp) /
+                                            static_cast<ck::long_index_t>(arg.conv_strides_[1]);
+                                        if(ho >= 0 && ck::type_convert<std::size_t>(ho) < Ho)
+                                        {
+                                            for(std::size_t x = 0; x < X; ++x)
+                                            {
+                                                auto w_tmp = static_cast<ck::long_index_t>(wi) +
+                                                             static_cast<ck::long_index_t>(
+                                                                 arg.in_left_pads_[2]) -
+                                                             static_cast<ck::long_index_t>(
+                                                                 x * arg.conv_dilations_[2]);
+
+                                                if(w_tmp % arg.conv_strides_[2] == 0)
+                                                {
+                                                    auto wo = static_cast<ck::long_index_t>(w_tmp) /
+                                                              static_cast<ck::long_index_t>(
+                                                                  arg.conv_strides_[2]);
+                                                    if(wo >= 0 &&
+                                                       ck::type_convert<std::size_t>(wo) < Wo)
+                                                    {
+                                                        for(std::size_t k = 0; k < K; ++k)
+                                                        {
+                                                            float v_out = 0;
+                                                            float v_wei = 0;
+
+                                                            arg.out_element_op_(
+                                                                v_out,
+                                                                ck::type_convert<float>(arg.output_(
+                                                                    g, n, k, do_, ho, wo)));
+
+                                                            arg.wei_element_op_(
+                                                                v_wei,
+                                                                ck::type_convert<float>(
+                                                                    arg.weight_(g, k, c, z, y, x)));
+
+                                                            v_acc += v_out * v_wei;
+                                                        }
+                                                    }
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+
+                    float v_in;
+
+                    arg.in_element_op_(v_in, v_acc);
+
+                    arg.input_(g, n, c, di, hi, wi) = ck::type_convert<InDataType>(v_acc);
+                };
+
+                make_ParallelTensorFunctor(f_ncdhw,
+                                           arg.input_.GetLengths()[0],
+                                           arg.input_.GetLengths()[1],
+                                           arg.input_.GetLengths()[2],
+                                           arg.input_.GetLengths()[3],
+                                           arg.input_.GetLengths()[4],
+                                           arg.input_.GetLengths()[5])(
+                    std::thread::hardware_concurrency());
+
+                return 0;
+            }
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(Tensor<InDataType>& input,
+                             const Tensor<WeiDataType>& weight,
+                             const Tensor<OutDataType>& output,
+                             std::vector<ck::index_t> conv_filter_strides,
+                             std::vector<ck::index_t> conv_filter_dilations,
+                             std::vector<ck::index_t> input_left_pads,
+                             std::vector<ck::index_t> input_right_pads,
+                             InElementwiseOperation in_element_op,
+                             WeiElementwiseOperation wei_element_op,
+                             OutElementwiseOperation out_element_op)
+    {
+        return Argument{input,
+                        weight,
+                        output,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        in_element_op,
+                        wei_element_op,
+                        out_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceConvBwdData"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp
new file mode 100644
index 00000000..7d62158f
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp
@@ -0,0 +1,327 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+#include "ck/library/utility/host_tensor.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+// input descriptor in [G, N, C, Do, Ho, Wo] order
+// weight descriptor in [G, K, C, Z, Y, X] order
+// output descriptor in [G, N, K, Di, Hi, Wi] order
+// phyiscal layout is irrelavent
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation,
+          typename std::enable_if<NDimSpatial >= 1 && NDimSpatial <= 3, bool>::type = false>
+struct ReferenceConvBwdWeight : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<InDataType>& in_n_c_hi_wi,
+                 Tensor<WeiDataType>& wei_k_c_y_x,
+                 const Tensor<OutDataType>& out_n_k_ho_wo,
+                 std::vector<ck::index_t> conv_filter_strides,
+                 std::vector<ck::index_t> conv_filter_dilations,
+                 std::vector<ck::index_t> input_left_pads,
+                 std::vector<ck::index_t> input_right_pads,
+                 InElementwiseOperation in_element_op,
+                 WeiElementwiseOperation wei_element_op,
+                 OutElementwiseOperation out_element_op)
+            : input_{in_n_c_hi_wi},
+              weight_{wei_k_c_y_x},
+              output_{out_n_k_ho_wo},
+              conv_strides_{conv_filter_strides},
+              conv_dilations_{conv_filter_dilations},
+              in_left_pads_{input_left_pads},
+              in_right_pads_{input_right_pads},
+              in_element_op_{in_element_op},
+              wei_element_op_{wei_element_op},
+              out_element_op_{out_element_op}
+        {
+        }
+
+        const Tensor<InDataType>& input_;
+        Tensor<WeiDataType>& weight_;
+        const Tensor<OutDataType>& output_;
+
+        std::vector<index_t> conv_strides_;
+        std::vector<index_t> conv_dilations_;
+        std::vector<index_t> in_left_pads_;
+        std::vector<index_t> in_right_pads_;
+
+        InElementwiseOperation in_element_op_;
+        WeiElementwiseOperation wei_element_op_;
+        OutElementwiseOperation out_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument = ReferenceConvBwdWeight::Argument;
+
+        float Run(const Argument& arg)
+        {
+            if(!(arg.input_.GetNumOfDimension() == NDimSpatial + 3 &&
+                 arg.weight_.GetNumOfDimension() == NDimSpatial + 3 &&
+                 arg.output_.GetNumOfDimension() == NDimSpatial + 3))
+            {
+                throw std::runtime_error("wrong! inconsistent dimension");
+            }
+
+            if constexpr(NDimSpatial == 1)
+            {
+                auto f_kcx = [&](auto g, auto k, auto c, auto x) {
+                    float v_acc = 0;
+
+                    for(std::size_t n = 0; n < arg.output_.GetLengths()[1]; ++n)
+                    {
+                        for(std::size_t wo = 0; wo < arg.output_.GetLengths()[3]; ++wo)
+                        {
+                            auto wi = static_cast<ck::long_index_t>(wo * arg.conv_strides_[0]) +
+                                      static_cast<ck::long_index_t>(x * arg.conv_dilations_[0]) -
+                                      static_cast<ck::long_index_t>(arg.in_left_pads_[0]);
+
+                            if(wi >= 0 &&
+                               ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[3])
+                            {
+                                float v_out;
+                                float v_in;
+
+                                arg.out_element_op_(
+                                    v_out, ck::type_convert<float>(arg.output_(g, n, k, wo)));
+
+                                arg.in_element_op_(
+                                    v_in, ck::type_convert<float>(arg.input_(g, n, c, wi)));
+
+                                v_acc += v_out * v_in;
+                            }
+                        }
+                    }
+
+                    float v_wei;
+
+                    arg.wei_element_op_(v_wei, v_acc);
+
+                    arg.weight_(g, k, c, x) = ck::type_convert<WeiDataType>(v_wei);
+                };
+
+                make_ParallelTensorFunctor(f_kcx,
+                                           arg.weight_.GetLengths()[0],
+                                           arg.weight_.GetLengths()[1],
+                                           arg.weight_.GetLengths()[2],
+                                           arg.weight_.GetLengths()[3])(
+                    std::thread::hardware_concurrency());
+
+                return 0;
+            }
+            else if constexpr(NDimSpatial == 2)
+            {
+                auto f_kcyx = [&](auto g, auto k, auto c, auto y, auto x) {
+                    std::size_t N = arg.output_.GetLengths()[1];
+
+                    std::size_t Ho = arg.output_.GetLengths()[3];
+                    std::size_t Wo = arg.output_.GetLengths()[4];
+
+                    float v_acc = 0;
+
+                    for(std::size_t n = 0; n < N; ++n)
+                    {
+                        for(std::size_t ho = 0; ho < Ho; ++ho)
+                        {
+                            auto hi = static_cast<ck::long_index_t>(ho * arg.conv_strides_[0]) +
+                                      static_cast<ck::long_index_t>(y * arg.conv_dilations_[0]) -
+                                      static_cast<ck::long_index_t>(arg.in_left_pads_[0]);
+
+                            for(std::size_t wo = 0; wo < Wo; ++wo)
+                            {
+                                auto wi =
+                                    static_cast<ck::long_index_t>(wo * arg.conv_strides_[1]) +
+                                    static_cast<ck::long_index_t>(x * arg.conv_dilations_[1]) -
+                                    static_cast<ck::long_index_t>(arg.in_left_pads_[1]);
+
+                                if(hi >= 0 &&
+                                   ck::type_convert<std::size_t>(hi) < arg.input_.GetLengths()[3] &&
+                                   wi >= 0 &&
+                                   ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[4])
+                                {
+                                    float v_out;
+                                    float v_in;
+
+                                    arg.out_element_op_(
+                                        v_out,
+                                        ck::type_convert<float>(arg.output_(g, n, k, ho, wo)));
+
+                                    arg.in_element_op_(
+                                        v_in, ck::type_convert<float>(arg.input_(g, n, c, hi, wi)));
+
+                                    v_acc += v_out * v_in;
+                                }
+                            }
+                        }
+                    }
+
+                    float v_wei;
+
+                    arg.wei_element_op_(v_wei, v_acc);
+
+                    arg.weight_(g, k, c, y, x) = ck::type_convert<WeiDataType>(v_wei);
+                };
+
+                make_ParallelTensorFunctor(f_kcyx,
+                                           arg.weight_.GetLengths()[0],
+                                           arg.weight_.GetLengths()[1],
+                                           arg.weight_.GetLengths()[2],
+                                           arg.weight_.GetLengths()[3],
+                                           arg.weight_.GetLengths()[4])(
+                    std::thread::hardware_concurrency());
+
+                return 0;
+            }
+            else if constexpr(NDimSpatial == 3)
+            {
+                auto f_kczyx = [&](auto g, auto k, auto c, auto z, auto y, auto x) {
+                    float v_acc = 0;
+
+                    for(std::size_t n = 0; n < arg.output_.GetLengths()[1]; ++n)
+                    {
+                        for(std::size_t do_ = 0; do_ < arg.output_.GetLengths()[3]; ++do_)
+                        {
+                            auto di = static_cast<ck::long_index_t>(do_ * arg.conv_strides_[0]) +
+                                      static_cast<ck::long_index_t>(z * arg.conv_dilations_[0]) -
+                                      static_cast<ck::long_index_t>(arg.in_left_pads_[0]);
+                            for(std::size_t ho = 0; ho < arg.output_.GetLengths()[4]; ++ho)
+                            {
+                                auto hi =
+                                    static_cast<ck::long_index_t>(ho * arg.conv_strides_[1]) +
+                                    static_cast<ck::long_index_t>(y * arg.conv_dilations_[1]) -
+                                    static_cast<ck::long_index_t>(arg.in_left_pads_[1]);
+                                for(std::size_t wo = 0; wo < arg.output_.GetLengths()[5]; ++wo)
+                                {
+                                    auto wi =
+                                        static_cast<ck::long_index_t>(wo * arg.conv_strides_[2]) +
+                                        static_cast<ck::long_index_t>(x * arg.conv_dilations_[2]) -
+                                        static_cast<ck::long_index_t>(arg.in_left_pads_[2]);
+
+                                    if(di >= 0 &&
+                                       ck::type_convert<std::size_t>(di) <
+                                           arg.input_.GetLengths()[3] &&
+                                       hi >= 0 &&
+                                       ck::type_convert<std::size_t>(hi) <
+                                           arg.input_.GetLengths()[4] &&
+                                       wi >= 0 &&
+                                       ck::type_convert<std::size_t>(wi) <
+                                           arg.input_.GetLengths()[5])
+                                    {
+                                        float v_out;
+                                        float v_in;
+
+                                        arg.out_element_op_(v_out,
+                                                            ck::type_convert<float>(
+                                                                arg.output_(g, n, k, do_, ho, wo)));
+
+                                        arg.in_element_op_(v_in,
+                                                           ck::type_convert<float>(
+                                                               arg.input_(g, n, c, di, hi, wi)));
+
+                                        v_acc += v_out * v_in;
+                                    }
+                                }
+                            }
+                        }
+                    }
+
+                    float v_wei;
+
+                    arg.wei_element_op_(v_wei, v_acc);
+
+                    arg.weight_(g, k, c, z, y, x) = ck::type_convert<WeiDataType>(v_wei);
+                };
+
+                make_ParallelTensorFunctor(f_kczyx,
+                                           arg.weight_.GetLengths()[0],
+                                           arg.weight_.GetLengths()[1],
+                                           arg.weight_.GetLengths()[2],
+                                           arg.weight_.GetLengths()[3],
+                                           arg.weight_.GetLengths()[4],
+                                           arg.weight_.GetLengths()[5])(
+                    std::thread::hardware_concurrency());
+
+                return 0;
+            }
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /*stream_config*/ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<InDataType>& in_n_c_hi_wi,
+                             Tensor<WeiDataType>& wei_k_c_y_x,
+                             const Tensor<OutDataType>& out_n_k_ho_wo,
+                             std::vector<ck::index_t> conv_filter_strides,
+                             std::vector<ck::index_t> conv_filter_dilations,
+                             std::vector<ck::index_t> input_left_pads,
+                             std::vector<ck::index_t> input_right_pads,
+                             InElementwiseOperation in_element_op,
+                             WeiElementwiseOperation wei_element_op,
+                             OutElementwiseOperation out_element_op)
+    {
+        return Argument{in_n_c_hi_wi,
+                        wei_k_c_y_x,
+                        out_n_k_ho_wo,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        in_element_op,
+                        wei_element_op,
+                        out_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceConvBwdWeight"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
new file mode 100644
index 00000000..b8d47d21
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
@@ -0,0 +1,339 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <type_traits>
+#include <sstream>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+//
+// @brief      Reference implementation for forward convolution.
+//
+// @paragraph
+//             Tensor descriptor in GNCHW/GKCXY/GNKHW dimensional order
+//             Supports both GNCHW/NGCHW as well as GNHWC/NHWGC physical layout
+//             as long as dimensions in tensor descriptor is in GNCHW order
+//
+// @tparam     InDataType               Input tensor data type.
+// @tparam     WeiDataType              Weights tensor data type.
+// @tparam     OutDataType              Output tensor data type.
+// @tparam     InElementwiseOperation   Functor for input tensor elementwise
+//                                      operation.
+// @tparam     WeiElementwiseOperation  Functor for weights tensor elementwise
+//                                      operation.
+// @tparam     NDimSpatial  Number of spatial dimensions.
+//
+// input descriptor in [G, N, C, Do, Ho, Wo] order
+// weight descriptor in [G, K, C, Z, Y, X] order
+// output descriptor in [G, N, K, Di, Hi, Wi] order
+// phyiscal layout is irrelavent
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation,
+          typename std::enable_if<NDimSpatial >= 1 && NDimSpatial <= 3, bool>::type = false>
+struct ReferenceConvFwd : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<InDataType>& input,
+                 const Tensor<WeiDataType>& weight,
+                 Tensor<OutDataType>& output,
+                 std::vector<ck::index_t> conv_filter_strides,
+                 std::vector<ck::index_t> conv_filter_dilations,
+                 std::vector<ck::index_t> input_left_pads,
+                 std::vector<ck::index_t> input_right_pads,
+                 InElementwiseOperation in_element_op,
+                 WeiElementwiseOperation wei_element_op,
+                 OutElementwiseOperation out_element_op)
+            : input_{input},
+              weight_{weight},
+              output_{output},
+              conv_strides_{conv_filter_strides},
+              conv_dilations_{conv_filter_dilations},
+              in_left_pads_{input_left_pads},
+              in_right_pads_{input_right_pads},
+              in_element_op_{in_element_op},
+              wei_element_op_{wei_element_op},
+              out_element_op_{out_element_op}
+        {
+        }
+
+        const Tensor<InDataType>& input_;
+        const Tensor<WeiDataType>& weight_;
+        Tensor<OutDataType>& output_;
+
+        std::vector<index_t> conv_strides_;
+        std::vector<index_t> conv_dilations_;
+        std::vector<index_t> in_left_pads_;
+        std::vector<index_t> in_right_pads_;
+
+        InElementwiseOperation in_element_op_;
+        WeiElementwiseOperation wei_element_op_;
+        OutElementwiseOperation out_element_op_;
+    };
+
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument = ReferenceConvFwd::Argument;
+
+        float Run(const Argument& arg)
+        {
+            if(!(arg.input_.GetNumOfDimension() == NDimSpatial + 3 &&
+                 arg.weight_.GetNumOfDimension() == NDimSpatial + 3 &&
+                 arg.output_.GetNumOfDimension() == NDimSpatial + 3))
+            {
+                throw std::runtime_error("wrong! inconsistent dimension");
+            }
+
+            if constexpr(NDimSpatial == 1)
+            {
+                auto func = [&](auto g, auto n, auto k, auto wo) {
+                    float v_acc = 0;
+
+                    for(std::size_t c = 0; c < arg.weight_.GetLengths()[2]; ++c)
+                    {
+                        for(std::size_t x = 0; x < arg.weight_.GetLengths()[3]; ++x)
+                        {
+                            auto wi = static_cast<ck::long_index_t>(wo * arg.conv_strides_[0]) +
+                                      static_cast<ck::long_index_t>(x * arg.conv_dilations_[0]) -
+                                      static_cast<ck::long_index_t>(arg.in_left_pads_[0]);
+
+                            if(wi >= 0 &&
+                               ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[3])
+                            {
+                                float v_in;
+                                float v_wei;
+
+                                arg.in_element_op_(
+                                    v_in, ck::type_convert<float>(arg.input_(g, n, c, wi)));
+
+                                arg.wei_element_op_(
+                                    v_wei, ck::type_convert<float>(arg.weight_(g, k, c, x)));
+
+                                v_acc += v_in * v_wei;
+                            }
+                        }
+                    }
+
+                    float v_out;
+
+                    arg.out_element_op_(v_out, v_acc);
+
+                    arg.output_(g, n, k, wo) = ck::type_convert<OutDataType>(v_out);
+                };
+
+                make_ParallelTensorFunctor(func,
+                                           arg.output_.GetLengths()[0],
+                                           arg.output_.GetLengths()[1],
+                                           arg.output_.GetLengths()[2],
+                                           arg.output_.GetLengths()[3])(
+                    std::thread::hardware_concurrency());
+
+                return 0;
+            }
+            else if constexpr(NDimSpatial == 2)
+            {
+                auto func = [&](auto g, auto n, auto k, auto ho, auto wo) {
+                    float v_acc = 0;
+
+                    for(std::size_t c = 0; c < arg.weight_.GetLengths()[2]; ++c)
+                    {
+                        for(std::size_t y = 0; y < arg.weight_.GetLengths()[3]; ++y)
+                        {
+                            auto hi = static_cast<ck::long_index_t>(ho * arg.conv_strides_[0]) +
+                                      static_cast<ck::long_index_t>(y * arg.conv_dilations_[0]) -
+                                      static_cast<ck::long_index_t>(arg.in_left_pads_[0]);
+
+                            for(std::size_t x = 0; x < arg.weight_.GetLengths()[4]; ++x)
+                            {
+                                auto wi =
+                                    static_cast<ck::long_index_t>(wo * arg.conv_strides_[1]) +
+                                    static_cast<ck::long_index_t>(x * arg.conv_dilations_[1]) -
+                                    static_cast<ck::long_index_t>(arg.in_left_pads_[1]);
+
+                                if(hi >= 0 &&
+                                   ck::type_convert<std::size_t>(hi) < arg.input_.GetLengths()[3] &&
+                                   wi >= 0 &&
+                                   ck::type_convert<std::size_t>(wi) < arg.input_.GetLengths()[4])
+                                {
+                                    float v_in;
+                                    float v_wei;
+
+                                    arg.in_element_op_(
+                                        v_in, ck::type_convert<float>(arg.input_(g, n, c, hi, wi)));
+
+                                    arg.wei_element_op_(
+                                        v_wei, ck::type_convert<float>(arg.weight_(g, k, c, y, x)));
+
+                                    v_acc += v_in * v_wei;
+                                }
+                            }
+                        }
+                    }
+
+                    float v_out;
+
+                    arg.out_element_op_(v_out, v_acc);
+
+                    arg.output_(g, n, k, ho, wo) = ck::type_convert<OutDataType>(v_out);
+                };
+
+                make_ParallelTensorFunctor(func,
+                                           arg.output_.GetLengths()[0],
+                                           arg.output_.GetLengths()[1],
+                                           arg.output_.GetLengths()[2],
+                                           arg.output_.GetLengths()[3],
+                                           arg.output_.GetLengths()[4])(
+                    std::thread::hardware_concurrency());
+
+                return 0;
+            }
+            else if constexpr(NDimSpatial == 3)
+            {
+                auto func = [&](auto g, auto n, auto k, auto d_o, auto ho, auto wo) {
+                    float v_acc = 0;
+
+                    for(std::size_t c = 0; c < arg.weight_.GetLengths()[2]; ++c)
+                    {
+                        for(std::size_t z = 0; z < arg.weight_.GetLengths()[3]; ++z)
+                        {
+                            auto di = static_cast<ck::long_index_t>(d_o * arg.conv_strides_[0]) +
+                                      static_cast<ck::long_index_t>(z * arg.conv_dilations_[0]) -
+                                      static_cast<ck::long_index_t>(arg.in_left_pads_[0]);
+                            for(std::size_t y = 0; y < arg.weight_.GetLengths()[4]; ++y)
+                            {
+                                auto hi =
+                                    static_cast<ck::long_index_t>(ho * arg.conv_strides_[1]) +
+                                    static_cast<ck::long_index_t>(y * arg.conv_dilations_[1]) -
+                                    static_cast<ck::long_index_t>(arg.in_left_pads_[1]);
+                                for(std::size_t x = 0; x < arg.weight_.GetLengths()[5]; ++x)
+                                {
+                                    auto wi =
+                                        static_cast<ck::long_index_t>(wo * arg.conv_strides_[2]) +
+                                        static_cast<ck::long_index_t>(x * arg.conv_dilations_[2]) -
+                                        static_cast<ck::long_index_t>(arg.in_left_pads_[2]);
+                                    if(di >= 0 &&
+                                       ck::type_convert<std::size_t>(di) <
+                                           arg.input_.GetLengths()[3] &&
+                                       hi >= 0 &&
+                                       ck::type_convert<std::size_t>(hi) <
+                                           arg.input_.GetLengths()[4] &&
+                                       wi >= 0 &&
+                                       ck::type_convert<std::size_t>(wi) <
+                                           arg.input_.GetLengths()[5])
+                                    {
+                                        float v_in;
+                                        float v_wei;
+
+                                        arg.in_element_op_(v_in,
+                                                           ck::type_convert<float>(
+                                                               arg.input_(g, n, c, di, hi, wi)));
+
+                                        arg.wei_element_op_(
+                                            v_wei,
+                                            ck::type_convert<float>(arg.weight_(g, k, c, z, y, x)));
+
+                                        v_acc += v_in * v_wei;
+                                    }
+                                }
+                            }
+                        }
+                    }
+
+                    float v_out;
+
+                    arg.out_element_op_(v_out, v_acc);
+
+                    arg.output_(g, n, k, d_o, ho, wo) = ck::type_convert<OutDataType>(v_out);
+                };
+
+                make_ParallelTensorFunctor(func,
+                                           arg.output_.GetLengths()[0],
+                                           arg.output_.GetLengths()[1],
+                                           arg.output_.GetLengths()[2],
+                                           arg.output_.GetLengths()[3],
+                                           arg.output_.GetLengths()[4],
+                                           arg.output_.GetLengths()[5])(
+                    std::thread::hardware_concurrency());
+
+                return 0;
+            }
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /*stream_config*/ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override
+    {
+        return NDimSpatial >= 1 && NDimSpatial <= 3;
+    }
+
+    static auto MakeArgument(const Tensor<InDataType>& input,
+                             const Tensor<WeiDataType>& weight,
+                             Tensor<OutDataType>& output,
+                             std::vector<ck::index_t> conv_filter_strides,
+                             std::vector<ck::index_t> conv_filter_dilations,
+                             std::vector<ck::index_t> input_left_pads,
+                             std::vector<ck::index_t> input_right_pads,
+                             InElementwiseOperation in_element_op,
+                             WeiElementwiseOperation wei_element_op,
+                             OutElementwiseOperation out_element_op)
+    {
+        return Argument{input,
+                        weight,
+                        output,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        in_element_op,
+                        wei_element_op,
+                        out_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceConvFwd"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp
new file mode 100644
index 00000000..be22003f
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp
@@ -0,0 +1,192 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+// out[N, Ho, Wo, K] =
+//     activate(in[N, Hi, Wi, C] * wei[K, Y, X, C] + bias[K])
+template <typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation>
+struct ReferenceConvFwd_Bias_Activation : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<InDataType>& in_n_c_hi_wi,
+                 const Tensor<WeiDataType>& wei_k_c_y_x,
+                 Tensor<OutDataType>& out_n_k_ho_wo,
+                 const Tensor<OutDataType>& bias_k,
+                 std::vector<ck::index_t> conv_filter_strides,
+                 std::vector<ck::index_t> conv_filter_dilations,
+                 std::vector<ck::index_t> input_left_pads,
+                 std::vector<ck::index_t> input_right_pads,
+                 InElementwiseOperation in_element_op,
+                 WeiElementwiseOperation wei_element_op,
+                 OutElementwiseOperation out_element_op)
+            : in_n_c_hi_wi_{in_n_c_hi_wi},
+              wei_k_c_y_x_{wei_k_c_y_x},
+              out_n_k_ho_wo_{out_n_k_ho_wo},
+              bias_k_{bias_k},
+              conv_strides_{conv_filter_strides},
+              conv_dilations_{conv_filter_dilations},
+              in_left_pads_{input_left_pads},
+              in_right_pads_{input_right_pads},
+              in_element_op_{in_element_op},
+              wei_element_op_{wei_element_op},
+              out_element_op_{out_element_op}
+        {
+        }
+
+        const Tensor<InDataType>& in_n_c_hi_wi_;
+        const Tensor<WeiDataType>& wei_k_c_y_x_;
+        Tensor<OutDataType>& out_n_k_ho_wo_;
+        const Tensor<OutDataType>& bias_k_;
+
+        std::vector<index_t> conv_strides_;
+        std::vector<index_t> conv_dilations_;
+        std::vector<index_t> in_left_pads_;
+        std::vector<index_t> in_right_pads_;
+
+        InElementwiseOperation in_element_op_;
+        WeiElementwiseOperation wei_element_op_;
+        OutElementwiseOperation out_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument = ReferenceConvFwd_Bias_Activation::Argument;
+
+        float Run(const Argument& arg)
+        {
+            auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
+                float v_acc = 0;
+
+                for(std::size_t c = 0; c < arg.wei_k_c_y_x_.mDesc.GetLengths()[1]; ++c)
+                {
+                    for(std::size_t y = 0; y < arg.wei_k_c_y_x_.mDesc.GetLengths()[2]; ++y)
+                    {
+                        auto hi = ck::type_convert<ck::long_index_t>(ho * arg.conv_strides_[0]) +
+                                  ck::type_convert<ck::long_index_t>(y * arg.conv_dilations_[0]) -
+                                  ck::type_convert<ck::long_index_t>(arg.in_left_pads_[0]);
+                        for(std::size_t x = 0; x < arg.wei_k_c_y_x_.mDesc.GetLengths()[3]; ++x)
+                        {
+                            auto wi =
+                                ck::type_convert<ck::long_index_t>(wo * arg.conv_strides_[1]) +
+                                ck::type_convert<ck::long_index_t>(x * arg.conv_dilations_[1]) -
+                                ck::type_convert<ck::long_index_t>(arg.in_left_pads_[1]);
+                            if(hi >= 0 &&
+                               ck::type_convert<std::size_t>(hi) <
+                                   arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] &&
+                               wi >= 0 &&
+                               ck::type_convert<std::size_t>(wi) <
+                                   arg.in_n_c_hi_wi_.mDesc.GetLengths()[3])
+                            {
+                                float v_in;
+                                float v_wei;
+
+                                arg.in_element_op_(
+                                    v_in,
+                                    static_cast<const float>(arg.in_n_c_hi_wi_(n, c, hi, wi)));
+                                arg.wei_element_op_(
+                                    v_wei, static_cast<const float>(arg.wei_k_c_y_x_(k, c, y, x)));
+
+                                v_acc += v_in * v_wei;
+                            }
+                        }
+                    }
+                }
+
+                float v_out;
+
+                arg.out_element_op_(v_out, v_acc, static_cast<float>(arg.bias_k_(k)));
+
+                arg.out_n_k_ho_wo_(n, k, ho, wo) = v_out;
+            };
+
+            make_ParallelTensorFunctor(f_nchw,
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[0],
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[1],
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[2],
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[3])(
+                std::thread::hardware_concurrency());
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<InDataType>& in_n_c_hi_wi,
+                             const Tensor<WeiDataType>& wei_k_c_y_x,
+                             Tensor<OutDataType>& out_n_k_ho_wo,
+                             const Tensor<OutDataType>& bias_k,
+                             std::vector<ck::index_t> conv_filter_strides,
+                             std::vector<ck::index_t> conv_filter_dilations,
+                             std::vector<ck::index_t> input_left_pads,
+                             std::vector<ck::index_t> input_right_pads,
+                             InElementwiseOperation in_element_op,
+                             WeiElementwiseOperation wei_element_op,
+                             OutElementwiseOperation out_element_op)
+    {
+        return Argument{in_n_c_hi_wi,
+                        wei_k_c_y_x,
+                        out_n_k_ho_wo,
+                        bias_k,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        in_element_op,
+                        wei_element_op,
+                        out_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceConvFwd_Bias_Activation"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp
new file mode 100644
index 00000000..f949f27f
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp
@@ -0,0 +1,200 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+// out[N, Ho, Wo, K] =
+//     activate(in[N, Hi, Wi, C] * wei[K, Y, X, C] + bias[K]) + residual[N, Ho, Wo, K]
+template <typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation>
+struct ReferenceConvFwd_Bias_Activation_Add : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<InDataType>& in_n_c_hi_wi,
+                 const Tensor<WeiDataType>& wei_k_c_y_x,
+                 Tensor<OutDataType>& out_n_k_ho_wo,
+                 const Tensor<OutDataType>& bias_k,
+                 const Tensor<OutDataType>& resi_n_k_ho_wo,
+                 std::vector<ck::index_t> conv_filter_strides,
+                 std::vector<ck::index_t> conv_filter_dilations,
+                 std::vector<ck::index_t> input_left_pads,
+                 std::vector<ck::index_t> input_right_pads,
+                 InElementwiseOperation in_element_op,
+                 WeiElementwiseOperation wei_element_op,
+                 OutElementwiseOperation out_element_op)
+            : in_n_c_hi_wi_{in_n_c_hi_wi},
+              wei_k_c_y_x_{wei_k_c_y_x},
+              out_n_k_ho_wo_{out_n_k_ho_wo},
+              bias_k_{bias_k},
+              resi_n_k_ho_wo_{resi_n_k_ho_wo},
+              conv_strides_{conv_filter_strides},
+              conv_dilations_{conv_filter_dilations},
+              in_left_pads_{input_left_pads},
+              in_right_pads_{input_right_pads},
+              in_element_op_{in_element_op},
+              wei_element_op_{wei_element_op},
+              out_element_op_{out_element_op}
+        {
+        }
+
+        const Tensor<InDataType>& in_n_c_hi_wi_;
+        const Tensor<WeiDataType>& wei_k_c_y_x_;
+        Tensor<OutDataType>& out_n_k_ho_wo_;
+        const Tensor<OutDataType>& bias_k_;
+        const Tensor<OutDataType>& resi_n_k_ho_wo_;
+
+        std::vector<index_t> conv_strides_;
+        std::vector<index_t> conv_dilations_;
+        std::vector<index_t> in_left_pads_;
+        std::vector<index_t> in_right_pads_;
+
+        InElementwiseOperation in_element_op_;
+        WeiElementwiseOperation wei_element_op_;
+        OutElementwiseOperation out_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument = ReferenceConvFwd_Bias_Activation_Add::Argument;
+
+        float Run(const Argument& arg)
+        {
+            auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
+                float v_acc = 0;
+
+                for(std::size_t c = 0; c < arg.wei_k_c_y_x_.mDesc.GetLengths()[1]; ++c)
+                {
+                    for(std::size_t y = 0; y < arg.wei_k_c_y_x_.mDesc.GetLengths()[2]; ++y)
+                    {
+                        auto hi = ck::type_convert<ck::long_index_t>(ho * arg.conv_strides_[0]) +
+                                  ck::type_convert<ck::long_index_t>(y * arg.conv_dilations_[0]) -
+                                  ck::type_convert<ck::long_index_t>(arg.in_left_pads_[0]);
+                        for(std::size_t x = 0; x < arg.wei_k_c_y_x_.mDesc.GetLengths()[3]; ++x)
+                        {
+                            auto wi =
+                                ck::type_convert<ck::long_index_t>(wo * arg.conv_strides_[1]) +
+                                ck::type_convert<ck::long_index_t>(x * arg.conv_dilations_[1]) -
+                                ck::type_convert<ck::long_index_t>(arg.in_left_pads_[1]);
+                            if(hi >= 0 &&
+                               ck::type_convert<std::size_t>(hi) <
+                                   arg.in_n_c_hi_wi_.mDesc.GetLengths()[2] &&
+                               wi >= 0 &&
+                               ck::type_convert<std::size_t>(wi) <
+                                   arg.in_n_c_hi_wi_.mDesc.GetLengths()[3])
+                            {
+                                float v_in;
+                                float v_wei;
+
+                                arg.in_element_op_(
+                                    v_in,
+                                    static_cast<const float>(arg.in_n_c_hi_wi_(n, c, hi, wi)));
+                                arg.wei_element_op_(
+                                    v_wei, static_cast<const float>(arg.wei_k_c_y_x_(k, c, y, x)));
+
+                                v_acc += v_in * v_wei;
+                            }
+                        }
+                    }
+                }
+
+                float v_out;
+
+                arg.out_element_op_(v_out,
+                                    v_acc,
+                                    static_cast<const float>(arg.bias_k_(k)),
+                                    static_cast<const float>(arg.resi_n_k_ho_wo_(n, k, ho, wo)));
+
+                arg.out_n_k_ho_wo_(n, k, ho, wo) = v_out;
+            };
+
+            make_ParallelTensorFunctor(f_nchw,
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[0],
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[1],
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[2],
+                                       arg.out_n_k_ho_wo_.mDesc.GetLengths()[3])(
+                std::thread::hardware_concurrency());
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /*stream_config*/ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<InDataType>& in_n_c_hi_wi,
+                             const Tensor<WeiDataType>& wei_k_c_y_x,
+                             Tensor<OutDataType>& out_n_k_ho_wo,
+                             const Tensor<OutDataType>& bias_k,
+                             const Tensor<OutDataType>& resi_n_k_ho_wo,
+                             std::vector<ck::index_t> conv_filter_strides,
+                             std::vector<ck::index_t> conv_filter_dilations,
+                             std::vector<ck::index_t> input_left_pads,
+                             std::vector<ck::index_t> input_right_pads,
+                             InElementwiseOperation in_element_op,
+                             WeiElementwiseOperation wei_element_op,
+                             OutElementwiseOperation out_element_op)
+    {
+        return Argument{in_n_c_hi_wi,
+                        wei_k_c_y_x,
+                        out_n_k_ho_wo,
+                        bias_k,
+                        resi_n_k_ho_wo,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        in_element_op,
+                        wei_element_op,
+                        out_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceConvFwd_Bias_Activation_Add"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
new file mode 100644
index 00000000..6728bb1f
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct ReferenceGemm : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<ADataType>& a_m_k,
+                 const Tensor<BDataType>& b_k_n,
+                 Tensor<CDataType>& c_m_n,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op)
+            : a_m_k_{a_m_k},
+              b_k_n_{b_k_n},
+              c_m_n_{c_m_n},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op}
+        {
+        }
+
+        const Tensor<ADataType>& a_m_k_;
+        const Tensor<BDataType>& b_k_n_;
+        Tensor<CDataType>& c_m_n_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument = ReferenceGemm::Argument;
+
+        float Run(const Argument& arg)
+        {
+            auto f_mk_kn_mn = [&](auto m, auto n) {
+                const int K = arg.a_m_k_.mDesc.GetLengths()[1];
+
+                AccDataType v_acc = 0;
+
+                for(int k = 0; k < K; ++k)
+                {
+                    ADataType v_a;
+                    BDataType v_b;
+
+                    arg.a_element_op_(v_a, arg.a_m_k_(m, k));
+                    arg.b_element_op_(v_b, arg.b_k_n_(k, n));
+
+                    v_acc +=
+                        ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);
+                }
+
+                AccDataType v_c;
+
+                arg.c_element_op_(v_c, v_acc);
+
+                arg.c_m_n_(m, n) = ck::type_convert<CDataType>(v_c);
+            };
+
+            make_ParallelTensorFunctor(
+                f_mk_kn_mn, arg.c_m_n_.mDesc.GetLengths()[0], arg.c_m_n_.mDesc.GetLengths()[1])(
+                std::thread::hardware_concurrency());
+
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<ADataType>& a_m_k,
+                             const Tensor<BDataType>& b_k_n,
+                             Tensor<CDataType>& c_m_n,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceGemm"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp
new file mode 100644
index 00000000..c77d22f4
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_2d.hpp
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+template <typename ADataType,
+          typename BDataType,
+          typename C0DataType,
+          typename CDataType,
+          typename AccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct ReferenceGemmBias2D : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<ADataType>& a_m_k,
+                 const Tensor<BDataType>& b_k_n,
+                 const Tensor<C0DataType>& c0_m_n,
+                 Tensor<CDataType>& c_m_n,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op)
+            : a_m_k_{a_m_k},
+              b_k_n_{b_k_n},
+              c0_m_n_{c0_m_n},
+              c_m_n_{c_m_n},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op}
+        {
+        }
+
+        const Tensor<ADataType>& a_m_k_;
+        const Tensor<BDataType>& b_k_n_;
+        const Tensor<CDataType>& c0_m_n_;
+        Tensor<CDataType>& c_m_n_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument = ReferenceGemmBias2D::Argument;
+
+        float Run(const Argument& arg)
+        {
+            auto f_mk_kn_mn = [&](auto m, auto n) {
+                const int K = arg.a_m_k_.mDesc.GetLengths()[1];
+
+                AccDataType a   = 0;
+                AccDataType b   = 0;
+                AccDataType acc = 0;
+
+                for(int k = 0; k < K; ++k)
+                {
+                    arg.a_element_op_(a, ck::type_convert<AccDataType>(arg.a_m_k_(m, k)));
+                    arg.b_element_op_(b, ck::type_convert<AccDataType>(arg.b_k_n_(k, n)));
+                    acc += a * b;
+                }
+
+                CDataType cast_acc = static_cast<CDataType>(acc);
+                arg.c_element_op_(arg.c_m_n_(m, n), cast_acc, arg.c0_m_n_(m, n));
+            };
+
+            make_ParallelTensorFunctor(
+                f_mk_kn_mn, arg.c_m_n_.mDesc.GetLengths()[0], arg.c_m_n_.mDesc.GetLengths()[1])(
+                std::thread::hardware_concurrency());
+
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<ADataType>& a_m_k,
+                             const Tensor<BDataType>& b_k_n,
+                             const Tensor<C0DataType>& c0_m_n,
+                             Tensor<CDataType>& c_m_n,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{a_m_k, b_k_n, c0_m_n, c_m_n, a_element_op, b_element_op, c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceGemmBias2D"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp
new file mode 100644
index 00000000..7dfc3c1e
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation.hpp
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+#include "ck/library/utility/host_tensor.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct ReferenceGemmBiasActivation : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<ADataType>& a_m_k,
+                 const Tensor<BDataType>& b_k_n,
+                 Tensor<CDataType>& c_m_n,
+                 const Tensor<CDataType>& c0_n,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op)
+            : a_m_k_{a_m_k},
+              b_k_n_{b_k_n},
+              c_m_n_{c_m_n},
+              c0_n_{c0_n},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op}
+        {
+        }
+
+        const Tensor<ADataType>& a_m_k_;
+        const Tensor<BDataType>& b_k_n_;
+        Tensor<CDataType>& c_m_n_;
+        const Tensor<CDataType>& c0_n_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument = ReferenceGemmBiasActivation::Argument;
+
+        float Run(const Argument& arg)
+        {
+            auto f_mk_kn_mn = [&](auto m, auto n) {
+                const int K = arg.a_m_k_.mDesc.GetLengths()[1];
+
+                float v_acc = 0;
+
+                for(int k = 0; k < K; ++k)
+                {
+                    float v_a;
+                    float v_b;
+
+                    arg.a_element_op_(v_a, static_cast<const float>(arg.a_m_k_(m, k)));
+                    arg.b_element_op_(v_b, static_cast<const float>(arg.b_k_n_(k, n)));
+
+                    v_acc += v_a * v_b;
+                }
+
+                float v_c;
+
+                arg.c_element_op_(v_c, v_acc, static_cast<float>(arg.c0_n_(n)));
+
+                arg.c_m_n_(m, n) = v_c;
+            };
+
+            make_ParallelTensorFunctor(
+                f_mk_kn_mn, arg.c_m_n_.mDesc.GetLengths()[0], arg.c_m_n_.mDesc.GetLengths()[1])(
+                std::thread::hardware_concurrency());
+
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<ADataType>& a_m_k,
+                             const Tensor<BDataType>& b_k_n,
+                             Tensor<CDataType>& c_m_n,
+                             const Tensor<CDataType>& c0_n,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{a_m_k, b_k_n, c_m_n, c0_n, a_element_op, b_element_op, c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceGemmBiasActivation"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp
new file mode 100644
index 00000000..99102a40
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_bias_activation_add.hpp
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+#include "ck/library/utility/host_tensor.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct ReferenceGemmBiasActivationAdd : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<ADataType>& a_m_k,
+                 const Tensor<BDataType>& b_k_n,
+                 Tensor<CDataType>& c_m_n,
+                 const Tensor<CDataType>& c0_n,
+                 const Tensor<CDataType>& c1_m_n,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op)
+            : a_m_k_{a_m_k},
+              b_k_n_{b_k_n},
+              c_m_n_{c_m_n},
+              c0_n_{c0_n},
+              c1_m_n_{c1_m_n},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              c_element_op_{c_element_op}
+        {
+        }
+
+        const Tensor<ADataType>& a_m_k_;
+        const Tensor<BDataType>& b_k_n_;
+        Tensor<CDataType>& c_m_n_;
+        const Tensor<CDataType>& c0_n_;
+        const Tensor<CDataType>& c1_m_n_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        CElementwiseOperation c_element_op_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        using Argument = ReferenceGemmBiasActivationAdd::Argument;
+
+        float Run(const Argument& arg)
+        {
+            auto f_mk_kn_mn = [&](auto m, auto n) {
+                const int K = arg.a_m_k_.mDesc.GetLengths()[1];
+
+                float v_acc = 0;
+
+                for(int k = 0; k < K; ++k)
+                {
+                    float v_a;
+                    float v_b;
+
+                    arg.a_element_op_(v_a, static_cast<const float>(arg.a_m_k_(m, k)));
+                    arg.b_element_op_(v_b, static_cast<const float>(arg.b_k_n_(k, n)));
+
+                    v_acc += v_a * v_b;
+                }
+
+                float v_c;
+
+                arg.c_element_op_(v_c,
+                                  v_acc,
+                                  static_cast<float>(arg.c0_n_(n)),
+                                  static_cast<float>(arg.c1_m_n_(m, n)));
+
+                arg.c_m_n_(m, n) = v_c;
+            };
+
+            make_ParallelTensorFunctor(
+                f_mk_kn_mn, arg.c_m_n_.mDesc.GetLengths()[0], arg.c_m_n_.mDesc.GetLengths()[1])(
+                std::thread::hardware_concurrency());
+
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<ADataType>& a_m_k,
+                             const Tensor<BDataType>& b_k_n,
+                             Tensor<CDataType>& c_m_n,
+                             const Tensor<CDataType>& c0_n,
+                             const Tensor<CDataType>& c1_m_n,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{
+            a_m_k, b_k_n, c_m_n, c0_n, c1_m_n, a_element_op, b_element_op, c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceGemmBiasActivationAdd"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp
new file mode 100644
index 00000000..28132aa1
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_layernorm.hpp
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+// D = Layernorm(acc_element_op(A * B + broadcast(bias)) + add) * broadcast(gamma) + broadcast(beta)
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename C0DataType,
+          typename AccDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename AccElementwiseOperation,
+          typename CElementwiseOperation>
+struct ReferenceGemmLayernorm : public device::BaseOperator
+{
+    using ReferenceGemmInstance = ReferenceGemm<ADataType,
+                                                BDataType,
+                                                AccDataType,
+                                                AccDataType,
+                                                AElementwiseOperation,
+                                                BElementwiseOperation,
+                                                element_wise::PassThrough>;
+
+    template <typename InDataType, typename OutDataType, typename ComputeDataType>
+    static void RunLayernorm(Tensor<OutDataType>& result,
+                             const Tensor<ComputeDataType>& acc, // MxN
+                             const Tensor<InDataType>& gamma,    // 1xN
+                             const Tensor<InDataType>& beta,     // 1xN
+                             const InDataType epsilon = 1e-5)
+    {
+        assert(acc.mDesc.GetLengths()[1] == gamma.mDesc.GetLengths()[0] &&
+               acc.mDesc.GetLengths()[1] == beta.mDesc.GetLengths()[0]);
+
+        size_t M = acc.mDesc.GetLengths()[0];
+        size_t N = acc.mDesc.GetLengths()[1];
+
+        Tensor<ComputeDataType> avg_acc_sq({M});
+        Tensor<ComputeDataType> avg_acc({M});
+        Tensor<ComputeDataType> acc_layernorm(acc);
+
+        // reduce N dim
+        for(size_t i = 0; i < M; i++)
+        {
+            ComputeDataType sum_acc_sq = 0;
+            ComputeDataType sum_acc    = 0;
+            for(size_t j = 0; j < N; j++)
+            {
+                sum_acc_sq += acc_layernorm(i, j) * acc_layernorm(i, j);
+                sum_acc += acc_layernorm(i, j);
+            }
+            avg_acc_sq(i) = sum_acc_sq / N;
+            avg_acc(i)    = sum_acc / N;
+        }
+
+        // normalize
+        acc_layernorm.ForEach([&](auto& self, auto idx) {
+            self(idx[0], idx[1]) =
+                (self(idx[0], idx[1]) - avg_acc(idx[0])) /
+                sqrt(avg_acc_sq(idx[0]) - avg_acc(idx[0]) * avg_acc(idx[0]) + epsilon);
+        });
+
+        // affine
+        acc_layernorm.ForEach([&](auto& self, auto idx) {
+            self(idx[0], idx[1]) = self(idx[0], idx[1]) * gamma(idx[1]) + beta(idx[1]);
+        });
+
+        // cast
+        result = acc_layernorm.template CopyAsType<OutDataType>();
+    }
+
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<ADataType>& a_m_k,
+                 const Tensor<BDataType>& b_k_n,
+                 Tensor<CDataType>& c_m_n,
+                 const Tensor<C0DataType>& c0_n_bias,  // 1xN
+                 const Tensor<C0DataType>& c0_m_n_add, // MxN
+                 const Tensor<C0DataType>& c0_n_gamma, // 1xN
+                 const Tensor<C0DataType>& c0_n_beta,  // 1xN
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 AccElementwiseOperation acc_element_op,
+                 CElementwiseOperation c_element_op,
+                 const CDataType epsilon = 1e-5)
+            : a_m_k_{a_m_k},
+              b_k_n_{b_k_n},
+              c_m_n_{c_m_n},
+              c0_n_bias_{c0_n_bias},
+              c0_m_n_add_{c0_m_n_add},
+              c0_n_gamma_{c0_n_gamma},
+              c0_n_beta_{c0_n_beta},
+              a_element_op_{a_element_op},
+              b_element_op_{b_element_op},
+              acc_element_op_{acc_element_op},
+              c_element_op_{c_element_op},
+              epsilon_{epsilon}
+        {
+        }
+
+        const Tensor<ADataType>& a_m_k_;
+        const Tensor<BDataType>& b_k_n_;
+        Tensor<CDataType>& c_m_n_;
+        const Tensor<C0DataType>& c0_n_bias_;
+        const Tensor<C0DataType>& c0_m_n_add_;
+        const Tensor<C0DataType>& c0_n_gamma_;
+        const Tensor<C0DataType>& c0_n_beta_;
+
+        AElementwiseOperation a_element_op_;
+        BElementwiseOperation b_element_op_;
+        AccElementwiseOperation acc_element_op_;
+        CElementwiseOperation c_element_op_;
+
+        const CDataType epsilon_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        // using Argument = ReferenceGemm::Argument;
+
+        float Run(const Argument& arg)
+        {
+            Tensor<AccDataType> acc_m_n(arg.c_m_n_.mDesc);
+            acc_m_n.GenerateTensorValue(GeneratorTensor_1<AccDataType>{0});
+
+            auto ref_gemm     = ReferenceGemmInstance{};
+            auto ref_invoker  = ref_gemm.MakeInvoker();
+            auto ref_argument = ref_gemm.MakeArgument(arg.a_m_k_,
+                                                      arg.b_k_n_,
+                                                      acc_m_n,
+                                                      arg.a_element_op_,
+                                                      arg.b_element_op_,
+                                                      element_wise::PassThrough{});
+
+            // gemm
+            ref_invoker.Run(ref_argument);
+
+            // activation(acc + bias)
+            acc_m_n.ForEach([&](auto& self, auto idx) {
+                AccDataType out;
+                arg.acc_element_op_(out, acc_m_n(idx[0], idx[1]) + arg.c0_n_bias_(idx[1]));
+                self(idx[0], idx[1]) = out;
+            });
+
+            // add from other layers
+            acc_m_n.ForEach([&](auto& self, auto idx) {
+                self(idx[0], idx[1]) += arg.c0_m_n_add_(idx[0], idx[1]);
+            });
+
+            // layernorm
+            RunLayernorm(arg.c_m_n_, acc_m_n, arg.c0_n_gamma_, arg.c0_n_beta_);
+
+            // elementwise op
+            arg.c_m_n_.ForEach([&](auto& self, auto idx) {
+                arg.c_element_op_(self(idx[0], idx[1]), self(idx[0], idx[1]));
+            });
+
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<ADataType>& a_m_k,
+                             const Tensor<BDataType>& b_k_n,
+                             Tensor<CDataType>& c_m_n,
+                             const Tensor<C0DataType>& c0_n_bias,  // 1xN
+                             const Tensor<C0DataType>& c0_m_n_add, // 1xN
+                             const Tensor<C0DataType>& c0_n_gamma, // 1xN
+                             const Tensor<C0DataType>& c0_n_beta,  // 1xN
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             AccElementwiseOperation acc_element_op,
+                             CElementwiseOperation c_element_op,
+                             const CDataType epsilon = 1e-5)
+    {
+        return Argument{a_m_k,
+                        b_k_n,
+                        c_m_n,
+                        c0_n_bias,
+                        c0_m_n_add,
+                        c0_n_gamma,
+                        c0_n_beta,
+                        a_element_op,
+                        b_element_op,
+                        acc_element_op,
+                        c_element_op,
+                        epsilon};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceGemmLayernorm"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp
new file mode 100644
index 00000000..fedd4dce
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp
@@ -0,0 +1,191 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <algorithm>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+template <typename XDataType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename AccElementwiseOperation>
+struct ReferenceGroupnorm : public device::BaseOperator
+{
+    // x = [N, H, W, G, C]
+    // y = [N, H, W, G, C]
+    // reduce dim [H, W, C], mean, var = [N, G]
+    // gamma, beta = [G, C]
+    // beta: [G, C]
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<XDataType>& x,
+                 const Tensor<GammaDataType>& gamma,
+                 const Tensor<BetaDataType>& beta,
+                 Tensor<YDataType>& y,
+                 AccElementwiseOperation acc_elementwise_op,
+                 const std::vector<index_t> lengths,
+                 AccDataType epsilon)
+            : x_(x),
+              gamma_(gamma),
+              beta_(beta),
+              y_(y),
+              acc_elementwise_op_(acc_elementwise_op),
+              lengths_(lengths),
+              epsilon_(epsilon)
+        {
+        }
+
+        const Tensor<XDataType> x_;
+        const Tensor<XDataType> gamma_;
+        const Tensor<XDataType> beta_;
+        Tensor<YDataType>& y_;
+        AccElementwiseOperation acc_elementwise_op_;
+        std::vector<index_t> lengths_;
+        AccDataType epsilon_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        float Run(const Argument& arg)
+        {
+            int N = arg.lengths_[0];
+            int H = arg.lengths_[1];
+            int W = arg.lengths_[2];
+            int G = arg.lengths_[3];
+            int C = arg.lengths_[4];
+
+            Tensor<AccDataType> mean({N, G});
+            Tensor<AccDataType> var({N, G});
+
+            // Compute mean & var in [H, W, C] by Welford Algorithm
+            // TODO - parallel for each HWC
+            // TODO - address calculation
+            for(int n = 0; n < N; ++n)
+            {
+                for(int g = 0; g < G; ++g)
+                {
+                    AccDataType mean_val = type_convert<AccDataType>(0.0f);
+                    AccDataType var_val  = type_convert<AccDataType>(0.0f);
+                    int32_t curr_count   = 0;
+
+                    for(int h = 0; h < H; ++h)
+                    {
+                        for(int w = 0; w < W; ++w)
+                        {
+                            for(int c = 0; c < C; ++c)
+                            {
+                                curr_count++;
+                                AccDataType x = type_convert<AccDataType>(arg.x_(n, h, w, g, c));
+                                AccDataType delta = x - mean_val;
+                                mean_val += delta / curr_count;
+                                AccDataType delta2 = x - mean_val;
+                                var_val += delta * delta2;
+                            }
+                        }
+                    }
+
+                    mean(n, g) = mean_val;
+                    var(n, g)  = var_val / curr_count;
+                }
+            }
+
+            // Normalization
+            for(int n = 0; n < N; ++n)
+            {
+                for(int h = 0; h < H; ++h)
+                {
+                    for(int w = 0; w < W; ++w)
+                    {
+                        for(int g = 0; g < G; ++g)
+                        {
+                            for(int c = 0; c < C; ++c)
+                            {
+                                AccDataType x = type_convert<AccDataType>(arg.x_(n, h, w, g, c));
+                                AccDataType gamma    = type_convert<AccDataType>(arg.gamma_(g, c));
+                                AccDataType beta     = type_convert<AccDataType>(arg.beta_(g, c));
+                                AccDataType mean_val = type_convert<AccDataType>(mean(n, g));
+                                AccDataType var_val  = type_convert<AccDataType>(var(n, g));
+                                AccDataType y        = gamma * (x - mean_val) /
+                                                    ck::math::sqrt(arg.epsilon_ + var_val) +
+                                                beta;
+                                arg.acc_elementwise_op_(y, y);
+                                arg.y_(n, h, w, g, c) = type_convert<YDataType>(y);
+                            }
+                        }
+                    }
+                }
+            }
+
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument* p_arg) override
+    {
+        const Argument* p_arg_ = dynamic_cast<const Argument*>(p_arg);
+        if(p_arg_->lengths_.size() != 5)
+            return false;
+
+        return true;
+    }
+
+    static auto MakeArgument(const Tensor<XDataType>& x,
+                             const Tensor<GammaDataType>& gamma,
+                             const Tensor<BetaDataType>& beta,
+                             Tensor<YDataType>& y,
+                             AccElementwiseOperation acc_elementwise_op,
+                             const std::vector<index_t> lengths,
+                             AccDataType epsilon)
+    {
+        return Argument{x, gamma, beta, y, acc_elementwise_op, lengths, epsilon};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceLayernorm"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp
new file mode 100644
index 00000000..680d94f7
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp
@@ -0,0 +1,171 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <algorithm>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+template <typename XDataType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename AccElementwiseOperation,
+          index_t Rank,
+          index_t NumReduceDim>
+struct ReferenceLayernorm : public device::BaseOperator
+{
+    // TODO - support generic layernorm
+    static_assert((Rank == 2 && NumReduceDim == 1), "Only support 2D version so far");
+
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<XDataType>& x_m_n,
+                 const Tensor<GammaDataType>& gamma_n,
+                 const Tensor<BetaDataType>& beta_n,
+                 Tensor<YDataType>& y_m_n,
+                 AccElementwiseOperation acc_elementwise_op,
+                 const std::vector<index_t> lengths,
+                 const std::vector<index_t> reduceDims,
+                 AccDataType epsilon)
+            : x_m_n_(x_m_n),
+              gamma_n_(gamma_n),
+              beta_n_(beta_n),
+              y_m_n_(y_m_n),
+              acc_elementwise_op_(acc_elementwise_op),
+              lengths_(lengths),
+              reduceDims_(reduceDims),
+              epsilon_(epsilon)
+        {
+        }
+
+        const Tensor<XDataType> x_m_n_;
+        const Tensor<XDataType> gamma_n_;
+        const Tensor<XDataType> beta_n_;
+        Tensor<YDataType>& y_m_n_;
+        AccElementwiseOperation acc_elementwise_op_;
+        std::vector<index_t> lengths_;
+        std::vector<index_t> reduceDims_;
+        AccDataType epsilon_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        float Run(const Argument& arg)
+        {
+            int M = arg.lengths_[0];
+            int N = arg.lengths_[1];
+
+            Tensor<AccDataType> mean({M});
+            Tensor<AccDataType> var({M});
+
+            for(int m = 0; m < M; ++m)
+            {
+                mean(m) = 0;
+                var(m)  = 0;
+
+                for(int n = 0; n < N; ++n)
+                {
+                    auto x_val = ck::type_convert<AccDataType>(arg.x_m_n_(m, n));
+                    mean(m) += x_val;
+                    var(m) += x_val * x_val;
+                }
+
+                mean(m) = mean(m) / N;
+                var(m)  = (var(m) / N) - (mean(m) * mean(m));
+            }
+
+            for(int m = 0; m < M; ++m)
+            {
+                for(int n = 0; n < N; ++n)
+                {
+                    auto x_val = ck::type_convert<AccDataType>(arg.x_m_n_(m, n));
+                    auto y_val = (x_val - mean(m)) / sqrt(var(m) + arg.epsilon_);
+                    y_val      = (y_val * arg.gamma_n_(n)) + arg.beta_n_(n);
+                    arg.acc_elementwise_op_(y_val, y_val);
+                    arg.y_m_n_(m, n) = ck::type_convert<YDataType>(y_val);
+                }
+            }
+
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument* p_arg) override
+    {
+        const Argument* p_arg_ = dynamic_cast<const Argument*>(p_arg);
+
+        // TODO - support generic layernorm
+        if(p_arg_->lengths_.size() != 2)
+            return false;
+
+        if(p_arg_->reduceDims_.size() != 1)
+            return false;
+
+        if(p_arg_->reduceDims_[0] != 1)
+            return false;
+
+        return true;
+    }
+
+    static auto MakeArgument(const Tensor<XDataType>& x_m_n,
+                             const Tensor<GammaDataType>& gamma_n,
+                             const Tensor<BetaDataType>& beta_n,
+                             Tensor<YDataType>& y_m_n,
+                             AccElementwiseOperation acc_elementwise_op,
+                             const std::vector<index_t> lengths,
+                             const std::vector<index_t> reduceDims,
+                             AccDataType epsilon)
+    {
+        return Argument{
+            x_m_n, gamma_n, beta_n, y_m_n, acc_elementwise_op, lengths, reduceDims, epsilon};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceLayernorm"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp
new file mode 100644
index 00000000..4839eb8a
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <algorithm>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+template <typename InDataType, typename OutDataType, typename AccDataType>
+struct ReferenceSoftmax : public device::BaseOperator
+{
+    // Argument
+    struct Argument : public device::BaseArgument
+    {
+        Argument(const Tensor<InDataType>& in,
+                 Tensor<OutDataType>& out,
+                 AccDataType alpha,
+                 AccDataType beta,
+                 const std::vector<index_t> sm_reduce_dims)
+            : in_(in), out_(out), alpha_(alpha), beta_(beta), sm_reduce_dims_(sm_reduce_dims)
+        {
+            // std::cout << "debug: scalar dims: ";
+            for(size_t i = 0; i < in.mDesc.GetNumOfDimension(); i++)
+            {
+                if(std::find(sm_reduce_dims.begin(), sm_reduce_dims.end(), i) ==
+                   sm_reduce_dims.end())
+                {
+                    sm_scalar_dims_.push_back(i);
+                    // std::cout << i << ", ";
+                }
+            }
+            // std::cout << std::endl;
+        }
+
+        const Tensor<InDataType>& in_;
+        Tensor<OutDataType>& out_;
+        AccDataType alpha_;
+        AccDataType beta_;
+        std::vector<index_t> sm_reduce_dims_;
+        std::vector<index_t> sm_scalar_dims_; // dim after internal max/sum reduction
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        float Run(const Argument& arg)
+        {
+            std::vector<size_t> scalar_lengths;
+            for(index_t dim : arg.sm_scalar_dims_)
+            {
+                scalar_lengths.push_back(arg.in_.mDesc.GetLengths()[dim]);
+            }
+            // max and sum reduction with final reduced values of dim=0 is a scalar so give it
+            // appropriate lengths of {1}
+            if(arg.sm_scalar_dims_.size() == 0)
+            {
+                scalar_lengths.push_back(1);
+            }
+
+            Tensor<AccDataType> reduce_max(scalar_lengths);
+            reduce_max.GenerateTensorValue(
+                GeneratorTensor_1<AccDataType>{std::numeric_limits<AccDataType>::lowest()});
+            Tensor<AccDataType> reduce_sum(scalar_lengths);
+            reduce_sum.GenerateTensorValue(GeneratorTensor_1<AccDataType>{0});
+
+            // when final reduced values is of dim=0, the index will be transformed into empty
+            // std::vector which is actually a valid input for Tensor::operator(std::vector) and
+            // internally accesses 0'th element
+            auto to_sm_scalar_idx = [&](auto idx) {
+                std::vector<size_t> sm_scalar_idx;
+                for(index_t dim : arg.sm_scalar_dims_)
+                {
+                    sm_scalar_idx.push_back(idx[dim]);
+                }
+                return sm_scalar_idx;
+            };
+
+            arg.in_.ForEach([&](auto& self, auto idx) {
+                reduce_max(to_sm_scalar_idx(idx)) = std::max(
+                    reduce_max(to_sm_scalar_idx(idx)), ck::type_convert<AccDataType>(self(idx)));
+            });
+
+            // LogRangeAsType<float>(std::cout << "reduce_max: ", reduce_max.mData, ",") <<
+            // std::endl;
+
+            Tensor<AccDataType> in_stable(arg.in_.mDesc);
+            in_stable.ForEach([&](auto& self, auto idx) {
+                // numerator = exp(x - max(x))
+                self(idx) = std::exp(ck::type_convert<AccDataType>(arg.in_(idx)) -
+                                     reduce_max(to_sm_scalar_idx(idx)));
+            });
+
+            // LogRangeAsType<float>(std::cout << "in_stable: ", in_stable.mData, ",") << std::endl;
+
+            in_stable.ForEach([&](auto& self, auto idx) {
+                // denominator = sum(exp(x - max(x)))
+                reduce_sum(to_sm_scalar_idx(idx)) += self(idx);
+            });
+
+            // LogRangeAsType<float>(std::cout << "reduce_sum: ", reduce_sum.mData, ",") <<
+            // std::endl;
+
+            arg.out_.ForEach([&](auto& self, auto idx) {
+                AccDataType temp_result =
+                    arg.alpha_ * in_stable(idx) / reduce_sum(to_sm_scalar_idx(idx)) +
+                    arg.beta_ * self(idx);
+                self(idx) = ck::type_convert<OutDataType>(temp_result);
+            });
+
+            // LogRangeAsType<float>(std::cout << "out: ", arg.out_.mData, ",") << std::endl;
+            // reduction along reduce dims
+            // LogRangeAsType<float>(std::cout << "reduce_max: ", reduce_max.mData, ",") <<
+            // std::endl; LogRangeAsType<float>(std::cout << "reduce_sum: ", reduce_sum.mData, ",")
+            // << std::endl;
+
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(const Tensor<InDataType>& in,
+                             Tensor<OutDataType>& out,
+                             AccDataType alpha,
+                             AccDataType beta,
+                             const std::vector<index_t> sm_reduce_dims)
+    {
+        return Argument{in, out, alpha, beta, sm_reduce_dims};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceSoftmax"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_sparse_embedding3_forward_layernorm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_sparse_embedding3_forward_layernorm.hpp
new file mode 100644
index 00000000..b6a9b0fb
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_sparse_embedding3_forward_layernorm.hpp
@@ -0,0 +1,205 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <algorithm>
+
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace host {
+
+template <typename EmbType,
+          typename IndexType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename AccDataType,
+          typename OutType>
+struct ReferenceSparseEmbedding3ForwardLayernorm : public device::BaseOperator
+{
+    struct Argument : public device::BaseArgument
+    {
+        Argument(Tensor<OutType>& output,
+                 const Tensor<EmbType>& emb_a,
+                 const Tensor<EmbType>& emb_b,
+                 const Tensor<EmbType>& emb_c,
+                 const Tensor<IndexType>& index_a,
+                 const Tensor<IndexType>& index_b,
+                 const Tensor<IndexType>& index_c,
+                 const Tensor<GammaDataType>& gamma,
+                 const Tensor<BetaDataType>& beta,
+                 ck::index_t NumRows,
+                 ck::index_t EmbeddingDim,
+                 ck::index_t IndexLength,
+                 AccDataType epsilon)
+            : output_(output),
+              emb_a_(emb_a),
+              emb_b_(emb_b),
+              emb_c_(emb_c),
+              index_a_(index_a),
+              index_b_(index_b),
+              index_c_(index_c),
+              gamma_(gamma),
+              beta_(beta),
+              NumRows_(NumRows),
+              EmbeddingDim_(EmbeddingDim),
+              IndexLength_(IndexLength),
+              epsilon_(epsilon)
+        {
+        }
+        Tensor<OutType>& output_;
+        const Tensor<EmbType> emb_a_;
+        const Tensor<EmbType> emb_b_;
+        const Tensor<EmbType> emb_c_;
+        const Tensor<IndexType> index_a_;
+        const Tensor<IndexType> index_b_;
+        const Tensor<IndexType> index_c_;
+        const Tensor<GammaDataType> gamma_;
+        const Tensor<BetaDataType> beta_;
+        ck::index_t NumRows_;
+        ck::index_t EmbeddingDim_;
+        ck::index_t IndexLength_;
+        AccDataType epsilon_;
+    };
+
+    // Invoker
+    struct Invoker : public device::BaseInvoker
+    {
+        float Run(const Argument& arg)
+        {
+            ck::index_t D = arg.EmbeddingDim_;
+            ck::index_t L = arg.IndexLength_;
+            ck::index_t E = arg.NumRows_;
+
+            Tensor<AccDataType> accumulator({L, D});
+
+            Tensor<AccDataType> mean({L});
+            Tensor<AccDataType> var({L});
+
+            accumulator.SetZero();
+
+            auto f_emb_per_row = [&](auto idx) {
+                IndexType idx_a = arg.index_a_(idx);
+                IndexType idx_b = arg.index_b_(idx);
+                IndexType idx_c = arg.index_c_(idx);
+
+                if(!((idx_a < E) && (idx_b < E) && (idx_c < E)))
+                {
+                    throw(std::runtime_error("wrong! out of range"));
+                }
+
+                for(auto d = 0; d < D; d++)
+                {
+                    auto v_a = ck::type_convert<AccDataType>(arg.emb_a_(idx_a, d));
+                    auto v_b = ck::type_convert<AccDataType>(arg.emb_b_(idx_b, d));
+                    auto v_c = ck::type_convert<AccDataType>(arg.emb_c_(idx_c, d));
+
+                    accumulator(idx, d) += v_a + v_b + v_c;
+                }
+            };
+            make_ParallelTensorFunctor(f_emb_per_row, L)(std::thread::hardware_concurrency());
+
+            // layernorm
+            for(auto idx = 0; idx < L; ++idx)
+            {
+                mean(idx) = 0;
+                var(idx)  = 0;
+
+                for(auto d = 0; d < D; ++d)
+                {
+                    auto x_val = accumulator(idx, d);
+                    mean(idx) += x_val;
+                    var(idx) += x_val * x_val;
+                }
+
+                mean(idx) = mean(idx) / D;
+                var(idx)  = (var(idx) / D) - (mean(idx) * mean(idx));
+            }
+
+            for(auto idx = 0; idx < L; ++idx)
+            {
+                for(auto d = 0; d < D; ++d)
+                {
+                    auto x_val          = accumulator(idx, d);
+                    auto y_val          = (x_val - mean(idx)) / sqrt(var(idx) + arg.epsilon_);
+                    y_val               = (y_val * arg.gamma_(d)) + arg.beta_(d);
+                    arg.output_(idx, d) = ck::type_convert<OutType>(y_val);
+                }
+            }
+            return 0;
+        }
+
+        float Run(const device::BaseArgument* p_arg,
+                  const StreamConfig& /* stream_config */ = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg));
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    bool IsSupportedArgument(const device::BaseArgument*) override { return true; }
+
+    static auto MakeArgument(Tensor<OutType>& output,
+                             const Tensor<EmbType>& emb_a,
+                             const Tensor<EmbType>& emb_b,
+                             const Tensor<EmbType>& emb_c,
+                             const Tensor<IndexType>& index_a,
+                             const Tensor<IndexType>& index_b,
+                             const Tensor<IndexType>& index_c,
+                             const Tensor<GammaDataType>& gamma,
+                             const Tensor<BetaDataType>& beta,
+                             ck::index_t NumRows,
+                             ck::index_t EmbeddingDim,
+                             ck::index_t IndexLength,
+                             AccDataType epsilon)
+    {
+        return Argument(output,
+                        emb_a,
+                        emb_b,
+                        emb_c,
+                        index_a,
+                        index_b,
+                        index_c,
+                        gamma,
+                        beta,
+                        NumRows,
+                        EmbeddingDim,
+                        IndexLength,
+                        epsilon);
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    virtual std::unique_ptr<device::BaseInvoker> MakeInvokerPointer()
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "ReferenceSparseEmbedding3ForwardLayernorm"
+            << std::endl;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace host
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd.hpp b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd.hpp
new file mode 100644
index 00000000..df4fca65
--- /dev/null
+++ b/library/include/ck/library/reference_tensor_operation/gpu/naive_conv_fwd.hpp
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef NAIVE_CONV_FWD_HPP
+#define NAIVE_CONV_FWD_HPP
+
+namespace ck {
+namespace ref {
+
+/*
+ * \brief naive implementation of 3D convolution. Layout is (NDHWC, KZYXC, NDHWK).
+ *
+ * \param N number of batches
+ * \param K number of filters
+ * \param C number of channels of weight
+ * \param (Di, Hi, Wi) depth, height and width dimension of data
+ * \param (Z, Y, X) depth, height and width dimensions of weights
+ * \param (Do, Ho, Wo) depth, height and width dimension of output
+ * \param (stride_z, stride_y, stride_x) strides
+ * \param (dilation_z, dilation_y, dilation_x) dilations
+ * \param (pad_z, pad_y, pad_x) pads
+ */
+template <typename TIn,
+          typename TWei,
+          typename TOut,
+          typename TAcc,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation>
+__global__ void naive_conv_fwd_ndhwc_kzyxc_ndhwk(const TIn* __restrict__ p_in,
+                                                 const TWei* __restrict__ p_wei,
+                                                 TOut* __restrict__ p_out,
+                                                 index_t N,
+                                                 index_t K,
+                                                 index_t C,
+                                                 index_t Di,
+                                                 index_t Hi,
+                                                 index_t Wi,
+                                                 index_t Z,
+                                                 index_t Y,
+                                                 index_t X,
+                                                 index_t Do,
+                                                 index_t Ho,
+                                                 index_t Wo,
+                                                 index_t stride_z,
+                                                 index_t stride_y,
+                                                 index_t stride_x,
+                                                 index_t dilation_z,
+                                                 index_t dilation_y,
+                                                 index_t dilation_x,
+                                                 index_t pad_z,
+                                                 index_t pad_y,
+                                                 index_t pad_x)
+{
+    const index_t tid                = blockIdx.x * blockDim.x + threadIdx.x;
+    const index_t num_threads        = blockDim.x * gridDim.x;
+    const long_index_t output_length = N * Do * Ho * Wo * K;
+
+    const index_t out_strides[] = {Do * Ho * Wo * K, Ho * Wo * K, Wo * K, K};
+    const index_t in_strides[]  = {Di * Hi * Wi * C, Hi * Wi * C, Wi * C, C};
+    const index_t wei_strides[] = {Z * Y * X * C, Y * X * C, X * C, C};
+
+    constexpr auto in_op  = InElementwiseOperation{};
+    constexpr auto wei_op = WeiElementwiseOperation{};
+    constexpr auto out_op = OutElementwiseOperation{};
+
+    TIn in_val;
+    TWei wei_val;
+    TOut out_val;
+
+    for(long_index_t ii = tid; ii < output_length; ii += num_threads)
+    {
+        const index_t n  = ii / out_strides[0];
+        index_t k        = ii - n * out_strides[0];
+        const index_t dO = k / out_strides[1];
+        k -= dO * out_strides[1];
+        const index_t ho = k / out_strides[2];
+        k -= ho * out_strides[2];
+        const index_t wo = k / out_strides[3];
+        k -= wo * out_strides[3];
+
+        TAcc acc = static_cast<TAcc>(0);
+
+        const TIn* in_n   = p_in + static_cast<long_index_t>(n) * in_strides[0];
+        const TWei* wei_k = p_wei + static_cast<long_index_t>(k) * wei_strides[0];
+
+        for(index_t z = 0; z < Z; ++z)
+        {
+            index_t di          = stride_z * dO - pad_z + dilation_z * z;
+            const TIn* in_n_di  = in_n + di * in_strides[1];
+            const TWei* wei_k_z = wei_k + z * wei_strides[1];
+
+            for(index_t y = 0; y < Y; ++y)
+            {
+                index_t hi            = stride_y * ho - pad_y + dilation_y * y;
+                const TIn* in_n_di_hi = in_n_di + hi * in_strides[2];
+                const TWei* wei_k_z_y = wei_k_z + y * wei_strides[2];
+
+                for(index_t x = 0; x < X; ++x)
+                {
+                    index_t wi               = stride_x * wo - pad_x + dilation_x * x;
+                    const TIn* in_n_di_hi_wi = in_n_di_hi + wi * in_strides[3];
+                    const TWei* wei_k_z_y_x  = wei_k_z_y + x * wei_strides[3];
+
+                    if(di >= 0 && di < Di && hi >= 0 && hi < Hi && wi >= 0 && wi < Wi)
+                    {
+                        for(index_t c = 0; c < C; ++c)
+                        {
+                            in_op(in_val, in_n_di_hi_wi[c]);
+                            wei_op(wei_val, wei_k_z_y_x[c]);
+                            acc += in_val * wei_val;
+                        }
+                    }
+                }
+            }
+        }
+
+        out_op(out_val, static_cast<TOut>(acc));
+        p_out[ii] = out_val;
+    }
+}
+} // namespace ref
+} // namespace ck
+
+#endif
diff --git a/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp b/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp
new file mode 100644
index 00000000..20df1b36
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+#include <type_traits>
+
+#include "ck/utility/functional2.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <typename BaseOp, typename NewOpInstances>
+void add_device_operation_instances(std::vector<std::unique_ptr<BaseOp>>& op_instances,
+                                    const NewOpInstances& new_op_instances)
+{
+    ck::static_for<0, std::tuple_size_v<NewOpInstances>, 1>{}([&](auto i) {
+        const auto new_op_instance = std::get<i>(new_op_instances);
+
+        using NewOpInstance = remove_cvref_t<decltype(new_op_instance)>;
+
+        static_assert(std::is_base_of_v<BaseOp, NewOpInstance>,
+                      "wrong! NewOpInstance should be derived from BaseOp");
+
+        op_instances.push_back(std::make_unique<NewOpInstance>(new_op_instance));
+    });
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
new file mode 100644
index 00000000..91980a9a
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/tuple.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// aliasing, for commonly used data type
+using F64  = double;
+using F32  = float;
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+using I8   = int8_t;
+using I32  = int32_t;
+
+using Empty_Tuple = ck::Tuple<>;
+
+using F16_Tuple     = ck::Tuple<F16>;
+using F16_F16_Tuple = ck::Tuple<F16, F16>;
+
+using F32_Tuple     = ck::Tuple<F32>;
+using I32_Tuple     = ck::Tuple<I32>;
+using I32_F32_Tuple = ck::Tuple<I32, F32>;
+
+// GEMM layout
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using Row_Tuple     = ck::Tuple<Row>;
+using Row_Row_Tuple = ck::Tuple<Row, Row>;
+
+// Conv layout
+//
+using NWC   = ck::tensor_layout::convolution::NWC;
+using NHWC  = ck::tensor_layout::convolution::NHWC;
+using NDHWC = ck::tensor_layout::convolution::NDHWC;
+
+using KXC   = ck::tensor_layout::convolution::KXC;
+using KYXC  = ck::tensor_layout::convolution::KYXC;
+using KZYXC = ck::tensor_layout::convolution::KZYXC;
+
+using NWK   = ck::tensor_layout::convolution::NWK;
+using NHWK  = ck::tensor_layout::convolution::NHWK;
+using NDHWK = ck::tensor_layout::convolution::NDHWK;
+
+//
+using GNWC   = ck::tensor_layout::convolution::GNWC;
+using GNHWC  = ck::tensor_layout::convolution::GNHWC;
+using GNDHWC = ck::tensor_layout::convolution::GNDHWC;
+
+using GKXC   = ck::tensor_layout::convolution::GKXC;
+using GKYXC  = ck::tensor_layout::convolution::GKYXC;
+using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
+
+using GNWK   = ck::tensor_layout::convolution::GNWK;
+using GNHWK  = ck::tensor_layout::convolution::GNHWK;
+using GNDHWK = ck::tensor_layout::convolution::GNDHWK;
+
+//
+using NWGC   = ck::tensor_layout::convolution::NWGC;
+using NHWGC  = ck::tensor_layout::convolution::NHWGC;
+using NDHWGC = ck::tensor_layout::convolution::NDHWGC;
+
+using KXGC   = ck::tensor_layout::convolution::KXGC;
+using KYXGC  = ck::tensor_layout::convolution::KYXGC;
+using KZYXGC = ck::tensor_layout::convolution::KZYXGC;
+
+using NWGK   = ck::tensor_layout::convolution::NWGK;
+using NHWGK  = ck::tensor_layout::convolution::NHWGK;
+using NDHWGK = ck::tensor_layout::convolution::NDHWGK;
+
+//
+using GK          = ck::tensor_layout::convolution::G_K;
+using GK_Tuple    = ck::Tuple<GK>;
+using GK_GK_Tuple = ck::Tuple<GK, GK>;
+
+// pointwise functor
+using PassThrough    = ck::tensor_operation::element_wise::PassThrough;
+using Relu           = ck::tensor_operation::element_wise::Relu;
+using Scale          = ck::tensor_operation::element_wise::Scale;
+using Bilinear       = ck::tensor_operation::element_wise::Bilinear;
+using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
+using AddFastGelu    = ck::tensor_operation::element_wise::AddFastGelu;
+using FastGelu       = ck::tensor_operation::element_wise::FastGelu;
+
+template <typename Activation>
+using Activation_Mul_Clamp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<Activation>;
+
+template <typename Activation>
+using Add_Activation_Mul_Clamp =
+    ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<Activation>;
+
+template <typename Activation>
+using Activation_Mul2_Clamp = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<Activation>;
+
+template <typename Activation>
+using Add_Activation_Mul2_Clamp =
+    ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<Activation>;
+
+template <typename DeviceOp, typename Tag = void>
+struct DeviceOperationInstanceFactory;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp
new file mode 100644
index 00000000..0655fd92
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp
@@ -0,0 +1,259 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Col, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Col, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Row, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Row, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemm<Col,
+                                                  Row,
+                                                  Row,
+                                                  int8_t,
+                                                  int8_t,
+                                                  int8_t,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemm<Col,
+                                                  Col,
+                                                  Row,
+                                                  int8_t,
+                                                  int8_t,
+                                                  int8_t,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemm<Row,
+                                                  Row,
+                                                  Row,
+                                                  int8_t,
+                                                  int8_t,
+                                                  int8_t,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemm<Row,
+                                                  Col,
+                                                  Row,
+                                                  int8_t,
+                                                  int8_t,
+                                                  int8_t,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatchedGemm<
+    ALayout,
+    BLayout,
+    CLayout,
+    ADataType,
+    BDataType,
+    CDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough>>
+{
+    using DeviceOp = DeviceBatchedGemm<ALayout,
+                                       BLayout,
+                                       CLayout,
+                                       ADataType,
+                                       BDataType,
+                                       CDataType,
+                                       ck::tensor_operation::element_wise::PassThrough,
+                                       ck::tensor_operation::element_wise::PassThrough,
+                                       ck::tensor_operation::element_wise::PassThrough>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<ADataType, float> && is_same_v<BDataType, float> &&
+                     is_same_v<CDataType, float>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instances(op_ptrs);
+            }
+        }
+        else if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+                          is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances(op_ptrs);
+            }
+        }
+        else if constexpr(is_same_v<ADataType, bhalf_t> && is_same_v<BDataType, bhalf_t> &&
+                          is_same_v<CDataType, bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instances(op_ptrs);
+            }
+        }
+        else if constexpr(is_same_v<ADataType, int8_t> && is_same_v<BDataType, int8_t> &&
+                          is_same_v<CDataType, int8_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instances(op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp
new file mode 100644
index 00000000..495c5f88
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_multiple_d_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+using CDE0ElementOp = ck::tensor_operation::element_wise::AddRelu;
+using CDE1ElementOp = ck::tensor_operation::element_wise::Add;
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultipleDGemmMultipleD<Row,
+                                                                        Col,
+                                                                        ck::Tuple<Row>,
+                                                                        Row,
+                                                                        ck::Tuple<Row>,
+                                                                        Row,
+                                                                        F16,
+                                                                        F16,
+                                                                        ck::Tuple<F16>,
+                                                                        F16,
+                                                                        ck::Tuple<F16>,
+                                                                        F16,
+                                                                        PassThrough,
+                                                                        PassThrough,
+                                                                        CDE0ElementOp,
+                                                                        PassThrough,
+                                                                        CDE1ElementOp>>>&
+        instances);
+
+void add_device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultipleDGemmMultipleD<Row,
+                                                                        Col,
+                                                                        ck::Tuple<Row>,
+                                                                        Col,
+                                                                        ck::Tuple<Row>,
+                                                                        Row,
+                                                                        F16,
+                                                                        F16,
+                                                                        ck::Tuple<F16>,
+                                                                        F16,
+                                                                        ck::Tuple<F16>,
+                                                                        F16,
+                                                                        PassThrough,
+                                                                        PassThrough,
+                                                                        CDE0ElementOp,
+                                                                        PassThrough,
+                                                                        CDE1ElementOp>>>&
+        instances);
+
+template <typename A0Layout,
+          typename B0Layout,
+          typename D0sLayout,
+          typename B1Layout,
+          typename D1sLayout,
+          typename E1Layout,
+          typename A0DataType,
+          typename B0DataType,
+          typename D0sDataType,
+          typename B1DataType,
+          typename D1sDataType,
+          typename E1DataType>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD<A0Layout,
+                                                                          B0Layout,
+                                                                          D0sLayout,
+                                                                          B1Layout,
+                                                                          D1sLayout,
+                                                                          E1Layout,
+                                                                          A0DataType,
+                                                                          B0DataType,
+                                                                          D0sDataType,
+                                                                          B1DataType,
+                                                                          D1sDataType,
+                                                                          E1DataType,
+                                                                          PassThrough,
+                                                                          PassThrough,
+                                                                          CDE0ElementOp,
+                                                                          PassThrough,
+                                                                          CDE1ElementOp>>
+{
+    using DeviceOp = DeviceBatchedGemmMultipleDGemmMultipleD<A0Layout,
+                                                             B0Layout,
+                                                             D0sLayout,
+                                                             B1Layout,
+                                                             D1sLayout,
+                                                             E1Layout,
+                                                             A0DataType,
+                                                             B0DataType,
+                                                             D0sDataType,
+                                                             B1DataType,
+                                                             D1sDataType,
+                                                             E1DataType,
+                                                             PassThrough,
+                                                             PassThrough,
+                                                             CDE0ElementOp,
+                                                             PassThrough,
+                                                             CDE1ElementOp>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<A0DataType, half_t> && is_same_v<B0DataType, half_t> &&
+                     is_same_v<B1DataType, half_t> && is_same_v<E1DataType, half_t>)
+        {
+            if constexpr(is_same_v<A0Layout, Row> && is_same_v<B0Layout, Col> &&
+                         is_same_v<B1Layout, Row> && is_same_v<E1Layout, Row>)
+            {
+                add_device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<A0Layout, Row> && is_same_v<B0Layout, Col> &&
+                              is_same_v<B1Layout, Col> && is_same_v<E1Layout, Row>)
+            {
+                add_device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance(
+                    op_ptrs);
+            }
+        }
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp
new file mode 100644
index 00000000..a6dcfa30
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
+    std::vector<std::unique_ptr<DeviceBatchedGemmGemm<Row,
+                                                      Col,
+                                                      Row,
+                                                      Row,
+                                                      F16,
+                                                      F16,
+                                                      F16,
+                                                      F16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance(
+    std::vector<std::unique_ptr<DeviceBatchedGemmGemm<Row,
+                                                      Col,
+                                                      Col,
+                                                      Row,
+                                                      F16,
+                                                      F16,
+                                                      F16,
+                                                      F16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+template <typename ALayout,
+          typename B0Layout,
+          typename B1Layout,
+          typename CLayout,
+          typename ADataType,
+          typename B0DataType,
+          typename B1DataType,
+          typename CDataType>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceBatchedGemmGemm<ALayout,
+                                                        B0Layout,
+                                                        B1Layout,
+                                                        CLayout,
+                                                        ADataType,
+                                                        B0DataType,
+                                                        B1DataType,
+                                                        CDataType,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough,
+                                                        PassThrough>>
+{
+    using DeviceOp = DeviceBatchedGemmGemm<ALayout,
+                                           B0Layout,
+                                           B1Layout,
+                                           CLayout,
+                                           ADataType,
+                                           B0DataType,
+                                           B1DataType,
+                                           CDataType,
+                                           PassThrough,
+                                           PassThrough,
+                                           PassThrough,
+                                           PassThrough,
+                                           PassThrough>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<B0DataType, half_t> &&
+                     is_same_v<B1DataType, half_t> && is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<B0Layout, Col> &&
+                         is_same_v<B1Layout, Row> && is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<B0Layout, Col> &&
+                              is_same_v<B1Layout, Col> && is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance(
+                    op_ptrs);
+            }
+        }
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp
new file mode 100644
index 00000000..8a0b1b1f
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
+    std::vector<std::unique_ptr<DeviceBatchedGemmSoftmaxGemm<Row,
+                                                             Col,
+                                                             Row,
+                                                             Row,
+                                                             F16,
+                                                             F16,
+                                                             F16,
+                                                             F16,
+                                                             PassThrough,
+                                                             PassThrough,
+                                                             Scale,
+                                                             PassThrough,
+                                                             PassThrough,
+                                                             false>>>& instances);
+
+void add_device_batched_gemm_masking_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
+    std::vector<std::unique_ptr<DeviceBatchedGemmSoftmaxGemm<Row,
+                                                             Col,
+                                                             Row,
+                                                             Row,
+                                                             F16,
+                                                             F16,
+                                                             F16,
+                                                             F16,
+                                                             PassThrough,
+                                                             PassThrough,
+                                                             Scale,
+                                                             PassThrough,
+                                                             PassThrough,
+                                                             true>>>& instances);
+
+template <typename ALayout,
+          typename B0Layout,
+          typename B1Layout,
+          typename CLayout,
+          typename ADataType,
+          typename B0DataType,
+          typename B1DataType,
+          typename CDataType,
+          bool MaskOutUpperTriangle>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemm<ALayout,
+                                                               B0Layout,
+                                                               B1Layout,
+                                                               CLayout,
+                                                               ADataType,
+                                                               B0DataType,
+                                                               B1DataType,
+                                                               CDataType,
+                                                               PassThrough,
+                                                               PassThrough,
+                                                               Scale,
+                                                               PassThrough,
+                                                               PassThrough,
+                                                               MaskOutUpperTriangle>>
+{
+    using DeviceOp = DeviceBatchedGemmSoftmaxGemm<ALayout,
+                                                  B0Layout,
+                                                  B1Layout,
+                                                  CLayout,
+                                                  ADataType,
+                                                  B0DataType,
+                                                  B1DataType,
+                                                  CDataType,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  Scale,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  MaskOutUpperTriangle>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<B0DataType, half_t> &&
+                     is_same_v<B1DataType, half_t> && is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<B0Layout, Col> &&
+                         is_same_v<B1Layout, Row> && is_same_v<CLayout, Row>)
+            {
+                if constexpr(MaskOutUpperTriangle)
+                {
+                    add_device_batched_gemm_masking_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
+                        op_ptrs);
+                }
+                else
+                {
+                    add_device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
+                        op_ptrs);
+                }
+            }
+        }
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp
new file mode 100644
index 00000000..89df1a7a
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp
@@ -0,0 +1,185 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_batched_gemm_masking_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemmSoftmaxGemmPermute<2,
+                                            1,
+                                            1,
+                                            1,
+                                            1,
+                                            F16,
+                                            F16,
+                                            F16,
+                                            F16,
+                                            ck::Tuple<>,
+                                            ck::Tuple<>,
+                                            PassThrough,
+                                            PassThrough,
+                                            Scale,
+                                            PassThrough,
+                                            PassThrough,
+                                            MaskingSpecialization::MaskOutUpperTriangle>>>&
+        instances);
+
+void add_device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances(
+    std::vector<
+        std::unique_ptr<DeviceBatchedGemmSoftmaxGemmPermute<2,
+                                                            1,
+                                                            1,
+                                                            1,
+                                                            1,
+                                                            F16,
+                                                            F16,
+                                                            F16,
+                                                            F16,
+                                                            ck::Tuple<>,
+                                                            ck::Tuple<>,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            Scale,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            MaskingSpecialization::MaskDisabled>>>&
+        instances);
+
+void add_device_batched_gemm_masking_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemmSoftmaxGemmPermute<2,
+                                            1,
+                                            1,
+                                            1,
+                                            1,
+                                            BF16,
+                                            BF16,
+                                            BF16,
+                                            BF16,
+                                            ck::Tuple<>,
+                                            ck::Tuple<>,
+                                            PassThrough,
+                                            PassThrough,
+                                            Scale,
+                                            PassThrough,
+                                            PassThrough,
+                                            MaskingSpecialization::MaskOutUpperTriangle>>>&
+        instances);
+
+void add_device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances(
+    std::vector<
+        std::unique_ptr<DeviceBatchedGemmSoftmaxGemmPermute<2,
+                                                            1,
+                                                            1,
+                                                            1,
+                                                            1,
+                                                            BF16,
+                                                            BF16,
+                                                            BF16,
+                                                            BF16,
+                                                            ck::Tuple<>,
+                                                            ck::Tuple<>,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            Scale,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            MaskingSpecialization::MaskDisabled>>>&
+        instances);
+
+template <typename ADataType,
+          typename B0DataType,
+          typename B1DataType,
+          typename CDataType,
+          MaskingSpecialization MaskingSpec>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute<2,
+                                                                      1,
+                                                                      1,
+                                                                      1,
+                                                                      1,
+                                                                      ADataType,
+                                                                      B0DataType,
+                                                                      B1DataType,
+                                                                      CDataType,
+                                                                      ck::Tuple<>,
+                                                                      ck::Tuple<>,
+                                                                      PassThrough,
+                                                                      PassThrough,
+                                                                      Scale,
+                                                                      PassThrough,
+                                                                      PassThrough,
+                                                                      MaskingSpec>>
+{
+    using DeviceOp = DeviceBatchedGemmSoftmaxGemmPermute<2,
+                                                         1,
+                                                         1,
+                                                         1,
+                                                         1,
+                                                         ADataType,
+                                                         B0DataType,
+                                                         B1DataType,
+                                                         CDataType,
+                                                         ck::Tuple<>,
+                                                         ck::Tuple<>,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         Scale,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         MaskingSpec>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<B0DataType, half_t> &&
+                     is_same_v<B1DataType, half_t> && is_same_v<CDataType, half_t>)
+        {
+            if constexpr(MaskingSpec == MaskingSpecialization::MaskOutUpperTriangle)
+            {
+                add_device_batched_gemm_masking_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances(
+                    op_ptrs);
+            }
+            else if(MaskingSpec == MaskingSpecialization::MaskDisabled)
+            {
+                add_device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances(
+                    op_ptrs);
+            }
+        }
+        else if constexpr(is_same_v<ADataType, BF16> && is_same_v<B0DataType, BF16> &&
+                          is_same_v<B1DataType, BF16> && is_same_v<CDataType, BF16>)
+        {
+            if constexpr(MaskingSpec == MaskingSpecialization::MaskOutUpperTriangle)
+            {
+                add_device_batched_gemm_masking_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances(
+                    op_ptrs);
+            }
+            else if(MaskingSpec == MaskingSpecialization::MaskDisabled)
+            {
+                add_device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances(
+                    op_ptrs);
+            }
+        }
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp
new file mode 100644
index 00000000..c84ffcff
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_batchnorm_backward.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// FP16
+void add_device_batchnorm_backward_rank_4_3_f16_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchNormBwd<F16, F32, F32, F32, F16, F32, F32, PassThrough, 4, 3>>>&);
+
+// FP32
+void add_device_batchnorm_backward_rank_4_3_f32_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchNormBwd<F32, F32, F32, F32, F32, F32, F32, PassThrough, 4, 3>>>&);
+
+// BF16
+void add_device_batchnorm_backward_rank_4_3_bf16_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchNormBwd<BF16, F32, F32, F32, BF16, F32, F32, PassThrough, 4, 3>>>&);
+
+// FP64
+void add_device_batchnorm_backward_rank_4_3_f64_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchNormBwd<F64, F64, F64, F64, F64, F64, F64, PassThrough, 4, 3>>>&);
+
+template <typename XDataType,
+          typename DxDataType,
+          typename DyDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename DscaleDbiasDataType,
+          typename MeanVarDataType,
+          typename DyElementwiseOp,
+          index_t Rank,
+          index_t NumReduceDim>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceBatchNormBwd<XDataType,
+                                                     DxDataType,
+                                                     DyDataType,
+                                                     AccDataType,
+                                                     ScaleDataType,
+                                                     DscaleDbiasDataType,
+                                                     MeanVarDataType,
+                                                     DyElementwiseOp,
+                                                     Rank,
+                                                     NumReduceDim>>
+{
+    using DeviceOp = DeviceBatchNormBwd<XDataType,
+                                        DxDataType,
+                                        DyDataType,
+                                        AccDataType,
+                                        ScaleDataType,
+                                        DscaleDbiasDataType,
+                                        MeanVarDataType,
+                                        DyElementwiseOp,
+                                        Rank,
+                                        NumReduceDim>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<XDataType, F16> && is_same_v<DxDataType, F32> &&
+                     is_same_v<DyDataType, F32> && is_same_v<AccDataType, F32> &&
+                     is_same_v<ScaleDataType, F16> && is_same_v<DscaleDbiasDataType, F32> &&
+                     is_same_v<MeanVarDataType, F32>)
+        {
+            if constexpr(Rank == 4 && NumReduceDim == 3 && is_same_v<DyElementwiseOp, PassThrough>)
+            {
+                add_device_batchnorm_backward_rank_4_3_f16_instances(op_ptrs);
+            }
+        }
+        else if constexpr(is_same_v<XDataType, F32> && is_same_v<DxDataType, F32> &&
+                          is_same_v<DyDataType, F32> && is_same_v<AccDataType, F32> &&
+                          is_same_v<ScaleDataType, F32> && is_same_v<DscaleDbiasDataType, F32> &&
+                          is_same_v<MeanVarDataType, F32>)
+        {
+            if constexpr(Rank == 4 && NumReduceDim == 3 && is_same_v<DyElementwiseOp, PassThrough>)
+            {
+                add_device_batchnorm_backward_rank_4_3_f32_instances(op_ptrs);
+            }
+        }
+        else if constexpr(is_same_v<XDataType, BF16> && is_same_v<DxDataType, F32> &&
+                          is_same_v<DyDataType, F32> && is_same_v<AccDataType, F32> &&
+                          is_same_v<ScaleDataType, BF16> && is_same_v<DscaleDbiasDataType, F32> &&
+                          is_same_v<MeanVarDataType, F32>)
+        {
+            if constexpr(Rank == 4 && NumReduceDim == 3 && is_same_v<DyElementwiseOp, PassThrough>)
+            {
+                add_device_batchnorm_backward_rank_4_3_bf16_instances(op_ptrs);
+            }
+        }
+        else if constexpr(is_same_v<XDataType, F64> && is_same_v<DxDataType, F64> &&
+                          is_same_v<DyDataType, F64> && is_same_v<AccDataType, F64> &&
+                          is_same_v<ScaleDataType, F64> && is_same_v<DscaleDbiasDataType, F64> &&
+                          is_same_v<MeanVarDataType, F64>)
+        {
+            if constexpr(Rank == 4 && NumReduceDim == 3 && is_same_v<DyElementwiseOp, PassThrough>)
+            {
+                add_device_batchnorm_backward_rank_4_3_f64_instances(op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp
new file mode 100644
index 00000000..8e40d60c
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_batchnorm_forward.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// FP16
+void add_device_batchnorm_forward_rank_4_3_f16_instances(
+    std::vector<
+        std::unique_ptr<DeviceBatchNormFwd<F16, F16, F32, F16, F16, F32, PassThrough, 4, 3>>>&);
+
+// FP32
+void add_device_batchnorm_forward_rank_4_3_f32_instances(
+    std::vector<
+        std::unique_ptr<DeviceBatchNormFwd<F32, F32, F32, F32, F32, F32, PassThrough, 4, 3>>>&);
+
+// BF16
+void add_device_batchnorm_forward_rank_4_3_bf16_instances(
+    std::vector<
+        std::unique_ptr<DeviceBatchNormFwd<BF16, BF16, F32, BF16, BF16, F32, PassThrough, 4, 3>>>&);
+
+// FP64
+void add_device_batchnorm_forward_rank_4_3_f64_instances(
+    std::vector<
+        std::unique_ptr<DeviceBatchNormFwd<F64, F64, F64, F64, F64, F64, PassThrough, 4, 3>>>&);
+
+template <typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          typename YElementwiseOp,
+          index_t Rank,
+          index_t NumReduceDim>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceBatchNormFwd<XDataType,
+                                                     YDataType,
+                                                     AccDataType,
+                                                     ScaleDataType,
+                                                     BiasDataType,
+                                                     MeanVarDataType,
+                                                     YElementwiseOp,
+                                                     Rank,
+                                                     NumReduceDim>>
+{
+    using DeviceOp = DeviceBatchNormFwd<XDataType,
+                                        YDataType,
+                                        AccDataType,
+                                        ScaleDataType,
+                                        BiasDataType,
+                                        MeanVarDataType,
+                                        YElementwiseOp,
+                                        Rank,
+                                        NumReduceDim>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<XDataType, F16> && is_same_v<YDataType, F16> &&
+                     is_same_v<AccDataType, F32> && is_same_v<ScaleDataType, F16> &&
+                     is_same_v<BiasDataType, F16> && is_same_v<MeanVarDataType, F32>)
+        {
+            if constexpr(Rank == 4 && NumReduceDim == 3 && is_same_v<YElementwiseOp, PassThrough>)
+            {
+                add_device_batchnorm_forward_rank_4_3_f16_instances(op_ptrs);
+            }
+        }
+        else if constexpr(is_same_v<XDataType, F32> && is_same_v<YDataType, F32> &&
+                          is_same_v<AccDataType, F32> && is_same_v<ScaleDataType, F32> &&
+                          is_same_v<BiasDataType, F32> && is_same_v<MeanVarDataType, F32>)
+        {
+            if constexpr(Rank == 4 && NumReduceDim == 3 && is_same_v<YElementwiseOp, PassThrough>)
+            {
+                add_device_batchnorm_forward_rank_4_3_f32_instances(op_ptrs);
+            }
+        }
+        else if constexpr(is_same_v<XDataType, BF16> && is_same_v<YDataType, BF16> &&
+                          is_same_v<AccDataType, F32> && is_same_v<ScaleDataType, BF16> &&
+                          is_same_v<BiasDataType, BF16> && is_same_v<MeanVarDataType, F32>)
+        {
+            if constexpr(Rank == 4 && NumReduceDim == 3 && is_same_v<YElementwiseOp, PassThrough>)
+            {
+                add_device_batchnorm_forward_rank_4_3_bf16_instances(op_ptrs);
+            }
+        }
+        else if constexpr(is_same_v<XDataType, F64> && is_same_v<YDataType, F64> &&
+                          is_same_v<AccDataType, F64> && is_same_v<ScaleDataType, F64> &&
+                          is_same_v<BiasDataType, F64> && is_same_v<MeanVarDataType, F64>)
+        {
+            if constexpr(Rank == 4 && NumReduceDim == 3 && is_same_v<YElementwiseOp, PassThrough>)
+            {
+                add_device_batchnorm_forward_rank_4_3_f64_instances(op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp b/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
new file mode 100644
index 00000000..a0cea7e3
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction_bilinear.hpp
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+#include <vector>
+#include <memory>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear>>>& instances);
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear>>>& instances);
+
+// Contraction + Bilinear
+template <index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename DDataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContractionMultipleD<
+    NumDimM,
+    NumDimN,
+    NumDimK,
+    ADataType,
+    BDataType,
+    ck::Tuple<DDataType>,
+    EDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::Bilinear>>
+{
+    using DeviceOp = DeviceContractionMultipleD<NumDimM,
+                                                NumDimN,
+                                                NumDimK,
+                                                ADataType,
+                                                BDataType,
+                                                ck::Tuple<DDataType>,
+                                                EDataType,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                ck::tensor_operation::element_wise::Bilinear>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<ADataType, float> && is_same_v<BDataType, float> &&
+                     is_same_v<DDataType, float> && is_same_v<EDataType, float>)
+        {
+            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
+            {
+                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance(
+                    op_ptrs);
+                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
+                    op_ptrs);
+                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance(
+                    op_ptrs);
+                add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance(
+                    op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp b/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
new file mode 100644
index 00000000..e921ecd4
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/contraction_scale.hpp
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+#include <vector>
+#include <memory>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_contraction_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale>>>& instances);
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale>>>& instances);
+
+// Contraction + Scale
+template <index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          typename ADataType,
+          typename BDataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceContractionMultipleD<
+    NumDimM,
+    NumDimN,
+    NumDimK,
+    ADataType,
+    BDataType,
+    ck::Tuple<>,
+    EDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::Scale>>
+{
+    using DeviceOp = DeviceContractionMultipleD<NumDimM,
+                                                NumDimN,
+                                                NumDimK,
+                                                ADataType,
+                                                BDataType,
+                                                ck::Tuple<>,
+                                                EDataType,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                ck::tensor_operation::element_wise::Scale>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<ADataType, float> && is_same_v<BDataType, float> &&
+                     is_same_v<EDataType, float>)
+        {
+            if constexpr(NumDimM == 2 && NumDimN == 2 && NumDimK == 2)
+            {
+                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance(
+                    op_ptrs);
+                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance(
+                    op_ptrs);
+                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance(
+                    op_ptrs);
+                add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance(
+                    op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp b/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp
new file mode 100644
index 00000000..ec5d18fc
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp
@@ -0,0 +1,309 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// conv1d backward data
+void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvBwdData<1,
+                                                  NWC,
+                                                  KXC,
+                                                  NWK,
+                                                  BF16,
+                                                  BF16,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instances(
+    std::vector<std::unique_ptr<
+        DeviceConvBwdData<1, NWC, KXC, NWK, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances(
+    std::vector<std::unique_ptr<
+        DeviceConvBwdData<1, NWC, KXC, NWK, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances(
+    std::vector<std::unique_ptr<DeviceConvBwdData<1,
+                                                  NWC,
+                                                  KXC,
+                                                  NWK,
+                                                  int8_t,
+                                                  int8_t,
+                                                  int8_t,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+// conv2d backward data
+void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvBwdData<2,
+                                                  NHWC,
+                                                  KYXC,
+                                                  NHWK,
+                                                  BF16,
+                                                  BF16,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvBwdData<2,
+                                                  NHWC,
+                                                  KYXC,
+                                                  NHWK,
+                                                  F16,
+                                                  F16,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvBwdData<2,
+                                                  NHWC,
+                                                  KYXC,
+                                                  NHWK,
+                                                  F32,
+                                                  F32,
+                                                  F32,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(
+    std::vector<std::unique_ptr<DeviceConvBwdData<2,
+                                                  NHWC,
+                                                  KYXC,
+                                                  NHWK,
+                                                  int8_t,
+                                                  int8_t,
+                                                  int8_t,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+// conv2d dl
+void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvBwdData<2,
+                                                  NHWC,
+                                                  KYXC,
+                                                  NHWK,
+                                                  F16,
+                                                  F16,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvBwdData<2,
+                                                  NHWC,
+                                                  KYXC,
+                                                  NHWK,
+                                                  F32,
+                                                  F32,
+                                                  F32,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instances(
+    std::vector<std::unique_ptr<DeviceConvBwdData<2,
+                                                  NHWC,
+                                                  KYXC,
+                                                  NHWK,
+                                                  int8_t,
+                                                  int8_t,
+                                                  int8_t,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+// conv3d backward data
+void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvBwdData<3,
+                                                  NDHWC,
+                                                  KZYXC,
+                                                  NDHWK,
+                                                  BF16,
+                                                  BF16,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvBwdData<3,
+                                                  NDHWC,
+                                                  KZYXC,
+                                                  NDHWK,
+                                                  F16,
+                                                  F16,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvBwdData<3,
+                                                  NDHWC,
+                                                  KZYXC,
+                                                  NDHWK,
+                                                  F32,
+                                                  F32,
+                                                  F32,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances(
+    std::vector<std::unique_ptr<DeviceConvBwdData<3,
+                                                  NDHWC,
+                                                  KZYXC,
+                                                  NDHWK,
+                                                  int8_t,
+                                                  int8_t,
+                                                  int8_t,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+template <ck::index_t NumDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceConvBwdData<
+    NumDimSpatial,
+    InLayout,
+    WeiLayout,
+    OutLayout,
+    InDataType,
+    WeiDataType,
+    OutDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough>>
+{
+    using DeviceOp = DeviceConvBwdData<NumDimSpatial,
+                                       InLayout,
+                                       WeiLayout,
+                                       OutLayout,
+                                       InDataType,
+                                       WeiDataType,
+                                       OutDataType,
+                                       ck::tensor_operation::element_wise::PassThrough,
+                                       ck::tensor_operation::element_wise::PassThrough,
+                                       ck::tensor_operation::element_wise::PassThrough>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(NumDimSpatial == 1 && is_same_v<InLayout, NWC> && is_same_v<WeiLayout, KXC> &&
+                     is_same_v<OutLayout, NWK>)
+        {
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float>)
+            {
+                add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                              is_same_v<OutDataType, half_t>)
+            {
+                add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                              is_same_v<WeiDataType, ck::bhalf_t> &&
+                              is_same_v<OutDataType, ck::bhalf_t>)
+            {
+                add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+                              is_same_v<OutDataType, int8_t>)
+            {
+                add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances(op_ptrs);
+            }
+        }
+        else if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWC> &&
+                          is_same_v<WeiLayout, KYXC> && is_same_v<OutLayout, NHWK>)
+        {
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float>)
+            {
+                add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(op_ptrs);
+                add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                              is_same_v<OutDataType, half_t>)
+            {
+                add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(op_ptrs);
+                add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                              is_same_v<WeiDataType, ck::bhalf_t> &&
+                              is_same_v<OutDataType, ck::bhalf_t>)
+            {
+                add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+                              is_same_v<OutDataType, int8_t>)
+            {
+                add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(op_ptrs);
+                add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instances(op_ptrs);
+            }
+        }
+        else if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, NDHWC> &&
+                          is_same_v<WeiLayout, KZYXC> && is_same_v<OutLayout, NDHWK>)
+        {
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float>)
+            {
+                add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                              is_same_v<OutDataType, half_t>)
+            {
+                add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                              is_same_v<WeiDataType, ck::bhalf_t> &&
+                              is_same_v<OutDataType, ck::bhalf_t>)
+            {
+                add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+                              is_same_v<OutDataType, int8_t>)
+            {
+                add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances(op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp
new file mode 100644
index 00000000..62f28c9b
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/convolution_forward.hpp
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// conv2d forward
+void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(
+    std::vector<std::unique_ptr<
+        DeviceConvFwd<2, NHWC, KYXC, NHWK, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvFwd<2,
+                                              NHWC,
+                                              KYXC,
+                                              NHWK,
+                                              BF16,
+                                              BF16,
+                                              BF16,
+                                              PassThrough,
+                                              PassThrough,
+                                              PassThrough>>>& instances);
+
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(
+    std::vector<std::unique_ptr<
+        DeviceConvFwd<2, NHWC, KYXC, NHWK, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(
+    std::vector<std::unique_ptr<
+        DeviceConvFwd<2, NHWC, KYXC, NHWK, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(
+    std::vector<std::unique_ptr<DeviceConvFwd<2,
+                                              NHWC,
+                                              KYXC,
+                                              NHWK,
+                                              int8_t,
+                                              int8_t,
+                                              int8_t,
+                                              PassThrough,
+                                              PassThrough,
+                                              PassThrough>>>& instances);
+
+template <ck::index_t NumDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceConvFwd<NumDimSpatial,
+                                                InLayout,
+                                                WeiLayout,
+                                                OutLayout,
+                                                InDataType,
+                                                WeiDataType,
+                                                OutDataType,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                ck::tensor_operation::element_wise::PassThrough>>
+{
+    using DeviceOp = DeviceConvFwd<NumDimSpatial,
+                                   InLayout,
+                                   WeiLayout,
+                                   OutLayout,
+                                   InDataType,
+                                   WeiDataType,
+                                   OutDataType,
+                                   ck::tensor_operation::element_wise::PassThrough,
+                                   ck::tensor_operation::element_wise::PassThrough,
+                                   ck::tensor_operation::element_wise::PassThrough>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWC> &&
+                     is_same_v<WeiLayout, KYXC> && is_same_v<OutLayout, NHWK>)
+        {
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float>)
+            {
+                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                              is_same_v<OutDataType, half_t>)
+            {
+                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(op_ptrs);
+                add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                              is_same_v<WeiDataType, ck::bhalf_t> &&
+                              is_same_v<OutDataType, ck::bhalf_t>)
+            {
+                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+                              is_same_v<OutDataType, int8_t>)
+            {
+                add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp
new file mode 100644
index 00000000..141af558
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/device_elementwise_instance.hpp
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Normalize                            = ck::tensor_operation::element_wise::Normalize;
+using DeviceNormalizeFromMeanMeanSquarePtr = ck::tensor_operation::device::DeviceElementwiseBasePtr<
+    Tuple<half_t, float, float, half_t, half_t>,
+    Tuple<half_t>,
+    Normalize,
+    2>;
+
+void add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(
+    std::vector<DeviceNormalizeFromMeanMeanSquarePtr>& instances);
+
+template <typename InputType,
+          typename MeanType,
+          typename MeanSquareType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename OutputType>
+auto get_device_normalize_from_mean_meansquare_instances()
+{
+    std::vector<DeviceNormalizeFromMeanMeanSquarePtr> op_ptrs;
+
+    if constexpr(is_same<InputType, half_t>::value && is_same<MeanType, float>::value &&
+                 is_same<MeanSquareType, float>::value && is_same<GammaDataType, half_t>::value &&
+                 is_same<BetaDataType, half_t>::value && is_same<OutputType, half_t>::value)
+    {
+        ck::tensor_operation::device::instance::
+            add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(op_ptrs);
+    }
+
+    return op_ptrs;
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp
new file mode 100644
index 00000000..682f5467
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/device_gemm_mean_squaremean_instance.hpp
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using DeviceGemmAddAddMeanSquareMeanPtr = ck::tensor_operation::device::DeviceGemmReducePtr<1, 2>;
+
+void add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_kn_mn_instances(
+    std::vector<DeviceGemmAddAddMeanSquareMeanPtr>&);
+void add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_nk_mn_instances(
+    std::vector<DeviceGemmAddAddMeanSquareMeanPtr>&);
+void add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_kn_mn_instances(
+    std::vector<DeviceGemmAddAddMeanSquareMeanPtr>&);
+void add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_nk_mn_instances(
+    std::vector<DeviceGemmAddAddMeanSquareMeanPtr>&);
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+auto get_device_gemm_add_add_mean_squaremean_instances()
+{
+    std::vector<DeviceGemmAddAddMeanSquareMeanPtr> op_ptrs;
+
+    if constexpr(is_same<ADataType, half_t>::value && is_same<BDataType, half_t>::value &&
+                 is_same<CDataType, half_t>::value)
+    {
+        if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value &&
+                     is_same<BLayout, tensor_layout::gemm::RowMajor>::value &&
+                     is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
+        {
+            ck::tensor_operation::device::instance::
+                add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_kn_mn_instances(
+                    op_ptrs);
+        }
+        else if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value &&
+                          is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value &&
+                          is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
+        {
+            ck::tensor_operation::device::instance::
+                add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_nk_mn_instances(
+                    op_ptrs);
+        }
+        else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value &&
+                          is_same<BLayout, tensor_layout::gemm::RowMajor>::value &&
+                          is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
+        {
+            ck::tensor_operation::device::instance::
+                add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_kn_mn_instances(
+                    op_ptrs);
+        }
+        else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value &&
+                          is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value &&
+                          is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
+        {
+            ck::tensor_operation::device::instance::
+                add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_nk_mn_instances(
+                    op_ptrs);
+        }
+    }
+
+    return op_ptrs;
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp
new file mode 100644
index 00000000..c87ae159
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_elementwise_normalization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// FP16
+void add_device_elementwise_normalization_rank_2_1_f16_instances(
+    std::vector<std::unique_ptr<DeviceElementwiseNormalization<ck::Tuple<F16, F16>,
+                                                               F16,
+                                                               F16,
+                                                               F32,
+                                                               F16,
+                                                               element_wise::Add,
+                                                               PassThrough,
+                                                               2,
+                                                               1>>>&);
+
+template <typename InDataTypeTuple,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename YDataType,
+          index_t Rank,
+          index_t NumReduceDim>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceElementwiseNormalization<
+    InDataTypeTuple,
+    GammaDataType,
+    BetaDataType,
+    F32,
+    YDataType,
+    ck::tensor_operation::element_wise::Add,
+    ck::tensor_operation::element_wise::PassThrough,
+    Rank,
+    NumReduceDim>>
+{
+    using DeviceOp = DeviceElementwiseNormalization<InDataTypeTuple,
+                                                    GammaDataType,
+                                                    BetaDataType,
+                                                    F32,
+                                                    YDataType,
+                                                    ck::tensor_operation::element_wise::Add,
+                                                    ck::tensor_operation::element_wise::PassThrough,
+                                                    Rank,
+                                                    NumReduceDim>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<GammaDataType, F16> && is_same_v<BetaDataType, F16> &&
+                     is_same_v<YDataType, F16>)
+        {
+            if constexpr(Rank == 2 && NumReduceDim == 1)
+            {
+                add_device_elementwise_normalization_rank_2_1_f16_instances(op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
new file mode 100644
index 00000000..e230507e
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
@@ -0,0 +1,385 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+#include <memory>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_dl_f32_f32_f32_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+
+        instances);
+
+void add_device_gemm_dl_f32_f32_f32_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_dl_f32_f32_f32_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_dl_f32_f32_f32_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_dl_i8_i8_i8_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_dl_i8_i8_i8_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_dl_i8_i8_i8_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_dl_i8_i8_i8_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_f16_f16_f16_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_f16_f16_f16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_f32_f32_f32_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_f32_f32_f32_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_f64_f64_f64_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+
+        DeviceGemm<Col, Row, Row, F64, F64, F64, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_f64_f64_f64_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, F64, F64, F64, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F64, F64, F64, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F64, F64, F64, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceGemm<ALayout,
+                                             BLayout,
+                                             CLayout,
+                                             ADataType,
+                                             BDataType,
+                                             CDataType,
+                                             ck::tensor_operation::element_wise::PassThrough,
+                                             ck::tensor_operation::element_wise::PassThrough,
+                                             ck::tensor_operation::element_wise::PassThrough>>
+{
+    using DeviceOp = DeviceGemm<ALayout,
+                                BLayout,
+                                CLayout,
+                                ADataType,
+                                BDataType,
+                                CDataType,
+                                ck::tensor_operation::element_wise::PassThrough,
+                                ck::tensor_operation::element_wise::PassThrough,
+                                ck::tensor_operation::element_wise::PassThrough>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<ADataType, float> && is_same_v<BDataType, float> &&
+                     is_same_v<CDataType, float>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances(op_ptrs);
+                add_device_gemm_dl_f32_f32_f32_mk_kn_mn_instances(op_ptrs);
+                add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances(op_ptrs);
+                add_device_gemm_dl_f32_f32_f32_mk_nk_mn_instances(op_ptrs);
+                add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_f32_f32_f32_km_kn_mn_instances(op_ptrs);
+                add_device_gemm_dl_f32_f32_f32_km_kn_mn_instances(op_ptrs);
+                add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_f32_f32_f32_km_nk_mn_instances(op_ptrs);
+                add_device_gemm_dl_f32_f32_f32_km_nk_mn_instances(op_ptrs);
+                add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instances(op_ptrs);
+            }
+        }
+        else if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+                          is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
+                add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
+                add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
+                add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
+                add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
+                add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_f16_f16_f16_km_kn_mn_instances(op_ptrs);
+                add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(op_ptrs);
+                add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_f16_f16_f16_km_nk_mn_instances(op_ptrs);
+                add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(op_ptrs);
+                add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(op_ptrs);
+            }
+        }
+        else if constexpr(is_same_v<ADataType, ck::bhalf_t> && is_same_v<BDataType, ck::bhalf_t> &&
+                          is_same_v<CDataType, ck::bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instances(op_ptrs);
+            }
+        }
+        else if constexpr(is_same_v<ADataType, int8_t> && is_same_v<BDataType, int8_t> &&
+                          is_same_v<CDataType, int8_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(op_ptrs);
+                add_device_gemm_dl_i8_i8_i8_mk_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(op_ptrs);
+                add_device_gemm_dl_i8_i8_i8_mk_nk_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances(op_ptrs);
+                add_device_gemm_dl_i8_i8_i8_km_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances(op_ptrs);
+                add_device_gemm_dl_i8_i8_i8_km_nk_mn_instances(op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp
new file mode 100644
index 00000000..09d8e8b9
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+#include <vector>
+#include <memory>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+                                                    Row,
+                                                    Row_Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    F16_F16_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    AddAddFastGelu>>>&);
+
+void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+                                                    Col,
+                                                    Row_Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    F16_F16_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    AddAddFastGelu>>>&);
+
+void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
+                                                    Row,
+                                                    Row_Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    F16_F16_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    AddAddFastGelu>>>&);
+
+void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
+                                                    Col,
+                                                    Row_Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    F16_F16_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    AddAddFastGelu>>>&);
+
+// GEMM + Add + Add + FastGelu
+template <typename ALayout,
+          typename BLayout,
+          typename D0Layout,
+          typename D1Layout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename D0DataType,
+          typename D1DataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMultipleD<
+    ALayout,
+    BLayout,
+    ck::Tuple<D0Layout, D1Layout>,
+    ELayout,
+    ADataType,
+    BDataType,
+    ck::Tuple<D0DataType, D1DataType>,
+    EDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::AddAddFastGelu>>
+{
+    using DeviceOp = DeviceGemmMultipleD<ALayout,
+                                         BLayout,
+                                         ck::Tuple<D0Layout, D1Layout>,
+                                         ELayout,
+                                         ADataType,
+                                         BDataType,
+                                         ck::Tuple<D0DataType, D1DataType>,
+                                         EDataType,
+                                         ck::tensor_operation::element_wise::PassThrough,
+                                         ck::tensor_operation::element_wise::PassThrough,
+                                         ck::tensor_operation::element_wise::AddAddFastGelu>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+                     is_same_v<D0DataType, half_t> && is_same_v<D1DataType, half_t> &&
+                     is_same_v<EDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<D0Layout, Row> && is_same_v<D1Layout, Row> &&
+                         is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<D0Layout, Row> && is_same_v<D1Layout, Row> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<D0Layout, Row> && is_same_v<D1Layout, Row> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<D0Layout, Row> && is_same_v<D1Layout, Row> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instances(
+                    op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp
new file mode 100644
index 00000000..554437f4
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+#include <vector>
+#include <memory>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+                                                    Row,
+                                                    Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    F16_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    AddFastGelu>>>&);
+
+void add_device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+                                                    Col,
+                                                    Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    F16_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    AddFastGelu>>>&);
+
+void add_device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
+                                                    Row,
+                                                    Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    F16_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    AddFastGelu>>>&);
+
+void add_device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
+                                                    Col,
+                                                    Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    F16_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    AddFastGelu>>>&);
+
+// GEMM + Add + FastGelu
+template <typename ALayout,
+          typename BLayout,
+          typename D0Layout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename D0DataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceGemmMultipleD<ALayout,
+                                                      BLayout,
+                                                      ck::Tuple<D0Layout>,
+                                                      ELayout,
+                                                      ADataType,
+                                                      BDataType,
+                                                      ck::Tuple<D0DataType>,
+                                                      EDataType,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      AddFastGelu>>
+{
+    using DeviceOp = DeviceGemmMultipleD<ALayout,
+                                         BLayout,
+                                         ck::Tuple<D0Layout>,
+                                         ELayout,
+                                         ADataType,
+                                         BDataType,
+                                         ck::Tuple<D0DataType>,
+                                         EDataType,
+                                         PassThrough,
+                                         PassThrough,
+                                         AddFastGelu>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+                     is_same_v<D0DataType, half_t> && is_same_v<EDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<D0Layout, Row> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<D0Layout, Row> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<D0Layout, Row> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<D0Layout, Row> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances(
+                    op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp
new file mode 100644
index 00000000..ef70504f
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+#include <vector>
+#include <memory>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
+                                                    Row,
+                                                    Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    F16_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    Bilinear>>>& instances);
+
+void add_device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
+                                                    Col,
+                                                    Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    F16_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    Bilinear>>>& instances);
+
+void add_device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+                                                    Row,
+                                                    Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    F16_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    Bilinear>>>& instances);
+
+void add_device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+                                                    Col,
+                                                    Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    F16_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    Bilinear>>>& instances);
+
+// GEMM + Bilinear
+template <typename ALayout,
+          typename BLayout,
+          typename DLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename DDataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMultipleD<
+    ALayout,
+    BLayout,
+    ck::Tuple<DLayout>,
+    ELayout,
+    ADataType,
+    BDataType,
+    ck::Tuple<DDataType>,
+    EDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::Bilinear>>
+{
+    using DeviceOp = DeviceGemmMultipleD<ALayout,
+                                         BLayout,
+                                         ck::Tuple<DLayout>,
+                                         ELayout,
+                                         ADataType,
+                                         BDataType,
+                                         ck::Tuple<DDataType>,
+                                         EDataType,
+                                         ck::tensor_operation::element_wise::PassThrough,
+                                         ck::tensor_operation::element_wise::PassThrough,
+                                         ck::tensor_operation::element_wise::Bilinear>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+                     is_same_v<DDataType, half_t> && is_same_v<EDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<DLayout, Row> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<DLayout, Row> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<DLayout, Row> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<DLayout, Row> && is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances(
+                    op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp
new file mode 100644
index 00000000..fbc5df98
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+#include <vector>
+#include <memory>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+                                                    Row,
+                                                    Empty_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    Empty_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    FastGelu>>>&);
+
+void add_device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+                                                    Col,
+                                                    Empty_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    Empty_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    FastGelu>>>&);
+
+void add_device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
+                                                    Row,
+                                                    Empty_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    Empty_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    FastGelu>>>&);
+
+void add_device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
+                                                    Col,
+                                                    Empty_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    Empty_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    FastGelu>>>&);
+
+// GEMM + FastGelu
+template <typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmMultipleD<ALayout,
+                                                                                        BLayout,
+                                                                                        Empty_Tuple,
+                                                                                        ELayout,
+                                                                                        ADataType,
+                                                                                        BDataType,
+                                                                                        Empty_Tuple,
+                                                                                        EDataType,
+                                                                                        PassThrough,
+                                                                                        PassThrough,
+                                                                                        FastGelu>>
+{
+    using DeviceOp = DeviceGemmMultipleD<ALayout,
+                                         BLayout,
+                                         Empty_Tuple,
+                                         ELayout,
+                                         ADataType,
+                                         BDataType,
+                                         Empty_Tuple,
+                                         EDataType,
+                                         PassThrough,
+                                         PassThrough,
+                                         FastGelu>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+                     is_same_v<EDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp
new file mode 100644
index 00000000..8986a793
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_splitk.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmSplitK<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmSplitK<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmSplitK<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmSplitK<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmSplitK<Col, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmSplitK<Col, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmSplitK<Row, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmSplitK<Row, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceGemmSplitK<ALayout,
+                                                   BLayout,
+                                                   CLayout,
+                                                   ADataType,
+                                                   BDataType,
+                                                   CDataType,
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   ck::tensor_operation::element_wise::PassThrough>>
+{
+    using DeviceOp = DeviceGemmSplitK<ALayout,
+                                      BLayout,
+                                      CLayout,
+                                      ADataType,
+                                      BDataType,
+                                      CDataType,
+                                      ck::tensor_operation::element_wise::PassThrough,
+                                      ck::tensor_operation::element_wise::PassThrough,
+                                      ck::tensor_operation::element_wise::PassThrough>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<ADataType, float> && is_same_v<BDataType, float> &&
+                     is_same_v<CDataType, float>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances(op_ptrs);
+            }
+        }
+        else if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+                          is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances(op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
new file mode 100644
index 00000000..81b2b4fc
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_bwd_data_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// conv2d backward data
+void add_device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  GNHWK,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  GNHWC,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances);
+
+template <ck::index_t NumDimSpatial,
+          typename OutLayout,
+          typename WeiLayout,
+          typename InLayout,
+          typename OutDataType,
+          typename WeiDataType,
+          typename InDataType>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD<
+        NumDimSpatial,
+        OutLayout,
+        WeiLayout,
+        Empty_Tuple,
+        InLayout,
+        OutDataType,
+        WeiDataType,
+        Empty_Tuple,
+        InDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough>>
+{
+    using DeviceOp =
+        DeviceGroupedConvBwdDataMultipleD<NumDimSpatial,
+                                          OutLayout,
+                                          WeiLayout,
+                                          Empty_Tuple,
+                                          InLayout,
+                                          OutDataType,
+                                          WeiDataType,
+                                          Empty_Tuple,
+                                          InDataType,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough,
+                                          ck::tensor_operation::element_wise::PassThrough>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
+                     is_same_v<WeiLayout, GKYXC> && is_same_v<OutLayout, GNHWK>)
+        {
+            if constexpr(is_same_v<InDataType, F16> && is_same_v<WeiDataType, F16> &&
+                         is_same_v<OutDataType, F16>)
+            {
+                add_device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instances(op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
new file mode 100644
index 00000000..ef6920e5
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
@@ -0,0 +1,235 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// conv1d backward weight
+void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<1,
+                                                           GNWC,
+                                                           GKXC,
+                                                           GNWK,
+                                                           BF16,
+                                                           F32,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<1,
+                                                           GNWC,
+                                                           GKXC,
+                                                           GNWK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<1,
+                                                           GNWC,
+                                                           GKXC,
+                                                           GNWK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+// conv2d backward weight
+void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           GNHWC,
+                                                           GKYXC,
+                                                           GNHWK,
+                                                           BF16,
+                                                           F32,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           GNHWC,
+                                                           GKYXC,
+                                                           GNHWK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           GNHWC,
+                                                           GKYXC,
+                                                           GNHWK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+// conv3d backward weight
+void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           GNDHWC,
+                                                           GKZYXC,
+                                                           GNDHWK,
+                                                           BF16,
+                                                           F32,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           GNDHWC,
+                                                           GKZYXC,
+                                                           GNDHWK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           GNDHWC,
+                                                           GKZYXC,
+                                                           GNDHWK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+template <ck::index_t NumDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvBwdWeight<
+    NumDimSpatial,
+    InLayout,
+    WeiLayout,
+    OutLayout,
+    InDataType,
+    WeiDataType,
+    OutDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough>>
+{
+    using DeviceOp = DeviceGroupedConvBwdWeight<NumDimSpatial,
+                                                InLayout,
+                                                WeiLayout,
+                                                OutLayout,
+                                                InDataType,
+                                                WeiDataType,
+                                                OutDataType,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                ck::tensor_operation::element_wise::PassThrough,
+                                                ck::tensor_operation::element_wise::PassThrough>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(NumDimSpatial == 1 && is_same_v<InLayout, GNWC> &&
+                     is_same_v<WeiLayout, GKXC> && is_same_v<OutLayout, GNWK>)
+        {
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float>)
+            {
+                add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                              is_same_v<OutDataType, half_t>)
+            {
+                add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, ck::bhalf_t> && is_same_v<WeiDataType, float> &&
+                              is_same_v<OutDataType, ck::bhalf_t>)
+            {
+                add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_instances(
+                    op_ptrs);
+            }
+        }
+        else if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
+                          is_same_v<WeiLayout, GKYXC> && is_same_v<OutLayout, GNHWK>)
+        {
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float>)
+            {
+                add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                              is_same_v<OutDataType, half_t>)
+            {
+                add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, ck::bhalf_t> && is_same_v<WeiDataType, float> &&
+                              is_same_v<OutDataType, ck::bhalf_t>)
+            {
+                add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instances(
+                    op_ptrs);
+            }
+        }
+        else if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, GNDHWC> &&
+                          is_same_v<WeiLayout, GKZYXC> && is_same_v<OutLayout, GNDHWK>)
+        {
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float>)
+            {
+                add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                              is_same_v<OutDataType, half_t>)
+            {
+                add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, ck::bhalf_t> && is_same_v<WeiDataType, float> &&
+                              is_same_v<OutDataType, ck::bhalf_t>)
+            {
+                add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instances(
+                    op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
new file mode 100644
index 00000000..ee38b738
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
@@ -0,0 +1,396 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// grouped conv1d forward, GNWC/GKXC/GNWK
+void add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<1,
+                                                              GNWC,
+                                                              GKXC,
+                                                              Empty_Tuple,
+                                                              GNWK,
+                                                              BF16,
+                                                              BF16,
+                                                              Empty_Tuple,
+                                                              BF16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+
+void add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<1,
+                                                              GNWC,
+                                                              GKXC,
+                                                              Empty_Tuple,
+                                                              GNWK,
+                                                              F16,
+                                                              F16,
+                                                              Empty_Tuple,
+                                                              F16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+
+void add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<1,
+                                                              GNWC,
+                                                              GKXC,
+                                                              Empty_Tuple,
+                                                              GNWK,
+                                                              F32,
+                                                              F32,
+                                                              Empty_Tuple,
+                                                              F32,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+
+void add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<1,
+                                                              GNWC,
+                                                              GKXC,
+                                                              Empty_Tuple,
+                                                              GNWK,
+                                                              int8_t,
+                                                              int8_t,
+                                                              Empty_Tuple,
+                                                              int8_t,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+
+// grouped conv2d forward, GNHWC/GKYXC/GNHWK
+void add_device_grouped_conv1d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              GNHWK,
+                                                              BF16,
+                                                              BF16,
+                                                              Empty_Tuple,
+                                                              BF16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              GNHWK,
+                                                              F16,
+                                                              F16,
+                                                              Empty_Tuple,
+                                                              F16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              GNHWK,
+                                                              F32,
+                                                              F32,
+                                                              Empty_Tuple,
+                                                              F32,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              GNHWK,
+                                                              int8_t,
+                                                              int8_t,
+                                                              Empty_Tuple,
+                                                              int8_t,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              GNHWK,
+                                                              F16,
+                                                              F16,
+                                                              Empty_Tuple,
+                                                              F16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              GNHWK,
+                                                              F32,
+                                                              F32,
+                                                              Empty_Tuple,
+                                                              F32,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              GNHWK,
+                                                              int8_t,
+                                                              int8_t,
+                                                              Empty_Tuple,
+                                                              int8_t,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+// grouped conv2d forward, NHWGC/GKYXC/NHWGK
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              NHWGK,
+                                                              F16,
+                                                              F16,
+                                                              Empty_Tuple,
+                                                              F16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+
+// grouped conv3d forward, GNDHWC/GKZYXC/GNDHWK
+void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
+                                                              GNDHWC,
+                                                              GKZYXC,
+                                                              Empty_Tuple,
+                                                              GNDHWK,
+                                                              BF16,
+                                                              BF16,
+                                                              Empty_Tuple,
+                                                              BF16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
+                                                              GNDHWC,
+                                                              GKZYXC,
+                                                              Empty_Tuple,
+                                                              GNDHWK,
+                                                              F16,
+                                                              F16,
+                                                              Empty_Tuple,
+                                                              F16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
+                                                              GNDHWC,
+                                                              GKZYXC,
+                                                              Empty_Tuple,
+                                                              GNDHWK,
+                                                              F32,
+                                                              F32,
+                                                              Empty_Tuple,
+                                                              F32,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
+                                                              GNDHWC,
+                                                              GKZYXC,
+                                                              Empty_Tuple,
+                                                              GNDHWK,
+                                                              int8_t,
+                                                              int8_t,
+                                                              Empty_Tuple,
+                                                              int8_t,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances);
+
+template <ck::index_t NumDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
+    NumDimSpatial,
+    InLayout,
+    WeiLayout,
+    Empty_Tuple,
+    OutLayout,
+    InDataType,
+    WeiDataType,
+    Empty_Tuple,
+    OutDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough>>
+{
+    using DeviceOp = DeviceGroupedConvFwdMultipleD<NumDimSpatial,
+                                                   InLayout,
+                                                   WeiLayout,
+                                                   Empty_Tuple,
+                                                   OutLayout,
+                                                   InDataType,
+                                                   WeiDataType,
+                                                   Empty_Tuple,
+                                                   OutDataType,
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   ck::tensor_operation::element_wise::PassThrough>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(NumDimSpatial == 1 && is_same_v<InLayout, GNWC> &&
+                     is_same_v<WeiLayout, GKXC> && is_same_v<OutLayout, GNWK>)
+        {
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float>)
+            {
+                add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                              is_same_v<OutDataType, half_t>)
+            {
+                add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                              is_same_v<WeiDataType, ck::bhalf_t> &&
+                              is_same_v<OutDataType, ck::bhalf_t>)
+            {
+                add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+                              is_same_v<OutDataType, int8_t>)
+            {
+                add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instances(op_ptrs);
+            }
+        }
+        else if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
+                          is_same_v<WeiLayout, GKYXC> && is_same_v<OutLayout, GNHWK>)
+        {
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float>)
+            {
+                add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                              is_same_v<OutDataType, half_t>)
+            {
+                add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                              is_same_v<WeiDataType, ck::bhalf_t> &&
+                              is_same_v<OutDataType, ck::bhalf_t>)
+            {
+                add_device_grouped_conv1d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+                              is_same_v<OutDataType, int8_t>)
+            {
+                add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_int8_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_instances(op_ptrs);
+            }
+        }
+        else if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, NHWGC> &&
+                          is_same_v<WeiLayout, GKYXC> && is_same_v<OutLayout, NHWGK>)
+        {
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float>)
+            {
+                // no instance
+            }
+            else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                              is_same_v<OutDataType, half_t>)
+            {
+                add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                              is_same_v<WeiDataType, ck::bhalf_t> &&
+                              is_same_v<OutDataType, ck::bhalf_t>)
+            {
+                // no instance
+            }
+            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+                              is_same_v<OutDataType, int8_t>)
+            {
+                // no instance
+            }
+        }
+        else if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, GNDHWC> &&
+                          is_same_v<WeiLayout, GKZYXC> && is_same_v<OutLayout, GNDHWK>)
+        {
+            if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                         is_same_v<OutDataType, float>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, half_t> && is_same_v<WeiDataType, half_t> &&
+                              is_same_v<OutDataType, half_t>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                              is_same_v<WeiDataType, ck::bhalf_t> &&
+                              is_same_v<OutDataType, ck::bhalf_t>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+                              is_same_v<OutDataType, int8_t>)
+            {
+                add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instances(op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
new file mode 100644
index 00000000..c64598da
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_gemm.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Col,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Col,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Col,
+                                                  Col,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances);
+
+template <typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename EDataType>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedGemm<
+    ALayout,
+    BLayout,
+    Empty_Tuple,
+    ELayout,
+    ADataType,
+    BDataType,
+    Empty_Tuple,
+    EDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough>>
+{
+    using DeviceOp = DeviceGroupedGemm<ALayout,
+                                       BLayout,
+                                       Empty_Tuple,
+                                       ELayout,
+                                       ADataType,
+                                       BDataType,
+                                       Empty_Tuple,
+                                       EDataType,
+                                       ck::tensor_operation::element_wise::PassThrough,
+                                       ck::tensor_operation::element_wise::PassThrough,
+                                       ck::tensor_operation::element_wise::PassThrough>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+                     is_same_v<EDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<ELayout, Row>)
+            {
+                add_device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<ELayout, Row>)
+            {
+                add_device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances(op_ptrs);
+            }
+        }
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/normalization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/normalization.hpp
new file mode 100644
index 00000000..55c67b76
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/normalization.hpp
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_normalization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// FP16
+void add_device_normalization_rank_2_1_f16_instances(
+    std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, PassThrough, 2, 1>>>&);
+
+void add_device_normalization_rank_4_3_f16_instances(
+    std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, PassThrough, 4, 3>>>&);
+
+void add_device_normalization_rank_5_3_f16_instances(
+    std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, PassThrough, 5, 3>>>&);
+
+// FP32
+void add_device_normalization_rank_2_1_f32_instances(
+    std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, PassThrough, 2, 1>>>&);
+
+void add_device_normalization_rank_4_3_f32_instances(
+    std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, PassThrough, 4, 3>>>&);
+
+void add_device_normalization_rank_5_3_f32_instances(
+    std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, PassThrough, 5, 3>>>&);
+
+template <typename XDataType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename YDataType,
+          index_t Rank,
+          index_t NumReduceDim>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceNormalization<
+    XDataType,
+    GammaDataType,
+    BetaDataType,
+    F32,
+    YDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    Rank,
+    NumReduceDim>>
+{
+    using DeviceOp = DeviceNormalization<XDataType,
+                                         GammaDataType,
+                                         BetaDataType,
+                                         F32,
+                                         YDataType,
+                                         ck::tensor_operation::element_wise::PassThrough,
+                                         Rank,
+                                         NumReduceDim>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<XDataType, F16> && is_same_v<GammaDataType, F16> &&
+                     is_same_v<BetaDataType, F16> && is_same_v<YDataType, F16>)
+        {
+            if constexpr(Rank == 2 && NumReduceDim == 1)
+            {
+                add_device_normalization_rank_2_1_f16_instances(op_ptrs);
+            }
+            else if constexpr(Rank == 4 && NumReduceDim == 3)
+            {
+                add_device_normalization_rank_4_3_f16_instances(op_ptrs);
+            }
+            else if constexpr(Rank == 5 && NumReduceDim == 3)
+            {
+                add_device_normalization_rank_5_3_f16_instances(op_ptrs);
+            }
+        }
+        else if constexpr(is_same_v<XDataType, F32> && is_same_v<GammaDataType, F32> &&
+                          is_same_v<BetaDataType, F32> && is_same_v<YDataType, F32>)
+        {
+            if constexpr(Rank == 2 && NumReduceDim == 1)
+            {
+                add_device_normalization_rank_2_1_f32_instances(op_ptrs);
+            }
+            else if constexpr(Rank == 4 && NumReduceDim == 3)
+            {
+                add_device_normalization_rank_4_3_f32_instances(op_ptrs);
+            }
+            else if constexpr(Rank == 5 && NumReduceDim == 3)
+            {
+                add_device_normalization_rank_5_3_f32_instances(op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp
new file mode 100644
index 00000000..eda81a23
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perchannel_quantization.hpp
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// grouped conv2d forward, GNHWC/GKYXC/GNHWK
+void add_device_conv2d_bias_perchannel_quantization_int8_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                      GNHWC,
+                                                      GKYXC,
+                                                      GK_GK_Tuple,
+                                                      GNHWK,
+                                                      int8_t,
+                                                      int8_t,
+                                                      I32_F32_Tuple,
+                                                      int8_t,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      Add_Activation_Mul2_Clamp<PassThrough>>>>&
+        instances);
+
+void add_device_conv2d_bias_relu_perchannel_quantization_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              GK_GK_Tuple,
+                                                              GNHWK,
+                                                              int8_t,
+                                                              int8_t,
+                                                              I32_F32_Tuple,
+                                                              int8_t,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              Add_Activation_Mul2_Clamp<Relu>>>>&
+        instances);
+
+template <ck::index_t NumDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename DsLayout,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename DsDataType,
+          typename OutDataType,
+          typename Activation>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
+    NumDimSpatial,
+    InLayout,
+    WeiLayout,
+    DsLayout,
+    OutLayout,
+    InDataType,
+    WeiDataType,
+    DsDataType,
+    OutDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    Add_Activation_Mul2_Clamp<Activation>>>
+{
+    using DeviceOp = DeviceGroupedConvFwdMultipleD<NumDimSpatial,
+                                                   InLayout,
+                                                   WeiLayout,
+                                                   DsLayout,
+                                                   OutLayout,
+                                                   InDataType,
+                                                   WeiDataType,
+                                                   DsDataType,
+                                                   OutDataType,
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Add_Activation_Mul2_Clamp<Activation>>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
+                     is_same_v<WeiLayout, GKYXC> && is_same_v<DsLayout, GK_GK_Tuple> &&
+                     is_same_v<OutLayout, GNHWK>)
+        {
+            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+                         is_same_v<DsDataType, I32_F32_Tuple> && is_same_v<OutDataType, int8_t>)
+            {
+                if constexpr(is_same_v<Activation, PassThrough>)
+                    add_device_conv2d_bias_perchannel_quantization_int8_instances(op_ptrs);
+                else if constexpr(is_same_v<Activation, Relu>)
+                    add_device_conv2d_bias_relu_perchannel_quantization_int8_instances(op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp
new file mode 100644
index 00000000..11384026
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_bias_forward_perlayer_quantization.hpp
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// grouped conv2d forward, GNHWC/GKYXC/GNHWK
+void add_device_conv2d_bias_perlayer_quantization_int8_instances(
+    std::vector<
+        std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                      GNHWC,
+                                                      GKYXC,
+                                                      GK_Tuple,
+                                                      GNHWK,
+                                                      int8_t,
+                                                      int8_t,
+                                                      I32_Tuple,
+                                                      int8_t,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      Add_Activation_Mul_Clamp<PassThrough>>>>&
+        instances);
+
+void add_device_conv2d_bias_relu_perlayer_quantization_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              GK_Tuple,
+                                                              GNHWK,
+                                                              int8_t,
+                                                              int8_t,
+                                                              I32_Tuple,
+                                                              int8_t,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              Add_Activation_Mul_Clamp<Relu>>>>&
+        instances);
+
+template <ck::index_t NumDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename DsLayout,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename DsDataType,
+          typename OutDataType,
+          typename Activation>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
+    NumDimSpatial,
+    InLayout,
+    WeiLayout,
+    DsLayout,
+    OutLayout,
+    InDataType,
+    WeiDataType,
+    DsDataType,
+    OutDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    Add_Activation_Mul_Clamp<Activation>>>
+{
+    using DeviceOp = DeviceGroupedConvFwdMultipleD<NumDimSpatial,
+                                                   InLayout,
+                                                   WeiLayout,
+                                                   DsLayout,
+                                                   OutLayout,
+                                                   InDataType,
+                                                   WeiDataType,
+                                                   DsDataType,
+                                                   OutDataType,
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Add_Activation_Mul_Clamp<Activation>>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
+                     is_same_v<WeiLayout, GKYXC> && is_same_v<DsLayout, GK_Tuple> &&
+                     is_same_v<OutLayout, GNHWK>)
+        {
+            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+                         is_same_v<DsDataType, I32_Tuple> && is_same_v<OutDataType, int8_t>)
+            {
+                if constexpr(is_same_v<Activation, PassThrough>)
+                    add_device_conv2d_bias_perlayer_quantization_int8_instances(op_ptrs);
+                else if constexpr(is_same_v<Activation, Relu>)
+                    add_device_conv2d_bias_relu_perlayer_quantization_int8_instances(op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perchannel_quantization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perchannel_quantization.hpp
new file mode 100644
index 00000000..1a67ce56
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perchannel_quantization.hpp
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// grouped conv2d forward, GNHWC/GKYXC/GNHWK
+void add_device_conv2d_perchannel_quantization_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              GK_Tuple,
+                                                              GNHWK,
+                                                              int8_t,
+                                                              int8_t,
+                                                              F32_Tuple,
+                                                              int8_t,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              Activation_Mul2_Clamp<PassThrough>>>>&
+        instances);
+
+void add_device_conv2d_relu_perchannel_quantization_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              GK_Tuple,
+                                                              GNHWK,
+                                                              int8_t,
+                                                              int8_t,
+                                                              F32_Tuple,
+                                                              int8_t,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              Activation_Mul2_Clamp<Relu>>>>&
+        instances);
+
+template <ck::index_t NumDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename DsLayout,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename DsDataType,
+          typename OutDataType,
+          typename Activation>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
+    NumDimSpatial,
+    InLayout,
+    WeiLayout,
+    DsLayout,
+    OutLayout,
+    InDataType,
+    WeiDataType,
+    DsDataType,
+    OutDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    Activation_Mul2_Clamp<Activation>>>
+{
+    using DeviceOp = DeviceGroupedConvFwdMultipleD<NumDimSpatial,
+                                                   InLayout,
+                                                   WeiLayout,
+                                                   GK_Tuple,
+                                                   OutLayout,
+                                                   InDataType,
+                                                   WeiDataType,
+                                                   F32_Tuple,
+                                                   OutDataType,
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Activation_Mul2_Clamp<Activation>>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
+                     is_same_v<WeiLayout, GKYXC> && is_same_v<DsLayout, GK_Tuple> &&
+                     is_same_v<OutLayout, GNHWK>)
+        {
+            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+                         is_same_v<OutDataType, int8_t>)
+            {
+                if constexpr(is_same_v<Activation, PassThrough>)
+                    add_device_conv2d_perchannel_quantization_int8_instances(op_ptrs);
+                else if constexpr(is_same_v<Activation, Relu>)
+                    add_device_conv2d_relu_perchannel_quantization_int8_instances(op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perlayer_quantization.hpp b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perlayer_quantization.hpp
new file mode 100644
index 00000000..410be4a5
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/quantization/grouped_convolution_forward_perlayer_quantization.hpp
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// grouped conv2d forward, GNHWC/GKYXC/GNHWK
+void add_device_conv2d_perlayer_quantization_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              GNHWK,
+                                                              int8_t,
+                                                              int8_t,
+                                                              Empty_Tuple,
+                                                              int8_t,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              Activation_Mul_Clamp<PassThrough>>>>&
+        instances);
+
+void add_device_conv2d_relu_perlayer_quantization_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              GNHWK,
+                                                              int8_t,
+                                                              int8_t,
+                                                              Empty_Tuple,
+                                                              int8_t,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              Activation_Mul_Clamp<Relu>>>>&
+        instances);
+
+template <ck::index_t NumDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename Activation>
+struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<
+    NumDimSpatial,
+    InLayout,
+    WeiLayout,
+    Empty_Tuple,
+    OutLayout,
+    InDataType,
+    WeiDataType,
+    Empty_Tuple,
+    OutDataType,
+    ck::tensor_operation::element_wise::PassThrough,
+    ck::tensor_operation::element_wise::PassThrough,
+    Activation_Mul_Clamp<Activation>>>
+{
+    using DeviceOp = DeviceGroupedConvFwdMultipleD<NumDimSpatial,
+                                                   InLayout,
+                                                   WeiLayout,
+                                                   Empty_Tuple,
+                                                   OutLayout,
+                                                   InDataType,
+                                                   WeiDataType,
+                                                   Empty_Tuple,
+                                                   OutDataType,
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   Activation_Mul_Clamp<Activation>>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(NumDimSpatial == 2 && is_same_v<InLayout, GNHWC> &&
+                     is_same_v<WeiLayout, GKYXC> && is_same_v<OutLayout, GNHWK>)
+        {
+            if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
+                         is_same_v<OutDataType, int8_t>)
+            {
+                if constexpr(is_same_v<Activation, PassThrough>)
+                    add_device_conv2d_perlayer_quantization_int8_instances(op_ptrs);
+                else if constexpr(is_same_v<Activation, Relu>)
+                    add_device_conv2d_relu_perlayer_quantization_int8_instances(op_ptrs);
+            }
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp
new file mode 100644
index 00000000..550a7b03
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.hpp"
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
new file mode 100644
index 00000000..90cfe837
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using reduce_configuration_1_instances_blockwise = std::tuple<
+    // clang-format off
+    // BlockSize | MThreadClusterSize | KThreadClusterSize
+    ReductionConfiguration_1<256, 128, 2>,
+    ReductionConfiguration_1<256, 64, 4>,
+    ReductionConfiguration_1<256, 32, 8>,
+    ReductionConfiguration_1<256, 16, 16>,
+    ReductionConfiguration_1<256, 8, 32>,
+    ReductionConfiguration_1<256, 4, 64>,
+    ReductionConfiguration_1<256, 2, 128>,
+    ReductionConfiguration_1<256, 1, 256>
+    // clang-format on
+    >;
+
+#ifdef QUICK_REDUCE_TEST
+using reduce_configuration_2_instances_blockwise = std::tuple<
+    // clang-format off
+    // InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
+    ReductionConfiguration_2<0, 2, 2, 2, 1>,
+    ReductionConfiguration_2<0, 1, 1, 2, 1>,
+    ReductionConfiguration_2<1, 2, 1, 1, 2>,
+    ReductionConfiguration_2<0, 1, 1, 3, 1>,
+    ReductionConfiguration_2<1, 1, 1, 1, 3>
+    // clang-format on
+    >;
+#else
+using reduce_configuration_2_instances_blockwise = std::tuple<
+    // clang-format off
+    // InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
+    ReductionConfiguration_2<0, 4, 4, 8, 1>,
+    ReductionConfiguration_2<0, 4, 4, 4, 1>,
+    ReductionConfiguration_2<0, 2, 2, 2, 1>,
+
+    ReductionConfiguration_2<1, 4, 1, 1, 8>,
+    ReductionConfiguration_2<1, 4, 1, 1, 4>,
+    ReductionConfiguration_2<1, 2, 1, 1, 2>,
+
+    // special instances
+    ReductionConfiguration_2<0, 1, 1, 3, 1>,
+    ReductionConfiguration_2<0, 1, 1, 5, 1>,
+    ReductionConfiguration_2<0, 1, 1, 7, 1>,
+    ReductionConfiguration_2<0, 1, 1, 11, 1>,
+
+    ReductionConfiguration_2<1, 1, 1, 1, 3>,
+    ReductionConfiguration_2<1, 1, 1, 1, 5>,
+    ReductionConfiguration_2<1, 1, 1, 1, 7>,
+    ReductionConfiguration_2<1, 1, 1, 1, 11>
+    // clang-format on
+    >;
+#endif
+
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          int Rank,
+          int NumReduceDim,
+          typename ReduceOperation,
+          typename InElementwiseOp,
+          typename AccElementwiseOp,
+          bool PropagateNan,
+          bool OutputIndex>
+void add_device_reduce_instance_blockwise(
+    std::vector<DeviceReducePtr<Rank, NumReduceDim, InElementwiseOp, AccElementwiseOp>>&
+        device_op_instances)
+{
+    static_for<0, std::tuple_size<reduce_configuration_1_instances_blockwise>::value, 1>{}(
+        [&](auto i) {
+            using cfg1 = remove_cvref_t<decltype(
+                std::get<i.value>(reduce_configuration_1_instances_blockwise{}))>;
+
+            static_for<0, std::tuple_size<reduce_configuration_2_instances_blockwise>::value, 1>{}(
+                [&](auto j) {
+                    using cfg2 = remove_cvref_t<decltype(
+                        std::get<j.value>(reduce_configuration_2_instances_blockwise{}))>;
+
+                    using ReduceOpInstance =
+                        DeviceReduceMultiBlock<InDataType,
+                                               AccDataType,
+                                               OutDataType,
+                                               Rank,
+                                               NumReduceDim,
+                                               ReduceOperation,
+                                               InElementwiseOp,
+                                               AccElementwiseOp,
+                                               InMemoryDataOperationEnum::Set,
+                                               PropagateNan,
+                                               OutputIndex,
+                                               false, // HaveIndexInputIfOutputIndex
+                                               cfg1::BlockSize_,
+                                               cfg1::MThreadClusterSize_,
+                                               cfg1::KThreadClusterSize_,
+                                               cfg2::MThreadSliceSize_,
+                                               cfg2::KThreadSliceSize_,
+                                               cfg2::InSrcVectorDim_,
+                                               cfg2::InSrcVectorSize_,
+                                               cfg2::OutDstVectorSize_>;
+
+                    device_op_instances.push_back(
+                        std::make_unique<ReduceOpInstance>(ReduceOpInstance{}));
+                });
+        });
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp
new file mode 100644
index 00000000..521d93e6
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp
new file mode 100644
index 00000000..fe3fd6c0
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp
new file mode 100644
index 00000000..52a2b69c
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp
new file mode 100644
index 00000000..ee4fee41
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp
new file mode 100644
index 00000000..3abdb7f9
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp
new file mode 100644
index 00000000..b0dbcf31
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 3, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 4, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 1, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<2, 1, UnarySquare, UnarySqrt>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp
new file mode 100644
index 00000000..7bbf3df0
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp
new file mode 100644
index 00000000..559f3222
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp
new file mode 100644
index 00000000..28c96107
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F16, F16, F16, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp
new file mode 100644
index 00000000..5080d286
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<F16, F32, F16, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_blockwise<F16, F32, F16, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_blockwise<F16, F32, F16, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_blockwise<F16, F32, F16, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp
new file mode 100644
index 00000000..0d24d153
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<F16, F32, F16, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_blockwise<F16, F32, F16, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_blockwise<F16, F32, F16, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_blockwise<F16, F32, F16, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp
new file mode 100644
index 00000000..c806e807
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<F16, F32, F16, 4, 3, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 3, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_blockwise<F16, F32, F16, 4, 4, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 4, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_blockwise<F16, F32, F16, 4, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 1, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_blockwise<F16, F32, F16, 2, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<2, 1, UnarySquare, UnarySqrt>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp
new file mode 100644
index 00000000..b7c046e7
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp
new file mode 100644
index 00000000..771bec1c
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp
new file mode 100644
index 00000000..c1fe8add
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp
new file mode 100644
index 00000000..6bc0662f
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp
new file mode 100644
index 00000000..6f800513
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp
new file mode 100644
index 00000000..c771ac4f
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 3, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 3, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 4, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 4, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 1, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_blockwise<F32, F32, F32, 2, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<2, 1, UnarySquare, UnarySqrt>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp
new file mode 100644
index 00000000..b9ddbb9a
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<F32, F64, F32, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_blockwise<F32, F64, F32, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_blockwise<F32, F64, F32, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_blockwise<F32, F64, F32, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp
new file mode 100644
index 00000000..390a719c
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<F32, F64, F32, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_blockwise<F32, F64, F32, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_blockwise<F32, F64, F32, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_blockwise<F32, F64, F32, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp
new file mode 100644
index 00000000..2a9ddbc6
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<F32, F64, F32, 4, 3, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 3, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_blockwise<F32, F64, F32, 4, 4, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 4, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_blockwise<F32, F64, F32, 4, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 1, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_blockwise<F32, F64, F32, 2, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<2, 1, UnarySquare, UnarySqrt>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp
new file mode 100644
index 00000000..57468844
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp
new file mode 100644
index 00000000..ad0f2357
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp
new file mode 100644
index 00000000..c7d95276
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp
new file mode 100644
index 00000000..ec562299
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp
new file mode 100644
index 00000000..48f66da6
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp
new file mode 100644
index 00000000..fabfa5b4
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 3, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 3, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 4, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 4, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 1, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_blockwise<F64, F64, F64, 2, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<2, 1, UnarySquare, UnarySqrt>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp
new file mode 100644
index 00000000..e08faec2
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<I8, I32, I8, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_blockwise<I8, I32, I8, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_blockwise<I8, I32, I8, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_blockwise<I8, I32, I8, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp
new file mode 100644
index 00000000..a1e692aa
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<I8, I32, I8, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_blockwise<I8, I32, I8, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_blockwise<I8, I32, I8, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_blockwise<I8, I32, I8, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp
new file mode 100644
index 00000000..e9654e8c
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<I8, I8, I8, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<I8, I8, I8, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp
new file mode 100644
index 00000000..78244213
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<I8, I8, I8, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<I8, I8, I8, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp
new file mode 100644
index 00000000..df323d40
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<I8, I8, I8, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_blockwise<I8, I8, I8, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp
new file mode 100644
index 00000000..8c08e5ef
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <int BlockSize, int MThreadClusterSize, int KThreadClusterSize>
+struct ReductionConfiguration_1
+{
+    static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize, "Invalid Configuration!");
+
+    static constexpr int BlockSize_          = BlockSize;
+    static constexpr int MThreadClusterSize_ = MThreadClusterSize;
+    static constexpr int KThreadClusterSize_ = KThreadClusterSize;
+};
+
+template <int InSrcVectorDim,
+          int InSrcVectorSize,
+          int OutDstVectorSize,
+          int MThreadSliceSize,
+          int KThreadSliceSize>
+struct ReductionConfiguration_2
+{
+    static constexpr int InSrcVectorDim_   = InSrcVectorDim;
+    static constexpr int InSrcVectorSize_  = InSrcVectorSize;
+    static constexpr int OutDstVectorSize_ = OutDstVectorSize;
+    static constexpr int MThreadSliceSize_ = MThreadSliceSize;
+    static constexpr int KThreadSliceSize_ = KThreadSliceSize;
+};
+
+using ReduceAdd  = ck::reduce::Add;
+using ReduceMin  = ck::reduce::Min;
+using ReduceMax  = ck::reduce::Max;
+using ReduceAMax = ck::reduce::AMax;
+
+using UnarySquare = ck::tensor_operation::element_wise::UnarySquare;
+using UnarySqrt   = ck::tensor_operation::element_wise::UnarySqrt;
+using UnaryDivide = ck::tensor_operation::element_wise::UnaryDivide;
+using UnaryAbs    = ck::tensor_operation::element_wise::UnaryAbs;
+
+#define QUICK_REDUCE_TEST 1
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
new file mode 100644
index 00000000..acf55d06
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using reduce_configuration_1_instances_multiblock_atomic_add = std::tuple<
+    // clang-format off
+    // BlockSize | MThreadClusterSize | KThreadClusterSize
+    ReductionConfiguration_1<256, 128, 2>,
+    ReductionConfiguration_1<256, 64, 4>,
+    ReductionConfiguration_1<256, 32, 8>,
+    ReductionConfiguration_1<256, 16, 16>,
+    ReductionConfiguration_1<256, 8, 32>,
+    ReductionConfiguration_1<256, 4, 64>,
+    ReductionConfiguration_1<256, 2, 128>,
+    ReductionConfiguration_1<256, 1, 256>
+    // clang-format on
+    >;
+
+#ifdef QUICK_REDUCE_TEST
+using reduce_configuration_2_instances_multiblock_atomic_add = std::tuple<
+    // clang-format off
+    // InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
+    ReductionConfiguration_2<0, 2, 2, 2, 1>,
+    ReductionConfiguration_2<0, 1, 1, 2, 1>,
+    ReductionConfiguration_2<1, 2, 1, 1, 2>,
+    ReductionConfiguration_2<0, 1, 1, 3, 1>,
+    ReductionConfiguration_2<1, 1, 1, 1, 3>
+    // clang-format on
+    >;
+#else
+using reduce_configuration_2_instances_multiblock_atomic_add = std::tuple<
+    // clang-format off
+    // InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
+    ReductionConfiguration_2<0, 4, 4, 8, 1>,
+    ReductionConfiguration_2<0, 4, 4, 4, 1>,
+    ReductionConfiguration_2<0, 2, 2, 2, 1>,
+
+    ReductionConfiguration_2<1, 4, 1, 1, 8>,
+    ReductionConfiguration_2<1, 4, 1, 1, 4>,
+    ReductionConfiguration_2<1, 2, 1, 1, 2>,
+
+    // special instances
+    ReductionConfiguration_2<0, 1, 1, 3, 1>,
+    ReductionConfiguration_2<0, 1, 1, 5, 1>,
+    ReductionConfiguration_2<0, 1, 1, 7, 1>,
+    ReductionConfiguration_2<0, 1, 1, 11, 1>,
+
+    ReductionConfiguration_2<1, 1, 1, 1, 3>,
+    ReductionConfiguration_2<1, 1, 1, 1, 5>,
+    ReductionConfiguration_2<1, 1, 1, 1, 7>,
+    ReductionConfiguration_2<1, 1, 1, 1, 11>
+    // clang-format on
+    >;
+#endif
+
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          int Rank,
+          int NumReduceDim,
+          typename ReduceOperation,
+          typename InElementwiseOp,
+          typename AccElementwiseOp,
+          bool PropagateNan,
+          bool OutputIndex>
+void add_device_reduce_instance_multiblock_atomic_add(
+    std::vector<DeviceReducePtr<Rank, NumReduceDim, InElementwiseOp, AccElementwiseOp>>&
+        device_op_instances)
+{
+    static_for<0,
+               std::tuple_size<reduce_configuration_1_instances_multiblock_atomic_add>::value,
+               1>{}([&](auto i) {
+        using cfg1 = remove_cvref_t<decltype(
+            std::get<i.value>(reduce_configuration_1_instances_multiblock_atomic_add{}))>;
+
+        static_for<0,
+                   std::tuple_size<reduce_configuration_2_instances_multiblock_atomic_add>::value,
+                   1>{}([&](auto j) {
+            using cfg2 = remove_cvref_t<decltype(
+                std::get<j.value>(reduce_configuration_2_instances_multiblock_atomic_add{}))>;
+
+            using ReduceOpInstance = DeviceReduceMultiBlock<InDataType,
+                                                            AccDataType,
+                                                            OutDataType,
+                                                            Rank,
+                                                            NumReduceDim,
+                                                            ReduceOperation,
+                                                            InElementwiseOp,
+                                                            AccElementwiseOp,
+                                                            InMemoryDataOperationEnum::AtomicAdd,
+                                                            PropagateNan,
+                                                            OutputIndex,
+                                                            false, // HaveIndexInputIfOutputIndex
+                                                            cfg1::BlockSize_,
+                                                            cfg1::MThreadClusterSize_,
+                                                            cfg1::KThreadClusterSize_,
+                                                            cfg2::MThreadSliceSize_,
+                                                            cfg2::KThreadSliceSize_,
+                                                            cfg2::InSrcVectorDim_,
+                                                            cfg2::InSrcVectorSize_,
+                                                            cfg2::OutDstVectorSize_>;
+
+            device_op_instances.push_back(std::make_unique<ReduceOpInstance>(ReduceOpInstance{}));
+        });
+    });
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp
new file mode 100644
index 00000000..f5102f49
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_multiblock_atomic_add<BF16, F32, F32, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_multiblock_atomic_add<BF16, F32, F32, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_multiblock_atomic_add<BF16, F32, F32, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_multiblock_atomic_add<BF16, F32, F32, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp
new file mode 100644
index 00000000..ec513113
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_multiblock_atomic_add<BF16, F32, F32, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_multiblock_atomic_add<BF16, F32, F32, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_multiblock_atomic_add<BF16, F32, F32, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_multiblock_atomic_add<BF16, F32, F32, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp
new file mode 100644
index 00000000..3a3d53b8
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F16, F32, F32, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F16, F32, F32, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F16, F32, F32, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F16, F32, F32, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp
new file mode 100644
index 00000000..bbf43989
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F16, F32, F32, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F16, F32, F32, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F16, F32, F32, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F16, F32, F32, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp
new file mode 100644
index 00000000..55147a60
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F32, F32, F32, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F32, F32, F32, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F32, F32, F32, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F32, F32, F32, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp
new file mode 100644
index 00000000..4bff06c6
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F32, F32, F32, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F32, F32, F32, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F32, F32, F32, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F32, F32, F32, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp
new file mode 100644
index 00000000..daffa1aa
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F32, F64, F32, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F32, F64, F32, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F32, F64, F32, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F32, F64, F32, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp
new file mode 100644
index 00000000..52c41711
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F32, F64, F32, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F32, F64, F32, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F32, F64, F32, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F32, F64, F32, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp
new file mode 100644
index 00000000..2f358b06
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F64, F64, F64, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F64, F64, F64, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F64, F64, F64, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F64, F64, F64, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp
new file mode 100644
index 00000000..84c99dcc
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F64, F64, F64, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F64, F64, F64, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F64, F64, F64, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_multiblock_atomic_add<F64, F64, F64, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
new file mode 100644
index 00000000..dfcc8dd8
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+#ifdef QUICK_REDUCE_TEST
+using reduce_configuration_2_instances_threadwise = std::tuple<
+    // clang-format off
+    // InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
+    ReductionConfiguration_2<0, 2, 2, 2, 1>,
+    ReductionConfiguration_2<0, 1, 1, 2, 1>,
+    ReductionConfiguration_2<1, 2, 1, 1, 2>,
+    ReductionConfiguration_2<0, 1, 1, 3, 1>,
+    ReductionConfiguration_2<1, 1, 1, 1, 3>
+    // clang-format on
+    >;
+#else
+using reduce_configuration_2_instances_threadwise = std::tuple<
+    // clang-format off
+    // InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
+    ReductionConfiguration_2<0, 4, 4, 8, 1>,
+    ReductionConfiguration_2<0, 4, 4, 4, 1>,
+    ReductionConfiguration_2<0, 2, 2, 2, 1>,
+
+    ReductionConfiguration_2<1, 4, 1, 1, 8>,
+    ReductionConfiguration_2<1, 4, 1, 1, 4>,
+    ReductionConfiguration_2<1, 2, 1, 1, 2>,
+
+    // special instances
+    ReductionConfiguration_2<0, 1, 1, 3, 1>,
+    ReductionConfiguration_2<0, 1, 1, 5, 1>,
+    ReductionConfiguration_2<0, 1, 1, 7, 1>,
+    ReductionConfiguration_2<0, 1, 1, 11, 1>,
+
+    ReductionConfiguration_2<1, 1, 1, 1, 3>,
+    ReductionConfiguration_2<1, 1, 1, 1, 5>,
+    ReductionConfiguration_2<1, 1, 1, 1, 7>,
+    ReductionConfiguration_2<1, 1, 1, 1, 11>
+    // clang-format on
+    >;
+#endif
+
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          int Rank,
+          int NumReduceDim,
+          typename ReduceOperation,
+          typename InElementwiseOp,
+          typename AccElementwiseOp,
+          bool PropagateNan,
+          bool OutputIndex>
+void add_device_reduce_instance_threadwise(
+    std::vector<DeviceReducePtr<Rank, NumReduceDim, InElementwiseOp, AccElementwiseOp>>&
+        device_op_instances)
+{
+    using cfg1 = ReductionConfiguration_1<256, 256, 1>;
+
+    static_for<0, std::tuple_size<reduce_configuration_2_instances_threadwise>::value, 1>{}(
+        [&](auto j) {
+            using cfg2 = remove_cvref_t<decltype(
+                std::get<j.value>(reduce_configuration_2_instances_threadwise{}))>;
+
+            using ReduceOpInstance = DeviceReduceThreadWise<InDataType,
+                                                            AccDataType,
+                                                            OutDataType,
+                                                            Rank,
+                                                            NumReduceDim,
+                                                            ReduceOperation,
+                                                            InElementwiseOp,
+                                                            AccElementwiseOp,
+                                                            PropagateNan,
+                                                            OutputIndex,
+                                                            false, // HaveIndexInputIfOutputIndex
+                                                            cfg1::BlockSize_,
+                                                            cfg2::MThreadSliceSize_,
+                                                            cfg2::KThreadSliceSize_,
+                                                            cfg2::InSrcVectorDim_,
+                                                            cfg2::InSrcVectorSize_,
+                                                            cfg2::OutDstVectorSize_>;
+
+            device_op_instances.push_back(std::make_unique<ReduceOpInstance>(ReduceOpInstance{}));
+        });
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.hpp
new file mode 100644
index 00000000..4168508b
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.hpp
new file mode 100644
index 00000000..317006e3
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.hpp
new file mode 100644
index 00000000..fc7718dd
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.hpp
new file mode 100644
index 00000000..e6616386
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.hpp
new file mode 100644
index 00000000..a9441b8e
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp
new file mode 100644
index 00000000..6820ace8
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 3, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 3, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 4, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 4, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 1, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 2, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<2, 1, UnarySquare, UnarySqrt>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp
new file mode 100644
index 00000000..ab3d4e6e
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F16, F16, F16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F16, F16, F16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp
new file mode 100644
index 00000000..ee08c963
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F16, F16, F16, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F16, F16, F16, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp
new file mode 100644
index 00000000..1007ca27
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F16, F16, F16, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F16, F16, F16, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp
new file mode 100644
index 00000000..1d562c49
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<F16, F32, F16, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_threadwise<F16, F32, F16, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_threadwise<F16, F32, F16, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_threadwise<F16, F32, F16, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp
new file mode 100644
index 00000000..5aac638b
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<F16, F32, F16, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_threadwise<F16, F32, F16, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_threadwise<F16, F32, F16, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_threadwise<F16, F32, F16, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp
new file mode 100644
index 00000000..7a3c7640
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<F16, F32, F16, 4, 3, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 3, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_threadwise<F16, F32, F16, 4, 4, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 4, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_threadwise<F16, F32, F16, 4, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 1, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_threadwise<F16, F32, F16, 2, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<2, 1, UnarySquare, UnarySqrt>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.hpp
new file mode 100644
index 00000000..4685d7b5
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.hpp
new file mode 100644
index 00000000..1de338fb
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.hpp
new file mode 100644
index 00000000..e86c41a9
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.hpp
new file mode 100644
index 00000000..2ca90085
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.hpp
new file mode 100644
index 00000000..38380e71
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp
new file mode 100644
index 00000000..04c5f3e6
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 3, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 3, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 4, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 4, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 1, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_threadwise<F32, F32, F32, 2, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<2, 1, UnarySquare, UnarySqrt>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.hpp
new file mode 100644
index 00000000..fef5d408
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<F32, F64, F32, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_threadwise<F32, F64, F32, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_threadwise<F32, F64, F32, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_threadwise<F32, F64, F32, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp
new file mode 100644
index 00000000..2416f614
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<F32, F64, F32, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_threadwise<F32, F64, F32, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_threadwise<F32, F64, F32, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_threadwise<F32, F64, F32, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp
new file mode 100644
index 00000000..fbd0285a
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<F32, F64, F32, 4, 3, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 3, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_threadwise<F32, F64, F32, 4, 4, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 4, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_threadwise<F32, F64, F32, 4, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 1, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_threadwise<F32, F64, F32, 2, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<2, 1, UnarySquare, UnarySqrt>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp
new file mode 100644
index 00000000..103b85a0
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp
new file mode 100644
index 00000000..e01f590f
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp
new file mode 100644
index 00000000..14a7459b
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp
new file mode 100644
index 00000000..7dfd8060
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp
new file mode 100644
index 00000000..7670a27c
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp
new file mode 100644
index 00000000..8bb85f37
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 3, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 4, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 1, UnarySquare, UnarySqrt>>&); 
+extern template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<2, 1, UnarySquare, UnarySqrt>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp
new file mode 100644
index 00000000..a005ba8d
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<I8, I32, I8, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_threadwise<I8, I32, I8, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_threadwise<I8, I32, I8, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+extern template void add_device_reduce_instance_threadwise<I8, I32, I8, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp
new file mode 100644
index 00000000..9e8c07eb
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<I8, I32, I8, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_threadwise<I8, I32, I8, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_threadwise<I8, I32, I8, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+extern template void add_device_reduce_instance_threadwise<I8, I32, I8, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp
new file mode 100644
index 00000000..a69f88f5
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp
new file mode 100644
index 00000000..734b31c1
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp
new file mode 100644
index 00000000..237bd969
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+extern template void add_device_reduce_instance_threadwise<I8, I8, I8, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
new file mode 100644
index 00000000..36eb092f
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <type_traits>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_softmax_f16_f16_rank3_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>&);
+void add_device_softmax_f16_f16_rank4_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>&);
+
+void add_device_softmax_f32_f32_rank3_instances(
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>&);
+void add_device_softmax_f32_f32_rank4_instances(
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>&);
+
+void add_device_softmax_i8_i8_rank3_instances(
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>&);
+void add_device_softmax_i8_i8_rank4_instances(
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>&);
+
+template <typename InDataType, typename AccDataType, typename OutDataType, index_t Rank>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::
+        DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>>
+{
+    using DeviceOp =
+        DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(std::is_same_v<InDataType, F16> && std::is_same_v<AccDataType, F32> &&
+                     std::is_same_v<OutDataType, F16>)
+        {
+            if constexpr(Rank == 3)
+                add_device_softmax_f16_f16_rank3_instances(op_ptrs);
+            else if constexpr(Rank == 4)
+                add_device_softmax_f16_f16_rank4_instances(op_ptrs);
+        }
+        else if constexpr(std::is_same_v<InDataType, F32> && std::is_same_v<AccDataType, F32> &&
+                          std::is_same_v<OutDataType, F32>)
+        {
+            if constexpr(Rank == 3)
+                add_device_softmax_f32_f32_rank3_instances(op_ptrs);
+            else if constexpr(Rank == 4)
+                add_device_softmax_f32_f32_rank4_instances(op_ptrs);
+        }
+        else if constexpr(std::is_same_v<InDataType, I8> && std::is_same_v<AccDataType, F32> &&
+                          std::is_same_v<OutDataType, I8>)
+        {
+            if constexpr(Rank == 3)
+                add_device_softmax_i8_i8_rank3_instances(op_ptrs);
+            else if constexpr(Rank == 4)
+                add_device_softmax_i8_i8_rank4_instances(op_ptrs);
+        }
+
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp
new file mode 100644
index 00000000..83f52fc3
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_softmax_f16_f16_rank3_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
+void add_device_softmax_f16_f16_rank4_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp
new file mode 100644
index 00000000..046ff578
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_softmax_f16_f16_rank3_reduce1_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp
new file mode 100644
index 00000000..8e6a226f
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_softmax_f16_f16_rank3_reduce2_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp
new file mode 100644
index 00000000..518fa5f9
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_softmax_f16_f16_rank3_reduce3_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp
new file mode 100644
index 00000000..10016cdd
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_softmax_f16_f16_rank4_reduce1_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp
new file mode 100644
index 00000000..cdd5a3cd
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_softmax_f16_f16_rank4_reduce2_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp
new file mode 100644
index 00000000..a8be272e
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_softmax_f16_f16_rank4_reduce3_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp
new file mode 100644
index 00000000..ec8296ff
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_softmax_f16_f16_rank4_reduce4_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp
new file mode 100644
index 00000000..b3877c4b
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <index_t Rank, index_t Reduce>
+using device_softmax_f16_f16_instances = std::tuple<
+    // clang-format off
+    //                InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
+    // fallback kernel
+    DeviceSoftmaxImpl<       F16,         F32,         F16,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                1,                8,              1,               1,              1>,
+    DeviceSoftmaxImpl<       F16,         F32,         F16,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                1,                8,              1,               8,              8>,
+    DeviceSoftmaxImpl<       F16,         F32,         F16,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  4,                 64,                1,                8,              1,               8,              8>,
+    DeviceSoftmaxImpl<       F16,         F32,         F16,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  2,                128,                1,                8,              1,               8,              8>,
+    DeviceSoftmaxImpl<       F16,         F32,         F16,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  2,                128,                1,               16,              1,               8,              8>,
+    DeviceSoftmaxImpl<       F16,         F32,         F16,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  2,                128,                1,               32,              1,               8,              8>,
+    DeviceSoftmaxImpl<       F16,         F32,         F16,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  1,                256,                1,                8,              1,               8,              8>,
+    DeviceSoftmaxImpl<       F16,         F32,         F16,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  1,                256,                1,               16,              1,               8,              8>,
+    DeviceSoftmaxImpl<       F16,         F32,         F16,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  1,                256,                1,               32,              1,               8,              8>,
+    // Reduction on middle dimensions
+    // InSrcVectorDim is 0 since we want to coalesce reads on M dimension
+    DeviceSoftmaxImpl<       F16,         F32,         F16,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                8,                4,              0,               1,              1>,
+    DeviceSoftmaxImpl<       F16,         F32,         F16,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                8,                4,              0,               8,              4>
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp
new file mode 100644
index 00000000..a6d9a359
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_softmax_f32_f32_rank3_instances(
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
+void add_device_softmax_f32_f32_rank4_instances(
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp
new file mode 100644
index 00000000..6621a2c8
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_softmax_f32_f32_rank3_reduce1_instances(
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp
new file mode 100644
index 00000000..3dfac98e
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_softmax_f32_f32_rank3_reduce2_instances(
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp
new file mode 100644
index 00000000..6d2a0c93
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_softmax_f32_f32_rank3_reduce3_instances(
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp
new file mode 100644
index 00000000..97dd3dcb
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_softmax_f32_f32_rank4_reduce1_instances(
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp
new file mode 100644
index 00000000..58f8760a
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_softmax_f32_f32_rank4_reduce2_instances(
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp
new file mode 100644
index 00000000..df8d31f0
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_softmax_f32_f32_rank4_reduce3_instances(
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp
new file mode 100644
index 00000000..1bd77322
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_softmax_f32_f32_rank4_reduce4_instances(
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp
new file mode 100644
index 00000000..16f129d2
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <index_t Rank, index_t Reduce>
+using device_softmax_f32_f32_instances = std::tuple<
+    // clang-format off
+    //                InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
+    DeviceSoftmaxImpl<       F32,         F32,         F32,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                1,                8,              1,               1,               1>, // fallback kernel
+    DeviceSoftmaxImpl<       F32,         F32,         F32,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                1,                8,              1,               4,               4>,
+    DeviceSoftmaxImpl<       F32,         F32,         F32,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  4,                 64,                1,                8,              1,               4,               4>,
+    DeviceSoftmaxImpl<       F32,         F32,         F32,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  2,                128,                1,                8,              1,               4,               4>,
+    DeviceSoftmaxImpl<       F32,         F32,         F32,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  2,                128,                1,               16,              1,               4,               4>,
+    DeviceSoftmaxImpl<       F32,         F32,         F32,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  2,                128,                1,               32,              1,               4,               4>,
+    DeviceSoftmaxImpl<       F32,         F32,         F32,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  1,                256,                1,                8,              1,               4,               4>,
+    DeviceSoftmaxImpl<       F32,         F32,         F32,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  1,                256,                1,               16,              1,               4,               4>,
+    DeviceSoftmaxImpl<       F32,         F32,         F32,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  1,                256,                1,               32,              1,               4,               4>,
+    // Reduction on middle dimensions
+    // InSrcVectorDim is 0 since we want to coalesce reads on M dimension
+    DeviceSoftmaxImpl<       F32,         F32,         F32,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                8,                4,              0,               1,               1>,
+    DeviceSoftmaxImpl<       F32,         F32,         F32,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                8,                4,              0,               4,               4>
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp
new file mode 100644
index 00000000..f80f712f
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_softmax_i8_i8_rank3_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
+void add_device_softmax_i8_i8_rank4_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp
new file mode 100644
index 00000000..6f9952e7
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_softmax_i8_i8_rank3_reduce1_instances(
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp
new file mode 100644
index 00000000..2cbd13a1
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_softmax_i8_i8_rank3_reduce2_instances(
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp
new file mode 100644
index 00000000..7b12522a
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_softmax_i8_i8_rank3_reduce3_instances(
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp
new file mode 100644
index 00000000..54d477f8
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_softmax_i8_i8_rank4_reduce1_instances(
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp
new file mode 100644
index 00000000..4ffc44e3
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_softmax_i8_i8_rank4_reduce2_instances(
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp
new file mode 100644
index 00000000..08cbb812
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_softmax_i8_i8_rank4_reduce3_instances(
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp
new file mode 100644
index 00000000..187d034b
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_softmax_i8_i8_rank4_reduce4_instances(
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp
new file mode 100644
index 00000000..7fc9ed69
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <tuple>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <index_t Rank, index_t Reduce>
+using device_softmax_i8_i8_instances = std::tuple<
+    // clang-format off
+    //                InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
+    // fallback kernel
+    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                1,               16,              1,               1,              1>,
+    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                1,               16,              1,              16,             16>,
+    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  4,                 64,                1,               16,              1,              16,             16>,
+    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  2,                128,                1,               16,              1,              16,             16>,
+    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  2,                128,                1,               32,              1,              16,             16>,
+    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  2,                128,                1,               64,              1,              16,             16>,
+    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  1,                256,                1,               16,              1,              16,             16>,
+    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  1,                256,                1,               32,              1,              16,             16>,
+    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  1,                256,                1,               64,              1,              16,             16>,
+    // Reduction on middle dimensions
+    // InSrcVectorDim is 0 since we want to coalesce reads on M dimension
+    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                  8,                 32,                8,                8,              0,               1,              1>,
+    DeviceSoftmaxImpl<        I8,         F32,          I8,     PassThrough,      PassThrough, Rank,       Reduce,       256,                 32,                  8,               32,                8,              0,              16,              8>
+    // clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp
new file mode 100644
index 00000000..03be6e2b
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp"
diff --git a/library/include/ck/library/utility/algorithm.hpp b/library/include/ck/library/utility/algorithm.hpp
new file mode 100644
index 00000000..86f04dd3
--- /dev/null
+++ b/library/include/ck/library/utility/algorithm.hpp
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <algorithm>
+#include <iterator>
+#include <type_traits>
+#include <utility>
+
+namespace ck {
+namespace ranges {
+template <typename InputRange, typename OutputIterator>
+auto copy(InputRange&& range, OutputIterator iter)
+    -> decltype(std::copy(std::begin(std::forward<InputRange>(range)),
+                          std::end(std::forward<InputRange>(range)),
+                          iter))
+{
+    return std::copy(std::begin(std::forward<InputRange>(range)),
+                     std::end(std::forward<InputRange>(range)),
+                     iter);
+}
+
+template <typename T, typename OutputRange>
+auto fill(OutputRange&& range, const T& init)
+    -> std::void_t<decltype(std::fill(std::begin(std::forward<OutputRange>(range)),
+                                      std::end(std::forward<OutputRange>(range)),
+                                      init))>
+{
+    std::fill(std::begin(std::forward<OutputRange>(range)),
+              std::end(std::forward<OutputRange>(range)),
+              init);
+}
+
+template <typename InputRange, typename OutputIterator, typename UnaryOperation>
+auto transform(InputRange&& range, OutputIterator iter, UnaryOperation unary_op)
+    -> decltype(std::transform(std::begin(range), std::end(range), iter, unary_op))
+{
+    return std::transform(std::begin(range), std::end(range), iter, unary_op);
+}
+
+} // namespace ranges
+} // namespace ck
diff --git a/library/include/ck/library/utility/check_err.hpp b/library/include/ck/library/utility/check_err.hpp
new file mode 100644
index 00000000..a89d03d3
--- /dev/null
+++ b/library/include/ck/library/utility/check_err.hpp
@@ -0,0 +1,218 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <iostream>
+#include <iomanip>
+#include <iterator>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/type.hpp"
+#include "ck/host_utility/io.hpp"
+
+#include "ck/library/utility/ranges.hpp"
+
+namespace ck {
+namespace utils {
+
+template <typename Range, typename RefRange>
+typename std::enable_if<
+    std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
+        std::is_floating_point_v<ranges::range_value_t<Range>> &&
+        !std::is_same_v<ranges::range_value_t<Range>, half_t>,
+    bool>::type
+check_err(const Range& out,
+          const RefRange& ref,
+          const std::string& msg = "Error: Incorrect results!",
+          double rtol            = 1e-5,
+          double atol            = 3e-6)
+{
+    if(out.size() != ref.size())
+    {
+        std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
+                  << std::endl;
+        return false;
+    }
+
+    bool res{true};
+    int err_count  = 0;
+    double err     = 0;
+    double max_err = std::numeric_limits<double>::min();
+    for(std::size_t i = 0; i < ref.size(); ++i)
+    {
+        const double o = *std::next(std::begin(out), i);
+        const double r = *std::next(std::begin(ref), i);
+        err            = std::abs(o - r);
+        if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
+        {
+            max_err = err > max_err ? err : max_err;
+            err_count++;
+            if(err_count < 5)
+            {
+                std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
+                          << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
+            }
+            res = false;
+        }
+    }
+    if(!res)
+    {
+        std::cerr << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
+    }
+    return res;
+}
+
+template <typename Range, typename RefRange>
+typename std::enable_if<
+    std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
+        std::is_same_v<ranges::range_value_t<Range>, bhalf_t>,
+    bool>::type
+check_err(const Range& out,
+          const RefRange& ref,
+          const std::string& msg = "Error: Incorrect results!",
+          double rtol            = 1e-3,
+          double atol            = 1e-3)
+{
+    if(out.size() != ref.size())
+    {
+        std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
+                  << std::endl;
+        return false;
+    }
+
+    bool res{true};
+    int err_count = 0;
+    double err    = 0;
+    // TODO: This is a hack. We should have proper specialization for bhalf_t data type.
+    double max_err = std::numeric_limits<float>::min();
+    for(std::size_t i = 0; i < ref.size(); ++i)
+    {
+        const double o = type_convert<float>(*std::next(std::begin(out), i));
+        const double r = type_convert<float>(*std::next(std::begin(ref), i));
+        err            = std::abs(o - r);
+        if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
+        {
+            max_err = err > max_err ? err : max_err;
+            err_count++;
+            if(err_count < 5)
+            {
+                std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
+                          << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
+            }
+            res = false;
+        }
+    }
+    if(!res)
+    {
+        std::cerr << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
+    }
+    return res;
+}
+
+template <typename Range, typename RefRange>
+typename std::enable_if<
+    std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
+        std::is_same_v<ranges::range_value_t<Range>, half_t>,
+    bool>::type
+check_err(const Range& out,
+          const RefRange& ref,
+          const std::string& msg = "Error: Incorrect results!",
+          double rtol            = 1e-3,
+          double atol            = 1e-3)
+{
+    if(out.size() != ref.size())
+    {
+        std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
+                  << std::endl;
+        return false;
+    }
+
+    bool res{true};
+    int err_count  = 0;
+    double err     = 0;
+    double max_err = std::numeric_limits<ranges::range_value_t<Range>>::min();
+    for(std::size_t i = 0; i < ref.size(); ++i)
+    {
+        const double o = type_convert<float>(*std::next(std::begin(out), i));
+        const double r = type_convert<float>(*std::next(std::begin(ref), i));
+        err            = std::abs(o - r);
+        if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
+        {
+            max_err = err > max_err ? err : max_err;
+            err_count++;
+            if(err_count < 5)
+            {
+                std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
+                          << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
+            }
+            res = false;
+        }
+    }
+    if(!res)
+    {
+        std::cerr << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl;
+    }
+    return res;
+}
+
+template <typename Range, typename RefRange>
+std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
+                  std::is_integral_v<ranges::range_value_t<Range>> &&
+                  !std::is_same_v<ranges::range_value_t<Range>, bhalf_t>)
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+                     || std::is_same_v<ranges::range_value_t<Range>, int4_t>
+#endif
+                 ,
+                 bool>
+check_err(const Range& out,
+          const RefRange& ref,
+          const std::string& msg = "Error: Incorrect results!",
+          double                 = 0,
+          double atol            = 0)
+{
+    if(out.size() != ref.size())
+    {
+        std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
+                  << std::endl;
+        return false;
+    }
+
+    bool res{true};
+    int err_count   = 0;
+    int64_t err     = 0;
+    int64_t max_err = std::numeric_limits<int64_t>::min();
+    for(std::size_t i = 0; i < ref.size(); ++i)
+    {
+        const int64_t o = *std::next(std::begin(out), i);
+        const int64_t r = *std::next(std::begin(ref), i);
+        err             = std::abs(o - r);
+
+        if(err > atol)
+        {
+            max_err = err > max_err ? err : max_err;
+            err_count++;
+            if(err_count < 5)
+            {
+                std::cerr << msg << " out[" << i << "] != ref[" << i << "]: " << o << " != " << r
+                          << std::endl;
+            }
+            res = false;
+        }
+    }
+    if(!res)
+    {
+        std::cerr << "max err: " << max_err << std::endl;
+    }
+    return res;
+}
+
+} // namespace utils
+} // namespace ck
diff --git a/library/include/ck/library/utility/conv_common.hpp b/library/include/ck/library/utility/conv_common.hpp
new file mode 100644
index 00000000..6fad9f7d
--- /dev/null
+++ b/library/include/ck/library/utility/conv_common.hpp
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_description/tensor_descriptor.hpp"
+
+template <typename... InDesc,
+          typename... WeiDesc,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename LeftPads,
+          typename RightPads>
+constexpr auto get_convolution_output_default_4d_tensor_descriptor(
+    const ck::TensorDescriptor<InDesc...>& in_desc,
+    const ck::TensorDescriptor<WeiDesc...>& wei_desc,
+    const ConvStrides& conv_strides,
+    const ConvDilations conv_dilations,
+    const LeftPads& left_pads,
+    const RightPads& right_pads)
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    assert(in_desc.GetNumOfDimension() == 4);
+    assert(wei_desc.GetNumOfDimension() == 4);
+    assert(in_desc.GetLength(I1) == wei_desc.GetLength(I1));
+
+    const auto N  = in_desc.GetLength(I0);
+    const auto Hi = in_desc.GetLength(I2);
+    const auto Wi = in_desc.GetLength(I3);
+
+    const auto K = wei_desc.GetLength(I0);
+    const auto Y = wei_desc.GetLength(I2);
+    const auto X = wei_desc.GetLength(I3);
+
+    const auto LeftPadH = left_pads[I0];
+    const auto LeftPadW = left_pads[I1];
+
+    const auto RightPadH = right_pads[I0];
+    const auto RightPadW = right_pads[I1];
+
+    const auto YEff = (Y - I1) * conv_dilations[I0] + I1;
+    const auto XEff = (X - I1) * conv_dilations[I1] + I1;
+
+    const auto Ho = (Hi + LeftPadH + RightPadH - YEff) / conv_strides[I0] + I1;
+    const auto Wo = (Wi + LeftPadW + RightPadW - XEff) / conv_strides[I1] + I1;
+
+    return make_naive_tensor_descriptor_packed(make_tuple(N, K, Ho, Wo));
+}
+
+template <class InDesc, class WeiDesc, class OutDesc>
+constexpr std::size_t
+calculate_convolution_flops(const InDesc&, const WeiDesc& wei_desc, const OutDesc& out_desc)
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    constexpr auto I3 = Number<3>{};
+
+    const index_t N  = out_desc.GetLength(I0);
+    const index_t K  = out_desc.GetLength(I1);
+    const index_t Ho = out_desc.GetLength(I2);
+    const index_t Wo = out_desc.GetLength(I3);
+
+    const index_t C = wei_desc.GetLength(I1);
+    const index_t Y = wei_desc.GetLength(I2);
+    const index_t X = wei_desc.GetLength(I3);
+
+    return std::size_t(2) * N * K * Ho * Wo * C * Y * X;
+}
diff --git a/library/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp b/library/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp
new file mode 100644
index 00000000..2b4f63b2
--- /dev/null
+++ b/library/include/ck/library/utility/convolution_host_tensor_descriptor_helper.hpp
@@ -0,0 +1,355 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+
+namespace ck {
+namespace utils {
+namespace conv {
+
+namespace detail {
+
+template <typename OldLayout>
+std::vector<std::size_t> get_layout_transpose_gnchw_to_old()
+{
+    // HACK: NHWC/KYXC/NHWK, which is treated as GNHWC/GKYXC/GNHWK by this function,
+    // is used by some legacy kernel. New kernel should use GNHWK/GKYXC/GNHWK
+    // TODO: remove this branch after removing legacy kernel
+    if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NWC> ||
+                 ck::is_same_v<OldLayout, ck::tensor_layout::convolution::KXC> ||
+                 ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NWK>)
+    {
+        return {0, 1, 3, 2};
+    }
+    else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NHWC> ||
+                      ck::is_same_v<OldLayout, ck::tensor_layout::convolution::KYXC> ||
+                      ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NHWK>)
+    {
+        return {0, 1, 4, 2, 3};
+    }
+    else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NDHWC> ||
+                      ck::is_same_v<OldLayout, ck::tensor_layout::convolution::KZYXC> ||
+                      ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NDHWK>)
+    {
+        return {0, 1, 5, 2, 3, 4};
+    }
+    // separate from legacy code above
+    else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNCW> ||
+                      ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GKCX> ||
+                      ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNKW>)
+    {
+        return {0, 1, 2, 3};
+    }
+    else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNCHW> ||
+                      ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GKCYX> ||
+                      ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNKHW>)
+    {
+        return {0, 1, 2, 3, 4};
+    }
+    else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNCDHW> ||
+                      ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GKCZYX> ||
+                      ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNKDHW>)
+    {
+        return {0, 1, 2, 3, 4, 5};
+    }
+    if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNWC> ||
+                 ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GKXC> ||
+                 ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNWK>)
+    {
+        return {0, 1, 3, 2};
+    }
+    else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNHWC> ||
+                      ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GKYXC> ||
+                      ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNHWK>)
+    {
+        return {0, 1, 4, 2, 3};
+    }
+    else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNDHWC> ||
+                      ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GKZYXC> ||
+                      ck::is_same_v<OldLayout, ck::tensor_layout::convolution::GNDHWK>)
+    {
+        return {0, 1, 5, 2, 3, 4};
+    }
+    else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NWGC> ||
+                      ck::is_same_v<OldLayout, ck::tensor_layout::convolution::KXGC> ||
+                      ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NWGK>)
+    {
+        return {2, 0, 3, 1};
+    }
+    else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NHWGC> ||
+                      ck::is_same_v<OldLayout, ck::tensor_layout::convolution::KYXGC> ||
+                      ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NHWGK>)
+    {
+        return {3, 0, 4, 1, 2};
+    }
+    else if constexpr(ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NDHWGC> ||
+                      ck::is_same_v<OldLayout, ck::tensor_layout::convolution::KZYXGC> ||
+                      ck::is_same_v<OldLayout, ck::tensor_layout::convolution::NDHWGK>)
+    {
+        return {4, 0, 5, 1, 2, 3};
+    }
+    else
+    {
+        printf("%s\n", __func__);
+        throw std::runtime_error("wrong! unsupported layout");
+    }
+}
+
+} // namespace detail
+
+// make tensor descriptor for packed input tensor, and order the dimension in the order of GNCHW
+// regardless of physical layout
+template <typename InLayout>
+HostTensorDescriptor
+make_input_host_tensor_descriptor_g_n_c_wis_packed(const ck::utils::conv::ConvParam& param)
+{
+    std::vector<std::size_t> physical_lengths;
+
+    // HACK: NHWC/KYXC/NHWK, which is treated as GNHWC/GKYXC/GNHWK by this function,
+    // is used by some legacy kernel. New kernel should use GNHWK/GKYXC/GNHWK
+    // TODO: remove this branch after removing legacy kernel
+    if constexpr(ck::is_same_v<InLayout, ck::tensor_layout::convolution::NWC> ||
+                 ck::is_same_v<InLayout, ck::tensor_layout::convolution::NHWC> ||
+                 ck::is_same_v<InLayout, ck::tensor_layout::convolution::NDHWC>)
+    {
+        if(param.G_ != 1)
+        {
+            throw std::runtime_error("wrong! G != 1");
+        }
+
+        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
+                                                    static_cast<std::size_t>(param.N_),
+                                                    static_cast<std::size_t>(param.C_)};
+
+        physical_lengths.insert(physical_lengths.begin() + 2,
+                                param.input_spatial_lengths_.begin(),
+                                param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
+    }
+    // separate from legacy code above
+    else if constexpr(ck::is_same_v<InLayout, ck::tensor_layout::convolution::GNCW> ||
+                      ck::is_same_v<InLayout, ck::tensor_layout::convolution::GNCHW> ||
+                      ck::is_same_v<InLayout, ck::tensor_layout::convolution::GNCDHW>)
+    {
+        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
+                                                    static_cast<std::size_t>(param.N_),
+                                                    static_cast<std::size_t>(param.C_)};
+
+        physical_lengths.insert(physical_lengths.end(),
+                                param.input_spatial_lengths_.begin(),
+                                param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
+    }
+    else if constexpr(ck::is_same_v<InLayout, ck::tensor_layout::convolution::GNWC> ||
+                      ck::is_same_v<InLayout, ck::tensor_layout::convolution::GNHWC> ||
+                      ck::is_same_v<InLayout, ck::tensor_layout::convolution::GNDHWC>)
+    {
+        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
+                                                    static_cast<std::size_t>(param.N_),
+                                                    static_cast<std::size_t>(param.C_)};
+
+        physical_lengths.insert(physical_lengths.begin() + 2,
+                                param.input_spatial_lengths_.begin(),
+                                param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
+    }
+    else if constexpr(ck::is_same_v<InLayout, ck::tensor_layout::convolution::NWGC> ||
+                      ck::is_same_v<InLayout, ck::tensor_layout::convolution::NHWGC> ||
+                      ck::is_same_v<InLayout, ck::tensor_layout::convolution::NDHWGC>)
+    {
+        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.N_),
+                                                    static_cast<std::size_t>(param.G_),
+                                                    static_cast<std::size_t>(param.C_)};
+
+        physical_lengths.insert(physical_lengths.begin() + 1,
+                                param.input_spatial_lengths_.begin(),
+                                param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
+    }
+    else
+    {
+        printf("%s\n", __func__);
+        printf("%s\n", InLayout::name);
+        throw std::runtime_error("wrong! unsupported layout");
+    }
+
+    return transpose_host_tensor_descriptor_given_new2old(
+        HostTensorDescriptor(physical_lengths),
+        detail::get_layout_transpose_gnchw_to_old<InLayout>());
+}
+
+// make tensor descriptor for packed weight tensor, and order the dimension in the order of GKCYX
+// regardless of physical layout
+template <typename WeiLayout>
+HostTensorDescriptor
+make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck::utils::conv::ConvParam& param)
+{
+    std::vector<std::size_t> physical_lengths;
+
+    // HACK: NHWC/KYXC/NHWK, which is treated as GNHWC/GKYXC/GNHWK by this function,
+    // is used by some legacy kernel. New kernel should use GNHWK/GKYXC/GNHWK
+    // TODO: remove this branch after removing legacy kernel
+    if constexpr(ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KXC> ||
+                 ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KYXC> ||
+                 ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KZYXC>)
+    {
+        if(param.G_ != 1)
+        {
+            throw std::runtime_error("wrong! G != 1");
+        }
+
+        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
+                                                    static_cast<std::size_t>(param.K_),
+                                                    static_cast<std::size_t>(param.C_)};
+
+        physical_lengths.insert(physical_lengths.begin() + 2,
+                                param.filter_spatial_lengths_.begin(),
+                                param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
+    }
+    // separate from legacy code above
+    else if constexpr(ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KXC> ||
+                      ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KYXC> ||
+                      ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KZYXC>)
+    {
+        if(param.G_ != 1)
+        {
+            throw std::runtime_error("wrong! G != 1");
+        }
+
+        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.K_),
+                                                    static_cast<std::size_t>(param.C_)};
+
+        physical_lengths.insert(physical_lengths.end(),
+                                param.filter_spatial_lengths_.begin(),
+                                param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
+    }
+    else if constexpr(ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::GKCX> ||
+                      ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::GKCYX> ||
+                      ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::GKCZYX>)
+    {
+        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
+                                                    static_cast<std::size_t>(param.K_),
+                                                    static_cast<std::size_t>(param.C_)};
+
+        physical_lengths.insert(physical_lengths.end(),
+                                param.filter_spatial_lengths_.begin(),
+                                param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
+    }
+    else if constexpr(ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::GKXC> ||
+                      ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::GKYXC> ||
+                      ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::GKZYXC>)
+    {
+        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
+                                                    static_cast<std::size_t>(param.K_),
+                                                    static_cast<std::size_t>(param.C_)};
+
+        physical_lengths.insert(physical_lengths.begin() + 2,
+                                param.filter_spatial_lengths_.begin(),
+                                param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
+    }
+    else if constexpr(ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KXGC> ||
+                      ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KYXGC> ||
+                      ck::is_same_v<WeiLayout, ck::tensor_layout::convolution::KZYXGC>)
+    {
+        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.K_),
+                                                    static_cast<std::size_t>(param.G_),
+                                                    static_cast<std::size_t>(param.C_)};
+
+        physical_lengths.insert(physical_lengths.begin() + 1,
+                                param.filter_spatial_lengths_.begin(),
+                                param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
+    }
+    else
+    {
+        printf("%s\n", __func__);
+        printf("%s\n", WeiLayout::name);
+        throw std::runtime_error("wrong! unsupported layout");
+    }
+
+    return transpose_host_tensor_descriptor_given_new2old(
+        HostTensorDescriptor(physical_lengths),
+        detail::get_layout_transpose_gnchw_to_old<WeiLayout>());
+}
+
+// make tensor descriptor for packed output tensor, and order the dimension in the order of GNKHW
+// regardless of physical layout
+template <typename OutLayout>
+HostTensorDescriptor
+make_output_host_tensor_descriptor_g_n_k_wos_packed(const ck::utils::conv::ConvParam& param)
+{
+    std::vector<std::size_t> physical_lengths;
+
+    // HACK: NHWC/KYXC/NHWK, which is treated as GNHWC/GKYXC/GNHWK by this function,
+    // is used by some legacy kernel. New kernel should use GNHWK/GKYXC/GNHWK
+    // TODO: remove this branch after removing legacy kernel
+    if constexpr(ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NWK> ||
+                 ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NHWK> ||
+                 ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NDHWK>)
+    {
+        if(param.G_ != 1)
+        {
+            throw std::runtime_error("wrong! G != 1");
+        }
+
+        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
+                                                    static_cast<std::size_t>(param.N_),
+                                                    static_cast<std::size_t>(param.K_)};
+
+        physical_lengths.insert(physical_lengths.begin() + 2,
+                                param.output_spatial_lengths_.begin(),
+                                param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
+    }
+    // separate from legacy code above
+    else if constexpr(ck::is_same_v<OutLayout, ck::tensor_layout::convolution::GNKW> ||
+                      ck::is_same_v<OutLayout, ck::tensor_layout::convolution::GNKHW> ||
+                      ck::is_same_v<OutLayout, ck::tensor_layout::convolution::GNKDHW>)
+    {
+        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
+                                                    static_cast<std::size_t>(param.N_),
+                                                    static_cast<std::size_t>(param.K_)};
+
+        physical_lengths.insert(physical_lengths.end(),
+                                param.output_spatial_lengths_.begin(),
+                                param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
+    }
+    else if constexpr(ck::is_same_v<OutLayout, ck::tensor_layout::convolution::GNWK> ||
+                      ck::is_same_v<OutLayout, ck::tensor_layout::convolution::GNHWK> ||
+                      ck::is_same_v<OutLayout, ck::tensor_layout::convolution::GNDHWK>)
+    {
+        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
+                                                    static_cast<std::size_t>(param.N_),
+                                                    static_cast<std::size_t>(param.K_)};
+
+        physical_lengths.insert(physical_lengths.begin() + 2,
+                                param.output_spatial_lengths_.begin(),
+                                param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
+    }
+    else if constexpr(ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NWGK> ||
+                      ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NHWGK> ||
+                      ck::is_same_v<OutLayout, ck::tensor_layout::convolution::NDHWGK>)
+    {
+        physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.N_),
+                                                    static_cast<std::size_t>(param.G_),
+                                                    static_cast<std::size_t>(param.K_)};
+
+        physical_lengths.insert(physical_lengths.begin() + 1,
+                                param.output_spatial_lengths_.begin(),
+                                param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
+    }
+    else
+    {
+        printf("%s\n", __func__);
+        printf("%s\n", OutLayout::name);
+        throw std::runtime_error("wrong! unsupported layout");
+    }
+
+    return transpose_host_tensor_descriptor_given_new2old(
+        HostTensorDescriptor(physical_lengths),
+        detail::get_layout_transpose_gnchw_to_old<OutLayout>());
+}
+
+} // namespace conv
+} // namespace utils
+} // namespace ck
diff --git a/library/include/ck/library/utility/convolution_parameter.hpp b/library/include/ck/library/utility/convolution_parameter.hpp
new file mode 100644
index 00000000..f4a2b56f
--- /dev/null
+++ b/library/include/ck/library/utility/convolution_parameter.hpp
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+#include <numeric>
+#include <iterator>
+#include <vector>
+
+#include "ck/ck.hpp"
+
+#include "ck/library/utility/numeric.hpp"
+
+namespace ck {
+namespace utils {
+namespace conv {
+
+struct ConvParam
+{
+    ConvParam();
+    ConvParam(ck::index_t n_dim,
+              ck::index_t group_count,
+              ck::index_t n_batch,
+              ck::index_t n_out_channels,
+              ck::index_t n_in_channels,
+              const std::vector<ck::index_t>& filters_len,
+              const std::vector<ck::index_t>& input_len,
+              const std::vector<ck::index_t>& strides,
+              const std::vector<ck::index_t>& dilations,
+              const std::vector<ck::index_t>& left_pads,
+              const std::vector<ck::index_t>& right_pads);
+
+    ck::index_t num_dim_spatial_;
+    ck::index_t G_;
+    ck::index_t N_;
+    ck::index_t K_;
+    ck::index_t C_;
+
+    std::vector<ck::index_t> filter_spatial_lengths_;
+    std::vector<ck::index_t> input_spatial_lengths_;
+    std::vector<ck::index_t> output_spatial_lengths_;
+
+    std::vector<ck::index_t> conv_filter_strides_;
+    std::vector<ck::index_t> conv_filter_dilations_;
+
+    std::vector<ck::index_t> input_left_pads_;
+    std::vector<ck::index_t> input_right_pads_;
+
+    std::vector<ck::index_t> GetOutputSpatialLengths() const;
+
+    std::size_t GetFlops() const;
+
+    template <typename InDataType>
+    std::size_t GetInputByte() const
+    {
+        // sizeof(InDataType) * (G * N * C * <input spatial lengths product>) +
+        return sizeof(InDataType) *
+               (G_ * N_ * C_ *
+                ck::accumulate_n<std::size_t>(
+                    std::begin(input_spatial_lengths_), num_dim_spatial_, 1, std::multiplies<>()));
+    }
+
+    template <typename WeiDataType>
+    std::size_t GetWeightByte() const
+    {
+        // sizeof(WeiDataType) * (G * K * C * <filter spatial lengths product>) +
+        return sizeof(WeiDataType) *
+               (G_ * K_ * C_ *
+                ck::accumulate_n<std::size_t>(
+                    std::begin(filter_spatial_lengths_), num_dim_spatial_, 1, std::multiplies<>()));
+    }
+
+    template <typename OutDataType>
+    std::size_t GetOutputByte() const
+    {
+        // sizeof(OutDataType) * (G * N * K * <output spatial lengths product>);
+        return sizeof(OutDataType) * (G_ * N_ * K_ *
+                                      std::accumulate(std::begin(output_spatial_lengths_),
+                                                      std::end(output_spatial_lengths_),
+                                                      static_cast<std::size_t>(1),
+                                                      std::multiplies<std::size_t>()));
+    }
+
+    template <typename InDataType, typename WeiDataType, typename OutDataType>
+    std::size_t GetByte() const
+    {
+        return GetInputByte<InDataType>() + GetWeightByte<WeiDataType>() +
+               GetOutputByte<OutDataType>();
+    }
+};
+
+std::string get_conv_param_parser_helper_msg();
+
+ConvParam parse_conv_param(int num_dim_spatial, int arg_idx, char* const argv[]);
+
+} // namespace conv
+} // namespace utils
+} // namespace ck
+
+std::ostream& operator<<(std::ostream& os, const ck::utils::conv::ConvParam& p);
diff --git a/library/include/ck/library/utility/device_memory.hpp b/library/include/ck/library/utility/device_memory.hpp
new file mode 100644
index 00000000..3c4ece44
--- /dev/null
+++ b/library/include/ck/library/utility/device_memory.hpp
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <hip/hip_runtime.h>
+
+template <typename T>
+__global__ void set_buffer_value(T* p, T x, uint64_t buffer_element_size)
+{
+    for(uint64_t i = threadIdx.x; i < buffer_element_size; i += blockDim.x)
+    {
+        p[i] = x;
+    }
+}
+
+struct DeviceMem
+{
+    DeviceMem() = delete;
+    DeviceMem(std::size_t mem_size);
+    void* GetDeviceBuffer() const;
+    std::size_t GetBufferSize() const;
+    void ToDevice(const void* p) const;
+    void FromDevice(void* p) const;
+    void SetZero() const;
+    template <typename T>
+    void SetValue(T x) const;
+    ~DeviceMem();
+
+    void* mpDeviceBuf;
+    std::size_t mMemSize;
+};
+
+template <typename T>
+void DeviceMem::SetValue(T x) const
+{
+    if(mMemSize % sizeof(T) != 0)
+    {
+        throw std::runtime_error("wrong! not entire DeviceMem will be set");
+    }
+
+    set_buffer_value<T><<<1, 1024>>>(static_cast<T*>(mpDeviceBuf), x, mMemSize / sizeof(T));
+}
diff --git a/library/include/ck/library/utility/fill.hpp b/library/include/ck/library/utility/fill.hpp
new file mode 100644
index 00000000..54d58f36
--- /dev/null
+++ b/library/include/ck/library/utility/fill.hpp
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <iterator>
+#include <random>
+#include <type_traits>
+#include <utility>
+
+#include "ck/utility/data_type.hpp"
+
+namespace ck {
+namespace utils {
+
+template <typename T>
+struct FillUniformDistribution
+{
+    float a_{-5.f};
+    float b_{5.f};
+
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last) const
+    {
+        std::mt19937 gen(11939);
+        std::uniform_real_distribution<float> dis(a_, b_);
+        std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
+    }
+
+    template <typename ForwardRange>
+    auto operator()(ForwardRange&& range) const
+        -> std::void_t<decltype(std::declval<const FillUniformDistribution&>()(
+            std::begin(std::forward<ForwardRange>(range)),
+            std::end(std::forward<ForwardRange>(range))))>
+    {
+        (*this)(std::begin(std::forward<ForwardRange>(range)),
+                std::end(std::forward<ForwardRange>(range)));
+    }
+};
+
+// Normally FillUniformDistributionIntegerValue should use std::uniform_int_distribution as below.
+// However this produces segfaults in std::mt19937 which look like inifite loop.
+//      template <typename T>
+//      struct FillUniformDistributionIntegerValue
+//      {
+//          int a_{-5};
+//          int b_{5};
+//
+//          template <typename ForwardIter>
+//          void operator()(ForwardIter first, ForwardIter last) const
+//          {
+//              std::mt19937 gen(11939);
+//              std::uniform_int_distribution<int> dis(a_, b_);
+//              std::generate(
+//                  first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
+//          }
+//      };
+
+// Workaround for uniform_int_distribution not working as expected. See note above.<
+template <typename T>
+struct FillUniformDistributionIntegerValue
+{
+    float a_{-5.f};
+    float b_{5.f};
+
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last) const
+    {
+        std::mt19937 gen(11939);
+        std::uniform_real_distribution<float> dis(a_, b_);
+        std::generate(
+            first, last, [&dis, &gen]() { return ck::type_convert<T>(std::round(dis(gen))); });
+    }
+
+    template <typename ForwardRange>
+    auto operator()(ForwardRange&& range) const
+        -> std::void_t<decltype(std::declval<const FillUniformDistributionIntegerValue&>()(
+            std::begin(std::forward<ForwardRange>(range)),
+            std::end(std::forward<ForwardRange>(range))))>
+    {
+        (*this)(std::begin(std::forward<ForwardRange>(range)),
+                std::end(std::forward<ForwardRange>(range)));
+    }
+};
+
+template <typename T>
+struct FillMonotonicSeq
+{
+    T init_value_{0};
+    T step_{1};
+
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last) const
+    {
+        std::generate(first, last, [=, n = init_value_]() mutable {
+            auto tmp = n;
+            n += step_;
+            return tmp;
+        });
+    }
+};
+
+template <typename T>
+struct FillConstant
+{
+    T value_{0};
+
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last) const
+    {
+        std::fill(first, last, value_);
+    }
+};
+
+} // namespace utils
+} // namespace ck
diff --git a/library/include/ck/library/utility/host_common_util.hpp b/library/include/ck/library/utility/host_common_util.hpp
new file mode 100644
index 00000000..6f4466e8
--- /dev/null
+++ b/library/include/ck/library/utility/host_common_util.hpp
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+#include <array>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <algorithm>
+
+#include "ck/ck.hpp"
+
+namespace ck {
+
+namespace host_common {
+
+template <typename T>
+static inline void dumpBufferToFile(const char* fileName, T* data, size_t dataNumItems)
+{
+    std::ofstream outFile(fileName, std::ios::binary);
+    if(outFile)
+    {
+        outFile.write(reinterpret_cast<char*>(data), dataNumItems * sizeof(T));
+        outFile.close();
+        std::cout << "Write output to file " << fileName << std::endl;
+    }
+    else
+    {
+        std::cout << "Could not open file " << fileName << " for writing" << std::endl;
+    }
+};
+
+template <typename T>
+static inline T getSingleValueFromString(const std::string& valueStr)
+{
+    std::istringstream iss(valueStr);
+
+    T val;
+
+    iss >> val;
+
+    return (val);
+};
+
+template <typename T>
+static inline std::vector<T> getTypeValuesFromString(const char* cstr_values)
+{
+    std::string valuesStr(cstr_values);
+
+    std::vector<T> values;
+    std::size_t pos = 0;
+    std::size_t new_pos;
+
+    new_pos = valuesStr.find(',', pos);
+    while(new_pos != std::string::npos)
+    {
+        const std::string sliceStr = valuesStr.substr(pos, new_pos - pos);
+
+        T val = getSingleValueFromString<T>(sliceStr);
+
+        values.push_back(val);
+
+        pos     = new_pos + 1;
+        new_pos = valuesStr.find(',', pos);
+    };
+
+    std::string sliceStr = valuesStr.substr(pos);
+    T val                = getSingleValueFromString<T>(sliceStr);
+
+    values.push_back(val);
+
+    return (values);
+}
+
+template <int NDim>
+static inline std::vector<std::array<index_t, NDim>>
+get_index_set(const std::array<index_t, NDim>& dim_lengths)
+{
+    static_assert(NDim >= 1, "NDim >= 1 is required to use this function!");
+
+    if constexpr(NDim == 1)
+    {
+        std::vector<std::array<index_t, NDim>> index_set;
+
+        for(int i = 0; i < dim_lengths[0]; i++)
+        {
+            std::array<index_t, 1> index{i};
+
+            index_set.push_back(index);
+        };
+
+        return index_set;
+    }
+    else
+    {
+        std::vector<std::array<index_t, NDim>> index_set;
+        std::array<index_t, NDim - 1> partial_dim_lengths;
+
+        std::copy(dim_lengths.begin() + 1, dim_lengths.end(), partial_dim_lengths.begin());
+
+        std::vector<std::array<index_t, NDim - 1>> partial_index_set;
+
+        partial_index_set = get_index_set<NDim - 1>(partial_dim_lengths);
+
+        for(index_t i = 0; i < dim_lengths[0]; i++)
+            for(const auto& partial_index : partial_index_set)
+            {
+                std::array<index_t, NDim> index;
+
+                index[0] = i;
+
+                std::copy(partial_index.begin(), partial_index.end(), index.begin() + 1);
+
+                index_set.push_back(index);
+            };
+
+        return index_set;
+    };
+};
+
+template <int NDim>
+static inline size_t get_offset_from_index(const std::array<index_t, NDim>& strides,
+                                           const std::array<index_t, NDim>& index)
+{
+    size_t offset = 0;
+
+    for(int i = 0; i < NDim; i++)
+        offset += index[i] * strides[i];
+
+    return (offset);
+};
+
+} // namespace host_common
+} // namespace ck
diff --git a/library/include/ck/library/utility/host_conv.hpp b/library/include/ck/library/utility/host_conv.hpp
new file mode 100644
index 00000000..8348a308
--- /dev/null
+++ b/library/include/ck/library/utility/host_conv.hpp
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+#include "host_tensor.hpp"
+#include "conv_common.hpp"
+
+template <typename TIn,
+          typename TWei,
+          typename TOut,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void host_conv_nchw_kcyx_nkhw(const Tensor<TIn>& in,
+                              const Tensor<TWei>& wei,
+                              Tensor<TOut>& out,
+                              const ConvStrides& conv_strides,
+                              const ConvDilations& conv_dilations,
+                              const InLeftPads& in_left_pads,
+                              const InRightPads&)
+{
+    constexpr auto I0 = ck::Number<0>{};
+    constexpr auto I1 = ck::Number<1>{};
+
+    auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
+        float v = 0;
+        for(int c = 0; c < wei.mDesc.GetLengths()[1]; ++c)
+        {
+            for(int y = 0; y < wei.mDesc.GetLengths()[2]; ++y)
+            {
+                int hi = ho * conv_strides[I0] + y * conv_dilations[I0] - in_left_pads[I0];
+                for(int x = 0; x < wei.mDesc.GetLengths()[3]; ++x)
+                {
+                    int wi = wo * conv_strides[I1] + x * conv_dilations[I1] - in_left_pads[I1];
+                    if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
+                       wi < in.mDesc.GetLengths()[3])
+                    {
+                        v += ck::type_convert<float>(in(n, c, hi, wi)) *
+                             ck::type_convert<float>(wei(k, c, y, x));
+                    }
+                }
+            }
+        }
+        out(n, k, ho, wo) = ck::type_convert<TOut>(v);
+    };
+
+    make_ParallelTensorFunctor(f_nchw,
+                               out.mDesc.GetLengths()[0],
+                               out.mDesc.GetLengths()[1],
+                               out.mDesc.GetLengths()[2],
+                               out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
+}
+
+template <typename TIn,
+          typename TWei,
+          typename TOut,
+          typename ConvStrides,
+          typename ConvDilations,
+          typename InLeftPads,
+          typename InRightPads>
+void host_conv3d_ndhwc_kzyxc_ndhwk(const Tensor<TIn>& in,
+                                   const Tensor<TWei>& wei,
+                                   Tensor<TOut>& out,
+                                   const ConvStrides& conv_strides,
+                                   const ConvDilations& conv_dilations,
+                                   const InLeftPads& in_left_pads,
+                                   const InRightPads&)
+{
+    using namespace ck;
+
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+    const auto Di     = in.mDesc.GetLengths()[1];
+    const auto Hi     = in.mDesc.GetLengths()[2];
+    const auto Wi     = in.mDesc.GetLengths()[3];
+    const auto Z      = wei.mDesc.GetLengths()[1];
+    const auto Y      = wei.mDesc.GetLengths()[2];
+    const auto X      = wei.mDesc.GetLengths()[3];
+    const auto C      = wei.mDesc.GetLengths()[4];
+
+    auto f_ndhwc = [&](auto n, auto do_tmp, auto ho_tmp, auto wo_tmp, auto k) {
+        // do__ must be converted to signed integer, otherwise zmin might be wrong in cases
+        // negative values.
+        const int do_ = static_cast<int>(do_tmp);
+        const int ho  = static_cast<int>(ho_tmp);
+        const int wo  = static_cast<int>(wo_tmp);
+        const int zmin =
+            std::max(0,
+                     (in_left_pads[I0] - do_ * conv_strides[I0] + conv_dilations[I0] - 1) /
+                         conv_dilations[I0]);
+        const int ymin =
+            std::max(0,
+                     (in_left_pads[I1] - ho * conv_strides[I1] + conv_dilations[I1] - 1) /
+                         conv_dilations[I1]);
+        const int xmin =
+            std::max(0,
+                     (in_left_pads[I2] - wo * conv_strides[I2] + conv_dilations[I2] - 1) /
+                         conv_dilations[I2]);
+        const int zmax =
+            std::min(Z, (in_left_pads[I0] - do_ * conv_strides[I0] + Di) / conv_dilations[I0]);
+        const int ymax =
+            std::min(Y, (in_left_pads[I1] - ho * conv_strides[I1] + Hi) / conv_dilations[I1]);
+        const int xmax =
+            std::min(X, (in_left_pads[I2] - wo * conv_strides[I2] + Wi) / conv_dilations[I2]);
+        const int di_min = do_ * conv_strides[I0] + zmin * conv_dilations[I0] - in_left_pads[I0];
+        const int hi_min = ho * conv_strides[I1] + ymin * conv_dilations[I1] - in_left_pads[I1];
+        const int wi_min = wo * conv_strides[I2] + xmin * conv_dilations[I2] - in_left_pads[I2];
+
+        double v = 0;
+
+        const TIn* in_n   = in.mData.data() + n * Di * Hi * Wi * C;
+        const TWei* wei_k = wei.mData.data() + k * Z * Y * X * C;
+
+        int di = di_min;
+        for(int z = zmin; z < zmax; ++z, di += conv_dilations[I0])
+        {
+            const TIn* in_n_di  = in_n + di * Hi * Wi * C;
+            const TWei* wei_k_z = wei_k + z * Y * X * C;
+            int hi              = hi_min;
+
+            for(int y = ymin; y < ymax; ++y, hi += conv_dilations[I1])
+            {
+                const TIn* in_n_di_hi = in_n_di + hi * Wi * C;
+                const TWei* wei_k_z_y = wei_k_z + y * X * C;
+                int wi                = wi_min;
+
+                for(int x = xmin; x < xmax; ++x, wi += conv_dilations[I2])
+                {
+                    const TIn* in_n_di_hi_wi = in_n_di_hi + wi * C;
+                    const TWei* wei_k_z_y_x  = wei_k_z_y + x * C;
+
+                    for(int c = 0; c < C; ++c)
+                    {
+                        v += static_cast<const double>(in_n_di_hi_wi[c]) *
+                             static_cast<const double>(wei_k_z_y_x[c]);
+                    }
+                }
+            }
+        }
+
+        out(n, do_, ho, wo, k) = v;
+    };
+
+    make_ParallelTensorFunctor(f_ndhwc,
+                               out.mDesc.GetLengths()[0],
+                               out.mDesc.GetLengths()[1],
+                               out.mDesc.GetLengths()[2],
+                               out.mDesc.GetLengths()[3],
+                               out.mDesc.GetLengths()[4])(std::thread::hardware_concurrency() - 4);
+}
diff --git a/library/include/ck/library/utility/host_gemm.hpp b/library/include/ck/library/utility/host_gemm.hpp
new file mode 100644
index 00000000..44036d02
--- /dev/null
+++ b/library/include/ck/library/utility/host_gemm.hpp
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "host_tensor.hpp"
+
+template <typename AType,
+          typename BType,
+          typename CType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+void host_gemm_mk_kn_mn(const Tensor<AType>& a_m_k,
+                        const Tensor<BType>& b_k_n,
+                        Tensor<CType>& c_m_n,
+                        const AElementwiseOperation& a_element_op,
+                        const BElementwiseOperation& b_element_op,
+                        const CElementwiseOperation& c_element_op)
+{
+    auto f_mk_kn_mn = [&](auto m, auto n) {
+        const int K = a_m_k.mDesc.GetLengths()[1];
+
+        float v_acc = 0;
+
+        for(int k = 0; k < K; ++k)
+        {
+            float v_a;
+            float v_b;
+
+            a_element_op(v_a, static_cast<const float>(a_m_k(m, k)));
+            b_element_op(v_b, static_cast<const float>(b_k_n(k, n)));
+
+            v_acc += v_a * v_b;
+        }
+
+        float v_c;
+
+        c_element_op(v_c, v_acc);
+
+        c_m_n(m, n) = v_c;
+    };
+
+    make_ParallelTensorFunctor(f_mk_kn_mn,
+                               c_m_n.mDesc.GetLengths()[0],
+                               c_m_n.mDesc.GetLengths()[1])(std::thread::hardware_concurrency());
+}
diff --git a/library/include/ck/library/utility/host_reduction.hpp b/library/include/ck/library/utility/host_reduction.hpp
new file mode 100644
index 00000000..7c0c969a
--- /dev/null
+++ b/library/include/ck/library/utility/host_reduction.hpp
@@ -0,0 +1,374 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <vector>
+#include <array>
+#include <functional>
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/utility/reduction_common.hpp"
+#include "ck/utility/reduction_functions_accumulate.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+
+template <int NDim>
+static void get_all_indexes(const std::array<size_t, NDim>& dimLengths,
+                            std::vector<std::array<size_t, NDim>>& indexes)
+{
+    static_assert(NDim >= 1, "NDim >= 1 is required to use this function!");
+
+    if constexpr(NDim == 1)
+    {
+        for(size_t i = 0; i < dimLengths[0]; i++)
+        {
+            std::array<size_t, 1> index{i};
+
+            indexes.push_back(index);
+        };
+    }
+    else
+    {
+        std::array<size_t, NDim - 1> partial_dim_lengths;
+
+        for(int i = 0; i < NDim - 1; i++)
+            partial_dim_lengths[i] = dimLengths[i + 1];
+
+        std::vector<std::array<size_t, NDim - 1>> partial_indexes;
+
+        get_all_indexes<NDim - 1>(partial_dim_lengths, partial_indexes);
+
+        for(size_t i = 0; i < dimLengths[0]; i++)
+            for(const auto& index : partial_indexes)
+            {
+                std::array<size_t, NDim> extIndex;
+
+                extIndex[0] = i;
+
+                for(int k = 0; k < NDim - 1; k++)
+                    extIndex[k + 1] = index[k];
+
+                indexes.push_back(extIndex);
+            };
+    };
+};
+
+template <int NDim>
+static size_t get_offset_from_index(const std::array<size_t, NDim>& strides,
+                                    const std::array<size_t, NDim>& index)
+{
+    size_t offset = 0;
+
+    for(int i = 0; i < NDim; i++)
+        offset += strides[i] * index[i];
+
+    return (offset);
+};
+
+template <int NDim>
+static size_t get_offset_from_index(const std::vector<size_t>& strides,
+                                    const std::array<size_t, NDim>& index)
+{
+    size_t offset = 0;
+
+    for(int i = 0; i < NDim; i++)
+        offset += strides[i] * index[i];
+
+    return (offset);
+};
+
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          typename ReduceOperation,
+          typename InElementwiseOperation,
+          typename AccElementwiseOperation,
+          int Rank,
+          int NumReduceDim,
+          bool PropagateNan,
+          bool OutputIndex>
+struct ReductionHost
+{
+    using IndexDataType = int32_t;
+
+    static constexpr int NumInvariantDim = Rank - NumReduceDim;
+
+    std::vector<size_t> outStrides;
+
+    IndexDataType divider;
+
+    std::array<size_t, NumReduceDim> reduceLengths;
+    std::array<size_t, NumReduceDim> reduceStrides;
+    std::array<size_t, NumInvariantDim> invariantLengths;
+    std::array<size_t, NumInvariantDim> invariantStrides;
+
+    std::vector<std::array<size_t, NumReduceDim>> reduce_dim_indexes;
+    std::vector<std::array<size_t, NumInvariantDim>> invariant_dim_indexes;
+
+    ReductionHost(HostTensorDescriptor& inDesc,
+                  HostTensorDescriptor& outDesc,
+                  const std::array<int, NumInvariantDim> invariantDims,
+                  const std::array<int, NumReduceDim> reduceDims)
+    {
+        // this->outLengths = to_int_vector(outDesc.GetLengths());
+        this->outStrides = outDesc.GetStrides();
+
+        int product = 1;
+
+        for(int i = 0; i < NumReduceDim; i++)
+        {
+            reduceLengths[i] = inDesc.GetLengths()[reduceDims[i]];
+            reduceStrides[i] = inDesc.GetStrides()[reduceDims[i]];
+            product *= inDesc.GetLengths()[reduceDims[i]];
+        };
+
+        divider = product;
+
+        for(int i = 0; i < NumInvariantDim; i++)
+        {
+            invariantLengths[i] = inDesc.GetLengths()[invariantDims[i]];
+            invariantStrides[i] = inDesc.GetStrides()[invariantDims[i]];
+        };
+
+        reduce_dim_indexes.clear();
+        get_all_indexes<NumReduceDim>(reduceLengths, reduce_dim_indexes);
+
+        if constexpr(NumInvariantDim > 0)
+        {
+            invariant_dim_indexes.clear();
+            get_all_indexes<NumInvariantDim>(invariantLengths, invariant_dim_indexes);
+        };
+    };
+
+    void Run(float alpha,
+             const InDataType* in_data,
+             float beta,
+             OutDataType* out_data,
+             IndexDataType* out_indices,
+             InElementwiseOperation in_elementwise_op,
+             AccElementwiseOperation acc_elementwise_op)
+    {
+        if constexpr(OutputIndex)
+        {
+            RunImpl_with_index(
+                alpha, in_data, beta, out_data, out_indices, in_elementwise_op, acc_elementwise_op);
+        }
+        else
+        {
+            RunImpl_no_index(alpha, in_data, beta, out_data, in_elementwise_op, acc_elementwise_op);
+        };
+    };
+
+    void RunImpl_with_index(float alpha,
+                            const InDataType* in_data,
+                            float beta,
+                            OutDataType* out_data,
+                            IndexDataType* out_indices,
+                            InElementwiseOperation in_elementwise_op,
+                            AccElementwiseOperation acc_elementwise_op)
+    {
+        using ck::float_equal_one;
+        using ck::float_equal_zero;
+        using ck::type_convert;
+
+        using Accumulation = ck::detail::AccumulateWithIndexAndNanCheck<PropagateNan,
+                                                                        ReduceOperation,
+                                                                        AccDataType,
+                                                                        IndexDataType>;
+
+        if constexpr(NumInvariantDim == 0)
+        {
+            AccDataType accuVal     = ReduceOperation::template GetIdentityValue<AccDataType>();
+            IndexDataType accuIndex = 0;
+
+            for(std::size_t i = 0; i < reduce_dim_indexes.size(); i++)
+            {
+                auto offset_reduce =
+                    get_offset_from_index<NumReduceDim>(reduceStrides, reduce_dim_indexes[i]);
+
+                auto currVal = type_convert<AccDataType>(in_data[offset_reduce]);
+
+                in_elementwise_op(currVal, currVal);
+
+                auto currIndex = static_cast<IndexDataType>(i);
+
+                Accumulation::Calculate(accuVal, currVal, accuIndex, currIndex);
+            };
+
+            acc_elementwise_op(accuVal, accuVal);
+
+            if(!float_equal_one{}(alpha))
+                accuVal *= type_convert<AccDataType>(alpha);
+
+            if(!float_equal_zero{}(beta))
+                accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);
+
+            out_data[0]    = type_convert<OutDataType>(accuVal);
+            out_indices[0] = accuIndex;
+        }
+        else
+        {
+            auto thread_reduce_func = [&](auto invariant_index) {
+                AccDataType accuVal     = ReduceOperation::template GetIdentityValue<AccDataType>();
+                IndexDataType accuIndex = 0;
+
+                auto offset_invariant =
+                    get_offset_from_index<NumInvariantDim>(invariantStrides, invariant_index);
+
+                for(std::size_t i = 0; i < reduce_dim_indexes.size(); i++)
+                {
+                    auto offset_reduce =
+                        get_offset_from_index<NumReduceDim>(reduceStrides, reduce_dim_indexes[i]);
+
+                    auto currVal =
+                        type_convert<AccDataType>(in_data[offset_invariant + offset_reduce]);
+
+                    in_elementwise_op(currVal, currVal);
+
+                    auto currIndex = static_cast<IndexDataType>(i);
+
+                    Accumulation::Calculate(accuVal, currVal, accuIndex, currIndex);
+                };
+
+                acc_elementwise_op(accuVal, accuVal);
+
+                if(!float_equal_one{}(alpha))
+                    accuVal *= type_convert<AccDataType>(alpha);
+
+                auto dst_offset =
+                    get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);
+
+                if(!float_equal_zero{}(beta))
+                    accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
+                               type_convert<AccDataType>(beta);
+
+                out_data[dst_offset]    = type_convert<OutDataType>(accuVal);
+                out_indices[dst_offset] = accuIndex;
+            };
+
+            std::size_t num_thread = 1;
+            std::size_t work_per_thread =
+                (invariant_dim_indexes.size() + num_thread - 1) / num_thread;
+
+            std::vector<joinable_thread> threads(num_thread);
+
+            for(std::size_t it = 0; it < num_thread; ++it)
+            {
+                std::size_t iw_begin = it * work_per_thread;
+                std::size_t iw_end =
+                    std::min((it + 1) * work_per_thread, invariant_dim_indexes.size());
+
+                auto f = [=] {
+                    for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
+                    {
+                        thread_reduce_func(invariant_dim_indexes[iw]);
+                    }
+                };
+
+                threads[it] = joinable_thread(f);
+            }
+        };
+    };
+
+    void RunImpl_no_index(float alpha,
+                          const InDataType* in_data,
+                          float beta,
+                          OutDataType* out_data,
+                          InElementwiseOperation in_elementwise_op,
+                          AccElementwiseOperation acc_elementwise_op)
+    {
+        using ck::float_equal_one;
+        using ck::float_equal_zero;
+        using ck::type_convert;
+
+        using Accumulation =
+            ck::detail::AccumulateWithNanCheck<PropagateNan, ReduceOperation, AccDataType>;
+
+        if constexpr(NumInvariantDim == 0)
+        {
+            AccDataType accuVal = ReduceOperation::template GetIdentityValue<AccDataType>();
+
+            for(const auto& reduce_index : reduce_dim_indexes)
+            {
+                auto offset_reduce =
+                    get_offset_from_index<NumReduceDim>(reduceStrides, reduce_index);
+
+                auto currVal = type_convert<AccDataType>(in_data[offset_reduce]);
+
+                in_elementwise_op(currVal, currVal);
+
+                Accumulation::Calculate(accuVal, currVal);
+            };
+
+            acc_elementwise_op(accuVal, accuVal);
+
+            if(!float_equal_one{}(alpha))
+                accuVal *= type_convert<AccDataType>(alpha);
+
+            if(!float_equal_zero{}(beta))
+                accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);
+
+            out_data[0] = type_convert<OutDataType>(accuVal);
+        }
+        else
+        {
+            auto thread_reduce_func = [&](auto invariant_index) {
+                AccDataType accuVal = ReduceOperation::template GetIdentityValue<AccDataType>();
+
+                auto offset_invariant =
+                    get_offset_from_index<NumInvariantDim>(invariantStrides, invariant_index);
+
+                for(const auto& reduce_index : reduce_dim_indexes)
+                {
+                    auto offset_reduce =
+                        get_offset_from_index<NumReduceDim>(reduceStrides, reduce_index);
+
+                    auto currVal =
+                        type_convert<AccDataType>(in_data[offset_invariant + offset_reduce]);
+
+                    in_elementwise_op(currVal, currVal);
+
+                    Accumulation::Calculate(accuVal, currVal);
+                };
+
+                acc_elementwise_op(accuVal, accuVal);
+
+                if(!float_equal_one{}(alpha))
+                    accuVal *= type_convert<AccDataType>(alpha);
+
+                auto dst_offset =
+                    get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);
+
+                if(!float_equal_zero{}(beta))
+                    accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
+                               type_convert<AccDataType>(beta);
+
+                out_data[dst_offset] = type_convert<OutDataType>(accuVal);
+            };
+
+            std::size_t num_thread = 1;
+            std::size_t work_per_thread =
+                (invariant_dim_indexes.size() + num_thread - 1) / num_thread;
+
+            std::vector<joinable_thread> threads(num_thread);
+
+            for(std::size_t it = 0; it < num_thread; ++it)
+            {
+                std::size_t iw_begin = it * work_per_thread;
+                std::size_t iw_end =
+                    std::min((it + 1) * work_per_thread, invariant_dim_indexes.size());
+
+                auto f = [=] {
+                    for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
+                    {
+                        thread_reduce_func(invariant_dim_indexes[iw]);
+                    }
+                };
+
+                threads[it] = joinable_thread(f);
+            }
+        };
+    };
+};
diff --git a/library/include/ck/library/utility/host_tensor.hpp b/library/include/ck/library/utility/host_tensor.hpp
new file mode 100644
index 00000000..a8c7fd03
--- /dev/null
+++ b/library/include/ck/library/utility/host_tensor.hpp
@@ -0,0 +1,472 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <numeric>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/span.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/ranges.hpp"
+
+template <typename Range>
+std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
+{
+    bool first = true;
+    for(auto&& v : range)
+    {
+        if(first)
+            first = false;
+        else
+            os << delim;
+        os << v;
+    }
+    return os;
+}
+
+template <typename T, typename Range>
+std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
+{
+    bool first = true;
+    for(auto&& v : range)
+    {
+        if(first)
+            first = false;
+        else
+            os << delim;
+        os << static_cast<T>(v);
+    }
+    return os;
+}
+
+template <typename F, typename T, std::size_t... Is>
+auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
+{
+    return f(std::get<Is>(args)...);
+}
+
+template <typename F, typename T>
+auto call_f_unpack_args(F f, T args)
+{
+    constexpr std::size_t N = std::tuple_size<T>{};
+
+    return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
+}
+
+template <typename F, typename T, std::size_t... Is>
+auto construct_f_unpack_args_impl(T args, std::index_sequence<Is...>)
+{
+    return F(std::get<Is>(args)...);
+}
+
+template <typename F, typename T>
+auto construct_f_unpack_args(F, T args)
+{
+    constexpr std::size_t N = std::tuple_size<T>{};
+
+    return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
+}
+
+struct HostTensorDescriptor
+{
+    HostTensorDescriptor() = default;
+
+    void CalculateStrides();
+
+    template <typename X, typename = std::enable_if_t<std::is_convertible_v<X, std::size_t>>>
+    HostTensorDescriptor(const std::initializer_list<X>& lens) : mLens(lens.begin(), lens.end())
+    {
+        this->CalculateStrides();
+    }
+
+    template <typename Lengths,
+              typename = std::enable_if_t<
+                  std::is_convertible_v<ck::ranges::range_value_t<Lengths>, std::size_t>>>
+    HostTensorDescriptor(const Lengths& lens) : mLens(lens.begin(), lens.end())
+    {
+        this->CalculateStrides();
+    }
+
+    template <typename X,
+              typename Y,
+              typename = std::enable_if_t<std::is_convertible_v<X, std::size_t> &&
+                                          std::is_convertible_v<Y, std::size_t>>>
+    HostTensorDescriptor(const std::initializer_list<X>& lens,
+                         const std::initializer_list<Y>& strides)
+        : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
+    {
+    }
+
+    template <typename Lengths,
+              typename Strides,
+              typename = std::enable_if_t<
+                  std::is_convertible_v<ck::ranges::range_value_t<Lengths>, std::size_t> &&
+                  std::is_convertible_v<ck::ranges::range_value_t<Strides>, std::size_t>>>
+    HostTensorDescriptor(const Lengths& lens, const Strides& strides)
+        : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
+    {
+    }
+
+    std::size_t GetNumOfDimension() const;
+    std::size_t GetElementSize() const;
+    std::size_t GetElementSpaceSize() const;
+
+    const std::vector<std::size_t>& GetLengths() const;
+    const std::vector<std::size_t>& GetStrides() const;
+
+    template <typename... Is>
+    std::size_t GetOffsetFromMultiIndex(Is... is) const
+    {
+        assert(sizeof...(Is) == this->GetNumOfDimension());
+        std::initializer_list<std::size_t> iss{static_cast<std::size_t>(is)...};
+        return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
+    }
+
+    std::size_t GetOffsetFromMultiIndex(std::vector<std::size_t> iss) const
+    {
+        return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
+    }
+
+    friend std::ostream& operator<<(std::ostream& os, const HostTensorDescriptor& desc);
+
+    private:
+    std::vector<std::size_t> mLens;
+    std::vector<std::size_t> mStrides;
+};
+
+template <typename New2Old>
+HostTensorDescriptor transpose_host_tensor_descriptor_given_new2old(const HostTensorDescriptor& a,
+                                                                    const New2Old& new2old)
+{
+    std::vector<std::size_t> new_lengths(a.GetNumOfDimension());
+    std::vector<std::size_t> new_strides(a.GetNumOfDimension());
+
+    for(std::size_t i = 0; i < a.GetNumOfDimension(); i++)
+    {
+        new_lengths[i] = a.GetLengths()[new2old[i]];
+        new_strides[i] = a.GetStrides()[new2old[i]];
+    }
+
+    return HostTensorDescriptor(new_lengths, new_strides);
+}
+
+struct joinable_thread : std::thread
+{
+    template <typename... Xs>
+    joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...)
+    {
+    }
+
+    joinable_thread(joinable_thread&&) = default;
+    joinable_thread& operator=(joinable_thread&&) = default;
+
+    ~joinable_thread()
+    {
+        if(this->joinable())
+            this->join();
+    }
+};
+
+template <typename F, typename... Xs>
+struct ParallelTensorFunctor
+{
+    F mF;
+    static constexpr std::size_t NDIM = sizeof...(Xs);
+    std::array<std::size_t, NDIM> mLens;
+    std::array<std::size_t, NDIM> mStrides;
+    std::size_t mN1d;
+
+    ParallelTensorFunctor(F f, Xs... xs) : mF(f), mLens({static_cast<std::size_t>(xs)...})
+    {
+        mStrides.back() = 1;
+        std::partial_sum(mLens.rbegin(),
+                         mLens.rend() - 1,
+                         mStrides.rbegin() + 1,
+                         std::multiplies<std::size_t>());
+        mN1d = mStrides[0] * mLens[0];
+    }
+
+    std::array<std::size_t, NDIM> GetNdIndices(std::size_t i) const
+    {
+        std::array<std::size_t, NDIM> indices;
+
+        for(std::size_t idim = 0; idim < NDIM; ++idim)
+        {
+            indices[idim] = i / mStrides[idim];
+            i -= indices[idim] * mStrides[idim];
+        }
+
+        return indices;
+    }
+
+    void operator()(std::size_t num_thread = 1) const
+    {
+        std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;
+
+        std::vector<joinable_thread> threads(num_thread);
+
+        for(std::size_t it = 0; it < num_thread; ++it)
+        {
+            std::size_t iw_begin = it * work_per_thread;
+            std::size_t iw_end   = std::min((it + 1) * work_per_thread, mN1d);
+
+            auto f = [=] {
+                for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
+                {
+                    call_f_unpack_args(mF, GetNdIndices(iw));
+                }
+            };
+            threads[it] = joinable_thread(f);
+        }
+    }
+};
+
+template <typename F, typename... Xs>
+auto make_ParallelTensorFunctor(F f, Xs... xs)
+{
+    return ParallelTensorFunctor<F, Xs...>(f, xs...);
+}
+
+template <typename T>
+struct Tensor
+{
+    using Descriptor = HostTensorDescriptor;
+    using Data       = std::vector<T>;
+
+    template <typename X>
+    Tensor(std::initializer_list<X> lens) : mDesc(lens), mData(mDesc.GetElementSpaceSize())
+    {
+    }
+
+    template <typename X, typename Y>
+    Tensor(std::initializer_list<X> lens, std::initializer_list<Y> strides)
+        : mDesc(lens, strides), mData(mDesc.GetElementSpaceSize())
+    {
+    }
+
+    template <typename Lengths>
+    Tensor(const Lengths& lens) : mDesc(lens), mData(mDesc.GetElementSpaceSize())
+    {
+    }
+
+    template <typename Lengths, typename Strides>
+    Tensor(const Lengths& lens, const Strides& strides)
+        : mDesc(lens, strides), mData(GetElementSpaceSize())
+    {
+    }
+
+    Tensor(const Descriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpaceSize()) {}
+
+    template <typename OutT>
+    Tensor<OutT> CopyAsType() const
+    {
+        Tensor<OutT> ret(mDesc);
+
+        ck::ranges::transform(
+            mData, ret.mData.begin(), [](auto value) { return ck::type_convert<OutT>(value); });
+
+        return ret;
+    }
+
+    Tensor()              = delete;
+    Tensor(const Tensor&) = default;
+    Tensor(Tensor&&)      = default;
+
+    ~Tensor() = default;
+
+    Tensor& operator=(const Tensor&) = default;
+    Tensor& operator=(Tensor&&) = default;
+
+    template <typename FromT>
+    explicit Tensor(const Tensor<FromT>& other) : Tensor(other.template CopyAsType<T>())
+    {
+    }
+
+    decltype(auto) GetLengths() const { return mDesc.GetLengths(); }
+
+    decltype(auto) GetStrides() const { return mDesc.GetStrides(); }
+
+    std::size_t GetNumOfDimension() const { return mDesc.GetNumOfDimension(); }
+
+    std::size_t GetElementSize() const { return mDesc.GetElementSize(); }
+
+    std::size_t GetElementSpaceSize() const { return mDesc.GetElementSpaceSize(); }
+
+    std::size_t GetElementSpaceSizeInBytes() const { return sizeof(T) * GetElementSpaceSize(); }
+
+    void SetZero() { ck::ranges::fill<T>(mData, 0); }
+
+    template <typename F>
+    void ForEach_impl(F&& f, std::vector<size_t>& idx, size_t rank)
+    {
+        if(rank == mDesc.GetNumOfDimension())
+        {
+            f(*this, idx);
+            return;
+        }
+        // else
+        for(size_t i = 0; i < mDesc.GetLengths()[rank]; i++)
+        {
+            idx[rank] = i;
+            ForEach_impl(std::forward<F>(f), idx, rank + 1);
+        }
+    }
+
+    template <typename F>
+    void ForEach(F&& f)
+    {
+        std::vector<size_t> idx(mDesc.GetNumOfDimension(), 0);
+        ForEach_impl(std::forward<F>(f), idx, size_t(0));
+    }
+
+    template <typename F>
+    void ForEach_impl(const F&& f, std::vector<size_t>& idx, size_t rank) const
+    {
+        if(rank == mDesc.GetNumOfDimension())
+        {
+            f(*this, idx);
+            return;
+        }
+        // else
+        for(size_t i = 0; i < mDesc.GetLengths()[rank]; i++)
+        {
+            idx[rank] = i;
+            ForEach_impl(std::forward<const F>(f), idx, rank + 1);
+        }
+    }
+
+    template <typename F>
+    void ForEach(const F&& f) const
+    {
+        std::vector<size_t> idx(mDesc.GetNumOfDimension(), 0);
+        ForEach_impl(std::forward<const F>(f), idx, size_t(0));
+    }
+
+    template <typename G>
+    void GenerateTensorValue(G g, std::size_t num_thread = 1)
+    {
+        switch(mDesc.GetNumOfDimension())
+        {
+        case 1: {
+            auto f = [&](auto i) { (*this)(i) = g(i); };
+            make_ParallelTensorFunctor(f, mDesc.GetLengths()[0])(num_thread);
+            break;
+        }
+        case 2: {
+            auto f = [&](auto i0, auto i1) { (*this)(i0, i1) = g(i0, i1); };
+            make_ParallelTensorFunctor(f, mDesc.GetLengths()[0], mDesc.GetLengths()[1])(num_thread);
+            break;
+        }
+        case 3: {
+            auto f = [&](auto i0, auto i1, auto i2) { (*this)(i0, i1, i2) = g(i0, i1, i2); };
+            make_ParallelTensorFunctor(
+                f, mDesc.GetLengths()[0], mDesc.GetLengths()[1], mDesc.GetLengths()[2])(num_thread);
+            break;
+        }
+        case 4: {
+            auto f = [&](auto i0, auto i1, auto i2, auto i3) {
+                (*this)(i0, i1, i2, i3) = g(i0, i1, i2, i3);
+            };
+            make_ParallelTensorFunctor(f,
+                                       mDesc.GetLengths()[0],
+                                       mDesc.GetLengths()[1],
+                                       mDesc.GetLengths()[2],
+                                       mDesc.GetLengths()[3])(num_thread);
+            break;
+        }
+        case 5: {
+            auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4) {
+                (*this)(i0, i1, i2, i3, i4) = g(i0, i1, i2, i3, i4);
+            };
+            make_ParallelTensorFunctor(f,
+                                       mDesc.GetLengths()[0],
+                                       mDesc.GetLengths()[1],
+                                       mDesc.GetLengths()[2],
+                                       mDesc.GetLengths()[3],
+                                       mDesc.GetLengths()[4])(num_thread);
+            break;
+        }
+        case 6: {
+            auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4, auto i5) {
+                (*this)(i0, i1, i2, i3, i4) = g(i0, i1, i2, i3, i4, i5);
+            };
+            make_ParallelTensorFunctor(f,
+                                       mDesc.GetLengths()[0],
+                                       mDesc.GetLengths()[1],
+                                       mDesc.GetLengths()[2],
+                                       mDesc.GetLengths()[3],
+                                       mDesc.GetLengths()[4],
+                                       mDesc.GetLengths()[5])(num_thread);
+            break;
+        }
+        default: throw std::runtime_error("unspported dimension");
+        }
+    }
+
+    template <typename... Is>
+    T& operator()(Is... is)
+    {
+        return mData[mDesc.GetOffsetFromMultiIndex(is...)];
+    }
+
+    template <typename... Is>
+    const T& operator()(Is... is) const
+    {
+        return mData[mDesc.GetOffsetFromMultiIndex(is...)];
+    }
+
+    T& operator()(std::vector<std::size_t> idx)
+    {
+        return mData[mDesc.GetOffsetFromMultiIndex(idx)];
+    }
+
+    const T& operator()(std::vector<std::size_t> idx) const
+    {
+        return mData[mDesc.GetOffsetFromMultiIndex(idx)];
+    }
+
+    typename Data::iterator begin() { return mData.begin(); }
+
+    typename Data::iterator end() { return mData.end(); }
+
+    typename Data::pointer data() { return mData.data(); }
+
+    typename Data::const_iterator begin() const { return mData.begin(); }
+
+    typename Data::const_iterator end() const { return mData.end(); }
+
+    typename Data::const_pointer data() const { return mData.data(); }
+
+    typename Data::size_type size() const { return mData.size(); }
+
+    template <typename U = T>
+    auto AsSpan() const
+    {
+        constexpr std::size_t FromSize = sizeof(T);
+        constexpr std::size_t ToSize   = sizeof(U);
+
+        using Element = std::add_const_t<std::remove_reference_t<U>>;
+        return ck::span<Element>{reinterpret_cast<Element*>(data()), size() * FromSize / ToSize};
+    }
+
+    template <typename U = T>
+    auto AsSpan()
+    {
+        constexpr std::size_t FromSize = sizeof(T);
+        constexpr std::size_t ToSize   = sizeof(U);
+
+        using Element = std::remove_reference_t<U>;
+        return ck::span<Element>{reinterpret_cast<Element*>(data()), size() * FromSize / ToSize};
+    }
+
+    Descriptor mDesc;
+    Data mData;
+};
diff --git a/library/include/ck/library/utility/host_tensor_generator.hpp b/library/include/ck/library/utility/host_tensor_generator.hpp
new file mode 100644
index 00000000..4259862e
--- /dev/null
+++ b/library/include/ck/library/utility/host_tensor_generator.hpp
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cmath>
+#include <numeric>
+#include <random>
+
+#include "ck/ck.hpp"
+
+template <typename T>
+struct GeneratorTensor_0
+{
+    template <typename... Is>
+    T operator()(Is...)
+    {
+        return T{0};
+    }
+};
+
+template <typename T>
+struct GeneratorTensor_1
+{
+    T value = 1;
+
+    template <typename... Is>
+    T operator()(Is...)
+    {
+        return value;
+    }
+};
+
+template <>
+struct GeneratorTensor_1<ck::bhalf_t>
+{
+    float value = 1.0;
+
+    template <typename... Is>
+    ck::bhalf_t operator()(Is...)
+    {
+        return ck::type_convert<ck::bhalf_t>(value);
+    }
+};
+
+template <>
+struct GeneratorTensor_1<int8_t>
+{
+    int8_t value = 1;
+
+    template <typename... Is>
+    int8_t operator()(Is...)
+    {
+        return value;
+    }
+};
+
+template <typename T>
+struct GeneratorTensor_2
+{
+    int min_value = 0;
+    int max_value = 1;
+
+    template <typename... Is>
+    T operator()(Is...)
+    {
+        return static_cast<T>((std::rand() % (max_value - min_value)) + min_value);
+    }
+};
+
+template <>
+struct GeneratorTensor_2<ck::bhalf_t>
+{
+    int min_value = 0;
+    int max_value = 1;
+
+    template <typename... Is>
+    ck::bhalf_t operator()(Is...)
+    {
+        float tmp = (std::rand() % (max_value - min_value)) + min_value;
+        return ck::type_convert<ck::bhalf_t>(tmp);
+    }
+};
+
+template <>
+struct GeneratorTensor_2<int8_t>
+{
+    int min_value = 0;
+    int max_value = 1;
+
+    template <typename... Is>
+    int8_t operator()(Is...)
+    {
+        return (std::rand() % (max_value - min_value)) + min_value;
+    }
+};
+
+template <typename T>
+struct GeneratorTensor_3
+{
+    float min_value = 0;
+    float max_value = 1;
+
+    template <typename... Is>
+    T operator()(Is...)
+    {
+        float tmp = float(std::rand()) / float(RAND_MAX);
+
+        return static_cast<T>(min_value + tmp * (max_value - min_value));
+    }
+};
+
+template <>
+struct GeneratorTensor_3<ck::bhalf_t>
+{
+    float min_value = 0;
+    float max_value = 1;
+
+    template <typename... Is>
+    ck::bhalf_t operator()(Is...)
+    {
+        float tmp = float(std::rand()) / float(RAND_MAX);
+
+        float fp32_tmp = min_value + tmp * (max_value - min_value);
+
+        return ck::type_convert<ck::bhalf_t>(fp32_tmp);
+    }
+};
+
+template <typename T>
+struct GeneratorTensor_4
+{
+    std::default_random_engine generator;
+    std::normal_distribution<float> distribution;
+
+    GeneratorTensor_4(float mean, float stddev) : generator(1), distribution(mean, stddev){};
+
+    template <typename... Is>
+    T operator()(Is...)
+    {
+        float tmp = distribution(generator);
+
+        return ck::type_convert<T>(tmp);
+    }
+};
+
+struct GeneratorTensor_Checkboard
+{
+    template <typename... Ts>
+    float operator()(Ts... Xs) const
+    {
+        std::array<ck::index_t, sizeof...(Ts)> dims = {static_cast<ck::index_t>(Xs)...};
+        return std::accumulate(dims.begin(),
+                               dims.end(),
+                               true,
+                               [](bool init, ck::index_t x) -> int { return init != (x % 2); })
+                   ? 1
+                   : -1;
+    }
+};
+
+template <ck::index_t Dim>
+struct GeneratorTensor_Sequential
+{
+    template <typename... Ts>
+    float operator()(Ts... Xs) const
+    {
+        std::array<ck::index_t, sizeof...(Ts)> dims = {{static_cast<ck::index_t>(Xs)...}};
+        return dims[Dim];
+    }
+};
+
+template <typename T, size_t NumEffectiveDim = 2>
+struct GeneratorTensor_Diagonal
+{
+    T value{1};
+
+    template <typename... Ts>
+    T operator()(Ts... Xs) const
+    {
+        std::array<ck::index_t, sizeof...(Ts)> dims = {{static_cast<ck::index_t>(Xs)...}};
+        size_t start_dim                            = dims.size() - NumEffectiveDim;
+        bool pred                                   = true;
+        for(size_t i = start_dim + 1; i < dims.size(); i++)
+        {
+            pred &= (dims[start_dim] == dims[i]);
+        }
+        return pred ? value : T{0};
+    }
+};
diff --git a/library/include/ck/library/utility/iterator.hpp b/library/include/ck/library/utility/iterator.hpp
new file mode 100644
index 00000000..9fdc88ea
--- /dev/null
+++ b/library/include/ck/library/utility/iterator.hpp
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iterator>
+#include <utility>
+
+#include "ck/utility/type.hpp"
+
+namespace ck {
+
+template <typename T>
+using iter_value_t = typename std::iterator_traits<remove_cvref_t<T>>::value_type;
+
+template <typename T>
+using iter_reference_t = decltype(*std::declval<T&>());
+
+template <typename T>
+using iter_difference_t = typename std::iterator_traits<remove_cvref_t<T>>::difference_type;
+
+} // namespace ck
diff --git a/library/include/ck/library/utility/literals.hpp b/library/include/ck/library/utility/literals.hpp
new file mode 100644
index 00000000..a73a2ea0
--- /dev/null
+++ b/library/include/ck/library/utility/literals.hpp
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+namespace literals {
+// [P0330] Literal Suffix for (signed) size_t (C++23)
+// ref: https://wg21.link/p0330r8
+inline constexpr std::size_t operator""_uz(unsigned long long size)
+{
+    return static_cast<std::size_t>(size);
+}
+
+inline constexpr std::size_t operator""_zu(unsigned long long size)
+{
+    return static_cast<std::size_t>(size);
+}
+} // namespace literals
+} // namespace ck
diff --git a/library/include/ck/library/utility/numeric.hpp b/library/include/ck/library/utility/numeric.hpp
new file mode 100644
index 00000000..70a7e87a
--- /dev/null
+++ b/library/include/ck/library/utility/numeric.hpp
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iterator>
+#include <numeric>
+
+namespace ck {
+template <typename T, typename ForwardIterator, typename Size, typename BinaryOperation>
+auto accumulate_n(ForwardIterator first, Size count, T init, BinaryOperation op)
+    -> decltype(std::accumulate(first, std::next(first, count), init, op))
+{
+    return std::accumulate(first, std::next(first, count), init, op);
+}
+} // namespace ck
diff --git a/library/include/ck/library/utility/op_instance_engine.hpp b/library/include/ck/library/utility/op_instance_engine.hpp
new file mode 100644
index 00000000..78812e8c
--- /dev/null
+++ b/library/include/ck/library/utility/op_instance_engine.hpp
@@ -0,0 +1,249 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <stdexcept>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "ck/utility/functional2.hpp"
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+
+namespace ck {
+namespace utils {
+
+struct ProfileBestConfig
+{
+    std::string best_op_name;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_tflops     = std::numeric_limits<float>::max();
+    float best_gb_per_sec = std::numeric_limits<float>::max();
+};
+
+/**
+ * @brief      This class describes an operation instance(s).
+ *
+ *             Op instance defines a particular specializations of operator
+ *             template. Thanks to this specific input/output data types, data
+ *             layouts and modifying elementwise operations it is able to create
+ *             it's input/output tensors, provide pointers to instances which
+ *             can execute it and all operation specific parameters.
+ */
+template <typename OutDataType, typename... InArgTypes>
+class OpInstance
+{
+    public:
+    template <typename T>
+    using TensorPtr      = std::unique_ptr<Tensor<T>>;
+    using InTensorsTuple = std::tuple<TensorPtr<InArgTypes>...>;
+    using DeviceMemPtr   = std::unique_ptr<DeviceMem>;
+    using DeviceBuffers  = std::vector<DeviceMemPtr>;
+
+    OpInstance()                  = default;
+    OpInstance(const OpInstance&) = default;
+    OpInstance& operator=(const OpInstance&) = default;
+    virtual ~OpInstance(){};
+
+    virtual InTensorsTuple GetInputTensors() const         = 0;
+    virtual TensorPtr<OutDataType> GetOutputTensor() const = 0;
+    virtual std::unique_ptr<tensor_operation::device::BaseInvoker>
+    MakeInvokerPointer(tensor_operation::device::BaseOperator*) const = 0;
+    virtual std::unique_ptr<tensor_operation::device::BaseArgument>
+    MakeArgumentPointer(tensor_operation::device::BaseOperator*,
+                        const DeviceBuffers&,
+                        const DeviceMemPtr&) const = 0;
+    virtual std::size_t GetFlops() const           = 0;
+    virtual std::size_t GetBtype() const           = 0;
+};
+
+/**
+ * @brief      A generic operation instance run engine.
+ */
+template <typename OutDataType, typename... InArgTypes>
+class OpInstanceRunEngine
+{
+    public:
+    using OpInstanceT = OpInstance<InArgTypes..., OutDataType>;
+    template <typename T>
+    using TensorPtr        = std::unique_ptr<Tensor<T>>;
+    using DeviceMemPtr     = std::unique_ptr<DeviceMem>;
+    using InTensorsTuple   = std::tuple<TensorPtr<InArgTypes>...>;
+    using DeviceBuffers    = std::vector<DeviceMemPtr>;
+    using InArgsTypesTuple = std::tuple<InArgTypes...>;
+
+    OpInstanceRunEngine() = delete;
+
+    template <typename ReferenceOp = std::function<void()>>
+    OpInstanceRunEngine(const OpInstanceT& op_instance,
+                        const ReferenceOp& reference_op = ReferenceOp{},
+                        bool do_verification            = true)
+        : op_instance_{op_instance}
+    {
+        in_tensors_ = op_instance_.GetInputTensors();
+        out_tensor_ = op_instance_.GetOutputTensor();
+
+        if constexpr(std::is_invocable_v<ReferenceOp,
+                                         const Tensor<InArgTypes>&...,
+                                         Tensor<OutDataType>&>)
+        {
+            if(do_verification)
+            {
+                ref_output_ = op_instance_.GetOutputTensor();
+                CallRefOpUnpackArgs(reference_op, std::make_index_sequence<kNInArgs_>{});
+            }
+        }
+        AllocateDeviceInputTensors(std::make_index_sequence<kNInArgs_>{});
+        out_device_buffer_ = std::make_unique<DeviceMem>(sizeof(OutDataType) *
+                                                         out_tensor_->mDesc.GetElementSpaceSize());
+        out_device_buffer_->SetZero();
+    }
+
+    virtual ~OpInstanceRunEngine(){};
+
+    template <typename OpInstancePtr>
+    bool Test(const std::vector<OpInstancePtr>& op_ptrs)
+    {
+        bool res{true};
+        for(auto& op_ptr : op_ptrs)
+        {
+            auto invoker  = op_instance_.MakeInvokerPointer(op_ptr.get());
+            auto argument = op_instance_.MakeArgumentPointer(
+                op_ptr.get(), in_device_buffers_, out_device_buffer_);
+            if(op_ptr->IsSupportedArgument(argument.get()))
+            {
+                std::cout << "Testing instance: " << op_ptr->GetTypeString() << std::endl;
+                invoker->Run(argument.get());
+                out_device_buffer_->FromDevice(out_tensor_->mData.data());
+                if(!ref_output_)
+                {
+                    throw std::runtime_error(
+                        "OpInstanceRunEngine::Test: Reference value not availabe."
+                        " You have to provide reference function.");
+                }
+                // TODO: enable flexible use of custom check_error functions
+                bool inst_res = CheckErr(out_tensor_->mData, ref_output_->mData);
+                std::cout << (inst_res ? "SUCCESS" : "FAILURE") << std::endl;
+                res = res && inst_res;
+                out_device_buffer_->SetZero();
+            }
+            else
+            {
+                std::cout << "Given conv problem is not supported by instance: \n\t>>>>"
+                          << op_ptr->GetTypeString() << std::endl;
+            }
+        }
+        return res;
+    }
+
+    template <typename OpInstancePtr>
+    ProfileBestConfig Profile(const std::vector<OpInstancePtr>& op_ptrs,
+                              bool time_kernel     = false,
+                              bool do_verification = false,
+                              bool do_log          = false)
+    {
+        ProfileBestConfig best_config;
+
+        for(auto& op_ptr : op_ptrs)
+        {
+            auto invoker  = op_instance_.MakeInvokerPointer(op_ptr.get());
+            auto argument = op_instance_.MakeArgumentPointer(
+                op_ptr.get(), in_device_buffers_, out_device_buffer_);
+            if(op_ptr->IsSupportedArgument(argument.get()))
+            {
+                std::string op_name = op_ptr->GetTypeString();
+                float avg_time = invoker->Run(argument.get(), StreamConfig{nullptr, time_kernel});
+
+                std::size_t flops     = op_instance_.GetFlops();
+                std::size_t num_btype = op_instance_.GetBtype();
+                float tflops          = static_cast<float>(flops) / 1.E9 / avg_time;
+                float gb_per_sec      = num_btype / 1.E6 / avg_time;
+
+                std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                          << " GB/s, " << op_name << std::endl;
+
+                if(avg_time < best_config.best_avg_time)
+                {
+                    best_config.best_op_name    = op_name;
+                    best_config.best_tflops     = tflops;
+                    best_config.best_gb_per_sec = gb_per_sec;
+                    best_config.best_avg_time   = avg_time;
+                }
+
+                if(do_verification)
+                {
+                    out_device_buffer_->FromDevice(out_tensor_->mData.data());
+                    if(!ref_output_)
+                    {
+                        throw std::runtime_error(
+                            "OpInstanceRunEngine::Profile: Reference value not availabe."
+                            " You have to provide reference function.");
+                    }
+                    // TODO: enable flexible use of custom check_error functions
+                    CheckErr(out_tensor_->mData, ref_output_->mData);
+
+                    if(do_log) {}
+                }
+                out_device_buffer_->SetZero();
+            }
+        }
+        return best_config;
+    }
+
+    void SetAtol(double a) { atol_ = a; }
+    void SetRtol(double r) { rtol_ = r; }
+
+    private:
+    template <typename F, std::size_t... Is>
+    void CallRefOpUnpackArgs(const F& f, std::index_sequence<Is...>) const
+    {
+        f(*std::get<Is>(in_tensors_)..., *ref_output_);
+    }
+
+    template <std::size_t... Is>
+    void AllocateDeviceInputTensors(std::index_sequence<Is...>)
+    {
+        (AllocateDeviceInputTensorsImpl<Is>(), ...);
+    }
+
+    template <std::size_t Index>
+    void AllocateDeviceInputTensorsImpl()
+    {
+        const auto& ts = std::get<Index>(in_tensors_);
+        in_device_buffers_
+            .emplace_back(
+                std::make_unique<DeviceMem>(sizeof(std::tuple_element_t<Index, InArgsTypesTuple>) *
+                                            ts->mDesc.GetElementSpaceSize()))
+            ->ToDevice(ts->mData.data());
+    }
+
+    static constexpr std::size_t kNInArgs_ = std::tuple_size_v<InTensorsTuple>;
+    const OpInstanceT& op_instance_;
+    double rtol_{1e-5};
+    double atol_{1e-8};
+
+    InTensorsTuple in_tensors_;
+    TensorPtr<OutDataType> out_tensor_;
+    TensorPtr<OutDataType> ref_output_;
+
+    DeviceBuffers in_device_buffers_;
+    DeviceMemPtr out_device_buffer_;
+
+    template <typename T>
+    bool CheckErr(const std::vector<T>& dev_out, const std::vector<T>& ref_out) const
+    {
+        return ck::utils::check_err(dev_out, ref_out, "Error: incorrect results!", rtol_, atol_);
+    }
+};
+
+} // namespace utils
+} // namespace ck
diff --git a/library/include/ck/library/utility/ranges.hpp b/library/include/ck/library/utility/ranges.hpp
new file mode 100644
index 00000000..55c322f1
--- /dev/null
+++ b/library/include/ck/library/utility/ranges.hpp
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iterator>
+#include <type_traits>
+#include <utility>
+
+#include "ck/library/utility/iterator.hpp"
+
+namespace ck {
+namespace ranges {
+
+template <typename R>
+using iterator_t = decltype(std::begin(std::declval<R&>()));
+
+template <typename R>
+using sentinel_t = decltype(std::end(std::declval<R&>()));
+
+template <typename R>
+using range_size_t = decltype(std::size(std::declval<R&>()));
+
+template <typename R>
+using range_difference_t = ck::iter_difference_t<ranges::iterator_t<R>>;
+
+template <typename R>
+using range_value_t = iter_value_t<ranges::iterator_t<R>>;
+
+template <typename R>
+using range_reference_t = iter_reference_t<ranges::iterator_t<R>>;
+
+template <typename T, typename = void>
+struct is_range : std::false_type
+{
+};
+
+template <typename T>
+struct is_range<
+    T,
+    std::void_t<decltype(std::begin(std::declval<T&>())), decltype(std::end(std::declval<T&>()))>>
+    : std::true_type
+{
+};
+
+template <typename T>
+inline constexpr bool is_range_v = is_range<T>::value;
+
+template <typename T, typename = void>
+struct is_sized_range : std::false_type
+{
+};
+
+template <typename T>
+struct is_sized_range<T, std::void_t<decltype(std::size(std::declval<T&>()))>>
+    : std::bool_constant<is_range_v<T>>
+{
+};
+} // namespace ranges
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
new file mode 100644
index 00000000..c206c4dc
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -0,0 +1,67 @@
+function(add_instance_library INSTANCE_NAME)
+    message("adding instance ${INSTANCE_NAME}")
+    add_library(${INSTANCE_NAME} OBJECT ${ARGN})
+    target_compile_features(${INSTANCE_NAME} PUBLIC)
+    set_target_properties(${INSTANCE_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    clang_tidy_check(${INSTANCE_NAME})
+endfunction(add_instance_library INSTANCE_NAME)
+
+
+file(GLOB dir_list LIST_DIRECTORIES true *)
+set(CK_DEVICE_INSTANCES)
+FOREACH(subdir_path ${dir_list})
+set(target_dir)
+IF(IS_DIRECTORY "${subdir_path}")
+    get_filename_component(target_dir ${subdir_path} NAME)
+    add_subdirectory(${target_dir})
+    list(APPEND CK_DEVICE_INSTANCES $<TARGET_OBJECTS:device_${target_dir}_instance>)
+ENDIF()
+ENDFOREACH()
+
+add_library(device_operations STATIC ${CK_DEVICE_INSTANCES})
+add_library(composablekernels::device_operations ALIAS device_operations)
+
+
+set(DEV_OPS_INC_DIRS
+    ${PROJECT_SOURCE_DIR}/include/ck/
+    ${PROJECT_SOURCE_DIR}/library/include/ck/
+)
+
+target_compile_features(device_operations PUBLIC)
+set_target_properties(device_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
+target_include_directories(device_operations PUBLIC
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/utility>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_description>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/problem_transform>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/device>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/device/impl>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/grid>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/block>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/warp>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/thread>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/tensor_operation/gpu/element>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/utility>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu/reduce>
+)
+
+#once new arches are enabled make this an option on the main cmake file
+# and pass down here to be exported
+target_compile_options(device_operations PRIVATE
+    --offload-arch=gfx908
+    --offload-arch=gfx90a
+)
+
+# install(TARGETS device_operations LIBRARY DESTINATION lib)
+rocm_install(TARGETS device_operations
+        EXPORT device_operationsTargets)
+
+rocm_install(DIRECTORY ${DEV_OPS_INC_DIRS} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck)
+rocm_install(EXPORT device_operationsTargets
+        FILE composable_kerneldevice_operationsTargets.cmake
+        NAMESPACE composable_kernel::
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+)
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt
new file mode 100644
index 00000000..0f2a7391
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt
@@ -0,0 +1,18 @@
+add_instance_library(device_batched_gemm_instance
+   device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
+   device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
+   device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
+   device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
+   device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp
+   device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp
+   device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp
+   device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp
+   device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp
+   device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp
+   device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp
+   device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp
+   device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp
+   device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instance.cpp
+   device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp
+   device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp
new file mode 100644
index 00000000..cc878745
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances = std::tuple<
+    // clang-format off
+        //##################| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##################|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##################|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##################|      |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceBatchedGemmXdl<  BF16,  BF16,  BF16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  BF16,  BF16,  BF16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  BF16,  BF16,  BF16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  BF16,  BF16,  BF16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  BF16,  BF16,  BF16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  BF16,  BF16,  BF16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  BF16,  BF16,  BF16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  BF16,  BF16,  BF16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp
new file mode 100644
index 00000000..04200cfb
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
+using device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instances = std::tuple<
+    // clang-format off
+        //##########|        AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##########|         Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##########|             |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##########|             |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp
new file mode 100644
index 00000000..7b86f3cc
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instances = std::tuple<
+    // clang-format off
+        //#################| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //#################|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //#################|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //#################|      |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    32,   256,     4,  8,   32,   32,    1,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    32,    64,     4,  8,   32,   32,    1,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,    64,    32,    32,     4,  8,   32,   32,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp
new file mode 100644
index 00000000..2afb1afb
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instances = std::tuple<
+    // clang-format off
+        //#################| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //#################|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //#################|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //#################|      |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceBatchedGemmXdl< BF16,  BF16,  BF16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
new file mode 100644
index 00000000..68d76894
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances = std::tuple<
+    // clang-format off
+        //##########|        AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumGemmK|          LoopScheduler|                    Pipeline|
+        //##########|         Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar| Prefetch|                       |                            |
+        //##########|             |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector| Stage   |                       |                            |
+        //##########|             |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |         |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES
+        // pipeline v1, 2 waves
+        ,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES
+        // pipeline v2, 1 wave
+        ,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
new file mode 100644
index 00000000..737e5bfc
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
+using device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances = std::tuple<
+    // clang-format off
+        //##########|        AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumGemmK|          LoopScheduler|                    Pipeline|
+        //##########|         Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar| Prefetch|                       |                            |
+        //##########|             |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector| Stage   |                       |                            |
+        //##########|             |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |         |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES
+        // pipeline v1, 2 waves
+        ,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES
+        // pipeline v2, 1 wave
+        ,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
new file mode 100644
index 00000000..e09d0173
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances = std::tuple<
+    // clang-format off
+        //#################| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumGemmK|          LoopScheduler|                    Pipeline|
+        //#################|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar| Prefetch|                       |                            |
+        //#################|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector| Stage   |                       |                            |
+        //#################|      |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |         |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    32,   256,     4,  8,   32,   32,    1,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    32,    64,     4,  8,   32,   32,    1,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,    64,    32,    32,     4,  8,   32,   32,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    16,   256,     4,  8,   16,   16,    1,    8,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    16,   128,     4,  8,   16,   16,    1,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    16,    64,     4,  8,   16,   16,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    16,    32,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,    64,    16,    16,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    32,   256,     4,  8,   32,   32,    1,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    32,    64,     4,  8,   32,   32,    1,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,    64,    32,    32,     4,  8,   32,   32,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    16,   256,     4,  8,   16,   16,    1,    8,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    16,   128,     4,  8,   16,   16,    1,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    16,    64,     4,  8,   16,   16,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    16,    32,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,    64,    16,    16,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    32,   256,     4,  8,   32,   32,    1,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    32,    64,     4,  8,   32,   32,    1,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,    64,    32,    32,     4,  8,   32,   32,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    16,   256,     4,  8,   16,   16,    1,    8,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    16,   128,     4,  8,   16,   16,    1,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    16,    64,     4,  8,   16,   16,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,    16,    32,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,    64,    16,    16,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
new file mode 100644
index 00000000..984d66e2
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances = std::tuple<
+    // clang-format off
+        //#################| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumGemmK|          LoopScheduler|                    Pipeline|
+        //#################|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar| Prefetch|                       |                            |
+        //#################|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector| Stage   |                       |                            |
+        //#################|      |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |         |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES         
+        // pipeline v2, 1 wave
+        ,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp
new file mode 100644
index 00000000..12cada9c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instances = std::tuple<
+    // clang-format off
+        //##########|        AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##########|         Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##########|             |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##########|             |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  4,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Col, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp
new file mode 100644
index 00000000..13f19886
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
+using device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instances = std::tuple<
+    // clang-format off
+        //##########|        AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##########|         Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##########|             |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##########|             |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  4,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Col, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp
new file mode 100644
index 00000000..2ca1adc2
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instances = std::tuple<
+    // clang-format off
+        //##########|        AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##########|         Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##########|             |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##########|             |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Row, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp
new file mode 100644
index 00000000..fe5de527
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instances = std::tuple<
+    // clang-format off
+        //##########|        AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##########|         Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##########|             |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##########|             |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    64,    64,     4,  4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    32,     4,  4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,    32,   128,     4,  4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    64,    32,     4,  4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    32,    64,     4,  4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Row, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp
new file mode 100644
index 00000000..5b55c8e1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using AData   = int8_t;
+using BData   = int8_t;
+using CData   = int8_t;
+using AccData = int32_t;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instances = std::tuple<
+    // clang-format off
+        //##########|          AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer|   ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer|  BBlockTransfer|  BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##########|           Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|    ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|        DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|       SrcScalar|       DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##########|               |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave|  Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|     PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |       PerVector|    PerVector_K1|          |                |       PerVector|
+        //##########|               |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                 |               |               |               |               |                 |          |                |               |               |              |                |                |          |                |                |
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              16,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              16,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              16,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              16,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              16,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              16,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    64,    64,    64,     4,  16,   32,   32,    2,    2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              16,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              16,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              16,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    32,     4,  16,   32,   32,    2,    1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              16,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,    32,   128,     4,  16,   32,   32,    1,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              16,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    64,    64,    32,     4,  16,   32,   32,    2,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              16,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    64,    32,    64,     4,  16,   32,   32,    1,    2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              16,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,               16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             1,              2,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,               16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             1,              4,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,               16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             1,              4,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,               16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             1,              2,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,               16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             1,              2,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,               16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             1,              4,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,               16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             1,              1,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,               16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             1,              2,              16,      true,               7,               1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemm<Col,
+                                                  Row,
+                                                  Row,
+                                                  int8_t,
+                                                  int8_t,
+                                                  int8_t,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp
new file mode 100644
index 00000000..9517e457
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using AData   = int8_t;
+using BData   = int8_t;
+using CData   = int8_t;
+using AccData = int32_t;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instances = std::tuple<
+    // clang-format off
+        //##########|          AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##########|           Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##########|               |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##########|               |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    64,    64,     4,  16,   32,   32,    2,    2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              16,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    32,     4,  16,   32,   32,    2,    1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,    32,   128,     4,  16,   32,   32,    1,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    64,    32,     4,  16,   32,   32,    2,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              16,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    32,    64,     4,  16,   32,   32,    1,    2,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              16,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,               16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,               16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,               16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,               16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,               16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,               16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,               16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,               16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemm<Col,
+                                                  Col,
+                                                  Row,
+                                                  int8_t,
+                                                  int8_t,
+                                                  int8_t,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp
new file mode 100644
index 00000000..43b91244
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using AData   = int8_t;
+using BData   = int8_t;
+using CData   = int8_t;
+using AccData = int32_t;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instances = std::tuple<
+    // clang-format off
+        //##########|          AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per|  K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer|  ABlockTransfer|  ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer|  BBlockTransfer|  BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##########|           Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|       SrcScalar|       DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|       SrcScalar|       DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##########|               |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |       PerVector|    PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |       PerVector|    PerVector_K1|          |                |       PerVector|
+        //##########|               |      |      |        |        |        |        |            |            |            |      |      |      |      |    |     |     |     |     |                |               |               |               |                |                |          |                |               |               |              |                |                |          |                |                |
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    64,    64,    64,     4,  16,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    32,     4,  16,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,    32,   128,     4,  16,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    64,    64,    32,     4,  16,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    64,    32,    64,     4,  16,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,               16,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,               16,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,               16,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,               16,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,               16,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,               16,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,               16,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,               16,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              16,      true,               7,               1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemm<Row,
+                                                  Row,
+                                                  Row,
+                                                  int8_t,
+                                                  int8_t,
+                                                  int8_t,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instance.cpp
new file mode 100644
index 00000000..326500fc
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instance.cpp
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_xdl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using AData   = int8_t;
+using BData   = int8_t;
+using CData   = int8_t;
+using AccData = int32_t;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instances = std::tuple<
+    // clang-format off
+        //##########|          AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per|  K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer|  ABlockTransfer|  ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer|  BBlockTransfer|  BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##########|           Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|       SrcScalar|       DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|       SrcScalar|       DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##########|               |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |       PerVector|    PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |       PerVector|    PerVector_K1|          |                |       PerVector|
+        //##########|               |      |      |        |        |        |        |            |            |            |      |      |      |      |    |     |     |     |     |                |               |               |               |                |                |          |                |               |               |              |                |                |          |                |                |
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    64,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,    64,   128,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    64,    64,     4,  16,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,    64,     4,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,    64,   128,     4,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,    32,     4,  16,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,    32,   128,     4,  16,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    64,    32,     4,  16,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>,
+        DeviceBatchedGemmXdl<  AData, BData, CData, AccData,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    32,    64,     4,  16,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              16,              16,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              16,              16,      true,               7,               1>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemm<Row,
+                                                  Col,
+                                                  Row,
+                                                  int8_t,
+                                                  int8_t,
+                                                  int8_t,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/CMakeLists.txt
new file mode 100644
index 00000000..d0e9b265
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_instance_library(device_batched_gemm_add_relu_gemm_add_instance
+    device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+    device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
new file mode 100644
index 00000000..e1bfa88f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough   = ck::tensor_operation::element_wise::PassThrough;
+using CDE0ElementOp = ck::tensor_operation::element_wise::AddRelu;
+using CDE1ElementOp = ck::tensor_operation::element_wise::Add;
+
+// c[g, m, n] = a[g, m, k] * b[g, n, k]
+using device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances =
+    std::tuple<
+        // clang-format off
+        //##################################################| A0Layout| B0Layout|       D0Layout| B1Layout|      D1sLayout| E1Layout| A0Data| B0Data| Acc0DataType|     D0DataType| B1Data| Acc1CData| CShuffle|        D1sData| E1Data|          A0|          B0|          CDE0|          B1|          CDE1| PadGemm0M| PadGemm0N| PadGemm0K| PadGemm1N| PadGemm1K|NumGemm0K| Block|  Gemm0| Gemm0| Gemm0| Gemm1| Gemm1|A0K1|B0K1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|A0BlockTransfer|A0BlockTransfer|A0BlockTransfer|A0BlockTransfer|A0BlockTransfer|A0BlockTransfer|A0BlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|   C1Shuffle|   C1Shuffle| CDE1BlockTransferClusterLengths| CDE1BlockTransfer|
+        //##################################################|         |         |               |         |               |         |   Type|   Type|         Type|           Type|   Type|      Type| DataType|           Type|   Type| Elementwise| Elementwise|   Elementwise| Elementwise|   Elementwise|          |          |          |          |          | Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|  ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MWaveMPerXdl|   ScalarPerVector|
+        //##################################################|         |         |               |         |               |         |       |       |             |               |       |          |         |               |       |   Operation|   Operation|     Operation|   Operation|     Operation|          |          |          |          |          |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per|Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|            _NBlock_NWaveNPerXdl|     _NWaveNPerXdl|
+        //##################################################|         |         |               |         |               |         |       |       |             |               |       |          |         |               |       |            |            |              |            |              |          |          |          |          |          |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|               |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                                |                  |
+        // no padding
+        DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle<      Row,      Col, ck::Tuple<Row>,      Row, ck::Tuple<Row>,      Row,    F16,    F16,          F32, ck::Tuple<F16>,    F16,       F32,      F32, ck::Tuple<F16>,    F16, PassThrough, PassThrough, CDE0ElementOp, PassThrough, CDE1ElementOp,     false,     false,     false,     false,     false,        1,   256,    128,   128,    64,    64,    32,   8,   8,    2,   32,   32,     1,     4,     2,    S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,                  S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle<      Row,      Col, ck::Tuple<Row>,      Row, ck::Tuple<Row>,      Row,    F16,    F16,          F32, ck::Tuple<F16>,    F16,       F32,      F32, ck::Tuple<F16>,    F16, PassThrough, PassThrough, CDE0ElementOp, PassThrough, CDE1ElementOp,     false,     false,     false,     false,     false,        1,   256,    128,   128,    32,    64,    32,   8,   8,    2,   32,   32,     1,     4,     2,    S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,                  S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle<      Row,      Col, ck::Tuple<Row>,      Row, ck::Tuple<Row>,      Row,    F16,    F16,          F32, ck::Tuple<F16>,    F16,       F32,      F32, ck::Tuple<F16>,    F16, PassThrough, PassThrough, CDE0ElementOp, PassThrough, CDE1ElementOp,     false,     false,     false,     false,     false,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,    S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,                  S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle<      Row,      Col, ck::Tuple<Row>,      Row, ck::Tuple<Row>,      Row,    F16,    F16,          F32, ck::Tuple<F16>,    F16,       F32,      F32, ck::Tuple<F16>,    F16, PassThrough, PassThrough, CDE0ElementOp, PassThrough, CDE1ElementOp,     false,     false,     false,     false,     false,        1,   256,    128,   128,    32,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,    S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,                  S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle<      Row,      Col, ck::Tuple<Row>,      Row, ck::Tuple<Row>,      Row,    F16,    F16,          F32, ck::Tuple<F16>,    F16,       F32,      F32, ck::Tuple<F16>,    F16, PassThrough, PassThrough, CDE0ElementOp, PassThrough, CDE1ElementOp,     false,     false,     false,     false,     false,        1,   256,     64,   256,    32,   128,    32,   8,   8,    2,   16,   16,     1,    16,     8,    S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           8,                  S<1, 16, 1,16>,               8>,
+        DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle<      Row,      Col, ck::Tuple<Row>,      Row, ck::Tuple<Row>,      Row,    F16,    F16,          F32, ck::Tuple<F16>,    F16,       F32,      F32, ck::Tuple<F16>,    F16, PassThrough, PassThrough, CDE0ElementOp, PassThrough, CDE1ElementOp,     false,     false,     false,     false,     false,        1,   256,     64,   256,    32,    64,    32,   8,   8,    2,   16,   16,     1,    16,     4,    S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           4,                  S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle<      Row,      Col, ck::Tuple<Row>,      Row, ck::Tuple<Row>,      Row,    F16,    F16,          F32, ck::Tuple<F16>,    F16,       F32,      F32, ck::Tuple<F16>,    F16, PassThrough, PassThrough, CDE0ElementOp, PassThrough, CDE1ElementOp,     false,     false,     false,     false,     false,        1,   256,     64,   256,    64,   128,    32,   8,   8,    2,   16,   16,     1,    16,     8,    S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           8,                  S<1, 16, 1,16>,               8>,
+        DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle<      Row,      Col, ck::Tuple<Row>,      Row, ck::Tuple<Row>,      Row,    F16,    F16,          F32, ck::Tuple<F16>,    F16,       F32,      F32, ck::Tuple<F16>,    F16, PassThrough, PassThrough, CDE0ElementOp, PassThrough, CDE1ElementOp,     false,     false,     false,     false,     false,        1,   256,     64,   256,    64,    64,    32,   8,   8,    2,   16,   16,     1,    16,     4,    S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           4,                  S<1, 32, 1, 8>,               8>,
+        // Padded fallback kernel
+        DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle<      Row,      Col, ck::Tuple<Row>,      Row, ck::Tuple<Row>,      Row,    F16,    F16,          F32, ck::Tuple<F16>,    F16,       F32,      F32, ck::Tuple<F16>,    F16, PassThrough, PassThrough, CDE0ElementOp, PassThrough, CDE1ElementOp,      true,      true,      true,      true,      true,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,    S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,                  S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle<      Row,      Col, ck::Tuple<Row>,      Row, ck::Tuple<Row>,      Row,    F16,    F16,          F32, ck::Tuple<F16>,    F16,       F32,      F32, ck::Tuple<F16>,    F16, PassThrough, PassThrough, CDE0ElementOp, PassThrough, CDE1ElementOp,      true,      true,      true,      true,      true,        1,   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,    S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,                  S<1, 32, 1, 8>,               8>
+        // clang-format on
+        >;
+
+void add_device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultipleDGemmMultipleD<Row,
+                                                                        Col,
+                                                                        ck::Tuple<Row>,
+                                                                        Row,
+                                                                        ck::Tuple<Row>,
+                                                                        Row,
+                                                                        F16,
+                                                                        F16,
+                                                                        ck::Tuple<F16>,
+                                                                        F16,
+                                                                        ck::Tuple<F16>,
+                                                                        F16,
+                                                                        PassThrough,
+                                                                        PassThrough,
+                                                                        CDE0ElementOp,
+                                                                        PassThrough,
+                                                                        CDE1ElementOp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
new file mode 100644
index 00000000..f59b7425
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough   = ck::tensor_operation::element_wise::PassThrough;
+using CDE0ElementOp = ck::tensor_operation::element_wise::AddRelu;
+using CDE1ElementOp = ck::tensor_operation::element_wise::Add;
+
+// c[g, m, n] = a[g, m, k] * b[g, n, k]
+using device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instances =
+    std::tuple<
+        // clang-format off
+        //##################################################| A0Layout| B0Layout|       D0Layout| B1Layout|      D1sLayout| E1Layout| A0Data| B0Data| Acc0DataType|     D0DataType| B1Data| Acc1CData| CShuffle|        D1sData| E1Data|          A0|          B0|          CDE0|          B1|          CDE1| PadGemm0M| PadGemm0N| PadGemm0K| PadGemm1N| PadGemm1K| NumGemm0K| Block|  Gemm0| Gemm0| Gemm0| Gemm1| Gemm1| A0K1| B0K1|B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1| A0BlockTransfer|A0BlockTransfer|A0BlockTransfer|A0BlockTransfer|A0BlockTransfer|A0BlockTransfer|A0BlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|   C1Shuffle|   C1Shuffle| CDE1BlockTransferClusterLengths| CDE1BlockTransfer|
+        //##################################################|         |         |               |         |               |         |   Type|   Type|         Type|           Type|   Type|      Type| DataType|           Type|   Type| Elementwise| Elementwise|   Elementwise| Elementwise|   Elementwise|          |          |          |          |          |  Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|     |     |    |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MWaveMPerXdl|   ScalarPerVector|
+        //##################################################|         |         |               |         |               |         |       |       |             |               |       |          |         |               |       |   Operation|   Operation|     Operation|   Operation|     Operation|          |          |          |          |          |     Stage|      |  Block| Block| Block| Block| Block|     |     |    |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|            _NBlock_NWaveNPerXdl|     _NWaveNPerXdl|
+        //##################################################|         |         |               |         |               |         |       |       |             |               |       |          |         |               |       |            |            |              |            |              |          |          |          |          |          |          |      |       |      |      |      |      |     |     |    |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                                |                  |
+        // no padding
+        DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle<      Row,      Col, ck::Tuple<Row>,      Col, ck::Tuple<Row>,      Row,    F16,    F16,          F32, ck::Tuple<F16>,    F16,       F32,      F32, ck::Tuple<F16>,    F16, PassThrough, PassThrough, CDE0ElementOp, PassThrough, CDE1ElementOp,     false,     false,     false,     false,     false,         1,   256,    256,   128,    32,   128,    32,    8,   8,    4,   32,   32,     2,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,       true,           1,           2,                  S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle<      Row,      Col, ck::Tuple<Row>,      Col, ck::Tuple<Row>,      Row,    F16,    F16,          F32, ck::Tuple<F16>,    F16,       F32,      F32, ck::Tuple<F16>,    F16, PassThrough, PassThrough, CDE0ElementOp, PassThrough, CDE1ElementOp,     false,     false,     false,     false,     false,         1,   256,    128,   128,    64,    64,    32,    8,   8,    4,   32,   32,     1,     4,     2,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,      false,           1,           2,                  S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle<      Row,      Col, ck::Tuple<Row>,      Col, ck::Tuple<Row>,      Row,    F16,    F16,          F32, ck::Tuple<F16>,    F16,       F32,      F32, ck::Tuple<F16>,    F16, PassThrough, PassThrough, CDE0ElementOp, PassThrough, CDE1ElementOp,     false,     false,     false,     false,     false,         1,   256,    128,   128,    32,    64,    32,    8,   8,    4,   32,   32,     1,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,       true,           1,           2,                  S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle<      Row,      Col, ck::Tuple<Row>,      Col, ck::Tuple<Row>,      Row,    F16,    F16,          F32, ck::Tuple<F16>,    F16,       F32,      F32, ck::Tuple<F16>,    F16, PassThrough, PassThrough, CDE0ElementOp, PassThrough, CDE1ElementOp,     false,     false,     false,     false,     false,         1,   256,    128,   128,    64,   128,    32,    8,   8,    4,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,      false,           1,           2,                  S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle<      Row,      Col, ck::Tuple<Row>,      Col, ck::Tuple<Row>,      Row,    F16,    F16,          F32, ck::Tuple<F16>,    F16,       F32,      F32, ck::Tuple<F16>,    F16, PassThrough, PassThrough, CDE0ElementOp, PassThrough, CDE1ElementOp,     false,     false,     false,     false,     false,         1,   256,    128,   128,    32,   128,    32,    8,   8,    4,   32,   32,     1,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,       true,           1,           2,                  S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle<      Row,      Col, ck::Tuple<Row>,      Col, ck::Tuple<Row>,      Row,    F16,    F16,          F32, ck::Tuple<F16>,    F16,       F32,      F32, ck::Tuple<F16>,    F16, PassThrough, PassThrough, CDE0ElementOp, PassThrough, CDE1ElementOp,     false,     false,     false,     false,     false,         1,   256,     64,   256,    32,   128,    32,    8,   8,    4,   16,   16,     1,    16,     8,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,       true,           1,           8,                  S<1, 16, 1,16>,               8>,
+        DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle<      Row,      Col, ck::Tuple<Row>,      Col, ck::Tuple<Row>,      Row,    F16,    F16,          F32, ck::Tuple<F16>,    F16,       F32,      F32, ck::Tuple<F16>,    F16, PassThrough, PassThrough, CDE0ElementOp, PassThrough, CDE1ElementOp,     false,     false,     false,     false,     false,         1,   256,     64,   256,    32,    64,    32,    8,   8,    4,   16,   16,     1,    16,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,       true,           1,           4,                  S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle<      Row,      Col, ck::Tuple<Row>,      Col, ck::Tuple<Row>,      Row,    F16,    F16,          F32, ck::Tuple<F16>,    F16,       F32,      F32, ck::Tuple<F16>,    F16, PassThrough, PassThrough, CDE0ElementOp, PassThrough, CDE1ElementOp,     false,     false,     false,     false,     false,         1,   256,     64,   256,    64,   128,    32,    8,   8,    4,   16,   16,     1,    16,     8,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,       true,           1,           8,                  S<1, 16, 1,16>,               8>,
+        DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle<      Row,      Col, ck::Tuple<Row>,      Col, ck::Tuple<Row>,      Row,    F16,    F16,          F32, ck::Tuple<F16>,    F16,       F32,      F32, ck::Tuple<F16>,    F16, PassThrough, PassThrough, CDE0ElementOp, PassThrough, CDE1ElementOp,     false,     false,     false,     false,     false,         1,   256,     64,   256,    64,    64,    32,    8,   8,    4,   16,   16,     1,    16,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,       true,           1,           4,                  S<1, 32, 1, 8>,               8>,
+        // Padded fallback kernel                                                                                                                                                                                                       
+        DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle<      Row,      Col, ck::Tuple<Row>,      Col, ck::Tuple<Row>,      Row,    F16,    F16,          F32, ck::Tuple<F16>,    F16,       F32,      F32, ck::Tuple<F16>,    F16, PassThrough, PassThrough, CDE0ElementOp, PassThrough, CDE1ElementOp,      true,      true,      true,      true,      true,         1,   256,    128,   128,    64,   128,    32,    8,   8,    4,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,      false,           1,           2,                  S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle<      Row,      Col, ck::Tuple<Row>,      Col, ck::Tuple<Row>,      Row,    F16,    F16,          F32, ck::Tuple<F16>,    F16,       F32,      F32, ck::Tuple<F16>,    F16, PassThrough, PassThrough, CDE0ElementOp, PassThrough, CDE1ElementOp,      true,      true,      true,      true,      true,         1,   256,    128,    64,    32,   128,    32,    8,   8,    4,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,       true,           1,           2,                  S<1, 32, 1, 8>,               8>
+        // clang-format on
+        >;
+
+void add_device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance(
+    std::vector<std::unique_ptr<DeviceBatchedGemmMultipleDGemmMultipleD<Row,
+                                                                        Col,
+                                                                        ck::Tuple<Row>,
+                                                                        Col,
+                                                                        ck::Tuple<Row>,
+                                                                        Row,
+                                                                        F16,
+                                                                        F16,
+                                                                        ck::Tuple<F16>,
+                                                                        F16,
+                                                                        ck::Tuple<F16>,
+                                                                        F16,
+                                                                        PassThrough,
+                                                                        PassThrough,
+                                                                        CDE0ElementOp,
+                                                                        PassThrough,
+                                                                        CDE1ElementOp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/CMakeLists.txt
new file mode 100644
index 00000000..865a31e7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_instance_library(device_batched_gemm_gemm_instance
+    device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+    device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
new file mode 100644
index 00000000..9b96194c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmPadded  = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+
+// c[g, m, n] = a[g, m, k] * b[g, n, k]
+using device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances = std::tuple<
+    // clang-format off
+        //################################| ALayout| B0Layout| B1Layout| CLayout| AData| B0Data| B1Data| CData| AccData| CShuffle|           A|          B0|        Acc0|          B1|           C|           GEMM| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //################################|        |         |         |        |  Type|   Type|   Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //################################|        |         |         |        |      |       |       |      |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //################################|        |         |         |        |      |       |       |      |        |         |            |            |            |            |            |               |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |
+        //################################|        |         |         |        |      |       |       |      |        |         |            |            |            |            |            |               |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    256,   128,    32,    64,    32,   8,   8,    2,   32,   32,     2,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    256,   128,    32,   128,    32,   8,   8,    2,   32,   32,     2,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    128,   256,    32,    64,    32,   8,   8,    2,   32,   32,     1,     8,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    128,   256,    32,   128,    32,   8,   8,    2,   32,   32,     1,     8,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    128,   128,    64,    64,    32,   8,   8,    2,   32,   32,     1,     4,     2,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    128,   128,    32,    64,    32,   8,   8,    2,   32,   32,     1,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    128,   128,    32,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,     64,   256,    32,   128,    32,   8,   8,    2,   16,   16,     1,    16,     8,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           8,               S<1, 16, 1,16>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,     64,   256,    32,    64,    32,   8,   8,    2,   16,   16,     1,    16,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           4,               S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,     64,   256,    64,   128,    32,   8,   8,    2,   16,   16,     1,    16,     8,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           8,               S<1, 16, 1,16>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,     64,   256,    64,    64,    32,   8,   8,    2,   16,   16,     1,    16,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           4,               S<1, 32, 1, 8>,               8>,
+        // Padded fallback kernel
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,     GemmPadded,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,     GemmPadded,        1,   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
+    std::vector<std::unique_ptr<DeviceBatchedGemmGemm<Row,
+                                                      Col,
+                                                      Row,
+                                                      Row,
+                                                      F16,
+                                                      F16,
+                                                      F16,
+                                                      F16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
new file mode 100644
index 00000000..0713dfcd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmPadded  = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+
+// c[g, m, n] = a[g, m, k] * b[g, n, k]
+using device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instances = std::tuple<
+    // clang-format off
+        //################################| ALayout| B0Layout| B1Layout| CLayout| AData| B0Data| B1Data| CData| AccData| CShuffle|           A|          B0|        Acc0|          B1|           C|           GEMM| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //################################|        |         |         |        |  Type|   Type|   Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //################################|        |         |         |        |      |       |       |      |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //################################|        |         |         |        |      |       |       |      |        |         |            |            |            |            |            |               |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |
+        //################################|        |         |         |        |      |       |       |      |        |         |            |            |            |            |            |               |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    256,   128,    32,    64,    32,   8,   8,    4,   32,   32,     2,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,       true,           1,           2,               S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    256,   128,    32,   128,    32,   8,   8,    4,   32,   32,     2,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,       true,           1,           2,               S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    128,   256,    32,    64,    32,   8,   8,    4,   32,   32,     1,     8,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,       true,           1,           2,               S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    128,   256,    32,   128,    32,   8,   8,    4,   32,   32,     1,     8,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,       true,           1,           2,               S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    128,   128,    64,    64,    32,   8,   8,    4,   32,   32,     1,     4,     2,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,      false,           1,           2,               S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    128,   128,    32,    64,    32,   8,   8,    4,   32,   32,     1,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,       true,           1,           2,               S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    128,   128,    64,   128,    32,   8,   8,    4,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,      false,           1,           2,               S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    128,   128,    32,   128,    32,   8,   8,    4,   32,   32,     1,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,       true,           1,           2,               S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,     64,   256,    32,   128,    32,   8,   8,    4,   16,   16,     1,    16,     8,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,       true,           1,           8,               S<1, 16, 1,16>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,     64,   256,    32,    64,    32,   8,   8,    4,   16,   16,     1,    16,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,       true,           1,           4,               S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,     64,   256,    64,   128,    32,   8,   8,    4,   16,   16,     1,    16,     8,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,       true,           1,           8,               S<1, 16, 1,16>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,     64,   256,    64,    64,    32,   8,   8,    4,   16,   16,     1,    16,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,       true,           1,           4,               S<1, 32, 1, 8>,               8>,
+        // Padded fallback kernel
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,     GemmPadded,        1,   256,    128,   128,    64,   128,    32,   8,   8,    4,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,      false,           1,           2,               S<1, 32, 1, 8>,               8>,
+        DeviceBatchedGemmGemm_Xdl_CShuffle<     Row,      Col,      Col,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, PassThrough, PassThrough,     GemmPadded,        1,   256,    128,    64,    32,   128,    32,   8,   8,    4,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,       true,           1,           2,               S<1, 32, 1, 8>,               8>
+    // clang-format on
+    >;
+
+void add_device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance(
+    std::vector<std::unique_ptr<DeviceBatchedGemmGemm<Row,
+                                                      Col,
+                                                      Col,
+                                                      Row,
+                                                      F16,
+                                                      F16,
+                                                      F16,
+                                                      F16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt
new file mode 100644
index 00000000..db3719cf
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_instance_library(device_batched_gemm_reduce_instance
+    device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp
+    device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp
+    device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp
+    device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp
+)
+
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp
new file mode 100644
index 00000000..521c3d92
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16              = ck::half_t;
+using F32              = float;
+using ReducePtrsGlobal = ck::Tuple<F32*, F32*>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using ReduceSum   = ck::reduce::Add;
+using ReduceOps   = ck::Tuple<ReduceSum, ReduceSum>;
+
+using Identity            = ck::tensor_operation::element_wise::PassThrough;
+using Square              = ck::tensor_operation::element_wise::UnarySquare;
+using ReduceInElementOps  = ck::Tuple<Identity, Square>;
+using ReduceOutElementOps = ck::Tuple<Identity, Identity>;
+
+using ReduceMemOp = ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicAdd,
+                                                          ck::InMemoryDataOperationEnum::AtomicAdd>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// c[g, m, n] = a[g, m, k] * b[g, n, k]
+using device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instances =
+    std::tuple<
+        // clang-format off
+        //##################################| ALayout| BLayout| CLayout|AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc|         ReduceData|           A|           B|           C|      Reduce|      ReduceInEleOp|      ReduceAccEleOp|        Reduce|          GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
+        //##################################|        |        |        | Type|  Type|  Type| DataType| DataType|  DataType|         Type Tuple| Elementwise| Elementwise| Elementwise|   Operation|                   |                    |    MemoryData|Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
+        //##################################|        |        |        |     |      |      |         |         |          |                   |   Operation|   Operation|   Operation|            |                   |                    |     Operation|              |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
+        //##################################|        |        |        |     |      |      |         |         |          |                   |            |            |            |            |                   |                    |              |              |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,   128,   256,    32,   4,   4,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   128,   128,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,   128,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   128,   128,    64,    32,   2,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   128,    64,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,   128,    64,    32,   2,   2,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 16, 1, 4>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,    64,   128,    32,   2,   2,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>
+        // clang-format on
+        >;
+
+void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instances(
+    std::vector<DeviceGemmReducePtr<0, ReducePtrsGlobal::Size()>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp
new file mode 100644
index 00000000..231d612d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16              = ck::half_t;
+using F32              = float;
+using ReducePtrsGlobal = ck::Tuple<F32*, F32*>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using ReduceSum   = ck::reduce::Add;
+using ReduceOps   = ck::Tuple<ReduceSum, ReduceSum>;
+
+using Identity            = ck::tensor_operation::element_wise::PassThrough;
+using Square              = ck::tensor_operation::element_wise::UnarySquare;
+using ReduceInElementOps  = ck::Tuple<Identity, Square>;
+using ReduceOutElementOps = ck::Tuple<Identity, Identity>;
+
+using ReduceMemOp = ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicAdd,
+                                                          ck::InMemoryDataOperationEnum::AtomicAdd>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// c[g, m, n] = a[g, m, k] * b[g, n, k]
+using device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instances =
+    std::tuple<
+        // clang-format off
+        //##################################| ALayout| BLayout| CLayout|AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc|         ReduceData|           A|           B|           C|      Reduce|      ReduceInEleOp|      ReduceAccEleOp|        Reduce|          GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
+        //##################################|        |        |        | Type|  Type|  Type| DataType| DataType|  DataType|         Type Tuple| Elementwise| Elementwise| Elementwise|   Operation|                   |                    |    MemoryData|Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
+        //##################################|        |        |        |     |      |      |         |         |          |                   |   Operation|   Operation|   Operation|            |                   |                    |     Operation|              |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
+        //##################################|        |        |        |     |      |      |         |         |          |                   |            |            |            |            |                   |                    |              |              |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   128,   128,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,   128,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   128,   128,    64,    32,   2,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   128,    64,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,   128,    64,    32,   2,   8,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,    64,   128,    32,   2,   8,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>
+        // clang-format on
+        >;
+
+void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instances(
+    std::vector<DeviceGemmReducePtr<0, ReducePtrsGlobal::Size()>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp
new file mode 100644
index 00000000..165bc395
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16              = ck::half_t;
+using F32              = float;
+using ReducePtrsGlobal = ck::Tuple<F32*, F32*>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using ReduceSum   = ck::reduce::Add;
+using ReduceOps   = ck::Tuple<ReduceSum, ReduceSum>;
+
+using Identity            = ck::tensor_operation::element_wise::PassThrough;
+using Square              = ck::tensor_operation::element_wise::UnarySquare;
+using ReduceInElementOps  = ck::Tuple<Identity, Square>;
+using ReduceOutElementOps = ck::Tuple<Identity, Identity>;
+
+using ReduceMemOp = ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicAdd,
+                                                          ck::InMemoryDataOperationEnum::AtomicAdd>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// c[g, m, n] = a[g, m, k] * b[g, n, k]
+using device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instances =
+    std::tuple<
+        // clang-format off
+        //##################################| ALayout| BLayout| CLayout|AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc|         ReduceData|           A|           B|           C|      Reduce|      ReduceInEleOp|      ReduceAccEleOp|        Reduce|          GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
+        //##################################|        |        |        | Type|  Type|  Type| DataType| DataType|  DataType|         Type Tuple| Elementwise| Elementwise| Elementwise|   Operation|                   |                    |    MemoryData|Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
+        //##################################|        |        |        |     |      |      |         |         |          |                   |   Operation|   Operation|   Operation|            |                   |                    |     Operation|              |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
+        //##################################|        |        |        |     |      |      |         |         |          |                   |            |            |            |            |                   |                    |              |              |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   128,   128,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,   128,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   128,   128,    64,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   128,    64,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 16, 1, 4>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,    64,   128,    32,   8,   2,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>
+        // clang-format on
+        >;
+
+void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instances(
+    std::vector<DeviceGemmReducePtr<0, ReducePtrsGlobal::Size()>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp
new file mode 100644
index 00000000..832fc3b0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_reduce_xdl_cshuffle.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16              = ck::half_t;
+using F32              = float;
+using ReducePtrsGlobal = ck::Tuple<F32*, F32*>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using ReduceSum   = ck::reduce::Add;
+using ReduceOps   = ck::Tuple<ReduceSum, ReduceSum>;
+
+using Identity            = ck::tensor_operation::element_wise::PassThrough;
+using Square              = ck::tensor_operation::element_wise::UnarySquare;
+using ReduceInElementOps  = ck::Tuple<Identity, Square>;
+using ReduceOutElementOps = ck::Tuple<Identity, Identity>;
+
+using ReduceMemOp = ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicAdd,
+                                                          ck::InMemoryDataOperationEnum::AtomicAdd>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// c[g, m, n] = a[g, m, k] * b[g, n, k]
+using device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instances =
+    std::tuple<
+        // clang-format off
+        //##################################| ALayout| BLayout| CLayout|AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc|         ReduceData|           A|           B|           C|      Reduce|      ReduceInEleOp|      ReduceAccEleOp|        Reduce|          GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
+        //##################################|        |        |        | Type|  Type|  Type| DataType| DataType|  DataType|         Type Tuple| Elementwise| Elementwise| Elementwise|   Operation|                   |                    |    MemoryData|Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
+        //##################################|        |        |        |     |      |      |         |         |          |                   |   Operation|   Operation|   Operation|            |                   |                    |     Operation|              |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
+        //##################################|        |        |        |     |      |      |         |         |          |                   |            |            |            |            |                   |                    |              |              |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<32, 2>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<32, 2>,                         4,                            1>,
+        DeviceBatchedGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps,   ReduceMemOp,   GemmDefault,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<32, 2>,                         4,                            1>
+        // clang-format on
+        >;
+
+void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instances(
+    std::vector<DeviceGemmReducePtr<0, ReducePtrsGlobal::Size()>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/CMakeLists.txt
new file mode 100644
index 00000000..29fce566
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_instance_library(device_batched_gemm_softmax_gemm_instance
+    device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+)
+
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
new file mode 100644
index 00000000..99e87124
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Scale       = ck::tensor_operation::element_wise::Scale;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmPadded  = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+
+// c[g, m, n] = a[g, m, k] * b[g, n, k]
+template <bool Masking>
+using device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances =
+    std::tuple<
+        // clang-format off
+        //#######################################| ALayout| B0Layout| B1Layout| CLayout| AData| B0Data| B1Data| CData| AccData| CShuffle|           A|          B0|        Acc0|          B1|           C|           GEMM| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|    MaskOut|
+        //#######################################|        |         |         |        |  Type|   Type|   Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|      Upper|
+        //#######################################|        |         |         |        |      |       |       |      |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|   Triangle|
+        //#######################################|        |         |         |        |      |       |       |      |        |         |            |            |            |            |            |               |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |           |
+        DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault,        1,   256,    256,   128,    32,    64,    32,   8,   8,    2,   32,   32,     2,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8,   Masking>,
+        DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault,        1,   256,    256,   128,    32,   128,    32,   8,   8,    2,   32,   32,     2,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8,   Masking>,
+        DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault,        1,   256,    128,   256,    32,    64,    32,   8,   8,    2,   32,   32,     1,     8,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8,   Masking>,
+        DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault,        1,   256,    128,   256,    32,   128,    32,   8,   8,    2,   32,   32,     1,     8,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8,   Masking>,
+        DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault,        1,   256,    128,   128,    64,    64,    32,   8,   8,    2,   32,   32,     1,     4,     2,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8,   Masking>,
+        DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault,        1,   256,    128,   128,    32,    64,    32,   8,   8,    2,   32,   32,     1,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8,   Masking>,
+        DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8,   Masking>,
+        DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault,        1,   256,    128,   128,    32,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8,   Masking>,
+        DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault,        1,   256,     64,   256,    32,   128,    32,   8,   8,    2,   16,   16,     1,    16,     8,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           8,               S<1, 16, 1,16>,               8,   Masking>,
+        DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault,        1,   256,     64,   256,    32,    64,    32,   8,   8,    2,   16,   16,     1,    16,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           4,               S<1, 32, 1, 8>,               8,   Masking>,
+        DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault,        1,   256,     64,   256,    64,   128,    32,   8,   8,    2,   16,   16,     1,    16,     8,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           8,               S<1, 16, 1,16>,               8,   Masking>,
+        DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault,        1,   256,     64,   256,    64,    64,    32,   8,   8,    2,   16,   16,     1,    16,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           4,               S<1, 32, 1, 8>,               8,   Masking>,
+        // Padded fallback kernel
+        DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8,   Masking>,
+        DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded,        1,   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8,   Masking>
+        // clang-format on
+        >;
+
+template <bool Masking>
+using device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_irregular_k_instances =
+    std::tuple<
+        // clang-format off
+        //#######################################| ALayout| B0Layout| B1Layout| CLayout| AData| B0Data| B1Data| CData| AccData| CShuffle|           A|          B0|        Acc0|          B1|           C|           GEMM| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|    MaskOut|
+        //#######################################|        |         |         |        |  Type|   Type|   Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|      Upper|
+        //#######################################|        |         |         |        |      |       |       |      |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|   Triangle|
+        //#######################################|        |         |         |        |      |       |       |      |        |         |            |            |            |            |            |               |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |           |
+        DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded,        1,   256,    256,   128,    40,    64,    32,   4,   4,    2,   32,   32,     2,     4,     2,     S<2,128, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,     false,      S<2,128, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,      false,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8,   Masking>,
+        DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded,        1,   256,    256,   128,    40,   128,    32,   4,   4,    2,   32,   32,     2,     4,     4,     S<2,128, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,     false,      S<2,128, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8,   Masking>,
+        DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded,        1,   256,    128,   256,    40,    64,    32,   4,   4,    2,   32,   32,     1,     8,     2,     S<2,128, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,     false,      S<2,128, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,      false,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8,   Masking>,
+        DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded,        1,   256,    128,   256,    40,   128,    32,   4,   4,    2,   32,   32,     1,     8,     4,     S<2,128, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,     false,      S<2,128, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8,   Masking>,
+        DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded,        1,   256,    128,   128,    40,    64,    32,   4,   4,    2,   32,   32,     1,     4,     2,     S<2,128, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,     false,      S<2,128, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,      false,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8,   Masking>,
+        DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded,        1,   256,    128,   128,    40,   128,    32,   4,   4,    2,   32,   32,     1,     4,     4,     S<2,128, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,     false,      S<2,128, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8,   Masking>
+        // clang-format on
+        >;
+
+void add_device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
+    std::vector<std::unique_ptr<DeviceBatchedGemmSoftmaxGemm<Row,
+                                                             Col,
+                                                             Row,
+                                                             Row,
+                                                             F16,
+                                                             F16,
+                                                             F16,
+                                                             F16,
+                                                             PassThrough,
+                                                             PassThrough,
+                                                             Scale,
+                                                             PassThrough,
+                                                             PassThrough,
+                                                             false>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances<
+            false>{});
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_irregular_k_instances<
+            false>{});
+}
+
+void add_device_batched_gemm_masking_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
+    std::vector<std::unique_ptr<DeviceBatchedGemmSoftmaxGemm<Row,
+                                                             Col,
+                                                             Row,
+                                                             Row,
+                                                             F16,
+                                                             F16,
+                                                             F16,
+                                                             F16,
+                                                             PassThrough,
+                                                             PassThrough,
+                                                             Scale,
+                                                             PassThrough,
+                                                             PassThrough,
+                                                             true>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances<
+            true>{});
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_irregular_k_instances<
+            true>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/CMakeLists.txt
new file mode 100644
index 00000000..76121ffc
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_instance_library(device_batched_gemm_softmax_gemm_permute_instance
+    device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+    device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
+)
+
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
new file mode 100644
index 00000000..53ad7ba5
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Scale       = ck::tensor_operation::element_wise::Scale;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmPadded  = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+
+static constexpr auto TensorDefault = ck::tensor_operation::device::TensorSpecialization::Default;
+
+// c[g, m, n] = a[g, m, k] * b[g, n, k]
+template <index_t NumDimG,
+          index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          index_t NumDimO,
+          MaskingSpecialization MaskingSpec>
+using device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances =
+    std::tuple<
+        // clang-format off
+        // #############################################|  NumDimG| NumDimM| NumDimN| NumDimK| NumDimO|  AData|  B0Data|  B1Data|  CData| Acc0BiasData| Acc1BiasData| AccData| CShuffle|           A|          B0|        Acc0|          B1|           C|           GEMM|   ATensorSpec|  B0TensorSpec|  B1TensorSpec|   CTensorSpec| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| MaskingSpec|
+        // #############################################|         |        |        |        |        |   Type|    Type|    Type|   Type|         Type|         Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization|              |              |              |              | Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            |
+        // #############################################|         |        |        |        |        |       |        |        |       |             |             |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |              |              |              |              |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            |
+        // #############################################|         |        |        |        |        |       |        |        |       |             |             |        |         |            |            |            |            |            |               |              |              |              |              |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |            |
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,    64,    32,   8,   8,    2,   32,   32,     2,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,   128,    32,   8,   8,    2,   32,   32,     2,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   256,    32,    64,    32,   8,   8,    2,   32,   32,     1,     8,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   256,    32,   128,    32,   8,   8,    2,   32,   32,     1,     8,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    64,    64,    32,   8,   8,    2,   32,   32,     1,     4,     2,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    32,    64,    32,   8,   8,    2,   32,   32,     1,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    32,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    32,   128,    32,   8,   8,    2,   16,   16,     1,    16,     8,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           8,               S<1, 16, 1,16>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    32,    64,    32,   8,   8,    2,   16,   16,     1,    16,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           4,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    64,   128,    32,   8,   8,    2,   16,   16,     1,    16,     8,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           8,               S<1, 16, 1,16>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    64,    64,    32,   8,   8,    2,   16,   16,     1,    16,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           4,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        // Padded fallback kernel
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>
+        // clang-format on
+        >;
+
+void add_device_batched_gemm_masking_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemmSoftmaxGemmPermute<2,
+                                            1,
+                                            1,
+                                            1,
+                                            1,
+                                            BF16,
+                                            BF16,
+                                            BF16,
+                                            BF16,
+                                            ck::Tuple<>,
+                                            ck::Tuple<>,
+                                            PassThrough,
+                                            PassThrough,
+                                            Scale,
+                                            PassThrough,
+                                            PassThrough,
+                                            MaskingSpecialization::MaskOutUpperTriangle>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances<
+            2,
+            1,
+            1,
+            1,
+            1,
+            MaskingSpecialization::MaskOutUpperTriangle>{});
+}
+
+void add_device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances(
+    std::vector<
+        std::unique_ptr<DeviceBatchedGemmSoftmaxGemmPermute<2,
+                                                            1,
+                                                            1,
+                                                            1,
+                                                            1,
+                                                            BF16,
+                                                            BF16,
+                                                            BF16,
+                                                            BF16,
+                                                            ck::Tuple<>,
+                                                            ck::Tuple<>,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            Scale,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            MaskingSpecialization::MaskDisabled>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances<
+            2,
+            1,
+            1,
+            1,
+            1,
+            MaskingSpecialization::MaskDisabled>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
new file mode 100644
index 00000000..21da6895
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Scale       = ck::tensor_operation::element_wise::Scale;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmPadded  = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+
+static constexpr auto TensorDefault = ck::tensor_operation::device::TensorSpecialization::Default;
+
+// c[g, m, n] = a[g, m, k] * b[g, n, k]
+template <index_t NumDimG,
+          index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          index_t NumDimO,
+          MaskingSpecialization MaskingSpec>
+using device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances =
+    std::tuple<
+        // clang-format off
+        // #############################################|  NumDimG| NumDimM| NumDimN| NumDimK| NumDimO| AData| B0Data| B1Data| CData| Acc0BiasData| Acc1BiasData| AccData| CShuffle|           A|          B0|        Acc0|          B1|           C|           GEMM|   ATensorSpec|  B0TensorSpec|  B1TensorSpec|   CTensorSpec| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| MaskingSpec|
+        // #############################################|         |        |        |        |        |  Type|   Type|   Type|  Type|         Type|         Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization|              |              |              |              | Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            |
+        // #############################################|         |        |        |        |        |      |       |       |      |             |             |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |              |              |              |              |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            |
+        // #############################################|         |        |        |        |        |      |       |       |      |             |             |        |         |            |            |            |            |            |               |              |              |              |              |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |            |
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,    64,    32,   8,   8,    2,   32,   32,     2,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,   128,    32,   8,   8,    2,   32,   32,     2,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   256,    32,    64,    32,   8,   8,    2,   32,   32,     1,     8,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   256,    32,   128,    32,   8,   8,    2,   32,   32,     1,     8,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    64,    64,    32,   8,   8,    2,   32,   32,     1,     4,     2,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    32,    64,    32,   8,   8,    2,   32,   32,     1,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    32,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    32,   128,    32,   8,   8,    2,   16,   16,     1,    16,     8,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           8,               S<1, 16, 1,16>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    32,    64,    32,   8,   8,    2,   16,   16,     1,    16,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           4,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    64,   128,    32,   8,   8,    2,   16,   16,     1,    16,     8,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           8,               S<1, 16, 1,16>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    64,    64,    32,   8,   8,    2,   16,   16,     1,    16,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           4,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        // Padded fallback kernel
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>
+        // clang-format on
+        >;
+
+void add_device_batched_gemm_masking_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemmSoftmaxGemmPermute<2,
+                                            1,
+                                            1,
+                                            1,
+                                            1,
+                                            F16,
+                                            F16,
+                                            F16,
+                                            F16,
+                                            ck::Tuple<>,
+                                            ck::Tuple<>,
+                                            PassThrough,
+                                            PassThrough,
+                                            Scale,
+                                            PassThrough,
+                                            PassThrough,
+                                            MaskingSpecialization::MaskOutUpperTriangle>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances<
+            2,
+            1,
+            1,
+            1,
+            1,
+            MaskingSpecialization::MaskOutUpperTriangle>{});
+}
+
+void add_device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances(
+    std::vector<
+        std::unique_ptr<DeviceBatchedGemmSoftmaxGemmPermute<2,
+                                                            1,
+                                                            1,
+                                                            1,
+                                                            1,
+                                                            F16,
+                                                            F16,
+                                                            F16,
+                                                            F16,
+                                                            ck::Tuple<>,
+                                                            ck::Tuple<>,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            Scale,
+                                                            PassThrough,
+                                                            PassThrough,
+                                                            MaskingSpecialization::MaskDisabled>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances<
+            2,
+            1,
+            1,
+            1,
+            1,
+            MaskingSpecialization::MaskDisabled>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batchnorm/CMakeLists.txt
new file mode 100644
index 00000000..d12a2f24
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_instance_library(device_batchnorm_instance
+    device_batchnorm_forward_f16_instance.cpp
+    device_batchnorm_forward_f32_instance.cpp
+    device_batchnorm_forward_bf16_instance.cpp
+    device_batchnorm_forward_f64_instance.cpp
+    device_batchnorm_backward_f16_instance.cpp
+    device_batchnorm_backward_f32_instance.cpp
+    device_batchnorm_backward_bf16_instance.cpp
+    device_batchnorm_backward_f64_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_bf16_instance.cpp
new file mode 100644
index 00000000..b62c8b99
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_bf16_instance.cpp
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+// clang-format off
+template <index_t Rank, index_t NumReduceDim, typename DyElementwiseOp>
+using device_batchnorm_backward_bf16_blockwise_instances =
+     std::tuple <
+        // XDataType, DxDataType, DyDataType, AccDataType, ScaleDataType, DscaleDbiasDataType, MeanVarDataType, DyElementwiseOp, Rank, NumReduceDim, UseMultiBlockInK, BLockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XDyDxVectorDim, XSrcVectorSize, DySrcVectorSize, DxDstVectorSize, ScaleSrcVectorSize, DscaleDbiasDstVectorSize, MeanVarSrcVectorSize 
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,  2,  2,    0,  2,  2,  2,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,  2,  2,    0,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,  2,  2,    0,  1,  1,  1,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,  2,  2,    0,  2,  2,  2,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,  2,  2,    1,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,  2,  2,    1,  1,  1,  1,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,  2,  2,    0,  2,  2,  2,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,  2,  2,    0,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,  2,  2,    0,  1,  1,  1,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,  2,  2,    0,  2,  2,  2,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,  2,  2,    1,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,  2,  2,    1,  1,  1,  1,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,  2,  2,    0,  2,  2,  2,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,  2,  2,    0,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,  2,  2,    0,  1,  1,  1,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,  2,  2,    0,  2,  2,  2,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,  2,  2,    1,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,  2,  2,    1,  1,  1,  1,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,  2,  2,    0,  2,  2,  2,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,  2,  2,    0,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,  2,  2,    0,  1,  1,  1,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,  2,  2,    0,  2,  2,  2,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,  2,  2,    1,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,  2,  2,    1,  1,  1,  1,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,  2,  2,    0,  2,  2,  2,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,  2,  2,    0,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,  2,  2,    0,  1,  1,  1,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,  2,  2,    0,  2,  2,  2,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,  2,  2,    1,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,  2,  2,    1,  1,  1,  1,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,  2,  2,    0,  2,  2,  2,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,  2,  2,    0,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,  2,  2,    0,  1,  1,  1,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,  2,  2,    0,  2,  2,  2,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,  2,  2,    1,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,  2,  2,    1,  1,  1,  1,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,  2,  2,    0,  2,  2,  2,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,  2,  2,    0,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,  2,  2,    0,  1,  1,  1,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,  2,  2,    0,  2,  2,  2,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,  2,  2,    1,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,  2,  2,    1,  1,  1,  1,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,  2,  2,    0,  2,  2,  2,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,  2,  2,    0,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,  2,  2,    0,  1,  1,  1,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,  2,  2,    0,  2,  2,  2,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,  2,  2,    1,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,  2,  2,    1,  1,  1,  1,    1,  1,  1>
+     >;
+// clang-format on
+
+// clang-format off
+template <index_t Rank, index_t NumReduceDim, typename DyElementwiseOp>
+using device_batchnorm_backward_bf16_multiblock_instances =
+     std::tuple <
+        // XDataType, DxDataType, DyDataType, AccDataType, ScaleDataType, BiasDataType, MeanVarDataType, DyElementwiseOp, Rank, NumReduceDim, UseMultiBlockInK, BLockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XDyDxVectorDim, XSrcVectorSize, DySrcVectorSize, DxDstVectorSize, ScaleSrcDstVectorSize, BiasDstVectorSize, MeanVarSrcVectorSize 
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<BF16, F32, F32, F32, BF16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,  2,  2,    1,  1,  1,  1,    1,  1,  1>
+     >;
+// clang-format on
+
+void add_device_batchnorm_backward_rank_4_3_bf16_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchNormBwd<BF16, F32, F32, F32, BF16, F32, F32, PassThrough, 4, 3>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_batchnorm_backward_bf16_blockwise_instances<4, 3, PassThrough>{});
+    add_device_operation_instances(
+        instances, device_batchnorm_backward_bf16_multiblock_instances<4, 3, PassThrough>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f16_instance.cpp
new file mode 100644
index 00000000..d05b8b59
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f16_instance.cpp
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+// clang-format off
+template <index_t Rank, index_t NumReduceDim, typename DyElementwiseOp>
+using device_batchnorm_backward_f16_blockwise_instances =
+     std::tuple <
+        // XDataType, DxDataType, DyDataType, AccDataType, ScaleDataType, DscaleDbiasDataType, MeanVarDataType, DyElementwiseOp, Rank, NumReduceDim, UseMultiBlockInK, BLockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XDyDxVectorDim, XSrcVectorSize, DySrcVectorSize, DxDstVectorSize, ScaleSrcVectorSize, DscaleDbiasDstVectorSize, MeanVarSrcVectorSize 
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,  2,  2,    0,  2,  2,  2,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,  2,  2,    0,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,  2,  2,    0,  1,  1,  1,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,  2,  2,    0,  2,  2,  2,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,  2,  2,    1,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,  2,  2,    1,  1,  1,  1,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,  2,  2,    0,  2,  2,  2,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,  2,  2,    0,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,  2,  2,    0,  1,  1,  1,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,  2,  2,    0,  2,  2,  2,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,  2,  2,    1,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,  2,  2,    1,  1,  1,  1,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,  2,  2,    0,  2,  2,  2,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,  2,  2,    0,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,  2,  2,    0,  1,  1,  1,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,  2,  2,    0,  2,  2,  2,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,  2,  2,    1,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,  2,  2,    1,  1,  1,  1,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,  2,  2,    0,  2,  2,  2,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,  2,  2,    0,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,  2,  2,    0,  1,  1,  1,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,  2,  2,    0,  2,  2,  2,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,  2,  2,    1,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,  2,  2,    1,  1,  1,  1,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,  2,  2,    0,  2,  2,  2,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,  2,  2,    0,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,  2,  2,    0,  1,  1,  1,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,  2,  2,    0,  2,  2,  2,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,  2,  2,    1,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,  2,  2,    1,  1,  1,  1,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,  2,  2,    0,  2,  2,  2,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,  2,  2,    0,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,  2,  2,    0,  1,  1,  1,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,  2,  2,    0,  2,  2,  2,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,  2,  2,    1,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,  2,  2,    1,  1,  1,  1,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,  2,  2,    0,  2,  2,  2,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,  2,  2,    0,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,  2,  2,    0,  1,  1,  1,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,  2,  2,    0,  2,  2,  2,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,  2,  2,    1,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,  2,  2,    1,  1,  1,  1,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,  2,  2,    0,  2,  2,  2,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,  2,  2,    0,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,  2,  2,    0,  1,  1,  1,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,  2,  2,    0,  2,  2,  2,    1,  1,  1>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,  2,  2,    1,  1,  1,  1,    2,  2,  2>,  
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,  2,  2,    1,  1,  1,  1,    1,  1,  1>
+     >;
+// clang-format on
+
+// clang-format off
+template <index_t Rank, index_t NumReduceDim, typename DyElementwiseOp>
+using device_batchnorm_backward_f16_multiblock_instances =
+     std::tuple <
+        // XDataType, DxDataType, DyDataType, AccDataType, ScaleDataType, BiasDataType, MeanVarDataType, DyElementwiseOp, Rank, NumReduceDim, UseMultiBlockInK, BLockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XDyDxVectorDim, XSrcVectorSize, DySrcVectorSize, DxDstVectorSize, ScaleSrcDstVectorSize, BiasDstVectorSize, MeanVarSrcVectorSize 
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F16, F32, F32, F32, F16, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,  2,  2,    1,  1,  1,  1,    1,  1,  1>
+     >;
+// clang-format on
+
+void add_device_batchnorm_backward_rank_4_3_f16_instances(
+    std::vector<
+        std::unique_ptr<DeviceBatchNormBwd<F16, F32, F32, F32, F16, F32, F32, PassThrough, 4, 3>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_batchnorm_backward_f16_blockwise_instances<4, 3, PassThrough>{});
+    add_device_operation_instances(
+        instances, device_batchnorm_backward_f16_multiblock_instances<4, 3, PassThrough>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f32_instance.cpp
new file mode 100644
index 00000000..e3ef95d1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f32_instance.cpp
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32 = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+// clang-format off
+template <index_t Rank, index_t NumReduceDim, typename DyElementwiseOp>
+using device_batchnorm_backward_f32_blockwise_instances = std::tuple<
+        // XDataType, DxDataType, DyDataType, AccDataType, ScaleDataType, DscaleDbiasDataType, MeanVarDataType, DyElementwiseOp, Rank, NumReduceDim, UseMultiBlockInK, BLockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XDyDxVectorDim, XSrcVectorSize, DySrcVectorSize, DxDstVectorSize, ScaleSrcVectorSize, DscaleDbiasDstVectorSize, MeanVarSrcVectorSize 
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,  2,  2,    1,  1,  1,  1,    1,  1,  1>
+    >;
+// clang-format on
+
+// clang-format off
+template <index_t Rank, index_t NumReduceDim, typename DyElementwiseOp>
+using device_batchnorm_backward_f32_multiblock_instances =
+     std::tuple <
+        // XDataType, DxDataType, DyDataType, AccDataType, ScaleDataType, BiasDataType, MeanVarDataType, DyElementwiseOp, Rank, NumReduceDim, UseMultiBlockInK, BLockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XDyDxVectorDim, XSrcVectorSize, DySrcVectorSize, DxDstVectorSize, ScaleSrcDstVectorSize, BiasDstVectorSize, MeanVarSrcVectorSize 
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F32, F32, F32, F32, F32, F32, F32, DyElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,  2,  2,    1,  1,  1,  1,    1,  1,  1>
+     >;
+// clang-format on
+
+void add_device_batchnorm_backward_rank_4_3_f32_instances(
+    std::vector<
+        std::unique_ptr<DeviceBatchNormBwd<F32, F32, F32, F32, F32, F32, F32, PassThrough, 4, 3>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_batchnorm_backward_f32_blockwise_instances<4, 3, PassThrough>{});
+    add_device_operation_instances(
+        instances, device_batchnorm_backward_f32_multiblock_instances<4, 3, PassThrough>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f64_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f64_instance.cpp
new file mode 100644
index 00000000..41be396c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_backward_f64_instance.cpp
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batchnorm_backward_impl.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F64 = double;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+// clang-format off
+template <index_t Rank, index_t NumReduceDim, typename DyElementwiseOp>
+using device_batchnorm_backward_f64_blockwise_instances = std::tuple<
+      // XDataType, DxDataType, DyDataType, AccDataType, ScaleDataType, DscaleDbiasDataType, MeanVarDataType, DyElementwiseOp, Rank, NumReduceDim, UseMultiBlockInK, BLockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XDyDxVectorDim, XSrcVectorSize, DySrcVectorSize, DxDstVectorSize, ScaleSrcVectorSize, DscaleDbiasDstVectorSize, MeanVarSrcVectorSize 
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+      DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,  2,  2,    1,  1,  1,  1,    1,  1,  1>
+    >;
+// clang-format on
+
+// clang-format off
+template <index_t Rank, index_t NumReduceDim, typename DyElementwiseOp>
+using device_batchnorm_backward_f64_multiblock_instances =
+     std::tuple <
+        // XDataType, DxDataType, DyDataType, AccDataType, ScaleDataType, BiasDataType, MeanVarDataType, DyElementwiseOp, Rank, NumReduceDim, UseMultiBlockInK, BLockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XDyDxVectorDim, XSrcVectorSize, DySrcVectorSize, DxDstVectorSize, ScaleSrcDstVectorSize, BiasDstVectorSize, MeanVarSrcVectorSize 
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,  2,  2,    1,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,  2,  2,    0,  2,  2,  2,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,  2,  2,    0,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,  2,  2,    0,  1,  1,  1,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,  2,  2,    0,  2,  2,  2,    1,  1,  1>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,  2,  2,    1,  1,  1,  1,    2,  2,  2>,
+        DeviceBatchNormBwdImpl<F64, F64, F64, F64, F64, F64, F64, DyElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,  2,  2,    1,  1,  1,  1,    1,  1,  1>
+     >;
+// clang-format on
+
+void add_device_batchnorm_backward_rank_4_3_f64_instances(
+    std::vector<
+        std::unique_ptr<DeviceBatchNormBwd<F64, F64, F64, F64, F64, F64, F64, PassThrough, 4, 3>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_batchnorm_backward_f64_blockwise_instances<4, 3, PassThrough>{});
+    add_device_operation_instances(
+        instances, device_batchnorm_backward_f64_multiblock_instances<4, 3, PassThrough>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_bf16_instance.cpp
new file mode 100644
index 00000000..cd1e05b1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_bf16_instance.cpp
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+// clang-format off
+template <index_t Rank, index_t NumReduceDim, typename YElementwiseOp>
+using device_batchnorm_forward_bf16_blockwise_instances =
+     std::tuple <
+        // XDataType, YDataType, AccDataType, ScaleDataType, BiasDataType, MeanVarDataType, YElementwiseOp, Rank, NumReduceDim, UseMultiBlockInK, BLockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XSrcYDstVectorDim, XSrcVectorSize, YDstVectorSize, ScaleSrcVectorSize, BiasSrcVectorSize, MeanVarSrcDstVectorSize 
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,    2,    2,    0,    2,    2,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,    2,    2,    0,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,    2,    2,    0,    1,    1,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,    2,    2,    0,    2,    2,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,    2,    2,    1,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,    2,    2,    1,    1,    1,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,    2,    2,    0,    2,    2,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,    2,    2,    0,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,    2,    2,    0,    1,    1,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,    2,    2,    0,    2,    2,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,    2,    2,    1,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,    2,    2,    1,    1,    1,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,    2,    2,    0,    2,    2,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,    2,    2,    0,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,    2,    2,    0,    1,    1,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,    2,    2,    0,    2,    2,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,    2,    2,    1,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,    2,    2,    1,    1,    1,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,    2,    2,    0,    2,    2,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,    2,    2,    0,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,    2,    2,    0,    1,    1,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,    2,    2,    0,    2,    2,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,    2,    2,    1,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,    2,    2,    1,    1,    1,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,    2,    2,    0,    2,    2,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,    2,    2,    0,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,    2,    2,    0,    1,    1,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,    2,    2,    0,    2,    2,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,    2,    2,    1,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,    2,    2,    1,    1,    1,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,    2,    2,    0,    2,    2,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,    2,    2,    0,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,    2,    2,    0,    1,    1,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,    2,    2,    0,    2,    2,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,    2,    2,    1,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,    2,    2,    1,    1,    1,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,    2,    2,    0,    2,    2,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,    2,    2,    0,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,    2,    2,    0,    1,    1,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,    2,    2,    0,    2,    2,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,    2,    2,    1,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,    2,    2,    1,    1,    1,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,    2,    2,    0,    2,    2,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,    2,    2,    0,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,    2,    2,    0,    1,    1,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,    2,    2,    0,    2,    2,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,    2,    2,    1,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,    2,    2,    1,    1,    1,    1,    1,    1>
+     >;
+// clang-format on
+
+// clang-format off
+template <index_t Rank, index_t NumReduceDim, typename YElementwiseOp>
+using device_batchnorm_forward_bf16_multiblock_instances =
+     std::tuple <
+        // XDataType, YDataType, AccDataType, ScaleDataType, BiasDataType, MeanVarDataType, YElementwiseOp, Rank, NumReduceDim, UseMultiBlockInK, BLockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XSrcYDstVectorDim, XSrcVectorSize, YDstVectorSize, ScaleSrcVectorSize, BiasSrcVectorSize, MeanVarSrcDstVectorSize
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<BF16, BF16, F32, BF16, BF16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,    2,    2,    1,    1,    1,    1,    1,    1>
+     >;
+// clang-format on
+
+void add_device_batchnorm_forward_rank_4_3_bf16_instances(
+    std::vector<
+        std::unique_ptr<DeviceBatchNormFwd<BF16, BF16, F32, BF16, BF16, F32, PassThrough, 4, 3>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_batchnorm_forward_bf16_blockwise_instances<4, 3, PassThrough>{});
+    add_device_operation_instances(
+        instances, device_batchnorm_forward_bf16_multiblock_instances<4, 3, PassThrough>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f16_instance.cpp
new file mode 100644
index 00000000..073dd583
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f16_instance.cpp
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+// clang-format off
+template <index_t Rank, index_t NumReduceDim, typename YElementwiseOp>
+using device_batchnorm_forward_f16_blockwise_instances =
+     std::tuple <
+        // XDataType, YDataType, AccDataType, ScaleDataType, BiasDataType, MeanVarDataType, YElementwiseOp, Rank, NumReduceDim, UseMultiBlockInK, BLockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XSrcYDstVectorDim, XSrcVectorSize, YDstVectorSize, ScaleSrcVectorSize, BiasSrcVectorSize, MeanVarSrcDstVectorSize 
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,    2,    2,    0,    2,    2,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,    2,    2,    0,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,    2,    2,    0,    1,    1,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,    2,    2,    0,    2,    2,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,    2,    2,    1,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,    2,    2,    1,    1,    1,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,    2,    2,    0,    2,    2,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,    2,    2,    0,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,    2,    2,    0,    1,    1,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,    2,    2,    0,    2,    2,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,    2,    2,    1,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,    2,    2,    1,    1,    1,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,    2,    2,    0,    2,    2,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,    2,    2,    0,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,    2,    2,    0,    1,    1,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,    2,    2,    0,    2,    2,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,    2,    2,    1,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,    2,    2,    1,    1,    1,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,    2,    2,    0,    2,    2,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,    2,    2,    0,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,    2,    2,    0,    1,    1,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,    2,    2,    0,    2,    2,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,    2,    2,    1,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,    2,    2,    1,    1,    1,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,    2,    2,    0,    2,    2,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,    2,    2,    0,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,    2,    2,    0,    1,    1,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,    2,    2,    0,    2,    2,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,    2,    2,    1,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,    2,    2,    1,    1,    1,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,    2,    2,    0,    2,    2,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,    2,    2,    0,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,    2,    2,    0,    1,    1,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,    2,    2,    0,    2,    2,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,    2,    2,    1,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,    2,    2,    1,    1,    1,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,    2,    2,    0,    2,    2,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,    2,    2,    0,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,    2,    2,    0,    1,    1,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,    2,    2,    0,    2,    2,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,    2,    2,    1,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,    2,    2,    1,    1,    1,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,    2,    2,    0,    2,    2,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,    2,    2,    0,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,    2,    2,    0,    1,    1,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,    2,    2,    0,    2,    2,    1,    1,    1>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,    2,    2,    1,    1,    1,    2,    2,    2>,  
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,    2,    2,    1,    1,    1,    1,    1,    1>
+     >;
+// clang-format on
+
+// clang-format off
+template <index_t Rank, index_t NumReduceDim, typename YElementwiseOp>
+using device_batchnorm_forward_f16_multiblock_instances =
+     std::tuple <
+        // XDataType, YDataType, AccDataType, ScaleDataType, BiasDataType, MeanVarDataType, YElementwiseOp, Rank, NumReduceDim, UseMultiBlockInK, BLockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XSrcYDstVectorDim, XSrcVectorSize, YDstVectorSize, ScaleSrcVectorSize, BiasSrcVectorSize, MeanVarSrcDstVectorSize
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F16, F16, F32, F16, F16, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,    2,    2,    1,    1,    1,    1,    1,    1>
+     >;
+// clang-format on
+
+void add_device_batchnorm_forward_rank_4_3_f16_instances(
+    std::vector<
+        std::unique_ptr<DeviceBatchNormFwd<F16, F16, F32, F16, F16, F32, PassThrough, 4, 3>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_batchnorm_forward_f16_blockwise_instances<4, 3, PassThrough>{});
+    add_device_operation_instances(
+        instances, device_batchnorm_forward_f16_multiblock_instances<4, 3, PassThrough>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f32_instance.cpp
new file mode 100644
index 00000000..be63bd44
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f32_instance.cpp
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32 = float;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+// clang-format off
+template <index_t Rank, index_t NumReduceDim, typename YElementwiseOp>
+using device_batchnorm_forward_f32_blockwise_instances = std::tuple<
+    // XDataType, YDataType, AccDataType, ScaleDataType, BiasDataType, MeanVarDataType, YElementwiseOp, Rank, NumReduceDim, UseMultiBlockInK, BLockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XSrcYDstVectorDim, XSrcVectorSize, YDstVectorSize, ScaleSrcVectorSize, BiasSrcVectorSize, MeanVarSrcDstVectorSize
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,    2,    2,    1,    1,    1,    1,    1,    1>
+    >;
+// clang-format on
+
+// clang-format off
+template <index_t Rank, index_t NumReduceDim, typename YElementwiseOp>
+using device_batchnorm_forward_f32_multiblock_instances =
+     std::tuple <
+        // XDataType, YDataType, AccDataType, ScaleDataType, BiasDataType, MeanVarDataType, YElementwiseOp, Rank, NumReduceDim, UseMultiBlockInK, BLockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XSrcYDstVectorDim, XSrcVectorSize, YDstVectorSize, ScaleSrcVectorSize, BiasSrcVectorSize, MeanVarSrcDstVectorSize
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F32, F32, F32, F32, F32, F32, YElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,    2,    2,    1,    1,    1,    1,    1,    1>
+     >;
+// clang-format on
+
+void add_device_batchnorm_forward_rank_4_3_f32_instances(
+    std::vector<
+        std::unique_ptr<DeviceBatchNormFwd<F32, F32, F32, F32, F32, F32, PassThrough, 4, 3>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_batchnorm_forward_f32_blockwise_instances<4, 3, PassThrough>{});
+    add_device_operation_instances(
+        instances, device_batchnorm_forward_f32_multiblock_instances<4, 3, PassThrough>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f64_instance.cpp b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f64_instance.cpp
new file mode 100644
index 00000000..fe87091e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/batchnorm/device_batchnorm_forward_f64_instance.cpp
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batchnorm_forward_impl.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F64 = double;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+// clang-format off
+template <index_t Rank, index_t NumReduceDim, typename YElementwiseOp>
+using device_batchnorm_forward_f64_blockwise_instances = std::tuple<
+    // XDataType, YDataType, AccDataType, ScaleDataType, BiasDataType, MeanVarDataType, YElementwiseOp, Rank, NumReduceDim, UseMultiBlockInK, BLockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XSrcYDstVectorDim, XSrcVectorSize, YDstVectorSize, ScaleSrcVectorSize, BiasSrcVectorSize, MeanVarSrcDstVectorSize
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 128, 2,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 64,  4,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 32,  8,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 16, 16,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 8,  32,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 4,  64,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 2, 128,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, false, 256, 1, 256,    2,    2,    1,    1,    1,    1,    1,    1>
+    >;
+// clang-format on
+
+// clang-format off
+template <index_t Rank, index_t NumReduceDim, typename YElementwiseOp>
+using device_batchnorm_forward_f64_multiblock_instances =
+     std::tuple <
+        // XDataType, YDataType, AccDataType, ScaleDataType, BiasDataType, MeanVarDataType, YElementwiseOp, Rank, NumReduceDim, UseMultiBlockInK, BLockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XSrcYDstVectorDim, XSrcVectorSize, YDstVectorSize, ScaleSrcVectorSize, BiasSrcVectorSize, MeanVarSrcDstVectorSize
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 128, 2,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 64,  4,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 32,  8,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 16, 16,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 8,  32,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 4,  64,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 2, 128,    2,    2,    1,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,    2,    2,    0,    2,    2,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,    2,    2,    0,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,    2,    2,    0,    1,    1,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,    2,    2,    0,    2,    2,    1,    1,    1>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,    2,    2,    1,    1,    1,    2,    2,    2>,
+        DeviceBatchNormFwdImpl<F64, F64, F64, F64, F64, F64, YElementwiseOp, Rank, NumReduceDim, true, 256, 1, 256,    2,    2,    1,    1,    1,    1,    1,    1>
+     >;
+// clang-format on
+
+void add_device_batchnorm_forward_rank_4_3_f64_instances(
+    std::vector<
+        std::unique_ptr<DeviceBatchNormFwd<F64, F64, F64, F64, F64, F64, PassThrough, 4, 3>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_batchnorm_forward_f64_blockwise_instances<4, 3, PassThrough>{});
+    add_device_operation_instances(
+        instances, device_batchnorm_forward_f64_multiblock_instances<4, 3, PassThrough>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/contraction_bilinear/CMakeLists.txt
new file mode 100644
index 00000000..ffd6a6a7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_instance_library(device_contraction_bilinear_instance
+    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
+    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
+    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
+    device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
+)
+
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
new file mode 100644
index 00000000..ebbff883
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32       = float;
+using F32_Tuple = ck::Tuple<F32>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+// k/k/n/n are the fast changing dimension for A/B/D/E
+using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>
+    // clang-format on
+    >;
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
new file mode 100644
index 00000000..980383f3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32       = float;
+using F32_Tuple = ck::Tuple<F32>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+// k/n/n/n are the fast changing dimension for A/B/D/E
+using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   1,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   1,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1,  8>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   1,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   1,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>
+    // clang-format on
+    >;
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
new file mode 100644
index 00000000..2d4b6e34
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32       = float;
+using F32_Tuple = ck::Tuple<F32>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+// m/k/n/n are the fast changing dimension for A/B/D/E
+using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|  CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|          _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|          _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                              |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    16,   1,   4,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   4,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1,  8>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1,  8>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   4,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   4,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   4,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>
+    // clang-format on
+    >;
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
new file mode 100644
index 00000000..7caa469f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32       = float;
+using F32_Tuple = ck::Tuple<F32>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] + D[m0, m1, n0, n1] = E[m0, m1, n0, n1]
+// m/n/n/n are the fast changing dimension for A/B/D/E
+using device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|    DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|      Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |          |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |          |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    16,   1,   1,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   1,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1,  8>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   1,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   1,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   1,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, F32_Tuple,   F32,  PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>
+    // clang-format on
+    >;
+
+void add_device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           F32_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Bilinear>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/contraction_scale/CMakeLists.txt
new file mode 100644
index 00000000..7ad66054
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_instance_library(device_contraction_scale_instance
+    device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
+    device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
+    device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
+    device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
+)
+
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
new file mode 100644
index 00000000..5118d0d0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32         = float;
+using Empty_Tuple = ck::Tuple<>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Scale       = ck::tensor_operation::element_wise::Scale;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1]
+// k/k/n are the fast changing dimension for A/B/E
+using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>
+    // clang-format on
+    >;
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
new file mode 100644
index 00000000..655d4f00
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32         = float;
+using Empty_Tuple = ck::Tuple<>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Scale       = ck::tensor_operation::element_wise::Scale;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1]
+// k/n/n are the fast changing dimension for A/B/E
+using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   1,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   1,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1,  8>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   1,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   1,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>
+    // clang-format on
+    >;
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
new file mode 100644
index 00000000..a9d20be1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32         = float;
+using Empty_Tuple = ck::Tuple<>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Scale       = ck::tensor_operation::element_wise::Scale;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1]
+// m/k/n are the fast changing dimension for A/B/E
+using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|  CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|          _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |            |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|          _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |            |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                              |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   256,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   256,    16,   1,   4,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   4,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1,  8>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1,  8>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   4,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   4,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   4,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,               4>
+    // clang-format on
+    >;
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
new file mode 100644
index 00000000..a68f5c97
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/contraction_scale/device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter of each device op
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 1
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_contraction_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32         = float;
+using Empty_Tuple = ck::Tuple<>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Scale       = ck::tensor_operation::element_wise::Scale;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// A[m0, m1, k0, k1] * B[n0, n1, k0, k1] = E[m0, m1, n0, n1]
+// m/n/n are the fast changing dimension for A/B/E
+using device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance = std::tuple<
+    // clang-format off
+        //#####################################| NumDimM| NumDimN| NumDimK| AData| BData| AccData| CShuffle|      DsData| EData|            A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################################|        |        |        |  Type|  Type|    Type| DataType|        Type|  Type|  Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################################|        |        |        |      |      |        |         |            |      |    Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################################|        |        |        |      |      |        |         |            |      |             |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   256,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   256,    16,   1,   1,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   1,   1,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   1,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1,  8>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   1,   1,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   1,   1,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   1,   1,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceContractionMultipleD_Xdl_CShuffle<       2,       2,       2,   F32,   F32,     F32,      F32, Empty_Tuple,   F32,  PassThrough, PassThrough,       Scale, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>
+    // clang-format on
+    >;
+
+void add_device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance(
+    std::vector<std::unique_ptr<DeviceContractionMultipleD<2,
+                                                           2,
+                                                           2,
+                                                           F32,
+                                                           F32,
+                                                           Empty_Tuple,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           Scale>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/CMakeLists.txt
new file mode 100644
index 00000000..75a36707
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_instance_library(device_conv1d_bwd_data_instance
+   device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp
+   device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp
+   device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp
+   device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp
new file mode 100644
index 00000000..5a5c8384
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instance.cpp
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = bhalf_t;
+using F32  = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using NWC = ck::tensor_layout::convolution::NWC;
+using KXC = ck::tensor_layout::convolution::KXC;
+using NWK = ck::tensor_layout::convolution::NWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvBwdDataDefault =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
+
+static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, wi, c] * wei[k, x, c] = out[n, wo, k]
+using device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances = std::tuple<
+    // clang-format off
+        //##############################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|       ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##############################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|               Data|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##############################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|     Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##############################|        |       |        |        |        |            |            |            |                   |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>
+    // clang-format on
+    >;
+
+using device_conv1d_bwd_data_xdl_nwc_kxc_nwk_1x1_s1_p0_bf16_instances = std::tuple<
+    // clang-format off
+        //##############################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                    ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##############################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                            Data|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##############################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                  Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##############################|        |       |        |        |        |            |            |            |                                |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>
+    // clang-format on
+    >;
+
+void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvBwdData<1,
+                                                  NWC,
+                                                  KXC,
+                                                  NWK,
+                                                  BF16,
+                                                  BF16,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances{});
+    add_device_operation_instances(
+        instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_1x1_s1_p0_bf16_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp
new file mode 100644
index 00000000..e0f3d619
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instance.cpp
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using NWC = ck::tensor_layout::convolution::NWC;
+using KXC = ck::tensor_layout::convolution::KXC;
+using NWK = ck::tensor_layout::convolution::NWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvBwdDataDefault =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
+
+static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, wi, c] * wei[k, x, c] = out[n, wo, k]
+using device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instances = std::tuple<
+    // clang-format off
+        //##############################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|       ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##############################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|               Data|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##############################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|     Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##############################|        |       |        |        |        |            |            |            |                   |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   256,   128,     4,   8,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,   256,     4,   8,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,   128,     4,   8,  32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,   128,     4,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,    64,   128,     4,   8,  32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,    32,   128,     4,   8,  32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,    64,   128,     4,   8,  32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,    64,     4,   8,  32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,    64,     4,   8,  32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,       
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    64,    64,     4,   8,  32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,    32,     4,   8,  32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    64,    32,     4,   8,  32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    32,    64,     4,   8,  32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>
+    // clang-format on
+    >;
+
+using device_conv1d_bwd_data_xdl_nwc_kxc_nwk_1x1_s1_p0_f16_instances = std::tuple<
+    // clang-format off
+        //##############################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                    ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##############################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                            Data|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##############################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                  Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##############################|        |       |        |        |        |            |            |            |                                |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>
+    // clang-format on
+    >;
+
+void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instances(
+    std::vector<std::unique_ptr<
+        DeviceConvBwdData<1, NWC, KXC, NWK, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instances{});
+    add_device_operation_instances(
+        instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_1x1_s1_p0_f16_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp
new file mode 100644
index 00000000..30537d93
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instance.cpp
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32 = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using NWC = ck::tensor_layout::convolution::NWC;
+using KXC = ck::tensor_layout::convolution::KXC;
+using NWK = ck::tensor_layout::convolution::NWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvBwdDataDefault =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
+
+static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, wi, c] * wei[k, x, c] = out[n, wo, k]
+using device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances = std::tuple<
+    // clang-format off
+        //##############################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|        ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##############################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                Data|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##############################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|      Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##############################|        |       |        |        |        |            |            |            |                    |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   256,   128,     4,  4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   128,   256,     4,  4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,   128,   128,     4,  4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   128,   128,     4,  4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,   128,    64,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,    64,   128,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,    64,    64,    64,     4,  4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   128,    64,     4,  4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,    64,   128,     4,  4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,   128,    32,     4,  4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,    32,   128,     4,  4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,    64,    64,    32,     4,  4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,    64,    32,    64,     4,  4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>
+    // clang-format on
+    >;
+
+using device_conv1d_bwd_data_xdl_nwc_kxc_nwk_1x1_s1_p0_f32_instances = std::tuple<
+    // clang-format off
+        //##############################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                     ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##############################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                             Data|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##############################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                   Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##############################|        |       |        |        |        |            |            |            |                                 |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   256,   128,     4,  4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   256,     4,  4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,   128,     4,  4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   128,     4,  4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    64,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,    64,   128,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    64,     4,  4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,    64,     4,  4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,    64,   128,     4,  4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    32,     4,  4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,    32,   128,     4,  4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    32,     4,  4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    32,    64,     4,  4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>
+    // clang-format on
+    >;
+
+void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances(
+    std::vector<std::unique_ptr<
+        DeviceConvBwdData<1, NWC, KXC, NWK, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances{});
+    add_device_operation_instances(
+        instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_1x1_s1_p0_f32_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp
new file mode 100644
index 00000000..190c39b8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/conv1d_bwd_data/device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instance.cpp
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using NWC = ck::tensor_layout::convolution::NWC;
+using KXC = ck::tensor_layout::convolution::KXC;
+using NWK = ck::tensor_layout::convolution::NWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvBwdDataDefault =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
+
+static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, wi, c] * wei[k, x, c] = out[n, wo, k]
+using device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances = std::tuple<
+    // clang-format off
+        //##############################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|       ConvBackward| Block|  MPer|  NPer| K0Per|  K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##############################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|               Data|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##############################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|     Specialization|      |      |      |      |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##############################|        |       |        |        |        |            |            |            |                   |      |      |      |      |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,   256,     4,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,   128,     4,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   256,   128,     4,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,   128,     4,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,    64,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,    64,   128,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    64,    64,     4,  16,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,    64,     4,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,    64,   128,     4,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,    32,     4,  16,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,    32,   128,     4,  16,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    64,    32,     4,  16,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    32,    64,     4,  16,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>
+    // clang-format on
+    >;
+
+using device_conv1d_bwd_data_xdl_nwc_kxc_nwk_1x1_s1_p0_int8_instances = std::tuple<
+    // clang-format off
+        //##############################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                    ConvBackward| Block|  MPer|  NPer| K0Per|  K1| MPer| NPer| MXdl| NXdl|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##############################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                            Data|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|    ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##############################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                  Specialization|      |      |      |      |    |     |     | Wave| Wave|  Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##############################|        |       |        |        |        |            |            |            |                                |      |      |      |      |    |     |     |     |     |                 |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   256,   128,     4,  16,   32,   32,    4,    2,      S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   256,     4,  16,   32,   32,    2,    4,      S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,   128,   128,     4,  16,   32,   32,    4,    2,      S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   128,     4,  16,   32,   32,    2,    2,      S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    64,     4,  16,   32,   32,    2,    2,      S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,    64,   128,     4,  16,   32,   32,    2,    2,      S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    64,     4,  16,   32,   32,    2,    2,      S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   128,    64,     4,  16,   32,   32,    2,    1,      S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,    64,   128,     4,  16,   32,   32,    1,    2,      S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    32,     4,  16,   32,   32,    2,    1,      S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,    32,   128,     4,  16,   32,   32,    1,    2,      S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    32,     4,  16,   32,   32,    2,    1,      S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       1, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,    64,    32,    64,     4,  16,   32,   32,    1,    2,      S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>
+    // clang-format on
+    >;
+
+void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances(
+    std::vector<std::unique_ptr<DeviceConvBwdData<1,
+                                                  NWC,
+                                                  KXC,
+                                                  NWK,
+                                                  int8_t,
+                                                  int8_t,
+                                                  int8_t,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances{});
+    add_device_operation_instances(
+        instances, device_conv1d_bwd_data_xdl_nwc_kxc_nwk_1x1_s1_p0_int8_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt
new file mode 100644
index 00000000..281453b5
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_instance_library(device_conv2d_bwd_data_instance
+   device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
+   device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
+   device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
+   device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
+
+   device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp
+   device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp
+   device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp
+) 
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp
new file mode 100644
index 00000000..e14cd558
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using InDataType  = ck::half_t;
+using WeiDataType = ck::half_t;
+using OutDataType = ck::half_t;
+using AccDataType = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using NHWC = ck::tensor_layout::convolution::NHWC;
+using KYXC = ck::tensor_layout::convolution::KYXC;
+using NHWK = ck::tensor_layout::convolution::NHWK;
+
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvBwdDataDefault =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
+
+static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
+using device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instances = std::tuple<
+    // clang-format off
+        //#########################|       NDim|     InData|     WeiData|     OutData|     AccData|           In|           Wei|           Out|        Convolution| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+        //#########################|    Spatial|       Type|        Type|        Type|        Type|  Elementwise|   Elementwise|   Elementwise|            Forward|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+        //#########################|           |           |            |            |            |    Operation|     Operation|     Operation|     Specialization|      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+        //#########################|           |           |            |            |            |             |              |              |                   |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+    DeviceConvNdBwdDataNwcKxcNwk_Dl<          2, InDataType, WeiDataType, OutDataType, AccDataType,  InElementOp,  WeiElementOp,  OutElementOp, ConvBwdDataDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,      S<1, 1, 8, 2>,      S<16, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 8, 1>,      S<0, 3, 1, 2>,       S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
+    // clang-format on
+    >;
+
+using device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances = std::tuple<
+    // clang-format off
+        //#########################|       NDim|     InData|     WeiData|     OutData|     AccData|           In|           Wei|           Out|                     Convolution| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+        //#########################|    Spatial|       Type|        Type|        Type|        Type|  Elementwise|   Elementwise|   Elementwise|                         Forward|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+        //#########################|           |           |            |            |            |    Operation|     Operation|     Operation|                  Specialization|      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+        //#########################|           |           |            |            |            |             |              |              |                                |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+    DeviceConvNdBwdDataNwcKxcNwk_Dl<          2, InDataType, WeiDataType, OutDataType, AccDataType,  InElementOp,  WeiElementOp,  OutElementOp, ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,      S<1, 1, 8, 2>,      S<16, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 8, 1>,      S<0, 3, 1, 2>,       S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
+    // clang-format on
+    >;
+
+void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvBwdData<2,
+                                                  NHWC,
+                                                  KYXC,
+                                                  NHWK,
+                                                  InDataType,
+                                                  WeiDataType,
+                                                  OutDataType,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instances{});
+    add_device_operation_instances(
+        instances, device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp
new file mode 100644
index 00000000..f001b83c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using InDataType  = float;
+using WeiDataType = float;
+using OutDataType = float;
+using AccDataType = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using NHWC = ck::tensor_layout::convolution::NHWC;
+using KYXC = ck::tensor_layout::convolution::KYXC;
+using NHWK = ck::tensor_layout::convolution::NHWK;
+
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvBwdDataDefault =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
+
+static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
+using device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instances = std::tuple<
+    // clang-format off
+        //#########################|       NDim|     InData|     WeiData|     OutData|     AccData|           In|           Wei|           Out|        Convolution| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+        //#########################|    Spatial|       Type|        Type|        Type|        Type|  Elementwise|   Elementwise|   Elementwise|            Forward|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+        //#########################|           |           |            |            |            |    Operation|     Operation|     Operation|     Specialization|      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+        //#########################|           |           |            |            |            |             |              |              |                   |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+    DeviceConvNdBwdDataNwcKxcNwk_Dl<          2, InDataType, WeiDataType, OutDataType, AccDataType,  InElementOp,  WeiElementOp,  OutElementOp, ConvBwdDataDefault,   256,   128,   128,    16,  1,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>,      S<1, 1, 8, 1>,      S<16, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
+    // clang-format on
+    >;
+
+using device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_1x1_s1_p0_f32_instances = std::tuple<
+    // clang-format off
+        //#########################|       NDim|     InData|     WeiData|     OutData|     AccData|           In|           Wei|           Out|                     Convolution| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+        //#########################|    Spatial|       Type|        Type|        Type|        Type|  Elementwise|   Elementwise|   Elementwise|                         Forward|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+        //#########################|           |           |            |            |            |    Operation|     Operation|     Operation|                  Specialization|      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+        //#########################|           |           |            |            |            |             |              |              |                                |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+    DeviceConvNdBwdDataNwcKxcNwk_Dl<          2, InDataType, WeiDataType, OutDataType, AccDataType,  InElementOp,  WeiElementOp,  OutElementOp, ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   128,    16,  1,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>,      S<1, 1, 8, 1>,      S<16, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
+    // clang-format on
+    >;
+
+void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvBwdData<2,
+                                                  NHWC,
+                                                  KYXC,
+                                                  NHWK,
+                                                  InDataType,
+                                                  WeiDataType,
+                                                  OutDataType,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instances{});
+    add_device_operation_instances(
+        instances, device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_1x1_s1_p0_f32_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp
new file mode 100644
index 00000000..83ba6a1c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using InDataType  = int8_t;
+using WeiDataType = int8_t;
+using OutDataType = int8_t;
+using AccDataType = int32_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using NHWC = ck::tensor_layout::convolution::NHWC;
+using KYXC = ck::tensor_layout::convolution::KYXC;
+using NHWK = ck::tensor_layout::convolution::NHWK;
+
+using PassThrough  = ck::tensor_operation::element_wise::PassThrough;
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvBwdDataDefault =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
+
+static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
+using device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instances = std::tuple<
+    // clang-format off
+        //#########################|       NDim|     InData|     WeiData|     OutData|     AccData|           In|           Wei|           Out|        Convolution| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+        //#########################|    Spatial|       Type|        Type|        Type|        Type|  Elementwise|   Elementwise|   Elementwise|            Forward|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+        //#########################|           |           |            |            |            |    Operation|     Operation|     Operation|     Specialization|      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+        //#########################|           |           |            |            |            |             |              |              |                   |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+    DeviceConvNdBwdDataNwcKxcNwk_Dl<          2, InDataType, WeiDataType, OutDataType, AccDataType,  InElementOp,  WeiElementOp,  OutElementOp, ConvBwdDataDefault,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>,      S<1, 1, 8, 4>,      S<16, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 8, 1>,      S<0, 3, 1, 2>,       S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
+    // clang-format on
+    >;
+
+using device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_1x1_s1_p0_int8_instances = std::tuple<
+    // clang-format off
+        //#########################|       NDim|     InData|     WeiData|     OutData|     AccData|           In|           Wei|           Out|                     Convolution| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+        //#########################|    Spatial|       Type|        Type|        Type|        Type|  Elementwise|   Elementwise|   Elementwise|                         Forward|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+        //#########################|           |           |            |            |            |    Operation|     Operation|     Operation|                  Specialization|      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+        //#########################|           |           |            |            |            |             |              |              |                                |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+    DeviceConvNdBwdDataNwcKxcNwk_Dl<          2, InDataType, WeiDataType, OutDataType, AccDataType,  InElementOp,  WeiElementOp,  OutElementOp, ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>,      S<1, 1, 8, 4>,      S<16, 1, 16, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 8, 1>,      S<0, 3, 1, 2>,       S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
+    // clang-format on
+    >;
+
+void add_device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instances(
+    std::vector<std::unique_ptr<DeviceConvBwdData<2,
+                                                  NHWC,
+                                                  KYXC,
+                                                  NHWK,
+                                                  InDataType,
+                                                  WeiDataType,
+                                                  OutDataType,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instances{});
+    add_device_operation_instances(
+        instances, device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_1x1_s1_p0_int8_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
new file mode 100644
index 00000000..1da9a81d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp"
+// FIXME: retire dedicated 2D version
+#include "ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = bhalf_t;
+using F32  = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using NHWC = ck::tensor_layout::convolution::NHWC;
+using KYXC = ck::tensor_layout::convolution::KYXC;
+using NHWK = ck::tensor_layout::convolution::NHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvBwdDataDefault =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
+
+static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
+using device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances = std::tuple<
+    // clang-format off
+        //##############################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|       ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##############################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|               Data|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##############################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|     Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##############################|        |       |        |        |        |            |            |            |                   |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>
+    // clang-format on
+    >;
+
+using device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_bf16_instances = std::tuple<
+    // clang-format off
+        //##############################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                    ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##############################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                            Data|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##############################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                  Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##############################|        |       |        |        |        |            |            |            |                                |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>
+    // clang-format on
+    >;
+
+// FIXME: retire dedicated 2D version
+// Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
+using device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances = std::tuple<
+    // clang-format off
+        //####################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|        ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //####################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                Data|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //####################################################################|       |        |        |        |   Operation|   Operation|   Operation|      Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //####################################################################|       |        |        |        |            |            |            |                    |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>
+    // clang-format on
+    >;
+
+// FIXME: retire dedicated 2D version
+using device_conv_dedidecate_2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_bf16_instances =
+    std::tuple<
+        // clang-format off
+        //####################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                     ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //####################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                             Data|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //####################################################################|       |        |        |        |   Operation|   Operation|   Operation|                   Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //####################################################################|       |        |        |        |            |            |            |                                 |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>
+        // clang-format on
+        >;
+
+void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvBwdData<2,
+                                                  NHWC,
+                                                  KYXC,
+                                                  NHWK,
+                                                  BF16,
+                                                  BF16,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances{});
+    add_device_operation_instances(
+        instances, device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_bf16_instances{});
+    add_device_operation_instances(
+        instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances{});
+    add_device_operation_instances(
+        instances,
+        device_conv_dedidecate_2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_bf16_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
new file mode 100644
index 00000000..7c33df5e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp"
+// FIXME: retire dedicated 2D version
+#include "ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using NHWC = ck::tensor_layout::convolution::NHWC;
+using KYXC = ck::tensor_layout::convolution::KYXC;
+using NHWK = ck::tensor_layout::convolution::NHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvBwdDataDefault =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
+
+static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
+using device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances = std::tuple<
+    // clang-format off
+        //##############################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|       ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##############################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|               Data|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##############################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|     Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##############################|        |       |        |        |        |            |            |            |                   |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   256,   128,     4,   8,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,   256,     4,   8,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,   128,     4,   8,  32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,   128,     4,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,    64,   128,     4,   8,  32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,    32,   128,     4,   8,  32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,    64,   128,     4,   8,  32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,    64,     4,   8,  32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,    64,     4,   8,  32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,       
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    64,    64,     4,   8,  32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,    32,     4,   8,  32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    64,    32,     4,   8,  32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    32,    64,     4,   8,  32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>
+    // clang-format on
+    >;
+
+using device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances = std::tuple<
+    // clang-format off
+        //##############################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                    ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##############################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                            Data|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##############################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                  Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##############################|        |       |        |        |        |            |            |            |                                |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>
+    // clang-format on
+    >;
+
+// FIXME: retire dedicated 2D version
+// Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
+using device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances = std::tuple<
+    // clang-format off
+        //####################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|       ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //####################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|               Data|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //####################################################################|       |        |        |        |   Operation|   Operation|   Operation|     Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //####################################################################|       |        |        |        |            |            |            |                   |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   256,   128,     4,   8,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,   256,     4,   8,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,   128,     4,   8,  32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+#if !CK_WORKAROUND_SWDEV_325164
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,   128,     4,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,    64,   128,     4,   8,  32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,    32,   128,     4,   8,  32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+#endif
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,    64,     4,   8,  32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,    64,   128,     4,   8,  32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    64,    64,     4,   8,  32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,    64,     4,   8,  32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,    32,     4,   8,  32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    64,    32,     4,   8,  32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    32,    64,     4,   8,  32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>
+    // clang-format on
+    >;
+
+// FIXME: retire dedicated 2D version
+using device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances =
+    std::tuple<
+        // clang-format off
+        //####################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                     ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //####################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                             Data|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //####################################################################|       |        |        |        |   Operation|   Operation|   Operation|                   Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //####################################################################|       |        |        |        |            |            |            |                                 |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>
+        // clang-format on
+        >;
+
+void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvBwdData<2,
+                                                  NHWC,
+                                                  KYXC,
+                                                  NHWK,
+                                                  F16,
+                                                  F16,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances{});
+    add_device_operation_instances(
+        instances, device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances{});
+    add_device_operation_instances(
+        instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances{});
+    add_device_operation_instances(
+        instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
new file mode 100644
index 00000000..a5f8629f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp"
+// FIXME: retire dedicated 2D version
+#include "ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32 = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using NHWC = ck::tensor_layout::convolution::NHWC;
+using KYXC = ck::tensor_layout::convolution::KYXC;
+using NHWK = ck::tensor_layout::convolution::NHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvBwdDataDefault =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
+
+static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
+using device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances = std::tuple<
+    // clang-format off
+        //##############################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|        ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##############################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                Data|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##############################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|      Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##############################|        |       |        |        |        |            |            |            |                    |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   256,   128,     4,  4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   128,   256,     4,  4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,   128,   128,     4,  4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   128,   128,     4,  4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,   128,    64,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,    64,   128,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,    64,    64,    64,     4,  4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   128,    64,     4,  4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,    64,   128,     4,  4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,   128,    32,     4,  4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,    32,   128,     4,  4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,    64,    64,    32,     4,  4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,    64,    32,    64,     4,  4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>
+    // clang-format on
+    >;
+
+using device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f32_instances = std::tuple<
+    // clang-format off
+        //##############################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                     ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##############################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                             Data|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##############################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                   Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##############################|        |       |        |        |        |            |            |            |                                 |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   256,   128,     4,  4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   256,     4,  4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,   128,     4,  4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   128,     4,  4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    64,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,    64,   128,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    64,     4,  4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,    64,     4,  4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,    64,   128,     4,  4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    32,     4,  4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,    32,   128,     4,  4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    32,     4,  4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    32,    64,     4,  4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>
+    // clang-format on
+    >;
+
+// FIXME: retire dedicated 2D version
+// Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
+using device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances = std::tuple<
+    // clang-format off
+        //####################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|        ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //####################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                Data|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //####################################################################|       |        |        |        |   Operation|   Operation|   Operation|      Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //####################################################################|       |        |        |        |            |            |            |                    |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   256,   128,     4,  4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   128,   256,     4,  4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,   128,   128,     4,  4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   128,   128,     4,  4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,   128,    64,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,    64,   128,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,    64,    64,    64,     4,  4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   128,    64,     4,  4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,    64,   128,     4,  4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,   128,    32,     4,  4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,    32,   128,     4,  4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,    64,    64,    32,     4,  4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,    64,    32,    64,     4,  4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>
+    // clang-format on
+    >;
+
+// FIXME: retire dedicated 2D version
+using device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f32_instances =
+    std::tuple<
+        // clang-format off
+        //####################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                     ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //####################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                             Data|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //####################################################################|       |        |        |        |   Operation|   Operation|   Operation|                   Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //####################################################################|       |        |        |        |            |            |            |                                 |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   256,   128,     4,  4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   256,     4,  4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,   128,     4,  4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   128,     4,  4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    64,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,    64,   128,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    64,     4,  4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,    64,     4,  4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,    64,   128,     4,  4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    32,     4,  4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,    32,   128,     4,  4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    32,     4,  4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    32,    64,     4,  4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>
+        // clang-format on
+        >;
+
+void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvBwdData<2,
+                                                  NHWC,
+                                                  KYXC,
+                                                  NHWK,
+                                                  F32,
+                                                  F32,
+                                                  F32,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances{});
+    add_device_operation_instances(
+        instances, device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f32_instances{});
+    add_device_operation_instances(
+        instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances{});
+    add_device_operation_instances(
+        instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f32_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
new file mode 100644
index 00000000..8076d6d3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp"
+// FIXME: retire dedicated 2D version
+#include "ck/tensor_operation/gpu/device/impl/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using NHWC = ck::tensor_layout::convolution::NHWC;
+using KYXC = ck::tensor_layout::convolution::KYXC;
+using NHWK = ck::tensor_layout::convolution::NHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvBwdDataDefault =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
+
+static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
+using device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances = std::tuple<
+    // clang-format off
+        //##############################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|       ConvBackward| Block|  MPer|  NPer| K0Per|  K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##############################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|               Data|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##############################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|     Specialization|      |      |      |      |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##############################|        |       |        |        |        |            |            |            |                   |      |      |      |      |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,   256,     4,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,   128,     4,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   256,   128,     4,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,   128,     4,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,    64,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,    64,   128,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    64,    64,     4,  16,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,    64,     4,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,    64,   128,     4,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,    32,     4,  16,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,    32,   128,     4,  16,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    64,    32,     4,  16,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    32,    64,     4,  16,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>
+    // clang-format on
+    >;
+
+using device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_int8_instances = std::tuple<
+    // clang-format off
+        //##############################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                    ConvBackward| Block|  MPer|  NPer| K0Per|  K1| MPer| NPer| MXdl| NXdl|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##############################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                            Data|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|    ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##############################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                  Specialization|      |      |      |      |    |     |     | Wave| Wave|  Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##############################|        |       |        |        |        |            |            |            |                                |      |      |      |      |    |     |     |     |     |                 |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   256,   128,     4,  16,   32,   32,    4,    2,      S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   256,     4,  16,   32,   32,    2,    4,      S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,   128,   128,     4,  16,   32,   32,    4,    2,      S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   128,     4,  16,   32,   32,    2,    2,      S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    64,     4,  16,   32,   32,    2,    2,      S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,    64,   128,     4,  16,   32,   32,    2,    2,      S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    64,     4,  16,   32,   32,    2,    2,      S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   128,    64,     4,  16,   32,   32,    2,    1,      S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,    64,   128,     4,  16,   32,   32,    1,    2,      S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    32,     4,  16,   32,   32,    2,    1,      S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,    32,   128,     4,  16,   32,   32,    1,    2,      S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    32,     4,  16,   32,   32,    2,    1,      S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       2, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,    64,    32,    64,     4,  16,   32,   32,    1,    2,      S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>
+    // clang-format on
+    >;
+
+// FIXME: retire dedicated 2D version
+// Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
+using device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances = std::tuple<
+    // clang-format off
+        //####################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|        ConvBackward| Block|  MPer|  NPer| K0Per|  K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //####################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                Data|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //####################################################################|       |        |        |        |   Operation|   Operation|   Operation|      Specialization|      |      |      |      |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //####################################################################|       |        |        |        |            |            |            |                    |      |      |      |      |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   256,   128,     4,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   128,   256,     4,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,   128,   128,     4,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   128,   128,     4,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,   128,    64,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,    64,   128,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,    64,    64,    64,     4,  16,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   128,    64,     4,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,    64,   128,     4,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,   128,    32,     4,  16,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,    32,   128,     4,  16,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,    64,    64,    32,     4,  16,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K< int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,    64,    32,    64,     4,  16,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>
+    // clang-format on
+    >;
+
+// FIXME: retire dedicated 2D version
+using device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_int8_instances =
+    std::tuple<
+        // clang-format off
+        //#####################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                     ConvBackward| Block|  MPer|  NPer| K0Per|  K1| MPer| NPer| MXdl| NXdl|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //#####################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                             Data|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|    ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //#####################################################################|       |        |        |        |   Operation|   Operation|   Operation|                   Specialization|      |      |      |      |    |     |     | Wave| Wave|  Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //#####################################################################|       |        |        |        |            |            |            |                                 |      |      |      |      |    |     |     |     |     |                 |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   256,   128,     4,  16,   32,   32,    4,    2,      S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   256,     4,  16,   32,   32,    2,    4,      S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,   128,     4,  16,   32,   32,    4,    2,      S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   128,     4,  16,   32,   32,    2,    2,      S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    64,     4,  16,   32,   32,    2,    2,      S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,    64,   128,     4,  16,   32,   32,    2,    2,      S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    64,     4,  16,   32,   32,    2,    2,      S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,    64,     4,  16,   32,   32,    2,    1,      S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,    64,   128,     4,  16,   32,   32,    1,    2,      S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    32,     4,  16,   32,   32,    2,    1,      S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,    32,   128,     4,  16,   32,   32,    1,    2,      S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    32,     4,  16,   32,   32,    2,    1,      S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    32,    64,     4,  16,   32,   32,    1,    2,      S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>
+        // clang-format on
+        >;
+
+void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(
+    std::vector<std::unique_ptr<DeviceConvBwdData<2,
+                                                  NHWC,
+                                                  KYXC,
+                                                  NHWK,
+                                                  int8_t,
+                                                  int8_t,
+                                                  int8_t,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances{});
+    add_device_operation_instances(
+        instances, device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_int8_instances{});
+    add_device_operation_instances(
+        instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances{});
+    add_device_operation_instances(
+        instances, device_conv_dedicated_2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_int8_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt
new file mode 100644
index 00000000..5b646852
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_instance_library(device_conv2d_fwd_instance
+   device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
+   device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
+   device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
+   device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
+   device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp
new file mode 100644
index 00000000..33503b9f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using NHWC = ck::tensor_layout::convolution::NHWC;
+using KYXC = ck::tensor_layout::convolution::KYXC;
+using NHWK = ck::tensor_layout::convolution::NHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvFwdDefault =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto ConvFwd1x1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
+
+static constexpr auto ConvFwd1x1S1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
+
+static constexpr auto ConvFwdOddC =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC;
+
+// arbitrary conv
+using device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances = std::tuple<
+    // clang-format off
+        //##########################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //##########################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //##########################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##########################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>
+    // clang-format on
+    >;
+
+// 1x1, pad 0
+using device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_1x1_p0_f16_instances = std::tuple<
+    // clang-format off
+        //##########################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //##########################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //##########################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##########################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>
+    // clang-format on
+    >;
+
+// 1x1, stride 1, pad 0
+using device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances = std::tuple<
+    // clang-format off
+        //##########################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //##########################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //##########################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##########################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>
+    // clang-format on
+    >;
+
+using device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_odd_c_f16_instances = std::tuple<
+    // clang-format off
+        //##########################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //##########################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //##########################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##########################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,    ConvFwdOddC,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,    ConvFwdOddC,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,    ConvFwdOddC,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,    ConvFwdOddC,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,    ConvFwdOddC,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,    ConvFwdOddC,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,    ConvFwdOddC,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,    ConvFwdOddC,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,    ConvFwdOddC,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,    ConvFwdOddC,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,    ConvFwdOddC,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,    ConvFwdOddC,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,    ConvFwdOddC,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,    ConvFwdOddC,   256,   128,    64,     4,  4,   32,   32,    2,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,    ConvFwdOddC,   256,   128,    64,     2,  4,   32,   32,    2,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,    ConvFwdOddC,   256,   256,    64,     2,  4,   32,   32,    4,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,    ConvFwdOddC,   128,   128,    64,     2,  4,   32,   32,    2,    2,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,    ConvFwdOddC,   128,    64,    64,     2,  4,   32,   32,    1,    2,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>
+    // clang-format on
+    >;
+
+void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(
+    std::vector<std::unique_ptr<
+        DeviceConvFwd<2, NHWC, KYXC, NHWK, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances{});
+    add_device_operation_instances(
+        instances, device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_1x1_p0_f16_instances{});
+    add_device_operation_instances(
+        instances, device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances{});
+    add_device_operation_instances(
+        instances, device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_odd_c_f16_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
new file mode 100644
index 00000000..c5e4bd19
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using NHWC = ck::tensor_layout::convolution::NHWC;
+using KYXC = ck::tensor_layout::convolution::KYXC;
+using NHWK = ck::tensor_layout::convolution::NHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvFwdDefault =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto ConvFwd1x1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
+
+static constexpr auto ConvFwd1x1S1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
+using device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances = std::tuple<
+    // clang-format off
+        //################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>
+    // clang-format on
+    >;
+
+using device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_p0_bf16_instances = std::tuple<
+    // clang-format off
+        //################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>
+    // clang-format on
+    >;
+
+using device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_bf16_instances = std::tuple<
+    // clang-format off
+        //################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>
+    // clang-format on
+    >;
+
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvFwd<2,
+                                              NHWC,
+                                              KYXC,
+                                              NHWK,
+                                              BF16,
+                                              BF16,
+                                              BF16,
+                                              PassThrough,
+                                              PassThrough,
+                                              PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances{});
+    add_device_operation_instances(instances,
+                                   device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_p0_bf16_instances{});
+    add_device_operation_instances(instances,
+                                   device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_bf16_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
new file mode 100644
index 00000000..f43d13e3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using NHWC = ck::tensor_layout::convolution::NHWC;
+using KYXC = ck::tensor_layout::convolution::KYXC;
+using NHWK = ck::tensor_layout::convolution::NHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvFwdDefault =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto ConvFwd1x1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
+
+static constexpr auto ConvFwd1x1S1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
+using device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances = std::tuple<
+    // clang-format off
+        //################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>
+    // clang-format on
+    >;
+
+using device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_p0_f16_instances = std::tuple<
+    // clang-format off
+        //################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>
+    // clang-format on
+    >;
+
+using device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances = std::tuple<
+    // clang-format off
+        //################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>
+    // clang-format on
+    >;
+
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(
+    std::vector<std::unique_ptr<
+        DeviceConvFwd<2, NHWC, KYXC, NHWK, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances{});
+    add_device_operation_instances(instances,
+                                   device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_p0_f16_instances{});
+    add_device_operation_instances(instances,
+                                   device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
new file mode 100644
index 00000000..0ce6b04c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32 = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using NHWC = ck::tensor_layout::convolution::NHWC;
+using KYXC = ck::tensor_layout::convolution::KYXC;
+using NHWK = ck::tensor_layout::convolution::NHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvFwdDefault =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto ConvFwd1x1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
+
+static constexpr auto ConvFwd1x1S1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
+using device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances = std::tuple<
+    // clang-format off
+        //################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   256,   128,     4,  4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   128,   256,     4,  4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,   128,   128,     4,  4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   128,   128,     4,  4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,   128,    64,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,    64,   128,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,    64,    64,    64,     4,  4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   128,    64,     4,  4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,    64,   128,     4,  4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,   128,    32,     4,  4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,    32,   128,     4,  4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,    64,    64,    32,     4,  4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,    64,    32,    64,     4,  4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>
+    // clang-format on
+    >;
+
+using device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_p0_f32_instances = std::tuple<
+    // clang-format off
+        //################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   256,   128,     4,  4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   128,   256,     4,  4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,   128,   128,     4,  4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   128,   128,     4,  4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,   128,    64,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,    64,   128,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,    64,    64,    64,     4,  4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   128,    64,     4,  4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,    64,   128,     4,  4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,   128,    32,     4,  4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,    32,   128,     4,  4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,    64,    64,    32,     4,  4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,    64,    32,    64,     4,  4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>
+    // clang-format on
+    >;
+
+using device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f32_instances = std::tuple<
+    // clang-format off
+        //################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   256,   128,     4,  4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   128,   256,     4,  4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,   128,   128,     4,  4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   128,   128,     4,  4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,   128,    64,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,    64,   128,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,    64,    64,    64,     4,  4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   128,    64,     4,  4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,    64,   128,     4,  4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,   128,    32,     4,  4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,    32,   128,     4,  4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,    64,    64,    32,     4,  4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,    64,    32,    64,     4,  4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>
+    // clang-format on
+    >;
+
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(
+    std::vector<std::unique_ptr<
+        DeviceConvFwd<2, NHWC, KYXC, NHWK, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances{});
+    add_device_operation_instances(instances,
+                                   device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_p0_f32_instances{});
+    add_device_operation_instances(instances,
+                                   device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f32_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
new file mode 100644
index 00000000..76ab3189
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using NHWC = ck::tensor_layout::convolution::NHWC;
+using KYXC = ck::tensor_layout::convolution::KYXC;
+using NHWK = ck::tensor_layout::convolution::NHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvFwdDefault =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto ConvFwd1x1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
+
+static constexpr auto ConvFwd1x1S1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
+using device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances = std::tuple<
+    // clang-format off
+        //################################################################| InData|  WeiData| OutData|  AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per|  K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //################################################################|   Type|     Type|    Type|     Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //################################################################|       |         |        |         |   Operation|   Operation|   Operation|               |      |      |      |      |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //################################################################|       |         |        |         |            |            |            |               |      |      |      |      |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   256,   128,     4,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   128,   256,     4,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,   128,   128,     4,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   128,   128,     4,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,   128,    64,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,    64,   128,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault,    64,    64,    64,     4,  16,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   128,    64,     4,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,    64,   128,     4,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,   128,    32,     4,  16,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,    32,   128,     4,  16,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault,    64,    64,    32,     4,  16,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault,    64,    32,    64,     4,  16,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>
+    // clang-format on
+    >;
+
+using device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_p0_int8_instances = std::tuple<
+    // clang-format off
+        //################################################################| InData|  WeiData| OutData|  AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per|  K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //################################################################|   Type|     Type|    Type|     Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //################################################################|       |         |        |         |   Operation|   Operation|   Operation|               |      |      |      |      |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //################################################################|       |         |        |         |            |            |            |               |      |      |      |      |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   256,   128,     4,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   128,   256,     4,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,   128,   128,     4,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   128,   128,     4,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,   128,    64,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,    64,   128,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,    64,    64,    64,     4,  16,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   128,    64,     4,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,    64,   128,     4,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,   128,    32,     4,  16,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,    32,   128,     4,  16,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,    64,    64,    32,     4,  16,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,    64,    32,    64,     4,  16,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>
+    // clang-format on
+    >;
+
+using device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_int8_instances = std::tuple<
+    // clang-format off
+        //################################################################| InData|  WeiData| OutData|  AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per|  K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //################################################################|   Type|     Type|    Type|     Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //################################################################|       |         |        |         |   Operation|   Operation|   Operation|               |      |      |      |      |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //################################################################|       |         |        |         |            |            |            |               |      |      |      |      |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   256,   128,     4,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   128,   256,     4,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,   128,   128,     4,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   128,   128,     4,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,   128,    64,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,    64,   128,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,    64,    64,    64,     4,  16,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   128,    64,     4,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,    64,   128,     4,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,   128,    32,     4,  16,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,    32,   128,     4,  16,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,    64,    64,    32,     4,  16,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>,
+        DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  int8_t,  int8_t,  int8_t,  int32_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,    64,    32,    64,     4,  16,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,      true,               7,               1>
+    // clang-format on
+    >;
+
+void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(
+    std::vector<std::unique_ptr<DeviceConvFwd<2,
+                                              NHWC,
+                                              KYXC,
+                                              NHWK,
+                                              int8_t,
+                                              int8_t,
+                                              int8_t,
+                                              PassThrough,
+                                              PassThrough,
+                                              PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances{});
+    add_device_operation_instances(instances,
+                                   device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_p0_int8_instances{});
+    add_device_operation_instances(instances,
+                                   device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_int8_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/CMakeLists.txt
new file mode 100644
index 00000000..670cd94f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_instance_library(device_conv2d_fwd_bias_relu_instance
+   device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp
new file mode 100644
index 00000000..f8c25508
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddRelu     = ck::tensor_operation::element_wise::AddRelu;
+
+static constexpr auto MemorySet = ck::InMemoryDataOperationEnum::Set;
+
+static constexpr auto ConvFwdDefault =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto ConvFwd1x1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
+
+static constexpr auto ConvFwd1x1S1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
+
+static constexpr auto ConvFwdOddC =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC;
+
+// arbitrary conv
+using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances = std::tuple<
+    // clang-format off
+        //##########################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|           Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //##########################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|  GlobalMemory| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //##########################################################################################|       |        |        |        |   Operation|   Operation|   Operation| DataOperation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##########################################################################################|       |        |        |        |            |            |            |              |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwdDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwdDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwdDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwdDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwdDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwdDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwdDefault,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwdDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwdDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwdDefault,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwdDefault,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwdDefault,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwdDefault,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>
+    // clang-format on
+    >;
+
+// 1x1, pad 0
+using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_1x1_p0_f16_instances = std::tuple<
+    // clang-format off
+        //##########################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|           Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //##########################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|  GlobalMemory| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //##########################################################################################|       |        |        |        |   Operation|   Operation|   Operation| DataOperation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##########################################################################################|       |        |        |        |            |            |            |              |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   ConvFwd1x1P0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   ConvFwd1x1P0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   ConvFwd1x1P0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   ConvFwd1x1P0,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   ConvFwd1x1P0,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   ConvFwd1x1P0,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   ConvFwd1x1P0,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   ConvFwd1x1P0,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   ConvFwd1x1P0,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   ConvFwd1x1P0,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   ConvFwd1x1P0,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   ConvFwd1x1P0,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   ConvFwd1x1P0,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>
+    // clang-format on
+    >;
+
+// 1x1, stride 1, pad 0
+using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances = std::tuple<
+    // clang-format off
+        //##########################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|           Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //##########################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|  GlobalMemory| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //##########################################################################################|       |        |        |        |   Operation|   Operation|   Operation| DataOperation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##########################################################################################|       |        |        |        |            |            |            |              |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwd1x1S1P0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwd1x1S1P0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwd1x1S1P0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwd1x1S1P0,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwd1x1S1P0,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwd1x1S1P0,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwd1x1S1P0,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwd1x1S1P0,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwd1x1S1P0,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwd1x1S1P0,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwd1x1S1P0,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwd1x1S1P0,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwd1x1S1P0,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>
+    // clang-format on
+    >;
+
+// Odd C
+using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_odd_c_f16_instances = std::tuple<
+    // clang-format off
+        //##########################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|           Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //##########################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|  GlobalMemory| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //##########################################################################################|       |        |        |        |   Operation|   Operation|   Operation| DataOperation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##########################################################################################|       |        |        |        |            |            |            |              |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    ConvFwdOddC,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    ConvFwdOddC,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    ConvFwdOddC,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    ConvFwdOddC,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    ConvFwdOddC,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    ConvFwdOddC,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    ConvFwdOddC,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    ConvFwdOddC,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    ConvFwdOddC,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    ConvFwdOddC,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    ConvFwdOddC,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    ConvFwdOddC,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    ConvFwdOddC,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    ConvFwdOddC,   256,   128,    64,     4,  4,   32,   32,    2,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    ConvFwdOddC,   256,   128,    64,     2,  4,   32,   32,    2,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    ConvFwdOddC,   256,   256,    64,     2,  4,   32,   32,    4,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    ConvFwdOddC,   128,   128,    64,     2,  4,   32,   32,    2,    2,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    ConvFwdOddC,   128,    64,    64,     2,  4,   32,   32,    1,    2,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>
+    // clang-format on
+    >;
+
+void add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances(
+    std::vector<DeviceConvFwdBiasActivationPtr<PassThrough, PassThrough, AddRelu>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances{});
+    add_device_operation_instances(
+        instances, device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_1x1_p0_f16_instances{});
+    add_device_operation_instances(
+        instances,
+        device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances{});
+    add_device_operation_instances(
+        instances, device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_odd_c_f16_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/CMakeLists.txt
new file mode 100644
index 00000000..68d5f582
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_instance_library(device_conv2d_fwd_bias_relu_add_instance
+   device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp
+)
+
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp
new file mode 100644
index 00000000..fe715247
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_c_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddReluAdd  = ck::tensor_operation::element_wise::AddReluAdd;
+
+static constexpr auto ConvFwdDefault =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto ConvFwd1x1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
+
+static constexpr auto ConvFwd1x1S1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
+
+static constexpr auto ConvFwdOddC =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC;
+
+// arbitrary conv
+using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instances = std::tuple<
+    // clang-format off
+        //##############################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //##############################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //##############################################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##############################################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwdDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwdDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwdDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwdDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwdDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwdDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwdDefault,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwdDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwdDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwdDefault,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwdDefault,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwdDefault,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwdDefault,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>
+    // clang-format on
+    >;
+
+// 1x1, pad 0
+using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_1x1_p0_f16_instances = std::tuple<
+    // clang-format off
+        //##############################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //##############################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //##############################################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##############################################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,   ConvFwd1x1P0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,   ConvFwd1x1P0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,   ConvFwd1x1P0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,   ConvFwd1x1P0,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,   ConvFwd1x1P0,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,   ConvFwd1x1P0,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,   ConvFwd1x1P0,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,   ConvFwd1x1P0,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,   ConvFwd1x1P0,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,   ConvFwd1x1P0,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,   ConvFwd1x1P0,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,   ConvFwd1x1P0,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,   ConvFwd1x1P0,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>
+    // clang-format on
+    >;
+
+// 1x1, stride 1, pad 0
+using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances = std::tuple<
+    // clang-format off
+        //##############################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //##############################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //##############################################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##############################################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwd1x1S1P0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwd1x1S1P0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwd1x1S1P0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwd1x1S1P0,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwd1x1S1P0,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwd1x1S1P0,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwd1x1S1P0,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwd1x1S1P0,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwd1x1S1P0,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwd1x1S1P0,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwd1x1S1P0,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwd1x1S1P0,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwd1x1S1P0,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>
+    // clang-format on
+    >;
+
+// Odd C
+using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_odd_c_f16_instances = std::tuple<
+    // clang-format off
+        //##############################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //##############################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //##############################################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##############################################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,    ConvFwdOddC,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,    ConvFwdOddC,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,    ConvFwdOddC,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,    ConvFwdOddC,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,    ConvFwdOddC,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,    ConvFwdOddC,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,    ConvFwdOddC,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,    ConvFwdOddC,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,    ConvFwdOddC,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,    ConvFwdOddC,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,    ConvFwdOddC,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,    ConvFwdOddC,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,    ConvFwdOddC,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,    ConvFwdOddC,   256,   128,    64,     4,  4,   32,   32,    2,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,    ConvFwdOddC,   256,   128,    64,     2,  4,   32,   32,    2,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,    ConvFwdOddC,   256,   256,    64,     2,  4,   32,   32,    4,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,    ConvFwdOddC,   128,   128,    64,     2,  4,   32,   32,    2,    2,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,    ConvFwdOddC,   128,    64,    64,     2,  4,   32,   32,    1,    2,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 16, 1, 1, 4>,               8>
+    // clang-format on
+    >;
+
+void add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instances(
+    std::vector<DeviceConvFwdBiasActivationAddPtr<PassThrough, PassThrough, AddReluAdd>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instances{});
+    add_device_operation_instances(
+        instances,
+        device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_1x1_p0_f16_instances{});
+    add_device_operation_instances(
+        instances,
+        device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances{});
+    add_device_operation_instances(
+        instances,
+        device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_odd_c_f16_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/CMakeLists.txt
new file mode 100644
index 00000000..db92208f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_instance_library(device_conv3d_bwd_data_instance
+   device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp
+   device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp
+   device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp
+   device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp
new file mode 100644
index 00000000..04ce7c07
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instance.cpp
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = bhalf_t;
+using F32  = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using NDHWC = ck::tensor_layout::convolution::NDHWC;
+using KZYXC = ck::tensor_layout::convolution::KZYXC;
+using NDHWK = ck::tensor_layout::convolution::NDHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvBwdDataDefault =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
+
+static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, di, hi, wi, c] * wei[k, z, y, x, c] = out[n, do, ho, wo, k]
+using device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances = std::tuple<
+    // clang-format off
+        //##############################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|       ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##############################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|               Data|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##############################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|     Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##############################|        |       |        |        |        |            |            |            |                   |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>
+    // clang-format on
+    >;
+
+using device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_bf16_instances = std::tuple<
+    // clang-format off
+        //##############################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                    ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##############################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                            Data|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##############################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                  Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##############################|        |       |        |        |        |            |            |            |                                |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>
+    // clang-format on
+    >;
+
+void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceConvBwdData<3,
+                                                  NDHWC,
+                                                  KZYXC,
+                                                  NDHWK,
+                                                  BF16,
+                                                  BF16,
+                                                  BF16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances{});
+    add_device_operation_instances(
+        instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_bf16_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp
new file mode 100644
index 00000000..0251d915
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instance.cpp
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using NDHWC = ck::tensor_layout::convolution::NDHWC;
+using KZYXC = ck::tensor_layout::convolution::KZYXC;
+using NDHWK = ck::tensor_layout::convolution::NDHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvBwdDataDefault =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
+
+static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, di, hi, wi, c] * wei[k, z, y, x, c] = out[n, do, ho, wo, k]
+using device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instances = std::tuple<
+    // clang-format off
+        //##############################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|       ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##############################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|               Data|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##############################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|     Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##############################|        |       |        |        |        |            |            |            |                   |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   256,   128,     4,   8,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,   256,     4,   8,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,   128,     4,   8,  32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,   128,     4,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,    64,   128,     4,   8,  32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,    32,   128,     4,   8,  32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,    64,   128,     4,   8,  32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,    64,     4,   8,  32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,    64,     4,   8,  32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,       
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    64,    64,     4,   8,  32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,    32,     4,   8,  32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    64,    32,     4,   8,  32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    32,    64,     4,   8,  32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>
+    // clang-format on
+    >;
+
+using device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_f16_instances = std::tuple<
+    // clang-format off
+        //##############################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                    ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##############################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                            Data|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##############################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                  Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##############################|        |       |        |        |        |            |            |            |                                |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>
+    // clang-format on
+    >;
+
+void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instances(
+    std::vector<std::unique_ptr<DeviceConvBwdData<3,
+                                                  NDHWC,
+                                                  KZYXC,
+                                                  NDHWK,
+                                                  F16,
+                                                  F16,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instances{});
+    add_device_operation_instances(
+        instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_f16_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp
new file mode 100644
index 00000000..c2975727
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instance.cpp
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32 = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using NDHWC = ck::tensor_layout::convolution::NDHWC;
+using KZYXC = ck::tensor_layout::convolution::KZYXC;
+using NDHWK = ck::tensor_layout::convolution::NDHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvBwdDataDefault =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
+
+static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, di, hi, wi, c] * wei[k, z, y, x, c] = out[n, do, ho, wo, k]
+using device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances = std::tuple<
+    // clang-format off
+        //##############################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|        ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##############################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                Data|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##############################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|      Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##############################|        |       |        |        |        |            |            |            |                    |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   256,   128,     4,  4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   128,   256,     4,  4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,   128,   128,     4,  4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   128,   128,     4,  4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,   128,    64,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,    64,   128,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,    64,    64,    64,     4,  4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   128,    64,     4,  4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,    64,   128,     4,  4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,   128,    32,     4,  4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,    32,   128,     4,  4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,    64,    64,    32,     4,  4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,    64,    32,    64,     4,  4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>
+    // clang-format on
+    >;
+
+using device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_f32_instances = std::tuple<
+    // clang-format off
+        //##############################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                     ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##############################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                             Data|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##############################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                   Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##############################|        |       |        |        |        |            |            |            |                                 |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   256,   128,     4,  4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   256,     4,  4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,   128,     4,  4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   128,     4,  4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    64,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,    64,   128,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    64,     4,  4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,    64,     4,  4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,    64,   128,     4,  4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    32,     4,  4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,    32,   128,     4,  4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    32,     4,  4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    32,    64,     4,  4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>
+    // clang-format on
+    >;
+
+void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances(
+    std::vector<std::unique_ptr<DeviceConvBwdData<3,
+                                                  NDHWC,
+                                                  KZYXC,
+                                                  NDHWK,
+                                                  F32,
+                                                  F32,
+                                                  F32,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances{});
+    add_device_operation_instances(
+        instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_f32_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp
new file mode 100644
index 00000000..fc86d730
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/conv3d_bwd_data/device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instance.cpp
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using NDHWC = ck::tensor_layout::convolution::NDHWC;
+using KZYXC = ck::tensor_layout::convolution::KZYXC;
+using NDHWK = ck::tensor_layout::convolution::NDHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvBwdDataDefault =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
+
+static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, wi, c] * wei[k, x, c] = out[n, wo, k]
+using device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances = std::tuple<
+    // clang-format off
+        //##############################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|       ConvBackward| Block|  MPer|  NPer| K0Per|  K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##############################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|               Data|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##############################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|     Specialization|      |      |      |      |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##############################|        |       |        |        |        |            |            |            |                   |      |      |      |      |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,   256,     4,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,   128,     4,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   256,   128,     4,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,   128,     4,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,    64,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,    64,   128,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    64,    64,     4,  16,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,    64,     4,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,    64,   128,     4,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,    32,     4,  16,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,    32,   128,     4,  16,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    64,    32,     4,  16,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    32,    64,     4,  16,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>
+    // clang-format on
+    >;
+
+using device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_int8_instances = std::tuple<
+    // clang-format off
+        //##############################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                    ConvBackward| Block|  MPer|  NPer| K0Per|  K1| MPer| NPer| MXdl| NXdl|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##############################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                            Data|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|    ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##############################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                  Specialization|      |      |      |      |    |     |     | Wave| Wave|  Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##############################|        |       |        |        |        |            |            |            |                                |      |      |      |      |    |     |     |     |     |                 |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   256,   128,     4,  16,   32,   32,    4,    2,      S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   256,     4,  16,   32,   32,    2,    4,      S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,   128,   128,     4,  16,   32,   32,    4,    2,      S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   128,     4,  16,   32,   32,    2,    2,      S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    64,     4,  16,   32,   32,    2,    2,      S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,    64,   128,     4,  16,   32,   32,    2,    2,      S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    64,     4,  16,   32,   32,    2,    2,      S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,   128,    64,     4,  16,   32,   32,    2,    1,      S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   256,    64,   128,     4,  16,   32,   32,    1,    2,      S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    32,     4,  16,   32,   32,    2,    1,      S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,   128,    32,   128,     4,  16,   32,   32,    1,    2,      S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    32,     4,  16,   32,   32,    2,    1,      S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConvNdBwdDataNwcKxcNwk_Xdl<       3, int8_t,  int8_t,  int8_t, int32_t, PassThrough, PassThrough, PassThrough, ConvBwdDataFilter1x1Stride1Pad0,    64,    32,    64,     4,  16,   32,   32,    1,    2,      S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>
+    // clang-format on
+    >;
+
+void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances(
+    std::vector<std::unique_ptr<DeviceConvBwdData<3,
+                                                  NDHWC,
+                                                  KZYXC,
+                                                  NDHWK,
+                                                  int8_t,
+                                                  int8_t,
+                                                  int8_t,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances{});
+    add_device_operation_instances(
+        instances, device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_1x1_s1_p0_int8_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/elementwise/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/elementwise/CMakeLists.txt
new file mode 100644
index 00000000..47516b41
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/elementwise/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_instance_library(device_elementwise_instance
+    device_normalize_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp b/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp
new file mode 100644
index 00000000..baddecf6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/elementwise/device_normalize_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using inputType      = F16;
+using MeanType       = F32;
+using SquareMeanType = F32;
+using GammaDataType  = F16;
+using BetaDataType   = F16;
+using outputType     = F16;
+
+using Normalize = ck::tensor_operation::element_wise::Normalize;
+using device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances = std::tuple<
+    // clang-format off
+    //###################|<in, mean, square_mean, gamma, beta>| <out>|  functor| NDim| MPerThread| <in, mean, square_mean, gamma, beta ScalarPerVector>| <out ScalarPerVector>|
+    DeviceElementwise<Tuple<F16, F32, F32, F16, F16>,  Tuple<F16>,  Normalize,  2,   8,       Sequence<8, 1, 1, 8, 8>,      Sequence<8>                >,
+    DeviceElementwise<Tuple<F16, F32, F32, F16, F16>,  Tuple<F16>,  Normalize,  2,   4,       Sequence<4, 1, 1, 4, 4>,      Sequence<4>                >,
+    DeviceElementwise<Tuple<F16, F32, F32, F16, F16>,  Tuple<F16>,  Normalize,  2,   2,       Sequence<2, 1, 1, 2, 2>,      Sequence<2>                >,
+    DeviceElementwise<Tuple<F16, F32, F32, F16, F16>,  Tuple<F16>,  Normalize,  2,   1,       Sequence<1, 1, 1, 1, 1>,      Sequence<1>                >
+    // clang-format on
+    >;
+
+void add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(
+    std::vector<DeviceElementwiseBasePtr<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/elementwise_normalization/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/elementwise_normalization/CMakeLists.txt
new file mode 100644
index 00000000..0c7cc2cd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/elementwise_normalization/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_instance_library(device_elementwise_normalization_instance
+    device_elementwise_normalization_f16_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/elementwise_normalization/device_elementwise_normalization_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/elementwise_normalization/device_elementwise_normalization_f16_instance.cpp
new file mode 100644
index 00000000..7f15372e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/elementwise_normalization/device_elementwise_normalization_f16_instance.cpp
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_elementwise_normalization_impl.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Add  = ck::tensor_operation::element_wise::Add;
+using Pass = ck::tensor_operation::element_wise::PassThrough;
+
+template <typename XElementwise, typename YElementwise, index_t Rank, index_t Reduce>
+// clang-format off
+using device_elementwise_normalization_f16_instances =
+    std::tuple <
+        // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize>
+        DeviceElementwiseNormalizationImpl<ck::Tuple<F16, F16>, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1, 1, 1, 1, 1>, // fallback kernel
+        DeviceElementwiseNormalizationImpl<ck::Tuple<F16, F16>, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 2, 1, 2, 1, 2, 2>, // fallback kernel
+        DeviceElementwiseNormalizationImpl<ck::Tuple<F16, F16>, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 4, 1, 4, 1, 4, 4>, // fallback kernel
+        DeviceElementwiseNormalizationImpl<ck::Tuple<F16, F16>, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 8, 1, 8, 1, 8, 8>,
+        DeviceElementwiseNormalizationImpl<ck::Tuple<F16, F16>, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 4, 64, 1, 8, 1, 8, 1, 8, 1, 8, 8>,
+        DeviceElementwiseNormalizationImpl<ck::Tuple<F16, F16>, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 2, 128, 1, 8, 1, 8, 1, 8, 1, 8, 8>,
+        DeviceElementwiseNormalizationImpl<ck::Tuple<F16, F16>, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 2, 128, 1, 16, 1, 8, 1, 8, 1, 8, 8>,
+        DeviceElementwiseNormalizationImpl<ck::Tuple<F16, F16>, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 2, 128, 1, 32, 1, 8, 1, 8, 1, 8, 8>,
+        DeviceElementwiseNormalizationImpl<ck::Tuple<F16, F16>, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 1, 256, 1, 8, 1, 8, 1, 8, 1, 8, 8>,
+        DeviceElementwiseNormalizationImpl<ck::Tuple<F16, F16>, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 1, 256, 1, 16, 1, 8, 1, 8, 1, 8, 8>,
+        DeviceElementwiseNormalizationImpl<ck::Tuple<F16, F16>, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 256, 1, 256, 1, 32, 1, 8, 1, 8, 1, 8, 8>,
+        DeviceElementwiseNormalizationImpl<ck::Tuple<F16, F16>, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 1024, 1, 1024, 1, 32, 1, 8, 1, 8, 1, 8, 8>,
+        DeviceElementwiseNormalizationImpl<ck::Tuple<F16, F16>, F16, F16, F32, F16, XElementwise ,YElementwise, Rank, Reduce, 1024, 1, 1024, 1, 8, 1, 2, 1, 2, 1, 2, 2>
+    >;
+// clang-format on
+
+void add_device_elementwise_normalization_rank_2_1_f16_instances(
+    std::vector<std::unique_ptr<
+        DeviceElementwiseNormalization<ck::Tuple<F16, F16>, F16, F16, F32, F16, Add, Pass, 2, 1>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_elementwise_normalization_f16_instances<Add, Pass, 2, 1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
new file mode 100644
index 00000000..e20d592c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
@@ -0,0 +1,43 @@
+add_instance_library(device_gemm_instance
+   device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp
+   device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp
+   device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp
+   device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp
+   device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp
+   device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp
+   device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp
+   device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp
+   device_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
+   device_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
+   device_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp
+   device_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp
+   device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
+   device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
+   device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
+   device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
+   device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp
+   device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp
+   device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp
+   device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp
+   device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
+   device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
+   device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
+   device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
+   device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp
+   device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp
+   device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp
+   device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp
+   device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp
+   device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp
+   device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp
+   device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp
+   device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp
+   device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp
+   device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp
+   device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp
+   device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp
+   device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp
+   device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp
+   device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp
+   device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp
new file mode 100644
index 00000000..5d2f18e1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_gemm_dl_f16_f16_f16_km_kn_mn_instances = std::tuple<
+    // clang-format off
+        // #########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // #########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // #########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|               Order|                 |                   |
+        // #########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 2>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<2, 1, 4, 2>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>
+    // clang-format on
+    >;
+
+void add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_dl_f16_f16_f16_km_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp
new file mode 100644
index 00000000..01e3b379
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
+using device_gemm_dl_f16_f16_f16_km_nk_mn_instances = std::tuple<
+    // clang-format off
+        // #########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // #########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // #########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                 |                   |
+        // #########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 2>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>
+    // clang-format on
+    >;
+
+void add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_dl_f16_f16_f16_km_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp
new file mode 100644
index 00000000..804e86a0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_gemm_dl_f16_f16_f16_mk_kn_mn_instances = std::tuple<
+    // clang-format off
+        // #########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // #########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // #########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                 |                   |
+        // #########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<2, 1, 4, 2>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>
+    // clang-format on
+    >;
+
+void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_dl_f16_f16_f16_mk_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp
new file mode 100644
index 00000000..159fa90f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_gemm_dl_f16_f16_f16_mk_nk_mn_instances =
+    std::tuple<
+        // clang-format off
+        //  ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        //  ########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        //  ########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                 |                   |
+        //  ########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        DeviceGemmDl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,        S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,                5,                  4>
+        // clang-format on
+        >;
+
+void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_dl_f16_f16_f16_mk_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp
new file mode 100644
index 00000000..d8e77984
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_gemm_dl_f32_f32_f32_km_kn_mn_instances = std::tuple<
+    // clang-format off
+        //  ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        //  ########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        //  ########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                 |                   |
+        //  ########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                    |                    |                 |                   |
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  1,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 1>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<2, 1, 4, 1>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,       S<1, 1, 4, 1>, S<0, 1, 2, 3, 4, 5>,                5,                  4>
+    // clang-format on
+    >;
+
+void add_device_gemm_dl_f32_f32_f32_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_dl_f32_f32_f32_km_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp
new file mode 100644
index 00000000..0034ac59
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
+using device_gemm_dl_f32_f32_f32_km_nk_mn_instances =
+    std::tuple<
+        // clang-format off
+        //  ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        //  ########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        //  ########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                 |                   |
+        //  ########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  1,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 1>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 1>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,        S<1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>,                5,                  4>
+        // clang-format on
+        >;
+
+void add_device_gemm_dl_f32_f32_f32_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_dl_f32_f32_f32_km_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp
new file mode 100644
index 00000000..0b540b8b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_gemm_dl_f32_f32_f32_mk_kn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //  ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        //  ########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        //  ########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                 |                   |
+        //  ########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  1,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,        S<1, 1, 1, 1>,      S<2, 1, 4, 1>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 1>, S<0, 1, 2, 3, 4, 5>,                5,                  4>
+        // clang-format on
+        >;
+
+void add_device_gemm_dl_f32_f32_f32_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_dl_f32_f32_f32_mk_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp
new file mode 100644
index 00000000..4f6ff511
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_gemm_dl_f32_f32_f32_mk_nk_mn_instances =
+    std::tuple<
+        // clang-format off
+        //  ########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        //  ########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        //  ########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                 |                   |
+        //  ########|      |      |      |        |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        DeviceGemmDl<   F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  1,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,        S<1, 1, 1, 1>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,        S<1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>,                5,                  4>
+        // clang-format on
+        >;
+
+void add_device_gemm_dl_f32_f32_f32_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_dl_f32_f32_f32_mk_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp
new file mode 100644
index 00000000..a4208245
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_gemm_dl_i8_i8_i8_km_kn_mn_instances = std::tuple<
+    // clang-format off
+        // #########|  AData|   BData|   CData|    AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer|  CThreadTransfer|    CThreadTransfer|
+        // #########|   Type|    Type|    Type|       Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess|  SrcDstVectorDim| DstScalarPerVector|
+        // #########|       |        |        |           |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                 |                   |
+        // #########|       |        |        |           |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                 |                   |
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 4>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<2, 1, 4, 4>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,                5,                  4>
+    // clang-format on
+    >;
+
+void add_device_gemm_dl_i8_i8_i8_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_dl_i8_i8_i8_km_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp
new file mode 100644
index 00000000..06fab7f6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
+using device_gemm_dl_i8_i8_i8_km_nk_mn_instances = std::tuple<
+    // clang-format off
+        // #########|  AData|   BData|   CData|    AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+        // #########|   Type|    Type|    Type|       Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+        // #########|       |        |        |           |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+        // #########|       |        |        |           |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<2, 1, 4, 4>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,               5,                  4>
+    // clang-format on
+    >;
+
+void add_device_gemm_dl_i8_i8_i8_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_dl_i8_i8_i8_km_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp
new file mode 100644
index 00000000..b6d72fa2
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_gemm_dl_i8_i8_i8_mk_kn_mn_instances = std::tuple<
+    // clang-format off
+        // #########|  AData|   BData|   CData|    AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|       BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+        // #########|   Type|    Type|    Type|       Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+        // #########|       |        |        |           |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder|  Lengths_K0_N0_N1_K1|               Order|                |                   |
+        // #########|       |        |        |           |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                     |                    |                |                   |
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<2, 1, 4, 4>,      S<8, 1,  32, 1>,  S<0, 3, 1, 2>,  S<0, 3, 1, 2>,       S<1, 1, 4, 1>,      S<0, 3, 1, 2>,        S<1, 1, 4, 4>, S<0, 1, 2, 3, 4, 5>,               5,                  4>
+    // clang-format on
+    >;
+
+void add_device_gemm_dl_i8_i8_i8_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_dl_i8_i8_i8_mk_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp
new file mode 100644
index 00000000..67d2e3ce
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_gemm_dl_i8_i8_i8_mk_nk_mn_instances = std::tuple<
+    // clang-format off
+        // #########|  AData|   BData|   CData|    AccData| ALayout| BLayout| CLayout|           A|           B|           C|           GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|       ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+        // #########|   Type|    Type|    Type|       Type|        |        |        | Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|      DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+        // #########|       |        |        |           |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder|  Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+        // #########|       |        |        |           |        |        |        |            |            |            |               |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                     |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+        DeviceGemmDl< int8_t,  int8_t,  int8_t,    int32_t,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,        S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,               5,                  4>
+    // clang-format on
+    >;
+
+void add_device_gemm_dl_i8_i8_i8_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_dl_i8_i8_i8_mk_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp
new file mode 100644
index 00000000..03eebf4e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                     Version|
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+    // clang-format on
+    >;
+
+void add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp
new file mode 100644
index 00000000..5d8de04c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instances = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   2,   2,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   2,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   2,   2,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   2,   2,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>
+    // clang-format on
+    >;
+
+void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp
new file mode 100644
index 00000000..7b12b7cf
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
+using device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instances = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   2,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   2,   8,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   2,   8,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>
+    // clang-format on
+    >;
+
+void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp
new file mode 100644
index 00000000..730ffd46
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instances = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   2,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>
+    // clang-format on
+    >;
+
+void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp
new file mode 100644
index 00000000..619473ff
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8>
+    // clang-format on
+    >;
+
+void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
new file mode 100644
index 00000000..8e06f9d2
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   2,   2,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   2,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   2,   2,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   2,   2,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   2,   2,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   2,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   2,   2,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   2,   2,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   2,   2,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   2,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   2,   2,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   2,   2,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+    // clang-format on
+    >;
+
+void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
new file mode 100644
index 00000000..f9458b74
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
+using device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   2,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   2,   8,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   2,   8,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   2,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   2,   8,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   2,   8,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   2,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   2,   8,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   2,   8,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+    // clang-format on
+    >;
+
+void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
new file mode 100644
index 00000000..77a03b74
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   2,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   2,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   2,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+    // clang-format on
+    >;
+
+void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
new file mode 100644
index 00000000..ef8d7d4e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,  LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+    // clang-format on
+    >;
+
+void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp
new file mode 100644
index 00000000..cb65cc7b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instances = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    16,   1,   1,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    16,   1,   1,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    16,   1,   1,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    16,   1,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    16,   1,   1,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    16,   1,   1,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    16,   1,   1,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>
+    // clang-format on
+    >;
+
+void add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp
new file mode 100644
index 00000000..5b1014ed
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
+using device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instances = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|  CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|          _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|          _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                              |                |
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    16,   1,   4,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    16,   1,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    16,   1,   4,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    16,   1,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1,  8>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1,  8>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    16,   1,   4,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1,  8, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    16,   1,   4,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    16,   1,   4,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              1,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,               S<1, 16, 1, 16>,              4>
+    // clang-format on
+    >;
+
+void add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp
new file mode 100644
index 00000000..e6f6add8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    16,   4,   1,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    16,   4,   1,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    16,   4,   1,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    16,   4,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    16,   4,   1,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    16,   4,   1,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    16,   4,   1,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              1,         0,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>
+    // clang-format on
+    >;
+
+void add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp
new file mode 100644
index 00000000..80b3d03d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F32,   F32,   F32,     F32,      F32, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>
+    // clang-format on
+    >;
+
+void add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
new file mode 100644
index 00000000..93b3df1e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //#####################| ALayout| BLayout| CLayout|  AData|  BData|  CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |   Type|   Type|   Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |       |       |       |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |       |       |       |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    64,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    64,   4,   4,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    64,  16,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    64,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,             16,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,             16,         1,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    64,   4,   4,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    64,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 2>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,             16,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 64, 1, 2>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    64,   4,   4,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,             16,         1,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    64,   4,   4,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    64,  16,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    64,   4,   4,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>
+        // clang-format on
+        >;
+
+void add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
new file mode 100644
index 00000000..f10365d8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
+using device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances =
+    std::tuple<
+        // clang-format off
+        //#####################| ALayout| BLayout| CLayout|  AData|  BData|  CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |   Type|   Type|   Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |       |       |       |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |       |       |       |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    64,   4,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    64,   4,  16,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    64,  16,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    64,   4,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    64,   4,  16,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    64,   4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 2>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 2>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    64,   4,  16,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    64,   4,  16,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    64,  16,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    64,   4,  16,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Col,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>
+        // clang-format on
+        >;
+
+void add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
new file mode 100644
index 00000000..a7a9eb62
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //#####################| ALayout| BLayout| CLayout|  AData|  BData|  CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |   Type|   Type|   Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |       |       |       |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |       |       |       |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    64,  16,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    64,  16,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    64,  16,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    64,  16,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,             16,         1,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    64,  16,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    64,  16,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 2>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 64, 1, 2>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    64,  16,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,             16,         1,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    64,  16,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    64,  16,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    64,  16,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,         0,           1,           1,               S<1, 64, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Row,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,             16,         1,           1,           1,               S<1, 64, 1, 4>,             16>
+        // clang-format on
+        >;
+
+void add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
new file mode 100644
index 00000000..9fb45b00
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances =
+    std::tuple<
+        // clang-format off
+        //#####################| ALayout| BLayout| CLayout|  AData|  BData|  CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |   Type|   Type|   Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |       |       |       |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |       |       |       |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    64,  16,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 2>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    64,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    64,  16,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    32,    64,  16,  16,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 2>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    32,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    64,    32,    64,  16,  16,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,             16>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    32,    64,    64,  16,  16,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,             16>
+        // clang-format on
+        >;
+
+void add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, int8_t, int8_t, int8_t, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp
new file mode 100644
index 00000000..18a78674
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault   = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_gemm_xdl_f16_f16_f16_km_kn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumPrefetch|          LoopScheduler| Pipeline|
+        //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|            |                       |         |
+        //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|            |                       |         |
+        //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |            |                       |         |
+        // pipeline v1, 1 wave
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+        // clang-format on
+        >;
+
+// irregular tile size
+using device_gemm_xdl_f16_f16_f16_km_kn_mn_irregular_tile_instances = std::tuple<
+    // clang-format off
+        //###########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumPrefetch|          LoopScheduler|                     Pipeline|
+        //###########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|            |                       |                             |
+        //###########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|            |                       |                             |
+        //###########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |            |                       |                             |
+        // pipeline v1, 1 wave
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   64,     16,    16,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   64,     16,    16,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,  GemmMNPadding,   64,     16,    16,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+    // clang-format on
+    >;
+
+void add_device_gemm_xdl_f16_f16_f16_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_xdl_f16_f16_f16_km_kn_mn_instances{});
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_f16_f16_f16_km_kn_mn_irregular_tile_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp
new file mode 100644
index 00000000..cef6070a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault   = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
+using device_gemm_xdl_f16_f16_f16_km_nk_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumPrefetch|          LoopScheduler|                     Pipeline|
+        //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|            |                       |                             |
+        //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|            |                       |                             |
+        //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |            |                       |                             |
+        // pipeline v1, 1 wave
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+        // clang-format on
+        >;
+
+// irregular tile size
+using device_gemm_xdl_f16_f16_f16_km_nk_mn_irregular_tile_instances = std::tuple<
+    // clang-format off
+        //###########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumPrefetch|          LoopScheduler|                     Pipeline|
+        //###########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|            |                       |                             |
+        //###########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|            |                       |                             |
+        //###########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |            |                       |                             |
+        // pipeline v1, 1 wave
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmMNPadding,   64,    16,    16,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmMNPadding,   64,    16,    16,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmMNPadding,   64,    16,    16,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+    // clang-format on
+    >;
+
+void add_device_gemm_xdl_f16_f16_f16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_xdl_f16_f16_f16_km_nk_mn_instances{});
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_f16_f16_f16_km_nk_mn_irregular_tile_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
new file mode 100644
index 00000000..1be70d6c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault   = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumPrefetch|          LoopScheduler|                     Pipeline|
+        //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|            |                       |                             |
+        //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|            |                       |                             |
+        //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |            |                       |                             |
+        // pipeline v1, 1 wave
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    32,   256,     4,  8,   32,   32,    1,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    32,    64,     4,  8,   32,   32,    1,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    32,    32,     4,  8,   32,   32,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    16,   256,     4,  8,   16,   16,    1,    8,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    16,   128,     4,  8,   16,   16,    1,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    16,    64,     4,  8,   16,   16,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    16,    32,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    16,    16,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    32,   256,     4,  8,   32,   32,    1,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    32,    64,     4,  8,   32,   32,    1,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    32,    32,     4,  8,   32,   32,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    16,   256,     4,  8,   16,   16,    1,    8,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    16,   128,     4,  8,   16,   16,    1,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    16,    64,     4,  8,   16,   16,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    16,    32,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    16,    16,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    32,   256,     4,  8,   32,   32,    1,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    32,    64,     4,  8,   32,   32,    1,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    32,    32,     4,  8,   32,   32,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    16,   256,     4,  8,   16,   16,    1,    8,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    16,   128,     4,  8,   16,   16,    1,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    16,    64,     4,  8,   16,   16,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    16,    32,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    16,    16,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+        // clang-format on
+        >;
+
+// irregular tile size
+using device_gemm_xdl_f16_f16_f16_mk_kn_mn_irregular_tile_instances = std::tuple<
+    // clang-format off
+        //###########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumPrefetch|          LoopScheduler|                     Pipeline|
+        //###########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|            |                       |                             |
+        //###########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|            |                       |                             |
+        //###########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |            |                       |                             |
+        // pipeline v1, 1 wave
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    64,    16,    16,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    64,    16,    16,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    64,    16,    16,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+    // clang-format on
+    >;
+
+void add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances{});
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_f16_f16_f16_mk_kn_mn_irregular_tile_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
new file mode 100644
index 00000000..6b8455ff
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -0,0 +1,130 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault   = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNPadding = ck::tensor_operation::device::GemmSpecialization::MNPadding;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances = std::tuple<
+    // clang-format off
+        //###########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumPrefetch|          LoopScheduler|                     Pipeline|
+        //###########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|            |                       |                             |
+        //###########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|            |                       |                             |
+        //###########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |            |                       |                             |
+        // pipeline v1, 1 wave
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+    // clang-format on
+    >;
+
+// irregular tile size
+using device_gemm_xdl_f16_f16_f16_mk_nk_mn_irregular_tile_instances = std::tuple<
+    // clang-format off
+        //###########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumPrefetch|          LoopScheduler|                     Pipeline|
+        //###########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|            |                       |                             |
+        //###########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|            |                       |                             |
+        //###########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |            |                       |                             |
+        // pipeline v1, 1 wave
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   128,   144,     8,  8,   16,   16,    2,    9,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8,  8, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   128,   144,     4,  8,   16,   16,    2,    9,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   128,   144,     8,  8,   16,   16,    2,    9,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8,  8, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   128,   144,     4,  8,   16,   16,    2,    9,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   128,   144,     8,  8,   16,   16,    2,    9,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8,  8, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   128,   144,     4,  8,   16,   16,    2,    9,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1,           1,  LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+    // clang-format on
+    >;
+
+void add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances{});
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_f16_f16_f16_mk_nk_mn_irregular_tile_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp
new file mode 100644
index 00000000..b9e28e3d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_gemm_xdl_f32_f32_f32_km_kn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  4,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>
+        // clang-format on
+        >;
+
+void add_device_gemm_xdl_f32_f32_f32_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_xdl_f32_f32_f32_km_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp
new file mode 100644
index 00000000..2b1a5a57
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
+using device_gemm_xdl_f32_f32_f32_km_nk_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  4,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  4,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  4,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  4,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  4,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  4,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>
+        // clang-format on
+        >;
+
+void add_device_gemm_xdl_f32_f32_f32_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_xdl_f32_f32_f32_km_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp
new file mode 100644
index 00000000..301d3b55
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>
+        // clang-format on
+        >;
+
+void add_device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp
new file mode 100644
index 00000000..cd16f35f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    64,    64,     4,  4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    32,     4,  4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    32,   128,     4,  4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    64,    32,     4,  4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>,
+        DeviceGemmXdl<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    32,    64,     4,  4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,      true,               7,               1>
+        // clang-format on
+        >;
+
+void add_device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp
new file mode 100644
index 00000000..39166698
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F64 = double;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_gemm_xdl_f64_f64_f64_km_kn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              2,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               7,               1>
+        // clang-format on
+        >;
+
+void add_device_gemm_xdl_f64_f64_f64_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Row, Row, F64, F64, F64, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_xdl_f64_f64_f64_km_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp
new file mode 100644
index 00000000..0a623034
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F64 = double;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
+using device_gemm_xdl_f64_f64_f64_km_nk_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  2,   16,   16,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  2,   16,   16,    4,    4,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  2,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Col,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  2,   16,   16,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              2,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>
+        // clang-format on
+        >;
+
+void add_device_gemm_xdl_f64_f64_f64_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Col, Col, Row, F64, F64, F64, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_xdl_f64_f64_f64_km_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp
new file mode 100644
index 00000000..5ef8d08d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F64 = double;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  2,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  2,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,      true,               7,               1>
+        // clang-format on
+        >;
+
+void add_device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Row, Row, F64, F64, F64, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_xdl_f64_f64_f64_mk_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp
new file mode 100644
index 00000000..c9557bae
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F64 = double;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##########| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  2,   16,   16,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  2,   16,   16,    4,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    64,    64,     4,  2,   16,   16,    4,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  2,   16,   16,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  2,   16,   16,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    32,     4,  2,   16,   16,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    32,   128,     4,  2,   16,   16,    2,    4,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    64,    32,     4,  2,   16,   16,    4,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>,
+        DeviceGemmXdl<  F64,   F64,   F64,     F64,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    32,    64,     4,  2,   16,   16,    2,    4,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1>
+        // clang-format on
+        >;
+
+void add_device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemm<Row, Col, Row, F64, F64, F64, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_gemm_xdl_f64_f64_f64_mk_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/CMakeLists.txt
new file mode 100644
index 00000000..bbf81a5f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_instance_library(device_gemm_add_add_fastgelu_instance
+   device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
+   device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
+   device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
+   device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
new file mode 100644
index 00000000..463e0865
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16           = ck::half_t;
+using F32           = float;
+using F16_F16_Tuple = ck::Tuple<F16, F16>;
+
+using Row           = ck::tensor_layout::gemm::RowMajor;
+using Col           = ck::tensor_layout::gemm::ColumnMajor;
+using Row_Row_Tuple = ck::Tuple<Row, Row>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough    = ck::tensor_operation::element_wise::PassThrough;
+using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// e = elementwise((a * b), d0, d1)
+// outout: e[m, n]
+// input: a[k, m], b[k, n], d0[m, n], d1[m, n]
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   2,   2,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   2,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   2,   2,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   2,   2,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   2,   2,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   2,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   2,   2,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   2,   2,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   2,   2,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   2,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   2,   2,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   2,   2,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+        // clang-format on
+        >;
+
+// irregular tile size
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_irregular_tile_instances =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+        // clang-format on
+        >;
+
+void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
+                                                    Row,
+                                                    Row_Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    F16_F16_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    AddAddFastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instances{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_irregular_tile_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
new file mode 100644
index 00000000..b71ff1b9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16           = ck::half_t;
+using F32           = float;
+using F16_F16_Tuple = ck::Tuple<F16, F16>;
+
+using Row           = ck::tensor_layout::gemm::RowMajor;
+using Col           = ck::tensor_layout::gemm::ColumnMajor;
+using Row_Row_Tuple = ck::Tuple<Row, Row>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough    = ck::tensor_operation::element_wise::PassThrough;
+using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// e = elementwise((a * b), d0, d1)
+// outout: e[m, n]
+// input: a[k, m], b[n, k], d0[m, n], d1[m, n]
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   2,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   2,   8,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   2,   8,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   2,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   2,   8,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   2,   8,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   2,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   2,   8,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   2,   8,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+        // clang-format on
+        >;
+
+// irregular tile size
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_irregular_tile_instances =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Interwave,      PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+        // clang-format on
+        >;
+
+void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
+                                                    Col,
+                                                    Row_Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    F16_F16_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    AddAddFastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instances{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_irregular_tile_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
new file mode 100644
index 00000000..9060c9b1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16           = ck::half_t;
+using F32           = float;
+using F16_F16_Tuple = ck::Tuple<F16, F16>;
+
+using Row           = ck::tensor_layout::gemm::RowMajor;
+using Col           = ck::tensor_layout::gemm::ColumnMajor;
+using Row_Row_Tuple = ck::Tuple<Row, Row>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough    = ck::tensor_operation::element_wise::PassThrough;
+using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// e = elementwise((a * b), d0, d1)
+// outout: e[m, n]
+// input: a[m, k], b[k, n], d0[m, n], d1[m, n]
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   2,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   2,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   2,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+        // clang-format on
+        >;
+
+// irregular tile size
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_irregular_tile_instances =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+        // clang-format on
+        >;
+
+void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+                                                    Row,
+                                                    Row_Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    F16_F16_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    AddAddFastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_irregular_tile_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
new file mode 100644
index 00000000..81cf01d6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16           = ck::half_t;
+using F32           = float;
+using F16_F16_Tuple = ck::Tuple<F16, F16>;
+
+using Row           = ck::tensor_layout::gemm::RowMajor;
+using Col           = ck::tensor_layout::gemm::ColumnMajor;
+using Row_Row_Tuple = ck::Tuple<Row, Row>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough    = ck::tensor_operation::element_wise::PassThrough;
+using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// e = elementwise((a * b), d0, d1)
+// outout: e[m, n]
+// input: a[m, k], b[n, k], d0[m, n], d1[m ,n]
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES         
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+        // clang-format on
+        >;
+
+// irregular tile size
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_irregular_tile_instances =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+        // clang-format on
+        >;
+
+void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+                                                    Col,
+                                                    Row_Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    F16_F16_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    AddAddFastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_irregular_tile_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/CMakeLists.txt
new file mode 100644
index 00000000..0beb10e3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_instance_library(device_gemm_add_fastgelu_instance
+   device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
+   device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
+   device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
+   device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
new file mode 100644
index 00000000..4da85cc4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// e = elementwise((a * b), d0)
+// outout: e[m, n]
+// input: a[k, m], b[k, n], d0[m, n]
+using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|    DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|    Layout| Layout|  Type|  Type|    Type| DataType|      Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |          |       |      |      |        |         |          |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |          |       |      |      |        |         |          |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   2,   2,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   2,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   2,   2,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   2,   2,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   2,   2,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   2,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   2,   2,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   2,   2,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   2,   2,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   2,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   2,   2,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   2,   2,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+    // clang-format on
+    >;
+
+// irregular tile size
+using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_irregular_tile_instances =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Interwave,      PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+        // clang-format on
+        >;
+
+void add_device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
+                                                    Row,
+                                                    Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    F16_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    AddFastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_irregular_tile_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
new file mode 100644
index 00000000..ab83e4ba
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// e = elementwise((a * b), d0, d1)
+// outout: e[m, n]
+// input: a[k, m], b[n, k], d0[m, n], d1[m, n]
+using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|    DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|    Layout| Layout|  Type|  Type|    Type| DataType|      Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |          |       |      |      |        |         |          |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |          |       |      |      |        |         |          |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   2,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   2,   8,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   2,   8,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   2,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   2,   8,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   2,   8,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   2,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   2,   8,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   2,   8,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+    // clang-format on
+    >;
+
+// irregular tile size
+using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_irregular_tile_instances =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Interwave,      PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+        // clang-format on
+        >;
+
+void add_device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
+                                                    Col,
+                                                    Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    F16_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    AddFastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_irregular_tile_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
new file mode 100644
index 00000000..a4cd3fad
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// e = elementwise((a * b), d0, d1)
+// outout: e[m, n]
+// input: a[m, k], b[k, n], d0[m, n], d1[m, n]
+using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|    DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|    Layout| Layout|  Type|  Type|    Type| DataType|      Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |          |       |      |      |        |         |          |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |          |       |      |      |        |         |          |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   2,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   2,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   2,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+    // clang-format on
+    >;
+
+// irregular tile size
+using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_irregular_tile_instances =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+        // clang-format on
+        >;
+
+void add_device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+                                                    Row,
+                                                    Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    F16_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    AddFastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_irregular_tile_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
new file mode 100644
index 00000000..207e76ff
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// e = elementwise((a * b), d0, d1)
+// outout: e[m, n]
+// input: a[m, k], b[n, k], d0[m, n], d1[m ,n]
+using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|    DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|    Layout| Layout|  Type|  Type|    Type| DataType|      Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |          |       |      |      |        |         |          |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |          |       |      |      |        |         |          |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES         
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+    // clang-format on
+    >;
+
+// irregular tile size
+using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_irregular_tile_instances =
+    std::tuple<
+        // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+        // clang-format on
+        >;
+
+void add_device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+                                                    Col,
+                                                    Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    F16_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    AddFastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_irregular_tile_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/CMakeLists.txt
new file mode 100644
index 00000000..ccada3a8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_instance_library(device_gemm_bias_add_reduce_instance
+    device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp
+    device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
+    device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp
+    device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp
new file mode 100644
index 00000000..e8747af4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_bias_add_reduce_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16              = ck::half_t;
+using F32              = float;
+using ReducePtrsGlobal = ck::Tuple<F32*, F32*>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using ReduceSum   = ck::reduce::Add;
+using ReduceOps   = ck::Tuple<ReduceSum, ReduceSum>;
+
+using Div                 = ck::tensor_operation::element_wise::UnaryDivide;
+using Identity            = ck::tensor_operation::element_wise::PassThrough;
+using Square              = ck::tensor_operation::element_wise::UnarySquare;
+using ReduceInElementOps  = ck::Tuple<Identity, Square>;
+using ReduceOutElementOps = ck::Tuple<Div, Div>;
+
+using ReduceMemOp = ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicAdd,
+                                                          ck::InMemoryDataOperationEnum::AtomicAdd>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// c[m, n] = a[k, m] * b[k, n]
+using device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_kn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##################################| ALayout| BLayout| CLayout|AData| BData| CData|C0Data|C1Data|  GemmAcc| CShuffle| ReduceAcc|         ReduceData|           A|           B|           C|          C1|    Reduce|      ReduceInEleOp|      ReduceAccEleOp|      Reduce|          GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
+        //##################################|        |        |        | Type|  Type|  Type|  Type|  Type| DataType| DataType|  DataType|         Type Tuple| Elementwise| Elementwise| Elementwise| Elementwise| Operation|                   |                    |  MemoryData|Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
+        //##################################|        |        |        |     |      |      |      |      |         |         |          |                   |   Operation|   Operation|   Operation|   Operation|          |                   |                    |   Operation|              |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
+        //##################################|        |        |        |     |      |      |      |      |         |         |          |                   |            |            |            |            |          |                   |                    |            |              |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,   256,    32,   2,   2,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,   128,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,   128,    64,    32,   2,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,    64,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,    64,    32,   2,   2,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 16, 1, 4>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,    64,   128,    32,   2,   2,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>
+        // clang-format on
+        >;
+
+void add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_kn_mn_instances(
+    std::vector<DeviceGemmReducePtr<1, ReduceOps::Size()>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp
new file mode 100644
index 00000000..ed54c3a9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_bias_add_reduce_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16              = ck::half_t;
+using F32              = float;
+using ReducePtrsGlobal = ck::Tuple<F32*, F32*>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using ReduceSum   = ck::reduce::Add;
+using ReduceOps   = ck::Tuple<ReduceSum, ReduceSum>;
+
+using Div                 = ck::tensor_operation::element_wise::UnaryDivide;
+using Identity            = ck::tensor_operation::element_wise::PassThrough;
+using Square              = ck::tensor_operation::element_wise::UnarySquare;
+using ReduceInElementOps  = ck::Tuple<Identity, Square>;
+using ReduceOutElementOps = ck::Tuple<Div, Div>;
+
+using ReduceMemOp = ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicAdd,
+                                                          ck::InMemoryDataOperationEnum::AtomicAdd>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// c[m, n] = a[k, m] * b[n, k]
+using device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_nk_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##################################| ALayout| BLayout| CLayout|AData| BData| CData|C0Data|C1Data|  GemmAcc| CShuffle| ReduceAcc|         ReduceData|           A|           B|           C|          C1|    Reduce|      ReduceInEleOp|      ReduceAccEleOp|      Reduce|          GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
+        //##################################|        |        |        | Type|  Type|  Type|  Type|  Type| DataType| DataType|  DataType|         Type Tuple| Elementwise| Elementwise| Elementwise| Elementwise| Operation|                   |                    |  MemoryData|Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
+        //##################################|        |        |        |     |      |      |      |      |         |         |          |                   |   Operation|   Operation|   Operation|   Operation|          |                   |                    |   Operation|              |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
+        //##################################|        |        |        |     |      |      |      |      |         |         |          |                   |            |            |            |            |          |                   |                    |            |              |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,   128,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,   128,    64,    32,   2,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,    64,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,    64,    32,   2,   8,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,    64,   128,    32,   2,   8,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>
+        // clang-format on
+        >;
+
+void add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_nk_mn_instances(
+    std::vector<DeviceGemmReducePtr<1, ReduceOps::Size()>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp
new file mode 100644
index 00000000..da7eae63
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_bias_add_reduce_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16              = ck::half_t;
+using F32              = float;
+using ReducePtrsGlobal = ck::Tuple<F32*, F32*>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using ReduceSum   = ck::reduce::Add;
+using ReduceOps   = ck::Tuple<ReduceSum, ReduceSum>;
+
+using Div                 = ck::tensor_operation::element_wise::UnaryDivide;
+using Identity            = ck::tensor_operation::element_wise::PassThrough;
+using Square              = ck::tensor_operation::element_wise::UnarySquare;
+using ReduceInElementOps  = ck::Tuple<Identity, Square>;
+using ReduceOutElementOps = ck::Tuple<Div, Div>;
+
+using ReduceMemOp = ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicAdd,
+                                                          ck::InMemoryDataOperationEnum::AtomicAdd>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// c[m, n] = a[m, k] * b[n, k]
+using device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_kn_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##################################| ALayout| BLayout| CLayout|AData| BData| CData|C0Data|C1Data|  GemmAcc| CShuffle| ReduceAcc|         ReduceData|           A|           B|           C|          C1|    Reduce|      ReduceInEleOp|      ReduceAccEleOp|      Reduce|          GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
+        //##################################|        |        |        | Type|  Type|  Type|  Type|  Type| DataType| DataType|  DataType|         Type Tuple| Elementwise| Elementwise| Elementwise| Elementwise| Operation|                   |                    |  MemoryData|Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
+        //##################################|        |        |        |     |      |      |      |      |         |         |          |                   |   Operation|   Operation|   Operation|   Operation|          |                   |                    |   Operation|              |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
+        //##################################|        |        |        |     |      |      |      |      |         |         |          |                   |            |            |            |            |          |                   |                    |            |              |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,   128,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,   128,    64,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,    64,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 16, 1, 4>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,    64,   128,    32,   8,   2,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>
+        // clang-format on
+        >;
+
+void add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_kn_mn_instances(
+    std::vector<DeviceGemmReducePtr<1, ReduceOps::Size()>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
new file mode 100644
index 00000000..34345095
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_bias_add_reduce/device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_bias_add_reduce_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16              = ck::half_t;
+using F32              = float;
+using ReducePtrsGlobal = ck::Tuple<F32*, F32*>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using ReduceSum   = ck::reduce::Add;
+using ReduceOps   = ck::Tuple<ReduceSum, ReduceSum>;
+
+using Div                 = ck::tensor_operation::element_wise::UnaryDivide;
+using Identity            = ck::tensor_operation::element_wise::PassThrough;
+using Square              = ck::tensor_operation::element_wise::UnarySquare;
+using ReduceInElementOps  = ck::Tuple<Identity, Square>;
+using ReduceOutElementOps = ck::Tuple<Div, Div>;
+
+using ReduceMemOp = ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicAdd,
+                                                          ck::InMemoryDataOperationEnum::AtomicAdd>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// c[m, n] = a[m, k] * b[n, k]
+using device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_nk_mn_instances =
+    std::tuple<
+        // clang-format off
+        //##################################| ALayout| BLayout| CLayout|AData| BData| CData|C0Data|C1Data|  GemmAcc| CShuffle| ReduceAcc|         DData|           A|           B|           C|          C1|         Dxs|    DxsInEleOp|    DxsAccEleOp|           D|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
+        //##################################|        |        |        | Type|  Type|  Type|  Type|  Type| DataType| DataType|  DataType|    Type Tuple| Elementwise| Elementwise| Elementwise| Elementwise|      Reduce|              |               |  MemoryData| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
+        //##################################|        |        |        |     |      |      |      |      |         |         |          |              |   Operation|   Operation|   Operation|   Operation|   Operation|              |               |   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
+        //##################################|        |        |        |     |      |      |      |      |         |         |          |              |            |            |            |            |            |              |               |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,    GemmDefault,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<32, 2>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,    GemmDefault,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,    GemmDefault,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,    GemmDefault,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<32, 2>,                         4,                            1>,
+        DeviceGemmBiasAddReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,    GemmDefault,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<32, 2>,                         4,                            1>
+        // clang-format on
+        >;
+
+void add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_nk_mn_instances(
+    std::vector<DeviceGemmReducePtr<1, ReduceOps::Size()>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_bilinear/CMakeLists.txt
new file mode 100644
index 00000000..cb1b3a48
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_instance_library(device_gemm_bilinear_instance
+   device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
+   device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
+   device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
+   device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
new file mode 100644
index 00000000..55461dfb
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16       = ck::half_t;
+using F32       = float;
+using F16_Tuple = ck::Tuple<ck::half_t>;
+
+using Row       = ck::tensor_layout::gemm::RowMajor;
+using Col       = ck::tensor_layout::gemm::ColumnMajor;
+using Row_Tuple = ck::Tuple<Row>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// e[m, n] = bilinear(a[k, m] * b[k, n], d[m, n])
+using device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances = std::tuple<
+    // clang-format off
+        // no padding
+        //##############################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|    DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //##############################| Layout| Layout|    Layout| Layout|  Type|  Type|    Type| DataType|      Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //##############################|       |       |          |       |      |      |        |         |          |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##############################|       |       |          |       |      |      |        |         |          |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   128,   256,    32,   2,   2,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   128,   128,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   128,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   128,   128,    64,    32,   2,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   128,    64,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   128,    64,    32,   2,   2,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,    64,   128,    32,   2,   2,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+
+        // M/N/K Padding
+        //##############################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|    DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //##############################| Layout| Layout|    Layout| Layout|  Type|  Type|    Type| DataType|      Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //##############################|       |       |          |       |      |      |        |         |          |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##############################|       |       |          |       |      |      |        |         |          |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    32,   2,   2,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    32,   2,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    32,   2,   2,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    32,   2,   2,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>
+    // clang-format on
+    >;
+
+void add_device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
+                                                    Row,
+                                                    Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    F16_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    Bilinear>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
new file mode 100644
index 00000000..405e6997
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16       = ck::half_t;
+using F32       = float;
+using F16_Tuple = ck::Tuple<ck::half_t>;
+
+using Row       = ck::tensor_layout::gemm::RowMajor;
+using Col       = ck::tensor_layout::gemm::ColumnMajor;
+using Row_Tuple = ck::Tuple<Row>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// e[m, n] = bilinear(a[k, m] * b[n, k], d[m, n])
+using device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances = std::tuple<
+    // clang-format off
+        // no padding
+        //##############################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|    DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //##############################| Layout| Layout|    Layout| Layout|  Type|  Type|    Type| DataType|      Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //##############################|       |       |          |       |      |      |        |         |          |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##############################|       |       |          |       |      |      |        |         |          |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   128,   128,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   128,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   128,   128,    64,    32,   2,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   128,    64,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   128,    64,    32,   2,   8,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,    64,   128,    32,   2,   8,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+
+        // M/N/K Padding
+        //##############################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|    DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //##############################| Layout| Layout|    Layout| Layout|  Type|  Type|    Type| DataType|      Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //##############################|       |       |          |       |      |      |        |         |          |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##############################|       |       |          |       |      |      |        |         |          |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    32,   2,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    32,   2,   8,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    32,   2,   8,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>
+    // clang-format on
+    >;
+
+void add_device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
+                                                    Col,
+                                                    Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    F16_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    Bilinear>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
new file mode 100644
index 00000000..9af31b3a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16       = ck::half_t;
+using F32       = float;
+using F16_Tuple = ck::Tuple<ck::half_t>;
+
+using Row       = ck::tensor_layout::gemm::RowMajor;
+using Col       = ck::tensor_layout::gemm::ColumnMajor;
+using Row_Tuple = ck::Tuple<Row>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// e[m, n] = bilinear(a[m, k] * b[k, n], d[m, n])
+using device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances = std::tuple<
+    // clang-format off
+        // no padding
+        //##############################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|    DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //##############################| Layout| Layout|    Layout| Layout|  Type|  Type|    Type| DataType|      Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //##############################|       |       |          |       |      |      |        |         |          |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##############################|       |       |          |       |      |      |        |         |          |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   128,   128,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   128,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   128,   128,    64,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   128,    64,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,    64,   128,    32,   8,   2,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+
+        // M/N/K padding
+        //##############################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|    DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //##############################| Layout| Layout|    Layout| Layout|  Type|  Type|    Type| DataType|      Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //##############################|       |       |          |       |      |      |        |         |          |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##############################|       |       |          |       |      |      |        |         |          |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   2,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>
+    // clang-format on
+    >;
+
+void add_device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+                                                    Row,
+                                                    Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    F16_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    Bilinear>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
new file mode 100644
index 00000000..841b7a1d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16       = ck::half_t;
+using F32       = float;
+using F16_Tuple = ck::Tuple<ck::half_t>;
+
+using Row       = ck::tensor_layout::gemm::RowMajor;
+using Col       = ck::tensor_layout::gemm::ColumnMajor;
+using Row_Tuple = ck::Tuple<Row>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// e[m, n] = bilinear(a[m, k] * b[n, k], d[m, n])
+using device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances = std::tuple<
+    // clang-format off
+        // no padding
+        // N % 8 == 0 && K % 8 == 0
+        //##############################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|    DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //##############################| Layout| Layout|    Layout| Layout|  Type|  Type|    Type| DataType|      Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //##############################|       |       |          |       |      |      |        |         |          |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##############################|       |       |          |       |      |      |        |         |          |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear,    GemmDefault,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+
+        // M/N/K padding
+        // N % 8 == 0 && K % 8 == 0
+        //##############################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|    DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //##############################| Layout| Layout|    Layout| Layout|  Type|  Type|    Type| DataType|      Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //##############################|       |       |          |       |      |      |        |         |          |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##############################|       |       |          |       |      |      |        |         |          |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+
+        // M/N/K padding
+        // N % 4 == 0 && K % 4 == 0
+        //##############################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|    DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //##############################| Layout| Layout|    Layout| Layout|  Type|  Type|    Type| DataType|      Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //##############################|       |       |          |       |      |      |        |         |          |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##############################|       |       |          |       |      |      |        |         |          |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,               4>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,               4>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,               4>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,               4>,
+
+        // M/N/K padding
+        // N % 8 == 0 && K % 1 == 0
+        //##############################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|    DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //##############################| Layout| Layout|    Layout| Layout|  Type|  Type|    Type| DataType|      Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //##############################|       |       |          |       |      |      |        |         |          |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##############################|       |       |          |       |      |      |        |         |          |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,              S<1,  4, 1, 64>,               1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,              S<1,  4, 1, 64>,               1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,              S<1,  2, 1, 64>,               1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,              S<1,  4, 1, 64>,               1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,              S<1,  4, 1, 32>,               1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,              S<1,  2, 1, 64>,               1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,              S<1,  2, 1, 32>,               1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,              S<1,  4, 1, 64>,               1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,              S<1,  4, 1, 64>,               1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,              S<1,  4, 1, 32>,               1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,              S<1,  2, 1, 64>,               1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,              S<1,  2, 1, 32>,               1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F16, F16_Tuple,   F16, PassThrough, PassThrough,    Bilinear, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,              S<1,  2, 1, 32>,               1>
+
+    // clang-format on
+    >;
+
+void add_device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+                                                    Col,
+                                                    Row_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    F16_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    Bilinear>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/CMakeLists.txt
new file mode 100644
index 00000000..17d27ab1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_instance_library(device_gemm_fastgelu_instance
+   device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
+   device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
+   device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
+   device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
new file mode 100644
index 00000000..9f7f643b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// e = elementwise((a * b))
+// outout: e[m, n]
+// input: a[k, m], b[k, n]
+using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   256,    32,   2,   2,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,   128,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,   128,    64,    32,   2,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,    64,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,    64,    32,   2,   2,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,    64,   128,    32,   2,   2,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   256,    32,   2,   2,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,   128,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,   128,    64,    32,   2,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,    64,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,    64,    32,   2,   2,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,    64,   128,    32,   2,   2,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   256,    32,   2,   2,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,   128,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,   128,    64,    32,   2,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,    64,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,    64,    32,   2,   2,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,    64,   128,    32,   2,   2,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+    // clang-format on
+    >;
+
+// irregular tile size
+using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_irregular_tile_instances = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Interwave,      PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+    // clang-format on
+    >;
+
+void add_device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
+                                                    Row,
+                                                    Empty_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    Empty_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    FastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_irregular_tile_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
new file mode 100644
index 00000000..c8e9f35d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// e = elementwise((a * b))
+// outout: e[m, n]
+// input: a[k, m], b[k, n]
+using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,   128,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,   128,    64,    32,   2,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,    64,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,    64,    32,   2,   8,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,    64,   128,    32,   2,   8,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,   128,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,   128,    64,    32,   2,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,    64,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,    64,    32,   2,   8,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,    64,   128,    32,   2,   8,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,   128,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,   128,    64,    32,   2,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,    64,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,    64,    32,   2,   8,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,    64,   128,    32,   2,   8,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+    // clang-format on
+    >;
+
+// irregular tile size
+using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_irregular_tile_instances = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+    // clang-format on
+    >;
+
+void add_device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Col,
+                                                    Col,
+                                                    Empty_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    Empty_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    FastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_irregular_tile_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
new file mode 100644
index 00000000..5f804d45
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// e = elementwise((a * b))
+// outout: e[m, n]
+// input: a[m, k], b[k, n]
+using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   2,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   2,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   2,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+    // clang-format on
+    >;
+
+// irregular tile size
+using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_irregular_tile_instances = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Interwave,      PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+    // clang-format on
+    >;
+
+void add_device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+                                                    Row,
+                                                    Empty_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    Empty_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    FastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_irregular_tile_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
new file mode 100644
index 00000000..60cb138f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+static constexpr auto GemmDefault    = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// e = elementwise((a * b))
+// outout: e[m, n]
+// input: a[m, k], b[n, k]
+using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES         
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Interwave,        PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8, LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+    // clang-format on
+    >;
+
+// irregular tile size
+using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_irregular_tile_instances = std::tuple<
+    // clang-format off
+        //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
+        //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+        //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+        // pipeline v1, 1 wave
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+#if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
+        // pipeline v1, 2 waves
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Interwave,      PipelineVersion::v1>
+#endif
+#if CK_EXPERIMENTAL_PIPELINE_V2_INSTANCES        
+        // pipeline v2, 1 wave
+        ,
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v2>
+#endif
+    // clang-format on
+    >;
+
+void add_device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleD<Row,
+                                                    Col,
+                                                    Empty_Tuple,
+                                                    Row,
+                                                    F16,
+                                                    F16,
+                                                    Empty_Tuple,
+                                                    F16,
+                                                    PassThrough,
+                                                    PassThrough,
+                                                    FastGelu>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances{});
+    add_device_operation_instances(
+        instances,
+        device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_irregular_tile_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_reduce/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_reduce/CMakeLists.txt
new file mode 100644
index 00000000..2b2cf8c7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_instance_library(device_gemm_reduce_instance
+    device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp
+    device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
+    device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp
+    device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp
new file mode 100644
index 00000000..59e2b2da
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instance.cpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_reduce_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16              = ck::half_t;
+using F32              = float;
+using ReducePtrsGlobal = ck::Tuple<F32*, F32*>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using ReduceSum   = ck::reduce::Add;
+using ReduceOps   = ck::Tuple<ReduceSum, ReduceSum>;
+
+using Div                 = ck::tensor_operation::element_wise::UnaryDivide;
+using Identity            = ck::tensor_operation::element_wise::PassThrough;
+using Square              = ck::tensor_operation::element_wise::UnarySquare;
+using ReduceInElementOps  = ck::Tuple<Identity, Square>;
+using ReduceOutElementOps = ck::Tuple<Div, Div>;
+
+using ReduceMemOp = ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicAdd,
+                                                          ck::InMemoryDataOperationEnum::AtomicAdd>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// c[m, n] = a[k, m] * b[k, n]
+using device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instances = std::tuple<
+    // clang-format off
+        //###########################| ALayout| BLayout| CLayout|AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc|         ReduceData|           A|           B|           C|    Reduce|      ReduceInEleOp|      ReduceAccEleOp|      Reduce|          GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
+        //###########################|        |        |        | Type|  Type|  Type| DataType| DataType|  DataType|         Type Tuple| Elementwise| Elementwise| Elementwise| Operation|                   |                    |  MemoryData|Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
+        //###########################|        |        |        |     |      |      |         |         |          |                   |   Operation|   Operation|   Operation|          |                   |                    |   Operation|              |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
+        //###########################|        |        |        |     |      |      |         |         |          |                   |            |            |            |          |                   |                    |            |              |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,   256,    32,   2,   2,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,   128,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,   128,    64,    32,   2,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,    64,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,    64,    32,   2,   2,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 16, 1, 4>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,    64,   128,    32,   2,   2,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>
+    // clang-format on
+    >;
+
+void add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instances(
+    std::vector<DeviceGemmReducePtr<0, ReducePtrsGlobal::Size()>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp
new file mode 100644
index 00000000..bb09bf8b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instance.cpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_reduce_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16              = ck::half_t;
+using F32              = float;
+using ReducePtrsGlobal = ck::Tuple<F32*, F32*>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using ReduceSum   = ck::reduce::Add;
+using ReduceOps   = ck::Tuple<ReduceSum, ReduceSum>;
+
+using Div                 = ck::tensor_operation::element_wise::UnaryDivide;
+using Identity            = ck::tensor_operation::element_wise::PassThrough;
+using Square              = ck::tensor_operation::element_wise::UnarySquare;
+using ReduceInElementOps  = ck::Tuple<Identity, Square>;
+using ReduceOutElementOps = ck::Tuple<Div, Div>;
+
+using ReduceMemOp = ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicAdd,
+                                                          ck::InMemoryDataOperationEnum::AtomicAdd>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// c[m, n] = a[k, m] * b[n, k]
+using device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instances = std::tuple<
+    // clang-format off
+        //###########################| ALayout| BLayout| CLayout|AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc|         ReduceData|           A|           B|           C|    Reduce|      ReduceInEleOp|      ReduceAccEleOp|      Reduce|          GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
+        //###########################|        |        |        | Type|  Type|  Type| DataType| DataType|  DataType|         Type Tuple| Elementwise| Elementwise| Elementwise| Operation|                   |                    |  MemoryData|Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
+        //###########################|        |        |        |     |      |      |         |         |          |                   |   Operation|   Operation|   Operation|          |                   |                    |   Operation|              |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
+        //###########################|        |        |        |     |      |      |         |         |          |                   |            |            |            |          |                   |                    |            |              |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,   128,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,   128,    64,    32,   2,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,    64,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,    64,    32,   2,   8,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,    64,   128,    32,   2,   8,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,     false,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Col,      Col,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>
+    // clang-format on
+    >;
+
+void add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instances(
+    std::vector<DeviceGemmReducePtr<0, ReducePtrsGlobal::Size()>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp
new file mode 100644
index 00000000..0a3b566d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instance.cpp
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_reduce_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16              = ck::half_t;
+using F32              = float;
+using ReducePtrsGlobal = ck::Tuple<F32*, F32*>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using ReduceSum   = ck::reduce::Add;
+using ReduceOps   = ck::Tuple<ReduceSum, ReduceSum>;
+
+using Div                 = ck::tensor_operation::element_wise::UnaryDivide;
+using Identity            = ck::tensor_operation::element_wise::PassThrough;
+using Square              = ck::tensor_operation::element_wise::UnarySquare;
+using ReduceInElementOps  = ck::Tuple<Identity, Square>;
+using ReduceOutElementOps = ck::Tuple<Div, Div>;
+
+using ReduceMemOp = ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicAdd,
+                                                          ck::InMemoryDataOperationEnum::AtomicAdd>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// c[m, n] = a[m, k] * b[n, k]
+using device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instances = std::tuple<
+    // clang-format off
+        //###########################| ALayout| BLayout| CLayout|AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc|         ReduceData|           A|           B|           C|    Reduce|      ReduceInEleOp|      ReduceAccEleOp|      Reduce|          GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
+        //###########################|        |        |        | Type|  Type|  Type| DataType| DataType|  DataType|         Type Tuple| Elementwise| Elementwise| Elementwise| Operation|                   |                    |  MemoryData|Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
+        //###########################|        |        |        |     |      |      |         |         |          |                   |   Operation|   Operation|   Operation|          |                   |                    |   Operation|              |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
+        //###########################|        |        |        |     |      |      |         |         |          |                   |            |            |            |          |                   |                    |            |              |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,   128,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,   128,    64,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,    64,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 16, 1, 4>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,    64,   128,    32,   8,   2,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,     false,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,      Row,    Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough,   ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,   GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>
+    // clang-format on
+    >;
+
+void add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instances(
+    std::vector<DeviceGemmReducePtr<0, ReducePtrsGlobal::Size()>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
new file mode 100644
index 00000000..2b17e47b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_reduce/device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instance.cpp
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_reduce_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16              = ck::half_t;
+using F32              = float;
+using ReducePtrsGlobal = ck::Tuple<F32*, F32*>;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using ReduceSum   = ck::reduce::Add;
+using ReduceOps   = ck::Tuple<ReduceSum, ReduceSum>;
+
+using Div                 = ck::tensor_operation::element_wise::UnaryDivide;
+using Identity            = ck::tensor_operation::element_wise::PassThrough;
+using Square              = ck::tensor_operation::element_wise::UnarySquare;
+using ReduceInElementOps  = ck::Tuple<Identity, Square>;
+using ReduceOutElementOps = ck::Tuple<Div, Div>;
+
+using ReduceMemOp = ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicAdd,
+                                                          ck::InMemoryDataOperationEnum::AtomicAdd>;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// c[m, n] = a[m, k] * b[n, k]
+using device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instances = std::tuple<
+    // clang-format off
+        //###########################| ALayout| BLayout| CLayout|AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc|         ReduceData|           A|           B|           C|    Reduce|      ReduceInEleOp|      ReduceAccEleOp|      Reduce|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
+        //###########################|        |        |        | Type|  Type|  Type| DataType| DataType|  DataType|         Type Tuple| Elementwise| Elementwise| Elementwise| Operation|                   |                    |  MemoryData| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
+        //###########################|        |        |        |     |      |      |         |         |          |                   |   Operation|   Operation|   Operation|          |                   |                    |   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
+        //###########################|        |        |        |     |      |      |         |         |          |                   |            |            |            |          |                   |                    |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |
+        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,    GemmDefault,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<32, 2>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,    GemmDefault,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 32, 1, 4>,               8,             S<64, 2>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,    GemmDefault,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 8>,               8,             S<32, 4>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,    GemmDefault,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<32, 2>,                         4,                            1>,
+        DeviceGemmReduce_Xdl_CShuffle<     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   ReducePtrsGlobal, PassThrough, PassThrough, PassThrough, ReduceOps, ReduceInElementOps, ReduceOutElementOps, ReduceMemOp,    GemmDefault,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,               S<1, 16, 1, 4>,               8,             S<32, 2>,                         4,                            1>
+    // clang-format on
+    >;
+
+void add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instances(
+    std::vector<DeviceGemmReducePtr<0, ReducePtrsGlobal::Size()>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_splitk/CMakeLists.txt
new file mode 100644
index 00000000..6b336227
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_instance_library(device_gemm_splitk_instance
+   device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp
+   device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp
+   device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp
+   device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp
+   device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
+   device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
+   device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp
+   device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp
new file mode 100644
index 00000000..e178d3b0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances = std::tuple<
+    // clang-format off
+        //#########################|AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //#########################| Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //#########################|     |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#########################|     |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,  256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              4,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,  256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              2,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,  128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              4,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,  256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              2,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,  128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              4,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,  128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              2,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,  256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              2,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,      true,           1,           1,                   S<1, 16, 1, 4>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,  256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              1,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>
+    // clang-format on
+    >;
+
+void add_device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmSplitK<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp
new file mode 100644
index 00000000..52be9fe7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances = std::tuple<
+    // clang-format off
+        //#########################|AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //#########################| Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //#########################|     |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#########################|     |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              4,              8,      true,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              2,              8,      true,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              4,              8,      true,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              2,              8,      true,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              4,              8,      true,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              2,              8,      true,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              2,              8,      true,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,      true,           1,           1,                   S<1, 16, 1, 4>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              1,              8,      true,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              8,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>
+    // clang-format on
+    >;
+
+void add_device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmSplitK<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
new file mode 100644
index 00000000..9b5ff404
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances = std::tuple<
+    // clang-format off
+        //#########################|AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //#########################| Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //#########################|     |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#########################|     |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              8,      true,           1,           1,                   S<1, 16, 1, 4>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>
+    // clang-format on
+    >;
+
+void add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmSplitK<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
new file mode 100644
index 00000000..7fc35c41
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances = std::tuple<
+    // clang-format off
+        //#########################|AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //#########################| Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //#########################|     |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#########################|     |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              8,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              8,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              8,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              8,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              8,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              8,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              8,              8,      true,           1,           1,                   S<1, 16, 1, 4>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              8,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              8,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              8,              8,      true,           1,           1,                   S<1, 32, 1, 4>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              8,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              8,              8,      true,           1,           1,                   S<1, 16, 1, 4>,               8>,
+        DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              8,              8,      true,           1,           1,                   S<1, 16, 1, 4>,               8>
+    // clang-format on
+    >;
+
+void add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmSplitK<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp
new file mode 100644
index 00000000..f27b2199
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[k, n] = c[m, n]
+using device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances = std::tuple<
+    // clang-format off
+        //#########################|AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //#########################| Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //#########################|     |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#########################|     |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,  256,   256,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              4,              4,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              4,      true,           1,           1,                   S<1, 32, 1, 8>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,  256,   128,   256,     4,  4,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              2,              4,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              4,      true,           1,           1,                   S<1, 32, 1, 8>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,  128,   128,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              4,      true,           1,           1,                   S<1, 16, 1, 8>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,  256,   128,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              2,              4,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              4,      true,           1,           1,                   S<1, 32, 1, 8>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,  128,   128,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              4,      true,           1,           1,                   S<1, 32, 1, 4>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,  128,    64,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              2,              4,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              4,      true,           1,           1,                   S<1, 16, 1, 8>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,  256,   128,    64,     4,  4,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              2,              4,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              4,      true,           1,           1,                   S<1, 16, 1, 4>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,    GemmDefault,  256,    64,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              1,              4,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              4,      true,           1,           1,                   S<1, 32, 1, 8>,               4>
+    // clang-format on
+    >;
+
+void add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmSplitK<Col, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp
new file mode 100644
index 00000000..b9a10955
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[k, m] * b[n, k] = c[m, n]
+using device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances = std::tuple<
+    // clang-format off
+        //#########################|AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //#########################| Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //#########################|     |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#########################|     |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              4,              4,      true,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,      true,           1,           1,                   S<1, 32, 1, 8>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  4,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              2,              4,      true,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,      true,           1,           1,                   S<1, 32, 1, 8>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,      true,           1,           1,                   S<1, 16, 1, 8>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              2,              4,      true,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,      true,           1,           1,                   S<1, 32, 1, 8>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,      true,           1,           1,                   S<1, 32, 1, 4>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              2,              4,      true,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,      true,           1,           1,                   S<1, 16, 1, 8>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  4,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              2,              4,      true,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,      true,           1,           1,                   S<1, 16, 1, 4>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,              2,              1,              4,      true,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,             3,              4,              4,      true,           1,           1,                   S<1, 32, 1, 8>,               4>
+    // clang-format on
+    >;
+
+void add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmSplitK<Col, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp
new file mode 100644
index 00000000..44e5f597
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[k, n] = c[m, n]
+using device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances = std::tuple<
+    // clang-format off
+        //#########################|AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //#########################| Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //#########################|     |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#########################|     |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              4,      true,           1,           1,                   S<1, 32, 1, 8>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  4,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              4,      true,           1,           1,                   S<1, 32, 1, 8>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              4,      true,           1,           1,                   S<1, 16, 1, 8>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              4,      true,           1,           1,                   S<1, 32, 1, 8>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              4,      true,           1,           1,                   S<1, 32, 1, 4>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              4,      true,           1,           1,                   S<1, 16, 1, 8>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  4,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              1,              4,      true,           1,           1,                   S<1, 16, 1, 4>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              4,      true,           1,           1,                   S<1, 32, 1, 8>,               4>
+    // clang-format on
+    >;
+
+void add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmSplitK<Row, Row, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp
new file mode 100644
index 00000000..f3a9063f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_splitk_c_shuffle.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// Compilation parameters for a[m, k] * b[n, k] = c[m, n]
+using device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances = std::tuple<
+    // clang-format off
+        //#########################|AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
+        //#########################| Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
+        //#########################|     |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#########################|     |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              4,              4,      true,           1,           1,                   S<1, 32, 1, 8>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  4,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              4,              4,      true,           1,           1,                   S<1, 32, 1, 8>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              4,              4,      true,           1,           1,                   S<1, 16, 1, 8>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              4,              4,      true,           1,           1,                   S<1, 32, 1, 8>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              4,              4,      true,           1,           1,                   S<1, 32, 1, 4>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    64,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              4,              4,      true,           1,           1,                   S<1, 16, 1, 8>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    64,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              4,              4,      true,           1,           1,                   S<1, 16, 1, 4>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  4,   32,   32,    2,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              4,              4,      true,           1,           1,                   S<1, 32, 1, 8>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              4,              4,      true,           1,           1,                   S<1, 32, 1, 8>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              4,              4,      true,           1,           1,                   S<1, 32, 1, 4>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    32,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              4,              4,      true,           1,           1,                   S<1, 16, 1, 8>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    64,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              4,              4,      true,           1,           1,                   S<1, 16, 1, 4>,               4>,
+        DeviceGemmXdlSplitKCShuffle<  F32,   F32,   F32,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    32,    64,     4,  4,   32,   32,    1,    2,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              4,              4,      true,  S<1, 4, 16, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              4,              4,      true,           1,           1,                   S<1, 16, 1, 4>,               4>
+    // clang-format on
+    >;
+
+void add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmSplitK<Row, Col, Row, F32, F32, F32, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(instances,
+                                   device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/CMakeLists.txt
new file mode 100644
index 00000000..3808e024
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_instance_library(device_grouped_conv1d_bwd_weight_instance
+    device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instance.cpp
+    device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instance.cpp
+    device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp
new file mode 100644
index 00000000..05ba4492
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = bhalf_t;
+using F32  = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using GNWC = ck::tensor_layout::convolution::GNWC;
+using GKXC = ck::tensor_layout::convolution::GKXC;
+using GNWK = ck::tensor_layout::convolution::GNWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvBwdWeightDefault =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default;
+
+static constexpr auto ConvBwdWeightFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, wi, c] * wei[k, x, c] = out[n, wo, k]
+using device_grouped_conv1d_bwd_weight_xdl_c_shuffle_gnwc_gkxc_gnwk_bf16_f32_bf16_instances =
+    std::tuple<
+        // clang-format off
+        //#########################################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                       ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|   BBlockTransfer|  BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //#########################################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                             Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //#########################################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                     Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //#########################################|        |       |        |        |        |            |            |            |                                   |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                 |                |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 32, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 8,  4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 8,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              1,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              1,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>
+        // clang-format on
+        >;
+
+using device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_1x1_s1_p0_bf16_f32_bf16_instances =
+    std::tuple<
+        // clang-format off
+        //#########################################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                       ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|   BBlockTransfer|  BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //#########################################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                             Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //#########################################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                     Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //#########################################|        |       |        |        |        |            |            |            |                                   |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                 |                |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 32, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 8,  4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 8,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>
+        // clang-format on
+        >;
+
+void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<1,
+                                                           GNWC,
+                                                           GKXC,
+                                                           GNWK,
+                                                           BF16,
+                                                           F32,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv1d_bwd_weight_xdl_c_shuffle_gnwc_gkxc_gnwk_bf16_f32_bf16_instances{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_1x1_s1_p0_bf16_f32_bf16_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instance.cpp
new file mode 100644
index 00000000..7a610a74
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instance.cpp
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using GNWC = ck::tensor_layout::convolution::GNWC;
+using GKXC = ck::tensor_layout::convolution::GKXC;
+using GNWK = ck::tensor_layout::convolution::GNWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvBwdWeightDefault =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default;
+
+static constexpr auto ConvBwdWeightFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, wi, c] * wei[k, x, c] = out[n, wo, k]
+using device_grouped_conv1d_bwd_weight_xdl_c_shuffle_gnwc_gkxc_gnwk_f16_default_instances =
+    std::tuple<
+        // clang-format off
+        //#########################################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                      ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //#########################################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                            Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //#########################################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                    Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //#########################################|        |       |        |        |        |            |            |            |                                  |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>
+        // clang-format on
+        >;
+
+using device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_1x1_s1_p0_f16_instances = std::tuple<
+    // clang-format off
+        //#########################################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                      ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //#########################################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                            Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //#########################################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                    Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //#########################################|        |       |        |        |        |            |            |            |                                  |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>
+    // clang-format on
+    >;
+
+void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<1,
+                                                           GNWC,
+                                                           GKXC,
+                                                           GNWK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv1d_bwd_weight_xdl_c_shuffle_gnwc_gkxc_gnwk_f16_default_instances{});
+    add_device_operation_instances(
+        instances, device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_1x1_s1_p0_f16_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instance.cpp
new file mode 100644
index 00000000..90e074f0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_bwd_weight/device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instance.cpp
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32 = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using GNWC = ck::tensor_layout::convolution::GNWC;
+using GKXC = ck::tensor_layout::convolution::GKXC;
+using GNWK = ck::tensor_layout::convolution::GNWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvBwdWeightDefault =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default;
+
+static constexpr auto ConvBwdWeightFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, wi, c] * wei[k, x, c] = out[n, wo, k]
+using device_grouped_conv1d_bwd_weight_xdl_c_shuffle_gnwc_gkxc_gnwk_f32_default_instances =
+    std::tuple<
+        // clang-format off
+        //#########################################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                      ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //#########################################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                            Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //#########################################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                    Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //#########################################|        |       |        |        |        |            |            |            |                                  |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   256,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 64, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   128,   256,     4,  4,   32,   32,    2,    4,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 64, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,   128,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   128,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,   128,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,    64,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,    64,    64,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   128,    64,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,    64,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 16, 4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,   128,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,    32,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,    64,    64,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,    64,    32,    64,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>
+        // clang-format on
+        >;
+
+using device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_1x1_s1_p0_f32_instances = std::tuple<
+    // clang-format off
+        //#########################################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                      ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //#########################################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                            Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //#########################################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                    Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //#########################################|        |       |        |        |        |            |            |            |                                  |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   256,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,   256,     4,  4,   32,   32,    2,    4,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 64, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,    64,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,    64,    64,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,    64,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,    64,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,    32,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,    64,    64,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       1,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,    64,    32,    64,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>
+    // clang-format on
+    >;
+
+void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<1,
+                                                           GNWC,
+                                                           GKXC,
+                                                           GNWK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv1d_bwd_weight_xdl_c_shuffle_gnwc_gkxc_gnwk_f32_default_instances{});
+    add_device_operation_instances(
+        instances, device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_1x1_s1_p0_f32_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/CMakeLists.txt
new file mode 100644
index 00000000..1d90593e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_instance_library(device_grouped_conv1d_fwd_instance
+   device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp
+   device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instance.cpp
+   device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instance.cpp
+   device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp
new file mode 100644
index 00000000..74aebf10
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instance.cpp
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Empty_Tuple = ck::Tuple<>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using GNWC = ck::tensor_layout::convolution::GNWC;
+using GKXC = ck::tensor_layout::convolution::GKXC;
+using GNWK = ck::tensor_layout::convolution::GNWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvFwdDefault =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto ConvFwd1x1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
+
+static constexpr auto ConvFwd1x1S1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// Compilation parameters for in[g, n, wi, c] * wei[g, k, x, c] = out[g, n, wo, k]
+using device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instances =
+    std::tuple<
+        // clang-format off
+        // Default
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+
+        // Filter1x1Pad0
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+
+        // Filter1x1Stride1Pad0
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+        // clang-format on
+        >;
+
+void add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<1,
+                                                              GNWC,
+                                                              GKXC,
+                                                              Empty_Tuple,
+                                                              GNWK,
+                                                              BF16,
+                                                              BF16,
+                                                              Empty_Tuple,
+                                                              BF16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instance.cpp
new file mode 100644
index 00000000..361ea8f4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instance.cpp
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Empty_Tuple = ck::Tuple<>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using GNWC = ck::tensor_layout::convolution::GNWC;
+using GKXC = ck::tensor_layout::convolution::GKXC;
+using GNWK = ck::tensor_layout::convolution::GNWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvFwdDefault =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto ConvFwd1x1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
+
+static constexpr auto ConvFwd1x1S1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// Compilation parameters for in[g, n, wi, c] * wei[g, k, x, c] = out[g, n, wo, k]
+using device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instances =
+    std::tuple<
+        // clang-format off
+        // Default
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+
+        // Filter1x1Pad0
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+
+        // Filter1x1Stride1Pad0
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+        // clang-format on
+        >;
+
+void add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<1,
+                                                              GNWC,
+                                                              GKXC,
+                                                              Empty_Tuple,
+                                                              GNWK,
+                                                              F16,
+                                                              F16,
+                                                              Empty_Tuple,
+                                                              F16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instance.cpp
new file mode 100644
index 00000000..3145b716
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instance.cpp
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32 = float;
+
+using Empty_Tuple = ck::Tuple<>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using GNWC = ck::tensor_layout::convolution::GNWC;
+using GKXC = ck::tensor_layout::convolution::GKXC;
+using GNWK = ck::tensor_layout::convolution::GNWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvFwdDefault =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto ConvFwd1x1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
+
+static constexpr auto ConvFwd1x1S1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// Compilation parameters for in[g, n, wi, c] * wei[g, k, x, c] = out[g, n, wo, k]
+using device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instances =
+    std::tuple<
+        // clang-format off
+        // Default
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+
+        // Filter1x1Pad0
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+
+        // Filter1x1Stride1Pad0
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>
+        // clang-format on
+        >;
+
+void add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<1,
+                                                              GNWC,
+                                                              GKXC,
+                                                              Empty_Tuple,
+                                                              GNWK,
+                                                              F32,
+                                                              F32,
+                                                              Empty_Tuple,
+                                                              F32,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instance.cpp
new file mode 100644
index 00000000..cde93f90
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv1d_fwd/device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instance.cpp
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Empty_Tuple = ck::Tuple<>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using GNWC = ck::tensor_layout::convolution::GNWC;
+using GKXC = ck::tensor_layout::convolution::GKXC;
+using GNWK = ck::tensor_layout::convolution::GNWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvFwdDefault =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto ConvFwd1x1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
+
+static constexpr auto ConvFwd1x1S1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// Compilation parameters for in[g, n, wi, c] * wei[g, k, x, c] = out[g, n, wo, k]
+using device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instances = std::tuple<
+    // clang-format off
+        // Default
+        //########################################|  NumDim|      A|      B|          Ds|      E|  AData|  BData| AccData| CShuffle|          Ds|  EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|   Type|   Type|    Type| DataType|    DataType|   Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |       |       |        |         |            |       |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |       |       |        |         |            |       |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+
+        // Filter1x1Pad0
+        //########################################|  NumDim|      A|      B|          Ds|      E|  AData|  BData| AccData| CShuffle|          Ds|  EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|   Type|   Type|    Type| DataType|    DataType|   Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |       |       |        |         |            |       |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |       |       |        |         |            |       |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+                                                                                                                                                                                                                
+        // Filter1x1Stride1Pad0                                                                                                                                                                                 
+        //########################################|  NumDim|      A|      B|          Ds|      E|  AData|  BData| AccData| CShuffle|          Ds|  EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|   Type|   Type|    Type| DataType|    DataType|   Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |       |       |        |         |            |       |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |       |       |        |         |            |       |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       1,   GNWC,   GKXC, Empty_Tuple,   GNWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+    // clang-format on
+    >;
+
+void add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<1,
+                                                              GNWC,
+                                                              GKXC,
+                                                              Empty_Tuple,
+                                                              GNWK,
+                                                              int8_t,
+                                                              int8_t,
+                                                              Empty_Tuple,
+                                                              int8_t,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt
new file mode 100644
index 00000000..3b2968d4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_instance_library(device_grouped_conv2d_bwd_data_instance
+   device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
new file mode 100644
index 00000000..3d604d42
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Empty_Tuple = ck::Tuple<>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using GNHWC = ck::tensor_layout::convolution::GNHWC;
+using GKYXC = ck::tensor_layout::convolution::GKYXC;
+using GNHWK = ck::tensor_layout::convolution::GNHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvBwdDataDefault =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
+
+static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Filter1x1Stride1Pad0;
+
+using device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instances = std::tuple<
+    // clang-format off
+        // 1. Default
+        // ##############################################|    NDim| ALayout| BLayout|    DsLayout| ELayout| AData| BData| AccData| CShuffle|      DsData| EData| AElementwise| BElementwise| CDEElementwise| ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
+        // ##############################################| Spatial|        |        |            |        |  Type|  Type|    Type| DataType|        Type|  Type|    Operation|    Operation|      Operation|  DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
+        // ##############################################|        |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
+        // ##############################################|        |        |        |            |        |      |      |        |         |            |      |             |             |               |                    |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   256,   128,   256,    32,   8,   8,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   128,   128,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   256,   128,   128,    32,   8,   8,   32,   32,       2,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   128,   128,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 4>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   128,    64,   128,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   256,   128,    64,    32,   8,   8,   32,   32,       2,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   256,    64,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   128,   128,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 4>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,   128,    32,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,    64,    64,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataDefault,  true,  true,             1,    64,    32,    64,    32,   8,   8,   32,   32,       1,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8>,
+
+        // 2. Filter1x1Stride1Pad0
+        // ##############################################|    NDim| ALayout| BLayout|    DsLayout| ELayout| AData| BData| AccData| CShuffle|      DsData| EData| AElementwise| BElementwise| CDEElementwise|              ConvolutionBackward| DoPad| DoPad|      NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer|    MXdl|    NXdl|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffleMXdl| CShuffleNXdl|   CDEBlockTransfer| CDEBlockTransfer|
+        // ##############################################| Spatial|        |        |            |        |  Type|  Type|    Type| DataType|        Type|  Type|    Operation|    Operation|      Operation|               DataSpecialization| GemmM| GemmN| PrefetchStage|  Size| Block| Block| Block|    |    |  XDL|  XDL| PerWave| PerWave|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|      PerWave|      PerWave|  _MBlock_MPerBlock|  ScalarPerVector|
+        // ##############################################|        |        |        |            |        |      |      |        |         |            |      |             |             |               |                                 |      |      |              |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          |   PerShuffle|   PerShuffle|  _NBlock_NPerBlock|       _NPerBlock|
+        // ##############################################|        |        |        |            |        |      |      |        |         |            |      |             |             |               |                                 |      |      |              |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |             |             |                   |                 |
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,   256,   256,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,   256,   128,   256,    32,   8,   8,   32,   32,       2,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,   128,   128,   128,    32,   8,   8,   32,   32,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,   256,   128,   128,    32,   8,   8,   32,   32,       2,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,   128,   128,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 4>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,   128,    64,   128,    32,   8,   8,   32,   32,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,    64,    64,    64,    32,   8,   8,   32,   32,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,   256,   128,    64,    32,   8,   8,   32,   32,       2,       1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,   256,    64,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,   128,   128,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 32, 1, 4>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,   128,    32,   128,    32,   8,   8,   32,   32,       1,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 8>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,    64,    64,    32,    32,   8,   8,   32,   32,       2,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8>,
+        DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<       2,   GNHWK,   GKYXC, Empty_Tuple,   GNHWC,   F16,   F16,     F32,      F16, Empty_Tuple,   F16,  PassThrough,  PassThrough,    PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,  true,  true,             1,    64,    32,    64,    32,   8,   8,   32,   32,       1,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              1,              8,              8,         1,            1,            1,     S<1, 16, 1, 4>,                8>
+    // clang-format on
+    >;
+
+void add_device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdDataMultipleD<2,
+                                                                  GNHWK,
+                                                                  GKYXC,
+                                                                  Empty_Tuple,
+                                                                  GNHWC,
+                                                                  F16,
+                                                                  F16,
+                                                                  Empty_Tuple,
+                                                                  F16,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_grouped_conv2d_bwd_data_xdl_gnhwc_gkyxc_gnhwk_f16_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt
new file mode 100644
index 00000000..4009121e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_instance_library(device_grouped_conv2d_bwd_weight_instance
+    device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+    device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
+    device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
+)
+
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
new file mode 100644
index 00000000..ede21f1f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = bhalf_t;
+using F32  = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using GNHWC = ck::tensor_layout::convolution::GNHWC;
+using GKYXC = ck::tensor_layout::convolution::GKYXC;
+using GNHWK = ck::tensor_layout::convolution::GNHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvBwdWeightDefault =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default;
+
+static constexpr auto ConvBwdWeightFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
+using device_grouped_conv2d_bwd_weight_xdl_c_shuffle_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instances =
+    std::tuple<
+        // clang-format off
+        //#########################################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                       ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|   BBlockTransfer|  BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //#########################################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                             Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //#########################################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                     Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //#########################################|        |       |        |        |        |            |            |            |                                   |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                 |                |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 32, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 8,  4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 8,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              1,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              1,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>
+        // clang-format on
+        >;
+
+using device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_1x1_s1_p0_bf16_f32_bf16_instances =
+    std::tuple<
+        // clang-format off
+        //#########################################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                       ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|   BBlockTransfer|  BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //#########################################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                             Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //#########################################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                     Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //#########################################|        |       |        |        |        |            |            |            |                                   |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                 |                |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 32, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 8,  4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 8,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>
+        // clang-format on
+        >;
+
+void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           GNHWC,
+                                                           GKYXC,
+                                                           GNHWK,
+                                                           BF16,
+                                                           F32,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv2d_bwd_weight_xdl_c_shuffle_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instances{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_1x1_s1_p0_bf16_f32_bf16_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
new file mode 100644
index 00000000..99e55661
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using GNHWC = ck::tensor_layout::convolution::GNHWC;
+using GKYXC = ck::tensor_layout::convolution::GKYXC;
+using GNHWK = ck::tensor_layout::convolution::GNHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvBwdWeightDefault =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default;
+
+static constexpr auto ConvBwdWeightFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
+using device_grouped_conv2d_bwd_weight_xdl_c_shuffle_gnhwc_gkyxc_gnhwk_f16_default_instances =
+    std::tuple<
+        // clang-format off
+        //#########################################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                      ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //#########################################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                            Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //#########################################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                    Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //#########################################|        |       |        |        |        |            |            |            |                                  |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>
+        // clang-format on
+        >;
+
+using device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_1x1_s1_p0_f16_instances = std::tuple<
+    // clang-format off
+        //#########################################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                      ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //#########################################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                            Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //#########################################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                    Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //#########################################|        |       |        |        |        |            |            |            |                                  |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>
+    // clang-format on
+    >;
+
+void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           GNHWC,
+                                                           GKYXC,
+                                                           GNHWK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv2d_bwd_weight_xdl_c_shuffle_gnhwc_gkyxc_gnhwk_f16_default_instances{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_1x1_s1_p0_f16_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
new file mode 100644
index 00000000..15871a28
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32 = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using GNHWC = ck::tensor_layout::convolution::GNHWC;
+using GKYXC = ck::tensor_layout::convolution::GKYXC;
+using GNHWK = ck::tensor_layout::convolution::GNHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvBwdWeightDefault =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default;
+
+static constexpr auto ConvBwdWeightFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
+using device_grouped_conv2d_bwd_weight_xdl_c_shuffle_gnhwc_gkyxc_gnhwk_f32_default_instances =
+    std::tuple<
+        // clang-format off
+        //#########################################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                      ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //#########################################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                            Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //#########################################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                    Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //#########################################|        |       |        |        |        |            |            |            |                                  |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   256,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 64, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   128,   256,     4,  4,   32,   32,    2,    4,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 64, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,   128,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   128,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,   128,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,    64,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,    64,    64,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   128,    64,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,    64,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 16, 4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,   128,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,    32,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,    64,    64,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,    64,    32,    64,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>
+        // clang-format on
+        >;
+
+using device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_1x1_s1_p0_f32_instances = std::tuple<
+    // clang-format off
+        //#########################################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                      ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //#########################################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                            Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //#########################################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                    Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //#########################################|        |       |        |        |        |            |            |            |                                  |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   256,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,   256,     4,  4,   32,   32,    2,    4,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 64, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,    64,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,    64,    64,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,    64,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,    64,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,    32,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,    64,    64,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       2,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,    64,    32,    64,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>
+    // clang-format on
+    >;
+
+void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           GNHWC,
+                                                           GKYXC,
+                                                           GNHWK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv2d_bwd_weight_xdl_c_shuffle_gnhwc_gkyxc_gnhwk_f32_default_instances{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_1x1_s1_p0_f32_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
new file mode 100644
index 00000000..5ef1b686
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_instance_library(device_grouped_conv2d_fwd_instance
+   # GNHWC, GKYXC, GNHWK
+   device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
+   device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+   device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
+   device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_int8_instance.cpp
+   # NHWGC, GKYXC, NHWGK
+   device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
+   #dl 
+   device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+   device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
+   device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
new file mode 100644
index 00000000..fc18b3c7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using InDataType  = ck::half_t;
+using WeiDataType = ck::half_t;
+using AccDataType = float;
+using OutDataType = ck::half_t;
+
+using Empty_Tuple = ck::Tuple<>;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+using InLayout  = ck::tensor_layout::convolution::GNHWC;
+using WeiLayout = ck::tensor_layout::convolution::GKYXC;
+using OutLayout = ck::tensor_layout::convolution::GNHWK;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+static constexpr auto Filter1x1Pad0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
+static constexpr auto Filter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
+
+static constexpr auto GemmPadingSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instances = std::tuple<
+    // clang-format off
+           // ########################################|        NDim|     InData|     WeiData|    MultpleD|     OutData|     AccData| InLayout| WeiLayout|   MultipleD| OutLayout|           In|           Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+           // ########################################|     Spatial|       Type|        Type|        Type|        Type|        Type|         |          |      Layout|          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+           // ########################################|            |           |            |            |            |            |         |          |            |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+           // ########################################|            |           |            |            |            |            |         |          |            |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2, InDataType, WeiDataType, Empty_Tuple, OutDataType, AccDataType, InLayout, WeiLayout, Empty_Tuple, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp,       ConvSpec,    GemmPadingSpec,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
+    // clang-format on
+    >;
+using device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_Filter1x1Pad0_instances = std::tuple<
+    // clang-format off
+           // ########################################|        NDim|     InData|     WeiData|    MultpleD|     OutData|     AccData| InLayout| WeiLayout|   MultipleD| OutLayout|           In|           Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+           // ########################################|     Spatial|       Type|        Type|        Type|        Type|        Type|         |          |      Layout|          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+           // ########################################|            |           |            |            |            |            |         |          |            |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+           // ########################################|            |           |            |            |            |            |         |          |            |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2, InDataType, WeiDataType, Empty_Tuple, OutDataType, AccDataType, InLayout, WeiLayout, Empty_Tuple, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp,  Filter1x1Pad0,    GemmPadingSpec,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
+    // clang-format on
+    >;
+
+using device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_Filter1x1Stride1Pad0_instances =
+    std::tuple<
+        // clang-format off
+           // ########################################|        NDim|     InData|     WeiData|    MultpleD|     OutData|     AccData| InLayout| WeiLayout|   MultipleD| OutLayout|           In|           Wei|           Out|          Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+           // ########################################|     Spatial|       Type|        Type|        Type|        Type|        Type|         |          |      Layout|          |  Elementwise|   Elementwise|   Elementwise|              Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+           // ########################################|            |           |            |            |            |            |         |          |            |          |    Operation|     Operation|     Operation|       Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+           // ########################################|            |           |            |            |            |            |         |          |            |          |             |              |              |                     |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2, InDataType, WeiDataType, Empty_Tuple, OutDataType, AccDataType, InLayout, WeiLayout, Empty_Tuple, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp, Filter1x1Stride1Pad0,    GemmPadingSpec,   256,   128,   128,    16,  2,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>,      S<8, 1, 1, 2>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 2>,      S<1, 2, 0, 3>,       S<1, 1, 1, 2>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
+        // clang-format on
+        >;
+
+void add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              InLayout,
+                                                              WeiLayout,
+                                                              Empty_Tuple,
+                                                              OutLayout,
+                                                              InDataType,
+                                                              WeiDataType,
+                                                              Empty_Tuple,
+                                                              OutDataType,
+                                                              InElementOp,
+                                                              WeiElementOp,
+                                                              OutElementOp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instances{});
+
+    add_device_operation_instances(
+        instances, device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_Filter1x1Pad0_instances{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_Filter1x1Stride1Pad0_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
new file mode 100644
index 00000000..648b3963
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using InDataType  = float;
+using WeiDataType = float;
+using AccDataType = float;
+using OutDataType = float;
+
+using Empty_Tuple = ck::Tuple<>;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+using InLayout  = ck::tensor_layout::convolution::GNHWC;
+using WeiLayout = ck::tensor_layout::convolution::GKYXC;
+using OutLayout = ck::tensor_layout::convolution::GNHWK;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+static constexpr auto Filter1x1Pad0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
+static constexpr auto Filter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
+
+static constexpr auto GemmPadingSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instances = std::tuple<
+    // clang-format off
+           // clang-format off
+           // ########################################|        NDim|     InData|     WeiData|    MultpleD|     OutData|     AccData| InLayout| WeiLayout|   MultipleD| OutLayout|           In|           Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+           // ########################################|     Spatial|       Type|        Type|        Type|        Type|        Type|         |          |      Layout|          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+           // ########################################|            |           |            |            |            |            |         |          |            |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+           // ########################################|            |           |            |            |            |            |         |          |            |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2, InDataType, WeiDataType, Empty_Tuple, OutDataType, AccDataType, InLayout, WeiLayout, Empty_Tuple, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp,       ConvSpec,    GemmPadingSpec,   256,   128,   128,    16,  1,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
+    // clang-format on
+    >;
+
+using device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_Filter1x1Pad0_instances = std::tuple<
+    // clang-format off
+           // clang-format off
+           // ########################################|        NDim|     InData|     WeiData|    MultpleD|     OutData|     AccData| InLayout| WeiLayout|   MultipleD| OutLayout|           In|           Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+           // ########################################|     Spatial|       Type|        Type|        Type|        Type|        Type|         |          |      Layout|          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+           // ########################################|            |           |            |            |            |            |         |          |            |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+           // ########################################|            |           |            |            |            |            |         |          |            |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2, InDataType, WeiDataType, Empty_Tuple, OutDataType, AccDataType, InLayout, WeiLayout, Empty_Tuple, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp,  Filter1x1Pad0,    GemmPadingSpec,   256,   128,   128,    16,  1,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
+    // clang-format on
+    >;
+
+using device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_Filter1x1Stride1Pad0_instances =
+    std::tuple<
+        // clang-format off
+           // clang-format off
+           // ########################################|        NDim|     InData|     WeiData|    MultpleD|     OutData|     AccData| InLayout| WeiLayout|   MultipleD| OutLayout|           In|           Wei|           Out|          Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+           // ########################################|     Spatial|       Type|        Type|        Type|        Type|        Type|         |          |      Layout|          |  Elementwise|   Elementwise|   Elementwise|              Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+           // ########################################|            |           |            |            |            |            |         |          |            |          |    Operation|     Operation|     Operation|       Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+           // ########################################|            |           |            |            |            |            |         |          |            |          |             |              |              |                     |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2, InDataType, WeiDataType, Empty_Tuple, OutDataType, AccDataType, InLayout, WeiLayout, Empty_Tuple, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp, Filter1x1Stride1Pad0,    GemmPadingSpec,   256,   128,   128,    16,  1,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>,      S<8, 1, 1, 1>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 1>,      S<1, 2, 0, 3>,       S<1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
+        // clang-format on
+        >;
+
+void add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              InLayout,
+                                                              WeiLayout,
+                                                              Empty_Tuple,
+                                                              OutLayout,
+                                                              InDataType,
+                                                              WeiDataType,
+                                                              Empty_Tuple,
+                                                              OutDataType,
+                                                              InElementOp,
+                                                              WeiElementOp,
+                                                              OutElementOp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instances{});
+
+    add_device_operation_instances(
+        instances, device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_Filter1x1Pad0_instances{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_Filter1x1Stride1Pad0_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_instance.cpp
new file mode 100644
index 00000000..1cb5d069
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_instance.cpp
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using InDataType  = int8_t;
+using WeiDataType = int8_t;
+using AccDataType = int32_t;
+using OutDataType = int8_t;
+
+using Empty_Tuple = ck::Tuple<>;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+using InLayout  = ck::tensor_layout::convolution::GNHWC;
+using WeiLayout = ck::tensor_layout::convolution::GKYXC;
+using OutLayout = ck::tensor_layout::convolution::GNHWK;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+static constexpr auto Filter1x1Pad0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
+static constexpr auto Filter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
+
+static constexpr auto GemmPadingSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_instances = std::tuple<
+    // clang-format off
+           // ########################################|        NDim|     InData|     WeiData|    MultpleD|     OutData|     AccData| InLayout| WeiLayout|   MultipleD| OutLayout|           In|           Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+           // ########################################|     Spatial|       Type|        Type|        Type|        Type|        Type|         |          |      Layout|          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+           // ########################################|            |           |            |            |            |            |         |          |            |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+           // ########################################|            |           |            |            |            |            |         |          |            |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2, InDataType, WeiDataType, Empty_Tuple, OutDataType, AccDataType, InLayout, WeiLayout, Empty_Tuple, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp,       ConvSpec,    GemmPadingSpec,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
+    // clang-format on
+    >;
+
+using device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_Filter1x1Pad0_instances = std::tuple<
+    // clang-format off
+           // ########################################|        NDim|     InData|     WeiData|    MultpleD|     OutData|     AccData| InLayout| WeiLayout|   MultipleD| OutLayout|           In|           Wei|           Out|    Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+           // ########################################|     Spatial|       Type|        Type|        Type|        Type|        Type|         |          |      Layout|          |  Elementwise|   Elementwise|   Elementwise|        Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+           // ########################################|            |           |            |            |            |            |         |          |            |          |    Operation|     Operation|     Operation| Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+           // ########################################|            |           |            |            |            |            |         |          |            |          |             |              |              |               |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2, InDataType, WeiDataType, Empty_Tuple, OutDataType, AccDataType, InLayout, WeiLayout, Empty_Tuple, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp,  Filter1x1Pad0,    GemmPadingSpec,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
+    // clang-format on
+    >;
+
+using device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_Filter1x1Stride1Pad0_instances =
+    std::tuple<
+        // clang-format off
+           // ########################################|        NDim|     InData|     WeiData|    MultpleD|     OutData|     AccData| InLayout| WeiLayout|   MultipleD| OutLayout|           In|           Wei|           Out|          Convolution|              GEMM| Block|  MPer|  NPer| K0Per| K1|      M1Per|      N1Per|   KPer|  M11N11Thread|  M11N11Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|      ABlockTransfer|     ABlockTransfer|      ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|      BBlockTransfer|     BBlockTransfer|      BBlockTransfer|     CThreadTransfer| CThreadTransfer|    CThreadTransfer|
+           // ########################################|     Spatial|       Type|        Type|        Type|        Type|        Type|         |          |      Layout|          |  Elementwise|   Elementwise|   Elementwise|              Forward|    Spacialization|  Size| Block| Block| Block|   | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster|      SrcAccess|     SrcVectorTensor|    SrcVectorTensor|     DstVectorTensor|        SrcDstAccess| SrcDstVectorDim| DstScalarPerVector|
+           // ########################################|            |           |            |            |            |            |         |          |            |          |    Operation|     Operation|     Operation|       Specialization|                  |      |      |      |      |   |           |           |       |              |              |        K0_M0_M1_K1|          K0_M0_M1_K1|   ArrangeOrder|          Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1|        K0_N0_N1_K1|          K0_N0_N1_K1|   ArrangeOrder|          Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1|               Order|                |                   |
+           // ########################################|            |           |            |            |            |            |         |          |            |          |             |              |              |                     |                  |      |      |      |      |   |           |           |       |              |              |                   |                     |               |               |                    |                   |                    |                   |                     |               |               |                    |                   |                    |                    |                |                   |
+        DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<           2, InDataType, WeiDataType, Empty_Tuple, OutDataType, AccDataType, InLayout, WeiLayout, Empty_Tuple, OutLayout,  InElementOp,  WeiElementOp,  OutElementOp, Filter1x1Stride1Pad0,    GemmPadingSpec,   256,   128,   128,    16,  4,          4,          4,      1,       S<8, 2>,       S<8, 2>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>,      S<8, 1, 1, 4>,      S<2, 1, 128, 1>,  S<1, 2, 0, 3>,  S<1, 2, 0, 3>,       S<4, 1, 1, 4>,      S<1, 2, 0, 3>,       S<1, 1, 1, 4>, S<0, 1, 2, 3, 4, 5>,               5,                 4>
+        // clang-format on
+        >;
+
+void add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              InLayout,
+                                                              WeiLayout,
+                                                              Empty_Tuple,
+                                                              OutLayout,
+                                                              InDataType,
+                                                              WeiDataType,
+                                                              Empty_Tuple,
+                                                              OutDataType,
+                                                              InElementOp,
+                                                              WeiElementOp,
+                                                              OutElementOp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_instances{});
+
+    add_device_operation_instances(
+        instances, device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_Filter1x1Pad0_instances{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_int8_Filter1x1Stride1Pad0_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
new file mode 100644
index 00000000..29f33103
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Empty_Tuple = ck::Tuple<>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using GNHWC = ck::tensor_layout::convolution::GNHWC;
+using GKYXC = ck::tensor_layout::convolution::GKYXC;
+using GNHWK = ck::tensor_layout::convolution::GNHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvFwdDefault =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto ConvFwd1x1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
+
+static constexpr auto ConvFwd1x1S1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
+
+static constexpr auto ConvFwdOddC =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// Compilation parameters for in[g, n, hi, wi, c] * wei[g, k, y, x, c] = out[g, n, ho, wo, k]
+using device_grouped_conv1d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances =
+    std::tuple<
+        // clang-format off
+        // Default
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+
+        // Filter1x1Pad0
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+
+        // Filter1x1Stride1Pad0
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+
+        // OddC
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   256,    64,    32,   8,   8,   32,   32,    4,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,    64,    64,    32,   8,   8,   32,   32,    1,    2,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+        // clang-format on
+        >;
+
+void add_device_grouped_conv1d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              GNHWK,
+                                                              BF16,
+                                                              BF16,
+                                                              Empty_Tuple,
+                                                              BF16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_grouped_conv1d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
new file mode 100644
index 00000000..6a4a3d2a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Empty_Tuple = ck::Tuple<>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using GNHWC = ck::tensor_layout::convolution::GNHWC;
+using GKYXC = ck::tensor_layout::convolution::GKYXC;
+using GNHWK = ck::tensor_layout::convolution::GNHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvFwdDefault =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto ConvFwd1x1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
+
+static constexpr auto ConvFwd1x1S1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
+
+static constexpr auto ConvFwdOddC =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// Compilation parameters for in[g, n, hi ,wi, c] * wei[g, k, y, x, c] = out[g, n, ho, wo, k]
+using device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instances =
+    std::tuple<
+        // clang-format off
+        // Default
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+
+        // Filter1x1Pad0
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+
+        // Filter1x1Stride1Pad0
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+
+        // OddC
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   256,    64,    32,   8,   8,   32,   32,    4,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,    64,    64,    32,   8,   8,   32,   32,    1,    2,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+        // clang-format on
+        >;
+
+void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              GNHWK,
+                                                              F16,
+                                                              F16,
+                                                              Empty_Tuple,
+                                                              F16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
new file mode 100644
index 00000000..1fec35fd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32 = float;
+
+using Empty_Tuple = ck::Tuple<>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using GNHWC = ck::tensor_layout::convolution::GNHWC;
+using GKYXC = ck::tensor_layout::convolution::GKYXC;
+using GNHWK = ck::tensor_layout::convolution::GNHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvFwdDefault =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto ConvFwd1x1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
+
+static constexpr auto ConvFwd1x1S1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// Compilation parameters for in[g, n, hi, wi, c] * wei[g, k, y, x, c] = out[g, n, ho, wo, k]
+using device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances =
+    std::tuple<
+        // clang-format off
+        // Default
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+
+        // Filter1x1Pad0
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+
+        // Filter1x1Stride1Pad0
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>
+        // clang-format on
+        >;
+
+void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              GNHWK,
+                                                              F32,
+                                                              F32,
+                                                              Empty_Tuple,
+                                                              F32,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_int8_instance.cpp
new file mode 100644
index 00000000..59b01213
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_int8_instance.cpp
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Empty_Tuple = ck::Tuple<>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using GNHWC = ck::tensor_layout::convolution::GNHWC;
+using GKYXC = ck::tensor_layout::convolution::GKYXC;
+using GNHWK = ck::tensor_layout::convolution::GNHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvFwdDefault =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto ConvFwd1x1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
+
+static constexpr auto ConvFwd1x1S1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// Compilation parameters for in[g, n, hi, wi, c] * wei[g, k, y, x, c] = out[g, n, ho, wo, k]
+using device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_int8_instances = std::tuple<
+    // clang-format off
+        // Default
+        //########################################|  NumDim|      A|      B|          Ds|      E|  AData|  BData| AccData| CShuffle|          Ds|  EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|   Type|   Type|    Type| DataType|    DataType|   Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |       |       |        |         |            |       |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |       |       |        |         |            |       |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+
+        // Filter1x1Pad0
+        //########################################|  NumDim|      A|      B|          Ds|      E|  AData|  BData| AccData| CShuffle|          Ds|  EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|   Type|   Type|    Type| DataType|    DataType|   Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |       |       |        |         |            |       |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |       |       |        |         |            |       |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+                                                                                                                                                                                                                
+        // Filter1x1Stride1Pad0                                                                                                                                                                                 
+        //########################################|  NumDim|      A|      B|          Ds|      E|  AData|  BData| AccData| CShuffle|          Ds|  EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|   Type|   Type|    Type| DataType|    DataType|   Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |       |       |        |         |            |       |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |       |       |        |         |            |       |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, Empty_Tuple,  GNHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+    // clang-format on
+    >;
+
+void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              GNHWK,
+                                                              int8_t,
+                                                              int8_t,
+                                                              Empty_Tuple,
+                                                              int8_t,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_int8_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
new file mode 100644
index 00000000..8aca7304
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instance.cpp
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Empty_Tuple = ck::Tuple<>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using NHWGC = ck::tensor_layout::convolution::NHWGC;
+using GKYXC = ck::tensor_layout::convolution::GKYXC;
+using NHWGK = ck::tensor_layout::convolution::NHWGK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvFwdDefault =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto ConvFwd1x1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
+
+static constexpr auto ConvFwd1x1S1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
+
+static constexpr auto ConvFwdOddC =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::OddC;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+using device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances =
+    std::tuple<
+        // clang-format off
+        // Default
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+
+        // Filter1x1Pad0
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+
+        // Filter1x1Stride1Pad0
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+
+        // OddC
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<4,  2, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   256,   256,    64,    32,   8,   8,   32,   32,    4,    1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 32, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  NHWGC,  GKYXC, Empty_Tuple,  NHWGK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    ConvFwdOddC, GemmMNKPadding,        1,   128,    64,    64,    32,   8,   8,   32,   32,    1,    2,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,         1,     S<2, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+        // clang-format on
+        >;
+
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<2,
+                                                              NHWGC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              NHWGK,
+                                                              F16,
+                                                              F16,
+                                                              Empty_Tuple,
+                                                              F16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
new file mode 100644
index 00000000..04cad43e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_instance_library(device_grouped_conv3d_bwd_weight_instance
+    device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
+    device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
+    device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
new file mode 100644
index 00000000..e48db4a5
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = bhalf_t;
+using F32  = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using GNDHWC = ck::tensor_layout::convolution::GNDHWC;
+using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
+using GNDHWK = ck::tensor_layout::convolution::GNDHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvBwdWeightDefault =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default;
+
+static constexpr auto ConvBwdWeightFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, di, hi, wi, c] * wei[k, z, y, x, c] = out[n, do, ho, wo, k]
+using device_grouped_conv3d_bwd_weight_xdl_c_shuffle_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instances =
+    std::tuple<
+        // clang-format off
+        //#########################################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                       ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|   BBlockTransfer|  BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //#########################################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                             Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //#########################################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                     Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //#########################################|        |       |        |        |        |            |            |            |                                   |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                 |                |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 32, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 8,  4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 8,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              1,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              1,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,               ConvBwdWeightDefault,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>
+        // clang-format on
+        >;
+
+using device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_1x1_s1_p0_bf16_f32_bf16_instances =
+    std::tuple<
+        // clang-format off
+        //#########################################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                       ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|   BBlockTransfer|  BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //#########################################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                             Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //#########################################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                     Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //#########################################|        |       |        |        |        |            |            |            |                                   |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                 |                |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 32, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 8,  4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 8,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               4>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdWeightFilter1x1Stride1Pad0,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>
+        // clang-format on
+        >;
+
+void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           GNDHWC,
+                                                           GKZYXC,
+                                                           GNDHWK,
+                                                           BF16,
+                                                           F32,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv3d_bwd_weight_xdl_c_shuffle_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instances{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_1x1_s1_p0_bf16_f32_bf16_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
new file mode 100644
index 00000000..1655850e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using GNDHWC = ck::tensor_layout::convolution::GNDHWC;
+using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
+using GNDHWK = ck::tensor_layout::convolution::GNDHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvBwdWeightDefault =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default;
+
+static constexpr auto ConvBwdWeightFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, di, hi, wi, c] * wei[k, z, y, x, c] = out[n, do, ho, wo, k]
+using device_grouped_conv3d_bwd_weight_xdl_c_shuffle_gndhwc_gkzyxc_gndhwk_f16_default_instances =
+    std::tuple<
+        // clang-format off
+        //#########################################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                      ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //#########################################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                            Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //#########################################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                    Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //#########################################|        |       |        |        |        |            |            |            |                                  |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>
+        // clang-format on
+        >;
+
+using device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_1x1_s1_p0_f16_instances =
+    std::tuple<
+        // clang-format off
+        //#########################################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                      ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //#########################################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                            Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //#########################################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                    Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //#########################################|        |       |        |        |        |            |            |            |                                  |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               8>,   
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>
+        // clang-format on
+        >;
+
+void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           GNDHWC,
+                                                           GKZYXC,
+                                                           GNDHWK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv3d_bwd_weight_xdl_c_shuffle_gndhwc_gkzyxc_gndhwk_f16_default_instances{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_1x1_s1_p0_f16_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
new file mode 100644
index 00000000..aba46b7e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_gnwc_gkxc_gnwk_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32 = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using GNDHWC = ck::tensor_layout::convolution::GNDHWC;
+using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
+using GNDHWK = ck::tensor_layout::convolution::GNDHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvBwdWeightDefault =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default;
+
+static constexpr auto ConvBwdWeightFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, di, hi, wi, c] * wei[k, z, y, x, c] = out[n, do, ho, wo, k]
+using device_grouped_conv3d_bwd_weight_xdl_c_shuffle_gndhwc_gkzyxc_gndhwk_f32_default_instances =
+    std::tuple<
+        // clang-format off
+        //#########################################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                      ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //#########################################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                            Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //#########################################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                    Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //#########################################|        |       |        |        |        |            |            |            |                                  |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   256,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 64, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   128,   256,     4,  4,   32,   32,    2,    4,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 64, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,   128,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   128,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,   128,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,    64,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,    64,    64,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,   128,    64,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   256,    64,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 16, 4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,   128,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,   128,    32,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,    64,    64,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,              ConvBwdWeightDefault,    64,    32,    64,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>
+        // clang-format on
+        >;
+
+using device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_1x1_s1_p0_f32_instances =
+    std::tuple<
+        // clang-format off
+        //#########################################|     Num| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                      ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //#########################################|     Dim|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                            Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //#########################################| Spatial|       |        |        |        |   Operation|   Operation|   Operation|                    Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //#########################################|        |       |        |        |        |            |            |            |                                  |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   256,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,   256,     4,  4,   32,   32,    2,    4,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 64, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,    64,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,    64,    64,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,   128,    64,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   256,    64,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,   128,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,   128,    32,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,    64,    64,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeightGnwcGkxcGnwk_Xdl_CShuffle<       3,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough, ConvBwdWeightFilter1x1Stride1Pad0,    64,    32,    64,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>
+        // clang-format on
+        >;
+
+void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           GNDHWC,
+                                                           GKZYXC,
+                                                           GNDHWK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv3d_bwd_weight_xdl_c_shuffle_gndhwc_gkzyxc_gndhwk_f32_default_instances{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_1x1_s1_p0_f32_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
new file mode 100644
index 00000000..78eedca5
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_instance_library(device_grouped_conv3d_fwd_instance
+   device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
+   device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
+   device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
+   device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
new file mode 100644
index 00000000..b4ae8b6c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instance.cpp
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = ck::bhalf_t;
+using F32  = float;
+
+using Empty_Tuple = ck::Tuple<>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using GNDHWC = ck::tensor_layout::convolution::GNDHWC;
+using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
+using GNDHWK = ck::tensor_layout::convolution::GNDHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvFwdDefault =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto ConvFwd1x1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
+
+static constexpr auto ConvFwd1x1S1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// in[g, n, di, hi, wi, c] * wei[g, k, z, y, x, c] = out[g, n, do, ho, wo, k]
+using device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instances =
+    std::tuple<
+        // clang-format off
+        // Default
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+
+        // Filter1x1Pad0
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+
+        // Filter1x1Stride1Pad0
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,  BF16,  BF16,     F32,     BF16, Empty_Tuple,  BF16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+        // clang-format on
+        >;
+
+void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
+                                                              GNDHWC,
+                                                              GKZYXC,
+                                                              Empty_Tuple,
+                                                              GNDHWK,
+                                                              BF16,
+                                                              BF16,
+                                                              Empty_Tuple,
+                                                              BF16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
new file mode 100644
index 00000000..061674bd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Empty_Tuple = ck::Tuple<>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using GNDHWC = ck::tensor_layout::convolution::GNDHWC;
+using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
+using GNDHWK = ck::tensor_layout::convolution::GNDHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvFwdDefault =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto ConvFwd1x1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
+
+static constexpr auto ConvFwd1x1S1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// in[g, n, di, hi, wi, c] * wei[g, k, z, y, x, c] = out[g, n, do, ho, wo, k]
+using device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instances =
+    std::tuple<
+        // clang-format off
+        // Default
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+
+        // Filter1x1Pad0
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+
+        // Filter1x1Stride1Pad0
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+        // clang-format on
+        >;
+
+void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
+                                                              GNDHWC,
+                                                              GKZYXC,
+                                                              Empty_Tuple,
+                                                              GNDHWK,
+                                                              F16,
+                                                              F16,
+                                                              Empty_Tuple,
+                                                              F16,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
new file mode 100644
index 00000000..ed7e5476
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32 = float;
+
+using Empty_Tuple = ck::Tuple<>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using GNDHWC = ck::tensor_layout::convolution::GNDHWC;
+using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
+using GNDHWK = ck::tensor_layout::convolution::GNDHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvFwdDefault =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto ConvFwd1x1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
+
+static constexpr auto ConvFwd1x1S1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// in[g, n, di, hi, wi, c] * wei[g, k, z, y, x, c] = out[g, n, do, ho, wo, k]
+using device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instances =
+    std::tuple<
+        // clang-format off
+        // Default
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+
+        // Filter1x1Pad0
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+
+        // Filter1x1Stride1Pad0
+        //########################################|  NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    16,   4,   4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    16,   4,   4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    16,   4,   4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    16,   4,   4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    16,   4,   4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1, 16, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    16,   4,   4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1, 16>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    16,   4,   4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK,   F32,   F32,     F32,      F32, Empty_Tuple,   F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    16,   4,   4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              4,              4,         1,           1,           1,              S<1,  8, 1,  8>,              4>
+        // clang-format on
+        >;
+
+void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
+                                                              GNDHWC,
+                                                              GKZYXC,
+                                                              Empty_Tuple,
+                                                              GNDHWK,
+                                                              F32,
+                                                              F32,
+                                                              Empty_Tuple,
+                                                              F32,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instance.cpp
new file mode 100644
index 00000000..bf5fa306
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instance.cpp
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Empty_Tuple = ck::Tuple<>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using GNDHWC = ck::tensor_layout::convolution::GNDHWC;
+using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
+using GNDHWK = ck::tensor_layout::convolution::GNDHWK;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvFwdDefault =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto ConvFwd1x1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
+
+static constexpr auto ConvFwd1x1S1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+// in[g, n, di, hi, wi, c] * wei[g, k, z, y, x, c] = out[g, n, do, ho, wo, k]
+using device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instances = std::tuple<
+    // clang-format off
+        // Default
+        //########################################|  NumDim|      A|      B|          Ds|      E|  AData|  BData| AccData| CShuffle|          Ds|  EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|   Type|   Type|    Type| DataType|    DataType|   Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |       |       |        |         |            |       |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |       |       |        |         |            |       |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwdDefault, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+
+        // Filter1x1Pad0
+        //########################################|  NumDim|      A|      B|          Ds|      E|  AData|  BData| AccData| CShuffle|          Ds|  EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|   Type|   Type|    Type| DataType|    DataType|   Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |       |       |        |         |            |       |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |       |       |        |         |            |       |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+                                                                                                                                                                                                                
+        // Filter1x1Stride1Pad0                                                                                                                                                                                 
+        //########################################|  NumDim|      A|      B|          Ds|      E|  AData|  BData| AccData| CShuffle|          Ds|  EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|      Layout| Layout|   Type|   Type|    Type| DataType|    DataType|   Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |            |       |       |       |        |         |            |       |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |            |       |       |       |        |         |            |       |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       3, GNDHWC, GKZYXC, Empty_Tuple, GNDHWK, int8_t, int8_t, int32_t,   int8_t, Empty_Tuple, int8_t, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0, GemmMNKPadding,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+    // clang-format on
+    >;
+
+void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<3,
+                                                              GNDHWC,
+                                                              GKZYXC,
+                                                              Empty_Tuple,
+                                                              GNDHWK,
+                                                              int8_t,
+                                                              int8_t,
+                                                              Empty_Tuple,
+                                                              int8_t,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              PassThrough>>>& instances)
+{
+    add_device_operation_instances(
+        instances, device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt
new file mode 100644
index 00000000..82beb2ac
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_instance_library(device_grouped_gemm_instance
+   device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
+   device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
+   device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp
+   device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp
new file mode 100644
index 00000000..5f2097b0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instance.cpp
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using Empty_Tuple = ck::Tuple<>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// a[k, m] * b[k, n] = e[m, n]
+using device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances = std::tuple<
+    // clang-format off
+        //###################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //###################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //###################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //###################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedGemm_Xdl<    Col,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedGemm_Xdl<    Col,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedGemm_Xdl<    Col,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   2,   2,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedGemm_Xdl<    Col,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedGemm_Xdl<    Col,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedGemm_Xdl<    Col,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedGemm_Xdl<    Col,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedGemm_Xdl<    Col,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedGemm_Xdl<    Col,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   2,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedGemm_Xdl<    Col,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedGemm_Xdl<    Col,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedGemm_Xdl<    Col,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedGemm_Xdl<    Col,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   2,   2,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedGemm_Xdl<    Col,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedGemm_Xdl<    Col,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   2,   2,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedGemm_Xdl<    Col,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>
+    // clang-format on
+    >;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_irregular_tile_instances = std::tuple<
+    // clang-format off
+        //###################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //###################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //###################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //###################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedGemm_Xdl<    Col,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,              1>,
+        DeviceGroupedGemm_Xdl<    Col,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    16,    64,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 4>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              2,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,              1>
+    // clang-format on
+    >;
+
+void add_device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Col,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_instances{});
+    add_device_operation_instances(
+        instances, device_grouped_gemm_xdl_f16_f16_f16_km_kn_mn_irregular_tile_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp
new file mode 100644
index 00000000..677bd1a2
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instance.cpp
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using Empty_Tuple = ck::Tuple<>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// a[k, m] * b[n, k] = e[m, n]
+using device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances = std::tuple<
+    // clang-format off
+        //###################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //###################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //###################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //###################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedGemm_Xdl<    Col,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl<    Col,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl<    Col,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl<    Col,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl<    Col,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl<    Col,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl<    Col,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl<    Col,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl<    Col,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   2,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
+        DeviceGroupedGemm_Xdl<    Col,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
+        DeviceGroupedGemm_Xdl<    Col,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl<    Col,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl<    Col,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   2,   8,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl<    Col,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl<    Col,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   2,   8,   32,   32,    1,    2,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl<    Col,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>
+    // clang-format on
+    >;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_irregular_tile_instances = std::tuple<
+    // clang-format off
+        //###################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //###################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //###################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //###################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedGemm_Xdl<    Col,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,              1>,
+        DeviceGroupedGemm_Xdl<    Col,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    16,    64,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 4>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              2,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,              1>
+    // clang-format on
+    >;
+
+void add_device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Col,
+                                                  Col,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_instances{});
+    add_device_operation_instances(
+        instances, device_grouped_gemm_xdl_f16_f16_f16_km_nk_mn_irregular_tile_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
new file mode 100644
index 00000000..95a1a87d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using Empty_Tuple = ck::Tuple<>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// a[m, k] * b[k, n] = e[m, n]
+using device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instances = std::tuple<
+    // clang-format off
+        //###################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //###################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //###################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //###################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedGemm_Xdl<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 4>,              8>,
+        DeviceGroupedGemm_Xdl<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 4>,              8>,
+        DeviceGroupedGemm_Xdl<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   2,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>,
+        DeviceGroupedGemm_Xdl<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>
+    // clang-format on
+    >;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_irregular_tile_instances = std::tuple<
+    // clang-format off
+        //###################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //###################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //###################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //###################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedGemm_Xdl<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,              1>,
+        DeviceGroupedGemm_Xdl<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    16,    64,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              2,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,              1>
+    // clang-format on
+    >;
+
+void add_device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Row,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instances{});
+    add_device_operation_instances(
+        instances, device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_irregular_tile_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
new file mode 100644
index 00000000..a103406d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using Empty_Tuple = ck::Tuple<>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+// a[m, k] * b[n, k] = e[m, n]
+using device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances = std::tuple<
+    // clang-format off
+        //###################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //###################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //###################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //###################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedGemm_Xdl<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedGemm_Xdl<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedGemm_Xdl<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedGemm_Xdl<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedGemm_Xdl<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedGemm_Xdl<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedGemm_Xdl<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    64,    64,    32,   8,   8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedGemm_Xdl<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedGemm_Xdl<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>,
+        DeviceGroupedGemm_Xdl<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedGemm_Xdl<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    32,   128,    32,   8,   8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8>,
+        DeviceGroupedGemm_Xdl<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>,
+        DeviceGroupedGemm_Xdl<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,               8>
+    // clang-format on
+    >;
+
+static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_irregular_tile_instances = std::tuple<
+    // clang-format off
+        //###################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //###################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //###################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //###################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedGemm_Xdl<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,              1>,
+        DeviceGroupedGemm_Xdl<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,    16,    64,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              2,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,              1>
+    // clang-format on
+    >;
+
+void add_device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemm<Row,
+                                                  Col,
+                                                  Empty_Tuple,
+                                                  Row,
+                                                  F16,
+                                                  F16,
+                                                  Empty_Tuple,
+                                                  F16,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instances{});
+    add_device_operation_instances(
+        instances, device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_irregular_tile_instances{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt
new file mode 100644
index 00000000..aa0cc114
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_instance_library(device_normalization_instance
+    device_normalization_f16_instance.cpp
+    device_normalization_f32_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp
new file mode 100644
index 00000000..8994d9dc
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f16_instance.cpp
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Pass = ck::tensor_operation::element_wise::PassThrough;
+
+template <typename OutElementwise, index_t Rank, index_t Reduce>
+// clang-format off
+using device_normalization_f16_instances =
+    std::tuple <
+        // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorDim, GammaSrcVectorSize, BetaSrcVectorDim, BetaSrcVectorSize, YDstVectorSize>
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1, 1, 1, 1, 1>, // fallback kernel
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 2, 1, 2, 1, 2, 2>, // fallback kernel
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 4, 1, 4, 1, 4, 4>, // fallback kernel
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 8, 1, 8, 1, 8, 8>,
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 4, 64, 1, 8, 1, 8, 1, 8, 1, 8, 8>,
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 2, 128, 1, 8, 1, 8, 1, 8, 1, 8, 8>,
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 2, 128, 1, 16, 1, 8, 1, 8, 1, 8, 8>,
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 2, 128, 1, 32, 1, 8, 1, 8, 1, 8, 8>,
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 8, 1, 8, 1, 8, 1, 8, 8>,
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 16, 1, 8, 1, 8, 1, 8, 8>,
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 32, 1, 8, 1, 8, 1, 8, 8>,
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 1024, 1, 1024, 1, 32, 1, 8, 1, 8, 1, 8, 8>,
+        DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 1024, 1, 1024, 1, 8, 1, 2, 1, 2, 1, 2, 2>
+    >;
+// clang-format on
+
+void add_device_normalization_rank_2_1_f16_instances(
+    std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Pass, 2, 1>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_normalization_f16_instances<Pass, 2, 1>{});
+}
+
+void add_device_normalization_rank_4_3_f16_instances(
+    std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Pass, 4, 3>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_normalization_f16_instances<Pass, 4, 3>{});
+}
+
+void add_device_normalization_rank_5_3_f16_instances(
+    std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Pass, 5, 3>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_normalization_f16_instances<Pass, 5, 3>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp
new file mode 100644
index 00000000..4a7e1fd0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/normalization/device_normalization_f32_instance.cpp
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_normalization_impl.hpp"
+#include "ck/utility/data_type.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32 = float;
+
+using Pass = ck::tensor_operation::element_wise::PassThrough;
+
+template <typename OutElementwise, index_t Rank, index_t Reduce>
+using device_layernorm_f32_instances = std::tuple<
+    // clang-format off
+        // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, XYSrcVectorDim, XSrcVectorSize, GammaSrcVectorSize, BetaSrcVectorSize, YDstVectorSize>
+        DeviceNormalizationImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1, 1, 1, 1, 1>, // fallback kernel
+        DeviceNormalizationImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 2, 1, 2, 1, 2, 2>, // fallback kernel
+        DeviceNormalizationImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 8, 32, 1, 8, 1, 4, 1, 4, 1, 4, 4>,
+        DeviceNormalizationImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 4, 64, 1, 8, 1, 4, 1, 4, 1, 4, 4>,
+        DeviceNormalizationImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 2, 128, 1, 8, 1, 4, 1, 4, 1, 4, 4>,
+        DeviceNormalizationImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 2, 128, 1, 16, 1, 4, 1, 4, 1, 4, 4>,
+        DeviceNormalizationImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 2, 128, 1, 32, 1, 4, 1, 4, 1, 4, 4>,
+        DeviceNormalizationImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 8, 1, 4, 1, 4, 1, 4, 4>,
+        DeviceNormalizationImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 16, 1, 4, 1, 4, 1, 4, 4>,
+        DeviceNormalizationImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 256, 1, 256, 1, 32, 1, 4, 1, 4, 1, 4, 4>
+    // clang-format on
+    >;
+
+void add_device_normalization_rank_2_1_f32_instances(
+    std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Pass, 2, 1>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_layernorm_f32_instances<Pass, 2, 1>{});
+}
+
+void add_device_normalization_rank_4_3_f32_instances(
+    std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Pass, 4, 3>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_layernorm_f32_instances<Pass, 4, 3>{});
+}
+
+void add_device_normalization_rank_5_3_f32_instances(
+    std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Pass, 5, 3>>>&
+        instances)
+{
+    add_device_operation_instances(instances, device_layernorm_f32_instances<Pass, 5, 3>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/quantization/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/quantization/CMakeLists.txt
new file mode 100644
index 00000000..9f826afd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/quantization/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_instance_library(device_quantization_instance
+    device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp
+    device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp
+    device_conv2d_xdl_perchannel_quantization_int8_instance.cpp
+    device_conv2d_xdl_perlayer_quantization_int8_instance.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/quantization/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp
new file mode 100644
index 00000000..e87e9875
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/quantization/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_conv2d_xdl_int8_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+void add_device_conv2d_bias_perchannel_quantization_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              GK_GK_Tuple,
+                                                              GNHWK,
+                                                              int8_t,
+                                                              int8_t,
+                                                              I32_F32_Tuple,
+                                                              int8_t,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              Add_Mul2_Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_conv2d_int8_32Ds_instances<GK_GK_Tuple,
+                                                                     I32_F32_Tuple,
+                                                                     Add_Mul2_Clamp,
+                                                                     ConvFwdDefault>{});
+    add_device_operation_instances(instances,
+                                   device_conv2d_int8_32Ds_instances<GK_GK_Tuple,
+                                                                     I32_F32_Tuple,
+                                                                     Add_Mul2_Clamp,
+                                                                     ConvFwd1x1P0>{});
+    add_device_operation_instances(instances,
+                                   device_conv2d_int8_32Ds_instances<GK_GK_Tuple,
+                                                                     I32_F32_Tuple,
+                                                                     Add_Mul2_Clamp,
+                                                                     ConvFwd1x1S1P0>{});
+}
+
+void add_device_conv2d_bias_relu_perchannel_quantization_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              GK_GK_Tuple,
+                                                              GNHWK,
+                                                              int8_t,
+                                                              int8_t,
+                                                              I32_F32_Tuple,
+                                                              int8_t,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              Add_Relu_Mul2_Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_conv2d_int8_32Ds_instances<GK_GK_Tuple,
+                                                                     I32_F32_Tuple,
+                                                                     Add_Relu_Mul2_Clamp,
+                                                                     ConvFwdDefault>{});
+    add_device_operation_instances(instances,
+                                   device_conv2d_int8_32Ds_instances<GK_GK_Tuple,
+                                                                     I32_F32_Tuple,
+                                                                     Add_Relu_Mul2_Clamp,
+                                                                     ConvFwd1x1P0>{});
+    add_device_operation_instances(instances,
+                                   device_conv2d_int8_32Ds_instances<GK_GK_Tuple,
+                                                                     I32_F32_Tuple,
+                                                                     Add_Relu_Mul2_Clamp,
+                                                                     ConvFwd1x1S1P0>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/quantization/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp
new file mode 100644
index 00000000..06eed760
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/quantization/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_conv2d_xdl_int8_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+void add_device_conv2d_bias_perlayer_quantization_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              GK_Tuple,
+                                                              GNHWK,
+                                                              int8_t,
+                                                              int8_t,
+                                                              I32_Tuple,
+                                                              int8_t,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              Add_Mul_Clamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_conv2d_int8_32Ds_instances<GK_Tuple, I32_Tuple, Add_Mul_Clamp, ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_conv2d_int8_32Ds_instances<GK_Tuple, I32_Tuple, Add_Mul_Clamp, ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_conv2d_int8_32Ds_instances<GK_Tuple, I32_Tuple, Add_Mul_Clamp, ConvFwd1x1S1P0>{});
+}
+
+void add_device_conv2d_bias_relu_perlayer_quantization_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              GK_Tuple,
+                                                              GNHWK,
+                                                              int8_t,
+                                                              int8_t,
+                                                              I32_Tuple,
+                                                              int8_t,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              Add_Relu_Mul_Clamp>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_conv2d_int8_32Ds_instances<GK_Tuple,
+                                                                     I32_Tuple,
+                                                                     Add_Relu_Mul_Clamp,
+                                                                     ConvFwdDefault>{});
+
+    add_device_operation_instances(
+        instances,
+        device_conv2d_int8_32Ds_instances<GK_Tuple, I32_Tuple, Add_Relu_Mul_Clamp, ConvFwd1x1P0>{});
+
+    add_device_operation_instances(instances,
+                                   device_conv2d_int8_32Ds_instances<GK_Tuple,
+                                                                     I32_Tuple,
+                                                                     Add_Relu_Mul_Clamp,
+                                                                     ConvFwd1x1S1P0>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/quantization/device_conv2d_xdl_int8_instance.hpp b/library/src/tensor_operation_instance/gpu/quantization/device_conv2d_xdl_int8_instance.hpp
new file mode 100644
index 00000000..6904e269
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/quantization/device_conv2d_xdl_int8_instance.hpp
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using Empty_Tuple = ck::Tuple<>;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using GNHWC       = ck::tensor_layout::convolution::GNHWC;
+using GKYXC       = ck::tensor_layout::convolution::GKYXC;
+using GNHWK       = ck::tensor_layout::convolution::GNHWK;
+using GK          = ck::tensor_layout::convolution::G_K;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Relu        = ck::tensor_operation::element_wise::Relu;
+
+using GK_Tuple      = ck::Tuple<GK>;
+using GK_GK_Tuple   = ck::Tuple<GK, GK>;
+using I32_Tuple     = ck::Tuple<int32_t>;
+using F32_Tuple     = ck::Tuple<float>;
+using I32_F32_Tuple = ck::Tuple<int32_t, float>;
+
+using Mul_Clamp      = ck::tensor_operation::element_wise::Activation_Mul_Clamp<PassThrough>;
+using Relu_Mul_Clamp = ck::tensor_operation::element_wise::Activation_Mul_Clamp<Relu>;
+
+using Add_Mul_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<PassThrough>;
+using Add_Relu_Mul_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp<Relu>;
+
+using Mul2_Clamp      = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<PassThrough>;
+using Relu_Mul2_Clamp = ck::tensor_operation::element_wise::Activation_Mul2_Clamp<Relu>;
+
+using Add_Mul2_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<PassThrough>;
+using Add_Relu_Mul2_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul2_Clamp<Relu>;
+
+static constexpr ck::index_t NDimSpatial = 2;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+static constexpr auto ConvFwdDefault =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+static constexpr auto ConvFwd1x1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Pad0;
+static constexpr auto ConvFwd1x1S1P0 =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0;
+
+template <typename DsLayout,
+          typename DsDatatype,
+          typename OutElementOp,
+          ConvolutionForwardSpecialization ConvSpec>
+// clang-format off
+using device_conv2d_int8_instances =
+    std::tuple <
+        //########################################|  NumDim|      A|      B|       Ds|      E|  AData|  BData| AccData| CShuffle|         Ds|  EData|           A|           B|          CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|   Layout| Layout|   Type|   Type|    Type| DataType|   DataType|   Type| Elementwise| Elementwise|  Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |         |       |       |       |        |         |           |       |   Operation|   Operation|    Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |         |       |       |       |        |         |           |       |            |            |             |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,               16>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   128,   256,    64,  16,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,               16>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>,               16>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,               16>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,   128,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 2>,               16>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,    64,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>,               16>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,    64,    64,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 2>,               16>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   128,    64,    64,  16,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,               16>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,    64,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,               16>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,   128,    32,    64,  16,  16,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 2>,               16>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,    32,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>,               16>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,    64,    64,    32,    64,  16,  16,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 2>,               16>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,    64,    32,    64,    64,  16,  16,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 2>,               16>
+    >;
+// clang-format on
+
+// for conv + multiple of 32 bit Ds. bit of Ds will affect the ScalarPerVector of C
+template <typename DsLayout,
+          typename DsDatatype,
+          typename OutElementOp,
+          ConvolutionForwardSpecialization ConvSpec>
+// clang-format off
+using device_conv2d_int8_32Ds_instances =
+    std::tuple <
+        //########################################|  NumDim|      A|      B|       Ds|      E|  AData|  BData| AccData| CShuffle|         Ds|  EData|           A|           B|          CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################| Spatial| Layout| Layout|   Layout| Layout|   Type|   Type|    Type| DataType|   DataType|   Type| Elementwise| Elementwise|  Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|        |       |       |         |       |       |       |        |         |           |       |   Operation|   Operation|    Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|        |       |       |         |       |       |       |        |         |           |       |            |            |             |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   256,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   128,   256,    64,  16,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,   128,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 2>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,    64,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,    64,    64,    64,    64,  16,  16,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 2>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,   128,    64,    64,  16,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   256,    64,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,   128,    32,    64,  16,  16,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 2>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,   128,    32,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 4>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,    64,    64,    32,    64,  16,  16,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 2>,               8>,
+        DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<       2,  GNHWC,  GKYXC, DsLayout,  GNHWK, int8_t, int8_t, int32_t,  int32_t, DsDatatype, int8_t, PassThrough, PassThrough, OutElementOp,       ConvSpec,       GemmSpec,        1,    64,    32,    64,    64,  16,  16,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 2>,               8>
+    >;
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/quantization/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp
new file mode 100644
index 00000000..5f1aa0c5
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/quantization/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_conv2d_xdl_int8_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+void add_device_conv2d_perchannel_quantization_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              GK_Tuple,
+                                                              GNHWK,
+                                                              int8_t,
+                                                              int8_t,
+                                                              F32_Tuple,
+                                                              int8_t,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              Mul2_Clamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_conv2d_int8_32Ds_instances<GK_Tuple, F32_Tuple, Mul2_Clamp, ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_conv2d_int8_32Ds_instances<GK_Tuple, F32_Tuple, Mul2_Clamp, ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_conv2d_int8_32Ds_instances<GK_Tuple, F32_Tuple, Mul2_Clamp, ConvFwd1x1S1P0>{});
+}
+
+void add_device_conv2d_relu_perchannel_quantization_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              GK_Tuple,
+                                                              GNHWK,
+                                                              int8_t,
+                                                              int8_t,
+                                                              F32_Tuple,
+                                                              int8_t,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              Relu_Mul2_Clamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_conv2d_int8_32Ds_instances<GK_Tuple, F32_Tuple, Relu_Mul2_Clamp, ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_conv2d_int8_32Ds_instances<GK_Tuple, F32_Tuple, Relu_Mul2_Clamp, ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_conv2d_int8_32Ds_instances<GK_Tuple, F32_Tuple, Relu_Mul2_Clamp, ConvFwd1x1S1P0>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/quantization/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp b/library/src/tensor_operation_instance/gpu/quantization/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp
new file mode 100644
index 00000000..83435d81
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/quantization/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_conv2d_xdl_int8_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+void add_device_conv2d_perlayer_quantization_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              GNHWK,
+                                                              int8_t,
+                                                              int8_t,
+                                                              Empty_Tuple,
+                                                              int8_t,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              Mul_Clamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_conv2d_int8_instances<Empty_Tuple, Empty_Tuple, Mul_Clamp, ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_conv2d_int8_instances<Empty_Tuple, Empty_Tuple, Mul_Clamp, ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_conv2d_int8_instances<Empty_Tuple, Empty_Tuple, Mul_Clamp, ConvFwd1x1S1P0>{});
+}
+
+void add_device_conv2d_relu_perlayer_quantization_int8_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleD<NDimSpatial,
+                                                              GNHWC,
+                                                              GKYXC,
+                                                              Empty_Tuple,
+                                                              GNHWK,
+                                                              int8_t,
+                                                              int8_t,
+                                                              Empty_Tuple,
+                                                              int8_t,
+                                                              PassThrough,
+                                                              PassThrough,
+                                                              Relu_Mul_Clamp>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_conv2d_int8_instances<Empty_Tuple, Empty_Tuple, Relu_Mul_Clamp, ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_conv2d_int8_instances<Empty_Tuple, Empty_Tuple, Relu_Mul_Clamp, ConvFwd1x1P0>{});
+    add_device_operation_instances(
+        instances,
+        device_conv2d_int8_instances<Empty_Tuple, Empty_Tuple, Relu_Mul_Clamp, ConvFwd1x1S1P0>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt
new file mode 100644
index 00000000..31ae7226
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt
@@ -0,0 +1,76 @@
+add_instance_library(device_reduce_instance
+   device_reduce_instance_blockwise_f16_f16_f16_min.cpp
+   device_reduce_instance_blockwise_f16_f16_f16_max.cpp
+   device_reduce_instance_blockwise_f16_f16_f16_amax.cpp
+   device_reduce_instance_blockwise_f16_f32_f16_add.cpp
+   device_reduce_instance_blockwise_f16_f32_f16_avg.cpp
+   device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp
+   device_reduce_instance_blockwise_f32_f32_f32_add.cpp
+   device_reduce_instance_blockwise_f32_f32_f32_avg.cpp
+   device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp
+   device_reduce_instance_blockwise_f32_f32_f32_min.cpp
+   device_reduce_instance_blockwise_f32_f32_f32_max.cpp
+   device_reduce_instance_blockwise_f32_f32_f32_amax.cpp
+   device_reduce_instance_blockwise_f32_f64_f32_add.cpp
+   device_reduce_instance_blockwise_f32_f64_f32_avg.cpp
+   device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp
+   device_reduce_instance_blockwise_f64_f64_f64_add.cpp
+   device_reduce_instance_blockwise_f64_f64_f64_avg.cpp
+   device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp
+   device_reduce_instance_blockwise_f64_f64_f64_min.cpp
+   device_reduce_instance_blockwise_f64_f64_f64_max.cpp
+   device_reduce_instance_blockwise_f64_f64_f64_amax.cpp
+   device_reduce_instance_blockwise_i8_i32_i8_add.cpp
+   device_reduce_instance_blockwise_i8_i32_i8_avg.cpp
+   device_reduce_instance_blockwise_i8_i8_i8_min.cpp   
+   device_reduce_instance_blockwise_i8_i8_i8_max.cpp   
+   device_reduce_instance_blockwise_i8_i8_i8_amax.cpp   
+   device_reduce_instance_blockwise_b16_f32_b16_add.cpp
+   device_reduce_instance_blockwise_b16_f32_b16_avg.cpp
+   device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp
+   device_reduce_instance_blockwise_b16_f32_b16_min.cpp
+   device_reduce_instance_blockwise_b16_f32_b16_max.cpp
+   device_reduce_instance_blockwise_b16_f32_b16_amax.cpp
+   device_reduce_instance_threadwise_f16_f16_f16_min.cpp
+   device_reduce_instance_threadwise_f16_f16_f16_max.cpp
+   device_reduce_instance_threadwise_f16_f16_f16_amax.cpp
+   device_reduce_instance_threadwise_f16_f32_f16_add.cpp
+   device_reduce_instance_threadwise_f16_f32_f16_avg.cpp
+   device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp
+   device_reduce_instance_threadwise_f32_f32_f32_add.cpp
+   device_reduce_instance_threadwise_f32_f32_f32_avg.cpp
+   device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp
+   device_reduce_instance_threadwise_f32_f32_f32_min.cpp
+   device_reduce_instance_threadwise_f32_f32_f32_max.cpp
+   device_reduce_instance_threadwise_f32_f32_f32_amax.cpp
+   device_reduce_instance_threadwise_f32_f64_f32_add.cpp
+   device_reduce_instance_threadwise_f32_f64_f32_avg.cpp
+   device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp
+   device_reduce_instance_threadwise_f64_f64_f64_add.cpp
+   device_reduce_instance_threadwise_f64_f64_f64_avg.cpp
+   device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp
+   device_reduce_instance_threadwise_f64_f64_f64_min.cpp
+   device_reduce_instance_threadwise_f64_f64_f64_max.cpp
+   device_reduce_instance_threadwise_f64_f64_f64_amax.cpp
+   device_reduce_instance_threadwise_i8_i32_i8_add.cpp
+   device_reduce_instance_threadwise_i8_i32_i8_avg.cpp
+   device_reduce_instance_threadwise_i8_i8_i8_min.cpp
+   device_reduce_instance_threadwise_i8_i8_i8_max.cpp
+   device_reduce_instance_threadwise_i8_i8_i8_amax.cpp
+   device_reduce_instance_threadwise_b16_f32_b16_add.cpp
+   device_reduce_instance_threadwise_b16_f32_b16_avg.cpp
+   device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp
+   device_reduce_instance_threadwise_b16_f32_b16_min.cpp
+   device_reduce_instance_threadwise_b16_f32_b16_max.cpp
+   device_reduce_instance_threadwise_b16_f32_b16_amax.cpp
+   device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp
+   device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp
+   device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp
+   device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp
+   device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp
+   device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp
+   device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp
+   device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp
+   device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp
+   device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp
new file mode 100644
index 00000000..1909183a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp
new file mode 100644
index 00000000..ec302010
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp
new file mode 100644
index 00000000..89f3e582
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp
new file mode 100644
index 00000000..f1bdd192
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp
new file mode 100644
index 00000000..58e9c562
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp
new file mode 100644
index 00000000..e5012c65
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 3, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 3, UnarySquare, UnarySqrt>>&); 
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 4, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 4, UnarySquare, UnarySqrt>>&); 
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 4, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 1, UnarySquare, UnarySqrt>>&); 
+template void add_device_reduce_instance_blockwise<BF16, F32, BF16, 2, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<2, 1, UnarySquare, UnarySqrt>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.cpp
new file mode 100644
index 00000000..0970cb9d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.cpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F16, F16, F16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F16, F16, F16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.cpp
new file mode 100644
index 00000000..6ee179a5
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.cpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F16, F16, F16, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F16, F16, F16, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.cpp
new file mode 100644
index 00000000..e53b4030
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.cpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F16, F16, F16, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F16, F16, F16, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F16, F16, F16, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.cpp
new file mode 100644
index 00000000..cab5738f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<F16, F32, F16, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_blockwise<F16, F32, F16, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_blockwise<F16, F32, F16, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_blockwise<F16, F32, F16, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.cpp
new file mode 100644
index 00000000..7d2a4fad
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<F16, F32, F16, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_blockwise<F16, F32, F16, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_blockwise<F16, F32, F16, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_blockwise<F16, F32, F16, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp
new file mode 100644
index 00000000..e08b64f8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<F16, F32, F16, 4, 3, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 3, UnarySquare, UnarySqrt>>&); 
+template void add_device_reduce_instance_blockwise<F16, F32, F16, 4, 4, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 4, UnarySquare, UnarySqrt>>&); 
+template void add_device_reduce_instance_blockwise<F16, F32, F16, 4, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 1, UnarySquare, UnarySqrt>>&); 
+template void add_device_reduce_instance_blockwise<F16, F32, F16, 2, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<2, 1, UnarySquare, UnarySqrt>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.cpp
new file mode 100644
index 00000000..89cabf37
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp
new file mode 100644
index 00000000..1e602c12
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.cpp
new file mode 100644
index 00000000..489b4bc4
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.cpp
new file mode 100644
index 00000000..04e2c5b1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.cpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.cpp
new file mode 100644
index 00000000..5c0e5360
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.cpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp
new file mode 100644
index 00000000..899dfcd3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 3, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 3, UnarySquare, UnarySqrt>>&); 
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 4, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 4, UnarySquare, UnarySqrt>>&); 
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 4, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 1, UnarySquare, UnarySqrt>>&); 
+template void add_device_reduce_instance_blockwise<F32, F32, F32, 2, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<2, 1, UnarySquare, UnarySqrt>>&);
+// clang-format on
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.cpp
new file mode 100644
index 00000000..5624337a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.cpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<F32, F64, F32, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_blockwise<F32, F64, F32, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_blockwise<F32, F64, F32, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_blockwise<F32, F64, F32, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.cpp
new file mode 100644
index 00000000..2f3067ce
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.cpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<F32, F64, F32, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_blockwise<F32, F64, F32, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_blockwise<F32, F64, F32, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_blockwise<F32, F64, F32, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp
new file mode 100644
index 00000000..2648e7d5
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<F32, F64, F32, 4, 3, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 3, UnarySquare, UnarySqrt>>&); 
+template void add_device_reduce_instance_blockwise<F32, F64, F32, 4, 4, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 4, UnarySquare, UnarySqrt>>&); 
+template void add_device_reduce_instance_blockwise<F32, F64, F32, 4, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 1, UnarySquare, UnarySqrt>>&); 
+template void add_device_reduce_instance_blockwise<F32, F64, F32, 2, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<2, 1, UnarySquare, UnarySqrt>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.cpp
new file mode 100644
index 00000000..f67ae2ee
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.cpp
new file mode 100644
index 00000000..6f8e0785
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.cpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.cpp
new file mode 100644
index 00000000..69fecf72
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.cpp
new file mode 100644
index 00000000..129a4f0f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.cpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.cpp
new file mode 100644
index 00000000..21babc4a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.cpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp
new file mode 100644
index 00000000..b85b3e2b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 3, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 3, UnarySquare, UnarySqrt>>&); 
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 4, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 4, UnarySquare, UnarySqrt>>&); 
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 4, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 1, UnarySquare, UnarySqrt>>&); 
+template void add_device_reduce_instance_blockwise<F64, F64, F64, 2, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<2, 1, UnarySquare, UnarySqrt>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.cpp
new file mode 100644
index 00000000..24a8293b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<I8, I32, I8, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_blockwise<I8, I32, I8, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_blockwise<I8, I32, I8, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_blockwise<I8, I32, I8, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.cpp
new file mode 100644
index 00000000..73e60fa9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<I8, I32, I8, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_blockwise<I8, I32, I8, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_blockwise<I8, I32, I8, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_blockwise<I8, I32, I8, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.cpp
new file mode 100644
index 00000000..72e649d8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.cpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<I8, I8, I8, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<I8, I8, I8, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.cpp
new file mode 100644
index 00000000..a7e053a0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.cpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<I8, I8, I8, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<I8, I8, I8, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.cpp
new file mode 100644
index 00000000..0e3abd35
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.cpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<I8, I8, I8, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<I8, I8, I8, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_blockwise<I8, I8, I8, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp
new file mode 100644
index 00000000..4b324560
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_multiblock_atomic_add<BF16, F32, F32, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_multiblock_atomic_add<BF16, F32, F32, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_multiblock_atomic_add<BF16, F32, F32, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_multiblock_atomic_add<BF16, F32, F32, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp
new file mode 100644
index 00000000..3298587a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_multiblock_atomic_add<BF16, F32, F32, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_multiblock_atomic_add<BF16, F32, F32, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_multiblock_atomic_add<BF16, F32, F32, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_multiblock_atomic_add<BF16, F32, F32, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp
new file mode 100644
index 00000000..729d4fd6
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_multiblock_atomic_add<F16, F32, F32, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_multiblock_atomic_add<F16, F32, F32, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_multiblock_atomic_add<F16, F32, F32, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_multiblock_atomic_add<F16, F32, F32, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp
new file mode 100644
index 00000000..e3e36e31
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_multiblock_atomic_add<F16, F32, F32, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_multiblock_atomic_add<F16, F32, F32, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_multiblock_atomic_add<F16, F32, F32, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_multiblock_atomic_add<F16, F32, F32, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp
new file mode 100644
index 00000000..e7580e7d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_multiblock_atomic_add<F32, F32, F32, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_multiblock_atomic_add<F32, F32, F32, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_multiblock_atomic_add<F32, F32, F32, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_multiblock_atomic_add<F32, F32, F32, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp
new file mode 100644
index 00000000..1e6feb00
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_multiblock_atomic_add<F32, F32, F32, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_multiblock_atomic_add<F32, F32, F32, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_multiblock_atomic_add<F32, F32, F32, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_multiblock_atomic_add<F32, F32, F32, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp
new file mode 100644
index 00000000..669c4d34
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_multiblock_atomic_add<F32, F64, F32, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_multiblock_atomic_add<F32, F64, F32, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_multiblock_atomic_add<F32, F64, F32, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_multiblock_atomic_add<F32, F64, F32, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp
new file mode 100644
index 00000000..335a5474
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_multiblock_atomic_add<F32, F64, F32, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_multiblock_atomic_add<F32, F64, F32, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_multiblock_atomic_add<F32, F64, F32, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_multiblock_atomic_add<F32, F64, F32, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp
new file mode 100644
index 00000000..e95e8391
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_multiblock_atomic_add<F64, F64, F64, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_multiblock_atomic_add<F64, F64, F64, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_multiblock_atomic_add<F64, F64, F64, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_multiblock_atomic_add<F64, F64, F64, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp
new file mode 100644
index 00000000..25498158
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_multiblock_atomic_add<F64, F64, F64, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_multiblock_atomic_add<F64, F64, F64, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_multiblock_atomic_add<F64, F64, F64, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_multiblock_atomic_add<F64, F64, F64, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.cpp
new file mode 100644
index 00000000..7262b8a5
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.cpp
new file mode 100644
index 00000000..c526a74f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.cpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.cpp
new file mode 100644
index 00000000..4c7252e7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.cpp
new file mode 100644
index 00000000..618900a7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.cpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.cpp
new file mode 100644
index 00000000..ce747cbc
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.cpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp
new file mode 100644
index 00000000..06f622b9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 3, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 3, UnarySquare, UnarySqrt>>&); 
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 4, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 4, UnarySquare, UnarySqrt>>&); 
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 4, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 1, UnarySquare, UnarySqrt>>&); 
+template void add_device_reduce_instance_threadwise<BF16, F32, BF16, 2, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<2, 1, UnarySquare, UnarySqrt>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.cpp
new file mode 100644
index 00000000..708eb58d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.cpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F16, F16, F16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F16, F16, F16, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.cpp
new file mode 100644
index 00000000..c8a62fa1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.cpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F16, F16, F16, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F16, F16, F16, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.cpp
new file mode 100644
index 00000000..ce209215
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.cpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F16, F16, F16, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F16, F16, F16, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F16, F16, F16, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.cpp
new file mode 100644
index 00000000..29251a8b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.cpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<F16, F32, F16, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_threadwise<F16, F32, F16, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_threadwise<F16, F32, F16, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_threadwise<F16, F32, F16, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.cpp
new file mode 100644
index 00000000..734fa9fd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.cpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<F16, F32, F16, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_threadwise<F16, F32, F16, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_threadwise<F16, F32, F16, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_threadwise<F16, F32, F16, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp
new file mode 100644
index 00000000..d7a0e2bf
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<F16, F32, F16, 4, 3, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 3, UnarySquare, UnarySqrt>>&); 
+template void add_device_reduce_instance_threadwise<F16, F32, F16, 4, 4, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 4, UnarySquare, UnarySqrt>>&); 
+template void add_device_reduce_instance_threadwise<F16, F32, F16, 4, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 1, UnarySquare, UnarySqrt>>&); 
+template void add_device_reduce_instance_threadwise<F16, F32, F16, 2, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<2, 1, UnarySquare, UnarySqrt>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp
new file mode 100644
index 00000000..8b97f300
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp
new file mode 100644
index 00000000..53d01e38
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp
new file mode 100644
index 00000000..125d054f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp
new file mode 100644
index 00000000..fb86a2bb
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp
new file mode 100644
index 00000000..49af0839
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp
new file mode 100644
index 00000000..30cc1b13
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 3, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 3, UnarySquare, UnarySqrt>>&); 
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 4, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 4, UnarySquare, UnarySqrt>>&); 
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 4, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 1, UnarySquare, UnarySqrt>>&); 
+template void add_device_reduce_instance_threadwise<F32, F32, F32, 2, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<2, 1, UnarySquare, UnarySqrt>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp
new file mode 100644
index 00000000..24f8a9ba
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<F32, F64, F32, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_threadwise<F32, F64, F32, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_threadwise<F32, F64, F32, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_threadwise<F32, F64, F32, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp
new file mode 100644
index 00000000..a26702f0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<F32, F64, F32, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_threadwise<F32, F64, F32, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_threadwise<F32, F64, F32, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_threadwise<F32, F64, F32, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp
new file mode 100644
index 00000000..34fe3262
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<F32, F64, F32, 4, 3, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 3, UnarySquare, UnarySqrt>>&); 
+template void add_device_reduce_instance_threadwise<F32, F64, F32, 4, 4, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 4, UnarySquare, UnarySqrt>>&); 
+template void add_device_reduce_instance_threadwise<F32, F64, F32, 4, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 1, UnarySquare, UnarySqrt>>&); 
+template void add_device_reduce_instance_threadwise<F32, F64, F32, 2, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<2, 1, UnarySquare, UnarySqrt>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp
new file mode 100644
index 00000000..74b15edd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp
new file mode 100644
index 00000000..65762492
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp
new file mode 100644
index 00000000..5e74295a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp
new file mode 100644
index 00000000..6fdea6cc
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp
new file mode 100644
index 00000000..317d573d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp
new file mode 100644
index 00000000..29f95ebc
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 3, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 3, UnarySquare, UnarySqrt>>&); 
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 4, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 4, UnarySquare, UnarySqrt>>&); 
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 4, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<4, 1, UnarySquare, UnarySqrt>>&); 
+template void add_device_reduce_instance_threadwise<F64, F64, F64, 2, 1, ReduceAdd, UnarySquare, UnarySqrt, false, false>(std::vector<DeviceReducePtr<2, 1, UnarySquare, UnarySqrt>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp
new file mode 100644
index 00000000..aa9f47cb
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<I8, I32, I8, 4, 3, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_threadwise<I8, I32, I8, 4, 4, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_threadwise<I8, I32, I8, 4, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&); 
+template void add_device_reduce_instance_threadwise<I8, I32, I8, 2, 1, ReduceAdd, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp
new file mode 100644
index 00000000..54a9dd1a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<I8, I32, I8, 4, 3, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_threadwise<I8, I32, I8, 4, 4, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_threadwise<I8, I32, I8, 4, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, UnaryDivide>>&); 
+template void add_device_reduce_instance_threadwise<I8, I32, I8, 2, 1, ReduceAdd, PassThrough, UnaryDivide, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, UnaryDivide>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp
new file mode 100644
index 00000000..4ef5717b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<I8, I8, I8, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, UnaryAbs, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<I8, I8, I8, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, UnaryAbs, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp
new file mode 100644
index 00000000..140a3c19
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 3, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 4, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<I8, I8, I8, 2, 1, ReduceMax, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 3, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 4, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<I8, I8, I8, 2, 1, ReduceMax, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp
new file mode 100644
index 00000000..317b4ad3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// clang-format off
+// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex 
+template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 3, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 4, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<I8, I8, I8, 2, 1, ReduceMin, PassThrough, PassThrough, false, false>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 3, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 3, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 4, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 4, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<I8, I8, I8, 4, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<4, 1, PassThrough, PassThrough>>&);
+template void add_device_reduce_instance_threadwise<I8, I8, I8, 2, 1, ReduceMin, PassThrough, PassThrough, false, true>(std::vector<DeviceReducePtr<2, 1, PassThrough, PassThrough>>&);
+// clang-format on
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt
new file mode 100644
index 00000000..fc13261a
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt
@@ -0,0 +1,26 @@
+add_instance_library(device_softmax_instance
+    device_softmax_i8_i8_instance.cpp
+    device_softmax_i8_i8_instance_rank3_reduce1.cpp
+    device_softmax_i8_i8_instance_rank3_reduce2.cpp
+    device_softmax_i8_i8_instance_rank3_reduce3.cpp
+    device_softmax_i8_i8_instance_rank4_reduce1.cpp
+    device_softmax_i8_i8_instance_rank4_reduce2.cpp
+    device_softmax_i8_i8_instance_rank4_reduce3.cpp
+    device_softmax_i8_i8_instance_rank4_reduce4.cpp
+    device_softmax_f16_f16_instance.cpp
+    device_softmax_f16_f16_instance_rank3_reduce1.cpp
+    device_softmax_f16_f16_instance_rank3_reduce2.cpp
+    device_softmax_f16_f16_instance_rank3_reduce3.cpp
+    device_softmax_f16_f16_instance_rank4_reduce1.cpp
+    device_softmax_f16_f16_instance_rank4_reduce2.cpp
+    device_softmax_f16_f16_instance_rank4_reduce3.cpp
+    device_softmax_f16_f16_instance_rank4_reduce4.cpp
+    device_softmax_f32_f32_instance.cpp
+    device_softmax_f32_f32_instance_rank3_reduce1.cpp
+    device_softmax_f32_f32_instance_rank3_reduce2.cpp
+    device_softmax_f32_f32_instance_rank3_reduce3.cpp
+    device_softmax_f32_f32_instance_rank4_reduce1.cpp
+    device_softmax_f32_f32_instance_rank4_reduce2.cpp
+    device_softmax_f32_f32_instance_rank4_reduce3.cpp
+    device_softmax_f32_f32_instance_rank4_reduce4.cpp
+)
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
new file mode 100644
index 00000000..14d27645
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_softmax_f16_f16_rank3_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances)
+{
+    add_device_softmax_f16_f16_rank3_reduce1_instances(instances);
+    add_device_softmax_f16_f16_rank3_reduce2_instances(instances);
+    add_device_softmax_f16_f16_rank3_reduce3_instances(instances);
+}
+
+void add_device_softmax_f16_f16_rank4_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances)
+{
+    add_device_softmax_f16_f16_rank4_reduce1_instances(instances);
+    add_device_softmax_f16_f16_rank4_reduce2_instances(instances);
+    add_device_softmax_f16_f16_rank4_reduce3_instances(instances);
+    add_device_softmax_f16_f16_rank4_reduce4_instances(instances);
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
new file mode 100644
index 00000000..fa334b99
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr index_t RANK = 3;
+
+void add_device_softmax_f16_f16_rank3_reduce1_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
new file mode 100644
index 00000000..1c9d37d8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr index_t RANK = 3;
+
+void add_device_softmax_f16_f16_rank3_reduce2_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 2>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
new file mode 100644
index 00000000..5fbdab50
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr index_t RANK = 3;
+
+void add_device_softmax_f16_f16_rank3_reduce3_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 3>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
new file mode 100644
index 00000000..7dd8640b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr index_t RANK = 4;
+
+void add_device_softmax_f16_f16_rank4_reduce1_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
new file mode 100644
index 00000000..b32fe683
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr index_t RANK = 4;
+
+void add_device_softmax_f16_f16_rank4_reduce2_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 2>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
new file mode 100644
index 00000000..c05048ec
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr index_t RANK = 4;
+
+void add_device_softmax_f16_f16_rank4_reduce3_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 3>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
new file mode 100644
index 00000000..6a235708
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr index_t RANK = 4;
+
+void add_device_softmax_f16_f16_rank4_reduce4_instances(
+    std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 4>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
new file mode 100644
index 00000000..e5bec5e2
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_softmax_f32_f32_rank3_instances(
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances)
+{
+    add_device_softmax_f32_f32_rank3_reduce1_instances(instances);
+    add_device_softmax_f32_f32_rank3_reduce2_instances(instances);
+    add_device_softmax_f32_f32_rank3_reduce3_instances(instances);
+}
+
+void add_device_softmax_f32_f32_rank4_instances(
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances)
+{
+    add_device_softmax_f32_f32_rank4_reduce1_instances(instances);
+    add_device_softmax_f32_f32_rank4_reduce2_instances(instances);
+    add_device_softmax_f32_f32_rank4_reduce3_instances(instances);
+    add_device_softmax_f32_f32_rank4_reduce4_instances(instances);
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
new file mode 100644
index 00000000..57d3f184
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr index_t RANK = 3;
+
+void add_device_softmax_f32_f32_rank3_reduce1_instances(
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
new file mode 100644
index 00000000..fae3a4dd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr index_t RANK = 3;
+
+void add_device_softmax_f32_f32_rank3_reduce2_instances(
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 2>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
new file mode 100644
index 00000000..b6fb70e8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr index_t RANK = 3;
+
+void add_device_softmax_f32_f32_rank3_reduce3_instances(
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 3>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
new file mode 100644
index 00000000..33c7b6f3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr index_t RANK = 4;
+
+void add_device_softmax_f32_f32_rank4_reduce1_instances(
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
new file mode 100644
index 00000000..c22aa574
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr index_t RANK = 4;
+
+void add_device_softmax_f32_f32_rank4_reduce2_instances(
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 2>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
new file mode 100644
index 00000000..55f3d2bd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr index_t RANK = 4;
+
+void add_device_softmax_f32_f32_rank4_reduce3_instances(
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 3>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
new file mode 100644
index 00000000..fb0bcf5e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr index_t RANK = 4;
+
+void add_device_softmax_f32_f32_rank4_reduce4_instances(
+    std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 4>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
new file mode 100644
index 00000000..608cfcf8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_softmax_i8_i8_rank3_instances(
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances)
+{
+    add_device_softmax_i8_i8_rank3_reduce1_instances(instances);
+    add_device_softmax_i8_i8_rank3_reduce2_instances(instances);
+    add_device_softmax_i8_i8_rank3_reduce3_instances(instances);
+}
+
+void add_device_softmax_i8_i8_rank4_instances(
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances)
+{
+    add_device_softmax_i8_i8_rank4_reduce1_instances(instances);
+    add_device_softmax_i8_i8_rank4_reduce2_instances(instances);
+    add_device_softmax_i8_i8_rank4_reduce3_instances(instances);
+    add_device_softmax_i8_i8_rank4_reduce4_instances(instances);
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
new file mode 100644
index 00000000..15552dba
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr index_t RANK = 3;
+
+void add_device_softmax_i8_i8_rank3_reduce1_instances(
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
new file mode 100644
index 00000000..67674028
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr index_t RANK = 3;
+
+void add_device_softmax_i8_i8_rank3_reduce2_instances(
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 2>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
new file mode 100644
index 00000000..4b33da93
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr index_t RANK = 3;
+
+void add_device_softmax_i8_i8_rank3_reduce3_instances(
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 3>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
new file mode 100644
index 00000000..fe3b823e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr index_t RANK = 4;
+
+void add_device_softmax_i8_i8_rank4_reduce1_instances(
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
new file mode 100644
index 00000000..8ecdf87d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr index_t RANK = 4;
+
+void add_device_softmax_i8_i8_rank4_reduce2_instances(
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 2>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
new file mode 100644
index 00000000..35631352
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr index_t RANK = 4;
+
+void add_device_softmax_i8_i8_rank4_reduce3_instances(
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 3>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
new file mode 100644
index 00000000..aa21a0bf
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <vector>
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr index_t RANK = 4;
+
+void add_device_softmax_i8_i8_rank4_reduce4_instances(
+    std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
+{
+    add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 4>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/utility/CMakeLists.txt b/library/src/utility/CMakeLists.txt
new file mode 100644
index 00000000..7f6a59ee
--- /dev/null
+++ b/library/src/utility/CMakeLists.txt
@@ -0,0 +1,28 @@
+## utility
+set(UTILITY_SOURCE
+    device_memory.cpp
+    host_tensor.cpp
+    convolution_parameter.cpp
+)
+
+add_library(utility STATIC ${UTILITY_SOURCE})
+add_library(composable_kernel::utility ALIAS utility)
+
+target_include_directories(utility PUBLIC
+    "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck>"
+    "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/utility>"
+)
+
+rocm_install(
+    TARGETS utility
+    EXPORT utilityTargets
+)
+
+rocm_install(
+    EXPORT utilityTargets
+    FILE composable_kernelutilityTargets.cmake
+    NAMESPACE composable_kernel::
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+)
+
+clang_tidy_check(utility)
diff --git a/library/src/utility/convolution_parameter.cpp b/library/src/utility/convolution_parameter.cpp
new file mode 100644
index 00000000..c8712d20
--- /dev/null
+++ b/library/src/utility/convolution_parameter.cpp
@@ -0,0 +1,171 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/host_utility/io.hpp"
+
+#include "ck/library/utility/convolution_parameter.hpp"
+
+namespace ck {
+namespace utils {
+namespace conv {
+
+ConvParam::ConvParam(ck::index_t n_dim,
+                     ck::index_t group_count,
+                     ck::index_t n_batch,
+                     ck::index_t n_out_channels,
+                     ck::index_t n_in_channels,
+                     const std::vector<ck::index_t>& filters_len,
+                     const std::vector<ck::index_t>& input_len,
+                     const std::vector<ck::index_t>& strides,
+                     const std::vector<ck::index_t>& dilations,
+                     const std::vector<ck::index_t>& left_pads,
+                     const std::vector<ck::index_t>& right_pads)
+    : num_dim_spatial_(n_dim),
+      G_(group_count),
+      N_(n_batch),
+      K_(n_out_channels),
+      C_(n_in_channels),
+      filter_spatial_lengths_(filters_len),
+      input_spatial_lengths_(input_len),
+      output_spatial_lengths_(num_dim_spatial_),
+      conv_filter_strides_(strides),
+      conv_filter_dilations_(dilations),
+      input_left_pads_(left_pads),
+      input_right_pads_(right_pads)
+{
+    if(static_cast<ck::index_t>(filter_spatial_lengths_.size()) != num_dim_spatial_ ||
+       static_cast<ck::index_t>(input_spatial_lengths_.size()) != num_dim_spatial_ ||
+       static_cast<ck::index_t>(conv_filter_strides_.size()) != num_dim_spatial_ ||
+       static_cast<ck::index_t>(conv_filter_dilations_.size()) != num_dim_spatial_ ||
+       static_cast<ck::index_t>(input_left_pads_.size()) != num_dim_spatial_ ||
+       static_cast<ck::index_t>(input_right_pads_.size()) != num_dim_spatial_)
+    {
+        throw(
+            std::runtime_error("ConvParam::ConvParam: "
+                               "parameter size is different from number of declared dimensions!"));
+    }
+
+    for(ck::index_t i = 0; i < num_dim_spatial_; ++i)
+    {
+        // XEff = (X - 1) * conv_dilation_w + 1;
+        // Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+        const ck::index_t x_eff = (filter_spatial_lengths_[i] - 1) * conv_filter_dilations_[i] + 1;
+
+        output_spatial_lengths_[i] =
+            (input_spatial_lengths_[i] + input_left_pads_[i] + input_right_pads_[i] - x_eff) /
+                conv_filter_strides_[i] +
+            1;
+    }
+}
+
+ConvParam::ConvParam()
+    : ConvParam::ConvParam(2, 1, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1})
+{
+}
+
+std::vector<ck::index_t> ConvParam::GetOutputSpatialLengths() const
+{
+    return output_spatial_lengths_;
+}
+
+std::size_t ConvParam::GetFlops() const
+{
+    // 2 * G * N * K * C * <output spatial lengths product> * <filter spatial lengths product>
+    return static_cast<std::size_t>(2) * G_ * N_ * K_ * C_ *
+           ck::accumulate_n<std::size_t>(
+               std::begin(output_spatial_lengths_), num_dim_spatial_, 1, std::multiplies<>()) *
+           ck::accumulate_n<std::size_t>(
+               std::begin(filter_spatial_lengths_), num_dim_spatial_, 1, std::multiplies<>());
+}
+
+std::string get_conv_param_parser_helper_msg()
+{
+    std::string msg;
+
+    msg += "Following arguments (depending on number of spatial dims):\n"
+           " Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n"
+           " G, N, K, C, \n"
+           " <filter spatial dimensions>, (ie Y, X for 2D)\n"
+           " <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
+           " <strides>, (ie Sy, Sx for 2D)\n"
+           " <dilations>, (ie Dy, Dx for 2D)\n"
+           " <left padding>, (ie LeftPy, LeftPx for 2D)\n"
+           " <right padding>, (ie RightPy, RightPx for 2D)\n";
+
+    return msg;
+}
+
+ck::utils::conv::ConvParam parse_conv_param(int num_dim_spatial, int arg_idx, char* const argv[])
+{
+    const ck::index_t G = std::stoi(argv[arg_idx++]);
+    const ck::index_t N = std::stoi(argv[arg_idx++]);
+    const ck::index_t K = std::stoi(argv[arg_idx++]);
+    const ck::index_t C = std::stoi(argv[arg_idx++]);
+
+    std::vector<ck::index_t> filter_spatial_lengths(num_dim_spatial);
+    std::vector<ck::index_t> input_spatial_lengths(num_dim_spatial);
+    std::vector<ck::index_t> conv_filter_strides(num_dim_spatial);
+    std::vector<ck::index_t> conv_filter_dilations(num_dim_spatial);
+    std::vector<ck::index_t> input_left_pads(num_dim_spatial);
+    std::vector<ck::index_t> input_right_pads(num_dim_spatial);
+
+    for(int i = 0; i < num_dim_spatial; ++i)
+    {
+        filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+    }
+
+    for(int i = 0; i < num_dim_spatial; ++i)
+    {
+        input_spatial_lengths[i] = std::stoi(argv[arg_idx++]);
+    }
+
+    for(int i = 0; i < num_dim_spatial; ++i)
+    {
+        conv_filter_strides[i] = std::stoi(argv[arg_idx++]);
+    }
+
+    for(int i = 0; i < num_dim_spatial; ++i)
+    {
+        conv_filter_dilations[i] = std::stoi(argv[arg_idx++]);
+    }
+
+    for(int i = 0; i < num_dim_spatial; ++i)
+    {
+        input_left_pads[i] = std::stoi(argv[arg_idx++]);
+    }
+
+    for(int i = 0; i < num_dim_spatial; ++i)
+    {
+        input_right_pads[i] = std::stoi(argv[arg_idx++]);
+    }
+
+    return ck::utils::conv::ConvParam{num_dim_spatial,
+                                      G,
+                                      N,
+                                      K,
+                                      C,
+                                      filter_spatial_lengths,
+                                      input_spatial_lengths,
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads};
+}
+} // namespace conv
+} // namespace utils
+} // namespace ck
+
+std::ostream& operator<<(std::ostream& os, const ck::utils::conv::ConvParam& p)
+{
+    os << "ConvParam {"
+       << "\nnum_dim_spatial: " << p.num_dim_spatial_ << "\nG: " << p.G_ << "\nN: " << p.N_
+       << "\nK: " << p.K_ << "\nC: " << p.C_
+       << "\nfilter_spatial_lengths: " << p.filter_spatial_lengths_
+       << "\ninput_spatial_lengths: " << p.input_spatial_lengths_
+       << "\nconv_filter_strides: " << p.conv_filter_strides_
+       << "\nconv_filter_dilations: " << p.conv_filter_dilations_
+       << "\ninput_left_pads: " << p.input_left_pads_
+       << "\ninput_right_pads: " << p.input_right_pads_ << "}\n";
+
+    return os;
+}
diff --git a/library/src/utility/device_memory.cpp b/library/src/utility/device_memory.cpp
new file mode 100644
index 00000000..90f94331
--- /dev/null
+++ b/library/src/utility/device_memory.cpp
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/host_utility/hip_check_error.hpp"
+
+#include "ck/library/utility/device_memory.hpp"
+
+DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
+{
+    hip_check_error(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
+}
+
+void* DeviceMem::GetDeviceBuffer() const { return mpDeviceBuf; }
+
+std::size_t DeviceMem::GetBufferSize() const { return mMemSize; }
+
+void DeviceMem::ToDevice(const void* p) const
+{
+    hip_check_error(hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
+}
+
+void DeviceMem::FromDevice(void* p) const
+{
+    hip_check_error(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
+}
+
+void DeviceMem::SetZero() const { hip_check_error(hipMemset(mpDeviceBuf, 0, mMemSize)); }
+
+DeviceMem::~DeviceMem() { hip_check_error(hipFree(mpDeviceBuf)); }
diff --git a/library/src/utility/host_tensor.cpp b/library/src/utility/host_tensor.cpp
new file mode 100644
index 00000000..e34fbc8f
--- /dev/null
+++ b/library/src/utility/host_tensor.cpp
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cassert>
+
+#include "ck/library/utility/host_tensor.hpp"
+
+void HostTensorDescriptor::CalculateStrides()
+{
+    mStrides.clear();
+    mStrides.resize(mLens.size(), 0);
+    if(mStrides.empty())
+        return;
+
+    mStrides.back() = 1;
+    std::partial_sum(
+        mLens.rbegin(), mLens.rend() - 1, mStrides.rbegin() + 1, std::multiplies<std::size_t>());
+}
+
+std::size_t HostTensorDescriptor::GetNumOfDimension() const { return mLens.size(); }
+
+std::size_t HostTensorDescriptor::GetElementSize() const
+{
+    assert(mLens.size() == mStrides.size());
+    return std::accumulate(
+        mLens.begin(), mLens.end(), std::size_t{1}, std::multiplies<std::size_t>());
+}
+
+std::size_t HostTensorDescriptor::GetElementSpaceSize() const
+{
+    std::size_t space = 1;
+    for(std::size_t i = 0; i < mLens.size(); ++i)
+    {
+        if(mLens[i] == 0)
+            continue;
+
+        space += (mLens[i] - 1) * mStrides[i];
+    }
+    return space;
+}
+
+const std::vector<std::size_t>& HostTensorDescriptor::GetLengths() const { return mLens; }
+
+const std::vector<std::size_t>& HostTensorDescriptor::GetStrides() const { return mStrides; }
+
+std::ostream& operator<<(std::ostream& os, const HostTensorDescriptor& desc)
+{
+    os << "dim " << desc.GetNumOfDimension() << ", ";
+
+    os << "lengths {";
+    LogRange(os, desc.GetLengths(), ", ");
+    os << "}, ";
+
+    os << "strides {";
+    LogRange(os, desc.GetStrides(), ", ");
+    os << "}";
+
+    return os;
+}
diff --git a/profiler/CMakeLists.txt b/profiler/CMakeLists.txt
new file mode 100644
index 00000000..bdd7125a
--- /dev/null
+++ b/profiler/CMakeLists.txt
@@ -0,0 +1,5 @@
+include_directories(BEFORE
+    ${CMAKE_CURRENT_LIST_DIR}/include
+)
+
+add_subdirectory(src)
diff --git a/profiler/README.md b/profiler/README.md
new file mode 100644
index 00000000..bfd6a3a5
--- /dev/null
+++ b/profiler/README.md
@@ -0,0 +1,48 @@
+## Profile GEMM kernels
+```bash
+#arg1: tensor operation (gemm=GEMM)
+#arg2: data type (0=fp32, 1=fp16)
+#arg3: matrix layout (0=NN, 1=NT, 2=TN, 3=TT)
+#arg4: verification (0=no, 1=yes)
+#arg5: initialization (0=no init, 1=integer value, 2=decimal value)
+#arg6: print matrix value (0=no, 1=yes)
+#arg7: run kernel # of times (>1)
+#arg8 to 13: M, N, K, StrideA, StrideB, StrideC
+
+################        op  datatype  layout  verify  init  log  repeat  M___ N___ K___  StrideA StrideB StrideC
+./bin/ckProfiler      gemm         1       1       1     1    0       5  3840 4096 4096     4096    4096    4096
+```
+
+Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
+```bash
+a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1}
+b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096}
+c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1}
+....
+Best Perf: 1.1933 ms, 107.977 TFlops, 79.0848 GB/s
+```
+
+## Profile 2d forward convolution kernels
+```bash
+#arg1: tensor operation (conv=Convolution)
+#arg2: data type (0=fp32, 1=fp16)
+#arg3: input tensor layout (0=NCHW, 1=NHWC)
+#arg4: weight tensor layout (0=KCYX, 1=KYXC)
+#arg5: output tensor layout (0=NKHW, 1=NHWK)
+#arg6: verification (0=no, 1=yes)
+#arg7: initialization (0=no init, 1=integer value, 2=decimal value)
+#arg8: print matrix value (0=no, 1=yes)
+#arg9: run kernel # of times (>1)
+#arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
+ ################          op datatype  in_layout   wei_layout  out_layout  verify  init  log  repeat  N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
+ ./bin/ckProfiler  conv2d_fwd        1          1            1           1       1     1    0       5  128  256  192 3 3   71   71     2 2       1 1      1 1       1 1
+```
+
+Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
+```
+in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
+wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
+out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
+....
+Best Perf: 1.42509 ms, 102.988 TFlops, 234.086 GB/s
+```
diff --git a/profiler/include/profiler/data_type_enum.hpp b/profiler/include/profiler/data_type_enum.hpp
new file mode 100644
index 00000000..afcd6fea
--- /dev/null
+++ b/profiler/include/profiler/data_type_enum.hpp
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+
+enum struct DataTypeEnum
+{
+    Half     = 0,
+    Float    = 1,
+    Int32    = 2,
+    Int8     = 3,
+    Int8x4   = 4,
+    BFloat16 = 5,
+    Double   = 6,
+    Unknown  = 100,
+};
+
+} // namespace ck
diff --git a/profiler/include/profiler/data_type_enum_helper.hpp b/profiler/include/profiler/data_type_enum_helper.hpp
new file mode 100644
index 00000000..d9bd5e1a
--- /dev/null
+++ b/profiler/include/profiler/data_type_enum_helper.hpp
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma
+
+#include "ck/utility/data_type.hpp"
+#include "profiler/data_type_enum.hpp"
+
+namespace ck {
+
+template <DataTypeEnum DataTypeEnum>
+struct get_datatype_from_enum;
+
+template <>
+struct get_datatype_from_enum<DataTypeEnum::Int8>
+{
+    using type = int8_t;
+};
+
+template <>
+struct get_datatype_from_enum<DataTypeEnum::Int32>
+{
+    using type = int32_t;
+};
+
+template <>
+struct get_datatype_from_enum<DataTypeEnum::Half>
+{
+    using type = half_t;
+};
+
+template <>
+struct get_datatype_from_enum<DataTypeEnum::Float>
+{
+    using type = float;
+};
+
+template <>
+struct get_datatype_from_enum<DataTypeEnum::Double>
+{
+    using type = double;
+};
+
+template <typename T>
+struct get_datatype_enum_from_type;
+
+template <>
+struct get_datatype_enum_from_type<int8_t>
+{
+    static constexpr DataTypeEnum value = DataTypeEnum::Int8;
+};
+
+template <>
+struct get_datatype_enum_from_type<int32_t>
+{
+    static constexpr DataTypeEnum value = DataTypeEnum::Int32;
+};
+
+template <>
+struct get_datatype_enum_from_type<half_t>
+{
+    static constexpr DataTypeEnum value = DataTypeEnum::Half;
+};
+
+template <>
+struct get_datatype_enum_from_type<float>
+{
+    static constexpr DataTypeEnum value = DataTypeEnum::Float;
+};
+
+template <>
+struct get_datatype_enum_from_type<double>
+{
+    static constexpr DataTypeEnum value = DataTypeEnum::Double;
+};
+
+} // namespace ck
diff --git a/profiler/include/profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp b/profiler/include/profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp
new file mode 100644
index 00000000..b1625427
--- /dev/null
+++ b/profiler/include/profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp
@@ -0,0 +1,360 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <memory>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename A0Layout,
+          typename B0Layout,
+          typename D0sLayout,
+          typename B1Layout,
+          typename D1sLayout,
+          typename E1Layout,
+          typename A0DataType,
+          typename B0DataType,
+          typename D0sDataType,
+          typename B1DataType,
+          typename D1sDataType,
+          typename E1DataType>
+bool profile_batched_gemm_add_relu_gemm_add_impl(bool do_verification,
+                                                 int init_method,
+                                                 bool do_log,
+                                                 bool time_kernel,
+                                                 int M,
+                                                 int N,
+                                                 int K,
+                                                 int O,
+                                                 int BatchCount    = 1,
+                                                 int StrideA0      = -1,
+                                                 int StrideB0      = -1,
+                                                 int StrideD0      = -1,
+                                                 int StrideB1      = -1,
+                                                 int StrideD1      = -1,
+                                                 int StrideE1      = -1,
+                                                 int BatchStrideA0 = -1,
+                                                 int BatchStrideB0 = -1,
+                                                 int BatchStrideD0 = -1,
+                                                 int BatchStrideB1 = -1,
+                                                 int BatchStrideD1 = -1,
+                                                 int BatchStrideE1 = -1)
+
+{
+    using Row = tensor_layout::gemm::RowMajor;
+    using Col = tensor_layout::gemm::ColumnMajor;
+
+    using PassThrough = tensor_operation::element_wise::PassThrough;
+
+    using A0ElementOp   = PassThrough;
+    using B0ElementOp   = PassThrough;
+    using CDE0ElementOp = ck::tensor_operation::element_wise::AddRelu;
+    using B1ElementOp   = PassThrough;
+    using CDE1ElementOp = ck::tensor_operation::element_wise::Add;
+
+    using D0DataType = remove_cvref_t<tuple_element_t<0, D0sDataType>>;
+
+    using D0Layout   = remove_cvref_t<tuple_element_t<0, D0sLayout>>;
+    using D1DataType = remove_cvref_t<tuple_element_t<0, D1sDataType>>;
+    using D1Layout   = remove_cvref_t<tuple_element_t<0, D1sLayout>>;
+
+    // for reference
+    using RefAcc0DataType = float;
+    using RefAcc1DataType = float;
+
+    bool pass = true;
+
+    const int DefaultStrideA0 = ck::is_same_v<A0Layout, Row> ? K : M;
+    const int DefaultStrideB0 = ck::is_same_v<B0Layout, Row> ? N : K;
+    const int DefaultStrideD0 = ck::is_same_v<D0Layout, Row> ? N : M;
+    const int DefaultStrideB1 = ck::is_same_v<B1Layout, Row> ? O : N;
+    const int DefaultStrideD1 = ck::is_same_v<D1Layout, Row> ? O : M;
+    const int DefaultStrideE1 = ck::is_same_v<E1Layout, Row> ? O : M;
+
+    StrideA0 = (StrideA0 < 0) ? DefaultStrideA0 : StrideA0;
+    StrideB0 = (StrideB0 < 0) ? DefaultStrideB0 : StrideB0;
+    StrideD0 = (StrideD0 < 0) ? DefaultStrideD0 : StrideD0;
+    StrideB1 = (StrideB1 < 0) ? DefaultStrideB1 : StrideB1;
+    StrideD1 = (StrideD1 < 0) ? DefaultStrideD1 : StrideD1;
+    StrideE1 = (StrideE1 < 0) ? DefaultStrideE1 : StrideE1;
+
+    const int DefaultBatchStrideA0 = (ck::is_same_v<A0Layout, Col> ? K : M) * StrideA0;
+    const int DefaultBatchStrideB0 = (ck::is_same_v<B0Layout, Col> ? N : K) * StrideB0;
+    const int DefaultBatchStrideD0 = (ck::is_same_v<D0Layout, Col> ? N : M) * StrideD0;
+    const int DefaultBatchStrideB1 = (ck::is_same_v<B1Layout, Col> ? O : N) * StrideB1;
+    const int DefaultBatchStrideD1 = (ck::is_same_v<D1Layout, Col> ? O : M) * StrideD1;
+    const int DefaultBatchStrideE1 = (ck::is_same_v<E1Layout, Col> ? O : M) * StrideE1;
+
+    BatchStrideA0 = BatchStrideA0 < 0 ? DefaultBatchStrideA0 : BatchStrideA0;
+    BatchStrideB0 = BatchStrideB0 < 0 ? DefaultBatchStrideB0 : BatchStrideB0;
+    BatchStrideD0 = BatchStrideD0 < 0 ? DefaultBatchStrideD0 : BatchStrideD0;
+    BatchStrideB1 = BatchStrideB1 < 0 ? DefaultBatchStrideB1 : BatchStrideB1;
+    BatchStrideD1 = BatchStrideD1 < 0 ? DefaultBatchStrideD1 : BatchStrideD1;
+    BatchStrideE1 = BatchStrideE1 < 0 ? DefaultBatchStrideE1 : BatchStrideE1;
+
+    auto f_host_tensor_descriptor = [](std::size_t batch_count,
+                                       std::size_t row,
+                                       std::size_t col,
+                                       std::size_t stride,
+                                       std::size_t batch_stride,
+                                       auto layout) {
+        using namespace ck::literals;
+
+        if(std::is_same<decltype(layout), Row>::value)
+        {
+            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz});
+        }
+        else
+        {
+            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride});
+        }
+    };
+
+    // E_m_o = A_m_k * B0_k_n * B1_n_o
+    Tensor<A0DataType> a0_g_m_k(
+        f_host_tensor_descriptor(BatchCount, M, K, StrideA0, BatchStrideA0, A0Layout{}));
+    Tensor<B0DataType> b0_g_k_n(
+        f_host_tensor_descriptor(BatchCount, K, N, StrideB0, BatchStrideB0, B0Layout{}));
+    Tensor<D0DataType> d0_g_m_n(
+        f_host_tensor_descriptor(BatchCount, M, N, StrideD0, BatchStrideD0, D0Layout{}));
+    Tensor<B1DataType> b1_g_n_o(
+        f_host_tensor_descriptor(BatchCount, N, O, StrideB1, BatchStrideB1, B1Layout{}));
+    Tensor<D1DataType> d1_g_m_o(
+        f_host_tensor_descriptor(BatchCount, M, O, StrideD1, BatchStrideD1, D1Layout{}));
+    Tensor<E1DataType> e1_g_m_o_host_result(
+        f_host_tensor_descriptor(BatchCount, M, O, StrideE1, BatchStrideE1, E1Layout{}));
+    Tensor<E1DataType> e1_g_m_o_device_result(
+        f_host_tensor_descriptor(BatchCount, M, O, StrideE1, BatchStrideE1, E1Layout{}));
+
+    // Host verification: Output of Gemm0 is input A of Gemm1
+    Tensor<RefAcc0DataType> c0_g_m_n(f_host_tensor_descriptor(BatchCount, M, N, N, M * N, Row{}));
+    Tensor<RefAcc0DataType> e0_g_m_n(f_host_tensor_descriptor(BatchCount, M, N, N, M * N, Row{}));
+    Tensor<RefAcc1DataType> c1_g_m_o(f_host_tensor_descriptor(BatchCount, M, O, O, M * O, Row{}));
+
+    std::cout << "a0_g_m_k: " << a0_g_m_k.mDesc << std::endl;
+    std::cout << "b0_g_k_n: " << b0_g_k_n.mDesc << std::endl;
+    std::cout << "d0_g_m_n: " << d0_g_m_n.mDesc << std::endl;
+    std::cout << "b1_g_n_o: " << b1_g_n_o.mDesc << std::endl;
+    std::cout << "d1_g_m_o: " << d1_g_m_o.mDesc << std::endl;
+    std::cout << "e1_g_m_o: " << e1_g_m_o_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a0_g_m_k.GenerateTensorValue(GeneratorTensor_2<A0DataType>{-2, 3});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 3});
+        d0_g_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-2, 3});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-2, 3});
+        d1_g_m_o.GenerateTensorValue(GeneratorTensor_2<D1DataType>{-2, 3});
+        break;
+    default:
+        a0_g_m_k.GenerateTensorValue(GeneratorTensor_3<A0DataType>{0.0, 1.0});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_3<B0DataType>{-0.5, 0.5});
+        d0_g_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_3<B1DataType>{-0.5, 0.5});
+        d1_g_m_o.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
+    }
+
+    DeviceMem a0_g_m_k_device_buf(sizeof(A0DataType) * a0_g_m_k.mDesc.GetElementSize());
+    DeviceMem b0_g_k_n_device_buf(sizeof(B0DataType) * b0_g_k_n.mDesc.GetElementSize());
+    DeviceMem d0_g_m_n_device_buf(sizeof(D0DataType) * d0_g_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem b1_g_n_o_device_buf(sizeof(B1DataType) * b1_g_n_o.mDesc.GetElementSize());
+    DeviceMem d1_g_m_o_device_buf(sizeof(D1DataType) * d1_g_m_o.mDesc.GetElementSpaceSize());
+    DeviceMem e1_g_m_o_device_buf(sizeof(E1DataType) *
+                                  e1_g_m_o_device_result.mDesc.GetElementSize());
+
+    a0_g_m_k_device_buf.ToDevice(a0_g_m_k.mData.data());
+    b0_g_k_n_device_buf.ToDevice(b0_g_k_n.mData.data());
+    d0_g_m_n_device_buf.ToDevice(d0_g_m_n.mData.data());
+    b1_g_n_o_device_buf.ToDevice(b1_g_n_o.mData.data());
+    d1_g_m_o_device_buf.ToDevice(d1_g_m_o.mData.data());
+
+    auto a0_element_op   = A0ElementOp{};
+    auto b0_element_op   = B0ElementOp{};
+    auto cde0_element_op = CDE0ElementOp{};
+    auto b1_element_op   = B1ElementOp{};
+    auto cde1_element_op = CDE1ElementOp{};
+
+    using DeviceOp =
+        tensor_operation::device::DeviceBatchedGemmMultipleDGemmMultipleD<A0Layout,
+                                                                          B0Layout,
+                                                                          D0sLayout,
+                                                                          B1Layout,
+                                                                          D1sLayout,
+                                                                          E1Layout,
+                                                                          A0DataType,
+                                                                          B0DataType,
+                                                                          D0sDataType,
+                                                                          B1DataType,
+                                                                          D1sDataType,
+                                                                          E1DataType,
+                                                                          A0ElementOp,
+                                                                          B0ElementOp,
+                                                                          CDE0ElementOp,
+                                                                          B1ElementOp,
+                                                                          CDE1ElementOp>;
+
+    // get device op instances
+    const auto op_ptrs = tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    if(do_verification)
+    {
+        // Ref Gemm0
+        using ReferenceGemm0Instance = tensor_operation::host::ReferenceBatchedGemm<A0DataType,
+                                                                                    B0DataType,
+                                                                                    RefAcc0DataType,
+                                                                                    RefAcc0DataType,
+                                                                                    A0ElementOp,
+                                                                                    B0ElementOp,
+                                                                                    PassThrough>;
+
+        // Ref Gemm1
+        using ReferenceGemm1Instance = tensor_operation::host::ReferenceBatchedGemm<RefAcc0DataType,
+                                                                                    B1DataType,
+                                                                                    RefAcc1DataType,
+                                                                                    RefAcc1DataType,
+                                                                                    PassThrough,
+                                                                                    B1ElementOp,
+                                                                                    PassThrough>;
+
+        auto ref_gemm0          = ReferenceGemm0Instance{};
+        auto ref_gemm0_invoker  = ref_gemm0.MakeInvoker();
+        auto ref_gemm0_argument = ref_gemm0.MakeArgument(
+            a0_g_m_k, b0_g_k_n, c0_g_m_n, a0_element_op, b0_element_op, PassThrough{});
+
+        ref_gemm0_invoker.Run(ref_gemm0_argument);
+
+        // cde0_elementwise
+        e0_g_m_n.ForEach(
+            [&](auto&, auto idx) { cde0_element_op(e0_g_m_n(idx), c0_g_m_n(idx), d0_g_m_n(idx)); });
+
+        auto ref_gemm1          = ReferenceGemm1Instance{};
+        auto ref_gemm1_invoker  = ref_gemm1.MakeInvoker();
+        auto ref_gemm1_argument = ref_gemm1.MakeArgument(
+            e0_g_m_n, b1_g_n_o, c1_g_m_o, PassThrough{}, b1_element_op, PassThrough{});
+
+        ref_gemm1_invoker.Run(ref_gemm1_argument);
+
+        // cde1_elementwise
+        e1_g_m_o_host_result.ForEach([&](auto&, auto idx) {
+            cde1_element_op(e1_g_m_o_host_result(idx), c1_g_m_o(idx), d1_g_m_o(idx));
+        });
+    }
+
+    std::string best_op_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device op instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<A0DataType*>(a0_g_m_k_device_buf.GetDeviceBuffer()),
+            static_cast<B0DataType*>(b0_g_k_n_device_buf.GetDeviceBuffer()),
+            std::array<const void*, 1>{d0_g_m_n_device_buf.GetDeviceBuffer()},
+            static_cast<B1DataType*>(b1_g_n_o_device_buf.GetDeviceBuffer()),
+            std::array<const void*, 1>{d1_g_m_o_device_buf.GetDeviceBuffer()},
+            static_cast<E1DataType*>(e1_g_m_o_device_buf.GetDeviceBuffer()),
+            M,
+            N,
+            K,
+            O,
+            BatchCount,
+            StrideA0,
+            StrideB0,
+            std::array<ck::index_t, 1>{StrideD0},
+            StrideB1,
+            std::array<ck::index_t, 1>{StrideD1},
+            StrideE1,
+            BatchStrideA0,
+            BatchStrideB0,
+            std::array<ck::index_t, 1>{BatchStrideD0},
+            BatchStrideB1,
+            std::array<ck::index_t, 1>{BatchStrideD1},
+            BatchStrideE1,
+            a0_element_op,
+            b0_element_op,
+            cde0_element_op,
+            b1_element_op,
+            cde1_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            std::string op_name = op_ptr->GetTypeString();
+
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * BatchCount;
+            std::size_t num_btype =
+                (sizeof(A0DataType) * M * K + sizeof(B0DataType) * K * N + sizeof(D0DataType) * N +
+                 sizeof(B1DataType) * N * O + sizeof(E1DataType) * M * O + sizeof(D1DataType) * O) *
+                BatchCount;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                      << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                e1_g_m_o_device_buf.FromDevice(e1_g_m_o_device_result.mData.data());
+
+                pass = pass & ck::utils::check_err(e1_g_m_o_device_result, e1_g_m_o_host_result);
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(
+                        std::cout << "e1_g_m_o_host_result : ", e1_g_m_o_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "e1_g_m_o_device_result : ", e1_g_m_o_device_result.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+        else
+        {
+            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp
new file mode 100644
index 00000000..1583c6db
--- /dev/null
+++ b/profiler/include/profiler/profile_batched_gemm_gemm_impl.hpp
@@ -0,0 +1,319 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <memory>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm_gemm.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename ADataType,
+          typename B0DataType,
+          typename B1DataType,
+          typename CDataType,
+          typename ALayout,
+          typename B0Layout,
+          typename B1Layout,
+          typename CLayout>
+bool profile_batched_gemm_gemm_impl(bool do_verification,
+                                    int init_method,
+                                    bool do_log,
+                                    bool time_kernel,
+                                    int M,
+                                    int N,
+                                    int K,
+                                    int O,
+                                    int BatchCount    = 1,
+                                    int StrideA       = -1,
+                                    int StrideB0      = -1,
+                                    int StrideB1      = -1,
+                                    int StrideC       = -1,
+                                    int BatchStrideA  = -1,
+                                    int BatchStrideB0 = -1,
+                                    int BatchStrideB1 = -1,
+                                    int BatchStrideC  = -1)
+
+{
+
+    using Row           = tensor_layout::gemm::RowMajor;
+    using Col           = tensor_layout::gemm::ColumnMajor;
+    using PassThrough   = tensor_operation::element_wise::PassThrough;
+    using AElementOp    = PassThrough;
+    using B0ElementOp   = PassThrough;
+    using B1ElementOp   = PassThrough;
+    using Acc0ElementOp = PassThrough;
+    using CElementOp    = PassThrough;
+    using AccDataType   = float;
+
+    // Ref Gemm0
+    using ReferenceGemm0Instance = tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B0DataType,
+                                                                                ADataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B0ElementOp,
+                                                                                CElementOp>;
+
+    // Ref Gemm
+    using ReferenceGemm1Instance = tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B1DataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B1ElementOp,
+                                                                                CElementOp>;
+
+    bool pass = true;
+
+    const int DefaultStrideA  = ck::is_same_v<ALayout, Row> ? K : M;
+    const int DefaultStrideB0 = ck::is_same_v<B0Layout, Row> ? N : K;
+    const int DefaultStrideB1 = ck::is_same_v<B1Layout, Row> ? O : N;
+    const int DefaultStrideC  = ck::is_same_v<CLayout, Row> ? O : M;
+
+    StrideA  = (StrideA < 0) ? DefaultStrideA : StrideA;
+    StrideB0 = (StrideB0 < 0) ? DefaultStrideB0 : StrideB0;
+    StrideB1 = (StrideB1 < 0) ? DefaultStrideB1 : StrideB1;
+    StrideC  = (StrideC < 0) ? DefaultStrideC : StrideC;
+
+    const int DefaultBatchStrideA  = (ck::is_same_v<ALayout, Col> ? K : M) * StrideA;
+    const int DefaultBatchStrideB0 = (ck::is_same_v<B0Layout, Col> ? N : K) * StrideB0;
+    const int DefaultBatchStrideB1 = (ck::is_same_v<B1Layout, Col> ? O : N) * StrideB1;
+    const int DefaultBatchStrideC  = (ck::is_same_v<CLayout, Col> ? O : M) * StrideC;
+
+    BatchStrideA  = BatchStrideA < 0 ? DefaultBatchStrideA : BatchStrideA;
+    BatchStrideB0 = BatchStrideB0 < 0 ? DefaultBatchStrideB0 : BatchStrideB0;
+    BatchStrideB1 = BatchStrideB1 < 0 ? DefaultBatchStrideB1 : BatchStrideB1;
+    BatchStrideC  = BatchStrideC < 0 ? DefaultBatchStrideC : BatchStrideC;
+
+    auto f_host_tensor_descriptor = [](std::size_t batch_count,
+                                       std::size_t row,
+                                       std::size_t col,
+                                       std::size_t stride,
+                                       std::size_t batch_stride,
+                                       auto layout) {
+        using namespace ck::literals;
+
+        if(std::is_same<decltype(layout), Row>::value)
+        {
+            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz});
+        }
+        else
+        {
+            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride});
+        }
+    };
+
+    // C_m_o = A_m_k * B0_k_n * B1_n_o
+    Tensor<ADataType> a_g_m_k(
+        f_host_tensor_descriptor(BatchCount, M, K, StrideA, BatchStrideA, ALayout{}));
+    Tensor<B0DataType> b0_g_k_n(
+        f_host_tensor_descriptor(BatchCount, K, N, StrideB0, BatchStrideB0, B0Layout{}));
+    Tensor<B1DataType> b1_g_n_o(
+        f_host_tensor_descriptor(BatchCount, N, O, StrideB1, BatchStrideB1, B1Layout{}));
+    Tensor<CDataType> c_g_m_o_host_result(
+        f_host_tensor_descriptor(BatchCount, M, O, StrideC, BatchStrideC, CLayout{}));
+    Tensor<CDataType> c_g_m_o_device_result(
+        f_host_tensor_descriptor(BatchCount, M, O, StrideC, BatchStrideC, CLayout{}));
+    // Host verification: Output of Gemm0 is input A of Gemm1
+    Tensor<ADataType> acc0_g_m_n(f_host_tensor_descriptor(BatchCount, M, N, N, M * N, Row{}));
+
+    std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
+    std::cout << "b0_g_k_n: " << b0_g_k_n.mDesc << std::endl;
+    std::cout << "b1_g_n_o: " << b1_g_n_o.mDesc << std::endl;
+    std::cout << "c_g_m_o: " << c_g_m_o_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 3});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 3});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-2, 3});
+        break;
+    case 2:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_3<B0DataType>{0.0, 1.0});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_3<B1DataType>{-0.5, 0.5});
+        break;
+    case 3:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Diagonal<B0DataType>{});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+        break;
+    default:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+    }
+
+    DeviceMem a_g_m_k_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSize());
+    DeviceMem b0_g_k_n_device_buf(sizeof(B0DataType) * b0_g_k_n.mDesc.GetElementSize());
+    DeviceMem b1_g_n_o_device_buf(sizeof(B1DataType) * b1_g_n_o.mDesc.GetElementSize());
+    DeviceMem c_g_m_o_device_buf(sizeof(CDataType) * c_g_m_o_device_result.mDesc.GetElementSize());
+
+    a_g_m_k_device_buf.ToDevice(a_g_m_k.mData.data());
+    b0_g_k_n_device_buf.ToDevice(b0_g_k_n.mData.data());
+    b1_g_n_o_device_buf.ToDevice(b1_g_n_o.mData.data());
+
+    auto a_element_op    = AElementOp{};
+    auto b0_element_op   = B0ElementOp{};
+    auto acc0_element_op = Acc0ElementOp{};
+    auto b1_element_op   = B1ElementOp{};
+    auto c_element_op    = CElementOp{};
+
+    using DeviceOp = tensor_operation::device::DeviceBatchedGemmGemm<ALayout,
+                                                                     B0Layout,
+                                                                     B1Layout,
+                                                                     CLayout,
+                                                                     ADataType,
+                                                                     B0DataType,
+                                                                     B1DataType,
+                                                                     CDataType,
+                                                                     AElementOp,
+                                                                     B0ElementOp,
+                                                                     Acc0ElementOp,
+                                                                     B1ElementOp,
+                                                                     CElementOp>;
+
+    // get device op instances
+    const auto op_ptrs = tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    // early fail when no instances are found
+    if(op_ptrs.size() == 0)
+    {
+        return false;
+    }
+
+    if(do_verification)
+    {
+        auto ref_gemm0          = ReferenceGemm0Instance{};
+        auto ref_gemm0_invoker  = ref_gemm0.MakeInvoker();
+        auto ref_gemm0_argument = ref_gemm0.MakeArgument(
+            a_g_m_k, b0_g_k_n, acc0_g_m_n, a_element_op, b0_element_op, PassThrough{});
+
+        ref_gemm0_invoker.Run(ref_gemm0_argument);
+
+        auto ref_gemm1          = ReferenceGemm1Instance{};
+        auto ref_gemm1_invoker  = ref_gemm1.MakeInvoker();
+        auto ref_gemm1_argument = ref_gemm1.MakeArgument(
+            acc0_g_m_n, b1_g_n_o, c_g_m_o_host_result, PassThrough{}, b1_element_op, c_element_op);
+
+        ref_gemm1_invoker.Run(ref_gemm1_argument);
+    }
+
+    std::string best_op_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device op instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<ADataType*>(a_g_m_k_device_buf.GetDeviceBuffer()),
+            static_cast<B0DataType*>(b0_g_k_n_device_buf.GetDeviceBuffer()),
+            static_cast<B1DataType*>(b1_g_n_o_device_buf.GetDeviceBuffer()),
+            static_cast<CDataType*>(c_g_m_o_device_buf.GetDeviceBuffer()),
+            M,
+            N,
+            K,
+            O,
+            BatchCount,
+            StrideA,
+            StrideB0,
+            StrideB1,
+            StrideC,
+            BatchStrideA,
+            BatchStrideB0,
+            BatchStrideB1,
+            BatchStrideC,
+            a_element_op,
+            b0_element_op,
+            acc0_element_op,
+            b1_element_op,
+            c_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            std::string op_name = op_ptr->GetTypeString();
+
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop      = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * BatchCount;
+            std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N +
+                                     sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O) *
+                                    BatchCount;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                      << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                c_g_m_o_device_buf.FromDevice(c_g_m_o_device_result.mData.data());
+
+                pass = pass & ck::utils::check_err(c_g_m_o_device_result, c_g_m_o_host_result);
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "a_g_m_k: ", a_g_m_k.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "b0_g_k_n : ", b0_g_k_n.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "b1_g_n_o : ", b1_g_n_o.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "c_g_m_o_host_result : ", c_g_m_o_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "c_g_m_o_device_result : ", c_g_m_o_device_result.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+        else
+        {
+            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_batched_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_impl.hpp
new file mode 100644
index 00000000..c07d7c05
--- /dev/null
+++ b/profiler/include/profiler/profile_batched_gemm_impl.hpp
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <memory>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+bool profile_batched_gemm_impl(int do_verification,
+                               int init_method,
+                               bool do_log,
+                               bool time_kernel,
+                               int M,
+                               int N,
+                               int K,
+                               int BatchStrideA,
+                               int BatchStrideB,
+                               int BatchStrideC,
+                               int StrideA,
+                               int StrideB,
+                               int StrideC,
+                               int BatchCount)
+{
+    bool pass = true;
+
+    auto f_host_tensor_descriptor = [](std::size_t batch_count,
+                                       std::size_t row,
+                                       std::size_t col,
+                                       std::size_t stride,
+                                       std::size_t batch_stride,
+                                       auto layout) {
+        using namespace ck::literals;
+
+        if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+        {
+            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz});
+        }
+        else
+        {
+            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride});
+        }
+    };
+
+    Tensor<ADataType> a_g_m_k(
+        f_host_tensor_descriptor(BatchCount, M, K, StrideA, BatchStrideA, ALayout{}));
+    Tensor<BDataType> b_g_k_n(
+        f_host_tensor_descriptor(BatchCount, K, N, StrideB, BatchStrideB, BLayout{}));
+    Tensor<CDataType> c_g_m_n_host_result(
+        f_host_tensor_descriptor(BatchCount, M, N, StrideC, BatchStrideC, CLayout{}));
+    Tensor<CDataType> c_g_m_n_device_result(
+        f_host_tensor_descriptor(BatchCount, M, N, StrideC, BatchStrideC, CLayout{}));
+
+    std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
+    std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl;
+    std::cout << "c_g_m_n: " << c_g_m_n_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    default:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+    }
+
+    using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+    const auto a_element_op = AElementOp{};
+    const auto b_element_op = BElementOp{};
+    const auto c_element_op = CElementOp{};
+
+    if(do_verification)
+    {
+        using ReferenceBatchedGemmInstance =
+            ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                             BDataType,
+                                                             CDataType,
+                                                             float,
+                                                             AElementOp,
+                                                             BElementOp,
+                                                             CElementOp>;
+
+        auto ref_batched_gemm = ReferenceBatchedGemmInstance{};
+        auto ref_invoker      = ref_batched_gemm.MakeInvoker();
+
+        auto ref_argument = ref_batched_gemm.MakeArgument(
+            a_g_m_k, b_g_k_n, c_g_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+        ref_invoker.Run(ref_argument);
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_g_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(CDataType) * c_g_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_g_m_k.mData.data());
+    b_device_buf.ToDevice(b_g_k_n.mData.data());
+    c_device_buf.ToDevice(c_g_m_n_device_result.mData.data());
+
+    using DeviceOp = ck::tensor_operation::device::DeviceBatchedGemm<ALayout,
+                                                                     BLayout,
+                                                                     CLayout,
+                                                                     ADataType,
+                                                                     BDataType,
+                                                                     CDataType,
+                                                                     AElementOp,
+                                                                     BElementOp,
+                                                                     CElementOp>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device op instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                        static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                        static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                        M,
+                                        N,
+                                        K,
+                                        StrideA,
+                                        StrideB,
+                                        StrideC,
+                                        BatchStrideA,
+                                        BatchStrideB,
+                                        BatchStrideC,
+                                        BatchCount,
+                                        ck::tensor_operation::element_wise::PassThrough{},
+                                        ck::tensor_operation::element_wise::PassThrough{},
+                                        ck::tensor_operation::element_wise::PassThrough{});
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // re-init C to zero before profiling next kernel
+            c_device_buf.SetZero();
+
+            std::string op_name = op_ptr->GetTypeString();
+
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop = std::size_t(2) * BatchCount * M * N * K;
+
+            std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                                     sizeof(CDataType) * M * N) *
+                                    BatchCount;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                      << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                c_device_buf.FromDevice(c_g_m_n_device_result.mData.data());
+
+                pass = pass & ck::utils::check_err(c_g_m_n_device_result, c_g_m_n_host_result);
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "a : ", a_g_m_k.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "b: ", b_g_k_n.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "c_host: ", c_g_m_n_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "c_device: ", c_g_m_n_device_result.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+        else
+        {
+            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_batched_gemm_reduce_impl.hpp b/profiler/include/profiler/profile_batched_gemm_reduce_impl.hpp
new file mode 100644
index 00000000..45b7b773
--- /dev/null
+++ b/profiler/include/profiler/profile_batched_gemm_reduce_impl.hpp
@@ -0,0 +1,362 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32                 = float;
+using F16                 = ck::half_t;
+using ReducePtrsGlobal    = ck::Tuple<F32*, F32*>;
+using Identity            = ck::tensor_operation::element_wise::PassThrough;
+using Square              = ck::tensor_operation::element_wise::UnarySquare;
+using ReduceInElementOps  = ck::Tuple<Identity, Square>;
+using ReduceOutElementOps = ck::Tuple<Identity, Identity>;
+
+using DeviceGemmReduceNoOpPtr =
+    ck::tensor_operation::device::DeviceGemmReducePtr<0, ReducePtrsGlobal::Size()>;
+
+void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instances(
+    std::vector<DeviceGemmReduceNoOpPtr>&);
+
+void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instances(
+    std::vector<DeviceGemmReduceNoOpPtr>&);
+
+void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instances(
+    std::vector<DeviceGemmReduceNoOpPtr>&);
+
+void add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instances(
+    std::vector<DeviceGemmReduceNoOpPtr>&);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+
+namespace ck {
+namespace profiler {
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename ReduceDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+bool profile_batched_gemm_reduce_impl(int do_verification,
+                                      int init_method,
+                                      bool do_log,
+                                      bool time_kernel,
+                                      int M,
+                                      int N,
+                                      int K,
+                                      int StrideA,
+                                      int StrideB,
+                                      int StrideC,
+                                      int BatchCount)
+{
+    bool pass = true;
+
+    auto f_host_tensor_descriptor = [](std::size_t batch_count,
+                                       std::size_t row,
+                                       std::size_t col,
+                                       std::size_t stride,
+                                       auto layout) {
+        using namespace ck::literals;
+
+        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+        {
+            return HostTensorDescriptor({batch_count, row, col}, {row * stride, stride, 1_uz});
+        }
+        else
+        {
+            return HostTensorDescriptor({batch_count, row, col}, {col * stride, 1_uz, stride});
+        }
+    };
+
+    Tensor<ADataType> a_g_m_k(f_host_tensor_descriptor(BatchCount, M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_g_k_n(f_host_tensor_descriptor(BatchCount, K, N, StrideB, BLayout{}));
+
+    Tensor<CDataType> c_g_m_n_host_result(
+        f_host_tensor_descriptor(BatchCount, M, N, StrideC, CLayout{}));
+    Tensor<ReduceDataType> d0_g_m_host_result({BatchCount, M});
+    Tensor<ReduceDataType> d1_g_m_host_result({BatchCount, M});
+
+    Tensor<CDataType> c_g_m_n_device_result(
+        f_host_tensor_descriptor(BatchCount, M, N, StrideC, CLayout{}));
+    Tensor<ReduceDataType> d0_g_m_device_result({BatchCount, M});
+    Tensor<ReduceDataType> d1_g_m_device_result({BatchCount, M});
+
+    std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
+    std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl;
+    std::cout << "c_g_m_n: " << c_g_m_n_host_result.mDesc << std::endl;
+    std::cout << "d0_g_m: " << d0_g_m_host_result.mDesc << std::endl;
+    std::cout << "d1_g_m: " << d1_g_m_host_result.mDesc << std::endl;
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        std::srand(0);
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5}, num_thread);
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}, num_thread);
+        break;
+    default:
+        std::srand(0);
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}, num_thread);
+        b_g_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}, num_thread);
+    }
+
+    using AElementOp            = ck::tensor_operation::element_wise::PassThrough;
+    using BElementOp            = ck::tensor_operation::element_wise::PassThrough;
+    using CElementOp            = ck::tensor_operation::element_wise::PassThrough;
+    using ReduceOp0             = ck::reduce::Add;
+    using ReduceOp1             = ck::reduce::Add;
+    using UnaryIdenticElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using UnarySquareElementOp  = ck::tensor_operation::element_wise::UnarySquare;
+
+    auto a_element_op                     = AElementOp{};
+    auto b_element_op                     = BElementOp{};
+    auto c_element_op                     = CElementOp{};
+    std::array<void*, 3> gemm_element_ops = {&a_element_op, &b_element_op, &c_element_op};
+
+    const auto reduce0_op = ReduceOp0{};
+    const auto reduce1_op = ReduceOp1{};
+
+    auto passthrough                            = UnaryIdenticElementOp{};
+    auto square                                 = UnarySquareElementOp{};
+    std::array<void*, 2> reduce_in_element_ops  = {&passthrough, &square};
+    std::array<void*, 2> reduce_out_element_ops = {&passthrough, &passthrough};
+
+    if(do_verification)
+    {
+        using ReferenceBatchedGemmInstance =
+            ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                             BDataType,
+                                                             CDataType,
+                                                             float,
+                                                             AElementOp,
+                                                             BElementOp,
+                                                             CElementOp>;
+
+        using ReduceAccDataType = ReduceDataType;
+
+        auto ref_batched_gemm = ReferenceBatchedGemmInstance{};
+        auto ref_invoker      = ref_batched_gemm.MakeInvoker();
+
+        auto ref_argument = ref_batched_gemm.MakeArgument(
+            a_g_m_k, b_g_k_n, c_g_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        for(int batch = 0; batch < BatchCount; ++batch)
+        {
+            for(int m = 0; m < M; ++m)
+            {
+                auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
+                auto reduce1_acc = reduce1_op.GetIdentityValue<ReduceAccDataType>();
+
+                for(int n = 0; n < N; ++n)
+                {
+                    ReduceAccDataType d0_val =
+                        ck::type_convert<ReduceAccDataType>(c_g_m_n_host_result(batch, m, n));
+                    ReduceAccDataType d1_val;
+
+                    square(d1_val, d0_val);
+                    reduce0_op(reduce0_acc, d0_val);
+                    reduce1_op(reduce1_acc, d1_val);
+                }
+
+                d0_g_m_host_result(batch, m) = ck::type_convert<ReduceDataType>(reduce0_acc);
+                d1_g_m_host_result(batch, m) = ck::type_convert<ReduceDataType>(reduce1_acc);
+            }
+        }
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_g_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(CDataType) * c_g_m_n_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem reduce0_device_buf(sizeof(ReduceDataType) *
+                                 d0_g_m_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem reduce1_device_buf(sizeof(ReduceDataType) *
+                                 d1_g_m_device_result.mDesc.GetElementSpaceSize());
+
+    std::array<void*, 2> p_reduces = {reduce0_device_buf.GetDeviceBuffer(),
+                                      reduce1_device_buf.GetDeviceBuffer()};
+
+    a_device_buf.ToDevice(a_g_m_k.mData.data());
+    b_device_buf.ToDevice(b_g_k_n.mData.data());
+
+    // add device GEMM instances
+    std::vector<ck::tensor_operation::device::instance::DeviceGemmReduceNoOpPtr> gemm_ptrs;
+
+    if constexpr(is_same<ADataType, half_t>::value && is_same<BDataType, half_t>::value &&
+                 is_same<CDataType, half_t>::value)
+    {
+        if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value &&
+                     is_same<BLayout, tensor_layout::gemm::RowMajor>::value &&
+                     is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
+        {
+            ck::tensor_operation::device::instance::
+                add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instances(
+                    gemm_ptrs);
+        }
+        else if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value &&
+                          is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value &&
+                          is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
+        {
+            ck::tensor_operation::device::instance::
+                add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instances(
+                    gemm_ptrs);
+        }
+        else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value &&
+                          is_same<BLayout, tensor_layout::gemm::RowMajor>::value &&
+                          is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
+        {
+            ck::tensor_operation::device::instance::
+                add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instances(
+                    gemm_ptrs);
+        }
+        else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value &&
+                          is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value &&
+                          is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
+        {
+            ck::tensor_operation::device::instance::
+                add_device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instances(
+                    gemm_ptrs);
+        }
+    }
+
+    if(gemm_ptrs.size() <= 0)
+    {
+        throw std::runtime_error("wrong! no device GEMM instance found");
+    }
+
+    std::string best_gemm_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device GEMM instances
+    for(auto& gemm_ptr : gemm_ptrs)
+    {
+        auto argument_ptr = gemm_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                                          b_device_buf.GetDeviceBuffer(),
+                                                          nullptr,
+                                                          {},
+                                                          c_device_buf.GetDeviceBuffer(),
+                                                          p_reduces,
+                                                          M,
+                                                          N,
+                                                          K,
+                                                          StrideA,
+                                                          StrideB,
+                                                          StrideC,
+                                                          {},
+                                                          gemm_element_ops,
+                                                          {},
+                                                          reduce_in_element_ops,
+                                                          reduce_out_element_ops,
+                                                          BatchCount);
+
+        auto invoker_ptr = gemm_ptr->MakeInvokerPointer();
+
+        if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // init DO, D1 to 0
+            reduce0_device_buf.SetZero();
+            reduce1_device_buf.SetZero();
+
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::string gemm_name = gemm_ptr->GetTypeString();
+
+            std::size_t flop      = std::size_t(2) * BatchCount * M * N * K;
+            std::size_t num_btype = sizeof(ADataType) * BatchCount * M * K +
+                                    sizeof(BDataType) * BatchCount * K * N +
+                                    sizeof(CDataType) * BatchCount * M * N;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                      << " GB/s, " << gemm_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_gemm_name  = gemm_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                c_device_buf.FromDevice(c_g_m_n_device_result.mData.data());
+                reduce0_device_buf.FromDevice(d0_g_m_device_result.mData.data());
+                reduce1_device_buf.FromDevice(d1_g_m_device_result.mData.data());
+
+                bool c_error  = ck::utils::check_err(c_g_m_n_device_result, c_g_m_n_host_result);
+                bool d0_error = ck::utils::check_err(d0_g_m_device_result, d0_g_m_host_result);
+                bool d1_error = ck::utils::check_err(d1_g_m_device_result, d1_g_m_host_result);
+
+                pass = pass && (c_error == true);
+                pass = pass && (d0_error == true);
+                pass = pass && (d1_error == true);
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "a : ", a_g_m_k.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "b: ", b_g_k_n.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "c_host: ", c_g_m_n_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "c_device: ", c_g_m_n_device_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "d0_host: ", d0_g_m_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "d0_device: ", d0_g_m_device_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "d1_host: ", d1_g_m_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "d1_device: ", d1_g_m_device_result.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+        else
+        {
+            std::cout << "does not support this GEMM problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp
new file mode 100644
index 00000000..f5ec2351
--- /dev/null
+++ b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_impl.hpp
@@ -0,0 +1,347 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <memory>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename ADataType,
+          typename B0DataType,
+          typename B1DataType,
+          typename CDataType,
+          typename ALayout,
+          typename B0Layout,
+          typename B1Layout,
+          typename CLayout,
+          bool MaskOutUpperTriangle>
+bool profile_batched_gemm_softmax_gemm_impl(bool do_verification,
+                                            int init_method,
+                                            bool do_log,
+                                            bool time_kernel,
+                                            int M,
+                                            int N,
+                                            int K,
+                                            int O,
+                                            int BatchCount    = 1,
+                                            int StrideA       = -1,
+                                            int StrideB0      = -1,
+                                            int StrideB1      = -1,
+                                            int StrideC       = -1,
+                                            int BatchStrideA  = -1,
+                                            int BatchStrideB0 = -1,
+                                            int BatchStrideB1 = -1,
+                                            int BatchStrideC  = -1,
+                                            float alpha       = -1.f)
+
+{
+
+    using Row           = tensor_layout::gemm::RowMajor;
+    using Col           = tensor_layout::gemm::ColumnMajor;
+    using PassThrough   = tensor_operation::element_wise::PassThrough;
+    using Scale         = tensor_operation::element_wise::Scale;
+    using AElementOp    = PassThrough;
+    using B0ElementOp   = PassThrough;
+    using Acc0ElementOp = Scale;
+    using B1ElementOp   = PassThrough;
+    using CElementOp    = PassThrough;
+    using AccDataType   = float;
+
+    // Ref Gemm0: various type in, fp32 out
+    using ReferenceGemm0Instance = tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B0DataType,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B0ElementOp,
+                                                                                Acc0ElementOp>;
+
+    // Ref Softmax: fp32 in, various type out
+    using ReferenceSoftmaxInstance =
+        tensor_operation::host::ReferenceSoftmax<AccDataType, ADataType, AccDataType>;
+
+    // Ref Gemm1: various type in, various type out
+    using ReferenceGemm1Instance = tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B1DataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B1ElementOp,
+                                                                                CElementOp>;
+
+    bool pass = true;
+
+    const int DefaultStrideA  = ck::is_same_v<ALayout, Row> ? K : M;
+    const int DefaultStrideB0 = ck::is_same_v<B0Layout, Row> ? N : K;
+    const int DefaultStrideB1 = ck::is_same_v<B1Layout, Row> ? O : N;
+    const int DefaultStrideC  = ck::is_same_v<CLayout, Row> ? O : M;
+
+    StrideA  = (StrideA < 0) ? DefaultStrideA : StrideA;
+    StrideB0 = (StrideB0 < 0) ? DefaultStrideB0 : StrideB0;
+    StrideB1 = (StrideB1 < 0) ? DefaultStrideB1 : StrideB1;
+    StrideC  = (StrideC < 0) ? DefaultStrideC : StrideC;
+
+    const int DefaultBatchStrideA  = (ck::is_same_v<ALayout, Col> ? K : M) * StrideA;
+    const int DefaultBatchStrideB0 = (ck::is_same_v<B0Layout, Col> ? N : K) * StrideB0;
+    const int DefaultBatchStrideB1 = (ck::is_same_v<B1Layout, Col> ? O : N) * StrideB1;
+    const int DefaultBatchStrideC  = (ck::is_same_v<CLayout, Col> ? O : M) * StrideC;
+
+    BatchStrideA  = BatchStrideA < 0 ? DefaultBatchStrideA : BatchStrideA;
+    BatchStrideB0 = BatchStrideB0 < 0 ? DefaultBatchStrideB0 : BatchStrideB0;
+    BatchStrideB1 = BatchStrideB1 < 0 ? DefaultBatchStrideB1 : BatchStrideB1;
+    BatchStrideC  = BatchStrideC < 0 ? DefaultBatchStrideC : BatchStrideC;
+
+    auto f_host_tensor_descriptor = [](std::size_t batch_count,
+                                       std::size_t row,
+                                       std::size_t col,
+                                       std::size_t stride,
+                                       std::size_t batch_stride,
+                                       auto layout) {
+        using namespace ck::literals;
+
+        if(std::is_same<decltype(layout), Row>::value)
+        {
+            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, stride, 1_uz});
+        }
+        else
+        {
+            return HostTensorDescriptor({batch_count, row, col}, {batch_stride, 1_uz, stride});
+        }
+    };
+
+    // C_m_o = A_m_k * B0_k_n * B1_n_o
+    Tensor<ADataType> a_g_m_k(
+        f_host_tensor_descriptor(BatchCount, M, K, StrideA, BatchStrideA, ALayout{}));
+    Tensor<B0DataType> b0_g_k_n(
+        f_host_tensor_descriptor(BatchCount, K, N, StrideB0, BatchStrideB0, B0Layout{}));
+    Tensor<B1DataType> b1_g_n_o(
+        f_host_tensor_descriptor(BatchCount, N, O, StrideB1, BatchStrideB1, B1Layout{}));
+    Tensor<CDataType> c_g_m_o_host_result(
+        f_host_tensor_descriptor(BatchCount, M, O, StrideC, BatchStrideC, CLayout{}));
+    Tensor<CDataType> c_g_m_o_device_result(
+        f_host_tensor_descriptor(BatchCount, M, O, StrideC, BatchStrideC, CLayout{}));
+    // Host verification: Output of Gemm0 is input A of Gemm1
+    Tensor<AccDataType> acc0_g_m_n(f_host_tensor_descriptor(BatchCount, M, N, N, M * N, Row{}));
+    Tensor<ADataType> a1_g_m_n(f_host_tensor_descriptor(BatchCount, M, N, N, M * N, Row{}));
+
+    std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
+    std::cout << "b0_g_k_n: " << b0_g_k_n.mDesc << std::endl;
+    std::cout << "b1_g_n_o: " << b1_g_n_o.mDesc << std::endl;
+    std::cout << "c_g_m_o: " << c_g_m_o_host_result.mDesc << std::endl;
+
+    std::srand(1); // work around test flakiness
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        // Still unsure whether this kind of deterministic floating point accurary issue is expected
+        // or not. May want to try exact same approach as the GPU kernel in the host reference
+        // GEMM+Softmax+GEMM function to see if the accuracy discrepancy goes away. Until then,
+        // shrink the input value range as it is less likely to produce errors of around ~1e-3.
+        // a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        // b0_g_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
+        // b1_g_n_o.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-5, 5});
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-2, 2});
+        break;
+    case 2:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_3<B0DataType>{0.0, 1.0});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_3<B1DataType>{-0.5, 0.5});
+        break;
+    case 3:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Diagonal<B0DataType>{});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+        break;
+    default:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+    }
+
+    DeviceMem a_g_m_k_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSize());
+    DeviceMem b0_g_k_n_device_buf(sizeof(B0DataType) * b0_g_k_n.mDesc.GetElementSize());
+    DeviceMem b1_g_n_o_device_buf(sizeof(B1DataType) * b1_g_n_o.mDesc.GetElementSize());
+    DeviceMem c_g_m_o_device_buf(sizeof(CDataType) * c_g_m_o_device_result.mDesc.GetElementSize());
+
+    a_g_m_k_device_buf.ToDevice(a_g_m_k.mData.data());
+    b0_g_k_n_device_buf.ToDevice(b0_g_k_n.mData.data());
+    b1_g_n_o_device_buf.ToDevice(b1_g_n_o.mData.data());
+
+    if(alpha < 0)
+    {
+        alpha = 1.f / std::sqrt(K); // usually 1 / sqrt(head_dim)
+    }
+    auto a_element_op    = AElementOp{};
+    auto b0_element_op   = B0ElementOp{};
+    auto acc0_element_op = Acc0ElementOp{alpha};
+    auto b1_element_op   = B1ElementOp{};
+    auto c_element_op    = CElementOp{};
+
+    using DeviceOp = tensor_operation::device::DeviceBatchedGemmSoftmaxGemm<ALayout,
+                                                                            B0Layout,
+                                                                            B1Layout,
+                                                                            CLayout,
+                                                                            ADataType,
+                                                                            B0DataType,
+                                                                            B1DataType,
+                                                                            CDataType,
+                                                                            AElementOp,
+                                                                            B0ElementOp,
+                                                                            Acc0ElementOp,
+                                                                            B1ElementOp,
+                                                                            CElementOp,
+                                                                            MaskOutUpperTriangle>;
+
+    // get device op instances
+    const auto op_ptrs = tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    if(do_verification)
+    {
+        auto ref_gemm0          = ReferenceGemm0Instance{};
+        auto ref_gemm0_invoker  = ref_gemm0.MakeInvoker();
+        auto ref_gemm0_argument = ref_gemm0.MakeArgument(
+            a_g_m_k, b0_g_k_n, acc0_g_m_n, a_element_op, b0_element_op, Scale{alpha});
+
+        ref_gemm0_invoker.Run(ref_gemm0_argument);
+
+        // mask out upper triangle
+        acc0_g_m_n.ForEach([&](auto& self, auto idx) {
+            if(MaskOutUpperTriangle && idx[1] < idx[2])
+                self(idx) = -ck::NumericLimits<float>::Infinity();
+        });
+
+        auto ref_softmax          = ReferenceSoftmaxInstance{};
+        auto ref_softmax_invoker  = ref_softmax.MakeInvoker();
+        auto ref_softmax_argument = ref_softmax.MakeArgument(acc0_g_m_n, a1_g_m_n, 1, 0, {2});
+
+        ref_softmax_invoker.Run(ref_softmax_argument);
+
+        auto ref_gemm1          = ReferenceGemm1Instance{};
+        auto ref_gemm1_invoker  = ref_gemm1.MakeInvoker();
+        auto ref_gemm1_argument = ref_gemm1.MakeArgument(
+            a1_g_m_n, b1_g_n_o, c_g_m_o_host_result, PassThrough{}, b1_element_op, c_element_op);
+
+        ref_gemm1_invoker.Run(ref_gemm1_argument);
+    }
+
+    std::string best_op_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device op instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<ADataType*>(a_g_m_k_device_buf.GetDeviceBuffer()),
+            static_cast<B0DataType*>(b0_g_k_n_device_buf.GetDeviceBuffer()),
+            static_cast<B1DataType*>(b1_g_n_o_device_buf.GetDeviceBuffer()),
+            static_cast<CDataType*>(c_g_m_o_device_buf.GetDeviceBuffer()),
+            M,
+            N,
+            K,
+            O,
+            BatchCount,
+            StrideA,
+            StrideB0,
+            StrideB1,
+            StrideC,
+            BatchStrideA,
+            BatchStrideB0,
+            BatchStrideB1,
+            BatchStrideC,
+            a_element_op,
+            b0_element_op,
+            acc0_element_op,
+            b1_element_op,
+            c_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            std::string op_name = op_ptr->GetTypeString();
+
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop      = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * BatchCount;
+            std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N +
+                                     sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O) *
+                                    BatchCount;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                      << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                c_g_m_o_device_buf.FromDevice(c_g_m_o_device_result.mData.data());
+
+                pass = pass & ck::utils::check_err(c_g_m_o_device_result, c_g_m_o_host_result);
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "a_g_m_k: ", a_g_m_k.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "b0_g_k_n : ", b0_g_k_n.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "b1_g_n_o : ", b1_g_n_o.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "c_g_m_o_host_result : ", c_g_m_o_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "c_g_m_o_device_result : ", c_g_m_o_device_result.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+        else
+        {
+            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp
new file mode 100644
index 00000000..91c28f25
--- /dev/null
+++ b/profiler/include/profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp
@@ -0,0 +1,367 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <memory>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <index_t NumDimG,
+          index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          index_t NumDimO,
+          typename ADataType,
+          typename B0DataType,
+          typename B1DataType,
+          typename CDataType,
+          typename Acc0BiasesDataType,
+          typename Acc1BiasesDataType,
+          tensor_operation::device::MaskingSpecialization MaskingSpec>
+bool profile_batched_gemm_softmax_gemm_permute_impl(bool do_verification,
+                                                    int init_method,
+                                                    bool do_log,
+                                                    bool time_kernel,
+                                                    int M,
+                                                    int N,
+                                                    int K,
+                                                    int O,
+                                                    int G0,
+                                                    int G1,
+                                                    float alpha = -1.f)
+
+{
+
+    using PassThrough   = tensor_operation::element_wise::PassThrough;
+    using Scale         = tensor_operation::element_wise::Scale;
+    using AElementOp    = PassThrough;
+    using B0ElementOp   = PassThrough;
+    using Acc0ElementOp = Scale;
+    using B1ElementOp   = PassThrough;
+    using CElementOp    = PassThrough;
+    using AccDataType   = float;
+    using tensor_operation::device::MaskingSpecialization;
+
+    // Ref Gemm0: various type in, fp32 out
+    using ReferenceGemm0Instance = tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B0DataType,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B0ElementOp,
+                                                                                Acc0ElementOp>;
+
+    // Ref Softmax: fp32 in, various type out
+    using ReferenceSoftmaxInstance =
+        tensor_operation::host::ReferenceSoftmax<AccDataType, ADataType, AccDataType>;
+
+    // Ref Gemm1: various type in, various type out
+    using ReferenceGemm1Instance = tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B1DataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B1ElementOp,
+                                                                                CElementOp>;
+
+    bool pass = true;
+
+    // A layout [G0, M, G1, K]
+    std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, G1, M, K};
+    std::vector<ck::index_t> a_gs_ms_ks_strides{M * G1 * K, K, G1 * K, 1};
+
+    // B0 layout [G0, N, G1, K]
+    std::vector<ck::index_t> b0_gs_ns_ks_lengths{G0, G1, N, K};
+    std::vector<ck::index_t> b0_gs_ns_ks_strides{N * G1 * K, K, G1 * K, 1};
+
+    // B1 layout [G0, N, G1, O]
+    std::vector<ck::index_t> b1_gs_os_ns_lengths{G0, G1, O, N};
+    std::vector<ck::index_t> b1_gs_os_ns_strides{N * G1 * O, O, 1, G1 * O};
+
+    // C layout [G0, M, G1, O]
+    std::vector<ck::index_t> c_gs_ms_os_lengths{G0, G1, M, O};
+    std::vector<ck::index_t> c_gs_ms_os_strides{M * G1 * O, O, G1 * O, 1};
+
+    const int BatchCount = G0 * G1;
+
+    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
+    Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
+    Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
+    Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
+    Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
+
+    std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
+    std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl;
+    std::cout << "b1_gs_os_ns: " << b1_gs_os_ns.mDesc << std::endl;
+    std::cout << "c_gs_ms_os: " << c_gs_ms_os_host_result.mDesc << std::endl;
+
+    std::srand(1); // work around test flakiness
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        // Still unsure whether this kind of deterministic floating point accurary issue is expected
+        // or not. May want to try exact same approach as the GPU kernel in the host reference
+        // GEMM+Softmax+GEMM function to see if the accuracy discrepancy goes away. Until then,
+        // shrink the input value range as it is less likely to produce errors of around ~1e-3.
+        // a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        // b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
+        // b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-5, 5});
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-2, 2});
+        break;
+    case 2:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_3<B0DataType>{0.0, 1.0});
+        b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_3<B1DataType>{-0.5, 0.5});
+        break;
+    case 3:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal<B0DataType>{});
+        b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+        break;
+    default:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_gs_ms_ks.mDesc.GetElementSpaceSize());
+    DeviceMem b0_device_buf(sizeof(B0DataType) * b0_gs_ns_ks.mDesc.GetElementSpaceSize());
+    DeviceMem b1_device_buf(sizeof(B1DataType) * b1_gs_os_ns.mDesc.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(CDataType) *
+                           c_gs_ms_os_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_gs_ms_ks.mData.data());
+    b0_device_buf.ToDevice(b0_gs_ns_ks.mData.data());
+    b1_device_buf.ToDevice(b1_gs_os_ns.mData.data());
+
+    if(alpha < 0)
+    {
+        alpha = 1.f / std::sqrt(K); // usually 1 / sqrt(head_dim)
+    }
+    auto a_element_op    = AElementOp{};
+    auto b0_element_op   = B0ElementOp{};
+    auto acc0_element_op = Acc0ElementOp{alpha};
+    auto b1_element_op   = B1ElementOp{};
+    auto c_element_op    = CElementOp{};
+
+    using DeviceOp = tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute<2,
+                                                                                   1,
+                                                                                   1,
+                                                                                   1,
+                                                                                   1,
+                                                                                   ADataType,
+                                                                                   B0DataType,
+                                                                                   B1DataType,
+                                                                                   CDataType,
+                                                                                   ck::Tuple<>,
+                                                                                   ck::Tuple<>,
+                                                                                   AElementOp,
+                                                                                   B0ElementOp,
+                                                                                   Acc0ElementOp,
+                                                                                   B1ElementOp,
+                                                                                   CElementOp,
+                                                                                   MaskingSpec>;
+
+    // get device op instances
+    const auto op_ptrs = tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    if(do_verification)
+    {
+        c_device_buf.FromDevice(c_gs_ms_os_device_result.mData.data());
+
+        Tensor<ADataType> a_g_m_k({BatchCount, M, K});
+        Tensor<B0DataType> b0_g_k_n({BatchCount, K, N});
+        Tensor<B1DataType> b1_g_n_o({BatchCount, N, O});
+        Tensor<AccDataType> acc0_g_m_n({BatchCount, M, N});        // scratch object after gemm0
+        Tensor<ADataType> a1_g_m_n({BatchCount, M, N});            // scratch object after softmax
+        Tensor<CDataType> c_g_m_o_host_result({BatchCount, M, O}); // scratch object after gemm1
+
+        // permute
+        a_gs_ms_ks.ForEach([&](auto& self, auto idx) {
+            a_g_m_k(idx[0] * G1 + idx[1], idx[2], idx[3]) = self(idx);
+        });
+        b0_gs_ns_ks.ForEach([&](auto& self, auto idx) {
+            b0_g_k_n(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx);
+        });
+        b1_gs_os_ns.ForEach([&](auto& self, auto idx) {
+            b1_g_n_o(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx);
+        });
+
+        auto ref_gemm0          = ReferenceGemm0Instance{};
+        auto ref_gemm0_invoker  = ref_gemm0.MakeInvoker();
+        auto ref_gemm0_argument = ref_gemm0.MakeArgument(
+            a_g_m_k, b0_g_k_n, acc0_g_m_n, a_element_op, b0_element_op, Scale{alpha});
+
+        ref_gemm0_invoker.Run(ref_gemm0_argument);
+
+        // mask out upper triangle
+        acc0_g_m_n.ForEach([&](auto& self, auto idx) {
+            if(MaskingSpec == MaskingSpecialization::MaskOutUpperTriangle && idx[1] < idx[2])
+                self(idx) = -ck::NumericLimits<float>::Infinity();
+        });
+
+        auto ref_softmax          = ReferenceSoftmaxInstance{};
+        auto ref_softmax_invoker  = ref_softmax.MakeInvoker();
+        auto ref_softmax_argument = ref_softmax.MakeArgument(acc0_g_m_n, a1_g_m_n, 1, 0, {2});
+
+        ref_softmax_invoker.Run(ref_softmax_argument);
+
+        auto ref_gemm1          = ReferenceGemm1Instance{};
+        auto ref_gemm1_invoker  = ref_gemm1.MakeInvoker();
+        auto ref_gemm1_argument = ref_gemm1.MakeArgument(
+            a1_g_m_n, b1_g_n_o, c_g_m_o_host_result, PassThrough{}, b1_element_op, c_element_op);
+
+        ref_gemm1_invoker.Run(ref_gemm1_argument);
+
+        // permute
+        c_gs_ms_os_host_result.ForEach([&](auto& self, auto idx) {
+            const size_t& g0 = idx[0];
+            const size_t& g1 = idx[1];
+
+            const size_t g = g0 * G1 + g1;
+
+            self(idx) = c_g_m_o_host_result(g, idx[2], idx[3]);
+        });
+    }
+
+    std::string best_op_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device op instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+            static_cast<B0DataType*>(b0_device_buf.GetDeviceBuffer()),
+            static_cast<B1DataType*>(b1_device_buf.GetDeviceBuffer()),
+            static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+            {}, // std::array<void*, 1> p_acc0_biases;
+            {}, // std::array<void*, 1> p_acc1_biases;
+            a_gs_ms_ks_lengths,
+            a_gs_ms_ks_strides,
+            b0_gs_ns_ks_lengths,
+            b0_gs_ns_ks_strides,
+            b1_gs_os_ns_lengths,
+            b1_gs_os_ns_strides,
+            c_gs_ms_os_lengths,
+            c_gs_ms_os_strides,
+            {}, // std::array<std::vector<ck::index_t>, 1>{acc0_biases_gs_ms_ns_lengths},
+            {}, // std::array<std::vector<ck::index_t>, 1>{acc0_biases_gs_ms_ns_strides},
+            {}, // std::array<std::vector<ck::index_t>, 1>{acc1_biases_gs_ms_os_lengths},
+            {}, // std::array<std::vector<ck::index_t>, 1>{acc1_biases_gs_ms_os_strides},
+            a_element_op,
+            b0_element_op,
+            acc0_element_op,
+            b1_element_op,
+            c_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            std::string op_name = op_ptr->GetTypeString();
+
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop      = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * BatchCount;
+            std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N +
+                                     sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O) *
+                                    BatchCount;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                      << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                c_device_buf.FromDevice(c_gs_ms_os_device_result.mData.data());
+
+                // default absolute error and relative error is 0.001
+                double rtol = 1e-3;
+                double atol = 1e-3;
+
+                // when BF16 is taken, set absolute error and relative error to 0.01
+                if(std::is_same_v<ADataType, ck::bhalf_t> &&
+                   std::is_same_v<B0DataType, ck::bhalf_t> &&
+                   std::is_same_v<B1DataType, ck::bhalf_t> &&
+                   std::is_same_v<CDataType, ck::bhalf_t>)
+                {
+                    rtol = 1e-2;
+                    atol = 1e-2;
+                }
+
+                pass = pass & ck::utils::check_err(c_gs_ms_os_device_result,
+                                                   c_gs_ms_os_host_result,
+                                                   "Error: Incorrect results!",
+                                                   rtol,
+                                                   atol);
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "a_gs_ms_ks: ", a_gs_ms_ks.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "b0_gs_ns_ks : ", b0_gs_ns_ks.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "b1_gs_os_ns : ", b1_gs_os_ns.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "c_gs_ms_os_host_result : ", c_gs_ms_os_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "c_gs_ms_os_device_result : ",
+                                          c_gs_ms_os_device_result.mData,
+                                          ",")
+                        << std::endl;
+                }
+            }
+        }
+        else
+        {
+            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_batchnorm_backward_impl.hpp b/profiler/include/profiler/profile_batchnorm_backward_impl.hpp
new file mode 100644
index 00000000..79d88620
--- /dev/null
+++ b/profiler/include/profiler/profile_batchnorm_backward_impl.hpp
@@ -0,0 +1,390 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+#include <stdexcept>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_backward.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename XDataType,
+          typename DxDataType,
+          typename DyDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename DscaleDbiasDataType,
+          typename MeanVarDataType,
+          index_t Rank,
+          index_t NumBatchNormReduceDim>
+bool profile_batchnorm_backward_impl(bool do_verification,
+                                     int init_method,
+                                     bool do_dumpout,
+                                     bool time_kernel,
+                                     const std::vector<size_t> inOutLengths,
+                                     const std::vector<int> reduceDims,
+                                     bool haveSavedMeanInvVar,
+                                     double epsilon)
+{
+    if(inOutLengths.size() != Rank || reduceDims.size() != NumBatchNormReduceDim)
+    {
+        throw std::runtime_error("Invalid tensor lengths or number of reduce dimensions!");
+    };
+
+    std::vector<size_t> scaleBiasMeanVarLengths;
+
+    // used for calculating the effective transferred bytes by each operation
+    size_t total_length;
+    size_t invariant_length = 1;
+
+    total_length =
+        std::accumulate(inOutLengths.begin(), inOutLengths.end(), 1, std::multiplies<size_t>{});
+
+    if(std::any_of(reduceDims.begin(), reduceDims.end(), [](int d) { return d < 0 || d >= Rank; }))
+        throw std::runtime_error("Invalid reduce dimensions!");
+
+    for(int dim = 0; dim < Rank; dim++)
+    {
+        if(std::none_of(reduceDims.begin(), reduceDims.end(), [&](int d) { return dim == d; }))
+        {
+            scaleBiasMeanVarLengths.push_back(inOutLengths[dim]);
+            invariant_length *= inOutLengths[dim];
+        };
+    }
+
+    // input data of the batchnorm backward algorithm
+    Tensor<XDataType> x(inOutLengths);
+    Tensor<DyDataType> dy(inOutLengths);
+    Tensor<ScaleDataType> bnScale(scaleBiasMeanVarLengths);
+
+    Tensor<MeanVarDataType> savedMean(scaleBiasMeanVarLengths);
+    Tensor<MeanVarDataType> savedInvVar(scaleBiasMeanVarLengths);
+    // savedVariance is only used for initializing savedInvVar
+    Tensor<MeanVarDataType> savedVariance(scaleBiasMeanVarLengths);
+
+    // output data of the batchnorm backward algorithm
+    Tensor<DxDataType> dx_ref(inOutLengths);
+    Tensor<DxDataType> dx(inOutLengths);
+
+    Tensor<DscaleDbiasDataType> dscale(scaleBiasMeanVarLengths);
+    Tensor<DscaleDbiasDataType> dbias(scaleBiasMeanVarLengths);
+
+    Tensor<DscaleDbiasDataType> dscale_ref(scaleBiasMeanVarLengths);
+    Tensor<DscaleDbiasDataType> dbias_ref(scaleBiasMeanVarLengths);
+
+    auto inOutStrides            = x.mDesc.GetStrides();
+    auto scaleBiasMeanVarStrides = bnScale.mDesc.GetStrides();
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+
+    if(haveSavedMeanInvVar)
+    {
+        const float x_mean       = 0.0f;
+        const float x_stddev     = 1.0f;
+        const float noise_stddev = 0.0001f;
+
+        // input data in normal distribution
+        x.GenerateTensorValue(GeneratorTensor_4<XDataType>{x_mean, x_stddev}, num_thread);
+
+        // initialize the savedMean to be values with tiny variation to the mean of the x values
+        savedMean.GenerateTensorValue(GeneratorTensor_4<MeanVarDataType>{x_mean, noise_stddev},
+                                      num_thread);
+
+        // initialize the variance to be values with tiny variation to the variance of the x values
+        savedVariance.GenerateTensorValue(
+            GeneratorTensor_4<MeanVarDataType>{x_stddev * x_stddev, noise_stddev}, num_thread);
+
+        auto it_src       = savedVariance.mData.begin();
+        auto it_dst       = savedInvVar.mData.begin();
+        float tmp_epsilon = std::numeric_limits<float>::epsilon();
+
+        while(it_src != savedVariance.mData.end())
+        {
+            *it_dst = type_convert<AccDataType>(
+                1.0f / std::sqrtf(type_convert<float>(*it_src) + tmp_epsilon));
+
+            it_src++;
+            it_dst++;
+        };
+    }
+    else
+    {
+        const float x_mean   = 0.0f;
+        const float x_stddev = 1.0f;
+
+        // input data in normal distribution
+        x.GenerateTensorValue(GeneratorTensor_4<XDataType>{x_mean, x_stddev}, num_thread);
+    };
+
+    if(do_verification)
+    {
+        switch(init_method)
+        {
+        case 0:
+            dy.GenerateTensorValue(GeneratorTensor_0<DyDataType>{}, num_thread);
+            bnScale.GenerateTensorValue(GeneratorTensor_0<ScaleDataType>{}, num_thread);
+            break;
+        case 1:
+            dy.GenerateTensorValue(GeneratorTensor_1<DyDataType>{1}, num_thread);
+            bnScale.GenerateTensorValue(GeneratorTensor_1<ScaleDataType>{1}, num_thread);
+            break;
+        case 2:
+            dy.GenerateTensorValue(GeneratorTensor_2<DyDataType>{-2, 2}, num_thread);
+            bnScale.GenerateTensorValue(GeneratorTensor_2<ScaleDataType>{-5, 5}, num_thread);
+            break;
+        default:
+            dy.GenerateTensorValue(GeneratorTensor_3<DyDataType>{-0.2f, 0.2f}, num_thread);
+            bnScale.GenerateTensorValue(GeneratorTensor_3<ScaleDataType>{-0.5f, 0.5f}, num_thread);
+        }
+    };
+
+    // input data of the batchnorm backward algorithm
+    DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
+    DeviceMem dy_dev(sizeof(DyDataType) * dy.mDesc.GetElementSpaceSize());
+
+    DeviceMem bnScale_dev(sizeof(ScaleDataType) * bnScale.mDesc.GetElementSpaceSize());
+
+    DeviceMem savedMean_dev(sizeof(MeanVarDataType) * savedMean.mDesc.GetElementSpaceSize());
+    DeviceMem savedInvVar_dev(sizeof(MeanVarDataType) * savedInvVar.mDesc.GetElementSpaceSize());
+
+    // output data of the batchnorm backward algorithm
+    DeviceMem dx_dev(sizeof(DxDataType) * dx.mDesc.GetElementSpaceSize());
+
+    DeviceMem dscale_dev(sizeof(DscaleDbiasDataType) * dscale.mDesc.GetElementSpaceSize());
+    DeviceMem dbias_dev(sizeof(DscaleDbiasDataType) * dbias.mDesc.GetElementSpaceSize());
+
+    x_dev.ToDevice(x.mData.data());
+    dy_dev.ToDevice(dy.mData.data());
+    bnScale_dev.ToDevice(bnScale.mData.data());
+
+    if(haveSavedMeanInvVar)
+    {
+        savedMean_dev.ToDevice(savedMean.mData.data());
+        savedInvVar_dev.ToDevice(savedInvVar.mData.data());
+    };
+
+    std::array<index_t, Rank> arrInOutLengths;
+    std::array<index_t, Rank> arrInOutStrides;
+    std::array<index_t, Rank - NumBatchNormReduceDim> arrScaleBiasMeanVarLengths;
+    std::array<index_t, Rank - NumBatchNormReduceDim> arrScaleBiasMeanVarStrides;
+    std::array<int, NumBatchNormReduceDim> arrReduceDims;
+
+    std::copy(inOutLengths.begin(), inOutLengths.end(), arrInOutLengths.begin());
+    std::copy(inOutStrides.begin(), inOutStrides.end(), arrInOutStrides.begin());
+    std::copy(scaleBiasMeanVarLengths.begin(),
+              scaleBiasMeanVarLengths.end(),
+              arrScaleBiasMeanVarLengths.begin());
+    std::copy(scaleBiasMeanVarStrides.begin(),
+              scaleBiasMeanVarStrides.end(),
+              arrScaleBiasMeanVarStrides.begin());
+
+    std::copy(reduceDims.begin(), reduceDims.end(), arrReduceDims.begin());
+
+    using PassThroughOp = ck::tensor_operation::element_wise::PassThrough;
+
+    // add device batchnorm-backward instances
+    using DeviceOp = ck::tensor_operation::device::DeviceBatchNormBwd<XDataType,
+                                                                      DxDataType,
+                                                                      DxDataType,
+                                                                      AccDataType,
+                                                                      ScaleDataType,
+                                                                      DscaleDbiasDataType,
+                                                                      MeanVarDataType,
+                                                                      PassThroughOp,
+                                                                      Rank,
+                                                                      NumBatchNormReduceDim>;
+
+    // get device op instances
+    const auto instance_ptrs =
+        ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            DeviceOp>::GetInstances();
+
+    std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
+
+    std::string best_instance_name;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    if(do_verification)
+    {
+        using ReferenceBatchNormBwdInstance =
+            ck::tensor_operation::host::ReferenceBatchNormBwd<XDataType,
+                                                              DxDataType,
+                                                              DyDataType,
+                                                              AccDataType,
+                                                              ScaleDataType,
+                                                              DscaleDbiasDataType,
+                                                              MeanVarDataType,
+                                                              PassThroughOp,
+                                                              Rank,
+                                                              NumBatchNormReduceDim>;
+
+        auto batchNormBwd_ref = ReferenceBatchNormBwdInstance{};
+
+        auto argument_ptr_ref = batchNormBwd_ref.MakeArgumentPointer(
+            arrInOutLengths,
+            arrInOutStrides,
+            arrInOutStrides,
+            arrInOutStrides,
+            arrReduceDims,
+            arrScaleBiasMeanVarLengths,
+            arrScaleBiasMeanVarStrides,
+            arrScaleBiasMeanVarStrides,
+            arrScaleBiasMeanVarStrides,
+            x.mData.data(),
+            dy.mData.data(),
+            bnScale.mData.data(),
+            haveSavedMeanInvVar ? savedMean.mData.data() : nullptr,
+            haveSavedMeanInvVar ? savedInvVar.mData.data() : nullptr,
+            epsilon,
+            PassThroughOp{},
+            dx_ref.mData.data(),
+            dscale_ref.mData.data(),
+            dbias_ref.mData.data());
+
+        if(!batchNormBwd_ref.IsSupportedArgument(argument_ptr_ref.get()))
+        {
+            std::cout << "The runtime parameters not supported by the reference instance, exiting!"
+                      << std::endl;
+            return (false);
+        };
+
+        auto invoker_ptr_ref = batchNormBwd_ref.MakeInvokerPointer();
+
+        (void)invoker_ptr_ref->Run(argument_ptr_ref.get());
+    }
+
+    int num_kernel = 0;
+    bool pass      = true;
+
+    for(auto& inst_ptr : instance_ptrs)
+    {
+        auto argument_ptr = inst_ptr->MakeArgumentPointer(
+            arrInOutLengths,
+            arrInOutStrides,
+            arrInOutStrides,
+            arrInOutStrides,
+            arrReduceDims,
+            arrScaleBiasMeanVarLengths,
+            arrScaleBiasMeanVarStrides,
+            arrScaleBiasMeanVarStrides,
+            arrScaleBiasMeanVarStrides,
+            x_dev.GetDeviceBuffer(),
+            dy_dev.GetDeviceBuffer(),
+            bnScale_dev.GetDeviceBuffer(),
+            haveSavedMeanInvVar ? savedMean_dev.GetDeviceBuffer() : nullptr,
+            haveSavedMeanInvVar ? savedInvVar_dev.GetDeviceBuffer() : nullptr,
+            epsilon,
+            PassThroughOp{},
+            dx_dev.GetDeviceBuffer(),
+            dscale_dev.GetDeviceBuffer(),
+            dbias_dev.GetDeviceBuffer());
+
+        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            num_kernel++;
+        }
+        else
+        {
+            if(time_kernel)
+            {
+                std::cout << inst_ptr->GetTypeString()
+                          << " skipped due to unsupported argument: " << std::endl;
+            }
+
+            continue;
+        };
+
+        size_t workspace_sz = inst_ptr->GetWorkSpaceSize(argument_ptr.get());
+
+        DeviceMem workspace_dev(workspace_sz);
+
+        inst_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
+
+        auto invoker_ptr = inst_ptr->MakeInvokerPointer();
+
+        float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+        size_t num_bytes = 0;
+
+        // inputing of x, dy, scale, outputing of dx, dscale, dbias
+        num_bytes += total_length * (sizeof(XDataType) + sizeof(DyDataType) + sizeof(DxDataType)) +
+                     invariant_length * sizeof(DscaleDbiasDataType) * 2;
+
+        // inputting of savedMean, savedInvVariance
+        if(haveSavedMeanInvVar)
+            num_bytes += invariant_length * sizeof(MeanVarDataType) * 2;
+
+        float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+        if(time_kernel)
+            std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                      << inst_ptr->GetTypeString() << std::endl;
+
+        if(avg_time < best_avg_time)
+        {
+            best_instance_name = inst_ptr->GetTypeString();
+            best_avg_time      = avg_time;
+            best_gb_per_sec    = gb_per_sec;
+        }
+
+        if(do_verification)
+        {
+            using ck::utils::check_err;
+            bool single_pass = true;
+
+            dx_dev.FromDevice(dx.mData.data());
+            dscale_dev.FromDevice(dscale.data());
+            dbias_dev.FromDevice(dbias.data());
+
+            // clang-format off
+            single_pass = single_pass && ck::utils::check_err(dx.mData, dx_ref.mData, "dx result:", 5e-4, 5e-4);
+            single_pass = single_pass && ck::utils::check_err(dscale.mData, dscale_ref.mData, "dScale result:", 3e-3, 3e-3);
+            single_pass = single_pass && ck::utils::check_err(dbias.mData, dbias_ref.mData, "dBias result:", 3e-3, 3e-3);
+            // clang-format on
+
+            pass = pass && single_pass;
+        };
+
+        if(do_dumpout)
+        {
+            using ck::host_common::dumpBufferToFile;
+
+            // clang-format off
+            dumpBufferToFile("dump_x.bin", x.mData.data(), x.mDesc.GetElementSize());
+            dumpBufferToFile("dump_dy.bin", dy.mData.data(), dy.mDesc.GetElementSize());
+            dumpBufferToFile("dump_dx.bin", dx.mData.data(), dx.mDesc.GetElementSize());
+            dumpBufferToFile("dump_dx_ref.bin", dx_ref.mData.data(), dx_ref.mDesc.GetElementSize());
+            dumpBufferToFile("dump_dscale.bin", dscale.mData.data(), dscale.mDesc.GetElementSize());
+            dumpBufferToFile("dump_dscale_ref.bin", dscale_ref.mData.data(), dscale_ref.mDesc.GetElementSize());
+            // clang-format off
+        };
+    }
+
+    if(time_kernel)
+    {
+        std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_instance_name << std::endl;
+    }
+
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_batchnorm_forward_impl.hpp b/profiler/include/profiler/profile_batchnorm_forward_impl.hpp
new file mode 100644
index 00000000..82fe75bf
--- /dev/null
+++ b/profiler/include/profiler/profile_batchnorm_forward_impl.hpp
@@ -0,0 +1,412 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+#include <stdexcept>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batchnorm_forward.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename XDataType,
+          typename YDataType,
+          typename AccDataType,
+          typename ScaleDataType,
+          typename BiasDataType,
+          typename MeanVarDataType,
+          index_t Rank,
+          index_t NumBatchNormReduceDim>
+bool profile_batchnorm_forward_impl(int do_verification,
+                                    int init_method,
+                                    bool do_dumpout,
+                                    bool time_kernel,
+                                    const std::vector<size_t> inOutLengths,
+                                    const std::vector<int> reduceDims,
+                                    bool updateMovingAverage,
+                                    bool saveMeanAndInvVariance,
+                                    double averageFactor,
+                                    double epsilon)
+{
+    if(inOutLengths.size() != Rank || reduceDims.size() != NumBatchNormReduceDim)
+    {
+        throw std::runtime_error("Invalid tensor lengths or number of reduce dimensions!");
+    };
+
+    std::vector<size_t> scaleBiasMeanVarLengths;
+
+    // used for calculating the effective transferred bytes by each operation
+    size_t total_length;
+    size_t invariant_length = 1;
+
+    total_length =
+        std::accumulate(inOutLengths.begin(), inOutLengths.end(), 1, std::multiplies<size_t>{});
+
+    if(std::any_of(reduceDims.begin(), reduceDims.end(), [](int d) { return d < 0 || d >= Rank; }))
+        throw std::runtime_error("Invalid reduce dimensions!");
+
+    for(int dim = 0; dim < Rank; dim++)
+    {
+        if(std::none_of(reduceDims.begin(), reduceDims.end(), [&](int d) { return dim == d; }))
+        {
+            scaleBiasMeanVarLengths.push_back(inOutLengths[dim]);
+            invariant_length *= inOutLengths[dim];
+        };
+    }
+
+    // input data of the batchnorm forward algorithm
+    Tensor<XDataType> x(inOutLengths);
+    Tensor<ScaleDataType> bnScale(scaleBiasMeanVarLengths);
+    Tensor<BiasDataType> bnBias(scaleBiasMeanVarLengths);
+
+    // output data of the batchnorm forward algorithm
+    Tensor<YDataType> y_ref(inOutLengths);
+    Tensor<YDataType> y(inOutLengths);
+
+    Tensor<MeanVarDataType> resultSaveMean_ref(scaleBiasMeanVarLengths);
+    Tensor<MeanVarDataType> resultSaveInvVariance_ref(scaleBiasMeanVarLengths);
+
+    Tensor<MeanVarDataType> resultRunningMean_ref(scaleBiasMeanVarLengths);
+    Tensor<MeanVarDataType> resultRunningVariance_ref(scaleBiasMeanVarLengths);
+
+    auto inOutStrides            = x.mDesc.GetStrides();
+    auto scaleBiasMeanVarStrides = bnScale.mDesc.GetStrides();
+
+    std::size_t num_thread = std::thread::hardware_concurrency();
+
+    if(updateMovingAverage)
+    {
+        const float x_mean       = 0.0f;
+        const float x_stddev     = 1.0f;
+        const float noise_stddev = 0.04f;
+
+        // input data in normal distribution
+        x.GenerateTensorValue(GeneratorTensor_4<XDataType>{x_mean, x_stddev}, num_thread);
+
+        // initialize the runningMean to be values with tiny variation to the mean of the x
+        // values
+        resultRunningMean_ref.GenerateTensorValue(
+            GeneratorTensor_4<MeanVarDataType>{x_mean, noise_stddev}, num_thread);
+
+        // initialize the runningVariance to be values with tiny variation to the variance of
+        // the x values
+        resultRunningVariance_ref.GenerateTensorValue(
+            GeneratorTensor_4<MeanVarDataType>{x_stddev * x_stddev, noise_stddev}, num_thread);
+    }
+    else
+    {
+        if constexpr(ck::is_same_v<XDataType, int8_t>)
+            x.GenerateTensorValue(GeneratorTensor_2<XDataType>{-5, 5}, num_thread);
+        else
+            x.GenerateTensorValue(GeneratorTensor_3<XDataType>{-1.0f, 1.0f}, num_thread);
+    };
+
+    if(do_verification)
+    {
+        switch(init_method)
+        {
+        case 0:
+            bnScale.GenerateTensorValue(GeneratorTensor_0<ScaleDataType>{}, num_thread);
+            bnBias.GenerateTensorValue(GeneratorTensor_0<BiasDataType>{}, num_thread);
+            break;
+        case 1:
+            bnScale.GenerateTensorValue(GeneratorTensor_1<ScaleDataType>{1}, num_thread);
+            bnBias.GenerateTensorValue(GeneratorTensor_1<BiasDataType>{0}, num_thread);
+            break;
+        case 2:
+            bnScale.GenerateTensorValue(GeneratorTensor_2<ScaleDataType>{-5, 5}, num_thread);
+            bnBias.GenerateTensorValue(GeneratorTensor_2<BiasDataType>{-5, 5}, num_thread);
+            break;
+        default:
+            bnScale.GenerateTensorValue(GeneratorTensor_3<ScaleDataType>{-1.0f, 1.0f}, num_thread);
+            bnBias.GenerateTensorValue(GeneratorTensor_3<BiasDataType>{-1.0f, 1.0f}, num_thread);
+        }
+    };
+
+    // these buffers are usually provided by the user application
+    DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
+    DeviceMem y_dev(sizeof(XDataType) * y.mDesc.GetElementSpaceSize());
+    DeviceMem bnScale_dev(sizeof(ScaleDataType) * bnScale.mDesc.GetElementSpaceSize());
+    DeviceMem bnBias_dev(sizeof(BiasDataType) * bnBias.mDesc.GetElementSpaceSize());
+
+    // mean_dev or resultSaveMean_dev
+    DeviceMem resultSaveMean_dev(sizeof(MeanVarDataType) *
+                                 resultSaveMean_ref.mDesc.GetElementSpaceSize());
+    // meansquare_dev or resultSaveInvVariance_dev
+    DeviceMem resultSaveInvVariance_dev(sizeof(MeanVarDataType) *
+                                        resultSaveInvVariance_ref.mDesc.GetElementSpaceSize());
+    // resultRunningMean_dev
+    DeviceMem resultRunningMean_dev(sizeof(MeanVarDataType) *
+                                    resultRunningMean_ref.mDesc.GetElementSpaceSize());
+    // resultRunningVariance_dev
+    DeviceMem resultRunningVariance_dev(sizeof(MeanVarDataType) *
+                                        resultRunningVariance_ref.mDesc.GetElementSpaceSize());
+
+    x_dev.ToDevice(x.mData.data());
+    bnScale_dev.ToDevice(bnScale.mData.data());
+    bnBias_dev.ToDevice(bnBias.mData.data());
+
+    if(updateMovingAverage)
+    {
+        resultRunningMean_dev.ToDevice(resultRunningMean_ref.mData.data());
+        resultRunningVariance_dev.ToDevice(resultRunningVariance_ref.mData.data());
+    };
+
+    // used for storing the device result for verification when updateMovingAverage is enabled
+    Tensor<MeanVarDataType> resultRunningMean(scaleBiasMeanVarLengths);
+    Tensor<MeanVarDataType> resultRunningVariance(scaleBiasMeanVarLengths);
+
+    // used for storing the device result for verification when saveMeanAndInvVariance is enabled
+    Tensor<MeanVarDataType> resultSaveMean(scaleBiasMeanVarLengths);
+    Tensor<MeanVarDataType> resultSaveInvVariance(scaleBiasMeanVarLengths);
+
+    std::array<index_t, Rank> arrInOutLengths;
+    std::array<index_t, Rank> arrInOutStrides;
+    std::array<index_t, Rank - NumBatchNormReduceDim> arrScaleBiasMeanVarLengths;
+    std::array<index_t, Rank - NumBatchNormReduceDim> arrScaleBiasMeanVarStrides;
+    std::array<int, NumBatchNormReduceDim> arrReduceDims;
+
+    std::copy(inOutLengths.begin(), inOutLengths.end(), arrInOutLengths.begin());
+    std::copy(inOutStrides.begin(), inOutStrides.end(), arrInOutStrides.begin());
+    std::copy(scaleBiasMeanVarLengths.begin(),
+              scaleBiasMeanVarLengths.end(),
+              arrScaleBiasMeanVarLengths.begin());
+    std::copy(scaleBiasMeanVarStrides.begin(),
+              scaleBiasMeanVarStrides.end(),
+              arrScaleBiasMeanVarStrides.begin());
+
+    std::copy(reduceDims.begin(), reduceDims.end(), arrReduceDims.begin());
+
+    using PassThroughOp = ck::tensor_operation::element_wise::PassThrough;
+
+    // add device batchnorm-forward instances
+    using DeviceOp = ck::tensor_operation::device::DeviceBatchNormFwd<XDataType,
+                                                                      YDataType,
+                                                                      AccDataType,
+                                                                      ScaleDataType,
+                                                                      BiasDataType,
+                                                                      MeanVarDataType,
+                                                                      PassThroughOp,
+                                                                      Rank,
+                                                                      NumBatchNormReduceDim>;
+
+    // get device op instances
+    const auto instance_ptrs =
+        ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            DeviceOp>::GetInstances();
+
+    std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
+
+    std::string best_instance_name;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    if(do_verification)
+    {
+        using ReferenceBatchNormFwdInstance =
+            ck::tensor_operation::host::ReferenceBatchNormFwd<XDataType,
+                                                              YDataType,
+                                                              AccDataType,
+                                                              ScaleDataType,
+                                                              BiasDataType,
+                                                              MeanVarDataType,
+                                                              PassThroughOp,
+                                                              Rank,
+                                                              NumBatchNormReduceDim>;
+
+        auto batchNormFwd_ref = ReferenceBatchNormFwdInstance{};
+
+        auto argument_ptr_ref = batchNormFwd_ref.MakeArgumentPointer(
+            arrInOutLengths,
+            arrInOutStrides,
+            arrInOutStrides,
+            arrReduceDims,
+            arrScaleBiasMeanVarLengths,
+            arrScaleBiasMeanVarStrides,
+            arrScaleBiasMeanVarStrides,
+            arrScaleBiasMeanVarStrides,
+            x.mData.data(),
+            bnScale.mData.data(),
+            bnBias.mData.data(),
+            epsilon,
+            PassThroughOp{},
+            y_ref.mData.data(),
+            saveMeanAndInvVariance ? resultSaveMean_ref.mData.data() : nullptr,
+            saveMeanAndInvVariance ? resultSaveInvVariance_ref.mData.data() : nullptr,
+            averageFactor,
+            updateMovingAverage ? resultRunningMean_ref.mData.data() : nullptr,
+            updateMovingAverage ? resultRunningVariance_ref.mData.data() : nullptr);
+
+        if(!batchNormFwd_ref.IsSupportedArgument(argument_ptr_ref.get()))
+        {
+            std::cout << "The runtime parameters not supported by the reference instance, exiting!"
+                      << std::endl;
+            return (false);
+        };
+
+        auto invoker_ptr_ref = batchNormFwd_ref.MakeInvokerPointer();
+
+        (void)invoker_ptr_ref->Run(argument_ptr_ref.get());
+    }
+
+    int num_kernel = 0;
+    bool pass      = true;
+
+    for(auto& inst_ptr : instance_ptrs)
+    {
+        auto argument_ptr = inst_ptr->MakeArgumentPointer(
+            arrInOutLengths,
+            arrInOutStrides,
+            arrInOutStrides,
+            arrReduceDims,
+            arrScaleBiasMeanVarLengths,
+            arrScaleBiasMeanVarStrides,
+            arrScaleBiasMeanVarStrides,
+            arrScaleBiasMeanVarStrides,
+            x_dev.GetDeviceBuffer(),
+            bnScale_dev.GetDeviceBuffer(),
+            bnBias_dev.GetDeviceBuffer(),
+            epsilon,
+            PassThroughOp{},
+            y_dev.GetDeviceBuffer(),
+            saveMeanAndInvVariance ? resultSaveMean_dev.GetDeviceBuffer() : nullptr,
+            saveMeanAndInvVariance ? resultSaveInvVariance_dev.GetDeviceBuffer() : nullptr,
+            averageFactor,
+            updateMovingAverage ? resultRunningMean_dev.GetDeviceBuffer() : nullptr,
+            updateMovingAverage ? resultRunningVariance_dev.GetDeviceBuffer() : nullptr);
+
+        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            num_kernel++;
+        }
+        else
+        {
+            if(time_kernel)
+            {
+                std::cout << inst_ptr->GetTypeString()
+                          << " skipped due to unsupported argument: " << std::endl;
+            }
+
+            continue;
+        };
+
+        size_t workspace_sz = inst_ptr->GetWorkSpaceSize(argument_ptr.get());
+
+        DeviceMem workspace_dev(workspace_sz);
+
+        inst_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
+
+        auto invoker_ptr = inst_ptr->MakeInvokerPointer();
+
+        float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+        size_t num_bytes = 0;
+
+        // inputing of x, scale, bias, outputing of y
+        num_bytes += total_length * (sizeof(XDataType) + sizeof(YDataType)) +
+                     invariant_length * (sizeof(ScaleDataType) + sizeof(BiasDataType));
+
+        // outputing of mean, inv-variance
+        num_bytes += saveMeanAndInvVariance ? invariant_length * sizeof(MeanVarDataType) * 2 : 0;
+
+        // updating of moving mean, variance
+        num_bytes += updateMovingAverage ? invariant_length * sizeof(MeanVarDataType) * 4 : 0;
+
+        float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+        if(time_kernel)
+            std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                      << inst_ptr->GetTypeString() << std::endl;
+
+        if(avg_time < best_avg_time)
+        {
+            best_instance_name = inst_ptr->GetTypeString();
+            best_avg_time      = avg_time;
+            best_gb_per_sec    = gb_per_sec;
+        }
+
+        if(do_verification)
+        {
+            using ck::utils::check_err;
+            bool single_pass;
+
+            y_dev.FromDevice(y.mData.data());
+
+            if constexpr(ck::is_same_v<YDataType, ck::bhalf_t>)
+                single_pass = check_err(y.mData, y_ref.mData, "y results", 1e-2, 1e-2);
+            else
+                single_pass = check_err(y.mData, y_ref.mData, "y results", 4e-3, 4e-3);
+
+            if(updateMovingAverage)
+            {
+                resultRunningMean_dev.FromDevice(resultRunningMean.mData.data());
+                resultRunningVariance_dev.FromDevice(resultRunningVariance.mData.data());
+
+                // clang-format off
+                single_pass = single_pass && check_err(resultRunningMean.mData, resultRunningMean_ref.mData, "average mean results", 1.5e-5, 1.5e-5);
+                single_pass = single_pass && check_err(resultRunningVariance.mData, resultRunningVariance_ref.mData, "average variance results", 1e-5, 1e-5);
+                // clang-format on
+            };
+
+            if(saveMeanAndInvVariance)
+            {
+                resultSaveMean_dev.FromDevice(resultSaveMean.mData.data());
+                resultSaveInvVariance_dev.FromDevice(resultSaveInvVariance.mData.data());
+
+                // clang-format off
+                single_pass = single_pass && check_err(resultSaveMean.mData, resultSaveMean_ref.mData, "mean results", 3e-5, 3e-5);
+                single_pass = single_pass && check_err(resultSaveInvVariance.mData, resultSaveInvVariance_ref.mData, "inv-variance results", 7e-5, 7e-5);
+                // clang-format on
+            };
+
+            pass = pass && single_pass;
+        };
+
+        if(do_dumpout)
+        {
+            using ck::host_common::dumpBufferToFile;
+
+            // clang-format off
+            dumpBufferToFile("dump_x.bin", x.mData.data(), x.mDesc.GetElementSize());
+            dumpBufferToFile("dump_y.bin", y.mData.data(), y.mDesc.GetElementSize());
+            dumpBufferToFile("dump_y_ref.bin", y_ref.mData.data(), y_ref.mDesc.GetElementSize());
+            // clang-format off
+
+            if(saveMeanAndInvVariance)
+            {
+                // clang-format off
+                dumpBufferToFile("dump_mean.bin", resultSaveMean.mData.data(), resultSaveMean.mDesc.GetElementSize());
+                dumpBufferToFile("dump_mean_ref.bin", resultSaveMean_ref.mData.data(), resultSaveMean_ref.mDesc.GetElementSize()); 
+                dumpBufferToFile("dump_invvar.bin", resultSaveInvVariance.mData.data(), resultSaveInvVariance.mDesc.GetElementSize());
+                dumpBufferToFile("dump_invvar_ref.bin", resultSaveInvVariance_ref.mData.data(), resultSaveInvVariance_ref.mDesc.GetElementSize());
+                // clang-format on
+            };
+        };
+    }
+
+    if(time_kernel)
+    {
+        std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_instance_name << std::endl;
+    }
+
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_conv_bwd_data_impl.hpp
new file mode 100644
index 00000000..86d394da
--- /dev/null
+++ b/profiler/include/profiler/profile_conv_bwd_data_impl.hpp
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/convolution_backward_data.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename DataType>
+void show_data_nhwc_layout(Tensor<DataType>& nhwc)
+{
+    std::cout << "[";
+    for(int n = 0; n < ck::type_convert<int>(nhwc.mDesc.GetLengths()[0]); n++)
+    {
+        std::cout << "[";
+        for(int hi = 0; hi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[2]); hi++)
+        {
+            std::cout << "[";
+            for(int wi = 0; wi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[3]); wi++)
+            {
+                std::cout << "[";
+                for(int c = 0; c < ck::type_convert<int>(nhwc.mDesc.GetLengths()[1]); c++)
+                {
+                    std::cout << static_cast<float>(nhwc(n, c, hi, wi)) << "  ";
+                }
+                std::cout << "]";
+            }
+            std::cout << "]";
+        }
+        std::cout << "]";
+    }
+    std::cout << "]";
+}
+
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType>
+bool profile_conv_bwd_data_impl(int do_verification,
+                                int init_method,
+                                bool do_log,
+                                bool time_kernel,
+                                const ck::utils::conv::ConvParam& conv_param)
+{
+    using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+    using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+
+    const auto in_g_n_c_wis_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+
+    const auto wei_g_k_c_xs_desc =
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+
+    const auto out_g_n_k_wos_desc =
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
+    Tensor<InDataType> input_host_result(in_g_n_c_wis_desc);
+    Tensor<InDataType> input_device_result(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> weight(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> output(out_g_n_k_wos_desc);
+
+    std::cout << "input: " << input_host_result.mDesc << std::endl;
+    std::cout << "weight: " << weight.mDesc << std::endl;
+    std::cout << "output: " << output.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        output.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        weight.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        break;
+    default:
+        output.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
+        weight.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * input_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * weight.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpaceSize());
+
+    out_device_buf.ToDevice(output.mData.data());
+    wei_device_buf.ToDevice(weight.mData.data());
+
+    if(do_verification)
+    {
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdData<NDimSpatial,
+                                                                         InDataType,
+                                                                         WeiDataType,
+                                                                         OutDataType,
+                                                                         InElementOp,
+                                                                         WeiElementOp,
+                                                                         OutElementOp>{};
+
+        auto ref_invoker = ref_conv.MakeInvoker();
+
+        auto ref_argument = ref_conv.MakeArgument(input_host_result,
+                                                  weight,
+                                                  output,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  InElementOp{},
+                                                  WeiElementOp{},
+                                                  OutElementOp{});
+        ref_invoker.Run(ref_argument);
+    }
+
+    using DeviceOp = ck::tensor_operation::device::DeviceConvBwdData<NDimSpatial,
+                                                                     InLayout,
+                                                                     WeiLayout,
+                                                                     OutLayout,
+                                                                     InDataType,
+                                                                     WeiDataType,
+                                                                     OutDataType,
+                                                                     InElementOp,
+                                                                     WeiElementOp,
+                                                                     OutElementOp>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    float best_avg_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device Conv instances
+    bool pass = true;
+
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+                                        static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+                                        static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+                                        conv_param.N_,
+                                        conv_param.K_,
+                                        conv_param.C_,
+                                        conv_param.input_spatial_lengths_,
+                                        conv_param.filter_spatial_lengths_,
+                                        conv_param.output_spatial_lengths_,
+                                        conv_param.conv_filter_strides_,
+                                        conv_param.conv_filter_dilations_,
+                                        conv_param.input_left_pads_,
+                                        conv_param.input_right_pads_,
+                                        in_element_op,
+                                        wei_element_op,
+                                        out_element_op);
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // for conv bwd data, some input tensor element are zero, but not written by kernel,
+            // need to set zero
+            in_device_buf.SetZero();
+
+            std::string op_name = op_ptr->GetTypeString();
+
+            auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+            float avg_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop      = conv_param.GetFlops();
+            std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+
+            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+            float gb_per_sec = num_btype / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                      << " GB/s" << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                in_device_buf.FromDevice(input_device_result.mData.data());
+
+                pass = pass & ck::utils::check_err(input_device_result, input_host_result);
+
+                if(do_log)
+                {
+                    std::cout << "in : ";
+                    show_data_nhwc_layout(output);
+                    std::cout << std::endl;
+
+                    std::cout << "wei: ";
+                    show_data_nhwc_layout(weight);
+                    std::cout << std::endl;
+
+                    std::cout << "out_host  : ";
+                    show_data_nhwc_layout(input_host_result);
+                    std::cout << std::endl;
+
+                    std::cout << "out_device: ";
+                    show_data_nhwc_layout(input_device_result);
+                    std::cout << std::endl;
+                }
+            }
+        }
+        else
+        {
+            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best configuration parameters:"
+              << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
+              << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_conv_fwd_bias_relu_add_impl.hpp b/profiler/include/profiler/profile_conv_fwd_bias_relu_add_impl.hpp
new file mode 100644
index 00000000..1aebef8b
--- /dev/null
+++ b/profiler/include/profiler/profile_conv_fwd_bias_relu_add_impl.hpp
@@ -0,0 +1,278 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation_add.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation_add.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using DeviceConvFwdBiasReluAddPtr =
+    DeviceConvFwdBiasActivationAddPtr<ck::tensor_operation::element_wise::PassThrough,
+                                      ck::tensor_operation::element_wise::PassThrough,
+                                      ck::tensor_operation::element_wise::AddReluAdd>;
+
+void add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instances(
+    std::vector<DeviceConvFwdBiasReluAddPtr>&);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+
+namespace ck {
+namespace profiler {
+
+template <int NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+void profile_conv_fwd_bias_relu_add_impl(int do_verification,
+                                         int init_method,
+                                         bool do_log,
+                                         bool time_kernel,
+                                         ck::index_t N,
+                                         ck::index_t K,
+                                         ck::index_t C,
+                                         std::vector<ck::index_t> input_spatial_lengths,
+                                         std::vector<ck::index_t> filter_spatial_lengths,
+                                         std::vector<ck::index_t> output_spatial_lengths,
+                                         std::vector<ck::index_t> conv_filter_strides,
+                                         std::vector<ck::index_t> conv_filter_dilations,
+                                         std::vector<ck::index_t> input_left_pads,
+                                         std::vector<ck::index_t> input_right_pads)
+{
+    const ck::index_t Y = filter_spatial_lengths[0];
+    const ck::index_t X = filter_spatial_lengths[1];
+
+    const ck::index_t Hi = input_spatial_lengths[0];
+    const ck::index_t Wi = input_spatial_lengths[1];
+
+    const ck::index_t Ho = output_spatial_lengths[0];
+    const ck::index_t Wo = output_spatial_lengths[1];
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
+            using namespace ck::literals;
+
+            if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
+                         is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
+                         is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
+            {
+                return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz});
+            }
+            else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value ||
+                              is_same<decltype(layout), tensor_layout::convolution::KYXC>::value ||
+                              is_same<decltype(layout), tensor_layout::convolution::NHWK>::value)
+            {
+                return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
+            }
+        };
+
+    Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
+    Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{}));
+    Tensor<OutDataType> out_n_k_ho_wo_host_result(
+        f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
+    Tensor<OutDataType> out_n_k_ho_wo_device_result(
+        f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
+
+    // bias: assume contiguous 1d vector
+    Tensor<OutDataType> bias_k({K});
+
+    // residual: assume same layout as output tensor
+    Tensor<OutDataType> resi_n_k_ho_wo(f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
+
+    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
+    std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
+    std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
+    std::cout << "bias_k: " << bias_k.mDesc << std::endl;
+    std::cout << "resi_n_k_ho_wo: " << resi_n_k_ho_wo.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        bias_k.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        resi_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        break;
+    default:
+        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+        bias_k.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
+        resi_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
+    }
+
+    using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+    using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+
+    if(do_verification)
+    {
+        using ReferenceConvFwdInstance =
+            ck::tensor_operation::host::ReferenceConvFwd_Bias_Activation_Add<InDataType,
+                                                                             WeiDataType,
+                                                                             OutDataType,
+                                                                             InElementOp,
+                                                                             WeiElementOp,
+                                                                             OutElementOp>;
+
+        auto ref_conv    = ReferenceConvFwdInstance{};
+        auto ref_invoker = ref_conv.MakeInvoker();
+
+        auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi,
+                                                  wei_k_c_y_x,
+                                                  out_n_k_ho_wo_host_result,
+                                                  bias_k,
+                                                  resi_n_k_ho_wo,
+                                                  conv_filter_strides,
+                                                  conv_filter_dilations,
+                                                  input_left_pads,
+                                                  input_right_pads,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op);
+
+        ref_invoker.Run(ref_argument);
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) *
+                             out_n_k_ho_wo_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem bias_device_buf(sizeof(OutDataType) * bias_k.mDesc.GetElementSpaceSize());
+    DeviceMem resi_device_buf(sizeof(OutDataType) * resi_n_k_ho_wo.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
+    bias_device_buf.ToDevice(bias_k.mData.data());
+    resi_device_buf.ToDevice(resi_n_k_ho_wo.mData.data());
+
+    using DeviceConvFwdBiasReluAddPtr = ck::tensor_operation::device::
+        DeviceConvFwdBiasActivationAddPtr<InElementOp, WeiElementOp, OutElementOp>;
+
+    // add device operator instances
+    std::vector<DeviceConvFwdBiasReluAddPtr> op_ptrs;
+
+    if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, ck::half_t> &&
+                 ck::is_same_v<ck::remove_cv_t<WeiDataType>, ck::half_t> &&
+                 ck::is_same_v<ck::remove_cv_t<OutDataType>, ck::half_t>)
+    {
+        ck::tensor_operation::device::instance::
+            add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instances(op_ptrs);
+    }
+
+    if(op_ptrs.size() <= 0)
+    {
+        throw std::runtime_error("wrong! no device Conv instance found");
+    }
+
+    std::string best_conv_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device Conv instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            static_cast<const OutDataType*>(bias_device_buf.GetDeviceBuffer()),
+            static_cast<const OutDataType*>(resi_device_buf.GetDeviceBuffer()),
+            N,
+            K,
+            C,
+            input_spatial_lengths,
+            filter_spatial_lengths,
+            output_spatial_lengths,
+            conv_filter_strides,
+            conv_filter_dilations,
+            input_left_pads,
+            input_right_pads,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            std::string conv_name = op_ptr->GetTypeString();
+
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
+
+            std::size_t num_btype =
+                sizeof(InDataType) * (N * C * Hi * Wi) + sizeof(WeiDataType) * (K * C * Y * X) +
+                sizeof(OutDataType) * (N * K * Ho * Wo) + sizeof(OutDataType) * (K) +
+                sizeof(OutDataType) * (N * K * Ho * Wo);
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                      << " GB/s, " << conv_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_conv_name  = conv_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
+
+                ck::utils::check_err(out_n_k_ho_wo_device_result, out_n_k_ho_wo_host_result);
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "in : ", in_n_c_hi_wi.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "wei: ", wei_k_c_y_x.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "out_host  : ", out_n_k_ho_wo_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "out_device: ", out_n_k_ho_wo_device_result.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_conv_fwd_bias_relu_impl.hpp b/profiler/include/profiler/profile_conv_fwd_bias_relu_impl.hpp
new file mode 100644
index 00000000..2bac1443
--- /dev/null
+++ b/profiler/include/profiler/profile_conv_fwd_bias_relu_impl.hpp
@@ -0,0 +1,266 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd_bias_activation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd_bias_activation.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using DeviceConvFwdBiasReluPtr =
+    DeviceConvFwdBiasActivationPtr<ck::tensor_operation::element_wise::PassThrough,
+                                   ck::tensor_operation::element_wise::PassThrough,
+                                   ck::tensor_operation::element_wise::AddRelu>;
+
+void add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances(
+    std::vector<DeviceConvFwdBiasReluPtr>&);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+
+namespace ck {
+namespace profiler {
+
+template <int NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+void profile_conv_fwd_bias_relu_impl(int do_verification,
+                                     int init_method,
+                                     bool do_log,
+                                     bool time_kernel,
+                                     ck::index_t N,
+                                     ck::index_t K,
+                                     ck::index_t C,
+                                     std::vector<ck::index_t> input_spatial_lengths,
+                                     std::vector<ck::index_t> filter_spatial_lengths,
+                                     std::vector<ck::index_t> output_spatial_lengths,
+                                     std::vector<ck::index_t> conv_filter_strides,
+                                     std::vector<ck::index_t> conv_filter_dilations,
+                                     std::vector<ck::index_t> input_left_pads,
+                                     std::vector<ck::index_t> input_right_pads)
+{
+    const ck::index_t Y = filter_spatial_lengths[0];
+    const ck::index_t X = filter_spatial_lengths[1];
+
+    const ck::index_t Hi = input_spatial_lengths[0];
+    const ck::index_t Wi = input_spatial_lengths[1];
+
+    const ck::index_t Ho = output_spatial_lengths[0];
+    const ck::index_t Wo = output_spatial_lengths[1];
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
+            using namespace ck::literals;
+
+            if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
+                         is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
+                         is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
+            {
+                return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz});
+            }
+            else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value ||
+                              is_same<decltype(layout), tensor_layout::convolution::KYXC>::value ||
+                              is_same<decltype(layout), tensor_layout::convolution::NHWK>::value)
+            {
+                return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
+            }
+        };
+
+    Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
+    Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{}));
+    Tensor<OutDataType> out_n_k_ho_wo_host_result(
+        f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
+    Tensor<OutDataType> out_n_k_ho_wo_device_result(
+        f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
+
+    // bias: assume contiguous 1d vector
+    Tensor<OutDataType> bias_k({K});
+
+    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
+    std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
+    std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
+    std::cout << "bias_k: " << bias_k.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        bias_k.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        break;
+    default:
+        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+        bias_k.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
+    }
+
+    using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+    using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+
+    if(do_verification)
+    {
+        using ReferenceConvFwdInstance =
+            ck::tensor_operation::host::ReferenceConvFwd_Bias_Activation<InDataType,
+                                                                         WeiDataType,
+                                                                         OutDataType,
+                                                                         InElementOp,
+                                                                         WeiElementOp,
+                                                                         OutElementOp>;
+
+        auto ref_conv    = ReferenceConvFwdInstance{};
+        auto ref_invoker = ref_conv.MakeInvoker();
+
+        auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi,
+                                                  wei_k_c_y_x,
+                                                  out_n_k_ho_wo_host_result,
+                                                  bias_k,
+                                                  conv_filter_strides,
+                                                  conv_filter_dilations,
+                                                  input_left_pads,
+                                                  input_right_pads,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op);
+        ref_invoker.Run(ref_argument);
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) *
+                             out_n_k_ho_wo_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem bias_device_buf(sizeof(OutDataType) * bias_k.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
+    bias_device_buf.ToDevice(bias_k.mData.data());
+
+    using DeviceConvFwdBiasReluPtr = ck::tensor_operation::device::
+        DeviceConvFwdBiasActivationPtr<InElementOp, WeiElementOp, OutElementOp>;
+
+    // add device operator instances
+    std::vector<DeviceConvFwdBiasReluPtr> op_ptrs;
+
+    if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, ck::half_t> &&
+                 ck::is_same_v<ck::remove_cv_t<WeiDataType>, ck::half_t> &&
+                 ck::is_same_v<ck::remove_cv_t<OutDataType>, ck::half_t>)
+    {
+        ck::tensor_operation::device::instance::
+            add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances(op_ptrs);
+    }
+
+    if(op_ptrs.size() <= 0)
+    {
+        throw std::runtime_error("wrong! no device Conv instance found");
+    }
+
+    std::string best_conv_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device Conv instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            static_cast<const OutDataType*>(bias_device_buf.GetDeviceBuffer()),
+            N,
+            K,
+            C,
+            input_spatial_lengths,
+            filter_spatial_lengths,
+            output_spatial_lengths,
+            conv_filter_strides,
+            conv_filter_dilations,
+            input_left_pads,
+            input_right_pads,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            std::string conv_name = op_ptr->GetTypeString();
+
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
+
+            std::size_t num_btype =
+                sizeof(InDataType) * (N * C * Hi * Wi) + sizeof(WeiDataType) * (K * C * Y * X) +
+                sizeof(OutDataType) * (N * K * Ho * Wo) + sizeof(OutDataType) * (K);
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                      << " GB/s, " << conv_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_conv_name  = conv_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
+
+                ck::utils::check_err(out_n_k_ho_wo_device_result, out_n_k_ho_wo_host_result);
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "in : ", in_n_c_hi_wi.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "wei: ", wei_k_c_y_x.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "out_host  : ", out_n_k_ho_wo_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "out_device: ", out_n_k_ho_wo_device_result.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_conv_fwd_impl.hpp b/profiler/include/profiler/profile_conv_fwd_impl.hpp
new file mode 100644
index 00000000..1f3ba8f0
--- /dev/null
+++ b/profiler/include/profiler/profile_conv_fwd_impl.hpp
@@ -0,0 +1,221 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+#include <iostream>
+#include <typeinfo>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/convolution_forward.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType>
+bool profile_conv_fwd_impl(int do_verification,
+                           int init_method,
+                           bool do_log,
+                           bool time_kernel,
+                           const ck::utils::conv::ConvParam& conv_param)
+{
+    using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+    using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+
+    const auto in_g_n_c_wis_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+
+    const auto wei_g_k_c_xs_desc =
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+
+    const auto out_g_n_k_wos_desc =
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
+    Tensor<InDataType> input(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> weight(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> host_output(out_g_n_k_wos_desc);
+    Tensor<OutDataType> device_output(out_g_n_k_wos_desc);
+
+    std::cout << "input: " << input.mDesc << std::endl;
+    std::cout << "weight: " << weight.mDesc << std::endl;
+    std::cout << "output: " << host_output.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        weight.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        break;
+    default:
+        input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+        weight.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * weight.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(input.mData.data());
+    wei_device_buf.ToDevice(weight.mData.data());
+
+    // run reference op
+    if(do_verification)
+    {
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                     InDataType,
+                                                                     WeiDataType,
+                                                                     OutDataType,
+                                                                     InElementOp,
+                                                                     WeiElementOp,
+                                                                     OutElementOp>{};
+
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(input,
+                                                  weight,
+                                                  host_output,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op);
+
+        // init host output to zero
+        host_output.SetZero();
+
+        ref_invoker.Run(ref_argument);
+    }
+
+    using DeviceOp = ck::tensor_operation::device::DeviceConvFwd<NDimSpatial,
+                                                                 InLayout,
+                                                                 WeiLayout,
+                                                                 OutLayout,
+                                                                 InDataType,
+                                                                 WeiDataType,
+                                                                 OutDataType,
+                                                                 InElementOp,
+                                                                 WeiElementOp,
+                                                                 OutElementOp>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    float best_avg_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device op instances
+    bool pass = true;
+
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+                                        static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+                                        static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+                                        conv_param.N_,
+                                        conv_param.K_,
+                                        conv_param.C_,
+                                        conv_param.input_spatial_lengths_,
+                                        conv_param.filter_spatial_lengths_,
+                                        conv_param.GetOutputSpatialLengths(),
+                                        conv_param.conv_filter_strides_,
+                                        conv_param.conv_filter_dilations_,
+                                        conv_param.input_left_pads_,
+                                        conv_param.input_right_pads_,
+                                        in_element_op,
+                                        wei_element_op,
+                                        out_element_op);
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // re-init output to zero before profiling next kernel
+            out_device_buf.SetZero();
+
+            std::string op_name = op_ptr->GetTypeString();
+
+            auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+            float avg_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop      = conv_param.GetFlops();
+            std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+
+            float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
+
+            float gb_per_sec = num_btype / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                out_device_buf.FromDevice(device_output.mData.data());
+
+                pass = pass & ck::utils::check_err(device_output, host_output);
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "input : ", input.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "weight: ", weight.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "host_output  : ", host_output.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "device_output: ", device_output.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+        else
+        {
+            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best configuration parameters:"
+              << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
+              << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_convnd_bwd_data_impl.hpp b/profiler/include/profiler/profile_convnd_bwd_data_impl.hpp
new file mode 100644
index 00000000..1e69ebc8
--- /dev/null
+++ b/profiler/include/profiler/profile_convnd_bwd_data_impl.hpp
@@ -0,0 +1,486 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_bwd_data.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/conv_util.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
+#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
+
+using F16  = ck::half_t;
+using F32  = float;
+using BF16 = ck::bhalf_t;
+using INT8 = int8_t;
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using DeviceConvBwdDataNoOpPtr =
+    DeviceConvBwdDataPtr<ck::tensor_operation::element_wise::PassThrough,
+                         ck::tensor_operation::element_wise::PassThrough,
+                         ck::tensor_operation::element_wise::PassThrough>;
+void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances(
+    std::vector<DeviceConvBwdDataNoOpPtr>&);
+void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instances(
+    std::vector<DeviceConvBwdDataNoOpPtr>&);
+void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances(
+    std::vector<DeviceConvBwdDataNoOpPtr>&);
+void add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances(
+    std::vector<DeviceConvBwdDataNoOpPtr>&);
+
+void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(
+    std::vector<DeviceConvBwdDataNoOpPtr>&);
+void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(
+    std::vector<DeviceConvBwdDataNoOpPtr>&);
+void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(
+    std::vector<DeviceConvBwdDataNoOpPtr>&);
+void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(
+    std::vector<DeviceConvBwdDataNoOpPtr>&);
+
+void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances(
+    std::vector<DeviceConvBwdDataNoOpPtr>&);
+void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instances(
+    std::vector<DeviceConvBwdDataNoOpPtr>&);
+void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(
+    std::vector<DeviceConvBwdDataNoOpPtr>&);
+void add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances(
+    std::vector<DeviceConvBwdDataNoOpPtr>&);
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+
+namespace ck {
+namespace profiler {
+using DeviceConvBwdDataNoOpPtr = ck::tensor_operation::device::instance::DeviceConvBwdDataNoOpPtr;
+
+template <typename InLayout>
+HostTensorDescriptor get_input_host_tensor_descriptor(const std::vector<std::size_t>& dims,
+                                                      int num_dim_spatial = 2)
+{
+    namespace tl = ck::tensor_layout::convolution;
+
+    switch(num_dim_spatial)
+    {
+    case 3: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, InLayout{});
+    }
+    case 2: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, InLayout{});
+    }
+    case 1: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, InLayout{});
+    }
+    default: {
+        throw std::runtime_error("Unsupported number of spatial dimensions provided!");
+    }
+    }
+}
+template <typename WeiLayout>
+HostTensorDescriptor get_filters_host_tensor_descriptor(const std::vector<std::size_t>& dims,
+                                                        int num_dim_spatial = 2)
+{
+    namespace tl = ck::tensor_layout::convolution;
+
+    switch(num_dim_spatial)
+    {
+    case 3: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, WeiLayout{});
+    }
+    case 2: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, WeiLayout{});
+    }
+    case 1: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, WeiLayout{});
+    }
+    default: {
+        throw std::runtime_error("Unsupported number of spatial dimensions provided!");
+    }
+    }
+}
+template <typename OutLayout>
+HostTensorDescriptor get_output_host_ensor_descriptor(const std::vector<std::size_t>& dims,
+                                                      int num_dim_spatial = 2)
+{
+    namespace tl = ck::tensor_layout::convolution;
+
+    switch(num_dim_spatial)
+    {
+    case 3: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, OutLayout{});
+    }
+    case 2: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, OutLayout{});
+    }
+    case 1: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, OutLayout{});
+    }
+    default: {
+        throw std::runtime_error("Unsupported number of spatial dimensions provided!");
+    }
+    }
+}
+template <typename InDataType, typename WeiDataType, typename OutDataType>
+void get_device_conv_bwd_data_op_ptr(
+    InDataType, WeiDataType, OutDataType, std::vector<DeviceConvBwdDataNoOpPtr>&, int)
+{
+    std::cout << "can not find device conv bwd data" << std::endl;
+    exit(1);
+}
+template <>
+void get_device_conv_bwd_data_op_ptr(
+    F32, F32, F32, std::vector<DeviceConvBwdDataNoOpPtr>& conv_ptrs, int num_dim_spatial)
+{
+    switch(num_dim_spatial)
+    {
+    case 1:
+        ck::tensor_operation::device::instance::
+            add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f32_instances(conv_ptrs);
+        break;
+    case 2:
+        ck::tensor_operation::device::instance::
+            add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs);
+        break;
+    case 3:
+        ck::tensor_operation::device::instance::
+            add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f32_instances(conv_ptrs);
+        break;
+    default: break;
+    }
+}
+template <>
+void get_device_conv_bwd_data_op_ptr(
+    F16, F16, F16, std::vector<DeviceConvBwdDataNoOpPtr>& conv_ptrs, int num_dim_spatial)
+{
+    switch(num_dim_spatial)
+    {
+    case 1:
+        ck::tensor_operation::device::instance::
+            add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_f16_instances(conv_ptrs);
+        break;
+    case 2:
+        ck::tensor_operation::device::instance::
+            add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
+        break;
+    case 3:
+        ck::tensor_operation::device::instance::
+            add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_f16_instances(conv_ptrs);
+        break;
+    default: break;
+    }
+}
+template <>
+void get_device_conv_bwd_data_op_ptr(
+    BF16, BF16, BF16, std::vector<DeviceConvBwdDataNoOpPtr>& conv_ptrs, int num_dim_spatial)
+{
+    switch(num_dim_spatial)
+    {
+    case 1:
+        ck::tensor_operation::device::instance::
+            add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_bf16_instances(conv_ptrs);
+        break;
+    case 2:
+        ck::tensor_operation::device::instance::
+            add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs);
+        break;
+    case 3:
+        ck::tensor_operation::device::instance::
+            add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(conv_ptrs);
+        break;
+    default: break;
+    }
+}
+template <>
+void get_device_conv_bwd_data_op_ptr(
+    INT8, INT8, INT8, std::vector<DeviceConvBwdDataNoOpPtr>& conv_ptrs, int num_dim_spatial)
+{
+    switch(num_dim_spatial)
+    {
+    case 1:
+        ck::tensor_operation::device::instance::
+            add_device_conv1d_bwd_data_xdl_nwc_kxc_nwk_int8_instances(conv_ptrs);
+        break;
+    case 2:
+        ck::tensor_operation::device::instance::
+            add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(conv_ptrs);
+        break;
+    case 3:
+        ck::tensor_operation::device::instance::
+            add_device_conv3d_bwd_data_xdl_ndhwc_kzyxc_ndhwk_int8_instances(conv_ptrs);
+        break;
+    default: break;
+    }
+}
+
+template <typename T>
+static bool check_out(const Tensor<T>& ref, const Tensor<T>& result)
+{
+    float max_diff = 1e-6;
+
+    for(std::size_t i = 0; i < ref.mData.size(); ++i)
+    {
+        float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
+        if(max_diff < diff)
+        {
+            return false;
+        }
+    }
+    return true;
+}
+template <typename DataType>
+void show_data_nhwc_layout(Tensor<DataType>& nhwc)
+{
+    std::cout << "[";
+    for(int n = 0; n < ck::type_convert<int>(nhwc.mDesc.GetLengths()[0]); n++)
+    {
+        std::cout << "[";
+        for(int hi = 0; hi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[2]); hi++)
+        {
+            std::cout << "[";
+            for(int wi = 0; wi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[3]); wi++)
+            {
+                std::cout << "[";
+                for(int c = 0; c < ck::type_convert<int>(nhwc.mDesc.GetLengths()[1]); c++)
+                {
+                    std::cout << static_cast<float>(nhwc(n, c, hi, wi)) << "  ";
+                }
+                std::cout << "]";
+            }
+            std::cout << "]";
+        }
+        std::cout << "]";
+    }
+    std::cout << "]";
+}
+
+template <int NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename AccDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+bool profile_convnd_bwd_data_impl(int do_verification,
+                                  int init_method,
+                                  bool do_log,
+                                  bool time_kernel,
+                                  ck::index_t N,
+                                  ck::index_t K,
+                                  ck::index_t C,
+                                  const std::vector<ck::index_t>& input_spatial_lengths,
+                                  const std::vector<ck::index_t>& filter_spatial_lengths,
+                                  const std::vector<ck::index_t>& output_spatial_lengths,
+                                  const std::vector<ck::index_t>& conv_filter_strides,
+                                  const std::vector<ck::index_t>& conv_filter_dilations,
+                                  const std::vector<ck::index_t>& input_left_pads,
+                                  const std::vector<ck::index_t>& input_right_pads)
+{
+    using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+    using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+
+    std::vector<std::size_t> input_dims{static_cast<std::size_t>(N), static_cast<std::size_t>(C)};
+    input_dims.insert(
+        std::end(input_dims), std::begin(input_spatial_lengths), std::end(input_spatial_lengths));
+
+    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(K), static_cast<std::size_t>(C)};
+    filter_dims.insert(std::end(filter_dims),
+                       std::begin(filter_spatial_lengths),
+                       std::end(filter_spatial_lengths));
+
+    std::vector<std::size_t> output_dims{static_cast<std::size_t>(N), static_cast<std::size_t>(K)};
+    output_dims.insert(std::end(output_dims),
+                       std::begin(output_spatial_lengths),
+                       std::end(output_spatial_lengths));
+
+    Tensor<InDataType> input_host_result(
+        get_input_host_tensor_descriptor<InLayout>(input_dims, NDimSpatial));
+    Tensor<InDataType> input_device_result(
+        get_input_host_tensor_descriptor<InLayout>(input_dims, NDimSpatial));
+    Tensor<WeiDataType> weights(
+        get_filters_host_tensor_descriptor<WeiLayout>(filter_dims, NDimSpatial));
+    Tensor<OutDataType> output(
+        get_output_host_ensor_descriptor<OutLayout>(output_dims, NDimSpatial));
+
+    std::cout << "input: " << input_host_result.mDesc << std::endl;
+    std::cout << "weights: " << weights.mDesc << std::endl;
+    std::cout << "output: " << output.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        output.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        weights.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        break;
+    default:
+        output.GenerateTensorValue(GeneratorTensor_1<OutDataType>{1});
+        weights.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * input_device_result.mDesc.GetElementSpace());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace());
+    DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace());
+
+    out_device_buf.ToDevice(output.mData.data());
+    wei_device_buf.ToDevice(weights.mData.data());
+
+    // reset input to zero
+    in_device_buf.SetZero();
+
+    if(do_verification)
+    {
+        auto RunReference = [&](auto& ref_conv) {
+            auto ref_invoker = ref_conv.MakeInvoker();
+
+            auto ref_argument = ref_conv.MakeArgument(input_host_result,
+                                                      weights,
+                                                      output,
+                                                      conv_filter_strides,
+                                                      conv_filter_dilations,
+                                                      input_left_pads,
+                                                      input_right_pads,
+                                                      InElementOp{},
+                                                      WeiElementOp{},
+                                                      OutElementOp{});
+            ref_invoker.Run(ref_argument);
+        };
+
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdData<InDataType,
+                                                                         WeiDataType,
+                                                                         OutDataType,
+                                                                         AccDataType,
+                                                                         InElementOp,
+                                                                         WeiElementOp,
+                                                                         OutElementOp,
+                                                                         NDimSpatial>();
+        RunReference(ref_conv);
+    }
+
+    // add device Conv instances
+    std::vector<DeviceConvBwdDataNoOpPtr> conv_ptrs;
+    get_device_conv_bwd_data_op_ptr(
+        InDataType{}, WeiDataType{}, OutDataType{}, conv_ptrs, NDimSpatial);
+
+    if(conv_ptrs.size() <= 0)
+    {
+        throw std::runtime_error("wrong! no device Conv instance found");
+    }
+
+    std::string best_conv_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device Conv instances
+    bool success = true;
+    for(auto& conv_ptr : conv_ptrs)
+    {
+        auto argument_ptr = conv_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            N,
+            K,
+            C,
+            input_spatial_lengths,
+            filter_spatial_lengths,
+            output_spatial_lengths,
+            conv_filter_strides,
+            conv_filter_dilations,
+            input_left_pads,
+            input_right_pads,
+            in_element_op,
+            wei_element_op,
+            out_element_op);
+
+        auto invoker_ptr = conv_ptr->MakeInvokerPointer();
+
+        if(conv_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            std::string conv_name = conv_ptr->GetTypeString();
+
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop =
+                ck::utils::conv::get_flops(N, C, K, filter_spatial_lengths, output_spatial_lengths);
+            std::size_t num_btype =
+                ck::utils::conv::get_btype<InDataType, WeiDataType, OutDataType>(
+                    N, C, K, input_spatial_lengths, filter_spatial_lengths, output_spatial_lengths);
+
+            float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                      << " GB/s" << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_conv_name  = conv_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                in_device_buf.FromDevice(input_device_result.mData.data());
+
+                if(!check_out(input_host_result, input_device_result))
+                {
+                    std::cout << "Fail Info: " << conv_ptr->GetTypeString() << std::endl;
+
+                    success = false;
+                }
+                else
+                {
+                    std::cout << "Pass Info: " << conv_ptr->GetTypeString() << std::endl;
+                }
+
+                success = ck::utils::check_err(input_host_result, input_device_result);
+
+                if(do_log)
+                {
+                    std::cout << "in : ";
+                    show_data_nhwc_layout(output);
+                    std::cout << std::endl;
+
+                    std::cout << "wei: ";
+                    show_data_nhwc_layout(weights);
+                    std::cout << std::endl;
+
+                    std::cout << "out_host  : ";
+                    show_data_nhwc_layout(input_host_result);
+                    std::cout << std::endl;
+
+                    std::cout << "out_device: ";
+                    show_data_nhwc_layout(input_device_result);
+                    std::cout << std::endl;
+                }
+            }
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
+    return success;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_convnd_bwd_weight_impl.hpp b/profiler/include/profiler/profile_convnd_bwd_weight_impl.hpp
new file mode 100644
index 00000000..e37c887a
--- /dev/null
+++ b/profiler/include/profiler/profile_convnd_bwd_weight_impl.hpp
@@ -0,0 +1,474 @@
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_backward_weight.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/conv_util.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
+#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_backward_weight.hpp"
+
+using F16  = ck::half_t;
+using F32  = float;
+using BF16 = ck::bhalf_t;
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using DeviceConvndBwdWeightNoOpPtr =
+    DeviceConvBwdWeightPtr<ck::tensor_operation::element_wise::PassThrough,
+                           ck::tensor_operation::element_wise::PassThrough,
+                           ck::tensor_operation::element_wise::PassThrough>;
+
+void add_device_conv1d_bwd_weight_xdl_nwc_kxc_nwk_f32_instances(
+    std::vector<DeviceConvndBwdWeightNoOpPtr>&);
+void add_device_convnd_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instances(
+    std::vector<DeviceConvndBwdWeightNoOpPtr>&);
+void add_device_conv3d_bwd_weight_xdl_ndhwc_kzyxc_ndhwk_f32_instances(
+    std::vector<DeviceConvndBwdWeightNoOpPtr>&);
+
+void add_device_conv1d_bwd_weight_xdl_nwc_kxc_nwk_f16_instances(
+    std::vector<DeviceConvndBwdWeightNoOpPtr>&);
+void add_device_convnd_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instances(
+    std::vector<DeviceConvndBwdWeightNoOpPtr>&);
+void add_device_conv3d_bwd_weight_xdl_ndhwc_kzyxc_ndhwk_f16_instances(
+    std::vector<DeviceConvndBwdWeightNoOpPtr>&);
+
+void add_device_conv1d_bwd_weight_xdl_nwc_kxc_nwk_bf16_instances(
+    std::vector<DeviceConvndBwdWeightNoOpPtr>&);
+void add_device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_bf16_instances(
+    std::vector<DeviceConvndBwdWeightNoOpPtr>&);
+void add_device_conv3d_bwd_weight_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(
+    std::vector<DeviceConvndBwdWeightNoOpPtr>&);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+
+namespace ck {
+namespace profiler {
+
+using DeviceConvndBwdWeightNoOpPtr =
+    ck::tensor_operation::device::instance::DeviceConvndBwdWeightNoOpPtr;
+
+template <typename InLayout>
+HostTensorDescriptor get_input_host_tensor_descriptor(const std::vector<std::size_t>& dims,
+                                                      int num_dim_spatial = 2)
+{
+    namespace tl = ck::tensor_layout::convolution;
+
+    switch(num_dim_spatial)
+    {
+    case 3: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, InLayout{});
+    }
+    case 2: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, InLayout{});
+    }
+    case 1: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, InLayout{});
+    }
+    default: {
+        throw std::runtime_error("Unsupported number of spatial dimensions provided!");
+    }
+    }
+}
+
+template <typename WeiLayout>
+HostTensorDescriptor get_filters_host_tensor_descriptor(const std::vector<std::size_t>& dims,
+                                                        int num_dim_spatial = 2)
+{
+    namespace tl = ck::tensor_layout::convolution;
+
+    switch(num_dim_spatial)
+    {
+    case 3: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, WeiLayout{});
+    }
+    case 2: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, WeiLayout{});
+    }
+    case 1: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, WeiLayout{});
+    }
+    default: {
+        throw std::runtime_error("Unsupported number of spatial dimensions provided!");
+    }
+    }
+}
+
+template <typename OutLayout>
+HostTensorDescriptor get_output_host_ensor_descriptor(const std::vector<std::size_t>& dims,
+                                                      int num_dim_spatial = 2)
+{
+    namespace tl = ck::tensor_layout::convolution;
+
+    switch(num_dim_spatial)
+    {
+    case 3: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, OutLayout{});
+    }
+    case 2: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, OutLayout{});
+    }
+    case 1: {
+        return ck::utils::conv::get_host_tensor_descriptor(dims, OutLayout{});
+    }
+    default: {
+        throw std::runtime_error("Unsupported number of spatial dimensions provided!");
+    }
+    }
+}
+
+template <typename InDataType, typename WeiDataType, typename OutDataType>
+void get_device_conv_bwd_weight_op_ptr(
+    InDataType, WeiDataType, OutDataType, std::vector<DeviceConvndBwdWeightNoOpPtr>&, int)
+{
+    std::cout << "can not find device conv bwd weight" << std::endl;
+    exit(1);
+}
+
+template <>
+void get_device_conv_bwd_weight_op_ptr(
+    F32, F32, F32, std::vector<DeviceConvndBwdWeightNoOpPtr>& conv_ptrs, int num_dim_spatial)
+{
+    switch(num_dim_spatial)
+    {
+    case 1:
+        ck::tensor_operation::device::instance::
+            add_device_conv1d_bwd_weight_xdl_nwc_kxc_nwk_f32_instances(conv_ptrs);
+        break;
+    case 2:
+        ck::tensor_operation::device::instance::
+            add_device_convnd_bwd_weight_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs);
+        break;
+    case 3:
+        ck::tensor_operation::device::instance::
+            add_device_conv3d_bwd_weight_xdl_ndhwc_kzyxc_ndhwk_f32_instances(conv_ptrs);
+        break;
+    default: break;
+    }
+}
+
+template <>
+void get_device_conv_bwd_weight_op_ptr(
+    F16, F16, F16, std::vector<DeviceConvndBwdWeightNoOpPtr>& conv_ptrs, int num_dim_spatial)
+{
+    switch(num_dim_spatial)
+    {
+    case 1:
+        ck::tensor_operation::device::instance::
+            add_device_conv1d_bwd_weight_xdl_nwc_kxc_nwk_f16_instances(conv_ptrs);
+        break;
+    case 2:
+        ck::tensor_operation::device::instance::
+            add_device_convnd_bwd_weight_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
+        break;
+    case 3:
+        ck::tensor_operation::device::instance::
+            add_device_conv3d_bwd_weight_xdl_ndhwc_kzyxc_ndhwk_f16_instances(conv_ptrs);
+        break;
+    default: break;
+    }
+}
+
+template <>
+void get_device_conv_bwd_weight_op_ptr(
+    BF16, BF16, BF16, std::vector<DeviceConvndBwdWeightNoOpPtr>& conv_ptrs, int num_dim_spatial)
+{
+    switch(num_dim_spatial)
+    {
+    case 1:
+        ck::tensor_operation::device::instance::
+            add_device_conv1d_bwd_weight_xdl_nwc_kxc_nwk_bf16_instances(conv_ptrs);
+        break;
+    case 2:
+        ck::tensor_operation::device::instance::
+            add_device_conv2d_bwd_weight_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs);
+        break;
+    case 3:
+        ck::tensor_operation::device::instance::
+            add_device_conv3d_bwd_weight_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(conv_ptrs);
+        break;
+    default: break;
+    }
+}
+
+template <typename DataType>
+void show_data_nhwc_layout(Tensor<DataType>& nhwc)
+{
+    std::cout << "[";
+    for(int n = 0; n < ck::type_convert<int>(nhwc.mDesc.GetLengths()[0]); n++)
+    {
+        std::cout << "[";
+        for(int hi = 0; hi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[2]); hi++)
+        {
+            std::cout << "[";
+            for(int wi = 0; wi < ck::type_convert<int>(nhwc.mDesc.GetLengths()[3]); wi++)
+            {
+                std::cout << "[";
+                for(int c = 0; c < ck::type_convert<int>(nhwc.mDesc.GetLengths()[1]); c++)
+                {
+                    std::cout << static_cast<float>(nhwc(n, c, hi, wi)) << "  ";
+                }
+                std::cout << "]";
+            }
+            std::cout << "]";
+        }
+        std::cout << "]";
+    }
+    std::cout << "]";
+}
+
+template <int NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+bool profile_convnd_bwd_weight_impl(int do_verification,
+                                    int init_method,
+                                    bool do_log,
+                                    bool time_kernel,
+                                    ck::index_t N,
+                                    ck::index_t K,
+                                    ck::index_t C,
+                                    std::vector<ck::index_t> input_spatial_lengths,
+                                    std::vector<ck::index_t> filter_spatial_lengths,
+                                    std::vector<ck::index_t> output_spatial_lengths,
+                                    std::vector<ck::index_t> conv_filter_strides,
+                                    std::vector<ck::index_t> conv_filter_dilations,
+                                    std::vector<ck::index_t> input_left_pads,
+                                    std::vector<ck::index_t> input_right_pads,
+                                    ck::index_t split_k)
+{
+    using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+    using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+
+    std::vector<std::size_t> input_dims{static_cast<std::size_t>(N), static_cast<std::size_t>(C)};
+    input_dims.insert(
+        std::end(input_dims), std::begin(input_spatial_lengths), std::end(input_spatial_lengths));
+
+    std::vector<std::size_t> filter_dims{static_cast<std::size_t>(K), static_cast<std::size_t>(C)};
+    filter_dims.insert(std::end(filter_dims),
+                       std::begin(filter_spatial_lengths),
+                       std::end(filter_spatial_lengths));
+
+    std::vector<std::size_t> output_dims{static_cast<std::size_t>(N), static_cast<std::size_t>(K)};
+    output_dims.insert(std::end(output_dims),
+                       std::begin(output_spatial_lengths),
+                       std::end(output_spatial_lengths));
+
+    Tensor<InDataType> input(get_input_host_tensor_descriptor<InLayout>(input_dims, NDimSpatial));
+    Tensor<WeiDataType> weights_host_result(
+        get_filters_host_tensor_descriptor<WeiLayout>(filter_dims, NDimSpatial));
+    Tensor<WeiDataType> weights_device_result(
+        get_filters_host_tensor_descriptor<WeiLayout>(filter_dims, NDimSpatial));
+    Tensor<OutDataType> output(
+        get_output_host_ensor_descriptor<OutLayout>(output_dims, NDimSpatial));
+
+    std::cout << "input: " << input.mDesc << std::endl;
+    std::cout << "weights: " << weights_host_result.mDesc << std::endl;
+    std::cout << "output: " << output.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        input.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-2, 2});
+        output.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
+        break;
+    default:
+        input.GenerateTensorValue(GeneratorTensor_1<OutDataType>{1});
+        output.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * weights_device_result.mDesc.GetElementSpace());
+    DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace());
+
+    in_device_buf.ToDevice(input.mData.data());
+    out_device_buf.ToDevice(output.mData.data());
+
+    // reset input to zero
+    wei_device_buf.SetZero();
+
+    if(do_verification)
+    {
+        auto RunReference = [&](auto& ref_conv) {
+            auto ref_invoker = ref_conv.MakeInvoker();
+
+            auto ref_argument = ref_conv.MakeArgument(input,
+                                                      weights_host_result,
+                                                      output,
+                                                      conv_filter_strides,
+                                                      conv_filter_dilations,
+                                                      input_left_pads,
+                                                      input_right_pads,
+                                                      InElementOp{},
+                                                      WeiElementOp{},
+                                                      OutElementOp{});
+            ref_invoker.Run(ref_argument);
+        };
+
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdWeight<InDataType,
+                                                                           WeiDataType,
+                                                                           OutDataType,
+                                                                           InElementOp,
+                                                                           WeiElementOp,
+                                                                           OutElementOp,
+                                                                           NDimSpatial>();
+        RunReference(ref_conv);
+    }
+
+    // add device Conv instances
+    std::vector<DeviceConvndBwdWeightNoOpPtr> conv_ptrs;
+    get_device_conv_bwd_weight_op_ptr(
+        InDataType{}, WeiDataType{}, OutDataType{}, conv_ptrs, NDimSpatial);
+
+    if(conv_ptrs.size() <= 0)
+    {
+        throw std::runtime_error("wrong! no device Conv instance found");
+    }
+
+    std::string best_conv_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device Conv instances
+    bool success = true;
+    for(auto& conv_ptr : conv_ptrs)
+    {
+        // using atomic, so need to reset input, setzero is done in invoker
+        // if(split_k > 1)
+        //{
+        //    wei_device_buf.SetZero();
+        //}
+
+        auto argument_ptr = conv_ptr->MakeArgumentPointer(
+            static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            N,
+            K,
+            C,
+            input_spatial_lengths,
+            filter_spatial_lengths,
+            output_spatial_lengths,
+            conv_filter_strides,
+            conv_filter_dilations,
+            input_left_pads,
+            input_right_pads,
+            in_element_op,
+            wei_element_op,
+            out_element_op,
+            split_k);
+
+        if(!conv_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            std::cout << "wrong! device_conv with the specified compilation parameters does "
+                         "not support this Conv problem"
+                      << std::endl;
+            continue;
+        }
+
+        auto invoker_ptr      = conv_ptr->MakeInvokerPointer();
+        std::string conv_name = conv_ptr->GetTypeString();
+        float ave_time        = 0;
+
+        if(std::is_same<InDataType, ck::bhalf_t>::value && split_k > 1)
+        {
+            // alloc work space
+            size_t bwd_weight_workspace_size = conv_ptr->GetWorkSpaceSize(argument_ptr.get());
+            if(bwd_weight_workspace_size <= 0)
+            {
+                printf("wrong work space size\n");
+                exit(1);
+            }
+            DeviceMem wei_work_space_device_buf(bwd_weight_workspace_size);
+            wei_work_space_device_buf.SetZero();
+            conv_ptr->SetWorkSpacePointer(argument_ptr.get(),
+                                          wei_work_space_device_buf.GetDeviceBuffer());
+            ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+        }
+        else
+        {
+            ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+        }
+
+        std::size_t flop =
+            ck::utils::conv::get_flops(N, C, K, filter_spatial_lengths, output_spatial_lengths);
+        std::size_t num_btype = ck::utils::conv::get_btype<InDataType, WeiDataType, OutDataType>(
+            N, C, K, input_spatial_lengths, filter_spatial_lengths, output_spatial_lengths);
+
+        float tflops     = static_cast<float>(flop) / 1.E9 / ave_time;
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s" << std::endl;
+
+        if(tflops > best_tflops)
+        {
+            best_conv_name  = conv_name;
+            best_tflops     = tflops;
+            best_ave_time   = ave_time;
+            best_gb_per_sec = gb_per_sec;
+        }
+
+        if(do_verification)
+        {
+            wei_device_buf.FromDevice(weights_device_result.mData.data());
+
+            success = ck::utils::check_err(weights_host_result, weights_device_result);
+
+            if(success == false)
+            {
+                std::cout << "Fail Info: " << conv_ptr->GetTypeString() << std::endl;
+            }
+            else
+            {
+                std::cout << "Pass Info: " << conv_ptr->GetTypeString() << std::endl;
+            }
+
+            if(do_log)
+            {
+                std::cout << "in : ";
+                show_data_nhwc_layout(output);
+                std::cout << std::endl;
+
+                std::cout << "wei: ";
+                show_data_nhwc_layout(weights_host_result);
+                std::cout << std::endl;
+
+                std::cout << "out  : ";
+                show_data_nhwc_layout(input);
+                std::cout << std::endl;
+
+                std::cout << "wei_device: ";
+                show_data_nhwc_layout(weights_device_result);
+                std::cout << std::endl;
+            }
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
+    return success;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp b/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp
new file mode 100644
index 00000000..7707e16b
--- /dev/null
+++ b/profiler/include/profiler/profile_elementwise_layernorm_impl.hpp
@@ -0,0 +1,266 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+
+#include "ck/ck.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/elementwise_normalization.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename HostTensorA, typename HostTensorB, typename HostTensorC, typename Functor>
+void host_elementwise2D(HostTensorC& C,
+                        const HostTensorA& A,
+                        const HostTensorB& B,
+                        const std::vector<std::size_t>& shape,
+                        Functor functor)
+{
+    using ctype = ck::remove_reference_t<decltype(C(0, 0))>;
+
+    for(std::size_t m = 0; m < shape[0]; ++m)
+        for(std::size_t n = 0; n < shape[1]; ++n)
+        {
+            auto a_val  = A(m, n);
+            auto b_val  = B(m, n);
+            ctype c_val = 0;
+            functor(c_val, a_val, b_val);
+            C(m, n) = c_val;
+        }
+}
+
+template <typename ADataType,
+          typename BDataType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename AccDataType,
+          typename YDataType>
+bool profile_elementwise_layernorm_impl(int do_verification,
+                                        int init_method,
+                                        bool do_log,
+                                        bool time_kernel,
+                                        std::vector<index_t> length)
+{
+    using Add         = ck::tensor_operation::element_wise::Add;
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+    if(length.size() != 2)
+        return false;
+
+    index_t M      = length[0];
+    index_t N      = length[1];
+    index_t Stride = N;
+
+    constexpr int Rank         = 2;
+    constexpr int NumReduceDim = 1;
+
+    std::vector<index_t> reduce_dim      = {1};
+    std::vector<index_t> gammaBetaLength = {N};
+    std::vector<index_t> gammaBetaStride = {0, 1};
+
+    auto f_host_tensor_descriptor2d = [](std::size_t row, std::size_t col, std::size_t stride) {
+        using namespace ck::literals;
+
+        return HostTensorDescriptor({row, col}, {stride, 1_uz});
+    };
+
+    Tensor<ADataType> a(length);
+    Tensor<BDataType> b(length);
+    Tensor<GammaDataType> gamma(gammaBetaLength);
+    Tensor<BetaDataType> beta(gammaBetaLength);
+    Tensor<YDataType> y(length);
+    Tensor<YDataType> host_y(length);
+
+    switch(init_method)
+    {
+    case 0:
+        a.GenerateTensorValue(GeneratorTensor_1<ADataType>{});
+        b.GenerateTensorValue(GeneratorTensor_1<BDataType>{});
+        gamma.GenerateTensorValue(GeneratorTensor_1<GammaDataType>{});
+        beta.GenerateTensorValue(GeneratorTensor_1<BetaDataType>{});
+        break;
+    case 1:
+        a.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        gamma.GenerateTensorValue(GeneratorTensor_2<GammaDataType>{-5, 5});
+        beta.GenerateTensorValue(GeneratorTensor_2<BetaDataType>{-5, 5});
+        break;
+    default:
+        a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0, 1});
+        b.GenerateTensorValue(GeneratorTensor_3<BDataType>{0, 1});
+        gamma.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{-0.5, 0.5});
+        beta.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem a_dev(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
+    DeviceMem b_dev(sizeof(ADataType) * b.mDesc.GetElementSpaceSize());
+    DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
+    DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
+    DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
+
+    a_dev.ToDevice(a.mData.data());
+    b_dev.ToDevice(b.mData.data());
+    gamma_dev.ToDevice(gamma.mData.data());
+    beta_dev.ToDevice(beta.mData.data());
+
+    std::array<const void*, 2> input = {a_dev.GetDeviceBuffer(), b_dev.GetDeviceBuffer()};
+
+    // add device normalization instances
+    using DeviceOp = ck::tensor_operation::device::DeviceElementwiseNormalization<
+        ck::Tuple<ADataType, BDataType>,
+        GammaDataType,
+        BetaDataType,
+        AccDataType,
+        YDataType,
+        Add,
+        PassThrough,
+        2,
+        1>;
+
+    // get device op instances
+    const auto instance_ptrs =
+        ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            DeviceOp>::GetInstances();
+
+    std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
+
+    std::string best_instance_name;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    if(do_verification)
+    {
+        using XDataType             = ADataType;
+        std::vector<std::size_t> mn = {static_cast<unsigned long>(M),
+                                       static_cast<unsigned long>(N)};
+        Tensor<XDataType> x(f_host_tensor_descriptor2d(M, N, Stride));
+        host_elementwise2D<Tensor<ADataType>, Tensor<BDataType>, Tensor<XDataType>, Add>(
+            x, a, b, mn, Add{});
+
+        using ReferenceInstance = ck::tensor_operation::host::ReferenceLayernorm<XDataType,
+                                                                                 GammaDataType,
+                                                                                 BetaDataType,
+                                                                                 YDataType,
+                                                                                 AccDataType,
+                                                                                 PassThrough,
+                                                                                 Rank,
+                                                                                 NumReduceDim>;
+
+        ReferenceInstance ref;
+        auto ref_argument =
+            ref.MakeArgument(x, gamma, beta, host_y, PassThrough{}, {M, N}, {1}, 1e-4);
+        auto ref_invoker = ref.MakeInvoker();
+        ref_invoker.Run(ref_argument);
+    }
+
+    int num_kernel = 0;
+
+    for(auto& inst_ptr : instance_ptrs)
+    {
+        auto argument_ptr = inst_ptr->MakeArgumentPointer(
+            length,
+            {
+                std::vector<ck::index_t>{a.mDesc.GetStrides().begin(), a.mDesc.GetStrides().end()},
+                std::vector<ck::index_t>{b.mDesc.GetStrides().begin(), b.mDesc.GetStrides().end()},
+            },
+            gammaBetaStride,
+            gammaBetaStride,
+            std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
+            reduce_dim,
+            1e-4,
+            input,
+            gamma_dev.GetDeviceBuffer(),
+            beta_dev.GetDeviceBuffer(),
+            y_dev.GetDeviceBuffer(),
+            Add{},
+            PassThrough{});
+
+        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            ++num_kernel;
+        }
+        else
+        {
+            continue;
+        }
+
+        auto invoker_ptr = inst_ptr->MakeInvokerPointer();
+
+        float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+        std::size_t num_bytes = a.mDesc.GetElementSize() * sizeof(ADataType) +
+                                b.mDesc.GetElementSize() * sizeof(BDataType) +
+                                gamma.mDesc.GetElementSize() * sizeof(GammaDataType) +
+                                beta.mDesc.GetElementSize() * sizeof(BetaDataType) +
+                                y.mDesc.GetElementSize() * sizeof(YDataType);
+
+        float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+        if(time_kernel)
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                      << inst_ptr->GetTypeString() << std::endl;
+
+        if(avg_time < best_avg_time)
+        {
+            best_instance_name = inst_ptr->GetTypeString();
+            best_avg_time      = avg_time;
+            best_gb_per_sec    = gb_per_sec;
+        }
+
+        if(do_verification)
+        {
+            y_dev.FromDevice(y.mData.data());
+
+            bool pass =
+                ck::utils::check_err(y.mData, host_y.mData, "Error: Incorrect results", 1e-3, 1e-3);
+
+            if(do_log)
+            {
+                LogRangeAsType<float>(std::cout << "a  : ", a.mData, ",") << std::endl;
+                LogRangeAsType<float>(std::cout << "b  : ", b.mData, ",") << std::endl;
+                LogRangeAsType<float>(std::cout << "host_y  : ", host_y.mData, ",") << std::endl;
+                LogRangeAsType<float>(std::cout << "y  : ", y.mData, ",") << std::endl;
+            }
+
+            if(!pass)
+            {
+                std::cout << inst_ptr->GetTypeString() << " failed verification: ";
+                LogRange(std::cout << "lengths = [", length, ", ") << "]." << std::endl;
+                return false;
+            }
+            else
+            {
+                if(time_kernel)
+                    std::cout << "pass" << std::endl;
+            }
+        }
+    }
+
+    if(time_kernel)
+    {
+        LogRange(std::cout << "length = ", length, ",") << ", ";
+        std::cout << "num_kernel = " << num_kernel << ", best perf = " << best_avg_time << " ms, "
+                  << best_gb_per_sec << " GB/s, " << best_instance_name << std::endl;
+    }
+
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is tested" << std::endl;
+        return false;
+    }
+
+    return true;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_gemm_add_add_fastgelu_impl.hpp b/profiler/include/profiler/profile_gemm_add_add_fastgelu_impl.hpp
new file mode 100644
index 00000000..3cc2ea3b
--- /dev/null
+++ b/profiler/include/profiler/profile_gemm_add_add_fastgelu_impl.hpp
@@ -0,0 +1,242 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_add_add_fastgelu.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename D0DataType,
+          typename D1DataType,
+          typename EDataType,
+          typename ALayout,
+          typename BLayout,
+          typename D0Layout,
+          typename D1Layout,
+          typename ELayout>
+bool profile_gemm_add_add_fastgelu_impl(int do_verification,
+                                        int init_method,
+                                        bool /*do_log*/,
+                                        bool time_kernel,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        int StrideA,
+                                        int StrideB,
+                                        int StrideD0,
+                                        int StrideD1,
+                                        int StrideE)
+{
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor(M, N, StrideD0, D0Layout{}));
+    Tensor<D1DataType> d1_m_n(f_host_tensor_descriptor(M, N, StrideD1, D1Layout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
+    std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_device_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-5, 5});
+        d1_m_n.GenerateTensorValue(GeneratorTensor_2<D1DataType>{-5, 5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
+        d1_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
+    }
+
+    using PassThrough    = ck::tensor_operation::element_wise::PassThrough;
+    using AddAddFastGelu = ck::tensor_operation::element_wise::AddAddFastGelu;
+
+    using AElementOp   = PassThrough;
+    using BElementOp   = PassThrough;
+    using CDEElementOp = AddAddFastGelu;
+
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{};
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
+        ALayout,
+        BLayout,
+        ck::Tuple<D0Layout, D1Layout>,
+        ELayout,
+        ADataType,
+        BDataType,
+        ck::Tuple<D0DataType, D1DataType>,
+        EDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::AddAddFastGelu>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    // run reference
+    if(do_verification)
+    {
+        Tensor<AccDataType> c_m_n({M, N});
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                PassThrough>;
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_m_n(m, n), d1_m_n(m, n));
+            }
+        }
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d0_m_n_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem d1_m_n_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    d0_m_n_device_buf.ToDevice(d0_m_n.mData.data());
+    d1_m_n_device_buf.ToDevice(d1_m_n.mData.data());
+
+    std::string best_op_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    bool pass = true;
+
+    // profile device operation instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            a_device_buf.GetDeviceBuffer(),
+            b_device_buf.GetDeviceBuffer(),
+            std::array<const void*, 2>{d0_m_n_device_buf.GetDeviceBuffer(),
+                                       d1_m_n_device_buf.GetDeviceBuffer()},
+            e_device_buf.GetDeviceBuffer(),
+            M,
+            N,
+            K,
+            StrideA,
+            StrideB,
+            std::array<ck::index_t, 2>{StrideD0, StrideD1},
+            StrideE,
+            a_element_op,
+            b_element_op,
+            cde_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // re-init E to zero before profiling a kernel
+            e_device_buf.SetZero();
+
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop = std::size_t(2) * M * N * K;
+
+            std::size_t num_btype =
+                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+                pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_gemm_add_fastgelu_impl.hpp b/profiler/include/profiler/profile_gemm_add_fastgelu_impl.hpp
new file mode 100644
index 00000000..d53a6589
--- /dev/null
+++ b/profiler/include/profiler/profile_gemm_add_fastgelu_impl.hpp
@@ -0,0 +1,232 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_add_fastgelu.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename D0DataType,
+          typename EDataType,
+          typename ALayout,
+          typename BLayout,
+          typename D0Layout,
+          typename ELayout>
+bool profile_gemm_add_fastgelu_impl(int do_verification,
+                                    int init_method,
+                                    bool /*do_log*/,
+                                    bool time_kernel,
+                                    int M,
+                                    int N,
+                                    int K,
+                                    int StrideA,
+                                    int StrideB,
+                                    int StrideD0,
+                                    int StrideE)
+{
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor(M, N, StrideD0, D0Layout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_device_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-5, 5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d0_m_n.GenerateTensorValue(GeneratorTensor_3<D0DataType>{0.0, 1.0});
+    }
+
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    using AddFastGelu = ck::tensor_operation::element_wise::AddFastGelu;
+
+    using AElementOp   = PassThrough;
+    using BElementOp   = PassThrough;
+    using CDEElementOp = AddFastGelu;
+
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{};
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
+        ALayout,
+        BLayout,
+        ck::Tuple<D0Layout>,
+        ELayout,
+        ADataType,
+        BDataType,
+        ck::Tuple<D0DataType>,
+        EDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::AddFastGelu>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    // run reference
+    if(do_verification)
+    {
+        Tensor<AccDataType> c_m_n({M, N});
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                PassThrough>;
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d0_m_n(m, n));
+            }
+        }
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d0_m_n_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    d0_m_n_device_buf.ToDevice(d0_m_n.mData.data());
+
+    std::string best_op_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    bool pass = true;
+
+    // profile device operation instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            a_device_buf.GetDeviceBuffer(),
+            b_device_buf.GetDeviceBuffer(),
+            std::array<const void*, 1>{d0_m_n_device_buf.GetDeviceBuffer()},
+            e_device_buf.GetDeviceBuffer(),
+            M,
+            N,
+            K,
+            StrideA,
+            StrideB,
+            std::array<ck::index_t, 1>{StrideD0},
+            StrideE,
+            a_element_op,
+            b_element_op,
+            cde_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // re-init E to zero before profiling a kernel
+            e_device_buf.SetZero();
+
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop = std::size_t(2) * M * N * K;
+
+            std::size_t num_btype =
+                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+                pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp b/profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp
new file mode 100644
index 00000000..b4ec78cd
--- /dev/null
+++ b/profiler/include/profiler/profile_gemm_bias_add_reduce_impl.hpp
@@ -0,0 +1,384 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32                 = float;
+using F16                 = ck::half_t;
+using ReducePtrsGlobal    = ck::Tuple<F32*, F32*>;
+using Div                 = ck::tensor_operation::element_wise::UnaryDivide;
+using Identity            = ck::tensor_operation::element_wise::PassThrough;
+using Square              = ck::tensor_operation::element_wise::UnarySquare;
+using ReduceInElementOps  = ck::Tuple<Identity, Square>;
+using ReduceOutElementOps = ck::Tuple<Div, Div>;
+
+using DeviceGemmBiasAddReduceNoOpPtr =
+    ck::tensor_operation::device::DeviceGemmReducePtr<1, ReducePtrsGlobal::Size()>;
+
+void add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_kn_mn_instances(
+    std::vector<DeviceGemmBiasAddReduceNoOpPtr>&);
+
+void add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_nk_mn_instances(
+    std::vector<DeviceGemmBiasAddReduceNoOpPtr>&);
+
+void add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_kn_mn_instances(
+    std::vector<DeviceGemmBiasAddReduceNoOpPtr>&);
+
+void add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_nk_mn_instances(
+    std::vector<DeviceGemmBiasAddReduceNoOpPtr>&);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+
+namespace ck {
+namespace profiler {
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename BiasDataType,
+          typename D0DataType,
+          typename ReduceDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+void profile_gemm_bias_add_reduce_impl(int do_verification,
+                                       int init_method,
+                                       bool do_log,
+                                       bool time_kernel,
+                                       int M,
+                                       int N,
+                                       int K,
+                                       int StrideA,
+                                       int StrideB,
+                                       int StrideC,
+                                       int StrideD0)
+{
+    auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
+        return HostTensorDescriptor({len}, {stride});
+    };
+
+    auto f_host_tensor_descriptor2d =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor2d(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor2d(K, N, StrideB, BLayout{}));
+
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
+    Tensor<BiasDataType> bias_n(f_host_tensor_descriptor1d(N, 1));
+    Tensor<D0DataType> d0_m_n(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
+    Tensor<ReduceDataType> reduce0_m_host_result({M});
+    Tensor<ReduceDataType> reduce1_m_host_result({M});
+
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor2d(M, N, StrideC, CLayout{}));
+    Tensor<ReduceDataType> reduce0_m_device_result({M});
+    Tensor<ReduceDataType> reduce1_m_device_result({M});
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+    std::cout << "reduce0_m: " << reduce0_m_host_result.mDesc << std::endl;
+    std::cout << "reduce1_m: " << reduce1_m_host_result.mDesc << std::endl;
+
+    std::size_t num_thread = 1;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        std::srand(0);
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5}, num_thread);
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}, num_thread);
+        bias_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}, num_thread);
+        d0_m_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}, num_thread);
+        break;
+    default:
+        std::srand(0);
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}, num_thread);
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}, num_thread);
+        bias_n.GenerateTensorValue(GeneratorTensor_3<ADataType>{-0.5, 0.5}, num_thread);
+        d0_m_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}, num_thread);
+    }
+
+    using PassThrough           = ck::tensor_operation::element_wise::PassThrough;
+    using AElementOp            = PassThrough;
+    using BElementOp            = PassThrough;
+    using CElementOp            = PassThrough;
+    using D0ElementOp           = PassThrough;
+    using ReduceOp0             = ck::reduce::Add;
+    using ReduceOp1             = ck::reduce::Add;
+    using UnaryDivElementOp     = ck::tensor_operation::element_wise::UnaryDivide;
+    using UnaryIdenticElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using UnarySquareElementOp  = ck::tensor_operation::element_wise::UnarySquare;
+
+    auto a_element_op                     = AElementOp{};
+    auto b_element_op                     = BElementOp{};
+    auto c_element_op                     = CElementOp{};
+    std::array<void*, 3> gemm_element_ops = {&a_element_op, &b_element_op, &c_element_op};
+
+    auto d0_element_op    = D0ElementOp{};
+    const auto reduce0_op = ReduceOp0{};
+    const auto reduce1_op = ReduceOp1{};
+
+    auto passthrough                            = UnaryIdenticElementOp{};
+    auto square                                 = UnarySquareElementOp{};
+    auto div                                    = UnaryDivElementOp{N};
+    std::array<void*, 2> reduce_in_element_ops  = {&passthrough, &square};
+    std::array<void*, 2> reduce_out_element_ops = {&div, &div};
+
+    if(do_verification)
+    {
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                CDataType,
+                                                                                ReduceDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                CElementOp>;
+
+        using ReduceAccDataType = ReduceDataType;
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+            for(int n = 0; n < N; ++n)
+            {
+                ReduceAccDataType acc = static_cast<ReduceAccDataType>(c_m_n_host_result(m, n)) +
+                                        static_cast<ReduceAccDataType>(bias_n(n));
+
+                ReduceAccDataType d0 = static_cast<ReduceAccDataType>(d0_m_n(m, n));
+                c_element_op(acc, acc);
+                d0_element_op(d0, d0);
+                acc += d0;
+                c_m_n_host_result(m, n) = static_cast<CDataType>(acc);
+            }
+
+        for(int m = 0; m < M; ++m)
+        {
+            auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
+            auto reduce1_acc = reduce1_op.GetIdentityValue<ReduceAccDataType>();
+
+            for(int n = 0; n < N; ++n)
+            {
+                ReduceAccDataType d0_val =
+                    ck::type_convert<ReduceAccDataType>(c_m_n_host_result(m, n));
+                ReduceAccDataType d1_val;
+
+                square(d1_val, d0_val);
+                reduce0_op(reduce0_acc, d0_val);
+                reduce1_op(reduce1_acc, d1_val);
+            }
+
+            div(reduce0_acc, reduce0_acc);
+            div(reduce1_acc, reduce1_acc);
+            reduce0_m_host_result(m) = ck::type_convert<ReduceDataType>(reduce0_acc);
+            reduce1_m_host_result(m) = ck::type_convert<ReduceDataType>(reduce1_acc);
+        }
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem bias_device_buf(sizeof(BiasDataType) * bias_n.mDesc.GetElementSpaceSize());
+    DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem reduce0_device_buf(sizeof(ReduceDataType) *
+                                 reduce0_m_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem reduce1_device_buf(sizeof(ReduceDataType) *
+                                 reduce1_m_device_result.mDesc.GetElementSpaceSize());
+
+    std::array<void*, 2> p_reduces = {reduce0_device_buf.GetDeviceBuffer(),
+                                      reduce1_device_buf.GetDeviceBuffer()};
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    bias_device_buf.ToDevice(bias_n.mData.data());
+    d0_device_buf.ToDevice(d0_m_n.mData.data());
+
+    // add device GEMM instances
+    std::vector<ck::tensor_operation::device::instance::DeviceGemmBiasAddReduceNoOpPtr> gemm_ptrs;
+
+    if constexpr(is_same<ADataType, half_t>::value && is_same<BDataType, half_t>::value &&
+                 is_same<CDataType, half_t>::value)
+    {
+        if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value &&
+                     is_same<BLayout, tensor_layout::gemm::RowMajor>::value &&
+                     is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
+        {
+            ck::tensor_operation::device::instance::
+                add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_kn_mn_instances(
+                    gemm_ptrs);
+        }
+        else if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value &&
+                          is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value &&
+                          is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
+        {
+            ck::tensor_operation::device::instance::
+                add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_mk_nk_mn_instances(
+                    gemm_ptrs);
+        }
+        else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value &&
+                          is_same<BLayout, tensor_layout::gemm::RowMajor>::value &&
+                          is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
+        {
+            ck::tensor_operation::device::instance::
+                add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_kn_mn_instances(
+                    gemm_ptrs);
+        }
+        else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value &&
+                          is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value &&
+                          is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
+        {
+            ck::tensor_operation::device::instance::
+                add_device_gemm_bias_add_mean_squaremean_xdl_cshuffle_f16_f16_f16_f16_f16_f32_f32_km_nk_mn_instances(
+                    gemm_ptrs);
+        }
+    }
+
+    if(gemm_ptrs.size() <= 0)
+    {
+        throw std::runtime_error("wrong! no device GEMM instance found");
+    }
+
+    std::string best_gemm_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device GEMM instances
+    for(auto& gemm_ptr : gemm_ptrs)
+    {
+        auto argument_ptr = gemm_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                                          b_device_buf.GetDeviceBuffer(),
+                                                          bias_device_buf.GetDeviceBuffer(),
+                                                          {d0_device_buf.GetDeviceBuffer()},
+                                                          c_device_buf.GetDeviceBuffer(),
+                                                          p_reduces,
+                                                          M,
+                                                          N,
+                                                          K,
+                                                          StrideA,
+                                                          StrideB,
+                                                          StrideC,
+                                                          {StrideD0},
+                                                          gemm_element_ops,
+                                                          {&d0_element_op},
+                                                          reduce_in_element_ops,
+                                                          reduce_out_element_ops);
+
+        auto invoker_ptr = gemm_ptr->MakeInvokerPointer();
+
+        if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // init DO, D1 to 0
+            reduce0_device_buf.SetZero();
+            reduce1_device_buf.SetZero();
+
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::string gemm_name = gemm_ptr->GetTypeString();
+
+            std::size_t flop = std::size_t(2) * M * N * K + std::size_t(2) * M * N;
+
+            std::size_t num_byte = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                                   sizeof(CDataType) * M * N + sizeof(BiasDataType) * M * N +
+                                   sizeof(D0DataType) * M * N + sizeof(ReduceDataType) * M +
+                                   sizeof(ReduceDataType) * M;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_byte / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                      << " GB/s, " << gemm_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_gemm_name  = gemm_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                c_device_buf.FromDevice(c_m_n_device_result.mData.data());
+                reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data());
+                reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data());
+
+                ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+                ck::utils::check_err(reduce0_m_device_result, reduce0_m_host_result);
+                ck::utils::check_err(reduce1_m_device_result, reduce1_m_host_result);
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "c_host: ", c_m_n_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "d0_host: ", reduce0_m_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "d0_device: ", reduce0_m_device_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "d1_host: ", reduce1_m_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "d1_device: ", reduce1_m_device_result.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+        else
+        {
+            std::cout << "does not support this GEMM problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_gemm_bilinear_impl.hpp b/profiler/include/profiler/profile_gemm_bilinear_impl.hpp
new file mode 100644
index 00000000..31bae281
--- /dev/null
+++ b/profiler/include/profiler/profile_gemm_bilinear_impl.hpp
@@ -0,0 +1,234 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_bilinear.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename DDataType,
+          typename EDataType,
+          typename ALayout,
+          typename BLayout,
+          typename DLayout,
+          typename ELayout>
+bool profile_gemm_bilinear_impl(int do_verification,
+                                int init_method,
+                                bool /*do_log*/,
+                                bool time_kernel,
+                                int M,
+                                int N,
+                                int K,
+                                int StrideA,
+                                int StrideB,
+                                int StrideD,
+                                int StrideE,
+                                float alpha,
+                                float beta)
+{
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<DDataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, DLayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_device_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_2<DDataType>{-5, 5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_m_n.GenerateTensorValue(GeneratorTensor_3<DDataType>{0.0, 1.0});
+    }
+
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    using Bilinear    = ck::tensor_operation::element_wise::Bilinear;
+
+    using AElementOp   = PassThrough;
+    using BElementOp   = PassThrough;
+    using CDEElementOp = Bilinear;
+
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{alpha, beta};
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
+        ALayout,
+        BLayout,
+        ck::Tuple<DLayout>,
+        ELayout,
+        ADataType,
+        BDataType,
+        ck::Tuple<DDataType>,
+        EDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::Bilinear>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    // run reference
+    if(do_verification)
+    {
+        Tensor<AccDataType> c_m_n({M, N});
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                PassThrough>;
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d_m_n(m, n));
+            }
+        }
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem d_m_n_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    d_m_n_device_buf.ToDevice(d_m_n.mData.data());
+
+    std::string best_op_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    bool pass = true;
+
+    // profile device operation instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            a_device_buf.GetDeviceBuffer(),
+            b_device_buf.GetDeviceBuffer(),
+            std::array<const void*, 1>{d_m_n_device_buf.GetDeviceBuffer()},
+            e_device_buf.GetDeviceBuffer(),
+            M,
+            N,
+            K,
+            StrideA,
+            StrideB,
+            std::array<ck::index_t, 1>{StrideD},
+            StrideE,
+            a_element_op,
+            b_element_op,
+            cde_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // re-init E to zero before profiling a kernel
+            e_device_buf.SetZero();
+
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop = std::size_t(2) * M * N * K;
+
+            std::size_t num_btype =
+                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+                pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_gemm_fastgelu_impl.hpp b/profiler/include/profiler/profile_gemm_fastgelu_impl.hpp
new file mode 100644
index 00000000..f9a544c0
--- /dev/null
+++ b/profiler/include/profiler/profile_gemm_fastgelu_impl.hpp
@@ -0,0 +1,222 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_fastgelu.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename EDataType,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout>
+bool profile_gemm_fastgelu_impl(int do_verification,
+                                int init_method,
+                                bool /*do_log*/,
+                                bool time_kernel,
+                                int M,
+                                int N,
+                                int K,
+                                int StrideA,
+                                int StrideB,
+                                int StrideE)
+{
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+    Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "e_m_n: " << e_m_n_device_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+    }
+
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    using FastGelu    = ck::tensor_operation::element_wise::FastGelu;
+
+    using AElementOp   = PassThrough;
+    using BElementOp   = PassThrough;
+    using CDEElementOp = FastGelu;
+
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{};
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleD<
+        ALayout,
+        BLayout,
+        ck::Tuple<>,
+        ELayout,
+        ADataType,
+        BDataType,
+        ck::Tuple<>,
+        EDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::FastGelu>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    // run reference
+    if(do_verification)
+    {
+        Tensor<AccDataType> c_m_n({M, N});
+
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                PassThrough>;
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            for(int n = 0; n < N; ++n)
+            {
+                cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n));
+            }
+        }
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+
+    std::string best_op_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    bool pass = true;
+
+    // profile device operation instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                                        b_device_buf.GetDeviceBuffer(),
+                                                        std::array<const void*, 0>{},
+                                                        e_device_buf.GetDeviceBuffer(),
+                                                        M,
+                                                        N,
+                                                        K,
+                                                        StrideA,
+                                                        StrideB,
+                                                        std::array<ck::index_t, 0>{},
+                                                        StrideE,
+                                                        a_element_op,
+                                                        b_element_op,
+                                                        cde_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // re-init E to zero before profiling a kernel
+            e_device_buf.SetZero();
+
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop = std::size_t(2) * M * N * K;
+
+            std::size_t num_btype =
+                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                e_device_buf.FromDevice(e_m_n_device_result.mData.data());
+
+                pass = pass && ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_gemm_impl.hpp b/profiler/include/profiler/profile_gemm_impl.hpp
new file mode 100644
index 00000000..9b164104
--- /dev/null
+++ b/profiler/include/profiler/profile_gemm_impl.hpp
@@ -0,0 +1,254 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+#include <iostream>
+#include <typeinfo>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType>
+int profile_gemm_impl(int do_verification,
+                      int init_method,
+                      bool do_log,
+                      bool time_kernel,
+                      int M,
+                      int N,
+                      int K,
+                      int StrideA,
+                      int StrideB,
+                      int StrideC)
+{
+    bool pass = true;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+    }
+
+    using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+    const auto a_element_op = AElementOp{};
+    const auto b_element_op = BElementOp{};
+    const auto c_element_op = CElementOp{};
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGemm<ALayout,
+                                                              BLayout,
+                                                              CLayout,
+                                                              ADataType,
+                                                              BDataType,
+                                                              CDataType,
+                                                              AElementOp,
+                                                              BElementOp,
+                                                              CElementOp>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    // Run reference op
+    if(do_verification)
+    {
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                CElementOp>;
+
+        auto ref_op      = ReferenceGemmInstance{};
+        auto ref_invoker = ref_op.MakeInvoker();
+
+        auto ref_argument = ref_op.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+        ref_invoker.Run(ref_argument);
+    }
+
+    std::string best_op_name;
+    float best_avg_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device op instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                        static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                        static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                        M,
+                                        N,
+                                        K,
+                                        StrideA,
+                                        StrideB,
+                                        StrideC,
+                                        a_element_op,
+                                        b_element_op,
+                                        c_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // re-init C to zero before profiling next kernel
+            c_device_buf.SetZero();
+
+            std::string op_name = op_ptr->GetTypeString();
+
+            float avg_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop = std::size_t(2) * M * N * K;
+
+            std::size_t num_btype =
+                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
+
+            float gb_per_sec = num_btype / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                c_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+                pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "c_host  : ", c_m_n_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+        else
+        {
+            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+        }
+    }
+
+    if constexpr(is_same<CDataType, float>::value)
+    {
+        std::cout << "Best Perf for datatype = f32";
+    }
+    else if constexpr(is_same<CDataType, half_t>::value)
+    {
+        std::cout << "Best Perf for datatype = f16";
+    }
+    else if constexpr(is_same<CDataType, bhalf_t>::value)
+    {
+        std::cout << "Best Perf for datatype = bf16";
+    }
+    else if constexpr(is_same<CDataType, int8_t>::value)
+    {
+        std::cout << "Best Perf for datatype = int8";
+    }
+
+    if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value)
+    {
+        std::cout << " ALayout =  RowMajor";
+    }
+    else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value)
+    {
+        std::cout << " ALayout =  ColumnMajor";
+    }
+
+    if constexpr(is_same<BLayout, tensor_layout::gemm::RowMajor>::value)
+    {
+        std::cout << " BLayout =  RowMajor";
+    }
+    else if constexpr(is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value)
+    {
+        std::cout << " BLayout =  ColumnMajor";
+    }
+
+    std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA
+              << " StrideB = " << StrideB << " StrideC = " << StrideC << " : " << best_avg_time
+              << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, "
+              << best_op_name << std::endl;
+
+    return pass ? 0 : 1;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_gemm_reduce_impl.hpp b/profiler/include/profiler/profile_gemm_reduce_impl.hpp
new file mode 100644
index 00000000..370121a3
--- /dev/null
+++ b/profiler/include/profiler/profile_gemm_reduce_impl.hpp
@@ -0,0 +1,353 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_reduce.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F32                 = float;
+using F16                 = ck::half_t;
+using ReducePtrsGlobal    = ck::Tuple<F32*, F32*>;
+using Div                 = ck::tensor_operation::element_wise::UnaryDivide;
+using Identity            = ck::tensor_operation::element_wise::PassThrough;
+using Square              = ck::tensor_operation::element_wise::UnarySquare;
+using ReduceInElementOps  = ck::Tuple<Identity, Square>;
+using ReduceOutElementOps = ck::Tuple<Div, Div>;
+
+using DeviceGemmReduceNoOpPtr =
+    ck::tensor_operation::device::DeviceGemmReducePtr<0, ReducePtrsGlobal::Size()>;
+
+void add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instances(
+    std::vector<DeviceGemmReduceNoOpPtr>&);
+
+void add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instances(
+    std::vector<DeviceGemmReduceNoOpPtr>&);
+
+void add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instances(
+    std::vector<DeviceGemmReduceNoOpPtr>&);
+
+void add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instances(
+    std::vector<DeviceGemmReduceNoOpPtr>&);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+
+namespace ck {
+namespace profiler {
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename ReduceDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+bool profile_gemm_reduce_impl(int do_verification,
+                              int init_method,
+                              bool do_log,
+                              bool time_kernel,
+                              int M,
+                              int N,
+                              int K,
+                              int StrideA,
+                              int StrideB,
+                              int StrideC)
+{
+    bool pass = true;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<ReduceDataType> reduce0_m_host_result({M});
+    Tensor<ReduceDataType> reduce1_m_host_result({M});
+
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<ReduceDataType> reduce0_m_device_result({M});
+    Tensor<ReduceDataType> reduce1_m_device_result({M});
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+    std::cout << "reduce0_m: " << reduce0_m_host_result.mDesc << std::endl;
+    std::cout << "reduce1_m: " << reduce1_m_host_result.mDesc << std::endl;
+
+    std::size_t num_thread = 1;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        std::srand(0);
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5}, num_thread);
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}, num_thread);
+        break;
+    default:
+        std::srand(0);
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}, num_thread);
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}, num_thread);
+    }
+
+    using AElementOp            = ck::tensor_operation::element_wise::PassThrough;
+    using BElementOp            = ck::tensor_operation::element_wise::PassThrough;
+    using CElementOp            = ck::tensor_operation::element_wise::PassThrough;
+    using ReduceOp0             = ck::reduce::Add;
+    using ReduceOp1             = ck::reduce::Add;
+    using UnaryIdenticElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using UnarySquareElementOp  = ck::tensor_operation::element_wise::UnarySquare;
+    using UnaryDivElementOp     = ck::tensor_operation::element_wise::UnaryDivide;
+
+    auto a_element_op                     = AElementOp{};
+    auto b_element_op                     = BElementOp{};
+    auto c_element_op                     = CElementOp{};
+    std::array<void*, 3> gemm_element_ops = {&a_element_op, &b_element_op, &c_element_op};
+
+    const auto reduce0_op = ReduceOp0{};
+    const auto reduce1_op = ReduceOp1{};
+
+    auto passthrough                            = UnaryIdenticElementOp{};
+    auto square                                 = UnarySquareElementOp{};
+    auto div                                    = UnaryDivElementOp{N};
+    std::array<void*, 2> reduce_in_element_ops  = {&passthrough, &square};
+    std::array<void*, 2> reduce_out_element_ops = {&div, &div};
+
+    if(do_verification)
+    {
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                CDataType,
+                                                                                ReduceDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                CElementOp>;
+
+        using ReduceAccDataType = ReduceDataType;
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        for(int m = 0; m < M; ++m)
+        {
+            auto reduce0_acc = reduce0_op.GetIdentityValue<ReduceAccDataType>();
+            auto reduce1_acc = reduce1_op.GetIdentityValue<ReduceAccDataType>();
+
+            for(int n = 0; n < N; ++n)
+            {
+                ReduceAccDataType d0_val =
+                    ck::type_convert<ReduceAccDataType>(c_m_n_host_result(m, n));
+                ReduceAccDataType d1_val;
+
+                square(d1_val, d0_val);
+                reduce0_op(reduce0_acc, d0_val);
+                reduce1_op(reduce1_acc, d1_val);
+            }
+
+            div(reduce0_acc, reduce0_acc);
+            div(reduce1_acc, reduce1_acc);
+            reduce0_m_host_result(m) = ck::type_convert<ReduceDataType>(reduce0_acc);
+            reduce1_m_host_result(m) = ck::type_convert<ReduceDataType>(reduce1_acc);
+        }
+    }
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem reduce0_device_buf(sizeof(ReduceDataType) *
+                                 reduce0_m_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem reduce1_device_buf(sizeof(ReduceDataType) *
+                                 reduce1_m_device_result.mDesc.GetElementSpaceSize());
+
+    std::array<void*, 2> p_reduces = {reduce0_device_buf.GetDeviceBuffer(),
+                                      reduce1_device_buf.GetDeviceBuffer()};
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+
+    // add device GEMM instances
+    std::vector<ck::tensor_operation::device::instance::DeviceGemmReduceNoOpPtr> gemm_ptrs;
+
+    if constexpr(is_same<ADataType, half_t>::value && is_same<BDataType, half_t>::value &&
+                 is_same<CDataType, half_t>::value)
+    {
+        if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value &&
+                     is_same<BLayout, tensor_layout::gemm::RowMajor>::value &&
+                     is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
+        {
+            ck::tensor_operation::device::instance::
+                add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_kn_mn_instances(
+                    gemm_ptrs);
+        }
+        else if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value &&
+                          is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value &&
+                          is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
+        {
+            ck::tensor_operation::device::instance::
+                add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_mk_nk_mn_instances(
+                    gemm_ptrs);
+        }
+        else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value &&
+                          is_same<BLayout, tensor_layout::gemm::RowMajor>::value &&
+                          is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
+        {
+            ck::tensor_operation::device::instance::
+                add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_kn_mn_instances(
+                    gemm_ptrs);
+        }
+        else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value &&
+                          is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value &&
+                          is_same<CLayout, tensor_layout::gemm::RowMajor>::value)
+        {
+            ck::tensor_operation::device::instance::
+                add_device_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_km_nk_mn_instances(
+                    gemm_ptrs);
+        }
+    }
+
+    if(gemm_ptrs.size() <= 0)
+    {
+        throw std::runtime_error("wrong! no device GEMM instance found");
+    }
+
+    std::string best_gemm_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device GEMM instances
+    for(auto& gemm_ptr : gemm_ptrs)
+    {
+        auto argument_ptr = gemm_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
+                                                          b_device_buf.GetDeviceBuffer(),
+                                                          nullptr,
+                                                          {},
+                                                          c_device_buf.GetDeviceBuffer(),
+                                                          p_reduces,
+                                                          M,
+                                                          N,
+                                                          K,
+                                                          StrideA,
+                                                          StrideB,
+                                                          StrideC,
+                                                          {},
+                                                          gemm_element_ops,
+                                                          {},
+                                                          reduce_in_element_ops,
+                                                          reduce_out_element_ops);
+
+        auto invoker_ptr = gemm_ptr->MakeInvokerPointer();
+
+        if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // init DO, D1 to 0
+            reduce0_device_buf.SetZero();
+            reduce1_device_buf.SetZero();
+
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::string gemm_name = gemm_ptr->GetTypeString();
+
+            std::size_t flop = std::size_t(2) * M * N * K;
+
+            std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                                    sizeof(CDataType) * M * N + sizeof(CDataType) * N;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                      << " GB/s, " << gemm_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_gemm_name  = gemm_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                c_device_buf.FromDevice(c_m_n_device_result.mData.data());
+                reduce0_device_buf.FromDevice(reduce0_m_device_result.mData.data());
+                reduce1_device_buf.FromDevice(reduce1_m_device_result.mData.data());
+
+                ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+                ck::utils::check_err(reduce0_m_device_result, reduce0_m_host_result);
+                ck::utils::check_err(reduce1_m_device_result, reduce1_m_host_result);
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "c_host: ", c_m_n_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "d0_host: ", reduce0_m_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "d0_device: ", reduce0_m_device_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "d1_host: ", reduce1_m_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "d1_device: ", reduce1_m_device_result.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+        else
+        {
+            std::cout << "does not support this GEMM problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_gemm_splitk_impl.hpp b/profiler/include/profiler/profile_gemm_splitk_impl.hpp
new file mode 100644
index 00000000..e5d5f876
--- /dev/null
+++ b/profiler/include/profiler/profile_gemm_splitk_impl.hpp
@@ -0,0 +1,257 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+#include <iostream>
+#include <typeinfo>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_splitk.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+bool profile_gemm_splitk_impl(int do_verification,
+                              int init_method,
+                              bool do_log,
+                              bool time_kernel,
+                              int M,
+                              int N,
+                              int K,
+                              int StrideA,
+                              int StrideB,
+                              int StrideC,
+                              int KBatch)
+{
+    bool pass = true;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_device_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+    }
+
+    using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+    const auto a_element_op = AElementOp{};
+    const auto b_element_op = BElementOp{};
+    const auto c_element_op = CElementOp{};
+
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    c_device_buf.ToDevice(c_m_n_device_result.mData.data());
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmSplitK<ALayout,
+                                                                    BLayout,
+                                                                    CLayout,
+                                                                    ADataType,
+                                                                    BDataType,
+                                                                    CDataType,
+                                                                    AElementOp,
+                                                                    BElementOp,
+                                                                    CElementOp>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    // Run reference GEMM
+    if(do_verification)
+    {
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                CElementOp>;
+
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+
+        ref_invoker.Run(ref_argument);
+    }
+
+    std::string best_op_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device GEMM instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                        static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                        static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                        M,
+                                        N,
+                                        K,
+                                        StrideA,
+                                        StrideB,
+                                        StrideC,
+                                        a_element_op,
+                                        b_element_op,
+                                        c_element_op,
+                                        KBatch);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // re-init C to zero before profiling next kernel
+            c_device_buf.SetZero();
+
+            std::string op_name = op_ptr->GetTypeString();
+
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop = std::size_t(2) * M * N * K;
+
+            std::size_t num_btype =
+                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                c_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+                pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "c_host  : ", c_m_n_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+        else
+        {
+            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+        }
+    }
+
+    if constexpr(is_same<CDataType, float>::value)
+    {
+        std::cout << "Best Perf for datatype = f32";
+    }
+    else if constexpr(is_same<CDataType, half_t>::value)
+    {
+        std::cout << "Best Perf for datatype = f16";
+    }
+    else if constexpr(is_same<CDataType, bhalf_t>::value)
+    {
+        std::cout << "Best Perf for datatype = bf16";
+    }
+    else if constexpr(is_same<CDataType, int8_t>::value)
+    {
+        std::cout << "Best Perf for datatype = int8";
+    }
+
+    if constexpr(is_same<ALayout, tensor_layout::gemm::RowMajor>::value)
+    {
+        std::cout << " ALayout =  RowMajor";
+    }
+    else if constexpr(is_same<ALayout, tensor_layout::gemm::ColumnMajor>::value)
+    {
+        std::cout << " ALayout =  ColumnMajor";
+    }
+
+    if constexpr(is_same<BLayout, tensor_layout::gemm::RowMajor>::value)
+    {
+        std::cout << " BLayout =  RowMajor";
+    }
+    else if constexpr(is_same<BLayout, tensor_layout::gemm::ColumnMajor>::value)
+    {
+        std::cout << " BLayout =  ColumnMajor";
+    }
+
+    std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA
+              << " StrideB = " << StrideB << " StrideC = " << StrideC << " : " << best_ave_time
+              << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, "
+              << best_op_name << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
new file mode 100644
index 00000000..4f9aa983
--- /dev/null
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
@@ -0,0 +1,252 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <algorithm>
+#include <iomanip>
+#include <iostream>
+#include <iterator>
+#include <typeinfo>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType>
+bool profile_grouped_conv_bwd_weight_impl(int do_verification,
+                                          int init_method,
+                                          bool do_log,
+                                          bool time_kernel,
+                                          const ck::utils::conv::ConvParam& conv_param,
+                                          ck::index_t split_k)
+{
+    using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+    using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+
+    const auto in_g_n_c_wis_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+
+    const auto wei_g_k_c_xs_desc =
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+
+    const auto out_g_n_k_wos_desc =
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
+    Tensor<InDataType> input(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> weight_host_result(wei_g_k_c_xs_desc);
+    Tensor<WeiDataType> weight_device_result(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> output(out_g_n_k_wos_desc);
+
+    std::cout << "input: " << input.mDesc << std::endl;
+    std::cout << "weight: " << weight_host_result.mDesc << std::endl;
+    std::cout << "output: " << output.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        output.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        break;
+    default:
+        input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+        output.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) *
+                             weight_device_result.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(input.mData.data());
+    out_device_buf.ToDevice(output.mData.data());
+
+    if(do_verification)
+    {
+        auto ref_conv     = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
+                                                                           InDataType,
+                                                                           WeiDataType,
+                                                                           OutDataType,
+                                                                           InElementOp,
+                                                                           WeiElementOp,
+                                                                           OutElementOp>{};
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(input,
+                                                  weight_host_result,
+                                                  output,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op);
+
+        ref_invoker.Run(ref_argument);
+    }
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdWeight<NDimSpatial,
+                                                                              InLayout,
+                                                                              WeiLayout,
+                                                                              OutLayout,
+                                                                              InDataType,
+                                                                              WeiDataType,
+                                                                              OutDataType,
+                                                                              InElementOp,
+                                                                              WeiElementOp,
+                                                                              OutElementOp>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    std::string best_op_name;
+    float best_avg_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device Conv instances
+    bool all_pass = true;
+
+    std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto range_copy = [](const auto& from, auto to) { std::copy(begin(from), end(from), to); };
+
+    range_copy(conv_param.input_spatial_lengths_, begin(input_spatial_lengths));
+    range_copy(conv_param.filter_spatial_lengths_, begin(filter_spatial_lengths));
+    range_copy(conv_param.output_spatial_lengths_, begin(output_spatial_lengths));
+    range_copy(conv_param.conv_filter_strides_, begin(conv_filter_strides));
+    range_copy(conv_param.conv_filter_dilations_, begin(conv_filter_dilations));
+    range_copy(conv_param.input_left_pads_, begin(input_left_pads));
+    range_copy(conv_param.input_right_pads_, begin(input_right_pads));
+
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
+                                        static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+                                        static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+                                        conv_param.G_,
+                                        conv_param.N_,
+                                        conv_param.K_,
+                                        conv_param.C_,
+                                        input_spatial_lengths,
+                                        filter_spatial_lengths,
+                                        output_spatial_lengths,
+                                        conv_filter_strides,
+                                        conv_filter_dilations,
+                                        input_left_pads,
+                                        input_right_pads,
+                                        in_element_op,
+                                        wei_element_op,
+                                        out_element_op,
+                                        split_k);
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // using atomic add, so need to reset input
+            wei_device_buf.SetZero();
+
+            std::string op_name = op_ptr->GetTypeString();
+
+            auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+            float avg_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop      = conv_param.GetFlops();
+            std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+
+            float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+            float gb_per_sec = num_btype / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                wei_device_buf.FromDevice(weight_device_result.mData.data());
+
+                bool pass = ck::utils::check_err(weight_device_result, weight_host_result);
+
+                if(!pass)
+                {
+                    std::cout << "Fail info: " << op_ptr->GetTypeString() << std::endl;
+                }
+
+                all_pass &= pass;
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "output : ", output.mData, ",") << std::endl;
+                    ;
+                    LogRangeAsType<float>(
+                        std::cout << "weight (device): ", weight_device_result.mData, ",")
+                        << std::endl;
+                    ;
+                    LogRangeAsType<float>(
+                        std::cout << "weight (host): ", weight_host_result.mData, ",")
+                        << std::endl;
+                    ;
+                    LogRangeAsType<float>(std::cout << "input: ", input.mData, ",") << std::endl;
+                    ;
+                }
+            }
+        }
+        else
+        {
+            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best configuration parameters:"
+              << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
+              << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
+
+    return all_pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
new file mode 100644
index 00000000..b201a2ed
--- /dev/null
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
@@ -0,0 +1,253 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+#include <iostream>
+#include <typeinfo>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType>
+bool profile_grouped_conv_fwd_impl(int do_verification,
+                                   int init_method,
+                                   bool do_log,
+                                   bool time_kernel,
+                                   const ck::utils::conv::ConvParam& conv_param)
+{
+    using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+    using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+    const auto in_element_op  = InElementOp{};
+    const auto wei_element_op = WeiElementOp{};
+    const auto out_element_op = OutElementOp{};
+
+    const auto in_g_n_c_wis_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+
+    const auto wei_g_k_c_xs_desc =
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+
+    const auto out_g_n_k_wos_desc =
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    Tensor<InDataType> input(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> weight(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> host_output(out_g_n_k_wos_desc);
+    Tensor<OutDataType> device_output(out_g_n_k_wos_desc);
+
+    std::cout << "input: " << input.mDesc << std::endl;
+    std::cout << "weight: " << weight.mDesc << std::endl;
+    std::cout << "output: " << host_output.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        weight.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        break;
+    default:
+        input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+        weight.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * weight.mDesc.GetElementSpaceSize());
+    DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(input.mData.data());
+    wei_device_buf.ToDevice(weight.mData.data());
+
+    // run reference op
+    if(do_verification)
+    {
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                     InDataType,
+                                                                     WeiDataType,
+                                                                     OutDataType,
+                                                                     InElementOp,
+                                                                     WeiElementOp,
+                                                                     OutElementOp>{};
+
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(input,
+                                                  weight,
+                                                  host_output,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op);
+
+        // init host output to zero
+        host_output.SetZero();
+
+        ref_invoker.Run(ref_argument);
+    }
+
+    std::string best_op_name;
+    float best_avg_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device op instances
+    bool pass = true;
+
+    auto run_impl = [&](auto& op_ptr, auto& argument_ptr) {
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            // re-init output to zero before profiling next kernel
+            out_device_buf.SetZero();
+
+            std::string op_name = op_ptr->GetTypeString();
+
+            auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+            float avg_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop      = conv_param.GetFlops();
+            std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+
+            float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
+
+            float gb_per_sec = num_btype / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                out_device_buf.FromDevice(device_output.mData.data());
+
+                pass = pass & ck::utils::check_err(device_output, host_output);
+
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "input : ", input.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "weight: ", weight.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "host_output  : ", host_output.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "device_output: ", device_output.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+        else
+        {
+            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
+        }
+    };
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD<NDimSpatial,
+                                                                                 InLayout,
+                                                                                 WeiLayout,
+                                                                                 ck::Tuple<>,
+                                                                                 OutLayout,
+                                                                                 InDataType,
+                                                                                 WeiDataType,
+                                                                                 ck::Tuple<>,
+                                                                                 OutDataType,
+                                                                                 InElementOp,
+                                                                                 WeiElementOp,
+                                                                                 OutElementOp>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "xdl found " << op_ptrs.size() << " instances" << std::endl;
+
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
+                                                        wei_device_buf.GetDeviceBuffer(),
+                                                        {},
+                                                        out_device_buf.GetDeviceBuffer(),
+                                                        a_g_n_c_wis_lengths,
+                                                        a_g_n_c_wis_strides,
+                                                        b_g_k_c_xs_lengths,
+                                                        b_g_k_c_xs_strides,
+                                                        {},
+                                                        {},
+                                                        e_g_n_k_wos_lengths,
+                                                        e_g_n_k_wos_strides,
+                                                        conv_filter_strides,
+                                                        conv_filter_dilations,
+                                                        input_left_pads,
+                                                        input_right_pads,
+                                                        in_element_op,
+                                                        wei_element_op,
+                                                        out_element_op);
+
+        run_impl(op_ptr, argument_ptr);
+    }
+
+    std::cout << "Best configuration parameters:"
+              << "\nname: " << best_op_name << "\navg_time: " << best_avg_time
+              << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
+
+    return pass;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_grouped_gemm_impl.hpp b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
new file mode 100644
index 00000000..04f94a0f
--- /dev/null
+++ b/profiler/include/profiler/profile_grouped_gemm_impl.hpp
@@ -0,0 +1,291 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_gemm.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AccDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
+bool profile_grouped_gemm_impl(int do_verification,
+                               int init_method,
+                               bool do_log,
+                               bool time_kernel,
+                               const std::vector<int>& Ms,
+                               const std::vector<int>& Ns,
+                               const std::vector<int>& Ks,
+                               const std::vector<int>& StrideAs,
+                               const std::vector<int>& StrideBs,
+                               const std::vector<int>& StrideCs)
+{
+
+    bool pass = true;
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            using namespace ck::literals;
+
+            if(is_same<decltype(layout), tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    std::size_t group_count = Ms.size();
+
+    if(!(group_count == Ns.size() && group_count == Ks.size() && group_count == StrideAs.size() &&
+         group_count == StrideBs.size() && group_count == StrideCs.size()))
+    {
+        throw std::runtime_error("wrong! inconsistent M/N/Ks, StrideA/B/Cs size\n");
+    }
+
+    std::vector<Tensor<ADataType>> a_m_k;
+    std::vector<Tensor<BDataType>> b_k_n;
+    std::vector<Tensor<CDataType>> c_m_n_device_results;
+
+    for(std::size_t i = 0; i < group_count; i++)
+    {
+        a_m_k.push_back(
+            Tensor<ADataType>(f_host_tensor_descriptor(Ms[i], Ks[i], StrideAs[i], ALayout{})));
+        b_k_n.push_back(
+            Tensor<BDataType>(f_host_tensor_descriptor(Ks[i], Ns[i], StrideBs[i], BLayout{})));
+
+        c_m_n_device_results.push_back(
+            Tensor<CDataType>(f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{})));
+
+        std::cout << "group: " << i << " a_m_k[" << i << "]:" << a_m_k[i].mDesc << ", b_k_n[" << i
+                  << "]:" << b_k_n[i].mDesc << ", c_m_n_device_results[" << i
+                  << "]:" << c_m_n_device_results[i].mDesc << std::endl;
+
+        std::size_t num_thread = 1;
+        switch(init_method)
+        {
+        case 0: break;
+        case 1:
+            a_m_k[i].GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5}, num_thread);
+            b_k_n[i].GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}, num_thread);
+            break;
+        default:
+            a_m_k[i].GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}, num_thread);
+            b_k_n[i].GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}, num_thread);
+        }
+
+        c_m_n_device_results[i].GenerateTensorValue(GeneratorTensor_0<CDataType>{}, num_thread);
+    }
+
+    using AElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using BElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using CElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+    const auto a_element_op = AElementOp{};
+    const auto b_element_op = BElementOp{};
+    const auto c_element_op = CElementOp{};
+
+    // if(do_verification)
+    // {
+
+    // }
+
+    using DeviceMemPtr = std::unique_ptr<DeviceMem>;
+    std::vector<DeviceMemPtr> a_device_buf, b_device_buf, c_device_buf;
+
+    a_device_buf.reserve(group_count);
+    b_device_buf.reserve(group_count);
+    c_device_buf.reserve(group_count);
+
+    std::vector<const void*> p_a, p_b;
+    std::vector<void*> p_c;
+
+    p_a.reserve(group_count);
+    p_b.reserve(group_count);
+    p_c.reserve(group_count);
+
+    std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
+
+    gemm_descs.reserve(group_count);
+
+    for(std::size_t i = 0; i < group_count; i++)
+    {
+        a_device_buf.emplace_back(
+            std::make_unique<DeviceMem>(sizeof(ADataType) * a_m_k[i].mDesc.GetElementSpaceSize()));
+        b_device_buf.emplace_back(
+            std::make_unique<DeviceMem>(sizeof(BDataType) * b_k_n[i].mDesc.GetElementSpaceSize()));
+
+        c_device_buf.emplace_back(std::make_unique<DeviceMem>(
+            sizeof(CDataType) * c_m_n_device_results[i].mDesc.GetElementSpaceSize()));
+
+        a_device_buf[i]->ToDevice(a_m_k[i].mData.data());
+        b_device_buf[i]->ToDevice(b_k_n[i].mData.data());
+        c_device_buf[i]->ToDevice(c_m_n_device_results[i].mData.data());
+
+        gemm_descs.push_back({Ms[i], Ns[i], Ks[i], StrideAs[i], StrideBs[i], StrideCs[i], {}});
+
+        p_a.push_back(a_device_buf[i]->GetDeviceBuffer());
+        p_b.push_back(b_device_buf[i]->GetDeviceBuffer());
+        p_c.push_back(c_device_buf[i]->GetDeviceBuffer());
+    }
+
+    using DeviceOp = ck::tensor_operation::device::DeviceGroupedGemm<ALayout,
+                                                                     BLayout,
+                                                                     ck::Tuple<>,
+                                                                     CLayout,
+                                                                     ADataType,
+                                                                     BDataType,
+                                                                     ck::Tuple<>,
+                                                                     CDataType,
+                                                                     AElementOp,
+                                                                     BElementOp,
+                                                                     CElementOp>;
+
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    if(op_ptrs.size() <= 0)
+    {
+        throw std::runtime_error("wrong! no device GEMM instance found");
+    }
+
+    std::string best_gemm_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    auto p_ds = std::vector<std::array<const void*, 0>>{};
+
+    // profile device GEMM instances
+    for(auto& gemm_ptr : op_ptrs)
+    {
+        auto argument_ptr =
+            gemm_ptr->MakeArgumentPointer(p_a,
+                                          p_b,
+                                          p_ds,
+                                          p_c,
+                                          gemm_descs,
+                                          ck::tensor_operation::element_wise::PassThrough{},
+                                          ck::tensor_operation::element_wise::PassThrough{},
+                                          ck::tensor_operation::element_wise::PassThrough{});
+
+        auto invoker_ptr = gemm_ptr->MakeInvokerPointer();
+
+        DeviceMem gemm_desc_workspace(gemm_ptr->GetWorkSpaceSize(argument_ptr.get()));
+
+        gemm_ptr->SetWorkSpacePointer(argument_ptr.get(), gemm_desc_workspace.GetDeviceBuffer());
+
+        if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            std::string gemm_name = gemm_ptr->GetTypeString();
+
+            float ave_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t flop = 0, num_btype = 0;
+            for(std::size_t i = 0; i < gemm_descs.size(); i++)
+            {
+                flop += std::size_t(2) * Ms[i] * Ns[i] * Ks[i];
+
+                num_btype += sizeof(ADataType) * Ms[i] * Ks[i] + sizeof(BDataType) * Ks[i] * Ns[i] +
+                             sizeof(CDataType) * Ms[i] * Ns[i];
+            }
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << gemm_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                best_gemm_name  = gemm_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                for(std::size_t i = 0; i < gemm_descs.size(); i++)
+                {
+
+                    c_device_buf[i]->FromDevice(c_m_n_device_results[i].mData.data());
+
+                    Tensor<CDataType> c_m_n_host_result(
+                        f_host_tensor_descriptor(Ms[i], Ns[i], StrideCs[i], CLayout{}));
+
+                    using ReferenceGemmInstance =
+                        ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                  BDataType,
+                                                                  CDataType,
+                                                                  AccDataType,
+                                                                  AElementOp,
+                                                                  BElementOp,
+                                                                  CElementOp>;
+
+                    auto ref_gemm    = ReferenceGemmInstance{};
+                    auto ref_invoker = ref_gemm.MakeInvoker();
+
+                    auto ref_argument = ref_gemm.MakeArgument(a_m_k[i],
+                                                              b_k_n[i],
+                                                              c_m_n_host_result,
+                                                              a_element_op,
+                                                              b_element_op,
+                                                              c_element_op);
+
+                    ref_invoker.Run(ref_argument);
+                    pass = pass && ck::utils::check_err(c_m_n_device_results[i], c_m_n_host_result);
+
+                    if(do_log)
+                    {
+                        LogRangeAsType<float>(std::cout << "a : ", a_m_k[i].mData, ",")
+                            << std::endl;
+                        LogRangeAsType<float>(std::cout << "b: ", b_k_n[i].mData, ",") << std::endl;
+                        LogRangeAsType<float>(
+                            std::cout << "c_device: ", c_m_n_device_results[i].mData, ",")
+                            << std::endl;
+                        LogRangeAsType<float>(
+                            std::cout << "c_host  : ", c_m_n_host_result.mData, ",")
+                            << std::endl;
+                    }
+                }
+            }
+        }
+        else
+        {
+            std::cout << "does not support this GEMM problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_gemm_name << std::endl;
+
+    return pass;
+} // namespace profiler
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_groupnorm_impl.hpp b/profiler/include/profiler/profile_groupnorm_impl.hpp
new file mode 100644
index 00000000..81fec559
--- /dev/null
+++ b/profiler/include/profiler/profile_groupnorm_impl.hpp
@@ -0,0 +1,208 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+
+#include "ck/ck.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/normalization.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_groupnorm.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename XDataType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename AccDataType,
+          typename YDataType>
+bool profile_groupnorm_impl(int do_verification,
+                            int init_method,
+                            bool do_log,
+                            bool time_kernel,
+                            std::vector<index_t> length)
+{
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+    if(length.size() != 5)
+        return false;
+
+    index_t G = length[3];
+    index_t C = length[4];
+
+    std::vector<index_t> reduce_dim      = {1, 2, 4};
+    std::vector<index_t> gammaBetaLength = {G, C};
+    std::vector<index_t> gammaBetaStride = {0, 0, 0, C, 1};
+
+    Tensor<XDataType> x(length);
+    Tensor<GammaDataType> gamma(gammaBetaLength);
+    Tensor<BetaDataType> beta(gammaBetaLength);
+    Tensor<YDataType> y(length);
+    Tensor<YDataType> host_y(length);
+
+    switch(init_method)
+    {
+    case 0:
+        x.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        gamma.GenerateTensorValue(GeneratorTensor_1<GammaDataType>{});
+        beta.GenerateTensorValue(GeneratorTensor_1<BetaDataType>{});
+        break;
+    case 1:
+        x.GenerateTensorValue(GeneratorTensor_2<XDataType>{-5, 5});
+        gamma.GenerateTensorValue(GeneratorTensor_2<GammaDataType>{-5, 5});
+        beta.GenerateTensorValue(GeneratorTensor_2<BetaDataType>{-5, 5});
+        break;
+    default:
+        x.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1});
+        gamma.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{-0.5, 0.5});
+        beta.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
+    DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
+    DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
+    DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
+
+    x_dev.ToDevice(x.mData.data());
+    gamma_dev.ToDevice(gamma.mData.data());
+    beta_dev.ToDevice(beta.mData.data());
+
+    // add device normalization instances
+    using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType,
+                                                                       GammaDataType,
+                                                                       BetaDataType,
+                                                                       AccDataType,
+                                                                       YDataType,
+                                                                       PassThrough,
+                                                                       5,
+                                                                       3>;
+
+    // get device op instances
+    const auto instance_ptrs =
+        ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            DeviceOp>::GetInstances();
+
+    std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
+
+    std::string best_instance_name;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    if(do_verification)
+    {
+        using ReferenceInstance = ck::tensor_operation::host::ReferenceGroupnorm<XDataType,
+                                                                                 GammaDataType,
+                                                                                 BetaDataType,
+                                                                                 YDataType,
+                                                                                 AccDataType,
+                                                                                 PassThrough>;
+
+        ReferenceInstance ref;
+        auto ref_argument = ref.MakeArgument(x, gamma, beta, host_y, PassThrough{}, length, 1e-6);
+        auto ref_invoker  = ref.MakeInvoker();
+        ref_invoker.Run(ref_argument);
+    }
+
+    int num_kernel = 0;
+
+    for(auto& inst_ptr : instance_ptrs)
+    {
+        auto argument_ptr = inst_ptr->MakeArgumentPointer(
+            length,
+            std::vector<ck::index_t>{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()},
+            gammaBetaStride,
+            gammaBetaStride,
+            std::vector<ck::index_t>{y.mDesc.GetStrides().begin(), y.mDesc.GetStrides().end()},
+            reduce_dim,
+            1e-6,
+            x_dev.GetDeviceBuffer(),
+            gamma_dev.GetDeviceBuffer(),
+            beta_dev.GetDeviceBuffer(),
+            y_dev.GetDeviceBuffer(),
+            nullptr,
+            nullptr,
+            PassThrough{});
+
+        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            ++num_kernel;
+        }
+        else
+        {
+            continue;
+        }
+
+        auto invoker_ptr = inst_ptr->MakeInvokerPointer();
+
+        float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+        std::size_t num_bytes = x.mDesc.GetElementSize() * sizeof(XDataType) +
+                                gamma.mDesc.GetElementSize() * sizeof(GammaDataType) +
+                                beta.mDesc.GetElementSize() * sizeof(BetaDataType) +
+                                y.mDesc.GetElementSize() * sizeof(YDataType);
+
+        float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+        if(time_kernel)
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                      << inst_ptr->GetTypeString() << std::endl;
+
+        if(avg_time < best_avg_time)
+        {
+            best_instance_name = inst_ptr->GetTypeString();
+            best_avg_time      = avg_time;
+            best_gb_per_sec    = gb_per_sec;
+        }
+
+        if(do_verification)
+        {
+            y_dev.FromDevice(y.mData.data());
+
+            bool pass = ck::utils::check_err(y, host_y, "Error: Incorrect results", 1e-3, 1e-3);
+
+            if(do_log)
+            {
+                LogRangeAsType<float>(std::cout << "x  : ", x.mData, ",") << std::endl;
+                LogRangeAsType<float>(std::cout << "host_y  : ", host_y.mData, ",") << std::endl;
+                LogRangeAsType<float>(std::cout << "y  : ", y.mData, ",") << std::endl;
+            }
+
+            if(!pass)
+            {
+                std::cout << inst_ptr->GetTypeString() << " failed verification: ";
+                LogRange(std::cout << "lengths = [", length, ", ") << "]." << std::endl;
+                return false;
+            }
+            else
+            {
+                if(time_kernel)
+                    std::cout << "pass" << std::endl;
+            }
+        }
+    }
+
+    if(time_kernel)
+    {
+        LogRange(std::cout << "length = ", length, ",") << ", ";
+        std::cout << "num_kernel = " << num_kernel << ", best perf = " << best_avg_time << " ms, "
+                  << best_gb_per_sec << " GB/s, " << best_instance_name << std::endl;
+    }
+
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
+
+    return true;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_layernorm_impl.hpp b/profiler/include/profiler/profile_layernorm_impl.hpp
new file mode 100644
index 00000000..eb21d4a5
--- /dev/null
+++ b/profiler/include/profiler/profile_layernorm_impl.hpp
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iomanip>
+
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/normalization.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
+
+namespace ck {
+namespace profiler {
+
+template <typename XDataType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename AccDataType,
+          typename YDataType,
+          index_t Rank>
+bool profile_layernorm_impl(int do_verification,
+                            int init_method,
+                            bool do_log,
+                            bool time_kernel,
+                            std::vector<index_t> length)
+{
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+    if(length.size() < 2)
+        return false;
+
+    // Assume normalize dimension except for batch (first) dimension
+    std::vector<index_t> reduce_length{length.begin() + 1, length.end()};
+    std::vector<index_t> reduce_dim;
+    for(int i = 1; i < Rank; ++i)
+        reduce_dim.push_back(i);
+
+    Tensor<XDataType> x(length);
+    Tensor<GammaDataType> gamma(reduce_length);
+    Tensor<BetaDataType> beta(reduce_length);
+    Tensor<YDataType> y(length);
+    Tensor<YDataType> host_y(length);
+
+    std::vector<index_t> strideXY =
+        std::vector<ck::index_t>{x.mDesc.GetStrides().begin(), x.mDesc.GetStrides().end()};
+    std::vector<index_t> strideGammaBeta = strideXY;
+    strideGammaBeta[0]                   = 0;
+
+    switch(init_method)
+    {
+    case 0:
+        x.GenerateTensorValue(GeneratorTensor_1<XDataType>{});
+        gamma.GenerateTensorValue(GeneratorTensor_1<GammaDataType>{});
+        beta.GenerateTensorValue(GeneratorTensor_1<BetaDataType>{});
+        y.GenerateTensorValue(GeneratorTensor_1<YDataType>{});
+        break;
+    case 1:
+        x.GenerateTensorValue(GeneratorTensor_2<XDataType>{-5, 5});
+        gamma.GenerateTensorValue(GeneratorTensor_2<GammaDataType>{-5, 5});
+        beta.GenerateTensorValue(GeneratorTensor_2<BetaDataType>{-5, 5});
+        y.GenerateTensorValue(GeneratorTensor_2<YDataType>{-5, 5});
+        break;
+    default:
+        x.GenerateTensorValue(GeneratorTensor_3<XDataType>{0, 1});
+        gamma.GenerateTensorValue(GeneratorTensor_3<GammaDataType>{-0.5, 0.5});
+        beta.GenerateTensorValue(GeneratorTensor_3<BetaDataType>{-0.5, 0.5});
+        y.GenerateTensorValue(GeneratorTensor_3<YDataType>{-0.5, 0.5});
+    }
+
+    DeviceMem x_dev(sizeof(XDataType) * x.mDesc.GetElementSpaceSize());
+    DeviceMem gamma_dev(sizeof(GammaDataType) * gamma.mDesc.GetElementSpaceSize());
+    DeviceMem beta_dev(sizeof(BetaDataType) * beta.mDesc.GetElementSpaceSize());
+    DeviceMem y_dev(sizeof(YDataType) * y.mDesc.GetElementSpaceSize());
+
+    x_dev.ToDevice(x.mData.data());
+    gamma_dev.ToDevice(gamma.mData.data());
+    beta_dev.ToDevice(beta.mData.data());
+
+    constexpr int NumReduceDim = Rank - 1;
+
+    // add device normalization instances
+    using DeviceOp = ck::tensor_operation::device::DeviceNormalization<XDataType,
+                                                                       GammaDataType,
+                                                                       BetaDataType,
+                                                                       AccDataType,
+                                                                       YDataType,
+                                                                       PassThrough,
+                                                                       Rank,
+                                                                       NumReduceDim>;
+
+    // get device op instances
+    const auto instance_ptrs =
+        ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+            DeviceOp>::GetInstances();
+
+    std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
+
+    std::string best_instance_name;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+
+    if(do_verification)
+    {
+        using ReferenceInstance = ck::tensor_operation::host::ReferenceLayernorm<XDataType,
+                                                                                 GammaDataType,
+                                                                                 BetaDataType,
+                                                                                 YDataType,
+                                                                                 AccDataType,
+                                                                                 PassThrough,
+                                                                                 Rank,
+                                                                                 NumReduceDim>;
+
+        ReferenceInstance ref;
+        auto ref_argument =
+            ref.MakeArgument(x, gamma, beta, host_y, PassThrough{}, length, reduce_dim, 1e-4);
+        auto ref_invoker = ref.MakeInvoker();
+        ref_invoker.Run(ref_argument);
+    }
+
+    int num_kernel = 0;
+
+    for(auto& inst_ptr : instance_ptrs)
+    {
+        auto argument_ptr = inst_ptr->MakeArgumentPointer(length,
+                                                          strideXY,
+                                                          strideGammaBeta,
+                                                          strideGammaBeta,
+                                                          strideXY,
+                                                          reduce_dim,
+                                                          1e-4,
+                                                          x_dev.GetDeviceBuffer(),
+                                                          gamma_dev.GetDeviceBuffer(),
+                                                          beta_dev.GetDeviceBuffer(),
+                                                          y_dev.GetDeviceBuffer(),
+                                                          nullptr,
+                                                          nullptr,
+                                                          PassThrough{});
+
+        if(inst_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            ++num_kernel;
+        }
+        else
+        {
+            if(time_kernel)
+            {
+                std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
+                LogRange(std::cout << "input lengths = ", length, ", ") << std::endl;
+            }
+
+            continue;
+        }
+
+        auto invoker_ptr = inst_ptr->MakeInvokerPointer();
+
+        float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+        std::size_t num_bytes = x.mDesc.GetElementSize() * sizeof(XDataType) +
+                                gamma.mDesc.GetElementSize() * sizeof(GammaDataType) +
+                                beta.mDesc.GetElementSize() * sizeof(BetaDataType) +
+                                y.mDesc.GetElementSize() * sizeof(YDataType);
+
+        float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+        if(time_kernel)
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                      << inst_ptr->GetTypeString() << std::endl;
+
+        if(avg_time < best_avg_time)
+        {
+            best_instance_name = inst_ptr->GetTypeString();
+            best_avg_time      = avg_time;
+            best_gb_per_sec    = gb_per_sec;
+        }
+
+        if(do_verification)
+        {
+            y_dev.FromDevice(y.mData.data());
+
+            bool pass = ck::utils::check_err(
+                y.mData, host_y.mData, "Error: Incorrect results d1", 1e-3, 1e-3);
+
+            if(do_log)
+            {
+                LogRangeAsType<float>(std::cout << "x  : ", x.mData, ",") << std::endl;
+                LogRangeAsType<float>(std::cout << "host_y  : ", host_y.mData, ",") << std::endl;
+                LogRangeAsType<float>(std::cout << "y  : ", y.mData, ",") << std::endl;
+            }
+
+            if(!pass)
+            {
+                std::cout << inst_ptr->GetTypeString() << " failed verification: ";
+                LogRange(std::cout << "lengths = [", length, ", ") << "]." << std::endl;
+                return false;
+            }
+            else
+            {
+                if(time_kernel)
+                    std::cout << "pass" << std::endl;
+            }
+        }
+    }
+
+    if(time_kernel)
+    {
+        LogRange(std::cout << "length = ", length, ",") << ", ";
+        LogRange(std::cout << "stride = ", strideXY, ",") << ", ";
+        LogRange(std::cout << "reduce dims ", reduce_dim, ",") << std::endl;
+        std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, "
+                  << best_instance_name << std::endl;
+    }
+
+    if(num_kernel == 0)
+    {
+        std::cout << "Error: No kernel is applicable" << std::endl;
+        return false;
+    }
+
+    return true;
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_reduce_impl.hpp b/profiler/include/profiler/profile_reduce_impl.hpp
new file mode 100644
index 00000000..ccb99398
--- /dev/null
+++ b/profiler/include/profiler/profile_reduce_impl.hpp
@@ -0,0 +1,520 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/reduction_enums.hpp"
+#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp"
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_reduction.hpp"
+#include "ck/library/utility/host_common_util.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+template <index_t Rank,
+          index_t NumReduceDim,
+          ReduceTensorOp ReduceOpId,
+          bool PropagateNan,
+          bool UseIndex>
+struct ReduceDescription
+{
+    static constexpr index_t Rank_              = Rank;
+    static constexpr index_t NumReduceDim_      = NumReduceDim;
+    static constexpr ReduceTensorOp ReduceOpId_ = ReduceOpId;
+    static constexpr bool PropagateNan_         = PropagateNan;
+    static constexpr bool UseIndex_             = UseIndex;
+};
+
+using reduce_description_instances =
+    std::tuple<ReduceDescription<4, 3, ReduceTensorOp::ADD, false, false>, // for ADD
+               ReduceDescription<4, 4, ReduceTensorOp::ADD, false, false>,
+               ReduceDescription<4, 1, ReduceTensorOp::ADD, false, false>,
+               ReduceDescription<2, 1, ReduceTensorOp::ADD, false, false>,
+
+               ReduceDescription<4, 3, ReduceTensorOp::AVG, false, false>, // for AVG
+               ReduceDescription<4, 4, ReduceTensorOp::AVG, false, false>,
+               ReduceDescription<4, 1, ReduceTensorOp::AVG, false, false>,
+               ReduceDescription<2, 1, ReduceTensorOp::AVG, false, false>,
+
+               ReduceDescription<4, 3, ReduceTensorOp::NORM2, false, false>, // for NORM2
+               ReduceDescription<4, 4, ReduceTensorOp::NORM2, false, false>,
+               ReduceDescription<4, 1, ReduceTensorOp::NORM2, false, false>,
+               ReduceDescription<2, 1, ReduceTensorOp::NORM2, false, false>,
+
+               ReduceDescription<4, 3, ReduceTensorOp::MIN, false, false>, // for MIN
+               ReduceDescription<4, 4, ReduceTensorOp::MIN, false, false>,
+               ReduceDescription<4, 1, ReduceTensorOp::MIN, false, false>,
+               ReduceDescription<2, 1, ReduceTensorOp::MIN, false, false>,
+               ReduceDescription<4, 3, ReduceTensorOp::MAX, false, false>, // for MAX
+               ReduceDescription<4, 4, ReduceTensorOp::MAX, false, false>,
+               ReduceDescription<4, 1, ReduceTensorOp::MAX, false, false>,
+               ReduceDescription<2, 1, ReduceTensorOp::MAX, false, false>,
+               ReduceDescription<4, 3, ReduceTensorOp::AMAX, false, false>, // for AMAX
+               ReduceDescription<4, 4, ReduceTensorOp::AMAX, false, false>,
+               ReduceDescription<4, 1, ReduceTensorOp::AMAX, false, false>,
+               ReduceDescription<2, 1, ReduceTensorOp::AMAX, false, false>,
+
+               ReduceDescription<4, 3, ReduceTensorOp::MIN, false, true>, // for MIN
+               ReduceDescription<4, 4, ReduceTensorOp::MIN, false, true>,
+               ReduceDescription<4, 1, ReduceTensorOp::MIN, false, true>,
+               ReduceDescription<2, 1, ReduceTensorOp::MIN, false, true>,
+               ReduceDescription<4, 3, ReduceTensorOp::MAX, false, true>, // for MAX
+               ReduceDescription<4, 4, ReduceTensorOp::MAX, false, true>,
+               ReduceDescription<4, 1, ReduceTensorOp::MAX, false, true>,
+               ReduceDescription<2, 1, ReduceTensorOp::MAX, false, true>,
+               ReduceDescription<4, 3, ReduceTensorOp::AMAX, false, true>, // for AMAX
+               ReduceDescription<4, 4, ReduceTensorOp::AMAX, false, true>,
+               ReduceDescription<4, 1, ReduceTensorOp::AMAX, false, true>,
+               ReduceDescription<2, 1, ReduceTensorOp::AMAX, false, true>>;
+
+template <typename DescriptionType>
+bool description_match(const DescriptionType& description,
+                       int Rank,
+                       const std::vector<int>& reduceDims,
+                       ReduceTensorOp ReduceOpId,
+                       bool PropagateNan,
+                       bool UseIndex)
+{
+    if(description.Rank_ != Rank || description.ReduceOpId_ != ReduceOpId ||
+       description.PropagateNan_ != PropagateNan || description.UseIndex_ != UseIndex)
+        return (false);
+
+    if(DescriptionType::NumReduceDim_ != reduceDims.size())
+        return (false);
+
+    bool result = true;
+
+    return (result);
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+
+namespace ck {
+namespace profiler {
+
+template <int Rank, int NumReduceDim>
+static inline std::array<int, Rank - NumReduceDim>
+get_invariant_dims(const std::array<int, NumReduceDim>& reduceDims)
+{
+    int reduceFlag = 0;
+
+    // flag the bits for the reduceDims
+    for(int i = 0; i < NumReduceDim; i++)
+    {
+        reduceFlag |= 1 << reduceDims[i];
+    };
+
+    std::array<int, Rank - NumReduceDim> invariantDims;
+
+    // collect invariant dimensions
+    int dim = 0;
+    for(int i = 0; i < Rank; i++)
+        if((reduceFlag & (1 << i)) == 0)
+        {
+            invariantDims[dim] = i;
+            dim++;
+        };
+
+    return invariantDims;
+};
+
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          int Rank,
+          int NumReduceDim,
+          ReduceTensorOp ReduceOpId,
+          bool PropagateNan,
+          bool UseIndex>
+bool profile_reduce_impl_impl(bool do_verification,
+                              int init_method,
+                              bool do_dumpout,
+                              bool time_kernel,
+                              const std::vector<size_t>& inLengths,
+                              const std::array<int, NumReduceDim>& reduceDims,
+                              float alpha,
+                              float beta)
+{
+    using namespace ck::tensor_operation::device;
+    using namespace ck::tensor_operation::device::instance;
+    using ck::host_common::dumpBufferToFile;
+
+    constexpr index_t NumOutDim = (Rank - NumReduceDim == 0) ? 1 : Rank - NumReduceDim;
+
+    constexpr bool op_support_indices =
+        (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
+         ReduceOpId == ReduceTensorOp::AMAX);
+
+    constexpr bool OutputIndex = (op_support_indices && UseIndex);
+
+    constexpr bool out_support_atomic_add = std::is_same<OutDataType, float>::value;
+    constexpr bool op_support_atomic_add =
+        !op_support_indices && ReduceOpId != ReduceTensorOp::NORM2;
+    constexpr bool use_atomic_add = (out_support_atomic_add && op_support_atomic_add);
+
+    // 1) If InDataType is half_t, must use half_t as AccDataType for indexable reduction operations
+    // 2) If InDataType is half_t, must use float as AccDataType for non-indexable reduction
+    // operations
+    constexpr bool invalid_reduce_1 =
+        std::is_same<InDataType, half_t>::value &&
+        ((!op_support_indices && !std::is_same<AccDataType, float>::value) ||
+         (op_support_indices && !std::is_same<AccDataType, half_t>::value));
+
+    // 1) If InDataType is float, must use float as AccDataType for indexable reduction operations
+    constexpr bool invalid_reduce_2 =
+        std::is_same<InDataType, float>::value &&
+        (op_support_indices && !std::is_same<AccDataType, float>::value);
+
+    // 1) The indices can only be used when the reduction operation is indexable
+    constexpr bool invalid_reduce_3 = (!op_support_indices && UseIndex);
+
+    // 1) If InDataType is int8_t, must use int8_t as AccDataType for indexable reduction operations
+    // 2) If InDataType is int8_t, must use int32_t as AccDataType for non-indexable reduction
+    // operations
+    constexpr bool invalid_reduce_4 =
+        std::is_same<InDataType, int8_t>::value &&
+        ((!op_support_indices && !std::is_same<AccDataType, int32_t>::value) ||
+         (op_support_indices && !std::is_same<AccDataType, int8_t>::value));
+
+    // 1) If InDataType is int8_t, the supported operation must be either indexable operations or
+    // ADD/AVG
+    constexpr bool invalid_reduce_5 = std::is_same<InDataType, int8_t>::value &&
+                                      (!op_support_indices && ReduceOpId != ReduceTensorOp::ADD &&
+                                       ReduceOpId != ReduceTensorOp::AVG);
+
+    // 1) If InDataType is bhalf_t, must use float as AccDataType for all reduction operations
+    constexpr bool invalid_reduce_6 =
+        std::is_same<InDataType, bhalf_t>::value && !std::is_same<AccDataType, float>::value;
+
+    constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3 ||
+                                     invalid_reduce_4 || invalid_reduce_5 || invalid_reduce_6);
+
+    bool pass = true;
+
+    if constexpr(!invalid_reduce)
+    {
+        Tensor<InDataType> in(inLengths);
+
+        std::vector<size_t> outLengths;
+
+        const auto invariantDims = get_invariant_dims<Rank, NumReduceDim>(reduceDims);
+
+        if(reduceDims.size() == Rank)
+            outLengths.push_back(1);
+        else
+            for(auto dim : invariantDims)
+                outLengths.push_back(inLengths[dim]);
+
+        Tensor<OutDataType> out_ref(outLengths);
+        Tensor<OutDataType> out(outLengths);
+        Tensor<int32_t> out_indices_ref(outLengths);
+        Tensor<int32_t> out_indices(outLengths);
+
+        auto inStrides  = in.mDesc.GetStrides();
+        auto outStrides = out.mDesc.GetStrides();
+
+        size_t invariant_total_length = out.mDesc.GetElementSize();
+        size_t reduce_total_length    = in.mDesc.GetElementSize() / invariant_total_length;
+
+        std::size_t num_thread = 1;
+
+        if(do_verification)
+        {
+            switch(init_method)
+            {
+            case 0: break;
+            case 1:
+                in.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
+                if(beta != 0.0f)
+                    out_ref.GenerateTensorValue(GeneratorTensor_1<InDataType>{1}, num_thread);
+                break;
+            case 2:
+                in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
+                if(beta != 0.0f)
+                    out_ref.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5}, num_thread);
+                break;
+            default:
+                in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}, num_thread);
+                if(beta != 0.0f)
+                    out_ref.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0},
+                                                num_thread);
+            }
+
+            if(beta != 0.0f)
+                for(size_t i = 0; i < out_ref.mDesc.GetElementSpaceSize(); i++)
+                    out.mData[i] = out_ref.mData[i];
+        };
+
+        // these buffers are usually provided by the user application
+        DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+        DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
+
+        in_dev.ToDevice(in.mData.data());
+
+        if(beta != 0.0f)
+            out_dev.ToDevice(out.mData.data());
+
+        size_t indicesSizeInBytes = OutputIndex ? out.mDesc.GetElementSize() * sizeof(int) : 0;
+
+        DeviceMem out_indices_dev(indicesSizeInBytes);
+
+        float best_avg_time   = 0;
+        float best_gb_per_sec = 0;
+
+        using InElementwiseOperation =
+            typename reduce_unary_operator<ReduceOpId, true, true>::InElementwiseOperation;
+        using AccElementwiseOperation =
+            typename reduce_unary_operator<ReduceOpId, true, true>::AccElementwiseOperation;
+
+        using ReduceOperation = typename reduce_binary_operator<ReduceOpId>::opType;
+
+        InElementwiseOperation in_elementwise_op;
+        AccElementwiseOperation acc_elementwise_op;
+
+        std::tie(in_elementwise_op, acc_elementwise_op) =
+            reduce_unary_operator<ReduceOpId, true, true>::GetElementwiseOperator(
+                static_cast<int32_t>(reduce_total_length));
+
+        using DeviceReduceInstPtr =
+            DeviceReducePtr<Rank, NumReduceDim, InElementwiseOperation, AccElementwiseOperation>;
+
+        std::vector<DeviceReduceInstPtr> reduce_ptrs;
+
+        add_device_reduce_instance_threadwise<InDataType,
+                                              AccDataType,
+                                              OutDataType,
+                                              Rank,
+                                              NumReduceDim,
+                                              ReduceOperation,
+                                              InElementwiseOperation,
+                                              AccElementwiseOperation,
+                                              PropagateNan,
+                                              UseIndex>(reduce_ptrs);
+
+        add_device_reduce_instance_blockwise<InDataType,
+                                             AccDataType,
+                                             OutDataType,
+                                             Rank,
+                                             NumReduceDim,
+                                             ReduceOperation,
+                                             InElementwiseOperation,
+                                             AccElementwiseOperation,
+                                             PropagateNan,
+                                             UseIndex>(reduce_ptrs);
+
+        if constexpr(use_atomic_add)
+        {
+            add_device_reduce_instance_multiblock_atomic_add<InDataType,
+                                                             AccDataType,
+                                                             OutDataType,
+                                                             Rank,
+                                                             NumReduceDim,
+                                                             ReduceOperation,
+                                                             InElementwiseOperation,
+                                                             AccElementwiseOperation,
+                                                             PropagateNan,
+                                                             UseIndex>(reduce_ptrs);
+        }
+
+        if(reduce_ptrs.empty())
+        {
+            throw std::runtime_error("Wrong! No device REDUCE instance found");
+        };
+
+        if(do_verification)
+        {
+            ReductionHost<InDataType,
+                          AccDataType,
+                          OutDataType,
+                          ReduceOperation,
+                          InElementwiseOperation,
+                          AccElementwiseOperation,
+                          Rank,
+                          NumReduceDim,
+                          PropagateNan,
+                          OutputIndex>
+                hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
+
+            hostReduce.Run(alpha,
+                           in.mData.data(),
+                           beta,
+                           out_ref.mData.data(),
+                           out_indices_ref.mData.data(),
+                           in_elementwise_op,
+                           acc_elementwise_op);
+        };
+
+        std::array<index_t, Rank> arrInLengths;
+        std::array<index_t, Rank> arrInStrides;
+        std::array<index_t, NumOutDim> arrOutLengths;
+        std::array<index_t, NumOutDim> arrOutStrides;
+
+        ck::ranges::copy(inLengths, arrInLengths.begin());
+        ck::ranges::copy(inStrides, arrInStrides.begin());
+        ck::ranges::copy(outLengths, arrOutLengths.begin());
+        ck::ranges::copy(outStrides, arrOutStrides.begin());
+
+        for(auto& reduce_ptr : reduce_ptrs)
+        {
+            auto argument_ptr = reduce_ptr->MakeArgumentPointer(arrInLengths,
+                                                                arrInStrides,
+                                                                arrOutLengths,
+                                                                arrOutStrides,
+                                                                reduceDims,
+                                                                alpha,
+                                                                beta,
+                                                                in_dev.GetDeviceBuffer(),
+                                                                nullptr,
+                                                                out_dev.GetDeviceBuffer(),
+                                                                out_indices_dev.GetDeviceBuffer(),
+                                                                in_elementwise_op,
+                                                                acc_elementwise_op);
+
+            if(!reduce_ptr->IsSupportedArgument(argument_ptr.get()))
+                continue;
+
+            std::string reduce_name = reduce_ptr->GetTypeString();
+
+            auto invoker_ptr = reduce_ptr->MakeInvokerPointer();
+
+            float avg_time =
+                invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+            std::size_t num_bytes =
+                invariant_total_length * reduce_total_length * sizeof(InDataType) +
+                invariant_total_length * sizeof(OutDataType);
+
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+            if(time_kernel)
+                std::cout << "Perf: " << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                          << reduce_name << std::endl;
+
+            if(gb_per_sec > best_gb_per_sec)
+            {
+                best_avg_time   = avg_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+
+            if(do_verification)
+            {
+                bool single_pass;
+
+                out_dev.FromDevice(out.mData.data());
+                single_pass = ck::utils::check_err(out, out_ref);
+
+                if(OutputIndex)
+                {
+                    out_indices_dev.FromDevice(out_indices.mData.data());
+                    single_pass = single_pass && ck::utils::check_err(out_indices, out_indices_ref);
+                };
+
+                if(!single_pass)
+                {
+                    std::cout << "Fail Info: " << reduce_ptr->GetTypeString() << std::endl;
+                }
+
+                pass = pass && single_pass;
+            };
+
+            if(do_dumpout)
+            {
+                dumpBufferToFile("dump_in.bin", in.mData.data(), in.mDesc.GetElementSize());
+                dumpBufferToFile("dump_out.bin", out.mData.data(), out.mDesc.GetElementSize());
+                dumpBufferToFile(
+                    "dump_out_host.bin", out_ref.mData.data(), out_ref.mDesc.GetElementSize());
+                if(OutputIndex)
+                {
+                    dumpBufferToFile("dump_indices.bin",
+                                     out_indices.mData.data(),
+                                     out_indices.mDesc.GetElementSize());
+                    dumpBufferToFile("dump_indices_host.bin",
+                                     out_indices_ref.mData.data(),
+                                     out_indices_ref.mDesc.GetElementSize());
+                };
+            };
+        };
+
+        if(time_kernel)
+            std::cout << "Best Perf: " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s"
+                      << std::endl;
+    }
+    else
+    {
+        std::cout << "The requested reduction operation is not supported, please check !!!"
+                  << std::endl;
+    };
+
+    return pass;
+};
+
+template <typename InDataType, typename AccDataType, typename OutDataType>
+bool profile_reduce_impl(bool do_verification,
+                         int init_method,
+                         bool do_dumpout,
+                         bool time_kernel,
+                         const std::vector<size_t>& inLengths,
+                         const std::vector<int>& reduceDims,
+                         ReduceTensorOp ReduceOpId,
+                         bool PropagateNan,
+                         bool UseIndex,
+                         float alpha,
+                         float beta)
+{
+    bool matched = false;
+    bool pass    = true;
+
+    using tuple_of_description_instances =
+        tensor_operation::device::instance::reduce_description_instances;
+
+    const auto tuple_object = tuple_of_description_instances{};
+
+    static_for<0, std::tuple_size<tuple_of_description_instances>::value, 1>{}([&](auto i) {
+        if(matched)
+            return;
+
+        using descType = remove_cvref_t<decltype(std::get<i>(tuple_object))>;
+
+        if(!description_match(
+               descType{}, inLengths.size(), reduceDims, ReduceOpId, PropagateNan, UseIndex))
+            return;
+
+        std::array<ck::index_t, descType::NumReduceDim_> arrReduceDims;
+
+        ck::ranges::copy(reduceDims, arrReduceDims.begin());
+
+        pass = pass && profile_reduce_impl_impl<InDataType,
+                                                AccDataType,
+                                                OutDataType,
+                                                descType::Rank_,
+                                                descType::NumReduceDim_,
+                                                static_cast<ReduceTensorOp>(descType::ReduceOpId_),
+                                                descType::PropagateNan_,
+                                                descType::UseIndex_>(do_verification,
+                                                                     init_method,
+                                                                     do_dumpout,
+                                                                     time_kernel,
+                                                                     inLengths,
+                                                                     arrReduceDims,
+                                                                     alpha,
+                                                                     beta);
+
+        matched = true;
+    });
+
+    return pass;
+};
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/include/profiler/profile_softmax_impl.hpp b/profiler/include/profiler/profile_softmax_impl.hpp
new file mode 100644
index 00000000..090cdaaa
--- /dev/null
+++ b/profiler/include/profiler/profile_softmax_impl.hpp
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <algorithm>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/fill.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
+#include "ck/library/tensor_operation_instance/gpu/softmax.hpp"
+#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/utility/data_type.hpp"
+
+namespace ck {
+namespace profiler {
+
+enum struct SoftmaxDataType
+{
+    F32_F32, // in, out
+    F16_F16,
+    BF16_BF16,
+    INT8_INT8,
+};
+
+// clang-format off
+template <typename SoftmaxDataType> std::string type_to_string();
+template <> std::string type_to_string<float>()   { return "f32"; }
+template <> std::string type_to_string<half_t>()  { return "f16"; }
+template <> std::string type_to_string<bhalf_t>() { return "bf16"; }
+template <> std::string type_to_string<int8_t>()  { return "int8"; }
+template <> std::string type_to_string<int32_t>() { return "int32"; }
+// clang-format on
+
+template <typename InDataType, typename AccDataType, typename OutDataType, index_t Rank>
+bool profile_softmax_impl(int do_verification,
+                          int init_method,
+                          bool do_log,
+                          bool time_kernel,
+                          std::vector<index_t> in_length,
+                          std::vector<index_t> in_strides,
+                          std::vector<index_t> reduce_dims,
+                          AccDataType alpha,
+                          AccDataType beta)
+{
+    if(Rank != in_length.size())
+    {
+        throw std::runtime_error("Input tensor rank is different from template argument Rank!");
+    }
+
+    Tensor<InDataType> in = in_strides.empty() ? Tensor<InDataType>(in_length)
+                                               : Tensor<InDataType>(in_length, in_strides);
+    Tensor<OutDataType> out(in.mDesc);
+    Tensor<OutDataType> prior_out(in.mDesc);
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        ck::utils::FillUniformDistributionIntegerValue<InDataType>{-5.f, 5.f}(in.begin(), in.end());
+        ck::utils::FillUniformDistributionIntegerValue<OutDataType>{-5.f, 5.f}(prior_out.begin(),
+                                                                               prior_out.end());
+        break;
+    default:
+        ck::utils::FillUniformDistribution<InDataType>{0.0f, 1.0f}(in);
+        ck::utils::FillUniformDistribution<OutDataType>{-0.5f, 0.5f}(prior_out);
+    }
+
+    Tensor<OutDataType> out_ref(prior_out);
+
+    if(do_verification)
+    {
+        using ReferenceSoftmax =
+            tensor_operation::host::ReferenceSoftmax<InDataType, OutDataType, AccDataType>;
+        ReferenceSoftmax{}.MakeInvoker().Run({in, out_ref, alpha, beta, reduce_dims});
+    }
+
+    DeviceMem in_dev(in.GetElementSpaceSizeInBytes());
+    DeviceMem out_dev(out.GetElementSpaceSizeInBytes());
+    in_dev.ToDevice(in.data());
+
+    std::vector<index_t> in_tensor_lengths(in.GetLengths().begin(), in.GetLengths().end());
+    std::vector<index_t> in_tensor_strides(in.GetStrides().begin(), in.GetStrides().end());
+
+    // add device softmax instances
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    using DeviceOp    = tensor_operation::device::
+        DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
+
+    // get device op instances
+    const auto instances = tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << instances.size() << " instances" << std::endl;
+
+    if(instances.size() <= 0)
+    {
+        throw std::runtime_error("wrong! no device normalization instance found");
+    }
+
+    std::string best_instance_name;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    std::vector<bool> instance_pass;
+
+    for(auto& inst_ptr : instances)
+    {
+        // Is this user's responsibility to check if problem mismatches kernel instance (ie. rank 3
+        // problem to rank 4 kernel) other than invoking IsSupportedArgument()?
+        if(!(inst_ptr->GetNumReduceDim() == static_cast<index_t>(reduce_dims.size())))
+        {
+            continue;
+        }
+
+        auto argument_ptr = inst_ptr->MakeArgumentPointer(in_tensor_lengths,
+                                                          in_tensor_strides,
+                                                          reduce_dims,
+                                                          &alpha,
+                                                          &beta,
+                                                          in_dev.GetDeviceBuffer(),
+                                                          out_dev.GetDeviceBuffer(),
+                                                          PassThrough{},
+                                                          PassThrough{});
+
+        if(!inst_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: ";
+            LogRange(std::cout << "input lengths = [", in_length, ", ")
+                << "], "
+                << "scaler = [" << alpha << ", " << beta << "]";
+            LogRange(std::cout << ", reduce dims = [", reduce_dims, ", ") << "]." << std::endl;
+            instance_pass.push_back(true);
+            continue;
+        }
+
+        out_dev.ToDevice(prior_out.data());
+        auto invoker_ptr = inst_ptr->MakeInvokerPointer();
+        float avg_time   = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+        if(time_kernel)
+        {
+            std::size_t num_bytes =
+                in.GetElementSize() * sizeof(InDataType) +
+                (beta == 0.0f ? 1 : 2) * out.GetElementSize() * sizeof(OutDataType);
+            float gb_per_sec = num_bytes / 1.E6 / avg_time;
+
+            std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
+                      << inst_ptr->GetTypeString() << std::endl;
+
+            if(avg_time < best_avg_time)
+            {
+                best_instance_name = inst_ptr->GetTypeString();
+                best_avg_time      = avg_time;
+                best_gb_per_sec    = gb_per_sec;
+            }
+        }
+
+        if(do_verification)
+        {
+            out_dev.FromDevice(out.data());
+            bool pass = true;
+            if(std::is_same<InDataType, int8_t>::value)
+            {
+                pass = pass && ck::utils::check_err(
+                                   out.mData, out_ref.mData, "Error: Incorrect results!", 0, 1);
+                if(do_log)
+                {
+                    LogRangeAsType<int>(std::cout << "in  : ", in.mData, ",") << std::endl;
+                    LogRangeAsType<int>(std::cout << "out_ref  : ", out_ref.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<int>(std::cout << "out  : ", out.mData, ",") << std::endl;
+                }
+            }
+            else
+            {
+                pass = pass && ck::utils::check_err(out.mData, out_ref.mData);
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "in  : ", in.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "out_ref  : ", out_ref.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "out  : ", out.mData, ",") << std::endl;
+                }
+            }
+
+            if(!pass)
+            {
+                std::cout << inst_ptr->GetTypeString() << " failed verification: ";
+                LogRange(std::cout << "input lengths = [", in_length, ", ")
+                    << "], "
+                    << "scaler = [" << alpha << ", " << beta << "]." << std::endl;
+            }
+            instance_pass.push_back(pass);
+        }
+    }
+    if(time_kernel)
+    {
+        std::cout << "Best Perf for datatype = " << type_to_string<InDataType>() << "_"
+                  << type_to_string<OutDataType>() << ", ";
+        LogRange(std::cout << "length = ", in_tensor_lengths, ",") << ", ";
+        LogRange(std::cout << "stride = ", in_tensor_strides, ",") << ", ";
+        LogRange(std::cout << "reduce dims ", reduce_dims, ",") << ", ";
+        std::cout << "alpha = " << alpha << ", "
+                  << "beta = " << beta << ", " << best_avg_time << " ms, " << best_gb_per_sec
+                  << " GB/s, " << best_instance_name << std::endl;
+    }
+    return std::all_of(
+        std::begin(instance_pass), std::end(instance_pass), [](bool p) { return p; });
+}
+
+} // namespace profiler
+} // namespace ck
diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt
new file mode 100644
index 00000000..bc87554c
--- /dev/null
+++ b/profiler/src/CMakeLists.txt
@@ -0,0 +1,67 @@
+# ckProfiler
+set(PROFILER_SOURCES
+    profiler.cpp
+    profile_gemm.cpp
+    profile_gemm_splitk.cpp
+    profile_gemm_bilinear.cpp
+    profile_gemm_bias_add_reduce.cpp
+    profile_gemm_add_add_fastgelu.cpp
+    profile_gemm_add_fastgelu.cpp
+    profile_gemm_fastgelu.cpp
+    profile_gemm_reduce.cpp
+    profile_batched_gemm.cpp
+    profile_batched_gemm_gemm.cpp
+    profile_batched_gemm_add_relu_gemm_add.cpp
+    profile_batched_gemm_reduce.cpp
+    profile_grouped_gemm.cpp
+    profile_conv_fwd.cpp
+    profile_conv_fwd_bias_relu.cpp
+    profile_conv_fwd_bias_relu_add.cpp
+    profile_conv_bwd_data.cpp
+    profile_grouped_conv_fwd.cpp
+    profile_grouped_conv_bwd_weight.cpp
+    profile_reduce.cpp
+    profile_groupnorm.cpp
+    profile_layernorm.cpp
+    profile_softmax.cpp
+    profile_batchnorm_fwd.cpp
+    profile_batchnorm_bwd.cpp
+)
+
+set(PROFILER_EXECUTABLE ckProfiler)
+
+add_executable(${PROFILER_EXECUTABLE} ${PROFILER_SOURCES})
+target_compile_options(${PROFILER_EXECUTABLE} PRIVATE -Wno-global-constructors)
+
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE utility)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_add_fastgelu_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_fastgelu_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_fastgelu_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_reduce_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bias_add_reduce_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_gemm_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_add_relu_gemm_add_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_fwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_fwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv1d_bwd_data_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_bwd_data_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv3d_bwd_data_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_add_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
+
+rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler)
diff --git a/profiler/src/profile_batched_gemm.cpp b/profiler/src/profile_batched_gemm.cpp
new file mode 100644
index 00000000..907a3737
--- /dev/null
+++ b/profiler/src/profile_batched_gemm.cpp
@@ -0,0 +1,203 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdint>
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_batched_gemm_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+enum struct GemmMatrixLayout
+{
+    MK_KN_MN, // 0
+    MK_NK_MN, // 1
+    KM_KN_MN, // 2
+    KM_NK_MN, // 3
+};
+
+enum struct GemmDataType
+{
+    F32_F32_F32,    // 0
+    F16_F16_F16,    // 1
+    BF16_BF16_BF16, // 2
+    INT8_INT8_INT8, // 3
+};
+
+#define OP_NAME "batched_gemm"
+#define OP_DESC "Batched GEMM"
+
+int profile_batched_gemm(int argc, char* argv[])
+{
+    if(argc != 18)
+    {
+        // clang-format off
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: fp32; 1: fp16, 2: bf16, 3: int8)\n");
+        printf("arg3: matrix layout (0: A[g, m, k] * B[g, k, n] = C[g, m, n];\n");
+        printf("                     1: A[g, m, k] * B[g, n, k] = C[g, m, n];\n");
+        printf("                     2: A[g, k, m] * B[g, k, n] = C[g, m, n];\n");
+        printf("                     3: A[g, k, m] * B[g, n, k] = C[g, m, n])\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
+        printf("arg8 to 17: M, N, K, StrideA, StrideB, StrideC, BatchStrideA, BatchStrideB, BatchStrideC, BatchCount\n");
+        // clang-format on
+        exit(1);
+    }
+
+    const auto data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+
+    const int M = std::stoi(argv[8]);
+    const int N = std::stoi(argv[9]);
+    const int K = std::stoi(argv[10]);
+
+    const int StrideA = std::stoi(argv[11]);
+    const int StrideB = std::stoi(argv[12]);
+    const int StrideC = std::stoi(argv[13]);
+
+    const int BatchStrideA = std::stoi(argv[14]);
+    const int BatchStrideB = std::stoi(argv[15]);
+    const int BatchStrideC = std::stoi(argv[16]);
+
+    const int BatchCount = std::stoi(argv[17]);
+
+    using F32  = float;
+    using F16  = ck::half_t;
+    using BF16 = ck::bhalf_t;
+    using INT8 = int8_t;
+
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    auto profile = [&](auto a_type,
+                       auto b_type,
+                       auto c_type,
+                       auto a_layout,
+                       auto b_layout,
+                       auto c_layout) {
+        using ADataType = decltype(a_type);
+        using BDataType = decltype(b_type);
+        using CDataType = decltype(c_type);
+
+        using ALayout = decltype(a_layout);
+        using BLayout = decltype(b_layout);
+        using CLayout = decltype(c_layout);
+
+        const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
+        const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
+        const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M;
+
+        const int StrideA_ = (StrideA < 0) ? DefaultStrideA : StrideA;
+        const int StrideB_ = (StrideB < 0) ? DefaultStrideB : StrideB;
+        const int StrideC_ = (StrideC < 0) ? DefaultStrideC : StrideC;
+
+        const int DefaultBatchStrideA = (ck::is_same_v<ALayout, Row> ? M : K) * StrideA_;
+        const int DefaultBatchStrideB = (ck::is_same_v<BLayout, Row> ? K : N) * StrideB_;
+        const int DefaultBatchStrideC = (ck::is_same_v<CLayout, Row> ? M : N) * StrideC_;
+
+        const int BatchStrideA_ = (BatchStrideA < 0) ? DefaultBatchStrideA : BatchStrideA;
+        const int BatchStrideB_ = (BatchStrideB < 0) ? DefaultBatchStrideB : BatchStrideB;
+        const int BatchStrideC_ = (BatchStrideC < 0) ? DefaultBatchStrideC : BatchStrideC;
+
+        bool pass = ck::profiler::
+            profile_batched_gemm_impl<ADataType, BDataType, CDataType, ALayout, BLayout, CLayout>(
+                do_verification,
+                init_method,
+                do_log,
+                time_kernel,
+                M,
+                N,
+                K,
+                BatchStrideA_,
+                BatchStrideB_,
+                BatchStrideC_,
+                StrideA_,
+                StrideB_,
+                StrideC_,
+                BatchCount);
+
+        return pass ? 0 : 1;
+    };
+
+    if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(F32{}, F32{}, F32{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(F32{}, F32{}, F32{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        return profile(F32{}, F32{}, F32{}, Col{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        return profile(F32{}, F32{}, F32{}, Col{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(F16{}, F16{}, F16{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(F16{}, F16{}, F16{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        return profile(F16{}, F16{}, F16{}, Col{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        return profile(F16{}, F16{}, F16{}, Col{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(BF16{}, BF16{}, BF16{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(BF16{}, BF16{}, BF16{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        return profile(BF16{}, BF16{}, BF16{}, Col{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        return profile(BF16{}, BF16{}, BF16{}, Col{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(INT8{}, INT8{}, INT8{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(INT8{}, INT8{}, INT8{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        return profile(INT8{}, INT8{}, INT8{}, Col{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        return profile(INT8{}, INT8{}, INT8{}, Col{}, Col{}, Row{});
+    }
+    else
+    {
+        std::cout << "this data_type & layout is not implemented" << std::endl;
+
+        return 1;
+    }
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm);
diff --git a/profiler/src/profile_batched_gemm_add_relu_gemm_add.cpp b/profiler/src/profile_batched_gemm_add_relu_gemm_add.cpp
new file mode 100644
index 00000000..f440a309
--- /dev/null
+++ b/profiler/src/profile_batched_gemm_add_relu_gemm_add.cpp
@@ -0,0 +1,214 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_batched_gemm_add_relu_gemm_add_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+#define OP_NAME "batched_gemm_add_relu_gemm_add"
+#define OP_DESC "Batched GEMM+Add+Relu+GEMM+Add"
+
+int profile_batched_gemm_add_relu_gemm_add(int argc, char* argv[])
+{
+    enum struct GemmMatrixLayout
+    {
+        MK_NK_MN_NO_MO_MO, // 0
+        MK_NK_MN_ON_MO_MO, // 1
+    };
+
+    enum struct GemmDataType
+    {
+        F32_F32_F32_F32_F32_F32, // 0
+        F16_F16_F16_F16_F16_F16, // 1
+    };
+
+    GemmDataType data_type  = GemmDataType::F16_F16_F16_F16_F16_F16;
+    GemmMatrixLayout layout = GemmMatrixLayout::MK_NK_MN_NO_MO_MO;
+    bool do_verification    = true;
+    int init_method         = 1;
+    bool do_log             = 0;
+    bool time_kernel        = false;
+
+    // GEMM shape
+    ck::index_t M             = 1024;
+    ck::index_t N             = 1024;
+    ck::index_t K             = 64;
+    ck::index_t O             = 128;
+    ck::index_t BatchCount    = 4;
+    ck::index_t StrideA0      = -1;
+    ck::index_t StrideB0      = -1;
+    ck::index_t StrideD0      = -1;
+    ck::index_t StrideB1      = -1;
+    ck::index_t StrideD1      = -1;
+    ck::index_t StrideE1      = -1;
+    ck::index_t BatchStrideA0 = -1;
+    ck::index_t BatchStrideB0 = -1;
+    ck::index_t BatchStrideD0 = -1;
+    ck::index_t BatchStrideB1 = -1;
+    ck::index_t BatchStrideD1 = -1;
+    ck::index_t BatchStrideE1 = -1;
+
+    if(argc == 8)
+    {
+        data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
+        layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+        do_verification = std::stoi(argv[4]);
+        init_method     = std::stoi(argv[5]);
+        do_log          = std::stoi(argv[6]);
+        time_kernel     = std::stoi(argv[7]);
+    }
+    else if(argc == 13)
+    {
+        data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
+        layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+        do_verification = std::stoi(argv[4]);
+        init_method     = std::stoi(argv[5]);
+        do_log          = std::stoi(argv[6]);
+        time_kernel     = std::stoi(argv[7]);
+
+        M          = std::stoi(argv[8]);
+        N          = std::stoi(argv[9]);
+        K          = std::stoi(argv[10]);
+        O          = std::stoi(argv[11]);
+        BatchCount = std::stoi(argv[12]);
+    }
+    else if(argc == 25)
+    {
+        data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
+        layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+        do_verification = std::stoi(argv[4]);
+        init_method     = std::stoi(argv[5]);
+        do_log          = std::stoi(argv[6]);
+        time_kernel     = std::stoi(argv[7]);
+
+        M          = std::stoi(argv[8]);
+        N          = std::stoi(argv[9]);
+        K          = std::stoi(argv[10]);
+        O          = std::stoi(argv[11]);
+        BatchCount = std::stoi(argv[12]);
+
+        StrideA0 = std::stoi(argv[13]);
+        StrideB0 = std::stoi(argv[14]);
+        StrideD0 = std::stoi(argv[15]);
+        StrideB1 = std::stoi(argv[16]);
+        StrideD1 = std::stoi(argv[17]);
+        StrideE1 = std::stoi(argv[18]);
+
+        BatchStrideA0 = std::stoi(argv[19]);
+        BatchStrideB0 = std::stoi(argv[20]);
+        BatchStrideD0 = std::stoi(argv[21]);
+        BatchStrideB1 = std::stoi(argv[22]);
+        BatchStrideD1 = std::stoi(argv[23]);
+        BatchStrideE1 = std::stoi(argv[24]);
+    }
+    else
+    {
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (1: fp16)\n");
+        printf("arg3: matrix layout (0: Relu(A0[m, k] * B0[n, k] + D0[m, n]) * B1[n, o] + D1[m, o] "
+               "= E1[m, o]; 1: Relu(A0[m, k] * B0[n, k] + D0[m, n]) * B1[o, n] + D1[m, o] = "
+               "E1[m, o];)\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=no, 1=yes)\n");
+        printf("arg8 to 12: M, N, K, O, Batch\n");
+        printf("arg13 to 18: StrideA0, StrideB0, StrideD0, StrideB1, StrideD1, StrideE1\n");
+        printf("arg19 to 24: BatchStrideA0, BatchStrideB0, BatchStrideD0, BatchStrideB1, "
+               "BatchStrideD1, BatchStrideE1 \n");
+        exit(1);
+    }
+
+    if(data_type == GemmDataType::F16_F16_F16_F16_F16_F16 &&
+       layout == GemmMatrixLayout::MK_NK_MN_NO_MO_MO)
+    {
+        ck::profiler::profile_batched_gemm_add_relu_gemm_add_impl<Row,            // A0Layout,
+                                                                  Col,            // B0Layout,
+                                                                  ck::Tuple<Row>, // D0sLayout,
+                                                                  Row,            // B1Layout,
+                                                                  ck::Tuple<Row>, // D1sLayout,
+                                                                  Row,            // E1Layout,
+                                                                  F16,            // A0DataType,
+                                                                  F16,            // B0DataType,
+                                                                  ck::Tuple<F16>, // D0DataType,
+                                                                  F16,            // B1DataType,
+                                                                  ck::Tuple<F16>, // D1sDataType
+                                                                  F16>            // E1DataType,
+            (do_verification,
+             init_method,
+             do_log,
+             time_kernel,
+             M,
+             N,
+             K,
+             O,
+             BatchCount,
+             StrideA0,
+             StrideB0,
+             StrideD0,
+             StrideB1,
+             StrideD1,
+             StrideE1,
+             BatchStrideA0,
+             BatchStrideB0,
+             BatchStrideD0,
+             BatchStrideB1,
+             BatchStrideD1,
+             BatchStrideE1);
+    }
+    else if(data_type == GemmDataType::F16_F16_F16_F16_F16_F16 &&
+            layout == GemmMatrixLayout::MK_NK_MN_ON_MO_MO)
+    {
+        ck::profiler::profile_batched_gemm_add_relu_gemm_add_impl<Row,            // A0Layout,
+                                                                  Col,            // B0Layout,
+                                                                  ck::Tuple<Row>, // D0sLayout,
+                                                                  Col,            // B1Layout,
+                                                                  ck::Tuple<Row>, // D1sLayout,
+                                                                  Row,            // E1Layout,
+                                                                  F16,            // A0DataType,
+                                                                  F16,            // B0DataType,
+                                                                  ck::Tuple<F16>, // D0DataType,
+                                                                  F16,            // B1DataType,
+                                                                  ck::Tuple<F16>, // D1sDataType
+                                                                  F16>            // E1DataType,
+            (do_verification,
+             init_method,
+             do_log,
+             time_kernel,
+             M,
+             N,
+             K,
+             O,
+             BatchCount,
+             StrideA0,
+             StrideB0,
+             StrideD0,
+             StrideB1,
+             StrideD1,
+             StrideE1,
+             BatchStrideA0,
+             BatchStrideB0,
+             BatchStrideD0,
+             BatchStrideB1,
+             BatchStrideD1,
+             BatchStrideE1);
+    }
+    else
+    {
+        throw std::runtime_error("wrong! this data_type & layout is not implemented");
+    }
+
+    return 0;
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm_add_relu_gemm_add);
diff --git a/profiler/src/profile_batched_gemm_gemm.cpp b/profiler/src/profile_batched_gemm_gemm.cpp
new file mode 100644
index 00000000..6015c93b
--- /dev/null
+++ b/profiler/src/profile_batched_gemm_gemm.cpp
@@ -0,0 +1,187 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_batched_gemm_gemm_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+#define OP_NAME "batched_gemm_gemm"
+#define OP_DESC "Batched GEMM+GEMM"
+
+int profile_batched_gemm_gemm(int argc, char* argv[])
+{
+    enum struct GemmMatrixLayout
+    {
+        MK_NK_NO_MO, // 0
+        MK_NK_ON_MO, // 0
+    };
+
+    enum struct GemmDataType
+    {
+        F32_F32_F32_F32, // 0
+        F16_F16_F16_F16, // 1
+    };
+
+    GemmDataType data_type  = GemmDataType::F16_F16_F16_F16;
+    GemmMatrixLayout layout = GemmMatrixLayout::MK_NK_NO_MO;
+    bool do_verification    = true;
+    int init_method         = 1;
+    bool do_log             = 0;
+    bool time_kernel        = false;
+
+    // GEMM shape
+    ck::index_t M             = 1024;
+    ck::index_t N             = 1024;
+    ck::index_t K             = 64;
+    ck::index_t O             = 128;
+    ck::index_t BatchCount    = 4;
+    ck::index_t StrideA0      = -1;
+    ck::index_t StrideB0      = -1;
+    ck::index_t StrideB1      = -1;
+    ck::index_t StrideE1      = -1;
+    ck::index_t BatchStrideA0 = -1;
+    ck::index_t BatchStrideB0 = -1;
+    ck::index_t BatchStrideB1 = -1;
+    ck::index_t BatchStrideE1 = -1;
+
+    if(argc == 8)
+    {
+        data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
+        layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+        do_verification = std::stoi(argv[4]);
+        init_method     = std::stoi(argv[5]);
+        do_log          = std::stoi(argv[6]);
+        time_kernel     = std::stoi(argv[7]);
+    }
+    else if(argc == 13)
+    {
+        data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
+        layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+        do_verification = std::stoi(argv[4]);
+        init_method     = std::stoi(argv[5]);
+        do_log          = std::stoi(argv[6]);
+        time_kernel     = std::stoi(argv[7]);
+
+        M          = std::stoi(argv[8]);
+        N          = std::stoi(argv[9]);
+        K          = std::stoi(argv[10]);
+        O          = std::stoi(argv[11]);
+        BatchCount = std::stoi(argv[12]);
+    }
+    else if(argc == 21)
+    {
+        data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
+        layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+        do_verification = std::stoi(argv[4]);
+        init_method     = std::stoi(argv[5]);
+        do_log          = std::stoi(argv[6]);
+        time_kernel     = std::stoi(argv[7]);
+
+        M          = std::stoi(argv[8]);
+        N          = std::stoi(argv[9]);
+        K          = std::stoi(argv[10]);
+        O          = std::stoi(argv[11]);
+        BatchCount = std::stoi(argv[12]);
+
+        StrideA0 = std::stoi(argv[13]);
+        StrideB0 = std::stoi(argv[14]);
+        StrideB1 = std::stoi(argv[15]);
+        StrideE1 = std::stoi(argv[16]);
+
+        BatchStrideA0 = std::stoi(argv[17]);
+        BatchStrideB0 = std::stoi(argv[18]);
+        BatchStrideB1 = std::stoi(argv[19]);
+        BatchStrideE1 = std::stoi(argv[20]);
+    }
+    else
+    {
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (1: fp16)\n");
+        printf("arg3: matrix layout (0: Relu(A0[m, k] * B0[n, k] + D0[m, n]) * B1[n, o] + D1[m, o] "
+               "= E1[m, o];  1: Relu(A0[m, k] * B0[n, k] + D0[m, n]) * B1[o, n] + D1[m, o] = E1[m, "
+               "o];)\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=no, 1=yes)\n");
+        printf("arg8 to 12: M, N, K, O, Batch\n");
+        printf("arg13 to 16: StrideA0, StrideB0, StrideB1, StrideE1\n");
+        printf("arg17 to 20: BatchStrideA0, BatchStrideB0, BatchStrideB1, BatchStrideE1 \n");
+        exit(1);
+    }
+
+    if(data_type == GemmDataType::F16_F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_NO_MO)
+    {
+        ck::profiler::profile_batched_gemm_gemm_impl<F16, // A0DataType,
+                                                     F16, // B0DataType,
+                                                     F16, // B1DataType,
+                                                     F16, // E1DataType,
+                                                     Row, // A0Layout,
+                                                     Col, // B0Layout,
+                                                     Row, // B1Layout,
+                                                     Row> // E1Layout,
+            (do_verification,
+             init_method,
+             do_log,
+             time_kernel,
+             M,
+             N,
+             K,
+             O,
+             BatchCount,
+             StrideA0,
+             StrideB0,
+             StrideB1,
+             StrideE1,
+             BatchStrideA0,
+             BatchStrideB0,
+             BatchStrideB1,
+             BatchStrideE1);
+    }
+    else if(data_type == GemmDataType::F16_F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_ON_MO)
+    {
+        ck::profiler::profile_batched_gemm_gemm_impl<F16, // A0DataType,
+                                                     F16, // B0DataType,
+                                                     F16, // B1DataType,
+                                                     F16, // E1DataType,
+                                                     Row, // A0Layout,
+                                                     Col, // B0Layout,
+                                                     Col, // B1Layout,
+                                                     Row> // E1Layout,
+            (do_verification,
+             init_method,
+             do_log,
+             time_kernel,
+             M,
+             N,
+             K,
+             O,
+             BatchCount,
+             StrideA0,
+             StrideB0,
+             StrideB1,
+             StrideE1,
+             BatchStrideA0,
+             BatchStrideB0,
+             BatchStrideB1,
+             BatchStrideE1);
+    }
+    else
+    {
+        throw std::runtime_error("wrong! this data_type & layout is not implemented");
+    }
+
+    return 0;
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm_gemm);
diff --git a/profiler/src/profile_batched_gemm_reduce.cpp b/profiler/src/profile_batched_gemm_reduce.cpp
new file mode 100644
index 00000000..6b1dfc01
--- /dev/null
+++ b/profiler/src/profile_batched_gemm_reduce.cpp
@@ -0,0 +1,159 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_batched_gemm_reduce_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+#define OP_NAME "batched_gemm_reduce"
+#define OP_DESC "Batched GEMM+Reduce"
+
+int profile_batched_gemm_reduce(int argc, char* argv[])
+{
+    enum struct GemmMatrixLayout
+    {
+        MK_KN_MN, // 0
+        MK_NK_MN, // 1
+        KM_KN_MN, // 2
+        KM_NK_MN, // 3
+    };
+
+    enum struct GemmReduceDataType
+    {
+        F32_F32_F32_F32_F32, // 0
+        F16_F16_F16_F32_F32, // 1
+    };
+
+    if(argc != 15)
+    {
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: fp32; 1: fp16)\n");
+        printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
+        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
+        printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
+        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
+        printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideC, BatchCount\n");
+        exit(1);
+    }
+
+    const auto data_type       = static_cast<GemmReduceDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+
+    const int M = std::stoi(argv[8]);
+    const int N = std::stoi(argv[9]);
+    const int K = std::stoi(argv[10]);
+
+    const int StrideA = std::stoi(argv[11]);
+    const int StrideB = std::stoi(argv[12]);
+    const int StrideC = std::stoi(argv[13]);
+
+    const int BatchCount = std::stoi(argv[14]);
+
+    if(data_type == GemmReduceDataType::F16_F16_F16_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t,
+                                                       ck::half_t,
+                                                       ck::half_t,
+                                                       float,
+                                                       ck::tensor_layout::gemm::RowMajor,
+                                                       ck::tensor_layout::gemm::RowMajor,
+                                                       ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? K : StrideA,
+            (StrideB < 0) ? N : StrideB,
+            (StrideC < 0) ? N : StrideC,
+            BatchCount);
+    }
+    else if(data_type == GemmReduceDataType::F16_F16_F16_F32_F32 &&
+            layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t,
+                                                       ck::half_t,
+                                                       ck::half_t,
+                                                       float,
+                                                       ck::tensor_layout::gemm::RowMajor,
+                                                       ck::tensor_layout::gemm::ColumnMajor,
+                                                       ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? K : StrideA,
+            (StrideB < 0) ? K : StrideB,
+            (StrideC < 0) ? N : StrideC,
+            BatchCount);
+    }
+    else if(data_type == GemmReduceDataType::F16_F16_F16_F32_F32 &&
+            layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t,
+                                                       ck::half_t,
+                                                       ck::half_t,
+                                                       float,
+                                                       ck::tensor_layout::gemm::ColumnMajor,
+                                                       ck::tensor_layout::gemm::RowMajor,
+                                                       ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? M : StrideA,
+            (StrideB < 0) ? N : StrideB,
+            (StrideC < 0) ? N : StrideC,
+            BatchCount);
+    }
+    else if(data_type == GemmReduceDataType::F16_F16_F16_F32_F32 &&
+            layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t,
+                                                       ck::half_t,
+                                                       ck::half_t,
+                                                       float,
+                                                       ck::tensor_layout::gemm::ColumnMajor,
+                                                       ck::tensor_layout::gemm::ColumnMajor,
+                                                       ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? M : StrideA,
+            (StrideB < 0) ? K : StrideB,
+            (StrideC < 0) ? N : StrideC,
+            BatchCount);
+    }
+    else
+    {
+        throw std::runtime_error("wrong! this data_type & layout is not implemented");
+    }
+
+    return 0;
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_batched_gemm_reduce);
diff --git a/profiler/src/profile_batchnorm_bwd.cpp b/profiler/src/profile_batchnorm_bwd.cpp
new file mode 100644
index 00000000..44ce7350
--- /dev/null
+++ b/profiler/src/profile_batchnorm_bwd.cpp
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <vector>
+#include <getopt.h>
+
+#include "ck/library/utility/host_common_util.hpp"
+#include "profiler/profile_batchnorm_backward_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+using ck::index_t;
+
+using namespace std;
+
+static const struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'},
+                                             {"reduceDims", required_argument, nullptr, 'R'},
+                                             {"dumpout", required_argument, nullptr, 'o'},
+                                             {"verify", required_argument, nullptr, 'v'},
+                                             {"help", no_argument, nullptr, '?'},
+                                             {nullptr, 0, nullptr, 0}};
+
+class BatchnormBwdArgParser
+{
+    private:
+    int option_index = 0;
+
+    public:
+    std::vector<size_t> inLengths;
+    std::vector<int> reduceDims;
+
+    bool do_verification = false;
+    bool do_dumpout      = false;
+
+    bool haveSavedMeanInvVar;
+
+    int data_type    = 0;
+    int init_method  = 2;
+    bool time_kernel = false;
+
+    BatchnormBwdArgParser()  = default;
+    ~BatchnormBwdArgParser() = default;
+
+    void show_usage(const char* cmd)
+    {
+        // clang-format off
+        std::cout << "Usage of " << cmd << std::endl;
+        std::cout << "--inOutLengths or -D, comma separated list of input tensor dimension lengths, must have 4 integers for nhwc" << std::endl;
+        std::cout << "--reduceDims or -R, comma separated list of dimensions to reduce on" << std::endl;  
+        std::cout << "--verify or -v, 1/0 to indicate whether to verify the result by comparing with the host-based batch-normalization" << std::endl;
+        std::cout << "Arg1: data type (0: fp16, 1: fp32, 5: bp16, 6: fp64)" << std::endl;
+        std::cout << "Arg2 -- 1/0 to indicate whether to use saved mean and invVariance" << std::endl;
+        std::cout << "Arg3 -- init method used for dy and bnScale (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)" << std::endl;
+        std::cout << "Arg4 -- time kernel (0=no, 1=yes)" << std::endl;
+        // clang-format on
+    };
+
+    int operator()(int argc, char* argv[])
+    {
+        using ck::host_common::getTypeValuesFromString;
+
+        int ch;
+
+        optind++; // to skip the module name
+
+        while(1)
+        {
+            ch = getopt_long(argc, argv, "D:R:v:o:", long_options, &option_index);
+            if(ch == -1)
+                break;
+            switch(ch)
+            {
+            case 'D':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                inLengths = getTypeValuesFromString<size_t>(optarg);
+                break;
+            case 'R':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                reduceDims = getTypeValuesFromString<int>(optarg);
+                break;
+            case 'v':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                do_verification = static_cast<bool>(std::atoi(optarg));
+                break;
+            case 'o':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                do_dumpout = static_cast<bool>(std::atoi(optarg));
+                break;
+            case '?':
+                if(std::string(long_options[option_index].name) == "help")
+                {
+                    show_usage(argv[0]);
+                    return -1;
+                };
+                break;
+
+            default:
+                show_usage(argv[0]);
+                std::cerr << "Invalid cmd-line options!" << std::endl;
+                return -1;
+            };
+        };
+
+        if(optind + 4 > argc)
+            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
+
+        data_type           = std::atoi(argv[optind++]);
+        haveSavedMeanInvVar = std::atoi(argv[optind++]);
+        init_method         = std::atoi(argv[optind++]);
+        time_kernel         = static_cast<bool>(std::atoi(argv[optind++]));
+
+        if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6)
+            return -1;
+
+        return 0;
+    };
+}; // end of class AppArgs
+
+static const double epsilon = std::numeric_limits<float>::epsilon();
+
+int profile_batchnorm_backward(int argc, char* argv[])
+{
+    using ck::profiler::profile_batchnorm_backward_impl;
+
+    BatchnormBwdArgParser arg_parser;
+
+    if(arg_parser(argc, argv) != 0)
+        return -1;
+
+    using F16  = ck::half_t;
+    using F32  = float;
+    using BF16 = ck::bhalf_t;
+    using F64  = double;
+
+    if(arg_parser.data_type == 0)
+    {
+        if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
+        {
+            profile_batchnorm_backward_impl<F16, F32, F32, F32, F16, F32, F32, 4, 3>(
+                arg_parser.do_verification,
+                arg_parser.init_method,
+                arg_parser.do_dumpout,
+                arg_parser.time_kernel,
+                arg_parser.inLengths,
+                arg_parser.reduceDims,
+                arg_parser.haveSavedMeanInvVar,
+                epsilon);
+        };
+    }
+    else if(arg_parser.data_type == 1)
+    {
+        if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
+        {
+            profile_batchnorm_backward_impl<F32, F32, F32, F32, F32, F32, F32, 4, 3>(
+                arg_parser.do_verification,
+                arg_parser.init_method,
+                arg_parser.do_dumpout,
+                arg_parser.time_kernel,
+                arg_parser.inLengths,
+                arg_parser.reduceDims,
+                arg_parser.haveSavedMeanInvVar,
+                epsilon);
+        };
+    }
+    else if(arg_parser.data_type == 5)
+    {
+        if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
+        {
+            profile_batchnorm_backward_impl<BF16, F32, F32, F32, BF16, F32, F32, 4, 3>(
+                arg_parser.do_verification,
+                arg_parser.init_method,
+                arg_parser.do_dumpout,
+                arg_parser.time_kernel,
+                arg_parser.inLengths,
+                arg_parser.reduceDims,
+                arg_parser.haveSavedMeanInvVar,
+                epsilon);
+        };
+    }
+    else if(arg_parser.data_type == 6)
+    {
+        if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
+        {
+            profile_batchnorm_backward_impl<F64, F64, F64, F64, F64, F64, F64, 4, 3>(
+                arg_parser.do_verification,
+                arg_parser.init_method,
+                arg_parser.do_dumpout,
+                arg_parser.time_kernel,
+                arg_parser.inLengths,
+                arg_parser.reduceDims,
+                arg_parser.haveSavedMeanInvVar,
+                epsilon);
+        };
+    }
+
+    return 0;
+}
+
+REGISTER_PROFILER_OPERATION("bnorm_bwd", "Batchnorm backward", profile_batchnorm_backward);
diff --git a/profiler/src/profile_batchnorm_fwd.cpp b/profiler/src/profile_batchnorm_fwd.cpp
new file mode 100644
index 00000000..902a1fc4
--- /dev/null
+++ b/profiler/src/profile_batchnorm_fwd.cpp
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <vector>
+#include <getopt.h>
+
+#include "ck/library/utility/host_common_util.hpp"
+#include "profiler/profile_batchnorm_forward_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+using ck::index_t;
+
+using namespace std;
+
+static const struct option long_options[] = {{"inOutLengths", required_argument, nullptr, 'D'},
+                                             {"reduceDims", required_argument, nullptr, 'R'},
+                                             {"dumpout", required_argument, nullptr, 'o'},
+                                             {"verify", required_argument, nullptr, 'v'},
+                                             {"help", no_argument, nullptr, '?'},
+                                             {nullptr, 0, nullptr, 0}};
+
+class BatchnormFwdArgParser
+{
+    private:
+    int option_index = 0;
+
+    public:
+    std::vector<size_t> inLengths;
+    std::vector<int> reduceDims;
+
+    bool do_verification = false;
+    bool do_dumpout      = false;
+
+    bool updateMovingAverage;
+    bool saveMeanAndInvVariance;
+
+    int data_type    = 0;
+    int init_method  = 2;
+    bool time_kernel = false;
+
+    BatchnormFwdArgParser()  = default;
+    ~BatchnormFwdArgParser() = default;
+
+    void show_usage(const char* cmd)
+    {
+        // clang-format off
+        std::cout << "Usage of " << cmd << std::endl;
+        std::cout << "--inOutLengths or -D, comma separated list of input tensor dimension lengths, must have 4 integers for nhwc" << std::endl;
+        std::cout << "--reduceDims or -R, comma separated list of dimensions to reduce on" << std::endl;  
+        std::cout << "--verify or -v, 1/0 to indicate whether to verify the result by comparing with the host-based batch-normalization" << std::endl;
+        std::cout << "Arg1: data type (0: fp16, 1: fp32, 5: bp16, 6: fp64)" << std::endl;
+        std::cout << "Arg2: 1/0 to indicate whether to update the moving average and variance (0=no, 1=yes)" << std::endl;
+        std::cout << "Arg3: 1/0 to indicate whether to save the calculated mean and invVariance (0=no, 1=yes)" << std::endl;
+        std::cout << "Arg4: init method used for bnScale and bnBias (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)" << std::endl;
+        std::cout << "Arg5: time kernel (0=no, 1=yes)" << std::endl;
+        // clang-format on
+    };
+
+    int operator()(int argc, char* argv[])
+    {
+        using ck::host_common::getTypeValuesFromString;
+
+        int ch;
+
+        optind++; // to skip the module name
+
+        while(1)
+        {
+            ch = getopt_long(argc, argv, "D:R:v:o:", long_options, &option_index);
+            if(ch == -1)
+                break;
+            switch(ch)
+            {
+            case 'D':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                inLengths = getTypeValuesFromString<size_t>(optarg);
+                break;
+            case 'R':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                reduceDims = getTypeValuesFromString<int>(optarg);
+                break;
+            case 'v':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                do_verification = static_cast<bool>(std::atoi(optarg));
+                break;
+            case 'o':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                do_dumpout = static_cast<bool>(std::atoi(optarg));
+                break;
+            case '?':
+                if(std::string(long_options[option_index].name) == "help")
+                {
+                    show_usage(argv[0]);
+                    return -1;
+                };
+                break;
+
+            default:
+                show_usage(argv[0]);
+                std::cerr << "Invalid cmd-line options!" << std::endl;
+                return -1;
+            };
+        };
+
+        if(optind + 5 > argc)
+            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
+
+        data_type              = std::atoi(argv[optind++]);
+        updateMovingAverage    = std::atoi(argv[optind++]);
+        saveMeanAndInvVariance = std::atoi(argv[optind++]);
+        init_method            = std::atoi(argv[optind++]);
+        time_kernel            = static_cast<bool>(std::atoi(argv[optind++]));
+
+        if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6)
+            return -1;
+
+        return 0;
+    };
+}; // end of class AppArgs
+
+static const double epsilon       = std::numeric_limits<float>::epsilon();
+static const double averageFactor = 0.1;
+
+int profile_batchnorm_forward(int argc, char* argv[])
+{
+    using ck::profiler::profile_batchnorm_forward_impl;
+
+    BatchnormFwdArgParser arg_parser;
+
+    if(arg_parser(argc, argv) != 0)
+        return -1;
+
+    using F16  = ck::half_t;
+    using F32  = float;
+    using BF16 = ck::bhalf_t;
+    using F64  = double;
+
+    if(arg_parser.data_type == 0)
+    {
+        if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
+        {
+            profile_batchnorm_forward_impl<F16, F16, F32, F16, F16, F16, 4, 3>(
+                arg_parser.do_verification,
+                arg_parser.init_method,
+                arg_parser.do_dumpout,
+                arg_parser.time_kernel,
+                arg_parser.inLengths,
+                arg_parser.reduceDims,
+                arg_parser.updateMovingAverage,
+                arg_parser.saveMeanAndInvVariance,
+                epsilon,
+                averageFactor);
+        };
+    }
+    else if(arg_parser.data_type == 1)
+    {
+        if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
+        {
+            profile_batchnorm_forward_impl<F32, F32, F32, F32, F32, F32, 4, 3>(
+                arg_parser.do_verification,
+                arg_parser.init_method,
+                arg_parser.do_dumpout,
+                arg_parser.time_kernel,
+                arg_parser.inLengths,
+                arg_parser.reduceDims,
+                arg_parser.updateMovingAverage,
+                arg_parser.saveMeanAndInvVariance,
+                epsilon,
+                averageFactor);
+        };
+    }
+    else if(arg_parser.data_type == 5)
+    {
+        if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
+        {
+            profile_batchnorm_forward_impl<BF16, BF16, F32, BF16, BF16, F32, 4, 3>(
+                arg_parser.do_verification,
+                arg_parser.init_method,
+                arg_parser.do_dumpout,
+                arg_parser.time_kernel,
+                arg_parser.inLengths,
+                arg_parser.reduceDims,
+                arg_parser.updateMovingAverage,
+                arg_parser.saveMeanAndInvVariance,
+                epsilon,
+                averageFactor);
+        };
+    }
+    else if(arg_parser.data_type == 6)
+    {
+        if(arg_parser.inLengths.size() == 4 && arg_parser.reduceDims.size() == 3)
+        {
+            profile_batchnorm_forward_impl<F64, F64, F64, F64, F64, F64, 4, 3>(
+                arg_parser.do_verification,
+                arg_parser.init_method,
+                arg_parser.do_dumpout,
+                arg_parser.time_kernel,
+                arg_parser.inLengths,
+                arg_parser.reduceDims,
+                arg_parser.updateMovingAverage,
+                arg_parser.saveMeanAndInvVariance,
+                epsilon,
+                averageFactor);
+        };
+    }
+
+    return 0;
+}
+
+REGISTER_PROFILER_OPERATION("bnorm_fwd", "Batchnorm forward", profile_batchnorm_forward);
diff --git a/profiler/src/profile_conv_bwd_data.cpp b/profiler/src/profile_conv_bwd_data.cpp
new file mode 100644
index 00000000..9241ead7
--- /dev/null
+++ b/profiler/src/profile_conv_bwd_data.cpp
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_conv_bwd_data_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+namespace {
+
+enum struct ConvLayout
+{
+    NCHW_KCYX_NKHW, // 0
+    NHWC_KYXC_NHWK, // 1
+};
+
+enum struct ConvDataType
+{
+    F32_F32_F32,    // 0
+    F16_F16_F16,    // 1
+    BF16_BF16_BF16, // 2
+    INT8_INT8_INT8, // 3
+};
+
+#define OP_NAME "conv_bwd_data"
+#define OP_DESC "Convolution Backward Data"
+
+static void print_helper_msg()
+{
+    std::cout
+        << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
+        << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
+        << "                 1: Input fp16, Weight fp16, Output fp16\n"
+        << "                 2: Input bf16, Weight bf16, Output bf16\n"
+        << "                 3: Input int8, Weight int8, Output int8)\n"
+        << "arg3: tensor layout (0: Input[N, C, Hi, Wi], Weight[K, C, Y, X], Output[N, K, Ho, Wo]\n"
+        << "                     1: Input[N, Hi, Wi, C], Weight[K, Y, X, C], Output[N, Ho, Wo, "
+           "K])\n"
+        << "arg4: verification (0: no, 1: yes)\n"
+        << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
+        << "arg6: print tensor value (0: no; 1: yes)\n"
+        << "arg7: time kernel (0: no, 1: yes)\n"
+        << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+}
+
+} // namespace
+
+int profile_conv_bwd_data(int argc, char* argv[])
+{
+    // 8 for control, 1 for num_dim_spatial
+    if(argc < 9)
+    {
+        print_helper_msg();
+        return 1;
+    }
+
+    const auto data_type       = static_cast<ConvDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<ConvLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+    const int num_dim_spatial  = std::stoi(argv[8]);
+
+    // 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
+    if(argc != 8 + 1 + 4 + 6 * num_dim_spatial)
+    {
+        print_helper_msg();
+        return 1;
+    }
+
+    const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 9, argv);
+
+    using F32  = float;
+    using F16  = ck::half_t;
+    using BF16 = ck::bhalf_t;
+    using INT8 = int8_t;
+
+    using NWC   = ck::tensor_layout::convolution::NWC;
+    using NHWC  = ck::tensor_layout::convolution::NHWC;
+    using NDHWC = ck::tensor_layout::convolution::NDHWC;
+
+    using KXC   = ck::tensor_layout::convolution::KXC;
+    using KYXC  = ck::tensor_layout::convolution::KYXC;
+    using KZYXC = ck::tensor_layout::convolution::KZYXC;
+
+    using NWK   = ck::tensor_layout::convolution::NWK;
+    using NHWK  = ck::tensor_layout::convolution::NHWK;
+    using NDHWK = ck::tensor_layout::convolution::NDHWK;
+
+    constexpr auto I1 = ck::Number<1>{};
+    constexpr auto I2 = ck::Number<2>{};
+    constexpr auto I3 = ck::Number<3>{};
+
+    auto profile = [&](auto num_dim_spatial_tmp,
+                       auto in_layout,
+                       auto wei_layout,
+                       auto out_layout,
+                       auto in_type,
+                       auto wei_type,
+                       auto out_type) {
+        constexpr ck::index_t NDimSpatial = num_dim_spatial_tmp.value;
+
+        using InLayout  = decltype(in_layout);
+        using WeiLayout = decltype(wei_layout);
+        using OutLayout = decltype(out_layout);
+
+        using InDataType  = decltype(in_type);
+        using WeiDataType = decltype(wei_type);
+        using OutDataType = decltype(out_type);
+
+        bool pass = ck::profiler::profile_conv_bwd_data_impl<NDimSpatial,
+                                                             InLayout,
+                                                             WeiLayout,
+                                                             OutLayout,
+                                                             InDataType,
+                                                             WeiDataType,
+                                                             OutDataType>(
+            do_verification, init_method, do_log, time_kernel, params);
+
+        return pass ? 0 : 1;
+    };
+
+    if(num_dim_spatial == 1 && layout == ConvLayout::NHWC_KYXC_NHWK)
+    {
+        if(data_type == ConvDataType::F32_F32_F32)
+        {
+            return profile(I1, NWC{}, KXC{}, NWK{}, F32{}, F32{}, F32{});
+        }
+        else if(data_type == ConvDataType::F16_F16_F16)
+        {
+            return profile(I1, NWC{}, KXC{}, NWK{}, F16{}, F16{}, F16{});
+        }
+        else if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            return profile(I1, NWC{}, KXC{}, NWK{}, BF16{}, BF16{}, BF16{});
+        }
+        else if(data_type == ConvDataType::INT8_INT8_INT8)
+        {
+            return profile(I1, NWC{}, KXC{}, NWK{}, INT8{}, INT8{}, INT8{});
+        }
+    }
+    else if(num_dim_spatial == 2 && layout == ConvLayout::NHWC_KYXC_NHWK)
+    {
+        if(data_type == ConvDataType::F32_F32_F32)
+        {
+            return profile(I2, NHWC{}, KYXC{}, NHWK{}, F32{}, F32{}, F32{});
+        }
+        else if(data_type == ConvDataType::F16_F16_F16)
+        {
+            return profile(I2, NHWC{}, KYXC{}, NHWK{}, F16{}, F16{}, F16{});
+        }
+        else if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            return profile(I2, NHWC{}, KYXC{}, NHWK{}, BF16{}, BF16{}, BF16{});
+        }
+        else if(data_type == ConvDataType::INT8_INT8_INT8)
+        {
+            return profile(I2, NHWC{}, KYXC{}, NHWK{}, INT8{}, INT8{}, INT8{});
+        }
+    }
+    else if(num_dim_spatial == 3 && layout == ConvLayout::NHWC_KYXC_NHWK)
+    {
+        if(data_type == ConvDataType::F32_F32_F32)
+        {
+            return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, F32{}, F32{}, F32{});
+        }
+        else if(data_type == ConvDataType::F16_F16_F16)
+        {
+            return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, F16{}, F16{}, F16{});
+        }
+        else if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, BF16{}, BF16{}, BF16{});
+        }
+        else if(data_type == ConvDataType::INT8_INT8_INT8)
+        {
+            return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, INT8{}, INT8{}, INT8{});
+        }
+    }
+
+    std::cout << "this data_type & layout is not implemented" << std::endl;
+
+    return 1;
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_conv_bwd_data);
diff --git a/profiler/src/profile_conv_fwd.cpp b/profiler/src/profile_conv_fwd.cpp
new file mode 100644
index 00000000..b57ee7fd
--- /dev/null
+++ b/profiler/src/profile_conv_fwd.cpp
@@ -0,0 +1,192 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_conv_fwd_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+namespace {
+
+enum struct ConvLayout
+{
+    NCHW_KCYX_NKHW, // 0
+    NHWC_KYXC_NHWK, // 1
+};
+
+enum struct ConvDataType
+{
+    F32_F32_F32,    // 0
+    F16_F16_F16,    // 1
+    BF16_BF16_BF16, // 2
+    INT8_INT8_INT8, // 3
+};
+
+#define OP_NAME "conv_fwd"
+#define OP_DESC "Convolution Forward"
+
+static void print_helper_msg()
+{
+    std::cout
+        // clang-format-off
+        << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
+        << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
+        << "                 1: Input fp16, Weight fp16, Output fp16\n"
+        << "                 2: Input bf16, Weight bf16, Output bf16\n"
+        << "                 3: Input int8, Weight int8, Output int8)\n"
+        << "arg3: tensor layout (0: Input[N, C, Hi, Wi], Weight[K, C, Y, X], Output[N, K, Ho, Wo]\n"
+        << "                     1: Input[N, Hi, Wi, C], Weight[K, Y, X, C], Output[N, Ho, Wo, "
+           "K])\n"
+        << "arg4: verification (0: no, 1: yes)\n"
+        << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
+        << "arg6: print tensor value (0: no; 1: yes)\n"
+        << "arg7: time kernel (0: no, 1: yes)\n"
+        << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+    // clang-format-on
+}
+
+} // namespace
+
+int profile_conv_fwd(int argc, char* argv[])
+{
+    // 8 for control, 1 for num_dim_spatial
+    if(argc < 9)
+    {
+        print_helper_msg();
+        return 1;
+    }
+
+    const auto data_type       = static_cast<ConvDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<ConvLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+    const int num_dim_spatial  = std::stoi(argv[8]);
+
+    // 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
+    if(argc != 8 + 1 + 4 + 6 * num_dim_spatial)
+    {
+        print_helper_msg();
+        return 1;
+    }
+
+    const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 9, argv);
+
+    using F32  = float;
+    using F16  = ck::half_t;
+    using BF16 = ck::bhalf_t;
+    using INT8 = int8_t;
+
+    using NWC   = ck::tensor_layout::convolution::NWC;
+    using NHWC  = ck::tensor_layout::convolution::NHWC;
+    using NDHWC = ck::tensor_layout::convolution::NDHWC;
+
+    using KXC   = ck::tensor_layout::convolution::KXC;
+    using KYXC  = ck::tensor_layout::convolution::KYXC;
+    using KZYXC = ck::tensor_layout::convolution::KZYXC;
+
+    using NWK   = ck::tensor_layout::convolution::NWK;
+    using NHWK  = ck::tensor_layout::convolution::NHWK;
+    using NDHWK = ck::tensor_layout::convolution::NDHWK;
+
+    constexpr auto I1 = ck::Number<1>{};
+    constexpr auto I2 = ck::Number<2>{};
+    constexpr auto I3 = ck::Number<3>{};
+
+    auto profile = [&](auto num_dim_spatial_tmp,
+                       auto in_layout,
+                       auto wei_layout,
+                       auto out_layout,
+                       auto in_type,
+                       auto wei_type,
+                       auto out_type) {
+        constexpr ck::index_t NDimSpatial = num_dim_spatial_tmp.value;
+
+        using InLayout  = decltype(in_layout);
+        using WeiLayout = decltype(wei_layout);
+        using OutLayout = decltype(out_layout);
+
+        using InDataType  = decltype(in_type);
+        using WeiDataType = decltype(wei_type);
+        using OutDataType = decltype(out_type);
+
+        bool pass = ck::profiler::profile_conv_fwd_impl<NDimSpatial,
+                                                        InLayout,
+                                                        WeiLayout,
+                                                        OutLayout,
+                                                        InDataType,
+                                                        WeiDataType,
+                                                        OutDataType>(
+            do_verification, init_method, do_log, time_kernel, params);
+
+        return pass ? 0 : 1;
+    };
+
+    if(num_dim_spatial == 1 && layout == ConvLayout::NHWC_KYXC_NHWK)
+    {
+        if(data_type == ConvDataType::F32_F32_F32)
+        {
+            return profile(I1, NWC{}, KXC{}, NWK{}, F32{}, F32{}, F32{});
+        }
+        else if(data_type == ConvDataType::F16_F16_F16)
+        {
+            return profile(I1, NWC{}, KXC{}, NWK{}, F16{}, F16{}, F16{});
+        }
+        else if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            return profile(I1, NWC{}, KXC{}, NWK{}, BF16{}, BF16{}, BF16{});
+        }
+        else if(data_type == ConvDataType::INT8_INT8_INT8)
+        {
+            return profile(I1, NWC{}, KXC{}, NWK{}, INT8{}, INT8{}, INT8{});
+        }
+    }
+    else if(num_dim_spatial == 2 && layout == ConvLayout::NHWC_KYXC_NHWK)
+    {
+        if(data_type == ConvDataType::F32_F32_F32)
+        {
+            return profile(I2, NHWC{}, KYXC{}, NHWK{}, F32{}, F32{}, F32{});
+        }
+        else if(data_type == ConvDataType::F16_F16_F16)
+        {
+            return profile(I2, NHWC{}, KYXC{}, NHWK{}, F16{}, F16{}, F16{});
+        }
+        else if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            return profile(I2, NHWC{}, KYXC{}, NHWK{}, BF16{}, BF16{}, BF16{});
+        }
+        else if(data_type == ConvDataType::INT8_INT8_INT8)
+        {
+            return profile(I2, NHWC{}, KYXC{}, NHWK{}, INT8{}, INT8{}, INT8{});
+        }
+    }
+    else if(num_dim_spatial == 3 && layout == ConvLayout::NHWC_KYXC_NHWK)
+    {
+        if(data_type == ConvDataType::F32_F32_F32)
+        {
+            return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, F32{}, F32{}, F32{});
+        }
+        else if(data_type == ConvDataType::F16_F16_F16)
+        {
+            return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, F16{}, F16{}, F16{});
+        }
+        else if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, BF16{}, BF16{}, BF16{});
+        }
+        else if(data_type == ConvDataType::INT8_INT8_INT8)
+        {
+            return profile(I3, NDHWC{}, KZYXC{}, NDHWK{}, INT8{}, INT8{}, INT8{});
+        }
+    }
+
+    std::cout << "this data_type & layout is not implemented" << std::endl;
+
+    return 1;
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_conv_fwd);
diff --git a/profiler/src/profile_conv_fwd_bias_relu.cpp b/profiler/src/profile_conv_fwd_bias_relu.cpp
new file mode 100644
index 00000000..b44007cd
--- /dev/null
+++ b/profiler/src/profile_conv_fwd_bias_relu.cpp
@@ -0,0 +1,122 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_conv_fwd_bias_relu_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+enum struct ConvDataType
+{
+    F32_F32_F32, // 0
+    F16_F16_F16, // 1
+};
+
+enum struct ConvInputLayout
+{
+    NCHW, // 0
+    NHWC, // 1
+};
+
+enum struct ConvWeightLayout
+{
+    KCYX, // 0
+    KYXC, // 1
+};
+
+enum struct ConvOutputLayout
+{
+    NKHW, // 0
+    NHWK, // 1
+};
+
+#define OP_NAME "conv_fwd_bias_relu"
+#define OP_DESC "Convolution Forward+Bias+ReLU"
+
+int profile_conv_fwd_bias_relu(int argc, char* argv[])
+{
+    if(argc != 25)
+    {
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: fp32; 1: fp16)\n");
+        printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
+        printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
+        printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n");
+        printf("arg6: verification (0: no; 1: yes)\n");
+        printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg8: print tensor value (0: no; 1: yes)\n");
+        printf("arg9: time kernel (0=n0, 1=yes)\n");
+        printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(1);
+    }
+
+    const auto data_type       = static_cast<ConvDataType>(std::stoi(argv[2]));
+    const auto in_layout       = static_cast<ConvInputLayout>(std::stoi(argv[3]));
+    const auto wei_layout      = static_cast<ConvWeightLayout>(std::stoi(argv[4]));
+    const auto out_layout      = static_cast<ConvOutputLayout>(std::stoi(argv[5]));
+    const bool do_verification = std::stoi(argv[6]);
+    const int init_method      = std::stoi(argv[7]);
+    const bool do_log          = std::stoi(argv[8]);
+    const bool time_kernel     = std::stoi(argv[9]);
+
+    const ck::index_t N  = std::stoi(argv[10]);
+    const ck::index_t K  = std::stoi(argv[11]);
+    const ck::index_t C  = std::stoi(argv[12]);
+    const ck::index_t Y  = std::stoi(argv[13]);
+    const ck::index_t X  = std::stoi(argv[14]);
+    const ck::index_t Hi = std::stoi(argv[15]);
+    const ck::index_t Wi = std::stoi(argv[16]);
+
+    const ck::index_t conv_stride_h   = std::stoi(argv[17]);
+    const ck::index_t conv_stride_w   = std::stoi(argv[18]);
+    const ck::index_t conv_dilation_h = std::stoi(argv[19]);
+    const ck::index_t conv_dilation_w = std::stoi(argv[20]);
+    const ck::index_t in_left_pad_h   = std::stoi(argv[21]);
+    const ck::index_t in_left_pad_w   = std::stoi(argv[22]);
+    const ck::index_t in_right_pad_h  = std::stoi(argv[23]);
+    const ck::index_t in_right_pad_w  = std::stoi(argv[24]);
+
+    const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
+
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+
+    if(data_type == ConvDataType::F16_F16_F16 && in_layout == ConvInputLayout::NHWC &&
+       wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
+    {
+        ck::profiler::profile_conv_fwd_bias_relu_impl<2,
+                                                      ck::half_t,
+                                                      ck::half_t,
+                                                      ck::half_t,
+                                                      ck::tensor_layout::convolution::NHWC,
+                                                      ck::tensor_layout::convolution::KYXC,
+                                                      ck::tensor_layout::convolution::NHWK>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            N,
+            K,
+            C,
+            std::vector<ck::index_t>{Hi, Wi},
+            std::vector<ck::index_t>{Y, X},
+            std::vector<ck::index_t>{Ho, Wo},
+            std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
+            std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
+            std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
+            std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
+    }
+    else
+    {
+        throw std::runtime_error("wrong! data_type & layout for this operator is not implemented");
+    }
+
+    return 0;
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_conv_fwd_bias_relu);
diff --git a/profiler/src/profile_conv_fwd_bias_relu_add.cpp b/profiler/src/profile_conv_fwd_bias_relu_add.cpp
new file mode 100644
index 00000000..408dd02f
--- /dev/null
+++ b/profiler/src/profile_conv_fwd_bias_relu_add.cpp
@@ -0,0 +1,122 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_conv_fwd_bias_relu_add_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+enum struct ConvDataType
+{
+    F32_F32_F32, // 0
+    F16_F16_F16, // 1
+};
+
+enum struct ConvInputLayout
+{
+    NCHW, // 0
+    NHWC, // 1
+};
+
+enum struct ConvWeightLayout
+{
+    KCYX, // 0
+    KYXC, // 1
+};
+
+enum struct ConvOutputLayout
+{
+    NKHW, // 0
+    NHWK, // 1
+};
+
+#define OP_NAME "conv_fwd_bias_relu_add"
+#define OP_DESC "Convolution Forward+Bias+ReLU+Add"
+
+int profile_conv_fwd_bias_relu_add(int argc, char* argv[])
+{
+    if(argc != 25)
+    {
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: fp32; 1: fp16)\n");
+        printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
+        printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
+        printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n");
+        printf("arg6: verification (0: no; 1: yes)\n");
+        printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg8: print tensor value (0: no; 1: yes)\n");
+        printf("arg9: time kernel (0=n0, 1=yes)\n");
+        printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(1);
+    }
+
+    const auto data_type       = static_cast<ConvDataType>(std::stoi(argv[2]));
+    const auto in_layout       = static_cast<ConvInputLayout>(std::stoi(argv[3]));
+    const auto wei_layout      = static_cast<ConvWeightLayout>(std::stoi(argv[4]));
+    const auto out_layout      = static_cast<ConvOutputLayout>(std::stoi(argv[5]));
+    const bool do_verification = std::stoi(argv[6]);
+    const int init_method      = std::stoi(argv[7]);
+    const bool do_log          = std::stoi(argv[8]);
+    const bool time_kernel     = std::stoi(argv[9]);
+
+    const ck::index_t N  = std::stoi(argv[10]);
+    const ck::index_t K  = std::stoi(argv[11]);
+    const ck::index_t C  = std::stoi(argv[12]);
+    const ck::index_t Y  = std::stoi(argv[13]);
+    const ck::index_t X  = std::stoi(argv[14]);
+    const ck::index_t Hi = std::stoi(argv[15]);
+    const ck::index_t Wi = std::stoi(argv[16]);
+
+    const ck::index_t conv_stride_h   = std::stoi(argv[17]);
+    const ck::index_t conv_stride_w   = std::stoi(argv[18]);
+    const ck::index_t conv_dilation_h = std::stoi(argv[19]);
+    const ck::index_t conv_dilation_w = std::stoi(argv[20]);
+    const ck::index_t in_left_pad_h   = std::stoi(argv[21]);
+    const ck::index_t in_left_pad_w   = std::stoi(argv[22]);
+    const ck::index_t in_right_pad_h  = std::stoi(argv[23]);
+    const ck::index_t in_right_pad_w  = std::stoi(argv[24]);
+
+    const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
+
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+
+    if(data_type == ConvDataType::F16_F16_F16 && in_layout == ConvInputLayout::NHWC &&
+       wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
+    {
+        ck::profiler::profile_conv_fwd_bias_relu_add_impl<2,
+                                                          ck::half_t,
+                                                          ck::half_t,
+                                                          ck::half_t,
+                                                          ck::tensor_layout::convolution::NHWC,
+                                                          ck::tensor_layout::convolution::KYXC,
+                                                          ck::tensor_layout::convolution::NHWK>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            N,
+            K,
+            C,
+            std::vector<ck::index_t>{Hi, Wi},
+            std::vector<ck::index_t>{Y, X},
+            std::vector<ck::index_t>{Ho, Wo},
+            std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
+            std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
+            std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
+            std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
+    }
+    else
+    {
+        throw std::runtime_error("wrong! data_type & layout for this operator is not implemented");
+    }
+
+    return 0;
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_conv_fwd_bias_relu_add);
diff --git a/profiler/src/profile_gemm.cpp b/profiler/src/profile_gemm.cpp
new file mode 100644
index 00000000..61bae6ae
--- /dev/null
+++ b/profiler/src/profile_gemm.cpp
@@ -0,0 +1,192 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_gemm_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+enum struct GemmMatrixLayout
+{
+    MK_KN_MN, // 0
+    MK_NK_MN, // 1
+    KM_KN_MN, // 2
+    KM_NK_MN, // 3
+};
+
+enum struct GemmDataType
+{
+    F32_F32_F32,    // 0
+    F16_F16_F16,    // 1
+    BF16_BF16_BF16, // 2
+    INT8_INT8_INT8, // 3
+};
+
+#define OP_NAME "gemm"
+#define OP_DESC "GEMM"
+
+static void print_helper_msg()
+{
+    std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
+              << "arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n"
+              << "arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n"
+              << "                     1: A[m, k] * B[n, k] = C[m, n];\n"
+              << "                     2: A[k, m] * B[k, n] = C[m, n];\n"
+              << "                     3: A[k, m] * B[n, k] = C[m, n])\n"
+              << "arg4: verification (0: no; 1: yes)\n"
+              << "arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n"
+              << "arg6: print tensor value (0: no; 1: yes)\n"
+              << "arg7: time kernel (0: no, 1: yes)\n"
+              << "arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n"
+              << std::endl;
+}
+
+int profile_gemm(int argc, char* argv[])
+{
+    if(argc != 14)
+    {
+        print_helper_msg();
+        exit(1);
+    }
+
+    const auto data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+
+    const int M = std::stoi(argv[8]);
+    const int N = std::stoi(argv[9]);
+    const int K = std::stoi(argv[10]);
+
+    const int StrideA = std::stoi(argv[11]);
+    const int StrideB = std::stoi(argv[12]);
+    const int StrideC = std::stoi(argv[13]);
+
+    using F32   = float;
+    using F16   = ck::half_t;
+    using BF16  = ck::bhalf_t;
+    using INT8  = int8_t;
+    using INT32 = int32_t;
+
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    auto profile = [&](auto a_layout,
+                       auto b_layout,
+                       auto c_layout,
+                       auto a_type,
+                       auto b_type,
+                       auto acc_type,
+                       auto c_type) {
+        using ALayout = decltype(a_layout);
+        using BLayout = decltype(b_layout);
+        using CLayout = decltype(c_layout);
+
+        using ADataType   = decltype(a_type);
+        using BDataType   = decltype(b_type);
+        using AccDataType = decltype(acc_type);
+        using CDataType   = decltype(c_type);
+
+        const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
+        const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
+        const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M;
+
+        bool pass =
+            ck::profiler::profile_gemm_impl<ALayout,
+                                            BLayout,
+                                            CLayout,
+                                            ADataType,
+                                            BDataType,
+                                            AccDataType,
+                                            CDataType>(do_verification,
+                                                       init_method,
+                                                       do_log,
+                                                       time_kernel,
+                                                       M,
+                                                       N,
+                                                       K,
+                                                       (StrideA < 0) ? DefaultStrideA : StrideA,
+                                                       (StrideB < 0) ? DefaultStrideB : StrideB,
+                                                       (StrideC < 0) ? DefaultStrideC : StrideC);
+
+        return pass ? 0 : 1;
+    };
+
+    if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(Row{}, Row{}, Row{}, F32{}, F32{}, F32{}, F32{});
+    }
+    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(Row{}, Col{}, Row{}, F32{}, F32{}, F32{}, F32{});
+    }
+    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        return profile(Col{}, Row{}, Row{}, F32{}, F32{}, F32{}, F32{});
+    }
+    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        return profile(Col{}, Col{}, Row{}, F32{}, F32{}, F32{}, F32{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(Row{}, Row{}, Row{}, F16{}, F16{}, F32{}, F16{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(Row{}, Col{}, Row{}, F16{}, F16{}, F32{}, F16{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        return profile(Col{}, Row{}, Row{}, F16{}, F16{}, F32{}, F16{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        return profile(Col{}, Col{}, Row{}, F16{}, F16{}, F32{}, F16{});
+    }
+    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(Row{}, Row{}, Row{}, BF16{}, BF16{}, F32{}, BF16{});
+    }
+    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(Row{}, Col{}, Row{}, BF16{}, BF16{}, F32{}, BF16{});
+    }
+    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        return profile(Col{}, Row{}, Row{}, BF16{}, BF16{}, F32{}, BF16{});
+    }
+    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        return profile(Col{}, Col{}, Row{}, BF16{}, BF16{}, F32{}, BF16{});
+    }
+    else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(Row{}, Row{}, Row{}, INT8{}, INT8{}, INT32{}, INT8{});
+    }
+    else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(Row{}, Col{}, Row{}, INT8{}, INT8{}, INT32{}, INT8{});
+    }
+    else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        return profile(Col{}, Row{}, Row{}, INT8{}, INT8{}, INT32{}, INT8{});
+    }
+    else if(data_type == GemmDataType::INT8_INT8_INT8 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        return profile(Col{}, Col{}, Row{}, INT8{}, INT8{}, INT32{}, INT8{});
+    }
+    else
+    {
+        std::cout << "this data_type & layout is not implemented" << std::endl;
+
+        return 1;
+    }
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm);
diff --git a/profiler/src/profile_gemm_add_add_fastgelu.cpp b/profiler/src/profile_gemm_add_add_fastgelu.cpp
new file mode 100644
index 00000000..c3c0fb7b
--- /dev/null
+++ b/profiler/src/profile_gemm_add_add_fastgelu.cpp
@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_gemm_add_add_fastgelu_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+#define OP_NAME "gemm_add_add_fastgelu"
+#define OP_DESC "GEMM+Add+Add+FastGeLU"
+
+int profile_gemm_add_add_fastgelu(int argc, char* argv[])
+{
+    enum struct MatrixLayout
+    {
+        MK_KN_MN_MN_MN, // 0
+        MK_NK_MN_MN_MN, // 1
+        KM_KN_MN_MN_MN, // 2
+        KM_NK_MN_MN_MN, // 3
+    };
+
+    enum struct MatrixDataType
+    {
+        F32_F32_F32_F32_F32,      // 0
+        F16_F16_F16_F16_F16,      // 1
+        BF16_BF16_BF16_BF16_BF16, // 2
+        INT8_INT8_INT8_INT8_INT8, // 3
+    };
+
+    if(argc != 16)
+    {
+        // clang-format off
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
+        printf("arg3: matrix layout (0: E[m, n] = FastGeLU(A[m, k] * B[k, n] + D0[m, n] + D1[m, n]);\n");
+        printf("                     1: E[m, n] = FastGeLU(A[m, k] * B[n, k] + D0[m, n] + D1[m, n]);\n");
+        printf("                     2: E[m, n] = FastGeLU(A[k, m] * B[k, n] + D0[m, n] + D1[m, n]);\n");
+        printf("                     3: E[m, n] = FastGeLU(A[k, m] * B[n, k] + D0[m, n] + D1[m, n]))\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=no, 1=yes)\n");
+        printf("arg8 to 15: M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE\n");
+        // clang-format on
+        exit(1);
+    }
+
+    const auto data_type       = static_cast<MatrixDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<MatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+
+    const int M = std::stoi(argv[8]);
+    const int N = std::stoi(argv[9]);
+    const int K = std::stoi(argv[10]);
+
+    const int StrideA  = std::stoi(argv[11]);
+    const int StrideB  = std::stoi(argv[12]);
+    const int StrideD0 = std::stoi(argv[13]);
+    const int StrideD1 = std::stoi(argv[14]);
+    const int StrideE  = std::stoi(argv[15]);
+
+    using F16 = ck::half_t;
+    using F32 = float;
+
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    auto profile = [&](auto a_type,
+                       auto b_type,
+                       auto acc_type,
+                       auto d0_type,
+                       auto d1_type,
+                       auto e_type,
+                       auto a_layout,
+                       auto b_layout,
+                       auto d0_layout,
+                       auto d1_layout,
+                       auto e_layout) {
+        using ADataType   = decltype(a_type);
+        using BDataType   = decltype(b_type);
+        using AccDataType = decltype(acc_type);
+        using D0DataType  = decltype(d0_type);
+        using D1DataType  = decltype(d1_type);
+        using EDataType   = decltype(e_type);
+
+        using ALayout  = decltype(a_layout);
+        using BLayout  = decltype(b_layout);
+        using D0Layout = decltype(d0_layout);
+        using D1Layout = decltype(d1_layout);
+        using ELayout  = decltype(e_layout);
+
+        const int DefaultStrideA  = ck::is_same_v<ALayout, Row> ? K : M;
+        const int DefaultStrideB  = ck::is_same_v<BLayout, Row> ? N : K;
+        const int DefaultStrideD0 = ck::is_same_v<D0Layout, Row> ? N : M;
+        const int DefaultStrideD1 = ck::is_same_v<D1Layout, Row> ? N : M;
+        const int DefaultStrideE  = ck::is_same_v<ELayout, Row> ? N : M;
+
+        bool pass = ck::profiler::profile_gemm_add_add_fastgelu_impl<ADataType,
+                                                                     BDataType,
+                                                                     AccDataType,
+                                                                     D0DataType,
+                                                                     D1DataType,
+                                                                     EDataType,
+                                                                     ALayout,
+                                                                     BLayout,
+                                                                     D0Layout,
+                                                                     D1Layout,
+                                                                     ELayout>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? DefaultStrideA : StrideA,
+            (StrideB < 0) ? DefaultStrideB : StrideB,
+            (StrideD0 < 0) ? DefaultStrideD0 : StrideD0,
+            (StrideD1 < 0) ? DefaultStrideD1 : StrideD1,
+            (StrideE < 0) ? DefaultStrideE : StrideE);
+
+        return pass ? 0 : 1;
+    };
+
+    if(data_type == MatrixDataType::F16_F16_F16_F16_F16 && layout == MatrixLayout::MK_KN_MN_MN_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Row{}, Row{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == MatrixDataType::F16_F16_F16_F16_F16 &&
+            layout == MatrixLayout::MK_NK_MN_MN_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Row{}, Col{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == MatrixDataType::F16_F16_F16_F16_F16 &&
+            layout == MatrixLayout::KM_KN_MN_MN_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Col{}, Row{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == MatrixDataType::F16_F16_F16_F16_F16 &&
+            layout == MatrixLayout::KM_NK_MN_MN_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Col{}, Col{}, Row{}, Row{}, Row{});
+    }
+    else
+    {
+        std::cout << "this data_type & layout is not implemented" << std::endl;
+
+        return 1;
+    }
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_add_add_fastgelu);
diff --git a/profiler/src/profile_gemm_add_fastgelu.cpp b/profiler/src/profile_gemm_add_fastgelu.cpp
new file mode 100644
index 00000000..380b25a6
--- /dev/null
+++ b/profiler/src/profile_gemm_add_fastgelu.cpp
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_gemm_add_fastgelu_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+#define OP_NAME "gemm_add_fastgelu"
+#define OP_DESC "GEMM+Add+FastGeLU"
+
+int profile_gemm_add_fastgelu(int argc, char* argv[])
+{
+    enum struct MatrixLayout
+    {
+        MK_KN_MN_MN, // 0
+        MK_NK_MN_MN, // 1
+        KM_KN_MN_MN, // 2
+        KM_NK_MN_MN, // 3
+    };
+
+    enum struct MatrixDataType
+    {
+        F32_F32_F32_F32,     // 0
+        F16_F16_F16_F16,     // 1
+        BF16_BF16_BF16_BF16, // 2
+        INT8_INT8_INT8_INT8, // 3
+    };
+
+    if(argc != 15)
+    {
+        // clang-format off
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
+        printf("arg3: matrix layout (0: E[m, n] = FastGeLU(A[m, k] * B[k, n] + D0[m, n]);\n");
+        printf("                     1: E[m, n] = FastGeLU(A[m, k] * B[n, k] + D0[m, n]);\n");
+        printf("                     2: E[m, n] = FastGeLU(A[k, m] * B[k, n] + D0[m, n]);\n");
+        printf("                     3: E[m, n] = FastGeLU(A[k, m] * B[n, k] + D0[m, n]))\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=no, 1=yes)\n");
+        printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideD0, StrideE\n");
+        // clang-format on
+        exit(1);
+    }
+
+    const auto data_type       = static_cast<MatrixDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<MatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+
+    const int M = std::stoi(argv[8]);
+    const int N = std::stoi(argv[9]);
+    const int K = std::stoi(argv[10]);
+
+    const int StrideA  = std::stoi(argv[11]);
+    const int StrideB  = std::stoi(argv[12]);
+    const int StrideD0 = std::stoi(argv[13]);
+    const int StrideE  = std::stoi(argv[14]);
+
+    using F16 = ck::half_t;
+    using F32 = float;
+
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    auto profile = [&](auto a_type,
+                       auto b_type,
+                       auto acc_type,
+                       auto d0_type,
+                       auto e_type,
+                       auto a_layout,
+                       auto b_layout,
+                       auto d0_layout,
+                       auto e_layout) {
+        using ADataType   = decltype(a_type);
+        using BDataType   = decltype(b_type);
+        using AccDataType = decltype(acc_type);
+        using D0DataType  = decltype(d0_type);
+        using EDataType   = decltype(e_type);
+
+        using ALayout  = decltype(a_layout);
+        using BLayout  = decltype(b_layout);
+        using D0Layout = decltype(d0_layout);
+        using ELayout  = decltype(e_layout);
+
+        const int DefaultStrideA  = ck::is_same_v<ALayout, Row> ? K : M;
+        const int DefaultStrideB  = ck::is_same_v<BLayout, Row> ? N : K;
+        const int DefaultStrideD0 = ck::is_same_v<D0Layout, Row> ? N : M;
+        const int DefaultStrideE  = ck::is_same_v<ELayout, Row> ? N : M;
+
+        bool pass = ck::profiler::profile_gemm_add_fastgelu_impl<ADataType,
+                                                                 BDataType,
+                                                                 AccDataType,
+                                                                 D0DataType,
+                                                                 EDataType,
+                                                                 ALayout,
+                                                                 BLayout,
+                                                                 D0Layout,
+                                                                 ELayout>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? DefaultStrideA : StrideA,
+            (StrideB < 0) ? DefaultStrideB : StrideB,
+            (StrideD0 < 0) ? DefaultStrideD0 : StrideD0,
+            (StrideE < 0) ? DefaultStrideE : StrideE);
+
+        return pass ? 0 : 1;
+    };
+
+    if(data_type == MatrixDataType::F16_F16_F16_F16 && layout == MatrixLayout::MK_KN_MN_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, F16{}, Row{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == MatrixDataType::F16_F16_F16_F16 && layout == MatrixLayout::MK_NK_MN_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, F16{}, Row{}, Col{}, Row{}, Row{});
+    }
+    else if(data_type == MatrixDataType::F16_F16_F16_F16 && layout == MatrixLayout::KM_KN_MN_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, F16{}, Col{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == MatrixDataType::F16_F16_F16_F16 && layout == MatrixLayout::KM_NK_MN_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, F16{}, Col{}, Col{}, Row{}, Row{});
+    }
+    else
+    {
+        std::cout << "this data_type & layout is not implemented" << std::endl;
+
+        return 1;
+    }
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_add_fastgelu);
diff --git a/profiler/src/profile_gemm_bias_add_reduce.cpp b/profiler/src/profile_gemm_bias_add_reduce.cpp
new file mode 100644
index 00000000..6d86db08
--- /dev/null
+++ b/profiler/src/profile_gemm_bias_add_reduce.cpp
@@ -0,0 +1,167 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_gemm_bias_add_reduce_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+#define OP_NAME "gemm_bias_add_reduce"
+#define OP_DESC "GEMM+Bias+Add+Reduce"
+
+int profile_gemm_bias_add_reduce(int argc, char* argv[])
+{
+    enum struct GemmMatrixLayout
+    {
+        MK_KN_MN, // 0
+        MK_NK_MN, // 1
+        KM_KN_MN, // 2
+        KM_NK_MN, // 3
+    };
+
+    enum struct GemmReduceDataType
+    {
+        F32_F32_F32_F32_F32_F32_F32, // 0
+        F16_F16_F16_F16_F16_F32_F32, // 1
+    };
+
+    if(!(argc == 14 || argc == 15))
+    {
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: fp32; 1: fp16)\n");
+        printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
+        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
+        printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
+        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
+        printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideC, StrideC1\n");
+        exit(1);
+    }
+
+    const auto data_type       = static_cast<GemmReduceDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+
+    const int M = std::stoi(argv[8]);
+    const int N = std::stoi(argv[9]);
+    const int K = std::stoi(argv[10]);
+
+    const int StrideA  = std::stoi(argv[11]);
+    const int StrideB  = std::stoi(argv[12]);
+    const int StrideC  = std::stoi(argv[13]);
+    const int StrideC1 = std::stoi(argv[14]);
+
+    if(data_type == GemmReduceDataType::F16_F16_F16_F16_F16_F32_F32 &&
+       layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        ck::profiler::profile_gemm_bias_add_reduce_impl<ck::half_t,
+                                                        ck::half_t,
+                                                        ck::half_t,
+                                                        ck::half_t,
+                                                        ck::half_t,
+                                                        float,
+                                                        ck::tensor_layout::gemm::RowMajor,
+                                                        ck::tensor_layout::gemm::RowMajor,
+                                                        ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? M : StrideA,
+            (StrideB < 0) ? N : StrideB,
+            (StrideC < 0) ? N : StrideC,
+            (StrideC1 < 0) ? N : StrideC1);
+    }
+    else if(data_type == GemmReduceDataType::F16_F16_F16_F16_F16_F32_F32 &&
+            layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        ck::profiler::profile_gemm_bias_add_reduce_impl<ck::half_t,
+                                                        ck::half_t,
+                                                        ck::half_t,
+                                                        ck::half_t,
+                                                        ck::half_t,
+                                                        float,
+                                                        ck::tensor_layout::gemm::RowMajor,
+                                                        ck::tensor_layout::gemm::ColumnMajor,
+                                                        ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? M : StrideA,
+            (StrideB < 0) ? N : StrideB,
+            (StrideC < 0) ? N : StrideC,
+            (StrideC1 < 0) ? N : StrideC1);
+    }
+    else if(data_type == GemmReduceDataType::F16_F16_F16_F16_F16_F32_F32 &&
+            layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        ck::profiler::profile_gemm_bias_add_reduce_impl<ck::half_t,
+                                                        ck::half_t,
+                                                        ck::half_t,
+                                                        ck::half_t,
+                                                        ck::half_t,
+                                                        float,
+                                                        ck::tensor_layout::gemm::ColumnMajor,
+                                                        ck::tensor_layout::gemm::RowMajor,
+                                                        ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? M : StrideA,
+            (StrideB < 0) ? N : StrideB,
+            (StrideC < 0) ? N : StrideC,
+            (StrideC1 < 0) ? N : StrideC1);
+    }
+    else if(data_type == GemmReduceDataType::F16_F16_F16_F16_F16_F32_F32 &&
+            layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        ck::profiler::profile_gemm_bias_add_reduce_impl<ck::half_t,
+                                                        ck::half_t,
+                                                        ck::half_t,
+                                                        ck::half_t,
+                                                        ck::half_t,
+                                                        float,
+                                                        ck::tensor_layout::gemm::ColumnMajor,
+                                                        ck::tensor_layout::gemm::ColumnMajor,
+                                                        ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? M : StrideA,
+            (StrideB < 0) ? N : StrideB,
+            (StrideC < 0) ? N : StrideC,
+            (StrideC1 < 0) ? N : StrideC1);
+    }
+    else
+    {
+        throw std::runtime_error("wrong! this data_type & layout is not implemented");
+    }
+
+    return 0;
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_bias_add_reduce);
diff --git a/profiler/src/profile_gemm_bilinear.cpp b/profiler/src/profile_gemm_bilinear.cpp
new file mode 100644
index 00000000..3480014b
--- /dev/null
+++ b/profiler/src/profile_gemm_bilinear.cpp
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_gemm_bilinear_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+#define OP_NAME "gemm_bilinear"
+#define OP_DESC "GEMM+Bilinear"
+
+int profile_gemm_bilinear(int argc, char* argv[])
+{
+    enum struct MatrixLayout
+    {
+        MK_KN_MN_MN, // 0
+        MK_NK_MN_MN, // 1
+        KM_KN_MN_MN, // 2
+        KM_NK_MN_MN, // 3
+    };
+
+    enum struct MatrixDataType
+    {
+        F32_F32_F32_F32,     // 0
+        F16_F16_F16_F16,     // 1
+        BF16_BF16_BF16_BF16, // 2
+        INT8_INT8_INT8_INT8, // 3
+    };
+
+    if(argc != 17)
+    {
+        // clang-format off
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
+        printf("arg3: matrix layout (0: E[m, n] = alpha * A[m, k] * B[k, n] + beta * D[m, n];\n");
+        printf("                     1: E[m, n] = alpha * A[m, k] * B[n, k] + beta * D[m, n];\n");
+        printf("                     2: E[m, n] = alpha * A[k, m] * B[k, n] + beta * D[m, n];\n");
+        printf("                     3: E[m, n] = alpha * A[k, m] * B[n, k] + beta * D[m, n])\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=no, 1=yes)\n");
+        printf("arg8 to 14: M, N, K, StrideA, StrideB, StrideD, StrideE\n");
+        printf("arg15 to 16: alhpa, beta\n");
+        // clang-format on
+        exit(1);
+    }
+
+    const auto data_type       = static_cast<MatrixDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<MatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+
+    const int M = std::stoi(argv[8]);
+    const int N = std::stoi(argv[9]);
+    const int K = std::stoi(argv[10]);
+
+    const int StrideA = std::stoi(argv[11]);
+    const int StrideB = std::stoi(argv[12]);
+    const int StrideD = std::stoi(argv[13]);
+    const int StrideE = std::stoi(argv[14]);
+
+    const float alpha = std::stof(argv[15]);
+    const float beta  = std::stof(argv[16]);
+
+    using F16 = ck::half_t;
+    using F32 = float;
+
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    auto profile = [&](auto a_type,
+                       auto b_type,
+                       auto acc_type,
+                       auto d_type,
+                       auto e_type,
+                       auto a_layout,
+                       auto b_layout,
+                       auto d_layout,
+                       auto e_layout) {
+        using ADataType   = decltype(a_type);
+        using BDataType   = decltype(b_type);
+        using AccDataType = decltype(acc_type);
+        using DDataType   = decltype(d_type);
+        using EDataType   = decltype(e_type);
+
+        using ALayout = decltype(a_layout);
+        using BLayout = decltype(b_layout);
+        using DLayout = decltype(d_layout);
+        using ELayout = decltype(e_layout);
+
+        const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
+        const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
+        const int DefaultStrideD = ck::is_same_v<DLayout, Row> ? N : M;
+        const int DefaultStrideE = ck::is_same_v<ELayout, Row> ? N : M;
+
+        bool pass = ck::profiler::profile_gemm_bilinear_impl<ADataType,
+                                                             BDataType,
+                                                             AccDataType,
+                                                             DDataType,
+                                                             EDataType,
+                                                             ALayout,
+                                                             BLayout,
+                                                             DLayout,
+                                                             ELayout>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? DefaultStrideA : StrideA,
+            (StrideB < 0) ? DefaultStrideB : StrideB,
+            (StrideD < 0) ? DefaultStrideD : StrideD,
+            (StrideE < 0) ? DefaultStrideE : StrideE,
+            alpha,
+            beta);
+
+        return pass ? 0 : 1;
+    };
+
+    if(data_type == MatrixDataType::F16_F16_F16_F16 && layout == MatrixLayout::MK_KN_MN_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, F16{}, Row{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == MatrixDataType::F16_F16_F16_F16 && layout == MatrixLayout::MK_NK_MN_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, F16{}, Row{}, Col{}, Row{}, Row{});
+    }
+    else if(data_type == MatrixDataType::F16_F16_F16_F16 && layout == MatrixLayout::KM_KN_MN_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, F16{}, Col{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == MatrixDataType::F16_F16_F16_F16 && layout == MatrixLayout::KM_NK_MN_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, F16{}, Col{}, Col{}, Row{}, Row{});
+    }
+    else
+    {
+        std::cout << "this data_type & layout is not implemented" << std::endl;
+
+        return 1;
+    }
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_bilinear);
diff --git a/profiler/src/profile_gemm_fastgelu.cpp b/profiler/src/profile_gemm_fastgelu.cpp
new file mode 100644
index 00000000..2a137224
--- /dev/null
+++ b/profiler/src/profile_gemm_fastgelu.cpp
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_gemm_fastgelu_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+#define OP_NAME "gemm_fastgelu"
+#define OP_DESC "GEMM+FastGeLU"
+
+int profile_gemm_fastgelu(int argc, char* argv[])
+{
+    enum struct MatrixLayout
+    {
+        MK_KN_MN, // 0
+        MK_NK_MN, // 1
+        KM_KN_MN, // 2
+        KM_NK_MN, // 3
+    };
+
+    enum struct MatrixDataType
+    {
+        F32_F32_F32,    // 0
+        F16_F16_F16,    // 1
+        BF16_BF16_BF16, // 2
+        INT8_INT8_INT8, // 3
+    };
+
+    if(argc != 14)
+    {
+        // clang-format off
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
+        printf("arg3: matrix layout (0: E[m, n] = FastGeLU(A[m, k] * B[k, n]);\n");
+        printf("                     1: E[m, n] = FastGeLU(A[m, k] * B[n, k]);\n");
+        printf("                     2: E[m, n] = FastGeLU(A[k, m] * B[k, n]);\n");
+        printf("                     3: E[m, n] = FastGeLU(A[k, m] * B[n, k]))\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=no, 1=yes)\n");
+        printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideE\n");
+        // clang-format on
+        exit(1);
+    }
+
+    const auto data_type       = static_cast<MatrixDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<MatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+
+    const int M = std::stoi(argv[8]);
+    const int N = std::stoi(argv[9]);
+    const int K = std::stoi(argv[10]);
+
+    const int StrideA = std::stoi(argv[11]);
+    const int StrideB = std::stoi(argv[12]);
+    const int StrideE = std::stoi(argv[13]);
+
+    using F16 = ck::half_t;
+    using F32 = float;
+
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    auto profile = [&](auto a_type,
+                       auto b_type,
+                       auto acc_type,
+                       auto e_type,
+                       auto a_layout,
+                       auto b_layout,
+                       auto e_layout) {
+        using ADataType   = decltype(a_type);
+        using BDataType   = decltype(b_type);
+        using AccDataType = decltype(acc_type);
+        using EDataType   = decltype(e_type);
+
+        using ALayout = decltype(a_layout);
+        using BLayout = decltype(b_layout);
+        using ELayout = decltype(e_layout);
+
+        const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
+        const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
+        const int DefaultStrideE = ck::is_same_v<ELayout, Row> ? N : M;
+
+        bool pass = ck::profiler::profile_gemm_fastgelu_impl<ADataType,
+                                                             BDataType,
+                                                             AccDataType,
+                                                             EDataType,
+                                                             ALayout,
+                                                             BLayout,
+                                                             ELayout>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? DefaultStrideA : StrideA,
+            (StrideB < 0) ? DefaultStrideB : StrideB,
+            (StrideE < 0) ? DefaultStrideE : StrideE);
+
+        return pass ? 0 : 1;
+    };
+
+    if(data_type == MatrixDataType::F16_F16_F16 && layout == MatrixLayout::MK_KN_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == MatrixDataType::F16_F16_F16 && layout == MatrixLayout::MK_NK_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == MatrixDataType::F16_F16_F16 && layout == MatrixLayout::KM_KN_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, Col{}, Row{}, Row{});
+    }
+    else if(data_type == MatrixDataType::F16_F16_F16 && layout == MatrixLayout::KM_NK_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, Col{}, Col{}, Row{});
+    }
+    else
+    {
+        std::cout << "this data_type & layout is not implemented" << std::endl;
+
+        return 1;
+    }
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_fastgelu);
diff --git a/profiler/src/profile_gemm_reduce.cpp b/profiler/src/profile_gemm_reduce.cpp
new file mode 100644
index 00000000..395bf062
--- /dev/null
+++ b/profiler/src/profile_gemm_reduce.cpp
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_gemm_reduce_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+#define OP_NAME "gemm_reduce"
+#define OP_DESC "GEMM+Reduce"
+
+int profile_gemm_reduce(int argc, char* argv[])
+{
+    enum struct GemmMatrixLayout
+    {
+        MK_KN_MN, // 0
+        MK_NK_MN, // 1
+        KM_KN_MN, // 2
+        KM_NK_MN, // 3
+    };
+
+    enum struct GemmReduceDataType
+    {
+        F32_F32_F32_F32_F32, // 0
+        F16_F16_F16_F32_F32, // 1
+    };
+
+    if(!(argc == 14 || argc == 15))
+    {
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: fp32; 1: fp16)\n");
+        printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
+        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
+        printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
+        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
+        printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
+        printf("arg14: split k into  mulitiple batch\n");
+        exit(1);
+    }
+
+    const auto data_type       = static_cast<GemmReduceDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+
+    const int M = std::stoi(argv[8]);
+    const int N = std::stoi(argv[9]);
+    const int K = std::stoi(argv[10]);
+
+    const int StrideA = std::stoi(argv[11]);
+    const int StrideB = std::stoi(argv[12]);
+    const int StrideC = std::stoi(argv[13]);
+
+    if(data_type == GemmReduceDataType::F16_F16_F16_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        ck::profiler::profile_gemm_reduce_impl<ck::half_t,
+                                               ck::half_t,
+                                               ck::half_t,
+                                               float,
+                                               ck::tensor_layout::gemm::RowMajor,
+                                               ck::tensor_layout::gemm::RowMajor,
+                                               ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? K : StrideA,
+            (StrideB < 0) ? N : StrideB,
+            (StrideC < 0) ? N : StrideC);
+    }
+    else if(data_type == GemmReduceDataType::F16_F16_F16_F32_F32 &&
+            layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        ck::profiler::profile_gemm_reduce_impl<ck::half_t,
+                                               ck::half_t,
+                                               ck::half_t,
+                                               float,
+                                               ck::tensor_layout::gemm::RowMajor,
+                                               ck::tensor_layout::gemm::ColumnMajor,
+                                               ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? K : StrideA,
+            (StrideB < 0) ? K : StrideB,
+            (StrideC < 0) ? N : StrideC);
+    }
+    else if(data_type == GemmReduceDataType::F16_F16_F16_F32_F32 &&
+            layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        ck::profiler::profile_gemm_reduce_impl<ck::half_t,
+                                               ck::half_t,
+                                               ck::half_t,
+                                               float,
+                                               ck::tensor_layout::gemm::ColumnMajor,
+                                               ck::tensor_layout::gemm::RowMajor,
+                                               ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? M : StrideA,
+            (StrideB < 0) ? N : StrideB,
+            (StrideC < 0) ? N : StrideC);
+    }
+    else if(data_type == GemmReduceDataType::F16_F16_F16_F32_F32 &&
+            layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        ck::profiler::profile_gemm_reduce_impl<ck::half_t,
+                                               ck::half_t,
+                                               ck::half_t,
+                                               float,
+                                               ck::tensor_layout::gemm::ColumnMajor,
+                                               ck::tensor_layout::gemm::ColumnMajor,
+                                               ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? M : StrideA,
+            (StrideB < 0) ? K : StrideB,
+            (StrideC < 0) ? N : StrideC);
+    }
+    else
+    {
+        throw std::runtime_error("wrong! this data_type & layout is not implemented");
+    }
+
+    return 0;
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_reduce);
diff --git a/profiler/src/profile_gemm_splitk.cpp b/profiler/src/profile_gemm_splitk.cpp
new file mode 100644
index 00000000..f636ce71
--- /dev/null
+++ b/profiler/src/profile_gemm_splitk.cpp
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_gemm_splitk_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+enum struct GemmMatrixLayout
+{
+    MK_KN_MN, // 0
+    MK_NK_MN, // 1
+    KM_KN_MN, // 2
+    KM_NK_MN, // 3
+};
+
+enum struct GemmDataType
+{
+    F32_F32_F32,    // 0
+    F16_F16_F16,    // 1
+    BF16_BF16_BF16, // 2
+    INT8_INT8_INT8, // 3
+};
+
+#define OP_NAME "gemm_splitk"
+#define OP_DESC "Split-K GEMM"
+
+int profile_gemm_splitk(int argc, char* argv[])
+{
+    if(argc != 15)
+    {
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
+        printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
+        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
+        printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
+        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=no, 1=yes)\n");
+        printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
+        printf("arg14: split k into  mulitiple batch\n");
+        exit(1);
+    }
+
+    const auto data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+
+    const int M = std::stoi(argv[8]);
+    const int N = std::stoi(argv[9]);
+    const int K = std::stoi(argv[10]);
+
+    const int StrideA = std::stoi(argv[11]);
+    const int StrideB = std::stoi(argv[12]);
+    const int StrideC = std::stoi(argv[13]);
+    const int KBatch  = std::stoi(argv[14]);
+
+    using F32 = float;
+    using F16 = ck::half_t;
+
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    auto profile = [&](auto a_type,
+                       auto b_type,
+                       auto acc_type,
+                       auto c_type,
+                       auto a_layout,
+                       auto b_layout,
+                       auto c_layout) {
+        using ADataType   = decltype(a_type);
+        using BDataType   = decltype(b_type);
+        using AccDataType = decltype(acc_type);
+        using CDataType   = decltype(c_type);
+
+        using ALayout = decltype(a_layout);
+        using BLayout = decltype(b_layout);
+        using CLayout = decltype(c_layout);
+
+        const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
+        const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
+        const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M;
+
+        bool pass = ck::profiler::profile_gemm_splitk_impl<ADataType,
+                                                           BDataType,
+                                                           AccDataType,
+                                                           CDataType,
+                                                           ALayout,
+                                                           BLayout,
+                                                           CLayout>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            M,
+            N,
+            K,
+            (StrideA < 0) ? DefaultStrideA : StrideA,
+            (StrideB < 0) ? DefaultStrideB : StrideB,
+            (StrideC < 0) ? DefaultStrideC : StrideC,
+            KBatch);
+
+        return pass ? 0 : 1;
+    };
+
+    if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(F32{}, F32{}, F32{}, F32{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(F32{}, F32{}, F32{}, F32{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        return profile(F32{}, F32{}, F32{}, F32{}, Col{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        return profile(F32{}, F32{}, F32{}, F32{}, Col{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, Col{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        return profile(F16{}, F16{}, F32{}, F16{}, Col{}, Col{}, Row{});
+    }
+    else
+    {
+        std::cout << "this data_type & layout is not implemented" << std::endl;
+
+        return 1;
+    }
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_gemm_splitk);
diff --git a/profiler/src/profile_grouped_conv_bwd_weight.cpp b/profiler/src/profile_grouped_conv_bwd_weight.cpp
new file mode 100644
index 00000000..dfd8a099
--- /dev/null
+++ b/profiler/src/profile_grouped_conv_bwd_weight.cpp
@@ -0,0 +1,182 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <initializer_list>
+#include <iostream>
+#include <numeric>
+
+#include "profiler/profile_grouped_conv_bwd_weight_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+namespace {
+
+enum struct ConvLayout
+{
+    GNCHW_GKCYX_GNKHW, // 0
+    GNHWC_GKYXC_GNHWK, // 1
+};
+
+enum struct ConvDataType
+{
+    F32_F32_F32,   // 0
+    F16_F16_F16,   // 1
+    BF16_F32_BF16, // 2
+};
+
+#define OP_NAME "grouped_conv_bwd_weight"
+#define OP_DESC "Grouped Convolution Backward Weight"
+
+static void print_helper_msg()
+{
+    std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
+              << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
+              << "                 1: Input fp16, Weight fp16, Output fp16\n"
+              << "                 2: Input bf16, Weight fp32, Output bf16)\n"
+              << "arg3: tensor layout (0: Input[G, N, C, Hi, Wi], Weight[G, K, C, Y, X], Output[G, "
+                 "N, K, Ho, Wo]\n"
+              << "                     1: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, "
+                 "N, Ho, Wo, K]\n"
+              << "arg4: verification (0: no, 1: yes)\n"
+              << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
+              << "arg6: print tensor value (0: no; 1: yes)\n"
+              << "arg7: time kernel (0: no, 1: yes)\n"
+              << ck::utils::conv::get_conv_param_parser_helper_msg() << " SplitK\n"
+              << std::endl;
+}
+
+} // namespace
+
+int profile_grouped_conv_bwd_weight(int argc, char* argv[])
+{
+    // 8 for control, 1 for num_dim_spatial
+    if(argc < 9)
+    {
+        print_helper_msg();
+        return 1;
+    }
+
+    const auto data_type       = static_cast<ConvDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<ConvLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+    const int num_dim_spatial  = std::stoi(argv[8]);
+
+    // 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial, 1 for split-K
+    if(argc != 8 + 1 + 4 + 6 * num_dim_spatial + 1)
+    {
+        print_helper_msg();
+        return 1;
+    }
+
+    const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 9, argv);
+
+    ck::index_t split_k = std::stoi(argv[8 + 1 + 4 + 6 * num_dim_spatial]);
+    split_k             = std::max(1, split_k);
+
+    using F32  = float;
+    using F16  = ck::half_t;
+    using BF16 = ck::bhalf_t;
+
+    using GNWC   = ck::tensor_layout::convolution::GNWC;
+    using GNHWC  = ck::tensor_layout::convolution::GNHWC;
+    using GNDHWC = ck::tensor_layout::convolution::GNDHWC;
+
+    using GKXC   = ck::tensor_layout::convolution::GKXC;
+    using GKYXC  = ck::tensor_layout::convolution::GKYXC;
+    using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
+
+    using GNWK   = ck::tensor_layout::convolution::GNWK;
+    using GNHWK  = ck::tensor_layout::convolution::GNHWK;
+    using GNDHWK = ck::tensor_layout::convolution::GNDHWK;
+
+    constexpr auto I1 = ck::Number<1>{};
+    constexpr auto I2 = ck::Number<2>{};
+    constexpr auto I3 = ck::Number<3>{};
+
+    auto profile = [&](auto num_dim_spatial_tmp,
+                       auto in_layout,
+                       auto wei_layout,
+                       auto out_layout,
+                       auto in_type,
+                       auto wei_type,
+                       auto out_type) {
+        constexpr ck::index_t NDimSpatial = num_dim_spatial_tmp.value;
+
+        using InLayout  = decltype(in_layout);
+        using WeiLayout = decltype(wei_layout);
+        using OutLayout = decltype(out_layout);
+
+        using InDataType  = decltype(in_type);
+        using WeiDataType = decltype(wei_type);
+        using OutDataType = decltype(out_type);
+
+        bool pass = ck::profiler::profile_grouped_conv_bwd_weight_impl<NDimSpatial,
+                                                                       InLayout,
+                                                                       WeiLayout,
+                                                                       OutLayout,
+                                                                       InDataType,
+                                                                       WeiDataType,
+                                                                       OutDataType>(
+            do_verification, init_method, do_log, time_kernel, params, split_k);
+
+        return pass ? 0 : 1;
+    };
+
+    if(num_dim_spatial == 1 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
+    {
+        if(data_type == ConvDataType::F32_F32_F32)
+        {
+            return profile(I1, GNWC{}, GKXC{}, GNWK{}, F32{}, F32{}, F32{});
+        }
+        else if(data_type == ConvDataType::F16_F16_F16)
+        {
+            return profile(I1, GNWC{}, GKXC{}, GNWK{}, F16{}, F16{}, F16{});
+        }
+        else if(data_type == ConvDataType::BF16_F32_BF16)
+        {
+            // fp32 atomic add is used for weight tensor in bf16 kernel
+            return profile(I1, GNWC{}, GKXC{}, GNWK{}, BF16{}, F32{}, BF16{});
+        }
+    }
+    else if(num_dim_spatial == 2 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
+    {
+        if(data_type == ConvDataType::F32_F32_F32)
+        {
+            return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F32{}, F32{}, F32{});
+        }
+        else if(data_type == ConvDataType::F16_F16_F16)
+        {
+            return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F16{}, F16{}, F16{});
+        }
+        else if(data_type == ConvDataType::BF16_F32_BF16)
+        {
+            // fp32 atomic add is used for weight tensor in bf16 kernel
+            return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, BF16{}, F32{}, BF16{});
+        }
+    }
+    else if(num_dim_spatial == 3 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
+    {
+        if(data_type == ConvDataType::F32_F32_F32)
+        {
+            return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F32{}, F32{}, F32{});
+        }
+        else if(data_type == ConvDataType::F16_F16_F16)
+        {
+            return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F16{}, F16{}, F16{});
+        }
+        else if(data_type == ConvDataType::BF16_F32_BF16)
+        {
+            // fp32 atomic add is used for weight tensor in bf16 kernel
+            return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, BF16{}, F32{}, BF16{});
+        }
+    }
+
+    std::cout << "this data_type & layout is not implemented" << std::endl;
+
+    return 1;
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_grouped_conv_bwd_weight);
diff --git a/profiler/src/profile_grouped_conv_fwd.cpp b/profiler/src/profile_grouped_conv_fwd.cpp
new file mode 100644
index 00000000..9ff3c15a
--- /dev/null
+++ b/profiler/src/profile_grouped_conv_fwd.cpp
@@ -0,0 +1,260 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_grouped_conv_fwd_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+namespace {
+
+enum struct ConvLayout
+{
+    GNHWC_GKYXC_GNHWK, // 0
+    NHWGC_GKYXC_NHWGK, // 1
+};
+
+enum struct ConvDataType
+{
+    F32_F32_F32,    // 0
+    F16_F16_F16,    // 1
+    BF16_BF16_BF16, // 2
+    INT8_INT8_INT8, // 3
+};
+
+#define OP_NAME "grouped_conv_fwd"
+#define OP_DESC "Grouped Convolution Forward"
+
+static void print_helper_msg()
+{
+    std::cout
+        // clang-format off
+        << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
+        << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
+        << "                 1: Input fp16, Weight fp16, Output fp16\n"
+        << "                 2: Input bf16, Weight bf16, Output bf16\n"
+        << "                 3: Input int8, Weight int8, Output int8)\n"
+        << "arg3: tensor layout (0: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]\n"
+        << "                     1: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K])\n"
+        << "arg4: verification (0: no, 1: yes)\n"
+        << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n"
+        << "arg6: print tensor value (0: no; 1: yes)\n"
+        << "arg7: time kernel (0: no, 1: yes)\n"
+        << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl;
+    // clang-format on
+}
+
+} // namespace
+
+int profile_grouped_conv_fwd(int argc, char* argv[])
+{
+    // 8 for control, 1 for num_dim_spatial
+    if(argc < 9)
+    {
+        print_helper_msg();
+        return 1;
+    }
+
+    const auto data_type       = static_cast<ConvDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<ConvLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+    const int num_dim_spatial  = std::stoi(argv[8]);
+
+    // 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial
+    if(argc != 8 + 1 + 4 + 6 * num_dim_spatial)
+    {
+        print_helper_msg();
+        return 1;
+    }
+
+    const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 9, argv);
+
+    using F32  = float;
+    using F16  = ck::half_t;
+    using BF16 = ck::bhalf_t;
+    using INT8 = int8_t;
+
+    //
+    using GNWC   = ck::tensor_layout::convolution::GNWC;
+    using GNHWC  = ck::tensor_layout::convolution::GNHWC;
+    using GNDHWC = ck::tensor_layout::convolution::GNDHWC;
+
+    using GKXC   = ck::tensor_layout::convolution::GKXC;
+    using GKYXC  = ck::tensor_layout::convolution::GKYXC;
+    using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
+
+    using GNWK   = ck::tensor_layout::convolution::GNWK;
+    using GNHWK  = ck::tensor_layout::convolution::GNHWK;
+    using GNDHWK = ck::tensor_layout::convolution::GNDHWK;
+
+    //
+    using NWGC   = ck::tensor_layout::convolution::NWGC;
+    using NHWGC  = ck::tensor_layout::convolution::NHWGC;
+    using NDHWGC = ck::tensor_layout::convolution::NDHWGC;
+
+    using NWGK   = ck::tensor_layout::convolution::NWGK;
+    using NHWGK  = ck::tensor_layout::convolution::NHWGK;
+    using NDHWGK = ck::tensor_layout::convolution::NDHWGK;
+
+    constexpr auto I1 = ck::Number<1>{};
+    constexpr auto I2 = ck::Number<2>{};
+    constexpr auto I3 = ck::Number<3>{};
+
+    auto profile = [&](auto num_dim_spatial_tmp,
+                       auto in_layout,
+                       auto wei_layout,
+                       auto out_layout,
+                       auto in_type,
+                       auto wei_type,
+                       auto out_type) {
+        constexpr ck::index_t NDimSpatial = num_dim_spatial_tmp.value;
+
+        using InLayout  = decltype(in_layout);
+        using WeiLayout = decltype(wei_layout);
+        using OutLayout = decltype(out_layout);
+
+        using InDataType  = decltype(in_type);
+        using WeiDataType = decltype(wei_type);
+        using OutDataType = decltype(out_type);
+
+        bool pass = ck::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
+                                                                InLayout,
+                                                                WeiLayout,
+                                                                OutLayout,
+                                                                InDataType,
+                                                                WeiDataType,
+                                                                OutDataType>(
+            do_verification, init_method, do_log, time_kernel, params);
+
+        return pass ? 0 : 1;
+    };
+
+    // GNHWC_GKYXC_GNHWK
+    if(num_dim_spatial == 1 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
+    {
+        if(data_type == ConvDataType::F32_F32_F32)
+        {
+            return profile(I1, GNWC{}, GKXC{}, GNWK{}, F32{}, F32{}, F32{});
+        }
+        else if(data_type == ConvDataType::F16_F16_F16)
+        {
+            return profile(I1, GNWC{}, GKXC{}, GNWK{}, F16{}, F16{}, F16{});
+        }
+        else if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            return profile(I1, GNWC{}, GKXC{}, GNWK{}, BF16{}, BF16{}, BF16{});
+        }
+        else if(data_type == ConvDataType::INT8_INT8_INT8)
+        {
+            return profile(I1, GNWC{}, GKXC{}, GNWK{}, INT8{}, INT8{}, INT8{});
+        }
+    }
+    else if(num_dim_spatial == 2 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
+    {
+        if(data_type == ConvDataType::F32_F32_F32)
+        {
+            return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F32{}, F32{}, F32{});
+        }
+        else if(data_type == ConvDataType::F16_F16_F16)
+        {
+            return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F16{}, F16{}, F16{});
+        }
+        else if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, BF16{}, BF16{}, BF16{});
+        }
+        else if(data_type == ConvDataType::INT8_INT8_INT8)
+        {
+            return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, INT8{}, INT8{}, INT8{});
+        }
+    }
+    else if(num_dim_spatial == 3 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
+    {
+        if(data_type == ConvDataType::F32_F32_F32)
+        {
+            return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F32{}, F32{}, F32{});
+        }
+        else if(data_type == ConvDataType::F16_F16_F16)
+        {
+            return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F16{}, F16{}, F16{});
+        }
+        else if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, BF16{}, BF16{}, BF16{});
+        }
+        else if(data_type == ConvDataType::INT8_INT8_INT8)
+        {
+            return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, INT8{}, INT8{}, INT8{});
+        }
+    }
+    // NHWGC_GKYXC_NHWGK
+    else if(num_dim_spatial == 1 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
+    {
+        if(data_type == ConvDataType::F32_F32_F32)
+        {
+            return profile(I1, NWGC{}, GKXC{}, NWGK{}, F32{}, F32{}, F32{});
+        }
+        else if(data_type == ConvDataType::F16_F16_F16)
+        {
+            return profile(I1, NWGC{}, GKXC{}, NWGK{}, F16{}, F16{}, F16{});
+        }
+        else if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            return profile(I1, NWGC{}, GKXC{}, NWGK{}, BF16{}, BF16{}, BF16{});
+        }
+        else if(data_type == ConvDataType::INT8_INT8_INT8)
+        {
+            return profile(I1, NWGC{}, GKXC{}, NWGK{}, INT8{}, INT8{}, INT8{});
+        }
+    }
+    else if(num_dim_spatial == 2 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
+    {
+        if(data_type == ConvDataType::F32_F32_F32)
+        {
+            return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F32{}, F32{}, F32{});
+        }
+        else if(data_type == ConvDataType::F16_F16_F16)
+        {
+            return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F16{}, F16{}, F16{});
+        }
+        else if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, BF16{}, BF16{});
+        }
+        else if(data_type == ConvDataType::INT8_INT8_INT8)
+        {
+            return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, INT8{}, INT8{}, INT8{});
+        }
+    }
+    else if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
+    {
+        if(data_type == ConvDataType::F32_F32_F32)
+        {
+            return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F32{}, F32{}, F32{});
+        }
+        else if(data_type == ConvDataType::F16_F16_F16)
+        {
+            return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F16{}, F16{}, F16{});
+        }
+        else if(data_type == ConvDataType::BF16_BF16_BF16)
+        {
+            return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, BF16{}, BF16{});
+        }
+        else if(data_type == ConvDataType::INT8_INT8_INT8)
+        {
+            return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, INT8{}, INT8{}, INT8{});
+        }
+    }
+
+    std::cout << "this data_type & layout is not implemented" << std::endl;
+
+    return 1;
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_grouped_conv_fwd);
diff --git a/profiler/src/profile_grouped_gemm.cpp b/profiler/src/profile_grouped_gemm.cpp
new file mode 100644
index 00000000..65e24bd9
--- /dev/null
+++ b/profiler/src/profile_grouped_gemm.cpp
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "profiler/profile_grouped_gemm_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+enum struct GemmMatrixLayout
+{
+    MK_KN_MN, // 0
+    MK_NK_MN, // 1
+    KM_KN_MN, // 2
+    KM_NK_MN, // 3
+    MK_KN_NM, // 4
+    MK_NK_NM, // 5
+    KM_KN_NM, // 6
+    KM_NK_NM, // 7
+};
+
+enum struct GemmDataType
+{
+    F32_F32_F32,    // 0
+    F16_F16_F16,    // 1
+    BF16_BF16_BF16, // 2
+    INT8_INT8_INT8, // 3
+};
+
+std::vector<int> argToIntArray(char* input)
+{
+    std::vector<int> out;
+
+    std::istringstream in(input);
+
+    std::string item;
+
+    while(std::getline(in, item, ','))
+    {
+        out.push_back(std::stoi(item));
+    }
+
+    return out;
+}
+
+#define OP_NAME "grouped_gemm"
+#define OP_DESC "Grouped GEMM"
+
+int profile_grouped_gemm(int argc, char* argv[])
+{
+    if(!(argc == 14))
+    {
+        printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
+        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
+        printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
+        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
+        printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
+        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
+        printf("arg4: verification (0: no; 1: yes)\n");
+        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg6: print tensor value (0: no; 1: yes)\n");
+        printf("arg7: time kernel (0=n0, 1=yes)\n");
+        printf("arg8 to 13: Ms, Ns, Ks, StrideAs, StrideBs, StrideCs (e.g., 256,256 128,128 64,64 "
+               "64,64 64,64 128,128)\n");
+        exit(1);
+    }
+
+    const auto data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
+    const auto layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
+    const bool do_verification = std::stoi(argv[4]);
+    const int init_method      = std::stoi(argv[5]);
+    const bool do_log          = std::stoi(argv[6]);
+    const bool time_kernel     = std::stoi(argv[7]);
+
+    const auto Ms = argToIntArray(argv[8]);
+    const auto Ns = argToIntArray(argv[9]);
+    const auto Ks = argToIntArray(argv[10]);
+
+    const auto StrideAs = argToIntArray(argv[11]);
+    const auto StrideBs = argToIntArray(argv[12]);
+    const auto StrideCs = argToIntArray(argv[13]);
+
+    if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        ck::profiler::profile_grouped_gemm_impl<ck::half_t,
+                                                ck::half_t,
+                                                ck::half_t,
+                                                float,
+                                                ck::tensor_layout::gemm::RowMajor,
+                                                ck::tensor_layout::gemm::RowMajor,
+                                                ck::tensor_layout::gemm::RowMajor>(do_verification,
+                                                                                   init_method,
+                                                                                   do_log,
+                                                                                   time_kernel,
+                                                                                   Ms,
+                                                                                   Ns,
+                                                                                   Ks,
+                                                                                   StrideAs,
+                                                                                   StrideBs,
+                                                                                   StrideCs);
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        ck::profiler::profile_grouped_gemm_impl<ck::half_t,
+                                                ck::half_t,
+                                                ck::half_t,
+                                                float,
+                                                ck::tensor_layout::gemm::RowMajor,
+                                                ck::tensor_layout::gemm::ColumnMajor,
+                                                ck::tensor_layout::gemm::RowMajor>(do_verification,
+                                                                                   init_method,
+                                                                                   do_log,
+                                                                                   time_kernel,
+                                                                                   Ms,
+                                                                                   Ns,
+                                                                                   Ks,
+                                                                                   StrideAs,
+                                                                                   StrideBs,
+                                                                                   StrideCs);
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        ck::profiler::profile_grouped_gemm_impl<ck::half_t,
+                                                ck::half_t,
+                                                ck::half_t,
+                                                float,
+                                                ck::tensor_layout::gemm::ColumnMajor,
+                                                ck::tensor_layout::gemm::RowMajor,
+                                                ck::tensor_layout::gemm::RowMajor>(do_verification,
+                                                                                   init_method,
+                                                                                   do_log,
+                                                                                   time_kernel,
+                                                                                   Ms,
+                                                                                   Ns,
+                                                                                   Ks,
+                                                                                   StrideAs,
+                                                                                   StrideBs,
+                                                                                   StrideCs);
+    }
+    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        ck::profiler::profile_grouped_gemm_impl<ck::half_t,
+                                                ck::half_t,
+                                                ck::half_t,
+                                                float,
+                                                ck::tensor_layout::gemm::ColumnMajor,
+                                                ck::tensor_layout::gemm::ColumnMajor,
+                                                ck::tensor_layout::gemm::RowMajor>(do_verification,
+                                                                                   init_method,
+                                                                                   do_log,
+                                                                                   time_kernel,
+                                                                                   Ms,
+                                                                                   Ns,
+                                                                                   Ks,
+                                                                                   StrideAs,
+                                                                                   StrideBs,
+                                                                                   StrideCs);
+    }
+    else
+    {
+        throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented");
+    }
+
+    return 0;
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_grouped_gemm);
diff --git a/profiler/src/profile_groupnorm.cpp b/profiler/src/profile_groupnorm.cpp
new file mode 100644
index 00000000..2741f527
--- /dev/null
+++ b/profiler/src/profile_groupnorm.cpp
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <vector>
+#include <unordered_map>
+
+#include "profiler/data_type_enum.hpp"
+#include "profiler/profile_groupnorm_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+using ck::index_t;
+
+struct GroupnormArgParser
+{
+    std::unordered_map<std::string, std::vector<int>> long_opts = {{"length", {}}};
+
+    bool parse_opt(int argc, char* argv[], const std::string& key, int i)
+    {
+        if(std::string("--") + key == argv[i])
+        {
+            int pos = i;
+            while(++i < argc && argv[i][0] != '-') {}
+            int end = i;
+            for(int j = pos + 1; j < end; j++)
+            {
+                long_opts[key].push_back(std::stoi(argv[j]));
+            }
+            return true;
+        }
+        return false;
+    }
+
+    void operator()(int argc, char* argv[])
+    {
+        for(auto& kv : long_opts)
+        {
+            for(int i = 1; i < argc; i++)
+            {
+                if(parse_opt(argc, argv, kv.first, i))
+                    break;
+            }
+        }
+    }
+};
+
+#define OP_NAME "groupnorm"
+#define OP_DESC "Group Normalization"
+
+void print_help_groupnorm()
+{
+    std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
+              << "arg2: data type (0: fp16; 1: fp32)\n"
+              << "arg3: verification (0: no; 1: yes)\n"
+              << "arg4: initialization (0: no init; 1: integer value; 2: decimal value)\n"
+              << "arg5: print tensor value (0: no; 1: yes)\n"
+              << "arg6: time kernel (0=no, 1=yes)\n"
+              << "--length: tensor extents (e.g, --length 1 16 16 32 40) \n"
+              << std::endl;
+}
+
+int profile_groupnorm(int argc, char* argv[])
+{
+    ck::DataTypeEnum data_type  = ck::DataTypeEnum::Half;
+    bool do_verification        = false;
+    int init_method             = 0;
+    bool do_log                 = 0;
+    bool time_kernel            = 1;
+    std::vector<index_t> length = {64, 16, 16, 32, 40};
+
+    if(argc != 1 && argc != 13)
+    {
+        print_help_groupnorm();
+        return 0;
+    }
+
+    if(argc == 13)
+    {
+        data_type       = static_cast<ck::DataTypeEnum>(std::stoi(argv[2]));
+        do_verification = std::stoi(argv[3]);
+        init_method     = std::stoi(argv[4]);
+        do_log          = std::stoi(argv[5]);
+        time_kernel     = std::stoi(argv[6]);
+
+        // parse the long options
+        GroupnormArgParser arg_parser;
+        arg_parser(argc, argv);
+        length = arg_parser.long_opts["length"];
+    }
+
+    using F16 = ck::half_t;
+    using F32 = float;
+
+    if(data_type == ck::DataTypeEnum::Float)
+    {
+        ck::profiler::profile_groupnorm_impl<F32, F32, F32, F32, F32>(
+            do_verification, init_method, do_log, time_kernel, length);
+    }
+    else if(data_type == ck::DataTypeEnum::Half)
+    {
+        ck::profiler::profile_groupnorm_impl<F16, F16, F16, F32, F16>(
+            do_verification, init_method, do_log, time_kernel, length);
+    }
+    else
+    {
+        throw std::runtime_error("not implemented yet");
+    }
+
+    return 0;
+}
+
+REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_groupnorm);
diff --git a/profiler/src/profile_layernorm.cpp b/profiler/src/profile_layernorm.cpp
new file mode 100644
index 00000000..e93fc2db
--- /dev/null
+++ b/profiler/src/profile_layernorm.cpp
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <vector>
+#include <unordered_map>
+
+#include "profiler/data_type_enum.hpp"
+#include "profiler/profile_layernorm_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+using ck::index_t;
+
+struct LayernormArgParser
+{
+    std::unordered_map<std::string, std::vector<int>> long_opts = {{"length", {}}};
+
+    bool parse_opt(int argc, char* argv[], const std::string& key, int i)
+    {
+        if(std::string("--") + key == argv[i])
+        {
+            int pos = i;
+            while(++i < argc && argv[i][0] != '-') {}
+            int end = i;
+            for(int j = pos + 1; j < end; j++)
+            {
+                long_opts[key].push_back(std::stoi(argv[j]));
+            }
+            return true;
+        }
+        return false;
+    }
+
+    void operator()(int argc, char* argv[])
+    {
+        for(auto& kv : long_opts)
+        {
+            for(int i = 1; i < argc; i++)
+            {
+                if(parse_opt(argc, argv, kv.first, i))
+                    break;
+            }
+        }
+    }
+};
+
+void print_help_layernorm()
+{
+    std::cout << "arg1: data type (0: fp16; 1: fp32)\n"
+              << "arg2: verification (0: no; 1: yes)\n"
+              << "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n"
+              << "arg4: print tensor value (0: no; 1: yes)\n"
+              << "arg5: time kernel (0=no, 1=yes)\n"
+              << "--length: tensor extents (e.g, --length 1024 1024) \n"
+              << std::endl;
+}
+
+int profile_layernorm(int argc, char* argv[])
+{
+    if(argc <= 2)
+    {
+        print_help_layernorm();
+        return 0;
+    }
+
+    LayernormArgParser arg_parser;
+
+    // short unnamed options
+    const ck::DataTypeEnum data_type = static_cast<ck::DataTypeEnum>(std::stoi(argv[2]));
+    const bool do_verification       = std::stoi(argv[3]);
+    const int init_method            = std::stoi(argv[4]);
+    const bool do_log                = std::stoi(argv[5]);
+    const bool time_kernel           = std::stoi(argv[6]);
+
+    // parse the long options
+    arg_parser(argc, argv);
+    const std::vector<index_t> length = arg_parser.long_opts["length"];
+
+    using F16          = ck::half_t;
+    using F32          = float;
+    constexpr int rank = 2;
+
+    if(data_type == ck::DataTypeEnum::Half)
+    {
+        ck::profiler::profile_layernorm_impl<F16, F16, F16, F32, F16, rank>(
+            do_verification, init_method, do_log, time_kernel, length);
+    }
+    else if(data_type == ck::DataTypeEnum::Float)
+    {
+        ck::profiler::profile_layernorm_impl<F32, F32, F32, F32, F32, rank>(
+            do_verification, init_method, do_log, time_kernel, length);
+    }
+    else
+    {
+        throw std::runtime_error("not implemented yet");
+    }
+
+    return 0;
+}
+
+REGISTER_PROFILER_OPERATION("layernorm", "Layer Normalization", profile_layernorm);
diff --git a/profiler/src/profile_reduce.cpp b/profiler/src/profile_reduce.cpp
new file mode 100644
index 00000000..69253718
--- /dev/null
+++ b/profiler/src/profile_reduce.cpp
@@ -0,0 +1,434 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <fstream>
+#include <cstdlib>
+#include <vector>
+#include <stdexcept>
+#include <sstream>
+#include <getopt.h>
+
+#include "ck/utility/reduction_enums.hpp"
+
+#include "ck/library/utility/host_common_util.hpp"
+
+#include "profiler/profile_reduce_impl.hpp"
+#include "profiler/data_type_enum.hpp"
+#include "profiler_operation_registry.hpp"
+
+using namespace std;
+
+using ck::ReduceTensorOp;
+
+static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
+                                       {"reduceDims", required_argument, nullptr, 'R'},
+                                       {"reduceOp", required_argument, nullptr, 'O'},
+                                       {"compType", required_argument, nullptr, 'C'},
+                                       {"outType", required_argument, nullptr, 'W'},
+                                       {"nanOpt", required_argument, nullptr, 'N'},
+                                       {"indicesOpt", required_argument, nullptr, 'I'},
+                                       {"scales", required_argument, nullptr, 'S'},
+                                       {"half", no_argument, nullptr, '?'},
+                                       {"double", no_argument, nullptr, '?'},
+                                       {"int8", no_argument, nullptr, '?'},
+                                       {"bf16", no_argument, nullptr, '?'},
+                                       {"dumpout", required_argument, nullptr, 'o'},
+                                       {"verify", required_argument, nullptr, 'v'},
+                                       {"help", no_argument, nullptr, '?'},
+                                       {nullptr, 0, nullptr, 0}};
+
+static void check_reduce_dims(const int rank, const std::vector<int>& reduceDims)
+{
+    for(auto dim : reduceDims)
+    {
+        if(dim < 0 || dim >= rank)
+            throw std::runtime_error("Invalid dimension index specified for Reducing");
+    };
+
+    unsigned int flag = 0;
+
+    for(auto dim : reduceDims)
+    {
+        if(flag & (0x1 << dim))
+            throw std::runtime_error("All toReduce dimensions should be different!");
+        flag = flag | (0x1 << dim);
+    };
+};
+
+class ReduceProfilerArgs
+{
+    private:
+    int option_index = 0;
+
+    public:
+    bool use_half   = false;
+    bool use_double = false;
+    bool use_int8   = false;
+    bool use_bf16   = false;
+
+    std::vector<size_t> inLengths;
+    std::vector<size_t> outLengths;
+    std::vector<int> reduceDims;
+
+    std::vector<float> scales;
+
+    ReduceTensorOp reduceOp     = ReduceTensorOp::ADD;
+    ck::DataTypeEnum compTypeId = ck::DataTypeEnum::Float;
+    ck::DataTypeEnum outTypeId  = ck::DataTypeEnum::Float;
+
+    bool compType_assigned = false;
+    bool outType_assigned  = false;
+
+    int nanOpt           = 0;
+    int indicesOpt       = 0;
+    bool do_verification = false;
+    bool do_dumpout      = false;
+
+    int init_method;
+    bool time_kernel;
+
+    ReduceProfilerArgs()  = default;
+    ~ReduceProfilerArgs() = default;
+
+    void show_usage(const char* cmd)
+    {
+        std::cout << "Usage of " << cmd << std::endl;
+        std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths"
+                  << std::endl;
+        std::cout << "--reduceDims or -R, comma separated list of to-reduce dimensions"
+                  << std::endl;
+        std::cout << "--reduceOp or -O, enum value indicating the reduction operations"
+                  << std::endl;
+        std::cout << "--compType or -C, enum value indicating the type of accumulated values used "
+                     "during the reduction"
+                  << std::endl;
+        std::cout << "--outType or -W, optional enum value indicating the type of the reduced "
+                     "output, which could be float when the input data is half"
+                  << std::endl;
+        std::cout
+            << "--nanOpt or -N, 1/0 value indicates the selection to use or not use Nan-Propagation"
+            << std::endl;
+        std::cout << "--indicesOpt or -I, 1/0 value indicates the selection to use or not use "
+                     "index in reduction"
+                  << std::endl;
+        std::cout << "--scales or -S, comma separated two float values for alpha and beta"
+                  << std::endl;
+        std::cout << "--half, use fp16 for the input and output tensor data types" << std::endl;
+        std::cout << "--double, use fp64 for the input and output tensor data types" << std::endl;
+        std::cout << "--int8, use int8 for the input and output tensor data types" << std::endl;
+        std::cout << "--bf16, use bfloat16 for the input and output tensor data types" << std::endl;
+        std::cout << "--verify or -v, 1/0 to indicate whether to verify the reduction result by "
+                     "comparing with the host-based reduction"
+                  << std::endl;
+        std::cout << "--dumpout or -o, 1/0 to indicate where to save the reduction result to files "
+                     "for further analysis"
+                  << std::endl;
+    };
+
+    int processArgs(int argc, char* argv[])
+    {
+        using ck::host_common::getTypeValuesFromString;
+
+        int ch;
+
+        optind++; // to skip the "reduce" module name
+
+        while(1)
+        {
+            ch = getopt_long(argc, argv, "D:R:O:C:W:N:I:S:v:o:", long_options, &option_index);
+            if(ch == -1)
+                break;
+            switch(ch)
+            {
+            case 'D':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                inLengths = getTypeValuesFromString<size_t>(optarg);
+                break;
+            case 'R':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                reduceDims = getTypeValuesFromString<int>(optarg);
+                break;
+            case 'O':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                reduceOp = static_cast<ReduceTensorOp>(std::atoi(optarg));
+                break;
+            case 'C':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                compTypeId        = static_cast<ck::DataTypeEnum>(std::atoi(optarg));
+                compType_assigned = true;
+                break;
+            case 'W':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                outTypeId        = static_cast<ck::DataTypeEnum>(std::atoi(optarg));
+                outType_assigned = true;
+                break;
+            case 'N':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                nanOpt = std::atoi(optarg);
+                break;
+            case 'I':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                indicesOpt = std::atoi(optarg);
+                break;
+            case 'S':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                scales = getTypeValuesFromString<float>(optarg);
+
+                if(scales.size() != 2)
+                    throw std::runtime_error("Invalid option format!");
+                break;
+            case 'v':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                do_verification = static_cast<bool>(std::atoi(optarg));
+                break;
+            case 'o':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                do_dumpout = static_cast<bool>(std::atoi(optarg));
+                break;
+            case '?':
+                if(std::string(long_options[option_index].name) == "half")
+                    use_half = true;
+                else if(std::string(long_options[option_index].name) == "double")
+                    use_double = true;
+                else if(std::string(long_options[option_index].name) == "int8")
+                    use_int8 = true;
+                else if(std::string(long_options[option_index].name) == "bf16")
+                    use_bf16 = true;
+                else if(std::string(long_options[option_index].name) == "help")
+                {
+                    show_usage(argv[0]);
+                    return (-1);
+                };
+                break;
+
+            default:
+                show_usage(argv[0]);
+                std::cerr << "Invalid cmd-line options!" << std::endl;
+                return (-1);
+            };
+        };
+
+        if(optind + 2 > argc)
+            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
+
+        init_method = std::atoi(argv[optind++]);
+        time_kernel = static_cast<bool>(std::atoi(argv[optind]));
+
+        if(scales.empty())
+        {
+            scales.push_back(1.0f);
+            scales.push_back(0.0f);
+        };
+
+        if(reduceOp == ReduceTensorOp::MIN || reduceOp == ReduceTensorOp::MAX ||
+           reduceOp == ReduceTensorOp::AMAX)
+        {
+            // for indexable operations, no need to assign compType and outType, just let them be
+            // same as inType
+            compType_assigned = false;
+            outType_assigned  = false;
+        };
+
+        return (0);
+    };
+
+}; // end of class AppArgs
+
+int profile_reduce(int argc, char* argv[])
+{
+    using ck::DataTypeEnum;
+    using ck::profiler::profile_reduce_impl;
+
+    ReduceProfilerArgs args;
+
+    if(args.processArgs(argc, argv) < 0)
+        return (-1);
+
+    int rank = args.inLengths.size();
+
+    check_reduce_dims(rank, args.reduceDims);
+
+    if(args.reduceOp == ReduceTensorOp::MUL || args.reduceOp == ReduceTensorOp::NORM1)
+        throw std::runtime_error("MUL and NORM1 are not supported by composable kernel!");
+
+    if(args.use_half)
+    {
+        if(!args.compType_assigned)
+            args.compTypeId = DataTypeEnum::Half;
+
+        if(args.outType_assigned &&
+           (args.outTypeId != DataTypeEnum::Half && args.outTypeId != DataTypeEnum::Float))
+            args.outTypeId = DataTypeEnum::Float;
+
+        if(!args.outType_assigned)
+            args.outTypeId = DataTypeEnum::Half;
+
+        if(args.compTypeId == DataTypeEnum::Half)
+        {
+            profile_reduce_impl<ck::half_t, ck::half_t, ck::half_t>(
+                args.do_verification,
+                args.init_method,
+                args.do_dumpout,
+                args.time_kernel,
+                args.inLengths,
+                args.reduceDims,
+                args.reduceOp,
+                static_cast<bool>(args.nanOpt),
+                static_cast<bool>(args.indicesOpt),
+                args.scales[0],
+                args.scales[1]);
+        }
+        else if(args.compTypeId == DataTypeEnum::Float)
+        {
+            profile_reduce_impl<ck::half_t, float, ck::half_t>(args.do_verification,
+                                                               args.init_method,
+                                                               args.do_dumpout,
+                                                               args.time_kernel,
+                                                               args.inLengths,
+                                                               args.reduceDims,
+                                                               args.reduceOp,
+                                                               static_cast<bool>(args.nanOpt),
+                                                               static_cast<bool>(args.indicesOpt),
+                                                               args.scales[0],
+                                                               args.scales[1]);
+        }
+        else
+            throw std::runtime_error("Invalid compType assignment!");
+    }
+    else if(args.use_double)
+    {
+        profile_reduce_impl<double, double, double>(args.do_verification,
+                                                    args.init_method,
+                                                    args.do_dumpout,
+                                                    args.time_kernel,
+                                                    args.inLengths,
+                                                    args.reduceDims,
+                                                    args.reduceOp,
+                                                    static_cast<bool>(args.nanOpt),
+                                                    static_cast<bool>(args.indicesOpt),
+                                                    args.scales[0],
+                                                    args.scales[1]);
+    }
+    else if(args.use_int8)
+    {
+        if(!args.compType_assigned)
+            args.compTypeId = DataTypeEnum::Int8;
+
+        if(args.outType_assigned &&
+           (args.outTypeId != DataTypeEnum::Int8 && args.outTypeId != DataTypeEnum::Int32))
+            args.outTypeId = DataTypeEnum::Int32;
+
+        if(!args.outType_assigned)
+            args.outTypeId = DataTypeEnum::Int8;
+
+        if(args.compTypeId == DataTypeEnum::Int8)
+        {
+            profile_reduce_impl<int8_t, int8_t, int8_t>(args.do_verification,
+                                                        args.init_method,
+                                                        args.do_dumpout,
+                                                        args.time_kernel,
+                                                        args.inLengths,
+                                                        args.reduceDims,
+                                                        args.reduceOp,
+                                                        static_cast<bool>(args.nanOpt),
+                                                        static_cast<bool>(args.indicesOpt),
+                                                        args.scales[0],
+                                                        args.scales[1]);
+        }
+        else if(args.compTypeId == DataTypeEnum::Int32)
+        {
+            profile_reduce_impl<int8_t, int32_t, int8_t>(args.do_verification,
+                                                         args.init_method,
+                                                         args.do_dumpout,
+                                                         args.time_kernel,
+                                                         args.inLengths,
+                                                         args.reduceDims,
+                                                         args.reduceOp,
+                                                         static_cast<bool>(args.nanOpt),
+                                                         static_cast<bool>(args.indicesOpt),
+                                                         args.scales[0],
+                                                         args.scales[1]);
+        }
+        else
+            throw std::runtime_error("Invalid compType assignment!");
+    }
+    else if(args.use_bf16)
+    {
+        if(args.outType_assigned &&
+           (args.outTypeId != DataTypeEnum::BFloat16 && args.outTypeId != DataTypeEnum::Float))
+            args.outTypeId = DataTypeEnum::Float;
+
+        if(!args.outType_assigned)
+            args.outTypeId = DataTypeEnum::BFloat16;
+
+        profile_reduce_impl<ck::bhalf_t, float, ck::bhalf_t>(args.do_verification,
+                                                             args.init_method,
+                                                             args.do_dumpout,
+                                                             args.time_kernel,
+                                                             args.inLengths,
+                                                             args.reduceDims,
+                                                             args.reduceOp,
+                                                             static_cast<bool>(args.nanOpt),
+                                                             static_cast<bool>(args.indicesOpt),
+                                                             args.scales[0],
+                                                             args.scales[1]);
+    }
+    else
+    {
+        if(args.compTypeId == DataTypeEnum::Float)
+        {
+            profile_reduce_impl<float, float, float>(args.do_verification,
+                                                     args.init_method,
+                                                     args.do_dumpout,
+                                                     args.time_kernel,
+                                                     args.inLengths,
+                                                     args.reduceDims,
+                                                     args.reduceOp,
+                                                     static_cast<bool>(args.nanOpt),
+                                                     static_cast<bool>(args.indicesOpt),
+                                                     args.scales[0],
+                                                     args.scales[1]);
+        }
+        else if(args.compTypeId == DataTypeEnum::Double)
+        {
+            profile_reduce_impl<float, double, float>(args.do_verification,
+                                                      args.init_method,
+                                                      args.do_dumpout,
+                                                      args.time_kernel,
+                                                      args.inLengths,
+                                                      args.reduceDims,
+                                                      args.reduceOp,
+                                                      static_cast<bool>(args.nanOpt),
+                                                      static_cast<bool>(args.indicesOpt),
+                                                      args.scales[0],
+                                                      args.scales[1]);
+        }
+        else
+            throw std::runtime_error("Invalid compType assignment!");
+    };
+
+    return (0);
+};
+
+REGISTER_PROFILER_OPERATION("reduce", "Reduce", profile_reduce);
diff --git a/profiler/src/profile_softmax.cpp b/profiler/src/profile_softmax.cpp
new file mode 100644
index 00000000..30f627dd
--- /dev/null
+++ b/profiler/src/profile_softmax.cpp
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <vector>
+#include <unordered_map>
+
+#include "profiler/profile_softmax_impl.hpp"
+#include "profiler_operation_registry.hpp"
+
+using ck::index_t;
+using ck::profiler::SoftmaxDataType;
+
+struct ArgParser
+{
+    std::unordered_map<std::string, std::vector<int>> long_opts = {
+        {"length", {}}, {"stride", {}}, {"reduce", {}}, {"alpha", {}}, {"beta", {}}};
+
+    bool parse_opt(int argc, char* argv[], const std::string& key, int i)
+    {
+        if(std::string("--") + key == argv[i])
+        {
+            int pos = i;
+            while(++i < argc && argv[i][0] != '-') {}
+            int end = i;
+            for(int j = pos + 1; j < end; j++)
+            {
+                long_opts[key].push_back(std::stoi(argv[j]));
+            }
+            return true;
+        }
+        return false;
+    }
+
+    void operator()(int argc, char* argv[])
+    {
+        for(auto& kv : long_opts)
+        {
+            for(int i = 1; i < argc; i++)
+            {
+                if(parse_opt(argc, argv, kv.first, i))
+                    break;
+            }
+        }
+    }
+};
+
+void print_help()
+{
+    std::cout << "arg1: tensor operation (softmax)\n"
+              << "arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n"
+              << "arg3: verification (0: no; 1: yes)\n"
+              << "arg4: initialization (0: no init; 1: integer value; 2: decimal value)\n"
+              << "arg5: print tensor value (0: no; 1: yes)\n"
+              << "arg6: time kernel (0=n0, 1=yes)\n"
+              << "--length: tensor extents (e.g, --length 8 4 256) \n"
+              << "--stride: tensor strides (e.g, --stride 1024 256 1)\n"
+              << "--reduce: to-reduce dimensions (e.g, --reduce 2)\n"
+              << "--alpha: alpha scaling value\n"
+              << "--beta: beta scaling value\n"
+              << std::endl;
+}
+
+int profile_softmax(int argc, char* argv[])
+{
+    if(argc <= 2)
+    {
+        print_help();
+        return 0;
+    }
+
+    ArgParser arg_parser;
+
+    // short unnamed options
+    const SoftmaxDataType data_type = static_cast<SoftmaxDataType>(std::stoi(argv[2]));
+    const bool do_verification      = std::stoi(argv[3]);
+    const int init_method           = std::stoi(argv[4]);
+    const bool do_log               = std::stoi(argv[5]);
+    const bool time_kernel          = std::stoi(argv[6]);
+
+    // parse the long options
+    arg_parser(argc, argv);
+    const std::vector<index_t> length = arg_parser.long_opts["length"];
+    const std::vector<index_t> stride = arg_parser.long_opts["stride"];
+    const std::vector<index_t> reduce = arg_parser.long_opts["reduce"];
+    const index_t alpha =
+        arg_parser.long_opts["alpha"].empty() ? 1 : arg_parser.long_opts["alpha"][0];
+    const index_t beta = arg_parser.long_opts["beta"].empty() ? 0 : arg_parser.long_opts["beta"][0];
+
+    // Rank 3
+    if(length.size() == 3)
+    {
+        if(data_type == SoftmaxDataType::F16_F16)
+        {
+            ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3>(do_verification,
+                                                                                 init_method,
+                                                                                 do_log,
+                                                                                 time_kernel,
+                                                                                 length,
+                                                                                 stride,
+                                                                                 reduce,
+                                                                                 float(alpha),
+                                                                                 float(beta));
+        }
+        else if(data_type == SoftmaxDataType::F32_F32)
+        {
+            ck::profiler::profile_softmax_impl<float, float, float, 3>(do_verification,
+                                                                       init_method,
+                                                                       do_log,
+                                                                       time_kernel,
+                                                                       length,
+                                                                       stride,
+                                                                       reduce,
+                                                                       float(alpha),
+                                                                       float(beta));
+        }
+        else
+        {
+            throw std::runtime_error("not implemented yet");
+        }
+    }
+    // Rank 4
+    else if(length.size() == 4)
+    {
+        if(data_type == SoftmaxDataType::F16_F16)
+        {
+            ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4>(do_verification,
+                                                                                 init_method,
+                                                                                 do_log,
+                                                                                 time_kernel,
+                                                                                 length,
+                                                                                 stride,
+                                                                                 reduce,
+                                                                                 float(alpha),
+                                                                                 float(beta));
+        }
+        else if(data_type == SoftmaxDataType::F32_F32)
+        {
+            ck::profiler::profile_softmax_impl<float, float, float, 4>(do_verification,
+                                                                       init_method,
+                                                                       do_log,
+                                                                       time_kernel,
+                                                                       length,
+                                                                       stride,
+                                                                       reduce,
+                                                                       float(alpha),
+                                                                       float(beta));
+        }
+        else
+        {
+            throw std::runtime_error("not implemented yet");
+        }
+    }
+    else
+    {
+        throw std::runtime_error("not implemented yet");
+    }
+
+    return 0;
+}
+
+// hijack main() for quick debugging
+// int main(int argc, char* argv[])
+// {
+//     profile_normalization(argc, argv);
+//     return 0;
+// }
+
+REGISTER_PROFILER_OPERATION("softmax", "Softmax", profile_softmax);
diff --git a/profiler/src/profiler.cpp b/profiler/src/profiler.cpp
new file mode 100644
index 00000000..080117e3
--- /dev/null
+++ b/profiler/src/profiler.cpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+
+#include "profiler_operation_registry.hpp"
+
+static void print_helper_message()
+{
+    std::cout << "arg1: tensor operation " << ProfilerOperationRegistry::GetInstance() << std::endl;
+}
+
+int main(int argc, char* argv[])
+{
+    if(argc == 1)
+    {
+        print_helper_message();
+    }
+    else if(const auto operation = ProfilerOperationRegistry::GetInstance().Get(argv[1]);
+            operation.has_value())
+    {
+        return (*operation)(argc, argv);
+    }
+    else
+    {
+        std::cerr << "cannot find operation: " << argv[1] << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/profiler/src/profiler_operation_registry.hpp b/profiler/src/profiler_operation_registry.hpp
new file mode 100644
index 00000000..91ff2912
--- /dev/null
+++ b/profiler/src/profiler_operation_registry.hpp
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <map>
+#include <optional>
+#include <string_view>
+#include <utility>
+
+class ProfilerOperationRegistry final
+{
+    ProfilerOperationRegistry()  = default;
+    ~ProfilerOperationRegistry() = default;
+
+    public:
+    using Operation = std::function<int(int, char*[])>;
+
+    private:
+    struct Entry final
+    {
+        explicit Entry(std::string_view description, Operation operation) noexcept
+            : description_(description), operation_(std::move(operation))
+        {
+        }
+
+        std::string_view description_;
+        Operation operation_;
+    };
+
+    std::map<std::string_view, Entry> entries_;
+
+    friend std::ostream& operator<<(std::ostream& stream, const ProfilerOperationRegistry& registry)
+    {
+        stream << "{\n";
+        for(auto& [name, entry] : registry.entries_)
+        {
+            stream << "\t" << name << ": " << entry.description_ << "\n";
+        }
+        stream << "}";
+
+        return stream;
+    }
+
+    public:
+    static ProfilerOperationRegistry& GetInstance()
+    {
+        static ProfilerOperationRegistry registry;
+        return registry;
+    }
+
+    std::optional<Operation> Get(std::string_view name) const
+    {
+        const auto found = entries_.find(name);
+        if(found == end(entries_))
+        {
+            return std::nullopt;
+        }
+
+        return (found->second).operation_;
+    }
+
+    bool Add(std::string_view name, std::string_view description, Operation operation)
+    {
+        return entries_
+            .emplace(std::piecewise_construct,
+                     std::forward_as_tuple(name),
+                     std::forward_as_tuple(description, std::move(operation)))
+            .second;
+    }
+};
+
+#define PP_CONCAT(x, y) PP_CONCAT_IMPL(x, y)
+#define PP_CONCAT_IMPL(x, y) x##y
+
+#define REGISTER_PROFILER_OPERATION(name, description, operation)              \
+    static const bool PP_CONCAT(operation_registration_result_, __COUNTER__) = \
+        ::ProfilerOperationRegistry::GetInstance().Add(name, description, operation)
diff --git a/rbuild.ini b/rbuild.ini
new file mode 100644
index 00000000..3649cedf
--- /dev/null
+++ b/rbuild.ini
@@ -0,0 +1,8 @@
+[develop]
+cxx = ${rocm_path}/bin/hipcc
+cc = ${rocm_path}/llvm/bin/clang
+ignore = pcre
+deps =
+    -f dev-requirements.txt
+define =
+    BUILD_DEV=On
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+
diff --git a/script/clang-format-overwrite.sh b/script/clang-format-overwrite.sh
new file mode 100644
index 00000000..f9d11fcd
--- /dev/null
+++ b/script/clang-format-overwrite.sh
@@ -0,0 +1,2 @@
+#find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}'
+git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|hpp|inc")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-10 -i -style=file {}'
diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh
new file mode 100644
index 00000000..2e605ce8
--- /dev/null
+++ b/script/cmake-ck-dev.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+rm -f CMakeCache.txt
+rm -f *.cmake
+rm -rf CMakeFiles
+
+MY_PROJECT_SOURCE=$1
+
+cmake                                                                                             \
+-D CMAKE_PREFIX_PATH=/opt/rocm                                                                    \
+-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                         \
+-D CMAKE_CXX_FLAGS="-O3 -ftemplate-backtrace-limit=0 -gline-tables-only -save-temps=$PWD"         \
+-D CMAKE_BUILD_TYPE=Release                                                                       \
+-D BUILD_DEV=ON                                                                                   \
+-D GPU_TARGETS="gfx908;gfx90a"                                                                    \
+-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
+-D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
+${MY_PROJECT_SOURCE}
+
+#-D AMDGPU_TARGETS=gfx90a;gfx908
diff --git a/script/cmake-ck-release.sh b/script/cmake-ck-release.sh
new file mode 100644
index 00000000..268b1ebf
--- /dev/null
+++ b/script/cmake-ck-release.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+rm -f CMakeCache.txt
+rm -f *.cmake
+rm -rf CMakeFiles
+
+MY_PROJECT_SOURCE=$1
+
+cmake                                                                                             \
+-D CMAKE_PREFIX_PATH=/opt/rocm                                                                    \
+-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                                                         \
+-D CMAKE_CXX_FLAGS="-O3"                                                                          \
+-D CMAKE_BUILD_TYPE=Release                                                                       \
+-D BUILD_DEV=OFF                                                                                  \
+-D GPU_TARGETS="gfx908;gfx90a"                                                                      \
+-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
+-D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
+${MY_PROJECT_SOURCE}
+
+#-D AMDGPU_TARGETS=gfx90a;gfx908
diff --git a/script/count_vgpr.sh b/script/count_vgpr.sh
new file mode 100644
index 00000000..07debc53
--- /dev/null
+++ b/script/count_vgpr.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+FILE=$1
+
+for num in {0..255}
+do
+    base_pattern="(\[?${num}\b|\[\d*:${num}\])"
+    spattern="s${base_pattern}"
+    vpattern="v${base_pattern}"
+    apattern="a${base_pattern}"
+    scount=$(grep -P $spattern $FILE | wc -l)
+    vcount=$(grep -P $vpattern $FILE | wc -l)
+    acount=$(grep -P $apattern $FILE | wc -l)
+    bash -c "echo -n v${num}   $vcount && \
+             echo -n , s${num} $scount && \
+             echo -n , a${num} $acount"
+    if [[ $scount -ne 0 || $vcount -ne 0 || $acount -ne 0 ]]; then
+        echo -n "  *"
+    fi
+    echo ""
+done
diff --git a/script/hipclang_opt.sh b/script/hipclang_opt.sh
new file mode 100644
index 00000000..c51bd51d
--- /dev/null
+++ b/script/hipclang_opt.sh
@@ -0,0 +1,25 @@
+rm *.ll *.s
+
+BC_FILE=$1
+
+/opt/rocm/llvm/bin/llvm-dis $BC_FILE -o original.ll
+/opt/rocm/llvm/bin/opt -S -inline -inline-threshold=104857 original.ll > inline.ll
+/opt/rocm/llvm/bin/opt -S -sroa inline.ll > sroa.ll
+/opt/rocm/llvm/bin/opt -S -O3 sroa.ll > o3.ll
+
+/opt/rocm/llvm/bin/llc -mcpu=gfx906 original.ll
+/opt/rocm/llvm/bin/llc -mcpu=gfx906 inline.ll
+/opt/rocm/llvm/bin/llc -mcpu=gfx906 sroa.ll
+/opt/rocm/llvm/bin/llc -mcpu=gfx906 o3.ll
+
+#/opt/rocm/llvm/bin/opt -S -O3 -sroa inline.ll > o3.ll
+#/opt/rocm/llvm/bin/opt -S -O3 -sroa o3.ll > o3_2.ll
+#/opt/rocm/llvm/bin/opt -S -O3 -sroa o3_2.ll > o3_3.ll
+#/opt/rocm/llvm/bin/opt -S -O3 -sroa o3_3.ll > o3_4.ll
+
+#/opt/rocm/llvm/bin/llc -mcpu=gfx908 opt.ll
+#/opt/rocm/llvm/bin/llc -mcpu=gfx908 inline.ll
+#/opt/rocm/llvm/bin/llc -mcpu=gfx908 o3.ll
+#/opt/rocm/llvm/bin/llc -mcpu=gfx908 o3_2.ll
+#/opt/rocm/llvm/bin/llc -mcpu=gfx908 o3_3.ll
+#/opt/rocm/llvm/bin/llc -mcpu=gfx908 o3_4.ll
diff --git a/script/parse_perf_data.py b/script/parse_perf_data.py
new file mode 100644
index 00000000..4cb13e62
--- /dev/null
+++ b/script/parse_perf_data.py
@@ -0,0 +1,290 @@
+#!/usr/bin/env python3
+import os, io, argparse, datetime, re
+import numpy as np
+import sqlalchemy
+from sqlalchemy.types import NVARCHAR, Float, Integer
+import pymysql
+import pandas as pd
+from sshtunnel import SSHTunnelForwarder
+
+def print_to_string(*args, **kwargs):
+    output = io.StringIO()
+    print(*args, file=output, **kwargs)
+    contents = output.getvalue()
+    output.close()
+    return contents
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Parse results from tf benchmark runs')
+    parser.add_argument('filename', type=str, help='Log file to prase or directory containing log files')
+    args = parser.parse_args()
+    files = []
+    if os.path.isdir(args.filename):
+        all_files = os.listdir(args.filename)
+        for name in all_files:
+            if not 'log' in name:
+                continue
+            files.append(os.path.join(args.filename, name))
+    else:
+        files = [args.filename]
+    args.files = files
+    return args
+
+def main():
+    args = parse_args()
+    tests = []
+    kernels=[]
+    tflops=[]
+    dtype=[]
+    alayout=[]
+    blayout=[]
+    M=[]
+    N=[]
+    K=[]
+    StrideA=[]
+    StrideB=[]
+    StrideC=[]
+    #parse results, get the Tflops value for "Best Perf" kernels
+
+    glue=""
+    for filename in args.files:
+        for line in open(filename):
+            if 'Branch name' in line:
+                lst=line.split()
+                branch_name=lst[2]
+            if 'On branch' in line:
+                lst=line.split()
+                branch_name=lst[2]
+            if 'Node name' in line:
+                lst=line.split()
+                node_id=lst[2]
+            if 'GPU_arch' in line:
+                lst=line.split()
+                gpu_arch=lst[2]
+            if 'HIP version' in line:
+                lst=line.split()
+                hip_vers=lst[2]
+            if 'Compute Unit' in line:
+                lst=line.split()
+                compute_units=lst[2]
+            if 'InstalledDir' in line:
+                lst=line.split()
+                rocm_vers=lst[1][lst[1].find('/opt/rocm-')+len('/opt/rocm-'):lst[1].rfind('/llvm/bin')]
+    print("Branch name:",branch_name)
+    print("Node name:",node_id)
+    print("GPU_arch:",gpu_arch)
+    print("Compute units:",compute_units)
+    print("ROCM_version:",rocm_vers)
+    print("HIP_version:",hip_vers)
+
+
+    #parse gemm performance tests:
+    if 'gemm' in filename:
+        for filename in args.files:
+            for line in open(filename):
+                if 'Best Perf' in line:
+                    lst=line.split()
+                    if len(lst)>=37: #the line is complete
+                        tests.append(glue.join(lst[5:30]))
+                        kernels.append(glue.join(lst[37:]))
+                        tflops.append(lst[33])
+                        dtype.append(lst[5])
+                        alayout.append(lst[8])
+                        blayout.append(lst[11])
+                        M.append(lst[14])
+                        N.append(lst[17])
+                        K.append(lst[20])
+                        StrideA.append(lst[23])
+                        StrideB.append(lst[26])
+                        StrideC.append(lst[29])
+                    elif len(lst)<37 and len(lst)>=33: #the tflops are available
+                        tests.append(glue.join(lst[5:30]))
+                        kernels.append("N/A")
+                        tflops.append(lst[33])
+                        dtype.append(lst[5])
+                        alayout.append(lst[8])
+                        blayout.append(lst[11])
+                        M.append(lst[14])
+                        N.append(lst[17])
+                        K.append(lst[20])
+                        StrideA.append(lst[23])
+                        StrideB.append(lst[26])
+                        StrideC.append(lst[29])
+                        print("warning: incomplete line:",lst)
+                    elif len(lst)<33: #even the tflops are not available
+                        print("Error in ckProfiler output!")
+                        print("warning: incomplete line=",lst)
+        #sort results
+        #sorted_tests = sorted(tests)
+        #print("sorted tests:",sorted_tests)
+        sorted_tflops = [x for _,x in sorted(zip(tests,tflops))]
+        #sorted_kernels = [x for _,x in sorted(zip(tests,kernels))]
+        test_list=list(range(1,len(tests)+1))
+
+    #parse resnet50 performance tests:
+    if 'resnet50' in filename:
+        for filename in args.files:
+            for line in open(filename):
+                if 'Best Perf' in line:
+                    lst=line.split()
+                    tflops.append(lst[4])
+
+    print("Number of tests:",len(tflops))
+    sql_hostname = '127.0.0.1'
+    sql_username = os.environ["dbuser"]
+    sql_password = os.environ["dbpassword"]
+    sql_main_database = 'miopen_perf'
+    sql_port = 3306
+    ssh_host = os.environ["dbsship"]
+    ssh_user = os.environ["dbsshuser"]
+    ssh_port = int(os.environ["dbsshport"])
+    ssh_pass = os.environ["dbsshpassword"]
+
+    with SSHTunnelForwarder(
+            (ssh_host, ssh_port),
+            ssh_username=ssh_user,
+            ssh_password=ssh_pass,
+            remote_bind_address=(sql_hostname, sql_port)) as tunnel:
+
+        sqlEngine = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@{2}:{3}/{4}'.
+            format(sql_username, sql_password, sql_hostname, tunnel.local_bind_port, sql_main_database))
+        conn = sqlEngine.connect()
+
+        #save gemm performance tests:
+        if 'gemm' in filename:
+
+            #write the ck_gemm_test_params table
+            #only needed once the test set changes
+            '''
+            sorted_dtypes = [x for _,x in sorted(zip(tests,dtype))]
+            sorted_alayout = [x for _,x in sorted(zip(tests,alayout))]
+            sorted_blayout = [x for _,x in sorted(zip(tests,blayout))]
+            sorted_M = [x for _,x in sorted(zip(tests,M))]
+            sorted_N = [x for _,x in sorted(zip(tests,N))]
+            sorted_K = [x for _,x in sorted(zip(tests,K))]
+            sorted_StrideA = [x for _,x in sorted(zip(tests,StrideA))]
+            sorted_StrideB = [x for _,x in sorted(zip(tests,StrideB))]
+            sorted_StrideC = [x for _,x in sorted(zip(tests,StrideC))]
+            ck_gemm_params=[test_list,sorted_dtypes,sorted_alayout,sorted_blayout,
+                        sorted_M,sorted_N,sorted_K,sorted_StrideA,sorted_StrideB,
+                        sorted_StrideC]
+            df=pd.DataFrame(np.transpose(ck_gemm_params),columns=['Test_number','Data_type',
+                'Alayout','BLayout','M','N','K', 'StrideA','StrideB','StrideC'])
+            print(df)
+
+            dtypes = {
+                'Test_number': Integer(),
+                'Data_type': NVARCHAR(length=5),
+                'Alayout': NVARCHAR(length=12),
+                'Blayout': NVARCHAR(length=12),
+                'M': Integer(),
+                'N': Integer(),
+                'K': Integer(),
+                'StrideA': Integer(),
+                'StrideB': Integer(),
+                'StrideC': Integer()
+                }
+            df.to_sql("ck_gemm_test_params",conn,if_exists='replace',index=False, dtype=dtypes)
+            '''
+
+            #read baseline results for the latest develop branch
+            query = '''SELECT * from ck_gemm_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_gemm_tflops where Branch_ID='develop' );'''
+            tflops_base = pd.read_sql_query(query, conn)
+
+            #write new results to the db
+            testlist=[]
+            for i in range(1,len(tests)+1):
+                testlist.append("Test%i"%i)
+            ck_gemm_tflops=[str(branch_name),str(node_id),str(gpu_arch),compute_units,str(rocm_vers),str(hip_vers),str(datetime.datetime.now())]
+            flops=pd.DataFrame(data=[ck_gemm_tflops],columns=['Branch_ID','Node_ID','GPU_arch','Compute Units','ROCM_version','HIP_version','Datetime'])
+            df_add=pd.DataFrame(data=[sorted_tflops],columns=testlist)
+            flops=pd.concat([flops,df_add],axis=1)
+            print("new tflops for gemm tests:",flops)
+            flops.to_sql("ck_gemm_tflops",conn,if_exists='append',index=False)
+
+        #save resnet50 performance tests:
+        if 'resnet50' in filename:
+            #read baseline results for the latest develop branch
+            query = '''SELECT * from ck_resnet50_N256_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_resnet50_N256_tflops where Branch_ID='develop' );'''
+            tflops_base_N256 = pd.read_sql_query(query, conn)
+            query = '''SELECT * from ck_resnet50_N4_tflops WHERE Datetime = (SELECT MAX(Datetime) FROM ck_resnet50_N4_tflops where Branch_ID='develop' );'''
+            tflops_base_N4 = pd.read_sql_query(query, conn)
+
+            #write new results to the db
+            testlist=[]
+            for i in range(1,50):
+                testlist.append("Layer%i"%i)
+            ck_resnet_tflops=[str(branch_name),str(node_id),str(gpu_arch),compute_units,str(rocm_vers),str(hip_vers),str(datetime.datetime.now())]
+            flops0=pd.DataFrame(data=[ck_resnet_tflops],columns=['Branch_ID','Node_ID','GPU_arch','Compute Units','ROCM_version','HIP_version','Datetime'])
+            df_add=pd.DataFrame(data=[tflops[0:49]],columns=testlist)
+            flops=pd.concat([flops0,df_add],axis=1)
+            print("new tflops for N=256 resnet50 test:",flops)
+            flops.to_sql("ck_resnet50_N256_tflops",conn,if_exists='append',index=False)
+            df_add=pd.DataFrame(data=[tflops[49:98]],columns=testlist)
+            flops=pd.concat([flops0,df_add],axis=1)
+            print("new tflops for N=4 resnet50 test:",flops)
+            flops.to_sql("ck_resnet50_N4_tflops",conn,if_exists='append',index=False)
+
+        conn.close()
+
+    #compare the results to the baseline if baseline exists
+    regression=0
+    if 'gemm' in filename:
+        if not tflops_base.empty:
+            base=tflops_base[testlist].to_numpy(dtype='float')
+            base_list=base[0]
+            ave_perf=0
+            for i in range(len(base_list)):
+                # success criterion:
+                if base_list[i]>1.01*float(sorted_tflops[i]):
+                    print("test # ",i,"shows regression by {:.3f}%".format(
+                        (float(sorted_tflops[i])-base_list[i])/base_list[i]*100))
+                    regression=1
+                ave_perf=ave_perf+float(sorted_tflops[i])/base_list[i]
+            if regression==0:
+                print("no regressions found")
+            ave_perf=ave_perf/len(base_list)
+            print("average performance relative to baseline:",ave_perf)
+        else:
+            print("could not find a baseline")
+    if 'resnet50' in filename:
+        if not tflops_base_N256.empty:
+            base=tflops_base_N256[testlist].to_numpy(dtype='float')
+            base_list=base[0]
+            ave_perf=0
+            for i in range(len(base_list)):
+                # success criterion:
+                if base_list[i]>1.01*float(tflops[i]):
+                    print("layer # ",i,"shows regression by {:.3f}%".format(
+                        (float(tflops[i])-base_list[i])/base_list[i]*100))
+                    regression=1
+                ave_perf=ave_perf+float(tflops[i])/base_list[i]
+            if regression==0:
+                print("no regressions found")
+            ave_perf=ave_perf/len(base_list)
+            print("average performance relative to baseline:",ave_perf)
+        else:
+            print("could not find a baseline for N=256")
+        if not tflops_base_N4.empty:
+            base=tflops_base_N4[testlist].to_numpy(dtype='float')
+            base_list=base[0]
+            ave_perf=0
+            for i in range(len(base_list)):
+                # success criterion:
+                if base_list[i]>1.01*float(tflops[i+49]):
+                    print("layer # ",i,"shows regression by {:.3f}%".format(
+                        (float(tflops[i+49])-base_list[i])/base_list[i]*100))
+                    regression=1
+                ave_perf=ave_perf+float(tflops[i+49])/base_list[i]
+            if regression==0:
+                print("no regressions found")
+            ave_perf=ave_perf/len(base_list)
+            print("average performance relative to baseline:",ave_perf)
+        else:
+            print("could not find a baseline for N=4")
+
+    #return 0 if performance criteria met, otherwise return 1
+    return regression
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/script/process_perf_data.py b/script/process_perf_data.py
new file mode 100644
index 00000000..638e4ef5
--- /dev/null
+++ b/script/process_perf_data.py
@@ -0,0 +1,308 @@
+#!/usr/bin/env python3
+import os, io, argparse, datetime
+#import numpy as np
+import sqlalchemy
+from sqlalchemy.types import NVARCHAR, Float, Integer
+import pymysql
+import pandas as pd
+from sshtunnel import SSHTunnelForwarder
+
+def print_to_string(*args, **kwargs):
+    output = io.StringIO()
+    print(*args, file=output, **kwargs)
+    contents = output.getvalue()
+    output.close()
+    return contents
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Parse results from tf benchmark runs')
+    parser.add_argument('filename', type=str, help='Log file to prase or directory containing log files')
+    args = parser.parse_args()
+    files = []
+    if os.path.isdir(args.filename):
+        all_files = os.listdir(args.filename)
+        for name in all_files:
+            if not 'log' in name:
+                continue
+            files.append(os.path.join(args.filename, name))
+    else:
+        files = [args.filename]
+    args.files = files
+    return args
+
+def get_log_params(logfile):
+    print("logfile=",logfile)
+    branch_name=' '
+    node_id=' '
+    gpu_arch=' '
+    hip_vers=' '
+    compute_units=0
+    environment=' '
+    rocm_vers=' '
+    for line in open(logfile):
+        if 'Branch name' in line:
+            lst=line.split()
+            branch_name=lst[2]
+        if 'On branch' in line:
+            lst=line.split()
+            branch_name=lst[2]
+        if 'Node name' in line:
+            lst=line.split()
+            node_id=lst[2]
+        if 'GPU_arch' in line:
+            lst=line.split()
+            gpu_arch=lst[2]
+        if 'HIP version' in line:
+            lst=line.split()
+            hip_vers=lst[2]
+        if 'Compute Unit' in line:
+            lst=line.split()
+            compute_units=lst[2]
+        if 'Environment type' in line:
+            lst=line.split()
+            environment=lst[2]
+        if 'InstalledDir' in line:
+            lst=line.split()
+            rocm_vers=lst[1][lst[1].find('/opt/rocm-')+len('/opt/rocm-'):lst[1].rfind('/llvm/bin')]
+    return branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment
+
+def parse_logfile(logfile):
+    glue=''
+    res=[]
+    tests=[]
+    kernels=[]
+    tflops=[]
+    dtype=[]
+    alayout=[]
+    blayout=[]
+    M=[]
+    N=[]
+    K=[]
+    StrideA=[]
+    StrideB=[]
+    StrideC=[]
+    if 'perf_gemm.log' in logfile:
+        for line in open(logfile):
+            if 'Best Perf' in line:
+                lst=line.split()
+                if len(lst)>=37: #the line is complete
+                    tests.append(glue.join(lst[5:30]))
+                    kernels.append(glue.join(lst[37:]))
+                    tflops.append(lst[33])
+                    dtype.append(lst[5])
+                    alayout.append(lst[8])
+                    blayout.append(lst[11])
+                    M.append(lst[14])
+                    N.append(lst[17])
+                    K.append(lst[20])
+                    StrideA.append(lst[23])
+                    StrideB.append(lst[26])
+                    StrideC.append(lst[29])
+                elif len(lst)<37 and len(lst)>=33: #the tflops are available
+                    tests.append(glue.join(lst[5:30]))
+                    kernels.append("N/A")
+                    tflops.append(lst[33])
+                    dtype.append(lst[5])
+                    alayout.append(lst[8])
+                    blayout.append(lst[11])
+                    M.append(lst[14])
+                    N.append(lst[17])
+                    K.append(lst[20])
+                    StrideA.append(lst[23])
+                    StrideB.append(lst[26])
+                    StrideC.append(lst[29])
+                    print("warning: incomplete line:",lst)
+                elif len(lst)<33: #even the tflops are not available
+                    print("Error in ckProfiler output!")
+                    print("warning: incomplete line=",lst)
+        #sort results
+        #sorted_tests = sorted(tests)
+        res = [x for _,x in sorted(zip(tests,tflops))]
+        #sorted_kernels = [x for _,x in sorted(zip(tests,kernels))]
+        test_list=list(range(1,len(tests)+1))
+    #parse conv_fwd and conv_bwd performance tests:
+    elif 'conv_fwd' in logfile or 'conv_bwd_data' in logfile:
+        for line in open(logfile):
+            if 'tflops:' in line:
+                lst=line.split()
+                res.append(lst[1])
+    #parse all other performance tests:
+    elif 'resnet50' in logfile or 'batched_gemm' in logfile or 'grouped_gemm' in logfile  or 'gemm_bilinear' in logfile or 'reduction' in logfile:
+        for line in open(logfile):
+            if 'Best Perf' in line:
+                lst=line.split()
+                res.append(lst[4])
+    elif 'onnx_gemm' in logfile or 'splitK_gemm' in logfile:
+        for line in open(logfile):
+            if 'Best Perf' in line:
+                lst=line.split()
+                res.append(lst[33])
+    return res
+
+
+def get_baseline(table, connection):
+    query = '''SELECT * from '''+table+''' WHERE Datetime = (SELECT MAX(Datetime) FROM '''+table+''' where Branch_ID='develop' );'''
+    return pd.read_sql_query(query, connection)
+
+def store_new_test_result(table_name, test_results, testlist, branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment, connection):
+    params=[str(branch_name),str(node_id),str(gpu_arch),compute_units,str(rocm_vers),str(hip_vers),str(environment),str(datetime.datetime.now())]
+    df=pd.DataFrame(data=[params],columns=['Branch_ID','Node_ID','GPU_arch','Compute Units','ROCM_version','HIP_version','Environment','Datetime'])
+    df_add=pd.DataFrame(data=[test_results],columns=testlist)
+    df=pd.concat([df,df_add],axis=1)
+    #print("new test results dataframe:",df)
+    df.to_sql(table_name,connection,if_exists='append',index=False)
+    return 0
+
+def compare_test_to_baseline(baseline,test,testlist):
+    regression=0
+    if not baseline.empty:
+        base=baseline[testlist].to_numpy(dtype='float')
+        base_list=base[0]
+        ave_perf=0
+        for i in range(len(base_list)):
+            # success criterion:
+            if base_list[i]>1.01*float(test[i]):
+                print("test # ",i,"shows regression by {:.3f}%".format(
+                    (float(test[i])-base_list[i])/base_list[i]*100))
+                regression=1
+            if base_list[i]>0: ave_perf=ave_perf+float(test[i])/base_list[i]
+        if regression==0:
+            print("no regressions found")
+        ave_perf=ave_perf/len(base_list)
+        print("average performance relative to baseline:",ave_perf)
+    else:
+        print("could not find a baseline")
+    return regression
+
+'''
+def post_test_params(tlist,connection):
+    sorted_dtypes = [x for _,x in sorted(zip(tests,dtype))]
+    sorted_alayout = [x for _,x in sorted(zip(tests,alayout))]
+    sorted_blayout = [x for _,x in sorted(zip(tests,blayout))]
+    sorted_M = [x for _,x in sorted(zip(tests,M))]
+    sorted_N = [x for _,x in sorted(zip(tests,N))]
+    sorted_K = [x for _,x in sorted(zip(tests,K))]
+    sorted_StrideA = [x for _,x in sorted(zip(tests,StrideA))]
+    sorted_StrideB = [x for _,x in sorted(zip(tests,StrideB))]
+    sorted_StrideC = [x for _,x in sorted(zip(tests,StrideC))]
+    ck_gemm_params=[tlist,sorted_dtypes,sorted_alayout,sorted_blayout,
+                sorted_M,sorted_N,sorted_K,sorted_StrideA,sorted_StrideB,
+                sorted_StrideC]
+    df=pd.DataFrame(np.transpose(ck_gemm_params),columns=['Test_number','Data_type',
+        'Alayout','BLayout','M','N','K', 'StrideA','StrideB','StrideC'])
+    print(df)
+
+    dtypes = {
+        'Test_number': Integer(),
+        'Data_type': NVARCHAR(length=5),
+        'Alayout': NVARCHAR(length=12),
+        'Blayout': NVARCHAR(length=12),
+        'M': Integer(),
+        'N': Integer(),
+        'K': Integer(),
+        'StrideA': Integer(),
+        'StrideB': Integer(),
+        'StrideC': Integer()
+        }
+    df.to_sql("ck_gemm_test_params",connection,if_exists='replace',index=False, dtype=dtypes)
+'''
+
+def main():
+    args = parse_args()
+    results=[]
+    tflops_base=[]
+    testlist=[]
+    #parse the test parameters from the logfile
+    for filename in args.files:
+        branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment = get_log_params(filename)
+
+    print("Branch name:",branch_name)
+    print("Node name:",node_id)
+    print("GPU_arch:",gpu_arch)
+    print("Compute units:",compute_units)
+    print("ROCM_version:",rocm_vers)
+    print("HIP_version:",hip_vers)
+    print("Environment:",environment)
+    #parse results, get the Tflops value for "Best Perf" kernels
+    results=parse_logfile(filename)
+
+    print("Number of tests:",len(results))
+    sql_hostname = '127.0.0.1'
+    sql_username = os.environ["dbuser"]
+    sql_password = os.environ["dbpassword"]
+    sql_main_database = 'miopen_perf'
+    sql_port = 3306
+    ssh_host = os.environ["dbsship"]
+    ssh_user = os.environ["dbsshuser"]
+    ssh_port = int(os.environ["dbsshport"])
+    ssh_pass = os.environ["dbsshpassword"]
+
+    with SSHTunnelForwarder(
+            (ssh_host, ssh_port),
+            ssh_username=ssh_user,
+            ssh_password=ssh_pass,
+            remote_bind_address=(sql_hostname, sql_port)) as tunnel:
+
+        sqlEngine = sqlalchemy.create_engine('mysql+pymysql://{0}:{1}@{2}:{3}/{4}'.
+            format(sql_username, sql_password, sql_hostname, tunnel.local_bind_port, sql_main_database))
+        conn = sqlEngine.connect()
+
+        #save gemm performance tests:
+        if 'perf_gemm.log' in filename:
+            #write the ck_gemm_test_params table only needed once the test set changes
+            #post_test_params(test_list,conn)
+            for i in range(1,len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_gemm_tflops"
+        if 'batched_gemm' in filename:
+            for i in range(1,len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_batched_gemm_tflops"
+        if 'grouped_gemm' in filename:
+            for i in range(1,len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_grouped_gemm_tflops"
+        if 'conv_fwd' in filename:
+            for i in range(1,len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_conv_fwd_tflops"
+        if 'conv_bwd_data' in filename:
+            for i in range(1,len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_conv_bwd_data_tflops"
+        if 'gemm_bilinear' in filename:
+            for i in range(1,len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_gemm_bilinear_tflops"
+        if 'reduction' in filename:
+            for i in range(1,len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_reduction_GBps"
+        if 'resnet50_N4' in filename:
+            for i in range(1,50):
+                testlist.append("Layer%i"%i)
+            table_name="ck_resnet50_N4_tflops"
+        if 'resnet50_N256' in filename:
+            for i in range(1,50):
+                testlist.append("Layer%i"%i)
+            table_name="ck_resnet50_N256_tflops"
+        if 'onnx_gemm' in filename:
+            for i in range(1,len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_onnx_gemm_tflops"
+        if 'splitK_gemm' in filename:
+            for i in range(1,len(results)+1):
+                testlist.append("Test%i"%i)
+            table_name="ck_splitK_gemm_tflops"
+
+        tflops_base = get_baseline(table_name,conn)
+        store_new_test_result(table_name, results, testlist, branch_name, node_id, gpu_arch, compute_units, rocm_vers, hip_vers, environment, conn)
+        conn.close()
+
+    #compare the results to the baseline if baseline exists
+    regression=0
+    regression=compare_test_to_baseline(tflops_base,results,testlist)
+    return regression
+
+if __name__ == '__main__':
+    main()
diff --git a/script/process_perf_data.sh b/script/process_perf_data.sh
new file mode 100644
index 00000000..15fc5cb1
--- /dev/null
+++ b/script/process_perf_data.sh
@@ -0,0 +1,15 @@
+#!/bin/bash 
+#
+# in order to run this script you'd need the following python packages:
+
+#pip3 install --upgrade pip
+#pip3 install sqlalchemy pymysql pandas sshtunnel
+
+# you would also need to set up some environment variables in order to 
+# post your new test results to the database and compare them to the baseline
+# please contact Illia.Silin@amd.com for more details
+
+#process results
+python3 process_perf_data.py perf_gemm.log
+python3 process_perf_data.py perf_resnet50_N256.log
+python3 process_perf_data.py perf_resnet50_N4.log
diff --git a/script/process_qa_data.sh b/script/process_qa_data.sh
new file mode 100644
index 00000000..abf1e623
--- /dev/null
+++ b/script/process_qa_data.sh
@@ -0,0 +1,23 @@
+#!/bin/bash 
+#
+# in order to run this script you'd need the following python packages:
+
+#pip3 install --upgrade pip
+#pip3 install sqlalchemy pymysql pandas sshtunnel
+
+# you would also need to set up some environment variables in order to 
+# post your new test results to the database and compare them to the baseline
+# please contact Illia.Silin@amd.com for more details
+
+#process results
+python3 process_perf_data.py perf_gemm.log
+python3 process_perf_data.py perf_resnet50_N256.log
+python3 process_perf_data.py perf_resnet50_N4.log
+python3 process_perf_data.py perf_batched_gemm.log
+python3 process_perf_data.py perf_grouped_gemm.log
+python3 process_perf_data.py perf_conv_fwd.log
+python3 process_perf_data.py perf_conv_bwd_data.log
+python3 process_perf_data.py perf_gemm_bilinear.log
+python3 process_perf_data.py perf_reduction.log
+python3 process_perf_data.py perf_splitK_gemm.log
+python3 process_perf_data.py perf_onnx_gemm.log
diff --git a/script/profile_batched_gemm.sh b/script/profile_batched_gemm.sh
new file mode 100644
index 00000000..d19ddd0c
--- /dev/null
+++ b/script/profile_batched_gemm.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+ 
+## GPU visibility
+export HIP_VISIBLE_DEVICES=0
+DRIVER="../build/bin/ckProfiler"
+OP=$1
+DATATYPE=$2
+LAYOUT=$3
+VERIFY=$4
+INIT=$5
+LOG=$6
+TIME=$7
+ 
+OP=$1
+DATATYPE=$2
+LAYOUT=$3
+VERIFY=$4
+INIT=$5
+LOG=$6
+TIME=$7
+ 
+########  op  datatype  layout  verify  init  log  time  M___ N___ K___  StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  960  1024 1024       -1     -1      -1            -1           -1           -1          8
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1920  2048 2048       -1     -1      -1            -1           -1           -1          8
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 3840  4096 4096       -1     -1      -1            -1           -1           -1          4
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7680  8192 8192       -1     -1      -1            -1           -1           -1          2
+ 
+ #######  op  datatype  layout  verify  init  log  time  M___ N___ K___  StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024  1024 1024     1024    1024    1024           -1           -1           -1          8
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048  2048 2048     2048    2048    2048           -1           -1           -1          8
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096  4096 4096     4096    4096    4096           -1           -1           -1          4
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192  8192 8192     8192    8192    8192           -1           -1           -1          2
+ 
+ #######  op  datatype  layout  verify  init  log  time  M___ N___ K___  StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024  1024 1024     1056    1056    1056           -1           -1           -1          8
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048  2048 2048     2080    2080    2080           -1           -1           -1          8
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096  4096 4096     4128    4128    4128           -1           -1           -1          4
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192  8192 8192     8224    8224    8224           -1           -1           -1          2
+ 
+ #######  op  datatype  layout  verify  init  log  time  M___ N___ K___  StrideA StrideB StrideC BatchStrideA BatchStrideB BatchStrideC BatchCount
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024  1024 1024     1088    1088    1088           -1           -1           -1          8
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048  2048 2048     2112    2112    2112           -1           -1           -1          8
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096  4096 4096     4160    4160    4160           -1           -1           -1          4
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192  8192 8192     8256    8256    8256           -1           -1           -1          2
diff --git a/script/profile_conv_bwd_data.sh b/script/profile_conv_bwd_data.sh
new file mode 100644
index 00000000..a1d2f450
--- /dev/null
+++ b/script/profile_conv_bwd_data.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+ 
+## GPU visibility
+export HIP_VISIBLE_DEVICES=0
+DRIVER="../build/bin/ckProfiler"
+
+OP=$1
+DATATYPE=$2
+LAYOUT=$3
+VERIFY=$4
+INIT=$5
+LOG=$6
+TIME=$7
+
+ N=$8
+
+# Resnet50
+########  op  datatype  layout  verify  init  log  time conv_dim G__ N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256 1024 1 1   14   14     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512 1024 1 1   14   14     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  128 3 3   28   28     1 1       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  128 1 1   28   28     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  128 3 3   56   56     2 2       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512 2048 1 1    7    7     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N 1024  256 1 1   14   14     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  256 3 3   14   14     1 1       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  256 3 3   28   28     2 2       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  256 1 1   56   56     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64  256 1 1   56   56     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  512 3 3   14   14     2 2       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  512 1 1   28   28     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  512 1 1   28   28     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N 2048  512 1 1    7    7     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  512 3 3    7    7     1 1       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256   64 1 1   56   56     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64   64 1 1   56   56     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64   64 3 3   56   56     1 1       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64    3 7 7  224  224     2 2       1 1      3 3       3 3
diff --git a/script/profile_conv_fwd.sh b/script/profile_conv_fwd.sh
new file mode 100644
index 00000000..a1d2f450
--- /dev/null
+++ b/script/profile_conv_fwd.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+ 
+## GPU visibility
+export HIP_VISIBLE_DEVICES=0
+DRIVER="../build/bin/ckProfiler"
+
+OP=$1
+DATATYPE=$2
+LAYOUT=$3
+VERIFY=$4
+INIT=$5
+LOG=$6
+TIME=$7
+
+ N=$8
+
+# Resnet50
+########  op  datatype  layout  verify  init  log  time conv_dim G__ N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256 1024 1 1   14   14     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512 1024 1 1   14   14     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  128 3 3   28   28     1 1       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  128 1 1   28   28     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  128 3 3   56   56     2 2       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512 2048 1 1    7    7     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N 1024  256 1 1   14   14     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  256 3 3   14   14     1 1       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  256 3 3   28   28     2 2       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  256 1 1   56   56     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64  256 1 1   56   56     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  512 3 3   14   14     2 2       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  128  512 1 1   28   28     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256  512 1 1   28   28     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N 2048  512 1 1    7    7     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  512  512 3 3    7    7     1 1       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N  256   64 1 1   56   56     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64   64 1 1   56   56     1 1       1 1      0 0       0 0
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64   64 3 3   56   56     1 1       1 1      1 1       1 1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME        2   1  $N   64    3 7 7  224  224     2 2       1 1      3 3       3 3
diff --git a/script/profile_gemm.sh b/script/profile_gemm.sh
new file mode 100644
index 00000000..b88159e7
--- /dev/null
+++ b/script/profile_gemm.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+## GPU visibility
+export HIP_VISIBLE_DEVICES=0
+DRIVER="../build/bin/ckProfiler"
+echo $DRIVER
+OP=$1
+DATATYPE=$2
+LAYOUT=$3
+VERIFY=$4
+INIT=$5
+LOG=$6
+TIME=$7
+
+
+# 120 CU
+########  op  datatype  layout  verify  init  log  time  M___ N___ K___  StrideA StrideB StrideC
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  960  1024 1024       -1     -1      -1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  960  2048 2048       -1     -1      -1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1920  1024 2048       -1     -1      -1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1920  2048 2048       -1     -1      -1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 3840  4096 4096       -1     -1      -1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7680  8192 8192       -1     -1      -1
+ 
+# 104 CU
+########  op  datatype  layout  verify  init  log  time  M___ N___ K___  StrideA StrideB StrideC
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  832  1024 1024       -1     -1      -1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  832  2048 2048       -1     -1      -1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1664  1024 2048       -1     -1      -1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1664  2048 2048       -1     -1      -1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 3328  4096 4096       -1     -1      -1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 6656  8192 8192       -1     -1      -1
+ 
+# 110 CU
+########  op  datatype  layout  verify  init  log  time  M___ N___ K___  StrideA StrideB StrideC
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1280  1408 1024       -1     -1      -1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1280  2816 2048       -1     -1      -1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2560  1408 2048       -1     -1      -1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2560  2816 2048       -1     -1      -1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 5120  5632 4096       -1     -1      -1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7040  8192 8192       -1     -1      -1
+
+# testing different strides
+########  op  datatype  layout  verify  init  log  time  M___ N___ K___  StrideA StrideB StrideC
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024  1024 1024	    1024   1024    1024
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048  2048 2048	    2048   2048    2048
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096  4096 4096	    4096   4096    4096
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192  8192 8192	    8192   8192    8192
+ 
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024  1024 1024	    1056   1056    1056
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048  2048 2048	    2080   2080    2080
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096  4096 4096	    4128   4128    4128
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192  8192 8192	    8224   8224    8224
+ 
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024  1024 1024	    1088   1088    1088
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048  2048 2048	    2112   2112    2112
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096  4096 4096	    4160   4160    4160
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192  8192 8192	    8256   8256    8256
diff --git a/script/profile_gemm_bilinear.sh b/script/profile_gemm_bilinear.sh
new file mode 100644
index 00000000..e6edefae
--- /dev/null
+++ b/script/profile_gemm_bilinear.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+## GPU visibility
+export HIP_VISIBLE_DEVICES=0
+DRIVER="../build/bin/ckProfiler"
+OP=$1
+DATATYPE=$2
+LAYOUT=$3
+VERIFY=$4
+INIT=$5
+LOG=$6
+TIME=$7
+ 
+########  op  datatype  layout  verify  init  log  time  M___ N___ K___  StrideA StrideB StrideD StrideE Alpha Beta
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  960  1024 1024       -1      -1      -1      -1     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1920  2048 2048       -1      -1      -1      -1     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 3840  4096 4096       -1      -1      -1      -1     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7680  8192 8192       -1      -1      -1      -1     1    1
+ 
+########  op  datatype  layout  verify  init  log  time  M___ N___ K___  StrideA StrideB StrideD StrideE Alpha Beta
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  960  1024 1024       -1      -1       0      -1     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1920  2048 2048       -1      -1       0      -1     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 3840  4096 4096       -1      -1       0      -1     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 7680  8192 8192       -1      -1       0      -1     1    1
+ 
+########  op  datatype  layout  verify  init  log  time  M___ N___ K___  StrideA StrideB StrideD StrideE Alpha Beta
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1000  1000 1000       -1      -1       0      -1     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2000  2000 2000       -1      -1       0      -1     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4000  4000 4000       -1      -1       0      -1     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8000  8000 8000       -1      -1       0      -1     1    1
+ 
+########  op  datatype  layout  verify  init  log  time  M___ N___ K___  StrideA StrideB StrideD StrideE Alpha Beta
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024  1024 1024     1056    1056    1056    1056     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048  2048 2048     2080    2080    2080    2080     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096  4096 4096     4128    4128    4128    4128     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192  8192 8192     8224    8224    8224    8224     1    1
+ 
+########  op  datatype  layout  verify  init  log  time  M___ N___ K___  StrideA StrideB StrideD StrideE Alpha Beta
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024  1024 1024     1088    1088    1088    1088     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048  2048 2048     2112    2112    2112    2112     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 4096  4096 4096     4160    4160    4160    4160     1    1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 8192  8192 8192     8256    8256    8256    8256     1    1
\ No newline at end of file
diff --git a/script/profile_grouped_gemm.sh b/script/profile_grouped_gemm.sh
new file mode 100644
index 00000000..8adb7c81
--- /dev/null
+++ b/script/profile_grouped_gemm.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+ 
+## GPU visibility
+export HIP_VISIBLE_DEVICES=0
+DRIVER="../build/bin/ckProfiler"
+OP=$1
+DATATYPE=$2
+LAYOUT=$3
+VERIFY=$4
+INIT=$5
+LOG=$6
+TIME=$7
+ 
+########  op  datatype  layout  verify  init  log  time  Ms______________ Ns______________ Ks_____________ StrideAs___________ StrideBs__________  StrideCs___________
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  256,512,1024,768 128,256,384,1024 128,192,256,512 1024,1025,1044,1026 1024,1024,1024,1024 1025,1024,1028,1024
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  512,768,2048,128 128,256,384,1024 128,192,256,512 1024,1025,2053,1026 1024,1024,1024,1024 1025,1024,2054,1024
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  256,512,1024,768 512,256,768,1024 128,192,256,512 1024,1045,1034,1026 1024,1024,1024,1024 1025,1063,1028,1024
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  512,768,4096,768 128,768,512,2048 128,192,256,512 1024,1027,4096,2050 1024,1024,1024,2048 1025,1024,4099,2049
diff --git a/script/profile_onnx_gemm.sh b/script/profile_onnx_gemm.sh
new file mode 100644
index 00000000..c2721e7f
--- /dev/null
+++ b/script/profile_onnx_gemm.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+## GPU visibility
+export HIP_VISIBLE_DEVICES=0
+DRIVER="../build/bin/ckProfiler"
+echo $DRIVER
+OP=$1
+DATATYPE=$2
+LAYOUT=$3
+VERIFY=$4
+INIT=$5
+LOG=$6
+TIME=$7
+# GEMM kernel benchmarks used by ONNX 
+########  op  datatype  layout  verify  init  log  time  M___ N___ K___  StrideA StrideB StrideC
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  384  768  768        -1     -1      -1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  384  768  2304       -1     -1      -1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  384  768  3072       -1     -1      -1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  384  3072 768        -1     -1      -1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  384  1024 1024       -1     -1      -1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  384  1024 3072       -1     -1      -1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  384  1024 4096       -1     -1      -1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  384  4096 1024       -1     -1      -1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  24576 768 768        -1     -1      -1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  24576 768 2304       -1     -1      -1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  24576 768 3072       -1     -1      -1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  24576 3072 768       -1     -1      -1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  24576 1024 1024      -1     -1      -1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  24576 1024 3072      -1     -1      -1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  24576 1024 4096      -1     -1      -1
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  24576 4096 1024      -1     -1      -1
+ 
diff --git a/script/profile_reduce_no_index.sh b/script/profile_reduce_no_index.sh
new file mode 100644
index 00000000..66bfe1dc
--- /dev/null
+++ b/script/profile_reduce_no_index.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+DRIVER="../build/bin/ckProfiler"
+VERIFY="-v $1"
+INIT=$2
+NREPEAT=$3
+PRECISION=$4
+##PRECISION=--half
+##PRECISION=--double
+##PRECISION=--int8
+##PRECISION=--bf16
+
+if [ -n $PRECISION ] && [ "$PRECISION" = "--half" -o "$PRECISION" = "--bf16" ]; then
+   ACCTYPE="-C 1"
+elif [ -n $PRECISION ] && [ "$PRECISION" = "--int8" ]; then
+   ACCTYPE="-C 2"
+fi
+
+#### 0 - ADD,  5 - AVG,  7 - NORM2
+Operations="0 5"
+
+#### 0 - ADD,  5 - AVG,    for int8, no NORM2 supported
+if [ -n $PRECISION ] && [ "$PRECISION" = "--int8" -o "$PRECISION" = "--half" ]; then
+   Operations=5
+fi
+
+## for generic validation
+for op in $Operations; do
+    set -x
+    #######        datatype   layout          reduce dims  op     acctype   verify  init  repeats
+    $DRIVER reduce $PRECISION -D 64,4,280,82  -R 0,1,2,3   -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 64,4,280,82  -R 0         -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 64,4,280,82  -R 1         -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 64,4,280,82  -R 2         -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 64,4,280,82  -R 3         -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 64,4,280,82  -R 0,1,2     -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 64,4,280,82  -R 1,2,3     -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 64,4,280,82  -R 0,2,3     -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 64,4,280,82  -R 0,1,3     -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 256,22960    -R 0         -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 256,22960    -R 1         -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 4,1469440    -R 0         -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 4,1469440    -R 1         -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    set +x
+done
+
+#### 0 - ADD,  5 - AVG,  7 - NORM2
+Operations=5
+
+## for performance evaluation (resnet50 NHWC => C)
+for op in $Operations; do
+    set -x
+    #######        datatype   layout             reduce dims  op     acctype   verify  init  repeats
+    $DRIVER reduce $PRECISION -D 256,14,14,1024  -R 0,1,2     -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT 
+    $DRIVER reduce $PRECISION -D 256,28,28,128   -R 0,1,2     -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 256,58,58,128   -R 0,1,2     -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 256,7,7,2048    -R 0,1,2     -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 256,14,14,256   -R 0,1,2     -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 256,30,30,256   -R 0,1,2     -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 256,56,56,256   -R 0,1,2     -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 256,16,16,512   -R 0,1,2     -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 256,28,28,512   -R 0,1,2     -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 256,7,7,512     -R 0,1,2     -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 256,56,56,64    -R 0,1,2     -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 256,230,230,3   -R 0,1,2     -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 128,14,14,1024  -R 0,1,2     -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 128,28,28,128   -R 0,1,2     -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 128,58,58,128   -R 0,1,2     -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 128,7,7,2048    -R 0,1,2     -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 128,14,14,256   -R 0,1,2     -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 128,30,30,256   -R 0,1,2     -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 128,56,56,256   -R 0,1,2     -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 128,16,16,512   -R 0,1,2     -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 128,28,28,512   -R 0,1,2     -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 128,7,7,512     -R 0,1,2     -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    $DRIVER reduce $PRECISION -D 128,56,56,64    -R 0,1,2     -O $op $ACCTYPE  $VERIFY $INIT $NREPEAT
+    set +x
+done 
+
diff --git a/script/profile_reduce_with_index.sh b/script/profile_reduce_with_index.sh
new file mode 100644
index 00000000..43543f44
--- /dev/null
+++ b/script/profile_reduce_with_index.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+DRIVER="../build/bin/ckProfiler"
+VERIFY="-v $1"
+INIT=$2
+NREPEAT=$3
+PRECISION=$4
+##PRECISION=--half
+##PRECISION=--double
+##PRECISION=--int8
+##PRECISION=--bf16
+
+#### 2 - MIN,  3 - MAX,  4 - AMAX
+Operations="2 4"
+
+## for generic validation
+for op in $Operations; do
+    for use_idx in 0 1; do
+        set -x
+        #######        datatype   layout          reduce dims  op     use index    verify  init  repeats
+        $DRIVER reduce $PRECISION -D 64,4,280,82  -R 0,1,2,3   -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 64,4,280,82  -R 0         -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 64,4,280,82  -R 1         -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 64,4,280,82  -R 2         -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 64,4,280,82  -R 3         -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 64,4,280,82  -R 0,1,2     -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 64,4,280,82  -R 1,2,3     -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 64,4,280,82  -R 0,2,3     -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 64,4,280,82  -R 0,1,3     -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 256,22960    -R 0         -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 256,22960    -R 1         -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 4,1469440    -R 0         -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 4,1469440    -R 1         -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        set +x
+    done
+done
+
+Operations=2
+
+## for performance evaluation (resnet50 NHWC => C)
+for op in $Operations; do
+    for use_idx in 0 1; do
+        set -x
+        #######        datatype   layout             reduce dims  op     use index    verify  init  repeats
+        $DRIVER reduce $PRECISION -D 256,14,14,1024  -R 0,1,2     -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 256,28,28,128   -R 0,1,2     -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 256,58,58,128   -R 0,1,2     -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 256,7,7,2048    -R 0,1,2     -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 256,14,14,256   -R 0,1,2     -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 256,30,30,256   -R 0,1,2     -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 256,56,56,256   -R 0,1,2     -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 256,16,16,512   -R 0,1,2     -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 256,28,28,512   -R 0,1,2     -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 256,7,7,512     -R 0,1,2     -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 256,56,56,64    -R 0,1,2     -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 256,230,230,3   -R 0,1,2     -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 128,14,14,1024  -R 0,1,2     -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 128,28,28,128   -R 0,1,2     -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 128,58,58,128   -R 0,1,2     -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 128,7,7,2048    -R 0,1,2     -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 128,14,14,256   -R 0,1,2     -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 128,30,30,256   -R 0,1,2     -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 128,56,56,256   -R 0,1,2     -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 128,16,16,512   -R 0,1,2     -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 128,28,28,512   -R 0,1,2     -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 128,7,7,512     -R 0,1,2     -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        $DRIVER reduce $PRECISION -D 128,56,56,64    -R 0,1,2     -O $op -I $use_idx  $VERIFY $INIT $NREPEAT
+        set +x
+    done
+done 
+
diff --git a/script/profile_resnet50.sh b/script/profile_resnet50.sh
new file mode 100644
index 00000000..b55cb2cc
--- /dev/null
+++ b/script/profile_resnet50.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+## GPU visibility
+export HIP_VISIBLE_DEVICES=0
+DRIVER="../build/bin/ckProfiler"
+
+OP=$1
+DATATYPE=$2
+IN_LAYOUT=$3
+WEI_LAYOUT=$4
+OUT_LAYOUT=$5
+VERIFY=$6
+INIT=$7
+LOG=$8
+TIME=$9
+
+ N=${10}
+
+# Resnet50
+######## op____________________  datatype  in_layout   wei_layout  out_layout  verify  init  log  time  N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N   64    3 7 7  224 224    2   2     1   1    3   3     3   3
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N   64   64 1 1   56  56    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N   64   64 3 3   56  56    1   1     1   1    1   1     1   1
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  256   64 1 1   56  56    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N   64  256 1 1   56  56    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N   64   64 3 3   56  56    1   1     1   1    1   1     1   1
+ $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  256   64 1 1   56  56    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N   64  256 1 1   56  56    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N   64   64 3 3   56  56    1   1     1   1    1   1     1   1
+ $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  256   64 1 1   56  56    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  128  256 1 1   56  56    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  128  128 3 3   56  56    2   2     1   1    1   1     1   1
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  512  128 1 1   28  28    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  128  512 1 1   28  28    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  128  128 3 3   28  28    1   1     1   1    1   1     1   1
+ $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  512  128 1 1   28  28    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  128  512 1 1   28  28    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  128  128 3 3   28  28    1   1     1   1    1   1     1   1
+ $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  512  128 1 1   28  28    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  128  512 1 1   28  28    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  128  128 3 3   28  28    1   1     1   1    1   1     1   1
+ $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  512  128 1 1   28  28    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  256  512 1 1   28  28    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  256  256 3 3   28  28    2   2     1   1    1   1     1   1
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N 1024  256 1 1   14  14    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  256 1024 1 1   14  14    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  256  256 3 3   14  14    1   1     1   1    1   1     1   1
+ $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N 1024  256 1 1   14  14    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  256 1024 1 1   14  14    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  256  256 3 3   14  14    1   1     1   1    1   1     1   1
+ $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N 1024  256 1 1   14  14    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  256 1024 1 1   14  14    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  256  256 3 3   14  14    1   1     1   1    1   1     1   1
+ $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N 1024  256 1 1   14  14    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  256 1024 1 1   14  14    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  256  256 3 3   14  14    1   1     1   1    1   1     1   1
+ $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N 1024  256 1 1   14  14    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  256 1024 1 1   14  14    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  256  256 3 3   14  14    1   1     1   1    1   1     1   1
+ $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N 1024  256 1 1   14  14    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  512 1024 1 1   14  14    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  512  512 3 3   14  14    2   2     1   1    1   1     1   1
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N 2048  512 1 1    7   7    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  512 2048 1 1    7   7    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  512  512 3 3    7   7    1   1     1   1    1   1     1   1
+ $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N 2048  512 1 1    7   7    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  512 2048 1 1    7   7    1   1     1   1    0   0     0   0
+ $DRIVER conv_fwd_bias_relu     $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N  512  512 3 3    7   7    1   1     1   1    1   1     1   1
+ $DRIVER conv_fwd_bias_relu_add $DATATYPE $IN_LAYOUT  $WEI_LAYOUT $OUT_LAYOUT $VERIFY $INIT $LOG $TIME   $N 2048  512 1 1    7   7    1   1     1   1    0   0     0   0
diff --git a/script/profile_splitK_gemm.sh b/script/profile_splitK_gemm.sh
new file mode 100644
index 00000000..d62f0e47
--- /dev/null
+++ b/script/profile_splitK_gemm.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+## GPU visibility
+export HIP_VISIBLE_DEVICES=0
+DRIVER="../build/bin/ckProfiler"
+echo $DRIVER
+OP=$1
+DATATYPE=$2
+LAYOUT=$3
+VERIFY=$4
+INIT=$5
+LOG=$6
+TIME=$7
+KBatch=$8
+
+
+# 120 CU
+########  op  datatype  layout  verify  init  log  time  M___ N___ K___  StrideA StrideB StrideC  KBatch_
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  960  1024 1024       -1     -1      -1   $KBatch
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  960  2048 2048       -1     -1      -1   $KBatch
+ 
+# 104 CU
+########  op  datatype  layout  verify  init  log  time  M___ N___ K___  StrideA StrideB StrideC KBatch_
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  832  1024 1024       -1     -1      -1  $KBatch
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME  832  2048 2048       -1     -1      -1  $KBatch
+ 
+# 110 CU
+########  op  datatype  layout  verify  init  log  time  M___ N___ K___  StrideA StrideB StrideC KBatch_
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1280  1408 1024       -1     -1      -1  $KBatch
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1280  2816 2048       -1     -1      -1  $KBatch
+
+# testing different strides
+########  op  datatype  layout  verify  init  log  time  M___ N___ K___  StrideA StrideB StrideC KBatch_
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024  1024 1024	    1024   1024    1024  $KBatch
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048  2048 2048	    2048   2048    2048  $KBatch
+ 
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024  1024 1024	    1056   1056    1056  $KBatch
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048  2048 2048	    2080   2080    2080  $KBatch
+ 
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 1024  1024 1024	    1088   1088    1088  $KBatch
+ $DRIVER $OP $DATATYPE $LAYOUT $VERIFY $INIT $LOG $TIME 2048  2048 2048	    2112   2112    2112  $KBatch
diff --git a/script/run_full_performance_tests.sh b/script/run_full_performance_tests.sh
new file mode 100644
index 00000000..eae334ae
--- /dev/null
+++ b/script/run_full_performance_tests.sh
@@ -0,0 +1,149 @@
+#!/bin/bash 
+#
+# in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
+# you would also need to set up some environment variables in order to 
+# post your new test results to the database and compare them to the baseline
+# please contact Illia.Silin@amd.com for more details
+#
+# run the script as "./run_full_performance_tests.sh <verification> <tag for your test environment> <branch name> < node name>
+# input arguments: 
+# verification = 0 : do not verify result correctness on CPU
+#              = 1 : verifuy correctness on CPU (may take a long time)
+# environment tag  : a string describing the specifics of your test environment
+# branch name      : name of the branch in git repo (git status | grep -e 'On branch')
+# node name        : $hostname
+
+#get the command line arguments:
+export verify=$1
+echo 'Verification: ' $verify
+export env_type=$2
+echo 'Environment type: ' $env_type
+export branch=$3
+echo 'Branch name: ' $branch
+export host_name=$4
+echo 'Host name: ' $host_name
+function print_log_header(){
+	rm -f $1;
+	echo 'On branch ' $3 &> $1;
+	echo 'Node name: ' $4 >> $1;
+	#get GPU_arch and number of compute units from rocminfo
+	echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1;
+	rocminfo | grep "Compute Unit:" >> $1;
+	hipcc --version | grep -e 'HIP version'  >> $1;
+	echo 'Environment type: ' $2 >> $1;
+	/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1;
+}
+
+#run gemm tests
+export gemm_log="perf_gemm.log"
+print_log_header $gemm_log $env_type $branch $host_name
+./profile_gemm.sh gemm 0 0 $verify 1 0 1 2>&1 | tee -a $gemm_log
+./profile_gemm.sh gemm 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_log
+./profile_gemm.sh gemm 2 0 $verify 1 0 1 2>&1 | tee -a $gemm_log
+./profile_gemm.sh gemm 3 0 $verify 1 0 1 2>&1 | tee -a $gemm_log
+./profile_gemm.sh gemm 0 1 $verify 1 0 1 2>&1 | tee -a $gemm_log
+./profile_gemm.sh gemm 1 1 $verify 1 0 1 2>&1 | tee -a $gemm_log
+./profile_gemm.sh gemm 2 1 $verify 1 0 1 2>&1 | tee -a $gemm_log
+./profile_gemm.sh gemm 3 1 $verify 1 0 1 2>&1 | tee -a $gemm_log
+./profile_gemm.sh gemm 0 2 $verify 1 0 1 2>&1 | tee -a $gemm_log
+./profile_gemm.sh gemm 1 2 $verify 1 0 1 2>&1 | tee -a $gemm_log
+./profile_gemm.sh gemm 2 2 $verify 1 0 1 2>&1 | tee -a $gemm_log
+./profile_gemm.sh gemm 3 2 $verify 1 0 1 2>&1 | tee -a $gemm_log
+./profile_gemm.sh gemm 0 3 $verify 1 0 1 2>&1 | tee -a $gemm_log
+./profile_gemm.sh gemm 1 3 $verify 1 0 1 2>&1 | tee -a $gemm_log
+./profile_gemm.sh gemm 2 3 $verify 1 0 1 2>&1 | tee -a $gemm_log
+./profile_gemm.sh gemm 3 3 $verify 1 0 1 2>&1 | tee -a $gemm_log
+
+#run batched_gemm tests
+export batched_gemm_log="perf_batched_gemm.log"
+print_log_header $batched_gemm_log $env_type $branch $host_name
+./profile_batched_gemm.sh batched_gemm 0 0 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 0 1 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 0 2 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 0 3 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 1 0 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 1 1 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 1 2 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 1 3 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 2 0 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 2 1 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 2 2 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 2 3 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 3 0 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 3 1 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 3 2 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
+./profile_batched_gemm.sh batched_gemm 3 3 $verify 1 0 1 2>&1 | tee -a $batched_gemm_log
+
+#run grouped_gemm tests
+export grouped_gemm_log="perf_grouped_gemm.log"
+print_log_header $grouped_gemm_log $env_type $branch $host_name
+./profile_grouped_gemm.sh grouped_gemm 1 0 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_log
+./profile_grouped_gemm.sh grouped_gemm 1 1 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_log
+./profile_grouped_gemm.sh grouped_gemm 1 2 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_log
+./profile_grouped_gemm.sh grouped_gemm 1 3 $verify 1 0 1 2>&1 | tee -a $grouped_gemm_log
+
+#run GEMM+Bilinear tests
+export gemm_bilinear_log="perf_gemm_bilinear.log"
+print_log_header $gemm_bilinear_log $env_type $branch $host_name
+./profile_gemm_bilinear.sh gemm_bilinear 1 0 $verify 1 0 1 2>&1 | tee -a $gemm_bilinear_log
+./profile_gemm_bilinear.sh gemm_bilinear 1 1 $verify 1 0 1 2>&1 | tee -a $gemm_bilinear_log
+./profile_gemm_bilinear.sh gemm_bilinear 1 2 $verify 1 0 1 2>&1 | tee -a $gemm_bilinear_log
+./profile_gemm_bilinear.sh gemm_bilinear 1 3 $verify 1 0 1 2>&1 | tee -a $gemm_bilinear_log
+
+#run conv_fwd tests
+export conv_fwd_log="perf_conv_fwd.log"
+print_log_header $conv_fwd_log $env_type $branch $host_name
+./profile_conv_fwd.sh conv_fwd 0 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log
+./profile_conv_fwd.sh conv_fwd 1 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log
+./profile_conv_fwd.sh conv_fwd 2 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log
+./profile_conv_fwd.sh conv_fwd 3 1 $verify 1 0 1 256 2>&1 | tee -a $conv_fwd_log
+
+#run conv_bwd_data tests
+export conv_bwd_data_log="perf_conv_bwd_data.log"
+print_log_header $conv_bwd_data_log $env_type $branch $host_name
+./profile_conv_bwd_data.sh conv_bwd_data 0 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log
+./profile_conv_bwd_data.sh conv_bwd_data 1 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log
+./profile_conv_bwd_data.sh conv_bwd_data 2 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log
+./profile_conv_bwd_data.sh conv_bwd_data 3 1 $verify 1 0 1 256 2>&1 | tee -a $conv_bwd_data_log
+
+#run resnet50 tests
+export resnet256_log="perf_resnet50_N256.log"
+print_log_header $resnet256_log $env_type $branch $host_name
+./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 1 0 1 256 2>&1 | tee -a $resnet256_log
+export resnet4_log="perf_resnet50_N4.log"
+print_log_header $resnet4_log $env_type $branch $host_name
+./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 1 0 1 4 2>&1 | tee -a $resnet4_log
+
+#run reduction tests
+export reduction_log="perf_reduction.log"
+print_log_header $reduction_log $env_type $branch $host_name
+./profile_reduce_with_index.sh $verify 2 10 --half 2>&1 | tee -a $reduction_log
+./profile_reduce_no_index.sh $verify 2 10 --half 2>&1 | tee -a $reduction_log
+
+#run splitK_gemm tests, first correctness verification, then performance
+export splitK_gemm_ver_log="perf_splitK_gemm_verify.log"
+print_log_header $splitK_gemm_ver_log $env_type $branch $host_name
+./profile_splitK_gemm.sh gemm_splitk 0 0 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
+./profile_splitK_gemm.sh gemm_splitk 0 1 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
+./profile_splitK_gemm.sh gemm_splitk 0 2 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
+./profile_splitK_gemm.sh gemm_splitk 0 3 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
+./profile_splitK_gemm.sh gemm_splitk 1 0 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
+./profile_splitK_gemm.sh gemm_splitk 1 1 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
+./profile_splitK_gemm.sh gemm_splitk 1 2 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
+./profile_splitK_gemm.sh gemm_splitk 1 3 $verify 1 0 0 4 2>&1 | tee -a $splitK_gemm_ver_log
+export splitK_gemm_log="perf_splitK_gemm.log"
+print_log_header $splitK_gemm_log $env_type $branch $host_name
+./profile_splitK_gemm.sh gemm_splitk 0 0 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 0 1 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 0 2 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 0 3 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 1 0 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 1 1 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 1 2 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+./profile_splitK_gemm.sh gemm_splitk 1 3 0 1 0 1 4 2>&1 | tee -a $splitK_gemm_log
+
+#run ONNX gemm tests
+export onnx_log="perf_onnx_gemm.log"
+print_log_header $onnx_log $env_type $branch $host_name
+./profile_onnx_gemm.sh gemm 0 0 $verify 1 0 1 2>&1 | tee -a $onnx_log
+./profile_onnx_gemm.sh gemm 1 0 $verify 1 0 1 2>&1 | tee -a $onnx_log
diff --git a/script/run_performance_tests.sh b/script/run_performance_tests.sh
new file mode 100644
index 00000000..4e3a6fc8
--- /dev/null
+++ b/script/run_performance_tests.sh
@@ -0,0 +1,60 @@
+#!/bin/bash 
+#
+# in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
+# run the script as "./run_performance_tests.sh <verification> <tag for your test environment> <branch name> < node name>
+# input arguments: 
+# verification = 0 : do not verify result correctness on CPU
+#              = 1 : verify correctness on CPU (may take a long time)
+# environment tag  : a string describing the specifics of your test environment
+# branch name      : name of the branch in git repo (git status | grep -e 'On branch')
+# node name        : $hostname
+
+#get the command line arguments:
+export verify=$1
+echo 'Verification: ' $verify
+export env_type=$2
+echo 'Environment type: ' $env_type
+export branch=$3
+echo 'Branch name: ' $branch
+export host_name=$4
+echo 'Host name: ' $host_name
+
+function print_log_header(){
+	rm -f $1;
+	echo 'On branch ' $3 &> $1;
+	echo 'Node name: ' $4 >> $1;
+	#get GPU_arch and number of compute units from rocminfo
+	echo -n "GPU_arch: " >> $1; rocminfo | grep "Name:" | grep "gfx" >> $1;
+	rocminfo | grep "Compute Unit:" >> $1;
+	hipcc --version | grep -e 'HIP version'  >> $1;
+	echo 'Environment type: ' $2 >> $1;
+	/opt/rocm/bin/amdclang++ --version | grep -e 'InstalledDir' >> $1;
+}
+
+#run gemm tests
+export gemm_log="perf_gemm.log"
+print_log_header $gemm_log $env_type $branch $host_name
+./profile_gemm.sh gemm 0 0 $verify 1 0 1 | tee -a $gemm_log
+./profile_gemm.sh gemm 1 0 $verify 1 0 1 | tee -a $gemm_log
+./profile_gemm.sh gemm 2 0 $verify 1 0 1 | tee -a $gemm_log
+./profile_gemm.sh gemm 3 0 $verify 1 0 1 | tee -a $gemm_log
+./profile_gemm.sh gemm 0 1 $verify 1 0 1 | tee -a $gemm_log
+./profile_gemm.sh gemm 1 1 $verify 1 0 1 | tee -a $gemm_log
+./profile_gemm.sh gemm 2 1 $verify 1 0 1 | tee -a $gemm_log
+./profile_gemm.sh gemm 3 1 $verify 1 0 1 | tee -a $gemm_log
+./profile_gemm.sh gemm 0 2 $verify 1 0 1 | tee -a $gemm_log
+./profile_gemm.sh gemm 1 2 $verify 1 0 1 | tee -a $gemm_log
+./profile_gemm.sh gemm 2 2 $verify 1 0 1 | tee -a $gemm_log
+./profile_gemm.sh gemm 3 2 $verify 1 0 1 | tee -a $gemm_log
+./profile_gemm.sh gemm 0 3 $verify 1 0 1 | tee -a $gemm_log
+./profile_gemm.sh gemm 1 3 $verify 1 0 1 | tee -a $gemm_log
+./profile_gemm.sh gemm 2 3 $verify 1 0 1 | tee -a $gemm_log
+./profile_gemm.sh gemm 3 3 $verify 1 0 1 | tee -a $gemm_log
+
+#run resnet50 tests
+export resnet256_log="perf_resnet50_N256.log"
+print_log_header $resnet256_log $env_type $branch $host_name
+./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 1 0 1 256 | tee -a $resnet256_log
+export resnet4_log="perf_resnet50_N4.log"
+print_log_header $resnet4_log $env_type $branch $host_name
+./profile_resnet50.sh conv_fwd_bias_relu 1 1 1 1 $verify 1 0 1 4 | tee -a $resnet4_log
diff --git a/script/test_convnd_fwd.sh b/script/test_convnd_fwd.sh
new file mode 100644
index 00000000..1bd7a6b5
--- /dev/null
+++ b/script/test_convnd_fwd.sh
@@ -0,0 +1,110 @@
+#!/usr/bin/env bash
+
+# set -e
+
+DIM1=False
+DIM2=True
+DIM3=False
+DATE=220317
+GIT_HASH=4e6dfda
+LOG_DIR=${DATE}_${GIT_HASH}
+SUFFIX=${GIT_HASH}
+
+
+#--------------------------------------------------------------------------
+#   Commandline arguments parsing
+#   like: cmd -key[--key] value
+#--------------------------------------------------------------------------
+
+POSITIONAL=()
+while [[ $# -gt 0 ]]
+do
+key="$1"
+
+case $key in
+    -d1|--d1)
+    DIM1=True
+    echo DIM1: "${DIM1}"
+    shift # past argument
+    ;;
+    -d2|--d2)
+    DIM2=True
+    echo DIM2: "${DIM2}"
+    shift # past argument
+    ;;
+    -d3|--d3)
+    DIM3=True
+    echo DIM3: "${DIM3}"
+    shift # past argument
+    ;;
+    -all|--all)
+    DIM1=True
+    DIM2=True
+    DIM3=True
+    echo DIM1: "${DIM1}"
+    echo DIM2: "${DIM2}"
+    echo DIM3: "${DIM3}"
+    shift # past argument
+    ;;
+    -s|--suffix)
+    SUFFIX=${SUFFIX}_"$2"
+    echo SUFFIX: "${SUFFIX}"
+    shift # past argument
+    shift # past value
+    ;;
+    *)    # unknown option
+    POSITIONAL+=("$1") # save it in an array for later
+    shift # past argument
+    ;;
+esac
+done
+set -- "${POSITIONAL[@]}" # restore positional parameters
+
+#--------------------------------------------------------------------------
+
+# NUMACTL="numactl --cpunodebind=1 --membind=1"
+NUMACTL=
+# ENV_CONF=
+GPU=mi100
+PROF_ITER_COUNT=10000
+LOG_DIR_PATH=../log/${LOG_DIR}
+set -x
+
+#-------------------------------------------------------------------------------
+#               1D
+#-------------------------------------------------------------------------------
+
+if [[ "${DIM1}" == "True" ]]; then
+    mkdir -p ${LOG_DIR_PATH}
+    echo ">>>>>>>> RUN test conv1d nwc <<<<<<<<<<"
+    CMD="./../build/bin/test_conv1d_fwd"
+    ${NUMACTL} ${CMD} 2>&1 \
+        | tee ${LOG_DIR_PATH}/test_conv1d_fwd_nwc_${SUFFIX}_${GPU}.log
+
+fi
+
+#-------------------------------------------------------------------------------
+#               2D
+#-------------------------------------------------------------------------------
+
+if [[ "${DIM2}" == "True" ]]; then
+    mkdir -p ${LOG_DIR_PATH}
+    echo ">>>>>>>> RUN test conv2d nhwc <<<<<<<<<<"
+    CMD="./../build/bin/test_conv2d_fwd"
+    ${NUMACTL} ${CMD} 2>&1 \
+        | tee ${LOG_DIR_PATH}/test_conv2d_fwd_nhwc_${SUFFIX}_${GPU}.log
+
+fi
+
+#-------------------------------------------------------------------------------
+#               3D
+#-------------------------------------------------------------------------------
+
+if [[ "${DIM3}" == "True" ]]; then
+    mkdir -p ${LOG_DIR_PATH}
+    echo ">>>>>>>> RUN test conv3d ndhwc <<<<<<<<<<"
+    CMD="./../build/bin/test_conv3d_fwd"
+    ${NUMACTL} ${CMD} 2>&1 \
+        | tee ${LOG_DIR_PATH}/test_conv3d_fwd_ndhwc_${SUFFIX}_${GPU}.log
+
+fi
diff --git a/script/test_reduce_no_index.sh b/script/test_reduce_no_index.sh
new file mode 100644
index 00000000..b9563038
--- /dev/null
+++ b/script/test_reduce_no_index.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+## The following will be used for CI
+
+set -x
+
+## for float
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2,3  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,3  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,2,3  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1,2,3  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 2  0 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 3  0 2
+
+## for float64
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2,3  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,3  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,2,3  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1,2,3  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 2  6 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 3  6 2
+
+## for float16
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2,3  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,3  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,2,3  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1,2,3  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 2  1 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 3  1 2
+
+## for int8_t
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2,3  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,3  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,2,3  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1,2,3  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 2  3 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 3  3 2
+
+## for bfloat16
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2,3  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,2  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,1,3  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0,2,3  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1,2,3  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 0  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 1  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 2  5 2
+bin/test_reduce_no_index -D 64,4,280,82  -R 3  5 2
+
+set +x
+
diff --git a/script/test_reduce_with_index.sh b/script/test_reduce_with_index.sh
new file mode 100644
index 00000000..b0843ba6
--- /dev/null
+++ b/script/test_reduce_with_index.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+## The following will be used for CI
+
+set -x
+
+## for float
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 2  0 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 3  0 2
+
+## for float64
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 2  6 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 3  6 2
+
+## for float16
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 2  1 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 3  1 2
+
+## for int8_t
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 2  3 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 3  3 2
+
+## for bfloat16
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2,3  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,2  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,1,3  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0,2,3  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1,2,3  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 0  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 1  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 2  5 2
+bin/test_reduce_with_index -D 64,4,280,82  -R 3  5 2
+
+set +x
+
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
new file mode 100644
index 00000000..b2e25e4c
--- /dev/null
+++ b/test/CMakeLists.txt
@@ -0,0 +1,60 @@
+include_directories(BEFORE
+    ${PROJECT_SOURCE_DIR}/
+    ${PROJECT_SOURCE_DIR}/profiler/include
+)
+
+include(googletest)
+
+add_custom_target(tests)
+
+function(add_test_executable TEST_NAME)
+    message("adding test ${TEST_NAME}")
+    add_executable(${TEST_NAME} ${ARGN})
+    add_test(NAME ${TEST_NAME} COMMAND $<TARGET_FILE:${TEST_NAME}>)
+    add_dependencies(tests ${TEST_NAME})
+    add_dependencies(check ${TEST_NAME})
+    rocm_install(TARGETS ${TEST_NAME} COMPONENT tests)
+endfunction(add_test_executable TEST_NAME)
+
+include(GoogleTest)
+
+function(add_gtest_executable TEST_NAME)
+    message("adding gtest ${TEST_NAME}")
+    add_executable(${TEST_NAME} ${ARGN})
+    add_dependencies(tests ${TEST_NAME})
+    add_dependencies(check ${TEST_NAME})
+
+    # suppress gtest warnings
+    target_compile_options(${TEST_NAME} PRIVATE -Wno-global-constructors -Wno-undef)
+    target_link_libraries(${TEST_NAME} PRIVATE gtest_main)
+    add_test(NAME ${TEST_NAME} COMMAND $<TARGET_FILE:${TEST_NAME}> )
+    rocm_install(TARGETS ${TEST_NAME} COMPONENT tests)
+endfunction(add_gtest_executable TEST_NAME)
+
+add_subdirectory(magic_number_division)
+add_subdirectory(space_filling_curve)
+add_subdirectory(conv_util)
+add_subdirectory(reference_conv_fwd)
+add_subdirectory(gemm)
+add_subdirectory(gemm_split_k)
+add_subdirectory(gemm_reduce)
+add_subdirectory(batched_gemm)
+add_subdirectory(batched_gemm_reduce)
+add_subdirectory(batched_gemm_gemm)
+add_subdirectory(batched_gemm_softmax_gemm)
+add_subdirectory(batched_gemm_softmax_gemm_permute)
+add_subdirectory(grouped_gemm)
+add_subdirectory(reduce)
+add_subdirectory(convnd_fwd)
+add_subdirectory(convnd_bwd_data)
+add_subdirectory(grouped_convnd_fwd)
+add_subdirectory(grouped_convnd_bwd_weight)
+add_subdirectory(block_to_ctile_map)
+add_subdirectory(softmax)
+add_subdirectory(normalization)
+add_subdirectory(data_type)
+add_subdirectory(elementwise_normalization)
+add_subdirectory(batchnorm)
+if(GPU_TARGETS MATCHES "gfx1100")
+    add_subdirectory(wmma_op)
+endif()
diff --git a/test/batched_gemm/CMakeLists.txt b/test/batched_gemm/CMakeLists.txt
new file mode 100644
index 00000000..0574f98e
--- /dev/null
+++ b/test/batched_gemm/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_test_executable(test_batched_gemm_fp16 batched_gemm_fp16.cpp)
+target_link_libraries(test_batched_gemm_fp16 PRIVATE utility)
+target_link_libraries(test_batched_gemm_fp16 PRIVATE device_batched_gemm_instance)
+
+add_test_executable(test_batched_gemm_fp32 batched_gemm_fp32.cpp)
+target_link_libraries(test_batched_gemm_fp32 PRIVATE utility)
+target_link_libraries(test_batched_gemm_fp32 PRIVATE device_batched_gemm_instance)
+
+add_test_executable(test_batched_gemm_bf16 batched_gemm_bf16.cpp)
+target_link_libraries(test_batched_gemm_bf16 PRIVATE utility)
+target_link_libraries(test_batched_gemm_bf16 PRIVATE device_batched_gemm_instance)
+
+add_test_executable(test_batched_gemm_int8 batched_gemm_int8.cpp)
+target_link_libraries(test_batched_gemm_int8 PRIVATE utility)
+target_link_libraries(test_batched_gemm_int8 PRIVATE device_batched_gemm_instance)
diff --git a/test/batched_gemm/batched_gemm_bf16.cpp b/test/batched_gemm/batched_gemm_bf16.cpp
new file mode 100644
index 00000000..78be5406
--- /dev/null
+++ b/test/batched_gemm/batched_gemm_bf16.cpp
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "profiler/profile_batched_gemm_impl.hpp"
+
+namespace {
+using ADataType = ck::bhalf_t;
+using BDataType = ck::bhalf_t;
+using CDataType = ck::bhalf_t;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+} // namespace
+
+int main()
+{
+    int M          = 256;
+    int N          = 256;
+    int K          = 128;
+    int BatchCount = 3;
+
+    bool pass = true;
+
+    pass = pass &&
+           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Row, Row>(
+               true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+
+    pass = pass &&
+           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Col, Row>(
+               true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
+
+    pass = pass &&
+           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Row, Row>(
+               true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
+
+    pass = pass &&
+           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Col, Row>(
+               true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
+
+    std::cout << "test BatchedGEMM bf16: " << (pass ? "Pass" : "Fail") << std::endl;
+    return pass ? 0 : 1;
+}
diff --git a/test/batched_gemm/batched_gemm_fp16.cpp b/test/batched_gemm/batched_gemm_fp16.cpp
new file mode 100644
index 00000000..6cbbedf6
--- /dev/null
+++ b/test/batched_gemm/batched_gemm_fp16.cpp
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "profiler/profile_batched_gemm_impl.hpp"
+
+namespace {
+using ADataType = ck::half_t;
+using BDataType = ck::half_t;
+using CDataType = ck::half_t;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+} // namespace
+
+int main()
+{
+    int M          = 512;
+    int N          = 256;
+    int K          = 128;
+    int BatchCount = 3;
+
+    bool pass = true;
+
+    pass = pass &&
+           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Row, Row>(
+               true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+
+    pass = pass &&
+           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Col, Row>(
+               true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
+
+    pass = pass &&
+           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Row, Row>(
+               true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
+
+    pass = pass &&
+           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Col, Row>(
+               true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
+
+    std::cout << "test BatchedGEMM fp16: " << (pass ? "Pass" : "Fail") << std::endl;
+    return pass ? 0 : 1;
+}
diff --git a/test/batched_gemm/batched_gemm_fp32.cpp b/test/batched_gemm/batched_gemm_fp32.cpp
new file mode 100644
index 00000000..c9e565e2
--- /dev/null
+++ b/test/batched_gemm/batched_gemm_fp32.cpp
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "profiler/profile_batched_gemm_impl.hpp"
+
+namespace {
+using ADataType = float;
+using BDataType = float;
+using CDataType = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+} // namespace
+
+int main()
+{
+    int M          = 256;
+    int N          = 256;
+    int K          = 128;
+    int BatchCount = 3;
+
+    bool pass = true;
+
+    pass = pass &&
+           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Row, Row>(
+               true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+
+    pass = pass &&
+           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Col, Row>(
+               true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
+
+    pass = pass &&
+           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Row, Row>(
+               true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
+
+    pass = pass &&
+           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Col, Row>(
+               true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
+
+    std::cout << "test BatchedGEMM fp32: " << (pass ? "Pass" : "Fail") << std::endl;
+    return pass ? 0 : 1;
+}
diff --git a/test/batched_gemm/batched_gemm_int8.cpp b/test/batched_gemm/batched_gemm_int8.cpp
new file mode 100644
index 00000000..4da941a5
--- /dev/null
+++ b/test/batched_gemm/batched_gemm_int8.cpp
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "profiler/profile_batched_gemm_impl.hpp"
+
+namespace {
+using ADataType = int8_t;
+using BDataType = int8_t;
+using CDataType = int8_t;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+} // namespace
+
+int main()
+{
+    int M          = 256;
+    int N          = 256;
+    int K          = 128;
+    int BatchCount = 3;
+
+    bool pass = true;
+
+    pass = pass &&
+           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Row, Row>(
+               true, 1, false, 1, M, N, K, K, N, N, M * K, K * N, M * N, BatchCount);
+
+    pass = pass &&
+           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Row, Col, Row>(
+               true, 1, false, 1, M, N, K, K, K, N, M * K, K * N, M * N, BatchCount);
+
+    pass = pass &&
+           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Row, Row>(
+               true, 1, false, 1, M, N, K, M, N, N, M * K, K * N, M * N, BatchCount);
+
+    pass = pass &&
+           ck::profiler::profile_batched_gemm_impl<ADataType, BDataType, CDataType, Col, Col, Row>(
+               true, 1, false, 1, M, N, K, M, K, N, M * K, K * N, M * N, BatchCount);
+
+    std::cout << "test BatchedGEMM int8: " << (pass ? "Pass" : "Fail") << std::endl;
+    return pass ? 0 : 1;
+}
diff --git a/test/batched_gemm_gemm/CMakeLists.txt b/test/batched_gemm_gemm/CMakeLists.txt
new file mode 100644
index 00000000..38680971
--- /dev/null
+++ b/test/batched_gemm_gemm/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_custom_target(test_batched_gemm_gemm)
+
+add_gtest_executable(test_batched_gemm_gemm_fp16 test_batched_gemm_gemm_fp16.cpp)
+target_link_libraries(test_batched_gemm_gemm_fp16 PRIVATE utility device_batched_gemm_gemm_instance)
+add_dependencies(test_batched_gemm_gemm test_batched_gemm_gemm_fp16)
\ No newline at end of file
diff --git a/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16.cpp b/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16.cpp
new file mode 100644
index 00000000..aa113de2
--- /dev/null
+++ b/test/batched_gemm_gemm/test_batched_gemm_gemm_fp16.cpp
@@ -0,0 +1,149 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "test_batched_gemm_gemm_util.hpp"
+
+template <typename Tuple>
+class TestBatchedGemmGemmFP16 : public TestBatchedGemmGemm<Tuple>
+{
+};
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+    std::tuple<F16, F16, F16, F16, Row, Col, Row, Row>,
+    std::tuple<F16, F16, F16, F16, Row, Col, Col, Row>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestBatchedGemmGemmFP16, KernelTypes);
+
+TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16) { this->Run(); }
+
+TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_PadM)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {136, 128, 32, 128, 1},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_PadN)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 136, 32, 128, 1},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_PadK)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 40, 128, 1},
+        {128, 128, 136, 128, 1},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_PadO)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 32, 136, 1},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_OddM)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {129, 128, 32, 128, 1},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_OddN)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 129, 32, 128, 1},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_OddK)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 33, 128, 1},
+        {128, 128, 129, 128, 1},
+    };
+    this->Run();
+}
+
+// If kernel B1Layout is RowMajor, expect not to support odd O size
+TYPED_TEST(TestBatchedGemmGemmFP16, Test_FP16_OddO)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 32, 129, 1},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmGemmFP16, DISABLED_Bench_FP16)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {256, 256, 64, 64, 768},
+        {256, 256, 128, 128, 768},
+        {512, 512, 64, 64, 768},
+        {512, 512, 128, 128, 768},
+        {1024, 1024, 64, 64, 768},
+        {1024, 1024, 128, 128, 768},
+        {2048, 2048, 64, 64, 768},
+        {2048, 2048, 128, 128, 768},
+        {4096, 4096, 64, 64, 768},
+        {4096, 4096, 128, 128, 768},
+    };
+    this->bench_  = true;
+    this->verify_ = false;
+    this->Run();
+}
+
+using ck::tensor_operation::device::GemmSpecialization;
+
+TEST(TestBatchedGemmGemmInterface, GemmSpecializationSizeMatch)
+{
+    int P = 120; // requires padding
+    int Q = 128; // do not require padding
+
+    // IsSupported(M, N, K, O)
+    // clang-format off
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::Default>{}.IsSupported(Q, Q, Q, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MPadding>{}.IsSupported(P, Q, Q, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::NPadding>{}.IsSupported(Q, P, Q, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::KPadding>{}.IsSupported(Q, Q, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNPadding>{}.IsSupported(P, P, Q, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MKPadding>{}.IsSupported(P, Q, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::NKPadding>{}.IsSupported(Q, P, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNKPadding>{}.IsSupported(P, P, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::OPadding>{}.IsSupported(Q, Q, Q, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MOPadding>{}.IsSupported(P, Q, Q, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::NOPadding>{}.IsSupported(Q, P, Q, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::KOPadding>{}.IsSupported(Q, Q, P, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNOPadding>{}.IsSupported(P, P, Q, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MKOPadding>{}.IsSupported(P, Q, P, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::NKOPadding>{}.IsSupported(Q, P, P, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(P, P, P, P));
+    // clang-format on
+}
+
+TEST(TestBatchedGemmGemmInterface, GemmSpecializationSizeMismatch)
+{
+    // IsSupported(M, N, K, O)
+    // clang-format off
+    EXPECT_FALSE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::Default>{}.IsSupported(128, 128, 120, 128));
+    EXPECT_FALSE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNKPadding>{}.IsSupported(128, 128, 128, 120));
+    // Kernel can't support odd K size because SrcVectorDim == KDim and must satisfy SizeKRaw % ABSrcScalarPerVector == 0
+    EXPECT_FALSE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(128, 128, 129, 128));
+    EXPECT_FALSE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(128, 128, 130, 128));
+    // Kernel can't support odd O size because SrcVectorDim == ODim and must satisfy SizeORaw % B1SrcScalarPerVector == 0
+    EXPECT_FALSE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(128, 128, 128, 129));
+    // clang-format on
+}
diff --git a/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp b/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp
new file mode 100644
index 00000000..53c4d37c
--- /dev/null
+++ b/test/batched_gemm_gemm/test_batched_gemm_gemm_util.hpp
@@ -0,0 +1,189 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include <vector>
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp"
+#include "profiler/profile_batched_gemm_gemm_impl.hpp"
+
+using ck::tensor_operation::device::GemmSpecialization;
+
+template <ck::index_t N>
+using I = ck::Number<N>;
+
+using F16 = ck::half_t;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <typename Tuple>
+struct TestBatchedGemmGemm : public ::testing::Test
+{
+    using ADataType  = std::tuple_element_t<0, Tuple>;
+    using B0DataType = std::tuple_element_t<1, Tuple>;
+    using B1DataType = std::tuple_element_t<2, Tuple>;
+    using CDataType  = std::tuple_element_t<3, Tuple>;
+    using ALayout    = std::tuple_element_t<4, Tuple>;
+    using B0Layout   = std::tuple_element_t<5, Tuple>;
+    using B1Layout   = std::tuple_element_t<6, Tuple>;
+    using CLayout    = std::tuple_element_t<7, Tuple>;
+
+    std::vector<std::vector<int>> lengths_ = {
+        {256, 256, 64, 64, 4},
+        {256, 256, 128, 128, 4},
+        {512, 512, 64, 64, 2},
+        {512, 512, 128, 128, 2},
+        {1024, 1024, 64, 64, 1},
+        {1024, 1024, 128, 128, 1},
+    };
+    bool bench_  = false;
+    bool verify_ = true;
+
+    void RunSingle(int M, int N, int K, int O, int BatchCount)
+    {
+        bool pass = ck::profiler::profile_batched_gemm_gemm_impl<ADataType,
+                                                                 B0DataType,
+                                                                 B1DataType,
+                                                                 CDataType,
+                                                                 ALayout,
+                                                                 B0Layout,
+                                                                 B1Layout,
+                                                                 CLayout>(
+            verify_, 1, false, bench_, M, N, K, O, BatchCount);
+
+        EXPECT_TRUE(pass);
+    }
+
+    void Run()
+    {
+        for(auto lengths : this->lengths_)
+        {
+            int M          = lengths[0];
+            int N          = lengths[1];
+            int K          = lengths[2];
+            int O          = lengths[3];
+            int BatchCount = lengths[4];
+
+            this->RunSingle(M, N, K, O, BatchCount);
+        }
+    }
+};
+
+template <GemmSpecialization GemmSpec>
+struct DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128
+{
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+    using ALayout  = Row;
+    using B0Layout = Col;
+    using B1Layout = Row;
+    using CLayout  = Row;
+
+    using ADataType        = F16;
+    using B0DataType       = F16;
+    using B1DataType       = F16;
+    using AccDataType      = float;
+    using CShuffleDataType = float;
+    using CDataType        = F16;
+
+    using AElementOp    = PassThrough;
+    using B0ElementOp   = PassThrough;
+    using Acc0ElementOp = PassThrough;
+    using B1ElementOp   = PassThrough;
+    using CElementOp    = PassThrough;
+
+    template <ck::index_t... Is>
+    using S = ck::Sequence<Is...>;
+
+    // static constexpr auto GemmSpec = std::tuple_element_t<0, Tuple>::value;
+
+    using DeviceGemmGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmGemm_Xdl_CShuffle<
+        ALayout,
+        B0Layout,
+        B1Layout,
+        CLayout,
+        ADataType,
+        B0DataType,
+        B1DataType,
+        CDataType,
+        AccDataType,
+        CShuffleDataType,
+        AElementOp,
+        B0ElementOp,
+        Acc0ElementOp,
+        B1ElementOp,
+        CElementOp,
+        GemmSpec,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        32,          // KPerBlock
+        128,         // Gemm1NPerBlock
+        32,          // Gemm1KPerBlock
+        8,           // AK1
+        8,           // BK1
+        2,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        4,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<8, 32, 1>, // B1BlockTransfer
+        S<0, 2, 1>,
+        S<0, 2, 1>,
+        1,
+        4,
+        2,
+        false,
+        1,              // CShuffleMXdlPerWavePerShuffle
+        2,              // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+
+    bool IsSupported(int M, int N, int K, int O)
+    {
+        auto gemm     = DeviceGemmGemmInstance{};
+        auto invoker  = gemm.MakeInvoker();
+        auto argument = gemm.MakeArgument(static_cast<ADataType*>(nullptr),
+                                          static_cast<B0DataType*>(nullptr),
+                                          static_cast<B1DataType*>(nullptr),
+                                          static_cast<CDataType*>(nullptr),
+                                          M,
+                                          N,
+                                          K,
+                                          O,
+                                          0,              // BatchCount
+                                          0,              // StrideA
+                                          0,              // StrideB0
+                                          0,              // StrideB1
+                                          0,              // StrideC
+                                          0,              // BatchStrideA
+                                          0,              // BatchStrideB0
+                                          0,              // BatchStrideB1
+                                          0,              // BatchStrideC
+                                          PassThrough{},  // a_element_op
+                                          PassThrough{},  // b0_element_op
+                                          PassThrough{},  // acc0_element_op
+                                          PassThrough{},  // b1_element_op
+                                          PassThrough{}); // c_element_op
+
+        return gemm.IsSupportedArgument(argument);
+    }
+};
diff --git a/test/batched_gemm_reduce/CMakeLists.txt b/test/batched_gemm_reduce/CMakeLists.txt
new file mode 100644
index 00000000..4dc0b082
--- /dev/null
+++ b/test/batched_gemm_reduce/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_test_executable(test_batched_gemm_reduce_fp16 batched_gemm_reduce_fp16.cpp)
+target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE utility)
+target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE device_batched_gemm_reduce_instance)
diff --git a/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp b/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
new file mode 100644
index 00000000..b150ce50
--- /dev/null
+++ b/test/batched_gemm_reduce/batched_gemm_reduce_fp16.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "profiler/profile_batched_gemm_reduce_impl.hpp"
+
+int main()
+{
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    int M = 512;
+    int N = 256;
+    int K = 128;
+
+    int BatchCount = 3;
+
+    bool pass = true;
+
+    pass = pass && ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t,
+                                                                  ck::half_t,
+                                                                  ck::half_t,
+                                                                  float,
+                                                                  Row,
+                                                                  Row,
+                                                                  Row>(
+                       true, 1, false, false, M, N, K, K, N, N, BatchCount);
+
+    pass = pass && ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t,
+                                                                  ck::half_t,
+                                                                  ck::half_t,
+                                                                  float,
+                                                                  Row,
+                                                                  Col,
+                                                                  Row>(
+                       true, 1, false, false, M, N, K, K, K, N, BatchCount);
+
+    pass = pass && ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t,
+                                                                  ck::half_t,
+                                                                  ck::half_t,
+                                                                  float,
+                                                                  Col,
+                                                                  Row,
+                                                                  Row>(
+                       true, 1, false, false, M, N, K, M, N, N, BatchCount);
+
+    pass = pass && ck::profiler::profile_batched_gemm_reduce_impl<ck::half_t,
+                                                                  ck::half_t,
+                                                                  ck::half_t,
+                                                                  float,
+                                                                  Col,
+                                                                  Col,
+                                                                  Row>(
+                       true, 1, false, false, M, N, K, M, K, N, BatchCount);
+
+    if(pass)
+    {
+        std::cout << "test BatchedGEMM+Reduce fp16: Pass" << std::endl;
+        return 0;
+    }
+    else
+    {
+        std::cout << "test BatchedGEMM+Reduce fp16: Fail" << std::endl;
+        return -1;
+    }
+}
diff --git a/test/batched_gemm_softmax_gemm/CMakeLists.txt b/test/batched_gemm_softmax_gemm/CMakeLists.txt
new file mode 100644
index 00000000..1ceecefb
--- /dev/null
+++ b/test/batched_gemm_softmax_gemm/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_custom_target(test_batched_gemm_softmax_gemm)
+
+add_gtest_executable(test_batched_gemm_softmax_gemm_fp16 test_batched_gemm_softmax_gemm_fp16.cpp)
+target_link_libraries(test_batched_gemm_softmax_gemm_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_instance)
+add_dependencies(test_batched_gemm_softmax_gemm test_batched_gemm_softmax_gemm_fp16)
\ No newline at end of file
diff --git a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16.cpp b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16.cpp
new file mode 100644
index 00000000..5df7769d
--- /dev/null
+++ b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_fp16.cpp
@@ -0,0 +1,176 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "test_batched_gemm_softmax_gemm_util.hpp"
+
+template <typename Tuple>
+class TestBatchedGemmSoftmaxGemmFP16 : public TestBatchedGemmSoftmaxGemm<Tuple>
+{
+};
+
+using Masked = std::true_type;
+using NoMask = std::false_type;
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+    std::tuple<F16, F16, F16, F16, Row, Col, Row, Row, NoMask>,
+    std::tuple<F16, F16, F16, F16, Row, Col, Row, Row, Masked>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestBatchedGemmSoftmaxGemmFP16, KernelTypes);
+
+TYPED_TEST(TestBatchedGemmSoftmaxGemmFP16, Test_FP16) { this->Run(); }
+
+TYPED_TEST(TestBatchedGemmSoftmaxGemmFP16, Test_FP16_PadM)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {136, 128, 32, 128, 1},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmSoftmaxGemmFP16, Test_FP16_PadN)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 136, 32, 128, 1},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmSoftmaxGemmFP16, Test_FP16_PadK)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 40, 128, 1},
+        {128, 128, 136, 128, 1},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmSoftmaxGemmFP16, Test_FP16_PadO)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 32, 136, 1},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmSoftmaxGemmFP16, Test_FP16_OddM)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {129, 128, 32, 128, 1},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmSoftmaxGemmFP16, Test_FP16_OddN)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 129, 32, 128, 1},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmSoftmaxGemmFP16, Test_FP16_OddK)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 33, 128, 1},
+        {128, 128, 129, 128, 1},
+    };
+    this->Run();
+}
+
+// If kernel B1Layout is RowMajor, expect not to support odd O size
+TYPED_TEST(TestBatchedGemmSoftmaxGemmFP16, Test_FP16_OddO)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 32, 129, 1},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmSoftmaxGemmFP16, DISABLED_Bench_FP16)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {256, 256, 64, 64, 768},
+        {256, 256, 128, 128, 768},
+        {512, 512, 64, 64, 768},
+        {512, 512, 128, 128, 768},
+        {1024, 1024, 64, 64, 768},
+        {1024, 1024, 128, 128, 768},
+        {2048, 2048, 64, 64, 768},
+        {2048, 2048, 128, 128, 768},
+        {4096, 4096, 64, 64, 768},
+        {4096, 4096, 128, 128, 768},
+    };
+    this->bench_  = true;
+    this->verify_ = false;
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmSoftmaxGemmFP16, DISABLED_Bench_FP16_IrregularK)
+{
+    this->lengths_ = std::vector<std::vector<int>>{{256, 256, 160, 160, 16},
+                                                   {256, 64, 160, 64, 16},
+                                                   {1024, 1024, 80, 80, 16},
+                                                   {1024, 64, 80, 64, 16},
+                                                   {4096, 4096, 40, 40, 16},
+                                                   {4096, 64, 40, 64, 16}};
+    this->bench_   = true;
+    this->verify_  = false;
+    this->Run();
+}
+
+using ck::tensor_operation::device::GemmSpecialization;
+
+TEST(TestBatchedGemmSoftmaxGemmInterface, GemmSpecializationSizeMatch)
+{
+    int P = 120; // requires padding
+    int Q = 128; // do not require padding
+
+    // IsSupported(M, N, K, O)
+    // clang-format off
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::Default>{}.IsSupported(Q, Q, Q, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MPadding>{}.IsSupported(P, Q, Q, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::NPadding>{}.IsSupported(Q, P, Q, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::KPadding>{}.IsSupported(Q, Q, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNPadding>{}.IsSupported(P, P, Q, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MKPadding>{}.IsSupported(P, Q, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::NKPadding>{}.IsSupported(Q, P, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNKPadding>{}.IsSupported(P, P, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::OPadding>{}.IsSupported(Q, Q, Q, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MOPadding>{}.IsSupported(P, Q, Q, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::NOPadding>{}.IsSupported(Q, P, Q, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::KOPadding>{}.IsSupported(Q, Q, P, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNOPadding>{}.IsSupported(P, P, Q, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MKOPadding>{}.IsSupported(P, Q, P, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::NKOPadding>{}.IsSupported(Q, P, P, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(P, P, P, P));
+    // clang-format on
+}
+
+TEST(TestBatchedGemmSoftmaxGemmInterface, GemmSpecializationSizeMismatch)
+{
+    // IsSupported(M, N, K, O)
+    // clang-format off
+    EXPECT_FALSE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::Default>{}.IsSupported(128, 128, 120, 128));
+    EXPECT_FALSE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNKPadding>{}.IsSupported(128, 128, 128, 120));
+    // Kernel can't support odd K size because SrcVectorDim == KDim and must satisfy SizeKRaw % ABSrcScalarPerVector == 0
+    EXPECT_FALSE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(128, 128, 129, 128));
+    EXPECT_FALSE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(128, 128, 130, 128));
+    // Kernel can't support odd O size because SrcVectorDim == ODim and must satisfy SizeORaw % B1SrcScalarPerVector == 0
+    EXPECT_FALSE(DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(128, 128, 128, 129));
+    // clang-format on
+}
+
+TYPED_TEST(TestBatchedGemmSoftmaxGemmFP16, AdhocTest)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {49, 49, 64, 64, 24},
+        {64, 49, 64, 64, 24},
+        {1020, 1020, 64, 128, 24},
+        {576, 576, 64, 64, 24},
+    };
+    this->Run();
+}
diff --git a/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp
new file mode 100644
index 00000000..98debe19
--- /dev/null
+++ b/test/batched_gemm_softmax_gemm/test_batched_gemm_softmax_gemm_util.hpp
@@ -0,0 +1,197 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include <vector>
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp"
+#include "profiler/profile_batched_gemm_softmax_gemm_impl.hpp"
+using ck::tensor_operation::device::GemmSpecialization;
+
+template <ck::index_t N>
+using I = ck::Number<N>;
+
+using F16 = ck::half_t;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <typename Tuple>
+struct TestBatchedGemmSoftmaxGemm : public ::testing::Test
+{
+    using ADataType   = std::tuple_element_t<0, Tuple>;
+    using B0DataType  = std::tuple_element_t<1, Tuple>;
+    using B1DataType  = std::tuple_element_t<2, Tuple>;
+    using CDataType   = std::tuple_element_t<3, Tuple>;
+    using ALayout     = std::tuple_element_t<4, Tuple>;
+    using B0Layout    = std::tuple_element_t<5, Tuple>;
+    using B1Layout    = std::tuple_element_t<6, Tuple>;
+    using CLayout     = std::tuple_element_t<7, Tuple>;
+    using MaskingType = std::tuple_element_t<8, Tuple>;
+
+    std::vector<std::vector<int>> lengths_ = {{256, 256, 64, 64, 4},
+                                              {256, 256, 128, 128, 4},
+                                              {512, 512, 64, 64, 2},
+                                              {512, 512, 128, 128, 2},
+                                              {1024, 1024, 64, 64, 1},
+                                              {1024, 1024, 128, 128, 1},
+                                              {256, 256, 160, 160, 4},
+                                              {256, 64, 160, 64, 4},
+                                              {1024, 1024, 80, 80, 2},
+                                              {1024, 64, 80, 64, 2},
+                                              {4096, 4096, 40, 40, 1},
+                                              {4096, 64, 40, 64, 1}};
+
+    bool bench_  = false;
+    bool verify_ = true;
+
+    void RunSingle(int M, int N, int K, int O, int BatchCount)
+    {
+        bool pass = ck::profiler::profile_batched_gemm_softmax_gemm_impl<ADataType,
+                                                                         B0DataType,
+                                                                         B1DataType,
+                                                                         CDataType,
+                                                                         ALayout,
+                                                                         B0Layout,
+                                                                         B1Layout,
+                                                                         CLayout,
+                                                                         MaskingType::value>(
+            verify_, 1, false, bench_, M, N, K, O, BatchCount);
+
+        EXPECT_TRUE(pass);
+    }
+
+    void Run()
+    {
+        for(auto lengths : this->lengths_)
+        {
+            int M          = lengths[0];
+            int N          = lengths[1];
+            int K          = lengths[2];
+            int O          = lengths[3];
+            int BatchCount = lengths[4];
+
+            this->RunSingle(M, N, K, O, BatchCount);
+        }
+    }
+};
+
+template <GemmSpecialization GemmSpec>
+struct DeviceInstanceWrapper_TNTT_FP16_M128_N128_K32_O128
+{
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+    using ALayout  = Row;
+    using B0Layout = Col;
+    using B1Layout = Row;
+    using CLayout  = Row;
+
+    using ADataType        = F16;
+    using B0DataType       = F16;
+    using B1DataType       = F16;
+    using AccDataType      = float;
+    using CShuffleDataType = float;
+    using CDataType        = F16;
+
+    using AElementOp    = PassThrough;
+    using B0ElementOp   = PassThrough;
+    using Acc0ElementOp = PassThrough;
+    using B1ElementOp   = PassThrough;
+    using CElementOp    = PassThrough;
+
+    template <ck::index_t... Is>
+    using S = ck::Sequence<Is...>;
+
+    // static constexpr auto GemmSpec = std::tuple_element_t<0, Tuple>::value;
+
+    using DeviceGemmGemmInstance =
+        ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<
+            ALayout,
+            B0Layout,
+            B1Layout,
+            CLayout,
+            ADataType,
+            B0DataType,
+            B1DataType,
+            CDataType,
+            AccDataType,
+            CShuffleDataType,
+            AElementOp,
+            B0ElementOp,
+            Acc0ElementOp,
+            B1ElementOp,
+            CElementOp,
+            GemmSpec,
+            1,
+            256,
+            128,         // MPerBlock
+            128,         // NPerBlock
+            32,          // KPerBlock
+            128,         // Gemm1NPerBlock
+            32,          // Gemm1KPerBlock
+            8,           // AK1
+            8,           // BK1
+            2,           // B1K1
+            32,          // MPerXDL
+            32,          // NPerXDL
+            1,           // MXdlPerWave
+            4,           // NXdlPerWave
+            4,           // Gemm1NXdlPerWave
+            S<4, 64, 1>, // ABlockTransfer
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            8,
+            8,
+            true,
+            S<4, 64, 1>, // BBlockTransfer
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            8,
+            8,
+            true,
+            S<8, 32, 1>, // B1BlockTransfer
+            S<0, 2, 1>,
+            S<0, 2, 1>,
+            1,
+            4,
+            2,
+            false,
+            1,              // CShuffleMXdlPerWavePerShuffle
+            2,              // CShuffleNXdlPerWavePerShuffle
+            S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+            8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+            false>;
+
+    bool IsSupported(int M, int N, int K, int O)
+    {
+        auto gemm     = DeviceGemmGemmInstance{};
+        auto invoker  = gemm.MakeInvoker();
+        auto argument = gemm.MakeArgument(static_cast<ADataType*>(nullptr),
+                                          static_cast<B0DataType*>(nullptr),
+                                          static_cast<B1DataType*>(nullptr),
+                                          static_cast<CDataType*>(nullptr),
+                                          M,
+                                          N,
+                                          K,
+                                          O,
+                                          0,              // BatchCount
+                                          0,              // StrideA
+                                          0,              // StrideB0
+                                          0,              // StrideB1
+                                          0,              // StrideC
+                                          0,              // BatchStrideA
+                                          0,              // BatchStrideB0
+                                          0,              // BatchStrideB1
+                                          0,              // BatchStrideC
+                                          PassThrough{},  // a_element_op
+                                          PassThrough{},  // b0_element_op
+                                          PassThrough{},  // acc0_element_op
+                                          PassThrough{},  // b1_element_op
+                                          PassThrough{}); // c_element_op
+
+        return gemm.IsSupportedArgument(argument);
+    }
+};
diff --git a/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt b/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt
new file mode 100644
index 00000000..f858d9f2
--- /dev/null
+++ b/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_custom_target(test_batched_gemm_softmax_gemm_permute)
+
+add_gtest_executable(test_batched_gemm_softmax_gemm_permute_fp16 test_batched_gemm_softmax_gemm_permute_fp16.cpp)
+add_gtest_executable(test_batched_gemm_softmax_gemm_permute_bf16 test_batched_gemm_softmax_gemm_permute_bf16.cpp)
+target_link_libraries(test_batched_gemm_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
+target_link_libraries(test_batched_gemm_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance)
+add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_fp16)
+add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_bf16)
\ No newline at end of file
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16.cpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16.cpp
new file mode 100644
index 00000000..4a775e6d
--- /dev/null
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_bf16.cpp
@@ -0,0 +1,182 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "test_batched_gemm_softmax_gemm_permute_util.hpp"
+
+template <typename Tuple>
+class TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16
+    : public TestBatchedGemmMaskingScaleSoftmaxGemmPermute<Tuple>
+{
+};
+
+using I1_t = ck::Number<1>;
+using I2_t = ck::Number<2>;
+
+using MaskDisabled_t =
+    ck::integral_constant<MaskingSpecialization, MaskingSpecialization::MaskDisabled>;
+using MaskOutUpperTriangle_t =
+    ck::integral_constant<MaskingSpecialization, MaskingSpecialization::MaskOutUpperTriangle>;
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+    std::tuple<I2_t, I1_t, I1_t, I1_t, I1_t, BF16, BF16, BF16, BF16, ck::Tuple<>, ck::Tuple<>, MaskDisabled_t>,
+    std::tuple<I2_t, I1_t, I1_t, I1_t, I1_t, BF16, BF16, BF16, BF16, ck::Tuple<>, ck::Tuple<>, MaskOutUpperTriangle_t>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, KernelTypes);
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, DISABLED_Test_BF16) { this->Run(); }
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_PadM)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {136, 128, 32, 128, 2, 3},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_PadN)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 136, 32, 128, 3, 2},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_PadK)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 40, 128, 2, 4},
+        {128, 128, 136, 128, 4, 2},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_PadO)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 32, 136, 1, 3},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_OddM)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {129, 128, 32, 128, 2, 3},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_OddN)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 129, 32, 128, 4, 3},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_OddK)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 33, 128, 2, 3},
+        {128, 128, 129, 128, 2, 3},
+    };
+    this->Run();
+}
+
+// If kernel B1Layout is RowMajor, expect not to support odd O size
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, Test_BF16_OddO)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 32, 129, 2, 3},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, DISABLED_Bench_BF16_IrregularK)
+{
+    this->lengths_ = std::vector<std::vector<int>>{{256, 256, 160, 160, 1, 16},
+                                                   {256, 64, 160, 64, 1, 16},
+                                                   {1024, 1024, 80, 80, 1, 16},
+                                                   {1024, 64, 80, 64, 1, 16},
+                                                   {4096, 4096, 40, 40, 1, 16},
+                                                   {4096, 64, 40, 64, 1, 16}};
+    this->bench_   = true;
+    this->verify_  = false;
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, DISABLED_Bench_BF16)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {256, 256, 64, 64, 48, 16},
+        {256, 256, 128, 128, 48, 16},
+        {512, 512, 64, 64, 48, 16},
+        {512, 512, 128, 128, 48, 16},
+        {1024, 1024, 64, 64, 48, 16},
+        {1024, 1024, 128, 128, 48, 16},
+        {2048, 2048, 64, 64, 48, 16},
+        {2048, 2048, 128, 128, 48, 16},
+        {4096, 4096, 64, 64, 48, 16},
+        {4096, 4096, 128, 128, 48, 16},
+    };
+    this->bench_  = true;
+    this->verify_ = false;
+    this->Run();
+}
+
+using ck::tensor_operation::device::GemmSpecialization;
+
+TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteInterface, GemmSpecializationSizeMatch)
+{
+    int P = 120; // requires padding
+    int Q = 128; // do not require padding
+
+    // IsSupported(M, N, K, O)
+    // clang-format off
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::Default>{}.IsSupported(Q, Q, Q, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MPadding>{}.IsSupported(P, Q, Q, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::NPadding>{}.IsSupported(Q, P, Q, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::KPadding>{}.IsSupported(Q, Q, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNPadding>{}.IsSupported(P, P, Q, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MKPadding>{}.IsSupported(P, Q, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::NKPadding>{}.IsSupported(Q, P, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNKPadding>{}.IsSupported(P, P, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::OPadding>{}.IsSupported(Q, Q, Q, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MOPadding>{}.IsSupported(P, Q, Q, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::NOPadding>{}.IsSupported(Q, P, Q, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::KOPadding>{}.IsSupported(Q, Q, P, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNOPadding>{}.IsSupported(P, P, Q, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MKOPadding>{}.IsSupported(P, Q, P, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::NKOPadding>{}.IsSupported(Q, P, P, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(P, P, P, P));
+    // clang-format on
+}
+
+TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteInterface, GemmSpecializationSizeMismatch)
+{
+    // IsSupported(M, N, K, O)
+    // clang-format off
+    EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::Default>{}.IsSupported(128, 128, 120, 128));
+    EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNKPadding>{}.IsSupported(128, 128, 128, 120));
+    // Kernel can't support odd K size because SrcVectorDim == KDim and must satisfy SizeKRaw % ABSrcScalarPerVector == 0
+    EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(128, 128, 129, 128));
+    EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(128, 128, 130, 128));
+    // Kernel can't support odd O size because SrcVectorDim == ODim and must satisfy SizeORaw % B1SrcScalarPerVector == 0
+    EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(128, 128, 128, 129));
+    // clang-format on
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteBF16, AdhocTest)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {49, 49, 64, 64, 4, 6},
+        {64, 49, 64, 64, 4, 6},
+        {1020, 1020, 64, 128, 4, 6},
+        {576, 576, 64, 64, 4, 6},
+    };
+    this->Run();
+}
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16.cpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16.cpp
new file mode 100644
index 00000000..293acd60
--- /dev/null
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_fp16.cpp
@@ -0,0 +1,182 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "test_batched_gemm_softmax_gemm_permute_util.hpp"
+
+template <typename Tuple>
+class TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16
+    : public TestBatchedGemmMaskingScaleSoftmaxGemmPermute<Tuple>
+{
+};
+
+using I1_t = ck::Number<1>;
+using I2_t = ck::Number<2>;
+
+using MaskDisabled_t =
+    ck::integral_constant<MaskingSpecialization, MaskingSpecialization::MaskDisabled>;
+using MaskOutUpperTriangle_t =
+    ck::integral_constant<MaskingSpecialization, MaskingSpecialization::MaskOutUpperTriangle>;
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+    std::tuple<I2_t, I1_t, I1_t, I1_t, I1_t, F16, F16, F16, F16, ck::Tuple<>, ck::Tuple<>, MaskDisabled_t>,
+    std::tuple<I2_t, I1_t, I1_t, I1_t, I1_t, F16, F16, F16, F16, ck::Tuple<>, ck::Tuple<>, MaskOutUpperTriangle_t>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, KernelTypes);
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, Test_FP16) { this->Run(); }
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, Test_FP16_PadM)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {136, 128, 32, 128, 2, 3},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, Test_FP16_PadN)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 136, 32, 128, 3, 2},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, Test_FP16_PadK)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 40, 128, 2, 4},
+        {128, 128, 136, 128, 4, 2},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, Test_FP16_PadO)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 32, 136, 1, 3},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, Test_FP16_OddM)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {129, 128, 32, 128, 2, 3},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, Test_FP16_OddN)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 129, 32, 128, 4, 3},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, Test_FP16_OddK)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 33, 128, 2, 3},
+        {128, 128, 129, 128, 2, 3},
+    };
+    this->Run();
+}
+
+// If kernel B1Layout is RowMajor, expect not to support odd O size
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, Test_FP16_OddO)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {128, 128, 32, 129, 2, 3},
+    };
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, DISABLED_Bench_FP16_IrregularK)
+{
+    this->lengths_ = std::vector<std::vector<int>>{{256, 256, 160, 160, 1, 16},
+                                                   {256, 64, 160, 64, 1, 16},
+                                                   {1024, 1024, 80, 80, 1, 16},
+                                                   {1024, 64, 80, 64, 1, 16},
+                                                   {4096, 4096, 40, 40, 1, 16},
+                                                   {4096, 64, 40, 64, 1, 16}};
+    this->bench_   = true;
+    this->verify_  = false;
+    this->Run();
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, DISABLED_Bench_FP16)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {256, 256, 64, 64, 48, 16},
+        {256, 256, 128, 128, 48, 16},
+        {512, 512, 64, 64, 48, 16},
+        {512, 512, 128, 128, 48, 16},
+        {1024, 1024, 64, 64, 48, 16},
+        {1024, 1024, 128, 128, 48, 16},
+        {2048, 2048, 64, 64, 48, 16},
+        {2048, 2048, 128, 128, 48, 16},
+        {4096, 4096, 64, 64, 48, 16},
+        {4096, 4096, 128, 128, 48, 16},
+    };
+    this->bench_  = true;
+    this->verify_ = false;
+    this->Run();
+}
+
+using ck::tensor_operation::device::GemmSpecialization;
+
+TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteInterface, GemmSpecializationSizeMatch)
+{
+    int P = 120; // requires padding
+    int Q = 128; // do not require padding
+
+    // IsSupported(M, N, K, O)
+    // clang-format off
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::Default>{}.IsSupported(Q, Q, Q, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MPadding>{}.IsSupported(P, Q, Q, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::NPadding>{}.IsSupported(Q, P, Q, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::KPadding>{}.IsSupported(Q, Q, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNPadding>{}.IsSupported(P, P, Q, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MKPadding>{}.IsSupported(P, Q, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::NKPadding>{}.IsSupported(Q, P, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNKPadding>{}.IsSupported(P, P, P, Q));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::OPadding>{}.IsSupported(Q, Q, Q, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MOPadding>{}.IsSupported(P, Q, Q, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::NOPadding>{}.IsSupported(Q, P, Q, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::KOPadding>{}.IsSupported(Q, Q, P, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNOPadding>{}.IsSupported(P, P, Q, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MKOPadding>{}.IsSupported(P, Q, P, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::NKOPadding>{}.IsSupported(Q, P, P, P));
+    EXPECT_TRUE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(P, P, P, P));
+    // clang-format on
+}
+
+TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteInterface, GemmSpecializationSizeMismatch)
+{
+    // IsSupported(M, N, K, O)
+    // clang-format off
+    EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::Default>{}.IsSupported(128, 128, 120, 128));
+    EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNKPadding>{}.IsSupported(128, 128, 128, 120));
+    // Kernel can't support odd K size because SrcVectorDim == KDim and must satisfy SizeKRaw % ABSrcScalarPerVector == 0
+    EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(128, 128, 129, 128));
+    EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(128, 128, 130, 128));
+    // Kernel can't support odd O size because SrcVectorDim == ODim and must satisfy SizeORaw % B1SrcScalarPerVector == 0
+    EXPECT_FALSE(DeviceInstanceWrapper_G2M1N1K1O1_TNTT_FP16_M128_N128_K32_O128<GemmSpecialization::MNKOPadding>{}.IsSupported(128, 128, 128, 129));
+    // clang-format on
+}
+
+TYPED_TEST(TestBatchedGemmMaskingScaleSoftmaxGemmPermuteFP16, AdhocTest)
+{
+    this->lengths_ = std::vector<std::vector<int>>{
+        {49, 49, 64, 64, 4, 6},
+        {64, 49, 64, 64, 4, 6},
+        {1020, 1020, 64, 128, 4, 6},
+        {576, 576, 64, 64, 4, 6},
+    };
+    this->Run();
+}
diff --git a/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp
new file mode 100644
index 00000000..912bbc91
--- /dev/null
+++ b/test/batched_gemm_softmax_gemm_permute/test_batched_gemm_softmax_gemm_permute_util.hpp
@@ -0,0 +1,368 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include <vector>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
+#include "profiler/profile_batched_gemm_softmax_gemm_permute_impl.hpp"
+
+using ck::tensor_operation::device::GemmSpecialization;
+using ck::tensor_operation::device::MaskingSpecialization;
+using ck::tensor_operation::device::TensorSpecialization;
+
+template <ck::index_t N>
+using I = ck::Number<N>;
+
+using F16  = ck::half_t;
+using BF16 = ck::bhalf_t;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <typename Tuple>
+struct TestBatchedGemmMaskingScaleSoftmaxGemmPermute : public ::testing::Test
+{
+    using NumDimGType      = std::tuple_element_t<0, Tuple>;
+    using NumDimMType      = std::tuple_element_t<1, Tuple>;
+    using NumDimNType      = std::tuple_element_t<2, Tuple>;
+    using NumDimKType      = std::tuple_element_t<3, Tuple>;
+    using NumDimOType      = std::tuple_element_t<4, Tuple>;
+    using ADataType        = std::tuple_element_t<5, Tuple>;
+    using B0DataType       = std::tuple_element_t<6, Tuple>;
+    using B1DataType       = std::tuple_element_t<7, Tuple>;
+    using CDataType        = std::tuple_element_t<8, Tuple>;
+    using Acc0BiasDataType = std::tuple_element_t<9, Tuple>;
+    using Acc1BiasDataType = std::tuple_element_t<10, Tuple>;
+    using MaskingType      = std::tuple_element_t<11, Tuple>;
+
+    std::vector<std::vector<int>> lengths_ = {
+        {256, 256, 64, 64, 6, 4},
+        {256, 256, 128, 128, 4, 6},
+        {512, 512, 64, 64, 3, 2},
+        {512, 512, 128, 128, 2, 3},
+        {1024, 1024, 64, 64, 3, 1},
+        {1024, 1024, 128, 128, 1, 1},
+    };
+    bool bench_  = false;
+    bool verify_ = true;
+
+    void RunSingle(int M, int N, int K, int O, int G0, int G1)
+    {
+        bool pass =
+            ck::profiler::profile_batched_gemm_softmax_gemm_permute_impl<NumDimGType::value,
+                                                                         NumDimMType::value,
+                                                                         NumDimNType::value,
+                                                                         NumDimKType::value,
+                                                                         NumDimOType::value,
+                                                                         ADataType,
+                                                                         B0DataType,
+                                                                         B1DataType,
+                                                                         CDataType,
+                                                                         ck::Tuple<>,
+                                                                         ck::Tuple<>,
+                                                                         MaskingType::value>(
+                verify_, 2, false, bench_, M, N, K, O, G0, G1);
+
+        EXPECT_TRUE(pass);
+    }
+
+    void Run()
+    {
+        for(auto lengths : this->lengths_)
+        {
+            int M  = lengths[0];
+            int N  = lengths[1];
+            int K  = lengths[2];
+            int O  = lengths[3];
+            int G0 = lengths[4];
+            int G1 = lengths[5];
+
+            this->RunSingle(M, N, K, O, G0, G1);
+        }
+    }
+};
+
+template <GemmSpecialization GemmSpec>
+struct DeviceInstanceWrapper_G2M1N1K1O1_TNTT_FP16_M128_N128_K32_O128
+{
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    using Scale       = ck::tensor_operation::element_wise::Scale;
+
+    template <ck::index_t... Is>
+    using S = ck::Sequence<Is...>;
+
+    using ADataType        = F16;
+    using B0DataType       = F16;
+    using B1DataType       = F16;
+    using AccDataType      = float;
+    using CShuffleDataType = F16;
+    using CDataType        = F16;
+
+    using AElementOp    = PassThrough;
+    using B0ElementOp   = PassThrough;
+    using Acc0ElementOp = Scale;
+    using B1ElementOp   = PassThrough;
+    using CElementOp    = PassThrough;
+
+    // static constexpr auto GemmSpec = std::tuple_element_t<0, Tuple>::value;
+
+    using DeviceGemmGemmInstance =
+        ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<
+            2,
+            1,
+            1,
+            1,
+            1,
+            ADataType,
+            B0DataType,
+            B1DataType,
+            CDataType,
+            ck::Tuple<>,
+            ck::Tuple<>,
+            AccDataType,
+            CShuffleDataType,
+            AElementOp,
+            B0ElementOp,
+            Acc0ElementOp,
+            B1ElementOp,
+            CElementOp,
+            GemmSpec,
+            TensorSpecialization::Default, // ATensorSpec
+            TensorSpecialization::Default, // B0TensorSpec
+            TensorSpecialization::Default, // B1TensorSpec
+            TensorSpecialization::Default, // CTensorSpec
+            1,
+            256,
+            128,         // MPerBlock
+            128,         // NPerBlock
+            32,          // KPerBlock
+            128,         // Gemm1NPerBlock
+            32,          // Gemm1KPerBlock
+            8,           // AK1
+            8,           // BK1
+            2,           // B1K1
+            32,          // MPerXDL
+            32,          // NPerXDL
+            1,           // MXdlPerWave
+            4,           // NXdlPerWave
+            4,           // Gemm1NXdlPerWave
+            S<4, 64, 1>, // ABlockTransfer
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            8,
+            8,
+            true,
+            S<4, 64, 1>, // BBlockTransfer
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            8,
+            8,
+            true,
+            S<8, 32, 1>, // B1BlockTransfer
+            S<0, 2, 1>,
+            S<0, 2, 1>,
+            1,
+            4,
+            2,
+            false,
+            1,              // CShuffleMXdlPerWavePerShuffle
+            2,              // CShuffleNXdlPerWavePerShuffle
+            S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+            8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+            MaskingSpecialization::MaskOutUpperTriangle>; // MaskOutUpperTriangle
+
+    bool IsSupported(int M, int N, int K, int O)
+    {
+        const int G0 = 1, G1 = 1;
+
+        // A layout [G0, M, G1, K]
+        std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, G1, M, K};
+        std::vector<ck::index_t> a_gs_ms_ks_strides{M * G1 * K, K, G1 * K, 1};
+
+        // B0 layout [G0, N, G1, K]
+        std::vector<ck::index_t> b0_gs_ns_ks_lengths{G0, G1, N, K};
+        std::vector<ck::index_t> b0_gs_ns_ks_strides{N * G1 * K, K, G1 * K, 1};
+
+        // B1 layout [G0, N, G1, O]
+        std::vector<ck::index_t> b1_gs_os_ns_lengths{G0, G1, O, N};
+        std::vector<ck::index_t> b1_gs_os_ns_strides{N * G1 * O, O, 1, G1 * O};
+
+        // C layout [G0, M, G1, O]
+        std::vector<ck::index_t> c_gs_ms_os_lengths{G0, G1, M, O};
+        std::vector<ck::index_t> c_gs_ms_os_strides{M * G1 * O, O, G1 * O, 1};
+
+        auto gemm     = DeviceGemmGemmInstance{};
+        auto invoker  = gemm.MakeInvoker();
+        auto argument = gemm.MakeArgument(static_cast<ADataType*>(nullptr),
+                                          static_cast<B0DataType*>(nullptr),
+                                          static_cast<B1DataType*>(nullptr),
+                                          static_cast<CDataType*>(nullptr),
+                                          {}, // p_acc0_biases
+                                          {}, // p_acc1_biases
+                                          a_gs_ms_ks_lengths,
+                                          a_gs_ms_ks_strides,
+                                          b0_gs_ns_ks_lengths,
+                                          b0_gs_ns_ks_strides,
+                                          b1_gs_os_ns_lengths,
+                                          b1_gs_os_ns_strides,
+                                          c_gs_ms_os_lengths,
+                                          c_gs_ms_os_strides,
+                                          {},             // acc0_biases_gs_ms_ns_lengths
+                                          {},             // acc0_biases_gs_ms_ns_strides
+                                          {},             // acc1_biases_gs_ms_os_lengths
+                                          {},             // acc1_biases_gs_ms_os_strides
+                                          PassThrough{},  // a_element_op
+                                          PassThrough{},  // b0_element_op
+                                          Scale{1.f},     // acc0_element_op
+                                          PassThrough{},  // b1_element_op
+                                          PassThrough{}); // c_element_op
+
+        return gemm.IsSupportedArgument(argument);
+    }
+};
+
+template <GemmSpecialization GemmSpec>
+struct DeviceInstanceWrapper_G2M1N1K1O1_TNTT_BF16_M128_N128_K32_O128
+{
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    using Scale       = ck::tensor_operation::element_wise::Scale;
+
+    template <ck::index_t... Is>
+    using S = ck::Sequence<Is...>;
+
+    using ADataType        = BF16;
+    using B0DataType       = BF16;
+    using B1DataType       = BF16;
+    using AccDataType      = float;
+    using CShuffleDataType = BF16;
+    using CDataType        = BF16;
+
+    using AElementOp    = PassThrough;
+    using B0ElementOp   = PassThrough;
+    using Acc0ElementOp = Scale;
+    using B1ElementOp   = PassThrough;
+    using CElementOp    = PassThrough;
+
+    // static constexpr auto GemmSpec = std::tuple_element_t<0, Tuple>::value;
+
+    using DeviceGemmGemmInstance =
+        ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<
+            2,
+            1,
+            1,
+            1,
+            1,
+            ADataType,
+            B0DataType,
+            B1DataType,
+            CDataType,
+            ck::Tuple<>,
+            ck::Tuple<>,
+            AccDataType,
+            CShuffleDataType,
+            AElementOp,
+            B0ElementOp,
+            Acc0ElementOp,
+            B1ElementOp,
+            CElementOp,
+            GemmSpec,
+            TensorSpecialization::Default, // ATensorSpec
+            TensorSpecialization::Default, // B0TensorSpec
+            TensorSpecialization::Default, // B1TensorSpec
+            TensorSpecialization::Default, // CTensorSpec
+            1,
+            256,
+            128,         // MPerBlock
+            128,         // NPerBlock
+            32,          // KPerBlock
+            128,         // Gemm1NPerBlock
+            32,          // Gemm1KPerBlock
+            8,           // AK1
+            8,           // BK1
+            2,           // B1K1
+            32,          // MPerXDL
+            32,          // NPerXDL
+            1,           // MXdlPerWave
+            4,           // NXdlPerWave
+            4,           // Gemm1NXdlPerWave
+            S<4, 64, 1>, // ABlockTransfer
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            8,
+            8,
+            true,
+            S<4, 64, 1>, // BBlockTransfer
+            S<1, 0, 2>,
+            S<1, 0, 2>,
+            2,
+            8,
+            8,
+            true,
+            S<8, 32, 1>, // B1BlockTransfer
+            S<0, 2, 1>,
+            S<0, 2, 1>,
+            1,
+            4,
+            2,
+            false,
+            1,              // CShuffleMXdlPerWavePerShuffle
+            2,              // CShuffleNXdlPerWavePerShuffle
+            S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+            8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+            MaskingSpecialization::MaskOutUpperTriangle>; // MaskOutUpperTriangle
+
+    bool IsSupported(int M, int N, int K, int O)
+    {
+        const int G0 = 1, G1 = 1;
+
+        // A layout [G0, M, G1, K]
+        std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, G1, M, K};
+        std::vector<ck::index_t> a_gs_ms_ks_strides{M * G1 * K, K, G1 * K, 1};
+
+        // B0 layout [G0, N, G1, K]
+        std::vector<ck::index_t> b0_gs_ns_ks_lengths{G0, G1, N, K};
+        std::vector<ck::index_t> b0_gs_ns_ks_strides{N * G1 * K, K, G1 * K, 1};
+
+        // B1 layout [G0, N, G1, O]
+        std::vector<ck::index_t> b1_gs_os_ns_lengths{G0, G1, O, N};
+        std::vector<ck::index_t> b1_gs_os_ns_strides{N * G1 * O, O, 1, G1 * O};
+
+        // C layout [G0, M, G1, O]
+        std::vector<ck::index_t> c_gs_ms_os_lengths{G0, G1, M, O};
+        std::vector<ck::index_t> c_gs_ms_os_strides{M * G1 * O, O, G1 * O, 1};
+
+        auto gemm     = DeviceGemmGemmInstance{};
+        auto invoker  = gemm.MakeInvoker();
+        auto argument = gemm.MakeArgument(static_cast<ADataType*>(nullptr),
+                                          static_cast<B0DataType*>(nullptr),
+                                          static_cast<B1DataType*>(nullptr),
+                                          static_cast<CDataType*>(nullptr),
+                                          {}, // p_acc0_biases
+                                          {}, // p_acc1_biases
+                                          a_gs_ms_ks_lengths,
+                                          a_gs_ms_ks_strides,
+                                          b0_gs_ns_ks_lengths,
+                                          b0_gs_ns_ks_strides,
+                                          b1_gs_os_ns_lengths,
+                                          b1_gs_os_ns_strides,
+                                          c_gs_ms_os_lengths,
+                                          c_gs_ms_os_strides,
+                                          {},             // acc0_biases_gs_ms_ns_lengths
+                                          {},             // acc0_biases_gs_ms_ns_strides
+                                          {},             // acc1_biases_gs_ms_os_lengths
+                                          {},             // acc1_biases_gs_ms_os_strides
+                                          PassThrough{},  // a_element_op
+                                          PassThrough{},  // b0_element_op
+                                          Scale{1.f},     // acc0_element_op
+                                          PassThrough{},  // b1_element_op
+                                          PassThrough{}); // c_element_op
+
+        return gemm.IsSupportedArgument(argument);
+    }
+};
diff --git a/test/batchnorm/CMakeLists.txt b/test/batchnorm/CMakeLists.txt
new file mode 100644
index 00000000..52f15086
--- /dev/null
+++ b/test/batchnorm/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_gtest_executable(test_batchnorm_fwd_rank_4 batchnorm_fwd_rank_4.cpp)
+add_gtest_executable(test_batchnorm_bwd_rank_4 batchnorm_bwd_rank_4.cpp)
+target_link_libraries(test_batchnorm_fwd_rank_4 PRIVATE utility device_batchnorm_instance)
+target_link_libraries(test_batchnorm_bwd_rank_4 PRIVATE utility device_batchnorm_instance)
diff --git a/test/batchnorm/batchnorm_bwd_rank_4.cpp b/test/batchnorm/batchnorm_bwd_rank_4.cpp
new file mode 100644
index 00000000..caa7331e
--- /dev/null
+++ b/test/batchnorm/batchnorm_bwd_rank_4.cpp
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <tuple>
+#include <gtest/gtest.h>
+
+#include "profiler/profile_batchnorm_backward_impl.hpp"
+
+using F16  = ck::half_t;
+using F32  = float;
+using BF16 = ck::bhalf_t;
+using F64  = double;
+
+template <typename Tuple>
+class TestBatchNormBwdRank4 : public ::testing::Test
+{
+    private:
+    const double epsilon = std::numeric_limits<float>::epsilon();
+
+    protected:
+    using XDataType       = std::tuple_element_t<0, Tuple>;
+    using DxDataType      = std::tuple_element_t<1, Tuple>;
+    using DyDataType      = std::tuple_element_t<2, Tuple>;
+    using AccDataType     = std::tuple_element_t<3, Tuple>;
+    using ScaleDataType   = std::tuple_element_t<4, Tuple>;
+    using BiasDataType    = std::tuple_element_t<5, Tuple>;
+    using MeanVarDataType = std::tuple_element_t<6, Tuple>;
+
+    std::vector<std::vector<size_t>> list_of_lengths = {
+        {128, 16, 3, 1024}, {128, 16, 6, 512}, {1, 1, 1, 1}, {4, 4, 4, 4}, {32, 32, 32, 32}};
+    std::vector<int> reduceDims;
+
+    template <int NumReduceDim>
+    void Run()
+    {
+        for(auto& inOutLengths : list_of_lengths)
+        {
+            bool pass = true;
+
+            EXPECT_FALSE(reduceDims.size() != NumReduceDim);
+
+            pass = pass && ck::profiler::profile_batchnorm_backward_impl<XDataType,
+                                                                         DxDataType,
+                                                                         DyDataType,
+                                                                         AccDataType,
+                                                                         ScaleDataType,
+                                                                         BiasDataType,
+                                                                         MeanVarDataType,
+                                                                         4,
+                                                                         NumReduceDim>(
+                               true, 3, false, false, inOutLengths, reduceDims, true, epsilon);
+
+            pass = pass && ck::profiler::profile_batchnorm_backward_impl<XDataType,
+                                                                         DxDataType,
+                                                                         DyDataType,
+                                                                         AccDataType,
+                                                                         ScaleDataType,
+                                                                         BiasDataType,
+                                                                         MeanVarDataType,
+                                                                         4,
+                                                                         NumReduceDim>(
+                               true, 3, false, false, inOutLengths, reduceDims, false, epsilon);
+
+            EXPECT_TRUE(pass);
+        }
+    }
+};
+
+using KernelTypes = ::testing::Types<std::tuple<F16, F32, F32, F32, F16, F32, F32>,
+                                     std::tuple<F32, F32, F32, F32, F32, F32, F32>,
+                                     std::tuple<BF16, F32, F32, F32, BF16, F32, F32>,
+                                     std::tuple<F64, F64, F64, F64, F64, F64, F64>>;
+
+TYPED_TEST_SUITE(TestBatchNormBwdRank4, KernelTypes);
+
+// nhwc
+TYPED_TEST(TestBatchNormBwdRank4, nhwc)
+{
+    this->reduceDims = {0, 1, 2};
+    this->template Run<3>();
+}
+
+// nchw
+TYPED_TEST(TestBatchNormBwdRank4, nchw)
+{
+    this->reduceDims = {0, 2, 3};
+    this->template Run<3>();
+}
diff --git a/test/batchnorm/batchnorm_fwd_rank_4.cpp b/test/batchnorm/batchnorm_fwd_rank_4.cpp
new file mode 100644
index 00000000..13aef7d6
--- /dev/null
+++ b/test/batchnorm/batchnorm_fwd_rank_4.cpp
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <tuple>
+#include <gtest/gtest.h>
+
+#include "profiler/profile_batchnorm_forward_impl.hpp"
+
+using F16  = ck::half_t;
+using F32  = float;
+using BF16 = ck::bhalf_t;
+using I8   = int8_t;
+using F64  = double;
+
+template <typename Tuple>
+class TestBatchNormFwdRank4 : public ::testing::Test
+{
+    private:
+    const double epsilon       = std::numeric_limits<float>::epsilon();
+    const double averageFactor = 0.1;
+
+    protected:
+    using XDataType       = std::tuple_element_t<0, Tuple>;
+    using YDataType       = std::tuple_element_t<1, Tuple>;
+    using AccDataType     = std::tuple_element_t<2, Tuple>;
+    using ScaleDataType   = std::tuple_element_t<3, Tuple>;
+    using BiasDataType    = std::tuple_element_t<4, Tuple>;
+    using MeanVarDataType = std::tuple_element_t<5, Tuple>;
+
+    std::vector<std::vector<size_t>> list_of_lengths = {
+        {128, 16, 3, 1024}, {128, 16, 6, 512}, {1, 1, 1, 1}, {4, 4, 4, 4}, {32, 32, 32, 32}};
+    std::vector<int> reduceDims;
+
+    template <int NumReduceDim>
+    void Run()
+    {
+        for(auto& inOutLengths : list_of_lengths)
+        {
+            bool pass = true;
+
+            EXPECT_FALSE(reduceDims.size() != NumReduceDim);
+
+            pass =
+                pass && ck::profiler::profile_batchnorm_forward_impl<XDataType,
+                                                                     YDataType,
+                                                                     AccDataType,
+                                                                     ScaleDataType,
+                                                                     BiasDataType,
+                                                                     MeanVarDataType,
+                                                                     4,
+                                                                     NumReduceDim>(true,
+                                                                                   3,
+                                                                                   false,
+                                                                                   false,
+                                                                                   inOutLengths,
+                                                                                   reduceDims,
+                                                                                   true,
+                                                                                   true,
+                                                                                   epsilon,
+                                                                                   averageFactor);
+
+            pass =
+                pass && ck::profiler::profile_batchnorm_forward_impl<XDataType,
+                                                                     YDataType,
+                                                                     AccDataType,
+                                                                     ScaleDataType,
+                                                                     BiasDataType,
+                                                                     MeanVarDataType,
+                                                                     4,
+                                                                     NumReduceDim>(true,
+                                                                                   3,
+                                                                                   false,
+                                                                                   false,
+                                                                                   inOutLengths,
+                                                                                   reduceDims,
+                                                                                   false,
+                                                                                   false,
+                                                                                   epsilon,
+                                                                                   averageFactor);
+
+            EXPECT_TRUE(pass);
+        }
+    }
+};
+
+using KernelTypes = ::testing::Types<std::tuple<F16, F16, F32, F16, F16, F32>,
+                                     std::tuple<F32, F32, F32, F32, F32, F32>,
+                                     std::tuple<BF16, BF16, F32, BF16, BF16, F32>,
+                                     std::tuple<F64, F64, F64, F64, F64, F64>>;
+
+TYPED_TEST_SUITE(TestBatchNormFwdRank4, KernelTypes);
+
+// nhwc
+TYPED_TEST(TestBatchNormFwdRank4, nhwc)
+{
+    this->reduceDims = {0, 1, 2};
+    this->template Run<3>();
+}
+
+// nchw
+TYPED_TEST(TestBatchNormFwdRank4, nchw)
+{
+    this->reduceDims = {0, 2, 3};
+    this->template Run<3>();
+}
diff --git a/test/block_to_ctile_map/CMakeLists.txt b/test/block_to_ctile_map/CMakeLists.txt
new file mode 100644
index 00000000..97dfbb2b
--- /dev/null
+++ b/test/block_to_ctile_map/CMakeLists.txt
@@ -0,0 +1 @@
+add_gtest_executable(test_block_to_ctile_map test_block_to_ctile_map.cpp)
\ No newline at end of file
diff --git a/test/block_to_ctile_map/test_block_to_ctile_map.cpp b/test/block_to_ctile_map/test_block_to_ctile_map.cpp
new file mode 100644
index 00000000..55d9b59f
--- /dev/null
+++ b/test/block_to_ctile_map/test_block_to_ctile_map.cpp
@@ -0,0 +1,322 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
+
+using namespace ck;
+
+static auto I0 = Number<0>{};
+static auto I1 = Number<1>{};
+static auto I2 = Number<2>{};
+
+TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N00_M01_N01_DeviceCTileIndexCheck1)
+{
+    const index_t M         = 384;
+    const index_t N         = 384;
+    const index_t MPerBlock = 128;
+    const index_t NPerBlock = 128;
+    const index_t MBlock    = M / MPerBlock;
+    const index_t NBlock    = N / NPerBlock;
+    const index_t M01       = 4;
+    const index_t N01       = 4;
+
+    auto c_grid_desc_m_n = make_naive_tensor_descriptor_packed(make_tuple(M, N));
+
+    printf("(M, N, MPerBlock, NPerBlock, M01, N01) = (%d, %d, %d, %d, %d, %d)\n",
+           M,
+           N,
+           MPerBlock,
+           NPerBlock,
+           M01,
+           N01);
+
+    BlockToCTileMap_M00_N00_M01_N01<MPerBlock, NPerBlock, decltype(c_grid_desc_m_n), true> tile_map(
+        c_grid_desc_m_n, M01, N01);
+
+    EXPECT_TRUE(tile_map.CheckValidity(c_grid_desc_m_n) == true);
+    EXPECT_TRUE(tile_map.CalculateGridSize(c_grid_desc_m_n) == 16);
+
+    // clang-format off
+    std::vector<std::vector<int>> expected_m0idx_n0idx_valid = {
+        {0, 0, 1},
+        {0, 1, 1},
+        {0, 2, 1},
+        {0, 3, 0},
+        {1, 0, 1},
+        {1, 1, 1},
+        {1, 2, 1},
+        {1, 3, 0},
+        {2, 0, 1},
+        {2, 1, 1},
+        {2, 2, 1},
+        {2, 3, 0},
+        {3, 0, 0},
+        {3, 1, 0},
+        {3, 2, 0},
+        {3, 3, 0}
+    };
+    // clang-format on
+
+    for(index_t i = 0; i < tile_map.CalculateGridSize(c_grid_desc_m_n); i++)
+    {
+        auto m0n0_idx = tile_map.CalculateBottomIndex(make_multi_index(i));
+        std::cout << "block_1d_id = " << i << ", m0, n0 = " << m0n0_idx[I0] << ", " << m0n0_idx[I1];
+        std::cout << ", valid = " << tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))
+                  << std::endl;
+        bool equal =
+            expected_m0idx_n0idx_valid[i] ==
+            std::vector<int>{m0n0_idx[I0],
+                             m0n0_idx[I1],
+                             tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))};
+        EXPECT_TRUE(equal);
+    }
+}
+
+TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N00_M01_N01_DeviceCTileIndexCheck0)
+{
+    const index_t M         = 384;
+    const index_t N         = 384;
+    const index_t MPerBlock = 128;
+    const index_t NPerBlock = 128;
+
+    const index_t M01 = 4;
+    const index_t N01 = 4;
+
+    auto c_grid_desc_m_n = make_naive_tensor_descriptor_packed(make_tuple(M, N));
+
+    printf("(M, N, MPerBlock, NPerBlock, M01, N01) = (%d, %d, %d, %d, %d, %d)\n",
+           M,
+           N,
+           MPerBlock,
+           NPerBlock,
+           M01,
+           N01);
+
+    BlockToCTileMap_M00_N00_M01_N01<MPerBlock, NPerBlock, decltype(c_grid_desc_m_n), false>
+        tile_map(c_grid_desc_m_n, M01, N01);
+
+    EXPECT_TRUE(tile_map.CheckValidity(c_grid_desc_m_n) == false);
+}
+
+TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N0_M01_DeviceCTileIndexCheck1)
+{
+    const index_t M         = 384;
+    const index_t N         = 512;
+    const index_t MPerBlock = 128;
+    const index_t NPerBlock = 128;
+    const index_t MBlock    = M / MPerBlock;
+    const index_t NBlock    = N / NPerBlock;
+    const index_t M01       = 4;
+
+    auto c_grid_desc_m_n = make_naive_tensor_descriptor_packed(make_tuple(M, N));
+
+    printf("(M, N, MPerBlock, NPerBlock, M01) = (%d, %d, %d, %d, %d)\n",
+           M,
+           N,
+           MPerBlock,
+           NPerBlock,
+           M01);
+
+    BlockToCTileMap_M00_N0_M01<MPerBlock, NPerBlock, decltype(c_grid_desc_m_n), true> tile_map(
+        c_grid_desc_m_n, M01);
+
+    EXPECT_TRUE(tile_map.CheckValidity(c_grid_desc_m_n) == true);
+    EXPECT_TRUE(tile_map.CalculateGridSize(c_grid_desc_m_n) == 16);
+
+    // clang-format off
+    std::vector<std::vector<int>> expected_m0idx_n0idx_valid = {
+        {0, 0, 1},
+        {1, 0, 1},
+        {2, 0, 1},
+        {3, 0, 0},
+        {0, 1, 1},
+        {1, 1, 1},
+        {2, 1, 1},
+        {3, 1, 0},
+        {0, 2, 1},
+        {1, 2, 1},
+        {2, 2, 1},
+        {3, 2, 0},
+        {0, 3, 1},
+        {1, 3, 1},
+        {2, 3, 1},
+        {3, 3, 0}
+    };
+    // clang-format on
+
+    for(index_t i = 0; i < tile_map.CalculateGridSize(c_grid_desc_m_n); i++)
+    {
+        auto m0n0_idx = tile_map.CalculateBottomIndex(make_multi_index(i));
+        std::cout << "block_1d_id = " << i << ", m0, n0 = " << m0n0_idx[I0] << ", " << m0n0_idx[I1];
+        std::cout << ", valid = " << tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))
+                  << std::endl;
+        bool equal =
+            expected_m0idx_n0idx_valid[i] ==
+            std::vector<int>{m0n0_idx[I0],
+                             m0n0_idx[I1],
+                             tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))};
+        EXPECT_TRUE(equal);
+    }
+}
+
+TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N0_M01_DeviceCTileIndexCheck0)
+{
+    const index_t M         = 512;
+    const index_t N         = 384;
+    const index_t MPerBlock = 128;
+    const index_t NPerBlock = 128;
+
+    auto c_grid_desc_m_n = make_naive_tensor_descriptor_packed(make_tuple(M, N));
+
+    // clang-format off
+    std::vector<std::tuple<int, int, bool>> expected_m0_gridsize_validity = {
+        {5, 15, false},
+        {4, 12, true},
+        {3, 18, false},
+        {2, 12, true},
+        {1, 12, true}
+    };
+    // clang-format on
+
+    for(auto e : expected_m0_gridsize_validity)
+    {
+        const index_t M01 = std::get<0>(e);
+
+        printf("(M, N, MPerBlock, NPerBlock, M01) = (%d, %d, %d, %d, %d)\n",
+               M,
+               N,
+               MPerBlock,
+               NPerBlock,
+               M01);
+
+        BlockToCTileMap_M00_N0_M01<MPerBlock, NPerBlock, decltype(c_grid_desc_m_n), false> tile_map(
+            c_grid_desc_m_n, M01);
+
+        EXPECT_EQ(tile_map.CalculateGridSize(c_grid_desc_m_n), std::get<1>(e));
+        EXPECT_EQ(tile_map.CheckValidity(c_grid_desc_m_n), std::get<2>(e));
+    }
+}
+
+TEST(BlockToCTileMap, TestBlockToCTileMap_M00_N0_M01Adapt)
+{
+    const index_t M         = 768;
+    const index_t N         = 384;
+    const index_t MPerBlock = 128;
+    const index_t NPerBlock = 128;
+    const index_t MBlock    = M / MPerBlock;
+    const index_t NBlock    = N / NPerBlock;
+    constexpr index_t M01   = 4;
+
+    auto c_grid_desc_m_n = make_naive_tensor_descriptor_packed(make_tuple(M, N));
+
+    printf("(M, N, MPerBlock, NPerBlock, M01) = (%d, %d, %d, %d, %d)\n",
+           M,
+           N,
+           MPerBlock,
+           NPerBlock,
+           M01);
+
+    BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, decltype(c_grid_desc_m_n)> tile_map(
+        c_grid_desc_m_n, M01);
+
+    EXPECT_TRUE(tile_map.CheckValidity(c_grid_desc_m_n) == true);
+    EXPECT_TRUE(tile_map.CalculateGridSize(c_grid_desc_m_n) == 18);
+
+    // clang-format off
+    std::vector<std::vector<int>> expected_m0idx_n0idx_valid = {
+        {0, 0, 1},
+        {1, 0, 1},
+        {2, 0, 1},
+        {3, 0, 1},
+        {0, 1, 1},
+        {1, 1, 1},
+        {2, 1, 1},
+        {3, 1, 1},
+        {0, 2, 1},
+        {1, 2, 1},
+        {2, 2, 1},
+        {3, 2, 1},
+        {4, 0, 1},
+        {5, 0, 1},
+        {4, 1, 1},
+        {5, 1, 1},
+        {4, 2, 1},
+        {5, 2, 1},
+    };
+    // clang-format on
+
+    for(index_t i = 0; i < tile_map.CalculateGridSize(c_grid_desc_m_n); i++)
+    {
+        auto m0n0_idx = tile_map.CalculateBottomIndex(make_multi_index(i));
+        std::cout << "block_1d_id = " << i << ", m0, n0 = " << m0n0_idx[I0] << ", " << m0n0_idx[I1];
+        std::cout << ", valid = " << tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))
+                  << std::endl;
+        bool equal =
+            expected_m0idx_n0idx_valid[i] ==
+            std::vector<int>{m0n0_idx[I0],
+                             m0n0_idx[I1],
+                             tile_map.ValidCTileIndex(m0n0_idx, make_tuple(MBlock, NBlock))};
+        EXPECT_TRUE(equal);
+    }
+}
+
+TEST(BlockToCTileMap, TestBlockToCTileMap_KSplit_M00_N0_M01Adapt)
+{
+    const index_t M         = 768;
+    const index_t N         = 384;
+    const index_t MPerBlock = 128;
+    const index_t NPerBlock = 128;
+    const index_t MBlock    = M / MPerBlock;
+    const index_t NBlock    = N / NPerBlock;
+    constexpr index_t M01   = 4;
+    const index_t KSplit    = 3;
+
+    auto c_grid_desc_m_n = make_naive_tensor_descriptor_packed(make_tuple(M, N));
+
+    printf("(M, N, MPerBlock, NPerBlock, M01) = (%d, %d, %d, %d, %d)\n",
+           M,
+           N,
+           MPerBlock,
+           NPerBlock,
+           M01);
+
+    BlockToCTileMap_KSplit_M00_N0_M01Adapt<MPerBlock, NPerBlock, decltype(c_grid_desc_m_n)>
+        tile_map(c_grid_desc_m_n, M01, KSplit);
+
+    EXPECT_TRUE(tile_map.CheckValidity(c_grid_desc_m_n) == true);
+    EXPECT_TRUE(tile_map.CalculateGridSize(c_grid_desc_m_n) == 18 * KSplit);
+
+    std::vector<std::vector<int>> expected_ksplitidx_m0idx_n0idx_valid = {
+        {0, 0, 0, 1}, {0, 1, 0, 1}, {0, 2, 0, 1}, {0, 3, 0, 1}, {0, 0, 1, 1}, {0, 1, 1, 1},
+        {0, 2, 1, 1}, {0, 3, 1, 1}, {0, 0, 2, 1}, {0, 1, 2, 1}, {0, 2, 2, 1}, {0, 3, 2, 1},
+        {0, 4, 0, 1}, {0, 5, 0, 1}, {0, 4, 1, 1}, {0, 5, 1, 1}, {0, 4, 2, 1}, {0, 5, 2, 1},
+        {1, 0, 0, 1}, {1, 1, 0, 1}, {1, 2, 0, 1}, {1, 3, 0, 1}, {1, 0, 1, 1}, {1, 1, 1, 1},
+        {1, 2, 1, 1}, {1, 3, 1, 1}, {1, 0, 2, 1}, {1, 1, 2, 1}, {1, 2, 2, 1}, {1, 3, 2, 1},
+        {1, 4, 0, 1}, {1, 5, 0, 1}, {1, 4, 1, 1}, {1, 5, 1, 1}, {1, 4, 2, 1}, {1, 5, 2, 1},
+        {2, 0, 0, 1}, {2, 1, 0, 1}, {2, 2, 0, 1}, {2, 3, 0, 1}, {2, 0, 1, 1}, {2, 1, 1, 1},
+        {2, 2, 1, 1}, {2, 3, 1, 1}, {2, 0, 2, 1}, {2, 1, 2, 1}, {2, 2, 2, 1}, {2, 3, 2, 1},
+        {2, 4, 0, 1}, {2, 5, 0, 1}, {2, 4, 1, 1}, {2, 5, 1, 1}, {2, 4, 2, 1}, {2, 5, 2, 1},
+    };
+
+    for(index_t i = 0; i < tile_map.CalculateGridSize(c_grid_desc_m_n); i++)
+    {
+        auto ksplitm0n0_idx = tile_map.CalculateBottomIndex(make_multi_index(i));
+        std::cout << "block_1d_id = " << i << ", ksplit, m0, n0 = " << ksplitm0n0_idx[I0] << ", "
+                  << ksplitm0n0_idx[I1] << ", " << ksplitm0n0_idx[I2];
+        std::cout << ", valid = "
+                  << tile_map.ValidCTileIndex(ksplitm0n0_idx, make_tuple(MBlock, NBlock))
+                  << std::endl;
+        bool equal =
+            expected_ksplitidx_m0idx_n0idx_valid[i] ==
+            std::vector<int>{ksplitm0n0_idx[I0],
+                             ksplitm0n0_idx[I1],
+                             ksplitm0n0_idx[I2],
+                             tile_map.ValidCTileIndex(ksplitm0n0_idx, make_tuple(MBlock, NBlock))};
+        EXPECT_TRUE(equal);
+    }
+}
diff --git a/test/conv_util/CMakeLists.txt b/test/conv_util/CMakeLists.txt
new file mode 100644
index 00000000..7a46039f
--- /dev/null
+++ b/test/conv_util/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_gtest_executable(test_conv_util conv_util.cpp)
+target_link_libraries(test_conv_util PRIVATE utility)
diff --git a/test/conv_util/conv_util.cpp b/test/conv_util/conv_util.cpp
new file mode 100644
index 00000000..73797a71
--- /dev/null
+++ b/test/conv_util/conv_util.cpp
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <string>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+
+namespace {
+
+class TestConvUtil : public ::testing::Test
+{
+    public:
+    void SetNDParams(std::size_t ndims, std::size_t s, std::size_t d, std::size_t p)
+    {
+        conv_params = ck::utils::conv::ConvParam(ndims,
+                                                 2,
+                                                 128,
+                                                 192,
+                                                 256,
+                                                 std::vector<ck::index_t>(ndims, 3),
+                                                 std::vector<ck::index_t>(ndims, 71),
+                                                 std::vector<ck::index_t>(ndims, s),
+                                                 std::vector<ck::index_t>(ndims, d),
+                                                 std::vector<ck::index_t>(ndims, p),
+                                                 std::vector<ck::index_t>(ndims, p));
+    }
+
+    protected:
+    // -------  default 2D -------
+    // input GNCHW {2, 128, 192, 71, 71},
+    // weights GKCYX {2, 256, 192, 3, 3},
+    // stride {s, s},
+    // dilations {d, d},
+    // padding {{p, p}, {p, p}
+    ck::utils::conv::ConvParam conv_params;
+};
+
+} // namespace
+
+TEST_F(TestConvUtil, ConvParamsGetOutputSpatialLengths1D)
+{
+    // stride 2, dilation 1, pad 1
+    SetNDParams(1, 2, 1, 1);
+    std::vector<ck::index_t> out_spatial_len = conv_params.GetOutputSpatialLengths();
+    EXPECT_TRUE(ck::utils::check_err(
+        out_spatial_len, std::vector<ck::index_t>{36}, "Error: ConvParams 1D."));
+
+    // stride 1, dilation 1, pad 1
+    SetNDParams(1, 1, 1, 1);
+    out_spatial_len = conv_params.GetOutputSpatialLengths();
+    EXPECT_TRUE(ck::utils::check_err(
+        out_spatial_len, std::vector<ck::index_t>{71}, "Error: ConvParams 1D stride {1}."));
+
+    // stride 2, dilation 1, pad 2
+    SetNDParams(1, 2, 1, 2);
+    out_spatial_len = conv_params.GetOutputSpatialLengths();
+    EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
+                                     std::vector<ck::index_t>{37},
+                                     "Error: ConvParams 1D padding left/right {2}."));
+
+    // stride 2, dilation 2, pad 2
+    SetNDParams(1, 2, 2, 2);
+    out_spatial_len = conv_params.GetOutputSpatialLengths();
+    EXPECT_TRUE(ck::utils::check_err(
+        out_spatial_len, std::vector<ck::index_t>{36}, "Error: ConvParams 1D dilation {2}."));
+
+    // stride 3, dilation 2, pad 1
+    SetNDParams(1, 3, 2, 1);
+    out_spatial_len = conv_params.GetOutputSpatialLengths();
+    EXPECT_TRUE(
+        ck::utils::check_err(out_spatial_len,
+                             std::vector<ck::index_t>{23},
+                             "Error: ConvParams 1D strides{3}, padding {1}, dilations {2}."));
+}
+
+TEST_F(TestConvUtil, ConvParamsGetOutputSpatialLengths2D)
+{
+    // stride 2, dilation 1, pad 1
+    SetNDParams(2, 2, 1, 1);
+    std::vector<ck::index_t> out_spatial_len = conv_params.GetOutputSpatialLengths();
+    EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
+                                     std::vector<ck::index_t>{36, 36},
+                                     "Error: ConvParams 2D default constructor."));
+
+    // stride 1, dilation 1, pad 1
+    SetNDParams(2, 1, 1, 1);
+    out_spatial_len = conv_params.GetOutputSpatialLengths();
+    EXPECT_TRUE(ck::utils::check_err(
+        out_spatial_len, std::vector<ck::index_t>{71, 71}, "Error: ConvParams 2D stride {1,1}."));
+
+    // stride 2, dilation 1, pad 2
+    SetNDParams(2, 2, 1, 2);
+    out_spatial_len = conv_params.GetOutputSpatialLengths();
+    EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
+                                     std::vector<ck::index_t>{37, 37},
+                                     "Error: ConvParams 2D padding left/right {2,2}."));
+
+    // stride 2, dilation 2, pad 2
+    SetNDParams(2, 2, 2, 2);
+    out_spatial_len = conv_params.GetOutputSpatialLengths();
+    EXPECT_TRUE(ck::utils::check_err(
+        out_spatial_len, std::vector<ck::index_t>{36, 36}, "Error: ConvParams 2D dilation {2,2}."));
+
+    // stride 3, dilation 2, pad 1
+    SetNDParams(2, 3, 2, 1);
+    out_spatial_len = conv_params.GetOutputSpatialLengths();
+    EXPECT_TRUE(
+        ck::utils::check_err(out_spatial_len,
+                             std::vector<ck::index_t>{23, 23},
+                             "Error: ConvParams 2D strides{3,3}, padding {1,1}, dilations {2,2}."));
+}
+
+TEST_F(TestConvUtil, ConvParamsGetOutputSpatialLengths3D)
+{
+    // stride 2, dilation 1, pad 1
+    SetNDParams(3, 2, 1, 1);
+    std::vector<ck::index_t> out_spatial_len = conv_params.GetOutputSpatialLengths();
+    EXPECT_TRUE(ck::utils::check_err(
+        out_spatial_len, std::vector<ck::index_t>{36, 36, 36}, "Error: ConvParams 3D."));
+
+    // stride 1, dilation 1, pad 1
+    SetNDParams(3, 1, 1, 1);
+    out_spatial_len = conv_params.GetOutputSpatialLengths();
+    EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
+                                     std::vector<ck::index_t>{71, 71, 71},
+                                     "Error: ConvParams 3D stride {1, 1, 1}."));
+
+    // stride 2, dilation 1, pad 2
+    SetNDParams(3, 2, 1, 2);
+    out_spatial_len = conv_params.GetOutputSpatialLengths();
+    EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
+                                     std::vector<ck::index_t>{37, 37, 37},
+                                     "Error: ConvParams 3D padding left/right {2, 2, 2}."));
+
+    // stride 2, dilation 2, pad 2
+    SetNDParams(3, 2, 2, 2);
+    out_spatial_len = conv_params.GetOutputSpatialLengths();
+    EXPECT_TRUE(ck::utils::check_err(out_spatial_len,
+                                     std::vector<ck::index_t>{36, 36, 36},
+                                     "Error: ConvParams 3D dilation {2, 2, 2}."));
+
+    // stride 3, dilation 2, pad 1
+    SetNDParams(3, 3, 2, 1);
+    out_spatial_len = conv_params.GetOutputSpatialLengths();
+    EXPECT_TRUE(ck::utils::check_err(
+        out_spatial_len,
+        std::vector<ck::index_t>{23, 23, 23},
+        "Error: ConvParams 3D strides{3, 3, 3}, padding {1, 1, 1}, dilations {2, 2, 2}."));
+}
diff --git a/test/convnd_bwd_data/CMakeLists.txt b/test/convnd_bwd_data/CMakeLists.txt
new file mode 100644
index 00000000..16ca4de8
--- /dev/null
+++ b/test/convnd_bwd_data/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_gtest_executable(test_convnd_bwd_data convnd_bwd_data.cpp)
+target_link_libraries(test_convnd_bwd_data PRIVATE utility device_conv1d_bwd_data_instance device_conv2d_bwd_data_instance device_conv3d_bwd_data_instance)
diff --git a/test/convnd_bwd_data/convnd_bwd_data.cpp b/test/convnd_bwd_data/convnd_bwd_data.cpp
new file mode 100644
index 00000000..70231d42
--- /dev/null
+++ b/test/convnd_bwd_data/convnd_bwd_data.cpp
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <tuple>
+#include <gtest/gtest.h>
+
+#include "profiler/profile_conv_bwd_data_impl.hpp"
+
+template <typename Tuple>
+class TestConvndBwdData : public ::testing::Test
+{
+    protected:
+    using DataType = std::tuple_element_t<0, Tuple>;
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        for(auto& param : conv_params)
+        {
+            bool pass;
+            EXPECT_FALSE(conv_params.empty());
+            pass = ck::profiler::profile_conv_bwd_data_impl<
+                NDimSpatial,
+                ck::tuple_element_t<NDimSpatial - 1,
+                                    ck::Tuple<ck::tensor_layout::convolution::NWC,
+                                              ck::tensor_layout::convolution::NHWC,
+                                              ck::tensor_layout::convolution::NDHWC>>,
+                ck::tuple_element_t<NDimSpatial - 1,
+                                    ck::Tuple<ck::tensor_layout::convolution::KXC,
+                                              ck::tensor_layout::convolution::KYXC,
+                                              ck::tensor_layout::convolution::KZYXC>>,
+                ck::tuple_element_t<NDimSpatial - 1,
+                                    ck::Tuple<ck::tensor_layout::convolution::NWK,
+                                              ck::tensor_layout::convolution::NHWK,
+                                              ck::tensor_layout::convolution::NDHWK>>,
+                DataType,
+                DataType,
+                DataType>(true,  // do_verification
+                          1,     // init_method integer value
+                          false, // do_log
+                          false, // time_kernel
+                          param);
+            EXPECT_TRUE(pass);
+        }
+    }
+};
+
+using KernelTypes = ::testing::Types<std::tuple<float>,
+                                     std::tuple<ck::half_t>,
+                                     std::tuple<ck::bhalf_t>,
+                                     std::tuple<std::int8_t>>;
+TYPED_TEST_SUITE(TestConvndBwdData, KernelTypes);
+
+// 1d
+TYPED_TEST(TestConvndBwdData, Conv1dBwdData)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back({1, 1, 128, 128, 256, {1}, {14}, {2}, {1}, {0}, {0}});
+    this->conv_params.push_back({1, 1, 128, 128, 256, {3}, {28}, {1}, {1}, {1}, {1}});
+    this->conv_params.push_back({1, 1, 128, 128, 256, {1}, {3}, {1}, {1}, {0}, {0}});
+    this->template Run<1>();
+}
+
+// 2d
+TYPED_TEST(TestConvndBwdData, Conv2dBwdData)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back(
+        {2, 1, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back(
+        {2, 1, 128, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->conv_params.push_back(
+        {2, 1, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
+    this->template Run<2>();
+}
+
+// 3d
+TYPED_TEST(TestConvndBwdData, Conv3dBwdData)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back(
+        {3, 1, 128, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 1, 128, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+    this->conv_params.push_back(
+        {3, 1, 128, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->template Run<3>();
+}
diff --git a/test/convnd_fwd/CMakeLists.txt b/test/convnd_fwd/CMakeLists.txt
new file mode 100644
index 00000000..97e170d8
--- /dev/null
+++ b/test/convnd_fwd/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_gtest_executable(test_convnd_fwd convnd_fwd.cpp)
+target_link_libraries(test_convnd_fwd PRIVATE utility device_conv2d_fwd_instance)
diff --git a/test/convnd_fwd/convnd_fwd.cpp b/test/convnd_fwd/convnd_fwd.cpp
new file mode 100644
index 00000000..a1921a9b
--- /dev/null
+++ b/test/convnd_fwd/convnd_fwd.cpp
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <tuple>
+#include <gtest/gtest.h>
+
+#include "profiler/profile_conv_fwd_impl.hpp"
+
+template <typename Tuple>
+class TestConvndFwd : public ::testing::Test
+{
+    protected:
+    using DataType = std::tuple_element_t<0, Tuple>;
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        for(auto& param : conv_params)
+        {
+            bool pass;
+            EXPECT_FALSE(conv_params.empty());
+            pass = ck::profiler::profile_conv_fwd_impl<
+                NDimSpatial,
+                ck::tuple_element_t<NDimSpatial - 1,
+                                    ck::Tuple<ck::tensor_layout::convolution::NWC,
+                                              ck::tensor_layout::convolution::NHWC,
+                                              ck::tensor_layout::convolution::NDHWC>>,
+                ck::tuple_element_t<NDimSpatial - 1,
+                                    ck::Tuple<ck::tensor_layout::convolution::KXC,
+                                              ck::tensor_layout::convolution::KYXC,
+                                              ck::tensor_layout::convolution::KZYXC>>,
+                ck::tuple_element_t<NDimSpatial - 1,
+                                    ck::Tuple<ck::tensor_layout::convolution::NWK,
+                                              ck::tensor_layout::convolution::NHWK,
+                                              ck::tensor_layout::convolution::NDHWK>>,
+                DataType,
+                DataType,
+                DataType>(true,  // do_verification
+                          1,     // init_method integer value
+                          false, // do_log
+                          false, // time_kernel
+                          param);
+            EXPECT_TRUE(pass);
+        }
+    }
+};
+
+using KernelTypes = ::testing::Types<std::tuple<float>,
+                                     std::tuple<ck::half_t>,
+                                     std::tuple<ck::bhalf_t>,
+                                     std::tuple<std::int8_t>>;
+TYPED_TEST_SUITE(TestConvndFwd, KernelTypes);
+
+// 1d
+TYPED_TEST(TestConvndFwd, Conv1dFwd)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back({1, 1, 128, 128, 256, {1}, {14}, {2}, {1}, {0}, {0}});
+    this->conv_params.push_back({1, 1, 128, 128, 256, {3}, {28}, {1}, {1}, {1}, {1}});
+    this->conv_params.push_back({1, 1, 128, 128, 256, {1}, {3}, {1}, {1}, {0}, {0}});
+    this->template Run<1>();
+}
+
+// 2d
+TYPED_TEST(TestConvndFwd, Conv2dFwd)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back(
+        {2, 1, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back(
+        {2, 1, 128, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->conv_params.push_back(
+        {2, 1, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
+    this->template Run<2>();
+}
+// 3d
+TYPED_TEST(TestConvndFwd, Conv3dFwd)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back(
+        {3, 1, 128, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 1, 128, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+    this->conv_params.push_back(
+        {3, 1, 128, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->template Run<3>();
+}
diff --git a/test/data_type/CMakeLists.txt b/test/data_type/CMakeLists.txt
new file mode 100644
index 00000000..088fbfec
--- /dev/null
+++ b/test/data_type/CMakeLists.txt
@@ -0,0 +1,4 @@
+if (USE_BITINT_EXTENSION_INT4)
+  add_gtest_executable(test_int4 int4.cpp)
+  target_link_libraries(test_int4 PRIVATE utility)
+endif()
diff --git a/test/data_type/int4.cpp b/test/data_type/int4.cpp
new file mode 100644
index 00000000..252a450b
--- /dev/null
+++ b/test/data_type/int4.cpp
@@ -0,0 +1,211 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <bitset>
+#include <cinttypes>
+#include <cstdint>
+#include <iomanip>
+#include "gtest/gtest.h"
+#include <hip/hip_runtime.h>
+
+#include "ck/host_utility/hip_check_error.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/math_v2.hpp"
+#include "ck/utility/get_id.hpp"
+#include "ck/library/utility/device_memory.hpp"
+
+using ck::int4_t;
+
+TEST(Int4, BaseArithmetic)
+{
+    int4_t a{1};
+    int4_t b{-2};
+    EXPECT_EQ(a + a, int4_t{2});
+    EXPECT_EQ(a - a, int4_t{0});
+    EXPECT_EQ(a + b, int4_t{-1});
+    EXPECT_EQ(a - b, int4_t{3});
+    EXPECT_EQ(a * a, int4_t{1});
+    EXPECT_EQ(a * b, int4_t{-2});
+    EXPECT_EQ(b * b, int4_t{4});
+    EXPECT_EQ(a / b, int4_t{0});
+    a = int4_t{4};
+    EXPECT_EQ(a / b, int4_t{-2});
+    b = int4_t{2};
+    EXPECT_EQ(a % b, int4_t{0});
+}
+
+TEST(Int4, NumericLimits)
+{
+    EXPECT_EQ(ck::NumericLimits<int4_t>::Min(), int4_t{-8});
+    EXPECT_EQ(ck::NumericLimits<int4_t>::Max(), int4_t{7});
+    EXPECT_EQ(ck::NumericLimits<int4_t>::Lowest(), int4_t{-8});
+}
+
+TEST(Int4, MathOpsV2)
+{
+    int4_t a{4};
+    int4_t b{-5};
+
+    EXPECT_EQ(ck::math::abs(a), int4_t{4});
+    EXPECT_EQ(ck::math::abs(b), int4_t{5});
+    EXPECT_FALSE(ck::math::isnan(b));
+}
+
+namespace {
+
+__global__ void copy(const int4_t* src, std::int8_t* dst, ck::index_t N)
+{
+    ck::index_t tid = ck::get_thread_global_1d_id();
+
+    const int8_t* src_i8 = reinterpret_cast<const int8_t*>(src);
+
+    if(tid < N)
+    {
+        for(ck::index_t i = tid; i < N; i += ck::get_grid_size())
+        {
+            dst[i] = src_i8[i];
+        }
+    }
+}
+
+__global__ void copy_with_static_cast(const int4_t* src, std::int8_t* dst, ck::index_t N)
+{
+    ck::index_t tid = ck::get_thread_global_1d_id();
+
+    if(tid < N)
+    {
+        for(ck::index_t i = tid; i < N; i += ck::get_grid_size())
+        {
+            dst[i] = static_cast<std::int8_t>(src[i]);
+        }
+    }
+}
+
+} // anonymous namespace
+
+TEST(Int4, CopyAsI8PositiveValue)
+{
+    constexpr std::size_t SIZE = 100;
+    std::vector<int4_t> h_src_i4(SIZE, 7);
+    std::vector<std::int8_t> h_src_i8(SIZE, 7);
+    std::vector<std::int8_t> h_dst_i8(SIZE, 0);
+
+    DeviceMem d_src_i4(h_src_i4.size() * sizeof(int4_t));
+    DeviceMem d_dst_i8(h_dst_i8.size() * sizeof(std::int8_t));
+
+    d_src_i4.SetZero();
+    d_dst_i8.SetZero();
+
+    d_src_i4.ToDevice(h_src_i4.data());
+
+    copy<<<1, 64>>>(reinterpret_cast<const int4_t*>(d_src_i4.GetDeviceBuffer()),
+                    reinterpret_cast<std::int8_t*>(d_dst_i8.GetDeviceBuffer()),
+                    SIZE);
+    hip_check_error(hipDeviceSynchronize());
+    d_dst_i8.FromDevice(h_dst_i8.data());
+
+    for(std::size_t i = 0; i < SIZE; ++i)
+    {
+        EXPECT_EQ(h_src_i8[i], h_dst_i8[i]);
+    }
+}
+
+TEST(Int4, DISABLED_CopyAsI8NegativeValue)
+{
+    constexpr std::size_t SIZE = 32;
+    std::vector<int4_t> h_src_i4(SIZE, -8);
+    std::vector<std::int8_t> h_src_i8(SIZE, -8);
+    std::vector<std::int8_t> h_dst_i8(SIZE, 0);
+
+    DeviceMem d_src_i4(h_src_i4.size() * sizeof(int4_t));
+    DeviceMem d_dst_i8(h_dst_i8.size() * sizeof(std::int8_t));
+
+    d_src_i4.SetZero();
+    d_dst_i8.SetZero();
+
+    d_src_i4.ToDevice(h_src_i4.data());
+
+    copy<<<1, 64>>>(reinterpret_cast<const int4_t*>(d_src_i4.GetDeviceBuffer()),
+                    reinterpret_cast<std::int8_t*>(d_dst_i8.GetDeviceBuffer()),
+                    SIZE);
+    hip_check_error(hipDeviceSynchronize());
+    d_dst_i8.FromDevice(h_dst_i8.data());
+
+    for(std::size_t i = 0; i < SIZE; ++i)
+    {
+        EXPECT_EQ(h_src_i8[i], h_dst_i8[i]);
+    }
+}
+
+TEST(Int4, CopyAsI8NegativeValueStaticCast)
+{
+    constexpr std::size_t SIZE = 32;
+    std::vector<int4_t> h_src_i4(SIZE, -8);
+    std::vector<std::int8_t> h_src_i8(SIZE, -8);
+    std::vector<std::int8_t> h_dst_i8(SIZE, 0);
+
+    DeviceMem d_src_i4(h_src_i4.size() * sizeof(int4_t));
+    DeviceMem d_dst_i8(h_dst_i8.size() * sizeof(std::int8_t));
+
+    d_src_i4.SetZero();
+    d_dst_i8.SetZero();
+
+    d_src_i4.ToDevice(h_src_i4.data());
+
+    copy_with_static_cast<<<1, 64>>>(reinterpret_cast<const int4_t*>(d_src_i4.GetDeviceBuffer()),
+                                     reinterpret_cast<std::int8_t*>(d_dst_i8.GetDeviceBuffer()),
+                                     SIZE);
+    hip_check_error(hipDeviceSynchronize());
+    d_dst_i8.FromDevice(h_dst_i8.data());
+
+    for(std::size_t i = 0; i < SIZE; ++i)
+    {
+        EXPECT_EQ(h_src_i8[i], h_dst_i8[i]);
+    }
+}
+
+TEST(Int4, DISABLED_BitwiseRepresentation)
+{
+    using bit8_t = std::bitset<8>;
+
+    int4_t a_i4{3};
+    std::int8_t a_i8 = *reinterpret_cast<std::int8_t*>(&a_i4);
+    std::int8_t b_i8{3};
+#if 0
+    std::cout << std::hex << std::showbase << static_cast<int32_t>(a_i8)
+              << ", " << static_cast<int32_t>(b_i8) << std::endl;
+#endif
+    EXPECT_EQ(bit8_t{static_cast<std::uint64_t>(a_i8)}, bit8_t{static_cast<std::uint64_t>(b_i8)});
+
+    a_i4 = int4_t{-3};
+    a_i8 = *reinterpret_cast<std::int8_t*>(&a_i4);
+    b_i8 = std::int8_t{-3};
+#if 0
+    std::cout << std::hex << std::showbase << static_cast<int32_t>(a_i8)
+              << ", " << static_cast<int32_t>(b_i8) << std::endl;
+#endif
+    EXPECT_EQ(bit8_t{static_cast<std::uint64_t>(a_i8)}, bit8_t{static_cast<std::uint64_t>(b_i8)});
+}
+
+TEST(Int4, BitwiseRepresentationStaticCast)
+{
+    using bit8_t = std::bitset<8>;
+
+    int4_t a_i4{3};
+    std::int8_t a_i8 = static_cast<std::int8_t>(a_i4);
+    std::int8_t b_i8{3};
+#if 0
+    std::cout << std::hex << std::showbase << static_cast<int32_t>(a_i8)
+              << ", " << static_cast<int32_t>(b_i8) << std::endl;
+#endif
+    EXPECT_EQ(bit8_t{static_cast<std::uint64_t>(a_i8)}, bit8_t{static_cast<std::uint64_t>(b_i8)});
+
+    a_i4 = int4_t{-3};
+    a_i8 = static_cast<std::int8_t>(a_i4);
+    b_i8 = std::int8_t{-3};
+#if 0
+    std::cout << std::hex << std::showbase << static_cast<int32_t>(a_i8)
+              << ", " << static_cast<int32_t>(b_i8) << std::endl;
+#endif
+    EXPECT_EQ(bit8_t{static_cast<std::uint64_t>(a_i8)}, bit8_t{static_cast<std::uint64_t>(b_i8)});
+}
diff --git a/test/elementwise_normalization/CMakeLists.txt b/test/elementwise_normalization/CMakeLists.txt
new file mode 100644
index 00000000..a20eb263
--- /dev/null
+++ b/test/elementwise_normalization/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_custom_target(test_elementwise_normalization)
+
+add_gtest_executable(test_elementwise_layernorm_fp16 test_elementwise_layernorm_fp16.cpp)
+
+target_link_libraries(test_elementwise_layernorm_fp16 PRIVATE utility device_elementwise_normalization_instance)
+
+add_dependencies(test_elementwise_normalization test_elementwise_layernorm_fp16)
diff --git a/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp b/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
new file mode 100644
index 00000000..403881b3
--- /dev/null
+++ b/test/elementwise_normalization/test_elementwise_layernorm_fp16.cpp
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "profiler/profile_elementwise_layernorm_impl.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+using ck::index_t;
+
+template <typename Tuple>
+class TestElementwiseLayernorm : public ::testing::Test
+{
+    protected:
+    using ADataType     = std::tuple_element_t<0, Tuple>;
+    using BDataType     = std::tuple_element_t<1, Tuple>;
+    using GammaDataType = std::tuple_element_t<2, Tuple>;
+    using BetaDataType  = std::tuple_element_t<3, Tuple>;
+    using AccDataType   = std::tuple_element_t<4, Tuple>;
+    using YDataType     = std::tuple_element_t<5, Tuple>;
+
+    void Run()
+    {
+        // M, N
+        std::vector<std::vector<ck::index_t>> lengths = {
+            {1, 1}, {25, 16}, {39, 777}, {100, 200}, {1024, 1024}, {48 * 256, 2048}};
+
+        for(auto length : lengths)
+        {
+            bool success = ck::profiler::profile_elementwise_layernorm_impl<ADataType,
+                                                                            BDataType,
+                                                                            GammaDataType,
+                                                                            BetaDataType,
+                                                                            AccDataType,
+                                                                            YDataType>(
+                true, 2, false, false, length);
+            EXPECT_TRUE(success);
+        }
+    }
+};
+
+using KernelTypes = ::testing::Types<
+    // ADataType, BDataType, GammaDataType, BetaDataType, AccDataType, YDataType>
+    std::tuple<F16, F16, F16, F16, F32, F16>>;
+
+TYPED_TEST_SUITE(TestElementwiseLayernorm, KernelTypes);
+TYPED_TEST(TestElementwiseLayernorm, Test_FP16) { this->Run(); }
diff --git a/test/gemm/CMakeLists.txt b/test/gemm/CMakeLists.txt
new file mode 100644
index 00000000..c427586b
--- /dev/null
+++ b/test/gemm/CMakeLists.txt
@@ -0,0 +1,25 @@
+add_test_executable(test_gemm_fp32 gemm_fp32.cpp)
+target_link_libraries(test_gemm_fp32 PRIVATE utility)
+target_link_libraries(test_gemm_fp32 PRIVATE device_gemm_instance)
+
+add_test_executable(test_gemm_fp16 gemm_fp16.cpp)
+target_link_libraries(test_gemm_fp16 PRIVATE utility)
+target_link_libraries(test_gemm_fp16 PRIVATE device_gemm_instance)
+
+add_test_executable(test_gemm_bf16 gemm_bf16.cpp)
+target_link_libraries(test_gemm_bf16 PRIVATE utility)
+target_link_libraries(test_gemm_bf16 PRIVATE device_gemm_instance)
+
+add_test_executable(test_gemm_int8 gemm_int8.cpp)
+target_link_libraries(test_gemm_int8 PRIVATE utility)
+target_link_libraries(test_gemm_int8 PRIVATE device_gemm_instance)
+
+add_library(gemm_standalone_xdl_fp16_instances STATIC
+    instance/gemm_f16_nn_instance.cpp
+    instance/gemm_f16_nt_instance.cpp
+    instance/gemm_f16_tn_instance.cpp
+    instance/gemm_f16_tt_instance.cpp
+)
+add_test_executable(test_gemm_standalone_xdl_fp16 gemm_standalone_xdl_fp16.cpp)
+target_link_libraries(test_gemm_standalone_xdl_fp16 PRIVATE gemm_standalone_xdl_fp16_instances utility)
+target_include_directories(test_gemm_standalone_xdl_fp16 PRIVATE instance/)
diff --git a/test/gemm/gemm_bf16.cpp b/test/gemm/gemm_bf16.cpp
new file mode 100644
index 00000000..5290d466
--- /dev/null
+++ b/test/gemm/gemm_bf16.cpp
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <tuple>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+#include "test/gemm/gemm_util.hpp"
+
+using ADataType   = ck::bhalf_t;
+using BDataType   = ck::bhalf_t;
+using CDataType   = ck::bhalf_t;
+using AccDataType = float;
+
+#include "run_gemm_test.inc"
+
+int main() { return run_gemm_test(); }
diff --git a/test/gemm/gemm_fp16.cpp b/test/gemm/gemm_fp16.cpp
new file mode 100644
index 00000000..92e225de
--- /dev/null
+++ b/test/gemm/gemm_fp16.cpp
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <tuple>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+#include "test/gemm/gemm_util.hpp"
+
+using ADataType   = ck::half_t;
+using BDataType   = ck::half_t;
+using CDataType   = ck::half_t;
+using AccDataType = float;
+
+#include "run_gemm_test.inc"
+
+int main() { return run_gemm_test(); }
diff --git a/test/gemm/gemm_fp32.cpp b/test/gemm/gemm_fp32.cpp
new file mode 100644
index 00000000..5d8c4881
--- /dev/null
+++ b/test/gemm/gemm_fp32.cpp
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <tuple>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+#include "test/gemm/gemm_util.hpp"
+
+using ADataType   = float;
+using BDataType   = float;
+using CDataType   = float;
+using AccDataType = float;
+
+#include "run_gemm_test.inc"
+
+int main() { return run_gemm_test(); }
diff --git a/test/gemm/gemm_fp64.cpp b/test/gemm/gemm_fp64.cpp
new file mode 100644
index 00000000..85d7f95b
--- /dev/null
+++ b/test/gemm/gemm_fp64.cpp
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <tuple>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+#include "test/gemm/gemm_util.hpp"
+
+using ADataType   = double;
+using BDataType   = double;
+using CDataType   = double;
+using AccDataType = double;
+
+#include "run_gemm_test.inc"
+
+int main() { return run_gemm_test(); }
diff --git a/test/gemm/gemm_int8.cpp b/test/gemm/gemm_int8.cpp
new file mode 100644
index 00000000..e73b22ce
--- /dev/null
+++ b/test/gemm/gemm_int8.cpp
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <tuple>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+#include "test/gemm/gemm_util.hpp"
+
+using ADataType   = int8_t;
+using BDataType   = int8_t;
+using CDataType   = int8_t;
+using AccDataType = int32_t;
+
+#include "run_gemm_test.inc"
+
+int main() { return run_gemm_test(); }
diff --git a/test/gemm/gemm_standalone_xdl_fp16.cpp b/test/gemm/gemm_standalone_xdl_fp16.cpp
new file mode 100644
index 00000000..8f5a5c55
--- /dev/null
+++ b/test/gemm/gemm_standalone_xdl_fp16.cpp
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gemm_util.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+
+#include "gemm_f16_nn_instance.hpp"
+#include "gemm_f16_nt_instance.hpp"
+#include "gemm_f16_tn_instance.hpp"
+#include "gemm_f16_tt_instance.hpp"
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using F16         = ck::half_t;
+using ADataType   = F16;
+using BDataType   = F16;
+using AccDataType = float;
+using CDataType   = F16;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+using ck::gemm_util::GemmParams;
+using ck::tensor_operation::device::BaseOperator;
+using ck::tensor_operation::device::DeviceGemm;
+using namespace ck::tensor_operation::device::instance;
+
+using DeviceGemmNN =
+    DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>;
+using DeviceGemmNT =
+    DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>;
+using DeviceGemmTN =
+    DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>;
+using DeviceGemmTT =
+    DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>;
+
+struct LayoutConfig
+{
+    bool ARowMajor;
+    bool BRowMajor;
+    bool CRowMajor;
+};
+
+int main(int argc, char* argv[])
+{
+    // Class DeviceGemm is templated by layout and precision types so it is not an option to contain
+    // them in a single vector. Instead we use abstract BaseOperator class and dynamic_cast() it
+    // upon invocation.
+    // And since DeviceGemm does not expose template arg information, an extra book keeping class
+    // LayoutConfig is used for determining which type a BaseOperator instance should be cast to.
+    using OpFactoryFn = void (*)(std::vector<std::unique_ptr<BaseOperator>>&);
+
+    std::vector<std::tuple<GemmParams, LayoutConfig, OpFactoryFn>> problems = {
+        // clang-format off
+    // 104 tiles
+    {GemmParams{2048, 3328, 4096}, LayoutConfig{false, false, true}, add_gemm_f16_nn_256x256},
+    {GemmParams{2048, 1664, 4096}, LayoutConfig{false, false, true}, add_gemm_f16_nn_256x128},
+    {GemmParams{1024, 1664, 4096}, LayoutConfig{false, false, true}, add_gemm_f16_nn_128x128},
+    {GemmParams{1024,  832, 4096}, LayoutConfig{false, false, true}, add_gemm_f16_nn_128x64},
+    {GemmParams{2048, 3328, 4096}, LayoutConfig{false, true, true}, add_gemm_f16_nt_256x256},
+    {GemmParams{2048, 1664, 4096}, LayoutConfig{false, true, true}, add_gemm_f16_nt_256x128},
+    {GemmParams{1024, 1664, 4096}, LayoutConfig{false, true, true}, add_gemm_f16_nt_128x128},
+    {GemmParams{1024,  832, 4096}, LayoutConfig{false, true, true}, add_gemm_f16_nt_128x64},
+    {GemmParams{2048, 3328, 4096}, LayoutConfig{true, false, true}, add_gemm_f16_tn_256x256},
+    {GemmParams{2048, 1664, 4096}, LayoutConfig{true, false, true}, add_gemm_f16_tn_256x128},
+    {GemmParams{1024, 1664, 4096}, LayoutConfig{true, false, true}, add_gemm_f16_tn_128x128},
+    {GemmParams{1024,  832, 4096}, LayoutConfig{true, false, true}, add_gemm_f16_tn_128x64},
+    {GemmParams{2048, 3328, 4096}, LayoutConfig{true, true, true}, add_gemm_f16_tt_256x256},
+    {GemmParams{2048, 1664, 4096}, LayoutConfig{true, true, true}, add_gemm_f16_tt_256x128},
+    {GemmParams{1024, 1664, 4096}, LayoutConfig{true, true, true}, add_gemm_f16_tt_128x128},
+    {GemmParams{1024,  832, 4096}, LayoutConfig{true, true, true}, add_gemm_f16_tt_128x64},
+    // 110 tiles
+    {GemmParams{2560, 2816, 4096}, LayoutConfig{false, false, true}, add_gemm_f16_nn_256x256},
+    {GemmParams{2560, 1408, 4096}, LayoutConfig{false, false, true}, add_gemm_f16_nn_256x128},
+    {GemmParams{1280, 1408, 4096}, LayoutConfig{false, false, true}, add_gemm_f16_nn_128x128},
+    {GemmParams{1280,  704, 4096}, LayoutConfig{false, false, true}, add_gemm_f16_nn_128x64},
+    {GemmParams{2560, 2816, 4096}, LayoutConfig{false, true, true}, add_gemm_f16_nt_256x256},
+    {GemmParams{2560, 1408, 4096}, LayoutConfig{false, true, true}, add_gemm_f16_nt_256x128},
+    {GemmParams{1280, 1408, 4096}, LayoutConfig{false, true, true}, add_gemm_f16_nt_128x128},
+    {GemmParams{1280,  704, 4096}, LayoutConfig{false, true, true}, add_gemm_f16_nt_128x64},
+    {GemmParams{2560, 2816, 4096}, LayoutConfig{true, false, true}, add_gemm_f16_tn_256x256},
+    {GemmParams{2560, 1408, 4096}, LayoutConfig{true, false, true}, add_gemm_f16_tn_256x128},
+    {GemmParams{1280, 1408, 4096}, LayoutConfig{true, false, true}, add_gemm_f16_tn_128x128},
+    {GemmParams{1280,  704, 4096}, LayoutConfig{true, false, true}, add_gemm_f16_tn_128x64},
+    {GemmParams{2560, 2816, 4096}, LayoutConfig{true, true, true}, add_gemm_f16_tt_256x256},
+    {GemmParams{2560, 1408, 4096}, LayoutConfig{true, true, true}, add_gemm_f16_tt_256x128},
+    {GemmParams{1280, 1408, 4096}, LayoutConfig{true, true, true}, add_gemm_f16_tt_128x128},
+    {GemmParams{1280,  704, 4096}, LayoutConfig{true, true, true}, add_gemm_f16_tt_128x64},
+        // clang-format on
+    };
+
+    bool do_verification = true;
+    bool time_kernel     = true;
+
+    if(argc == 1)
+    {
+        // use default
+    }
+    else if(argc == 3)
+    {
+        do_verification = std::stoi(argv[1]);
+        time_kernel     = std::stoi(argv[2]);
+    }
+    else
+    {
+        std::cerr << "arg1: verification (0=no, 1=yes)" << std::endl
+                  << "arg2: time kernel (0=no, 1=yes)" << std::endl;
+        return 0;
+    }
+
+    bool pass = true;
+    for(auto& p : problems)
+    {
+        GemmParams& problem_size          = std::get<0>(p);
+        const LayoutConfig& layout_config = std::get<1>(p);
+        const auto& factory               = std::get<2>(p);
+        std::vector<std::unique_ptr<BaseOperator>> ops;
+        factory(ops);
+
+        // overwrite strides
+        problem_size.StrideA = layout_config.ARowMajor ? problem_size.K : problem_size.M;
+        problem_size.StrideB = layout_config.BRowMajor ? problem_size.N : problem_size.K;
+        problem_size.StrideC = layout_config.CRowMajor ? problem_size.N : problem_size.M;
+
+        if(!layout_config.ARowMajor && !layout_config.BRowMajor)
+        {
+            auto op_ptr = dynamic_cast<DeviceGemmNN*>(ops[0].get());
+            pass &= ck::gemm_util::TestGemm<AccDataType>{}(
+                op_ptr, problem_size, do_verification, time_kernel);
+        }
+        else if(!layout_config.ARowMajor && layout_config.BRowMajor)
+        {
+            auto op_ptr = dynamic_cast<DeviceGemmNT*>(ops[0].get());
+            pass &= ck::gemm_util::TestGemm<AccDataType>{}(
+                op_ptr, problem_size, do_verification, time_kernel);
+        }
+        else if(layout_config.ARowMajor && !layout_config.BRowMajor)
+        {
+            auto op_ptr = dynamic_cast<DeviceGemmTN*>(ops[0].get());
+            pass &= ck::gemm_util::TestGemm<AccDataType>{}(
+                op_ptr, problem_size, do_verification, time_kernel);
+        }
+        else if(layout_config.ARowMajor && layout_config.BRowMajor)
+        {
+            auto op_ptr = dynamic_cast<DeviceGemmTT*>(ops[0].get());
+            pass &= ck::gemm_util::TestGemm<AccDataType>{}(
+                op_ptr, problem_size, do_verification, time_kernel);
+        }
+    }
+
+    std::cout << (pass ? "ALL TESTS PASSED" : "SOME TESTS FAILED") << std::endl;
+    return pass ? 0 : 1;
+}
diff --git a/test/gemm/gemm_util.hpp b/test/gemm/gemm_util.hpp
new file mode 100644
index 00000000..9057c0af
--- /dev/null
+++ b/test/gemm/gemm_util.hpp
@@ -0,0 +1,267 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+namespace ck {
+namespace gemm_util {
+
+struct GemmParams
+{
+    ck::index_t M = 1024;
+    ck::index_t N = 1024;
+    ck::index_t K = 1024;
+
+    ck::index_t StrideA = 1024;
+    ck::index_t StrideB = 1024;
+    ck::index_t StrideC = 1024;
+};
+
+template <typename GemmInstance,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+void RunHostGEMM(const Tensor<ADataType>& A,
+                 const Tensor<BDataType>& B,
+                 Tensor<CDataType>& C,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op)
+{
+    auto ref_gemm    = GemmInstance{};
+    auto ref_invoker = ref_gemm.MakeInvoker();
+
+    auto ref_argument = ref_gemm.MakeArgument(A, B, C, a_element_op, b_element_op, c_element_op);
+
+    ref_invoker.Run(ref_argument);
+}
+
+template <typename DeviceGemmPtr_,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+bool RunDeviceGEMM(DeviceGemmPtr_& gemmPtr,
+                   const ck::gemm_util::GemmParams& params,
+                   const Tensor<ADataType>& A,
+                   const Tensor<BDataType>& B,
+                   Tensor<CDataType>& C,
+                   AElementwiseOperation a_element_op,
+                   BElementwiseOperation b_element_op,
+                   CElementwiseOperation c_element_op,
+                   bool time_kernel)
+{
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * A.mDesc.GetElementSpaceSize());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * B.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * C.mDesc.GetElementSpaceSize());
+
+    auto invoker_ptr = gemmPtr->MakeInvokerPointer();
+    auto argument_ptr =
+        gemmPtr->MakeArgumentPointer(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+                                     static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
+                                     static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
+                                     params.M,
+                                     params.N,
+                                     params.K,
+                                     params.StrideA,
+                                     params.StrideB,
+                                     params.StrideC,
+                                     a_element_op,
+                                     b_element_op,
+                                     c_element_op);
+
+    if(gemmPtr->IsSupportedArgument(argument_ptr.get()))
+    {
+        a_m_k_device_buf.ToDevice(A.mData.data());
+        b_k_n_device_buf.ToDevice(B.mData.data());
+        float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel});
+
+        std::size_t flop      = std::size_t(2) * params.M * params.N * params.K;
+        std::size_t num_btype = sizeof(ADataType) * params.M * params.K +
+                                sizeof(BDataType) * params.K * params.N +
+                                sizeof(CDataType) * params.M * params.N;
+
+        float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+        float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+        std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                  << " GB/s, " << std::endl;
+
+        c_m_n_device_buf.FromDevice(C.mData.data());
+
+        return true;
+    }
+    else
+    {
+        std::cout << "device_gemm with the specified compilation parameters does "
+                     "not support this GEMM problem"
+                  << std::endl;
+
+        return false;
+    }
+}
+
+template <typename AccDataType>
+struct TestGemm
+{
+    template <typename ADataType,
+              typename BDataType,
+              typename CDataType,
+              typename ALayout,
+              typename BLayout,
+              typename CLayout>
+    auto PrepareGemmTensor(const ck::gemm_util::GemmParams& params)
+    {
+        auto f_host_tensor_descriptor =
+            [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+                using namespace ck::literals;
+
+                if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+                {
+                    return HostTensorDescriptor({row, col}, {stride, 1_uz});
+                }
+                else
+                {
+                    return HostTensorDescriptor({row, col}, {1_uz, stride});
+                }
+            };
+
+        Tensor<ADataType> a_m_k(
+            f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{}));
+        Tensor<BDataType> b_k_n(
+            f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{}));
+        Tensor<CDataType> c_m_n_host_result(
+            f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
+        Tensor<CDataType> c_m_n_device_result(
+            f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
+
+        auto f_generate_tensor_value = [](auto& tensor, auto type) {
+            using dataType = decltype(type);
+
+            tensor.GenerateTensorValue(GeneratorTensor_2<dataType>{-5, 5});
+        };
+
+        f_generate_tensor_value(a_m_k, ADataType{});
+        f_generate_tensor_value(b_k_n, BDataType{});
+
+        std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+        std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+        std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+
+        return std::make_tuple(a_m_k, b_k_n, c_m_n_host_result, c_m_n_device_result);
+    }
+
+    template <template <class...> class DeviceGemmPtr_,
+              typename ALayout,
+              typename BLayout,
+              typename CLayout,
+              typename ADataType,
+              typename BDataType,
+              typename CDataType,
+              typename AElementwiseOperation,
+              typename BElementwiseOperation,
+              typename CElementwiseOperation>
+    auto operator()(DeviceGemmPtr_<ALayout,
+                                   BLayout,
+                                   CLayout,
+                                   ADataType,
+                                   BDataType,
+                                   CDataType,
+                                   AElementwiseOperation,
+                                   BElementwiseOperation,
+                                   CElementwiseOperation>* gemmPtr,
+                    const GemmParams& params = GemmParams{},
+                    bool do_verification     = true,
+                    bool time_kernel         = false)
+    {
+        std::cout << "ALayout = " << ALayout{}.name << ", BLayout = " << BLayout{}.name
+                  << ", CLayout = " << CLayout{}.name << std::endl;
+        std::cout << gemmPtr->GetTypeString() << std::endl;
+
+        auto host_tensors =
+            PrepareGemmTensor<ADataType, BDataType, CDataType, ALayout, BLayout, CLayout>(params);
+
+        const Tensor<ADataType>& a  = std::get<0>(host_tensors);
+        const Tensor<BDataType>& b  = std::get<1>(host_tensors);
+        Tensor<CDataType>& c_host   = std::get<2>(host_tensors);
+        Tensor<CDataType>& c_device = std::get<3>(host_tensors);
+
+        auto a_element_op = AElementwiseOperation{};
+        auto b_element_op = BElementwiseOperation{};
+        auto c_element_op = CElementwiseOperation{};
+
+        using ReferenceGemmInstance =
+            ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                      BDataType,
+                                                      CDataType,
+                                                      AccDataType,
+                                                      AElementwiseOperation,
+                                                      BElementwiseOperation,
+                                                      CElementwiseOperation>;
+
+        if(do_verification)
+        {
+            ck::gemm_util::RunHostGEMM<ReferenceGemmInstance>(
+                a, b, c_host, a_element_op, b_element_op, c_element_op);
+        }
+
+        // Act
+        bool is_supported = ck::gemm_util::RunDeviceGEMM(
+            gemmPtr, params, a, b, c_device, a_element_op, b_element_op, c_element_op, time_kernel);
+
+        if(is_supported && do_verification)
+        {
+            // Assert
+            bool res = false;
+            if(std::is_same<CDataType, float>::value)
+            {
+                res = ck::utils::check_err(c_device, c_host);
+                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
+            }
+            else if(std::is_same<CDataType, ck::half_t>::value)
+            {
+                res = ck::utils::check_err(c_device, c_host);
+                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
+            }
+            else if(std::is_same<CDataType, ck::bhalf_t>::value)
+            {
+                res = ck::utils::check_err(c_device, c_host);
+                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
+            }
+            else if(std::is_same<CDataType, int8_t>::value)
+            {
+                res = ck::utils::check_err(c_device, c_host);
+                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
+            }
+            else if(std::is_same<CDataType, double>::value)
+            {
+                res = ck::utils::check_err(c_device, c_host);
+                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
+            }
+
+            return res;
+        }
+        else
+        {
+            return true;
+        }
+    }
+};
+
+} // namespace gemm_util
+} // namespace ck
diff --git a/test/gemm/instance/gemm_f16_nn_instance.cpp b/test/gemm/instance/gemm_f16_nn_instance.cpp
new file mode 100644
index 00000000..4d65c587
--- /dev/null
+++ b/test/gemm/instance/gemm_f16_nn_instance.cpp
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+#include "gemm_f16_nn_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using gemm_f16_nn_256x256 = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   256,    32,   2,   8,   32,   32,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>
+    // clang-format on
+    >;
+
+using gemm_f16_nn_256x128 = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>
+    // clang-format on
+    >;
+
+using gemm_f16_nn_128x128 = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   2,   8,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>
+    // clang-format on
+    >;
+
+using gemm_f16_nn_128x64 = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   2,   8,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>
+    // clang-format on
+    >;
+
+void add_gemm_f16_nn_256x256(std::vector<std::unique_ptr<BaseOperator>>& instances)
+{
+    add_device_operation_instances(instances, gemm_f16_nn_256x256{});
+}
+
+void add_gemm_f16_nn_256x128(std::vector<std::unique_ptr<BaseOperator>>& instances)
+{
+    add_device_operation_instances(instances, gemm_f16_nn_256x128{});
+}
+
+void add_gemm_f16_nn_128x128(std::vector<std::unique_ptr<BaseOperator>>& instances)
+{
+    add_device_operation_instances(instances, gemm_f16_nn_128x128{});
+}
+
+void add_gemm_f16_nn_128x64(std::vector<std::unique_ptr<BaseOperator>>& instances)
+{
+    add_device_operation_instances(instances, gemm_f16_nn_128x64{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/test/gemm/instance/gemm_f16_nn_instance.hpp b/test/gemm/instance/gemm_f16_nn_instance.hpp
new file mode 100644
index 00000000..5ae3928d
--- /dev/null
+++ b/test/gemm/instance/gemm_f16_nn_instance.hpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+void add_gemm_f16_nn_256x256(std::vector<std::unique_ptr<BaseOperator>>& instances);
+
+void add_gemm_f16_nn_256x128(std::vector<std::unique_ptr<BaseOperator>>& instances);
+
+void add_gemm_f16_nn_128x128(std::vector<std::unique_ptr<BaseOperator>>& instances);
+
+void add_gemm_f16_nn_128x64(std::vector<std::unique_ptr<BaseOperator>>& instances);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/test/gemm/instance/gemm_f16_nt_instance.cpp b/test/gemm/instance/gemm_f16_nt_instance.cpp
new file mode 100644
index 00000000..431ff1e6
--- /dev/null
+++ b/test/gemm/instance/gemm_f16_nt_instance.cpp
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+#include "gemm_f16_nt_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using gemm_f16_nt_256x256 = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   256,    32,   2,   2,   32,   32,    4,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>
+    // clang-format on
+    >;
+
+using gemm_f16_nt_256x128 = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>
+    // clang-format on
+    >;
+
+using gemm_f16_nt_128x128 = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   2,   2,   32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>
+    // clang-format on
+    >;
+
+using gemm_f16_nt_128x64 = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   2,   2,   32,   32,    2,    1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>
+    // clang-format on
+    >;
+
+void add_gemm_f16_nt_256x256(std::vector<std::unique_ptr<BaseOperator>>& instances)
+{
+    add_device_operation_instances(instances, gemm_f16_nt_256x256{});
+}
+
+void add_gemm_f16_nt_256x128(std::vector<std::unique_ptr<BaseOperator>>& instances)
+{
+    add_device_operation_instances(instances, gemm_f16_nt_256x128{});
+}
+
+void add_gemm_f16_nt_128x128(std::vector<std::unique_ptr<BaseOperator>>& instances)
+{
+    add_device_operation_instances(instances, gemm_f16_nt_128x128{});
+}
+
+void add_gemm_f16_nt_128x64(std::vector<std::unique_ptr<BaseOperator>>& instances)
+{
+    add_device_operation_instances(instances, gemm_f16_nt_128x64{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/test/gemm/instance/gemm_f16_nt_instance.hpp b/test/gemm/instance/gemm_f16_nt_instance.hpp
new file mode 100644
index 00000000..99f9ffba
--- /dev/null
+++ b/test/gemm/instance/gemm_f16_nt_instance.hpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+void add_gemm_f16_nt_256x256(std::vector<std::unique_ptr<BaseOperator>>& instances);
+
+void add_gemm_f16_nt_256x128(std::vector<std::unique_ptr<BaseOperator>>& instances);
+
+void add_gemm_f16_nt_128x128(std::vector<std::unique_ptr<BaseOperator>>& instances);
+
+void add_gemm_f16_nt_128x64(std::vector<std::unique_ptr<BaseOperator>>& instances);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/test/gemm/instance/gemm_f16_tn_instance.cpp b/test/gemm/instance/gemm_f16_tn_instance.cpp
new file mode 100644
index 00000000..6f5dbc31
--- /dev/null
+++ b/test/gemm/instance/gemm_f16_tn_instance.cpp
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+#include "gemm_f16_tn_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using gemm_f16_tn_256x256 = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   256,    32,   8,   8,   32,   32,    4,   4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>
+    // clang-format on
+    >;
+
+using gemm_f16_tn_256x128 = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>
+    // clang-format on
+    >;
+
+using gemm_f16_tn_128x128 = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>
+    // clang-format on
+    >;
+
+using gemm_f16_tn_128x64 = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8>
+    // clang-format on
+    >;
+
+void add_gemm_f16_tn_256x256(std::vector<std::unique_ptr<BaseOperator>>& instances)
+{
+    add_device_operation_instances(instances, gemm_f16_tn_256x256{});
+}
+
+void add_gemm_f16_tn_256x128(std::vector<std::unique_ptr<BaseOperator>>& instances)
+{
+    add_device_operation_instances(instances, gemm_f16_tn_256x128{});
+}
+
+void add_gemm_f16_tn_128x128(std::vector<std::unique_ptr<BaseOperator>>& instances)
+{
+    add_device_operation_instances(instances, gemm_f16_tn_128x128{});
+}
+
+void add_gemm_f16_tn_128x64(std::vector<std::unique_ptr<BaseOperator>>& instances)
+{
+    add_device_operation_instances(instances, gemm_f16_tn_128x64{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/test/gemm/instance/gemm_f16_tn_instance.hpp b/test/gemm/instance/gemm_f16_tn_instance.hpp
new file mode 100644
index 00000000..62388aeb
--- /dev/null
+++ b/test/gemm/instance/gemm_f16_tn_instance.hpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+void add_gemm_f16_tn_256x256(std::vector<std::unique_ptr<BaseOperator>>& instances);
+
+void add_gemm_f16_tn_256x128(std::vector<std::unique_ptr<BaseOperator>>& instances);
+
+void add_gemm_f16_tn_128x128(std::vector<std::unique_ptr<BaseOperator>>& instances);
+
+void add_gemm_f16_tn_128x64(std::vector<std::unique_ptr<BaseOperator>>& instances);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/test/gemm/instance/gemm_f16_tt_instance.cpp b/test/gemm/instance/gemm_f16_tt_instance.cpp
new file mode 100644
index 00000000..b6ef5b1c
--- /dev/null
+++ b/test/gemm/instance/gemm_f16_tt_instance.cpp
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+#include "gemm_f16_tt_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using gemm_f16_tt_256x256 = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   256,    32,   8,   2,   32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>
+    // clang-format on
+    >;
+
+using gemm_f16_tt_256x128 = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>
+    // clang-format on
+    >;
+
+using gemm_f16_tt_128x128 = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    32,   8,   2,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>
+    // clang-format on
+    >;
+
+using gemm_f16_tt_128x64 = std::tuple<
+    // clang-format off
+        //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,    64,    32,   8,   2,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8>
+    // clang-format on
+    >;
+
+void add_gemm_f16_tt_256x256(std::vector<std::unique_ptr<BaseOperator>>& instances)
+{
+    add_device_operation_instances(instances, gemm_f16_tt_256x256{});
+}
+
+void add_gemm_f16_tt_256x128(std::vector<std::unique_ptr<BaseOperator>>& instances)
+{
+    add_device_operation_instances(instances, gemm_f16_tt_256x128{});
+}
+
+void add_gemm_f16_tt_128x128(std::vector<std::unique_ptr<BaseOperator>>& instances)
+{
+    add_device_operation_instances(instances, gemm_f16_tt_128x128{});
+}
+
+void add_gemm_f16_tt_128x64(std::vector<std::unique_ptr<BaseOperator>>& instances)
+{
+    add_device_operation_instances(instances, gemm_f16_tt_128x64{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/test/gemm/instance/gemm_f16_tt_instance.hpp b/test/gemm/instance/gemm_f16_tt_instance.hpp
new file mode 100644
index 00000000..9d75b4e4
--- /dev/null
+++ b/test/gemm/instance/gemm_f16_tt_instance.hpp
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+void add_gemm_f16_tt_256x256(std::vector<std::unique_ptr<BaseOperator>>& instances);
+
+void add_gemm_f16_tt_256x128(std::vector<std::unique_ptr<BaseOperator>>& instances);
+
+void add_gemm_f16_tt_128x128(std::vector<std::unique_ptr<BaseOperator>>& instances);
+
+void add_gemm_f16_tt_128x64(std::vector<std::unique_ptr<BaseOperator>>& instances);
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/test/gemm/run_gemm_test.inc b/test/gemm/run_gemm_test.inc
new file mode 100644
index 00000000..ec27729b
--- /dev/null
+++ b/test/gemm/run_gemm_test.inc
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+int run_gemm_test()
+{
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+    auto test = [&](auto a_layout, auto b_layout, auto c_layout) {
+        bool pass = true;
+
+        using DeviceOp = ck::tensor_operation::device::DeviceGemm<decltype(a_layout),
+                                                                  decltype(b_layout),
+                                                                  decltype(c_layout),
+                                                                  ADataType,
+                                                                  BDataType,
+                                                                  CDataType,
+                                                                  PassThrough,
+                                                                  PassThrough,
+                                                                  PassThrough>;
+
+        const auto gemmPtrs =
+            ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+                DeviceOp>::GetInstances();
+
+        for(auto& gemmPtr : gemmPtrs)
+        {
+            pass &= ck::gemm_util::TestGemm<AccDataType>{}(gemmPtr.get());
+        }
+
+        return pass;
+    };
+
+    bool pass = test(Row{}, Row{}, Row{}) && test(Row{}, Col{}, Row{}) &&
+                test(Col{}, Row{}, Row{}) && test(Col{}, Col{}, Row{});
+
+    std::cout << "TestGemm ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl;
+    return pass ? 0 : 1;
+}
diff --git a/test/gemm_reduce/CMakeLists.txt b/test/gemm_reduce/CMakeLists.txt
new file mode 100644
index 00000000..349f892c
--- /dev/null
+++ b/test/gemm_reduce/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_test_executable(test_gemm_reduce_fp16 gemm_reduce_fp16.cpp)
+target_link_libraries(test_gemm_reduce_fp16 PRIVATE utility)
+target_link_libraries(test_gemm_reduce_fp16 PRIVATE device_gemm_reduce_instance)
diff --git a/test/gemm_reduce/gemm_reduce_fp16.cpp b/test/gemm_reduce/gemm_reduce_fp16.cpp
new file mode 100644
index 00000000..029165ec
--- /dev/null
+++ b/test/gemm_reduce/gemm_reduce_fp16.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "profiler/profile_gemm_reduce_impl.hpp"
+
+int main()
+{
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    int M = 512;
+    int N = 256;
+    int K = 128;
+
+    bool pass = true;
+
+    pass = pass &&
+           ck::profiler::
+               profile_gemm_reduce_impl<ck::half_t, ck::half_t, ck::half_t, float, Row, Row, Row>(
+                   true, 1, false, false, M, N, K, K, N, N);
+
+    pass = pass &&
+           ck::profiler::
+               profile_gemm_reduce_impl<ck::half_t, ck::half_t, ck::half_t, float, Row, Col, Row>(
+                   true, 1, false, false, M, N, K, K, K, N);
+
+    pass = pass &&
+           ck::profiler::
+               profile_gemm_reduce_impl<ck::half_t, ck::half_t, ck::half_t, float, Col, Row, Row>(
+                   true, 1, false, false, M, N, K, M, N, N);
+
+    pass = pass &&
+           ck::profiler::
+               profile_gemm_reduce_impl<ck::half_t, ck::half_t, ck::half_t, float, Col, Col, Row>(
+                   true, 1, false, false, M, N, K, M, K, N);
+
+    if(pass)
+    {
+        std::cout << "test GEMM+Reduce fp16: Pass" << std::endl;
+        return 0;
+    }
+    else
+    {
+        std::cout << "test GEMM+Reduce fp16: Fail" << std::endl;
+        return -1;
+    }
+}
diff --git a/test/gemm_split_k/CMakeLists.txt b/test/gemm_split_k/CMakeLists.txt
new file mode 100644
index 00000000..793091e5
--- /dev/null
+++ b/test/gemm_split_k/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_test_executable(test_gemm_split_k gemm_split_k.cpp)
+target_link_libraries(test_gemm_split_k PRIVATE utility)
+target_link_libraries(test_gemm_split_k PRIVATE device_gemm_splitk_instance)
diff --git a/test/gemm_split_k/gemm_split_k.cpp b/test/gemm_split_k/gemm_split_k.cpp
new file mode 100644
index 00000000..1edb5769
--- /dev/null
+++ b/test/gemm_split_k/gemm_split_k.cpp
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+
+#include "ck/library/utility/host_gemm.hpp"
+
+enum struct GemmMatrixLayout
+{
+    MK_KN_MN, // 0
+    MK_NK_MN, // 1
+    KM_KN_MN, // 2
+    KM_NK_MN, // 3
+};
+
+template <typename T>
+static bool check_out(const Tensor<T>& ref, const Tensor<T>& result)
+{
+    float max_diff = 1e-6;
+
+    for(std::size_t i = 0; i < ref.mData.size(); ++i)
+    {
+        float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
+        if(max_diff < diff)
+        {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+struct gemmArgs
+{
+    GemmMatrixLayout layout;
+    int M;
+    int N;
+    int K;
+    int StrideA;
+    int StrideB;
+    int StrideC;
+    int KBatch;
+};
+
+int test_gemm(const gemmArgs& args)
+{
+    using Row = ck::tensor_layout::gemm::RowMajor;
+    using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+    bool a_row_major, b_row_major, c_row_major;
+
+    switch(args.layout)
+    {
+    case GemmMatrixLayout::MK_KN_MN:
+        a_row_major = true;
+        b_row_major = true;
+        c_row_major = true;
+        break;
+    case GemmMatrixLayout::MK_NK_MN:
+        a_row_major = true;
+        b_row_major = false;
+        c_row_major = true;
+        break;
+    case GemmMatrixLayout::KM_KN_MN:
+        a_row_major = false;
+        b_row_major = true;
+        c_row_major = true;
+        break;
+    case GemmMatrixLayout::KM_NK_MN:
+        a_row_major = false;
+        b_row_major = false;
+        c_row_major = true;
+        break;
+    default: printf("not supported layout"); return 1;
+    }
+
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, bool row_major) {
+            using namespace ck::literals;
+
+            if(row_major)
+            {
+                return HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+    Tensor<float> a_m_k(f_host_tensor_descriptor(args.M, args.K, args.StrideA, a_row_major));
+    Tensor<float> b_k_n(f_host_tensor_descriptor(args.K, args.N, args.StrideB, b_row_major));
+    Tensor<float> c_m_n_host_result(
+        f_host_tensor_descriptor(args.M, args.N, args.StrideC, c_row_major));
+    Tensor<float> c_m_n_device_result(
+        f_host_tensor_descriptor(args.M, args.N, args.StrideC, c_row_major));
+
+    // init data
+    std::size_t num_thread = 1;
+    a_m_k.GenerateTensorValue(GeneratorTensor_2<float>{-5, 5}, num_thread);
+    b_k_n.GenerateTensorValue(GeneratorTensor_2<float>{-5, 5}, num_thread);
+    // set zero to c_device_buf
+    c_m_n_device_result.GenerateTensorValue(GeneratorTensor_0<float>{}, num_thread);
+
+    host_gemm_mk_kn_mn(a_m_k,
+                       b_k_n,
+                       c_m_n_host_result,
+                       ck::tensor_operation::element_wise::PassThrough{},
+                       ck::tensor_operation::element_wise::PassThrough{},
+                       ck::tensor_operation::element_wise::PassThrough{});
+
+    DeviceMem a_device_buf(sizeof(float) * a_m_k.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(float) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem c_device_buf(sizeof(float) * c_m_n_device_result.mDesc.GetElementSpaceSize());
+
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    c_device_buf.ToDevice(c_m_n_device_result.mData.data());
+
+    auto test = [&](auto a_layout, auto b_layout, auto c_layout) {
+        bool success = false;
+
+        using DeviceOp = ck::tensor_operation::device::DeviceGemmSplitK<decltype(a_layout),
+                                                                        decltype(b_layout),
+                                                                        decltype(c_layout),
+                                                                        float,
+                                                                        float,
+                                                                        float,
+                                                                        PassThrough,
+                                                                        PassThrough,
+                                                                        PassThrough>;
+
+        const auto gemm_ptrs =
+            ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+                DeviceOp>::GetInstances();
+
+        for(auto& gemm_ptr : gemm_ptrs)
+        {
+            auto argument_ptr =
+                gemm_ptr->MakeArgumentPointer(static_cast<float*>(a_device_buf.GetDeviceBuffer()),
+                                              static_cast<float*>(b_device_buf.GetDeviceBuffer()),
+                                              static_cast<float*>(c_device_buf.GetDeviceBuffer()),
+                                              args.M,
+                                              args.N,
+                                              args.K,
+                                              args.StrideA,
+                                              args.StrideB,
+                                              args.StrideC,
+                                              ck::tensor_operation::element_wise::PassThrough{},
+                                              ck::tensor_operation::element_wise::PassThrough{},
+                                              ck::tensor_operation::element_wise::PassThrough{},
+                                              args.KBatch);
+
+            auto invoker_ptr = gemm_ptr->MakeInvokerPointer();
+
+            if(gemm_ptr->IsSupportedArgument(argument_ptr.get()))
+            {
+                invoker_ptr->Run(argument_ptr.get());
+
+                c_device_buf.FromDevice(c_m_n_device_result.mData.data());
+
+                if(!check_out(c_m_n_host_result, c_m_n_device_result))
+                {
+                    success = false;
+                    break;
+                }
+                success = true;
+            }
+        }
+
+        return success;
+    };
+
+    bool success = false;
+
+    if(args.layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        success = test(Row{}, Row{}, Row{});
+    }
+    else if(args.layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        success = test(Row{}, Col{}, Row{});
+    }
+    else if(args.layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        success = test(Col{}, Row{}, Row{});
+    }
+    else
+    {
+        success = test(Col{}, Col{}, Row{});
+    }
+
+    auto error_code = 0;
+    if(success)
+    {
+        std::cout << "test split k : Pass" << std::endl;
+    }
+    else
+    {
+        std::cout << "test split k: Fail " << std::endl;
+        error_code = -1; // test needs to report failure
+    }
+    return error_code;
+}
+
+int main(int argc, char* argv[])
+{
+    std::vector<gemmArgs> test_cases;
+    if(argc == 1)
+    {
+        test_cases = {{GemmMatrixLayout::MK_KN_MN, 1024, 1024, 1024, 1024, 1024, 1024, 2},
+                      {GemmMatrixLayout::MK_KN_MN, 1024, 1024, 1024, 1024, 1024, 1024, 8}};
+    }
+    else if(argc == 9)
+    {
+        const auto layout = static_cast<GemmMatrixLayout>(std::stoi(argv[1]));
+
+        const int M = std::stoi(argv[2]);
+        const int N = std::stoi(argv[3]);
+        const int K = std::stoi(argv[4]);
+
+        const int StrideA = std::stoi(argv[5]);
+        const int StrideB = std::stoi(argv[6]);
+        const int StrideC = std::stoi(argv[7]);
+        const int KBatch  = std::stoi(argv[8]);
+        test_cases        = {{layout, M, N, K, StrideA, StrideB, StrideC, KBatch}};
+    }
+    else
+    {
+        printf("arg1: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
+        printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
+        printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
+        printf("                     3: A[k, m] * B[n, k] = C[m, n])\n");
+        printf("arg2 to 7: M, N, K, StrideA, StrideB, StrideC KBatch\n");
+        return -1;
+    }
+    bool error = false;
+    for(const auto& kinder : test_cases)
+    {
+        error |= test_gemm(kinder);
+    }
+    return error ? 1 : 0;
+}
diff --git a/test/grouped_convnd_bwd_weight/CMakeLists.txt b/test/grouped_convnd_bwd_weight/CMakeLists.txt
new file mode 100644
index 00000000..e2f0790c
--- /dev/null
+++ b/test/grouped_convnd_bwd_weight/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_gtest_executable(test_grouped_convnd_bwd_weight grouped_convnd_bwd_weight.cpp) 
+target_link_libraries(test_grouped_convnd_bwd_weight PRIVATE utility device_grouped_conv1d_bwd_weight_instance device_grouped_conv2d_bwd_weight_instance device_grouped_conv3d_bwd_weight_instance)
diff --git a/test/grouped_convnd_bwd_weight/grouped_convnd_bwd_weight.cpp b/test/grouped_convnd_bwd_weight/grouped_convnd_bwd_weight.cpp
new file mode 100644
index 00000000..6545b6e5
--- /dev/null
+++ b/test/grouped_convnd_bwd_weight/grouped_convnd_bwd_weight.cpp
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <tuple>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "profiler/profile_grouped_conv_bwd_weight_impl.hpp"
+
+template <typename Tuple>
+class TestGroupedConvndBwdWeight : public ::testing::Test
+{
+    protected:
+    using DataType = std::tuple_element_t<0, Tuple>;
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+    ck::index_t split_k{2};
+
+    template <ck::index_t NDimSpatial>
+    void Run()
+    {
+        for(auto& param : conv_params)
+        {
+            bool pass;
+            EXPECT_FALSE(conv_params.empty());
+            pass = ck::profiler::profile_grouped_conv_bwd_weight_impl<
+                NDimSpatial,
+                ck::tuple_element_t<NDimSpatial - 1,
+                                    ck::Tuple<ck::tensor_layout::convolution::GNWC,
+                                              ck::tensor_layout::convolution::GNHWC,
+                                              ck::tensor_layout::convolution::GNDHWC>>,
+                ck::tuple_element_t<NDimSpatial - 1,
+                                    ck::Tuple<ck::tensor_layout::convolution::GKXC,
+                                              ck::tensor_layout::convolution::GKYXC,
+                                              ck::tensor_layout::convolution::GKZYXC>>,
+                ck::tuple_element_t<NDimSpatial - 1,
+                                    ck::Tuple<ck::tensor_layout::convolution::GNWK,
+                                              ck::tensor_layout::convolution::GNHWK,
+                                              ck::tensor_layout::convolution::GNDHWK>>,
+                DataType,
+                DataType,
+                DataType>(true,  // do_verification
+                          1,     // init_method integer value
+                          false, // do_log
+                          false, // time_kernel
+                          param,
+                          split_k);
+            EXPECT_TRUE(pass);
+        }
+    }
+};
+
+using KernelTypes =
+    ::testing::Types<std::tuple<float>, std::tuple<ck::half_t>, std::tuple<ck::bhalf_t>>;
+TYPED_TEST_SUITE(TestGroupedConvndBwdWeight, KernelTypes);
+
+TYPED_TEST(TestGroupedConvndBwdWeight, Test1D)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back({1, 4, 128, 128, 256, {1}, {14}, {2}, {1}, {0}, {0}});
+    this->conv_params.push_back({1, 4, 64, 128, 256, {3}, {28}, {1}, {1}, {1}, {1}});
+    this->conv_params.push_back({1, 4, 128, 128, 256, {1}, {3}, {1}, {1}, {0}, {0}});
+    this->template Run<1>();
+}
+
+TYPED_TEST(TestGroupedConvndBwdWeight, Test2D)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back(
+        {2, 4, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
+    this->conv_params.push_back(
+        {2, 4, 8, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->conv_params.push_back(
+        {2, 4, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
+    this->template Run<2>();
+}
+
+TYPED_TEST(TestGroupedConvndBwdWeight, Test3D)
+{
+    this->conv_params.clear();
+    this->conv_params.push_back(
+        {3, 4, 128, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->conv_params.push_back(
+        {3, 4, 8, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+    this->conv_params.push_back(
+        {3, 4, 128, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    this->template Run<3>();
+}
diff --git a/test/grouped_convnd_fwd/CMakeLists.txt b/test/grouped_convnd_fwd/CMakeLists.txt
new file mode 100644
index 00000000..38da8847
--- /dev/null
+++ b/test/grouped_convnd_fwd/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_gtest_executable(test_grouped_convnd_fwd grouped_convnd_fwd.cpp)
+target_link_libraries(test_grouped_convnd_fwd PRIVATE utility device_grouped_conv1d_fwd_instance device_grouped_conv2d_fwd_instance device_grouped_conv3d_fwd_instance)
+
diff --git a/test/grouped_convnd_fwd/grouped_convnd_fwd.cpp b/test/grouped_convnd_fwd/grouped_convnd_fwd.cpp
new file mode 100644
index 00000000..6df7f996
--- /dev/null
+++ b/test/grouped_convnd_fwd/grouped_convnd_fwd.cpp
@@ -0,0 +1,270 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "profiler/profile_grouped_conv_fwd_impl.hpp"
+
+class TestGroupedConvNdFwd : public ::testing::Test
+{
+    protected:
+    std::vector<ck::utils::conv::ConvParam> conv_params;
+};
+
+// 1d GNWC/GKXC/GNWK
+TEST_F(TestGroupedConvNdFwd, GroupedConv1dFwdGNWC)
+{
+    conv_params.clear();
+    conv_params.push_back({1, 2, 128, 128, 256, {1}, {14}, {2}, {1}, {0}, {0}});
+    conv_params.push_back({1, 2, 128, 128, 256, {3}, {28}, {1}, {1}, {1}, {1}});
+    conv_params.push_back({1, 2, 128, 128, 256, {1}, {3}, {1}, {1}, {0}, {0}});
+
+    for(auto& param : conv_params)
+    {
+        bool pass;
+
+        // fp32
+        pass = ck::profiler::profile_grouped_conv_fwd_impl<1,
+                                                           ck::tensor_layout::convolution::GNWC,
+                                                           ck::tensor_layout::convolution::GKXC,
+                                                           ck::tensor_layout::convolution::GNWK,
+                                                           float,
+                                                           float,
+                                                           float>(true,  // do_verification
+                                                                  1,     // init_method
+                                                                  false, // do_log
+                                                                  false, // time_kernel
+                                                                  param);
+
+        EXPECT_TRUE(pass);
+
+        // fp16
+        pass = ck::profiler::profile_grouped_conv_fwd_impl<1,
+                                                           ck::tensor_layout::convolution::GNWC,
+                                                           ck::tensor_layout::convolution::GKXC,
+                                                           ck::tensor_layout::convolution::GNWK,
+                                                           ck::half_t,
+                                                           ck::half_t,
+                                                           ck::half_t>(true,  // do_verification
+                                                                       1,     // init_method
+                                                                       false, // do_log
+                                                                       false, // time_kernel
+                                                                       param);
+
+        EXPECT_TRUE(pass);
+
+        // bf16
+        pass = ck::profiler::profile_grouped_conv_fwd_impl<1,
+                                                           ck::tensor_layout::convolution::GNWC,
+                                                           ck::tensor_layout::convolution::GKXC,
+                                                           ck::tensor_layout::convolution::GNWK,
+                                                           ck::bhalf_t,
+                                                           ck::bhalf_t,
+                                                           ck::bhalf_t>(true,  // do_verification
+                                                                        1,     // init_method
+                                                                        false, // do_log
+                                                                        false, // time_kernel
+                                                                        param);
+
+        EXPECT_TRUE(pass);
+
+        // int8
+        pass = ck::profiler::profile_grouped_conv_fwd_impl<1,
+                                                           ck::tensor_layout::convolution::GNWC,
+                                                           ck::tensor_layout::convolution::GKXC,
+                                                           ck::tensor_layout::convolution::GNWK,
+                                                           int8_t,
+                                                           int8_t,
+                                                           int8_t>(true,  // do_verification
+                                                                   1,     // init_method
+                                                                   false, // do_log
+                                                                   false, // time_kernel
+                                                                   param);
+
+        EXPECT_TRUE(pass);
+    }
+}
+
+// 2d GNHWC/GKYXC/GNHWK
+TEST_F(TestGroupedConvNdFwd, GroupedConv2dFwdGNHWC)
+{
+    conv_params.clear();
+    conv_params.push_back({2, 2, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
+    conv_params.push_back({2, 2, 128, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    conv_params.push_back({2, 2, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
+
+    for(auto& param : conv_params)
+    {
+        bool pass;
+
+        // fp32
+        pass = ck::profiler::profile_grouped_conv_fwd_impl<2,
+                                                           ck::tensor_layout::convolution::GNHWC,
+                                                           ck::tensor_layout::convolution::GKYXC,
+                                                           ck::tensor_layout::convolution::GNHWK,
+                                                           float,
+                                                           float,
+                                                           float>(true,  // do_verification
+                                                                  1,     // init_method
+                                                                  false, // do_log
+                                                                  false, // time_kernel
+                                                                  param);
+
+        EXPECT_TRUE(pass);
+
+        // fp16
+        pass = ck::profiler::profile_grouped_conv_fwd_impl<2,
+                                                           ck::tensor_layout::convolution::GNHWC,
+                                                           ck::tensor_layout::convolution::GKYXC,
+                                                           ck::tensor_layout::convolution::GNHWK,
+                                                           ck::half_t,
+                                                           ck::half_t,
+                                                           ck::half_t>(true,  // do_verification
+                                                                       1,     // init_method
+                                                                       false, // do_log
+                                                                       false, // time_kernel
+                                                                       param);
+
+        EXPECT_TRUE(pass);
+
+        // bf16
+        pass = ck::profiler::profile_grouped_conv_fwd_impl<2,
+                                                           ck::tensor_layout::convolution::GNHWC,
+                                                           ck::tensor_layout::convolution::GKYXC,
+                                                           ck::tensor_layout::convolution::GNHWK,
+                                                           ck::bhalf_t,
+                                                           ck::bhalf_t,
+                                                           ck::bhalf_t>(true,  // do_verification
+                                                                        1,     // init_method
+                                                                        false, // do_log
+                                                                        false, // time_kernel
+                                                                        param);
+
+        EXPECT_TRUE(pass);
+
+        // int8
+        pass = ck::profiler::profile_grouped_conv_fwd_impl<2,
+                                                           ck::tensor_layout::convolution::GNHWC,
+                                                           ck::tensor_layout::convolution::GKYXC,
+                                                           ck::tensor_layout::convolution::GNHWK,
+                                                           int8_t,
+                                                           int8_t,
+                                                           int8_t>(true,  // do_verification
+                                                                   1,     // init_method
+                                                                   false, // do_log
+                                                                   false, // time_kernel
+                                                                   param);
+
+        EXPECT_TRUE(pass);
+    }
+}
+
+// 3d GNDHWC/GKZYXC/GNDHWK
+TEST_F(TestGroupedConvNdFwd, GroupedConv3dFwdGNDHWC)
+{
+    conv_params.clear();
+    conv_params.push_back(
+        {3, 2, 128, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+    conv_params.push_back(
+        {3, 2, 128, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}});
+    conv_params.push_back(
+        {3, 2, 128, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}});
+
+    for(auto& param : conv_params)
+    {
+        bool pass;
+
+        // fp32
+        pass = ck::profiler::profile_grouped_conv_fwd_impl<3,
+                                                           ck::tensor_layout::convolution::GNDHWC,
+                                                           ck::tensor_layout::convolution::GKZYXC,
+                                                           ck::tensor_layout::convolution::GNDHWK,
+                                                           float,
+                                                           float,
+                                                           float>(true,  // do_verification
+                                                                  1,     // init_method
+                                                                  false, // do_log
+                                                                  false, // time_kernel
+                                                                  param);
+
+        EXPECT_TRUE(pass);
+
+        // fp16
+        pass = ck::profiler::profile_grouped_conv_fwd_impl<3,
+                                                           ck::tensor_layout::convolution::GNDHWC,
+                                                           ck::tensor_layout::convolution::GKZYXC,
+                                                           ck::tensor_layout::convolution::GNDHWK,
+                                                           ck::half_t,
+                                                           ck::half_t,
+                                                           ck::half_t>(true,  // do_verification
+                                                                       1,     // init_method
+                                                                       false, // do_log
+                                                                       false, // time_kernel
+                                                                       param);
+
+        EXPECT_TRUE(pass);
+
+        // bf16
+        pass = ck::profiler::profile_grouped_conv_fwd_impl<3,
+                                                           ck::tensor_layout::convolution::GNDHWC,
+                                                           ck::tensor_layout::convolution::GKZYXC,
+                                                           ck::tensor_layout::convolution::GNDHWK,
+                                                           ck::bhalf_t,
+                                                           ck::bhalf_t,
+                                                           ck::bhalf_t>(true,  // do_verification
+                                                                        1,     // init_method
+                                                                        false, // do_log
+                                                                        false, // time_kernel
+                                                                        param);
+
+        EXPECT_TRUE(pass);
+
+        // int8
+        pass = ck::profiler::profile_grouped_conv_fwd_impl<3,
+                                                           ck::tensor_layout::convolution::GNDHWC,
+                                                           ck::tensor_layout::convolution::GKZYXC,
+                                                           ck::tensor_layout::convolution::GNDHWK,
+                                                           int8_t,
+                                                           int8_t,
+                                                           int8_t>(true,  // do_verification
+                                                                   1,     // init_method
+                                                                   false, // do_log
+                                                                   false, // time_kernel
+                                                                   param);
+
+        EXPECT_TRUE(pass);
+    }
+}
+
+// 2d NHWGC/KYXGC/NHWGK
+TEST_F(TestGroupedConvNdFwd, GroupedConv2dFwdNHWGC)
+{
+    conv_params.clear();
+    conv_params.push_back({2, 2, 128, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}});
+    conv_params.push_back({2, 2, 128, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    conv_params.push_back({2, 2, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
+
+    for(auto& param : conv_params)
+    {
+        bool pass;
+
+        // fp16
+        pass = ck::profiler::profile_grouped_conv_fwd_impl<2,
+                                                           ck::tensor_layout::convolution::NHWGC,
+                                                           ck::tensor_layout::convolution::KYXGC,
+                                                           ck::tensor_layout::convolution::NHWGK,
+                                                           ck::half_t,
+                                                           ck::half_t,
+                                                           ck::half_t>(true,  // do_verification
+                                                                       1,     // init_method
+                                                                       false, // do_log
+                                                                       false, // time_kernel
+                                                                       param);
+
+        EXPECT_TRUE(pass);
+    }
+}
diff --git a/test/grouped_gemm/CMakeLists.txt b/test/grouped_gemm/CMakeLists.txt
new file mode 100644
index 00000000..31a78733
--- /dev/null
+++ b/test/grouped_gemm/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_test_executable(test_grouped_gemm_fp16 grouped_gemm_fp16.cpp)
+target_link_libraries(test_grouped_gemm_fp16 PRIVATE utility)
+target_link_libraries(test_grouped_gemm_fp16 PRIVATE device_grouped_gemm_instance)
diff --git a/test/grouped_gemm/grouped_gemm_fp16.cpp b/test/grouped_gemm/grouped_gemm_fp16.cpp
new file mode 100644
index 00000000..b3f7cca4
--- /dev/null
+++ b/test/grouped_gemm/grouped_gemm_fp16.cpp
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+
+#include "profiler/profile_grouped_gemm_impl.hpp"
+
+namespace {
+
+using ADataType   = ck::half_t;
+using BDataType   = ck::half_t;
+using CDataType   = ck::half_t;
+using AccDataType = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+template <typename ALayout, typename BLayout, typename CLayout>
+bool TestGroupedGemm()
+{
+    int group_count = rand() % 10 + 1;
+
+    // GEMM shape
+    std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
+    std::vector<const void*> p_a, p_b;
+    std::vector<void*> p_c;
+
+    std::vector<int> Ms, Ns, Ks, StrideAs, StrideBs, StrideCs;
+
+    for(int i = 0; i < group_count; i++)
+    {
+        Ms.push_back(256 + 256 * (rand() % 10));
+        Ns.push_back(256 + 256 * (rand() % 10));
+        Ks.push_back(128 + 128 * (rand() % 10));
+
+        StrideAs.push_back(std::is_same<Row, ALayout>::value ? Ks[i] : Ms[i]);
+        StrideBs.push_back(std::is_same<Row, BLayout>::value ? Ns[i] : Ks[i]);
+        StrideCs.push_back(std::is_same<Row, CLayout>::value ? Ns[i] : Ms[i]);
+    }
+
+    return ck::profiler::profile_grouped_gemm_impl<ADataType,
+                                                   BDataType,
+                                                   CDataType,
+                                                   AccDataType,
+                                                   ALayout,
+                                                   BLayout,
+                                                   CLayout>(
+        true, 1, false, 1, Ms, Ns, Ks, StrideAs, StrideBs, StrideCs);
+}
+
+} // anonymous namespace
+
+int main()
+{
+    bool res = true;
+
+    res = res && TestGroupedGemm<Row, Row, Row>();
+    res = res && TestGroupedGemm<Row, Col, Row>();
+    res = res && TestGroupedGemm<Col, Row, Row>();
+    res = res && TestGroupedGemm<Col, Col, Row>();
+
+    std::cout << "TestGroupedGemm ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl;
+
+    return res ? 0 : 1;
+}
diff --git a/test/magic_number_division/CMakeLists.txt b/test/magic_number_division/CMakeLists.txt
new file mode 100644
index 00000000..e7fc6ee5
--- /dev/null
+++ b/test/magic_number_division/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_test_executable(test_magic_number_division magic_number_division.cpp)
+target_link_libraries(test_magic_number_division PRIVATE utility)
diff --git a/test/magic_number_division/magic_number_division.cpp b/test/magic_number_division/magic_number_division.cpp
new file mode 100644
index 00000000..680fddf1
--- /dev/null
+++ b/test/magic_number_division/magic_number_division.cpp
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/utility/magic_division.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+
+__global__ void gpu_magic_number_division(uint32_t magic_multiplier,
+                                          uint32_t magic_shift,
+                                          const int32_t* p_dividend,
+                                          int32_t* p_result,
+                                          uint64_t num)
+{
+    uint64_t global_thread_num = blockDim.x * gridDim.x;
+
+    uint64_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+    for(uint64_t data_id = global_thread_id; data_id < num; data_id += global_thread_num)
+    {
+        p_result[data_id] =
+            ck::MagicDivision::DoMagicDivision(p_dividend[data_id], magic_multiplier, magic_shift);
+    }
+}
+
+__global__ void
+gpu_naive_division(int32_t divisor, const int32_t* p_dividend, int32_t* p_result, uint64_t num)
+{
+    uint64_t global_thread_num = blockDim.x * gridDim.x;
+
+    uint64_t global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+    for(uint64_t data_id = global_thread_id; data_id < num; data_id += global_thread_num)
+    {
+        p_result[data_id] = p_dividend[data_id] / divisor;
+    }
+}
+
+__host__ void cpu_magic_number_division(uint32_t magic_multiplier,
+                                        uint32_t magic_shift,
+                                        const int32_t* p_dividend,
+                                        int32_t* p_result,
+                                        uint64_t num)
+{
+    for(uint64_t data_id = 0; data_id < num; ++data_id)
+    {
+        p_result[data_id] =
+            ck::MagicDivision::DoMagicDivision(p_dividend[data_id], magic_multiplier, magic_shift);
+    }
+}
+
+int main(int, char*[])
+{
+    uint64_t num_divisor  = 4096;
+    uint64_t num_dividend = 1L << 16;
+
+    std::vector<int32_t> divisors_host(num_divisor);
+    std::vector<int32_t> dividends_host(num_dividend);
+
+    // generate divisor
+    for(uint64_t i = 0; i < num_divisor; ++i)
+    {
+        divisors_host[i] = i + 1;
+    }
+
+    // generate dividend
+    for(uint64_t i = 0; i < num_divisor; ++i)
+    {
+        dividends_host[i] = i;
+    }
+
+    DeviceMem dividends_dev_buf(sizeof(int32_t) * num_dividend);
+    DeviceMem naive_result_dev_buf(sizeof(int32_t) * num_dividend);
+    DeviceMem magic_result_dev_buf(sizeof(int32_t) * num_dividend);
+
+    std::vector<int32_t> naive_result_host(num_dividend);
+    std::vector<int32_t> magic_result_host(num_dividend);
+    std::vector<int32_t> magic_result_host2(num_dividend);
+
+    dividends_dev_buf.ToDevice(dividends_host.data());
+
+    bool pass = true;
+
+    for(std::size_t i = 0; i < num_divisor; ++i)
+    {
+        // run naive division on GPU
+        gpu_naive_division<<<1024, 256>>>(
+            divisors_host[i],
+            static_cast<const int32_t*>(dividends_dev_buf.GetDeviceBuffer()),
+            static_cast<int32_t*>(naive_result_dev_buf.GetDeviceBuffer()),
+            num_dividend);
+
+        // calculate magic number
+        uint32_t magic_multiplier, magic_shift;
+
+        ck::tie(magic_multiplier, magic_shift) =
+            ck::MagicDivision::CalculateMagicNumbers(divisors_host[i]);
+
+        // run magic division on GPU
+        gpu_magic_number_division<<<1024, 256>>>(
+            magic_multiplier,
+            magic_shift,
+            static_cast<const int32_t*>(dividends_dev_buf.GetDeviceBuffer()),
+            static_cast<int32_t*>(magic_result_dev_buf.GetDeviceBuffer()),
+            num_dividend);
+
+        naive_result_dev_buf.FromDevice(naive_result_host.data());
+        magic_result_dev_buf.FromDevice(magic_result_host.data());
+
+        bool res = ck::utils::check_err(magic_result_host, naive_result_host);
+
+        if(!res)
+        {
+            pass = false;
+            continue;
+        }
+
+        cpu_magic_number_division(magic_multiplier,
+                                  magic_shift,
+                                  dividends_host.data(),
+                                  magic_result_host2.data(),
+                                  num_dividend);
+
+        res = ck::utils::check_err(magic_result_host2, naive_result_host);
+
+        if(!res)
+        {
+            pass = false;
+            continue;
+        }
+    }
+
+    if(pass)
+    {
+        std::cout << "test magic number division: Pass" << std::endl;
+        return 0;
+    }
+    else
+    {
+        std::cout << "test magic number division: Fail" << std::endl;
+        return -1;
+    }
+}
diff --git a/test/normalization/CMakeLists.txt b/test/normalization/CMakeLists.txt
new file mode 100644
index 00000000..456423f2
--- /dev/null
+++ b/test/normalization/CMakeLists.txt
@@ -0,0 +1,17 @@
+add_custom_target(test_layernorm)
+
+add_gtest_executable(test_layernorm2d_fp32 test_layernorm2d_fp32.cpp)
+add_gtest_executable(test_layernorm2d_fp16 test_layernorm2d_fp16.cpp)
+add_gtest_executable(test_groupnorm_fp16 test_groupnorm_fp16.cpp)
+add_gtest_executable(test_groupnorm_fp32 test_groupnorm_fp32.cpp) 
+
+
+target_link_libraries(test_layernorm2d_fp32 PRIVATE utility device_normalization_instance)
+target_link_libraries(test_layernorm2d_fp16 PRIVATE utility device_normalization_instance)
+target_link_libraries(test_groupnorm_fp16 PRIVATE utility device_normalization_instance)
+target_link_libraries(test_groupnorm_fp32 PRIVATE utility device_normalization_instance)
+
+add_dependencies(test_layernorm test_layernorm2d_fp32)
+add_dependencies(test_layernorm test_layernorm2d_fp16)
+add_dependencies(test_layernorm test_groupnorm_fp16)
+add_dependencies(test_layernorm test_groupnorm_fp32)
diff --git a/test/normalization/test_groupnorm_fp16.cpp b/test/normalization/test_groupnorm_fp16.cpp
new file mode 100644
index 00000000..636e522d
--- /dev/null
+++ b/test/normalization/test_groupnorm_fp16.cpp
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "profiler/profile_groupnorm_impl.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+using ck::index_t;
+
+template <typename Tuple>
+class TestGroupnorm : public ::testing::Test
+{
+    protected:
+    using XDataType     = std::tuple_element_t<0, Tuple>;
+    using GammaDataType = std::tuple_element_t<1, Tuple>;
+    using BetaDataType  = std::tuple_element_t<2, Tuple>;
+    using AccDataType   = std::tuple_element_t<3, Tuple>;
+    using YDataType     = std::tuple_element_t<4, Tuple>;
+
+    void Run()
+    {
+        // [N, H, W, G, C], reduce H, W, C
+        std::vector<std::vector<ck::index_t>> lengths = {{1, 1, 1, 1, 1},
+                                                         {1, 2, 3, 4, 5},
+                                                         {256, 9, 9, 9, 9},
+                                                         {1, 64, 64, 32, 10},
+                                                         {1, 32, 32, 32, 20},
+                                                         {2, 32, 32, 32, 30},
+                                                         {2, 32, 32, 32, 40},
+                                                         {1, 16, 16, 32, 40}};
+
+        for(auto length : lengths)
+        {
+            bool success =
+                ck::profiler::profile_groupnorm_impl<XDataType,
+                                                     GammaDataType,
+                                                     BetaDataType,
+                                                     AccDataType,
+                                                     YDataType>(true, 2, false, false, length);
+            EXPECT_TRUE(success);
+        }
+    }
+};
+
+using KernelTypes = ::testing::Types<
+    // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType>
+    std::tuple<F16, F16, F16, F32, F16>>;
+
+TYPED_TEST_SUITE(TestGroupnorm, KernelTypes);
+TYPED_TEST(TestGroupnorm, Test_FP16) { this->Run(); }
diff --git a/test/normalization/test_groupnorm_fp32.cpp b/test/normalization/test_groupnorm_fp32.cpp
new file mode 100644
index 00000000..ef492664
--- /dev/null
+++ b/test/normalization/test_groupnorm_fp32.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "profiler/profile_groupnorm_impl.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+using ck::index_t;
+
+template <typename Tuple>
+class TestGroupnorm : public ::testing::Test
+{
+    protected:
+    using XDataType     = std::tuple_element_t<0, Tuple>;
+    using GammaDataType = std::tuple_element_t<1, Tuple>;
+    using BetaDataType  = std::tuple_element_t<2, Tuple>;
+    using AccDataType   = std::tuple_element_t<3, Tuple>;
+    using YDataType     = std::tuple_element_t<4, Tuple>;
+
+    void Run()
+    {
+        // [N, H, W, G, C], reduce H, W, C
+        std::vector<std::vector<ck::index_t>> lengths = {{1, 1, 1, 1, 1},
+                                                         {1, 2, 3, 4, 5},
+                                                         {256, 9, 9, 9, 9},
+                                                         {1, 64, 64, 32, 10},
+                                                         {1, 32, 32, 32, 20},
+                                                         {1, 16, 16, 32, 40}};
+
+        for(auto length : lengths)
+        {
+            bool success =
+                ck::profiler::profile_groupnorm_impl<XDataType,
+                                                     GammaDataType,
+                                                     BetaDataType,
+                                                     AccDataType,
+                                                     YDataType>(true, 2, false, false, length);
+            EXPECT_TRUE(success);
+        }
+    }
+};
+
+using KernelTypes = ::testing::Types<
+    // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType>
+    std::tuple<F32, F32, F32, F32, F32>>;
+
+TYPED_TEST_SUITE(TestGroupnorm, KernelTypes);
+TYPED_TEST(TestGroupnorm, Test_FP32) { this->Run(); }
diff --git a/test/normalization/test_layernorm2d_fp16.cpp b/test/normalization/test_layernorm2d_fp16.cpp
new file mode 100644
index 00000000..eeb8ec15
--- /dev/null
+++ b/test/normalization/test_layernorm2d_fp16.cpp
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "profiler/profile_layernorm_impl.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+using ck::index_t;
+
+template <typename Tuple>
+class TestLayernorm2d : public ::testing::Test
+{
+    protected:
+    using XDataType     = std::tuple_element_t<0, Tuple>;
+    using GammaDataType = std::tuple_element_t<1, Tuple>;
+    using BetaDataType  = std::tuple_element_t<2, Tuple>;
+    using AccDataType   = std::tuple_element_t<3, Tuple>;
+    using YDataType     = std::tuple_element_t<4, Tuple>;
+
+    void Run()
+    {
+        // [N, D], reduce D
+        std::vector<std::vector<ck::index_t>> lengths = {
+            {4, 256}, {8, 511}, {9, 1032}, {4, 2048}, {1, 8192}, {4000, 2000}};
+
+        for(auto length : lengths)
+        {
+            bool success = ck::profiler::profile_layernorm_impl<XDataType,
+                                                                GammaDataType,
+                                                                BetaDataType,
+                                                                AccDataType,
+                                                                YDataType,
+                                                                2>(true, 2, false, false, length);
+            EXPECT_TRUE(success);
+        }
+    }
+};
+
+using KernelTypes = ::testing::Types<
+    // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType>
+    std::tuple<F16, F16, F16, F32, F16>>;
+
+TYPED_TEST_SUITE(TestLayernorm2d, KernelTypes);
+TYPED_TEST(TestLayernorm2d, Test_FP16) { this->Run(); }
diff --git a/test/normalization/test_layernorm2d_fp32.cpp b/test/normalization/test_layernorm2d_fp32.cpp
new file mode 100644
index 00000000..f555b425
--- /dev/null
+++ b/test/normalization/test_layernorm2d_fp32.cpp
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "gtest/gtest.h"
+#include "profiler/profile_layernorm_impl.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+using ck::index_t;
+
+template <typename Tuple>
+class TestLayernorm2d : public ::testing::Test
+{
+    protected:
+    using XDataType     = std::tuple_element_t<0, Tuple>;
+    using GammaDataType = std::tuple_element_t<1, Tuple>;
+    using BetaDataType  = std::tuple_element_t<2, Tuple>;
+    using AccDataType   = std::tuple_element_t<3, Tuple>;
+    using YDataType     = std::tuple_element_t<4, Tuple>;
+
+    void Run()
+    {
+        // [N, D], reduce D
+        std::vector<std::vector<ck::index_t>> lengths = {
+            {4, 256}, {8, 511}, {9, 1032}, {4, 2048}, {1, 8192}, {4000, 2000}};
+
+        for(auto length : lengths)
+        {
+            bool success = ck::profiler::profile_layernorm_impl<XDataType,
+                                                                GammaDataType,
+                                                                BetaDataType,
+                                                                AccDataType,
+                                                                YDataType,
+                                                                2>(true, 2, false, false, length);
+            EXPECT_TRUE(success);
+        }
+    }
+};
+
+using KernelTypes = ::testing::Types<
+    // XDataType, GammaDataType, BetaDataType, AccDataType, YDataType>
+    std::tuple<F32, F32, F32, F32, F32>>;
+
+TYPED_TEST_SUITE(TestLayernorm2d, KernelTypes);
+TYPED_TEST(TestLayernorm2d, Test_FP32) { this->Run(); }
diff --git a/test/reduce/CMakeLists.txt b/test/reduce/CMakeLists.txt
new file mode 100644
index 00000000..fb436165
--- /dev/null
+++ b/test/reduce/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_test_executable(test_reduce_no_index reduce_no_index.cpp)
+add_test_executable(test_reduce_with_index reduce_with_index.cpp)
+target_link_libraries(test_reduce_no_index PRIVATE utility)
+target_link_libraries(test_reduce_no_index PRIVATE device_reduce_instance)
+target_link_libraries(test_reduce_with_index PRIVATE utility)
+target_link_libraries(test_reduce_with_index PRIVATE device_reduce_instance)
+
diff --git a/test/reduce/reduce_no_index.cpp b/test/reduce/reduce_no_index.cpp
new file mode 100644
index 00000000..3f4d0676
--- /dev/null
+++ b/test/reduce/reduce_no_index.cpp
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <getopt.h>
+
+#include "ck/library/utility/host_common_util.hpp"
+#include "profiler/profile_reduce_impl.hpp"
+
+using namespace ck;
+
+static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
+                                       {"reduceDimensions", required_argument, nullptr, 'R'},
+                                       {"scales", required_argument, nullptr, 'S'},
+                                       {"help", no_argument, nullptr, '?'},
+                                       {nullptr, 0, nullptr, 0}};
+
+class SimpleAppArgs
+{
+    private:
+    int option_index = 0;
+
+    public:
+    std::vector<size_t> inLengths;
+    std::vector<int> reduceDims;
+    std::vector<float> scales;
+
+    int data_type;
+    int init_method = 1;
+
+    public:
+    void show_usage(const char* cmd)
+    {
+        std::cout << "Usage of " << cmd << std::endl;
+        std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths "
+                     "(only 4-d tensor supported)"
+                  << std::endl;
+        std::cout << "--reduceDimensions or -R comma seperated list of dimension indexes to reduce "
+                     "(only 1 or 3 or 4 dimensions supported)"
+                  << std::endl;
+        std::cout << "--scales or -S, comma separated two float values for alpha and beta"
+                  << std::endl;
+        std::cout << "Arg1 -- data type (0: fp16, 1: fp32, 3: int8, 5: bp16, 6: fp64)" << std::endl;
+        std::cout << "Arg2 -- init method(0=no init, 1=single integer value, 2=scope integer "
+                     "value, 3=decimal value)"
+                  << std::endl;
+    };
+
+    int processArgs(int argc, char* argv[])
+    {
+        using ck::host_common::getTypeValuesFromString;
+
+        int ch;
+
+        while(1)
+        {
+            ch = getopt_long(argc, argv, "D:R:S:", long_options, &option_index);
+            if(ch == -1)
+                break;
+            switch(ch)
+            {
+            case 'D':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                inLengths = getTypeValuesFromString<size_t>(optarg);
+                break;
+            case 'R':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                reduceDims = getTypeValuesFromString<int>(optarg);
+                break;
+            case 'S':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                scales = getTypeValuesFromString<float>(optarg);
+                break;
+            case '?':
+                if(std::string(long_options[option_index].name) == "help")
+                {
+                    show_usage(argv[0]);
+                    return (-1);
+                };
+                break;
+            default: show_usage(argv[0]); return (-1);
+            };
+        };
+
+        if(optind + 2 > argc)
+            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
+
+        data_type   = std::atoi(argv[optind++]);
+        init_method = std::atoi(argv[optind]);
+
+        if(scales.empty())
+        {
+            scales.push_back(1.0f);
+            scales.push_back(0.0f);
+        };
+
+        if(inLengths.size() != 4 ||
+           (reduceDims.size() != 1 && reduceDims.size() != 3 && reduceDims.size() != 4))
+            return (-1);
+
+        if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6)
+            return (-1);
+
+        return (0);
+    };
+};
+
+bool test_reduce_no_index(int data_type,
+                          int init_method,
+                          std::vector<int> reduceDims,
+                          std::vector<size_t> inLengths,
+                          ReduceTensorOp reduceOpId,
+                          bool propagateNan,
+                          float alpha,
+                          float beta)
+{
+    using ck::profiler::profile_reduce_impl;
+
+    bool result = true;
+
+    if(data_type == 0)
+    {
+        result = profile_reduce_impl<float, float, float>(true,
+                                                          init_method,
+                                                          false,
+                                                          false,
+                                                          inLengths,
+                                                          reduceDims,
+                                                          reduceOpId,
+                                                          propagateNan,
+                                                          false,
+                                                          alpha,
+                                                          beta);
+    }
+    else if(data_type == 1)
+    {
+        result = profile_reduce_impl<ck::half_t, float, ck::half_t>(true,
+                                                                    init_method,
+                                                                    false,
+                                                                    false,
+                                                                    inLengths,
+                                                                    reduceDims,
+                                                                    reduceOpId,
+                                                                    propagateNan,
+                                                                    false,
+                                                                    alpha,
+                                                                    beta);
+    }
+    else if(data_type == 3)
+    {
+        result = profile_reduce_impl<int8_t, int32_t, int8_t>(true,
+                                                              init_method,
+                                                              false,
+                                                              false,
+                                                              inLengths,
+                                                              reduceDims,
+                                                              reduceOpId,
+                                                              propagateNan,
+                                                              false,
+                                                              alpha,
+                                                              beta);
+    }
+    else if(data_type == 5)
+    {
+        result = profile_reduce_impl<ck::bhalf_t, float, ck::bhalf_t>(true,
+                                                                      init_method,
+                                                                      false,
+                                                                      false,
+                                                                      inLengths,
+                                                                      reduceDims,
+                                                                      reduceOpId,
+                                                                      propagateNan,
+                                                                      false,
+                                                                      alpha,
+                                                                      beta);
+    }
+    else if(data_type == 6)
+    {
+        result = profile_reduce_impl<double, double, double>(true,
+                                                             init_method,
+                                                             false,
+                                                             false,
+                                                             inLengths,
+                                                             reduceDims,
+                                                             reduceOpId,
+                                                             propagateNan,
+                                                             false,
+                                                             alpha,
+                                                             beta);
+    }
+
+    return (result);
+};
+
+constexpr ReduceTensorOp reduceOpId = ReduceTensorOp::AVG;
+constexpr bool propagateNan         = false;
+
+int main(int argc, char* argv[])
+{
+    SimpleAppArgs args;
+
+    bool result = true;
+
+    if(argc == 1)
+    {
+        int data_type   = 1;
+        int init_method = 2;
+        std::vector<size_t> inLengths{64, 4, 280, 80};
+        std::vector<std::vector<int>> v_reduceDims{
+            {0, 1, 2, 3}, {0, 1, 2}, {1, 2, 3}, {0, 1, 3}, {0, 2, 3}, {0}, {1}, {2}, {3}};
+
+        for(auto& reduceDims : v_reduceDims)
+            result = result && test_reduce_no_index(data_type,
+                                                    init_method,
+                                                    reduceDims,
+                                                    inLengths,
+                                                    reduceOpId,
+                                                    propagateNan,
+                                                    1.0f,
+                                                    0.0f);
+    }
+    else
+    {
+        if(args.processArgs(argc, argv) < 0)
+        {
+            throw std::runtime_error(
+                "Invalid input arguments, test_reduce_no_index could not be executed!");
+        };
+
+        result = test_reduce_no_index(args.data_type,
+                                      args.init_method,
+                                      args.reduceDims,
+                                      args.inLengths,
+                                      reduceOpId,
+                                      propagateNan,
+                                      args.scales[0],
+                                      args.scales[1]);
+    }
+
+    std::cout << "test_reduce_no_index ..... " << (result ? "SUCCESS" : "FAILURE") << std::endl;
+
+    return (result ? 0 : -1);
+}
diff --git a/test/reduce/reduce_with_index.cpp b/test/reduce/reduce_with_index.cpp
new file mode 100644
index 00000000..c616a68e
--- /dev/null
+++ b/test/reduce/reduce_with_index.cpp
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <getopt.h>
+
+#include "ck/library/utility/host_common_util.hpp"
+#include "profiler/profile_reduce_impl.hpp"
+
+using namespace ck;
+
+static struct option long_options[] = {{"inLengths", required_argument, nullptr, 'D'},
+                                       {"reduceDimensions", required_argument, nullptr, 'R'},
+                                       {"scales", required_argument, nullptr, 'S'},
+                                       {"help", no_argument, nullptr, '?'},
+                                       {nullptr, 0, nullptr, 0}};
+
+class SimpleAppArgs
+{
+    private:
+    int option_index = 0;
+
+    public:
+    std::vector<size_t> inLengths;
+    std::vector<int> reduceDims;
+    std::vector<float> scales;
+
+    int data_type;
+    int init_method = 1;
+
+    public:
+    void show_usage(const char* cmd)
+    {
+        std::cout << "Usage of " << cmd << std::endl;
+        std::cout << "--inLengths or -D, comma separated list of input tensor dimension lengths "
+                     "(only 4-d tensor supported)"
+                  << std::endl;
+        std::cout << "--reduceDimensions or -R comma seperated list of dimension indexes to reduce "
+                     "(only 1 or 3 or 4 dimensions supported)"
+                  << std::endl;
+        std::cout << "--scales or -S, comma separated two float values for alpha and beta"
+                  << std::endl;
+        std::cout << "Arg1 -- data type (1: fp32, 3: int8, 5: bp16, 6: fp64)" << std::endl;
+        std::cout << "Arg2 -- init method(0=no init, 1=single integer value, 2=scope integer "
+                     "value, 3=decimal value)"
+                  << std::endl;
+    };
+
+    int processArgs(int argc, char* argv[])
+    {
+        using ck::host_common::getTypeValuesFromString;
+
+        int ch;
+
+        while(1)
+        {
+            ch = getopt_long(argc, argv, "D:R:S:", long_options, &option_index);
+            if(ch == -1)
+                break;
+            switch(ch)
+            {
+            case 'D':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                inLengths = getTypeValuesFromString<size_t>(optarg);
+                break;
+            case 'R':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                reduceDims = getTypeValuesFromString<int>(optarg);
+                break;
+            case 'S':
+                if(!optarg)
+                    throw std::runtime_error("Invalid option format!");
+
+                scales = getTypeValuesFromString<float>(optarg);
+                break;
+            case '?':
+                if(std::string(long_options[option_index].name) == "help")
+                {
+                    show_usage(argv[0]);
+                    return (-1);
+                };
+                break;
+            default: show_usage(argv[0]); return (-1);
+            };
+        };
+
+        if(optind + 2 > argc)
+            throw std::runtime_error("Invalid cmd-line arguments, more argumetns are needed!");
+
+        data_type   = std::atoi(argv[optind++]);
+        init_method = std::atoi(argv[optind]);
+
+        if(scales.empty())
+        {
+            scales.push_back(1.0f);
+            scales.push_back(0.0f);
+        };
+
+        if(inLengths.size() != 4 ||
+           (reduceDims.size() != 1 && reduceDims.size() != 3 && reduceDims.size() != 4))
+            return (-1);
+
+        if(data_type != 0 && data_type != 1 && data_type != 3 && data_type != 5 && data_type != 6)
+            return (-1);
+
+        return (0);
+    };
+};
+
+bool test_reduce_with_index(int data_type,
+                            int init_method,
+                            std::vector<int> reduceDims,
+                            std::vector<size_t> inLengths,
+                            ReduceTensorOp reduceOpId,
+                            bool propagateNan,
+                            float alpha,
+                            float beta)
+{
+    using ck::profiler::profile_reduce_impl;
+
+    bool result = true;
+
+    if(data_type == 0)
+    {
+        result = profile_reduce_impl<float, float, float>(true,
+                                                          init_method,
+                                                          false,
+                                                          false,
+                                                          inLengths,
+                                                          reduceDims,
+                                                          reduceOpId,
+                                                          propagateNan,
+                                                          true,
+                                                          alpha,
+                                                          beta);
+    }
+    else if(data_type == 1)
+    {
+        result = profile_reduce_impl<ck::half_t, ck::half_t, ck::half_t>(true,
+                                                                         init_method,
+                                                                         false,
+                                                                         false,
+                                                                         inLengths,
+                                                                         reduceDims,
+                                                                         reduceOpId,
+                                                                         propagateNan,
+                                                                         true,
+                                                                         alpha,
+                                                                         beta);
+    }
+    else if(data_type == 3)
+    {
+        result = profile_reduce_impl<int8_t, int8_t, int8_t>(true,
+                                                             init_method,
+                                                             false,
+                                                             false,
+                                                             inLengths,
+                                                             reduceDims,
+                                                             reduceOpId,
+                                                             propagateNan,
+                                                             true,
+                                                             alpha,
+                                                             beta);
+    }
+    else if(data_type == 5)
+    {
+        result = profile_reduce_impl<ck::bhalf_t, float, ck::bhalf_t>(true,
+                                                                      init_method,
+                                                                      false,
+                                                                      false,
+                                                                      inLengths,
+                                                                      reduceDims,
+                                                                      reduceOpId,
+                                                                      propagateNan,
+                                                                      true,
+                                                                      alpha,
+                                                                      beta);
+    }
+    else if(data_type == 6)
+    {
+        result = profile_reduce_impl<double, double, double>(true,
+                                                             init_method,
+                                                             false,
+                                                             false,
+                                                             inLengths,
+                                                             reduceDims,
+                                                             reduceOpId,
+                                                             propagateNan,
+                                                             true,
+                                                             alpha,
+                                                             beta);
+    }
+
+    return (result);
+};
+
+constexpr ReduceTensorOp reduceOpId = ReduceTensorOp::AMAX;
+constexpr bool propagateNan         = false;
+
+int main(int argc, char* argv[])
+{
+    SimpleAppArgs args;
+
+    bool result = true;
+
+    if(argc == 1)
+    {
+        int data_type   = 1;
+        int init_method = 2;
+        std::vector<size_t> inLengths{64, 4, 280, 80};
+        std::vector<std::vector<int>> v_reduceDims{
+            {0, 1, 2, 3}, {0, 1, 2}, {1, 2, 3}, {0, 1, 3}, {0, 2, 3}, {0}, {1}, {2}, {3}};
+
+        for(auto& reduceDims : v_reduceDims)
+            result = result && test_reduce_with_index(data_type,
+                                                      init_method,
+                                                      reduceDims,
+                                                      inLengths,
+                                                      reduceOpId,
+                                                      propagateNan,
+                                                      1.0f,
+                                                      0.0f);
+    }
+    else
+    {
+        if(args.processArgs(argc, argv) < 0)
+        {
+            throw std::runtime_error(
+                "Invalid input arguments, test_reduce_with_index could not be executed!");
+        };
+
+        result = test_reduce_with_index(args.data_type,
+                                        args.init_method,
+                                        args.reduceDims,
+                                        args.inLengths,
+                                        reduceOpId,
+                                        propagateNan,
+                                        args.scales[0],
+                                        args.scales[1]);
+    }
+
+    std::cout << "test_reduce_with_index ..... " << (result ? "SUCCESS" : "FAILURE") << std::endl;
+
+    return (result ? 0 : -1);
+}
diff --git a/test/reference_conv_fwd/CMakeLists.txt b/test/reference_conv_fwd/CMakeLists.txt
new file mode 100644
index 00000000..b40b9a1e
--- /dev/null
+++ b/test/reference_conv_fwd/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_gtest_executable(test_reference_conv_fwd reference_conv_fwd.cpp)
+target_link_libraries(test_reference_conv_fwd PRIVATE utility)
diff --git a/test/reference_conv_fwd/reference_conv_fwd.cpp b/test/reference_conv_fwd/reference_conv_fwd.cpp
new file mode 100644
index 00000000..1f9ba006
--- /dev/null
+++ b/test/reference_conv_fwd/reference_conv_fwd.cpp
@@ -0,0 +1,392 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cmath>
+#include <cstdlib>
+#include <numeric>
+#include <type_traits>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/fill.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+
+namespace {
+
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
+
+template <ck::index_t NDimSpatial,
+          typename InDataType    = float,
+          typename WeiDataType   = float,
+          typename OutDataType   = float,
+          typename InLayout      = ck::tensor_layout::convolution::GNHWC,
+          typename WeiLayout     = ck::tensor_layout::convolution::GKYXC,
+          typename OutLayout     = ck::tensor_layout::convolution::GNHWK,
+          typename FillInputOp   = ck::utils::FillMonotonicSeq<InDataType>,
+          typename FillWeightsOp = ck::utils::FillConstant<WeiDataType>>
+Tensor<OutDataType>
+run_reference_convolution_forward(const ck::utils::conv::ConvParam& conv_param,
+                                  const FillInputOp& fill_input_op     = FillInputOp{},
+                                  const FillWeightsOp& fill_weights_op = FillWeightsOp{0.5f})
+{
+    const auto in_g_n_c_wis_desc =
+        ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
+
+    const auto wei_g_k_c_xs_desc =
+        ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
+
+    const auto out_g_n_k_wos_desc =
+        ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
+
+    Tensor<InDataType> input(in_g_n_c_wis_desc);
+    Tensor<WeiDataType> weights(wei_g_k_c_xs_desc);
+    Tensor<OutDataType> host_output(out_g_n_k_wos_desc);
+
+    fill_input_op(input.begin(), input.end());
+    fill_weights_op(weights.begin(), weights.end());
+    ck::ranges::fill<OutDataType>(host_output, 0.f);
+
+    auto ref_conv     = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                 InDataType,
+                                                                 WeiDataType,
+                                                                 OutDataType,
+                                                                 InElementOp,
+                                                                 WeiElementOp,
+                                                                 OutElementOp>();
+    auto ref_invoker  = ref_conv.MakeInvoker();
+    auto ref_argument = ref_conv.MakeArgument(input,
+                                              weights,
+                                              host_output,
+                                              conv_param.conv_filter_strides_,
+                                              conv_param.conv_filter_dilations_,
+                                              conv_param.input_left_pads_,
+                                              conv_param.input_right_pads_,
+                                              InElementOp{},
+                                              WeiElementOp{},
+                                              OutElementOp{});
+
+    ref_invoker.Run(ref_argument);
+    return host_output;
+}
+
+} // anonymous namespace
+
+// Eeference convolution assume dimensions of tensor descriptors are in GNCDHW/GKCZYX/GNKDHW order,
+// regardless of physical tensor layouts in  memory.
+// Some tests below assume dimensions of tensor descriptors can be in other order, and therefore
+// are disabled
+// TODO: add more tests, which comply with assumption about dimension order of reference convolution
+// and add tests for more physical layout
+#if 0
+TEST(ReferenceConvolutionFWD, Conv2DGNHWC)
+{
+    ck::utils::conv::ConvParam conv_param(2,
+                                          1,
+                                          1,
+                                          1,
+                                          2,
+                                          std::vector<ck::index_t>{3, 3},
+                                          std::vector<ck::index_t>{6, 6},
+                                          std::vector<ck::index_t>{1, 1},
+                                          std::vector<ck::index_t>{1, 1},
+                                          std::vector<ck::index_t>{0, 0},
+                                          std::vector<ck::index_t>{0, 0});
+
+    auto out_tensor = run_reference_convolution_forward<2>(conv_param);
+    std::vector<std::size_t> ref_dims{1, 1, 4, 4, 1};
+    std::vector<float> ref_data{130.5,
+                                148.5,
+                                166.5,
+                                184.5,
+                                238.5,
+                                256.5,
+                                274.5,
+                                292.5,
+                                346.5,
+                                364.5,
+                                382.5,
+                                400.5,
+                                454.5,
+                                472.5,
+                                490.5,
+                                508.5};
+    EXPECT_TRUE(ck::utils::check_err(
+        out_tensor.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!"));
+    EXPECT_TRUE(ck::utils::check_err(out_tensor, ref_data, "Error: incorrect results!"));
+}
+
+TEST(ReferenceConvolutionFWD, Conv2DGNHWCStridesDilationsPadding)
+{
+    ck::utils::conv::ConvParam conv_param(2,
+                                          1,
+                                          1,
+                                          2,
+                                          2,
+                                          std::vector<ck::index_t>{3, 3},
+                                          std::vector<ck::index_t>{12, 12},
+                                          std::vector<ck::index_t>{2, 2},
+                                          std::vector<ck::index_t>{2, 2},
+                                          std::vector<ck::index_t>{1, 1},
+                                          std::vector<ck::index_t>{1, 1});
+
+    auto out_tensor                   = run_reference_convolution_forward<2>(conv_param);
+    std::vector<std::size_t> ref_dims = std::vector<std::size_t>{1, 5, 5, 2};
+    std::vector<float> ref_data{
+        210.,  210.,  327.,   327.,   351.,   351.,   375.,   375.,   399.,   399.,
+        459.,  459.,  706.5,  706.5,  742.5,  742.5,  778.5,  778.5,  814.5,  814.5,
+        747.,  747.,  1138.5, 1138.5, 1174.5, 1174.5, 1210.5, 1210.5, 1246.5, 1246.5,
+        1035., 1035., 1570.5, 1570.5, 1606.5, 1606.5, 1642.5, 1642.5, 1678.5, 1678.5,
+        1323., 1323., 2002.5, 2002.5, 2038.5, 2038.5, 2074.5, 2074.5, 2110.5, 2110.5};
+    EXPECT_TRUE(ck::utils::check_err(
+        out_tensor.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!"));
+    EXPECT_TRUE(ck::utils::check_err(out_tensor, ref_data, "Error: incorrect results!"));
+}
+
+TEST(ReferenceConvolutionFWD, Conv1DGNWC)
+{
+    ck::utils::conv::ConvParam conv_param(1,
+                                          1,
+                                          1,
+                                          1,
+                                          2,
+                                          std::vector<ck::index_t>{3},
+                                          std::vector<ck::index_t>{6},
+                                          std::vector<ck::index_t>{1},
+                                          std::vector<ck::index_t>{1},
+                                          std::vector<ck::index_t>{0},
+                                          std::vector<ck::index_t>{0});
+
+    auto out_tensor =
+        run_reference_convolution_forward<1,
+                                          float,
+                                          float,
+                                          float,
+                                          ck::tensor_layout::convolution::GNWC,
+                                          ck::tensor_layout::convolution::GKXC,
+                                          ck::tensor_layout::convolution::GNWK>(conv_param);
+    std::vector<std::size_t> ref_dims{1, 1, 4, 1};
+    std::vector<float> ref_data{7.5, 13.5, 19.5, 25.5};
+    EXPECT_TRUE(ck::utils::check_err(
+        out_tensor.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!"));
+    EXPECT_TRUE(ck::utils::check_err(out_tensor, ref_data, "Error: incorrect results!"));
+}
+
+TEST(ReferenceConvolutionFWD, Conv1DGNWCStridesDilationsPadding)
+{
+    ck::utils::conv::ConvParam conv_param(1,
+                                          1,
+                                          1,
+                                          2,
+                                          2,
+                                          std::vector<ck::index_t>{3},
+                                          std::vector<ck::index_t>{12},
+                                          std::vector<ck::index_t>{2},
+                                          std::vector<ck::index_t>{2},
+                                          std::vector<ck::index_t>{1},
+                                          std::vector<ck::index_t>{1});
+
+    auto out_tensor =
+        run_reference_convolution_forward<1,
+                                          float,
+                                          float,
+                                          float,
+                                          ck::tensor_layout::convolution::GNWC,
+                                          ck::tensor_layout::convolution::GKXC,
+                                          ck::tensor_layout::convolution::GNWK>(conv_param);
+    std::vector<std::size_t> ref_dims{1, 1, 5, 2};
+    std::vector<float> ref_data{9., 9., 19.5, 19.5, 31.5, 31.5, 43.5, 43.5, 55.5, 55.5};
+    EXPECT_TRUE(ck::utils::check_err(
+        out_tensor.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!"));
+    EXPECT_TRUE(ck::utils::check_err(out_tensor, ref_data, "Error: incorrect results!"));
+}
+
+TEST(ReferenceConvolutionFWD, Conv1DGNWCSameOutputSize)
+{
+    ck::utils::conv::ConvParam conv_param(1,
+                                          1,
+                                          2,
+                                          16,
+                                          4,
+                                          std::vector<ck::index_t>{3},
+                                          std::vector<ck::index_t>{16},
+                                          std::vector<ck::index_t>{1},
+                                          std::vector<ck::index_t>{1},
+                                          std::vector<ck::index_t>{1},
+                                          std::vector<ck::index_t>{1});
+
+    auto out_tensor2 = run_reference_convolution_forward<1,
+                                                         float,
+                                                         float,
+                                                         float,
+                                                         ck::tensor_layout::convolution::GNWC,
+                                                         ck::tensor_layout::convolution::GKXC,
+                                                         ck::tensor_layout::convolution::GNWK>(
+        conv_param, ck::utils::FillMonotonicSeq<float>{0.f, 0.1f});
+
+    std::vector<std::size_t> ref_dims{1, 2, 16, 16};
+    std::vector<float> ref_data{
+        1.4,       1.4,       1.4,       1.4,       1.4,       1.4,       1.4,       1.4,
+        1.4,       1.4,       1.4,       1.4,       1.4,       1.4,       1.4,       1.4,
+        3.3,       3.3,       3.3,       3.3,       3.3,       3.3,       3.3,       3.3,
+        3.3,       3.3,       3.3,       3.3,       3.3,       3.3,       3.3,       3.3,
+        5.7,       5.7,       5.7,       5.7,       5.7,       5.7,       5.7,       5.7,
+        5.7,       5.7,       5.7,       5.7,       5.7,       5.7,       5.7,       5.7,
+        8.1,       8.1,       8.1,       8.1,       8.1,       8.1,       8.1,       8.1,
+        8.1,       8.1,       8.1,       8.1,       8.1,       8.1,       8.1,       8.1,
+        10.5,      10.5,      10.5,      10.5,      10.5,      10.5,      10.5,      10.5,
+        10.5,      10.5,      10.5,      10.5,      10.5,      10.5,      10.5,      10.5,
+        12.900001, 12.900001, 12.900001, 12.900001, 12.900001, 12.900001, 12.900001, 12.900001,
+        12.900001, 12.900001, 12.900001, 12.900001, 12.900001, 12.900001, 12.900001, 12.900001,
+        15.3,      15.3,      15.3,      15.3,      15.3,      15.3,      15.3,      15.3,
+        15.3,      15.3,      15.3,      15.3,      15.3,      15.3,      15.3,      15.3,
+        17.7,      17.7,      17.7,      17.7,      17.7,      17.7,      17.7,      17.7,
+        17.7,      17.7,      17.7,      17.7,      17.7,      17.7,      17.7,      17.7,
+        20.1,      20.1,      20.1,      20.1,      20.1,      20.1,      20.1,      20.1,
+        20.1,      20.1,      20.1,      20.1,      20.1,      20.1,      20.1,      20.1,
+        22.5,      22.5,      22.5,      22.5,      22.5,      22.5,      22.5,      22.5,
+        22.5,      22.5,      22.5,      22.5,      22.5,      22.5,      22.5,      22.5,
+        24.900002, 24.900002, 24.900002, 24.900002, 24.900002, 24.900002, 24.900002, 24.900002,
+        24.900002, 24.900002, 24.900002, 24.900002, 24.900002, 24.900002, 24.900002, 24.900002,
+        27.300001, 27.300001, 27.300001, 27.300001, 27.300001, 27.300001, 27.300001, 27.300001,
+        27.300001, 27.300001, 27.300001, 27.300001, 27.300001, 27.300001, 27.300001, 27.300001,
+        29.7,      29.7,      29.7,      29.7,      29.7,      29.7,      29.7,      29.7,
+        29.7,      29.7,      29.7,      29.7,      29.7,      29.7,      29.7,      29.7,
+        32.100002, 32.100002, 32.100002, 32.100002, 32.100002, 32.100002, 32.100002, 32.100002,
+        32.100002, 32.100002, 32.100002, 32.100002, 32.100002, 32.100002, 32.100002, 32.100002,
+        34.5,      34.5,      34.5,      34.5,      34.5,      34.5,      34.5,      34.5,
+        34.5,      34.5,      34.5,      34.5,      34.5,      34.5,      34.5,      34.5,
+        23.8,      23.8,      23.8,      23.8,      23.8,      23.8,      23.8,      23.8,
+        23.8,      23.8,      23.8,      23.8,      23.8,      23.8,      23.8,      23.8,
+        27.,       27.,       27.,       27.,       27.,       27.,       27.,       27.,
+        27.,       27.,       27.,       27.,       27.,       27.,       27.,       27.,
+        41.7,      41.7,      41.7,      41.7,      41.7,      41.7,      41.7,      41.7,
+        41.7,      41.7,      41.7,      41.7,      41.7,      41.7,      41.7,      41.7,
+        44.100002, 44.100002, 44.100002, 44.100002, 44.100002, 44.100002, 44.100002, 44.100002,
+        44.100002, 44.100002, 44.100002, 44.100002, 44.100002, 44.100002, 44.100002, 44.100002,
+        46.5,      46.5,      46.5,      46.5,      46.5,      46.5,      46.5,      46.5,
+        46.5,      46.5,      46.5,      46.5,      46.5,      46.5,      46.5,      46.5,
+        48.899998, 48.899998, 48.899998, 48.899998, 48.899998, 48.899998, 48.899998, 48.899998,
+        48.899998, 48.899998, 48.899998, 48.899998, 48.899998, 48.899998, 48.899998, 48.899998,
+        51.3,      51.3,      51.3,      51.3,      51.3,      51.3,      51.3,      51.3,
+        51.3,      51.3,      51.3,      51.3,      51.3,      51.3,      51.3,      51.3,
+        53.7,      53.7,      53.7,      53.7,      53.7,      53.7,      53.7,      53.7,
+        53.7,      53.7,      53.7,      53.7,      53.7,      53.7,      53.7,      53.7,
+        56.100002, 56.100002, 56.100002, 56.100002, 56.100002, 56.100002, 56.100002, 56.100002,
+        56.100002, 56.100002, 56.100002, 56.100002, 56.100002, 56.100002, 56.100002, 56.100002,
+        58.5,      58.5,      58.5,      58.5,      58.5,      58.5,      58.5,      58.5,
+        58.5,      58.5,      58.5,      58.5,      58.5,      58.5,      58.5,      58.5,
+        60.899998, 60.899998, 60.899998, 60.899998, 60.899998, 60.899998, 60.899998, 60.899998,
+        60.899998, 60.899998, 60.899998, 60.899998, 60.899998, 60.899998, 60.899998, 60.899998,
+        63.3,      63.3,      63.3,      63.3,      63.3,      63.3,      63.3,      63.3,
+        63.3,      63.3,      63.3,      63.3,      63.3,      63.3,      63.3,      63.3,
+        65.7,      65.7,      65.7,      65.7,      65.7,      65.7,      65.7,      65.7,
+        65.7,      65.7,      65.7,      65.7,      65.7,      65.7,      65.7,      65.7,
+        68.1,      68.1,      68.1,      68.1,      68.1,      68.1,      68.1,      68.1,
+        68.1,      68.1,      68.1,      68.1,      68.1,      68.1,      68.1,      68.1,
+        70.5,      70.5,      70.5,      70.5,      70.5,      70.5,      70.5,      70.5,
+        70.5,      70.5,      70.5,      70.5,      70.5,      70.5,      70.5,      70.5,
+        72.9,      72.9,      72.9,      72.9,      72.9,      72.9,      72.9,      72.9,
+        72.9,      72.9,      72.9,      72.9,      72.9,      72.9,      72.9,      72.9,
+        49.4,      49.4,      49.4,      49.4,      49.4,      49.4,      49.4,      49.4,
+        49.4,      49.4,      49.4,      49.4,      49.4,      49.4,      49.4,      49.4};
+    EXPECT_TRUE(ck::utils::check_err(
+        out_tensor2.mDesc.GetLengths(), ref_dims, "Error: wrong output tensor dimensions!"));
+    EXPECT_TRUE(ck::utils::check_err(out_tensor2, ref_data, "Error: incorrect results!"));
+}
+#endif
+
+TEST(ReferenceConvolutionFWD, Conv3DGNCDHW)
+{
+    ck::utils::conv::ConvParam conv_param(3,
+                                          1,
+                                          1,
+                                          1,
+                                          2,
+                                          std::vector<ck::index_t>{3, 3, 3},
+                                          std::vector<ck::index_t>{6, 6, 6},
+                                          std::vector<ck::index_t>{1, 1, 1},
+                                          std::vector<ck::index_t>{1, 1, 1},
+                                          std::vector<ck::index_t>{0, 0, 0},
+                                          std::vector<ck::index_t>{0, 0, 0});
+
+    auto out_tensor = run_reference_convolution_forward<3,
+                                                        float,
+                                                        float,
+                                                        float,
+                                                        ck::tensor_layout::convolution::GNCDHW,
+                                                        ck::tensor_layout::convolution::GKCZYX,
+                                                        ck::tensor_layout::convolution::GNKDHW>(
+        conv_param, ck::utils::FillMonotonicSeq<float>{0.f, 0.1f});
+    std::vector<std::size_t> ref_dims{1, 1, 1, 4, 4, 4};
+    std::vector<float> ref_data{
+        407.7,     410.40002, 413.09998, 415.80002, 423.90002, 426.6,     429.30002, 432.,
+        440.1,     442.80002, 445.5,     448.2,     456.30002, 459.,      461.7,     464.40002,
+        504.90002, 507.6,     510.30002, 513.,      521.1,     523.8,     526.5,     529.2001,
+        537.3,     540.,      542.7001,  545.4,     553.5,     556.2001,  558.9,     561.6,
+        602.10004, 604.8,     607.5,     610.2,     618.3,     621.,      623.7,     626.4,
+        634.5,     637.2,     639.9,     642.60004, 650.7,     653.4,     656.10004, 658.8,
+        699.3,     702.,      704.7,     707.4,     715.5,     718.2,     720.9,     723.60004,
+        731.7,     734.4001,  737.10004, 739.8,     747.9001,  750.60004, 753.3,     756.};
+    EXPECT_TRUE(ck::utils::check_err(out_tensor.mDesc.GetLengths(),
+                                     ref_dims,
+                                     "Error [case 1]: wrong output tensor dimensions!"));
+    EXPECT_TRUE(ck::utils::check_err(out_tensor, ref_data, "Error [case 1]: incorrect results!"));
+}
+
+TEST(ReferenceConvolutionFWD, Conv3DGNCDHWStridesDilations)
+{
+    ck::utils::conv::ConvParam conv_param(3,
+                                          1,
+                                          1,
+                                          2,
+                                          2,
+                                          std::vector<ck::index_t>{3, 3, 3},
+                                          std::vector<ck::index_t>{12, 12, 12},
+                                          std::vector<ck::index_t>{3, 3, 3},
+                                          std::vector<ck::index_t>{1, 1, 1},
+                                          std::vector<ck::index_t>{0, 0, 0},
+                                          std::vector<ck::index_t>{0, 0, 0});
+
+    auto out_tensor = run_reference_convolution_forward<3,
+                                                        float,
+                                                        float,
+                                                        float,
+                                                        ck::tensor_layout::convolution::GNCDHW,
+                                                        ck::tensor_layout::convolution::GKCZYX,
+                                                        ck::tensor_layout::convolution::GNKDHW>(
+        conv_param, ck::utils::FillMonotonicSeq<float>{0.f, 0.1f});
+    std::vector<std::size_t> ref_dims{1, 1, 2, 4, 4, 4};
+    std::vector<float> ref_data{
+        2756.7002, 2764.7998, 2772.9001, 2781.,     2853.9001, 2862.,     2870.1,    2878.2002,
+        2951.1,    2959.2002, 2967.2998, 2975.4001, 3048.2998, 3056.4001, 3064.5,    3072.6,
+        3923.1,    3931.2,    3939.2998, 3947.4,    4020.2998, 4028.4001, 4036.5002, 4044.5999,
+        4117.5,    4125.6,    4133.7,    4141.8,    4214.7,    4222.8,    4230.9004, 4239.,
+        5089.5,    5097.5996, 5105.7,    5113.8,    5186.7,    5194.8,    5202.9,    5211.,
+        5283.9004, 5292.,     5300.0996, 5308.2,    5381.0996, 5389.2,    5397.3,    5405.4004,
+        6255.9004, 6264.0005, 6272.1,    6280.2,    6353.1,    6361.2,    6369.301,  6377.4,
+        6450.301,  6458.4,    6466.5,    6474.6,    6547.5,    6555.6,    6563.699,  6571.801,
+        2756.7002, 2764.7998, 2772.9001, 2781.,     2853.9001, 2862.,     2870.1,    2878.2002,
+        2951.1,    2959.2002, 2967.2998, 2975.4001, 3048.2998, 3056.4001, 3064.5,    3072.6,
+        3923.1,    3931.2,    3939.2998, 3947.4,    4020.2998, 4028.4001, 4036.5002, 4044.5999,
+        4117.5,    4125.6,    4133.7,    4141.8,    4214.7,    4222.8,    4230.9004, 4239.,
+        5089.5,    5097.5996, 5105.7,    5113.8,    5186.7,    5194.8,    5202.9,    5211.,
+        5283.9004, 5292.,     5300.0996, 5308.2,    5381.0996, 5389.2,    5397.3,    5405.4004,
+        6255.9004, 6264.0005, 6272.1,    6280.2,    6353.1,    6361.2,    6369.301,  6377.4,
+        6450.301,  6458.4,    6466.5,    6474.6,    6547.5,    6555.6,    6563.699,  6571.801};
+    EXPECT_TRUE(ck::utils::check_err(out_tensor.mDesc.GetLengths(),
+                                     ref_dims,
+                                     "Error [case 2]: wrong output tensor dimensions!"));
+    EXPECT_TRUE(ck::utils::check_err(
+        out_tensor, ref_data, "Error [case 2]: incorrect results!", 1e-4f, 1e-6f));
+}
diff --git a/test/softmax/CMakeLists.txt b/test/softmax/CMakeLists.txt
new file mode 100644
index 00000000..4ba40126
--- /dev/null
+++ b/test/softmax/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_custom_target(test_softmax)
+
+add_gtest_executable(test_softmax_rank3 test_softmax_rank3.cpp)
+add_gtest_executable(test_softmax_rank4 test_softmax_rank4.cpp)
+add_gtest_executable(test_softmax_interface test_softmax_interface.cpp)
+target_link_libraries(test_softmax_rank3 PRIVATE utility device_softmax_instance)
+target_link_libraries(test_softmax_rank4 PRIVATE utility device_softmax_instance)
+target_link_libraries(test_softmax_interface PRIVATE utility device_softmax_instance)
+add_dependencies(test_softmax test_softmax_rank3)
+add_dependencies(test_softmax test_softmax_rank4)
+add_dependencies(test_softmax test_softmax_interface)
diff --git a/test/softmax/test_softmax_interface.cpp b/test/softmax/test_softmax_interface.cpp
new file mode 100644
index 00000000..8cac0ba0
--- /dev/null
+++ b/test/softmax/test_softmax_interface.cpp
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <stdexcept>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test_softmax_util.hpp"
+
+class TestSoftmaxInterface : public ::testing::Test
+{
+    protected:
+    template <ck::index_t Rank, ck::index_t NumReduceDims>
+    using SoftmaxInstance =
+        ck::DeviceSoftmaxInstanceWrapper<Rank, NumReduceDims, 256, 1, 256, 1, 8, 1, 8, 8>;
+};
+
+TEST_F(TestSoftmaxInterface, IncorrectReduceDims)
+{
+    std::vector<ck::index_t> lengths{2, 128, 1536};
+    std::vector<ck::index_t> strides{128 * 1536, 1536, 1};
+
+    EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported(lengths, strides, {-1})), std::runtime_error);
+    EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported(lengths, strides, {3})), std::runtime_error);
+    EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported(lengths, strides, {0, 1})),
+                 std::runtime_error);
+    EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported(lengths, strides, {})), std::runtime_error);
+    EXPECT_THROW((SoftmaxInstance<3, 2>{}.IsSupported(lengths, strides, {2, -1})),
+                 std::runtime_error);
+    EXPECT_THROW((SoftmaxInstance<3, 2>{}.IsSupported(lengths, strides, {2, 4})),
+                 std::runtime_error);
+    EXPECT_THROW((SoftmaxInstance<3, 2>{}.IsSupported(lengths, strides, {2})), std::runtime_error);
+}
+
+TEST_F(TestSoftmaxInterface, IncorrectLengthsSize)
+{
+    std::vector<ck::index_t> lengths{128, 1536};
+    std::vector<ck::index_t> strides{128 * 1536, 1536, 1};
+    std::vector<ck::index_t> reduce_dims{2};
+
+    EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported({128, 1536}, strides, reduce_dims)),
+                 std::runtime_error);
+    EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported({}, strides, reduce_dims)),
+                 std::runtime_error);
+    EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported({1, 8, 128, 1536}, strides, reduce_dims)),
+                 std::runtime_error);
+}
+
+TEST_F(TestSoftmaxInterface, IncorrectStridesSize)
+{
+    std::vector<ck::index_t> lengths{2, 128, 1536};
+    std::vector<ck::index_t> reduce_dims{2};
+
+    EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported(lengths, {1536, 1}, reduce_dims)),
+                 std::runtime_error);
+    EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported(lengths, {}, reduce_dims)),
+                 std::runtime_error);
+    EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported(lengths, {1, 2, 3, 4}, reduce_dims)),
+                 std::runtime_error);
+}
+
+TEST_F(TestSoftmaxInterface, UnsupportedLengths)
+{
+    using SoftmaxInstance1 = ck::DeviceSoftmaxInstanceWrapper<3, 1, 256, 1, 256, 1, 8, 1, 8, 4>;
+    EXPECT_FALSE(SoftmaxInstance1{}.IsSupported({2, 128, 1500}, {128 * 1500, 1500, 1}, {2}));
+    EXPECT_FALSE(SoftmaxInstance1{}.IsSupported({2, 127, 1536}, {127 * 1536, 1536, 1}, {2}));
+    EXPECT_FALSE(SoftmaxInstance1{}.IsSupported({2, 128, 1537}, {128 * 1537, 1537, 1}, {2}));
+
+    // Reduction of middle dimensions
+    using SoftmaxInstance2 = ck::DeviceSoftmaxInstanceWrapper<3, 3, 256, 8, 32, 8, 8, 0, 8, 4>;
+    EXPECT_FALSE(SoftmaxInstance2{}.IsSupported({2, 128, 1536}, {128 * 1536, 1536, 1}, {0, 1, 2}));
+
+    // Reduction of middle dimensions
+    using SoftmaxInstance3 = ck::DeviceSoftmaxInstanceWrapper<3, 1, 256, 8, 32, 8, 8, 0, 4, 8>;
+    EXPECT_FALSE(SoftmaxInstance3{}.IsSupported({2, 128, 1536}, {128 * 1536, 1536, 1}, {2}));
+    EXPECT_FALSE(SoftmaxInstance3{}.IsSupported({2, 128, 1537}, {128 * 1537, 1537, 1}, {1}));
+    EXPECT_FALSE(SoftmaxInstance3{}.IsSupported({2, 128, 1540}, {128 * 1540, 1540, 1}, {1}));
+    EXPECT_FALSE(SoftmaxInstance3{}.IsSupported({2, 127, 1536}, {127 * 1536, 1536, 1}, {1}));
+}
+
+TEST_F(TestSoftmaxInterface, UnsupportedInstance)
+{
+    // Instance with InSrcVectorDim = 1, can't reduce middle dims if in/out vec size != 1
+    using SoftmaxInstance1 = ck::DeviceSoftmaxInstanceWrapper<3, 1, 256, 8, 32, 1, 8, 1, 8, 8>;
+    EXPECT_FALSE(SoftmaxInstance1{}.IsSupported({2, 128, 1024}, {128 * 1024, 1024, 1}, {0}));
+}
diff --git a/test/softmax/test_softmax_rank3.cpp b/test/softmax/test_softmax_rank3.cpp
new file mode 100644
index 00000000..5691ee3f
--- /dev/null
+++ b/test/softmax/test_softmax_rank3.cpp
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <algorithm>
+#include <stdexcept>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test_softmax_util.hpp"
+
+template <ck::index_t N>
+using I = ck::Number<N>;
+
+using F16 = ck::half_t;
+using F32 = float;
+using I8  = int8_t;
+
+template <typename Tuple>
+class TestSoftmax : public ck::TestSoftmax<Tuple>
+{
+};
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+    //         InDataType, AccDataType, OutDataType, Rank
+    std::tuple<       F16,         F32,         F16,    I<3>>,
+    std::tuple<       F32,         F32,         F32,    I<3>>,
+    std::tuple<        I8,         F32,          I8,    I<3>>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestSoftmax, KernelTypes);
+
+#include "test_softmax_ut_cases.inc"
diff --git a/test/softmax/test_softmax_rank4.cpp b/test/softmax/test_softmax_rank4.cpp
new file mode 100644
index 00000000..f0b22df2
--- /dev/null
+++ b/test/softmax/test_softmax_rank4.cpp
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <algorithm>
+#include <stdexcept>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "test_softmax_util.hpp"
+
+template <ck::index_t N>
+using I = ck::Number<N>;
+
+using F16 = ck::half_t;
+using F32 = float;
+using I8  = int8_t;
+
+template <typename Tuple>
+class TestSoftmax : public ck::TestSoftmax<Tuple>
+{
+};
+
+// clang-format off
+using KernelTypes = ::testing::Types<
+    //         InDataType, AccDataType, OutDataType, Rank
+    std::tuple<       F16,         F32,         F16,    I<4>>,
+    std::tuple<       F32,         F32,         F32,    I<4>>,
+    std::tuple<        I8,         F32,          I8,    I<4>>
+    >;
+// clang-format on
+
+TYPED_TEST_SUITE(TestSoftmax, KernelTypes);
+
+#include "test_softmax_ut_cases.inc"
diff --git a/test/softmax/test_softmax_ut_cases.inc b/test/softmax/test_softmax_ut_cases.inc
new file mode 100644
index 00000000..cf5e4d2d
--- /dev/null
+++ b/test/softmax/test_softmax_ut_cases.inc
@@ -0,0 +1,60 @@
+#pragma once
+
+TYPED_TEST(TestSoftmax, ReduceOutermostDim)
+{
+    std::vector<ck::index_t> reduce_dims{this->Rank - 1};
+    this->Run(reduce_dims);
+}
+
+TYPED_TEST(TestSoftmax, ReduceMiddleDim)
+{
+    for(int dim = 0; dim < this->Rank - 1; ++dim)
+    {
+        std::vector<ck::index_t> reduce_dims{dim};
+        this->Run(reduce_dims);
+    }
+}
+
+TYPED_TEST(TestSoftmax, ReduceMultipleDimsWithOutermost)
+{
+    for(int dim = 0; dim < this->Rank - 1; ++dim)
+    {
+        std::vector<ck::index_t> reduce_dims{dim, this->Rank - 1};
+        this->Run(reduce_dims);
+    }
+}
+
+TYPED_TEST(TestSoftmax, ReduceMultipleMiddleDims)
+{
+    std::vector<ck::index_t> reduce_dims{0, 1};
+    if(this->Rank >= 3)
+    {
+        this->Run(reduce_dims);
+    }
+
+    if(this->Rank >= 4)
+    {
+        reduce_dims = std::vector<ck::index_t>{0, 2};
+        this->Run(reduce_dims);
+        reduce_dims = std::vector<ck::index_t>{0, 1, 2};
+        this->Run(reduce_dims);
+    }
+}
+
+TYPED_TEST(TestSoftmax, ReduceAllDims)
+{
+    std::vector<ck::index_t> reduce_dims(this->Rank);
+    std::iota(std::begin(reduce_dims), std::end(reduce_dims), 0);
+    this->Run(reduce_dims);
+}
+
+TYPED_TEST(TestSoftmax, ReduceOddLengths)
+{
+    this->in_lengths_ = {{3, 63, 1032}};
+    if(this->Rank >= 4)
+    {
+        this->in_lengths_ = {{1, 3, 63, 1032}};
+    }
+    this->Run({this->Rank - 1});
+    this->Run({this->Rank - 2});
+}
diff --git a/test/softmax/test_softmax_util.hpp b/test/softmax/test_softmax_util.hpp
new file mode 100644
index 00000000..40b300cf
--- /dev/null
+++ b/test/softmax/test_softmax_util.hpp
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+#include <sstream>
+#include <tuple>
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "include/ck/utility/data_type.hpp"
+#include "profiler/profile_softmax_impl.hpp"
+
+namespace ck {
+
+template <typename Range>
+std::string serialize_range(const Range& range)
+{
+    std::stringstream ss;
+    for(auto& r : range)
+    {
+        ss << r << ", ";
+    }
+    std::string str = ss.str();
+    return std::string(str.begin(), str.end() - 2);
+}
+
+template <typename Tuple>
+class TestSoftmax : public ::testing::Test
+{
+    protected:
+    using InDataType              = std::tuple_element_t<0, Tuple>;
+    using AccDataType             = std::tuple_element_t<1, Tuple>;
+    using OutDataType             = std::tuple_element_t<2, Tuple>;
+    static constexpr index_t Rank = std::tuple_element_t<3, Tuple>{}.value;
+
+    public:
+    std::vector<std::vector<index_t>> in_lengths_ = {{2, 128, 1024}, {4, 16, 8448}, {128, 128, 64}};
+    std::vector<std::vector<AccDataType>> scales_ = {{2, 0}, {0, 2}, {2, 2}};
+    bool bench_                                   = false; // measure kernel performance
+    bool verify_                                  = true;
+
+    void SetUp() override
+    {
+        if constexpr(Rank == 4)
+        {
+            in_lengths_ = std::vector<std::vector<index_t>>{
+                {1, 2, 128, 1024}, {2, 4, 16, 8448}, {1, 128, 128, 64}};
+        }
+    }
+
+    void RunSingle(std::vector<index_t> in_length,
+                   std::vector<index_t> reduce_dims,
+                   AccDataType alpha,
+                   AccDataType beta)
+    {
+        int init_method = 1; // integer value initialization
+        bool log        = false;
+        std::vector<ck::index_t> strides; // intenionally empty, to get packed layout.
+        bool pass = ck::profiler::profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank>(
+            verify_, init_method, log, bench_, in_length, strides, reduce_dims, alpha, beta);
+        EXPECT_TRUE(pass);
+    }
+
+    void Run(std::vector<index_t> reduce_dims = {})
+    {
+        if(reduce_dims.empty())
+        {
+            reduce_dims.push_back(Rank - 1);
+        }
+
+        for(auto in_length : this->in_lengths_)
+        {
+            for(auto scale : this->scales_)
+            {
+                this->RunSingle(in_length, reduce_dims, scale[0], scale[1]);
+            }
+        }
+    }
+};
+
+template <index_t Rank,
+          index_t NumReduceDim,
+          index_t BlockSize,
+          index_t MThreadClusterSize,
+          index_t KThreadClusterSize,
+          index_t MThreadSliceSize,
+          index_t KThreadSliceSize,
+          index_t InSrcVectorDim,
+          index_t InSrcVectorSize,
+          index_t OutDstVectorSize>
+struct DeviceSoftmaxInstanceWrapper
+{
+    using F16  = half_t;
+    using F32  = float;
+    using Pass = tensor_operation::element_wise::PassThrough;
+
+    using InDataType   = F16;
+    using AccDataType  = F32;
+    using OutDataType  = F16;
+    using InElementOp  = Pass;
+    using AccElementOp = Pass;
+
+    using DeviceSoftmaxInstance = tensor_operation::device::DeviceSoftmaxImpl<InDataType,
+                                                                              AccDataType,
+                                                                              OutDataType,
+                                                                              InElementOp,
+                                                                              AccElementOp,
+                                                                              Rank,
+                                                                              NumReduceDim,
+                                                                              BlockSize,
+                                                                              MThreadClusterSize,
+                                                                              KThreadClusterSize,
+                                                                              MThreadSliceSize,
+                                                                              KThreadSliceSize,
+                                                                              InSrcVectorDim,
+                                                                              InSrcVectorSize,
+                                                                              OutDstVectorSize>;
+
+    bool IsSupported(const std::vector<index_t> in_lengths,
+                     const std::vector<index_t> in_strides,
+                     const std::vector<index_t> reduce_dims) const
+    {
+        auto softmax  = DeviceSoftmaxInstance{};
+        auto argument = softmax.MakeArgument(in_lengths,
+                                             in_strides,
+                                             reduce_dims,
+                                             1,       // alpha
+                                             1,       // beta
+                                             nullptr, // in_dev
+                                             nullptr, // in_out
+                                             Pass{},  // in elementwise op
+                                             Pass{}); // acc elementwise op
+        return softmax.IsSupportedArgument(argument);
+    }
+};
+
+} // namespace ck
diff --git a/test/space_filling_curve/CMakeLists.txt b/test/space_filling_curve/CMakeLists.txt
new file mode 100644
index 00000000..a5272680
--- /dev/null
+++ b/test/space_filling_curve/CMakeLists.txt
@@ -0,0 +1 @@
+add_test_executable(test_space_filling_curve space_filling_curve.cpp)
diff --git a/test/space_filling_curve/space_filling_curve.cpp b/test/space_filling_curve/space_filling_curve.cpp
new file mode 100644
index 00000000..c7f6759e
--- /dev/null
+++ b/test/space_filling_curve/space_filling_curve.cpp
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <vector>
+#include <iostream>
+#include <numeric>
+#include <cassert>
+
+#include "ck/ck.hpp"
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_space_filling_curve.hpp"
+
+using namespace ck;
+
+void traverse_using_space_filling_curve_linear();
+void traverse_using_space_filling_curve_snakecurved();
+
+int main(int argc, char** argv)
+{
+    (void)argc;
+    (void)argv;
+
+    traverse_using_space_filling_curve_linear();
+    traverse_using_space_filling_curve_snakecurved();
+
+    return 0;
+}
+
+void traverse_using_space_filling_curve_linear()
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+
+    using TensorLengths    = Sequence<3, 2, 2>;
+    using DimAccessOrder   = Sequence<2, 0, 1>;
+    using ScalarsPerAccess = Sequence<1, 1, 1>;
+    using SpaceFillingCurve =
+        SpaceFillingCurve<TensorLengths, DimAccessOrder, ScalarsPerAccess, false>;
+
+    constexpr auto expected = make_tuple(make_tuple(0, 0, 0),
+                                         make_tuple(0, 1, 0),
+                                         make_tuple(1, 0, 0),
+                                         make_tuple(1, 1, 0),
+                                         make_tuple(2, 0, 0),
+                                         make_tuple(2, 1, 0),
+                                         make_tuple(0, 0, 1),
+                                         make_tuple(0, 1, 1),
+                                         make_tuple(1, 0, 1),
+                                         make_tuple(1, 1, 1),
+                                         make_tuple(2, 0, 1),
+                                         make_tuple(2, 1, 1));
+
+    constexpr index_t num_access = SpaceFillingCurve::GetNumOfAccess();
+
+    static_assert(num_access == reduce_on_sequence(TensorLengths{} / ScalarsPerAccess{},
+                                                   math::multiplies{},
+                                                   Number<1>{}));
+
+    static_for<1, num_access, 1>{}([&](auto i) {
+        constexpr auto idx_curr = SpaceFillingCurve::GetIndex(i);
+
+        static_assert(idx_curr[I0] == expected[i][I0]);
+        static_assert(idx_curr[I1] == expected[i][I1]);
+        static_assert(idx_curr[I2] == expected[i][I2]);
+
+        constexpr auto backward_step = SpaceFillingCurve::GetBackwardStep(i);
+        constexpr auto expected_step = expected[i - I1] - expected[i];
+        static_assert(backward_step[I0] == expected_step[I0]);
+        static_assert(backward_step[I1] == expected_step[I1]);
+        static_assert(backward_step[I2] == expected_step[I2]);
+    });
+
+    static_for<0, num_access - 1, 1>{}([&](auto i) {
+        constexpr auto idx_curr = SpaceFillingCurve::GetIndex(i);
+
+        static_assert(idx_curr[I0] == expected[i][I0]);
+        static_assert(idx_curr[I1] == expected[i][I1]);
+        static_assert(idx_curr[I2] == expected[i][I2]);
+
+        constexpr auto forward_step  = SpaceFillingCurve::GetForwardStep(i);
+        constexpr auto expected_step = expected[i + I1] - expected[i];
+        static_assert(forward_step[I0] == expected_step[I0]);
+        static_assert(forward_step[I1] == expected_step[I1]);
+        static_assert(forward_step[I2] == expected_step[I2]);
+    });
+}
+
+void traverse_using_space_filling_curve_snakecurved()
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+
+    using TensorLengths    = Sequence<16, 10, 9>;
+    using DimAccessOrder   = Sequence<2, 0, 1>;
+    using ScalarsPerAccess = Sequence<4, 2, 3>;
+    using SpaceFillingCurve =
+        SpaceFillingCurve<TensorLengths, DimAccessOrder, ScalarsPerAccess, true>;
+
+    constexpr auto expected = make_tuple(make_tuple(0, 0, 0),
+                                         make_tuple(0, 2, 0),
+                                         make_tuple(0, 4, 0),
+                                         make_tuple(0, 6, 0),
+                                         make_tuple(0, 8, 0),
+                                         make_tuple(4, 8, 0),
+                                         make_tuple(4, 6, 0),
+                                         make_tuple(4, 4, 0),
+                                         make_tuple(4, 2, 0),
+                                         make_tuple(4, 0, 0),
+                                         make_tuple(8, 0, 0),
+                                         make_tuple(8, 2, 0),
+                                         make_tuple(8, 4, 0),
+                                         make_tuple(8, 6, 0),
+                                         make_tuple(8, 8, 0),
+                                         make_tuple(12, 8, 0),
+                                         make_tuple(12, 6, 0),
+                                         make_tuple(12, 4, 0),
+                                         make_tuple(12, 2, 0),
+                                         make_tuple(12, 0, 0),
+                                         make_tuple(12, 0, 3),
+                                         make_tuple(12, 2, 3),
+                                         make_tuple(12, 4, 3),
+                                         make_tuple(12, 6, 3),
+                                         make_tuple(12, 8, 3),
+                                         make_tuple(8, 8, 3),
+                                         make_tuple(8, 6, 3),
+                                         make_tuple(8, 4, 3),
+                                         make_tuple(8, 2, 3),
+                                         make_tuple(8, 0, 3),
+                                         make_tuple(4, 0, 3),
+                                         make_tuple(4, 2, 3),
+                                         make_tuple(4, 4, 3),
+                                         make_tuple(4, 6, 3),
+                                         make_tuple(4, 8, 3),
+                                         make_tuple(0, 8, 3),
+                                         make_tuple(0, 6, 3),
+                                         make_tuple(0, 4, 3),
+                                         make_tuple(0, 2, 3),
+                                         make_tuple(0, 0, 3),
+                                         make_tuple(0, 0, 6),
+                                         make_tuple(0, 2, 6),
+                                         make_tuple(0, 4, 6),
+                                         make_tuple(0, 6, 6),
+                                         make_tuple(0, 8, 6),
+                                         make_tuple(4, 8, 6),
+                                         make_tuple(4, 6, 6),
+                                         make_tuple(4, 4, 6),
+                                         make_tuple(4, 2, 6),
+                                         make_tuple(4, 0, 6),
+                                         make_tuple(8, 0, 6),
+                                         make_tuple(8, 2, 6),
+                                         make_tuple(8, 4, 6),
+                                         make_tuple(8, 6, 6),
+                                         make_tuple(8, 8, 6),
+                                         make_tuple(12, 8, 6),
+                                         make_tuple(12, 6, 6),
+                                         make_tuple(12, 4, 6),
+                                         make_tuple(12, 2, 6),
+                                         make_tuple(12, 0, 6));
+
+    constexpr index_t num_access = SpaceFillingCurve::GetNumOfAccess();
+
+    static_assert(num_access == reduce_on_sequence(TensorLengths{} / ScalarsPerAccess{},
+                                                   math::multiplies{},
+                                                   Number<1>{}));
+
+    static_for<1, num_access, 1>{}([&](auto i) {
+        constexpr auto idx_curr = SpaceFillingCurve::GetIndex(i);
+
+        static_assert(idx_curr[I0] == expected[i][I0]);
+        static_assert(idx_curr[I1] == expected[i][I1]);
+        static_assert(idx_curr[I2] == expected[i][I2]);
+
+        constexpr auto backward_step = SpaceFillingCurve::GetBackwardStep(i);
+        constexpr auto expected_step = expected[i - I1] - expected[i];
+        static_assert(backward_step[I0] == expected_step[I0]);
+        static_assert(backward_step[I1] == expected_step[I1]);
+        static_assert(backward_step[I2] == expected_step[I2]);
+    });
+
+    static_for<0, num_access - 1, 1>{}([&](auto i) {
+        constexpr auto idx_curr = SpaceFillingCurve::GetIndex(i);
+
+        static_assert(idx_curr[I0] == expected[i][I0]);
+        static_assert(idx_curr[I1] == expected[i][I1]);
+        static_assert(idx_curr[I2] == expected[i][I2]);
+
+        constexpr auto forward_step  = SpaceFillingCurve::GetForwardStep(i);
+        constexpr auto expected_step = expected[i + I1] - expected[i];
+        static_assert(forward_step[I0] == expected_step[I0]);
+        static_assert(forward_step[I1] == expected_step[I1]);
+        static_assert(forward_step[I2] == expected_step[I2]);
+    });
+}
diff --git a/test/wmma_op/CMakeLists.txt b/test/wmma_op/CMakeLists.txt
new file mode 100644
index 00000000..e553253c
--- /dev/null
+++ b/test/wmma_op/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_test_executable(test_wmma_op wmma_op.cpp)
+target_link_libraries(test_wmma_op PRIVATE utility)
diff --git a/test/wmma_op/wmma_op.cpp b/test/wmma_op/wmma_op.cpp
new file mode 100644
index 00000000..761c15f1
--- /dev/null
+++ b/test/wmma_op/wmma_op.cpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <tuple>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "test/wmma_op/wmma_op_util.hpp"
+
+template <typename SrcType,
+          typename DstType,
+          typename GPUAccType,
+          typename CPUAccType,
+          ck::index_t AccNum>
+bool run_test()
+{
+    using Row         = ck::tensor_layout::gemm::RowMajor;
+    using Col         = ck::tensor_layout::gemm::ColumnMajor;
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+    bool pass         = true;
+
+    const auto matmul_default = ck::wmma_op_util::matmul<SrcType, DstType, GPUAccType, AccNum>;
+    const auto matmul_swizzle_a =
+        ck::wmma_op_util::matmul_swizzle_a<SrcType, DstType, GPUAccType, AccNum>;
+
+    const auto wmma_kernel_container = std::make_tuple(matmul_default, matmul_swizzle_a);
+
+    ck::static_for<0, 2, 1>{}([&](auto i) {
+        pass &=
+            ck::wmma_op_util::TestWmma<decltype(std::get<ck::Number<i>{}>(wmma_kernel_container)),
+                                       SrcType,
+                                       SrcType,
+                                       DstType,
+                                       GPUAccType,
+                                       CPUAccType,
+                                       decltype(Row{}),
+                                       decltype(Col{}),
+                                       decltype(Row{}),
+                                       PassThrough,
+                                       PassThrough,
+                                       PassThrough,
+                                       AccNum>{}(std::get<ck::Number<i>{}>(wmma_kernel_container));
+    });
+
+    return pass ? 1 : 0;
+}
+int main(int, char*[])
+{
+    bool pass = true;
+    // clang-format off
+    //              |SrcType     |DstType     |GPUAccType  |CPUAccType |AccNum
+    pass &= run_test<ck::half_t,  ck::half_t,  float,       float,      8     >();
+    pass &= run_test<ck::bhalf_t, ck::bhalf_t, float,       float,      8     >();
+    pass &= run_test<ck::half_t,  ck::half_t,  ck::half_t,  ck::half_t, 16    >();
+    pass &= run_test<ck::bhalf_t, ck::bhalf_t, ck::bhalf_t, float,      16    >();
+    pass &= run_test<int8_t,      int8_t,      int32_t,     int32_t,    8     >();
+    // clang-format on
+
+    std::cout << "TestGemm ..... " << (pass ? "SUCCESS" : "FAILURE") << std::endl;
+    return pass ? 0 : 1;
+}
diff --git a/test/wmma_op/wmma_op_util.hpp b/test/wmma_op/wmma_op_util.hpp
new file mode 100644
index 00000000..ef3f831a
--- /dev/null
+++ b/test/wmma_op/wmma_op_util.hpp
@@ -0,0 +1,369 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/utility/amd_wmma.hpp"
+
+namespace ck {
+namespace wmma_op_util {
+
+template <typename src_vec, typename acc_vec>
+__device__ void builtin_wmma_naive_selector(const src_vec&, const src_vec&, acc_vec&)
+{
+}
+
+template <>
+__device__ void
+builtin_wmma_naive_selector<half16_t,
+                            StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, float, 1, 8, true>>(
+    const half16_t& reg_a,
+    const half16_t& reg_b,
+    StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, float, 1, 8, true>& reg_c)
+{
+    intrin_wmma_f32_16x16x16_f16_w32<16, 16>::Run(
+        reg_a, reg_b, reg_c.GetVectorTypeReference(Number<0>{}));
+}
+
+template <>
+__device__ void
+builtin_wmma_naive_selector<bhalf16_t,
+                            StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, float, 1, 8, true>>(
+    const bhalf16_t& reg_a,
+    const bhalf16_t& reg_b,
+    StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, float, 1, 8, true>& reg_c)
+{
+    intrin_wmma_f32_16x16x16_bf16_w32<16, 16>::Run(
+        reg_a, reg_b, reg_c.GetVectorTypeReference(Number<0>{}));
+}
+
+template <>
+__device__ void
+builtin_wmma_naive_selector<half16_t,
+                            StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, half_t, 1, 16, true>>(
+    const half16_t& reg_a,
+    const half16_t& reg_b,
+    StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, half_t, 1, 16, true>& reg_c)
+{
+    intrin_wmma_f16_16x16x16_f16_w32<16, 16, 0>::Run(
+        reg_a, reg_b, reg_c.GetVectorTypeReference(Number<0>{}));
+}
+
+template <>
+__device__ void builtin_wmma_naive_selector<
+    bhalf16_t,
+    StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, bhalf_t, 1, 16, true>>(
+    const bhalf16_t& reg_a,
+    const bhalf16_t& reg_b,
+    StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, bhalf_t, 1, 16, true>& reg_c)
+{
+    intrin_wmma_bf16_16x16x16_bf16_w32<16, 16, 0>::Run(
+        reg_a, reg_b, reg_c.GetVectorTypeReference(Number<0>{}));
+}
+
+template <>
+__device__ void
+builtin_wmma_naive_selector<int8x16_t,
+                            StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, int32_t, 1, 8, true>>(
+    const int8x16_t& reg_a,
+    const int8x16_t& reg_b,
+    StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, int32_t, 1, 8, true>& reg_c)
+{
+    intrin_wmma_i32_16x16x16_iu8_w32<16, 16, true, true, false>::Run(
+        reg_a, reg_b, reg_c.GetVectorTypeReference(Number<0>{}));
+}
+
+#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
+template <>
+__device__ void
+builtin_wmma_naive_selector<int4x16_t,
+                            StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, int32_t, 1, 8, true>>(
+    const int4x16_t& reg_a,
+    const int4x16_t& reg_b,
+    StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, int32_t, 1, 8, true>& reg_c)
+{
+    intrin_wmma_i32_16x16x16_iu4_w32<16, 16, true, true, false>::Run(
+        reg_a, reg_b, reg_c.GetVectorTypeReference(Number<0>{}));
+}
+#endif
+
+template <typename src_t, typename dst_t, typename acc_t, index_t acc_num>
+__global__ void matmul(const src_t* a, const src_t* b, dst_t* c)
+{
+    const int lIdx = threadIdx.x;
+    // a and b fragments are stored in 8 VGPRs each, in packed format, so 16 elements each for a and
+    // b a_frag will store one column of the 16x16 matrix tile b_frag will store one row of the
+    // 16x16 matrix tile
+    using src_vec  = typename vector_type<src_t, 16>::type;
+    src_vec a_frag = {};
+    src_vec b_frag = {};
+    // initialize c fragment to 0
+    using acc_vec = StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, acc_t, 1, acc_num, true>;
+    acc_vec c_thread_buf_;
+
+    // lane is (0-31) mod 16 instead of 0-31 due to matrix replication in gfx11
+    // see https://atlvsp3.amd.com/sp3_gfx11_5_instructions.pdf page 482
+    // TODO: remove this dependency in gfx12 https://ontrack-internal.amd.com/browse/DEGFXSP3-101
+    const int lane = lIdx % 16;
+
+    for(int ele = 0; ele < 16; ++ele)
+    {
+        b_frag[ele] = b[16 * lane + ele];
+    }
+    // follow origin design
+    for(int ele = 0; ele < 16; ++ele)
+    {
+        a_frag[ele] = a[16 * lane + ele];
+    }
+
+    // sync threads, similar to mma_sync
+    __syncthreads();
+    builtin_wmma_naive_selector<src_vec, acc_vec>(a_frag, b_frag, c_thread_buf_);
+    __syncthreads();
+    // wait for results, similar to mma_sync
+    static_for<0, 8, 1>{}([&](auto ele) {
+        const int r = ele * 2 + (lIdx / 16);
+        // store results from unpacked c_thread_buf_ output
+        c[16 * r + lane] = ck::type_convert<dst_t>(c_thread_buf_[Number<ele * acc_num / 8>{}]);
+    });
+}
+
+template <typename src_t, typename dst_t, typename acc_t, index_t acc_num>
+__global__ void matmul_swizzle_a(const src_t* a, const src_t* b, dst_t* c)
+{
+    const int lIdx = threadIdx.x;
+
+    using src_vec  = typename vector_type<src_t, 16>::type;
+    src_vec a_frag = {};
+    src_vec b_frag = {};
+    using acc_vec  = StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr, acc_t, 1, acc_num, true>;
+    acc_vec c_thread_buf_;
+
+    const int lane = lIdx % 16;
+
+    for(int ele = 0; ele < 16; ++ele)
+    {
+        b_frag[ele] = b[16 * lane + ele];
+    }
+
+    const int offset_m = (((lane & 1) << 3) | (lane >> 1));
+    for(int ele = 0; ele < 16; ++ele)
+    {
+        a_frag[ele] = a[16 * offset_m + ele];
+    }
+
+    __syncthreads();
+    builtin_wmma_naive_selector<src_vec, acc_vec>(a_frag, b_frag, c_thread_buf_);
+    __syncthreads();
+
+    static_for<0, 8, 1>{}([&](auto ele) {
+        const int blk = lIdx / 16;
+        const int r   = ele;
+        c[16 * 8 * blk + 16 * r + lane] =
+            ck::type_convert<dst_t>(c_thread_buf_[Number<ele * acc_num / 8>{}]);
+    });
+}
+
+struct GemmParams
+{
+    GemmParams() : M(16), N(16), K(16), StrideA(16), StrideB(16), StrideC(16), alpha(1), beta(0) {}
+
+    ck::index_t M;
+    ck::index_t N;
+    ck::index_t K;
+
+    ck::index_t StrideA;
+    ck::index_t StrideB;
+    ck::index_t StrideC;
+
+    float alpha;
+    float beta;
+};
+
+template <typename GemmInstance,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+void RunHostGEMM(const Tensor<ADataType>& A,
+                 const Tensor<BDataType>& B,
+                 Tensor<CDataType>& C,
+                 AElementwiseOperation a_element_op,
+                 BElementwiseOperation b_element_op,
+                 CElementwiseOperation c_element_op)
+{
+    auto ref_gemm     = GemmInstance{};
+    auto ref_invoker  = ref_gemm.MakeInvoker();
+    auto ref_argument = ref_gemm.MakeArgument(A, B, C, a_element_op, b_element_op, c_element_op);
+
+    ref_invoker.Run(ref_argument);
+}
+
+template <typename KernelType, typename ADataType, typename BDataType, typename CDataType>
+bool RunDeviceGEMM(KernelType kernel,
+                   const Tensor<ADataType>& A,
+                   const Tensor<BDataType>& B,
+                   Tensor<CDataType>& C)
+{
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * A.mDesc.GetElementSpaceSize());
+    DeviceMem b_n_k_device_buf(sizeof(BDataType) * B.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * C.mDesc.GetElementSpaceSize());
+
+    a_m_k_device_buf.ToDevice(A.mData.data());
+    b_n_k_device_buf.ToDevice(B.mData.data());
+    kernel<<<1, 32>>>(static_cast<const ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+                      static_cast<const BDataType*>(b_n_k_device_buf.GetDeviceBuffer()),
+                      static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()));
+    c_m_n_device_buf.FromDevice(C.mData.data());
+
+    return true;
+}
+
+template <typename DeviceWmma,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename GPUAccDataType,
+          typename CPUAccDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          index_t CAccNum>
+struct TestWmma
+{
+    auto PrepareGemmTensor(const ck::wmma_op_util::GemmParams& params)
+    {
+        auto f_host_tensor_descriptor =
+            [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+                if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+                {
+                    return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                                std::vector<std::size_t>({stride, 1}));
+                }
+                else
+                {
+                    return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                                std::vector<std::size_t>({1, stride}));
+                }
+            };
+
+        Tensor<ADataType> a_m_k(
+            f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{}));
+        Tensor<BDataType> b_n_k(
+            f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{}));
+        Tensor<CDataType> c_m_n_host_result(
+            f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
+        Tensor<CDataType> c_m_n_device_result(
+            f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
+
+        auto f_generate_tensor_value = [](auto& tensor, auto type) {
+            using dataType = decltype(type);
+
+            tensor.GenerateTensorValue(GeneratorTensor_2<dataType>{-5, 5});
+        };
+
+        f_generate_tensor_value(a_m_k, ADataType{});
+        f_generate_tensor_value(b_n_k, BDataType{});
+
+        return std::make_tuple(a_m_k, b_n_k, c_m_n_host_result, c_m_n_device_result);
+    }
+
+    auto operator()(const DeviceWmma& wmma_kernel)
+    {
+        std::cout << "ALayout = " << ALayout{}.name << ", BLayout = " << BLayout{}.name
+                  << ", CLayout = " << CLayout{}.name << std::endl;
+
+        // Arrange
+        ck::wmma_op_util::GemmParams params;
+        params.M       = 16;
+        params.N       = 16;
+        params.K       = 16;
+        params.StrideA = 16;
+        params.StrideB = 16;
+        params.StrideC = 16;
+
+        auto host_tensors = PrepareGemmTensor(params);
+
+        const Tensor<ADataType>& a  = std::get<0>(host_tensors);
+        const Tensor<BDataType>& b  = std::get<1>(host_tensors);
+        Tensor<CDataType>& c_host   = std::get<2>(host_tensors);
+        Tensor<CDataType>& c_device = std::get<3>(host_tensors);
+
+        auto a_element_op = AElementwiseOperation{};
+        auto b_element_op = BElementwiseOperation{};
+        auto c_element_op = CElementwiseOperation{};
+
+        using ReferenceGemmInstance =
+            ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                      BDataType,
+                                                      CDataType,
+                                                      CPUAccDataType,
+                                                      AElementwiseOperation,
+                                                      BElementwiseOperation,
+                                                      CElementwiseOperation>;
+        ck::wmma_op_util::RunHostGEMM<ReferenceGemmInstance>(
+            a, b, c_host, a_element_op, b_element_op, c_element_op);
+
+        // Act
+        bool is_supported = ck::wmma_op_util::RunDeviceGEMM(wmma_kernel, a, b, c_device);
+
+        if(is_supported)
+        {
+            // Assert
+            bool res = false;
+            if(std::is_same<CDataType, float>::value)
+            {
+                res = ck::utils::check_err(c_device.mData, c_host.mData);
+                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
+            }
+            else if(std::is_same<CDataType, ck::half_t>::value)
+            {
+                res = ck::utils::check_err(c_device.mData, c_host.mData);
+                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
+            }
+            else if(std::is_same<CDataType, ck::bhalf_t>::value)
+            {
+                // 0.5 Pixel Error Tolerance is introduced by Accumulator difference.
+                // BF16 WMMA Accumulator is in BF16 Type while On Host-side Accumulator is Float.
+                res = ck::utils::check_err(
+                    c_device.mData, c_host.mData, "Error: Incorrect results!", 0, 1.0);
+                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
+            }
+            else if(std::is_same<CDataType, int8_t>::value)
+            {
+                res = ck::utils::check_err(c_device.mData, c_host.mData);
+                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
+            }
+            else if(std::is_same<CDataType, double>::value)
+            {
+                res = ck::utils::check_err(c_device.mData, c_host.mData);
+                std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
+            }
+            else
+            {
+                std::cout << "UNSUPPORTED CDataType" << std::endl;
+            }
+
+            return res;
+        }
+        else
+        {
+            return true;
+        }
+    }
+};
+
+} // namespace wmma_op_util
+} // namespace ck
-- 
GitLab